{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.487747048960998, "eval_steps": 87, "global_step": 90000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000541941165512219, "grad_norm": 18.66890525817871, "learning_rate": 3.3333333333333335e-07, "loss": 7.3063, "step": 10 }, { "epoch": 0.001083882331024438, "grad_norm": 14.986076354980469, "learning_rate": 6.666666666666667e-07, "loss": 6.602, "step": 20 }, { "epoch": 0.0016258234965366573, "grad_norm": 7.8340582847595215, "learning_rate": 1.0000000000000002e-06, "loss": 5.4589, "step": 30 }, { "epoch": 0.002167764662048876, "grad_norm": 3.6684656143188477, "learning_rate": 1.3333333333333334e-06, "loss": 4.4819, "step": 40 }, { "epoch": 0.0027097058275610954, "grad_norm": 1.7751961946487427, "learning_rate": 1.6666666666666667e-06, "loss": 3.8516, "step": 50 }, { "epoch": 0.0032516469930733145, "grad_norm": 0.8667094707489014, "learning_rate": 2.0000000000000003e-06, "loss": 3.471, "step": 60 }, { "epoch": 0.0037935881585855337, "grad_norm": 0.48009157180786133, "learning_rate": 2.3333333333333336e-06, "loss": 3.2603, "step": 70 }, { "epoch": 0.004335529324097752, "grad_norm": 0.31772586703300476, "learning_rate": 2.666666666666667e-06, "loss": 3.1565, "step": 80 }, { "epoch": 0.004714888139956306, "eval_loss": 3.0095248222351074, "eval_runtime": 22.3757, "eval_samples_per_second": 223.456, "eval_steps_per_second": 1.207, "step": 87 }, { "epoch": 0.0048774704896099716, "grad_norm": 0.22845230996608734, "learning_rate": 3e-06, "loss": 3.0993, "step": 90 }, { "epoch": 0.005419411655122191, "grad_norm": 0.1711948961019516, "learning_rate": 3.3333333333333333e-06, "loss": 3.067, "step": 100 }, { "epoch": 0.00596135282063441, "grad_norm": 0.24801534414291382, "learning_rate": 3.666666666666667e-06, "loss": 3.0429, "step": 110 }, { "epoch": 0.006503293986146629, "grad_norm": 0.1762855052947998, "learning_rate": 4.000000000000001e-06, "loss": 3.0244, "step": 120 }, { "epoch": 0.007045235151658848, "grad_norm": 0.20275895297527313, "learning_rate": 4.333333333333334e-06, "loss": 3.0101, "step": 130 }, { "epoch": 0.007587176317171067, "grad_norm": 0.20807678997516632, "learning_rate": 4.666666666666667e-06, "loss": 2.9962, "step": 140 }, { "epoch": 0.008129117482683286, "grad_norm": 0.21902145445346832, "learning_rate": 5e-06, "loss": 2.9879, "step": 150 }, { "epoch": 0.008671058648195505, "grad_norm": 0.2640894055366516, "learning_rate": 5.333333333333334e-06, "loss": 2.9807, "step": 160 }, { "epoch": 0.009212999813707724, "grad_norm": 0.2247193157672882, "learning_rate": 5.666666666666667e-06, "loss": 2.9764, "step": 170 }, { "epoch": 0.009429776279912611, "eval_loss": 2.909708023071289, "eval_runtime": 21.9944, "eval_samples_per_second": 227.331, "eval_steps_per_second": 1.228, "step": 174 }, { "epoch": 0.009754940979219943, "grad_norm": 0.1678290069103241, "learning_rate": 6e-06, "loss": 2.9725, "step": 180 }, { "epoch": 0.010296882144732162, "grad_norm": 0.16639265418052673, "learning_rate": 6.333333333333334e-06, "loss": 2.9687, "step": 190 }, { "epoch": 0.010838823310244381, "grad_norm": 0.16535356640815735, "learning_rate": 6.666666666666667e-06, "loss": 2.968, "step": 200 }, { "epoch": 0.0113807644757566, "grad_norm": 0.12626643478870392, "learning_rate": 7.000000000000001e-06, "loss": 2.9656, "step": 210 }, { "epoch": 0.01192270564126882, "grad_norm": 0.16154739260673523, "learning_rate": 7.333333333333334e-06, "loss": 2.9658, "step": 220 }, { "epoch": 0.012464646806781039, "grad_norm": 0.10952357947826385, "learning_rate": 7.666666666666667e-06, "loss": 2.9626, "step": 230 }, { "epoch": 0.013006587972293258, "grad_norm": 0.13569523394107819, "learning_rate": 8.000000000000001e-06, "loss": 2.9611, "step": 240 }, { "epoch": 0.013548529137805477, "grad_norm": 0.18444740772247314, "learning_rate": 8.333333333333334e-06, "loss": 2.9599, "step": 250 }, { "epoch": 0.014090470303317696, "grad_norm": 0.1738782823085785, "learning_rate": 8.666666666666668e-06, "loss": 2.9588, "step": 260 }, { "epoch": 0.014144664419868919, "eval_loss": 2.9037859439849854, "eval_runtime": 21.9954, "eval_samples_per_second": 227.321, "eval_steps_per_second": 1.228, "step": 261 }, { "epoch": 0.014632411468829916, "grad_norm": 0.1825179159641266, "learning_rate": 9e-06, "loss": 2.9562, "step": 270 }, { "epoch": 0.015174352634342135, "grad_norm": 0.1394873708486557, "learning_rate": 9.333333333333334e-06, "loss": 2.9553, "step": 280 }, { "epoch": 0.015716293799854352, "grad_norm": 0.11560714244842529, "learning_rate": 9.666666666666667e-06, "loss": 2.9563, "step": 290 }, { "epoch": 0.01625823496536657, "grad_norm": 0.16794171929359436, "learning_rate": 1e-05, "loss": 2.9527, "step": 300 }, { "epoch": 0.01680017613087879, "grad_norm": 0.11284048855304718, "learning_rate": 1.0333333333333333e-05, "loss": 2.9529, "step": 310 }, { "epoch": 0.01734211729639101, "grad_norm": 0.12263070046901703, "learning_rate": 1.0666666666666667e-05, "loss": 2.9521, "step": 320 }, { "epoch": 0.01788405846190323, "grad_norm": 0.22584529221057892, "learning_rate": 1.1000000000000001e-05, "loss": 2.9527, "step": 330 }, { "epoch": 0.018425999627415448, "grad_norm": 0.15804457664489746, "learning_rate": 1.1333333333333334e-05, "loss": 2.9509, "step": 340 }, { "epoch": 0.018859552559825223, "eval_loss": 2.8973894119262695, "eval_runtime": 21.9951, "eval_samples_per_second": 227.324, "eval_steps_per_second": 1.228, "step": 348 }, { "epoch": 0.018967940792927667, "grad_norm": 0.20428232848644257, "learning_rate": 1.1666666666666668e-05, "loss": 2.9489, "step": 350 }, { "epoch": 0.019509881958439886, "grad_norm": 0.18693846464157104, "learning_rate": 1.2e-05, "loss": 2.95, "step": 360 }, { "epoch": 0.020051823123952105, "grad_norm": 0.113795205950737, "learning_rate": 1.2333333333333334e-05, "loss": 2.9469, "step": 370 }, { "epoch": 0.020593764289464325, "grad_norm": 0.18272531032562256, "learning_rate": 1.2666666666666668e-05, "loss": 2.9474, "step": 380 }, { "epoch": 0.021135705454976544, "grad_norm": 0.14928747713565826, "learning_rate": 1.3000000000000001e-05, "loss": 2.9478, "step": 390 }, { "epoch": 0.021677646620488763, "grad_norm": 0.28558146953582764, "learning_rate": 1.3333333333333333e-05, "loss": 2.9453, "step": 400 }, { "epoch": 0.022219587786000982, "grad_norm": 0.2676473557949066, "learning_rate": 1.3666666666666666e-05, "loss": 2.9447, "step": 410 }, { "epoch": 0.0227615289515132, "grad_norm": 0.18305936455726624, "learning_rate": 1.4000000000000001e-05, "loss": 2.9433, "step": 420 }, { "epoch": 0.02330347011702542, "grad_norm": 0.21853986382484436, "learning_rate": 1.4333333333333334e-05, "loss": 2.9438, "step": 430 }, { "epoch": 0.02357444069978153, "eval_loss": 2.8930819034576416, "eval_runtime": 21.9945, "eval_samples_per_second": 227.33, "eval_steps_per_second": 1.228, "step": 435 }, { "epoch": 0.02384541128253764, "grad_norm": 0.218394935131073, "learning_rate": 1.4666666666666668e-05, "loss": 2.942, "step": 440 }, { "epoch": 0.02438735244804986, "grad_norm": 0.15290504693984985, "learning_rate": 1.5e-05, "loss": 2.9412, "step": 450 }, { "epoch": 0.024929293613562078, "grad_norm": 0.2329382598400116, "learning_rate": 1.5333333333333334e-05, "loss": 2.9393, "step": 460 }, { "epoch": 0.025471234779074297, "grad_norm": 0.2563980519771576, "learning_rate": 1.5666666666666667e-05, "loss": 2.9404, "step": 470 }, { "epoch": 0.026013175944586516, "grad_norm": 0.18090900778770447, "learning_rate": 1.6000000000000003e-05, "loss": 2.9384, "step": 480 }, { "epoch": 0.026555117110098735, "grad_norm": 0.10313646495342255, "learning_rate": 1.6333333333333335e-05, "loss": 2.9383, "step": 490 }, { "epoch": 0.027097058275610954, "grad_norm": 0.127348393201828, "learning_rate": 1.6666666666666667e-05, "loss": 2.9374, "step": 500 }, { "epoch": 0.027638999441123174, "grad_norm": 0.39012327790260315, "learning_rate": 1.7000000000000003e-05, "loss": 2.9369, "step": 510 }, { "epoch": 0.028180940606635393, "grad_norm": 0.26286575198173523, "learning_rate": 1.7333333333333336e-05, "loss": 2.9365, "step": 520 }, { "epoch": 0.028289328839737837, "eval_loss": 2.885981798171997, "eval_runtime": 35.0266, "eval_samples_per_second": 142.749, "eval_steps_per_second": 0.771, "step": 522 }, { "epoch": 0.028722881772147612, "grad_norm": 0.34882956743240356, "learning_rate": 1.7666666666666668e-05, "loss": 2.9374, "step": 530 }, { "epoch": 0.02926482293765983, "grad_norm": 0.2220120131969452, "learning_rate": 1.8e-05, "loss": 2.934, "step": 540 }, { "epoch": 0.02980676410317205, "grad_norm": 0.11112108826637268, "learning_rate": 1.8333333333333333e-05, "loss": 2.9332, "step": 550 }, { "epoch": 0.03034870526868427, "grad_norm": 0.1723136603832245, "learning_rate": 1.866666666666667e-05, "loss": 2.9332, "step": 560 }, { "epoch": 0.03089064643419649, "grad_norm": 0.1271258443593979, "learning_rate": 1.9e-05, "loss": 2.9326, "step": 570 }, { "epoch": 0.031432587599708704, "grad_norm": 0.2465275079011917, "learning_rate": 1.9333333333333333e-05, "loss": 2.9304, "step": 580 }, { "epoch": 0.03197452876522092, "grad_norm": 0.24456460773944855, "learning_rate": 1.9666666666666666e-05, "loss": 2.9304, "step": 590 }, { "epoch": 0.03251646993073314, "grad_norm": 0.18503615260124207, "learning_rate": 2e-05, "loss": 2.9272, "step": 600 }, { "epoch": 0.03300421697969414, "eval_loss": 2.8818912506103516, "eval_runtime": 21.9944, "eval_samples_per_second": 227.331, "eval_steps_per_second": 1.228, "step": 609 }, { "epoch": 0.03305841109624536, "grad_norm": 0.18611077964305878, "learning_rate": 2.0333333333333334e-05, "loss": 2.9287, "step": 610 }, { "epoch": 0.03360035226175758, "grad_norm": 0.2303771823644638, "learning_rate": 2.0666666666666666e-05, "loss": 2.9272, "step": 620 }, { "epoch": 0.0341422934272698, "grad_norm": 0.12826542556285858, "learning_rate": 2.1e-05, "loss": 2.926, "step": 630 }, { "epoch": 0.03468423459278202, "grad_norm": 0.18119564652442932, "learning_rate": 2.1333333333333335e-05, "loss": 2.9259, "step": 640 }, { "epoch": 0.03522617575829424, "grad_norm": 0.17282642424106598, "learning_rate": 2.1666666666666667e-05, "loss": 2.9264, "step": 650 }, { "epoch": 0.03576811692380646, "grad_norm": 0.18431398272514343, "learning_rate": 2.2000000000000003e-05, "loss": 2.9243, "step": 660 }, { "epoch": 0.03631005808931868, "grad_norm": 0.2258967012166977, "learning_rate": 2.2333333333333335e-05, "loss": 2.9244, "step": 670 }, { "epoch": 0.036851999254830896, "grad_norm": 0.14141803979873657, "learning_rate": 2.2666666666666668e-05, "loss": 2.9226, "step": 680 }, { "epoch": 0.037393940420343115, "grad_norm": 0.3774568736553192, "learning_rate": 2.3000000000000003e-05, "loss": 2.9226, "step": 690 }, { "epoch": 0.037719105119650445, "eval_loss": 2.8792285919189453, "eval_runtime": 21.9939, "eval_samples_per_second": 227.336, "eval_steps_per_second": 1.228, "step": 696 }, { "epoch": 0.037935881585855334, "grad_norm": 0.2437097430229187, "learning_rate": 2.3333333333333336e-05, "loss": 2.9214, "step": 700 }, { "epoch": 0.03847782275136755, "grad_norm": 0.18445032835006714, "learning_rate": 2.3666666666666668e-05, "loss": 2.9201, "step": 710 }, { "epoch": 0.03901976391687977, "grad_norm": 0.12405350804328918, "learning_rate": 2.4e-05, "loss": 2.919, "step": 720 }, { "epoch": 0.03956170508239199, "grad_norm": 0.14779530465602875, "learning_rate": 2.4333333333333336e-05, "loss": 2.919, "step": 730 }, { "epoch": 0.04010364624790421, "grad_norm": 0.16767142713069916, "learning_rate": 2.466666666666667e-05, "loss": 2.9153, "step": 740 }, { "epoch": 0.04064558741341643, "grad_norm": 0.22561559081077576, "learning_rate": 2.5e-05, "loss": 2.916, "step": 750 }, { "epoch": 0.04118752857892865, "grad_norm": 0.1435348242521286, "learning_rate": 2.5333333333333337e-05, "loss": 2.9134, "step": 760 }, { "epoch": 0.04172946974444087, "grad_norm": 0.25675374269485474, "learning_rate": 2.5666666666666666e-05, "loss": 2.9133, "step": 770 }, { "epoch": 0.04227141090995309, "grad_norm": 0.22575390338897705, "learning_rate": 2.6000000000000002e-05, "loss": 2.9124, "step": 780 }, { "epoch": 0.042433993259606756, "eval_loss": 2.874786853790283, "eval_runtime": 21.9951, "eval_samples_per_second": 227.324, "eval_steps_per_second": 1.228, "step": 783 }, { "epoch": 0.04281335207546531, "grad_norm": 0.17684337496757507, "learning_rate": 2.633333333333333e-05, "loss": 2.9118, "step": 790 }, { "epoch": 0.043355293240977526, "grad_norm": 0.27737951278686523, "learning_rate": 2.6666666666666667e-05, "loss": 2.9098, "step": 800 }, { "epoch": 0.043897234406489745, "grad_norm": 0.28385818004608154, "learning_rate": 2.7000000000000002e-05, "loss": 2.9101, "step": 810 }, { "epoch": 0.044439175572001964, "grad_norm": 0.21399657428264618, "learning_rate": 2.733333333333333e-05, "loss": 2.909, "step": 820 }, { "epoch": 0.04498111673751418, "grad_norm": 0.20526191592216492, "learning_rate": 2.7666666666666667e-05, "loss": 2.906, "step": 830 }, { "epoch": 0.0455230579030264, "grad_norm": 0.3085150420665741, "learning_rate": 2.8000000000000003e-05, "loss": 2.9058, "step": 840 }, { "epoch": 0.04606499906853862, "grad_norm": 0.18636156618595123, "learning_rate": 2.8333333333333335e-05, "loss": 2.9055, "step": 850 }, { "epoch": 0.04660694023405084, "grad_norm": 0.23989151418209076, "learning_rate": 2.8666666666666668e-05, "loss": 2.9046, "step": 860 }, { "epoch": 0.04714888139956306, "grad_norm": 0.23799440264701843, "learning_rate": 2.9e-05, "loss": 2.9056, "step": 870 }, { "epoch": 0.04714888139956306, "eval_loss": 2.871436595916748, "eval_runtime": 21.8935, "eval_samples_per_second": 228.379, "eval_steps_per_second": 1.233, "step": 870 }, { "epoch": 0.04769082256507528, "grad_norm": 0.1619342863559723, "learning_rate": 2.9333333333333336e-05, "loss": 2.9041, "step": 880 }, { "epoch": 0.0482327637305875, "grad_norm": 0.28320956230163574, "learning_rate": 2.9666666666666672e-05, "loss": 2.9025, "step": 890 }, { "epoch": 0.04877470489609972, "grad_norm": 0.18892525136470795, "learning_rate": 3e-05, "loss": 2.8997, "step": 900 }, { "epoch": 0.049316646061611936, "grad_norm": 0.29564598202705383, "learning_rate": 3.0333333333333337e-05, "loss": 2.9001, "step": 910 }, { "epoch": 0.049858587227124156, "grad_norm": 0.24769143760204315, "learning_rate": 3.066666666666667e-05, "loss": 2.8987, "step": 920 }, { "epoch": 0.050400528392636375, "grad_norm": 0.21904461085796356, "learning_rate": 3.1e-05, "loss": 2.8968, "step": 930 }, { "epoch": 0.050942469558148594, "grad_norm": 0.29892289638519287, "learning_rate": 3.1333333333333334e-05, "loss": 2.8972, "step": 940 }, { "epoch": 0.05148441072366081, "grad_norm": 0.20083844661712646, "learning_rate": 3.1666666666666666e-05, "loss": 2.8951, "step": 950 }, { "epoch": 0.051863769539519364, "eval_loss": 2.865743398666382, "eval_runtime": 21.7153, "eval_samples_per_second": 230.253, "eval_steps_per_second": 1.243, "step": 957 }, { "epoch": 0.05202635188917303, "grad_norm": 0.26588037610054016, "learning_rate": 3.2000000000000005e-05, "loss": 2.8932, "step": 960 }, { "epoch": 0.05256829305468525, "grad_norm": 0.23635846376419067, "learning_rate": 3.233333333333333e-05, "loss": 2.8911, "step": 970 }, { "epoch": 0.05311023422019747, "grad_norm": 0.34888023138046265, "learning_rate": 3.266666666666667e-05, "loss": 2.891, "step": 980 }, { "epoch": 0.05365217538570969, "grad_norm": 0.34451714158058167, "learning_rate": 3.3e-05, "loss": 2.8907, "step": 990 }, { "epoch": 0.05419411655122191, "grad_norm": 0.2428581416606903, "learning_rate": 3.3333333333333335e-05, "loss": 2.8909, "step": 1000 }, { "epoch": 0.05473605771673413, "grad_norm": 0.17025752365589142, "learning_rate": 3.366666666666667e-05, "loss": 2.8891, "step": 1010 }, { "epoch": 0.05527799888224635, "grad_norm": 0.23046477138996124, "learning_rate": 3.4000000000000007e-05, "loss": 2.8887, "step": 1020 }, { "epoch": 0.055819940047758566, "grad_norm": 0.28877273201942444, "learning_rate": 3.433333333333333e-05, "loss": 2.8855, "step": 1030 }, { "epoch": 0.056361881213270786, "grad_norm": 0.16221983730793, "learning_rate": 3.466666666666667e-05, "loss": 2.8857, "step": 1040 }, { "epoch": 0.056578657679475675, "eval_loss": 2.8612656593322754, "eval_runtime": 22.0002, "eval_samples_per_second": 227.27, "eval_steps_per_second": 1.227, "step": 1044 }, { "epoch": 0.056903822378783005, "grad_norm": 0.10896836966276169, "learning_rate": 3.5e-05, "loss": 2.8861, "step": 1050 }, { "epoch": 0.057445763544295224, "grad_norm": 0.2890329360961914, "learning_rate": 3.5333333333333336e-05, "loss": 2.8826, "step": 1060 }, { "epoch": 0.05798770470980744, "grad_norm": 0.4240993857383728, "learning_rate": 3.566666666666667e-05, "loss": 2.885, "step": 1070 }, { "epoch": 0.05852964587531966, "grad_norm": 0.33681583404541016, "learning_rate": 3.6e-05, "loss": 2.8837, "step": 1080 }, { "epoch": 0.05907158704083188, "grad_norm": 0.23618997633457184, "learning_rate": 3.633333333333333e-05, "loss": 2.8846, "step": 1090 }, { "epoch": 0.0596135282063441, "grad_norm": 0.1710646152496338, "learning_rate": 3.6666666666666666e-05, "loss": 2.881, "step": 1100 }, { "epoch": 0.06015546937185632, "grad_norm": 0.2047516405582428, "learning_rate": 3.7e-05, "loss": 2.8773, "step": 1110 }, { "epoch": 0.06069741053736854, "grad_norm": 0.20800012350082397, "learning_rate": 3.733333333333334e-05, "loss": 2.8787, "step": 1120 }, { "epoch": 0.06123935170288076, "grad_norm": 0.2583804428577423, "learning_rate": 3.766666666666667e-05, "loss": 2.876, "step": 1130 }, { "epoch": 0.06129354581943198, "eval_loss": 2.860222578048706, "eval_runtime": 26.0987, "eval_samples_per_second": 191.581, "eval_steps_per_second": 1.035, "step": 1131 }, { "epoch": 0.06178129286839298, "grad_norm": 0.1588805764913559, "learning_rate": 3.8e-05, "loss": 2.8741, "step": 1140 }, { "epoch": 0.062323234033905196, "grad_norm": 0.22128504514694214, "learning_rate": 3.8333333333333334e-05, "loss": 2.8781, "step": 1150 }, { "epoch": 0.06286517519941741, "grad_norm": 0.2544479966163635, "learning_rate": 3.866666666666667e-05, "loss": 2.8762, "step": 1160 }, { "epoch": 0.06340711636492963, "grad_norm": 0.3215446472167969, "learning_rate": 3.9000000000000006e-05, "loss": 2.8758, "step": 1170 }, { "epoch": 0.06394905753044185, "grad_norm": 0.20564351975917816, "learning_rate": 3.933333333333333e-05, "loss": 2.8725, "step": 1180 }, { "epoch": 0.06449099869595407, "grad_norm": 0.24028970301151276, "learning_rate": 3.966666666666667e-05, "loss": 2.8728, "step": 1190 }, { "epoch": 0.06503293986146629, "grad_norm": 0.21962830424308777, "learning_rate": 4e-05, "loss": 2.8702, "step": 1200 }, { "epoch": 0.0655748810269785, "grad_norm": 0.281931072473526, "learning_rate": 4.0333333333333336e-05, "loss": 2.869, "step": 1210 }, { "epoch": 0.06600843395938828, "eval_loss": 2.8557112216949463, "eval_runtime": 21.9913, "eval_samples_per_second": 227.363, "eval_steps_per_second": 1.228, "step": 1218 }, { "epoch": 0.06611682219249072, "grad_norm": 0.16926707327365875, "learning_rate": 4.066666666666667e-05, "loss": 2.8687, "step": 1220 }, { "epoch": 0.06665876335800294, "grad_norm": 0.3461899757385254, "learning_rate": 4.1e-05, "loss": 2.8666, "step": 1230 }, { "epoch": 0.06720070452351516, "grad_norm": 0.324008047580719, "learning_rate": 4.133333333333333e-05, "loss": 2.8691, "step": 1240 }, { "epoch": 0.06774264568902738, "grad_norm": 0.2589833438396454, "learning_rate": 4.166666666666667e-05, "loss": 2.8676, "step": 1250 }, { "epoch": 0.0682845868545396, "grad_norm": 0.21630503237247467, "learning_rate": 4.2e-05, "loss": 2.8649, "step": 1260 }, { "epoch": 0.06882652802005182, "grad_norm": 0.19127613306045532, "learning_rate": 4.233333333333334e-05, "loss": 2.8643, "step": 1270 }, { "epoch": 0.06936846918556404, "grad_norm": 0.2493632286787033, "learning_rate": 4.266666666666667e-05, "loss": 2.8633, "step": 1280 }, { "epoch": 0.06991041035107626, "grad_norm": 0.26412254571914673, "learning_rate": 4.3e-05, "loss": 2.8644, "step": 1290 }, { "epoch": 0.07045235151658848, "grad_norm": 0.27580294013023376, "learning_rate": 4.3333333333333334e-05, "loss": 2.8618, "step": 1300 }, { "epoch": 0.0707233220993446, "eval_loss": 2.8502135276794434, "eval_runtime": 21.9938, "eval_samples_per_second": 227.337, "eval_steps_per_second": 1.228, "step": 1305 }, { "epoch": 0.0709942926821007, "grad_norm": 0.37026703357696533, "learning_rate": 4.3666666666666666e-05, "loss": 2.8585, "step": 1310 }, { "epoch": 0.07153623384761292, "grad_norm": 0.3927730917930603, "learning_rate": 4.4000000000000006e-05, "loss": 2.8594, "step": 1320 }, { "epoch": 0.07207817501312513, "grad_norm": 0.21110251545906067, "learning_rate": 4.433333333333334e-05, "loss": 2.858, "step": 1330 }, { "epoch": 0.07262011617863735, "grad_norm": 0.29518288373947144, "learning_rate": 4.466666666666667e-05, "loss": 2.8537, "step": 1340 }, { "epoch": 0.07316205734414957, "grad_norm": 0.27836179733276367, "learning_rate": 4.5e-05, "loss": 2.8552, "step": 1350 }, { "epoch": 0.07370399850966179, "grad_norm": 0.39191293716430664, "learning_rate": 4.5333333333333335e-05, "loss": 2.8549, "step": 1360 }, { "epoch": 0.07424593967517401, "grad_norm": 0.1891399621963501, "learning_rate": 4.566666666666667e-05, "loss": 2.8531, "step": 1370 }, { "epoch": 0.07478788084068623, "grad_norm": 0.3418962061405182, "learning_rate": 4.600000000000001e-05, "loss": 2.8513, "step": 1380 }, { "epoch": 0.07532982200619845, "grad_norm": 0.2758435308933258, "learning_rate": 4.633333333333333e-05, "loss": 2.8523, "step": 1390 }, { "epoch": 0.07543821023930089, "eval_loss": 2.8491737842559814, "eval_runtime": 21.9942, "eval_samples_per_second": 227.333, "eval_steps_per_second": 1.228, "step": 1392 }, { "epoch": 0.07587176317171067, "grad_norm": 0.5267038941383362, "learning_rate": 4.666666666666667e-05, "loss": 2.8527, "step": 1400 }, { "epoch": 0.07641370433722289, "grad_norm": 0.31036752462387085, "learning_rate": 4.7e-05, "loss": 2.8491, "step": 1410 }, { "epoch": 0.0769556455027351, "grad_norm": 0.2229401171207428, "learning_rate": 4.7333333333333336e-05, "loss": 2.849, "step": 1420 }, { "epoch": 0.07749758666824733, "grad_norm": 0.40294215083122253, "learning_rate": 4.766666666666667e-05, "loss": 2.847, "step": 1430 }, { "epoch": 0.07803952783375954, "grad_norm": 0.25234729051589966, "learning_rate": 4.8e-05, "loss": 2.8497, "step": 1440 }, { "epoch": 0.07858146899927176, "grad_norm": 0.2245229333639145, "learning_rate": 4.8333333333333334e-05, "loss": 2.8429, "step": 1450 }, { "epoch": 0.07912341016478398, "grad_norm": 0.37367942929267883, "learning_rate": 4.866666666666667e-05, "loss": 2.8461, "step": 1460 }, { "epoch": 0.0796653513302962, "grad_norm": 0.36259761452674866, "learning_rate": 4.9e-05, "loss": 2.8394, "step": 1470 }, { "epoch": 0.0801530983792572, "eval_loss": 2.847864866256714, "eval_runtime": 21.994, "eval_samples_per_second": 227.334, "eval_steps_per_second": 1.228, "step": 1479 }, { "epoch": 0.08020729249580842, "grad_norm": 0.3754774034023285, "learning_rate": 4.933333333333334e-05, "loss": 2.8415, "step": 1480 }, { "epoch": 0.08074923366132064, "grad_norm": 0.5675404071807861, "learning_rate": 4.966666666666667e-05, "loss": 2.8442, "step": 1490 }, { "epoch": 0.08129117482683286, "grad_norm": 0.23455142974853516, "learning_rate": 5e-05, "loss": 2.8377, "step": 1500 }, { "epoch": 0.08183311599234508, "grad_norm": 0.34516745805740356, "learning_rate": 5.0333333333333335e-05, "loss": 2.8378, "step": 1510 }, { "epoch": 0.0823750571578573, "grad_norm": 0.26785463094711304, "learning_rate": 5.0666666666666674e-05, "loss": 2.8371, "step": 1520 }, { "epoch": 0.08291699832336952, "grad_norm": 0.39150965213775635, "learning_rate": 5.1000000000000006e-05, "loss": 2.8366, "step": 1530 }, { "epoch": 0.08345893948888174, "grad_norm": 0.322078138589859, "learning_rate": 5.133333333333333e-05, "loss": 2.8332, "step": 1540 }, { "epoch": 0.08400088065439396, "grad_norm": 0.34766703844070435, "learning_rate": 5.166666666666667e-05, "loss": 2.8366, "step": 1550 }, { "epoch": 0.08454282181990617, "grad_norm": 0.32657214999198914, "learning_rate": 5.2000000000000004e-05, "loss": 2.8305, "step": 1560 }, { "epoch": 0.08486798651921351, "eval_loss": 2.8430960178375244, "eval_runtime": 21.9941, "eval_samples_per_second": 227.334, "eval_steps_per_second": 1.228, "step": 1566 }, { "epoch": 0.0850847629854184, "grad_norm": 0.5440207719802856, "learning_rate": 5.2333333333333336e-05, "loss": 2.8311, "step": 1570 }, { "epoch": 0.08562670415093061, "grad_norm": 0.43800434470176697, "learning_rate": 5.266666666666666e-05, "loss": 2.834, "step": 1580 }, { "epoch": 0.08616864531644283, "grad_norm": 0.28220343589782715, "learning_rate": 5.300000000000001e-05, "loss": 2.8264, "step": 1590 }, { "epoch": 0.08671058648195505, "grad_norm": 0.5123216509819031, "learning_rate": 5.333333333333333e-05, "loss": 2.8261, "step": 1600 }, { "epoch": 0.08725252764746727, "grad_norm": 0.3581772744655609, "learning_rate": 5.3666666666666666e-05, "loss": 2.8278, "step": 1610 }, { "epoch": 0.08779446881297949, "grad_norm": 0.39921578764915466, "learning_rate": 5.4000000000000005e-05, "loss": 2.8234, "step": 1620 }, { "epoch": 0.08833640997849171, "grad_norm": 0.36354538798332214, "learning_rate": 5.433333333333334e-05, "loss": 2.8229, "step": 1630 }, { "epoch": 0.08887835114400393, "grad_norm": 0.30660825967788696, "learning_rate": 5.466666666666666e-05, "loss": 2.8225, "step": 1640 }, { "epoch": 0.08942029230951615, "grad_norm": 0.30880966782569885, "learning_rate": 5.500000000000001e-05, "loss": 2.8206, "step": 1650 }, { "epoch": 0.08958287465916981, "eval_loss": 2.8423779010772705, "eval_runtime": 21.999, "eval_samples_per_second": 227.283, "eval_steps_per_second": 1.227, "step": 1653 }, { "epoch": 0.08996223347502837, "grad_norm": 0.2573600113391876, "learning_rate": 5.5333333333333334e-05, "loss": 2.8159, "step": 1660 }, { "epoch": 0.09050417464054059, "grad_norm": 0.30160731077194214, "learning_rate": 5.566666666666667e-05, "loss": 2.8184, "step": 1670 }, { "epoch": 0.0910461158060528, "grad_norm": 0.2629443109035492, "learning_rate": 5.6000000000000006e-05, "loss": 2.8192, "step": 1680 }, { "epoch": 0.09158805697156502, "grad_norm": 0.23509126901626587, "learning_rate": 5.633333333333334e-05, "loss": 2.8148, "step": 1690 }, { "epoch": 0.09212999813707724, "grad_norm": 0.42312270402908325, "learning_rate": 5.666666666666667e-05, "loss": 2.813, "step": 1700 }, { "epoch": 0.09267193930258946, "grad_norm": 0.46546509861946106, "learning_rate": 5.6999999999999996e-05, "loss": 2.8086, "step": 1710 }, { "epoch": 0.09321388046810168, "grad_norm": 0.23622998595237732, "learning_rate": 5.7333333333333336e-05, "loss": 2.8111, "step": 1720 }, { "epoch": 0.0937558216336139, "grad_norm": 0.607647180557251, "learning_rate": 5.766666666666667e-05, "loss": 2.8102, "step": 1730 }, { "epoch": 0.09429776279912612, "grad_norm": 0.40518462657928467, "learning_rate": 5.8e-05, "loss": 2.8072, "step": 1740 }, { "epoch": 0.09429776279912612, "eval_loss": 2.834738254547119, "eval_runtime": 21.6631, "eval_samples_per_second": 230.807, "eval_steps_per_second": 1.246, "step": 1740 }, { "epoch": 0.09483970396463834, "grad_norm": 0.2947821021080017, "learning_rate": 5.833333333333334e-05, "loss": 2.807, "step": 1750 }, { "epoch": 0.09538164513015056, "grad_norm": 0.304720938205719, "learning_rate": 5.866666666666667e-05, "loss": 2.8086, "step": 1760 }, { "epoch": 0.09592358629566278, "grad_norm": 0.37076836824417114, "learning_rate": 5.9e-05, "loss": 2.8035, "step": 1770 }, { "epoch": 0.096465527461175, "grad_norm": 0.4996398389339447, "learning_rate": 5.9333333333333343e-05, "loss": 2.804, "step": 1780 }, { "epoch": 0.09700746862668722, "grad_norm": 0.37258633971214294, "learning_rate": 5.966666666666667e-05, "loss": 2.8074, "step": 1790 }, { "epoch": 0.09754940979219943, "grad_norm": 0.2905193567276001, "learning_rate": 6e-05, "loss": 2.8006, "step": 1800 }, { "epoch": 0.09809135095771165, "grad_norm": 0.27609148621559143, "learning_rate": 6.033333333333334e-05, "loss": 2.7965, "step": 1810 }, { "epoch": 0.09863329212322387, "grad_norm": 0.3159330189228058, "learning_rate": 6.066666666666667e-05, "loss": 2.7977, "step": 1820 }, { "epoch": 0.09901265093908243, "eval_loss": 2.8351926803588867, "eval_runtime": 21.9919, "eval_samples_per_second": 227.356, "eval_steps_per_second": 1.228, "step": 1827 }, { "epoch": 0.09917523328873609, "grad_norm": 0.3412172794342041, "learning_rate": 6.1e-05, "loss": 2.7945, "step": 1830 }, { "epoch": 0.09971717445424831, "grad_norm": 0.2713095247745514, "learning_rate": 6.133333333333334e-05, "loss": 2.7904, "step": 1840 }, { "epoch": 0.10025911561976053, "grad_norm": 0.3899465799331665, "learning_rate": 6.166666666666667e-05, "loss": 2.7907, "step": 1850 }, { "epoch": 0.10080105678527275, "grad_norm": 0.2539375424385071, "learning_rate": 6.2e-05, "loss": 2.7935, "step": 1860 }, { "epoch": 0.10134299795078497, "grad_norm": 0.6003013253211975, "learning_rate": 6.233333333333334e-05, "loss": 2.7854, "step": 1870 }, { "epoch": 0.10188493911629719, "grad_norm": 0.410022497177124, "learning_rate": 6.266666666666667e-05, "loss": 2.7879, "step": 1880 }, { "epoch": 0.10242688028180941, "grad_norm": 0.24878162145614624, "learning_rate": 6.3e-05, "loss": 2.7848, "step": 1890 }, { "epoch": 0.10296882144732163, "grad_norm": 0.5053821802139282, "learning_rate": 6.333333333333333e-05, "loss": 2.7876, "step": 1900 }, { "epoch": 0.10351076261283385, "grad_norm": 0.43480825424194336, "learning_rate": 6.366666666666668e-05, "loss": 2.7826, "step": 1910 }, { "epoch": 0.10372753907903873, "eval_loss": 2.837707996368408, "eval_runtime": 22.0381, "eval_samples_per_second": 226.88, "eval_steps_per_second": 1.225, "step": 1914 }, { "epoch": 0.10405270377834606, "grad_norm": 0.3400084674358368, "learning_rate": 6.400000000000001e-05, "loss": 2.7832, "step": 1920 }, { "epoch": 0.10459464494385828, "grad_norm": 0.2867070436477661, "learning_rate": 6.433333333333333e-05, "loss": 2.7821, "step": 1930 }, { "epoch": 0.1051365861093705, "grad_norm": 0.3295210003852844, "learning_rate": 6.466666666666666e-05, "loss": 2.776, "step": 1940 }, { "epoch": 0.10567852727488272, "grad_norm": 0.29053670167922974, "learning_rate": 6.500000000000001e-05, "loss": 2.7776, "step": 1950 }, { "epoch": 0.10622046844039494, "grad_norm": 0.4241558313369751, "learning_rate": 6.533333333333334e-05, "loss": 2.7788, "step": 1960 }, { "epoch": 0.10676240960590716, "grad_norm": 0.6300092339515686, "learning_rate": 6.566666666666666e-05, "loss": 2.7763, "step": 1970 }, { "epoch": 0.10730435077141938, "grad_norm": 0.29351305961608887, "learning_rate": 6.6e-05, "loss": 2.7713, "step": 1980 }, { "epoch": 0.1078462919369316, "grad_norm": 0.5574642419815063, "learning_rate": 6.633333333333334e-05, "loss": 2.7693, "step": 1990 }, { "epoch": 0.10838823310244382, "grad_norm": 0.34177178144454956, "learning_rate": 6.666666666666667e-05, "loss": 2.7661, "step": 2000 }, { "epoch": 0.10844242721899504, "eval_loss": 2.8290762901306152, "eval_runtime": 21.9941, "eval_samples_per_second": 227.334, "eval_steps_per_second": 1.228, "step": 2001 }, { "epoch": 0.10893017426795604, "grad_norm": 0.5755301117897034, "learning_rate": 6.7e-05, "loss": 2.7709, "step": 2010 }, { "epoch": 0.10947211543346826, "grad_norm": 0.4249889850616455, "learning_rate": 6.733333333333333e-05, "loss": 2.7636, "step": 2020 }, { "epoch": 0.11001405659898048, "grad_norm": 0.3331117033958435, "learning_rate": 6.766666666666667e-05, "loss": 2.768, "step": 2030 }, { "epoch": 0.1105559977644927, "grad_norm": 0.6131373643875122, "learning_rate": 6.800000000000001e-05, "loss": 2.7659, "step": 2040 }, { "epoch": 0.11109793893000491, "grad_norm": 0.36327242851257324, "learning_rate": 6.833333333333333e-05, "loss": 2.7612, "step": 2050 }, { "epoch": 0.11163988009551713, "grad_norm": 0.4649696946144104, "learning_rate": 6.866666666666666e-05, "loss": 2.7571, "step": 2060 }, { "epoch": 0.11218182126102935, "grad_norm": 0.4964756965637207, "learning_rate": 6.9e-05, "loss": 2.7529, "step": 2070 }, { "epoch": 0.11272376242654157, "grad_norm": 0.261391818523407, "learning_rate": 6.933333333333334e-05, "loss": 2.7493, "step": 2080 }, { "epoch": 0.11315731535895135, "eval_loss": 2.822695732116699, "eval_runtime": 21.997, "eval_samples_per_second": 227.304, "eval_steps_per_second": 1.227, "step": 2088 }, { "epoch": 0.11326570359205379, "grad_norm": 0.4503042995929718, "learning_rate": 6.966666666666668e-05, "loss": 2.7474, "step": 2090 }, { "epoch": 0.11380764475756601, "grad_norm": 0.5234399437904358, "learning_rate": 7e-05, "loss": 2.7477, "step": 2100 }, { "epoch": 0.11434958592307823, "grad_norm": 0.4587494432926178, "learning_rate": 7.033333333333334e-05, "loss": 2.7441, "step": 2110 }, { "epoch": 0.11489152708859045, "grad_norm": 0.3193627893924713, "learning_rate": 7.066666666666667e-05, "loss": 2.7529, "step": 2120 }, { "epoch": 0.11543346825410267, "grad_norm": 0.60213702917099, "learning_rate": 7.1e-05, "loss": 2.7452, "step": 2130 }, { "epoch": 0.11597540941961489, "grad_norm": 0.495370477437973, "learning_rate": 7.133333333333334e-05, "loss": 2.7431, "step": 2140 }, { "epoch": 0.1165173505851271, "grad_norm": 0.46439045667648315, "learning_rate": 7.166666666666667e-05, "loss": 2.7463, "step": 2150 }, { "epoch": 0.11705929175063932, "grad_norm": 0.4808029234409332, "learning_rate": 7.2e-05, "loss": 2.7399, "step": 2160 }, { "epoch": 0.11760123291615154, "grad_norm": 0.40961867570877075, "learning_rate": 7.233333333333335e-05, "loss": 2.7369, "step": 2170 }, { "epoch": 0.11787220349890765, "eval_loss": 2.823870897293091, "eval_runtime": 21.9942, "eval_samples_per_second": 227.333, "eval_steps_per_second": 1.228, "step": 2175 }, { "epoch": 0.11814317408166376, "grad_norm": 0.4309645891189575, "learning_rate": 7.266666666666667e-05, "loss": 2.7333, "step": 2180 }, { "epoch": 0.11868511524717598, "grad_norm": 0.46611249446868896, "learning_rate": 7.3e-05, "loss": 2.7364, "step": 2190 }, { "epoch": 0.1192270564126882, "grad_norm": 0.3425257205963135, "learning_rate": 7.333333333333333e-05, "loss": 2.7344, "step": 2200 }, { "epoch": 0.11976899757820042, "grad_norm": 0.6439690589904785, "learning_rate": 7.366666666666668e-05, "loss": 2.7389, "step": 2210 }, { "epoch": 0.12031093874371264, "grad_norm": 0.9094191193580627, "learning_rate": 7.4e-05, "loss": 2.727, "step": 2220 }, { "epoch": 0.12085287990922486, "grad_norm": 0.4872890114784241, "learning_rate": 7.433333333333333e-05, "loss": 2.7286, "step": 2230 }, { "epoch": 0.12139482107473708, "grad_norm": 0.4804534614086151, "learning_rate": 7.466666666666667e-05, "loss": 2.7233, "step": 2240 }, { "epoch": 0.1219367622402493, "grad_norm": 0.330377995967865, "learning_rate": 7.500000000000001e-05, "loss": 2.7211, "step": 2250 }, { "epoch": 0.12247870340576152, "grad_norm": 0.9292433857917786, "learning_rate": 7.533333333333334e-05, "loss": 2.7224, "step": 2260 }, { "epoch": 0.12258709163886396, "eval_loss": 2.818483352661133, "eval_runtime": 21.9931, "eval_samples_per_second": 227.344, "eval_steps_per_second": 1.228, "step": 2262 }, { "epoch": 0.12302064457127374, "grad_norm": 0.4465082287788391, "learning_rate": 7.566666666666667e-05, "loss": 2.7214, "step": 2270 }, { "epoch": 0.12356258573678595, "grad_norm": 0.300624817609787, "learning_rate": 7.6e-05, "loss": 2.722, "step": 2280 }, { "epoch": 0.12410452690229817, "grad_norm": 0.41431039571762085, "learning_rate": 7.633333333333334e-05, "loss": 2.7135, "step": 2290 }, { "epoch": 0.12464646806781039, "grad_norm": 0.3542834520339966, "learning_rate": 7.666666666666667e-05, "loss": 2.7118, "step": 2300 }, { "epoch": 0.1251884092333226, "grad_norm": 0.42152753472328186, "learning_rate": 7.7e-05, "loss": 2.7193, "step": 2310 }, { "epoch": 0.12573035039883482, "grad_norm": 0.4453124701976776, "learning_rate": 7.733333333333333e-05, "loss": 2.7154, "step": 2320 }, { "epoch": 0.12627229156434705, "grad_norm": 0.6320910453796387, "learning_rate": 7.766666666666667e-05, "loss": 2.7098, "step": 2330 }, { "epoch": 0.12681423272985926, "grad_norm": 0.3247159421443939, "learning_rate": 7.800000000000001e-05, "loss": 2.7067, "step": 2340 }, { "epoch": 0.12730197977882027, "eval_loss": 2.813124179840088, "eval_runtime": 21.9951, "eval_samples_per_second": 227.323, "eval_steps_per_second": 1.228, "step": 2349 }, { "epoch": 0.1273561738953715, "grad_norm": 0.34726181626319885, "learning_rate": 7.833333333333333e-05, "loss": 2.7054, "step": 2350 }, { "epoch": 0.1278981150608837, "grad_norm": 0.5048098564147949, "learning_rate": 7.866666666666666e-05, "loss": 2.7063, "step": 2360 }, { "epoch": 0.12844005622639593, "grad_norm": 0.4321523606777191, "learning_rate": 7.900000000000001e-05, "loss": 2.7011, "step": 2370 }, { "epoch": 0.12898199739190813, "grad_norm": 0.2796650826931, "learning_rate": 7.933333333333334e-05, "loss": 2.6997, "step": 2380 }, { "epoch": 0.12952393855742036, "grad_norm": 0.5107274651527405, "learning_rate": 7.966666666666666e-05, "loss": 2.6933, "step": 2390 }, { "epoch": 0.13006587972293257, "grad_norm": 0.6369110345840454, "learning_rate": 8e-05, "loss": 2.6989, "step": 2400 }, { "epoch": 0.1306078208884448, "grad_norm": 0.6331022381782532, "learning_rate": 8.033333333333334e-05, "loss": 2.6923, "step": 2410 }, { "epoch": 0.131149762053957, "grad_norm": 0.36579135060310364, "learning_rate": 8.066666666666667e-05, "loss": 2.6889, "step": 2420 }, { "epoch": 0.13169170321946924, "grad_norm": 0.9732735753059387, "learning_rate": 8.1e-05, "loss": 2.6948, "step": 2430 }, { "epoch": 0.13201686791877656, "eval_loss": 2.808922290802002, "eval_runtime": 21.9927, "eval_samples_per_second": 227.348, "eval_steps_per_second": 1.228, "step": 2436 }, { "epoch": 0.13223364438498145, "grad_norm": 0.5418347120285034, "learning_rate": 8.133333333333334e-05, "loss": 2.6871, "step": 2440 }, { "epoch": 0.13277558555049368, "grad_norm": 0.43290242552757263, "learning_rate": 8.166666666666667e-05, "loss": 2.6827, "step": 2450 }, { "epoch": 0.13331752671600589, "grad_norm": 0.6338348984718323, "learning_rate": 8.2e-05, "loss": 2.6821, "step": 2460 }, { "epoch": 0.13385946788151812, "grad_norm": 0.604308545589447, "learning_rate": 8.233333333333333e-05, "loss": 2.6774, "step": 2470 }, { "epoch": 0.13440140904703032, "grad_norm": 0.49863043427467346, "learning_rate": 8.266666666666667e-05, "loss": 2.6781, "step": 2480 }, { "epoch": 0.13494335021254256, "grad_norm": 0.9325143098831177, "learning_rate": 8.3e-05, "loss": 2.6793, "step": 2490 }, { "epoch": 0.13548529137805476, "grad_norm": 0.5320800542831421, "learning_rate": 8.333333333333334e-05, "loss": 2.6797, "step": 2500 }, { "epoch": 0.136027232543567, "grad_norm": 0.3204202353954315, "learning_rate": 8.366666666666668e-05, "loss": 2.6757, "step": 2510 }, { "epoch": 0.1365691737090792, "grad_norm": 0.2703516483306885, "learning_rate": 8.4e-05, "loss": 2.6667, "step": 2520 }, { "epoch": 0.13673175605873286, "eval_loss": 2.8014180660247803, "eval_runtime": 21.9979, "eval_samples_per_second": 227.295, "eval_steps_per_second": 1.227, "step": 2523 }, { "epoch": 0.13711111487459143, "grad_norm": 0.4166392385959625, "learning_rate": 8.433333333333334e-05, "loss": 2.672, "step": 2530 }, { "epoch": 0.13765305604010364, "grad_norm": 0.6374923586845398, "learning_rate": 8.466666666666667e-05, "loss": 2.666, "step": 2540 }, { "epoch": 0.13819499720561587, "grad_norm": 0.41108188033103943, "learning_rate": 8.5e-05, "loss": 2.6661, "step": 2550 }, { "epoch": 0.13873693837112808, "grad_norm": 0.6000506281852722, "learning_rate": 8.533333333333334e-05, "loss": 2.6677, "step": 2560 }, { "epoch": 0.1392788795366403, "grad_norm": 0.3874584436416626, "learning_rate": 8.566666666666667e-05, "loss": 2.6664, "step": 2570 }, { "epoch": 0.13982082070215252, "grad_norm": 1.0246257781982422, "learning_rate": 8.6e-05, "loss": 2.6662, "step": 2580 }, { "epoch": 0.14036276186766475, "grad_norm": 0.6086533069610596, "learning_rate": 8.633333333333334e-05, "loss": 2.6615, "step": 2590 }, { "epoch": 0.14090470303317695, "grad_norm": 0.2854156792163849, "learning_rate": 8.666666666666667e-05, "loss": 2.6512, "step": 2600 }, { "epoch": 0.1414466441986892, "grad_norm": 0.6454458236694336, "learning_rate": 8.7e-05, "loss": 2.6608, "step": 2610 }, { "epoch": 0.1414466441986892, "eval_loss": 2.8015899658203125, "eval_runtime": 21.9849, "eval_samples_per_second": 227.429, "eval_steps_per_second": 1.228, "step": 2610 }, { "epoch": 0.1419885853642014, "grad_norm": 0.553676426410675, "learning_rate": 8.733333333333333e-05, "loss": 2.6523, "step": 2620 }, { "epoch": 0.14253052652971362, "grad_norm": 0.6649917364120483, "learning_rate": 8.766666666666668e-05, "loss": 2.6496, "step": 2630 }, { "epoch": 0.14307246769522583, "grad_norm": 0.3380162715911865, "learning_rate": 8.800000000000001e-05, "loss": 2.6456, "step": 2640 }, { "epoch": 0.14361440886073806, "grad_norm": 0.4091057777404785, "learning_rate": 8.833333333333333e-05, "loss": 2.6397, "step": 2650 }, { "epoch": 0.14415635002625027, "grad_norm": 0.5197025537490845, "learning_rate": 8.866666666666668e-05, "loss": 2.6446, "step": 2660 }, { "epoch": 0.1446982911917625, "grad_norm": 0.3578343093395233, "learning_rate": 8.900000000000001e-05, "loss": 2.6438, "step": 2670 }, { "epoch": 0.1452402323572747, "grad_norm": 0.7891026735305786, "learning_rate": 8.933333333333334e-05, "loss": 2.6442, "step": 2680 }, { "epoch": 0.14578217352278694, "grad_norm": 0.49294915795326233, "learning_rate": 8.966666666666666e-05, "loss": 2.6435, "step": 2690 }, { "epoch": 0.14616153233864548, "eval_loss": 2.7972233295440674, "eval_runtime": 21.9971, "eval_samples_per_second": 227.303, "eval_steps_per_second": 1.227, "step": 2697 }, { "epoch": 0.14632411468829915, "grad_norm": 0.5119202136993408, "learning_rate": 9e-05, "loss": 2.6435, "step": 2700 }, { "epoch": 0.14686605585381138, "grad_norm": 0.5359740853309631, "learning_rate": 9.033333333333334e-05, "loss": 2.6324, "step": 2710 }, { "epoch": 0.14740799701932358, "grad_norm": 0.6155476570129395, "learning_rate": 9.066666666666667e-05, "loss": 2.6331, "step": 2720 }, { "epoch": 0.14794993818483582, "grad_norm": 0.9409157633781433, "learning_rate": 9.1e-05, "loss": 2.6363, "step": 2730 }, { "epoch": 0.14849187935034802, "grad_norm": 0.3532879650592804, "learning_rate": 9.133333333333334e-05, "loss": 2.6232, "step": 2740 }, { "epoch": 0.14903382051586025, "grad_norm": 0.38057300448417664, "learning_rate": 9.166666666666667e-05, "loss": 2.6234, "step": 2750 }, { "epoch": 0.14957576168137246, "grad_norm": 0.6639463901519775, "learning_rate": 9.200000000000001e-05, "loss": 2.6245, "step": 2760 }, { "epoch": 0.1501177028468847, "grad_norm": 0.6744067668914795, "learning_rate": 9.233333333333333e-05, "loss": 2.6265, "step": 2770 }, { "epoch": 0.1506596440123969, "grad_norm": 0.5296260714530945, "learning_rate": 9.266666666666666e-05, "loss": 2.6255, "step": 2780 }, { "epoch": 0.15087642047860178, "eval_loss": 2.7909934520721436, "eval_runtime": 21.9988, "eval_samples_per_second": 227.285, "eval_steps_per_second": 1.227, "step": 2784 }, { "epoch": 0.15120158517790913, "grad_norm": 1.1150288581848145, "learning_rate": 9.300000000000001e-05, "loss": 2.6247, "step": 2790 }, { "epoch": 0.15174352634342134, "grad_norm": 0.3209221661090851, "learning_rate": 9.333333333333334e-05, "loss": 2.6256, "step": 2800 }, { "epoch": 0.15228546750893357, "grad_norm": 0.7962296605110168, "learning_rate": 9.366666666666668e-05, "loss": 2.6113, "step": 2810 }, { "epoch": 0.15282740867444577, "grad_norm": 0.345284640789032, "learning_rate": 9.4e-05, "loss": 2.6234, "step": 2820 }, { "epoch": 0.153369349839958, "grad_norm": 0.3219437003135681, "learning_rate": 9.433333333333334e-05, "loss": 2.619, "step": 2830 }, { "epoch": 0.1539112910054702, "grad_norm": 0.3087829351425171, "learning_rate": 9.466666666666667e-05, "loss": 2.605, "step": 2840 }, { "epoch": 0.15445323217098245, "grad_norm": 0.3032431900501251, "learning_rate": 9.5e-05, "loss": 2.6083, "step": 2850 }, { "epoch": 0.15499517333649465, "grad_norm": 0.3768031597137451, "learning_rate": 9.533333333333334e-05, "loss": 2.6076, "step": 2860 }, { "epoch": 0.15553711450200688, "grad_norm": 0.37252676486968994, "learning_rate": 9.566666666666667e-05, "loss": 2.6045, "step": 2870 }, { "epoch": 0.1555913086185581, "eval_loss": 2.7903635501861572, "eval_runtime": 21.9938, "eval_samples_per_second": 227.337, "eval_steps_per_second": 1.228, "step": 2871 }, { "epoch": 0.1560790556675191, "grad_norm": 0.9097828269004822, "learning_rate": 9.6e-05, "loss": 2.6135, "step": 2880 }, { "epoch": 0.15662099683303132, "grad_norm": 0.5291810631752014, "learning_rate": 9.633333333333335e-05, "loss": 2.5981, "step": 2890 }, { "epoch": 0.15716293799854353, "grad_norm": 0.7748283743858337, "learning_rate": 9.666666666666667e-05, "loss": 2.5933, "step": 2900 }, { "epoch": 0.15770487916405576, "grad_norm": 0.4185350239276886, "learning_rate": 9.7e-05, "loss": 2.5929, "step": 2910 }, { "epoch": 0.15824682032956797, "grad_norm": 0.32910650968551636, "learning_rate": 9.733333333333335e-05, "loss": 2.5889, "step": 2920 }, { "epoch": 0.1587887614950802, "grad_norm": 0.32221078872680664, "learning_rate": 9.766666666666668e-05, "loss": 2.5938, "step": 2930 }, { "epoch": 0.1593307026605924, "grad_norm": 0.6913098096847534, "learning_rate": 9.8e-05, "loss": 2.5899, "step": 2940 }, { "epoch": 0.15987264382610464, "grad_norm": 1.0831592082977295, "learning_rate": 9.833333333333333e-05, "loss": 2.5864, "step": 2950 }, { "epoch": 0.1603061967585144, "eval_loss": 2.782402276992798, "eval_runtime": 21.9947, "eval_samples_per_second": 227.328, "eval_steps_per_second": 1.228, "step": 2958 }, { "epoch": 0.16041458499161684, "grad_norm": 0.6254389882087708, "learning_rate": 9.866666666666668e-05, "loss": 2.5797, "step": 2960 }, { "epoch": 0.16095652615712908, "grad_norm": 0.34385034441947937, "learning_rate": 9.900000000000001e-05, "loss": 2.5846, "step": 2970 }, { "epoch": 0.16149846732264128, "grad_norm": 0.7574624419212341, "learning_rate": 9.933333333333334e-05, "loss": 2.5805, "step": 2980 }, { "epoch": 0.16204040848815351, "grad_norm": 0.36409491300582886, "learning_rate": 9.966666666666667e-05, "loss": 2.5852, "step": 2990 }, { "epoch": 0.16258234965366572, "grad_norm": 0.7688263654708862, "learning_rate": 0.0001, "loss": 2.585, "step": 3000 }, { "epoch": 0.16312429081917795, "grad_norm": 0.9283438920974731, "learning_rate": 9.99999069936256e-05, "loss": 2.582, "step": 3010 }, { "epoch": 0.16366623198469016, "grad_norm": 0.5236392617225647, "learning_rate": 9.999962797488683e-05, "loss": 2.5818, "step": 3020 }, { "epoch": 0.1642081731502024, "grad_norm": 0.4604909121990204, "learning_rate": 9.999916294493705e-05, "loss": 2.574, "step": 3030 }, { "epoch": 0.1647501143157146, "grad_norm": 0.5703723430633545, "learning_rate": 9.999851190569852e-05, "loss": 2.5638, "step": 3040 }, { "epoch": 0.1650210848984707, "eval_loss": 2.777949571609497, "eval_runtime": 21.9996, "eval_samples_per_second": 227.277, "eval_steps_per_second": 1.227, "step": 3045 }, { "epoch": 0.16529205548122683, "grad_norm": 1.0797618627548218, "learning_rate": 9.99976748598624e-05, "loss": 2.5687, "step": 3050 }, { "epoch": 0.16583399664673903, "grad_norm": 0.7626991271972656, "learning_rate": 9.999665181088869e-05, "loss": 2.5726, "step": 3060 }, { "epoch": 0.16637593781225127, "grad_norm": 0.46589240431785583, "learning_rate": 9.999544276300629e-05, "loss": 2.5625, "step": 3070 }, { "epoch": 0.16691787897776347, "grad_norm": 0.4888695478439331, "learning_rate": 9.999404772121297e-05, "loss": 2.5547, "step": 3080 }, { "epoch": 0.1674598201432757, "grad_norm": 1.3562276363372803, "learning_rate": 9.999246669127524e-05, "loss": 2.5644, "step": 3090 }, { "epoch": 0.1680017613087879, "grad_norm": 0.3566310405731201, "learning_rate": 9.999069967972854e-05, "loss": 2.5584, "step": 3100 }, { "epoch": 0.16854370247430014, "grad_norm": 0.7351515293121338, "learning_rate": 9.998874669387696e-05, "loss": 2.5589, "step": 3110 }, { "epoch": 0.16908564363981235, "grad_norm": 0.5617663264274597, "learning_rate": 9.998660774179343e-05, "loss": 2.566, "step": 3120 }, { "epoch": 0.16962758480532458, "grad_norm": 0.7777019143104553, "learning_rate": 9.998428283231952e-05, "loss": 2.5553, "step": 3130 }, { "epoch": 0.16973597303842702, "eval_loss": 2.773970127105713, "eval_runtime": 21.9948, "eval_samples_per_second": 227.326, "eval_steps_per_second": 1.228, "step": 3132 }, { "epoch": 0.1701695259708368, "grad_norm": 0.8600534796714783, "learning_rate": 9.998177197506557e-05, "loss": 2.5529, "step": 3140 }, { "epoch": 0.17071146713634902, "grad_norm": 0.3683708906173706, "learning_rate": 9.997907518041047e-05, "loss": 2.5552, "step": 3150 }, { "epoch": 0.17125340830186123, "grad_norm": 0.5638754367828369, "learning_rate": 9.997619245950172e-05, "loss": 2.5465, "step": 3160 }, { "epoch": 0.17179534946737346, "grad_norm": 0.4635675251483917, "learning_rate": 9.997312382425543e-05, "loss": 2.5419, "step": 3170 }, { "epoch": 0.17233729063288566, "grad_norm": 0.48697274923324585, "learning_rate": 9.99698692873561e-05, "loss": 2.5409, "step": 3180 }, { "epoch": 0.1728792317983979, "grad_norm": 0.48942264914512634, "learning_rate": 9.99664288622568e-05, "loss": 2.5347, "step": 3190 }, { "epoch": 0.1734211729639101, "grad_norm": 0.3773297369480133, "learning_rate": 9.996280256317887e-05, "loss": 2.5432, "step": 3200 }, { "epoch": 0.17396311412942234, "grad_norm": 0.8151489496231079, "learning_rate": 9.995899040511207e-05, "loss": 2.5359, "step": 3210 }, { "epoch": 0.17445086117838332, "eval_loss": 2.7794876098632812, "eval_runtime": 21.6952, "eval_samples_per_second": 230.465, "eval_steps_per_second": 1.245, "step": 3219 }, { "epoch": 0.17450505529493454, "grad_norm": 0.6645236015319824, "learning_rate": 9.995499240381441e-05, "loss": 2.5412, "step": 3220 }, { "epoch": 0.17504699646044677, "grad_norm": 0.31391623616218567, "learning_rate": 9.995080857581208e-05, "loss": 2.5403, "step": 3230 }, { "epoch": 0.17558893762595898, "grad_norm": 0.6431202292442322, "learning_rate": 9.994643893839943e-05, "loss": 2.5346, "step": 3240 }, { "epoch": 0.1761308787914712, "grad_norm": 0.6171109676361084, "learning_rate": 9.994188350963887e-05, "loss": 2.5362, "step": 3250 }, { "epoch": 0.17667281995698342, "grad_norm": 0.3944205343723297, "learning_rate": 9.993714230836076e-05, "loss": 2.5302, "step": 3260 }, { "epoch": 0.17721476112249565, "grad_norm": 0.462694376707077, "learning_rate": 9.993221535416346e-05, "loss": 2.5301, "step": 3270 }, { "epoch": 0.17775670228800786, "grad_norm": 0.39359036087989807, "learning_rate": 9.992710266741307e-05, "loss": 2.5236, "step": 3280 }, { "epoch": 0.1782986434535201, "grad_norm": 0.6656551361083984, "learning_rate": 9.99218042692435e-05, "loss": 2.5282, "step": 3290 }, { "epoch": 0.1788405846190323, "grad_norm": 0.36592236161231995, "learning_rate": 9.991632018155627e-05, "loss": 2.5288, "step": 3300 }, { "epoch": 0.17916574931833962, "eval_loss": 2.763190746307373, "eval_runtime": 21.9963, "eval_samples_per_second": 227.311, "eval_steps_per_second": 1.227, "step": 3306 }, { "epoch": 0.17938252578454453, "grad_norm": 1.2581602334976196, "learning_rate": 9.991065042702054e-05, "loss": 2.5198, "step": 3310 }, { "epoch": 0.17992446695005673, "grad_norm": 0.9863691926002502, "learning_rate": 9.990479502907287e-05, "loss": 2.5248, "step": 3320 }, { "epoch": 0.18046640811556897, "grad_norm": 0.538944661617279, "learning_rate": 9.989875401191725e-05, "loss": 2.5226, "step": 3330 }, { "epoch": 0.18100834928108117, "grad_norm": 0.8944976925849915, "learning_rate": 9.989252740052489e-05, "loss": 2.512, "step": 3340 }, { "epoch": 0.1815502904465934, "grad_norm": 0.5679588913917542, "learning_rate": 9.988611522063423e-05, "loss": 2.5097, "step": 3350 }, { "epoch": 0.1820922316121056, "grad_norm": 0.40262141823768616, "learning_rate": 9.987951749875081e-05, "loss": 2.509, "step": 3360 }, { "epoch": 0.18263417277761784, "grad_norm": 0.3973683714866638, "learning_rate": 9.987273426214702e-05, "loss": 2.5156, "step": 3370 }, { "epoch": 0.18317611394313005, "grad_norm": 0.4501461982727051, "learning_rate": 9.98657655388622e-05, "loss": 2.5193, "step": 3380 }, { "epoch": 0.18371805510864228, "grad_norm": 0.3253689110279083, "learning_rate": 9.985861135770237e-05, "loss": 2.5033, "step": 3390 }, { "epoch": 0.18388063745829594, "eval_loss": 2.767366647720337, "eval_runtime": 22.0, "eval_samples_per_second": 227.273, "eval_steps_per_second": 1.227, "step": 3393 }, { "epoch": 0.18425999627415449, "grad_norm": 0.737877368927002, "learning_rate": 9.985127174824017e-05, "loss": 2.5085, "step": 3400 }, { "epoch": 0.18480193743966672, "grad_norm": 1.0722659826278687, "learning_rate": 9.984374674081472e-05, "loss": 2.4993, "step": 3410 }, { "epoch": 0.18534387860517892, "grad_norm": 0.7790729403495789, "learning_rate": 9.983603636653154e-05, "loss": 2.4982, "step": 3420 }, { "epoch": 0.18588581977069116, "grad_norm": 0.3918309211730957, "learning_rate": 9.982814065726233e-05, "loss": 2.4979, "step": 3430 }, { "epoch": 0.18642776093620336, "grad_norm": 0.31229421496391296, "learning_rate": 9.982005964564495e-05, "loss": 2.4927, "step": 3440 }, { "epoch": 0.1869697021017156, "grad_norm": 0.4041496813297272, "learning_rate": 9.981179336508322e-05, "loss": 2.4964, "step": 3450 }, { "epoch": 0.1875116432672278, "grad_norm": 0.3821089267730713, "learning_rate": 9.980334184974672e-05, "loss": 2.4913, "step": 3460 }, { "epoch": 0.18805358443274003, "grad_norm": 0.4665864408016205, "learning_rate": 9.979470513457084e-05, "loss": 2.487, "step": 3470 }, { "epoch": 0.18859552559825224, "grad_norm": 0.509825587272644, "learning_rate": 9.978588325525639e-05, "loss": 2.4932, "step": 3480 }, { "epoch": 0.18859552559825224, "eval_loss": 2.7670769691467285, "eval_runtime": 21.9697, "eval_samples_per_second": 227.587, "eval_steps_per_second": 1.229, "step": 3480 }, { "epoch": 0.18913746676376447, "grad_norm": 1.2350674867630005, "learning_rate": 9.977687624826966e-05, "loss": 2.4936, "step": 3490 }, { "epoch": 0.18967940792927668, "grad_norm": 0.7966161370277405, "learning_rate": 9.97676841508422e-05, "loss": 2.4861, "step": 3500 }, { "epoch": 0.1902213490947889, "grad_norm": 0.3995961844921112, "learning_rate": 9.975830700097056e-05, "loss": 2.488, "step": 3510 }, { "epoch": 0.19076329026030112, "grad_norm": 0.3042275309562683, "learning_rate": 9.974874483741632e-05, "loss": 2.4736, "step": 3520 }, { "epoch": 0.19130523142581332, "grad_norm": 0.31050169467926025, "learning_rate": 9.973899769970578e-05, "loss": 2.4825, "step": 3530 }, { "epoch": 0.19184717259132555, "grad_norm": 0.323839008808136, "learning_rate": 9.972906562812986e-05, "loss": 2.4788, "step": 3540 }, { "epoch": 0.19238911375683776, "grad_norm": 0.8257955312728882, "learning_rate": 9.971894866374397e-05, "loss": 2.4857, "step": 3550 }, { "epoch": 0.19293105492235, "grad_norm": 0.547508955001831, "learning_rate": 9.970864684836776e-05, "loss": 2.4832, "step": 3560 }, { "epoch": 0.19331041373820854, "eval_loss": 2.761841297149658, "eval_runtime": 21.9898, "eval_samples_per_second": 227.379, "eval_steps_per_second": 1.228, "step": 3567 }, { "epoch": 0.1934729960878622, "grad_norm": 0.7686718106269836, "learning_rate": 9.969816022458495e-05, "loss": 2.4779, "step": 3570 }, { "epoch": 0.19401493725337443, "grad_norm": 0.3649592995643616, "learning_rate": 9.968748883574324e-05, "loss": 2.4733, "step": 3580 }, { "epoch": 0.19455687841888664, "grad_norm": 0.7720659375190735, "learning_rate": 9.967663272595408e-05, "loss": 2.465, "step": 3590 }, { "epoch": 0.19509881958439887, "grad_norm": 0.47627437114715576, "learning_rate": 9.966559194009244e-05, "loss": 2.4662, "step": 3600 }, { "epoch": 0.19564076074991107, "grad_norm": 0.3539658486843109, "learning_rate": 9.965436652379671e-05, "loss": 2.4693, "step": 3610 }, { "epoch": 0.1961827019154233, "grad_norm": 1.2727680206298828, "learning_rate": 9.964295652346844e-05, "loss": 2.465, "step": 3620 }, { "epoch": 0.1967246430809355, "grad_norm": 0.34611645340919495, "learning_rate": 9.963136198627224e-05, "loss": 2.4634, "step": 3630 }, { "epoch": 0.19726658424644775, "grad_norm": 0.7994367480278015, "learning_rate": 9.961958296013543e-05, "loss": 2.4702, "step": 3640 }, { "epoch": 0.19780852541195995, "grad_norm": 1.1131157875061035, "learning_rate": 9.960761949374802e-05, "loss": 2.4639, "step": 3650 }, { "epoch": 0.19802530187816486, "eval_loss": 2.7547664642333984, "eval_runtime": 21.9926, "eval_samples_per_second": 227.35, "eval_steps_per_second": 1.228, "step": 3654 }, { "epoch": 0.19835046657747218, "grad_norm": 1.1642026901245117, "learning_rate": 9.959547163656238e-05, "loss": 2.4701, "step": 3660 }, { "epoch": 0.1988924077429844, "grad_norm": 0.38460174202919006, "learning_rate": 9.958313943879311e-05, "loss": 2.4701, "step": 3670 }, { "epoch": 0.19943434890849662, "grad_norm": 0.4307605028152466, "learning_rate": 9.957062295141675e-05, "loss": 2.46, "step": 3680 }, { "epoch": 0.19997629007400883, "grad_norm": 0.952154815196991, "learning_rate": 9.955792222617171e-05, "loss": 2.4643, "step": 3690 }, { "epoch": 0.20051823123952106, "grad_norm": 0.3698415756225586, "learning_rate": 9.95450373155579e-05, "loss": 2.4557, "step": 3700 }, { "epoch": 0.20106017240503327, "grad_norm": 0.3291034996509552, "learning_rate": 9.953196827283659e-05, "loss": 2.4598, "step": 3710 }, { "epoch": 0.2016021135705455, "grad_norm": 0.31008443236351013, "learning_rate": 9.95187151520302e-05, "loss": 2.4511, "step": 3720 }, { "epoch": 0.2021440547360577, "grad_norm": 0.5721209645271301, "learning_rate": 9.950527800792205e-05, "loss": 2.4536, "step": 3730 }, { "epoch": 0.20268599590156994, "grad_norm": 0.2794472575187683, "learning_rate": 9.949165689605615e-05, "loss": 2.4425, "step": 3740 }, { "epoch": 0.20274019001812116, "eval_loss": 2.7546393871307373, "eval_runtime": 22.0026, "eval_samples_per_second": 227.246, "eval_steps_per_second": 1.227, "step": 3741 }, { "epoch": 0.20322793706708214, "grad_norm": 0.844109058380127, "learning_rate": 9.947785187273695e-05, "loss": 2.4386, "step": 3750 }, { "epoch": 0.20376987823259438, "grad_norm": 0.3052925765514374, "learning_rate": 9.946386299502911e-05, "loss": 2.448, "step": 3760 }, { "epoch": 0.20431181939810658, "grad_norm": 0.5775868892669678, "learning_rate": 9.94496903207573e-05, "loss": 2.4449, "step": 3770 }, { "epoch": 0.20485376056361881, "grad_norm": 0.7839900255203247, "learning_rate": 9.943533390850595e-05, "loss": 2.4545, "step": 3780 }, { "epoch": 0.20539570172913102, "grad_norm": 0.4013843834400177, "learning_rate": 9.942079381761889e-05, "loss": 2.4563, "step": 3790 }, { "epoch": 0.20593764289464325, "grad_norm": 0.3987264633178711, "learning_rate": 9.940607010819937e-05, "loss": 2.4431, "step": 3800 }, { "epoch": 0.20647958406015546, "grad_norm": 0.3003256022930145, "learning_rate": 9.93911628411095e-05, "loss": 2.4424, "step": 3810 }, { "epoch": 0.2070215252256677, "grad_norm": 0.6472198367118835, "learning_rate": 9.937607207797024e-05, "loss": 2.4379, "step": 3820 }, { "epoch": 0.20745507815807745, "eval_loss": 2.7585620880126953, "eval_runtime": 21.9936, "eval_samples_per_second": 227.339, "eval_steps_per_second": 1.228, "step": 3828 }, { "epoch": 0.2075634663911799, "grad_norm": 0.4794847071170807, "learning_rate": 9.9360797881161e-05, "loss": 2.4339, "step": 3830 }, { "epoch": 0.20810540755669213, "grad_norm": 0.3864356577396393, "learning_rate": 9.934534031381946e-05, "loss": 2.4437, "step": 3840 }, { "epoch": 0.20864734872220433, "grad_norm": 0.4814181923866272, "learning_rate": 9.932969943984126e-05, "loss": 2.4322, "step": 3850 }, { "epoch": 0.20918928988771657, "grad_norm": 0.6743614077568054, "learning_rate": 9.931387532387981e-05, "loss": 2.4356, "step": 3860 }, { "epoch": 0.20973123105322877, "grad_norm": 0.6256967186927795, "learning_rate": 9.929786803134592e-05, "loss": 2.4571, "step": 3870 }, { "epoch": 0.210273172218741, "grad_norm": 0.45648303627967834, "learning_rate": 9.928167762840761e-05, "loss": 2.432, "step": 3880 }, { "epoch": 0.2108151133842532, "grad_norm": 0.4223450720310211, "learning_rate": 9.926530418198978e-05, "loss": 2.4279, "step": 3890 }, { "epoch": 0.21135705454976544, "grad_norm": 0.40104198455810547, "learning_rate": 9.924874775977402e-05, "loss": 2.4261, "step": 3900 }, { "epoch": 0.21189899571527765, "grad_norm": 0.7316650748252869, "learning_rate": 9.923200843019818e-05, "loss": 2.4293, "step": 3910 }, { "epoch": 0.21216996629803378, "eval_loss": 2.747528553009033, "eval_runtime": 22.0, "eval_samples_per_second": 227.273, "eval_steps_per_second": 1.227, "step": 3915 }, { "epoch": 0.21244093688078988, "grad_norm": 0.39364123344421387, "learning_rate": 9.921508626245628e-05, "loss": 2.4287, "step": 3920 }, { "epoch": 0.2129828780463021, "grad_norm": 0.5997620224952698, "learning_rate": 9.919798132649803e-05, "loss": 2.4304, "step": 3930 }, { "epoch": 0.21352481921181432, "grad_norm": 0.6328970193862915, "learning_rate": 9.91806936930287e-05, "loss": 2.4236, "step": 3940 }, { "epoch": 0.21406676037732653, "grad_norm": 0.8760083913803101, "learning_rate": 9.916322343350875e-05, "loss": 2.4235, "step": 3950 }, { "epoch": 0.21460870154283876, "grad_norm": 0.9582383036613464, "learning_rate": 9.914557062015352e-05, "loss": 2.4171, "step": 3960 }, { "epoch": 0.21515064270835096, "grad_norm": 0.7223474383354187, "learning_rate": 9.912773532593297e-05, "loss": 2.4194, "step": 3970 }, { "epoch": 0.2156925838738632, "grad_norm": 0.7395852208137512, "learning_rate": 9.910971762457138e-05, "loss": 2.412, "step": 3980 }, { "epoch": 0.2162345250393754, "grad_norm": 0.352475643157959, "learning_rate": 9.909151759054702e-05, "loss": 2.4086, "step": 3990 }, { "epoch": 0.21677646620488764, "grad_norm": 0.43535661697387695, "learning_rate": 9.907313529909185e-05, "loss": 2.4128, "step": 4000 }, { "epoch": 0.21688485443799008, "eval_loss": 2.742093563079834, "eval_runtime": 21.9979, "eval_samples_per_second": 227.294, "eval_steps_per_second": 1.227, "step": 4002 }, { "epoch": 0.21731840737039984, "grad_norm": 0.4158700406551361, "learning_rate": 9.905457082619124e-05, "loss": 2.4054, "step": 4010 }, { "epoch": 0.21786034853591207, "grad_norm": 0.6706676483154297, "learning_rate": 9.903582424858355e-05, "loss": 2.3992, "step": 4020 }, { "epoch": 0.21840228970142428, "grad_norm": 1.0080608129501343, "learning_rate": 9.901689564375998e-05, "loss": 2.4159, "step": 4030 }, { "epoch": 0.2189442308669365, "grad_norm": 0.8006541728973389, "learning_rate": 9.899778508996412e-05, "loss": 2.413, "step": 4040 }, { "epoch": 0.21948617203244872, "grad_norm": 0.7094182968139648, "learning_rate": 9.89784926661917e-05, "loss": 2.4095, "step": 4050 }, { "epoch": 0.22002811319796095, "grad_norm": 0.5873300433158875, "learning_rate": 9.895901845219013e-05, "loss": 2.4033, "step": 4060 }, { "epoch": 0.22057005436347316, "grad_norm": 0.65219646692276, "learning_rate": 9.893936252845842e-05, "loss": 2.3991, "step": 4070 }, { "epoch": 0.2211119955289854, "grad_norm": 0.5918698310852051, "learning_rate": 9.891952497624662e-05, "loss": 2.4142, "step": 4080 }, { "epoch": 0.22159974257794637, "eval_loss": 2.7375481128692627, "eval_runtime": 21.9899, "eval_samples_per_second": 227.377, "eval_steps_per_second": 1.228, "step": 4089 }, { "epoch": 0.2216539366944976, "grad_norm": 1.1681138277053833, "learning_rate": 9.889950587755549e-05, "loss": 2.4009, "step": 4090 }, { "epoch": 0.22219587786000983, "grad_norm": 0.3698543310165405, "learning_rate": 9.88793053151364e-05, "loss": 2.398, "step": 4100 }, { "epoch": 0.22273781902552203, "grad_norm": 0.3500048816204071, "learning_rate": 9.885892337249069e-05, "loss": 2.3997, "step": 4110 }, { "epoch": 0.22327976019103427, "grad_norm": 0.9475021958351135, "learning_rate": 9.88383601338695e-05, "loss": 2.4023, "step": 4120 }, { "epoch": 0.22382170135654647, "grad_norm": 0.37473389506340027, "learning_rate": 9.881761568427335e-05, "loss": 2.3969, "step": 4130 }, { "epoch": 0.2243636425220587, "grad_norm": 0.46967896819114685, "learning_rate": 9.879669010945189e-05, "loss": 2.3995, "step": 4140 }, { "epoch": 0.2249055836875709, "grad_norm": 0.40588507056236267, "learning_rate": 9.877558349590341e-05, "loss": 2.3983, "step": 4150 }, { "epoch": 0.22544752485308314, "grad_norm": 0.32905760407447815, "learning_rate": 9.875429593087454e-05, "loss": 2.3965, "step": 4160 }, { "epoch": 0.22598946601859535, "grad_norm": 0.7571531534194946, "learning_rate": 9.873282750235993e-05, "loss": 2.3858, "step": 4170 }, { "epoch": 0.2263146307179027, "eval_loss": 2.7395927906036377, "eval_runtime": 21.9951, "eval_samples_per_second": 227.323, "eval_steps_per_second": 1.228, "step": 4176 }, { "epoch": 0.22653140718410758, "grad_norm": 0.4100936949253082, "learning_rate": 9.871117829910181e-05, "loss": 2.3913, "step": 4180 }, { "epoch": 0.22707334834961979, "grad_norm": 1.1051249504089355, "learning_rate": 9.868934841058972e-05, "loss": 2.385, "step": 4190 }, { "epoch": 0.22761528951513202, "grad_norm": 0.4184013903141022, "learning_rate": 9.866733792706003e-05, "loss": 2.3901, "step": 4200 }, { "epoch": 0.22815723068064422, "grad_norm": 0.8634352087974548, "learning_rate": 9.864514693949563e-05, "loss": 2.382, "step": 4210 }, { "epoch": 0.22869917184615646, "grad_norm": 0.42947426438331604, "learning_rate": 9.86227755396256e-05, "loss": 2.3847, "step": 4220 }, { "epoch": 0.22924111301166866, "grad_norm": 0.7337902784347534, "learning_rate": 9.860022381992467e-05, "loss": 2.3877, "step": 4230 }, { "epoch": 0.2297830541771809, "grad_norm": 0.33461204171180725, "learning_rate": 9.857749187361308e-05, "loss": 2.3771, "step": 4240 }, { "epoch": 0.2303249953426931, "grad_norm": 0.4798398017883301, "learning_rate": 9.85545797946559e-05, "loss": 2.3745, "step": 4250 }, { "epoch": 0.23086693650820533, "grad_norm": 0.7971112728118896, "learning_rate": 9.853148767776293e-05, "loss": 2.3856, "step": 4260 }, { "epoch": 0.231029518857859, "eval_loss": 2.7393031120300293, "eval_runtime": 21.9959, "eval_samples_per_second": 227.316, "eval_steps_per_second": 1.228, "step": 4263 }, { "epoch": 0.23140887767371754, "grad_norm": 0.5765118598937988, "learning_rate": 9.85082156183881e-05, "loss": 2.3818, "step": 4270 }, { "epoch": 0.23195081883922977, "grad_norm": 0.9162944555282593, "learning_rate": 9.848476371272922e-05, "loss": 2.3756, "step": 4280 }, { "epoch": 0.23249276000474198, "grad_norm": 0.757483959197998, "learning_rate": 9.846113205772746e-05, "loss": 2.3754, "step": 4290 }, { "epoch": 0.2330347011702542, "grad_norm": 0.5934547185897827, "learning_rate": 9.8437320751067e-05, "loss": 2.3688, "step": 4300 }, { "epoch": 0.23357664233576642, "grad_norm": 0.540972888469696, "learning_rate": 9.841332989117469e-05, "loss": 2.3713, "step": 4310 }, { "epoch": 0.23411858350127865, "grad_norm": 0.41171032190322876, "learning_rate": 9.838915957721953e-05, "loss": 2.3675, "step": 4320 }, { "epoch": 0.23466052466679085, "grad_norm": 0.5488543510437012, "learning_rate": 9.83648099091123e-05, "loss": 2.369, "step": 4330 }, { "epoch": 0.2352024658323031, "grad_norm": 0.4907141923904419, "learning_rate": 9.834028098750525e-05, "loss": 2.3694, "step": 4340 }, { "epoch": 0.2357444069978153, "grad_norm": 0.3236653804779053, "learning_rate": 9.83155729137915e-05, "loss": 2.3596, "step": 4350 }, { "epoch": 0.2357444069978153, "eval_loss": 2.7318739891052246, "eval_runtime": 21.9575, "eval_samples_per_second": 227.713, "eval_steps_per_second": 1.23, "step": 4350 }, { "epoch": 0.23628634816332753, "grad_norm": 0.29620563983917236, "learning_rate": 9.82906857901048e-05, "loss": 2.3661, "step": 4360 }, { "epoch": 0.23682828932883973, "grad_norm": 0.9057065844535828, "learning_rate": 9.826561971931891e-05, "loss": 2.366, "step": 4370 }, { "epoch": 0.23737023049435196, "grad_norm": 0.9793679118156433, "learning_rate": 9.824037480504741e-05, "loss": 2.3657, "step": 4380 }, { "epoch": 0.23791217165986417, "grad_norm": 0.6782917976379395, "learning_rate": 9.821495115164309e-05, "loss": 2.3674, "step": 4390 }, { "epoch": 0.2384541128253764, "grad_norm": 0.3981853425502777, "learning_rate": 9.818934886419756e-05, "loss": 2.3678, "step": 4400 }, { "epoch": 0.2389960539908886, "grad_norm": 0.5394397974014282, "learning_rate": 9.816356804854089e-05, "loss": 2.3544, "step": 4410 }, { "epoch": 0.23953799515640084, "grad_norm": 0.45772433280944824, "learning_rate": 9.813760881124107e-05, "loss": 2.3546, "step": 4420 }, { "epoch": 0.24007993632191305, "grad_norm": 0.5437049269676208, "learning_rate": 9.811147125960364e-05, "loss": 2.3592, "step": 4430 }, { "epoch": 0.24045929513777162, "eval_loss": 2.728144407272339, "eval_runtime": 21.9944, "eval_samples_per_second": 227.331, "eval_steps_per_second": 1.228, "step": 4437 }, { "epoch": 0.24062187748742528, "grad_norm": 0.8073667883872986, "learning_rate": 9.808515550167124e-05, "loss": 2.3523, "step": 4440 }, { "epoch": 0.24116381865293748, "grad_norm": 0.40337878465652466, "learning_rate": 9.805866164622311e-05, "loss": 2.3574, "step": 4450 }, { "epoch": 0.24170575981844972, "grad_norm": 0.7153369188308716, "learning_rate": 9.803198980277467e-05, "loss": 2.3557, "step": 4460 }, { "epoch": 0.24224770098396192, "grad_norm": 0.37222856283187866, "learning_rate": 9.800514008157711e-05, "loss": 2.3553, "step": 4470 }, { "epoch": 0.24278964214947416, "grad_norm": 0.3131774067878723, "learning_rate": 9.79781125936169e-05, "loss": 2.3562, "step": 4480 }, { "epoch": 0.24333158331498636, "grad_norm": 0.5932138562202454, "learning_rate": 9.79509074506153e-05, "loss": 2.3422, "step": 4490 }, { "epoch": 0.2438735244804986, "grad_norm": 0.6757357716560364, "learning_rate": 9.792352476502796e-05, "loss": 2.349, "step": 4500 }, { "epoch": 0.2444154656460108, "grad_norm": 0.9189159870147705, "learning_rate": 9.789596465004437e-05, "loss": 2.341, "step": 4510 }, { "epoch": 0.24495740681152303, "grad_norm": 0.40294119715690613, "learning_rate": 9.786822721958751e-05, "loss": 2.3526, "step": 4520 }, { "epoch": 0.24517418327772791, "eval_loss": 2.7286596298217773, "eval_runtime": 21.9954, "eval_samples_per_second": 227.32, "eval_steps_per_second": 1.228, "step": 4524 }, { "epoch": 0.24549934797703524, "grad_norm": 0.4049138128757477, "learning_rate": 9.784031258831325e-05, "loss": 2.3427, "step": 4530 }, { "epoch": 0.24604128914254747, "grad_norm": 0.5206155776977539, "learning_rate": 9.781222087161003e-05, "loss": 2.347, "step": 4540 }, { "epoch": 0.24658323030805968, "grad_norm": 0.3420925736427307, "learning_rate": 9.77839521855982e-05, "loss": 2.3421, "step": 4550 }, { "epoch": 0.2471251714735719, "grad_norm": 1.2839010953903198, "learning_rate": 9.775550664712966e-05, "loss": 2.3492, "step": 4560 }, { "epoch": 0.2476671126390841, "grad_norm": 0.6513694524765015, "learning_rate": 9.772688437378738e-05, "loss": 2.3417, "step": 4570 }, { "epoch": 0.24820905380459635, "grad_norm": 0.7786572575569153, "learning_rate": 9.769808548388488e-05, "loss": 2.333, "step": 4580 }, { "epoch": 0.24875099497010855, "grad_norm": 0.8283511996269226, "learning_rate": 9.766911009646569e-05, "loss": 2.3358, "step": 4590 }, { "epoch": 0.24929293613562079, "grad_norm": 0.5858640074729919, "learning_rate": 9.763995833130299e-05, "loss": 2.3424, "step": 4600 }, { "epoch": 0.249834877301133, "grad_norm": 0.41306570172309875, "learning_rate": 9.761063030889898e-05, "loss": 2.3338, "step": 4610 }, { "epoch": 0.2498890714176842, "eval_loss": 2.723004102706909, "eval_runtime": 21.9968, "eval_samples_per_second": 227.306, "eval_steps_per_second": 1.227, "step": 4611 }, { "epoch": 0.2503768184666452, "grad_norm": 0.32257717847824097, "learning_rate": 9.758112615048448e-05, "loss": 2.3332, "step": 4620 }, { "epoch": 0.25091875963215743, "grad_norm": 0.31163015961647034, "learning_rate": 9.755144597801837e-05, "loss": 2.3306, "step": 4630 }, { "epoch": 0.25146070079766963, "grad_norm": 0.6164253950119019, "learning_rate": 9.752158991418708e-05, "loss": 2.3288, "step": 4640 }, { "epoch": 0.2520026419631819, "grad_norm": 0.6337904930114746, "learning_rate": 9.749155808240415e-05, "loss": 2.3271, "step": 4650 }, { "epoch": 0.2525445831286941, "grad_norm": 0.3386767506599426, "learning_rate": 9.746135060680966e-05, "loss": 2.3342, "step": 4660 }, { "epoch": 0.2530865242942063, "grad_norm": 0.9337306022644043, "learning_rate": 9.743096761226972e-05, "loss": 2.3299, "step": 4670 }, { "epoch": 0.2536284654597185, "grad_norm": 0.31421270966529846, "learning_rate": 9.7400409224376e-05, "loss": 2.3302, "step": 4680 }, { "epoch": 0.25417040662523077, "grad_norm": 0.33126044273376465, "learning_rate": 9.736967556944516e-05, "loss": 2.325, "step": 4690 }, { "epoch": 0.25460395955764054, "eval_loss": 2.722799301147461, "eval_runtime": 25.1071, "eval_samples_per_second": 199.147, "eval_steps_per_second": 1.075, "step": 4698 }, { "epoch": 0.254712347790743, "grad_norm": 0.3403181731700897, "learning_rate": 9.733876677451833e-05, "loss": 2.319, "step": 4700 }, { "epoch": 0.2552542889562552, "grad_norm": 0.4288688004016876, "learning_rate": 9.730768296736064e-05, "loss": 2.322, "step": 4710 }, { "epoch": 0.2557962301217674, "grad_norm": 1.1494555473327637, "learning_rate": 9.727642427646061e-05, "loss": 2.319, "step": 4720 }, { "epoch": 0.25633817128727965, "grad_norm": 1.3199281692504883, "learning_rate": 9.72449908310297e-05, "loss": 2.3299, "step": 4730 }, { "epoch": 0.25688011245279185, "grad_norm": 1.220360279083252, "learning_rate": 9.721338276100172e-05, "loss": 2.3441, "step": 4740 }, { "epoch": 0.25742205361830406, "grad_norm": 1.106679916381836, "learning_rate": 9.718160019703232e-05, "loss": 2.3379, "step": 4750 }, { "epoch": 0.25796399478381626, "grad_norm": 1.3295619487762451, "learning_rate": 9.714964327049842e-05, "loss": 2.3501, "step": 4760 }, { "epoch": 0.2585059359493285, "grad_norm": 0.6279563307762146, "learning_rate": 9.711751211349773e-05, "loss": 2.3166, "step": 4770 }, { "epoch": 0.25904787711484073, "grad_norm": 0.3395664095878601, "learning_rate": 9.70852068588481e-05, "loss": 2.3087, "step": 4780 }, { "epoch": 0.25931884769759683, "eval_loss": 2.721379518508911, "eval_runtime": 21.9931, "eval_samples_per_second": 227.344, "eval_steps_per_second": 1.228, "step": 4785 }, { "epoch": 0.25958981828035294, "grad_norm": 0.32632923126220703, "learning_rate": 9.705272764008709e-05, "loss": 2.3064, "step": 4790 }, { "epoch": 0.26013175944586514, "grad_norm": 0.3286370038986206, "learning_rate": 9.702007459147134e-05, "loss": 2.3013, "step": 4800 }, { "epoch": 0.2606737006113774, "grad_norm": 0.6520362496376038, "learning_rate": 9.698724784797604e-05, "loss": 2.3196, "step": 4810 }, { "epoch": 0.2612156417768896, "grad_norm": 0.6383447051048279, "learning_rate": 9.695424754529434e-05, "loss": 2.3109, "step": 4820 }, { "epoch": 0.2617575829424018, "grad_norm": 0.3725563585758209, "learning_rate": 9.692107381983684e-05, "loss": 2.3072, "step": 4830 }, { "epoch": 0.262299524107914, "grad_norm": 0.5179880857467651, "learning_rate": 9.688772680873103e-05, "loss": 2.3065, "step": 4840 }, { "epoch": 0.2628414652734263, "grad_norm": 0.5016129612922668, "learning_rate": 9.685420664982067e-05, "loss": 2.3088, "step": 4850 }, { "epoch": 0.2633834064389385, "grad_norm": 0.5394784212112427, "learning_rate": 9.682051348166523e-05, "loss": 2.2979, "step": 4860 }, { "epoch": 0.2639253476044507, "grad_norm": 0.5578838586807251, "learning_rate": 9.678664744353935e-05, "loss": 2.3034, "step": 4870 }, { "epoch": 0.26403373583755313, "eval_loss": 2.723337173461914, "eval_runtime": 21.9924, "eval_samples_per_second": 227.351, "eval_steps_per_second": 1.228, "step": 4872 }, { "epoch": 0.2644672887699629, "grad_norm": 0.34530109167099, "learning_rate": 9.675260867543224e-05, "loss": 2.3096, "step": 4880 }, { "epoch": 0.26500922993547515, "grad_norm": 0.7636291980743408, "learning_rate": 9.671839731804716e-05, "loss": 2.307, "step": 4890 }, { "epoch": 0.26555117110098736, "grad_norm": 0.3167097568511963, "learning_rate": 9.66840135128007e-05, "loss": 2.3068, "step": 4900 }, { "epoch": 0.26609311226649957, "grad_norm": 0.45626381039619446, "learning_rate": 9.664945740182235e-05, "loss": 2.2948, "step": 4910 }, { "epoch": 0.26663505343201177, "grad_norm": 0.695152759552002, "learning_rate": 9.661472912795383e-05, "loss": 2.2975, "step": 4920 }, { "epoch": 0.26717699459752403, "grad_norm": 0.30375921726226807, "learning_rate": 9.65798288347485e-05, "loss": 2.3041, "step": 4930 }, { "epoch": 0.26771893576303624, "grad_norm": 0.8904628753662109, "learning_rate": 9.654475666647078e-05, "loss": 2.2957, "step": 4940 }, { "epoch": 0.26826087692854844, "grad_norm": 1.199233889579773, "learning_rate": 9.650951276809561e-05, "loss": 2.2994, "step": 4950 }, { "epoch": 0.2687486239775094, "eval_loss": 2.7227141857147217, "eval_runtime": 21.9965, "eval_samples_per_second": 227.309, "eval_steps_per_second": 1.227, "step": 4959 }, { "epoch": 0.26880281809406065, "grad_norm": 2.123131513595581, "learning_rate": 9.647409728530772e-05, "loss": 2.3143, "step": 4960 }, { "epoch": 0.2693447592595729, "grad_norm": 0.7237464189529419, "learning_rate": 9.643851036450115e-05, "loss": 2.3183, "step": 4970 }, { "epoch": 0.2698867004250851, "grad_norm": 0.6150104403495789, "learning_rate": 9.640275215277858e-05, "loss": 2.3048, "step": 4980 }, { "epoch": 0.2704286415905973, "grad_norm": 0.3302861154079437, "learning_rate": 9.636682279795076e-05, "loss": 2.2903, "step": 4990 }, { "epoch": 0.2709705827561095, "grad_norm": 0.7235733270645142, "learning_rate": 9.633072244853587e-05, "loss": 2.2807, "step": 5000 }, { "epoch": 0.2715125239216218, "grad_norm": 0.36814624071121216, "learning_rate": 9.629445125375891e-05, "loss": 2.2886, "step": 5010 }, { "epoch": 0.272054465087134, "grad_norm": 0.5725374221801758, "learning_rate": 9.625800936355108e-05, "loss": 2.2996, "step": 5020 }, { "epoch": 0.2725964062526462, "grad_norm": 0.45517677068710327, "learning_rate": 9.62213969285492e-05, "loss": 2.2876, "step": 5030 }, { "epoch": 0.2731383474181584, "grad_norm": 0.5474647879600525, "learning_rate": 9.618461410009503e-05, "loss": 2.2838, "step": 5040 }, { "epoch": 0.2734635121174657, "eval_loss": 2.713071346282959, "eval_runtime": 21.9919, "eval_samples_per_second": 227.356, "eval_steps_per_second": 1.228, "step": 5046 }, { "epoch": 0.27368028858367066, "grad_norm": 0.3035077452659607, "learning_rate": 9.614766103023473e-05, "loss": 2.2759, "step": 5050 }, { "epoch": 0.27422222974918287, "grad_norm": 0.6675074100494385, "learning_rate": 9.611053787171804e-05, "loss": 2.2889, "step": 5060 }, { "epoch": 0.27476417091469507, "grad_norm": 0.5895007252693176, "learning_rate": 9.607324477799793e-05, "loss": 2.2786, "step": 5070 }, { "epoch": 0.2753061120802073, "grad_norm": 0.781353771686554, "learning_rate": 9.603578190322974e-05, "loss": 2.2852, "step": 5080 }, { "epoch": 0.27584805324571954, "grad_norm": 0.3827020823955536, "learning_rate": 9.599814940227062e-05, "loss": 2.2813, "step": 5090 }, { "epoch": 0.27638999441123174, "grad_norm": 0.6647722125053406, "learning_rate": 9.59603474306789e-05, "loss": 2.279, "step": 5100 }, { "epoch": 0.27693193557674395, "grad_norm": 0.3594939112663269, "learning_rate": 9.592237614471346e-05, "loss": 2.2697, "step": 5110 }, { "epoch": 0.27747387674225615, "grad_norm": 0.3628252446651459, "learning_rate": 9.588423570133301e-05, "loss": 2.2752, "step": 5120 }, { "epoch": 0.2780158179077684, "grad_norm": 0.322070449590683, "learning_rate": 9.584592625819555e-05, "loss": 2.2714, "step": 5130 }, { "epoch": 0.2781784002574221, "eval_loss": 2.711329698562622, "eval_runtime": 22.0018, "eval_samples_per_second": 227.255, "eval_steps_per_second": 1.227, "step": 5133 }, { "epoch": 0.2785577590732806, "grad_norm": 0.4778302311897278, "learning_rate": 9.580744797365761e-05, "loss": 2.2721, "step": 5140 }, { "epoch": 0.2790997002387928, "grad_norm": 0.44883444905281067, "learning_rate": 9.57688010067737e-05, "loss": 2.2807, "step": 5150 }, { "epoch": 0.27964164140430503, "grad_norm": 0.9123862981796265, "learning_rate": 9.572998551729552e-05, "loss": 2.2729, "step": 5160 }, { "epoch": 0.2801835825698173, "grad_norm": 2.519432544708252, "learning_rate": 9.569100166567143e-05, "loss": 2.2818, "step": 5170 }, { "epoch": 0.2807255237353295, "grad_norm": 1.1521965265274048, "learning_rate": 9.565184961304577e-05, "loss": 2.2977, "step": 5180 }, { "epoch": 0.2812674649008417, "grad_norm": 1.3088760375976562, "learning_rate": 9.561252952125808e-05, "loss": 2.2886, "step": 5190 }, { "epoch": 0.2818094060663539, "grad_norm": 0.45803970098495483, "learning_rate": 9.557304155284256e-05, "loss": 2.2879, "step": 5200 }, { "epoch": 0.28235134723186617, "grad_norm": 0.4502812623977661, "learning_rate": 9.553338587102732e-05, "loss": 2.2774, "step": 5210 }, { "epoch": 0.2828932883973784, "grad_norm": 0.3390588164329529, "learning_rate": 9.549356263973376e-05, "loss": 2.2749, "step": 5220 }, { "epoch": 0.2828932883973784, "eval_loss": 2.7108614444732666, "eval_runtime": 21.8696, "eval_samples_per_second": 228.628, "eval_steps_per_second": 1.235, "step": 5220 }, { "epoch": 0.2834352295628906, "grad_norm": 0.31600069999694824, "learning_rate": 9.545357202357584e-05, "loss": 2.2686, "step": 5230 }, { "epoch": 0.2839771707284028, "grad_norm": 0.6420486569404602, "learning_rate": 9.541341418785944e-05, "loss": 2.2669, "step": 5240 }, { "epoch": 0.28451911189391504, "grad_norm": 0.6971787810325623, "learning_rate": 9.537308929858167e-05, "loss": 2.2553, "step": 5250 }, { "epoch": 0.28506105305942725, "grad_norm": 0.35600554943084717, "learning_rate": 9.533259752243015e-05, "loss": 2.2607, "step": 5260 }, { "epoch": 0.28560299422493945, "grad_norm": 0.4857500195503235, "learning_rate": 9.529193902678236e-05, "loss": 2.2604, "step": 5270 }, { "epoch": 0.28614493539045166, "grad_norm": 0.41848480701446533, "learning_rate": 9.525111397970495e-05, "loss": 2.2627, "step": 5280 }, { "epoch": 0.28668687655596387, "grad_norm": 0.39947494864463806, "learning_rate": 9.521012254995298e-05, "loss": 2.259, "step": 5290 }, { "epoch": 0.2872288177214761, "grad_norm": 0.6807723045349121, "learning_rate": 9.516896490696936e-05, "loss": 2.2634, "step": 5300 }, { "epoch": 0.28760817653733467, "eval_loss": 2.706552267074585, "eval_runtime": 21.9925, "eval_samples_per_second": 227.35, "eval_steps_per_second": 1.228, "step": 5307 }, { "epoch": 0.28777075888698833, "grad_norm": 0.7820791602134705, "learning_rate": 9.512764122088394e-05, "loss": 2.2567, "step": 5310 }, { "epoch": 0.28831270005250054, "grad_norm": 0.6624732613563538, "learning_rate": 9.508615166251305e-05, "loss": 2.263, "step": 5320 }, { "epoch": 0.28885464121801274, "grad_norm": 0.3670085072517395, "learning_rate": 9.504449640335858e-05, "loss": 2.2528, "step": 5330 }, { "epoch": 0.289396582383525, "grad_norm": 0.36448076367378235, "learning_rate": 9.500267561560746e-05, "loss": 2.2564, "step": 5340 }, { "epoch": 0.2899385235490372, "grad_norm": 0.3871496021747589, "learning_rate": 9.496068947213073e-05, "loss": 2.2561, "step": 5350 }, { "epoch": 0.2904804647145494, "grad_norm": 0.5003486275672913, "learning_rate": 9.491853814648305e-05, "loss": 2.2539, "step": 5360 }, { "epoch": 0.2910224058800616, "grad_norm": 1.1620166301727295, "learning_rate": 9.487622181290183e-05, "loss": 2.2517, "step": 5370 }, { "epoch": 0.2915643470455739, "grad_norm": 0.725284218788147, "learning_rate": 9.483374064630656e-05, "loss": 2.2456, "step": 5380 }, { "epoch": 0.2921062882110861, "grad_norm": 0.541685938835144, "learning_rate": 9.479109482229812e-05, "loss": 2.2553, "step": 5390 }, { "epoch": 0.29232306467729097, "eval_loss": 2.7066264152526855, "eval_runtime": 21.9975, "eval_samples_per_second": 227.298, "eval_steps_per_second": 1.227, "step": 5394 }, { "epoch": 0.2926482293765983, "grad_norm": 0.40974506735801697, "learning_rate": 9.474828451715798e-05, "loss": 2.2516, "step": 5400 }, { "epoch": 0.2931901705421105, "grad_norm": 0.5276529788970947, "learning_rate": 9.470530990784752e-05, "loss": 2.2554, "step": 5410 }, { "epoch": 0.29373211170762276, "grad_norm": 0.9658313393592834, "learning_rate": 9.466217117200735e-05, "loss": 2.2501, "step": 5420 }, { "epoch": 0.29427405287313496, "grad_norm": 0.6076865196228027, "learning_rate": 9.461886848795642e-05, "loss": 2.2438, "step": 5430 }, { "epoch": 0.29481599403864717, "grad_norm": 0.5501825213432312, "learning_rate": 9.457540203469142e-05, "loss": 2.2419, "step": 5440 }, { "epoch": 0.29535793520415937, "grad_norm": 0.41925522685050964, "learning_rate": 9.453177199188603e-05, "loss": 2.2496, "step": 5450 }, { "epoch": 0.29589987636967163, "grad_norm": 0.4038546681404114, "learning_rate": 9.448797853989013e-05, "loss": 2.255, "step": 5460 }, { "epoch": 0.29644181753518384, "grad_norm": 0.3127026855945587, "learning_rate": 9.444402185972901e-05, "loss": 2.2396, "step": 5470 }, { "epoch": 0.29698375870069604, "grad_norm": 0.6538156867027283, "learning_rate": 9.439990213310277e-05, "loss": 2.2377, "step": 5480 }, { "epoch": 0.29703795281724726, "eval_loss": 2.7067112922668457, "eval_runtime": 21.9955, "eval_samples_per_second": 227.32, "eval_steps_per_second": 1.228, "step": 5481 }, { "epoch": 0.29752569986620825, "grad_norm": 0.6304920315742493, "learning_rate": 9.435561954238548e-05, "loss": 2.2417, "step": 5490 }, { "epoch": 0.2980676410317205, "grad_norm": 0.5496434569358826, "learning_rate": 9.431117427062434e-05, "loss": 2.2353, "step": 5500 }, { "epoch": 0.2986095821972327, "grad_norm": 0.7714524865150452, "learning_rate": 9.426656650153909e-05, "loss": 2.2292, "step": 5510 }, { "epoch": 0.2991515233627449, "grad_norm": 0.7220959663391113, "learning_rate": 9.422179641952113e-05, "loss": 2.2506, "step": 5520 }, { "epoch": 0.2996934645282571, "grad_norm": 0.5006898045539856, "learning_rate": 9.417686420963283e-05, "loss": 2.2222, "step": 5530 }, { "epoch": 0.3002354056937694, "grad_norm": 0.3837469220161438, "learning_rate": 9.413177005760672e-05, "loss": 2.2429, "step": 5540 }, { "epoch": 0.3007773468592816, "grad_norm": 0.34782886505126953, "learning_rate": 9.408651414984472e-05, "loss": 2.2423, "step": 5550 }, { "epoch": 0.3013192880247938, "grad_norm": 0.35922521352767944, "learning_rate": 9.404109667341746e-05, "loss": 2.2349, "step": 5560 }, { "epoch": 0.30175284095720356, "eval_loss": 2.698547124862671, "eval_runtime": 21.9987, "eval_samples_per_second": 227.287, "eval_steps_per_second": 1.227, "step": 5568 }, { "epoch": 0.301861229190306, "grad_norm": 0.4572945237159729, "learning_rate": 9.399551781606329e-05, "loss": 2.2244, "step": 5570 }, { "epoch": 0.30240317035581826, "grad_norm": 0.3920208513736725, "learning_rate": 9.394977776618779e-05, "loss": 2.2299, "step": 5580 }, { "epoch": 0.30294511152133047, "grad_norm": 0.9155771136283875, "learning_rate": 9.390387671286279e-05, "loss": 2.2424, "step": 5590 }, { "epoch": 0.3034870526868427, "grad_norm": 0.34311428666114807, "learning_rate": 9.38578148458256e-05, "loss": 2.2273, "step": 5600 }, { "epoch": 0.3040289938523549, "grad_norm": 0.39582574367523193, "learning_rate": 9.381159235547839e-05, "loss": 2.2342, "step": 5610 }, { "epoch": 0.30457093501786714, "grad_norm": 0.37435415387153625, "learning_rate": 9.376520943288716e-05, "loss": 2.2312, "step": 5620 }, { "epoch": 0.30511287618337934, "grad_norm": 0.5571390986442566, "learning_rate": 9.371866626978118e-05, "loss": 2.2193, "step": 5630 }, { "epoch": 0.30565481734889155, "grad_norm": 0.348417192697525, "learning_rate": 9.367196305855199e-05, "loss": 2.24, "step": 5640 }, { "epoch": 0.30619675851440376, "grad_norm": 0.9660398364067078, "learning_rate": 9.362509999225281e-05, "loss": 2.2291, "step": 5650 }, { "epoch": 0.3064677290971599, "eval_loss": 2.700347900390625, "eval_runtime": 21.9973, "eval_samples_per_second": 227.301, "eval_steps_per_second": 1.227, "step": 5655 }, { "epoch": 0.306738699679916, "grad_norm": 0.6863952875137329, "learning_rate": 9.357807726459754e-05, "loss": 2.2291, "step": 5660 }, { "epoch": 0.3072806408454282, "grad_norm": 0.4206200838088989, "learning_rate": 9.353089506996016e-05, "loss": 2.2224, "step": 5670 }, { "epoch": 0.3078225820109404, "grad_norm": 0.9796401262283325, "learning_rate": 9.348355360337374e-05, "loss": 2.2199, "step": 5680 }, { "epoch": 0.30836452317645263, "grad_norm": 0.8994840979576111, "learning_rate": 9.343605306052977e-05, "loss": 2.2317, "step": 5690 }, { "epoch": 0.3089064643419649, "grad_norm": 0.6637131571769714, "learning_rate": 9.338839363777728e-05, "loss": 2.2211, "step": 5700 }, { "epoch": 0.3094484055074771, "grad_norm": 0.6918546557426453, "learning_rate": 9.334057553212204e-05, "loss": 2.2204, "step": 5710 }, { "epoch": 0.3099903466729893, "grad_norm": 0.3313464820384979, "learning_rate": 9.32925989412258e-05, "loss": 2.2202, "step": 5720 }, { "epoch": 0.3105322878385015, "grad_norm": 0.4739852249622345, "learning_rate": 9.324446406340537e-05, "loss": 2.2164, "step": 5730 }, { "epoch": 0.31107422900401377, "grad_norm": 1.1123601198196411, "learning_rate": 9.319617109763188e-05, "loss": 2.2114, "step": 5740 }, { "epoch": 0.3111826172371162, "eval_loss": 2.697807550430298, "eval_runtime": 29.7475, "eval_samples_per_second": 168.081, "eval_steps_per_second": 0.908, "step": 5742 }, { "epoch": 0.311616170169526, "grad_norm": 0.33748936653137207, "learning_rate": 9.314772024352995e-05, "loss": 2.2157, "step": 5750 }, { "epoch": 0.3121581113350382, "grad_norm": 0.35050588846206665, "learning_rate": 9.309911170137682e-05, "loss": 2.2109, "step": 5760 }, { "epoch": 0.3127000525005504, "grad_norm": 0.5154600739479065, "learning_rate": 9.30503456721016e-05, "loss": 2.2026, "step": 5770 }, { "epoch": 0.31324199366606265, "grad_norm": 0.5823956727981567, "learning_rate": 9.300142235728432e-05, "loss": 2.2115, "step": 5780 }, { "epoch": 0.31378393483157485, "grad_norm": 0.7447882890701294, "learning_rate": 9.295234195915523e-05, "loss": 2.2098, "step": 5790 }, { "epoch": 0.31432587599708706, "grad_norm": 0.6827070116996765, "learning_rate": 9.290310468059389e-05, "loss": 2.2105, "step": 5800 }, { "epoch": 0.31486781716259926, "grad_norm": 0.42079001665115356, "learning_rate": 9.285371072512831e-05, "loss": 2.2102, "step": 5810 }, { "epoch": 0.3154097583281115, "grad_norm": 0.7562916874885559, "learning_rate": 9.280416029693419e-05, "loss": 2.2039, "step": 5820 }, { "epoch": 0.3158975053770725, "eval_loss": 2.691206455230713, "eval_runtime": 48.3032, "eval_samples_per_second": 103.513, "eval_steps_per_second": 0.559, "step": 5829 }, { "epoch": 0.3159516994936237, "grad_norm": 0.6581352949142456, "learning_rate": 9.275445360083398e-05, "loss": 2.2065, "step": 5830 }, { "epoch": 0.31649364065913593, "grad_norm": 0.8172852993011475, "learning_rate": 9.270459084229612e-05, "loss": 2.2078, "step": 5840 }, { "epoch": 0.31703558182464814, "grad_norm": 0.8475619554519653, "learning_rate": 9.265457222743414e-05, "loss": 2.2049, "step": 5850 }, { "epoch": 0.3175775229901604, "grad_norm": 0.37048637866973877, "learning_rate": 9.260439796300582e-05, "loss": 2.2131, "step": 5860 }, { "epoch": 0.3181194641556726, "grad_norm": 0.4438531994819641, "learning_rate": 9.255406825641233e-05, "loss": 2.1972, "step": 5870 }, { "epoch": 0.3186614053211848, "grad_norm": 0.7300620079040527, "learning_rate": 9.250358331569737e-05, "loss": 2.1957, "step": 5880 }, { "epoch": 0.319203346486697, "grad_norm": 0.42140352725982666, "learning_rate": 9.245294334954636e-05, "loss": 2.2043, "step": 5890 }, { "epoch": 0.3197452876522093, "grad_norm": 1.2300440073013306, "learning_rate": 9.24021485672855e-05, "loss": 2.2077, "step": 5900 }, { "epoch": 0.3202872288177215, "grad_norm": 0.9198090434074402, "learning_rate": 9.23511991788809e-05, "loss": 2.1974, "step": 5910 }, { "epoch": 0.3206123935170288, "eval_loss": 2.7107741832733154, "eval_runtime": 52.6522, "eval_samples_per_second": 94.963, "eval_steps_per_second": 0.513, "step": 5916 }, { "epoch": 0.3208291699832337, "grad_norm": 0.9036116600036621, "learning_rate": 9.230009539493787e-05, "loss": 2.2209, "step": 5920 }, { "epoch": 0.3213711111487459, "grad_norm": 0.5668349266052246, "learning_rate": 9.224883742669982e-05, "loss": 2.2166, "step": 5930 }, { "epoch": 0.32191305231425815, "grad_norm": 0.5860267281532288, "learning_rate": 9.219742548604756e-05, "loss": 2.1962, "step": 5940 }, { "epoch": 0.32245499347977036, "grad_norm": 0.3655729591846466, "learning_rate": 9.214585978549832e-05, "loss": 2.2041, "step": 5950 }, { "epoch": 0.32299693464528256, "grad_norm": 0.34334370493888855, "learning_rate": 9.209414053820495e-05, "loss": 2.198, "step": 5960 }, { "epoch": 0.32353887581079477, "grad_norm": 0.39599186182022095, "learning_rate": 9.2042267957955e-05, "loss": 2.1826, "step": 5970 }, { "epoch": 0.32408081697630703, "grad_norm": 0.46203556656837463, "learning_rate": 9.199024225916982e-05, "loss": 2.1947, "step": 5980 }, { "epoch": 0.32462275814181923, "grad_norm": 0.43823203444480896, "learning_rate": 9.193806365690371e-05, "loss": 2.1897, "step": 5990 }, { "epoch": 0.32516469930733144, "grad_norm": 0.3208545744419098, "learning_rate": 9.1885732366843e-05, "loss": 2.1952, "step": 6000 }, { "epoch": 0.00016258234965366573, "eval_loss": 2.6887643337249756, "eval_runtime": 22.0393, "eval_samples_per_second": 226.867, "eval_steps_per_second": 1.225, "step": 6003 }, { "epoch": 0.000541941165512219, "grad_norm": 0.34268251061439514, "learning_rate": 9.183324860530519e-05, "loss": 2.1881, "step": 6010 }, { "epoch": 0.001083882331024438, "grad_norm": 0.6927510499954224, "learning_rate": 9.178061258923802e-05, "loss": 2.1964, "step": 6020 }, { "epoch": 0.0016258234965366573, "grad_norm": 0.7226220965385437, "learning_rate": 9.172782453621862e-05, "loss": 2.1923, "step": 6030 }, { "epoch": 0.002167764662048876, "grad_norm": 0.6258693337440491, "learning_rate": 9.167488466445255e-05, "loss": 2.1977, "step": 6040 }, { "epoch": 0.0027097058275610954, "grad_norm": 0.9858429431915283, "learning_rate": 9.162179319277296e-05, "loss": 2.1936, "step": 6050 }, { "epoch": 0.0032516469930733145, "grad_norm": 0.6434946656227112, "learning_rate": 9.156855034063963e-05, "loss": 2.2008, "step": 6060 }, { "epoch": 0.0037935881585855337, "grad_norm": 0.49262088537216187, "learning_rate": 9.151515632813808e-05, "loss": 2.1885, "step": 6070 }, { "epoch": 0.004335529324097752, "grad_norm": 0.5512810945510864, "learning_rate": 9.14616113759787e-05, "loss": 2.1764, "step": 6080 }, { "epoch": 0.0048774704896099716, "grad_norm": 0.38034671545028687, "learning_rate": 9.140791570549583e-05, "loss": 2.1774, "step": 6090 }, { "epoch": 0.0048774704896099716, "eval_loss": 2.6898508071899414, "eval_runtime": 21.9661, "eval_samples_per_second": 227.624, "eval_steps_per_second": 1.229, "step": 6090 }, { "epoch": 0.005419411655122191, "grad_norm": 0.9372425079345703, "learning_rate": 9.135406953864675e-05, "loss": 2.1785, "step": 6100 }, { "epoch": 0.00596135282063441, "grad_norm": 0.36292022466659546, "learning_rate": 9.130007309801089e-05, "loss": 2.1849, "step": 6110 }, { "epoch": 0.006503293986146629, "grad_norm": 0.47751471400260925, "learning_rate": 9.12459266067888e-05, "loss": 2.1846, "step": 6120 }, { "epoch": 0.007045235151658848, "grad_norm": 0.5181723237037659, "learning_rate": 9.119163028880136e-05, "loss": 2.1772, "step": 6130 }, { "epoch": 0.007587176317171067, "grad_norm": 0.641890823841095, "learning_rate": 9.113718436848873e-05, "loss": 2.1719, "step": 6140 }, { "epoch": 0.008129117482683286, "grad_norm": 0.6258209347724915, "learning_rate": 9.108258907090944e-05, "loss": 2.1779, "step": 6150 }, { "epoch": 0.008671058648195505, "grad_norm": 0.41677314043045044, "learning_rate": 9.102784462173954e-05, "loss": 2.1814, "step": 6160 }, { "epoch": 0.009212999813707724, "grad_norm": 0.4094899594783783, "learning_rate": 9.097295124727161e-05, "loss": 2.169, "step": 6170 }, { "epoch": 0.009592358629566278, "eval_loss": 2.683100461959839, "eval_runtime": 22.9235, "eval_samples_per_second": 218.117, "eval_steps_per_second": 1.178, "step": 6177 }, { "epoch": 0.009754940979219943, "grad_norm": 0.6764087080955505, "learning_rate": 9.091790917441381e-05, "loss": 2.1747, "step": 6180 }, { "epoch": 0.010296882144732162, "grad_norm": 0.6722964644432068, "learning_rate": 9.086271863068893e-05, "loss": 2.1805, "step": 6190 }, { "epoch": 0.010838823310244381, "grad_norm": 0.5901824235916138, "learning_rate": 9.080737984423358e-05, "loss": 2.1904, "step": 6200 }, { "epoch": 0.0113807644757566, "grad_norm": 0.4733932614326477, "learning_rate": 9.075189304379703e-05, "loss": 2.1834, "step": 6210 }, { "epoch": 0.01192270564126882, "grad_norm": 0.6376433968544006, "learning_rate": 9.06962584587405e-05, "loss": 2.1765, "step": 6220 }, { "epoch": 0.012464646806781039, "grad_norm": 0.559138298034668, "learning_rate": 9.0640476319036e-05, "loss": 2.1715, "step": 6230 }, { "epoch": 0.013006587972293258, "grad_norm": 0.7046493887901306, "learning_rate": 9.05845468552655e-05, "loss": 2.1645, "step": 6240 }, { "epoch": 0.013548529137805477, "grad_norm": 1.1869386434555054, "learning_rate": 9.052847029861999e-05, "loss": 2.1667, "step": 6250 }, { "epoch": 0.014090470303317696, "grad_norm": 0.5609129071235657, "learning_rate": 9.047224688089845e-05, "loss": 2.1764, "step": 6260 }, { "epoch": 0.014307246769522584, "eval_loss": 2.6828575134277344, "eval_runtime": 21.9786, "eval_samples_per_second": 227.494, "eval_steps_per_second": 1.228, "step": 6264 }, { "epoch": 0.014632411468829916, "grad_norm": 0.3735623359680176, "learning_rate": 9.041587683450695e-05, "loss": 2.159, "step": 6270 }, { "epoch": 0.015174352634342135, "grad_norm": 0.5112239718437195, "learning_rate": 9.035936039245761e-05, "loss": 2.1625, "step": 6280 }, { "epoch": 0.015716293799854352, "grad_norm": 0.7340002059936523, "learning_rate": 9.030269778836777e-05, "loss": 2.1693, "step": 6290 }, { "epoch": 0.01625823496536657, "grad_norm": 0.5309348106384277, "learning_rate": 9.024588925645889e-05, "loss": 2.1689, "step": 6300 }, { "epoch": 0.01680017613087879, "grad_norm": 0.33672529458999634, "learning_rate": 9.018893503155569e-05, "loss": 2.1676, "step": 6310 }, { "epoch": 0.01734211729639101, "grad_norm": 0.43856242299079895, "learning_rate": 9.013183534908508e-05, "loss": 2.1618, "step": 6320 }, { "epoch": 0.01788405846190323, "grad_norm": 0.385219931602478, "learning_rate": 9.007459044507528e-05, "loss": 2.1699, "step": 6330 }, { "epoch": 0.018425999627415448, "grad_norm": 0.5022485256195068, "learning_rate": 9.001720055615476e-05, "loss": 2.1621, "step": 6340 }, { "epoch": 0.018967940792927667, "grad_norm": 0.38395243883132935, "learning_rate": 8.995966591955132e-05, "loss": 2.1586, "step": 6350 }, { "epoch": 0.01902213490947889, "eval_loss": 2.6814119815826416, "eval_runtime": 23.8855, "eval_samples_per_second": 209.332, "eval_steps_per_second": 1.13, "step": 6351 }, { "epoch": 0.019509881958439886, "grad_norm": 0.3883703649044037, "learning_rate": 8.990198677309109e-05, "loss": 2.1581, "step": 6360 }, { "epoch": 0.020051823123952105, "grad_norm": 0.3477310538291931, "learning_rate": 8.984416335519754e-05, "loss": 2.1587, "step": 6370 }, { "epoch": 0.020593764289464325, "grad_norm": 0.35369783639907837, "learning_rate": 8.978619590489055e-05, "loss": 2.164, "step": 6380 }, { "epoch": 0.021135705454976544, "grad_norm": 0.6048836708068848, "learning_rate": 8.972808466178529e-05, "loss": 2.1553, "step": 6390 }, { "epoch": 0.021677646620488763, "grad_norm": 0.5932040214538574, "learning_rate": 8.966982986609141e-05, "loss": 2.1536, "step": 6400 }, { "epoch": 0.022219587786000982, "grad_norm": 0.6168770790100098, "learning_rate": 8.961143175861187e-05, "loss": 2.1562, "step": 6410 }, { "epoch": 0.0227615289515132, "grad_norm": 0.41919735074043274, "learning_rate": 8.955289058074207e-05, "loss": 2.1598, "step": 6420 }, { "epoch": 0.02330347011702542, "grad_norm": 0.6106112003326416, "learning_rate": 8.94942065744688e-05, "loss": 2.1558, "step": 6430 }, { "epoch": 0.023737023049435195, "eval_loss": 2.683075189590454, "eval_runtime": 21.975, "eval_samples_per_second": 227.532, "eval_steps_per_second": 1.229, "step": 6438 }, { "epoch": 0.02384541128253764, "grad_norm": 0.7993687987327576, "learning_rate": 8.943537998236922e-05, "loss": 2.1442, "step": 6440 }, { "epoch": 0.02438735244804986, "grad_norm": 0.6228429079055786, "learning_rate": 8.937641104760994e-05, "loss": 2.1479, "step": 6450 }, { "epoch": 0.024929293613562078, "grad_norm": 0.47318562865257263, "learning_rate": 8.931730001394591e-05, "loss": 2.1512, "step": 6460 }, { "epoch": 0.025471234779074297, "grad_norm": 0.3833015263080597, "learning_rate": 8.92580471257195e-05, "loss": 2.1565, "step": 6470 }, { "epoch": 0.026013175944586516, "grad_norm": 0.357759565114975, "learning_rate": 8.919865262785941e-05, "loss": 2.15, "step": 6480 }, { "epoch": 0.026555117110098735, "grad_norm": 0.478135347366333, "learning_rate": 8.913911676587976e-05, "loss": 2.1399, "step": 6490 }, { "epoch": 0.027097058275610954, "grad_norm": 0.3849744498729706, "learning_rate": 8.907943978587896e-05, "loss": 2.1505, "step": 6500 }, { "epoch": 0.027638999441123174, "grad_norm": 0.7612195611000061, "learning_rate": 8.901962193453875e-05, "loss": 2.1517, "step": 6510 }, { "epoch": 0.028180940606635393, "grad_norm": 0.39389902353286743, "learning_rate": 8.895966345912322e-05, "loss": 2.1512, "step": 6520 }, { "epoch": 0.028451911189391502, "eval_loss": 2.684783935546875, "eval_runtime": 21.9732, "eval_samples_per_second": 227.55, "eval_steps_per_second": 1.229, "step": 6525 }, { "epoch": 0.028722881772147612, "grad_norm": 0.3195110559463501, "learning_rate": 8.889956460747773e-05, "loss": 2.1501, "step": 6530 }, { "epoch": 0.02926482293765983, "grad_norm": 0.34488433599472046, "learning_rate": 8.883932562802787e-05, "loss": 2.1392, "step": 6540 }, { "epoch": 0.02980676410317205, "grad_norm": 0.49418726563453674, "learning_rate": 8.877894676977848e-05, "loss": 2.148, "step": 6550 }, { "epoch": 0.03034870526868427, "grad_norm": 0.48197141289711, "learning_rate": 8.871842828231265e-05, "loss": 2.1441, "step": 6560 }, { "epoch": 0.03089064643419649, "grad_norm": 0.5270874500274658, "learning_rate": 8.865777041579057e-05, "loss": 2.1395, "step": 6570 }, { "epoch": 0.031432587599708704, "grad_norm": 0.42525890469551086, "learning_rate": 8.859697342094864e-05, "loss": 2.1438, "step": 6580 }, { "epoch": 0.03197452876522092, "grad_norm": 0.3241468071937561, "learning_rate": 8.85360375490983e-05, "loss": 2.1313, "step": 6590 }, { "epoch": 0.03251646993073314, "grad_norm": 0.4242691695690155, "learning_rate": 8.84749630521251e-05, "loss": 2.1338, "step": 6600 }, { "epoch": 0.03305841109624536, "grad_norm": 0.3246161937713623, "learning_rate": 8.84137501824876e-05, "loss": 2.1438, "step": 6610 }, { "epoch": 0.03316679932934781, "eval_loss": 2.6783671379089355, "eval_runtime": 21.9713, "eval_samples_per_second": 227.57, "eval_steps_per_second": 1.229, "step": 6612 }, { "epoch": 0.03360035226175758, "grad_norm": 0.37965840101242065, "learning_rate": 8.835239919321632e-05, "loss": 2.1324, "step": 6620 }, { "epoch": 0.0341422934272698, "grad_norm": 0.6797935962677002, "learning_rate": 8.829091033791274e-05, "loss": 2.1389, "step": 6630 }, { "epoch": 0.03468423459278202, "grad_norm": 0.7013537287712097, "learning_rate": 8.822928387074821e-05, "loss": 2.1369, "step": 6640 }, { "epoch": 0.03522617575829424, "grad_norm": 0.344716340303421, "learning_rate": 8.816752004646294e-05, "loss": 2.1294, "step": 6650 }, { "epoch": 0.03576811692380646, "grad_norm": 0.33730098605155945, "learning_rate": 8.810561912036489e-05, "loss": 2.148, "step": 6660 }, { "epoch": 0.03631005808931868, "grad_norm": 0.41755759716033936, "learning_rate": 8.804358134832874e-05, "loss": 2.133, "step": 6670 }, { "epoch": 0.036851999254830896, "grad_norm": 1.0376006364822388, "learning_rate": 8.798140698679489e-05, "loss": 2.1425, "step": 6680 }, { "epoch": 0.037393940420343115, "grad_norm": 0.673877477645874, "learning_rate": 8.791909629276827e-05, "loss": 2.1426, "step": 6690 }, { "epoch": 0.037881687469304114, "eval_loss": 2.6814136505126953, "eval_runtime": 21.9797, "eval_samples_per_second": 227.482, "eval_steps_per_second": 1.228, "step": 6699 }, { "epoch": 0.037935881585855334, "grad_norm": 0.5003832578659058, "learning_rate": 8.785664952381746e-05, "loss": 2.1438, "step": 6700 }, { "epoch": 0.03847782275136755, "grad_norm": 0.4365040957927704, "learning_rate": 8.77940669380734e-05, "loss": 2.1297, "step": 6710 }, { "epoch": 0.03901976391687977, "grad_norm": 0.3096308708190918, "learning_rate": 8.773134879422856e-05, "loss": 2.134, "step": 6720 }, { "epoch": 0.03956170508239199, "grad_norm": 0.9517742395401001, "learning_rate": 8.766849535153568e-05, "loss": 2.1274, "step": 6730 }, { "epoch": 0.04010364624790421, "grad_norm": 0.4467058777809143, "learning_rate": 8.760550686980681e-05, "loss": 2.1337, "step": 6740 }, { "epoch": 0.04064558741341643, "grad_norm": 0.619213879108429, "learning_rate": 8.754238360941218e-05, "loss": 2.1252, "step": 6750 }, { "epoch": 0.04118752857892865, "grad_norm": 0.5627840161323547, "learning_rate": 8.747912583127913e-05, "loss": 2.1229, "step": 6760 }, { "epoch": 0.04172946974444087, "grad_norm": 0.34763067960739136, "learning_rate": 8.741573379689109e-05, "loss": 2.1238, "step": 6770 }, { "epoch": 0.04227141090995309, "grad_norm": 0.45356321334838867, "learning_rate": 8.735220776828641e-05, "loss": 2.1258, "step": 6780 }, { "epoch": 0.04259657560926042, "eval_loss": 2.6720757484436035, "eval_runtime": 25.5872, "eval_samples_per_second": 195.41, "eval_steps_per_second": 1.055, "step": 6786 }, { "epoch": 0.04281335207546531, "grad_norm": 0.3579098880290985, "learning_rate": 8.728854800805733e-05, "loss": 2.122, "step": 6790 }, { "epoch": 0.043355293240977526, "grad_norm": 0.49912065267562866, "learning_rate": 8.722475477934894e-05, "loss": 2.1271, "step": 6800 }, { "epoch": 0.043897234406489745, "grad_norm": 0.5217333436012268, "learning_rate": 8.716082834585797e-05, "loss": 2.1158, "step": 6810 }, { "epoch": 0.044439175572001964, "grad_norm": 0.6708068251609802, "learning_rate": 8.709676897183176e-05, "loss": 2.1309, "step": 6820 }, { "epoch": 0.04498111673751418, "grad_norm": 0.4707426130771637, "learning_rate": 8.703257692206724e-05, "loss": 2.1155, "step": 6830 }, { "epoch": 0.0455230579030264, "grad_norm": 0.5237321257591248, "learning_rate": 8.696825246190972e-05, "loss": 2.1245, "step": 6840 }, { "epoch": 0.04606499906853862, "grad_norm": 0.6378241181373596, "learning_rate": 8.690379585725186e-05, "loss": 2.118, "step": 6850 }, { "epoch": 0.04660694023405084, "grad_norm": 0.33660611510276794, "learning_rate": 8.683920737453254e-05, "loss": 2.128, "step": 6860 }, { "epoch": 0.04714888139956306, "grad_norm": 0.2975222170352936, "learning_rate": 8.677448728073583e-05, "loss": 2.1156, "step": 6870 }, { "epoch": 0.04731146374921673, "eval_loss": 2.673781394958496, "eval_runtime": 22.7337, "eval_samples_per_second": 219.938, "eval_steps_per_second": 1.188, "step": 6873 }, { "epoch": 0.04769082256507528, "grad_norm": 0.34510669112205505, "learning_rate": 8.670963584338975e-05, "loss": 2.1145, "step": 6880 }, { "epoch": 0.0482327637305875, "grad_norm": 0.3134111166000366, "learning_rate": 8.664465333056526e-05, "loss": 2.1195, "step": 6890 }, { "epoch": 0.04877470489609972, "grad_norm": 0.37668564915657043, "learning_rate": 8.657954001087521e-05, "loss": 2.1195, "step": 6900 }, { "epoch": 0.049316646061611936, "grad_norm": 0.8455840945243835, "learning_rate": 8.651429615347309e-05, "loss": 2.1116, "step": 6910 }, { "epoch": 0.049858587227124156, "grad_norm": 1.0009831190109253, "learning_rate": 8.644892202805195e-05, "loss": 2.1119, "step": 6920 }, { "epoch": 0.050400528392636375, "grad_norm": 0.9167200326919556, "learning_rate": 8.638341790484341e-05, "loss": 2.1142, "step": 6930 }, { "epoch": 0.050942469558148594, "grad_norm": 0.6210402250289917, "learning_rate": 8.631778405461638e-05, "loss": 2.1147, "step": 6940 }, { "epoch": 0.05148441072366081, "grad_norm": 0.6486452221870422, "learning_rate": 8.625202074867607e-05, "loss": 2.107, "step": 6950 }, { "epoch": 0.05202635188917303, "grad_norm": 0.2966337502002716, "learning_rate": 8.618612825886272e-05, "loss": 2.0978, "step": 6960 }, { "epoch": 0.05202635188917303, "eval_loss": 2.667224168777466, "eval_runtime": 21.9683, "eval_samples_per_second": 227.601, "eval_steps_per_second": 1.229, "step": 6960 }, { "epoch": 0.05256829305468525, "grad_norm": 1.5529571771621704, "learning_rate": 8.612010685755066e-05, "loss": 2.1092, "step": 6970 }, { "epoch": 0.05311023422019747, "grad_norm": 0.6471717357635498, "learning_rate": 8.605395681764706e-05, "loss": 2.1193, "step": 6980 }, { "epoch": 0.05365217538570969, "grad_norm": 0.5466669797897339, "learning_rate": 8.598767841259078e-05, "loss": 2.1173, "step": 6990 }, { "epoch": 0.05419411655122191, "grad_norm": 0.3719353973865509, "learning_rate": 8.592127191635138e-05, "loss": 2.1186, "step": 7000 }, { "epoch": 0.05473605771673413, "grad_norm": 0.6376820206642151, "learning_rate": 8.585473760342786e-05, "loss": 2.1094, "step": 7010 }, { "epoch": 0.05527799888224635, "grad_norm": 0.36836645007133484, "learning_rate": 8.578807574884756e-05, "loss": 2.1028, "step": 7020 }, { "epoch": 0.055819940047758566, "grad_norm": 0.318914532661438, "learning_rate": 8.572128662816498e-05, "loss": 2.107, "step": 7030 }, { "epoch": 0.056361881213270786, "grad_norm": 0.5992175340652466, "learning_rate": 8.56543705174608e-05, "loss": 2.1014, "step": 7040 }, { "epoch": 0.056741240029129336, "eval_loss": 2.667701244354248, "eval_runtime": 21.977, "eval_samples_per_second": 227.51, "eval_steps_per_second": 1.229, "step": 7047 }, { "epoch": 0.056903822378783005, "grad_norm": 0.3516386151313782, "learning_rate": 8.558732769334055e-05, "loss": 2.1166, "step": 7050 }, { "epoch": 0.057445763544295224, "grad_norm": 1.2428702116012573, "learning_rate": 8.552015843293358e-05, "loss": 2.1083, "step": 7060 }, { "epoch": 0.05798770470980744, "grad_norm": 0.812322735786438, "learning_rate": 8.545286301389183e-05, "loss": 2.1043, "step": 7070 }, { "epoch": 0.05852964587531966, "grad_norm": 0.5382115840911865, "learning_rate": 8.538544171438879e-05, "loss": 2.0998, "step": 7080 }, { "epoch": 0.05907158704083188, "grad_norm": 0.7883318662643433, "learning_rate": 8.531789481311824e-05, "loss": 2.1118, "step": 7090 }, { "epoch": 0.0596135282063441, "grad_norm": 0.3522588312625885, "learning_rate": 8.525022258929319e-05, "loss": 2.1094, "step": 7100 }, { "epoch": 0.06015546937185632, "grad_norm": 0.5178406238555908, "learning_rate": 8.518242532264468e-05, "loss": 2.0969, "step": 7110 }, { "epoch": 0.06069741053736854, "grad_norm": 0.6928080916404724, "learning_rate": 8.511450329342061e-05, "loss": 2.0937, "step": 7120 }, { "epoch": 0.06123935170288076, "grad_norm": 0.33726730942726135, "learning_rate": 8.504645678238462e-05, "loss": 2.0893, "step": 7130 }, { "epoch": 0.06145612816908565, "eval_loss": 2.6699295043945312, "eval_runtime": 21.9715, "eval_samples_per_second": 227.567, "eval_steps_per_second": 1.229, "step": 7134 }, { "epoch": 0.06178129286839298, "grad_norm": 0.5048802495002747, "learning_rate": 8.497828607081488e-05, "loss": 2.1111, "step": 7140 }, { "epoch": 0.062323234033905196, "grad_norm": 0.3473956286907196, "learning_rate": 8.490999144050299e-05, "loss": 2.1016, "step": 7150 }, { "epoch": 0.06286517519941741, "grad_norm": 0.3585287928581238, "learning_rate": 8.484157317375276e-05, "loss": 2.091, "step": 7160 }, { "epoch": 0.06340711636492963, "grad_norm": 0.929985761642456, "learning_rate": 8.47730315533791e-05, "loss": 2.103, "step": 7170 }, { "epoch": 0.06394905753044185, "grad_norm": 0.5229865312576294, "learning_rate": 8.470436686270678e-05, "loss": 2.09, "step": 7180 }, { "epoch": 0.06449099869595407, "grad_norm": 0.6161647439002991, "learning_rate": 8.463557938556928e-05, "loss": 2.0994, "step": 7190 }, { "epoch": 0.06503293986146629, "grad_norm": 0.4682093858718872, "learning_rate": 8.456666940630772e-05, "loss": 2.094, "step": 7200 }, { "epoch": 0.0655748810269785, "grad_norm": 0.35867783427238464, "learning_rate": 8.449763720976947e-05, "loss": 2.0879, "step": 7210 }, { "epoch": 0.06611682219249072, "grad_norm": 0.44877171516418457, "learning_rate": 8.442848308130723e-05, "loss": 2.0922, "step": 7220 }, { "epoch": 0.06617101630904194, "eval_loss": 2.6712543964385986, "eval_runtime": 21.9738, "eval_samples_per_second": 227.544, "eval_steps_per_second": 1.229, "step": 7221 }, { "epoch": 0.06665876335800294, "grad_norm": 0.4711715877056122, "learning_rate": 8.435920730677762e-05, "loss": 2.0889, "step": 7230 }, { "epoch": 0.06720070452351516, "grad_norm": 0.5644111037254333, "learning_rate": 8.428981017254012e-05, "loss": 2.0885, "step": 7240 }, { "epoch": 0.06774264568902738, "grad_norm": 0.6288381814956665, "learning_rate": 8.42202919654559e-05, "loss": 2.0922, "step": 7250 }, { "epoch": 0.0682845868545396, "grad_norm": 0.39220157265663147, "learning_rate": 8.41506529728866e-05, "loss": 2.0756, "step": 7260 }, { "epoch": 0.06882652802005182, "grad_norm": 0.4057351052761078, "learning_rate": 8.408089348269307e-05, "loss": 2.0918, "step": 7270 }, { "epoch": 0.06936846918556404, "grad_norm": 0.4261212944984436, "learning_rate": 8.401101378323434e-05, "loss": 2.0806, "step": 7280 }, { "epoch": 0.06991041035107626, "grad_norm": 0.6397058367729187, "learning_rate": 8.394101416336627e-05, "loss": 2.0885, "step": 7290 }, { "epoch": 0.07045235151658848, "grad_norm": 0.7682284712791443, "learning_rate": 8.387089491244048e-05, "loss": 2.0817, "step": 7300 }, { "epoch": 0.07088590444899825, "eval_loss": 2.657864809036255, "eval_runtime": 21.9777, "eval_samples_per_second": 227.503, "eval_steps_per_second": 1.229, "step": 7308 }, { "epoch": 0.0709942926821007, "grad_norm": 0.4054642617702484, "learning_rate": 8.380065632030305e-05, "loss": 2.093, "step": 7310 }, { "epoch": 0.07153623384761292, "grad_norm": 0.42616334557533264, "learning_rate": 8.37302986772934e-05, "loss": 2.0864, "step": 7320 }, { "epoch": 0.07207817501312513, "grad_norm": 0.5650444030761719, "learning_rate": 8.365982227424306e-05, "loss": 2.0884, "step": 7330 }, { "epoch": 0.07262011617863735, "grad_norm": 0.9685772061347961, "learning_rate": 8.358922740247447e-05, "loss": 2.0863, "step": 7340 }, { "epoch": 0.07316205734414957, "grad_norm": 0.8171983957290649, "learning_rate": 8.351851435379974e-05, "loss": 2.0896, "step": 7350 }, { "epoch": 0.07370399850966179, "grad_norm": 0.6826300024986267, "learning_rate": 8.34476834205195e-05, "loss": 2.0914, "step": 7360 }, { "epoch": 0.07424593967517401, "grad_norm": 0.8995702266693115, "learning_rate": 8.337673489542172e-05, "loss": 2.0719, "step": 7370 }, { "epoch": 0.07478788084068623, "grad_norm": 0.3203958570957184, "learning_rate": 8.330566907178038e-05, "loss": 2.0861, "step": 7380 }, { "epoch": 0.07532982200619845, "grad_norm": 0.3850855827331543, "learning_rate": 8.323448624335435e-05, "loss": 2.0916, "step": 7390 }, { "epoch": 0.07560079258895457, "eval_loss": 2.671057939529419, "eval_runtime": 21.9786, "eval_samples_per_second": 227.494, "eval_steps_per_second": 1.228, "step": 7395 }, { "epoch": 0.07587176317171067, "grad_norm": 1.0707188844680786, "learning_rate": 8.316318670438614e-05, "loss": 2.0833, "step": 7400 }, { "epoch": 0.07641370433722289, "grad_norm": 0.36094823479652405, "learning_rate": 8.309177074960073e-05, "loss": 2.087, "step": 7410 }, { "epoch": 0.0769556455027351, "grad_norm": 0.34225502610206604, "learning_rate": 8.30202386742043e-05, "loss": 2.0898, "step": 7420 }, { "epoch": 0.07749758666824733, "grad_norm": 0.3515205681324005, "learning_rate": 8.294859077388301e-05, "loss": 2.0787, "step": 7430 }, { "epoch": 0.07803952783375954, "grad_norm": 0.4942980408668518, "learning_rate": 8.287682734480182e-05, "loss": 2.0727, "step": 7440 }, { "epoch": 0.07858146899927176, "grad_norm": 0.5919073224067688, "learning_rate": 8.280494868360325e-05, "loss": 2.085, "step": 7450 }, { "epoch": 0.07912341016478398, "grad_norm": 0.5980909466743469, "learning_rate": 8.27329550874061e-05, "loss": 2.0768, "step": 7460 }, { "epoch": 0.0796653513302962, "grad_norm": 0.4768329858779907, "learning_rate": 8.266084685380434e-05, "loss": 2.0824, "step": 7470 }, { "epoch": 0.08020729249580842, "grad_norm": 0.6061816215515137, "learning_rate": 8.258862428086572e-05, "loss": 2.0784, "step": 7480 }, { "epoch": 0.08031568072891086, "eval_loss": 2.6704938411712646, "eval_runtime": 21.9746, "eval_samples_per_second": 227.536, "eval_steps_per_second": 1.229, "step": 7482 }, { "epoch": 0.08074923366132064, "grad_norm": 0.4925580322742462, "learning_rate": 8.251628766713068e-05, "loss": 2.0779, "step": 7490 }, { "epoch": 0.08129117482683286, "grad_norm": 0.38380166888237, "learning_rate": 8.244383731161109e-05, "loss": 2.0689, "step": 7500 }, { "epoch": 0.08183311599234508, "grad_norm": 0.7923119068145752, "learning_rate": 8.237127351378889e-05, "loss": 2.0795, "step": 7510 }, { "epoch": 0.0823750571578573, "grad_norm": 0.6250379085540771, "learning_rate": 8.229859657361504e-05, "loss": 2.0803, "step": 7520 }, { "epoch": 0.08291699832336952, "grad_norm": 0.32212239503860474, "learning_rate": 8.222580679150813e-05, "loss": 2.0658, "step": 7530 }, { "epoch": 0.08345893948888174, "grad_norm": 0.35104089975357056, "learning_rate": 8.215290446835322e-05, "loss": 2.0689, "step": 7540 }, { "epoch": 0.08400088065439396, "grad_norm": 0.6346544623374939, "learning_rate": 8.207988990550055e-05, "loss": 2.0708, "step": 7550 }, { "epoch": 0.08454282181990617, "grad_norm": 0.3987348675727844, "learning_rate": 8.200676340476437e-05, "loss": 2.0642, "step": 7560 }, { "epoch": 0.08503056886886717, "eval_loss": 2.6780905723571777, "eval_runtime": 21.979, "eval_samples_per_second": 227.49, "eval_steps_per_second": 1.228, "step": 7569 }, { "epoch": 0.0850847629854184, "grad_norm": 0.9453224539756775, "learning_rate": 8.193352526842159e-05, "loss": 2.0647, "step": 7570 }, { "epoch": 0.08562670415093061, "grad_norm": 0.3680925667285919, "learning_rate": 8.186017579921055e-05, "loss": 2.0611, "step": 7580 }, { "epoch": 0.08616864531644283, "grad_norm": 0.8765574097633362, "learning_rate": 8.178671530032988e-05, "loss": 2.0722, "step": 7590 }, { "epoch": 0.08671058648195505, "grad_norm": 0.674678385257721, "learning_rate": 8.171314407543708e-05, "loss": 2.0738, "step": 7600 }, { "epoch": 0.08725252764746727, "grad_norm": 0.582229495048523, "learning_rate": 8.163946242864744e-05, "loss": 2.0725, "step": 7610 }, { "epoch": 0.08779446881297949, "grad_norm": 0.4998132288455963, "learning_rate": 8.15656706645326e-05, "loss": 2.0749, "step": 7620 }, { "epoch": 0.08833640997849171, "grad_norm": 0.38911086320877075, "learning_rate": 8.149176908811947e-05, "loss": 2.0665, "step": 7630 }, { "epoch": 0.08887835114400393, "grad_norm": 0.7126851081848145, "learning_rate": 8.141775800488877e-05, "loss": 2.0599, "step": 7640 }, { "epoch": 0.08942029230951615, "grad_norm": 0.6182229518890381, "learning_rate": 8.134363772077399e-05, "loss": 2.0657, "step": 7650 }, { "epoch": 0.08974545700882348, "eval_loss": 2.663647174835205, "eval_runtime": 21.9735, "eval_samples_per_second": 227.547, "eval_steps_per_second": 1.229, "step": 7656 }, { "epoch": 0.08996223347502837, "grad_norm": 0.5466020107269287, "learning_rate": 8.126940854215997e-05, "loss": 2.0764, "step": 7660 }, { "epoch": 0.09050417464054059, "grad_norm": 0.7600970268249512, "learning_rate": 8.119507077588165e-05, "loss": 2.0711, "step": 7670 }, { "epoch": 0.0910461158060528, "grad_norm": 0.3224121034145355, "learning_rate": 8.11206247292229e-05, "loss": 2.0595, "step": 7680 }, { "epoch": 0.09158805697156502, "grad_norm": 0.3413337171077728, "learning_rate": 8.10460707099151e-05, "loss": 2.0586, "step": 7690 }, { "epoch": 0.09212999813707724, "grad_norm": 0.337230384349823, "learning_rate": 8.097140902613601e-05, "loss": 2.0525, "step": 7700 }, { "epoch": 0.09267193930258946, "grad_norm": 1.0965609550476074, "learning_rate": 8.089663998650839e-05, "loss": 2.0618, "step": 7710 }, { "epoch": 0.09321388046810168, "grad_norm": 0.7565444707870483, "learning_rate": 8.082176390009878e-05, "loss": 2.0523, "step": 7720 }, { "epoch": 0.0937558216336139, "grad_norm": 0.49668484926223755, "learning_rate": 8.074678107641623e-05, "loss": 2.0595, "step": 7730 }, { "epoch": 0.09429776279912612, "grad_norm": 0.9314787983894348, "learning_rate": 8.067169182541099e-05, "loss": 2.0604, "step": 7740 }, { "epoch": 0.09446034514877978, "eval_loss": 2.659668445587158, "eval_runtime": 21.9747, "eval_samples_per_second": 227.535, "eval_steps_per_second": 1.229, "step": 7743 }, { "epoch": 0.09483970396463834, "grad_norm": 0.6480481624603271, "learning_rate": 8.059649645747325e-05, "loss": 2.0626, "step": 7750 }, { "epoch": 0.09538164513015056, "grad_norm": 0.9633429050445557, "learning_rate": 8.052119528343181e-05, "loss": 2.0628, "step": 7760 }, { "epoch": 0.09592358629566278, "grad_norm": 0.5564477443695068, "learning_rate": 8.044578861455286e-05, "loss": 2.0621, "step": 7770 }, { "epoch": 0.096465527461175, "grad_norm": 0.613134503364563, "learning_rate": 8.037027676253866e-05, "loss": 2.059, "step": 7780 }, { "epoch": 0.09700746862668722, "grad_norm": 0.7734301686286926, "learning_rate": 8.029466003952628e-05, "loss": 2.062, "step": 7790 }, { "epoch": 0.09754940979219943, "grad_norm": 0.31820887327194214, "learning_rate": 8.021893875808625e-05, "loss": 2.0541, "step": 7800 }, { "epoch": 0.09809135095771165, "grad_norm": 0.3358112573623657, "learning_rate": 8.014311323122131e-05, "loss": 2.0547, "step": 7810 }, { "epoch": 0.09863329212322387, "grad_norm": 0.5778986811637878, "learning_rate": 8.006718377236514e-05, "loss": 2.057, "step": 7820 }, { "epoch": 0.09917523328873609, "grad_norm": 0.318333238363266, "learning_rate": 7.999115069538099e-05, "loss": 2.0503, "step": 7830 }, { "epoch": 0.09917523328873609, "eval_loss": 2.6534852981567383, "eval_runtime": 21.973, "eval_samples_per_second": 227.552, "eval_steps_per_second": 1.229, "step": 7830 }, { "epoch": 0.09971717445424831, "grad_norm": 0.5033796429634094, "learning_rate": 7.991501431456047e-05, "loss": 2.0504, "step": 7840 }, { "epoch": 0.10025911561976053, "grad_norm": 0.39477086067199707, "learning_rate": 7.983877494462219e-05, "loss": 2.0601, "step": 7850 }, { "epoch": 0.10080105678527275, "grad_norm": 0.37355226278305054, "learning_rate": 7.976243290071045e-05, "loss": 2.0529, "step": 7860 }, { "epoch": 0.10134299795078497, "grad_norm": 0.6517744064331055, "learning_rate": 7.968598849839404e-05, "loss": 2.0555, "step": 7870 }, { "epoch": 0.10188493911629719, "grad_norm": 0.3515459895133972, "learning_rate": 7.960944205366478e-05, "loss": 2.0515, "step": 7880 }, { "epoch": 0.10242688028180941, "grad_norm": 0.590758740901947, "learning_rate": 7.953279388293634e-05, "loss": 2.0536, "step": 7890 }, { "epoch": 0.10296882144732163, "grad_norm": 0.3137708008289337, "learning_rate": 7.945604430304289e-05, "loss": 2.0495, "step": 7900 }, { "epoch": 0.10351076261283385, "grad_norm": 0.5091531872749329, "learning_rate": 7.93791936312377e-05, "loss": 2.0461, "step": 7910 }, { "epoch": 0.1038901214286924, "eval_loss": 2.6576335430145264, "eval_runtime": 21.9728, "eval_samples_per_second": 227.554, "eval_steps_per_second": 1.229, "step": 7917 }, { "epoch": 0.10405270377834606, "grad_norm": 0.33678263425827026, "learning_rate": 7.930224218519207e-05, "loss": 2.0508, "step": 7920 }, { "epoch": 0.10459464494385828, "grad_norm": 0.36633098125457764, "learning_rate": 7.922519028299376e-05, "loss": 2.0451, "step": 7930 }, { "epoch": 0.1051365861093705, "grad_norm": 0.531390905380249, "learning_rate": 7.914803824314579e-05, "loss": 2.0512, "step": 7940 }, { "epoch": 0.10567852727488272, "grad_norm": 0.6033332347869873, "learning_rate": 7.907078638456506e-05, "loss": 2.0421, "step": 7950 }, { "epoch": 0.10622046844039494, "grad_norm": 0.7366685271263123, "learning_rate": 7.899343502658123e-05, "loss": 2.0543, "step": 7960 }, { "epoch": 0.10676240960590716, "grad_norm": 0.5616899132728577, "learning_rate": 7.891598448893508e-05, "loss": 2.0458, "step": 7970 }, { "epoch": 0.10730435077141938, "grad_norm": 0.7124751806259155, "learning_rate": 7.88384350917775e-05, "loss": 2.0412, "step": 7980 }, { "epoch": 0.1078462919369316, "grad_norm": 0.5059435963630676, "learning_rate": 7.876078715566794e-05, "loss": 2.0452, "step": 7990 }, { "epoch": 0.10838823310244382, "grad_norm": 0.297770619392395, "learning_rate": 7.868304100157318e-05, "loss": 2.0514, "step": 8000 }, { "epoch": 0.1086050095686487, "eval_loss": 2.6553268432617188, "eval_runtime": 21.9655, "eval_samples_per_second": 227.63, "eval_steps_per_second": 1.229, "step": 8004 }, { "epoch": 0.10893017426795604, "grad_norm": 0.890073299407959, "learning_rate": 7.860519695086608e-05, "loss": 2.0421, "step": 8010 }, { "epoch": 0.10947211543346826, "grad_norm": 0.9192163348197937, "learning_rate": 7.852725532532405e-05, "loss": 2.0366, "step": 8020 }, { "epoch": 0.11001405659898048, "grad_norm": 0.3507506847381592, "learning_rate": 7.84492164471279e-05, "loss": 2.0436, "step": 8030 }, { "epoch": 0.1105559977644927, "grad_norm": 0.315594881772995, "learning_rate": 7.837108063886046e-05, "loss": 2.0421, "step": 8040 }, { "epoch": 0.11109793893000491, "grad_norm": 0.6328078508377075, "learning_rate": 7.829284822350516e-05, "loss": 2.0414, "step": 8050 }, { "epoch": 0.11163988009551713, "grad_norm": 0.6946396231651306, "learning_rate": 7.821451952444487e-05, "loss": 2.0334, "step": 8060 }, { "epoch": 0.11218182126102935, "grad_norm": 0.3898144066333771, "learning_rate": 7.813609486546036e-05, "loss": 2.0386, "step": 8070 }, { "epoch": 0.11272376242654157, "grad_norm": 0.8166786432266235, "learning_rate": 7.805757457072913e-05, "loss": 2.0515, "step": 8080 }, { "epoch": 0.11326570359205379, "grad_norm": 0.36218130588531494, "learning_rate": 7.797895896482396e-05, "loss": 2.0395, "step": 8090 }, { "epoch": 0.11331989770860501, "eval_loss": 2.6553170680999756, "eval_runtime": 21.9791, "eval_samples_per_second": 227.489, "eval_steps_per_second": 1.228, "step": 8091 }, { "epoch": 0.11380764475756601, "grad_norm": 0.34464749693870544, "learning_rate": 7.790024837271165e-05, "loss": 2.0413, "step": 8100 }, { "epoch": 0.11434958592307823, "grad_norm": 0.46647629141807556, "learning_rate": 7.782144311975158e-05, "loss": 2.047, "step": 8110 }, { "epoch": 0.11489152708859045, "grad_norm": 0.30626147985458374, "learning_rate": 7.77425435316945e-05, "loss": 2.0349, "step": 8120 }, { "epoch": 0.11543346825410267, "grad_norm": 0.3106520473957062, "learning_rate": 7.7663549934681e-05, "loss": 2.031, "step": 8130 }, { "epoch": 0.11597540941961489, "grad_norm": 0.6385337710380554, "learning_rate": 7.758446265524038e-05, "loss": 2.026, "step": 8140 }, { "epoch": 0.1165173505851271, "grad_norm": 0.7750070095062256, "learning_rate": 7.750528202028912e-05, "loss": 2.0243, "step": 8150 }, { "epoch": 0.11705929175063932, "grad_norm": 0.5177359580993652, "learning_rate": 7.74260083571296e-05, "loss": 2.0276, "step": 8160 }, { "epoch": 0.11760123291615154, "grad_norm": 0.9061049818992615, "learning_rate": 7.734664199344876e-05, "loss": 2.0382, "step": 8170 }, { "epoch": 0.11803478584856132, "eval_loss": 2.6514930725097656, "eval_runtime": 21.9691, "eval_samples_per_second": 227.592, "eval_steps_per_second": 1.229, "step": 8178 }, { "epoch": 0.11814317408166376, "grad_norm": 0.8364623188972473, "learning_rate": 7.726718325731671e-05, "loss": 2.0361, "step": 8180 }, { "epoch": 0.11868511524717598, "grad_norm": 0.8486116528511047, "learning_rate": 7.718763247718542e-05, "loss": 2.0466, "step": 8190 }, { "epoch": 0.1192270564126882, "grad_norm": 0.9383118152618408, "learning_rate": 7.710798998188731e-05, "loss": 2.0303, "step": 8200 }, { "epoch": 0.11976899757820042, "grad_norm": 0.6073142886161804, "learning_rate": 7.702825610063393e-05, "loss": 2.0357, "step": 8210 }, { "epoch": 0.12031093874371264, "grad_norm": 0.46166083216667175, "learning_rate": 7.694843116301458e-05, "loss": 2.0319, "step": 8220 }, { "epoch": 0.12085287990922486, "grad_norm": 0.5513178110122681, "learning_rate": 7.686851549899494e-05, "loss": 2.0273, "step": 8230 }, { "epoch": 0.12139482107473708, "grad_norm": 0.4025670289993286, "learning_rate": 7.678850943891573e-05, "loss": 2.0338, "step": 8240 }, { "epoch": 0.1219367622402493, "grad_norm": 0.7250736355781555, "learning_rate": 7.670841331349134e-05, "loss": 2.0223, "step": 8250 }, { "epoch": 0.12247870340576152, "grad_norm": 0.5293325185775757, "learning_rate": 7.662822745380845e-05, "loss": 2.0363, "step": 8260 }, { "epoch": 0.12274967398851762, "eval_loss": 2.6460092067718506, "eval_runtime": 21.9726, "eval_samples_per_second": 227.556, "eval_steps_per_second": 1.229, "step": 8265 }, { "epoch": 0.12302064457127374, "grad_norm": 0.4530751407146454, "learning_rate": 7.654795219132465e-05, "loss": 2.0366, "step": 8270 }, { "epoch": 0.12356258573678595, "grad_norm": 0.47878649830818176, "learning_rate": 7.64675878578671e-05, "loss": 2.0289, "step": 8280 }, { "epoch": 0.12410452690229817, "grad_norm": 0.5691394209861755, "learning_rate": 7.638713478563116e-05, "loss": 2.023, "step": 8290 }, { "epoch": 0.12464646806781039, "grad_norm": 0.5451269149780273, "learning_rate": 7.630659330717899e-05, "loss": 2.0293, "step": 8300 }, { "epoch": 0.1251884092333226, "grad_norm": 0.2928202450275421, "learning_rate": 7.622596375543815e-05, "loss": 2.0301, "step": 8310 }, { "epoch": 0.12573035039883482, "grad_norm": 0.4254739582538605, "learning_rate": 7.614524646370034e-05, "loss": 2.0301, "step": 8320 }, { "epoch": 0.12627229156434705, "grad_norm": 0.40174198150634766, "learning_rate": 7.606444176561989e-05, "loss": 2.0225, "step": 8330 }, { "epoch": 0.12681423272985926, "grad_norm": 0.32534322142601013, "learning_rate": 7.598354999521243e-05, "loss": 2.023, "step": 8340 }, { "epoch": 0.1273561738953715, "grad_norm": 0.6066051125526428, "learning_rate": 7.590257148685352e-05, "loss": 2.0251, "step": 8350 }, { "epoch": 0.12746456212847393, "eval_loss": 2.6490955352783203, "eval_runtime": 21.9681, "eval_samples_per_second": 227.602, "eval_steps_per_second": 1.229, "step": 8352 }, { "epoch": 0.1278981150608837, "grad_norm": 0.6870770454406738, "learning_rate": 7.582150657527732e-05, "loss": 2.021, "step": 8360 }, { "epoch": 0.12844005622639593, "grad_norm": 0.6110761761665344, "learning_rate": 7.574035559557506e-05, "loss": 2.0273, "step": 8370 }, { "epoch": 0.12898199739190813, "grad_norm": 0.32752725481987, "learning_rate": 7.565911888319375e-05, "loss": 2.0185, "step": 8380 }, { "epoch": 0.12952393855742036, "grad_norm": 0.5558075904846191, "learning_rate": 7.557779677393486e-05, "loss": 2.011, "step": 8390 }, { "epoch": 0.13006587972293257, "grad_norm": 0.46736517548561096, "learning_rate": 7.549638960395283e-05, "loss": 2.0255, "step": 8400 }, { "epoch": 0.1306078208884448, "grad_norm": 0.35495907068252563, "learning_rate": 7.541489770975365e-05, "loss": 2.0289, "step": 8410 }, { "epoch": 0.131149762053957, "grad_norm": 0.6544122099876404, "learning_rate": 7.533332142819358e-05, "loss": 2.0241, "step": 8420 }, { "epoch": 0.13169170321946924, "grad_norm": 0.4280942380428314, "learning_rate": 7.52516610964777e-05, "loss": 2.0245, "step": 8430 }, { "epoch": 0.13217945026843023, "eval_loss": 2.6533358097076416, "eval_runtime": 21.9742, "eval_samples_per_second": 227.54, "eval_steps_per_second": 1.229, "step": 8439 }, { "epoch": 0.13223364438498145, "grad_norm": 0.4303838610649109, "learning_rate": 7.516991705215853e-05, "loss": 2.0144, "step": 8440 }, { "epoch": 0.13277558555049368, "grad_norm": 0.7431203722953796, "learning_rate": 7.508808963313461e-05, "loss": 2.0236, "step": 8450 }, { "epoch": 0.13331752671600589, "grad_norm": 0.6149893403053284, "learning_rate": 7.500617917764908e-05, "loss": 2.0145, "step": 8460 }, { "epoch": 0.13385946788151812, "grad_norm": 0.3794841468334198, "learning_rate": 7.492418602428841e-05, "loss": 2.0113, "step": 8470 }, { "epoch": 0.13440140904703032, "grad_norm": 0.4046865403652191, "learning_rate": 7.484211051198085e-05, "loss": 2.0092, "step": 8480 }, { "epoch": 0.13494335021254256, "grad_norm": 0.4416908025741577, "learning_rate": 7.475995297999507e-05, "loss": 2.0194, "step": 8490 }, { "epoch": 0.13548529137805476, "grad_norm": 0.4587673246860504, "learning_rate": 7.467771376793887e-05, "loss": 2.0126, "step": 8500 }, { "epoch": 0.136027232543567, "grad_norm": 0.4561171233654022, "learning_rate": 7.459539321575758e-05, "loss": 2.0158, "step": 8510 }, { "epoch": 0.1365691737090792, "grad_norm": 0.4048430323600769, "learning_rate": 7.451299166373283e-05, "loss": 2.0094, "step": 8520 }, { "epoch": 0.13689433840838655, "eval_loss": 2.6495518684387207, "eval_runtime": 21.9712, "eval_samples_per_second": 227.571, "eval_steps_per_second": 1.229, "step": 8526 }, { "epoch": 0.13711111487459143, "grad_norm": 0.9034155011177063, "learning_rate": 7.443050945248102e-05, "loss": 2.0219, "step": 8530 }, { "epoch": 0.13765305604010364, "grad_norm": 0.7499250173568726, "learning_rate": 7.434794692295202e-05, "loss": 2.0136, "step": 8540 }, { "epoch": 0.13819499720561587, "grad_norm": 0.6840431094169617, "learning_rate": 7.426530441642766e-05, "loss": 2.0084, "step": 8550 }, { "epoch": 0.13873693837112808, "grad_norm": 0.44849923253059387, "learning_rate": 7.418258227452038e-05, "loss": 2.0216, "step": 8560 }, { "epoch": 0.1392788795366403, "grad_norm": 0.3296073079109192, "learning_rate": 7.40997808391718e-05, "loss": 2.0037, "step": 8570 }, { "epoch": 0.13982082070215252, "grad_norm": 0.34490448236465454, "learning_rate": 7.401690045265133e-05, "loss": 2.0104, "step": 8580 }, { "epoch": 0.14036276186766475, "grad_norm": 0.3491741418838501, "learning_rate": 7.39339414575547e-05, "loss": 2.0163, "step": 8590 }, { "epoch": 0.14090470303317695, "grad_norm": 0.37607693672180176, "learning_rate": 7.385090419680259e-05, "loss": 2.0061, "step": 8600 }, { "epoch": 0.1414466441986892, "grad_norm": 0.6395640969276428, "learning_rate": 7.37677890136392e-05, "loss": 2.0192, "step": 8610 }, { "epoch": 0.14160922654834285, "eval_loss": 2.6450469493865967, "eval_runtime": 21.9781, "eval_samples_per_second": 227.499, "eval_steps_per_second": 1.228, "step": 8613 }, { "epoch": 0.1419885853642014, "grad_norm": 0.44370248913764954, "learning_rate": 7.368459625163084e-05, "loss": 1.9993, "step": 8620 }, { "epoch": 0.14253052652971362, "grad_norm": 0.8615736961364746, "learning_rate": 7.360132625466452e-05, "loss": 1.9997, "step": 8630 }, { "epoch": 0.14307246769522583, "grad_norm": 0.45963114500045776, "learning_rate": 7.351797936694645e-05, "loss": 2.0108, "step": 8640 }, { "epoch": 0.14361440886073806, "grad_norm": 0.34752896428108215, "learning_rate": 7.34345559330007e-05, "loss": 2.0089, "step": 8650 }, { "epoch": 0.14415635002625027, "grad_norm": 0.36175432801246643, "learning_rate": 7.33510562976678e-05, "loss": 2.0071, "step": 8660 }, { "epoch": 0.1446982911917625, "grad_norm": 0.32326793670654297, "learning_rate": 7.326748080610324e-05, "loss": 1.9989, "step": 8670 }, { "epoch": 0.1452402323572747, "grad_norm": 1.1309995651245117, "learning_rate": 7.318382980377603e-05, "loss": 2.0054, "step": 8680 }, { "epoch": 0.14578217352278694, "grad_norm": 0.31617847084999084, "learning_rate": 7.310010363646736e-05, "loss": 2.0057, "step": 8690 }, { "epoch": 0.14632411468829915, "grad_norm": 0.34660130739212036, "learning_rate": 7.301630265026908e-05, "loss": 2.0027, "step": 8700 }, { "epoch": 0.14632411468829915, "eval_loss": 2.6555795669555664, "eval_runtime": 21.9664, "eval_samples_per_second": 227.62, "eval_steps_per_second": 1.229, "step": 8700 }, { "epoch": 0.14686605585381138, "grad_norm": 0.421053409576416, "learning_rate": 7.293242719158241e-05, "loss": 2.0116, "step": 8710 }, { "epoch": 0.14740799701932358, "grad_norm": 0.5713244080543518, "learning_rate": 7.284847760711628e-05, "loss": 2.0094, "step": 8720 }, { "epoch": 0.14794993818483582, "grad_norm": 0.5165607929229736, "learning_rate": 7.27644542438861e-05, "loss": 2.0045, "step": 8730 }, { "epoch": 0.14849187935034802, "grad_norm": 0.7992691993713379, "learning_rate": 7.268035744921225e-05, "loss": 1.9992, "step": 8740 }, { "epoch": 0.14903382051586025, "grad_norm": 0.42167598009109497, "learning_rate": 7.259618757071866e-05, "loss": 1.9992, "step": 8750 }, { "epoch": 0.14957576168137246, "grad_norm": 0.4957919418811798, "learning_rate": 7.251194495633132e-05, "loss": 2.0026, "step": 8760 }, { "epoch": 0.1501177028468847, "grad_norm": 0.512251079082489, "learning_rate": 7.24276299542769e-05, "loss": 1.9965, "step": 8770 }, { "epoch": 0.1506596440123969, "grad_norm": 0.30542322993278503, "learning_rate": 7.234324291308129e-05, "loss": 1.9981, "step": 8780 }, { "epoch": 0.15103900282825547, "eval_loss": 2.644069194793701, "eval_runtime": 21.9682, "eval_samples_per_second": 227.602, "eval_steps_per_second": 1.229, "step": 8787 }, { "epoch": 0.15120158517790913, "grad_norm": 0.7472847104072571, "learning_rate": 7.225878418156819e-05, "loss": 2.0041, "step": 8790 }, { "epoch": 0.15174352634342134, "grad_norm": 0.3889388144016266, "learning_rate": 7.217425410885759e-05, "loss": 1.9959, "step": 8800 }, { "epoch": 0.15228546750893357, "grad_norm": 0.40479448437690735, "learning_rate": 7.208965304436444e-05, "loss": 2.0035, "step": 8810 }, { "epoch": 0.15282740867444577, "grad_norm": 0.5486807227134705, "learning_rate": 7.200498133779706e-05, "loss": 2.0039, "step": 8820 }, { "epoch": 0.153369349839958, "grad_norm": 0.3424724042415619, "learning_rate": 7.192023933915586e-05, "loss": 2.0085, "step": 8830 }, { "epoch": 0.1539112910054702, "grad_norm": 0.3559320867061615, "learning_rate": 7.18354273987318e-05, "loss": 1.9998, "step": 8840 }, { "epoch": 0.15445323217098245, "grad_norm": 0.4217950105667114, "learning_rate": 7.175054586710486e-05, "loss": 2.0115, "step": 8850 }, { "epoch": 0.15499517333649465, "grad_norm": 0.8350193500518799, "learning_rate": 7.166559509514283e-05, "loss": 2.0065, "step": 8860 }, { "epoch": 0.15553711450200688, "grad_norm": 0.43232548236846924, "learning_rate": 7.158057543399957e-05, "loss": 2.0002, "step": 8870 }, { "epoch": 0.15575389096821177, "eval_loss": 2.642789602279663, "eval_runtime": 21.9675, "eval_samples_per_second": 227.609, "eval_steps_per_second": 1.229, "step": 8874 }, { "epoch": 0.1560790556675191, "grad_norm": 0.8233237862586975, "learning_rate": 7.149548723511377e-05, "loss": 2.004, "step": 8880 }, { "epoch": 0.15662099683303132, "grad_norm": 0.7288250923156738, "learning_rate": 7.141033085020747e-05, "loss": 1.997, "step": 8890 }, { "epoch": 0.15716293799854353, "grad_norm": 0.6390490531921387, "learning_rate": 7.132510663128448e-05, "loss": 2.0031, "step": 8900 }, { "epoch": 0.15770487916405576, "grad_norm": 0.41019633412361145, "learning_rate": 7.123981493062907e-05, "loss": 1.9946, "step": 8910 }, { "epoch": 0.15824682032956797, "grad_norm": 0.541032075881958, "learning_rate": 7.115445610080444e-05, "loss": 1.9958, "step": 8920 }, { "epoch": 0.1587887614950802, "grad_norm": 0.5177111029624939, "learning_rate": 7.106903049465123e-05, "loss": 1.9975, "step": 8930 }, { "epoch": 0.1593307026605924, "grad_norm": 0.3048165738582611, "learning_rate": 7.098353846528619e-05, "loss": 1.9908, "step": 8940 }, { "epoch": 0.15987264382610464, "grad_norm": 0.33911651372909546, "learning_rate": 7.089798036610058e-05, "loss": 2.002, "step": 8950 }, { "epoch": 0.16041458499161684, "grad_norm": 0.9156370162963867, "learning_rate": 7.081235655075878e-05, "loss": 1.9849, "step": 8960 }, { "epoch": 0.16046877910816806, "eval_loss": 2.637845754623413, "eval_runtime": 21.9743, "eval_samples_per_second": 227.538, "eval_steps_per_second": 1.229, "step": 8961 }, { "epoch": 0.16095652615712908, "grad_norm": 0.5576390027999878, "learning_rate": 7.072666737319683e-05, "loss": 1.9829, "step": 8970 }, { "epoch": 0.16149846732264128, "grad_norm": 0.36364230513572693, "learning_rate": 7.064091318762089e-05, "loss": 1.9983, "step": 8980 }, { "epoch": 0.16204040848815351, "grad_norm": 0.7000935077667236, "learning_rate": 7.055509434850597e-05, "loss": 1.9852, "step": 8990 }, { "epoch": 0.16258234965366572, "grad_norm": 0.6612693667411804, "learning_rate": 7.046921121059417e-05, "loss": 1.9829, "step": 9000 }, { "epoch": 0.16312429081917795, "grad_norm": 0.6157468557357788, "learning_rate": 7.038326412889353e-05, "loss": 1.9901, "step": 9010 }, { "epoch": 0.16366623198469016, "grad_norm": 0.33146920800209045, "learning_rate": 7.029725345867628e-05, "loss": 1.9893, "step": 9020 }, { "epoch": 0.1642081731502024, "grad_norm": 0.39117079973220825, "learning_rate": 7.02111795554776e-05, "loss": 1.9896, "step": 9030 }, { "epoch": 0.1647501143157146, "grad_norm": 0.4249345660209656, "learning_rate": 7.0125042775094e-05, "loss": 1.9823, "step": 9040 }, { "epoch": 0.1651836672481244, "eval_loss": 2.6504335403442383, "eval_runtime": 21.9717, "eval_samples_per_second": 227.566, "eval_steps_per_second": 1.229, "step": 9048 }, { "epoch": 0.16529205548122683, "grad_norm": 1.197481632232666, "learning_rate": 7.003884347358187e-05, "loss": 1.9937, "step": 9050 }, { "epoch": 0.16583399664673903, "grad_norm": 0.7394921183586121, "learning_rate": 6.995258200725613e-05, "loss": 1.9839, "step": 9060 }, { "epoch": 0.16637593781225127, "grad_norm": 0.6299876570701599, "learning_rate": 6.986625873268857e-05, "loss": 1.9854, "step": 9070 }, { "epoch": 0.16691787897776347, "grad_norm": 0.7766085863113403, "learning_rate": 6.977987400670654e-05, "loss": 1.9938, "step": 9080 }, { "epoch": 0.1674598201432757, "grad_norm": 0.40683385729789734, "learning_rate": 6.969342818639138e-05, "loss": 1.9844, "step": 9090 }, { "epoch": 0.1680017613087879, "grad_norm": 0.33413490653038025, "learning_rate": 6.960692162907695e-05, "loss": 1.9802, "step": 9100 }, { "epoch": 0.16854370247430014, "grad_norm": 0.3267943859100342, "learning_rate": 6.952035469234823e-05, "loss": 1.9908, "step": 9110 }, { "epoch": 0.16908564363981235, "grad_norm": 0.3775721788406372, "learning_rate": 6.943372773403972e-05, "loss": 1.9816, "step": 9120 }, { "epoch": 0.16962758480532458, "grad_norm": 0.42679643630981445, "learning_rate": 6.934704111223407e-05, "loss": 1.9814, "step": 9130 }, { "epoch": 0.16989855538808069, "eval_loss": 2.642576217651367, "eval_runtime": 21.9695, "eval_samples_per_second": 227.588, "eval_steps_per_second": 1.229, "step": 9135 }, { "epoch": 0.1701695259708368, "grad_norm": 0.542743444442749, "learning_rate": 6.926029518526054e-05, "loss": 1.995, "step": 9140 }, { "epoch": 0.17071146713634902, "grad_norm": 0.298638254404068, "learning_rate": 6.917349031169353e-05, "loss": 1.9888, "step": 9150 }, { "epoch": 0.17125340830186123, "grad_norm": 0.6379884481430054, "learning_rate": 6.908662685035111e-05, "loss": 1.9863, "step": 9160 }, { "epoch": 0.17179534946737346, "grad_norm": 0.3423231542110443, "learning_rate": 6.899970516029355e-05, "loss": 1.9858, "step": 9170 }, { "epoch": 0.17233729063288566, "grad_norm": 0.41656017303466797, "learning_rate": 6.891272560082177e-05, "loss": 1.9928, "step": 9180 }, { "epoch": 0.1728792317983979, "grad_norm": 0.41668909788131714, "learning_rate": 6.882568853147594e-05, "loss": 1.9793, "step": 9190 }, { "epoch": 0.1734211729639101, "grad_norm": 0.45455989241600037, "learning_rate": 6.873859431203393e-05, "loss": 1.9811, "step": 9200 }, { "epoch": 0.17396311412942234, "grad_norm": 0.421617716550827, "learning_rate": 6.865144330250984e-05, "loss": 1.9803, "step": 9210 }, { "epoch": 0.17450505529493454, "grad_norm": 0.33842185139656067, "learning_rate": 6.856423586315258e-05, "loss": 1.9708, "step": 9220 }, { "epoch": 0.17461344352803698, "eval_loss": 2.636101245880127, "eval_runtime": 21.9746, "eval_samples_per_second": 227.535, "eval_steps_per_second": 1.229, "step": 9222 }, { "epoch": 0.17504699646044677, "grad_norm": 0.4086952805519104, "learning_rate": 6.847697235444422e-05, "loss": 1.9812, "step": 9230 }, { "epoch": 0.17558893762595898, "grad_norm": 0.573617696762085, "learning_rate": 6.83896531370987e-05, "loss": 1.9738, "step": 9240 }, { "epoch": 0.1761308787914712, "grad_norm": 0.5594168901443481, "learning_rate": 6.830227857206014e-05, "loss": 1.9763, "step": 9250 }, { "epoch": 0.17667281995698342, "grad_norm": 0.820530116558075, "learning_rate": 6.821484902050152e-05, "loss": 1.9754, "step": 9260 }, { "epoch": 0.17721476112249565, "grad_norm": 0.33057236671447754, "learning_rate": 6.81273648438231e-05, "loss": 1.9775, "step": 9270 }, { "epoch": 0.17775670228800786, "grad_norm": 0.4354369640350342, "learning_rate": 6.803982640365092e-05, "loss": 1.9864, "step": 9280 }, { "epoch": 0.1782986434535201, "grad_norm": 0.3154381811618805, "learning_rate": 6.795223406183532e-05, "loss": 1.9862, "step": 9290 }, { "epoch": 0.1788405846190323, "grad_norm": 0.31054070591926575, "learning_rate": 6.78645881804495e-05, "loss": 1.9838, "step": 9300 }, { "epoch": 0.1793283316679933, "eval_loss": 2.6388654708862305, "eval_runtime": 21.9681, "eval_samples_per_second": 227.603, "eval_steps_per_second": 1.229, "step": 9309 }, { "epoch": 0.17938252578454453, "grad_norm": 0.3508462607860565, "learning_rate": 6.777688912178787e-05, "loss": 1.9744, "step": 9310 }, { "epoch": 0.17992446695005673, "grad_norm": 0.34929049015045166, "learning_rate": 6.768913724836477e-05, "loss": 1.9707, "step": 9320 }, { "epoch": 0.18046640811556897, "grad_norm": 0.32265743613243103, "learning_rate": 6.760133292291277e-05, "loss": 1.9676, "step": 9330 }, { "epoch": 0.18100834928108117, "grad_norm": 0.32434341311454773, "learning_rate": 6.751347650838134e-05, "loss": 1.96, "step": 9340 }, { "epoch": 0.1815502904465934, "grad_norm": 0.5159777998924255, "learning_rate": 6.742556836793518e-05, "loss": 1.9691, "step": 9350 }, { "epoch": 0.1820922316121056, "grad_norm": 0.30936720967292786, "learning_rate": 6.733760886495284e-05, "loss": 1.9696, "step": 9360 }, { "epoch": 0.18263417277761784, "grad_norm": 0.3295222222805023, "learning_rate": 6.724959836302518e-05, "loss": 1.9852, "step": 9370 }, { "epoch": 0.18317611394313005, "grad_norm": 0.631144106388092, "learning_rate": 6.716153722595392e-05, "loss": 1.9825, "step": 9380 }, { "epoch": 0.18371805510864228, "grad_norm": 0.4007103741168976, "learning_rate": 6.707342581775e-05, "loss": 1.9684, "step": 9390 }, { "epoch": 0.1840432198079496, "eval_loss": 2.635845422744751, "eval_runtime": 21.9618, "eval_samples_per_second": 227.668, "eval_steps_per_second": 1.229, "step": 9396 }, { "epoch": 0.18425999627415449, "grad_norm": 0.7023712396621704, "learning_rate": 6.69852645026322e-05, "loss": 1.973, "step": 9400 }, { "epoch": 0.18480193743966672, "grad_norm": 0.6182842254638672, "learning_rate": 6.689705364502562e-05, "loss": 1.9714, "step": 9410 }, { "epoch": 0.18534387860517892, "grad_norm": 0.5204935669898987, "learning_rate": 6.680879360956012e-05, "loss": 1.9647, "step": 9420 }, { "epoch": 0.18588581977069116, "grad_norm": 0.3946668803691864, "learning_rate": 6.672048476106886e-05, "loss": 1.9776, "step": 9430 }, { "epoch": 0.18642776093620336, "grad_norm": 0.44481825828552246, "learning_rate": 6.663212746458676e-05, "loss": 1.9718, "step": 9440 }, { "epoch": 0.1869697021017156, "grad_norm": 0.4806019365787506, "learning_rate": 6.6543722085349e-05, "loss": 1.9699, "step": 9450 }, { "epoch": 0.1875116432672278, "grad_norm": 0.3834541141986847, "learning_rate": 6.645526898878955e-05, "loss": 1.9617, "step": 9460 }, { "epoch": 0.18805358443274003, "grad_norm": 0.587719738483429, "learning_rate": 6.636676854053958e-05, "loss": 1.9663, "step": 9470 }, { "epoch": 0.18859552559825224, "grad_norm": 0.48372459411621094, "learning_rate": 6.627822110642603e-05, "loss": 1.9715, "step": 9480 }, { "epoch": 0.1887581079479059, "eval_loss": 2.629991292953491, "eval_runtime": 21.9683, "eval_samples_per_second": 227.601, "eval_steps_per_second": 1.229, "step": 9483 }, { "epoch": 0.18913746676376447, "grad_norm": 0.368070513010025, "learning_rate": 6.618962705247003e-05, "loss": 1.9678, "step": 9490 }, { "epoch": 0.18967940792927668, "grad_norm": 0.3025422692298889, "learning_rate": 6.610098674488546e-05, "loss": 1.9647, "step": 9500 }, { "epoch": 0.1902213490947889, "grad_norm": 0.4809657037258148, "learning_rate": 6.601230055007734e-05, "loss": 1.9742, "step": 9510 }, { "epoch": 0.19076329026030112, "grad_norm": 0.3071550726890564, "learning_rate": 6.592356883464043e-05, "loss": 1.9588, "step": 9520 }, { "epoch": 0.19130523142581332, "grad_norm": 0.3039422035217285, "learning_rate": 6.583479196535763e-05, "loss": 1.961, "step": 9530 }, { "epoch": 0.19184717259132555, "grad_norm": 1.0146269798278809, "learning_rate": 6.574597030919844e-05, "loss": 1.9722, "step": 9540 }, { "epoch": 0.19238911375683776, "grad_norm": 0.3183740973472595, "learning_rate": 6.565710423331757e-05, "loss": 1.9659, "step": 9550 }, { "epoch": 0.19293105492235, "grad_norm": 0.373046338558197, "learning_rate": 6.556819410505331e-05, "loss": 1.9615, "step": 9560 }, { "epoch": 0.1934729960878622, "grad_norm": 0.36389511823654175, "learning_rate": 6.547924029192603e-05, "loss": 1.9639, "step": 9570 }, { "epoch": 0.1934729960878622, "eval_loss": 2.6214120388031006, "eval_runtime": 21.9741, "eval_samples_per_second": 227.54, "eval_steps_per_second": 1.229, "step": 9570 }, { "epoch": 0.19401493725337443, "grad_norm": 0.3653751015663147, "learning_rate": 6.539024316163671e-05, "loss": 1.9665, "step": 9580 }, { "epoch": 0.19455687841888664, "grad_norm": 0.4899474084377289, "learning_rate": 6.530120308206536e-05, "loss": 1.968, "step": 9590 }, { "epoch": 0.19509881958439887, "grad_norm": 0.3284156322479248, "learning_rate": 6.521212042126951e-05, "loss": 1.9563, "step": 9600 }, { "epoch": 0.19564076074991107, "grad_norm": 0.6163231134414673, "learning_rate": 6.512299554748281e-05, "loss": 1.9764, "step": 9610 }, { "epoch": 0.1961827019154233, "grad_norm": 0.5178772807121277, "learning_rate": 6.503382882911322e-05, "loss": 1.97, "step": 9620 }, { "epoch": 0.1967246430809355, "grad_norm": 0.33963051438331604, "learning_rate": 6.494462063474181e-05, "loss": 1.9612, "step": 9630 }, { "epoch": 0.19726658424644775, "grad_norm": 0.2811043858528137, "learning_rate": 6.485537133312107e-05, "loss": 1.9682, "step": 9640 }, { "epoch": 0.19780852541195995, "grad_norm": 0.3501632511615753, "learning_rate": 6.476608129317342e-05, "loss": 1.9547, "step": 9650 }, { "epoch": 0.19818788422781852, "eval_loss": 2.6299521923065186, "eval_runtime": 21.9699, "eval_samples_per_second": 227.584, "eval_steps_per_second": 1.229, "step": 9657 }, { "epoch": 0.19835046657747218, "grad_norm": 0.3678341209888458, "learning_rate": 6.46767508839896e-05, "loss": 1.9613, "step": 9660 }, { "epoch": 0.1988924077429844, "grad_norm": 0.62614905834198, "learning_rate": 6.458738047482731e-05, "loss": 1.9626, "step": 9670 }, { "epoch": 0.19943434890849662, "grad_norm": 0.7425987124443054, "learning_rate": 6.449797043510954e-05, "loss": 1.9591, "step": 9680 }, { "epoch": 0.19997629007400883, "grad_norm": 0.3964003324508667, "learning_rate": 6.440852113442314e-05, "loss": 1.9638, "step": 9690 }, { "epoch": 0.20051823123952106, "grad_norm": 0.4931100308895111, "learning_rate": 6.431903294251721e-05, "loss": 1.9487, "step": 9700 }, { "epoch": 0.20106017240503327, "grad_norm": 0.31457212567329407, "learning_rate": 6.422950622930164e-05, "loss": 1.9618, "step": 9710 }, { "epoch": 0.2016021135705455, "grad_norm": 0.44526612758636475, "learning_rate": 6.413994136484553e-05, "loss": 1.9584, "step": 9720 }, { "epoch": 0.2021440547360577, "grad_norm": 0.3651030957698822, "learning_rate": 6.405033871937572e-05, "loss": 1.9614, "step": 9730 }, { "epoch": 0.20268599590156994, "grad_norm": 0.639445960521698, "learning_rate": 6.396069866327519e-05, "loss": 1.964, "step": 9740 }, { "epoch": 0.20290277236777482, "eval_loss": 2.6391215324401855, "eval_runtime": 21.9615, "eval_samples_per_second": 227.671, "eval_steps_per_second": 1.229, "step": 9744 }, { "epoch": 0.20322793706708214, "grad_norm": 0.45781099796295166, "learning_rate": 6.387102156708152e-05, "loss": 1.9654, "step": 9750 }, { "epoch": 0.20376987823259438, "grad_norm": 0.3018638789653778, "learning_rate": 6.37813078014855e-05, "loss": 1.9603, "step": 9760 }, { "epoch": 0.20431181939810658, "grad_norm": 0.2993408739566803, "learning_rate": 6.369155773732945e-05, "loss": 1.9519, "step": 9770 }, { "epoch": 0.20485376056361881, "grad_norm": 0.35188788175582886, "learning_rate": 6.360177174560567e-05, "loss": 1.945, "step": 9780 }, { "epoch": 0.20539570172913102, "grad_norm": 0.5056437849998474, "learning_rate": 6.351195019745508e-05, "loss": 1.9569, "step": 9790 }, { "epoch": 0.20593764289464325, "grad_norm": 0.4438895583152771, "learning_rate": 6.342209346416553e-05, "loss": 1.9503, "step": 9800 }, { "epoch": 0.20647958406015546, "grad_norm": 0.8153665661811829, "learning_rate": 6.333220191717026e-05, "loss": 1.9529, "step": 9810 }, { "epoch": 0.2070215252256677, "grad_norm": 0.47668522596359253, "learning_rate": 6.324227592804651e-05, "loss": 1.9515, "step": 9820 }, { "epoch": 0.2075634663911799, "grad_norm": 0.3650936782360077, "learning_rate": 6.315231586851382e-05, "loss": 1.9473, "step": 9830 }, { "epoch": 0.20761766050773112, "eval_loss": 2.6374318599700928, "eval_runtime": 21.9702, "eval_samples_per_second": 227.581, "eval_steps_per_second": 1.229, "step": 9831 }, { "epoch": 0.20810540755669213, "grad_norm": 0.4178442060947418, "learning_rate": 6.306232211043262e-05, "loss": 1.9593, "step": 9840 }, { "epoch": 0.20864734872220433, "grad_norm": 0.39025014638900757, "learning_rate": 6.297229502580257e-05, "loss": 1.957, "step": 9850 }, { "epoch": 0.20918928988771657, "grad_norm": 0.573856770992279, "learning_rate": 6.288223498676114e-05, "loss": 1.9455, "step": 9860 }, { "epoch": 0.20973123105322877, "grad_norm": 0.32634246349334717, "learning_rate": 6.279214236558201e-05, "loss": 1.9464, "step": 9870 }, { "epoch": 0.210273172218741, "grad_norm": 0.48771846294403076, "learning_rate": 6.270201753467351e-05, "loss": 1.9471, "step": 9880 }, { "epoch": 0.2108151133842532, "grad_norm": 0.3284076154232025, "learning_rate": 6.261186086657722e-05, "loss": 1.9512, "step": 9890 }, { "epoch": 0.21135705454976544, "grad_norm": 0.4014468789100647, "learning_rate": 6.252167273396614e-05, "loss": 1.9596, "step": 9900 }, { "epoch": 0.21189899571527765, "grad_norm": 0.2978026568889618, "learning_rate": 6.24314535096435e-05, "loss": 1.95, "step": 9910 }, { "epoch": 0.21233254864768744, "eval_loss": 2.6269800662994385, "eval_runtime": 21.9722, "eval_samples_per_second": 227.561, "eval_steps_per_second": 1.229, "step": 9918 }, { "epoch": 0.21244093688078988, "grad_norm": 0.36181050539016724, "learning_rate": 6.234120356654096e-05, "loss": 1.9543, "step": 9920 }, { "epoch": 0.2129828780463021, "grad_norm": 0.7839191555976868, "learning_rate": 6.225092327771723e-05, "loss": 1.9515, "step": 9930 }, { "epoch": 0.21352481921181432, "grad_norm": 1.2086142301559448, "learning_rate": 6.216061301635633e-05, "loss": 1.9502, "step": 9940 }, { "epoch": 0.21406676037732653, "grad_norm": 0.30762261152267456, "learning_rate": 6.207027315576635e-05, "loss": 1.9565, "step": 9950 }, { "epoch": 0.21460870154283876, "grad_norm": 0.31394830346107483, "learning_rate": 6.197990406937757e-05, "loss": 1.9466, "step": 9960 }, { "epoch": 0.21515064270835096, "grad_norm": 0.3644161820411682, "learning_rate": 6.188950613074122e-05, "loss": 1.954, "step": 9970 }, { "epoch": 0.2156925838738632, "grad_norm": 0.34890973567962646, "learning_rate": 6.179907971352766e-05, "loss": 1.936, "step": 9980 }, { "epoch": 0.2162345250393754, "grad_norm": 0.4451602101325989, "learning_rate": 6.170862519152505e-05, "loss": 1.9447, "step": 9990 }, { "epoch": 0.21677646620488764, "grad_norm": 0.33436113595962524, "learning_rate": 6.161814293863772e-05, "loss": 1.9497, "step": 10000 }, { "epoch": 0.21704743678764374, "eval_loss": 2.6255741119384766, "eval_runtime": 21.9675, "eval_samples_per_second": 227.609, "eval_steps_per_second": 1.229, "step": 10005 }, { "epoch": 0.21731840737039984, "grad_norm": 0.6699984669685364, "learning_rate": 6.152763332888464e-05, "loss": 1.9456, "step": 10010 }, { "epoch": 0.21786034853591207, "grad_norm": 0.3605382740497589, "learning_rate": 6.143709673639778e-05, "loss": 1.9341, "step": 10020 }, { "epoch": 0.21840228970142428, "grad_norm": 0.35184648633003235, "learning_rate": 6.134653353542074e-05, "loss": 1.9538, "step": 10030 }, { "epoch": 0.2189442308669365, "grad_norm": 0.3720100522041321, "learning_rate": 6.125594410030706e-05, "loss": 1.9441, "step": 10040 }, { "epoch": 0.21948617203244872, "grad_norm": 0.5039480924606323, "learning_rate": 6.116532880551876e-05, "loss": 1.9407, "step": 10050 }, { "epoch": 0.22002811319796095, "grad_norm": 0.3840113878250122, "learning_rate": 6.107468802562472e-05, "loss": 1.9444, "step": 10060 }, { "epoch": 0.22057005436347316, "grad_norm": 0.5373072028160095, "learning_rate": 6.098402213529916e-05, "loss": 1.948, "step": 10070 }, { "epoch": 0.2211119955289854, "grad_norm": 0.2886420786380768, "learning_rate": 6.089333150932014e-05, "loss": 1.9425, "step": 10080 }, { "epoch": 0.2216539366944976, "grad_norm": 0.2997380495071411, "learning_rate": 6.0802616522567914e-05, "loss": 1.9477, "step": 10090 }, { "epoch": 0.22176232492760004, "eval_loss": 2.6249098777770996, "eval_runtime": 21.9708, "eval_samples_per_second": 227.574, "eval_steps_per_second": 1.229, "step": 10092 }, { "epoch": 0.22219587786000983, "grad_norm": 0.3256387412548065, "learning_rate": 6.0711877550023474e-05, "loss": 1.9371, "step": 10100 }, { "epoch": 0.22273781902552203, "grad_norm": 0.37883371114730835, "learning_rate": 6.062111496676694e-05, "loss": 1.9464, "step": 10110 }, { "epoch": 0.22327976019103427, "grad_norm": 0.42293113470077515, "learning_rate": 6.053032914797605e-05, "loss": 1.9389, "step": 10120 }, { "epoch": 0.22382170135654647, "grad_norm": 0.9902483820915222, "learning_rate": 6.043952046892457e-05, "loss": 1.9459, "step": 10130 }, { "epoch": 0.2243636425220587, "grad_norm": 0.28801223635673523, "learning_rate": 6.034868930498076e-05, "loss": 1.9296, "step": 10140 }, { "epoch": 0.2249055836875709, "grad_norm": 0.4709838926792145, "learning_rate": 6.025783603160583e-05, "loss": 1.9471, "step": 10150 }, { "epoch": 0.22544752485308314, "grad_norm": 0.5633691549301147, "learning_rate": 6.016696102435241e-05, "loss": 1.9371, "step": 10160 }, { "epoch": 0.22598946601859535, "grad_norm": 0.6367583870887756, "learning_rate": 6.0076064658862884e-05, "loss": 1.9431, "step": 10170 }, { "epoch": 0.22647721306755636, "eval_loss": 2.6244375705718994, "eval_runtime": 21.9646, "eval_samples_per_second": 227.639, "eval_steps_per_second": 1.229, "step": 10179 }, { "epoch": 0.22653140718410758, "grad_norm": 0.6641575694084167, "learning_rate": 5.998514731086805e-05, "loss": 1.9408, "step": 10180 }, { "epoch": 0.22707334834961979, "grad_norm": 0.45785051584243774, "learning_rate": 5.9894209356185314e-05, "loss": 1.9407, "step": 10190 }, { "epoch": 0.22761528951513202, "grad_norm": 0.473785936832428, "learning_rate": 5.980325117071736e-05, "loss": 1.9439, "step": 10200 }, { "epoch": 0.22815723068064422, "grad_norm": 0.6695595979690552, "learning_rate": 5.971227313045043e-05, "loss": 1.945, "step": 10210 }, { "epoch": 0.22869917184615646, "grad_norm": 0.502714991569519, "learning_rate": 5.9621275611452874e-05, "loss": 1.9456, "step": 10220 }, { "epoch": 0.22924111301166866, "grad_norm": 0.6983989477157593, "learning_rate": 5.9530258989873555e-05, "loss": 1.9517, "step": 10230 }, { "epoch": 0.2297830541771809, "grad_norm": 0.3308335840702057, "learning_rate": 5.943922364194029e-05, "loss": 1.9387, "step": 10240 }, { "epoch": 0.2303249953426931, "grad_norm": 0.2856954038143158, "learning_rate": 5.93481699439583e-05, "loss": 1.9409, "step": 10250 }, { "epoch": 0.23086693650820533, "grad_norm": 0.3123374879360199, "learning_rate": 5.925709827230868e-05, "loss": 1.933, "step": 10260 }, { "epoch": 0.23119210120751266, "eval_loss": 2.6269350051879883, "eval_runtime": 21.9652, "eval_samples_per_second": 227.633, "eval_steps_per_second": 1.229, "step": 10266 }, { "epoch": 0.23140887767371754, "grad_norm": 0.3088303506374359, "learning_rate": 5.91660090034468e-05, "loss": 1.9344, "step": 10270 }, { "epoch": 0.23195081883922977, "grad_norm": 0.3397095501422882, "learning_rate": 5.907490251390079e-05, "loss": 1.9421, "step": 10280 }, { "epoch": 0.23249276000474198, "grad_norm": 0.40521690249443054, "learning_rate": 5.898377918026993e-05, "loss": 1.9369, "step": 10290 }, { "epoch": 0.2330347011702542, "grad_norm": 0.3461083769798279, "learning_rate": 5.889263937922315e-05, "loss": 1.937, "step": 10300 }, { "epoch": 0.23357664233576642, "grad_norm": 0.47172901034355164, "learning_rate": 5.8801483487497476e-05, "loss": 1.9437, "step": 10310 }, { "epoch": 0.23411858350127865, "grad_norm": 0.4379764795303345, "learning_rate": 5.87103118818964e-05, "loss": 1.92, "step": 10320 }, { "epoch": 0.23466052466679085, "grad_norm": 0.7908567190170288, "learning_rate": 5.861912493928837e-05, "loss": 1.9329, "step": 10330 }, { "epoch": 0.2352024658323031, "grad_norm": 0.5137506723403931, "learning_rate": 5.852792303660528e-05, "loss": 1.9414, "step": 10340 }, { "epoch": 0.2357444069978153, "grad_norm": 0.31868940591812134, "learning_rate": 5.8436706550840805e-05, "loss": 1.9387, "step": 10350 }, { "epoch": 0.23590698934746895, "eval_loss": 2.6262524127960205, "eval_runtime": 21.9674, "eval_samples_per_second": 227.61, "eval_steps_per_second": 1.229, "step": 10353 }, { "epoch": 0.23628634816332753, "grad_norm": 0.3951703608036041, "learning_rate": 5.834547585904898e-05, "loss": 1.9313, "step": 10360 }, { "epoch": 0.23682828932883973, "grad_norm": 0.3555956780910492, "learning_rate": 5.8254231338342446e-05, "loss": 1.9298, "step": 10370 }, { "epoch": 0.23737023049435196, "grad_norm": 0.3569830358028412, "learning_rate": 5.8162973365891106e-05, "loss": 1.9423, "step": 10380 }, { "epoch": 0.23791217165986417, "grad_norm": 0.34016862511634827, "learning_rate": 5.807170231892042e-05, "loss": 1.9239, "step": 10390 }, { "epoch": 0.2384541128253764, "grad_norm": 0.32383885979652405, "learning_rate": 5.7980418574709924e-05, "loss": 1.9378, "step": 10400 }, { "epoch": 0.2389960539908886, "grad_norm": 0.42014604806900024, "learning_rate": 5.788912251059162e-05, "loss": 1.9331, "step": 10410 }, { "epoch": 0.23953799515640084, "grad_norm": 0.3834390640258789, "learning_rate": 5.7797814503948414e-05, "loss": 1.9344, "step": 10420 }, { "epoch": 0.24007993632191305, "grad_norm": 0.33072102069854736, "learning_rate": 5.770649493221262e-05, "loss": 1.9281, "step": 10430 }, { "epoch": 0.24062187748742528, "grad_norm": 0.41425782442092896, "learning_rate": 5.7615164172864346e-05, "loss": 1.9247, "step": 10440 }, { "epoch": 0.24062187748742528, "eval_loss": 2.6209261417388916, "eval_runtime": 21.9653, "eval_samples_per_second": 227.632, "eval_steps_per_second": 1.229, "step": 10440 }, { "epoch": 0.24116381865293748, "grad_norm": 0.4126664400100708, "learning_rate": 5.7523822603429924e-05, "loss": 1.9254, "step": 10450 }, { "epoch": 0.24170575981844972, "grad_norm": 0.4077990651130676, "learning_rate": 5.7432470601480394e-05, "loss": 1.9298, "step": 10460 }, { "epoch": 0.24224770098396192, "grad_norm": 0.33790072798728943, "learning_rate": 5.7341108544629894e-05, "loss": 1.9214, "step": 10470 }, { "epoch": 0.24278964214947416, "grad_norm": 0.5326333045959473, "learning_rate": 5.724973681053417e-05, "loss": 1.9277, "step": 10480 }, { "epoch": 0.24333158331498636, "grad_norm": 0.8307605981826782, "learning_rate": 5.715835577688894e-05, "loss": 1.9399, "step": 10490 }, { "epoch": 0.2438735244804986, "grad_norm": 0.29457995295524597, "learning_rate": 5.706696582142834e-05, "loss": 1.9274, "step": 10500 }, { "epoch": 0.2444154656460108, "grad_norm": 0.47820374369621277, "learning_rate": 5.697556732192343e-05, "loss": 1.9332, "step": 10510 }, { "epoch": 0.24495740681152303, "grad_norm": 0.615271270275116, "learning_rate": 5.688416065618057e-05, "loss": 1.9358, "step": 10520 }, { "epoch": 0.24533676562738158, "eval_loss": 2.623511552810669, "eval_runtime": 21.9658, "eval_samples_per_second": 227.627, "eval_steps_per_second": 1.229, "step": 10527 }, { "epoch": 0.24549934797703524, "grad_norm": 0.3703593909740448, "learning_rate": 5.679274620203986e-05, "loss": 1.9345, "step": 10530 }, { "epoch": 0.24604128914254747, "grad_norm": 0.3069552779197693, "learning_rate": 5.670132433737363e-05, "loss": 1.9229, "step": 10540 }, { "epoch": 0.24658323030805968, "grad_norm": 0.30190664529800415, "learning_rate": 5.660989544008479e-05, "loss": 1.9288, "step": 10550 }, { "epoch": 0.2471251714735719, "grad_norm": 0.5308563113212585, "learning_rate": 5.651845988810538e-05, "loss": 1.9246, "step": 10560 }, { "epoch": 0.2476671126390841, "grad_norm": 0.33339574933052063, "learning_rate": 5.642701805939491e-05, "loss": 1.9239, "step": 10570 }, { "epoch": 0.24820905380459635, "grad_norm": 0.6166539788246155, "learning_rate": 5.633557033193884e-05, "loss": 1.9311, "step": 10580 }, { "epoch": 0.24875099497010855, "grad_norm": 0.3371501863002777, "learning_rate": 5.624411708374703e-05, "loss": 1.9148, "step": 10590 }, { "epoch": 0.24929293613562079, "grad_norm": 0.5741251111030579, "learning_rate": 5.615265869285212e-05, "loss": 1.936, "step": 10600 }, { "epoch": 0.249834877301133, "grad_norm": 0.8390110731124878, "learning_rate": 5.606119553730808e-05, "loss": 1.9235, "step": 10610 }, { "epoch": 0.2500516537673379, "eval_loss": 2.6314127445220947, "eval_runtime": 21.9728, "eval_samples_per_second": 227.554, "eval_steps_per_second": 1.229, "step": 10614 }, { "epoch": 0.2503768184666452, "grad_norm": 0.31572863459587097, "learning_rate": 5.596972799518849e-05, "loss": 1.9226, "step": 10620 }, { "epoch": 0.25091875963215743, "grad_norm": 0.3893384635448456, "learning_rate": 5.5878256444585134e-05, "loss": 1.9379, "step": 10630 }, { "epoch": 0.25146070079766963, "grad_norm": 0.4373924136161804, "learning_rate": 5.578678126360632e-05, "loss": 1.9296, "step": 10640 }, { "epoch": 0.2520026419631819, "grad_norm": 0.2836342751979828, "learning_rate": 5.5695302830375374e-05, "loss": 1.9231, "step": 10650 }, { "epoch": 0.2525445831286941, "grad_norm": 0.3092157244682312, "learning_rate": 5.5603821523029084e-05, "loss": 1.9191, "step": 10660 }, { "epoch": 0.2530865242942063, "grad_norm": 0.6483246088027954, "learning_rate": 5.551233771971611e-05, "loss": 1.9202, "step": 10670 }, { "epoch": 0.2536284654597185, "grad_norm": 0.34343624114990234, "learning_rate": 5.54208517985954e-05, "loss": 1.9297, "step": 10680 }, { "epoch": 0.25417040662523077, "grad_norm": 0.29993903636932373, "learning_rate": 5.532936413783469e-05, "loss": 1.9228, "step": 10690 }, { "epoch": 0.254712347790743, "grad_norm": 0.4231652319431305, "learning_rate": 5.5237875115608905e-05, "loss": 1.94, "step": 10700 }, { "epoch": 0.2547665419072942, "eval_loss": 2.6222281455993652, "eval_runtime": 21.972, "eval_samples_per_second": 227.563, "eval_steps_per_second": 1.229, "step": 10701 }, { "epoch": 0.2552542889562552, "grad_norm": 0.31495311856269836, "learning_rate": 5.51463851100986e-05, "loss": 1.9252, "step": 10710 }, { "epoch": 0.2557962301217674, "grad_norm": 0.5191118717193604, "learning_rate": 5.5054894499488353e-05, "loss": 1.9118, "step": 10720 }, { "epoch": 0.25633817128727965, "grad_norm": 0.6403262615203857, "learning_rate": 5.496340366196527e-05, "loss": 1.9308, "step": 10730 }, { "epoch": 0.25688011245279185, "grad_norm": 0.29355281591415405, "learning_rate": 5.4871912975717444e-05, "loss": 1.925, "step": 10740 }, { "epoch": 0.25742205361830406, "grad_norm": 0.4204306900501251, "learning_rate": 5.478042281893225e-05, "loss": 1.92, "step": 10750 }, { "epoch": 0.25796399478381626, "grad_norm": 0.5830504894256592, "learning_rate": 5.468893356979498e-05, "loss": 1.9133, "step": 10760 }, { "epoch": 0.2585059359493285, "grad_norm": 0.2985590994358063, "learning_rate": 5.459744560648707e-05, "loss": 1.9221, "step": 10770 }, { "epoch": 0.25904787711484073, "grad_norm": 0.29512789845466614, "learning_rate": 5.4505959307184675e-05, "loss": 1.9203, "step": 10780 }, { "epoch": 0.2594814300472505, "eval_loss": 2.625079870223999, "eval_runtime": 21.9676, "eval_samples_per_second": 227.608, "eval_steps_per_second": 1.229, "step": 10788 }, { "epoch": 0.25958981828035294, "grad_norm": 0.3882138133049011, "learning_rate": 5.441447505005714e-05, "loss": 1.9178, "step": 10790 }, { "epoch": 0.26013175944586514, "grad_norm": 0.5985729098320007, "learning_rate": 5.432299321326526e-05, "loss": 1.9214, "step": 10800 }, { "epoch": 0.2606737006113774, "grad_norm": 0.5359235405921936, "learning_rate": 5.423151417495991e-05, "loss": 1.9182, "step": 10810 }, { "epoch": 0.2612156417768896, "grad_norm": 0.2866165041923523, "learning_rate": 5.4140038313280364e-05, "loss": 1.9205, "step": 10820 }, { "epoch": 0.2617575829424018, "grad_norm": 0.7025465965270996, "learning_rate": 5.404856600635273e-05, "loss": 1.9221, "step": 10830 }, { "epoch": 0.262299524107914, "grad_norm": 0.48262158036231995, "learning_rate": 5.39570976322885e-05, "loss": 1.9164, "step": 10840 }, { "epoch": 0.2628414652734263, "grad_norm": 0.331333190202713, "learning_rate": 5.386563356918286e-05, "loss": 1.9203, "step": 10850 }, { "epoch": 0.2633834064389385, "grad_norm": 0.559256374835968, "learning_rate": 5.3774174195113145e-05, "loss": 1.919, "step": 10860 }, { "epoch": 0.2639253476044507, "grad_norm": 0.4558927118778229, "learning_rate": 5.368271988813741e-05, "loss": 1.9186, "step": 10870 }, { "epoch": 0.2641963181872068, "eval_loss": 2.6204073429107666, "eval_runtime": 21.9646, "eval_samples_per_second": 227.639, "eval_steps_per_second": 1.229, "step": 10875 }, { "epoch": 0.2644672887699629, "grad_norm": 0.32578545808792114, "learning_rate": 5.3591271026292645e-05, "loss": 1.9178, "step": 10880 }, { "epoch": 0.26500922993547515, "grad_norm": 0.5844838619232178, "learning_rate": 5.349982798759341e-05, "loss": 1.9306, "step": 10890 }, { "epoch": 0.26555117110098736, "grad_norm": 0.37786585092544556, "learning_rate": 5.340839115003019e-05, "loss": 1.9236, "step": 10900 }, { "epoch": 0.26609311226649957, "grad_norm": 0.33021968603134155, "learning_rate": 5.331696089156776e-05, "loss": 1.9097, "step": 10910 }, { "epoch": 0.26663505343201177, "grad_norm": 0.3873339891433716, "learning_rate": 5.322553759014383e-05, "loss": 1.9097, "step": 10920 }, { "epoch": 0.26717699459752403, "grad_norm": 0.32818225026130676, "learning_rate": 5.313412162366723e-05, "loss": 1.9136, "step": 10930 }, { "epoch": 0.26771893576303624, "grad_norm": 0.6563804149627686, "learning_rate": 5.304271337001652e-05, "loss": 1.9173, "step": 10940 }, { "epoch": 0.26826087692854844, "grad_norm": 0.3008669912815094, "learning_rate": 5.295131320703841e-05, "loss": 1.9104, "step": 10950 }, { "epoch": 0.26880281809406065, "grad_norm": 0.3501867353916168, "learning_rate": 5.2859921512546104e-05, "loss": 1.9232, "step": 10960 }, { "epoch": 0.2689112063271631, "eval_loss": 2.611741542816162, "eval_runtime": 21.9709, "eval_samples_per_second": 227.574, "eval_steps_per_second": 1.229, "step": 10962 }, { "epoch": 0.2693447592595729, "grad_norm": 0.2955268621444702, "learning_rate": 5.276853866431787e-05, "loss": 1.906, "step": 10970 }, { "epoch": 0.2698867004250851, "grad_norm": 0.44501546025276184, "learning_rate": 5.267716504009533e-05, "loss": 1.9085, "step": 10980 }, { "epoch": 0.2704286415905973, "grad_norm": 0.3195658326148987, "learning_rate": 5.258580101758203e-05, "loss": 1.9186, "step": 10990 }, { "epoch": 0.2709705827561095, "grad_norm": 0.5696297883987427, "learning_rate": 5.2494446974441837e-05, "loss": 1.9061, "step": 11000 }, { "epoch": 0.2715125239216218, "grad_norm": 0.29219719767570496, "learning_rate": 5.2403103288297314e-05, "loss": 1.9114, "step": 11010 }, { "epoch": 0.272054465087134, "grad_norm": 0.3206324279308319, "learning_rate": 5.231177033672824e-05, "loss": 1.9041, "step": 11020 }, { "epoch": 0.2725964062526462, "grad_norm": 0.33854442834854126, "learning_rate": 5.222044849727005e-05, "loss": 1.9057, "step": 11030 }, { "epoch": 0.2731383474181584, "grad_norm": 0.2936360538005829, "learning_rate": 5.212913814741219e-05, "loss": 1.918, "step": 11040 }, { "epoch": 0.27362609446711944, "eval_loss": 2.61767840385437, "eval_runtime": 21.6394, "eval_samples_per_second": 231.06, "eval_steps_per_second": 1.248, "step": 11049 }, { "epoch": 0.27368028858367066, "grad_norm": 0.34562787413597107, "learning_rate": 5.203783966459665e-05, "loss": 1.9068, "step": 11050 }, { "epoch": 0.27422222974918287, "grad_norm": 0.5706035494804382, "learning_rate": 5.1946553426216394e-05, "loss": 1.9123, "step": 11060 }, { "epoch": 0.27476417091469507, "grad_norm": 0.5709580183029175, "learning_rate": 5.1855279809613675e-05, "loss": 1.9214, "step": 11070 }, { "epoch": 0.2753061120802073, "grad_norm": 0.3960080146789551, "learning_rate": 5.1764019192078686e-05, "loss": 1.913, "step": 11080 }, { "epoch": 0.27584805324571954, "grad_norm": 0.4222663342952728, "learning_rate": 5.16727719508478e-05, "loss": 1.9182, "step": 11090 }, { "epoch": 0.27638999441123174, "grad_norm": 0.6161875128746033, "learning_rate": 5.158153846310214e-05, "loss": 1.9089, "step": 11100 }, { "epoch": 0.27693193557674395, "grad_norm": 0.5677464008331299, "learning_rate": 5.149031910596599e-05, "loss": 1.8998, "step": 11110 }, { "epoch": 0.27747387674225615, "grad_norm": 0.3052445352077484, "learning_rate": 5.139911425650518e-05, "loss": 1.9103, "step": 11120 }, { "epoch": 0.2780158179077684, "grad_norm": 0.2828851044178009, "learning_rate": 5.1307924291725583e-05, "loss": 1.907, "step": 11130 }, { "epoch": 0.27834098260707574, "eval_loss": 2.613119125366211, "eval_runtime": 21.9757, "eval_samples_per_second": 227.524, "eval_steps_per_second": 1.229, "step": 11136 }, { "epoch": 0.2785577590732806, "grad_norm": 0.33760419487953186, "learning_rate": 5.121674958857159e-05, "loss": 1.9194, "step": 11140 }, { "epoch": 0.2790997002387928, "grad_norm": 0.3404898941516876, "learning_rate": 5.112559052392444e-05, "loss": 1.9043, "step": 11150 }, { "epoch": 0.27964164140430503, "grad_norm": 0.3572676181793213, "learning_rate": 5.103444747460079e-05, "loss": 1.9091, "step": 11160 }, { "epoch": 0.2801835825698173, "grad_norm": 0.3610747456550598, "learning_rate": 5.0943320817351034e-05, "loss": 1.9049, "step": 11170 }, { "epoch": 0.2807255237353295, "grad_norm": 0.46232640743255615, "learning_rate": 5.085221092885785e-05, "loss": 1.9233, "step": 11180 }, { "epoch": 0.2812674649008417, "grad_norm": 0.27908554673194885, "learning_rate": 5.076111818573459e-05, "loss": 1.9064, "step": 11190 }, { "epoch": 0.2818094060663539, "grad_norm": 0.6692981719970703, "learning_rate": 5.0670042964523745e-05, "loss": 1.9067, "step": 11200 }, { "epoch": 0.28235134723186617, "grad_norm": 0.3264489769935608, "learning_rate": 5.057898564169534e-05, "loss": 1.8999, "step": 11210 }, { "epoch": 0.2828932883973784, "grad_norm": 0.5410310626029968, "learning_rate": 5.048794659364546e-05, "loss": 1.9098, "step": 11220 }, { "epoch": 0.28305587074703203, "eval_loss": 2.6135261058807373, "eval_runtime": 21.9689, "eval_samples_per_second": 227.594, "eval_steps_per_second": 1.229, "step": 11223 }, { "epoch": 0.2834352295628906, "grad_norm": 0.6051361560821533, "learning_rate": 5.0396926196694626e-05, "loss": 1.9124, "step": 11230 }, { "epoch": 0.2839771707284028, "grad_norm": 0.5583413243293762, "learning_rate": 5.030592482708626e-05, "loss": 1.9025, "step": 11240 }, { "epoch": 0.28451911189391504, "grad_norm": 0.6230108737945557, "learning_rate": 5.021494286098514e-05, "loss": 1.9113, "step": 11250 }, { "epoch": 0.28506105305942725, "grad_norm": 0.538650631904602, "learning_rate": 5.0123980674475824e-05, "loss": 1.9034, "step": 11260 }, { "epoch": 0.28560299422493945, "grad_norm": 0.6079558730125427, "learning_rate": 5.003303864356115e-05, "loss": 1.9154, "step": 11270 }, { "epoch": 0.28614493539045166, "grad_norm": 0.5191447138786316, "learning_rate": 4.994211714416058e-05, "loss": 1.8997, "step": 11280 }, { "epoch": 0.28668687655596387, "grad_norm": 0.2797117233276367, "learning_rate": 4.9851216552108745e-05, "loss": 1.8978, "step": 11290 }, { "epoch": 0.2872288177214761, "grad_norm": 0.29481279850006104, "learning_rate": 4.976033724315385e-05, "loss": 1.9152, "step": 11300 }, { "epoch": 0.28777075888698833, "grad_norm": 0.390680193901062, "learning_rate": 4.966947959295612e-05, "loss": 1.9112, "step": 11310 }, { "epoch": 0.28777075888698833, "eval_loss": 2.617300033569336, "eval_runtime": 21.9665, "eval_samples_per_second": 227.62, "eval_steps_per_second": 1.229, "step": 11310 }, { "epoch": 0.28831270005250054, "grad_norm": 0.6555575132369995, "learning_rate": 4.957864397708625e-05, "loss": 1.8965, "step": 11320 }, { "epoch": 0.28885464121801274, "grad_norm": 0.3799804449081421, "learning_rate": 4.948783077102385e-05, "loss": 1.9039, "step": 11330 }, { "epoch": 0.289396582383525, "grad_norm": 0.2934130132198334, "learning_rate": 4.9397040350155876e-05, "loss": 1.9078, "step": 11340 }, { "epoch": 0.2899385235490372, "grad_norm": 0.2916678190231323, "learning_rate": 4.930627308977517e-05, "loss": 1.892, "step": 11350 }, { "epoch": 0.2904804647145494, "grad_norm": 0.30668988823890686, "learning_rate": 4.921552936507876e-05, "loss": 1.9098, "step": 11360 }, { "epoch": 0.2910224058800616, "grad_norm": 0.4994325637817383, "learning_rate": 4.912480955116642e-05, "loss": 1.9045, "step": 11370 }, { "epoch": 0.2915643470455739, "grad_norm": 1.1822820901870728, "learning_rate": 4.9034114023039076e-05, "loss": 1.8957, "step": 11380 }, { "epoch": 0.2921062882110861, "grad_norm": 0.3069334626197815, "learning_rate": 4.894344315559729e-05, "loss": 1.9096, "step": 11390 }, { "epoch": 0.29248564702694463, "eval_loss": 2.6101443767547607, "eval_runtime": 21.9727, "eval_samples_per_second": 227.556, "eval_steps_per_second": 1.229, "step": 11397 }, { "epoch": 0.2926482293765983, "grad_norm": 0.3290071487426758, "learning_rate": 4.885279732363967e-05, "loss": 1.9007, "step": 11400 }, { "epoch": 0.2931901705421105, "grad_norm": 0.27592140436172485, "learning_rate": 4.876217690186131e-05, "loss": 1.8886, "step": 11410 }, { "epoch": 0.29373211170762276, "grad_norm": 0.5586705207824707, "learning_rate": 4.867158226485231e-05, "loss": 1.8874, "step": 11420 }, { "epoch": 0.29427405287313496, "grad_norm": 0.5295722484588623, "learning_rate": 4.858101378709616e-05, "loss": 1.8926, "step": 11430 }, { "epoch": 0.29481599403864717, "grad_norm": 0.3275451064109802, "learning_rate": 4.8490471842968267e-05, "loss": 1.908, "step": 11440 }, { "epoch": 0.29535793520415937, "grad_norm": 0.2957296371459961, "learning_rate": 4.8399956806734234e-05, "loss": 1.899, "step": 11450 }, { "epoch": 0.29589987636967163, "grad_norm": 0.30543988943099976, "learning_rate": 4.830946905254861e-05, "loss": 1.9042, "step": 11460 }, { "epoch": 0.29644181753518384, "grad_norm": 0.5068356394767761, "learning_rate": 4.821900895445302e-05, "loss": 1.8957, "step": 11470 }, { "epoch": 0.29698375870069604, "grad_norm": 0.3116234838962555, "learning_rate": 4.812857688637486e-05, "loss": 1.9, "step": 11480 }, { "epoch": 0.2972005351669009, "eval_loss": 2.6111230850219727, "eval_runtime": 21.9729, "eval_samples_per_second": 227.553, "eval_steps_per_second": 1.229, "step": 11484 }, { "epoch": 0.29752569986620825, "grad_norm": 0.31971922516822815, "learning_rate": 4.8038173222125645e-05, "loss": 1.9074, "step": 11490 }, { "epoch": 0.2980676410317205, "grad_norm": 0.3503149747848511, "learning_rate": 4.7947798335399416e-05, "loss": 1.899, "step": 11500 }, { "epoch": 0.2986095821972327, "grad_norm": 0.30250006914138794, "learning_rate": 4.7857452599771354e-05, "loss": 1.9063, "step": 11510 }, { "epoch": 0.2991515233627449, "grad_norm": 0.7795620560646057, "learning_rate": 4.77671363886961e-05, "loss": 1.9005, "step": 11520 }, { "epoch": 0.2996934645282571, "grad_norm": 0.5038977265357971, "learning_rate": 4.7676850075506185e-05, "loss": 1.8991, "step": 11530 }, { "epoch": 0.3002354056937694, "grad_norm": 0.4416648745536804, "learning_rate": 4.758659403341069e-05, "loss": 1.893, "step": 11540 }, { "epoch": 0.3007773468592816, "grad_norm": 0.3159092366695404, "learning_rate": 4.749636863549346e-05, "loss": 1.9063, "step": 11550 }, { "epoch": 0.3013192880247938, "grad_norm": 0.33458906412124634, "learning_rate": 4.740617425471168e-05, "loss": 1.9001, "step": 11560 }, { "epoch": 0.301861229190306, "grad_norm": 0.5140174627304077, "learning_rate": 4.731601126389438e-05, "loss": 1.899, "step": 11570 }, { "epoch": 0.3019154233068572, "eval_loss": 2.6166553497314453, "eval_runtime": 21.9725, "eval_samples_per_second": 227.558, "eval_steps_per_second": 1.229, "step": 11571 }, { "epoch": 0.30240317035581826, "grad_norm": 0.570547878742218, "learning_rate": 4.722588003574077e-05, "loss": 1.895, "step": 11580 }, { "epoch": 0.30294511152133047, "grad_norm": 0.44531741738319397, "learning_rate": 4.7135780942818817e-05, "loss": 1.9075, "step": 11590 }, { "epoch": 0.3034870526868427, "grad_norm": 0.47997337579727173, "learning_rate": 4.704571435756363e-05, "loss": 1.8995, "step": 11600 }, { "epoch": 0.3040289938523549, "grad_norm": 0.47242090106010437, "learning_rate": 4.6955680652275916e-05, "loss": 1.8903, "step": 11610 }, { "epoch": 0.30457093501786714, "grad_norm": 0.28598716855049133, "learning_rate": 4.6865680199120545e-05, "loss": 1.8922, "step": 11620 }, { "epoch": 0.30511287618337934, "grad_norm": 0.44576406478881836, "learning_rate": 4.677571337012484e-05, "loss": 1.8952, "step": 11630 }, { "epoch": 0.30565481734889155, "grad_norm": 0.3293391764163971, "learning_rate": 4.668578053717721e-05, "loss": 1.8935, "step": 11640 }, { "epoch": 0.30619675851440376, "grad_norm": 0.31350746750831604, "learning_rate": 4.65958820720255e-05, "loss": 1.8906, "step": 11650 }, { "epoch": 0.3066303114468136, "eval_loss": 2.620225191116333, "eval_runtime": 21.9691, "eval_samples_per_second": 227.592, "eval_steps_per_second": 1.229, "step": 11658 }, { "epoch": 0.306738699679916, "grad_norm": 0.3374342918395996, "learning_rate": 4.650601834627549e-05, "loss": 1.8892, "step": 11660 }, { "epoch": 0.3072806408454282, "grad_norm": 0.365682452917099, "learning_rate": 4.641618973138942e-05, "loss": 1.892, "step": 11670 }, { "epoch": 0.3078225820109404, "grad_norm": 0.3332476019859314, "learning_rate": 4.6326396598684296e-05, "loss": 1.8927, "step": 11680 }, { "epoch": 0.30836452317645263, "grad_norm": 1.3207186460494995, "learning_rate": 4.6236639319330524e-05, "loss": 1.8873, "step": 11690 }, { "epoch": 0.3089064643419649, "grad_norm": 0.3541896641254425, "learning_rate": 4.614691826435028e-05, "loss": 1.8908, "step": 11700 }, { "epoch": 0.3094484055074771, "grad_norm": 0.486902117729187, "learning_rate": 4.605723380461603e-05, "loss": 1.8982, "step": 11710 }, { "epoch": 0.3099903466729893, "grad_norm": 0.33585986495018005, "learning_rate": 4.596758631084892e-05, "loss": 1.901, "step": 11720 }, { "epoch": 0.3105322878385015, "grad_norm": 0.37316057085990906, "learning_rate": 4.587797615361735e-05, "loss": 1.8892, "step": 11730 }, { "epoch": 0.31107422900401377, "grad_norm": 0.4041992723941803, "learning_rate": 4.578840370333534e-05, "loss": 1.8928, "step": 11740 }, { "epoch": 0.31134519958676987, "eval_loss": 2.6146914958953857, "eval_runtime": 21.9662, "eval_samples_per_second": 227.622, "eval_steps_per_second": 1.229, "step": 11745 }, { "epoch": 0.311616170169526, "grad_norm": 0.2976398169994354, "learning_rate": 4.569886933026107e-05, "loss": 1.8848, "step": 11750 }, { "epoch": 0.3121581113350382, "grad_norm": 0.3247699737548828, "learning_rate": 4.5609373404495316e-05, "loss": 1.8998, "step": 11760 }, { "epoch": 0.3127000525005504, "grad_norm": 0.29927968978881836, "learning_rate": 4.55199162959799e-05, "loss": 1.8883, "step": 11770 }, { "epoch": 0.31324199366606265, "grad_norm": 0.301215797662735, "learning_rate": 4.543049837449626e-05, "loss": 1.8916, "step": 11780 }, { "epoch": 0.31378393483157485, "grad_norm": 0.3488280773162842, "learning_rate": 4.534112000966377e-05, "loss": 1.9006, "step": 11790 }, { "epoch": 0.31432587599708706, "grad_norm": 0.36820992827415466, "learning_rate": 4.5251781570938324e-05, "loss": 1.8888, "step": 11800 }, { "epoch": 0.31486781716259926, "grad_norm": 0.33846035599708557, "learning_rate": 4.51624834276108e-05, "loss": 1.8845, "step": 11810 }, { "epoch": 0.3154097583281115, "grad_norm": 0.3961305320262909, "learning_rate": 4.5073225948805476e-05, "loss": 1.8928, "step": 11820 }, { "epoch": 0.3159516994936237, "grad_norm": 0.29425087571144104, "learning_rate": 4.498400950347855e-05, "loss": 1.8951, "step": 11830 }, { "epoch": 0.31606008772672617, "eval_loss": 2.6105449199676514, "eval_runtime": 22.294, "eval_samples_per_second": 224.276, "eval_steps_per_second": 1.211, "step": 11832 }, { "epoch": 0.31649364065913593, "grad_norm": 0.32833293080329895, "learning_rate": 4.4894834460416626e-05, "loss": 1.8865, "step": 11840 }, { "epoch": 0.31703558182464814, "grad_norm": 0.274554580450058, "learning_rate": 4.480570118823511e-05, "loss": 1.8974, "step": 11850 }, { "epoch": 0.3175775229901604, "grad_norm": 0.3351888656616211, "learning_rate": 4.471661005537682e-05, "loss": 1.8834, "step": 11860 }, { "epoch": 0.3181194641556726, "grad_norm": 0.6972358226776123, "learning_rate": 4.462756143011031e-05, "loss": 1.8822, "step": 11870 }, { "epoch": 0.3186614053211848, "grad_norm": 0.36904025077819824, "learning_rate": 4.453855568052847e-05, "loss": 1.8972, "step": 11880 }, { "epoch": 0.319203346486697, "grad_norm": 0.27459755539894104, "learning_rate": 4.444959317454696e-05, "loss": 1.8854, "step": 11890 }, { "epoch": 0.3197452876522093, "grad_norm": 0.3487244248390198, "learning_rate": 4.436067427990266e-05, "loss": 1.8857, "step": 11900 }, { "epoch": 0.3202872288177215, "grad_norm": 0.3203682005405426, "learning_rate": 4.42717993641522e-05, "loss": 1.8869, "step": 11910 }, { "epoch": 0.32077497586668247, "eval_loss": 2.613398313522339, "eval_runtime": 21.9703, "eval_samples_per_second": 227.58, "eval_steps_per_second": 1.229, "step": 11919 }, { "epoch": 0.3208291699832337, "grad_norm": 0.30035504698753357, "learning_rate": 4.418296879467041e-05, "loss": 1.8942, "step": 11920 }, { "epoch": 0.3213711111487459, "grad_norm": 0.38128897547721863, "learning_rate": 4.409418293864881e-05, "loss": 1.8833, "step": 11930 }, { "epoch": 0.32191305231425815, "grad_norm": 0.31847649812698364, "learning_rate": 4.400544216309409e-05, "loss": 1.8946, "step": 11940 }, { "epoch": 0.32245499347977036, "grad_norm": 0.6311419606208801, "learning_rate": 4.3916746834826604e-05, "loss": 1.887, "step": 11950 }, { "epoch": 0.32299693464528256, "grad_norm": 0.5365369319915771, "learning_rate": 4.3828097320478825e-05, "loss": 1.881, "step": 11960 }, { "epoch": 0.32353887581079477, "grad_norm": 0.3272137939929962, "learning_rate": 4.3739493986493864e-05, "loss": 1.8887, "step": 11970 }, { "epoch": 0.32408081697630703, "grad_norm": 0.3870546519756317, "learning_rate": 4.3650937199123934e-05, "loss": 1.8752, "step": 11980 }, { "epoch": 0.32462275814181923, "grad_norm": 0.47655603289604187, "learning_rate": 4.356242732442887e-05, "loss": 1.8959, "step": 11990 }, { "epoch": 0.32516469930733144, "grad_norm": 0.5217764377593994, "learning_rate": 4.3473964728274516e-05, "loss": 1.8831, "step": 12000 }, { "epoch": 0.32548986400663876, "eval_loss": 2.6064906120300293, "eval_runtime": 21.97, "eval_samples_per_second": 227.583, "eval_steps_per_second": 1.229, "step": 12006 }, { "epoch": 0.32570664047284364, "grad_norm": 0.4419178068637848, "learning_rate": 4.338554977633138e-05, "loss": 1.8843, "step": 12010 }, { "epoch": 0.3262485816383559, "grad_norm": 0.38523489236831665, "learning_rate": 4.3297182834072944e-05, "loss": 1.8829, "step": 12020 }, { "epoch": 0.3267905228038681, "grad_norm": 0.537399411201477, "learning_rate": 4.3208864266774294e-05, "loss": 1.8814, "step": 12030 }, { "epoch": 0.3273324639693803, "grad_norm": 0.3097943365573883, "learning_rate": 4.312059443951051e-05, "loss": 1.8777, "step": 12040 }, { "epoch": 0.3278744051348925, "grad_norm": 0.3149316608905792, "learning_rate": 4.303237371715524e-05, "loss": 1.8874, "step": 12050 }, { "epoch": 0.3284163463004048, "grad_norm": 0.35441088676452637, "learning_rate": 4.2944202464379125e-05, "loss": 1.8776, "step": 12060 }, { "epoch": 0.328958287465917, "grad_norm": 0.34289848804473877, "learning_rate": 4.2856081045648285e-05, "loss": 1.8946, "step": 12070 }, { "epoch": 0.3295002286314292, "grad_norm": 0.3778747320175171, "learning_rate": 4.276800982522293e-05, "loss": 1.8779, "step": 12080 }, { "epoch": 0.3300421697969414, "grad_norm": 0.4332473874092102, "learning_rate": 4.26799891671557e-05, "loss": 1.8827, "step": 12090 }, { "epoch": 0.33020475214659506, "eval_loss": 2.6110384464263916, "eval_runtime": 21.9727, "eval_samples_per_second": 227.555, "eval_steps_per_second": 1.229, "step": 12093 }, { "epoch": 0.33058411096245366, "grad_norm": 0.334953248500824, "learning_rate": 4.2592019435290266e-05, "loss": 1.8724, "step": 12100 }, { "epoch": 0.33112605212796586, "grad_norm": 0.36350664496421814, "learning_rate": 4.2504100993259774e-05, "loss": 1.8744, "step": 12110 }, { "epoch": 0.33166799329347807, "grad_norm": 0.44730937480926514, "learning_rate": 4.241623420448533e-05, "loss": 1.8722, "step": 12120 }, { "epoch": 0.3322099344589903, "grad_norm": 0.6003024578094482, "learning_rate": 4.2328419432174605e-05, "loss": 1.8838, "step": 12130 }, { "epoch": 0.33275187562450254, "grad_norm": 0.27411770820617676, "learning_rate": 4.224065703932016e-05, "loss": 1.8716, "step": 12140 }, { "epoch": 0.33329381679001474, "grad_norm": 0.3935556411743164, "learning_rate": 4.215294738869808e-05, "loss": 1.8874, "step": 12150 }, { "epoch": 0.33383575795552695, "grad_norm": 0.32209470868110657, "learning_rate": 4.206529084286649e-05, "loss": 1.877, "step": 12160 }, { "epoch": 0.33437769912103915, "grad_norm": 0.5299956202507019, "learning_rate": 4.197768776416387e-05, "loss": 1.8806, "step": 12170 }, { "epoch": 0.3349196402865514, "grad_norm": 0.5041821002960205, "learning_rate": 4.1890138514707835e-05, "loss": 1.8764, "step": 12180 }, { "epoch": 0.3349196402865514, "eval_loss": 2.6070642471313477, "eval_runtime": 21.9587, "eval_samples_per_second": 227.7, "eval_steps_per_second": 1.23, "step": 12180 }, { "epoch": 0.3354615814520636, "grad_norm": 0.4416036307811737, "learning_rate": 4.180264345639339e-05, "loss": 1.8713, "step": 12190 }, { "epoch": 0.3360035226175758, "grad_norm": 0.2827119827270508, "learning_rate": 4.171520295089153e-05, "loss": 1.8777, "step": 12200 }, { "epoch": 0.33654546378308803, "grad_norm": 0.374304860830307, "learning_rate": 4.1627817359647846e-05, "loss": 1.8803, "step": 12210 }, { "epoch": 0.3370874049486003, "grad_norm": 0.27908772230148315, "learning_rate": 4.1540487043880824e-05, "loss": 1.8862, "step": 12220 }, { "epoch": 0.3376293461141125, "grad_norm": 0.41203275322914124, "learning_rate": 4.145321236458053e-05, "loss": 1.8731, "step": 12230 }, { "epoch": 0.3381712872796247, "grad_norm": 0.332907497882843, "learning_rate": 4.136599368250704e-05, "loss": 1.8844, "step": 12240 }, { "epoch": 0.3387132284451369, "grad_norm": 0.302048921585083, "learning_rate": 4.1278831358188915e-05, "loss": 1.8878, "step": 12250 }, { "epoch": 0.33925516961064917, "grad_norm": 0.40147966146469116, "learning_rate": 4.119172575192185e-05, "loss": 1.876, "step": 12260 }, { "epoch": 0.3396345284265077, "eval_loss": 2.6118130683898926, "eval_runtime": 21.9708, "eval_samples_per_second": 227.575, "eval_steps_per_second": 1.229, "step": 12267 }, { "epoch": 0.33979711077616137, "grad_norm": 0.2979309856891632, "learning_rate": 4.110467722376697e-05, "loss": 1.8913, "step": 12270 }, { "epoch": 0.3403390519416736, "grad_norm": 0.3392654061317444, "learning_rate": 4.1017686133549524e-05, "loss": 1.8759, "step": 12280 }, { "epoch": 0.3408809931071858, "grad_norm": 0.44583526253700256, "learning_rate": 4.093075284085738e-05, "loss": 1.8815, "step": 12290 }, { "epoch": 0.34142293427269804, "grad_norm": 0.3814590573310852, "learning_rate": 4.084387770503939e-05, "loss": 1.8834, "step": 12300 }, { "epoch": 0.34196487543821025, "grad_norm": 0.2939944565296173, "learning_rate": 4.0757061085204084e-05, "loss": 1.8648, "step": 12310 }, { "epoch": 0.34250681660372245, "grad_norm": 0.41222232580184937, "learning_rate": 4.0670303340218085e-05, "loss": 1.8683, "step": 12320 }, { "epoch": 0.34304875776923466, "grad_norm": 0.307704359292984, "learning_rate": 4.058360482870464e-05, "loss": 1.8837, "step": 12330 }, { "epoch": 0.3435906989347469, "grad_norm": 0.41536006331443787, "learning_rate": 4.049696590904218e-05, "loss": 1.8883, "step": 12340 }, { "epoch": 0.3441326401002591, "grad_norm": 0.29493269324302673, "learning_rate": 4.0410386939362774e-05, "loss": 1.8749, "step": 12350 }, { "epoch": 0.344349416566464, "eval_loss": 2.604987144470215, "eval_runtime": 21.9698, "eval_samples_per_second": 227.585, "eval_steps_per_second": 1.229, "step": 12354 }, { "epoch": 0.34467458126577133, "grad_norm": 0.3915794789791107, "learning_rate": 4.032386827755069e-05, "loss": 1.8712, "step": 12360 }, { "epoch": 0.34521652243128353, "grad_norm": 0.3644906282424927, "learning_rate": 4.0237410281240915e-05, "loss": 1.8795, "step": 12370 }, { "epoch": 0.3457584635967958, "grad_norm": 0.29800209403038025, "learning_rate": 4.015101330781764e-05, "loss": 1.8704, "step": 12380 }, { "epoch": 0.346300404762308, "grad_norm": 0.3081452250480652, "learning_rate": 4.0064677714412856e-05, "loss": 1.8776, "step": 12390 }, { "epoch": 0.3468423459278202, "grad_norm": 0.3264562487602234, "learning_rate": 3.997840385790481e-05, "loss": 1.8819, "step": 12400 }, { "epoch": 0.3473842870933324, "grad_norm": 0.30599161982536316, "learning_rate": 3.989219209491652e-05, "loss": 1.8903, "step": 12410 }, { "epoch": 0.34792622825884467, "grad_norm": 0.31660717725753784, "learning_rate": 3.98060427818144e-05, "loss": 1.8748, "step": 12420 }, { "epoch": 0.3484681694243569, "grad_norm": 0.41350027918815613, "learning_rate": 3.971995627470668e-05, "loss": 1.8775, "step": 12430 }, { "epoch": 0.3490101105898691, "grad_norm": 0.43306195735931396, "learning_rate": 3.963393292944195e-05, "loss": 1.8683, "step": 12440 }, { "epoch": 0.3490643047064203, "eval_loss": 2.6103334426879883, "eval_runtime": 21.967, "eval_samples_per_second": 227.614, "eval_steps_per_second": 1.229, "step": 12441 }, { "epoch": 0.3495520517553813, "grad_norm": 0.3944796323776245, "learning_rate": 3.954797310160777e-05, "loss": 1.8695, "step": 12450 }, { "epoch": 0.35009399292089355, "grad_norm": 0.3327912986278534, "learning_rate": 3.946207714652911e-05, "loss": 1.8703, "step": 12460 }, { "epoch": 0.35063593408640575, "grad_norm": 0.38429713249206543, "learning_rate": 3.937624541926689e-05, "loss": 1.8715, "step": 12470 }, { "epoch": 0.35117787525191796, "grad_norm": 0.6065689325332642, "learning_rate": 3.9290478274616605e-05, "loss": 1.8717, "step": 12480 }, { "epoch": 0.35171981641743016, "grad_norm": 0.31308409571647644, "learning_rate": 3.920477606710673e-05, "loss": 1.8792, "step": 12490 }, { "epoch": 0.3522617575829424, "grad_norm": 0.5742292404174805, "learning_rate": 3.911913915099734e-05, "loss": 1.8788, "step": 12500 }, { "epoch": 0.35280369874845463, "grad_norm": 0.4109736979007721, "learning_rate": 3.903356788027863e-05, "loss": 1.8666, "step": 12510 }, { "epoch": 0.35334563991396684, "grad_norm": 0.6786653399467468, "learning_rate": 3.894806260866941e-05, "loss": 1.8743, "step": 12520 }, { "epoch": 0.3537791928463766, "eval_loss": 2.6084859371185303, "eval_runtime": 21.9673, "eval_samples_per_second": 227.611, "eval_steps_per_second": 1.229, "step": 12528 }, { "epoch": 0.35388758107947904, "grad_norm": 0.2924114465713501, "learning_rate": 3.886262368961571e-05, "loss": 1.8715, "step": 12530 }, { "epoch": 0.3544295222449913, "grad_norm": 0.36380696296691895, "learning_rate": 3.877725147628925e-05, "loss": 1.8659, "step": 12540 }, { "epoch": 0.3549714634105035, "grad_norm": 0.3204795718193054, "learning_rate": 3.869194632158603e-05, "loss": 1.8751, "step": 12550 }, { "epoch": 0.3555134045760157, "grad_norm": 0.5478801131248474, "learning_rate": 3.8606708578124875e-05, "loss": 1.8603, "step": 12560 }, { "epoch": 0.3560553457415279, "grad_norm": 0.5209506154060364, "learning_rate": 3.852153859824593e-05, "loss": 1.868, "step": 12570 }, { "epoch": 0.3565972869070402, "grad_norm": 0.43026360869407654, "learning_rate": 3.8436436734009243e-05, "loss": 1.8737, "step": 12580 }, { "epoch": 0.3571392280725524, "grad_norm": 0.3033788800239563, "learning_rate": 3.83514033371933e-05, "loss": 1.874, "step": 12590 }, { "epoch": 0.3576811692380646, "grad_norm": 0.460555762052536, "learning_rate": 3.8266438759293555e-05, "loss": 1.8764, "step": 12600 }, { "epoch": 0.3582231104035768, "grad_norm": 0.335553377866745, "learning_rate": 3.818154335152101e-05, "loss": 1.8718, "step": 12610 }, { "epoch": 0.3584940809863329, "eval_loss": 2.607743740081787, "eval_runtime": 21.9725, "eval_samples_per_second": 227.557, "eval_steps_per_second": 1.229, "step": 12615 }, { "epoch": 0.35876505156908906, "grad_norm": 0.2863267958164215, "learning_rate": 3.8096717464800735e-05, "loss": 1.8626, "step": 12620 }, { "epoch": 0.35930699273460126, "grad_norm": 0.386030375957489, "learning_rate": 3.8011961449770403e-05, "loss": 1.8737, "step": 12630 }, { "epoch": 0.35984893390011347, "grad_norm": 0.5605409145355225, "learning_rate": 3.7927275656778936e-05, "loss": 1.8686, "step": 12640 }, { "epoch": 0.36039087506562567, "grad_norm": 0.32051485776901245, "learning_rate": 3.7842660435884916e-05, "loss": 1.8766, "step": 12650 }, { "epoch": 0.36093281623113793, "grad_norm": 0.31499314308166504, "learning_rate": 3.775811613685518e-05, "loss": 1.867, "step": 12660 }, { "epoch": 0.36147475739665014, "grad_norm": 0.280678927898407, "learning_rate": 3.767364310916353e-05, "loss": 1.8651, "step": 12670 }, { "epoch": 0.36201669856216234, "grad_norm": 0.26214084029197693, "learning_rate": 3.7589241701989005e-05, "loss": 1.8702, "step": 12680 }, { "epoch": 0.36255863972767455, "grad_norm": 0.31816792488098145, "learning_rate": 3.750491226421473e-05, "loss": 1.8672, "step": 12690 }, { "epoch": 0.3631005808931868, "grad_norm": 0.3030293881893158, "learning_rate": 3.7420655144426256e-05, "loss": 1.8554, "step": 12700 }, { "epoch": 0.36320896912628925, "eval_loss": 2.6091229915618896, "eval_runtime": 21.9746, "eval_samples_per_second": 227.536, "eval_steps_per_second": 1.229, "step": 12702 }, { "epoch": 0.363642522058699, "grad_norm": 0.4357650578022003, "learning_rate": 3.733647069091016e-05, "loss": 1.8616, "step": 12710 }, { "epoch": 0.3641844632242112, "grad_norm": 0.3486103415489197, "learning_rate": 3.725235925165278e-05, "loss": 1.8744, "step": 12720 }, { "epoch": 0.3647264043897234, "grad_norm": 0.35597971081733704, "learning_rate": 3.716832117433853e-05, "loss": 1.8693, "step": 12730 }, { "epoch": 0.3652683455552357, "grad_norm": 0.6880526542663574, "learning_rate": 3.7084356806348566e-05, "loss": 1.8561, "step": 12740 }, { "epoch": 0.3658102867207479, "grad_norm": 0.36243849992752075, "learning_rate": 3.7000466494759445e-05, "loss": 1.869, "step": 12750 }, { "epoch": 0.3663522278862601, "grad_norm": 0.35954001545906067, "learning_rate": 3.691665058634153e-05, "loss": 1.8669, "step": 12760 }, { "epoch": 0.3668941690517723, "grad_norm": 0.2994548976421356, "learning_rate": 3.683290942755767e-05, "loss": 1.8587, "step": 12770 }, { "epoch": 0.36743611021728456, "grad_norm": 0.5059475302696228, "learning_rate": 3.674924336456173e-05, "loss": 1.8584, "step": 12780 }, { "epoch": 0.36792385726624555, "eval_loss": 2.6061477661132812, "eval_runtime": 21.9728, "eval_samples_per_second": 227.554, "eval_steps_per_second": 1.229, "step": 12789 }, { "epoch": 0.36797805138279677, "grad_norm": 0.4944458305835724, "learning_rate": 3.6665652743197075e-05, "loss": 1.8711, "step": 12790 }, { "epoch": 0.36851999254830897, "grad_norm": 0.3134331703186035, "learning_rate": 3.658213790899537e-05, "loss": 1.8633, "step": 12800 }, { "epoch": 0.3690619337138212, "grad_norm": 0.35285812616348267, "learning_rate": 3.649869920717487e-05, "loss": 1.8843, "step": 12810 }, { "epoch": 0.36960387487933344, "grad_norm": 0.3203577399253845, "learning_rate": 3.641533698263917e-05, "loss": 1.859, "step": 12820 }, { "epoch": 0.37014581604484564, "grad_norm": 0.35432031750679016, "learning_rate": 3.6332051579975815e-05, "loss": 1.8704, "step": 12830 }, { "epoch": 0.37068775721035785, "grad_norm": 0.3153700828552246, "learning_rate": 3.624884334345465e-05, "loss": 1.8622, "step": 12840 }, { "epoch": 0.37122969837587005, "grad_norm": 0.7621405124664307, "learning_rate": 3.616571261702669e-05, "loss": 1.8665, "step": 12850 }, { "epoch": 0.3717716395413823, "grad_norm": 0.5706172585487366, "learning_rate": 3.6082659744322464e-05, "loss": 1.8578, "step": 12860 }, { "epoch": 0.3723135807068945, "grad_norm": 0.6530439853668213, "learning_rate": 3.59996850686507e-05, "loss": 1.8579, "step": 12870 }, { "epoch": 0.37263874540620184, "eval_loss": 2.6071505546569824, "eval_runtime": 21.9724, "eval_samples_per_second": 227.558, "eval_steps_per_second": 1.229, "step": 12876 }, { "epoch": 0.3728555218724067, "grad_norm": 0.49953022599220276, "learning_rate": 3.591678893299693e-05, "loss": 1.8671, "step": 12880 }, { "epoch": 0.37339746303791893, "grad_norm": 0.37402573227882385, "learning_rate": 3.583397168002196e-05, "loss": 1.8639, "step": 12890 }, { "epoch": 0.3739394042034312, "grad_norm": 0.281125545501709, "learning_rate": 3.575123365206057e-05, "loss": 1.8736, "step": 12900 }, { "epoch": 0.3744813453689434, "grad_norm": 0.29488998651504517, "learning_rate": 3.566857519112008e-05, "loss": 1.8704, "step": 12910 }, { "epoch": 0.3750232865344556, "grad_norm": 0.29372894763946533, "learning_rate": 3.558599663887886e-05, "loss": 1.868, "step": 12920 }, { "epoch": 0.3755652276999678, "grad_norm": 0.27853238582611084, "learning_rate": 3.550349833668499e-05, "loss": 1.8644, "step": 12930 }, { "epoch": 0.37610716886548007, "grad_norm": 0.2777658998966217, "learning_rate": 3.542108062555483e-05, "loss": 1.8621, "step": 12940 }, { "epoch": 0.3766491100309923, "grad_norm": 0.3306451737880707, "learning_rate": 3.5338743846171574e-05, "loss": 1.8612, "step": 12950 }, { "epoch": 0.3771910511965045, "grad_norm": 0.4671363830566406, "learning_rate": 3.525648833888393e-05, "loss": 1.857, "step": 12960 }, { "epoch": 0.37735363354615814, "eval_loss": 2.6051692962646484, "eval_runtime": 21.9698, "eval_samples_per_second": 227.585, "eval_steps_per_second": 1.229, "step": 12963 }, { "epoch": 0.3777329923620167, "grad_norm": 0.3648098111152649, "learning_rate": 3.5174314443704634e-05, "loss": 1.8643, "step": 12970 }, { "epoch": 0.37827493352752894, "grad_norm": 0.4720825254917145, "learning_rate": 3.5092222500309066e-05, "loss": 1.8606, "step": 12980 }, { "epoch": 0.37881687469304115, "grad_norm": 0.3330930173397064, "learning_rate": 3.501021284803384e-05, "loss": 1.8627, "step": 12990 }, { "epoch": 0.37935881585855336, "grad_norm": 0.45893749594688416, "learning_rate": 3.492828582587541e-05, "loss": 1.8577, "step": 13000 }, { "epoch": 0.37990075702406556, "grad_norm": 0.2954985201358795, "learning_rate": 3.4846441772488706e-05, "loss": 1.8512, "step": 13010 }, { "epoch": 0.3804426981895778, "grad_norm": 0.2998946011066437, "learning_rate": 3.476468102618564e-05, "loss": 1.8632, "step": 13020 }, { "epoch": 0.38098463935509, "grad_norm": 0.3708723187446594, "learning_rate": 3.4683003924933823e-05, "loss": 1.8694, "step": 13030 }, { "epoch": 0.38152658052060223, "grad_norm": 0.48674148321151733, "learning_rate": 3.4601410806355055e-05, "loss": 1.864, "step": 13040 }, { "epoch": 0.38206852168611444, "grad_norm": 0.35994434356689453, "learning_rate": 3.4519902007724026e-05, "loss": 1.8574, "step": 13050 }, { "epoch": 0.38206852168611444, "eval_loss": 2.6076242923736572, "eval_runtime": 21.9664, "eval_samples_per_second": 227.621, "eval_steps_per_second": 1.229, "step": 13050 }, { "epoch": 0.38261046285162664, "grad_norm": 0.30608904361724854, "learning_rate": 3.443847786596682e-05, "loss": 1.8665, "step": 13060 }, { "epoch": 0.3831524040171389, "grad_norm": 0.2905879616737366, "learning_rate": 3.435713871765969e-05, "loss": 1.8563, "step": 13070 }, { "epoch": 0.3836943451826511, "grad_norm": 0.34675753116607666, "learning_rate": 3.427588489902748e-05, "loss": 1.8582, "step": 13080 }, { "epoch": 0.3842362863481633, "grad_norm": 0.2855149209499359, "learning_rate": 3.419471674594226e-05, "loss": 1.8635, "step": 13090 }, { "epoch": 0.3847782275136755, "grad_norm": 0.3102046847343445, "learning_rate": 3.4113634593922126e-05, "loss": 1.8736, "step": 13100 }, { "epoch": 0.3853201686791878, "grad_norm": 0.34414398670196533, "learning_rate": 3.4032638778129576e-05, "loss": 1.8504, "step": 13110 }, { "epoch": 0.3858621098447, "grad_norm": 0.34666258096694946, "learning_rate": 3.395172963337029e-05, "loss": 1.8708, "step": 13120 }, { "epoch": 0.3864040510102122, "grad_norm": 0.2988355755805969, "learning_rate": 3.387090749409167e-05, "loss": 1.8635, "step": 13130 }, { "epoch": 0.38678340982607073, "eval_loss": 2.610823631286621, "eval_runtime": 21.9709, "eval_samples_per_second": 227.574, "eval_steps_per_second": 1.229, "step": 13137 }, { "epoch": 0.3869459921757244, "grad_norm": 0.2701932489871979, "learning_rate": 3.3790172694381385e-05, "loss": 1.8619, "step": 13140 }, { "epoch": 0.38748793334123666, "grad_norm": 0.31789588928222656, "learning_rate": 3.370952556796621e-05, "loss": 1.8527, "step": 13150 }, { "epoch": 0.38802987450674886, "grad_norm": 0.3052840232849121, "learning_rate": 3.362896644821042e-05, "loss": 1.8559, "step": 13160 }, { "epoch": 0.38857181567226107, "grad_norm": 0.2687411606311798, "learning_rate": 3.3548495668114536e-05, "loss": 1.8587, "step": 13170 }, { "epoch": 0.3891137568377733, "grad_norm": 0.5188722014427185, "learning_rate": 3.346811356031394e-05, "loss": 1.8567, "step": 13180 }, { "epoch": 0.38965569800328553, "grad_norm": 0.33732667565345764, "learning_rate": 3.3387820457077403e-05, "loss": 1.8539, "step": 13190 }, { "epoch": 0.39019763916879774, "grad_norm": 0.31062641739845276, "learning_rate": 3.3307616690305875e-05, "loss": 1.8569, "step": 13200 }, { "epoch": 0.39073958033430994, "grad_norm": 0.30725353956222534, "learning_rate": 3.322750259153096e-05, "loss": 1.8489, "step": 13210 }, { "epoch": 0.39128152149982215, "grad_norm": 0.3953590989112854, "learning_rate": 3.314747849191362e-05, "loss": 1.8627, "step": 13220 }, { "epoch": 0.3914982979660271, "eval_loss": 2.6103906631469727, "eval_runtime": 21.9715, "eval_samples_per_second": 227.567, "eval_steps_per_second": 1.229, "step": 13224 }, { "epoch": 0.3918234626653344, "grad_norm": 0.284915953874588, "learning_rate": 3.306754472224285e-05, "loss": 1.8606, "step": 13230 }, { "epoch": 0.3923654038308466, "grad_norm": 0.5332658886909485, "learning_rate": 3.2987701612934174e-05, "loss": 1.8526, "step": 13240 }, { "epoch": 0.3929073449963588, "grad_norm": 0.330483078956604, "learning_rate": 3.290794949402837e-05, "loss": 1.8603, "step": 13250 }, { "epoch": 0.393449286161871, "grad_norm": 0.4462834298610687, "learning_rate": 3.282828869519019e-05, "loss": 1.8646, "step": 13260 }, { "epoch": 0.3939912273273833, "grad_norm": 0.3231438100337982, "learning_rate": 3.2748719545706776e-05, "loss": 1.8528, "step": 13270 }, { "epoch": 0.3945331684928955, "grad_norm": 0.2627573311328888, "learning_rate": 3.266924237448655e-05, "loss": 1.857, "step": 13280 }, { "epoch": 0.3950751096584077, "grad_norm": 0.2972419261932373, "learning_rate": 3.2589857510057634e-05, "loss": 1.8477, "step": 13290 }, { "epoch": 0.3956170508239199, "grad_norm": 0.3084324598312378, "learning_rate": 3.251056528056658e-05, "loss": 1.8504, "step": 13300 }, { "epoch": 0.39615899198943216, "grad_norm": 0.3726852536201477, "learning_rate": 3.2431366013777156e-05, "loss": 1.8487, "step": 13310 }, { "epoch": 0.3962131861059834, "eval_loss": 2.607210397720337, "eval_runtime": 21.9736, "eval_samples_per_second": 227.546, "eval_steps_per_second": 1.229, "step": 13311 }, { "epoch": 0.39670093315494437, "grad_norm": 0.3420845568180084, "learning_rate": 3.235226003706872e-05, "loss": 1.8553, "step": 13320 }, { "epoch": 0.3972428743204566, "grad_norm": 0.3318046033382416, "learning_rate": 3.227324767743507e-05, "loss": 1.8592, "step": 13330 }, { "epoch": 0.3977848154859688, "grad_norm": 0.2706853449344635, "learning_rate": 3.2194329261482985e-05, "loss": 1.8574, "step": 13340 }, { "epoch": 0.39832675665148104, "grad_norm": 0.27932029962539673, "learning_rate": 3.211550511543095e-05, "loss": 1.8565, "step": 13350 }, { "epoch": 0.39886869781699325, "grad_norm": 0.29047316312789917, "learning_rate": 3.203677556510779e-05, "loss": 1.859, "step": 13360 }, { "epoch": 0.39941063898250545, "grad_norm": 0.3240620791912079, "learning_rate": 3.195814093595127e-05, "loss": 1.8506, "step": 13370 }, { "epoch": 0.39995258014801766, "grad_norm": 0.2864803969860077, "learning_rate": 3.18796015530068e-05, "loss": 1.8585, "step": 13380 }, { "epoch": 0.4004945213135299, "grad_norm": 0.28760528564453125, "learning_rate": 3.180115774092609e-05, "loss": 1.857, "step": 13390 }, { "epoch": 0.4009280742459397, "eval_loss": 2.60837984085083, "eval_runtime": 21.9751, "eval_samples_per_second": 227.53, "eval_steps_per_second": 1.229, "step": 13398 }, { "epoch": 0.4010364624790421, "grad_norm": 0.47491455078125, "learning_rate": 3.172280982396577e-05, "loss": 1.8524, "step": 13400 }, { "epoch": 0.4015784036445543, "grad_norm": 0.7039004564285278, "learning_rate": 3.164455812598609e-05, "loss": 1.8448, "step": 13410 }, { "epoch": 0.40212034481006653, "grad_norm": 0.5090378522872925, "learning_rate": 3.15664029704496e-05, "loss": 1.8595, "step": 13420 }, { "epoch": 0.4026622859755788, "grad_norm": 0.3467673361301422, "learning_rate": 3.148834468041973e-05, "loss": 1.8602, "step": 13430 }, { "epoch": 0.403204227141091, "grad_norm": 0.3230932056903839, "learning_rate": 3.141038357855953e-05, "loss": 1.8451, "step": 13440 }, { "epoch": 0.4037461683066032, "grad_norm": 0.3669589161872864, "learning_rate": 3.133251998713032e-05, "loss": 1.8655, "step": 13450 }, { "epoch": 0.4042881094721154, "grad_norm": 0.29886960983276367, "learning_rate": 3.1254754227990294e-05, "loss": 1.8554, "step": 13460 }, { "epoch": 0.40483005063762767, "grad_norm": 0.3277972638607025, "learning_rate": 3.1177086622593345e-05, "loss": 1.8566, "step": 13470 }, { "epoch": 0.4053719918031399, "grad_norm": 0.2972913980484009, "learning_rate": 3.109951749198755e-05, "loss": 1.8554, "step": 13480 }, { "epoch": 0.405642962385896, "eval_loss": 2.604696273803711, "eval_runtime": 22.1189, "eval_samples_per_second": 226.051, "eval_steps_per_second": 1.221, "step": 13485 }, { "epoch": 0.4059139329686521, "grad_norm": 0.41642656922340393, "learning_rate": 3.102204715681397e-05, "loss": 1.8526, "step": 13490 }, { "epoch": 0.4064558741341643, "grad_norm": 0.3331157863140106, "learning_rate": 3.0944675937305254e-05, "loss": 1.8621, "step": 13500 }, { "epoch": 0.40699781529967655, "grad_norm": 0.2806148827075958, "learning_rate": 3.086740415328436e-05, "loss": 1.8597, "step": 13510 }, { "epoch": 0.40753975646518875, "grad_norm": 0.328106552362442, "learning_rate": 3.0790232124163256e-05, "loss": 1.8543, "step": 13520 }, { "epoch": 0.40808169763070096, "grad_norm": 0.27871695160865784, "learning_rate": 3.0713160168941494e-05, "loss": 1.8546, "step": 13530 }, { "epoch": 0.40862363879621316, "grad_norm": 0.3508320748806, "learning_rate": 3.0636188606205e-05, "loss": 1.8454, "step": 13540 }, { "epoch": 0.4091655799617254, "grad_norm": 0.29558315873146057, "learning_rate": 3.0559317754124706e-05, "loss": 1.8595, "step": 13550 }, { "epoch": 0.40970752112723763, "grad_norm": 0.31710633635520935, "learning_rate": 3.048254793045524e-05, "loss": 1.8416, "step": 13560 }, { "epoch": 0.41024946229274983, "grad_norm": 0.31882941722869873, "learning_rate": 3.040587945253362e-05, "loss": 1.8512, "step": 13570 }, { "epoch": 0.4103578505258523, "eval_loss": 2.610496759414673, "eval_runtime": 21.9645, "eval_samples_per_second": 227.64, "eval_steps_per_second": 1.229, "step": 13572 }, { "epoch": 0.41079140345826204, "grad_norm": 0.28335651755332947, "learning_rate": 3.032931263727796e-05, "loss": 1.852, "step": 13580 }, { "epoch": 0.4113333446237743, "grad_norm": 0.40349459648132324, "learning_rate": 3.0252847801186135e-05, "loss": 1.8441, "step": 13590 }, { "epoch": 0.4118752857892865, "grad_norm": 0.2962387204170227, "learning_rate": 3.0176485260334398e-05, "loss": 1.8472, "step": 13600 }, { "epoch": 0.4124172269547987, "grad_norm": 0.27455824613571167, "learning_rate": 3.0100225330376282e-05, "loss": 1.848, "step": 13610 }, { "epoch": 0.4129591681203109, "grad_norm": 0.524045467376709, "learning_rate": 3.0024068326541056e-05, "loss": 1.8442, "step": 13620 }, { "epoch": 0.4135011092858232, "grad_norm": 0.28335145115852356, "learning_rate": 2.994801456363263e-05, "loss": 1.8395, "step": 13630 }, { "epoch": 0.4140430504513354, "grad_norm": 0.2653671205043793, "learning_rate": 2.987206435602809e-05, "loss": 1.8421, "step": 13640 }, { "epoch": 0.4145849916168476, "grad_norm": 0.37382206320762634, "learning_rate": 2.979621801767643e-05, "loss": 1.847, "step": 13650 }, { "epoch": 0.41507273866580857, "eval_loss": 2.604285717010498, "eval_runtime": 21.965, "eval_samples_per_second": 227.635, "eval_steps_per_second": 1.229, "step": 13659 }, { "epoch": 0.4151269327823598, "grad_norm": 0.2956138849258423, "learning_rate": 2.972047586209739e-05, "loss": 1.8429, "step": 13660 }, { "epoch": 0.41566887394787205, "grad_norm": 0.3020017147064209, "learning_rate": 2.9644838202379988e-05, "loss": 1.8395, "step": 13670 }, { "epoch": 0.41621081511338426, "grad_norm": 0.36557555198669434, "learning_rate": 2.956930535118129e-05, "loss": 1.8314, "step": 13680 }, { "epoch": 0.41675275627889646, "grad_norm": 0.2863655686378479, "learning_rate": 2.9493877620725208e-05, "loss": 1.8437, "step": 13690 }, { "epoch": 0.41729469744440867, "grad_norm": 0.30269670486450195, "learning_rate": 2.9418555322800983e-05, "loss": 1.8538, "step": 13700 }, { "epoch": 0.41783663860992093, "grad_norm": 0.3189536929130554, "learning_rate": 2.9343338768762175e-05, "loss": 1.832, "step": 13710 }, { "epoch": 0.41837857977543313, "grad_norm": 0.3305031657218933, "learning_rate": 2.9268228269525178e-05, "loss": 1.8469, "step": 13720 }, { "epoch": 0.41892052094094534, "grad_norm": 0.30250081419944763, "learning_rate": 2.9193224135567965e-05, "loss": 1.8563, "step": 13730 }, { "epoch": 0.41946246210645755, "grad_norm": 0.2988894581794739, "learning_rate": 2.9118326676928938e-05, "loss": 1.8563, "step": 13740 }, { "epoch": 0.4197876268057649, "eval_loss": 2.606349229812622, "eval_runtime": 21.9714, "eval_samples_per_second": 227.568, "eval_steps_per_second": 1.229, "step": 13746 }, { "epoch": 0.4200044032719698, "grad_norm": 0.32497337460517883, "learning_rate": 2.904353620320542e-05, "loss": 1.8507, "step": 13750 }, { "epoch": 0.420546344437482, "grad_norm": 0.4472458064556122, "learning_rate": 2.8968853023552555e-05, "loss": 1.8343, "step": 13760 }, { "epoch": 0.4210882856029942, "grad_norm": 0.4618549942970276, "learning_rate": 2.8894277446682028e-05, "loss": 1.8497, "step": 13770 }, { "epoch": 0.4216302267685064, "grad_norm": 0.3211345970630646, "learning_rate": 2.8819809780860625e-05, "loss": 1.8459, "step": 13780 }, { "epoch": 0.4221721679340187, "grad_norm": 0.2894679307937622, "learning_rate": 2.87454503339092e-05, "loss": 1.8519, "step": 13790 }, { "epoch": 0.4227141090995309, "grad_norm": 0.2763760983943939, "learning_rate": 2.867119941320114e-05, "loss": 1.8481, "step": 13800 }, { "epoch": 0.4232560502650431, "grad_norm": 0.318680077791214, "learning_rate": 2.859705732566129e-05, "loss": 1.8437, "step": 13810 }, { "epoch": 0.4237979914305553, "grad_norm": 0.39189639687538147, "learning_rate": 2.852302437776465e-05, "loss": 1.8552, "step": 13820 }, { "epoch": 0.42433993259606756, "grad_norm": 0.45934727787971497, "learning_rate": 2.844910087553503e-05, "loss": 1.8426, "step": 13830 }, { "epoch": 0.4245025149457212, "eval_loss": 2.608975648880005, "eval_runtime": 21.9733, "eval_samples_per_second": 227.548, "eval_steps_per_second": 1.229, "step": 13833 }, { "epoch": 0.42488187376157976, "grad_norm": 0.2927630543708801, "learning_rate": 2.8375287124543835e-05, "loss": 1.855, "step": 13840 }, { "epoch": 0.42542381492709197, "grad_norm": 0.27992621064186096, "learning_rate": 2.830158342990884e-05, "loss": 1.845, "step": 13850 }, { "epoch": 0.4259657560926042, "grad_norm": 0.33510464429855347, "learning_rate": 2.8227990096292827e-05, "loss": 1.8371, "step": 13860 }, { "epoch": 0.42650769725811644, "grad_norm": 0.4821888208389282, "learning_rate": 2.8154507427902467e-05, "loss": 1.8491, "step": 13870 }, { "epoch": 0.42704963842362864, "grad_norm": 0.29919731616973877, "learning_rate": 2.808113572848692e-05, "loss": 1.8429, "step": 13880 }, { "epoch": 0.42759157958914085, "grad_norm": 0.31553396582603455, "learning_rate": 2.8007875301336662e-05, "loss": 1.8436, "step": 13890 }, { "epoch": 0.42813352075465305, "grad_norm": 0.30432793498039246, "learning_rate": 2.7934726449282213e-05, "loss": 1.8395, "step": 13900 }, { "epoch": 0.4286754619201653, "grad_norm": 0.3555530905723572, "learning_rate": 2.7861689474692898e-05, "loss": 1.842, "step": 13910 }, { "epoch": 0.4292174030856775, "grad_norm": 0.32901936769485474, "learning_rate": 2.7788764679475538e-05, "loss": 1.8336, "step": 13920 }, { "epoch": 0.4292174030856775, "eval_loss": 2.611562728881836, "eval_runtime": 21.9692, "eval_samples_per_second": 227.591, "eval_steps_per_second": 1.229, "step": 13920 }, { "epoch": 0.4297593442511897, "grad_norm": 0.5692172646522522, "learning_rate": 2.7715952365073324e-05, "loss": 1.8485, "step": 13930 }, { "epoch": 0.43030128541670193, "grad_norm": 0.28073015809059143, "learning_rate": 2.7643252832464423e-05, "loss": 1.8312, "step": 13940 }, { "epoch": 0.4308432265822142, "grad_norm": 0.4343844950199127, "learning_rate": 2.7570666382160843e-05, "loss": 1.8512, "step": 13950 }, { "epoch": 0.4313851677477264, "grad_norm": 0.26866263151168823, "learning_rate": 2.7498193314207137e-05, "loss": 1.8367, "step": 13960 }, { "epoch": 0.4319271089132386, "grad_norm": 0.5220692753791809, "learning_rate": 2.742583392817918e-05, "loss": 1.8317, "step": 13970 }, { "epoch": 0.4324690500787508, "grad_norm": 0.3568936288356781, "learning_rate": 2.7353588523182943e-05, "loss": 1.8356, "step": 13980 }, { "epoch": 0.43301099124426307, "grad_norm": 0.28348490595817566, "learning_rate": 2.7281457397853237e-05, "loss": 1.8416, "step": 13990 }, { "epoch": 0.43355293240977527, "grad_norm": 0.32432541251182556, "learning_rate": 2.720944085035248e-05, "loss": 1.8346, "step": 14000 }, { "epoch": 0.4339322912256338, "eval_loss": 2.6060945987701416, "eval_runtime": 21.9781, "eval_samples_per_second": 227.499, "eval_steps_per_second": 1.228, "step": 14007 }, { "epoch": 0.4340948735752875, "grad_norm": 0.30824974179267883, "learning_rate": 2.7137539178369464e-05, "loss": 1.8423, "step": 14010 }, { "epoch": 0.4346368147407997, "grad_norm": 0.387539803981781, "learning_rate": 2.7065752679118128e-05, "loss": 1.8532, "step": 14020 }, { "epoch": 0.43517875590631194, "grad_norm": 0.3984427750110626, "learning_rate": 2.6994081649336366e-05, "loss": 1.8505, "step": 14030 }, { "epoch": 0.43572069707182415, "grad_norm": 0.39461550116539, "learning_rate": 2.6922526385284737e-05, "loss": 1.8425, "step": 14040 }, { "epoch": 0.43626263823733635, "grad_norm": 0.26579055190086365, "learning_rate": 2.685108718274525e-05, "loss": 1.8379, "step": 14050 }, { "epoch": 0.43680457940284856, "grad_norm": 0.3582479655742645, "learning_rate": 2.6779764337020195e-05, "loss": 1.8338, "step": 14060 }, { "epoch": 0.4373465205683608, "grad_norm": 0.4719529151916504, "learning_rate": 2.6708558142930862e-05, "loss": 1.8382, "step": 14070 }, { "epoch": 0.437888461733873, "grad_norm": 0.3371265232563019, "learning_rate": 2.6637468894816366e-05, "loss": 1.844, "step": 14080 }, { "epoch": 0.43843040289938523, "grad_norm": 0.27364712953567505, "learning_rate": 2.656649688653242e-05, "loss": 1.8391, "step": 14090 }, { "epoch": 0.4386471793655901, "eval_loss": 2.606126308441162, "eval_runtime": 21.9732, "eval_samples_per_second": 227.549, "eval_steps_per_second": 1.229, "step": 14094 }, { "epoch": 0.43897234406489744, "grad_norm": 0.3573056757450104, "learning_rate": 2.6495642411450082e-05, "loss": 1.8407, "step": 14100 }, { "epoch": 0.4395142852304097, "grad_norm": 0.26993247866630554, "learning_rate": 2.64249057624546e-05, "loss": 1.8321, "step": 14110 }, { "epoch": 0.4400562263959219, "grad_norm": 0.48967689275741577, "learning_rate": 2.6354287231944154e-05, "loss": 1.8435, "step": 14120 }, { "epoch": 0.4405981675614341, "grad_norm": 0.3284403383731842, "learning_rate": 2.6283787111828666e-05, "loss": 1.8477, "step": 14130 }, { "epoch": 0.4411401087269463, "grad_norm": 0.33071354031562805, "learning_rate": 2.6213405693528638e-05, "loss": 1.8433, "step": 14140 }, { "epoch": 0.4416820498924586, "grad_norm": 0.2832528352737427, "learning_rate": 2.6143143267973846e-05, "loss": 1.846, "step": 14150 }, { "epoch": 0.4422239910579708, "grad_norm": 0.3607860207557678, "learning_rate": 2.6073000125602236e-05, "loss": 1.8488, "step": 14160 }, { "epoch": 0.442765932223483, "grad_norm": 0.26745522022247314, "learning_rate": 2.600297655635866e-05, "loss": 1.8377, "step": 14170 }, { "epoch": 0.4433078733889952, "grad_norm": 0.28432849049568176, "learning_rate": 2.5933072849693706e-05, "loss": 1.8432, "step": 14180 }, { "epoch": 0.4433620675055464, "eval_loss": 2.6039960384368896, "eval_runtime": 21.9648, "eval_samples_per_second": 227.637, "eval_steps_per_second": 1.229, "step": 14181 }, { "epoch": 0.44384981455450745, "grad_norm": 0.274472177028656, "learning_rate": 2.5863289294562497e-05, "loss": 1.8296, "step": 14190 }, { "epoch": 0.44439175572001965, "grad_norm": 0.30612727999687195, "learning_rate": 2.5793626179423514e-05, "loss": 1.8461, "step": 14200 }, { "epoch": 0.44493369688553186, "grad_norm": 0.2670454978942871, "learning_rate": 2.5724083792237363e-05, "loss": 1.8464, "step": 14210 }, { "epoch": 0.44547563805104406, "grad_norm": 0.3071325719356537, "learning_rate": 2.5654662420465613e-05, "loss": 1.8413, "step": 14220 }, { "epoch": 0.4460175792165563, "grad_norm": 0.35536590218544006, "learning_rate": 2.5585362351069586e-05, "loss": 1.8479, "step": 14230 }, { "epoch": 0.44655952038206853, "grad_norm": 0.3007301390171051, "learning_rate": 2.5516183870509212e-05, "loss": 1.8459, "step": 14240 }, { "epoch": 0.44710146154758074, "grad_norm": 0.3410910665988922, "learning_rate": 2.544712726474182e-05, "loss": 1.8482, "step": 14250 }, { "epoch": 0.44764340271309294, "grad_norm": 0.3722754418849945, "learning_rate": 2.5378192819220954e-05, "loss": 1.8384, "step": 14260 }, { "epoch": 0.44807695564550276, "eval_loss": 2.605301856994629, "eval_runtime": 22.0831, "eval_samples_per_second": 226.417, "eval_steps_per_second": 1.223, "step": 14268 }, { "epoch": 0.4481853438786052, "grad_norm": 0.27585405111312866, "learning_rate": 2.5309380818895133e-05, "loss": 1.8416, "step": 14270 }, { "epoch": 0.4487272850441174, "grad_norm": 0.2805611193180084, "learning_rate": 2.524069154820684e-05, "loss": 1.8404, "step": 14280 }, { "epoch": 0.4492692262096296, "grad_norm": 0.4039418697357178, "learning_rate": 2.5172125291091147e-05, "loss": 1.8429, "step": 14290 }, { "epoch": 0.4498111673751418, "grad_norm": 0.27978357672691345, "learning_rate": 2.510368233097472e-05, "loss": 1.8414, "step": 14300 }, { "epoch": 0.4503531085406541, "grad_norm": 0.2985621690750122, "learning_rate": 2.5035362950774504e-05, "loss": 1.8439, "step": 14310 }, { "epoch": 0.4508950497061663, "grad_norm": 0.28868043422698975, "learning_rate": 2.496716743289659e-05, "loss": 1.8448, "step": 14320 }, { "epoch": 0.4514369908716785, "grad_norm": 0.3035377860069275, "learning_rate": 2.4899096059235144e-05, "loss": 1.8442, "step": 14330 }, { "epoch": 0.4519789320371907, "grad_norm": 0.29208043217658997, "learning_rate": 2.4831149111171117e-05, "loss": 1.8354, "step": 14340 }, { "epoch": 0.45252087320270296, "grad_norm": 0.36014124751091003, "learning_rate": 2.476332686957113e-05, "loss": 1.8325, "step": 14350 }, { "epoch": 0.45279184378545906, "eval_loss": 2.5989575386047363, "eval_runtime": 21.9695, "eval_samples_per_second": 227.588, "eval_steps_per_second": 1.229, "step": 14355 }, { "epoch": 0.45306281436821516, "grad_norm": 0.4234754741191864, "learning_rate": 2.4695629614786373e-05, "loss": 1.8427, "step": 14360 }, { "epoch": 0.45360475553372737, "grad_norm": 0.2819061577320099, "learning_rate": 2.462805762665128e-05, "loss": 1.8368, "step": 14370 }, { "epoch": 0.45414669669923957, "grad_norm": 0.4023802876472473, "learning_rate": 2.4560611184482604e-05, "loss": 1.8363, "step": 14380 }, { "epoch": 0.45468863786475183, "grad_norm": 0.4065036475658417, "learning_rate": 2.4493290567078052e-05, "loss": 1.8344, "step": 14390 }, { "epoch": 0.45523057903026404, "grad_norm": 0.3405419886112213, "learning_rate": 2.442609605271524e-05, "loss": 1.8358, "step": 14400 }, { "epoch": 0.45577252019577624, "grad_norm": 0.3849346339702606, "learning_rate": 2.4359027919150578e-05, "loss": 1.8321, "step": 14410 }, { "epoch": 0.45631446136128845, "grad_norm": 0.27683576941490173, "learning_rate": 2.4292086443617964e-05, "loss": 1.8423, "step": 14420 }, { "epoch": 0.4568564025268007, "grad_norm": 0.30656400322914124, "learning_rate": 2.4225271902827808e-05, "loss": 1.8341, "step": 14430 }, { "epoch": 0.4573983436923129, "grad_norm": 0.46740400791168213, "learning_rate": 2.4158584572965827e-05, "loss": 1.8442, "step": 14440 }, { "epoch": 0.45750673192541536, "eval_loss": 2.6014182567596436, "eval_runtime": 21.98, "eval_samples_per_second": 227.479, "eval_steps_per_second": 1.228, "step": 14442 }, { "epoch": 0.4579402848578251, "grad_norm": 0.4761998951435089, "learning_rate": 2.4092024729691855e-05, "loss": 1.8453, "step": 14450 }, { "epoch": 0.4584822260233373, "grad_norm": 0.3128609359264374, "learning_rate": 2.4025592648138807e-05, "loss": 1.832, "step": 14460 }, { "epoch": 0.4590241671888496, "grad_norm": 0.31973832845687866, "learning_rate": 2.3959288602911398e-05, "loss": 1.8325, "step": 14470 }, { "epoch": 0.4595661083543618, "grad_norm": 0.2859000265598297, "learning_rate": 2.3893112868085134e-05, "loss": 1.8248, "step": 14480 }, { "epoch": 0.460108049519874, "grad_norm": 0.2959967255592346, "learning_rate": 2.382706571720516e-05, "loss": 1.8338, "step": 14490 }, { "epoch": 0.4606499906853862, "grad_norm": 0.26421812176704407, "learning_rate": 2.376114742328507e-05, "loss": 1.8357, "step": 14500 }, { "epoch": 0.46119193185089846, "grad_norm": 0.29020607471466064, "learning_rate": 2.3695358258805813e-05, "loss": 1.8357, "step": 14510 }, { "epoch": 0.46173387301641067, "grad_norm": 0.5411327481269836, "learning_rate": 2.3629698495714577e-05, "loss": 1.8333, "step": 14520 }, { "epoch": 0.46222162006537165, "eval_loss": 2.5997419357299805, "eval_runtime": 21.9696, "eval_samples_per_second": 227.587, "eval_steps_per_second": 1.229, "step": 14529 }, { "epoch": 0.4622758141819229, "grad_norm": 0.4464700222015381, "learning_rate": 2.356416840542364e-05, "loss": 1.8359, "step": 14530 }, { "epoch": 0.4628177553474351, "grad_norm": 0.37165167927742004, "learning_rate": 2.3498768258809296e-05, "loss": 1.8324, "step": 14540 }, { "epoch": 0.46335969651294734, "grad_norm": 0.3330148458480835, "learning_rate": 2.343349832621067e-05, "loss": 1.8297, "step": 14550 }, { "epoch": 0.46390163767845954, "grad_norm": 0.5441980361938477, "learning_rate": 2.3368358877428643e-05, "loss": 1.8247, "step": 14560 }, { "epoch": 0.46444357884397175, "grad_norm": 0.2869478464126587, "learning_rate": 2.3303350181724716e-05, "loss": 1.8367, "step": 14570 }, { "epoch": 0.46498552000948395, "grad_norm": 0.30734243988990784, "learning_rate": 2.3238472507819923e-05, "loss": 1.8368, "step": 14580 }, { "epoch": 0.4655274611749962, "grad_norm": 0.27206000685691833, "learning_rate": 2.3173726123893675e-05, "loss": 1.8442, "step": 14590 }, { "epoch": 0.4660694023405084, "grad_norm": 0.34638717770576477, "learning_rate": 2.3109111297582744e-05, "loss": 1.8407, "step": 14600 }, { "epoch": 0.4666113435060206, "grad_norm": 0.29559314250946045, "learning_rate": 2.3044628295980027e-05, "loss": 1.8357, "step": 14610 }, { "epoch": 0.46693650820532795, "eval_loss": 2.6009130477905273, "eval_runtime": 21.9702, "eval_samples_per_second": 227.581, "eval_steps_per_second": 1.229, "step": 14616 }, { "epoch": 0.46715328467153283, "grad_norm": 0.31058135628700256, "learning_rate": 2.2980277385633533e-05, "loss": 1.8294, "step": 14620 }, { "epoch": 0.4676952258370451, "grad_norm": 0.27876853942871094, "learning_rate": 2.291605883254525e-05, "loss": 1.8333, "step": 14630 }, { "epoch": 0.4682371670025573, "grad_norm": 0.3003843128681183, "learning_rate": 2.2851972902170053e-05, "loss": 1.8333, "step": 14640 }, { "epoch": 0.4687791081680695, "grad_norm": 0.4129401743412018, "learning_rate": 2.2788019859414646e-05, "loss": 1.8185, "step": 14650 }, { "epoch": 0.4693210493335817, "grad_norm": 0.2741876542568207, "learning_rate": 2.2724199968636357e-05, "loss": 1.8215, "step": 14660 }, { "epoch": 0.46986299049909397, "grad_norm": 0.34926167130470276, "learning_rate": 2.266051349364216e-05, "loss": 1.8269, "step": 14670 }, { "epoch": 0.4704049316646062, "grad_norm": 0.3961677849292755, "learning_rate": 2.2596960697687518e-05, "loss": 1.8275, "step": 14680 }, { "epoch": 0.4709468728301184, "grad_norm": 0.3206305503845215, "learning_rate": 2.2533541843475344e-05, "loss": 1.836, "step": 14690 }, { "epoch": 0.4714888139956306, "grad_norm": 0.349286824464798, "learning_rate": 2.247025719315484e-05, "loss": 1.8297, "step": 14700 }, { "epoch": 0.47165139634528425, "eval_loss": 2.6037356853485107, "eval_runtime": 22.0801, "eval_samples_per_second": 226.448, "eval_steps_per_second": 1.223, "step": 14703 }, { "epoch": 0.47203075516114285, "grad_norm": 0.3154524862766266, "learning_rate": 2.240710700832052e-05, "loss": 1.8323, "step": 14710 }, { "epoch": 0.47257269632665505, "grad_norm": 0.30774760246276855, "learning_rate": 2.2344091550011033e-05, "loss": 1.8456, "step": 14720 }, { "epoch": 0.47311463749216726, "grad_norm": 0.3301653265953064, "learning_rate": 2.228121107870812e-05, "loss": 1.8384, "step": 14730 }, { "epoch": 0.47365657865767946, "grad_norm": 0.30092665553092957, "learning_rate": 2.2218465854335535e-05, "loss": 1.8373, "step": 14740 }, { "epoch": 0.4741985198231917, "grad_norm": 0.28236475586891174, "learning_rate": 2.215585613625798e-05, "loss": 1.8184, "step": 14750 }, { "epoch": 0.4747404609887039, "grad_norm": 0.28573015332221985, "learning_rate": 2.209338218328006e-05, "loss": 1.841, "step": 14760 }, { "epoch": 0.47528240215421613, "grad_norm": 0.3064768314361572, "learning_rate": 2.2031044253645117e-05, "loss": 1.8275, "step": 14770 }, { "epoch": 0.47582434331972834, "grad_norm": 0.2896689772605896, "learning_rate": 2.1968842605034262e-05, "loss": 1.8326, "step": 14780 }, { "epoch": 0.4763662844852406, "grad_norm": 0.28397008776664734, "learning_rate": 2.190677749456526e-05, "loss": 1.8242, "step": 14790 }, { "epoch": 0.4763662844852406, "eval_loss": 2.599595785140991, "eval_runtime": 22.3123, "eval_samples_per_second": 224.092, "eval_steps_per_second": 1.21, "step": 14790 }, { "epoch": 0.4769082256507528, "grad_norm": 0.7356740236282349, "learning_rate": 2.1844849178791486e-05, "loss": 1.8301, "step": 14800 }, { "epoch": 0.477450166816265, "grad_norm": 0.40485429763793945, "learning_rate": 2.1783057913700865e-05, "loss": 1.825, "step": 14810 }, { "epoch": 0.4779921079817772, "grad_norm": 0.2894582748413086, "learning_rate": 2.17214039547148e-05, "loss": 1.8295, "step": 14820 }, { "epoch": 0.4785340491472894, "grad_norm": 0.28514474630355835, "learning_rate": 2.1659887556687102e-05, "loss": 1.8327, "step": 14830 }, { "epoch": 0.4790759903128017, "grad_norm": 0.3327902555465698, "learning_rate": 2.1598508973903004e-05, "loss": 1.8393, "step": 14840 }, { "epoch": 0.4796179314783139, "grad_norm": 0.28141072392463684, "learning_rate": 2.1537268460078018e-05, "loss": 1.8279, "step": 14850 }, { "epoch": 0.4801598726438261, "grad_norm": 0.5223854184150696, "learning_rate": 2.147616626835694e-05, "loss": 1.8246, "step": 14860 }, { "epoch": 0.4807018138093383, "grad_norm": 0.28995490074157715, "learning_rate": 2.141520265131284e-05, "loss": 1.8196, "step": 14870 }, { "epoch": 0.4810811726251969, "eval_loss": 2.6000826358795166, "eval_runtime": 21.9715, "eval_samples_per_second": 227.568, "eval_steps_per_second": 1.229, "step": 14877 }, { "epoch": 0.48124375497485056, "grad_norm": 0.3166589140892029, "learning_rate": 2.1354377860945925e-05, "loss": 1.829, "step": 14880 }, { "epoch": 0.48178569614036276, "grad_norm": 0.304440975189209, "learning_rate": 2.1293692148682553e-05, "loss": 1.8353, "step": 14890 }, { "epoch": 0.48232763730587497, "grad_norm": 0.3413919508457184, "learning_rate": 2.1233145765374202e-05, "loss": 1.8222, "step": 14900 }, { "epoch": 0.4828695784713872, "grad_norm": 0.31630587577819824, "learning_rate": 2.1172738961296396e-05, "loss": 1.8281, "step": 14910 }, { "epoch": 0.48341151963689943, "grad_norm": 0.3274076581001282, "learning_rate": 2.1112471986147723e-05, "loss": 1.8278, "step": 14920 }, { "epoch": 0.48395346080241164, "grad_norm": 0.3373514711856842, "learning_rate": 2.1052345089048765e-05, "loss": 1.818, "step": 14930 }, { "epoch": 0.48449540196792384, "grad_norm": 0.43474531173706055, "learning_rate": 2.0992358518541025e-05, "loss": 1.8377, "step": 14940 }, { "epoch": 0.48503734313343605, "grad_norm": 0.28481611609458923, "learning_rate": 2.093251252258602e-05, "loss": 1.8339, "step": 14950 }, { "epoch": 0.4855792842989483, "grad_norm": 0.38067948818206787, "learning_rate": 2.0872807348564134e-05, "loss": 1.8251, "step": 14960 }, { "epoch": 0.4857960607651532, "eval_loss": 2.606252431869507, "eval_runtime": 21.966, "eval_samples_per_second": 227.624, "eval_steps_per_second": 1.229, "step": 14964 }, { "epoch": 0.4861212254644605, "grad_norm": 0.30156803131103516, "learning_rate": 2.0813243243273694e-05, "loss": 1.832, "step": 14970 }, { "epoch": 0.4866631666299727, "grad_norm": 0.2849683463573456, "learning_rate": 2.075382045292987e-05, "loss": 1.8327, "step": 14980 }, { "epoch": 0.4872051077954849, "grad_norm": 0.2862844169139862, "learning_rate": 2.0694539223163674e-05, "loss": 1.8226, "step": 14990 }, { "epoch": 0.4877470489609972, "grad_norm": 0.3448987603187561, "learning_rate": 2.0635399799021005e-05, "loss": 1.827, "step": 15000 }, { "epoch": 0.000541941165512219, "grad_norm": 0.3310023248195648, "learning_rate": 2.0576402424961567e-05, "loss": 1.821, "step": 15010 }, { "epoch": 0.001083882331024438, "grad_norm": 0.2753497064113617, "learning_rate": 2.0517547344857874e-05, "loss": 1.8207, "step": 15020 }, { "epoch": 0.0016258234965366573, "grad_norm": 0.2612669765949249, "learning_rate": 2.04588348019943e-05, "loss": 1.8235, "step": 15030 }, { "epoch": 0.002167764662048876, "grad_norm": 0.2987896502017975, "learning_rate": 2.0400265039065938e-05, "loss": 1.8197, "step": 15040 }, { "epoch": 0.0027097058275610954, "grad_norm": 0.27388012409210205, "learning_rate": 2.0341838298177776e-05, "loss": 1.8277, "step": 15050 }, { "epoch": 0.002763899944112317, "eval_loss": 2.6046245098114014, "eval_runtime": 23.5773, "eval_samples_per_second": 212.068, "eval_steps_per_second": 1.145, "step": 15051 }, { "epoch": 0.0032516469930733145, "grad_norm": 0.3995117247104645, "learning_rate": 2.0283554820843547e-05, "loss": 1.8286, "step": 15060 }, { "epoch": 0.0037935881585855337, "grad_norm": 0.2957726716995239, "learning_rate": 2.022541484798479e-05, "loss": 1.8287, "step": 15070 }, { "epoch": 0.004335529324097752, "grad_norm": 0.2851552963256836, "learning_rate": 2.0167418619929908e-05, "loss": 1.8285, "step": 15080 }, { "epoch": 0.0048774704896099716, "grad_norm": 0.3845987021923065, "learning_rate": 2.010956637641303e-05, "loss": 1.8201, "step": 15090 }, { "epoch": 0.005419411655122191, "grad_norm": 0.2890310287475586, "learning_rate": 2.0051858356573155e-05, "loss": 1.8243, "step": 15100 }, { "epoch": 0.00596135282063441, "grad_norm": 0.27605000138282776, "learning_rate": 1.9994294798953134e-05, "loss": 1.8272, "step": 15110 }, { "epoch": 0.006503293986146629, "grad_norm": 0.39410722255706787, "learning_rate": 1.9936875941498646e-05, "loss": 1.8233, "step": 15120 }, { "epoch": 0.007045235151658848, "grad_norm": 0.37865981459617615, "learning_rate": 1.9879602021557226e-05, "loss": 1.8267, "step": 15130 }, { "epoch": 0.007478788084068624, "eval_loss": 2.604631185531616, "eval_runtime": 23.8764, "eval_samples_per_second": 209.412, "eval_steps_per_second": 1.131, "step": 15138 }, { "epoch": 0.007587176317171067, "grad_norm": 0.28183528780937195, "learning_rate": 1.98224732758773e-05, "loss": 1.8224, "step": 15140 }, { "epoch": 0.008129117482683286, "grad_norm": 0.2929738759994507, "learning_rate": 1.97654899406072e-05, "loss": 1.8245, "step": 15150 }, { "epoch": 0.008671058648195505, "grad_norm": 0.28582409024238586, "learning_rate": 1.9708652251294206e-05, "loss": 1.8136, "step": 15160 }, { "epoch": 0.009212999813707724, "grad_norm": 0.2901137173175812, "learning_rate": 1.9651960442883528e-05, "loss": 1.8254, "step": 15170 }, { "epoch": 0.009754940979219943, "grad_norm": 0.26637405157089233, "learning_rate": 1.9595414749717363e-05, "loss": 1.8244, "step": 15180 }, { "epoch": 0.010296882144732162, "grad_norm": 0.2925315201282501, "learning_rate": 1.9539015405533935e-05, "loss": 1.8162, "step": 15190 }, { "epoch": 0.010838823310244381, "grad_norm": 0.3221004009246826, "learning_rate": 1.9482762643466504e-05, "loss": 1.8219, "step": 15200 }, { "epoch": 0.0113807644757566, "grad_norm": 0.3065180480480194, "learning_rate": 1.9426656696042424e-05, "loss": 1.8196, "step": 15210 }, { "epoch": 0.01192270564126882, "grad_norm": 0.3092059791088104, "learning_rate": 1.9370697795182187e-05, "loss": 1.8217, "step": 15220 }, { "epoch": 0.01219367622402493, "eval_loss": 2.6005666255950928, "eval_runtime": 22.3369, "eval_samples_per_second": 223.845, "eval_steps_per_second": 1.209, "step": 15225 }, { "epoch": 0.012464646806781039, "grad_norm": 0.3684033155441284, "learning_rate": 1.9314886172198426e-05, "loss": 1.8316, "step": 15230 }, { "epoch": 0.013006587972293258, "grad_norm": 0.3613276779651642, "learning_rate": 1.9259222057794996e-05, "loss": 1.8332, "step": 15240 }, { "epoch": 0.013548529137805477, "grad_norm": 0.26722803711891174, "learning_rate": 1.9203705682066007e-05, "loss": 1.8249, "step": 15250 }, { "epoch": 0.014090470303317696, "grad_norm": 0.30705708265304565, "learning_rate": 1.9148337274494862e-05, "loss": 1.8228, "step": 15260 }, { "epoch": 0.014632411468829916, "grad_norm": 0.32335802912712097, "learning_rate": 1.9093117063953354e-05, "loss": 1.8176, "step": 15270 }, { "epoch": 0.015174352634342135, "grad_norm": 0.33310064673423767, "learning_rate": 1.9038045278700653e-05, "loss": 1.824, "step": 15280 }, { "epoch": 0.015716293799854352, "grad_norm": 0.27640849351882935, "learning_rate": 1.8983122146382406e-05, "loss": 1.8284, "step": 15290 }, { "epoch": 0.01625823496536657, "grad_norm": 0.27379941940307617, "learning_rate": 1.892834789402979e-05, "loss": 1.8175, "step": 15300 }, { "epoch": 0.01680017613087879, "grad_norm": 0.32370367646217346, "learning_rate": 1.8873722748058552e-05, "loss": 1.8272, "step": 15310 }, { "epoch": 0.016908564363981235, "eval_loss": 2.6025352478027344, "eval_runtime": 25.0337, "eval_samples_per_second": 199.731, "eval_steps_per_second": 1.079, "step": 15312 }, { "epoch": 0.01734211729639101, "grad_norm": 0.3098716139793396, "learning_rate": 1.8819246934268125e-05, "loss": 1.8247, "step": 15320 }, { "epoch": 0.01788405846190323, "grad_norm": 0.2977130115032196, "learning_rate": 1.8764920677840632e-05, "loss": 1.8229, "step": 15330 }, { "epoch": 0.018425999627415448, "grad_norm": 0.28470513224601746, "learning_rate": 1.871074420333999e-05, "loss": 1.817, "step": 15340 }, { "epoch": 0.018967940792927667, "grad_norm": 0.2921582758426666, "learning_rate": 1.8656717734710975e-05, "loss": 1.8392, "step": 15350 }, { "epoch": 0.019509881958439886, "grad_norm": 0.30813974142074585, "learning_rate": 1.8602841495278294e-05, "loss": 1.8216, "step": 15360 }, { "epoch": 0.020051823123952105, "grad_norm": 0.27432745695114136, "learning_rate": 1.8549115707745656e-05, "loss": 1.8257, "step": 15370 }, { "epoch": 0.020593764289464325, "grad_norm": 0.2718896269798279, "learning_rate": 1.84955405941949e-05, "loss": 1.8297, "step": 15380 }, { "epoch": 0.021135705454976544, "grad_norm": 0.2647911310195923, "learning_rate": 1.8442116376084985e-05, "loss": 1.8202, "step": 15390 }, { "epoch": 0.021623452503937542, "eval_loss": 2.602659225463867, "eval_runtime": 26.4511, "eval_samples_per_second": 189.028, "eval_steps_per_second": 1.021, "step": 15399 }, { "epoch": 0.021677646620488763, "grad_norm": 0.3035239279270172, "learning_rate": 1.8388843274251156e-05, "loss": 1.825, "step": 15400 }, { "epoch": 0.022219587786000982, "grad_norm": 0.31842127442359924, "learning_rate": 1.8335721508903987e-05, "loss": 1.8171, "step": 15410 }, { "epoch": 0.0227615289515132, "grad_norm": 0.3104911744594574, "learning_rate": 1.8282751299628486e-05, "loss": 1.8218, "step": 15420 }, { "epoch": 0.02330347011702542, "grad_norm": 0.3664672076702118, "learning_rate": 1.822993286538321e-05, "loss": 1.8127, "step": 15430 }, { "epoch": 0.02384541128253764, "grad_norm": 0.2692658007144928, "learning_rate": 1.8177266424499313e-05, "loss": 1.8264, "step": 15440 }, { "epoch": 0.02438735244804986, "grad_norm": 0.3049471378326416, "learning_rate": 1.812475219467966e-05, "loss": 1.8147, "step": 15450 }, { "epoch": 0.024929293613562078, "grad_norm": 0.32675981521606445, "learning_rate": 1.8072390392997954e-05, "loss": 1.8286, "step": 15460 }, { "epoch": 0.025471234779074297, "grad_norm": 0.2658865451812744, "learning_rate": 1.8020181235897797e-05, "loss": 1.8234, "step": 15470 }, { "epoch": 0.026013175944586516, "grad_norm": 0.28163376450538635, "learning_rate": 1.796812493919185e-05, "loss": 1.8217, "step": 15480 }, { "epoch": 0.026338340643893846, "eval_loss": 2.600201368331909, "eval_runtime": 22.1652, "eval_samples_per_second": 225.579, "eval_steps_per_second": 1.218, "step": 15486 }, { "epoch": 0.026555117110098735, "grad_norm": 0.32032302021980286, "learning_rate": 1.791622171806088e-05, "loss": 1.8177, "step": 15490 }, { "epoch": 0.027097058275610954, "grad_norm": 0.28977230191230774, "learning_rate": 1.78644717870529e-05, "loss": 1.8137, "step": 15500 }, { "epoch": 0.027638999441123174, "grad_norm": 0.332872599363327, "learning_rate": 1.781287536008229e-05, "loss": 1.8214, "step": 15510 }, { "epoch": 0.028180940606635393, "grad_norm": 0.27174341678619385, "learning_rate": 1.77614326504289e-05, "loss": 1.8277, "step": 15520 }, { "epoch": 0.028722881772147612, "grad_norm": 0.27996736764907837, "learning_rate": 1.771014387073715e-05, "loss": 1.8163, "step": 15530 }, { "epoch": 0.02926482293765983, "grad_norm": 0.26476001739501953, "learning_rate": 1.765900923301523e-05, "loss": 1.8268, "step": 15540 }, { "epoch": 0.02980676410317205, "grad_norm": 0.3071003556251526, "learning_rate": 1.760802894863412e-05, "loss": 1.8231, "step": 15550 }, { "epoch": 0.03034870526868427, "grad_norm": 0.31128978729248047, "learning_rate": 1.7557203228326737e-05, "loss": 1.8213, "step": 15560 }, { "epoch": 0.03089064643419649, "grad_norm": 0.27923986315727234, "learning_rate": 1.7506532282187166e-05, "loss": 1.8192, "step": 15570 }, { "epoch": 0.031053228783850154, "eval_loss": 2.6021640300750732, "eval_runtime": 22.1843, "eval_samples_per_second": 225.384, "eval_steps_per_second": 1.217, "step": 15573 }, { "epoch": 0.031432587599708704, "grad_norm": 0.324139267206192, "learning_rate": 1.7456016319669642e-05, "loss": 1.827, "step": 15580 }, { "epoch": 0.03197452876522092, "grad_norm": 0.27704742550849915, "learning_rate": 1.7405655549587816e-05, "loss": 1.8165, "step": 15590 }, { "epoch": 0.03251646993073314, "grad_norm": 0.3016372323036194, "learning_rate": 1.7355450180113808e-05, "loss": 1.8272, "step": 15600 }, { "epoch": 0.03305841109624536, "grad_norm": 0.2791915833950043, "learning_rate": 1.730540041877733e-05, "loss": 1.8166, "step": 15610 }, { "epoch": 0.03360035226175758, "grad_norm": 0.40646281838417053, "learning_rate": 1.7255506472464936e-05, "loss": 1.828, "step": 15620 }, { "epoch": 0.0341422934272698, "grad_norm": 0.32504037022590637, "learning_rate": 1.7205768547419077e-05, "loss": 1.8265, "step": 15630 }, { "epoch": 0.03468423459278202, "grad_norm": 0.2760820984840393, "learning_rate": 1.7156186849237244e-05, "loss": 1.8341, "step": 15640 }, { "epoch": 0.03522617575829424, "grad_norm": 0.2752606272697449, "learning_rate": 1.7106761582871205e-05, "loss": 1.8145, "step": 15650 }, { "epoch": 0.03576811692380646, "grad_norm": 0.3181304633617401, "learning_rate": 1.7057492952626025e-05, "loss": 1.8322, "step": 15660 }, { "epoch": 0.03576811692380646, "eval_loss": 2.6028239727020264, "eval_runtime": 21.9861, "eval_samples_per_second": 227.417, "eval_steps_per_second": 1.228, "step": 15660 }, { "epoch": 0.03631005808931868, "grad_norm": 0.2869409918785095, "learning_rate": 1.7008381162159358e-05, "loss": 1.8151, "step": 15670 }, { "epoch": 0.036851999254830896, "grad_norm": 0.30195239186286926, "learning_rate": 1.6959426414480516e-05, "loss": 1.8268, "step": 15680 }, { "epoch": 0.037393940420343115, "grad_norm": 0.2631888687610626, "learning_rate": 1.6910628911949644e-05, "loss": 1.8166, "step": 15690 }, { "epoch": 0.037935881585855334, "grad_norm": 0.29582157731056213, "learning_rate": 1.6861988856276946e-05, "loss": 1.8089, "step": 15700 }, { "epoch": 0.03847782275136755, "grad_norm": 0.27693384885787964, "learning_rate": 1.6813506448521727e-05, "loss": 1.812, "step": 15710 }, { "epoch": 0.03901976391687977, "grad_norm": 0.282247930765152, "learning_rate": 1.6765181889091675e-05, "loss": 1.819, "step": 15720 }, { "epoch": 0.03956170508239199, "grad_norm": 0.3580980896949768, "learning_rate": 1.671701537774202e-05, "loss": 1.8174, "step": 15730 }, { "epoch": 0.04010364624790421, "grad_norm": 0.2820577621459961, "learning_rate": 1.666900711357463e-05, "loss": 1.8327, "step": 15740 }, { "epoch": 0.04048300506376277, "eval_loss": 2.6001157760620117, "eval_runtime": 25.7544, "eval_samples_per_second": 194.141, "eval_steps_per_second": 1.048, "step": 15747 }, { "epoch": 0.04064558741341643, "grad_norm": 0.3680673837661743, "learning_rate": 1.6621157295037298e-05, "loss": 1.8087, "step": 15750 }, { "epoch": 0.04118752857892865, "grad_norm": 0.3296014964580536, "learning_rate": 1.65734661199228e-05, "loss": 1.8183, "step": 15760 }, { "epoch": 0.04172946974444087, "grad_norm": 0.29352930188179016, "learning_rate": 1.652593378536816e-05, "loss": 1.8075, "step": 15770 }, { "epoch": 0.04227141090995309, "grad_norm": 0.36422020196914673, "learning_rate": 1.6478560487853866e-05, "loss": 1.8151, "step": 15780 }, { "epoch": 0.04281335207546531, "grad_norm": 0.30587121844291687, "learning_rate": 1.6431346423202945e-05, "loss": 1.8174, "step": 15790 }, { "epoch": 0.043355293240977526, "grad_norm": 0.29669710993766785, "learning_rate": 1.6384291786580247e-05, "loss": 1.8186, "step": 15800 }, { "epoch": 0.043897234406489745, "grad_norm": 0.3331429362297058, "learning_rate": 1.633739677249159e-05, "loss": 1.8162, "step": 15810 }, { "epoch": 0.044439175572001964, "grad_norm": 0.3026489019393921, "learning_rate": 1.6290661574782995e-05, "loss": 1.8103, "step": 15820 }, { "epoch": 0.04498111673751418, "grad_norm": 0.29594072699546814, "learning_rate": 1.624408638663985e-05, "loss": 1.8055, "step": 15830 }, { "epoch": 0.04519789320371907, "eval_loss": 2.599421977996826, "eval_runtime": 23.9096, "eval_samples_per_second": 209.121, "eval_steps_per_second": 1.129, "step": 15834 }, { "epoch": 0.0455230579030264, "grad_norm": 0.286182165145874, "learning_rate": 1.619767140058614e-05, "loss": 1.8213, "step": 15840 }, { "epoch": 0.04606499906853862, "grad_norm": 0.3165307343006134, "learning_rate": 1.6151416808483603e-05, "loss": 1.816, "step": 15850 }, { "epoch": 0.04660694023405084, "grad_norm": 0.3465806245803833, "learning_rate": 1.6105322801531005e-05, "loss": 1.8199, "step": 15860 }, { "epoch": 0.04714888139956306, "grad_norm": 0.31671446561813354, "learning_rate": 1.605938957026329e-05, "loss": 1.8164, "step": 15870 }, { "epoch": 0.04769082256507528, "grad_norm": 0.2874152362346649, "learning_rate": 1.6013617304550827e-05, "loss": 1.8194, "step": 15880 }, { "epoch": 0.0482327637305875, "grad_norm": 0.28032568097114563, "learning_rate": 1.5968006193598626e-05, "loss": 1.8211, "step": 15890 }, { "epoch": 0.04877470489609972, "grad_norm": 0.3768242597579956, "learning_rate": 1.5922556425945532e-05, "loss": 1.8175, "step": 15900 }, { "epoch": 0.049316646061611936, "grad_norm": 0.32226991653442383, "learning_rate": 1.5877268189463456e-05, "loss": 1.8246, "step": 15910 }, { "epoch": 0.049858587227124156, "grad_norm": 0.29384303092956543, "learning_rate": 1.583214167135661e-05, "loss": 1.8125, "step": 15920 }, { "epoch": 0.049912781343675376, "eval_loss": 2.5980074405670166, "eval_runtime": 24.6083, "eval_samples_per_second": 203.184, "eval_steps_per_second": 1.097, "step": 15921 }, { "epoch": 0.050400528392636375, "grad_norm": 0.30090948939323425, "learning_rate": 1.57871770581607e-05, "loss": 1.8066, "step": 15930 }, { "epoch": 0.050942469558148594, "grad_norm": 0.28371649980545044, "learning_rate": 1.5742374535742233e-05, "loss": 1.8208, "step": 15940 }, { "epoch": 0.05148441072366081, "grad_norm": 0.2772349417209625, "learning_rate": 1.569773428929765e-05, "loss": 1.8237, "step": 15950 }, { "epoch": 0.05202635188917303, "grad_norm": 0.28294837474823, "learning_rate": 1.5653256503352603e-05, "loss": 1.8141, "step": 15960 }, { "epoch": 0.05256829305468525, "grad_norm": 0.264106810092926, "learning_rate": 1.5608941361761224e-05, "loss": 1.8101, "step": 15970 }, { "epoch": 0.05311023422019747, "grad_norm": 0.2991582155227661, "learning_rate": 1.5564789047705296e-05, "loss": 1.8162, "step": 15980 }, { "epoch": 0.05365217538570969, "grad_norm": 0.2891395688056946, "learning_rate": 1.5520799743693585e-05, "loss": 1.8405, "step": 15990 }, { "epoch": 0.05419411655122191, "grad_norm": 0.29393982887268066, "learning_rate": 1.5476973631561003e-05, "loss": 1.8179, "step": 16000 }, { "epoch": 0.05462766948363168, "eval_loss": 2.5983400344848633, "eval_runtime": 24.2547, "eval_samples_per_second": 206.145, "eval_steps_per_second": 1.113, "step": 16008 }, { "epoch": 0.05473605771673413, "grad_norm": 0.2763077914714813, "learning_rate": 1.5433310892467897e-05, "loss": 1.8226, "step": 16010 }, { "epoch": 0.05527799888224635, "grad_norm": 0.27592843770980835, "learning_rate": 1.5389811706899284e-05, "loss": 1.8173, "step": 16020 }, { "epoch": 0.055819940047758566, "grad_norm": 0.31375107169151306, "learning_rate": 1.5346476254664132e-05, "loss": 1.8118, "step": 16030 }, { "epoch": 0.056361881213270786, "grad_norm": 0.3110925853252411, "learning_rate": 1.5303304714894568e-05, "loss": 1.8171, "step": 16040 }, { "epoch": 0.056903822378783005, "grad_norm": 0.2970430254936218, "learning_rate": 1.526029726604521e-05, "loss": 1.8095, "step": 16050 }, { "epoch": 0.057445763544295224, "grad_norm": 0.27963101863861084, "learning_rate": 1.5217454085892358e-05, "loss": 1.8202, "step": 16060 }, { "epoch": 0.05798770470980744, "grad_norm": 0.31473496556282043, "learning_rate": 1.5174775351533289e-05, "loss": 1.831, "step": 16070 }, { "epoch": 0.05852964587531966, "grad_norm": 0.2634471654891968, "learning_rate": 1.5132261239385533e-05, "loss": 1.8102, "step": 16080 }, { "epoch": 0.05907158704083188, "grad_norm": 0.36205658316612244, "learning_rate": 1.5089911925186124e-05, "loss": 1.814, "step": 16090 }, { "epoch": 0.05934255762358799, "eval_loss": 2.5985541343688965, "eval_runtime": 28.1025, "eval_samples_per_second": 177.92, "eval_steps_per_second": 0.961, "step": 16095 }, { "epoch": 0.0596135282063441, "grad_norm": 0.5245100855827332, "learning_rate": 1.5047727583990898e-05, "loss": 1.8204, "step": 16100 }, { "epoch": 0.06015546937185632, "grad_norm": 0.316148966550827, "learning_rate": 1.5005708390173756e-05, "loss": 1.8179, "step": 16110 }, { "epoch": 0.06069741053736854, "grad_norm": 0.29313069581985474, "learning_rate": 1.4963854517425934e-05, "loss": 1.8179, "step": 16120 }, { "epoch": 0.06123935170288076, "grad_norm": 0.26670292019844055, "learning_rate": 1.4922166138755289e-05, "loss": 1.8124, "step": 16130 }, { "epoch": 0.06178129286839298, "grad_norm": 0.3401847183704376, "learning_rate": 1.4880643426485602e-05, "loss": 1.8163, "step": 16140 }, { "epoch": 0.062323234033905196, "grad_norm": 0.31569597125053406, "learning_rate": 1.4839286552255843e-05, "loss": 1.8118, "step": 16150 }, { "epoch": 0.06286517519941741, "grad_norm": 0.2905227839946747, "learning_rate": 1.4798095687019482e-05, "loss": 1.8281, "step": 16160 }, { "epoch": 0.06340711636492963, "grad_norm": 0.27679443359375, "learning_rate": 1.4757071001043765e-05, "loss": 1.8123, "step": 16170 }, { "epoch": 0.06394905753044185, "grad_norm": 0.2783887982368469, "learning_rate": 1.4716212663909016e-05, "loss": 1.806, "step": 16180 }, { "epoch": 0.0640574457635443, "eval_loss": 2.598423957824707, "eval_runtime": 22.218, "eval_samples_per_second": 225.043, "eval_steps_per_second": 1.215, "step": 16182 }, { "epoch": 0.06449099869595407, "grad_norm": 0.2765551507472992, "learning_rate": 1.4675520844507932e-05, "loss": 1.8033, "step": 16190 }, { "epoch": 0.06503293986146629, "grad_norm": 0.2806571424007416, "learning_rate": 1.4634995711044892e-05, "loss": 1.8129, "step": 16200 }, { "epoch": 0.0655748810269785, "grad_norm": 0.3304462134838104, "learning_rate": 1.4594637431035272e-05, "loss": 1.8113, "step": 16210 }, { "epoch": 0.06611682219249072, "grad_norm": 0.29869329929351807, "learning_rate": 1.4554446171304736e-05, "loss": 1.8068, "step": 16220 }, { "epoch": 0.06665876335800294, "grad_norm": 0.2936442196369171, "learning_rate": 1.451442209798852e-05, "loss": 1.8089, "step": 16230 }, { "epoch": 0.06720070452351516, "grad_norm": 0.2957615256309509, "learning_rate": 1.4474565376530819e-05, "loss": 1.7961, "step": 16240 }, { "epoch": 0.06774264568902738, "grad_norm": 0.2832295298576355, "learning_rate": 1.4434876171684023e-05, "loss": 1.814, "step": 16250 }, { "epoch": 0.0682845868545396, "grad_norm": 0.2817908525466919, "learning_rate": 1.4395354647508113e-05, "loss": 1.8179, "step": 16260 }, { "epoch": 0.0687723339035006, "eval_loss": 2.5987160205841064, "eval_runtime": 31.5857, "eval_samples_per_second": 158.299, "eval_steps_per_second": 0.855, "step": 16269 }, { "epoch": 0.06882652802005182, "grad_norm": 0.27101555466651917, "learning_rate": 1.4356000967369915e-05, "loss": 1.8071, "step": 16270 }, { "epoch": 0.06936846918556404, "grad_norm": 0.29149556159973145, "learning_rate": 1.4316815293942438e-05, "loss": 1.8095, "step": 16280 }, { "epoch": 0.06991041035107626, "grad_norm": 0.3395109176635742, "learning_rate": 1.4277797789204242e-05, "loss": 1.8077, "step": 16290 }, { "epoch": 0.07045235151658848, "grad_norm": 0.2934078276157379, "learning_rate": 1.4238948614438738e-05, "loss": 1.8234, "step": 16300 }, { "epoch": 0.0709942926821007, "grad_norm": 0.40953195095062256, "learning_rate": 1.4200267930233512e-05, "loss": 1.8193, "step": 16310 }, { "epoch": 0.07153623384761292, "grad_norm": 0.30236881971359253, "learning_rate": 1.4161755896479711e-05, "loss": 1.804, "step": 16320 }, { "epoch": 0.07207817501312513, "grad_norm": 0.3711455762386322, "learning_rate": 1.4123412672371278e-05, "loss": 1.801, "step": 16330 }, { "epoch": 0.07262011617863735, "grad_norm": 0.33262383937835693, "learning_rate": 1.4085238416404434e-05, "loss": 1.8189, "step": 16340 }, { "epoch": 0.07316205734414957, "grad_norm": 0.2707761228084564, "learning_rate": 1.4047233286376901e-05, "loss": 1.8234, "step": 16350 }, { "epoch": 0.07348722204345691, "eval_loss": 2.5961382389068604, "eval_runtime": 23.0319, "eval_samples_per_second": 217.09, "eval_steps_per_second": 1.172, "step": 16356 }, { "epoch": 0.07370399850966179, "grad_norm": 0.277217835187912, "learning_rate": 1.4009397439387317e-05, "loss": 1.8101, "step": 16360 }, { "epoch": 0.07424593967517401, "grad_norm": 0.324625700712204, "learning_rate": 1.3971731031834595e-05, "loss": 1.8177, "step": 16370 }, { "epoch": 0.07478788084068623, "grad_norm": 0.27004820108413696, "learning_rate": 1.3934234219417198e-05, "loss": 1.8239, "step": 16380 }, { "epoch": 0.07532982200619845, "grad_norm": 0.31389129161834717, "learning_rate": 1.3896907157132564e-05, "loss": 1.8044, "step": 16390 }, { "epoch": 0.07587176317171067, "grad_norm": 0.334043025970459, "learning_rate": 1.3859749999276485e-05, "loss": 1.8114, "step": 16400 }, { "epoch": 0.07641370433722289, "grad_norm": 0.28280240297317505, "learning_rate": 1.382276289944239e-05, "loss": 1.81, "step": 16410 }, { "epoch": 0.0769556455027351, "grad_norm": 0.2850959002971649, "learning_rate": 1.3785946010520792e-05, "loss": 1.8064, "step": 16420 }, { "epoch": 0.07749758666824733, "grad_norm": 0.33790093660354614, "learning_rate": 1.3749299484698571e-05, "loss": 1.8038, "step": 16430 }, { "epoch": 0.07803952783375954, "grad_norm": 0.3782959282398224, "learning_rate": 1.3712823473458422e-05, "loss": 1.814, "step": 16440 }, { "epoch": 0.0782021101834132, "eval_loss": 2.5989608764648438, "eval_runtime": 23.6056, "eval_samples_per_second": 211.814, "eval_steps_per_second": 1.144, "step": 16443 }, { "epoch": 0.07858146899927176, "grad_norm": 0.34481021761894226, "learning_rate": 1.36765181275782e-05, "loss": 1.8129, "step": 16450 }, { "epoch": 0.07912341016478398, "grad_norm": 0.2746882438659668, "learning_rate": 1.3640383597130288e-05, "loss": 1.8063, "step": 16460 }, { "epoch": 0.0796653513302962, "grad_norm": 0.38598158955574036, "learning_rate": 1.3604420031480968e-05, "loss": 1.8072, "step": 16470 }, { "epoch": 0.08020729249580842, "grad_norm": 0.267660915851593, "learning_rate": 1.356862757928985e-05, "loss": 1.7979, "step": 16480 }, { "epoch": 0.08074923366132064, "grad_norm": 0.33238664269447327, "learning_rate": 1.3533006388509187e-05, "loss": 1.8071, "step": 16490 }, { "epoch": 0.08129117482683286, "grad_norm": 0.28862419724464417, "learning_rate": 1.3497556606383354e-05, "loss": 1.8117, "step": 16500 }, { "epoch": 0.08183311599234508, "grad_norm": 0.36335626244544983, "learning_rate": 1.3462278379448148e-05, "loss": 1.8215, "step": 16510 }, { "epoch": 0.0823750571578573, "grad_norm": 0.31064873933792114, "learning_rate": 1.3427171853530232e-05, "loss": 1.803, "step": 16520 }, { "epoch": 0.08291699832336952, "grad_norm": 0.3698948919773102, "learning_rate": 1.3392237173746527e-05, "loss": 1.8072, "step": 16530 }, { "epoch": 0.08291699832336952, "eval_loss": 2.5981791019439697, "eval_runtime": 21.9635, "eval_samples_per_second": 227.65, "eval_steps_per_second": 1.229, "step": 16530 }, { "epoch": 0.08345893948888174, "grad_norm": 0.272014319896698, "learning_rate": 1.3357474484503613e-05, "loss": 1.8132, "step": 16540 }, { "epoch": 0.08400088065439396, "grad_norm": 0.2674700617790222, "learning_rate": 1.3322883929497105e-05, "loss": 1.8112, "step": 16550 }, { "epoch": 0.08454282181990617, "grad_norm": 0.3143172264099121, "learning_rate": 1.3288465651711113e-05, "loss": 1.8035, "step": 16560 }, { "epoch": 0.0850847629854184, "grad_norm": 0.2932548224925995, "learning_rate": 1.3254219793417596e-05, "loss": 1.8099, "step": 16570 }, { "epoch": 0.08562670415093061, "grad_norm": 0.28410300612449646, "learning_rate": 1.3220146496175808e-05, "loss": 1.8086, "step": 16580 }, { "epoch": 0.08616864531644283, "grad_norm": 0.2889862358570099, "learning_rate": 1.3186245900831692e-05, "loss": 1.8116, "step": 16590 }, { "epoch": 0.08671058648195505, "grad_norm": 0.26894381642341614, "learning_rate": 1.3152518147517309e-05, "loss": 1.8031, "step": 16600 }, { "epoch": 0.08725252764746727, "grad_norm": 0.265250027179718, "learning_rate": 1.3118963375650279e-05, "loss": 1.8048, "step": 16610 }, { "epoch": 0.08763188646332583, "eval_loss": 2.597559690475464, "eval_runtime": 27.4632, "eval_samples_per_second": 182.062, "eval_steps_per_second": 0.983, "step": 16617 }, { "epoch": 0.08779446881297949, "grad_norm": 0.27796122431755066, "learning_rate": 1.3085581723933146e-05, "loss": 1.8049, "step": 16620 }, { "epoch": 0.08833640997849171, "grad_norm": 0.3169678747653961, "learning_rate": 1.3052373330352873e-05, "loss": 1.8094, "step": 16630 }, { "epoch": 0.08887835114400393, "grad_norm": 0.3725588321685791, "learning_rate": 1.3019338332180223e-05, "loss": 1.8171, "step": 16640 }, { "epoch": 0.08942029230951615, "grad_norm": 0.3112504780292511, "learning_rate": 1.2986476865969215e-05, "loss": 1.8162, "step": 16650 }, { "epoch": 0.08996223347502837, "grad_norm": 0.33968761563301086, "learning_rate": 1.2953789067556545e-05, "loss": 1.8147, "step": 16660 }, { "epoch": 0.09050417464054059, "grad_norm": 0.3981771469116211, "learning_rate": 1.2921275072061061e-05, "loss": 1.8085, "step": 16670 }, { "epoch": 0.0910461158060528, "grad_norm": 0.35043928027153015, "learning_rate": 1.2888935013883141e-05, "loss": 1.8226, "step": 16680 }, { "epoch": 0.09158805697156502, "grad_norm": 0.26825150847435, "learning_rate": 1.2856769026704188e-05, "loss": 1.8066, "step": 16690 }, { "epoch": 0.09212999813707724, "grad_norm": 0.27858951687812805, "learning_rate": 1.2824777243486063e-05, "loss": 1.8188, "step": 16700 }, { "epoch": 0.09234677460328213, "eval_loss": 2.594632863998413, "eval_runtime": 22.2955, "eval_samples_per_second": 224.26, "eval_steps_per_second": 1.211, "step": 16704 }, { "epoch": 0.09267193930258946, "grad_norm": 0.27273324131965637, "learning_rate": 1.2792959796470527e-05, "loss": 1.804, "step": 16710 }, { "epoch": 0.09321388046810168, "grad_norm": 0.3499986529350281, "learning_rate": 1.2761316817178725e-05, "loss": 1.8051, "step": 16720 }, { "epoch": 0.0937558216336139, "grad_norm": 0.26582643389701843, "learning_rate": 1.2729848436410596e-05, "loss": 1.8208, "step": 16730 }, { "epoch": 0.09429776279912612, "grad_norm": 0.3342505693435669, "learning_rate": 1.2698554784244355e-05, "loss": 1.8129, "step": 16740 }, { "epoch": 0.09483970396463834, "grad_norm": 0.28335967659950256, "learning_rate": 1.2667435990035972e-05, "loss": 1.8078, "step": 16750 }, { "epoch": 0.09538164513015056, "grad_norm": 0.2727366089820862, "learning_rate": 1.2636492182418594e-05, "loss": 1.8062, "step": 16760 }, { "epoch": 0.09592358629566278, "grad_norm": 0.31147339940071106, "learning_rate": 1.2605723489302087e-05, "loss": 1.8107, "step": 16770 }, { "epoch": 0.096465527461175, "grad_norm": 0.2873106002807617, "learning_rate": 1.2575130037872418e-05, "loss": 1.8043, "step": 16780 }, { "epoch": 0.09700746862668722, "grad_norm": 0.32366591691970825, "learning_rate": 1.2544711954591192e-05, "loss": 1.8257, "step": 16790 }, { "epoch": 0.09706166274323844, "eval_loss": 2.5952253341674805, "eval_runtime": 27.6353, "eval_samples_per_second": 180.928, "eval_steps_per_second": 0.977, "step": 16791 }, { "epoch": 0.09754940979219943, "grad_norm": 0.2702518105506897, "learning_rate": 1.2514469365195094e-05, "loss": 1.8183, "step": 16800 }, { "epoch": 0.09809135095771165, "grad_norm": 0.28163909912109375, "learning_rate": 1.2484402394695408e-05, "loss": 1.8045, "step": 16810 }, { "epoch": 0.09863329212322387, "grad_norm": 0.2960811257362366, "learning_rate": 1.2454511167377447e-05, "loss": 1.8087, "step": 16820 }, { "epoch": 0.09917523328873609, "grad_norm": 0.28118273615837097, "learning_rate": 1.2424795806800102e-05, "loss": 1.812, "step": 16830 }, { "epoch": 0.09971717445424831, "grad_norm": 0.30942973494529724, "learning_rate": 1.2395256435795278e-05, "loss": 1.8082, "step": 16840 }, { "epoch": 0.10025911561976053, "grad_norm": 0.33988916873931885, "learning_rate": 1.2365893176467404e-05, "loss": 1.812, "step": 16850 }, { "epoch": 0.10080105678527275, "grad_norm": 0.3102359473705292, "learning_rate": 1.2336706150192946e-05, "loss": 1.8144, "step": 16860 }, { "epoch": 0.10134299795078497, "grad_norm": 0.3185904920101166, "learning_rate": 1.2307695477619872e-05, "loss": 1.805, "step": 16870 }, { "epoch": 0.10177655088319475, "eval_loss": 2.5944910049438477, "eval_runtime": 22.1385, "eval_samples_per_second": 225.85, "eval_steps_per_second": 1.22, "step": 16878 }, { "epoch": 0.10188493911629719, "grad_norm": 0.28458863496780396, "learning_rate": 1.2278861278667187e-05, "loss": 1.795, "step": 16880 }, { "epoch": 0.10242688028180941, "grad_norm": 0.31006863713264465, "learning_rate": 1.2250203672524424e-05, "loss": 1.7943, "step": 16890 }, { "epoch": 0.10296882144732163, "grad_norm": 0.28029975295066833, "learning_rate": 1.2221722777651119e-05, "loss": 1.8163, "step": 16900 }, { "epoch": 0.10351076261283385, "grad_norm": 0.27603140473365784, "learning_rate": 1.219341871177639e-05, "loss": 1.814, "step": 16910 }, { "epoch": 0.10405270377834606, "grad_norm": 0.2757927179336548, "learning_rate": 1.2165291591898383e-05, "loss": 1.8138, "step": 16920 }, { "epoch": 0.10459464494385828, "grad_norm": 0.2880472242832184, "learning_rate": 1.2137341534283842e-05, "loss": 1.8064, "step": 16930 }, { "epoch": 0.1051365861093705, "grad_norm": 0.2712019979953766, "learning_rate": 1.2109568654467594e-05, "loss": 1.8005, "step": 16940 }, { "epoch": 0.10567852727488272, "grad_norm": 0.2719828188419342, "learning_rate": 1.2081973067252051e-05, "loss": 1.8146, "step": 16950 }, { "epoch": 0.10622046844039494, "grad_norm": 0.3635210394859314, "learning_rate": 1.205455488670681e-05, "loss": 1.8189, "step": 16960 }, { "epoch": 0.10649143902315104, "eval_loss": 2.596562385559082, "eval_runtime": 28.8968, "eval_samples_per_second": 173.029, "eval_steps_per_second": 0.934, "step": 16965 }, { "epoch": 0.10676240960590716, "grad_norm": 0.2864342927932739, "learning_rate": 1.2027314226168121e-05, "loss": 1.8097, "step": 16970 }, { "epoch": 0.10730435077141938, "grad_norm": 0.3084823489189148, "learning_rate": 1.2000251198238424e-05, "loss": 1.8093, "step": 16980 }, { "epoch": 0.1078462919369316, "grad_norm": 0.3098483383655548, "learning_rate": 1.1973365914785909e-05, "loss": 1.8137, "step": 16990 }, { "epoch": 0.10838823310244382, "grad_norm": 0.30631518363952637, "learning_rate": 1.1946658486944022e-05, "loss": 1.8042, "step": 17000 }, { "epoch": 0.10893017426795604, "grad_norm": 0.33063915371894836, "learning_rate": 1.1920129025111034e-05, "loss": 1.8086, "step": 17010 }, { "epoch": 0.10947211543346826, "grad_norm": 0.2645542025566101, "learning_rate": 1.1893777638949576e-05, "loss": 1.8219, "step": 17020 }, { "epoch": 0.11001405659898048, "grad_norm": 0.27368003129959106, "learning_rate": 1.1867604437386164e-05, "loss": 1.8101, "step": 17030 }, { "epoch": 0.1105559977644927, "grad_norm": 0.2701696455478668, "learning_rate": 1.1841609528610796e-05, "loss": 1.8174, "step": 17040 }, { "epoch": 0.11109793893000491, "grad_norm": 0.28612881898880005, "learning_rate": 1.1815793020076448e-05, "loss": 1.8077, "step": 17050 }, { "epoch": 0.11120632716310735, "eval_loss": 2.595874071121216, "eval_runtime": 29.3622, "eval_samples_per_second": 170.287, "eval_steps_per_second": 0.92, "step": 17052 }, { "epoch": 0.11163988009551713, "grad_norm": 0.33459609746932983, "learning_rate": 1.1790155018498666e-05, "loss": 1.7929, "step": 17060 }, { "epoch": 0.11218182126102935, "grad_norm": 0.27914193272590637, "learning_rate": 1.176469562985513e-05, "loss": 1.8074, "step": 17070 }, { "epoch": 0.11272376242654157, "grad_norm": 0.26985886693000793, "learning_rate": 1.1739414959385191e-05, "loss": 1.8024, "step": 17080 }, { "epoch": 0.11326570359205379, "grad_norm": 0.4287962019443512, "learning_rate": 1.1714313111589445e-05, "loss": 1.7996, "step": 17090 }, { "epoch": 0.11380764475756601, "grad_norm": 0.29455164074897766, "learning_rate": 1.1689390190229312e-05, "loss": 1.8117, "step": 17100 }, { "epoch": 0.11434958592307823, "grad_norm": 0.26276010274887085, "learning_rate": 1.1664646298326582e-05, "loss": 1.8055, "step": 17110 }, { "epoch": 0.11489152708859045, "grad_norm": 0.2993430495262146, "learning_rate": 1.1640081538163036e-05, "loss": 1.8146, "step": 17120 }, { "epoch": 0.11543346825410267, "grad_norm": 0.3164224922657013, "learning_rate": 1.1615696011279965e-05, "loss": 1.8064, "step": 17130 }, { "epoch": 0.11592121530306367, "eval_loss": 2.596324920654297, "eval_runtime": 25.6516, "eval_samples_per_second": 194.919, "eval_steps_per_second": 1.053, "step": 17139 }, { "epoch": 0.11597540941961489, "grad_norm": 0.3307249844074249, "learning_rate": 1.1591489818477795e-05, "loss": 1.8039, "step": 17140 }, { "epoch": 0.1165173505851271, "grad_norm": 0.29287073016166687, "learning_rate": 1.1567463059815642e-05, "loss": 1.8227, "step": 17150 }, { "epoch": 0.11705929175063932, "grad_norm": 0.282429039478302, "learning_rate": 1.1543615834610914e-05, "loss": 1.8041, "step": 17160 }, { "epoch": 0.11760123291615154, "grad_norm": 0.2842390835285187, "learning_rate": 1.1519948241438899e-05, "loss": 1.8088, "step": 17170 }, { "epoch": 0.11814317408166376, "grad_norm": 0.2744342088699341, "learning_rate": 1.149646037813236e-05, "loss": 1.8148, "step": 17180 }, { "epoch": 0.11868511524717598, "grad_norm": 0.28901031613349915, "learning_rate": 1.1473152341781114e-05, "loss": 1.8048, "step": 17190 }, { "epoch": 0.1192270564126882, "grad_norm": 0.2894619107246399, "learning_rate": 1.1450024228731648e-05, "loss": 1.8114, "step": 17200 }, { "epoch": 0.11976899757820042, "grad_norm": 0.2956114709377289, "learning_rate": 1.1427076134586722e-05, "loss": 1.8023, "step": 17210 }, { "epoch": 0.12031093874371264, "grad_norm": 0.28053778409957886, "learning_rate": 1.1404308154204944e-05, "loss": 1.8051, "step": 17220 }, { "epoch": 0.12063610344301996, "eval_loss": 2.5927584171295166, "eval_runtime": 23.5866, "eval_samples_per_second": 211.984, "eval_steps_per_second": 1.145, "step": 17226 }, { "epoch": 0.12085287990922486, "grad_norm": 0.3586195707321167, "learning_rate": 1.1381720381700439e-05, "loss": 1.8011, "step": 17230 }, { "epoch": 0.12139482107473708, "grad_norm": 0.2799775004386902, "learning_rate": 1.1359312910442378e-05, "loss": 1.8082, "step": 17240 }, { "epoch": 0.1219367622402493, "grad_norm": 0.2748625874519348, "learning_rate": 1.1337085833054672e-05, "loss": 1.7877, "step": 17250 }, { "epoch": 0.12247870340576152, "grad_norm": 0.27648496627807617, "learning_rate": 1.131503924141553e-05, "loss": 1.7975, "step": 17260 }, { "epoch": 0.12302064457127374, "grad_norm": 0.27565595507621765, "learning_rate": 1.1293173226657108e-05, "loss": 1.8062, "step": 17270 }, { "epoch": 0.12356258573678595, "grad_norm": 0.2683922052383423, "learning_rate": 1.1271487879165137e-05, "loss": 1.804, "step": 17280 }, { "epoch": 0.12410452690229817, "grad_norm": 0.27080050110816956, "learning_rate": 1.1249983288578525e-05, "loss": 1.8092, "step": 17290 }, { "epoch": 0.12464646806781039, "grad_norm": 0.3873322308063507, "learning_rate": 1.1228659543789005e-05, "loss": 1.8062, "step": 17300 }, { "epoch": 0.1251884092333226, "grad_norm": 0.3005862534046173, "learning_rate": 1.1207516732940766e-05, "loss": 1.8053, "step": 17310 }, { "epoch": 0.12535099158297627, "eval_loss": 2.597273826599121, "eval_runtime": 26.7305, "eval_samples_per_second": 187.052, "eval_steps_per_second": 1.01, "step": 17313 }, { "epoch": 0.12573035039883482, "grad_norm": 0.27348408102989197, "learning_rate": 1.118655494343008e-05, "loss": 1.8239, "step": 17320 }, { "epoch": 0.12627229156434705, "grad_norm": 0.3401775360107422, "learning_rate": 1.1165774261904948e-05, "loss": 1.7989, "step": 17330 }, { "epoch": 0.12681423272985926, "grad_norm": 0.2691876292228699, "learning_rate": 1.114517477426475e-05, "loss": 1.8121, "step": 17340 }, { "epoch": 0.1273561738953715, "grad_norm": 0.2959340214729309, "learning_rate": 1.1124756565659861e-05, "loss": 1.7961, "step": 17350 }, { "epoch": 0.1278981150608837, "grad_norm": 0.28132373094558716, "learning_rate": 1.1104519720491333e-05, "loss": 1.8025, "step": 17360 }, { "epoch": 0.12844005622639593, "grad_norm": 0.322965145111084, "learning_rate": 1.108446432241052e-05, "loss": 1.811, "step": 17370 }, { "epoch": 0.12898199739190813, "grad_norm": 0.28159090876579285, "learning_rate": 1.106459045431875e-05, "loss": 1.806, "step": 17380 }, { "epoch": 0.12952393855742036, "grad_norm": 0.25992992520332336, "learning_rate": 1.1044898198366984e-05, "loss": 1.8067, "step": 17390 }, { "epoch": 0.13006587972293257, "grad_norm": 0.3366539180278778, "learning_rate": 1.1025387635955451e-05, "loss": 1.8015, "step": 17400 }, { "epoch": 0.13006587972293257, "eval_loss": 2.593247890472412, "eval_runtime": 21.9708, "eval_samples_per_second": 227.575, "eval_steps_per_second": 1.229, "step": 17400 }, { "epoch": 0.1306078208884448, "grad_norm": 0.2966313660144806, "learning_rate": 1.1006058847733338e-05, "loss": 1.8048, "step": 17410 }, { "epoch": 0.131149762053957, "grad_norm": 0.27395039796829224, "learning_rate": 1.098691191359845e-05, "loss": 1.8117, "step": 17420 }, { "epoch": 0.13169170321946924, "grad_norm": 0.331516295671463, "learning_rate": 1.096794691269686e-05, "loss": 1.805, "step": 17430 }, { "epoch": 0.13223364438498145, "grad_norm": 0.30107611417770386, "learning_rate": 1.0949163923422624e-05, "loss": 1.8032, "step": 17440 }, { "epoch": 0.13277558555049368, "grad_norm": 0.2857365310192108, "learning_rate": 1.0930563023417417e-05, "loss": 1.8035, "step": 17450 }, { "epoch": 0.13331752671600589, "grad_norm": 0.2834301292896271, "learning_rate": 1.0912144289570228e-05, "loss": 1.7969, "step": 17460 }, { "epoch": 0.13385946788151812, "grad_norm": 0.275591641664505, "learning_rate": 1.089390779801704e-05, "loss": 1.7935, "step": 17470 }, { "epoch": 0.13440140904703032, "grad_norm": 0.27701419591903687, "learning_rate": 1.0875853624140517e-05, "loss": 1.8203, "step": 17480 }, { "epoch": 0.1347807678628889, "eval_loss": 2.59383487701416, "eval_runtime": 26.2522, "eval_samples_per_second": 190.46, "eval_steps_per_second": 1.028, "step": 17487 }, { "epoch": 0.13494335021254256, "grad_norm": 0.3042493164539337, "learning_rate": 1.0857981842569686e-05, "loss": 1.7945, "step": 17490 }, { "epoch": 0.13548529137805476, "grad_norm": 0.3667808473110199, "learning_rate": 1.0840292527179657e-05, "loss": 1.8075, "step": 17500 }, { "epoch": 0.136027232543567, "grad_norm": 0.28393447399139404, "learning_rate": 1.0822785751091268e-05, "loss": 1.7968, "step": 17510 }, { "epoch": 0.1365691737090792, "grad_norm": 0.2786491811275482, "learning_rate": 1.0805461586670825e-05, "loss": 1.8161, "step": 17520 }, { "epoch": 0.13711111487459143, "grad_norm": 0.27336838841438293, "learning_rate": 1.0788320105529776e-05, "loss": 1.7943, "step": 17530 }, { "epoch": 0.13765305604010364, "grad_norm": 0.2629176676273346, "learning_rate": 1.0771361378524431e-05, "loss": 1.8007, "step": 17540 }, { "epoch": 0.13819499720561587, "grad_norm": 0.26536351442337036, "learning_rate": 1.0754585475755682e-05, "loss": 1.7939, "step": 17550 }, { "epoch": 0.13873693837112808, "grad_norm": 0.30199551582336426, "learning_rate": 1.0737992466568667e-05, "loss": 1.7971, "step": 17560 }, { "epoch": 0.1392788795366403, "grad_norm": 0.2917165160179138, "learning_rate": 1.072158241955252e-05, "loss": 1.8053, "step": 17570 }, { "epoch": 0.1394956560028452, "eval_loss": 2.5910274982452393, "eval_runtime": 27.7651, "eval_samples_per_second": 180.082, "eval_steps_per_second": 0.972, "step": 17574 }, { "epoch": 0.13982082070215252, "grad_norm": 0.3207840025424957, "learning_rate": 1.0705355402540095e-05, "loss": 1.8075, "step": 17580 }, { "epoch": 0.14036276186766475, "grad_norm": 0.2685806155204773, "learning_rate": 1.0689311482607655e-05, "loss": 1.8048, "step": 17590 }, { "epoch": 0.14090470303317695, "grad_norm": 0.2922206521034241, "learning_rate": 1.0673450726074611e-05, "loss": 1.7965, "step": 17600 }, { "epoch": 0.1414466441986892, "grad_norm": 0.3959106504917145, "learning_rate": 1.0657773198503267e-05, "loss": 1.8087, "step": 17610 }, { "epoch": 0.1419885853642014, "grad_norm": 0.33581146597862244, "learning_rate": 1.0642278964698486e-05, "loss": 1.8066, "step": 17620 }, { "epoch": 0.14253052652971362, "grad_norm": 0.2724241614341736, "learning_rate": 1.0626968088707508e-05, "loss": 1.8005, "step": 17630 }, { "epoch": 0.14307246769522583, "grad_norm": 0.2701176404953003, "learning_rate": 1.0611840633819626e-05, "loss": 1.8039, "step": 17640 }, { "epoch": 0.14361440886073806, "grad_norm": 0.2956571877002716, "learning_rate": 1.0596896662565925e-05, "loss": 1.7962, "step": 17650 }, { "epoch": 0.14415635002625027, "grad_norm": 0.29787176847457886, "learning_rate": 1.0582136236719073e-05, "loss": 1.8094, "step": 17660 }, { "epoch": 0.1442105441428015, "eval_loss": 2.5948381423950195, "eval_runtime": 22.6082, "eval_samples_per_second": 221.158, "eval_steps_per_second": 1.194, "step": 17661 }, { "epoch": 0.1446982911917625, "grad_norm": 0.2762181758880615, "learning_rate": 1.0567559417292992e-05, "loss": 1.8069, "step": 17670 }, { "epoch": 0.1452402323572747, "grad_norm": 0.2777063548564911, "learning_rate": 1.0553166264542675e-05, "loss": 1.8085, "step": 17680 }, { "epoch": 0.14578217352278694, "grad_norm": 0.2686801552772522, "learning_rate": 1.05389568379639e-05, "loss": 1.7995, "step": 17690 }, { "epoch": 0.14632411468829915, "grad_norm": 0.3426153361797333, "learning_rate": 1.0524931196292987e-05, "loss": 1.8079, "step": 17700 }, { "epoch": 0.14686605585381138, "grad_norm": 0.30656588077545166, "learning_rate": 1.0511089397506558e-05, "loss": 1.8001, "step": 17710 }, { "epoch": 0.14740799701932358, "grad_norm": 0.26832064986228943, "learning_rate": 1.0497431498821308e-05, "loss": 1.8022, "step": 17720 }, { "epoch": 0.14794993818483582, "grad_norm": 0.31563130021095276, "learning_rate": 1.0483957556693748e-05, "loss": 1.7997, "step": 17730 }, { "epoch": 0.14849187935034802, "grad_norm": 0.2889915406703949, "learning_rate": 1.0470667626820001e-05, "loss": 1.7999, "step": 17740 }, { "epoch": 0.1489254322827578, "eval_loss": 2.596463918685913, "eval_runtime": 22.7533, "eval_samples_per_second": 219.749, "eval_steps_per_second": 1.187, "step": 17748 }, { "epoch": 0.14903382051586025, "grad_norm": 0.2632191777229309, "learning_rate": 1.0457561764135531e-05, "loss": 1.7958, "step": 17750 }, { "epoch": 0.14957576168137246, "grad_norm": 0.3954158425331116, "learning_rate": 1.0444640022814952e-05, "loss": 1.8029, "step": 17760 }, { "epoch": 0.1501177028468847, "grad_norm": 0.32601505517959595, "learning_rate": 1.0431902456271798e-05, "loss": 1.8048, "step": 17770 }, { "epoch": 0.1506596440123969, "grad_norm": 0.3632460832595825, "learning_rate": 1.0419349117158271e-05, "loss": 1.7984, "step": 17780 }, { "epoch": 0.15120158517790913, "grad_norm": 0.2885991036891937, "learning_rate": 1.040698005736507e-05, "loss": 1.7987, "step": 17790 }, { "epoch": 0.15174352634342134, "grad_norm": 0.3226932883262634, "learning_rate": 1.0394795328021151e-05, "loss": 1.8114, "step": 17800 }, { "epoch": 0.15228546750893357, "grad_norm": 0.2789970934391022, "learning_rate": 1.038279497949351e-05, "loss": 1.7995, "step": 17810 }, { "epoch": 0.15282740867444577, "grad_norm": 0.2944728136062622, "learning_rate": 1.0370979061386985e-05, "loss": 1.8049, "step": 17820 }, { "epoch": 0.153369349839958, "grad_norm": 0.28693804144859314, "learning_rate": 1.0359347622544055e-05, "loss": 1.79, "step": 17830 }, { "epoch": 0.1536403204227141, "eval_loss": 2.597463369369507, "eval_runtime": 23.0476, "eval_samples_per_second": 216.942, "eval_steps_per_second": 1.171, "step": 17835 }, { "epoch": 0.1539112910054702, "grad_norm": 0.2763684391975403, "learning_rate": 1.0347900711044624e-05, "loss": 1.81, "step": 17840 }, { "epoch": 0.15445323217098245, "grad_norm": 0.2787637412548065, "learning_rate": 1.0336638374205845e-05, "loss": 1.8125, "step": 17850 }, { "epoch": 0.15499517333649465, "grad_norm": 0.2788551449775696, "learning_rate": 1.0325560658581893e-05, "loss": 1.816, "step": 17860 }, { "epoch": 0.15553711450200688, "grad_norm": 0.2845081686973572, "learning_rate": 1.0314667609963802e-05, "loss": 1.7967, "step": 17870 }, { "epoch": 0.1560790556675191, "grad_norm": 0.37821102142333984, "learning_rate": 1.0303959273379258e-05, "loss": 1.7942, "step": 17880 }, { "epoch": 0.15662099683303132, "grad_norm": 0.2836483418941498, "learning_rate": 1.0293435693092411e-05, "loss": 1.814, "step": 17890 }, { "epoch": 0.15716293799854353, "grad_norm": 0.2846856415271759, "learning_rate": 1.0283096912603723e-05, "loss": 1.8124, "step": 17900 }, { "epoch": 0.15770487916405576, "grad_norm": 0.2603934705257416, "learning_rate": 1.027294297464974e-05, "loss": 1.8014, "step": 17910 }, { "epoch": 0.15824682032956797, "grad_norm": 0.34290361404418945, "learning_rate": 1.0262973921202948e-05, "loss": 1.7978, "step": 17920 }, { "epoch": 0.1583552085626704, "eval_loss": 2.5988216400146484, "eval_runtime": 25.8064, "eval_samples_per_second": 193.75, "eval_steps_per_second": 1.046, "step": 17922 }, { "epoch": 0.1587887614950802, "grad_norm": 0.28911086916923523, "learning_rate": 1.0253189793471598e-05, "loss": 1.8057, "step": 17930 }, { "epoch": 0.1593307026605924, "grad_norm": 0.297972708940506, "learning_rate": 1.0243590631899517e-05, "loss": 1.8037, "step": 17940 }, { "epoch": 0.15987264382610464, "grad_norm": 0.27581244707107544, "learning_rate": 1.023417647616596e-05, "loss": 1.8009, "step": 17950 }, { "epoch": 0.16041458499161684, "grad_norm": 0.2763090133666992, "learning_rate": 1.0224947365185444e-05, "loss": 1.8058, "step": 17960 }, { "epoch": 0.16095652615712908, "grad_norm": 0.285412073135376, "learning_rate": 1.0215903337107571e-05, "loss": 1.8179, "step": 17970 }, { "epoch": 0.16149846732264128, "grad_norm": 0.31604015827178955, "learning_rate": 1.0207044429316883e-05, "loss": 1.8074, "step": 17980 }, { "epoch": 0.16204040848815351, "grad_norm": 0.2986457645893097, "learning_rate": 1.0198370678432713e-05, "loss": 1.7948, "step": 17990 }, { "epoch": 0.16258234965366572, "grad_norm": 0.28448864817619324, "learning_rate": 1.0189882120309019e-05, "loss": 1.794, "step": 18000 }, { "epoch": 0.16307009670262673, "eval_loss": 2.5937376022338867, "eval_runtime": 24.5121, "eval_samples_per_second": 203.981, "eval_steps_per_second": 1.101, "step": 18009 }, { "epoch": 0.16312429081917795, "grad_norm": 0.3744131922721863, "learning_rate": 1.0181578790034248e-05, "loss": 1.7963, "step": 18010 }, { "epoch": 0.16366623198469016, "grad_norm": 0.35869526863098145, "learning_rate": 1.0173460721931186e-05, "loss": 1.7989, "step": 18020 }, { "epoch": 0.1642081731502024, "grad_norm": 0.35815170407295227, "learning_rate": 1.0165527949556814e-05, "loss": 1.8044, "step": 18030 }, { "epoch": 0.1647501143157146, "grad_norm": 0.3258884847164154, "learning_rate": 1.015778050570217e-05, "loss": 1.7979, "step": 18040 }, { "epoch": 0.16529205548122683, "grad_norm": 0.2814440429210663, "learning_rate": 1.0150218422392213e-05, "loss": 1.7945, "step": 18050 }, { "epoch": 0.16583399664673903, "grad_norm": 0.2600628733634949, "learning_rate": 1.0142841730885705e-05, "loss": 1.8, "step": 18060 }, { "epoch": 0.16637593781225127, "grad_norm": 0.2741418182849884, "learning_rate": 1.0135650461675054e-05, "loss": 1.7987, "step": 18070 }, { "epoch": 0.16691787897776347, "grad_norm": 0.286240816116333, "learning_rate": 1.0128644644486213e-05, "loss": 1.8017, "step": 18080 }, { "epoch": 0.1674598201432757, "grad_norm": 0.2839854061603546, "learning_rate": 1.0121824308278545e-05, "loss": 1.793, "step": 18090 }, { "epoch": 0.16778498484258303, "eval_loss": 2.5955307483673096, "eval_runtime": 23.2564, "eval_samples_per_second": 214.994, "eval_steps_per_second": 1.161, "step": 18096 }, { "epoch": 0.1680017613087879, "grad_norm": 0.2708841860294342, "learning_rate": 1.0115189481244706e-05, "loss": 1.7934, "step": 18100 }, { "epoch": 0.16854370247430014, "grad_norm": 0.32778841257095337, "learning_rate": 1.0108740190810519e-05, "loss": 1.8111, "step": 18110 }, { "epoch": 0.16908564363981235, "grad_norm": 0.28490176796913147, "learning_rate": 1.0102476463634891e-05, "loss": 1.8004, "step": 18120 }, { "epoch": 0.16962758480532458, "grad_norm": 0.28157317638397217, "learning_rate": 1.0096398325609658e-05, "loss": 1.8033, "step": 18130 }, { "epoch": 0.1701695259708368, "grad_norm": 0.29462599754333496, "learning_rate": 1.0090505801859521e-05, "loss": 1.8047, "step": 18140 }, { "epoch": 0.17071146713634902, "grad_norm": 0.3192684054374695, "learning_rate": 1.0084798916741909e-05, "loss": 1.8049, "step": 18150 }, { "epoch": 0.17125340830186123, "grad_norm": 0.2903258204460144, "learning_rate": 1.0079277693846895e-05, "loss": 1.7911, "step": 18160 }, { "epoch": 0.17179534946737346, "grad_norm": 0.26754647493362427, "learning_rate": 1.0073942155997108e-05, "loss": 1.7938, "step": 18170 }, { "epoch": 0.17233729063288566, "grad_norm": 0.26055288314819336, "learning_rate": 1.0068792325247608e-05, "loss": 1.8061, "step": 18180 }, { "epoch": 0.17249987298253933, "eval_loss": 2.5973074436187744, "eval_runtime": 25.1042, "eval_samples_per_second": 199.169, "eval_steps_per_second": 1.076, "step": 18183 }, { "epoch": 0.1728792317983979, "grad_norm": 0.3081805109977722, "learning_rate": 1.0063828222885822e-05, "loss": 1.81, "step": 18190 }, { "epoch": 0.1734211729639101, "grad_norm": 0.30028587579727173, "learning_rate": 1.0059049869431447e-05, "loss": 1.7991, "step": 18200 }, { "epoch": 0.17396311412942234, "grad_norm": 0.29933491349220276, "learning_rate": 1.0054457284636363e-05, "loss": 1.7965, "step": 18210 }, { "epoch": 0.17450505529493454, "grad_norm": 0.2657584547996521, "learning_rate": 1.0050050487484561e-05, "loss": 1.7991, "step": 18220 }, { "epoch": 0.17504699646044677, "grad_norm": 0.28480470180511475, "learning_rate": 1.004582949619204e-05, "loss": 1.7998, "step": 18230 }, { "epoch": 0.17558893762595898, "grad_norm": 0.3122563660144806, "learning_rate": 1.004179432820677e-05, "loss": 1.8084, "step": 18240 }, { "epoch": 0.1761308787914712, "grad_norm": 0.2768636643886566, "learning_rate": 1.0037945000208584e-05, "loss": 1.8065, "step": 18250 }, { "epoch": 0.17667281995698342, "grad_norm": 0.2690199017524719, "learning_rate": 1.0034281528109125e-05, "loss": 1.8072, "step": 18260 }, { "epoch": 0.17721476112249565, "grad_norm": 0.27862584590911865, "learning_rate": 1.0030803927051793e-05, "loss": 1.7843, "step": 18270 }, { "epoch": 0.17721476112249565, "eval_loss": 2.597975730895996, "eval_runtime": 21.9736, "eval_samples_per_second": 227.545, "eval_steps_per_second": 1.229, "step": 18270 }, { "epoch": 0.17775670228800786, "grad_norm": 0.3340103328227997, "learning_rate": 1.0027512211411644e-05, "loss": 1.7988, "step": 18280 }, { "epoch": 0.1782986434535201, "grad_norm": 0.33370035886764526, "learning_rate": 1.0024406394795373e-05, "loss": 1.8053, "step": 18290 }, { "epoch": 0.1788405846190323, "grad_norm": 0.2686094343662262, "learning_rate": 1.0021486490041246e-05, "loss": 1.7932, "step": 18300 }, { "epoch": 0.17938252578454453, "grad_norm": 0.27461278438568115, "learning_rate": 1.0018752509219017e-05, "loss": 1.8024, "step": 18310 }, { "epoch": 0.17992446695005673, "grad_norm": 0.26994603872299194, "learning_rate": 1.0016204463629918e-05, "loss": 1.794, "step": 18320 }, { "epoch": 0.18046640811556897, "grad_norm": 0.28709709644317627, "learning_rate": 1.0013842363806594e-05, "loss": 1.7986, "step": 18330 }, { "epoch": 0.18100834928108117, "grad_norm": 0.3366316556930542, "learning_rate": 1.0011666219513065e-05, "loss": 1.794, "step": 18340 }, { "epoch": 0.1815502904465934, "grad_norm": 0.3002452552318573, "learning_rate": 1.0009676039744664e-05, "loss": 1.8091, "step": 18350 }, { "epoch": 0.18192964926245195, "eval_loss": 2.598811149597168, "eval_runtime": 26.2348, "eval_samples_per_second": 190.586, "eval_steps_per_second": 1.029, "step": 18357 }, { "epoch": 0.1820922316121056, "grad_norm": 0.276909738779068, "learning_rate": 1.0007871832728043e-05, "loss": 1.7919, "step": 18360 }, { "epoch": 0.18263417277761784, "grad_norm": 0.2699563503265381, "learning_rate": 1.0006253605921098e-05, "loss": 1.7975, "step": 18370 }, { "epoch": 0.18317611394313005, "grad_norm": 0.30202600359916687, "learning_rate": 1.0004821366012962e-05, "loss": 1.7966, "step": 18380 }, { "epoch": 0.18371805510864228, "grad_norm": 0.2897915244102478, "learning_rate": 1.0003575118923961e-05, "loss": 1.8013, "step": 18390 }, { "epoch": 0.18425999627415449, "grad_norm": 0.2939542233943939, "learning_rate": 1.0002514869805599e-05, "loss": 1.8084, "step": 18400 }, { "epoch": 0.18480193743966672, "grad_norm": 0.2901172637939453, "learning_rate": 1.0001640623040548e-05, "loss": 1.8072, "step": 18410 }, { "epoch": 0.18534387860517892, "grad_norm": 0.4139680564403534, "learning_rate": 1.0000952382242605e-05, "loss": 1.798, "step": 18420 }, { "epoch": 0.18588581977069116, "grad_norm": 0.2750343978404999, "learning_rate": 1.000045015025669e-05, "loss": 1.7949, "step": 18430 }, { "epoch": 0.18642776093620336, "grad_norm": 0.28022781014442444, "learning_rate": 1.0000133929158849e-05, "loss": 1.8095, "step": 18440 }, { "epoch": 0.18664453740240824, "eval_loss": 2.5955660343170166, "eval_runtime": 23.3631, "eval_samples_per_second": 214.012, "eval_steps_per_second": 1.156, "step": 18444 }, { "epoch": 0.1869697021017156, "grad_norm": 0.26084405183792114, "learning_rate": 1.000000372025621e-05, "loss": 1.815, "step": 18450 }, { "epoch": 1.0004335529324098, "grad_norm": 7.159980297088623, "learning_rate": 9.350101516641391e-05, "loss": 1.9598, "step": 18460 }, { "epoch": 1.000975494097922, "grad_norm": 6.029938697814941, "learning_rate": 9.349281376788221e-05, "loss": 2.0867, "step": 18470 }, { "epoch": 1.0015174352634342, "grad_norm": 4.406125068664551, "learning_rate": 9.348460760102786e-05, "loss": 2.0069, "step": 18480 }, { "epoch": 1.0020593764289465, "grad_norm": 4.879397392272949, "learning_rate": 9.347639666686739e-05, "loss": 2.0173, "step": 18490 }, { "epoch": 1.0026013175944586, "grad_norm": 3.420335292816162, "learning_rate": 9.346818096641794e-05, "loss": 1.9867, "step": 18500 }, { "epoch": 1.003143258759971, "grad_norm": 1.2602205276489258, "learning_rate": 9.345996050069725e-05, "loss": 1.9279, "step": 18510 }, { "epoch": 1.003685199925483, "grad_norm": 3.904205560684204, "learning_rate": 9.345173527072363e-05, "loss": 1.9165, "step": 18520 }, { "epoch": 1.0042271410909953, "grad_norm": 1.9543260335922241, "learning_rate": 9.344350527751599e-05, "loss": 1.9421, "step": 18530 }, { "epoch": 1.0042813352075466, "eval_loss": 2.611534595489502, "eval_runtime": 59.2714, "eval_samples_per_second": 84.358, "eval_steps_per_second": 0.456, "step": 18531 }, { "epoch": 1.0047690822565076, "grad_norm": 1.9394625425338745, "learning_rate": 9.343527052209384e-05, "loss": 1.8943, "step": 18540 }, { "epoch": 1.0053110234220197, "grad_norm": 1.3751671314239502, "learning_rate": 9.342703100547722e-05, "loss": 1.8667, "step": 18550 }, { "epoch": 1.005852964587532, "grad_norm": 1.4693511724472046, "learning_rate": 9.341878672868683e-05, "loss": 1.8609, "step": 18560 }, { "epoch": 1.0063949057530441, "grad_norm": 1.3482840061187744, "learning_rate": 9.341053769274396e-05, "loss": 1.8552, "step": 18570 }, { "epoch": 1.0069368469185564, "grad_norm": 1.061438798904419, "learning_rate": 9.340228389867041e-05, "loss": 1.8599, "step": 18580 }, { "epoch": 1.0074787880840685, "grad_norm": 0.778508722782135, "learning_rate": 9.339402534748868e-05, "loss": 1.8528, "step": 18590 }, { "epoch": 1.0080207292495809, "grad_norm": 0.43868595361709595, "learning_rate": 9.338576204022178e-05, "loss": 1.8457, "step": 18600 }, { "epoch": 1.008562670415093, "grad_norm": 0.531323254108429, "learning_rate": 9.337749397789332e-05, "loss": 1.8489, "step": 18610 }, { "epoch": 1.008996223347503, "eval_loss": 2.614398717880249, "eval_runtime": 21.9914, "eval_samples_per_second": 227.361, "eval_steps_per_second": 1.228, "step": 18618 }, { "epoch": 1.0091046115806053, "grad_norm": 0.6396978497505188, "learning_rate": 9.336922116152755e-05, "loss": 1.844, "step": 18620 }, { "epoch": 1.0096465527461176, "grad_norm": 0.44661280512809753, "learning_rate": 9.336094359214923e-05, "loss": 1.8426, "step": 18630 }, { "epoch": 1.0101884939116297, "grad_norm": 0.55703204870224, "learning_rate": 9.335266127078376e-05, "loss": 1.8513, "step": 18640 }, { "epoch": 1.010730435077142, "grad_norm": 0.8314570784568787, "learning_rate": 9.334437419845713e-05, "loss": 1.8503, "step": 18650 }, { "epoch": 1.011272376242654, "grad_norm": 0.5193946957588196, "learning_rate": 9.333608237619589e-05, "loss": 1.8619, "step": 18660 }, { "epoch": 1.0118143174081664, "grad_norm": 0.3355676829814911, "learning_rate": 9.33277858050272e-05, "loss": 1.8414, "step": 18670 }, { "epoch": 1.0123562585736785, "grad_norm": 0.4459117650985718, "learning_rate": 9.331948448597882e-05, "loss": 1.8392, "step": 18680 }, { "epoch": 1.0128981997391908, "grad_norm": 0.7514176368713379, "learning_rate": 9.331117842007906e-05, "loss": 1.842, "step": 18690 }, { "epoch": 1.0134401409047031, "grad_norm": 0.4279676377773285, "learning_rate": 9.330286760835686e-05, "loss": 1.8378, "step": 18700 }, { "epoch": 1.013711111487459, "eval_loss": 2.6156516075134277, "eval_runtime": 21.9911, "eval_samples_per_second": 227.364, "eval_steps_per_second": 1.228, "step": 18705 }, { "epoch": 1.0139820820702152, "grad_norm": 0.43919897079467773, "learning_rate": 9.32945520518417e-05, "loss": 1.8378, "step": 18710 }, { "epoch": 1.0145240232357275, "grad_norm": 0.4081036150455475, "learning_rate": 9.32862317515637e-05, "loss": 1.8381, "step": 18720 }, { "epoch": 1.0150659644012396, "grad_norm": 1.0370370149612427, "learning_rate": 9.327790670855352e-05, "loss": 1.8482, "step": 18730 }, { "epoch": 1.015607905566752, "grad_norm": 0.717692494392395, "learning_rate": 9.326957692384247e-05, "loss": 1.8335, "step": 18740 }, { "epoch": 1.016149846732264, "grad_norm": 0.5903939604759216, "learning_rate": 9.326124239846237e-05, "loss": 1.8393, "step": 18750 }, { "epoch": 1.0166917878977764, "grad_norm": 0.5651223063468933, "learning_rate": 9.325290313344565e-05, "loss": 1.8397, "step": 18760 }, { "epoch": 1.0172337290632885, "grad_norm": 0.7575017809867859, "learning_rate": 9.32445591298254e-05, "loss": 1.8482, "step": 18770 }, { "epoch": 1.0177756702288008, "grad_norm": 1.311630129814148, "learning_rate": 9.32362103886352e-05, "loss": 1.844, "step": 18780 }, { "epoch": 1.018317611394313, "grad_norm": 1.2291172742843628, "learning_rate": 9.322785691090925e-05, "loss": 1.8475, "step": 18790 }, { "epoch": 1.0184259996274154, "eval_loss": 2.618657350540161, "eval_runtime": 21.9861, "eval_samples_per_second": 227.417, "eval_steps_per_second": 1.228, "step": 18792 }, { "epoch": 1.0188595525598252, "grad_norm": 0.6886726021766663, "learning_rate": 9.321949869768236e-05, "loss": 1.8358, "step": 18800 }, { "epoch": 1.0194014937253375, "grad_norm": 0.8392327427864075, "learning_rate": 9.32111357499899e-05, "loss": 1.8462, "step": 18810 }, { "epoch": 1.0199434348908496, "grad_norm": 0.35495704412460327, "learning_rate": 9.320276806886784e-05, "loss": 1.834, "step": 18820 }, { "epoch": 1.020485376056362, "grad_norm": 0.45716392993927, "learning_rate": 9.319439565535274e-05, "loss": 1.8272, "step": 18830 }, { "epoch": 1.021027317221874, "grad_norm": 0.31467896699905396, "learning_rate": 9.318601851048172e-05, "loss": 1.8398, "step": 18840 }, { "epoch": 1.0215692583873863, "grad_norm": 0.7633432149887085, "learning_rate": 9.31776366352925e-05, "loss": 1.839, "step": 18850 }, { "epoch": 1.0221111995528986, "grad_norm": 0.40369927883148193, "learning_rate": 9.316925003082344e-05, "loss": 1.8429, "step": 18860 }, { "epoch": 1.0226531407184107, "grad_norm": 0.7707934975624084, "learning_rate": 9.316085869811338e-05, "loss": 1.8314, "step": 18870 }, { "epoch": 1.0231408877673718, "eval_loss": 2.6104660034179688, "eval_runtime": 21.9934, "eval_samples_per_second": 227.341, "eval_steps_per_second": 1.228, "step": 18879 }, { "epoch": 1.023195081883923, "grad_norm": 0.791067361831665, "learning_rate": 9.315246263820181e-05, "loss": 1.8436, "step": 18880 }, { "epoch": 1.0237370230494351, "grad_norm": 0.3280937671661377, "learning_rate": 9.314406185212883e-05, "loss": 1.8302, "step": 18890 }, { "epoch": 1.0242789642149475, "grad_norm": 0.4388424754142761, "learning_rate": 9.313565634093507e-05, "loss": 1.8393, "step": 18900 }, { "epoch": 1.0248209053804596, "grad_norm": 0.4343729019165039, "learning_rate": 9.312724610566177e-05, "loss": 1.8377, "step": 18910 }, { "epoch": 1.0253628465459719, "grad_norm": 0.7564290761947632, "learning_rate": 9.311883114735077e-05, "loss": 1.8383, "step": 18920 }, { "epoch": 1.025904787711484, "grad_norm": 0.3451434075832367, "learning_rate": 9.311041146704443e-05, "loss": 1.8312, "step": 18930 }, { "epoch": 1.0264467288769963, "grad_norm": 0.44154074788093567, "learning_rate": 9.31019870657858e-05, "loss": 1.832, "step": 18940 }, { "epoch": 1.0269886700425086, "grad_norm": 0.5464621782302856, "learning_rate": 9.309355794461843e-05, "loss": 1.832, "step": 18950 }, { "epoch": 1.0275306112080207, "grad_norm": 0.737061619758606, "learning_rate": 9.308512410458648e-05, "loss": 1.8379, "step": 18960 }, { "epoch": 1.0278557759073281, "eval_loss": 2.598432779312134, "eval_runtime": 21.9905, "eval_samples_per_second": 227.371, "eval_steps_per_second": 1.228, "step": 18966 }, { "epoch": 1.028072552373533, "grad_norm": 0.5460038185119629, "learning_rate": 9.307668554673471e-05, "loss": 1.8363, "step": 18970 }, { "epoch": 1.028614493539045, "grad_norm": 0.365409791469574, "learning_rate": 9.306824227210845e-05, "loss": 1.8447, "step": 18980 }, { "epoch": 1.0291564347045574, "grad_norm": 0.37528592348098755, "learning_rate": 9.305979428175361e-05, "loss": 1.8298, "step": 18990 }, { "epoch": 1.0296983758700695, "grad_norm": 0.37354302406311035, "learning_rate": 9.30513415767167e-05, "loss": 1.8439, "step": 19000 }, { "epoch": 1.0302403170355818, "grad_norm": 0.4735754430294037, "learning_rate": 9.304288415804479e-05, "loss": 1.8267, "step": 19010 }, { "epoch": 1.0307822582010941, "grad_norm": 0.30185240507125854, "learning_rate": 9.303442202678555e-05, "loss": 1.8362, "step": 19020 }, { "epoch": 1.0313241993666062, "grad_norm": 0.3827216625213623, "learning_rate": 9.302595518398723e-05, "loss": 1.8311, "step": 19030 }, { "epoch": 1.0318661405321186, "grad_norm": 0.3216041624546051, "learning_rate": 9.301748363069872e-05, "loss": 1.8422, "step": 19040 }, { "epoch": 1.0324080816976307, "grad_norm": 0.34851735830307007, "learning_rate": 9.300900736796934e-05, "loss": 1.8369, "step": 19050 }, { "epoch": 1.0325706640472845, "eval_loss": 2.6064109802246094, "eval_runtime": 21.9894, "eval_samples_per_second": 227.383, "eval_steps_per_second": 1.228, "step": 19053 }, { "epoch": 1.032950022863143, "grad_norm": 0.6215195059776306, "learning_rate": 9.300052639684918e-05, "loss": 1.834, "step": 19060 }, { "epoch": 1.033491964028655, "grad_norm": 0.6960977911949158, "learning_rate": 9.299204071838878e-05, "loss": 1.8224, "step": 19070 }, { "epoch": 1.0340339051941674, "grad_norm": 0.680069088935852, "learning_rate": 9.298355033363931e-05, "loss": 1.8277, "step": 19080 }, { "epoch": 1.0345758463596795, "grad_norm": 0.8072498440742493, "learning_rate": 9.297505524365254e-05, "loss": 1.8475, "step": 19090 }, { "epoch": 1.0351177875251918, "grad_norm": 0.547581672668457, "learning_rate": 9.296655544948081e-05, "loss": 1.8269, "step": 19100 }, { "epoch": 1.035659728690704, "grad_norm": 0.33486223220825195, "learning_rate": 9.295805095217701e-05, "loss": 1.8352, "step": 19110 }, { "epoch": 1.0362016698562162, "grad_norm": 0.5130099654197693, "learning_rate": 9.294954175279466e-05, "loss": 1.8332, "step": 19120 }, { "epoch": 1.0367436110217285, "grad_norm": 0.5673821568489075, "learning_rate": 9.294102785238785e-05, "loss": 1.8172, "step": 19130 }, { "epoch": 1.0372855521872406, "grad_norm": 0.5935150980949402, "learning_rate": 9.293250925201122e-05, "loss": 1.83, "step": 19140 }, { "epoch": 1.0372855521872406, "eval_loss": 2.619783878326416, "eval_runtime": 21.976, "eval_samples_per_second": 227.521, "eval_steps_per_second": 1.229, "step": 19140 }, { "epoch": 1.037827493352753, "grad_norm": 0.8336159586906433, "learning_rate": 9.292398595272003e-05, "loss": 1.8289, "step": 19150 }, { "epoch": 1.038369434518265, "grad_norm": 0.7801002264022827, "learning_rate": 9.291545795557011e-05, "loss": 1.8325, "step": 19160 }, { "epoch": 1.0389113756837773, "grad_norm": 0.3641831576824188, "learning_rate": 9.290692526161787e-05, "loss": 1.8289, "step": 19170 }, { "epoch": 1.0394533168492897, "grad_norm": 0.34061098098754883, "learning_rate": 9.289838787192032e-05, "loss": 1.8234, "step": 19180 }, { "epoch": 1.0399952580148017, "grad_norm": 0.31556975841522217, "learning_rate": 9.288984578753502e-05, "loss": 1.8252, "step": 19190 }, { "epoch": 1.040537199180314, "grad_norm": 0.3090665638446808, "learning_rate": 9.28812990095201e-05, "loss": 1.8194, "step": 19200 }, { "epoch": 1.0410791403458262, "grad_norm": 0.29080694913864136, "learning_rate": 9.287274753893435e-05, "loss": 1.8342, "step": 19210 }, { "epoch": 1.0416210815113385, "grad_norm": 0.5005635619163513, "learning_rate": 9.286419137683704e-05, "loss": 1.8401, "step": 19220 }, { "epoch": 1.042000440327197, "eval_loss": 2.6065304279327393, "eval_runtime": 21.9863, "eval_samples_per_second": 227.414, "eval_steps_per_second": 1.228, "step": 19227 }, { "epoch": 1.0421630226768506, "grad_norm": 0.6064873933792114, "learning_rate": 9.28556305242881e-05, "loss": 1.822, "step": 19230 }, { "epoch": 1.0427049638423629, "grad_norm": 0.30239003896713257, "learning_rate": 9.284706498234798e-05, "loss": 1.8317, "step": 19240 }, { "epoch": 1.043246905007875, "grad_norm": 0.45904672145843506, "learning_rate": 9.28384947520778e-05, "loss": 1.8207, "step": 19250 }, { "epoch": 1.0437888461733873, "grad_norm": 0.4452885389328003, "learning_rate": 9.282991983453917e-05, "loss": 1.832, "step": 19260 }, { "epoch": 1.0443307873388996, "grad_norm": 0.5682956576347351, "learning_rate": 9.28213402307943e-05, "loss": 1.8331, "step": 19270 }, { "epoch": 1.0448727285044117, "grad_norm": 0.886755108833313, "learning_rate": 9.281275594190599e-05, "loss": 1.8377, "step": 19280 }, { "epoch": 1.045414669669924, "grad_norm": 0.9180141687393188, "learning_rate": 9.280416696893766e-05, "loss": 1.8277, "step": 19290 }, { "epoch": 1.0459566108354361, "grad_norm": 0.3015201687812805, "learning_rate": 9.279557331295325e-05, "loss": 1.8243, "step": 19300 }, { "epoch": 1.0464985520009484, "grad_norm": 0.36977553367614746, "learning_rate": 9.278697497501731e-05, "loss": 1.8161, "step": 19310 }, { "epoch": 1.0467153284671533, "eval_loss": 2.6029727458953857, "eval_runtime": 21.9909, "eval_samples_per_second": 227.367, "eval_steps_per_second": 1.228, "step": 19314 }, { "epoch": 1.0470404931664605, "grad_norm": 0.7295284867286682, "learning_rate": 9.277837195619496e-05, "loss": 1.8169, "step": 19320 }, { "epoch": 1.0475824343319728, "grad_norm": 0.3548785448074341, "learning_rate": 9.276976425755192e-05, "loss": 1.8258, "step": 19330 }, { "epoch": 1.0481243754974852, "grad_norm": 0.581323504447937, "learning_rate": 9.276115188015445e-05, "loss": 1.8386, "step": 19340 }, { "epoch": 1.0486663166629973, "grad_norm": 0.33570852875709534, "learning_rate": 9.275253482506945e-05, "loss": 1.8224, "step": 19350 }, { "epoch": 1.0492082578285096, "grad_norm": 0.3555627167224884, "learning_rate": 9.274391309336432e-05, "loss": 1.8235, "step": 19360 }, { "epoch": 1.0497501989940217, "grad_norm": 0.31758037209510803, "learning_rate": 9.27352866861071e-05, "loss": 1.8283, "step": 19370 }, { "epoch": 1.050292140159534, "grad_norm": 0.3359634280204773, "learning_rate": 9.272665560436642e-05, "loss": 1.8207, "step": 19380 }, { "epoch": 1.050834081325046, "grad_norm": 0.2850840389728546, "learning_rate": 9.271801984921142e-05, "loss": 1.8291, "step": 19390 }, { "epoch": 1.0513760224905584, "grad_norm": 0.29905441403388977, "learning_rate": 9.270937942171189e-05, "loss": 1.8188, "step": 19400 }, { "epoch": 1.0514302166071097, "eval_loss": 2.616773843765259, "eval_runtime": 22.0168, "eval_samples_per_second": 227.1, "eval_steps_per_second": 1.226, "step": 19401 }, { "epoch": 1.0519179636560705, "grad_norm": 0.3919777274131775, "learning_rate": 9.270073432293814e-05, "loss": 1.8129, "step": 19410 }, { "epoch": 1.0524599048215828, "grad_norm": 0.37216314673423767, "learning_rate": 9.26920845539611e-05, "loss": 1.8217, "step": 19420 }, { "epoch": 1.0530018459870951, "grad_norm": 0.3143322765827179, "learning_rate": 9.268343011585227e-05, "loss": 1.8239, "step": 19430 }, { "epoch": 1.0535437871526072, "grad_norm": 0.6800436973571777, "learning_rate": 9.267477100968372e-05, "loss": 1.8202, "step": 19440 }, { "epoch": 1.0540857283181195, "grad_norm": 0.35689041018486023, "learning_rate": 9.266610723652813e-05, "loss": 1.82, "step": 19450 }, { "epoch": 1.0546276694836316, "grad_norm": 0.29921287298202515, "learning_rate": 9.265743879745867e-05, "loss": 1.8124, "step": 19460 }, { "epoch": 1.055169610649144, "grad_norm": 0.47158485651016235, "learning_rate": 9.264876569354921e-05, "loss": 1.8158, "step": 19470 }, { "epoch": 1.055711551814656, "grad_norm": 0.32789716124534607, "learning_rate": 9.26400879258741e-05, "loss": 1.8313, "step": 19480 }, { "epoch": 1.0561451047470658, "eval_loss": 2.605008840560913, "eval_runtime": 21.9889, "eval_samples_per_second": 227.388, "eval_steps_per_second": 1.228, "step": 19488 }, { "epoch": 1.0562534929801684, "grad_norm": 0.3259328007698059, "learning_rate": 9.263140549550832e-05, "loss": 1.8178, "step": 19490 }, { "epoch": 1.0567954341456804, "grad_norm": 0.27712419629096985, "learning_rate": 9.26227184035274e-05, "loss": 1.8237, "step": 19500 }, { "epoch": 1.0573373753111928, "grad_norm": 0.7220262289047241, "learning_rate": 9.261402665100746e-05, "loss": 1.8208, "step": 19510 }, { "epoch": 1.057879316476705, "grad_norm": 0.4974667429924011, "learning_rate": 9.260533023902522e-05, "loss": 1.8214, "step": 19520 }, { "epoch": 1.0584212576422172, "grad_norm": 0.4250205457210541, "learning_rate": 9.259662916865792e-05, "loss": 1.8159, "step": 19530 }, { "epoch": 1.0589631988077295, "grad_norm": 0.4620699882507324, "learning_rate": 9.258792344098344e-05, "loss": 1.806, "step": 19540 }, { "epoch": 1.0595051399732416, "grad_norm": 0.36918604373931885, "learning_rate": 9.257921305708018e-05, "loss": 1.8078, "step": 19550 }, { "epoch": 1.060047081138754, "grad_norm": 0.3109738528728485, "learning_rate": 9.257049801802716e-05, "loss": 1.8172, "step": 19560 }, { "epoch": 1.060589022304266, "grad_norm": 0.5203297138214111, "learning_rate": 9.256177832490398e-05, "loss": 1.816, "step": 19570 }, { "epoch": 1.0608599928870222, "eval_loss": 2.603193759918213, "eval_runtime": 21.9951, "eval_samples_per_second": 227.323, "eval_steps_per_second": 1.228, "step": 19575 }, { "epoch": 1.0611309634697783, "grad_norm": 0.29931849241256714, "learning_rate": 9.255305397879076e-05, "loss": 1.8155, "step": 19580 }, { "epoch": 1.0616729046352906, "grad_norm": 0.3699570298194885, "learning_rate": 9.254432498076826e-05, "loss": 1.8154, "step": 19590 }, { "epoch": 1.0622148458008027, "grad_norm": 0.2735179364681244, "learning_rate": 9.253559133191779e-05, "loss": 1.8209, "step": 19600 }, { "epoch": 1.062756786966315, "grad_norm": 0.31726008653640747, "learning_rate": 9.252685303332123e-05, "loss": 1.8278, "step": 19610 }, { "epoch": 1.0632987281318271, "grad_norm": 0.6221383213996887, "learning_rate": 9.251811008606102e-05, "loss": 1.825, "step": 19620 }, { "epoch": 1.0638406692973394, "grad_norm": 0.8196062445640564, "learning_rate": 9.250936249122023e-05, "loss": 1.8196, "step": 19630 }, { "epoch": 1.0643826104628515, "grad_norm": 0.2812905013561249, "learning_rate": 9.250061024988246e-05, "loss": 1.8123, "step": 19640 }, { "epoch": 1.0649245516283639, "grad_norm": 0.4467778205871582, "learning_rate": 9.249185336313191e-05, "loss": 1.8098, "step": 19650 }, { "epoch": 1.0654664927938762, "grad_norm": 0.36650586128234863, "learning_rate": 9.248309183205334e-05, "loss": 1.8245, "step": 19660 }, { "epoch": 1.0655748810269785, "eval_loss": 2.5961737632751465, "eval_runtime": 21.9861, "eval_samples_per_second": 227.416, "eval_steps_per_second": 1.228, "step": 19662 }, { "epoch": 1.0660084339593883, "grad_norm": 0.351523220539093, "learning_rate": 9.247432565773209e-05, "loss": 1.8098, "step": 19670 }, { "epoch": 1.0665503751249006, "grad_norm": 0.4025458097457886, "learning_rate": 9.246555484125407e-05, "loss": 1.8208, "step": 19680 }, { "epoch": 1.0670923162904127, "grad_norm": 0.5277726650238037, "learning_rate": 9.245677938370578e-05, "loss": 1.8235, "step": 19690 }, { "epoch": 1.067634257455925, "grad_norm": 0.3863029181957245, "learning_rate": 9.244799928617427e-05, "loss": 1.8074, "step": 19700 }, { "epoch": 1.068176198621437, "grad_norm": 0.4278218746185303, "learning_rate": 9.24392145497472e-05, "loss": 1.8179, "step": 19710 }, { "epoch": 1.0687181397869494, "grad_norm": 0.30992093682289124, "learning_rate": 9.243042517551277e-05, "loss": 1.8083, "step": 19720 }, { "epoch": 1.0692600809524615, "grad_norm": 0.92547607421875, "learning_rate": 9.242163116455979e-05, "loss": 1.8194, "step": 19730 }, { "epoch": 1.0698020221179738, "grad_norm": 0.297558069229126, "learning_rate": 9.241283251797759e-05, "loss": 1.8245, "step": 19740 }, { "epoch": 1.0702897691669349, "eval_loss": 2.59582781791687, "eval_runtime": 21.9957, "eval_samples_per_second": 227.317, "eval_steps_per_second": 1.228, "step": 19749 }, { "epoch": 1.0703439632834861, "grad_norm": 0.34201860427856445, "learning_rate": 9.240402923685613e-05, "loss": 1.8202, "step": 19750 }, { "epoch": 1.0708859044489982, "grad_norm": 0.5197858810424805, "learning_rate": 9.239522132228594e-05, "loss": 1.8104, "step": 19760 }, { "epoch": 1.0714278456145105, "grad_norm": 0.5215007662773132, "learning_rate": 9.238640877535809e-05, "loss": 1.8128, "step": 19770 }, { "epoch": 1.0719697867800226, "grad_norm": 0.3624344766139984, "learning_rate": 9.237759159716421e-05, "loss": 1.8203, "step": 19780 }, { "epoch": 1.072511727945535, "grad_norm": 0.3879830241203308, "learning_rate": 9.236876978879657e-05, "loss": 1.8119, "step": 19790 }, { "epoch": 1.073053669111047, "grad_norm": 0.3035085201263428, "learning_rate": 9.235994335134798e-05, "loss": 1.8118, "step": 19800 }, { "epoch": 1.0735956102765594, "grad_norm": 1.0123307704925537, "learning_rate": 9.23511122859118e-05, "loss": 1.8119, "step": 19810 }, { "epoch": 1.0741375514420715, "grad_norm": 0.46915149688720703, "learning_rate": 9.234227659358197e-05, "loss": 1.8163, "step": 19820 }, { "epoch": 1.0746794926075838, "grad_norm": 0.48051413893699646, "learning_rate": 9.233343627545307e-05, "loss": 1.8178, "step": 19830 }, { "epoch": 1.0750046573068912, "eval_loss": 2.6102840900421143, "eval_runtime": 21.9902, "eval_samples_per_second": 227.374, "eval_steps_per_second": 1.228, "step": 19836 }, { "epoch": 1.075221433773096, "grad_norm": 0.7908685207366943, "learning_rate": 9.232459133262016e-05, "loss": 1.8174, "step": 19840 }, { "epoch": 1.0757633749386082, "grad_norm": 0.4490140974521637, "learning_rate": 9.231574176617893e-05, "loss": 1.821, "step": 19850 }, { "epoch": 1.0763053161041205, "grad_norm": 0.6310442686080933, "learning_rate": 9.230688757722562e-05, "loss": 1.8002, "step": 19860 }, { "epoch": 1.0768472572696326, "grad_norm": 0.47709372639656067, "learning_rate": 9.229802876685702e-05, "loss": 1.817, "step": 19870 }, { "epoch": 1.077389198435145, "grad_norm": 0.3235110640525818, "learning_rate": 9.228916533617057e-05, "loss": 1.8022, "step": 19880 }, { "epoch": 1.077931139600657, "grad_norm": 0.2803106904029846, "learning_rate": 9.228029728626421e-05, "loss": 1.8075, "step": 19890 }, { "epoch": 1.0784730807661693, "grad_norm": 0.290698766708374, "learning_rate": 9.227142461823648e-05, "loss": 1.8064, "step": 19900 }, { "epoch": 1.0790150219316816, "grad_norm": 0.7563320398330688, "learning_rate": 9.226254733318648e-05, "loss": 1.8079, "step": 19910 }, { "epoch": 1.0795569630971937, "grad_norm": 0.4314441382884979, "learning_rate": 9.22536654322139e-05, "loss": 1.7955, "step": 19920 }, { "epoch": 1.0797195454468473, "eval_loss": 2.596864938735962, "eval_runtime": 21.9923, "eval_samples_per_second": 227.352, "eval_steps_per_second": 1.228, "step": 19923 }, { "epoch": 1.080098904262706, "grad_norm": 0.30512261390686035, "learning_rate": 9.224477891641897e-05, "loss": 1.8072, "step": 19930 }, { "epoch": 1.0806408454282181, "grad_norm": 0.5094988346099854, "learning_rate": 9.223588778690255e-05, "loss": 1.8176, "step": 19940 }, { "epoch": 1.0811827865937305, "grad_norm": 0.47923043370246887, "learning_rate": 9.222699204476599e-05, "loss": 1.8203, "step": 19950 }, { "epoch": 1.0817247277592426, "grad_norm": 0.3491983711719513, "learning_rate": 9.221809169111129e-05, "loss": 1.8097, "step": 19960 }, { "epoch": 1.0822666689247549, "grad_norm": 0.5780982971191406, "learning_rate": 9.220918672704099e-05, "loss": 1.8065, "step": 19970 }, { "epoch": 1.0828086100902672, "grad_norm": 0.37517765164375305, "learning_rate": 9.220027715365817e-05, "loss": 1.8102, "step": 19980 }, { "epoch": 1.0833505512557793, "grad_norm": 0.582490086555481, "learning_rate": 9.219136297206652e-05, "loss": 1.8215, "step": 19990 }, { "epoch": 1.0838924924212916, "grad_norm": 0.5589390397071838, "learning_rate": 9.218244418337028e-05, "loss": 1.8206, "step": 20000 }, { "epoch": 1.0844344335868037, "grad_norm": 0.5232866406440735, "learning_rate": 9.217352078867431e-05, "loss": 1.8069, "step": 20010 }, { "epoch": 1.0844344335868037, "eval_loss": 2.6093404293060303, "eval_runtime": 21.9876, "eval_samples_per_second": 227.401, "eval_steps_per_second": 1.228, "step": 20010 }, { "epoch": 1.084976374752316, "grad_norm": 0.3265039026737213, "learning_rate": 9.216459278908398e-05, "loss": 1.7983, "step": 20020 }, { "epoch": 1.085518315917828, "grad_norm": 0.3922523558139801, "learning_rate": 9.215566018570523e-05, "loss": 1.7976, "step": 20030 }, { "epoch": 1.0860602570833404, "grad_norm": 0.35444027185440063, "learning_rate": 9.214672297964461e-05, "loss": 1.7977, "step": 20040 }, { "epoch": 1.0866021982488525, "grad_norm": 0.5408755540847778, "learning_rate": 9.213778117200926e-05, "loss": 1.8022, "step": 20050 }, { "epoch": 1.0871441394143648, "grad_norm": 0.5063872337341309, "learning_rate": 9.212883476390677e-05, "loss": 1.8088, "step": 20060 }, { "epoch": 1.0876860805798771, "grad_norm": 0.39630988240242004, "learning_rate": 9.211988375644543e-05, "loss": 1.7955, "step": 20070 }, { "epoch": 1.0882280217453892, "grad_norm": 0.3347756564617157, "learning_rate": 9.211092815073407e-05, "loss": 1.7925, "step": 20080 }, { "epoch": 1.0887699629109016, "grad_norm": 0.3801024854183197, "learning_rate": 9.210196794788203e-05, "loss": 1.8096, "step": 20090 }, { "epoch": 1.08914932172676, "eval_loss": 2.6098368167877197, "eval_runtime": 21.9921, "eval_samples_per_second": 227.354, "eval_steps_per_second": 1.228, "step": 20097 }, { "epoch": 1.0893119040764137, "grad_norm": 0.4123273491859436, "learning_rate": 9.20930031489993e-05, "loss": 1.7988, "step": 20100 }, { "epoch": 1.089853845241926, "grad_norm": 0.6030639410018921, "learning_rate": 9.208403375519637e-05, "loss": 1.7927, "step": 20110 }, { "epoch": 1.090395786407438, "grad_norm": 0.3275871276855469, "learning_rate": 9.207505976758434e-05, "loss": 1.7937, "step": 20120 }, { "epoch": 1.0909377275729504, "grad_norm": 0.4877399206161499, "learning_rate": 9.206608118727488e-05, "loss": 1.811, "step": 20130 }, { "epoch": 1.0914796687384625, "grad_norm": 0.2985506057739258, "learning_rate": 9.20570980153802e-05, "loss": 1.812, "step": 20140 }, { "epoch": 1.0920216099039748, "grad_norm": 0.29198411107063293, "learning_rate": 9.204811025301311e-05, "loss": 1.803, "step": 20150 }, { "epoch": 1.092563551069487, "grad_norm": 0.3116692006587982, "learning_rate": 9.203911790128696e-05, "loss": 1.8113, "step": 20160 }, { "epoch": 1.0931054922349992, "grad_norm": 0.3624420464038849, "learning_rate": 9.20301209613157e-05, "loss": 1.794, "step": 20170 }, { "epoch": 1.0936474334005115, "grad_norm": 0.5469610095024109, "learning_rate": 9.202111943421381e-05, "loss": 1.8038, "step": 20180 }, { "epoch": 1.0938642098667164, "eval_loss": 2.607426404953003, "eval_runtime": 21.9928, "eval_samples_per_second": 227.347, "eval_steps_per_second": 1.228, "step": 20184 }, { "epoch": 1.0941893745660236, "grad_norm": 0.5207812786102295, "learning_rate": 9.201211332109639e-05, "loss": 1.7874, "step": 20190 }, { "epoch": 1.094731315731536, "grad_norm": 0.3748219311237335, "learning_rate": 9.200310262307905e-05, "loss": 1.7988, "step": 20200 }, { "epoch": 1.095273256897048, "grad_norm": 0.34070441126823425, "learning_rate": 9.199408734127801e-05, "loss": 1.8057, "step": 20210 }, { "epoch": 1.0958151980625603, "grad_norm": 0.32609522342681885, "learning_rate": 9.198506747681005e-05, "loss": 1.8109, "step": 20220 }, { "epoch": 1.0963571392280727, "grad_norm": 0.4092683792114258, "learning_rate": 9.19760430307925e-05, "loss": 1.7954, "step": 20230 }, { "epoch": 1.0968990803935847, "grad_norm": 0.5064677000045776, "learning_rate": 9.196701400434327e-05, "loss": 1.8074, "step": 20240 }, { "epoch": 1.097441021559097, "grad_norm": 0.3224587142467499, "learning_rate": 9.195798039858084e-05, "loss": 1.7973, "step": 20250 }, { "epoch": 1.0979829627246092, "grad_norm": 0.5034688711166382, "learning_rate": 9.194894221462427e-05, "loss": 1.7994, "step": 20260 }, { "epoch": 1.0985249038901215, "grad_norm": 0.3772341310977936, "learning_rate": 9.193989945359314e-05, "loss": 1.7817, "step": 20270 }, { "epoch": 1.0985790980066727, "eval_loss": 2.6105637550354004, "eval_runtime": 21.9937, "eval_samples_per_second": 227.338, "eval_steps_per_second": 1.228, "step": 20271 }, { "epoch": 1.0990668450556336, "grad_norm": 0.8809638619422913, "learning_rate": 9.193085211660764e-05, "loss": 1.8056, "step": 20280 }, { "epoch": 1.0996087862211459, "grad_norm": 0.6711058020591736, "learning_rate": 9.192180020478852e-05, "loss": 1.7951, "step": 20290 }, { "epoch": 1.1001507273866582, "grad_norm": 0.8903740048408508, "learning_rate": 9.19127437192571e-05, "loss": 1.8076, "step": 20300 }, { "epoch": 1.1006926685521703, "grad_norm": 0.8000921010971069, "learning_rate": 9.190368266113524e-05, "loss": 1.811, "step": 20310 }, { "epoch": 1.1012346097176826, "grad_norm": 0.4838610589504242, "learning_rate": 9.189461703154538e-05, "loss": 1.809, "step": 20320 }, { "epoch": 1.1017765508831947, "grad_norm": 0.5514103174209595, "learning_rate": 9.188554683161056e-05, "loss": 1.7987, "step": 20330 }, { "epoch": 1.102318492048707, "grad_norm": 0.2846459448337555, "learning_rate": 9.187647206245434e-05, "loss": 1.8004, "step": 20340 }, { "epoch": 1.1028604332142191, "grad_norm": 0.45695552229881287, "learning_rate": 9.186739272520085e-05, "loss": 1.7999, "step": 20350 }, { "epoch": 1.1032939861466289, "eval_loss": 2.5855958461761475, "eval_runtime": 21.9945, "eval_samples_per_second": 227.33, "eval_steps_per_second": 1.228, "step": 20358 }, { "epoch": 1.1034023743797314, "grad_norm": 0.3189384937286377, "learning_rate": 9.185830882097482e-05, "loss": 1.8023, "step": 20360 }, { "epoch": 1.1039443155452435, "grad_norm": 0.5680913925170898, "learning_rate": 9.184922035090151e-05, "loss": 1.8008, "step": 20370 }, { "epoch": 1.1044862567107558, "grad_norm": 0.31937944889068604, "learning_rate": 9.184012731610676e-05, "loss": 1.7983, "step": 20380 }, { "epoch": 1.1050281978762682, "grad_norm": 0.9807746410369873, "learning_rate": 9.1831029717717e-05, "loss": 1.7981, "step": 20390 }, { "epoch": 1.1055701390417803, "grad_norm": 1.0870219469070435, "learning_rate": 9.182192755685917e-05, "loss": 1.7879, "step": 20400 }, { "epoch": 1.1061120802072926, "grad_norm": 0.45273005962371826, "learning_rate": 9.181282083466082e-05, "loss": 1.7992, "step": 20410 }, { "epoch": 1.1066540213728047, "grad_norm": 0.7691323757171631, "learning_rate": 9.180370955225006e-05, "loss": 1.7918, "step": 20420 }, { "epoch": 1.107195962538317, "grad_norm": 0.5909658670425415, "learning_rate": 9.179459371075554e-05, "loss": 1.802, "step": 20430 }, { "epoch": 1.107737903703829, "grad_norm": 0.3207308053970337, "learning_rate": 9.178547331130649e-05, "loss": 1.7934, "step": 20440 }, { "epoch": 1.1080088742865852, "eval_loss": 2.5942413806915283, "eval_runtime": 21.9938, "eval_samples_per_second": 227.337, "eval_steps_per_second": 1.228, "step": 20445 }, { "epoch": 1.1082798448693414, "grad_norm": 0.3061155378818512, "learning_rate": 9.177634835503272e-05, "loss": 1.7894, "step": 20450 }, { "epoch": 1.1088217860348535, "grad_norm": 0.35638898611068726, "learning_rate": 9.176721884306459e-05, "loss": 1.7893, "step": 20460 }, { "epoch": 1.1093637272003658, "grad_norm": 0.351608008146286, "learning_rate": 9.1758084776533e-05, "loss": 1.7951, "step": 20470 }, { "epoch": 1.1099056683658781, "grad_norm": 0.3291257321834564, "learning_rate": 9.174894615656948e-05, "loss": 1.7911, "step": 20480 }, { "epoch": 1.1104476095313902, "grad_norm": 0.2714458703994751, "learning_rate": 9.173980298430604e-05, "loss": 1.795, "step": 20490 }, { "epoch": 1.1109895506969025, "grad_norm": 0.3953942358493805, "learning_rate": 9.173065526087531e-05, "loss": 1.7912, "step": 20500 }, { "epoch": 1.1115314918624146, "grad_norm": 0.3956896662712097, "learning_rate": 9.17215029874105e-05, "loss": 1.7911, "step": 20510 }, { "epoch": 1.112073433027927, "grad_norm": 0.3793238401412964, "learning_rate": 9.17123461650453e-05, "loss": 1.7972, "step": 20520 }, { "epoch": 1.112615374193439, "grad_norm": 0.4785431921482086, "learning_rate": 9.170318479491406e-05, "loss": 1.7961, "step": 20530 }, { "epoch": 1.1127237624265416, "eval_loss": 2.584057331085205, "eval_runtime": 21.9901, "eval_samples_per_second": 227.375, "eval_steps_per_second": 1.228, "step": 20532 }, { "epoch": 1.1131573153589513, "grad_norm": 0.4006010890007019, "learning_rate": 9.169401887815164e-05, "loss": 1.7861, "step": 20540 }, { "epoch": 1.1136992565244637, "grad_norm": 0.2974483072757721, "learning_rate": 9.168484841589346e-05, "loss": 1.7885, "step": 20550 }, { "epoch": 1.1142411976899758, "grad_norm": 0.7173396348953247, "learning_rate": 9.167567340927552e-05, "loss": 1.8022, "step": 20560 }, { "epoch": 1.114783138855488, "grad_norm": 0.46824324131011963, "learning_rate": 9.166649385943441e-05, "loss": 1.7895, "step": 20570 }, { "epoch": 1.1153250800210002, "grad_norm": 0.6654873490333557, "learning_rate": 9.165730976750722e-05, "loss": 1.7898, "step": 20580 }, { "epoch": 1.1158670211865125, "grad_norm": 0.27924737334251404, "learning_rate": 9.164812113463165e-05, "loss": 1.783, "step": 20590 }, { "epoch": 1.1164089623520246, "grad_norm": 0.5301341414451599, "learning_rate": 9.163892796194594e-05, "loss": 1.7854, "step": 20600 }, { "epoch": 1.116950903517537, "grad_norm": 0.357898086309433, "learning_rate": 9.162973025058891e-05, "loss": 1.7834, "step": 20610 }, { "epoch": 1.117438650566498, "eval_loss": 2.5919032096862793, "eval_runtime": 21.993, "eval_samples_per_second": 227.345, "eval_steps_per_second": 1.228, "step": 20619 }, { "epoch": 1.1174928446830492, "grad_norm": 0.3866696059703827, "learning_rate": 9.162052800169992e-05, "loss": 1.7898, "step": 20620 }, { "epoch": 1.1180347858485613, "grad_norm": 0.30722272396087646, "learning_rate": 9.161132121641892e-05, "loss": 1.7837, "step": 20630 }, { "epoch": 1.1185767270140736, "grad_norm": 0.36760374903678894, "learning_rate": 9.160210989588639e-05, "loss": 1.7846, "step": 20640 }, { "epoch": 1.1191186681795857, "grad_norm": 0.48524531722068787, "learning_rate": 9.15928940412434e-05, "loss": 1.7822, "step": 20650 }, { "epoch": 1.119660609345098, "grad_norm": 0.41821709275245667, "learning_rate": 9.158367365363157e-05, "loss": 1.7902, "step": 20660 }, { "epoch": 1.1202025505106101, "grad_norm": 0.5472697615623474, "learning_rate": 9.157444873419307e-05, "loss": 1.7975, "step": 20670 }, { "epoch": 1.1207444916761224, "grad_norm": 0.3065122663974762, "learning_rate": 9.156521928407066e-05, "loss": 1.7851, "step": 20680 }, { "epoch": 1.1212864328416345, "grad_norm": 0.5732535719871521, "learning_rate": 9.155598530440763e-05, "loss": 1.7849, "step": 20690 }, { "epoch": 1.1218283740071469, "grad_norm": 0.3151445686817169, "learning_rate": 9.154674679634786e-05, "loss": 1.7931, "step": 20700 }, { "epoch": 1.122153538706454, "eval_loss": 2.5944461822509766, "eval_runtime": 21.9928, "eval_samples_per_second": 227.348, "eval_steps_per_second": 1.228, "step": 20706 }, { "epoch": 1.1223703151726592, "grad_norm": 0.3001176714897156, "learning_rate": 9.153750376103575e-05, "loss": 1.7924, "step": 20710 }, { "epoch": 1.1229122563381713, "grad_norm": 0.2985573709011078, "learning_rate": 9.152825619961633e-05, "loss": 1.7838, "step": 20720 }, { "epoch": 1.1234541975036836, "grad_norm": 0.3202672600746155, "learning_rate": 9.151900411323509e-05, "loss": 1.7899, "step": 20730 }, { "epoch": 1.1239961386691957, "grad_norm": 0.3624779284000397, "learning_rate": 9.150974750303818e-05, "loss": 1.779, "step": 20740 }, { "epoch": 1.124538079834708, "grad_norm": 0.47234293818473816, "learning_rate": 9.150048637017226e-05, "loss": 1.7977, "step": 20750 }, { "epoch": 1.12508002100022, "grad_norm": 0.4014720618724823, "learning_rate": 9.149122071578457e-05, "loss": 1.7887, "step": 20760 }, { "epoch": 1.1256219621657324, "grad_norm": 0.8474875688552856, "learning_rate": 9.148195054102289e-05, "loss": 1.7889, "step": 20770 }, { "epoch": 1.1261639033312445, "grad_norm": 0.3223975598812103, "learning_rate": 9.147267584703554e-05, "loss": 1.7931, "step": 20780 }, { "epoch": 1.1267058444967568, "grad_norm": 0.5686293840408325, "learning_rate": 9.146339663497148e-05, "loss": 1.7829, "step": 20790 }, { "epoch": 1.1268684268464104, "eval_loss": 2.596271276473999, "eval_runtime": 21.9897, "eval_samples_per_second": 227.38, "eval_steps_per_second": 1.228, "step": 20793 }, { "epoch": 1.1272477856622691, "grad_norm": 0.4797728359699249, "learning_rate": 9.145411290598013e-05, "loss": 1.7919, "step": 20800 }, { "epoch": 1.1277897268277812, "grad_norm": 0.33035770058631897, "learning_rate": 9.144482466121157e-05, "loss": 1.7981, "step": 20810 }, { "epoch": 1.1283316679932935, "grad_norm": 0.6441662311553955, "learning_rate": 9.143553190181633e-05, "loss": 1.7875, "step": 20820 }, { "epoch": 1.1288736091588056, "grad_norm": 0.5361613035202026, "learning_rate": 9.14262346289456e-05, "loss": 1.7806, "step": 20830 }, { "epoch": 1.129415550324318, "grad_norm": 0.32189682126045227, "learning_rate": 9.141693284375106e-05, "loss": 1.7967, "step": 20840 }, { "epoch": 1.12995749148983, "grad_norm": 0.28092676401138306, "learning_rate": 9.140762654738499e-05, "loss": 1.7902, "step": 20850 }, { "epoch": 1.1304994326553424, "grad_norm": 0.29651352763175964, "learning_rate": 9.139831574100022e-05, "loss": 1.7905, "step": 20860 }, { "epoch": 1.1310413738208545, "grad_norm": 0.3914618492126465, "learning_rate": 9.138900042575012e-05, "loss": 1.7863, "step": 20870 }, { "epoch": 1.1315833149863668, "grad_norm": 0.33878007531166077, "learning_rate": 9.137968060278863e-05, "loss": 1.7872, "step": 20880 }, { "epoch": 1.1315833149863668, "eval_loss": 2.5968291759490967, "eval_runtime": 21.9881, "eval_samples_per_second": 227.396, "eval_steps_per_second": 1.228, "step": 20880 }, { "epoch": 1.132125256151879, "grad_norm": 0.584201455116272, "learning_rate": 9.137035627327026e-05, "loss": 1.7768, "step": 20890 }, { "epoch": 1.1326671973173912, "grad_norm": 0.3373468816280365, "learning_rate": 9.136102743835004e-05, "loss": 1.7857, "step": 20900 }, { "epoch": 1.1332091384829035, "grad_norm": 0.2604641020298004, "learning_rate": 9.13516940991836e-05, "loss": 1.7847, "step": 20910 }, { "epoch": 1.1337510796484156, "grad_norm": 0.28360849618911743, "learning_rate": 9.134235625692714e-05, "loss": 1.7903, "step": 20920 }, { "epoch": 1.134293020813928, "grad_norm": 0.4277685284614563, "learning_rate": 9.133301391273736e-05, "loss": 1.7813, "step": 20930 }, { "epoch": 1.1348349619794402, "grad_norm": 0.6790282726287842, "learning_rate": 9.132366706777155e-05, "loss": 1.792, "step": 20940 }, { "epoch": 1.1353769031449523, "grad_norm": 0.4232507050037384, "learning_rate": 9.13143157231876e-05, "loss": 1.7832, "step": 20950 }, { "epoch": 1.1359188443104646, "grad_norm": 0.5046867728233337, "learning_rate": 9.130495988014384e-05, "loss": 1.7758, "step": 20960 }, { "epoch": 1.1362982031263231, "eval_loss": 2.590829372406006, "eval_runtime": 21.9886, "eval_samples_per_second": 227.391, "eval_steps_per_second": 1.228, "step": 20967 }, { "epoch": 1.1364607854759767, "grad_norm": 0.3382951021194458, "learning_rate": 9.129559953979928e-05, "loss": 1.7858, "step": 20970 }, { "epoch": 1.137002726641489, "grad_norm": 0.33747395873069763, "learning_rate": 9.128623470331343e-05, "loss": 1.7828, "step": 20980 }, { "epoch": 1.1375446678070011, "grad_norm": 0.2978461682796478, "learning_rate": 9.127686537184636e-05, "loss": 1.7839, "step": 20990 }, { "epoch": 1.1380866089725135, "grad_norm": 0.5556627511978149, "learning_rate": 9.126749154655872e-05, "loss": 1.7826, "step": 21000 }, { "epoch": 1.1386285501380256, "grad_norm": 0.3020962178707123, "learning_rate": 9.125811322861168e-05, "loss": 1.7779, "step": 21010 }, { "epoch": 1.1391704913035379, "grad_norm": 0.5155685544013977, "learning_rate": 9.1248730419167e-05, "loss": 1.7912, "step": 21020 }, { "epoch": 1.1397124324690502, "grad_norm": 1.0742664337158203, "learning_rate": 9.123934311938696e-05, "loss": 1.7943, "step": 21030 }, { "epoch": 1.1402543736345623, "grad_norm": 0.6324855089187622, "learning_rate": 9.122995133043442e-05, "loss": 1.7778, "step": 21040 }, { "epoch": 1.1407963148000746, "grad_norm": 0.33419859409332275, "learning_rate": 9.122055505347283e-05, "loss": 1.7839, "step": 21050 }, { "epoch": 1.1410130912662795, "eval_loss": 2.5992467403411865, "eval_runtime": 21.9929, "eval_samples_per_second": 227.346, "eval_steps_per_second": 1.228, "step": 21054 }, { "epoch": 1.1413382559655867, "grad_norm": 0.30402234196662903, "learning_rate": 9.12111542896661e-05, "loss": 1.7818, "step": 21060 }, { "epoch": 1.141880197131099, "grad_norm": 0.28443703055381775, "learning_rate": 9.120174904017882e-05, "loss": 1.7698, "step": 21070 }, { "epoch": 1.142422138296611, "grad_norm": 0.29675841331481934, "learning_rate": 9.119233930617603e-05, "loss": 1.7764, "step": 21080 }, { "epoch": 1.1429640794621234, "grad_norm": 0.6041000485420227, "learning_rate": 9.118292508882338e-05, "loss": 1.7697, "step": 21090 }, { "epoch": 1.1435060206276355, "grad_norm": 0.3296681046485901, "learning_rate": 9.117350638928706e-05, "loss": 1.7884, "step": 21100 }, { "epoch": 1.1440479617931478, "grad_norm": 0.5075874924659729, "learning_rate": 9.11640832087338e-05, "loss": 1.7874, "step": 21110 }, { "epoch": 1.1445899029586601, "grad_norm": 0.5272268056869507, "learning_rate": 9.115465554833095e-05, "loss": 1.7755, "step": 21120 }, { "epoch": 1.1451318441241722, "grad_norm": 0.6501821875572205, "learning_rate": 9.114522340924631e-05, "loss": 1.7839, "step": 21130 }, { "epoch": 1.1456737852896846, "grad_norm": 0.2923777103424072, "learning_rate": 9.113578679264835e-05, "loss": 1.7769, "step": 21140 }, { "epoch": 1.1457279794062356, "eval_loss": 2.5842106342315674, "eval_runtime": 22.0002, "eval_samples_per_second": 227.27, "eval_steps_per_second": 1.227, "step": 21141 }, { "epoch": 1.1462157264551966, "grad_norm": 0.3021109402179718, "learning_rate": 9.1126345699706e-05, "loss": 1.7834, "step": 21150 }, { "epoch": 1.146757667620709, "grad_norm": 0.30015039443969727, "learning_rate": 9.111690013158877e-05, "loss": 1.7723, "step": 21160 }, { "epoch": 1.147299608786221, "grad_norm": 0.7381042838096619, "learning_rate": 9.110745008946678e-05, "loss": 1.7824, "step": 21170 }, { "epoch": 1.1478415499517334, "grad_norm": 0.39448803663253784, "learning_rate": 9.109799557451062e-05, "loss": 1.7873, "step": 21180 }, { "epoch": 1.1483834911172455, "grad_norm": 0.2745918035507202, "learning_rate": 9.108853658789149e-05, "loss": 1.7832, "step": 21190 }, { "epoch": 1.1489254322827578, "grad_norm": 0.3459675908088684, "learning_rate": 9.107907313078115e-05, "loss": 1.7695, "step": 21200 }, { "epoch": 1.14946737344827, "grad_norm": 0.30421921610832214, "learning_rate": 9.106960520435183e-05, "loss": 1.7735, "step": 21210 }, { "epoch": 1.1500093146137822, "grad_norm": 0.2987232804298401, "learning_rate": 9.106013280977645e-05, "loss": 1.7777, "step": 21220 }, { "epoch": 1.150442867546192, "eval_loss": 2.5747172832489014, "eval_runtime": 21.9936, "eval_samples_per_second": 227.339, "eval_steps_per_second": 1.228, "step": 21228 }, { "epoch": 1.1505512557792945, "grad_norm": 0.29645800590515137, "learning_rate": 9.105065594822838e-05, "loss": 1.7794, "step": 21230 }, { "epoch": 1.1510931969448066, "grad_norm": 0.3672481179237366, "learning_rate": 9.104117462088154e-05, "loss": 1.7751, "step": 21240 }, { "epoch": 1.151635138110319, "grad_norm": 0.27135440707206726, "learning_rate": 9.103168882891047e-05, "loss": 1.7841, "step": 21250 }, { "epoch": 1.1521770792758312, "grad_norm": 0.5033676624298096, "learning_rate": 9.102219857349024e-05, "loss": 1.776, "step": 21260 }, { "epoch": 1.1527190204413433, "grad_norm": 0.4527705907821655, "learning_rate": 9.101270385579643e-05, "loss": 1.776, "step": 21270 }, { "epoch": 1.1532609616068557, "grad_norm": 0.4339083433151245, "learning_rate": 9.100320467700521e-05, "loss": 1.7749, "step": 21280 }, { "epoch": 1.1538029027723677, "grad_norm": 0.28354117274284363, "learning_rate": 9.099370103829332e-05, "loss": 1.7802, "step": 21290 }, { "epoch": 1.15434484393788, "grad_norm": 0.2814236283302307, "learning_rate": 9.098419294083801e-05, "loss": 1.7651, "step": 21300 }, { "epoch": 1.1548867851033922, "grad_norm": 0.32581958174705505, "learning_rate": 9.09746803858171e-05, "loss": 1.7867, "step": 21310 }, { "epoch": 1.1551577556861483, "eval_loss": 2.5878617763519287, "eval_runtime": 21.9939, "eval_samples_per_second": 227.336, "eval_steps_per_second": 1.228, "step": 21315 }, { "epoch": 1.1554287262689045, "grad_norm": 0.742839515209198, "learning_rate": 9.096516337440898e-05, "loss": 1.7793, "step": 21320 }, { "epoch": 1.1559706674344166, "grad_norm": 0.27201151847839355, "learning_rate": 9.095564190779257e-05, "loss": 1.7819, "step": 21330 }, { "epoch": 1.1565126085999289, "grad_norm": 0.2983904778957367, "learning_rate": 9.094611598714733e-05, "loss": 1.7799, "step": 21340 }, { "epoch": 1.1570545497654412, "grad_norm": 0.2776441276073456, "learning_rate": 9.09365856136533e-05, "loss": 1.7716, "step": 21350 }, { "epoch": 1.1575964909309533, "grad_norm": 0.27620989084243774, "learning_rate": 9.092705078849108e-05, "loss": 1.7662, "step": 21360 }, { "epoch": 1.1581384320964656, "grad_norm": 0.4133933484554291, "learning_rate": 9.091751151284178e-05, "loss": 1.7788, "step": 21370 }, { "epoch": 1.1586803732619777, "grad_norm": 0.587293267250061, "learning_rate": 9.090796778788709e-05, "loss": 1.7711, "step": 21380 }, { "epoch": 1.15922231442749, "grad_norm": 0.3694206178188324, "learning_rate": 9.089841961480927e-05, "loss": 1.7688, "step": 21390 }, { "epoch": 1.1597642555930021, "grad_norm": 0.5002974271774292, "learning_rate": 9.088886699479105e-05, "loss": 1.7743, "step": 21400 }, { "epoch": 1.1598726438261047, "eval_loss": 2.5802183151245117, "eval_runtime": 21.9908, "eval_samples_per_second": 227.368, "eval_steps_per_second": 1.228, "step": 21402 }, { "epoch": 1.1603061967585144, "grad_norm": 0.4875272810459137, "learning_rate": 9.087930992901581e-05, "loss": 1.7803, "step": 21410 }, { "epoch": 1.1608481379240265, "grad_norm": 0.33327093720436096, "learning_rate": 9.086974841866743e-05, "loss": 1.7743, "step": 21420 }, { "epoch": 1.1613900790895388, "grad_norm": 0.6208361983299255, "learning_rate": 9.086018246493037e-05, "loss": 1.7694, "step": 21430 }, { "epoch": 1.1619320202550512, "grad_norm": 0.4120174050331116, "learning_rate": 9.085061206898957e-05, "loss": 1.7802, "step": 21440 }, { "epoch": 1.1624739614205633, "grad_norm": 0.3111993074417114, "learning_rate": 9.08410372320306e-05, "loss": 1.7863, "step": 21450 }, { "epoch": 1.1630159025860756, "grad_norm": 0.2776833772659302, "learning_rate": 9.083145795523955e-05, "loss": 1.7794, "step": 21460 }, { "epoch": 1.1635578437515877, "grad_norm": 0.7861410975456238, "learning_rate": 9.082187423980304e-05, "loss": 1.7772, "step": 21470 }, { "epoch": 1.1640997849171, "grad_norm": 0.3405260741710663, "learning_rate": 9.081228608690828e-05, "loss": 1.7832, "step": 21480 }, { "epoch": 1.164587531966061, "eval_loss": 2.5907578468322754, "eval_runtime": 21.9898, "eval_samples_per_second": 227.378, "eval_steps_per_second": 1.228, "step": 21489 }, { "epoch": 1.164641726082612, "grad_norm": 0.28428077697753906, "learning_rate": 9.080269349774301e-05, "loss": 1.7735, "step": 21490 }, { "epoch": 1.1651836672481244, "grad_norm": 0.28834155201911926, "learning_rate": 9.079309647349549e-05, "loss": 1.7815, "step": 21500 }, { "epoch": 1.1657256084136365, "grad_norm": 0.35341453552246094, "learning_rate": 9.078349501535461e-05, "loss": 1.7785, "step": 21510 }, { "epoch": 1.1662675495791488, "grad_norm": 0.5031281113624573, "learning_rate": 9.077388912450969e-05, "loss": 1.7797, "step": 21520 }, { "epoch": 1.1668094907446611, "grad_norm": 0.439062237739563, "learning_rate": 9.076427880215072e-05, "loss": 1.7657, "step": 21530 }, { "epoch": 1.1673514319101732, "grad_norm": 0.356479287147522, "learning_rate": 9.075466404946814e-05, "loss": 1.7763, "step": 21540 }, { "epoch": 1.1678933730756855, "grad_norm": 0.3379020094871521, "learning_rate": 9.074504486765304e-05, "loss": 1.7652, "step": 21550 }, { "epoch": 1.1684353142411976, "grad_norm": 0.3034111559391022, "learning_rate": 9.073542125789695e-05, "loss": 1.7658, "step": 21560 }, { "epoch": 1.16897725540671, "grad_norm": 0.4461290240287781, "learning_rate": 9.072579322139202e-05, "loss": 1.7678, "step": 21570 }, { "epoch": 1.1693024201060171, "eval_loss": 2.5929572582244873, "eval_runtime": 21.9943, "eval_samples_per_second": 227.332, "eval_steps_per_second": 1.228, "step": 21576 }, { "epoch": 1.1695191965722223, "grad_norm": 0.25855252146720886, "learning_rate": 9.071616075933095e-05, "loss": 1.7704, "step": 21580 }, { "epoch": 1.1700611377377343, "grad_norm": 0.27765700221061707, "learning_rate": 9.070652387290695e-05, "loss": 1.7575, "step": 21590 }, { "epoch": 1.1706030789032467, "grad_norm": 0.26034823060035706, "learning_rate": 9.069688256331377e-05, "loss": 1.7625, "step": 21600 }, { "epoch": 1.1711450200687588, "grad_norm": 1.0148969888687134, "learning_rate": 9.068723683174578e-05, "loss": 1.7771, "step": 21610 }, { "epoch": 1.171686961234271, "grad_norm": 0.3433249592781067, "learning_rate": 9.067758667939782e-05, "loss": 1.769, "step": 21620 }, { "epoch": 1.1722289023997832, "grad_norm": 0.3450257480144501, "learning_rate": 9.066793210746533e-05, "loss": 1.7761, "step": 21630 }, { "epoch": 1.1727708435652955, "grad_norm": 0.29265645146369934, "learning_rate": 9.065827311714426e-05, "loss": 1.7633, "step": 21640 }, { "epoch": 1.1733127847308076, "grad_norm": 0.29442882537841797, "learning_rate": 9.064860970963112e-05, "loss": 1.7706, "step": 21650 }, { "epoch": 1.17385472589632, "grad_norm": 0.4089371860027313, "learning_rate": 9.063894188612298e-05, "loss": 1.7673, "step": 21660 }, { "epoch": 1.1740173082459735, "eval_loss": 2.585028648376465, "eval_runtime": 21.9891, "eval_samples_per_second": 227.386, "eval_steps_per_second": 1.228, "step": 21663 }, { "epoch": 1.1743966670618322, "grad_norm": 0.5681344866752625, "learning_rate": 9.062926964781746e-05, "loss": 1.7797, "step": 21670 }, { "epoch": 1.1749386082273443, "grad_norm": 0.5952972769737244, "learning_rate": 9.061959299591269e-05, "loss": 1.7781, "step": 21680 }, { "epoch": 1.1754805493928566, "grad_norm": 0.30679190158843994, "learning_rate": 9.060991193160739e-05, "loss": 1.7685, "step": 21690 }, { "epoch": 1.1760224905583687, "grad_norm": 0.37840238213539124, "learning_rate": 9.06002264561008e-05, "loss": 1.7613, "step": 21700 }, { "epoch": 1.176564431723881, "grad_norm": 0.6132546663284302, "learning_rate": 9.059053657059272e-05, "loss": 1.7842, "step": 21710 }, { "epoch": 1.1771063728893931, "grad_norm": 0.37106719613075256, "learning_rate": 9.058084227628351e-05, "loss": 1.763, "step": 21720 }, { "epoch": 1.1776483140549054, "grad_norm": 0.2650188207626343, "learning_rate": 9.057114357437401e-05, "loss": 1.7582, "step": 21730 }, { "epoch": 1.1781902552204175, "grad_norm": 0.4426226019859314, "learning_rate": 9.056144046606568e-05, "loss": 1.7725, "step": 21740 }, { "epoch": 1.1787321963859299, "grad_norm": 0.39338555932044983, "learning_rate": 9.05517329525605e-05, "loss": 1.7712, "step": 21750 }, { "epoch": 1.1787321963859299, "eval_loss": 2.59128999710083, "eval_runtime": 21.9834, "eval_samples_per_second": 227.444, "eval_steps_per_second": 1.228, "step": 21750 }, { "epoch": 1.1792741375514422, "grad_norm": 0.3727186918258667, "learning_rate": 9.054202103506098e-05, "loss": 1.7642, "step": 21760 }, { "epoch": 1.1798160787169543, "grad_norm": 0.4082069993019104, "learning_rate": 9.053230471477023e-05, "loss": 1.7736, "step": 21770 }, { "epoch": 1.1803580198824666, "grad_norm": 0.29831764101982117, "learning_rate": 9.052258399289182e-05, "loss": 1.7592, "step": 21780 }, { "epoch": 1.1808999610479787, "grad_norm": 0.4205126166343689, "learning_rate": 9.051285887062993e-05, "loss": 1.7677, "step": 21790 }, { "epoch": 1.181441902213491, "grad_norm": 0.38999685645103455, "learning_rate": 9.050312934918926e-05, "loss": 1.7681, "step": 21800 }, { "epoch": 1.181983843379003, "grad_norm": 0.34131819009780884, "learning_rate": 9.049339542977507e-05, "loss": 1.7568, "step": 21810 }, { "epoch": 1.1825257845445154, "grad_norm": 0.5073003768920898, "learning_rate": 9.048365711359317e-05, "loss": 1.7684, "step": 21820 }, { "epoch": 1.1830677257100275, "grad_norm": 0.47667044401168823, "learning_rate": 9.047391440184985e-05, "loss": 1.7634, "step": 21830 }, { "epoch": 1.1834470845258862, "eval_loss": 2.5854649543762207, "eval_runtime": 21.9926, "eval_samples_per_second": 227.349, "eval_steps_per_second": 1.228, "step": 21837 }, { "epoch": 1.1836096668755398, "grad_norm": 0.8095766305923462, "learning_rate": 9.046416729575205e-05, "loss": 1.7581, "step": 21840 }, { "epoch": 1.1841516080410521, "grad_norm": 0.3507765531539917, "learning_rate": 9.045441579650717e-05, "loss": 1.7646, "step": 21850 }, { "epoch": 1.1846935492065642, "grad_norm": 0.29202473163604736, "learning_rate": 9.044465990532318e-05, "loss": 1.7758, "step": 21860 }, { "epoch": 1.1852354903720765, "grad_norm": 0.265829861164093, "learning_rate": 9.043489962340861e-05, "loss": 1.7639, "step": 21870 }, { "epoch": 1.1857774315375886, "grad_norm": 0.2809160649776459, "learning_rate": 9.042513495197252e-05, "loss": 1.7516, "step": 21880 }, { "epoch": 1.186319372703101, "grad_norm": 0.3107336163520813, "learning_rate": 9.04153658922245e-05, "loss": 1.7647, "step": 21890 }, { "epoch": 1.186861313868613, "grad_norm": 0.5341174006462097, "learning_rate": 9.040559244537473e-05, "loss": 1.7625, "step": 21900 }, { "epoch": 1.1874032550341254, "grad_norm": 0.25601211190223694, "learning_rate": 9.039581461263388e-05, "loss": 1.7634, "step": 21910 }, { "epoch": 1.1879451961996375, "grad_norm": 0.6476098299026489, "learning_rate": 9.038603239521318e-05, "loss": 1.7637, "step": 21920 }, { "epoch": 1.1881619726658426, "eval_loss": 2.5872724056243896, "eval_runtime": 21.9906, "eval_samples_per_second": 227.37, "eval_steps_per_second": 1.228, "step": 21924 }, { "epoch": 1.1884871373651498, "grad_norm": 0.30727824568748474, "learning_rate": 9.037624579432442e-05, "loss": 1.7608, "step": 21930 }, { "epoch": 1.189029078530662, "grad_norm": 0.256548672914505, "learning_rate": 9.036645481117992e-05, "loss": 1.7675, "step": 21940 }, { "epoch": 1.1895710196961742, "grad_norm": 0.3580315113067627, "learning_rate": 9.035665944699254e-05, "loss": 1.7716, "step": 21950 }, { "epoch": 1.1901129608616865, "grad_norm": 0.2569289207458496, "learning_rate": 9.034685970297571e-05, "loss": 1.7627, "step": 21960 }, { "epoch": 1.1906549020271986, "grad_norm": 0.3389548361301422, "learning_rate": 9.033705558034335e-05, "loss": 1.7617, "step": 21970 }, { "epoch": 1.191196843192711, "grad_norm": 0.35166749358177185, "learning_rate": 9.032724708030995e-05, "loss": 1.7634, "step": 21980 }, { "epoch": 1.1917387843582232, "grad_norm": 0.47258156538009644, "learning_rate": 9.031743420409058e-05, "loss": 1.7606, "step": 21990 }, { "epoch": 1.1922807255237353, "grad_norm": 0.3378153443336487, "learning_rate": 9.030761695290077e-05, "loss": 1.7602, "step": 22000 }, { "epoch": 1.1928226666892476, "grad_norm": 0.5784263610839844, "learning_rate": 9.029779532795668e-05, "loss": 1.7585, "step": 22010 }, { "epoch": 1.1928768608057987, "eval_loss": 2.6090340614318848, "eval_runtime": 21.9879, "eval_samples_per_second": 227.398, "eval_steps_per_second": 1.228, "step": 22011 }, { "epoch": 1.1933646078547597, "grad_norm": 0.5027700066566467, "learning_rate": 9.028796933047495e-05, "loss": 1.7604, "step": 22020 }, { "epoch": 1.193906549020272, "grad_norm": 0.3202950060367584, "learning_rate": 9.027813896167278e-05, "loss": 1.7642, "step": 22030 }, { "epoch": 1.1944484901857841, "grad_norm": 0.3492630422115326, "learning_rate": 9.026830422276792e-05, "loss": 1.7612, "step": 22040 }, { "epoch": 1.1949904313512965, "grad_norm": 0.6746916770935059, "learning_rate": 9.025846511497864e-05, "loss": 1.7562, "step": 22050 }, { "epoch": 1.1955323725168086, "grad_norm": 0.42550233006477356, "learning_rate": 9.02486216395238e-05, "loss": 1.7618, "step": 22060 }, { "epoch": 1.1960743136823209, "grad_norm": 0.4293383061885834, "learning_rate": 9.023877379762274e-05, "loss": 1.763, "step": 22070 }, { "epoch": 1.1966162548478332, "grad_norm": 0.3266971707344055, "learning_rate": 9.022892159049537e-05, "loss": 1.7669, "step": 22080 }, { "epoch": 1.1971581960133453, "grad_norm": 0.99581378698349, "learning_rate": 9.021906501936213e-05, "loss": 1.7648, "step": 22090 }, { "epoch": 1.197591748945755, "eval_loss": 2.58486270904541, "eval_runtime": 21.9939, "eval_samples_per_second": 227.336, "eval_steps_per_second": 1.228, "step": 22098 }, { "epoch": 1.1977001371788576, "grad_norm": 0.5003217458724976, "learning_rate": 9.020920408544404e-05, "loss": 1.7659, "step": 22100 }, { "epoch": 1.1982420783443697, "grad_norm": 0.5149915218353271, "learning_rate": 9.019933878996259e-05, "loss": 1.7496, "step": 22110 }, { "epoch": 1.198784019509882, "grad_norm": 0.3475677967071533, "learning_rate": 9.018946913413989e-05, "loss": 1.7642, "step": 22120 }, { "epoch": 1.199325960675394, "grad_norm": 0.3139243423938751, "learning_rate": 9.017959511919853e-05, "loss": 1.7556, "step": 22130 }, { "epoch": 1.1998679018409064, "grad_norm": 0.32470259070396423, "learning_rate": 9.016971674636165e-05, "loss": 1.7616, "step": 22140 }, { "epoch": 1.2004098430064185, "grad_norm": 0.6316733360290527, "learning_rate": 9.015983401685296e-05, "loss": 1.7633, "step": 22150 }, { "epoch": 1.2009517841719308, "grad_norm": 0.6149379014968872, "learning_rate": 9.014994693189667e-05, "loss": 1.7599, "step": 22160 }, { "epoch": 1.2014937253374431, "grad_norm": 0.4740849435329437, "learning_rate": 9.014005549271757e-05, "loss": 1.7587, "step": 22170 }, { "epoch": 1.2020356665029552, "grad_norm": 0.4013853371143341, "learning_rate": 9.013015970054096e-05, "loss": 1.7523, "step": 22180 }, { "epoch": 1.2023066370857114, "eval_loss": 2.5968024730682373, "eval_runtime": 21.9857, "eval_samples_per_second": 227.42, "eval_steps_per_second": 1.228, "step": 22185 }, { "epoch": 1.2025776076684676, "grad_norm": 0.5497053861618042, "learning_rate": 9.012025955659269e-05, "loss": 1.7546, "step": 22190 }, { "epoch": 1.2031195488339796, "grad_norm": 0.3100571632385254, "learning_rate": 9.011035506209912e-05, "loss": 1.7637, "step": 22200 }, { "epoch": 1.203661489999492, "grad_norm": 0.298776239156723, "learning_rate": 9.010044621828722e-05, "loss": 1.7663, "step": 22210 }, { "epoch": 1.204203431165004, "grad_norm": 0.36760690808296204, "learning_rate": 9.009053302638444e-05, "loss": 1.7575, "step": 22220 }, { "epoch": 1.2047453723305164, "grad_norm": 0.2956767678260803, "learning_rate": 9.008061548761876e-05, "loss": 1.7624, "step": 22230 }, { "epoch": 1.2052873134960285, "grad_norm": 0.5647885203361511, "learning_rate": 9.007069360321873e-05, "loss": 1.7409, "step": 22240 }, { "epoch": 1.2058292546615408, "grad_norm": 0.26180022954940796, "learning_rate": 9.006076737441347e-05, "loss": 1.7539, "step": 22250 }, { "epoch": 1.206371195827053, "grad_norm": 0.29279986023902893, "learning_rate": 9.005083680243254e-05, "loss": 1.7551, "step": 22260 }, { "epoch": 1.2069131369925652, "grad_norm": 0.3438403904438019, "learning_rate": 9.004090188850612e-05, "loss": 1.7585, "step": 22270 }, { "epoch": 1.2070215252256677, "eval_loss": 2.595259666442871, "eval_runtime": 21.992, "eval_samples_per_second": 227.356, "eval_steps_per_second": 1.228, "step": 22272 }, { "epoch": 1.2074550781580775, "grad_norm": 0.46787571907043457, "learning_rate": 9.003096263386492e-05, "loss": 1.7648, "step": 22280 }, { "epoch": 1.2079970193235896, "grad_norm": 0.5319798588752747, "learning_rate": 9.002101903974016e-05, "loss": 1.7574, "step": 22290 }, { "epoch": 1.208538960489102, "grad_norm": 0.45188024640083313, "learning_rate": 9.00110711073636e-05, "loss": 1.7565, "step": 22300 }, { "epoch": 1.2090809016546142, "grad_norm": 0.4394822120666504, "learning_rate": 9.000111883796756e-05, "loss": 1.7594, "step": 22310 }, { "epoch": 1.2096228428201263, "grad_norm": 0.290712833404541, "learning_rate": 8.999116223278486e-05, "loss": 1.7592, "step": 22320 }, { "epoch": 1.2101647839856386, "grad_norm": 0.6723989248275757, "learning_rate": 8.998120129304892e-05, "loss": 1.752, "step": 22330 }, { "epoch": 1.2107067251511507, "grad_norm": 0.3058725595474243, "learning_rate": 8.997123601999364e-05, "loss": 1.7487, "step": 22340 }, { "epoch": 1.211248666316663, "grad_norm": 0.25992143154144287, "learning_rate": 8.996126641485345e-05, "loss": 1.7464, "step": 22350 }, { "epoch": 1.211736413365624, "eval_loss": 2.6008663177490234, "eval_runtime": 21.9912, "eval_samples_per_second": 227.363, "eval_steps_per_second": 1.228, "step": 22359 }, { "epoch": 1.2117906074821752, "grad_norm": 0.30580416321754456, "learning_rate": 8.995129247886339e-05, "loss": 1.7577, "step": 22360 }, { "epoch": 1.2123325486476875, "grad_norm": 0.2877110540866852, "learning_rate": 8.994131421325893e-05, "loss": 1.7556, "step": 22370 }, { "epoch": 1.2128744898131996, "grad_norm": 0.390491247177124, "learning_rate": 8.993133161927618e-05, "loss": 1.7553, "step": 22380 }, { "epoch": 1.2134164309787119, "grad_norm": 0.3429940342903137, "learning_rate": 8.992134469815173e-05, "loss": 1.7507, "step": 22390 }, { "epoch": 1.2139583721442242, "grad_norm": 0.29374656081199646, "learning_rate": 8.99113534511227e-05, "loss": 1.7548, "step": 22400 }, { "epoch": 1.2145003133097363, "grad_norm": 0.37793880701065063, "learning_rate": 8.99013578794268e-05, "loss": 1.7504, "step": 22410 }, { "epoch": 1.2150422544752486, "grad_norm": 0.312223345041275, "learning_rate": 8.989135798430218e-05, "loss": 1.7373, "step": 22420 }, { "epoch": 1.2155841956407607, "grad_norm": 0.355259507894516, "learning_rate": 8.988135376698764e-05, "loss": 1.7562, "step": 22430 }, { "epoch": 1.216126136806273, "grad_norm": 0.5126851797103882, "learning_rate": 8.987134522872242e-05, "loss": 1.7551, "step": 22440 }, { "epoch": 1.2164513015055802, "eval_loss": 2.5885868072509766, "eval_runtime": 21.9949, "eval_samples_per_second": 227.325, "eval_steps_per_second": 1.228, "step": 22446 }, { "epoch": 1.2166680779717851, "grad_norm": 0.3533921241760254, "learning_rate": 8.986133237074636e-05, "loss": 1.7599, "step": 22450 }, { "epoch": 1.2172100191372974, "grad_norm": 0.3277350962162018, "learning_rate": 8.98513151942998e-05, "loss": 1.7425, "step": 22460 }, { "epoch": 1.2177519603028095, "grad_norm": 0.45636269450187683, "learning_rate": 8.984129370062362e-05, "loss": 1.7538, "step": 22470 }, { "epoch": 1.2182939014683218, "grad_norm": 0.7295756340026855, "learning_rate": 8.983126789095925e-05, "loss": 1.7601, "step": 22480 }, { "epoch": 1.2188358426338342, "grad_norm": 0.5607370734214783, "learning_rate": 8.982123776654862e-05, "loss": 1.7585, "step": 22490 }, { "epoch": 1.2193777837993462, "grad_norm": 0.29513418674468994, "learning_rate": 8.981120332863423e-05, "loss": 1.7626, "step": 22500 }, { "epoch": 1.2199197249648586, "grad_norm": 0.2580159306526184, "learning_rate": 8.980116457845911e-05, "loss": 1.756, "step": 22510 }, { "epoch": 1.2204616661303707, "grad_norm": 0.31364554166793823, "learning_rate": 8.979112151726684e-05, "loss": 1.7503, "step": 22520 }, { "epoch": 1.221003607295883, "grad_norm": 0.8860167860984802, "learning_rate": 8.978107414630146e-05, "loss": 1.7357, "step": 22530 }, { "epoch": 1.2211661896455366, "eval_loss": 2.586221933364868, "eval_runtime": 21.9909, "eval_samples_per_second": 227.367, "eval_steps_per_second": 1.228, "step": 22533 }, { "epoch": 1.221545548461395, "grad_norm": 0.550439178943634, "learning_rate": 8.977102246680762e-05, "loss": 1.7482, "step": 22540 }, { "epoch": 1.2220874896269074, "grad_norm": 0.8875564932823181, "learning_rate": 8.976096648003048e-05, "loss": 1.7466, "step": 22550 }, { "epoch": 1.2226294307924195, "grad_norm": 0.5859338045120239, "learning_rate": 8.975090618721573e-05, "loss": 1.7587, "step": 22560 }, { "epoch": 1.2231713719579318, "grad_norm": 0.27543067932128906, "learning_rate": 8.97408415896096e-05, "loss": 1.7534, "step": 22570 }, { "epoch": 1.2237133131234441, "grad_norm": 0.3466237485408783, "learning_rate": 8.973077268845884e-05, "loss": 1.7601, "step": 22580 }, { "epoch": 1.2242552542889562, "grad_norm": 0.2768082916736603, "learning_rate": 8.972069948501074e-05, "loss": 1.7504, "step": 22590 }, { "epoch": 1.2247971954544685, "grad_norm": 0.45373255014419556, "learning_rate": 8.971062198051315e-05, "loss": 1.7479, "step": 22600 }, { "epoch": 1.2253391366199806, "grad_norm": 0.6204401850700378, "learning_rate": 8.970054017621437e-05, "loss": 1.7478, "step": 22610 }, { "epoch": 1.225881077785493, "grad_norm": 0.3263530135154724, "learning_rate": 8.969045407336336e-05, "loss": 1.7486, "step": 22620 }, { "epoch": 1.225881077785493, "eval_loss": 2.593569040298462, "eval_runtime": 21.9692, "eval_samples_per_second": 227.591, "eval_steps_per_second": 1.229, "step": 22620 }, { "epoch": 1.2264230189510053, "grad_norm": 0.48239004611968994, "learning_rate": 8.968036367320952e-05, "loss": 1.7489, "step": 22630 }, { "epoch": 1.2269649601165173, "grad_norm": 0.3203703761100769, "learning_rate": 8.967026897700277e-05, "loss": 1.7437, "step": 22640 }, { "epoch": 1.2275069012820297, "grad_norm": 0.6340882182121277, "learning_rate": 8.966016998599362e-05, "loss": 1.7371, "step": 22650 }, { "epoch": 1.2280488424475418, "grad_norm": 0.30104759335517883, "learning_rate": 8.96500667014331e-05, "loss": 1.7637, "step": 22660 }, { "epoch": 1.228590783613054, "grad_norm": 0.2778589129447937, "learning_rate": 8.963995912457275e-05, "loss": 1.7441, "step": 22670 }, { "epoch": 1.2291327247785662, "grad_norm": 0.3374054431915283, "learning_rate": 8.962984725666465e-05, "loss": 1.7528, "step": 22680 }, { "epoch": 1.2296746659440785, "grad_norm": 0.25601983070373535, "learning_rate": 8.961973109896144e-05, "loss": 1.7444, "step": 22690 }, { "epoch": 1.2302166071095906, "grad_norm": 0.2800886929035187, "learning_rate": 8.960961065271622e-05, "loss": 1.7444, "step": 22700 }, { "epoch": 1.2305959659254493, "eval_loss": 2.58610463142395, "eval_runtime": 21.9907, "eval_samples_per_second": 227.368, "eval_steps_per_second": 1.228, "step": 22707 }, { "epoch": 1.230758548275103, "grad_norm": 0.4286305904388428, "learning_rate": 8.95994859191827e-05, "loss": 1.7398, "step": 22710 }, { "epoch": 1.2313004894406152, "grad_norm": 0.5830126404762268, "learning_rate": 8.95893568996151e-05, "loss": 1.7458, "step": 22720 }, { "epoch": 1.2318424306061273, "grad_norm": 0.30274108052253723, "learning_rate": 8.957922359526812e-05, "loss": 1.7443, "step": 22730 }, { "epoch": 1.2323843717716396, "grad_norm": 0.5309381484985352, "learning_rate": 8.956908600739707e-05, "loss": 1.7477, "step": 22740 }, { "epoch": 1.2329263129371517, "grad_norm": 0.2998747229576111, "learning_rate": 8.95589441372577e-05, "loss": 1.7574, "step": 22750 }, { "epoch": 1.233468254102664, "grad_norm": 0.3685966432094574, "learning_rate": 8.954879798610637e-05, "loss": 1.74, "step": 22760 }, { "epoch": 1.2340101952681761, "grad_norm": 0.6951805949211121, "learning_rate": 8.953864755519995e-05, "loss": 1.7444, "step": 22770 }, { "epoch": 1.2345521364336884, "grad_norm": 0.3382275700569153, "learning_rate": 8.952849284579585e-05, "loss": 1.7352, "step": 22780 }, { "epoch": 1.2350940775992005, "grad_norm": 0.26488932967185974, "learning_rate": 8.951833385915193e-05, "loss": 1.7461, "step": 22790 }, { "epoch": 1.2353108540654056, "eval_loss": 2.575146436691284, "eval_runtime": 21.9928, "eval_samples_per_second": 227.347, "eval_steps_per_second": 1.228, "step": 22794 }, { "epoch": 1.2356360187647129, "grad_norm": 0.46747535467147827, "learning_rate": 8.950817059652669e-05, "loss": 1.7437, "step": 22800 }, { "epoch": 1.2361779599302252, "grad_norm": 0.29368457198143005, "learning_rate": 8.949800305917909e-05, "loss": 1.7458, "step": 22810 }, { "epoch": 1.2367199010957373, "grad_norm": 0.3844955265522003, "learning_rate": 8.948783124836866e-05, "loss": 1.7403, "step": 22820 }, { "epoch": 1.2372618422612496, "grad_norm": 0.6568012237548828, "learning_rate": 8.94776551653554e-05, "loss": 1.7358, "step": 22830 }, { "epoch": 1.2378037834267617, "grad_norm": 0.34444165229797363, "learning_rate": 8.946747481139992e-05, "loss": 1.7468, "step": 22840 }, { "epoch": 1.238345724592274, "grad_norm": 0.44042837619781494, "learning_rate": 8.945729018776331e-05, "loss": 1.7455, "step": 22850 }, { "epoch": 1.238887665757786, "grad_norm": 0.32380804419517517, "learning_rate": 8.944710129570719e-05, "loss": 1.7432, "step": 22860 }, { "epoch": 1.2394296069232984, "grad_norm": 0.608935534954071, "learning_rate": 8.943690813649369e-05, "loss": 1.7578, "step": 22870 }, { "epoch": 1.2399715480888105, "grad_norm": 0.5230217576026917, "learning_rate": 8.942671071138554e-05, "loss": 1.7458, "step": 22880 }, { "epoch": 1.2400257422053618, "eval_loss": 2.581331253051758, "eval_runtime": 21.9957, "eval_samples_per_second": 227.317, "eval_steps_per_second": 1.228, "step": 22881 }, { "epoch": 1.2405134892543228, "grad_norm": 0.5061838030815125, "learning_rate": 8.941650902164595e-05, "loss": 1.7455, "step": 22890 }, { "epoch": 1.2410554304198351, "grad_norm": 0.405821293592453, "learning_rate": 8.940630306853861e-05, "loss": 1.7405, "step": 22900 }, { "epoch": 1.2415973715853472, "grad_norm": 0.45560845732688904, "learning_rate": 8.939609285332785e-05, "loss": 1.7459, "step": 22910 }, { "epoch": 1.2421393127508595, "grad_norm": 0.5665388703346252, "learning_rate": 8.938587837727842e-05, "loss": 1.7552, "step": 22920 }, { "epoch": 1.2426812539163716, "grad_norm": 0.37847816944122314, "learning_rate": 8.937565964165569e-05, "loss": 1.7422, "step": 22930 }, { "epoch": 1.243223195081884, "grad_norm": 0.4069490432739258, "learning_rate": 8.936543664772546e-05, "loss": 1.7435, "step": 22940 }, { "epoch": 1.2437651362473963, "grad_norm": 0.3240896761417389, "learning_rate": 8.935520939675414e-05, "loss": 1.7512, "step": 22950 }, { "epoch": 1.2443070774129084, "grad_norm": 0.4123472273349762, "learning_rate": 8.934497789000865e-05, "loss": 1.7425, "step": 22960 }, { "epoch": 1.2447406303453181, "eval_loss": 2.5666730403900146, "eval_runtime": 21.9906, "eval_samples_per_second": 227.37, "eval_steps_per_second": 1.228, "step": 22968 }, { "epoch": 1.2448490185784207, "grad_norm": 0.2759721577167511, "learning_rate": 8.933474212875642e-05, "loss": 1.736, "step": 22970 }, { "epoch": 1.2453909597439328, "grad_norm": 0.438054621219635, "learning_rate": 8.932450211426537e-05, "loss": 1.7408, "step": 22980 }, { "epoch": 1.245932900909445, "grad_norm": 0.3276619613170624, "learning_rate": 8.931425784780405e-05, "loss": 1.748, "step": 22990 }, { "epoch": 1.2464748420749572, "grad_norm": 0.27382031083106995, "learning_rate": 8.930400933064144e-05, "loss": 1.7394, "step": 23000 }, { "epoch": 1.2470167832404695, "grad_norm": 0.24581308662891388, "learning_rate": 8.929375656404707e-05, "loss": 1.7361, "step": 23010 }, { "epoch": 1.2475587244059816, "grad_norm": 0.2953229546546936, "learning_rate": 8.928349954929103e-05, "loss": 1.7587, "step": 23020 }, { "epoch": 1.248100665571494, "grad_norm": 0.48332518339157104, "learning_rate": 8.927323828764393e-05, "loss": 1.7334, "step": 23030 }, { "epoch": 1.2486426067370062, "grad_norm": 0.8373629450798035, "learning_rate": 8.926297278037685e-05, "loss": 1.7477, "step": 23040 }, { "epoch": 1.2491845479025183, "grad_norm": 0.2879384458065033, "learning_rate": 8.925270302876146e-05, "loss": 1.7373, "step": 23050 }, { "epoch": 1.2494555184852745, "eval_loss": 2.5645227432250977, "eval_runtime": 21.9888, "eval_samples_per_second": 227.388, "eval_steps_per_second": 1.228, "step": 23055 }, { "epoch": 1.2497264890680306, "grad_norm": 0.2931196689605713, "learning_rate": 8.924242903406993e-05, "loss": 1.747, "step": 23060 }, { "epoch": 1.2502684302335427, "grad_norm": 0.5951820015907288, "learning_rate": 8.923215079757496e-05, "loss": 1.7395, "step": 23070 }, { "epoch": 1.250810371399055, "grad_norm": 0.30423420667648315, "learning_rate": 8.922186832054977e-05, "loss": 1.7422, "step": 23080 }, { "epoch": 1.2513523125645671, "grad_norm": 0.3447738289833069, "learning_rate": 8.92115816042681e-05, "loss": 1.7358, "step": 23090 }, { "epoch": 1.2518942537300795, "grad_norm": 0.3407246470451355, "learning_rate": 8.920129065000424e-05, "loss": 1.7486, "step": 23100 }, { "epoch": 1.2524361948955915, "grad_norm": 0.33367615938186646, "learning_rate": 8.919099545903299e-05, "loss": 1.7355, "step": 23110 }, { "epoch": 1.2529781360611039, "grad_norm": 0.32768481969833374, "learning_rate": 8.918069603262965e-05, "loss": 1.7406, "step": 23120 }, { "epoch": 1.2535200772266162, "grad_norm": 0.31800198554992676, "learning_rate": 8.91703923720701e-05, "loss": 1.7452, "step": 23130 }, { "epoch": 1.2540620183921283, "grad_norm": 0.5581735372543335, "learning_rate": 8.916008447863068e-05, "loss": 1.743, "step": 23140 }, { "epoch": 1.2541704066252308, "eval_loss": 2.5848233699798584, "eval_runtime": 21.9922, "eval_samples_per_second": 227.353, "eval_steps_per_second": 1.228, "step": 23142 }, { "epoch": 1.2546039595576406, "grad_norm": 0.5674453377723694, "learning_rate": 8.914977235358831e-05, "loss": 1.7393, "step": 23150 }, { "epoch": 1.2551459007231527, "grad_norm": 0.3421599864959717, "learning_rate": 8.913945599822043e-05, "loss": 1.7402, "step": 23160 }, { "epoch": 1.255687841888665, "grad_norm": 0.31315505504608154, "learning_rate": 8.912913541380492e-05, "loss": 1.7363, "step": 23170 }, { "epoch": 1.2562297830541773, "grad_norm": 0.30993425846099854, "learning_rate": 8.91188106016203e-05, "loss": 1.7405, "step": 23180 }, { "epoch": 1.2567717242196894, "grad_norm": 0.2763616144657135, "learning_rate": 8.910848156294555e-05, "loss": 1.7464, "step": 23190 }, { "epoch": 1.2573136653852015, "grad_norm": 0.3239862322807312, "learning_rate": 8.90981482990602e-05, "loss": 1.7422, "step": 23200 }, { "epoch": 1.2578556065507138, "grad_norm": 0.3442568778991699, "learning_rate": 8.908781081124427e-05, "loss": 1.7433, "step": 23210 }, { "epoch": 1.2583975477162261, "grad_norm": 0.2833685278892517, "learning_rate": 8.907746910077834e-05, "loss": 1.7337, "step": 23220 }, { "epoch": 1.2588852947651872, "eval_loss": 2.577096939086914, "eval_runtime": 21.9824, "eval_samples_per_second": 227.454, "eval_steps_per_second": 1.228, "step": 23229 }, { "epoch": 1.2589394888817382, "grad_norm": 0.2630921006202698, "learning_rate": 8.906712316894346e-05, "loss": 1.7534, "step": 23230 }, { "epoch": 1.2594814300472505, "grad_norm": 0.5170078277587891, "learning_rate": 8.90567730170213e-05, "loss": 1.7483, "step": 23240 }, { "epoch": 1.2600233712127626, "grad_norm": 0.3840513527393341, "learning_rate": 8.904641864629394e-05, "loss": 1.7339, "step": 23250 }, { "epoch": 1.260565312378275, "grad_norm": 0.29101109504699707, "learning_rate": 8.903606005804406e-05, "loss": 1.7207, "step": 23260 }, { "epoch": 1.2611072535437873, "grad_norm": 0.34224191308021545, "learning_rate": 8.902569725355482e-05, "loss": 1.7333, "step": 23270 }, { "epoch": 1.2616491947092994, "grad_norm": 0.8239234089851379, "learning_rate": 8.901533023410994e-05, "loss": 1.7417, "step": 23280 }, { "epoch": 1.2621911358748115, "grad_norm": 0.7893701791763306, "learning_rate": 8.900495900099362e-05, "loss": 1.7398, "step": 23290 }, { "epoch": 1.2627330770403238, "grad_norm": 0.31907394528388977, "learning_rate": 8.899458355549061e-05, "loss": 1.739, "step": 23300 }, { "epoch": 1.263275018205836, "grad_norm": 0.26405349373817444, "learning_rate": 8.898420389888619e-05, "loss": 1.747, "step": 23310 }, { "epoch": 1.2636001829051433, "eval_loss": 2.575762987136841, "eval_runtime": 21.9898, "eval_samples_per_second": 227.378, "eval_steps_per_second": 1.228, "step": 23316 }, { "epoch": 1.2638169593713482, "grad_norm": 0.2704724669456482, "learning_rate": 8.897382003246614e-05, "loss": 1.7478, "step": 23320 }, { "epoch": 1.2643589005368605, "grad_norm": 0.28200745582580566, "learning_rate": 8.896343195751677e-05, "loss": 1.7414, "step": 23330 }, { "epoch": 1.2649008417023726, "grad_norm": 0.367416113615036, "learning_rate": 8.895303967532489e-05, "loss": 1.7448, "step": 23340 }, { "epoch": 1.265442782867885, "grad_norm": 0.41314128041267395, "learning_rate": 8.894264318717786e-05, "loss": 1.7301, "step": 23350 }, { "epoch": 1.2659847240333972, "grad_norm": 0.346835196018219, "learning_rate": 8.893224249436357e-05, "loss": 1.7309, "step": 23360 }, { "epoch": 1.2665266651989093, "grad_norm": 0.37419700622558594, "learning_rate": 8.892183759817039e-05, "loss": 1.7363, "step": 23370 }, { "epoch": 1.2670686063644216, "grad_norm": 0.5215455293655396, "learning_rate": 8.891142849988725e-05, "loss": 1.7392, "step": 23380 }, { "epoch": 1.2676105475299337, "grad_norm": 0.32529744505882263, "learning_rate": 8.890101520080357e-05, "loss": 1.7349, "step": 23390 }, { "epoch": 1.268152488695446, "grad_norm": 0.5487125515937805, "learning_rate": 8.889059770220931e-05, "loss": 1.734, "step": 23400 }, { "epoch": 1.2683150710450997, "eval_loss": 2.5602195262908936, "eval_runtime": 21.9906, "eval_samples_per_second": 227.37, "eval_steps_per_second": 1.228, "step": 23403 }, { "epoch": 1.2686944298609582, "grad_norm": 0.465263694524765, "learning_rate": 8.888017600539493e-05, "loss": 1.7317, "step": 23410 }, { "epoch": 1.2692363710264705, "grad_norm": 0.3183075487613678, "learning_rate": 8.886975011165146e-05, "loss": 1.7385, "step": 23420 }, { "epoch": 1.2697783121919826, "grad_norm": 0.5280638933181763, "learning_rate": 8.885932002227039e-05, "loss": 1.732, "step": 23430 }, { "epoch": 1.2703202533574949, "grad_norm": 0.3372764587402344, "learning_rate": 8.884888573854375e-05, "loss": 1.7314, "step": 23440 }, { "epoch": 1.2708621945230072, "grad_norm": 0.374603807926178, "learning_rate": 8.883844726176412e-05, "loss": 1.731, "step": 23450 }, { "epoch": 1.2714041356885193, "grad_norm": 0.3078954219818115, "learning_rate": 8.882800459322453e-05, "loss": 1.7377, "step": 23460 }, { "epoch": 1.2719460768540316, "grad_norm": 0.2972983419895172, "learning_rate": 8.881755773421863e-05, "loss": 1.734, "step": 23470 }, { "epoch": 1.2724880180195437, "grad_norm": 0.30043935775756836, "learning_rate": 8.880710668604047e-05, "loss": 1.7393, "step": 23480 }, { "epoch": 1.273029959185056, "grad_norm": 0.4024638533592224, "learning_rate": 8.879665144998473e-05, "loss": 1.738, "step": 23490 }, { "epoch": 1.273029959185056, "eval_loss": 2.574789047241211, "eval_runtime": 21.9656, "eval_samples_per_second": 227.628, "eval_steps_per_second": 1.229, "step": 23490 }, { "epoch": 1.273571900350568, "grad_norm": 0.33888739347457886, "learning_rate": 8.878619202734653e-05, "loss": 1.7342, "step": 23500 }, { "epoch": 1.2741138415160804, "grad_norm": 0.40043386816978455, "learning_rate": 8.877572841942153e-05, "loss": 1.7383, "step": 23510 }, { "epoch": 1.2746557826815925, "grad_norm": 0.46869125962257385, "learning_rate": 8.876526062750597e-05, "loss": 1.7412, "step": 23520 }, { "epoch": 1.2751977238471048, "grad_norm": 0.41995322704315186, "learning_rate": 8.875478865289649e-05, "loss": 1.7332, "step": 23530 }, { "epoch": 1.2757396650126172, "grad_norm": 0.28642240166664124, "learning_rate": 8.874431249689033e-05, "loss": 1.7313, "step": 23540 }, { "epoch": 1.2762816061781292, "grad_norm": 0.5792128443717957, "learning_rate": 8.873383216078527e-05, "loss": 1.7324, "step": 23550 }, { "epoch": 1.2768235473436416, "grad_norm": 0.6733444929122925, "learning_rate": 8.872334764587952e-05, "loss": 1.7357, "step": 23560 }, { "epoch": 1.2773654885091537, "grad_norm": 0.27996379137039185, "learning_rate": 8.87128589534719e-05, "loss": 1.723, "step": 23570 }, { "epoch": 1.2777448473250121, "eval_loss": 2.5722544193267822, "eval_runtime": 21.9913, "eval_samples_per_second": 227.362, "eval_steps_per_second": 1.228, "step": 23577 }, { "epoch": 1.277907429674666, "grad_norm": 0.29412564635276794, "learning_rate": 8.870236608486165e-05, "loss": 1.7301, "step": 23580 }, { "epoch": 1.2784493708401783, "grad_norm": 0.2967008352279663, "learning_rate": 8.869186904134862e-05, "loss": 1.7411, "step": 23590 }, { "epoch": 1.2789913120056904, "grad_norm": 1.056755542755127, "learning_rate": 8.868136782423314e-05, "loss": 1.735, "step": 23600 }, { "epoch": 1.2795332531712025, "grad_norm": 0.6758991479873657, "learning_rate": 8.867086243481603e-05, "loss": 1.7431, "step": 23610 }, { "epoch": 1.2800751943367148, "grad_norm": 0.33271247148513794, "learning_rate": 8.866035287439868e-05, "loss": 1.7338, "step": 23620 }, { "epoch": 1.280617135502227, "grad_norm": 0.40084099769592285, "learning_rate": 8.864983914428293e-05, "loss": 1.7385, "step": 23630 }, { "epoch": 1.2811590766677392, "grad_norm": 0.3270736038684845, "learning_rate": 8.863932124577123e-05, "loss": 1.716, "step": 23640 }, { "epoch": 1.2817010178332515, "grad_norm": 0.3113623559474945, "learning_rate": 8.862879918016643e-05, "loss": 1.7371, "step": 23650 }, { "epoch": 1.2822429589987636, "grad_norm": 0.6056118011474609, "learning_rate": 8.861827294877201e-05, "loss": 1.7375, "step": 23660 }, { "epoch": 1.2824597354649687, "eval_loss": 2.576855182647705, "eval_runtime": 21.9914, "eval_samples_per_second": 227.361, "eval_steps_per_second": 1.228, "step": 23664 }, { "epoch": 1.282784900164276, "grad_norm": 0.3462965488433838, "learning_rate": 8.86077425528919e-05, "loss": 1.7308, "step": 23670 }, { "epoch": 1.2833268413297882, "grad_norm": 0.2960757315158844, "learning_rate": 8.859720799383054e-05, "loss": 1.7263, "step": 23680 }, { "epoch": 1.2838687824953003, "grad_norm": 0.29280513525009155, "learning_rate": 8.858666927289292e-05, "loss": 1.7369, "step": 23690 }, { "epoch": 1.2844107236608124, "grad_norm": 0.602401614189148, "learning_rate": 8.857612639138451e-05, "loss": 1.7398, "step": 23700 }, { "epoch": 1.2849526648263248, "grad_norm": 0.4261922240257263, "learning_rate": 8.856557935061137e-05, "loss": 1.7272, "step": 23710 }, { "epoch": 1.285494605991837, "grad_norm": 0.32072654366493225, "learning_rate": 8.855502815187996e-05, "loss": 1.7529, "step": 23720 }, { "epoch": 1.2860365471573492, "grad_norm": 0.45794838666915894, "learning_rate": 8.854447279649737e-05, "loss": 1.7268, "step": 23730 }, { "epoch": 1.2865784883228615, "grad_norm": 0.8380472660064697, "learning_rate": 8.85339132857711e-05, "loss": 1.732, "step": 23740 }, { "epoch": 1.2871204294883736, "grad_norm": 0.25734448432922363, "learning_rate": 8.852334962100926e-05, "loss": 1.7341, "step": 23750 }, { "epoch": 1.2871746236049249, "eval_loss": 2.560034990310669, "eval_runtime": 21.9846, "eval_samples_per_second": 227.432, "eval_steps_per_second": 1.228, "step": 23751 }, { "epoch": 1.287662370653886, "grad_norm": 0.32123035192489624, "learning_rate": 8.851278180352041e-05, "loss": 1.7288, "step": 23760 }, { "epoch": 1.2882043118193982, "grad_norm": 0.41775065660476685, "learning_rate": 8.850220983461365e-05, "loss": 1.7305, "step": 23770 }, { "epoch": 1.2887462529849103, "grad_norm": 0.3736642599105835, "learning_rate": 8.849163371559858e-05, "loss": 1.7276, "step": 23780 }, { "epoch": 1.2892881941504226, "grad_norm": 0.3616468906402588, "learning_rate": 8.848105344778532e-05, "loss": 1.731, "step": 23790 }, { "epoch": 1.2898301353159347, "grad_norm": 0.39535510540008545, "learning_rate": 8.847046903248453e-05, "loss": 1.7199, "step": 23800 }, { "epoch": 1.290372076481447, "grad_norm": 0.27410778403282166, "learning_rate": 8.845988047100736e-05, "loss": 1.729, "step": 23810 }, { "epoch": 1.2909140176469591, "grad_norm": 0.2933545708656311, "learning_rate": 8.844928776466547e-05, "loss": 1.7338, "step": 23820 }, { "epoch": 1.2914559588124714, "grad_norm": 0.44660070538520813, "learning_rate": 8.843869091477102e-05, "loss": 1.7316, "step": 23830 }, { "epoch": 1.2918895117448812, "eval_loss": 2.552804946899414, "eval_runtime": 21.9898, "eval_samples_per_second": 227.378, "eval_steps_per_second": 1.228, "step": 23838 }, { "epoch": 1.2919978999779835, "grad_norm": 0.3844490945339203, "learning_rate": 8.842808992263672e-05, "loss": 1.7361, "step": 23840 }, { "epoch": 1.2925398411434958, "grad_norm": 0.5550277829170227, "learning_rate": 8.841748478957577e-05, "loss": 1.7385, "step": 23850 }, { "epoch": 1.2930817823090082, "grad_norm": 0.32363656163215637, "learning_rate": 8.840687551690189e-05, "loss": 1.7243, "step": 23860 }, { "epoch": 1.2936237234745203, "grad_norm": 0.4406464993953705, "learning_rate": 8.839626210592931e-05, "loss": 1.7437, "step": 23870 }, { "epoch": 1.2941656646400326, "grad_norm": 0.3115149140357971, "learning_rate": 8.838564455797275e-05, "loss": 1.7447, "step": 23880 }, { "epoch": 1.2947076058055447, "grad_norm": 0.27961403131484985, "learning_rate": 8.837502287434752e-05, "loss": 1.7287, "step": 23890 }, { "epoch": 1.295249546971057, "grad_norm": 0.49664953351020813, "learning_rate": 8.836439705636935e-05, "loss": 1.7304, "step": 23900 }, { "epoch": 1.2957914881365693, "grad_norm": 0.6754089593887329, "learning_rate": 8.835376710535451e-05, "loss": 1.7334, "step": 23910 }, { "epoch": 1.2963334293020814, "grad_norm": 0.2822556793689728, "learning_rate": 8.834313302261982e-05, "loss": 1.7226, "step": 23920 }, { "epoch": 1.2966043998848376, "eval_loss": 2.5520572662353516, "eval_runtime": 21.9902, "eval_samples_per_second": 227.373, "eval_steps_per_second": 1.228, "step": 23925 }, { "epoch": 1.2968753704675935, "grad_norm": 0.41642260551452637, "learning_rate": 8.833249480948257e-05, "loss": 1.7223, "step": 23930 }, { "epoch": 1.2974173116331058, "grad_norm": 0.6077197790145874, "learning_rate": 8.832185246726057e-05, "loss": 1.7211, "step": 23940 }, { "epoch": 1.2979592527986181, "grad_norm": 0.7251920700073242, "learning_rate": 8.831120599727217e-05, "loss": 1.7233, "step": 23950 }, { "epoch": 1.2985011939641302, "grad_norm": 0.7429440021514893, "learning_rate": 8.83005554008362e-05, "loss": 1.727, "step": 23960 }, { "epoch": 1.2990431351296425, "grad_norm": 0.33626994490623474, "learning_rate": 8.828990067927199e-05, "loss": 1.7237, "step": 23970 }, { "epoch": 1.2995850762951546, "grad_norm": 0.4045674502849579, "learning_rate": 8.827924183389941e-05, "loss": 1.7197, "step": 23980 }, { "epoch": 1.300127017460667, "grad_norm": 0.43031617999076843, "learning_rate": 8.826857886603885e-05, "loss": 1.7181, "step": 23990 }, { "epoch": 1.3006689586261793, "grad_norm": 0.4407658874988556, "learning_rate": 8.825791177701116e-05, "loss": 1.7297, "step": 24000 }, { "epoch": 1.3012108997916914, "grad_norm": 0.4655308425426483, "learning_rate": 8.824724056813775e-05, "loss": 1.7271, "step": 24010 }, { "epoch": 1.3013192880247937, "eval_loss": 2.550110340118408, "eval_runtime": 21.9873, "eval_samples_per_second": 227.404, "eval_steps_per_second": 1.228, "step": 24012 }, { "epoch": 1.3017528409572034, "grad_norm": 0.47410663962364197, "learning_rate": 8.823656524074054e-05, "loss": 1.7271, "step": 24020 }, { "epoch": 1.3022947821227158, "grad_norm": 0.397776335477829, "learning_rate": 8.822588579614192e-05, "loss": 1.7268, "step": 24030 }, { "epoch": 1.302836723288228, "grad_norm": 0.4908923804759979, "learning_rate": 8.821520223566483e-05, "loss": 1.7312, "step": 24040 }, { "epoch": 1.3033786644537402, "grad_norm": 0.475315660238266, "learning_rate": 8.820451456063268e-05, "loss": 1.7358, "step": 24050 }, { "epoch": 1.3039206056192525, "grad_norm": 0.3519163131713867, "learning_rate": 8.819382277236943e-05, "loss": 1.7313, "step": 24060 }, { "epoch": 1.3044625467847646, "grad_norm": 0.6225571632385254, "learning_rate": 8.818312687219953e-05, "loss": 1.7269, "step": 24070 }, { "epoch": 1.305004487950277, "grad_norm": 0.3213563859462738, "learning_rate": 8.817242686144793e-05, "loss": 1.7273, "step": 24080 }, { "epoch": 1.3055464291157892, "grad_norm": 0.3261583149433136, "learning_rate": 8.816172274144013e-05, "loss": 1.7208, "step": 24090 }, { "epoch": 1.30603417616475, "eval_loss": 2.560994863510132, "eval_runtime": 21.9922, "eval_samples_per_second": 227.353, "eval_steps_per_second": 1.228, "step": 24099 }, { "epoch": 1.3060883702813013, "grad_norm": 0.35049372911453247, "learning_rate": 8.815101451350207e-05, "loss": 1.7158, "step": 24100 }, { "epoch": 1.3066303114468136, "grad_norm": 0.253828227519989, "learning_rate": 8.814030217896026e-05, "loss": 1.7282, "step": 24110 }, { "epoch": 1.3071722526123257, "grad_norm": 0.32618460059165955, "learning_rate": 8.81295857391417e-05, "loss": 1.7182, "step": 24120 }, { "epoch": 1.307714193777838, "grad_norm": 0.2585597634315491, "learning_rate": 8.811886519537391e-05, "loss": 1.7322, "step": 24130 }, { "epoch": 1.3082561349433501, "grad_norm": 0.29455363750457764, "learning_rate": 8.810814054898488e-05, "loss": 1.71, "step": 24140 }, { "epoch": 1.3087980761088625, "grad_norm": 0.32605740427970886, "learning_rate": 8.809741180130313e-05, "loss": 1.7158, "step": 24150 }, { "epoch": 1.3093400172743745, "grad_norm": 0.422584593296051, "learning_rate": 8.808667895365771e-05, "loss": 1.717, "step": 24160 }, { "epoch": 1.3098819584398869, "grad_norm": 0.3378809988498688, "learning_rate": 8.807594200737815e-05, "loss": 1.7302, "step": 24170 }, { "epoch": 1.3104238996053992, "grad_norm": 0.4901425242424011, "learning_rate": 8.806520096379448e-05, "loss": 1.7179, "step": 24180 }, { "epoch": 1.3107490643047064, "eval_loss": 2.562967300415039, "eval_runtime": 21.9941, "eval_samples_per_second": 227.334, "eval_steps_per_second": 1.228, "step": 24186 }, { "epoch": 1.3109658407709113, "grad_norm": 0.39655086398124695, "learning_rate": 8.805445582423728e-05, "loss": 1.7215, "step": 24190 }, { "epoch": 1.3115077819364236, "grad_norm": 0.3672724962234497, "learning_rate": 8.804370659003762e-05, "loss": 1.7214, "step": 24200 }, { "epoch": 1.3120497231019357, "grad_norm": 0.4074071943759918, "learning_rate": 8.803295326252701e-05, "loss": 1.7172, "step": 24210 }, { "epoch": 1.312591664267448, "grad_norm": 0.4320417046546936, "learning_rate": 8.802219584303758e-05, "loss": 1.7183, "step": 24220 }, { "epoch": 1.3131336054329603, "grad_norm": 0.5554030537605286, "learning_rate": 8.80114343329019e-05, "loss": 1.7136, "step": 24230 }, { "epoch": 1.3136755465984724, "grad_norm": 0.3394809663295746, "learning_rate": 8.800066873345306e-05, "loss": 1.7206, "step": 24240 }, { "epoch": 1.3142174877639845, "grad_norm": 0.2953517436981201, "learning_rate": 8.798989904602465e-05, "loss": 1.7286, "step": 24250 }, { "epoch": 1.3147594289294968, "grad_norm": 0.3113909959793091, "learning_rate": 8.797912527195078e-05, "loss": 1.7155, "step": 24260 }, { "epoch": 1.3153013700950091, "grad_norm": 0.26612791419029236, "learning_rate": 8.796834741256605e-05, "loss": 1.7277, "step": 24270 }, { "epoch": 1.3154639524446627, "eval_loss": 2.5562336444854736, "eval_runtime": 21.9909, "eval_samples_per_second": 227.367, "eval_steps_per_second": 1.228, "step": 24273 }, { "epoch": 1.3158433112605212, "grad_norm": 0.3873152434825897, "learning_rate": 8.795756546920556e-05, "loss": 1.7147, "step": 24280 }, { "epoch": 1.3163852524260335, "grad_norm": 0.30190661549568176, "learning_rate": 8.794677944320497e-05, "loss": 1.7135, "step": 24290 }, { "epoch": 1.3169271935915456, "grad_norm": 0.6682907342910767, "learning_rate": 8.793598933590036e-05, "loss": 1.729, "step": 24300 }, { "epoch": 1.317469134757058, "grad_norm": 0.280906617641449, "learning_rate": 8.79251951486284e-05, "loss": 1.7061, "step": 24310 }, { "epoch": 1.3180110759225703, "grad_norm": 0.5581380724906921, "learning_rate": 8.79143968827262e-05, "loss": 1.7343, "step": 24320 }, { "epoch": 1.3185530170880824, "grad_norm": 0.4470410645008087, "learning_rate": 8.790359453953145e-05, "loss": 1.7278, "step": 24330 }, { "epoch": 1.3190949582535945, "grad_norm": 0.4987182021141052, "learning_rate": 8.789278812038222e-05, "loss": 1.7266, "step": 24340 }, { "epoch": 1.3196368994191068, "grad_norm": 1.2388598918914795, "learning_rate": 8.788197762661723e-05, "loss": 1.7193, "step": 24350 }, { "epoch": 1.320178840584619, "grad_norm": 0.8786284327507019, "learning_rate": 8.78711630595756e-05, "loss": 1.7217, "step": 24360 }, { "epoch": 1.320178840584619, "eval_loss": 2.568453073501587, "eval_runtime": 21.9891, "eval_samples_per_second": 227.386, "eval_steps_per_second": 1.228, "step": 24360 }, { "epoch": 1.3207207817501312, "grad_norm": 0.41298526525497437, "learning_rate": 8.7860344420597e-05, "loss": 1.7191, "step": 24370 }, { "epoch": 1.3212627229156435, "grad_norm": 0.27922841906547546, "learning_rate": 8.78495217110216e-05, "loss": 1.7214, "step": 24380 }, { "epoch": 1.3218046640811556, "grad_norm": 0.3357425034046173, "learning_rate": 8.783869493219008e-05, "loss": 1.7165, "step": 24390 }, { "epoch": 1.322346605246668, "grad_norm": 0.6415514945983887, "learning_rate": 8.782786408544358e-05, "loss": 1.7259, "step": 24400 }, { "epoch": 1.3228885464121802, "grad_norm": 0.5723456144332886, "learning_rate": 8.78170291721238e-05, "loss": 1.7221, "step": 24410 }, { "epoch": 1.3234304875776923, "grad_norm": 0.27751269936561584, "learning_rate": 8.780619019357295e-05, "loss": 1.7241, "step": 24420 }, { "epoch": 1.3239724287432046, "grad_norm": 0.5434432029724121, "learning_rate": 8.779534715113368e-05, "loss": 1.7284, "step": 24430 }, { "epoch": 1.3245143699087167, "grad_norm": 0.3848007917404175, "learning_rate": 8.778450004614918e-05, "loss": 1.7164, "step": 24440 }, { "epoch": 1.3248937287245752, "eval_loss": 2.563822031021118, "eval_runtime": 21.9907, "eval_samples_per_second": 227.369, "eval_steps_per_second": 1.228, "step": 24447 }, { "epoch": 1.325056311074229, "grad_norm": 0.3684322237968445, "learning_rate": 8.777364887996315e-05, "loss": 1.7197, "step": 24450 }, { "epoch": 1.3255982522397411, "grad_norm": 0.3419678807258606, "learning_rate": 8.77627936539198e-05, "loss": 1.7221, "step": 24460 }, { "epoch": 1.3261401934052535, "grad_norm": 0.2709847688674927, "learning_rate": 8.77519343693638e-05, "loss": 1.7159, "step": 24470 }, { "epoch": 1.3266821345707656, "grad_norm": 0.29516398906707764, "learning_rate": 8.774107102764038e-05, "loss": 1.72, "step": 24480 }, { "epoch": 1.3272240757362779, "grad_norm": 0.29573437571525574, "learning_rate": 8.773020363009521e-05, "loss": 1.7187, "step": 24490 }, { "epoch": 1.3277660169017902, "grad_norm": 0.44336891174316406, "learning_rate": 8.771933217807453e-05, "loss": 1.7106, "step": 24500 }, { "epoch": 1.3283079580673023, "grad_norm": 0.3083667755126953, "learning_rate": 8.770845667292503e-05, "loss": 1.7229, "step": 24510 }, { "epoch": 1.3288498992328146, "grad_norm": 0.4367120862007141, "learning_rate": 8.769757711599391e-05, "loss": 1.7231, "step": 24520 }, { "epoch": 1.3293918403983267, "grad_norm": 0.40371039509773254, "learning_rate": 8.768669350862892e-05, "loss": 1.7188, "step": 24530 }, { "epoch": 1.3296086168645316, "eval_loss": 2.572169542312622, "eval_runtime": 21.9901, "eval_samples_per_second": 227.375, "eval_steps_per_second": 1.228, "step": 24534 }, { "epoch": 1.329933781563839, "grad_norm": 0.7102428674697876, "learning_rate": 8.767580585217823e-05, "loss": 1.7157, "step": 24540 }, { "epoch": 1.3304757227293513, "grad_norm": 0.35714074969291687, "learning_rate": 8.766491414799057e-05, "loss": 1.7066, "step": 24550 }, { "epoch": 1.3310176638948634, "grad_norm": 0.29054415225982666, "learning_rate": 8.765401839741517e-05, "loss": 1.7183, "step": 24560 }, { "epoch": 1.3315596050603755, "grad_norm": 0.26257362961769104, "learning_rate": 8.764311860180175e-05, "loss": 1.7163, "step": 24570 }, { "epoch": 1.3321015462258878, "grad_norm": 0.3010044991970062, "learning_rate": 8.763221476250051e-05, "loss": 1.7215, "step": 24580 }, { "epoch": 1.3326434873914001, "grad_norm": 0.33944013714790344, "learning_rate": 8.762130688086219e-05, "loss": 1.72, "step": 24590 }, { "epoch": 1.3331854285569122, "grad_norm": 0.5436154007911682, "learning_rate": 8.761039495823799e-05, "loss": 1.7121, "step": 24600 }, { "epoch": 1.3337273697224246, "grad_norm": 0.4238379895687103, "learning_rate": 8.759947899597964e-05, "loss": 1.7195, "step": 24610 }, { "epoch": 1.3342693108879367, "grad_norm": 0.2585737705230713, "learning_rate": 8.758855899543939e-05, "loss": 1.7015, "step": 24620 }, { "epoch": 1.334323505004488, "eval_loss": 2.5573627948760986, "eval_runtime": 21.9891, "eval_samples_per_second": 227.386, "eval_steps_per_second": 1.228, "step": 24621 }, { "epoch": 1.334811252053449, "grad_norm": 0.35536953806877136, "learning_rate": 8.75776349579699e-05, "loss": 1.7164, "step": 24630 }, { "epoch": 1.3353531932189613, "grad_norm": 0.4927321672439575, "learning_rate": 8.756670688492445e-05, "loss": 1.7181, "step": 24640 }, { "epoch": 1.3358951343844734, "grad_norm": 0.3061029613018036, "learning_rate": 8.755577477765674e-05, "loss": 1.7136, "step": 24650 }, { "epoch": 1.3364370755499855, "grad_norm": 0.27562111616134644, "learning_rate": 8.7544838637521e-05, "loss": 1.7222, "step": 24660 }, { "epoch": 1.3369790167154978, "grad_norm": 0.46605604887008667, "learning_rate": 8.753389846587194e-05, "loss": 1.7085, "step": 24670 }, { "epoch": 1.33752095788101, "grad_norm": 0.47294819355010986, "learning_rate": 8.752295426406479e-05, "loss": 1.711, "step": 24680 }, { "epoch": 1.3380628990465222, "grad_norm": 0.42514827847480774, "learning_rate": 8.751200603345524e-05, "loss": 1.7081, "step": 24690 }, { "epoch": 1.3386048402120345, "grad_norm": 0.46494096517562866, "learning_rate": 8.750105377539957e-05, "loss": 1.6989, "step": 24700 }, { "epoch": 1.3390383931444443, "eval_loss": 2.5702195167541504, "eval_runtime": 21.9879, "eval_samples_per_second": 227.398, "eval_steps_per_second": 1.228, "step": 24708 }, { "epoch": 1.3391467813775466, "grad_norm": 0.36518365144729614, "learning_rate": 8.749009749125445e-05, "loss": 1.7115, "step": 24710 }, { "epoch": 1.339688722543059, "grad_norm": 0.32633692026138306, "learning_rate": 8.747913718237712e-05, "loss": 1.7166, "step": 24720 }, { "epoch": 1.3402306637085712, "grad_norm": 0.370088130235672, "learning_rate": 8.746817285012527e-05, "loss": 1.714, "step": 24730 }, { "epoch": 1.3407726048740833, "grad_norm": 0.25269201397895813, "learning_rate": 8.745720449585714e-05, "loss": 1.7212, "step": 24740 }, { "epoch": 1.3413145460395957, "grad_norm": 0.3450757563114166, "learning_rate": 8.744623212093142e-05, "loss": 1.7241, "step": 24750 }, { "epoch": 1.3418564872051078, "grad_norm": 0.26472437381744385, "learning_rate": 8.743525572670734e-05, "loss": 1.7087, "step": 24760 }, { "epoch": 1.34239842837062, "grad_norm": 0.5389896035194397, "learning_rate": 8.74242753145446e-05, "loss": 1.7157, "step": 24770 }, { "epoch": 1.3429403695361322, "grad_norm": 0.6463091969490051, "learning_rate": 8.74132908858034e-05, "loss": 1.7056, "step": 24780 }, { "epoch": 1.3434823107016445, "grad_norm": 0.40934550762176514, "learning_rate": 8.740230244184448e-05, "loss": 1.7137, "step": 24790 }, { "epoch": 1.3437532812844006, "eval_loss": 2.5633254051208496, "eval_runtime": 21.9886, "eval_samples_per_second": 227.391, "eval_steps_per_second": 1.228, "step": 24795 }, { "epoch": 1.3440242518671566, "grad_norm": 0.4404658377170563, "learning_rate": 8.739130998402898e-05, "loss": 1.7126, "step": 24800 }, { "epoch": 1.3445661930326689, "grad_norm": 0.37851405143737793, "learning_rate": 8.738031351371863e-05, "loss": 1.7127, "step": 24810 }, { "epoch": 1.3451081341981812, "grad_norm": 0.30251404643058777, "learning_rate": 8.736931303227563e-05, "loss": 1.7293, "step": 24820 }, { "epoch": 1.3456500753636933, "grad_norm": 0.5295489430427551, "learning_rate": 8.735830854106267e-05, "loss": 1.7092, "step": 24830 }, { "epoch": 1.3461920165292056, "grad_norm": 0.54742431640625, "learning_rate": 8.734730004144292e-05, "loss": 1.7201, "step": 24840 }, { "epoch": 1.3467339576947177, "grad_norm": 0.6740279793739319, "learning_rate": 8.733628753478009e-05, "loss": 1.7187, "step": 24850 }, { "epoch": 1.34727589886023, "grad_norm": 0.4922647476196289, "learning_rate": 8.732527102243835e-05, "loss": 1.7185, "step": 24860 }, { "epoch": 1.3478178400257423, "grad_norm": 0.34301790595054626, "learning_rate": 8.731425050578238e-05, "loss": 1.709, "step": 24870 }, { "epoch": 1.3483597811912544, "grad_norm": 0.6136478185653687, "learning_rate": 8.730322598617734e-05, "loss": 1.7129, "step": 24880 }, { "epoch": 1.3484681694243568, "eval_loss": 2.5651490688323975, "eval_runtime": 21.9924, "eval_samples_per_second": 227.351, "eval_steps_per_second": 1.228, "step": 24882 }, { "epoch": 1.3489017223567665, "grad_norm": 0.44811177253723145, "learning_rate": 8.729219746498892e-05, "loss": 1.7175, "step": 24890 }, { "epoch": 1.3494436635222788, "grad_norm": 0.35493898391723633, "learning_rate": 8.728116494358325e-05, "loss": 1.7221, "step": 24900 }, { "epoch": 1.3499856046877912, "grad_norm": 0.3304137885570526, "learning_rate": 8.727012842332706e-05, "loss": 1.7252, "step": 24910 }, { "epoch": 1.3505275458533033, "grad_norm": 0.2520520091056824, "learning_rate": 8.725908790558746e-05, "loss": 1.7111, "step": 24920 }, { "epoch": 1.3510694870188156, "grad_norm": 0.48782554268836975, "learning_rate": 8.724804339173208e-05, "loss": 1.7112, "step": 24930 }, { "epoch": 1.3516114281843277, "grad_norm": 0.3623749017715454, "learning_rate": 8.72369948831291e-05, "loss": 1.7057, "step": 24940 }, { "epoch": 1.35215336934984, "grad_norm": 0.31124410033226013, "learning_rate": 8.722594238114716e-05, "loss": 1.702, "step": 24950 }, { "epoch": 1.3526953105153523, "grad_norm": 0.3612881600856781, "learning_rate": 8.721488588715539e-05, "loss": 1.7136, "step": 24960 }, { "epoch": 1.3531830575643131, "eval_loss": 2.5710861682891846, "eval_runtime": 21.9923, "eval_samples_per_second": 227.352, "eval_steps_per_second": 1.228, "step": 24969 }, { "epoch": 1.3532372516808644, "grad_norm": 0.3691853880882263, "learning_rate": 8.720382540252341e-05, "loss": 1.7128, "step": 24970 }, { "epoch": 1.3537791928463765, "grad_norm": 0.33288514614105225, "learning_rate": 8.719276092862137e-05, "loss": 1.6998, "step": 24980 }, { "epoch": 1.3543211340118888, "grad_norm": 0.28774330019950867, "learning_rate": 8.718169246681986e-05, "loss": 1.7154, "step": 24990 }, { "epoch": 1.3548630751774011, "grad_norm": 0.44046953320503235, "learning_rate": 8.717062001849001e-05, "loss": 1.6982, "step": 25000 }, { "epoch": 1.3554050163429132, "grad_norm": 0.40893176198005676, "learning_rate": 8.715954358500342e-05, "loss": 1.7179, "step": 25010 }, { "epoch": 1.3559469575084255, "grad_norm": 0.46221449971199036, "learning_rate": 8.714846316773218e-05, "loss": 1.7108, "step": 25020 }, { "epoch": 1.3564888986739376, "grad_norm": 0.4240012466907501, "learning_rate": 8.713737876804891e-05, "loss": 1.7114, "step": 25030 }, { "epoch": 1.35703083983945, "grad_norm": 0.39423850178718567, "learning_rate": 8.71262903873267e-05, "loss": 1.7035, "step": 25040 }, { "epoch": 1.3575727810049623, "grad_norm": 0.26944607496261597, "learning_rate": 8.711519802693911e-05, "loss": 1.7109, "step": 25050 }, { "epoch": 1.3578979457042695, "eval_loss": 2.564357280731201, "eval_runtime": 21.9966, "eval_samples_per_second": 227.308, "eval_steps_per_second": 1.227, "step": 25056 }, { "epoch": 1.3581147221704744, "grad_norm": 0.353392630815506, "learning_rate": 8.710410168826021e-05, "loss": 1.7087, "step": 25060 }, { "epoch": 1.3586566633359867, "grad_norm": 0.41630539298057556, "learning_rate": 8.709300137266456e-05, "loss": 1.7018, "step": 25070 }, { "epoch": 1.3591986045014988, "grad_norm": 0.3117140233516693, "learning_rate": 8.708189708152727e-05, "loss": 1.7139, "step": 25080 }, { "epoch": 1.359740545667011, "grad_norm": 0.522264301776886, "learning_rate": 8.707078881622381e-05, "loss": 1.7087, "step": 25090 }, { "epoch": 1.3602824868325232, "grad_norm": 0.2718927562236786, "learning_rate": 8.70596765781303e-05, "loss": 1.702, "step": 25100 }, { "epoch": 1.3608244279980355, "grad_norm": 0.5048019289970398, "learning_rate": 8.704856036862325e-05, "loss": 1.7174, "step": 25110 }, { "epoch": 1.3613663691635476, "grad_norm": 0.29533931612968445, "learning_rate": 8.703744018907966e-05, "loss": 1.7028, "step": 25120 }, { "epoch": 1.36190831032906, "grad_norm": 0.6198928356170654, "learning_rate": 8.702631604087709e-05, "loss": 1.7008, "step": 25130 }, { "epoch": 1.3624502514945722, "grad_norm": 0.3209061026573181, "learning_rate": 8.701518792539351e-05, "loss": 1.7072, "step": 25140 }, { "epoch": 1.3626128338442258, "eval_loss": 2.5585379600524902, "eval_runtime": 21.9846, "eval_samples_per_second": 227.432, "eval_steps_per_second": 1.228, "step": 25143 }, { "epoch": 1.3629921926600843, "grad_norm": 0.2755369246006012, "learning_rate": 8.700405584400747e-05, "loss": 1.7093, "step": 25150 }, { "epoch": 1.3635341338255966, "grad_norm": 0.24208010733127594, "learning_rate": 8.699291979809794e-05, "loss": 1.7136, "step": 25160 }, { "epoch": 1.3640760749911087, "grad_norm": 0.44155067205429077, "learning_rate": 8.69817797890444e-05, "loss": 1.6991, "step": 25170 }, { "epoch": 1.364618016156621, "grad_norm": 0.24571721255779266, "learning_rate": 8.697063581822682e-05, "loss": 1.6956, "step": 25180 }, { "epoch": 1.3651599573221331, "grad_norm": 0.2612999975681305, "learning_rate": 8.695948788702568e-05, "loss": 1.7069, "step": 25190 }, { "epoch": 1.3657018984876454, "grad_norm": 0.47256600856781006, "learning_rate": 8.694833599682194e-05, "loss": 1.6981, "step": 25200 }, { "epoch": 1.3662438396531575, "grad_norm": 0.3326663374900818, "learning_rate": 8.693718014899705e-05, "loss": 1.7145, "step": 25210 }, { "epoch": 1.3667857808186699, "grad_norm": 0.6608891487121582, "learning_rate": 8.692602034493292e-05, "loss": 1.6981, "step": 25220 }, { "epoch": 1.3673277219841822, "grad_norm": 0.3640691041946411, "learning_rate": 8.691485658601203e-05, "loss": 1.7105, "step": 25230 }, { "epoch": 1.3673277219841822, "eval_loss": 2.5621073246002197, "eval_runtime": 21.9283, "eval_samples_per_second": 228.016, "eval_steps_per_second": 1.231, "step": 25230 }, { "epoch": 1.3678696631496943, "grad_norm": 0.26905468106269836, "learning_rate": 8.690368887361724e-05, "loss": 1.7063, "step": 25240 }, { "epoch": 1.3684116043152066, "grad_norm": 0.33112001419067383, "learning_rate": 8.689251720913199e-05, "loss": 1.7043, "step": 25250 }, { "epoch": 1.3689535454807187, "grad_norm": 0.6896092891693115, "learning_rate": 8.688134159394017e-05, "loss": 1.7025, "step": 25260 }, { "epoch": 1.369495486646231, "grad_norm": 0.26553961634635925, "learning_rate": 8.687016202942617e-05, "loss": 1.7046, "step": 25270 }, { "epoch": 1.3700374278117433, "grad_norm": 0.6590327024459839, "learning_rate": 8.685897851697486e-05, "loss": 1.6894, "step": 25280 }, { "epoch": 1.3705793689772554, "grad_norm": 0.264779657125473, "learning_rate": 8.684779105797164e-05, "loss": 1.7135, "step": 25290 }, { "epoch": 1.3711213101427675, "grad_norm": 0.29008379578590393, "learning_rate": 8.68365996538023e-05, "loss": 1.7019, "step": 25300 }, { "epoch": 1.3716632513082798, "grad_norm": 0.28447601199150085, "learning_rate": 8.682540430585322e-05, "loss": 1.7155, "step": 25310 }, { "epoch": 1.3720426101241383, "eval_loss": 2.554466962814331, "eval_runtime": 21.9935, "eval_samples_per_second": 227.34, "eval_steps_per_second": 1.228, "step": 25317 }, { "epoch": 1.3722051924737921, "grad_norm": 0.4446844160556793, "learning_rate": 8.681420501551123e-05, "loss": 1.6982, "step": 25320 }, { "epoch": 1.3727471336393042, "grad_norm": 0.3506196439266205, "learning_rate": 8.680300178416364e-05, "loss": 1.6931, "step": 25330 }, { "epoch": 1.3732890748048165, "grad_norm": 0.28431805968284607, "learning_rate": 8.679179461319828e-05, "loss": 1.7094, "step": 25340 }, { "epoch": 1.3738310159703286, "grad_norm": 0.3544039726257324, "learning_rate": 8.678058350400342e-05, "loss": 1.6993, "step": 25350 }, { "epoch": 1.374372957135841, "grad_norm": 0.4839727580547333, "learning_rate": 8.676936845796785e-05, "loss": 1.712, "step": 25360 }, { "epoch": 1.3749148983013533, "grad_norm": 0.34992125630378723, "learning_rate": 8.675814947648085e-05, "loss": 1.7031, "step": 25370 }, { "epoch": 1.3754568394668654, "grad_norm": 0.37806737422943115, "learning_rate": 8.674692656093216e-05, "loss": 1.7053, "step": 25380 }, { "epoch": 1.3759987806323775, "grad_norm": 0.24221035838127136, "learning_rate": 8.673569971271206e-05, "loss": 1.702, "step": 25390 }, { "epoch": 1.3765407217978898, "grad_norm": 0.5913215279579163, "learning_rate": 8.672446893321125e-05, "loss": 1.6931, "step": 25400 }, { "epoch": 1.3767574982640947, "eval_loss": 2.552272081375122, "eval_runtime": 21.9916, "eval_samples_per_second": 227.36, "eval_steps_per_second": 1.228, "step": 25404 }, { "epoch": 1.377082662963402, "grad_norm": 0.29546642303466797, "learning_rate": 8.671323422382097e-05, "loss": 1.7001, "step": 25410 }, { "epoch": 1.3776246041289142, "grad_norm": 0.48717227578163147, "learning_rate": 8.67019955859329e-05, "loss": 1.7133, "step": 25420 }, { "epoch": 1.3781665452944265, "grad_norm": 0.3169041872024536, "learning_rate": 8.669075302093928e-05, "loss": 1.6972, "step": 25430 }, { "epoch": 1.3787084864599386, "grad_norm": 0.36612847447395325, "learning_rate": 8.667950653023274e-05, "loss": 1.6897, "step": 25440 }, { "epoch": 1.379250427625451, "grad_norm": 0.2583743929862976, "learning_rate": 8.666825611520648e-05, "loss": 1.7026, "step": 25450 }, { "epoch": 1.3797923687909632, "grad_norm": 0.34147560596466064, "learning_rate": 8.665700177725415e-05, "loss": 1.6969, "step": 25460 }, { "epoch": 1.3803343099564753, "grad_norm": 0.43001192808151245, "learning_rate": 8.664574351776988e-05, "loss": 1.708, "step": 25470 }, { "epoch": 1.3808762511219876, "grad_norm": 0.41845470666885376, "learning_rate": 8.663448133814826e-05, "loss": 1.7032, "step": 25480 }, { "epoch": 1.3814181922874997, "grad_norm": 0.3702855706214905, "learning_rate": 8.662321523978448e-05, "loss": 1.6965, "step": 25490 }, { "epoch": 1.381472386404051, "eval_loss": 2.5554749965667725, "eval_runtime": 21.99, "eval_samples_per_second": 227.376, "eval_steps_per_second": 1.228, "step": 25491 }, { "epoch": 1.381960133453012, "grad_norm": 0.4040187895298004, "learning_rate": 8.661194522407408e-05, "loss": 1.7025, "step": 25500 }, { "epoch": 1.3825020746185241, "grad_norm": 0.5149408578872681, "learning_rate": 8.660067129241315e-05, "loss": 1.698, "step": 25510 }, { "epoch": 1.3830440157840365, "grad_norm": 0.328133761882782, "learning_rate": 8.658939344619824e-05, "loss": 1.7068, "step": 25520 }, { "epoch": 1.3835859569495486, "grad_norm": 0.34496381878852844, "learning_rate": 8.657811168682644e-05, "loss": 1.7131, "step": 25530 }, { "epoch": 1.3841278981150609, "grad_norm": 0.29763221740722656, "learning_rate": 8.656682601569524e-05, "loss": 1.6992, "step": 25540 }, { "epoch": 1.3846698392805732, "grad_norm": 0.3687681257724762, "learning_rate": 8.655553643420268e-05, "loss": 1.7021, "step": 25550 }, { "epoch": 1.3852117804460853, "grad_norm": 0.36310508847236633, "learning_rate": 8.654424294374729e-05, "loss": 1.6958, "step": 25560 }, { "epoch": 1.3857537216115976, "grad_norm": 0.7248652577400208, "learning_rate": 8.653294554572802e-05, "loss": 1.7009, "step": 25570 }, { "epoch": 1.3861872745440074, "eval_loss": 2.5556063652038574, "eval_runtime": 21.9953, "eval_samples_per_second": 227.321, "eval_steps_per_second": 1.228, "step": 25578 }, { "epoch": 1.3862956627771097, "grad_norm": 0.2832872271537781, "learning_rate": 8.652164424154438e-05, "loss": 1.6906, "step": 25580 }, { "epoch": 1.386837603942622, "grad_norm": 0.6250040531158447, "learning_rate": 8.651033903259629e-05, "loss": 1.6885, "step": 25590 }, { "epoch": 1.3873795451081343, "grad_norm": 0.36408352851867676, "learning_rate": 8.649902992028419e-05, "loss": 1.697, "step": 25600 }, { "epoch": 1.3879214862736464, "grad_norm": 0.344611257314682, "learning_rate": 8.648771690600905e-05, "loss": 1.7002, "step": 25610 }, { "epoch": 1.3884634274391585, "grad_norm": 0.5186106562614441, "learning_rate": 8.647639999117221e-05, "loss": 1.6938, "step": 25620 }, { "epoch": 1.3890053686046708, "grad_norm": 0.3214333951473236, "learning_rate": 8.646507917717562e-05, "loss": 1.6959, "step": 25630 }, { "epoch": 1.3895473097701831, "grad_norm": 0.31978529691696167, "learning_rate": 8.645375446542162e-05, "loss": 1.6994, "step": 25640 }, { "epoch": 1.3900892509356952, "grad_norm": 0.3708828389644623, "learning_rate": 8.644242585731309e-05, "loss": 1.7054, "step": 25650 }, { "epoch": 1.3906311921012076, "grad_norm": 0.2623041272163391, "learning_rate": 8.643109335425335e-05, "loss": 1.6969, "step": 25660 }, { "epoch": 1.3909021626839637, "eval_loss": 2.5598154067993164, "eval_runtime": 21.9908, "eval_samples_per_second": 227.368, "eval_steps_per_second": 1.228, "step": 25665 }, { "epoch": 1.3911731332667197, "grad_norm": 0.23545025289058685, "learning_rate": 8.641975695764621e-05, "loss": 1.6934, "step": 25670 }, { "epoch": 1.391715074432232, "grad_norm": 0.31226447224617004, "learning_rate": 8.640841666889599e-05, "loss": 1.705, "step": 25680 }, { "epoch": 1.3922570155977443, "grad_norm": 0.25354868173599243, "learning_rate": 8.639707248940748e-05, "loss": 1.6927, "step": 25690 }, { "epoch": 1.3927989567632564, "grad_norm": 0.2758951783180237, "learning_rate": 8.638572442058596e-05, "loss": 1.6954, "step": 25700 }, { "epoch": 1.3933408979287685, "grad_norm": 0.2809247672557831, "learning_rate": 8.637437246383714e-05, "loss": 1.691, "step": 25710 }, { "epoch": 1.3938828390942808, "grad_norm": 0.27451246976852417, "learning_rate": 8.636301662056727e-05, "loss": 1.6959, "step": 25720 }, { "epoch": 1.394424780259793, "grad_norm": 0.5022933483123779, "learning_rate": 8.635165689218309e-05, "loss": 1.6908, "step": 25730 }, { "epoch": 1.3949667214253052, "grad_norm": 0.23696884512901306, "learning_rate": 8.634029328009174e-05, "loss": 1.6908, "step": 25740 }, { "epoch": 1.3955086625908175, "grad_norm": 0.2913939356803894, "learning_rate": 8.632892578570094e-05, "loss": 1.6963, "step": 25750 }, { "epoch": 1.3956170508239198, "eval_loss": 2.5667457580566406, "eval_runtime": 21.9931, "eval_samples_per_second": 227.344, "eval_steps_per_second": 1.228, "step": 25752 }, { "epoch": 1.3960506037563296, "grad_norm": 0.28175458312034607, "learning_rate": 8.631755441041884e-05, "loss": 1.7013, "step": 25760 }, { "epoch": 1.396592544921842, "grad_norm": 0.4736407399177551, "learning_rate": 8.630617915565405e-05, "loss": 1.7113, "step": 25770 }, { "epoch": 1.3971344860873542, "grad_norm": 0.33866068720817566, "learning_rate": 8.629480002281572e-05, "loss": 1.6952, "step": 25780 }, { "epoch": 1.3976764272528663, "grad_norm": 0.2977088689804077, "learning_rate": 8.628341701331341e-05, "loss": 1.7036, "step": 25790 }, { "epoch": 1.3982183684183787, "grad_norm": 0.26200008392333984, "learning_rate": 8.627203012855723e-05, "loss": 1.708, "step": 25800 }, { "epoch": 1.3987603095838907, "grad_norm": 0.6198749542236328, "learning_rate": 8.626063936995774e-05, "loss": 1.6924, "step": 25810 }, { "epoch": 1.399302250749403, "grad_norm": 0.6624993681907654, "learning_rate": 8.624924473892595e-05, "loss": 1.6978, "step": 25820 }, { "epoch": 1.3998441919149152, "grad_norm": 0.32168132066726685, "learning_rate": 8.62378462368734e-05, "loss": 1.7015, "step": 25830 }, { "epoch": 1.4003319389638762, "eval_loss": 2.560290575027466, "eval_runtime": 21.9932, "eval_samples_per_second": 227.343, "eval_steps_per_second": 1.228, "step": 25839 }, { "epoch": 1.4003861330804275, "grad_norm": 0.39742761850357056, "learning_rate": 8.622644386521207e-05, "loss": 1.6899, "step": 25840 }, { "epoch": 1.4009280742459396, "grad_norm": 0.4070427417755127, "learning_rate": 8.621503762535445e-05, "loss": 1.6905, "step": 25850 }, { "epoch": 1.4014700154114519, "grad_norm": 0.33487892150878906, "learning_rate": 8.620362751871349e-05, "loss": 1.6978, "step": 25860 }, { "epoch": 1.4020119565769642, "grad_norm": 0.2614098787307739, "learning_rate": 8.619221354670264e-05, "loss": 1.682, "step": 25870 }, { "epoch": 1.4025538977424763, "grad_norm": 0.3438650369644165, "learning_rate": 8.618079571073578e-05, "loss": 1.6924, "step": 25880 }, { "epoch": 1.4030958389079886, "grad_norm": 0.3044370710849762, "learning_rate": 8.616937401222734e-05, "loss": 1.6948, "step": 25890 }, { "epoch": 1.4036377800735007, "grad_norm": 0.3689815402030945, "learning_rate": 8.615794845259215e-05, "loss": 1.7012, "step": 25900 }, { "epoch": 1.404179721239013, "grad_norm": 0.2958444356918335, "learning_rate": 8.61465190332456e-05, "loss": 1.6935, "step": 25910 }, { "epoch": 1.4047216624045253, "grad_norm": 0.4378800392150879, "learning_rate": 8.613508575560348e-05, "loss": 1.6843, "step": 25920 }, { "epoch": 1.4050468271038326, "eval_loss": 2.5524935722351074, "eval_runtime": 21.9905, "eval_samples_per_second": 227.371, "eval_steps_per_second": 1.228, "step": 25926 }, { "epoch": 1.4052636035700374, "grad_norm": 0.2674407362937927, "learning_rate": 8.612364862108211e-05, "loss": 1.6896, "step": 25930 }, { "epoch": 1.4058055447355495, "grad_norm": 0.37812310457229614, "learning_rate": 8.611220763109829e-05, "loss": 1.6967, "step": 25940 }, { "epoch": 1.4063474859010618, "grad_norm": 0.25375431776046753, "learning_rate": 8.610076278706926e-05, "loss": 1.6858, "step": 25950 }, { "epoch": 1.4068894270665742, "grad_norm": 0.30164968967437744, "learning_rate": 8.608931409041273e-05, "loss": 1.695, "step": 25960 }, { "epoch": 1.4074313682320863, "grad_norm": 0.3200394809246063, "learning_rate": 8.607786154254699e-05, "loss": 1.6881, "step": 25970 }, { "epoch": 1.4079733093975986, "grad_norm": 0.3104444742202759, "learning_rate": 8.606640514489067e-05, "loss": 1.6972, "step": 25980 }, { "epoch": 1.4085152505631107, "grad_norm": 0.3415814936161041, "learning_rate": 8.605494489886297e-05, "loss": 1.6989, "step": 25990 }, { "epoch": 1.409057191728623, "grad_norm": 0.3201243281364441, "learning_rate": 8.604348080588351e-05, "loss": 1.6955, "step": 26000 }, { "epoch": 1.4095991328941353, "grad_norm": 0.2451564520597458, "learning_rate": 8.603201286737243e-05, "loss": 1.6988, "step": 26010 }, { "epoch": 1.409761715243789, "eval_loss": 2.557467222213745, "eval_runtime": 21.9947, "eval_samples_per_second": 227.328, "eval_steps_per_second": 1.228, "step": 26013 }, { "epoch": 1.4101410740596474, "grad_norm": 0.31296998262405396, "learning_rate": 8.602054108475032e-05, "loss": 1.6994, "step": 26020 }, { "epoch": 1.4106830152251595, "grad_norm": 0.6278992891311646, "learning_rate": 8.600906545943827e-05, "loss": 1.7028, "step": 26030 }, { "epoch": 1.4112249563906718, "grad_norm": 0.3286595642566681, "learning_rate": 8.59975859928578e-05, "loss": 1.7018, "step": 26040 }, { "epoch": 1.4117668975561841, "grad_norm": 0.26818498969078064, "learning_rate": 8.598610268643097e-05, "loss": 1.686, "step": 26050 }, { "epoch": 1.4123088387216962, "grad_norm": 0.3351867198944092, "learning_rate": 8.597461554158025e-05, "loss": 1.6841, "step": 26060 }, { "epoch": 1.4128507798872085, "grad_norm": 0.43873295187950134, "learning_rate": 8.596312455972866e-05, "loss": 1.7027, "step": 26070 }, { "epoch": 1.4133927210527206, "grad_norm": 0.3023853600025177, "learning_rate": 8.595162974229963e-05, "loss": 1.6949, "step": 26080 }, { "epoch": 1.413934662218233, "grad_norm": 0.30039265751838684, "learning_rate": 8.594013109071705e-05, "loss": 1.6887, "step": 26090 }, { "epoch": 1.4144766033837453, "grad_norm": 0.32320213317871094, "learning_rate": 8.59286286064054e-05, "loss": 1.6913, "step": 26100 }, { "epoch": 1.4144766033837453, "eval_loss": 2.557053327560425, "eval_runtime": 21.9348, "eval_samples_per_second": 227.948, "eval_steps_per_second": 1.231, "step": 26100 }, { "epoch": 1.4150185445492574, "grad_norm": 0.2611085772514343, "learning_rate": 8.591712229078949e-05, "loss": 1.6902, "step": 26110 }, { "epoch": 1.4155604857147697, "grad_norm": 0.6301568150520325, "learning_rate": 8.59056121452947e-05, "loss": 1.6871, "step": 26120 }, { "epoch": 1.4161024268802818, "grad_norm": 0.29995056986808777, "learning_rate": 8.589409817134685e-05, "loss": 1.6999, "step": 26130 }, { "epoch": 1.416644368045794, "grad_norm": 0.30796995759010315, "learning_rate": 8.588258037037227e-05, "loss": 1.6911, "step": 26140 }, { "epoch": 1.4171863092113062, "grad_norm": 0.28592872619628906, "learning_rate": 8.58710587437977e-05, "loss": 1.6882, "step": 26150 }, { "epoch": 1.4177282503768185, "grad_norm": 0.264588862657547, "learning_rate": 8.58595332930504e-05, "loss": 1.6967, "step": 26160 }, { "epoch": 1.4182701915423306, "grad_norm": 0.3666386306285858, "learning_rate": 8.58480040195581e-05, "loss": 1.682, "step": 26170 }, { "epoch": 1.418812132707843, "grad_norm": 0.38330572843551636, "learning_rate": 8.5836470924749e-05, "loss": 1.6829, "step": 26180 }, { "epoch": 1.4191914915237014, "eval_loss": 2.5586886405944824, "eval_runtime": 21.9928, "eval_samples_per_second": 227.348, "eval_steps_per_second": 1.228, "step": 26187 }, { "epoch": 1.4193540738733552, "grad_norm": 0.32240378856658936, "learning_rate": 8.582493401005175e-05, "loss": 1.6968, "step": 26190 }, { "epoch": 1.4198960150388673, "grad_norm": 0.3624284267425537, "learning_rate": 8.581339327689549e-05, "loss": 1.7, "step": 26200 }, { "epoch": 1.4204379562043796, "grad_norm": 0.3740536868572235, "learning_rate": 8.580184872670987e-05, "loss": 1.6942, "step": 26210 }, { "epoch": 1.4209798973698917, "grad_norm": 0.5000831484794617, "learning_rate": 8.579030036092495e-05, "loss": 1.6869, "step": 26220 }, { "epoch": 1.421521838535404, "grad_norm": 0.4658829867839813, "learning_rate": 8.57787481809713e-05, "loss": 1.6873, "step": 26230 }, { "epoch": 1.4220637797009164, "grad_norm": 0.3881395161151886, "learning_rate": 8.576719218827996e-05, "loss": 1.6912, "step": 26240 }, { "epoch": 1.4226057208664284, "grad_norm": 0.6568459868431091, "learning_rate": 8.575563238428241e-05, "loss": 1.6982, "step": 26250 }, { "epoch": 1.4231476620319405, "grad_norm": 0.2987072765827179, "learning_rate": 8.574406877041067e-05, "loss": 1.6951, "step": 26260 }, { "epoch": 1.4236896031974529, "grad_norm": 0.3211478590965271, "learning_rate": 8.573250134809715e-05, "loss": 1.7003, "step": 26270 }, { "epoch": 1.4239063796636577, "eval_loss": 2.5507450103759766, "eval_runtime": 21.9925, "eval_samples_per_second": 227.35, "eval_steps_per_second": 1.228, "step": 26274 }, { "epoch": 1.4242315443629652, "grad_norm": 0.327347993850708, "learning_rate": 8.57209301187748e-05, "loss": 1.6815, "step": 26280 }, { "epoch": 1.4247734855284773, "grad_norm": 0.3275546133518219, "learning_rate": 8.5709355083877e-05, "loss": 1.6945, "step": 26290 }, { "epoch": 1.4253154266939896, "grad_norm": 0.36542415618896484, "learning_rate": 8.569777624483763e-05, "loss": 1.695, "step": 26300 }, { "epoch": 1.4258573678595017, "grad_norm": 0.52955561876297, "learning_rate": 8.568619360309102e-05, "loss": 1.6802, "step": 26310 }, { "epoch": 1.426399309025014, "grad_norm": 0.9618374109268188, "learning_rate": 8.567460716007195e-05, "loss": 1.6978, "step": 26320 }, { "epoch": 1.4269412501905263, "grad_norm": 0.7096937298774719, "learning_rate": 8.566301691721574e-05, "loss": 1.6956, "step": 26330 }, { "epoch": 1.4274831913560384, "grad_norm": 0.2932894825935364, "learning_rate": 8.56514228759581e-05, "loss": 1.6919, "step": 26340 }, { "epoch": 1.4280251325215505, "grad_norm": 0.38948243856430054, "learning_rate": 8.563982503773527e-05, "loss": 1.6941, "step": 26350 }, { "epoch": 1.4285670736870628, "grad_norm": 0.2520905137062073, "learning_rate": 8.562822340398395e-05, "loss": 1.6801, "step": 26360 }, { "epoch": 1.428621267803614, "eval_loss": 2.5581541061401367, "eval_runtime": 21.991, "eval_samples_per_second": 227.366, "eval_steps_per_second": 1.228, "step": 26361 }, { "epoch": 1.4291090148525751, "grad_norm": 0.26686540246009827, "learning_rate": 8.56166179761413e-05, "loss": 1.6883, "step": 26370 }, { "epoch": 1.4296509560180872, "grad_norm": 0.45527172088623047, "learning_rate": 8.560500875564491e-05, "loss": 1.6885, "step": 26380 }, { "epoch": 1.4301928971835995, "grad_norm": 0.33151310682296753, "learning_rate": 8.559339574393294e-05, "loss": 1.6901, "step": 26390 }, { "epoch": 1.4307348383491116, "grad_norm": 0.4582711458206177, "learning_rate": 8.558177894244392e-05, "loss": 1.6876, "step": 26400 }, { "epoch": 1.431276779514624, "grad_norm": 0.23842747509479523, "learning_rate": 8.557015835261688e-05, "loss": 1.7054, "step": 26410 }, { "epoch": 1.4318187206801363, "grad_norm": 0.4057098627090454, "learning_rate": 8.555853397589136e-05, "loss": 1.7013, "step": 26420 }, { "epoch": 1.4323606618456484, "grad_norm": 0.4992932081222534, "learning_rate": 8.554690581370732e-05, "loss": 1.6931, "step": 26430 }, { "epoch": 1.4329026030111607, "grad_norm": 0.4780980944633484, "learning_rate": 8.553527386750521e-05, "loss": 1.6896, "step": 26440 }, { "epoch": 1.4333361559435704, "eval_loss": 2.556183099746704, "eval_runtime": 21.9936, "eval_samples_per_second": 227.339, "eval_steps_per_second": 1.228, "step": 26448 }, { "epoch": 1.4334445441766728, "grad_norm": 0.5446142554283142, "learning_rate": 8.552363813872594e-05, "loss": 1.6788, "step": 26450 }, { "epoch": 1.433986485342185, "grad_norm": 0.48127421736717224, "learning_rate": 8.551199862881091e-05, "loss": 1.6734, "step": 26460 }, { "epoch": 1.4345284265076972, "grad_norm": 0.30122366547584534, "learning_rate": 8.550035533920194e-05, "loss": 1.6897, "step": 26470 }, { "epoch": 1.4350703676732095, "grad_norm": 0.36052316427230835, "learning_rate": 8.548870827134139e-05, "loss": 1.6886, "step": 26480 }, { "epoch": 1.4356123088387216, "grad_norm": 0.47297677397727966, "learning_rate": 8.547705742667203e-05, "loss": 1.6943, "step": 26490 }, { "epoch": 1.436154250004234, "grad_norm": 0.26182791590690613, "learning_rate": 8.54654028066371e-05, "loss": 1.6938, "step": 26500 }, { "epoch": 1.4366961911697462, "grad_norm": 0.3834492564201355, "learning_rate": 8.545374441268033e-05, "loss": 1.6822, "step": 26510 }, { "epoch": 1.4372381323352583, "grad_norm": 0.29216572642326355, "learning_rate": 8.544208224624592e-05, "loss": 1.6838, "step": 26520 }, { "epoch": 1.4377800735007706, "grad_norm": 0.2463953047990799, "learning_rate": 8.543041630877856e-05, "loss": 1.6817, "step": 26530 }, { "epoch": 1.4380510440835268, "eval_loss": 2.5639803409576416, "eval_runtime": 21.9922, "eval_samples_per_second": 227.353, "eval_steps_per_second": 1.228, "step": 26535 }, { "epoch": 1.4383220146662827, "grad_norm": 0.6637621521949768, "learning_rate": 8.54187466017233e-05, "loss": 1.6862, "step": 26540 }, { "epoch": 1.438863955831795, "grad_norm": 0.33043143153190613, "learning_rate": 8.540707312652581e-05, "loss": 1.704, "step": 26550 }, { "epoch": 1.4394058969973074, "grad_norm": 0.36580994725227356, "learning_rate": 8.53953958846321e-05, "loss": 1.6747, "step": 26560 }, { "epoch": 1.4399478381628195, "grad_norm": 0.5655596256256104, "learning_rate": 8.538371487748871e-05, "loss": 1.688, "step": 26570 }, { "epoch": 1.4404897793283316, "grad_norm": 0.41566041111946106, "learning_rate": 8.537203010654266e-05, "loss": 1.6908, "step": 26580 }, { "epoch": 1.4410317204938439, "grad_norm": 0.3370110094547272, "learning_rate": 8.536034157324135e-05, "loss": 1.6906, "step": 26590 }, { "epoch": 1.4415736616593562, "grad_norm": 0.4993593692779541, "learning_rate": 8.534864927903274e-05, "loss": 1.6829, "step": 26600 }, { "epoch": 1.4421156028248683, "grad_norm": 0.28498584032058716, "learning_rate": 8.533695322536523e-05, "loss": 1.6933, "step": 26610 }, { "epoch": 1.4426575439903806, "grad_norm": 0.7398360371589661, "learning_rate": 8.532525341368767e-05, "loss": 1.686, "step": 26620 }, { "epoch": 1.442765932223483, "eval_loss": 2.5482335090637207, "eval_runtime": 21.988, "eval_samples_per_second": 227.397, "eval_steps_per_second": 1.228, "step": 26622 }, { "epoch": 1.4431994851558927, "grad_norm": 0.40166711807250977, "learning_rate": 8.531354984544936e-05, "loss": 1.6862, "step": 26630 }, { "epoch": 1.443741426321405, "grad_norm": 0.43468061089515686, "learning_rate": 8.530184252210012e-05, "loss": 1.6952, "step": 26640 }, { "epoch": 1.4442833674869173, "grad_norm": 0.39255911111831665, "learning_rate": 8.529013144509017e-05, "loss": 1.6928, "step": 26650 }, { "epoch": 1.4448253086524294, "grad_norm": 0.3616562783718109, "learning_rate": 8.527841661587024e-05, "loss": 1.6716, "step": 26660 }, { "epoch": 1.4453672498179415, "grad_norm": 0.5667160153388977, "learning_rate": 8.526669803589153e-05, "loss": 1.6842, "step": 26670 }, { "epoch": 1.4459091909834538, "grad_norm": 0.6671094298362732, "learning_rate": 8.525497570660567e-05, "loss": 1.6756, "step": 26680 }, { "epoch": 1.4464511321489661, "grad_norm": 0.4949670732021332, "learning_rate": 8.524324962946477e-05, "loss": 1.6857, "step": 26690 }, { "epoch": 1.4469930733144782, "grad_norm": 0.2640099823474884, "learning_rate": 8.52315198059214e-05, "loss": 1.6939, "step": 26700 }, { "epoch": 1.4474808203634393, "eval_loss": 2.541543960571289, "eval_runtime": 21.9948, "eval_samples_per_second": 227.326, "eval_steps_per_second": 1.228, "step": 26709 }, { "epoch": 1.4475350144799906, "grad_norm": 0.24292974174022675, "learning_rate": 8.521978623742861e-05, "loss": 1.6733, "step": 26710 }, { "epoch": 1.4480769556455027, "grad_norm": 0.40531477332115173, "learning_rate": 8.520804892543993e-05, "loss": 1.6769, "step": 26720 }, { "epoch": 1.448618896811015, "grad_norm": 0.37671342492103577, "learning_rate": 8.519630787140926e-05, "loss": 1.6894, "step": 26730 }, { "epoch": 1.4491608379765273, "grad_norm": 0.3520098328590393, "learning_rate": 8.51845630767911e-05, "loss": 1.6852, "step": 26740 }, { "epoch": 1.4497027791420394, "grad_norm": 0.3429430425167084, "learning_rate": 8.517281454304031e-05, "loss": 1.6855, "step": 26750 }, { "epoch": 1.4502447203075517, "grad_norm": 0.22939632833003998, "learning_rate": 8.516106227161223e-05, "loss": 1.6898, "step": 26760 }, { "epoch": 1.4507866614730638, "grad_norm": 0.2420610934495926, "learning_rate": 8.514930626396273e-05, "loss": 1.6847, "step": 26770 }, { "epoch": 1.451328602638576, "grad_norm": 0.3113659620285034, "learning_rate": 8.513754652154805e-05, "loss": 1.6867, "step": 26780 }, { "epoch": 1.4518705438040882, "grad_norm": 0.34163153171539307, "learning_rate": 8.512578304582495e-05, "loss": 1.695, "step": 26790 }, { "epoch": 1.4521957085033956, "eval_loss": 2.5400216579437256, "eval_runtime": 21.9954, "eval_samples_per_second": 227.32, "eval_steps_per_second": 1.228, "step": 26796 }, { "epoch": 1.4524124849696005, "grad_norm": 0.26067569851875305, "learning_rate": 8.511401583825066e-05, "loss": 1.6843, "step": 26800 }, { "epoch": 1.4529544261351126, "grad_norm": 0.34139981865882874, "learning_rate": 8.51022449002828e-05, "loss": 1.68, "step": 26810 }, { "epoch": 1.453496367300625, "grad_norm": 0.40918228030204773, "learning_rate": 8.509047023337954e-05, "loss": 1.6758, "step": 26820 }, { "epoch": 1.4540383084661372, "grad_norm": 0.33097583055496216, "learning_rate": 8.507869183899947e-05, "loss": 1.6781, "step": 26830 }, { "epoch": 1.4545802496316493, "grad_norm": 0.5380516648292542, "learning_rate": 8.506690971860167e-05, "loss": 1.6753, "step": 26840 }, { "epoch": 1.4551221907971617, "grad_norm": 0.3190267086029053, "learning_rate": 8.50551238736456e-05, "loss": 1.6841, "step": 26850 }, { "epoch": 1.4556641319626737, "grad_norm": 0.40389493107795715, "learning_rate": 8.504333430559127e-05, "loss": 1.687, "step": 26860 }, { "epoch": 1.456206073128186, "grad_norm": 0.3580215573310852, "learning_rate": 8.503154101589915e-05, "loss": 1.6818, "step": 26870 }, { "epoch": 1.4567480142936984, "grad_norm": 0.2765572667121887, "learning_rate": 8.501974400603009e-05, "loss": 1.6859, "step": 26880 }, { "epoch": 1.456910596643352, "eval_loss": 2.5574100017547607, "eval_runtime": 21.9907, "eval_samples_per_second": 227.368, "eval_steps_per_second": 1.228, "step": 26883 }, { "epoch": 1.4572899554592105, "grad_norm": 0.2352159172296524, "learning_rate": 8.500794327744547e-05, "loss": 1.679, "step": 26890 }, { "epoch": 1.4578318966247226, "grad_norm": 0.241908460855484, "learning_rate": 8.499613883160712e-05, "loss": 1.6749, "step": 26900 }, { "epoch": 1.4583738377902349, "grad_norm": 0.46047091484069824, "learning_rate": 8.498433066997733e-05, "loss": 1.6867, "step": 26910 }, { "epoch": 1.4589157789557472, "grad_norm": 0.4493439793586731, "learning_rate": 8.497251879401884e-05, "loss": 1.6741, "step": 26920 }, { "epoch": 1.4594577201212593, "grad_norm": 0.26538506150245667, "learning_rate": 8.496070320519484e-05, "loss": 1.6759, "step": 26930 }, { "epoch": 1.4599996612867716, "grad_norm": 0.3610468804836273, "learning_rate": 8.494888390496901e-05, "loss": 1.6859, "step": 26940 }, { "epoch": 1.4605416024522837, "grad_norm": 0.3829333782196045, "learning_rate": 8.493706089480546e-05, "loss": 1.6795, "step": 26950 }, { "epoch": 1.461083543617796, "grad_norm": 0.26342180371284485, "learning_rate": 8.492523417616878e-05, "loss": 1.6823, "step": 26960 }, { "epoch": 1.4616254847833083, "grad_norm": 0.3344675302505493, "learning_rate": 8.491340375052403e-05, "loss": 1.6862, "step": 26970 }, { "epoch": 1.4616254847833083, "eval_loss": 2.554945230484009, "eval_runtime": 21.9254, "eval_samples_per_second": 228.046, "eval_steps_per_second": 1.231, "step": 26970 }, { "epoch": 1.4621674259488204, "grad_norm": 0.28914472460746765, "learning_rate": 8.490156961933666e-05, "loss": 1.6751, "step": 26980 }, { "epoch": 1.4627093671143325, "grad_norm": 0.46620985865592957, "learning_rate": 8.488973178407269e-05, "loss": 1.689, "step": 26990 }, { "epoch": 1.4632513082798448, "grad_norm": 0.2889445722103119, "learning_rate": 8.487789024619852e-05, "loss": 1.6843, "step": 27000 }, { "epoch": 1.0005419411655123, "grad_norm": 0.23382456600666046, "learning_rate": 8.486604500718102e-05, "loss": 1.6714, "step": 27010 }, { "epoch": 1.0010838823310244, "grad_norm": 0.554624617099762, "learning_rate": 8.485419606848751e-05, "loss": 1.675, "step": 27020 }, { "epoch": 1.0016258234965367, "grad_norm": 0.48615115880966187, "learning_rate": 8.484234343158583e-05, "loss": 1.6822, "step": 27030 }, { "epoch": 1.0021677646620488, "grad_norm": 0.348832368850708, "learning_rate": 8.48304870979442e-05, "loss": 1.6722, "step": 27040 }, { "epoch": 1.0027097058275611, "grad_norm": 0.5944726467132568, "learning_rate": 8.481862706903132e-05, "loss": 1.6743, "step": 27050 }, { "epoch": 1.0030890646434196, "eval_loss": 2.5551207065582275, "eval_runtime": 22.1704, "eval_samples_per_second": 225.526, "eval_steps_per_second": 1.218, "step": 27057 }, { "epoch": 1.0032516469930732, "grad_norm": 0.34727635979652405, "learning_rate": 8.48067633463164e-05, "loss": 1.6697, "step": 27060 }, { "epoch": 1.0037935881585855, "grad_norm": 0.37410902976989746, "learning_rate": 8.479489593126904e-05, "loss": 1.666, "step": 27070 }, { "epoch": 1.0043355293240976, "grad_norm": 0.3332069516181946, "learning_rate": 8.478302482535933e-05, "loss": 1.6891, "step": 27080 }, { "epoch": 1.00487747048961, "grad_norm": 0.2845593988895416, "learning_rate": 8.477115003005781e-05, "loss": 1.6745, "step": 27090 }, { "epoch": 1.0054194116551223, "grad_norm": 0.367365300655365, "learning_rate": 8.475927154683547e-05, "loss": 1.6853, "step": 27100 }, { "epoch": 1.0059613528206344, "grad_norm": 0.2617562711238861, "learning_rate": 8.474738937716378e-05, "loss": 1.6763, "step": 27110 }, { "epoch": 1.0065032939861467, "grad_norm": 0.4524533748626709, "learning_rate": 8.473550352251465e-05, "loss": 1.6789, "step": 27120 }, { "epoch": 1.0070452351516588, "grad_norm": 0.39219167828559875, "learning_rate": 8.472361398436043e-05, "loss": 1.6915, "step": 27130 }, { "epoch": 1.007587176317171, "grad_norm": 0.47083818912506104, "learning_rate": 8.471172076417397e-05, "loss": 1.6807, "step": 27140 }, { "epoch": 1.007803952783376, "eval_loss": 2.5521044731140137, "eval_runtime": 21.9818, "eval_samples_per_second": 227.461, "eval_steps_per_second": 1.228, "step": 27144 }, { "epoch": 1.0081291174826832, "grad_norm": 0.33303186297416687, "learning_rate": 8.469982386342851e-05, "loss": 1.6765, "step": 27150 }, { "epoch": 1.0086710586481955, "grad_norm": 0.28193533420562744, "learning_rate": 8.468792328359783e-05, "loss": 1.674, "step": 27160 }, { "epoch": 1.0092129998137078, "grad_norm": 0.35703226923942566, "learning_rate": 8.467601902615613e-05, "loss": 1.6827, "step": 27170 }, { "epoch": 1.00975494097922, "grad_norm": 0.36111465096473694, "learning_rate": 8.4664111092578e-05, "loss": 1.6728, "step": 27180 }, { "epoch": 1.0102968821447322, "grad_norm": 0.28821465373039246, "learning_rate": 8.465219948433858e-05, "loss": 1.6828, "step": 27190 }, { "epoch": 1.0108388233102443, "grad_norm": 0.3027511537075043, "learning_rate": 8.464028420291344e-05, "loss": 1.6751, "step": 27200 }, { "epoch": 1.0113807644757566, "grad_norm": 0.28520819544792175, "learning_rate": 8.462836524977856e-05, "loss": 1.6664, "step": 27210 }, { "epoch": 1.0119227056412687, "grad_norm": 0.4790101647377014, "learning_rate": 8.461644262641044e-05, "loss": 1.6729, "step": 27220 }, { "epoch": 1.012464646806781, "grad_norm": 0.43564823269844055, "learning_rate": 8.460451633428599e-05, "loss": 1.6749, "step": 27230 }, { "epoch": 1.0125188409233323, "eval_loss": 2.557983636856079, "eval_runtime": 21.9815, "eval_samples_per_second": 227.464, "eval_steps_per_second": 1.228, "step": 27231 }, { "epoch": 1.0130065879722931, "grad_norm": 0.24843677878379822, "learning_rate": 8.459258637488259e-05, "loss": 1.6682, "step": 27240 }, { "epoch": 1.0135485291378055, "grad_norm": 0.2693169414997101, "learning_rate": 8.458065274967806e-05, "loss": 1.6843, "step": 27250 }, { "epoch": 1.0140904703033178, "grad_norm": 0.2424820065498352, "learning_rate": 8.45687154601507e-05, "loss": 1.6775, "step": 27260 }, { "epoch": 1.0146324114688299, "grad_norm": 0.37894406914711, "learning_rate": 8.455677450777922e-05, "loss": 1.6716, "step": 27270 }, { "epoch": 1.0151743526343422, "grad_norm": 0.771432638168335, "learning_rate": 8.454482989404286e-05, "loss": 1.6872, "step": 27280 }, { "epoch": 1.0157162937998543, "grad_norm": 0.28843340277671814, "learning_rate": 8.453288162042124e-05, "loss": 1.681, "step": 27290 }, { "epoch": 1.0162582349653666, "grad_norm": 0.31491175293922424, "learning_rate": 8.452092968839446e-05, "loss": 1.6717, "step": 27300 }, { "epoch": 1.0168001761308787, "grad_norm": 0.38847339153289795, "learning_rate": 8.450897409944309e-05, "loss": 1.6786, "step": 27310 }, { "epoch": 1.0172337290632885, "eval_loss": 2.552642345428467, "eval_runtime": 21.9929, "eval_samples_per_second": 227.346, "eval_steps_per_second": 1.228, "step": 27318 }, { "epoch": 1.017342117296391, "grad_norm": 0.6366086602210999, "learning_rate": 8.44970148550481e-05, "loss": 1.6791, "step": 27320 }, { "epoch": 1.0178840584619033, "grad_norm": 0.5009103417396545, "learning_rate": 8.448505195669099e-05, "loss": 1.6762, "step": 27330 }, { "epoch": 1.0184259996274154, "grad_norm": 0.31960320472717285, "learning_rate": 8.447308540585364e-05, "loss": 1.676, "step": 27340 }, { "epoch": 1.0189679407929277, "grad_norm": 0.5080009698867798, "learning_rate": 8.446111520401845e-05, "loss": 1.6768, "step": 27350 }, { "epoch": 1.0195098819584398, "grad_norm": 0.44143038988113403, "learning_rate": 8.44491413526682e-05, "loss": 1.6816, "step": 27360 }, { "epoch": 1.0200518231239522, "grad_norm": 0.3140455186367035, "learning_rate": 8.443716385328618e-05, "loss": 1.6696, "step": 27370 }, { "epoch": 1.0205937642894642, "grad_norm": 0.5069706439971924, "learning_rate": 8.442518270735611e-05, "loss": 1.6752, "step": 27380 }, { "epoch": 1.0211357054549766, "grad_norm": 1.0147614479064941, "learning_rate": 8.441319791636215e-05, "loss": 1.6684, "step": 27390 }, { "epoch": 1.0216776466204887, "grad_norm": 0.2649192214012146, "learning_rate": 8.440120948178895e-05, "loss": 1.6823, "step": 27400 }, { "epoch": 1.0219486172032448, "eval_loss": 2.5535383224487305, "eval_runtime": 21.989, "eval_samples_per_second": 227.386, "eval_steps_per_second": 1.228, "step": 27405 }, { "epoch": 1.022219587786001, "grad_norm": 0.2985130250453949, "learning_rate": 8.438921740512153e-05, "loss": 1.6679, "step": 27410 }, { "epoch": 1.0227615289515133, "grad_norm": 0.3355660140514374, "learning_rate": 8.437722168784548e-05, "loss": 1.6648, "step": 27420 }, { "epoch": 1.0233034701170254, "grad_norm": 0.30278173089027405, "learning_rate": 8.436522233144675e-05, "loss": 1.6676, "step": 27430 }, { "epoch": 1.0238454112825377, "grad_norm": 0.24776387214660645, "learning_rate": 8.435321933741177e-05, "loss": 1.6718, "step": 27440 }, { "epoch": 1.0243873524480498, "grad_norm": 0.25117209553718567, "learning_rate": 8.434121270722741e-05, "loss": 1.6791, "step": 27450 }, { "epoch": 1.024929293613562, "grad_norm": 0.23679675161838531, "learning_rate": 8.4329202442381e-05, "loss": 1.6793, "step": 27460 }, { "epoch": 1.0254712347790742, "grad_norm": 0.4516010582447052, "learning_rate": 8.431718854436034e-05, "loss": 1.6673, "step": 27470 }, { "epoch": 1.0260131759445865, "grad_norm": 0.2465139925479889, "learning_rate": 8.430517101465364e-05, "loss": 1.6743, "step": 27480 }, { "epoch": 1.0265551171100988, "grad_norm": 0.3209734857082367, "learning_rate": 8.429314985474958e-05, "loss": 1.67, "step": 27490 }, { "epoch": 1.0266635053432012, "eval_loss": 2.5473275184631348, "eval_runtime": 21.9881, "eval_samples_per_second": 227.396, "eval_steps_per_second": 1.228, "step": 27492 }, { "epoch": 1.027097058275611, "grad_norm": 0.2319590449333191, "learning_rate": 8.42811250661373e-05, "loss": 1.6803, "step": 27500 }, { "epoch": 1.0276389994411232, "grad_norm": 0.3213670253753662, "learning_rate": 8.426909665030637e-05, "loss": 1.6656, "step": 27510 }, { "epoch": 1.0281809406066353, "grad_norm": 0.29917410016059875, "learning_rate": 8.425706460874682e-05, "loss": 1.6738, "step": 27520 }, { "epoch": 1.0287228817721477, "grad_norm": 0.24403968453407288, "learning_rate": 8.424502894294913e-05, "loss": 1.6695, "step": 27530 }, { "epoch": 1.0292648229376598, "grad_norm": 0.3891294002532959, "learning_rate": 8.423298965440423e-05, "loss": 1.6619, "step": 27540 }, { "epoch": 1.029806764103172, "grad_norm": 0.4318540692329407, "learning_rate": 8.422094674460348e-05, "loss": 1.6751, "step": 27550 }, { "epoch": 1.0303487052686842, "grad_norm": 0.3217497766017914, "learning_rate": 8.420890021503872e-05, "loss": 1.6667, "step": 27560 }, { "epoch": 1.0308906464341965, "grad_norm": 0.31174400448799133, "learning_rate": 8.419685006720221e-05, "loss": 1.6779, "step": 27570 }, { "epoch": 1.0313783934831575, "eval_loss": 2.553981304168701, "eval_runtime": 22.0351, "eval_samples_per_second": 226.91, "eval_steps_per_second": 1.225, "step": 27579 }, { "epoch": 1.0314325875997088, "grad_norm": 0.25709420442581177, "learning_rate": 8.418479630258666e-05, "loss": 1.681, "step": 27580 }, { "epoch": 1.0319745287652209, "grad_norm": 0.2779182195663452, "learning_rate": 8.417273892268527e-05, "loss": 1.6807, "step": 27590 }, { "epoch": 1.0325164699307332, "grad_norm": 0.41596314311027527, "learning_rate": 8.416067792899162e-05, "loss": 1.6761, "step": 27600 }, { "epoch": 1.0330584110962453, "grad_norm": 0.2902078628540039, "learning_rate": 8.41486133229998e-05, "loss": 1.6773, "step": 27610 }, { "epoch": 1.0336003522617576, "grad_norm": 0.2825920879840851, "learning_rate": 8.413654510620432e-05, "loss": 1.6655, "step": 27620 }, { "epoch": 1.0341422934272697, "grad_norm": 0.3873823285102844, "learning_rate": 8.412447328010012e-05, "loss": 1.6638, "step": 27630 }, { "epoch": 1.034684234592782, "grad_norm": 1.103782057762146, "learning_rate": 8.411239784618263e-05, "loss": 1.6825, "step": 27640 }, { "epoch": 1.0352261757582943, "grad_norm": 0.632357120513916, "learning_rate": 8.410031880594768e-05, "loss": 1.6624, "step": 27650 }, { "epoch": 1.0357681169238064, "grad_norm": 0.3010210394859314, "learning_rate": 8.408823616089157e-05, "loss": 1.6669, "step": 27660 }, { "epoch": 1.0360932816231139, "eval_loss": 2.547511100769043, "eval_runtime": 21.993, "eval_samples_per_second": 227.345, "eval_steps_per_second": 1.228, "step": 27666 }, { "epoch": 1.0363100580893188, "grad_norm": 0.5328647494316101, "learning_rate": 8.407614991251106e-05, "loss": 1.6691, "step": 27670 }, { "epoch": 1.0368519992548308, "grad_norm": 0.328873872756958, "learning_rate": 8.406406006230333e-05, "loss": 1.6752, "step": 27680 }, { "epoch": 1.0373939404203432, "grad_norm": 0.5136923789978027, "learning_rate": 8.405196661176604e-05, "loss": 1.6724, "step": 27690 }, { "epoch": 1.0379358815858553, "grad_norm": 0.2645358443260193, "learning_rate": 8.403986956239724e-05, "loss": 1.67, "step": 27700 }, { "epoch": 1.0384778227513676, "grad_norm": 0.31197217106819153, "learning_rate": 8.402776891569547e-05, "loss": 1.6693, "step": 27710 }, { "epoch": 1.0390197639168797, "grad_norm": 0.3356460928916931, "learning_rate": 8.401566467315973e-05, "loss": 1.668, "step": 27720 }, { "epoch": 1.039561705082392, "grad_norm": 0.3273829221725464, "learning_rate": 8.400355683628941e-05, "loss": 1.6722, "step": 27730 }, { "epoch": 1.0401036462479043, "grad_norm": 0.26531314849853516, "learning_rate": 8.399144540658441e-05, "loss": 1.6708, "step": 27740 }, { "epoch": 1.0406455874134164, "grad_norm": 0.2471926361322403, "learning_rate": 8.397933038554502e-05, "loss": 1.6663, "step": 27750 }, { "epoch": 1.04080816976307, "eval_loss": 2.5492753982543945, "eval_runtime": 21.9881, "eval_samples_per_second": 227.396, "eval_steps_per_second": 1.228, "step": 27753 }, { "epoch": 1.0411875285789287, "grad_norm": 0.35514238476753235, "learning_rate": 8.396721177467197e-05, "loss": 1.6734, "step": 27760 }, { "epoch": 1.0417294697444408, "grad_norm": 0.33591434359550476, "learning_rate": 8.39550895754665e-05, "loss": 1.6738, "step": 27770 }, { "epoch": 1.0422714109099531, "grad_norm": 0.2625053822994232, "learning_rate": 8.394296378943027e-05, "loss": 1.6687, "step": 27780 }, { "epoch": 1.0428133520754652, "grad_norm": 0.2579818665981293, "learning_rate": 8.39308344180653e-05, "loss": 1.6667, "step": 27790 }, { "epoch": 1.0433552932409775, "grad_norm": 0.25735151767730713, "learning_rate": 8.39187014628742e-05, "loss": 1.6705, "step": 27800 }, { "epoch": 1.0438972344064898, "grad_norm": 0.259867399930954, "learning_rate": 8.390656492535992e-05, "loss": 1.6795, "step": 27810 }, { "epoch": 1.044439175572002, "grad_norm": 0.26979291439056396, "learning_rate": 8.389442480702585e-05, "loss": 1.6748, "step": 27820 }, { "epoch": 1.0449811167375143, "grad_norm": 0.3980228006839752, "learning_rate": 8.38822811093759e-05, "loss": 1.6651, "step": 27830 }, { "epoch": 1.0455230579030264, "grad_norm": 0.39179670810699463, "learning_rate": 8.387013383391435e-05, "loss": 1.6701, "step": 27840 }, { "epoch": 1.0455230579030264, "eval_loss": 2.563586950302124, "eval_runtime": 21.9807, "eval_samples_per_second": 227.472, "eval_steps_per_second": 1.228, "step": 27840 }, { "epoch": 1.0460649990685387, "grad_norm": 0.5658113956451416, "learning_rate": 8.385798298214596e-05, "loss": 1.672, "step": 27850 }, { "epoch": 1.0466069402340508, "grad_norm": 0.5975947380065918, "learning_rate": 8.384582855557594e-05, "loss": 1.6522, "step": 27860 }, { "epoch": 1.047148881399563, "grad_norm": 0.28900066018104553, "learning_rate": 8.383367055570991e-05, "loss": 1.6596, "step": 27870 }, { "epoch": 1.0476908225650752, "grad_norm": 0.38573452830314636, "learning_rate": 8.382150898405396e-05, "loss": 1.6625, "step": 27880 }, { "epoch": 1.0482327637305875, "grad_norm": 0.2802741229534149, "learning_rate": 8.380934384211458e-05, "loss": 1.6651, "step": 27890 }, { "epoch": 1.0487747048960998, "grad_norm": 0.31623005867004395, "learning_rate": 8.37971751313988e-05, "loss": 1.6644, "step": 27900 }, { "epoch": 1.049316646061612, "grad_norm": 0.2634221911430359, "learning_rate": 8.378500285341397e-05, "loss": 1.6651, "step": 27910 }, { "epoch": 1.0498585872271242, "grad_norm": 0.2507796287536621, "learning_rate": 8.377282700966795e-05, "loss": 1.6699, "step": 27920 }, { "epoch": 1.0502379460429827, "eval_loss": 2.5567116737365723, "eval_runtime": 21.9902, "eval_samples_per_second": 227.374, "eval_steps_per_second": 1.228, "step": 27927 }, { "epoch": 1.0504005283926363, "grad_norm": 0.38170045614242554, "learning_rate": 8.376064760166907e-05, "loss": 1.674, "step": 27930 }, { "epoch": 1.0509424695581486, "grad_norm": 0.31428414583206177, "learning_rate": 8.3748464630926e-05, "loss": 1.6715, "step": 27940 }, { "epoch": 1.0514844107236607, "grad_norm": 0.30629944801330566, "learning_rate": 8.373627809894796e-05, "loss": 1.6685, "step": 27950 }, { "epoch": 1.052026351889173, "grad_norm": 0.4108351171016693, "learning_rate": 8.372408800724454e-05, "loss": 1.6599, "step": 27960 }, { "epoch": 1.0525682930546854, "grad_norm": 0.4083997309207916, "learning_rate": 8.371189435732581e-05, "loss": 1.6629, "step": 27970 }, { "epoch": 1.0531102342201974, "grad_norm": 0.5288533568382263, "learning_rate": 8.369969715070226e-05, "loss": 1.6598, "step": 27980 }, { "epoch": 1.0536521753857098, "grad_norm": 0.35696470737457275, "learning_rate": 8.368749638888484e-05, "loss": 1.6646, "step": 27990 }, { "epoch": 1.0541941165512219, "grad_norm": 0.3221840262413025, "learning_rate": 8.36752920733849e-05, "loss": 1.6577, "step": 28000 }, { "epoch": 1.0547360577167342, "grad_norm": 0.35849282145500183, "learning_rate": 8.366308420571429e-05, "loss": 1.6604, "step": 28010 }, { "epoch": 1.054952834182939, "eval_loss": 2.547173500061035, "eval_runtime": 21.9831, "eval_samples_per_second": 227.447, "eval_steps_per_second": 1.228, "step": 28014 }, { "epoch": 1.0552779988822463, "grad_norm": 0.2584594488143921, "learning_rate": 8.365087278738524e-05, "loss": 1.6827, "step": 28020 }, { "epoch": 1.0558199400477586, "grad_norm": 0.2509235441684723, "learning_rate": 8.363865781991046e-05, "loss": 1.6627, "step": 28030 }, { "epoch": 1.0563618812132707, "grad_norm": 0.36739498376846313, "learning_rate": 8.36264393048031e-05, "loss": 1.666, "step": 28040 }, { "epoch": 1.056903822378783, "grad_norm": 0.41700538992881775, "learning_rate": 8.36142172435767e-05, "loss": 1.6799, "step": 28050 }, { "epoch": 1.0574457635442953, "grad_norm": 0.31023573875427246, "learning_rate": 8.360199163774531e-05, "loss": 1.6701, "step": 28060 }, { "epoch": 1.0579877047098074, "grad_norm": 0.23943783342838287, "learning_rate": 8.358976248882337e-05, "loss": 1.656, "step": 28070 }, { "epoch": 1.0585296458753197, "grad_norm": 0.8542354702949524, "learning_rate": 8.357752979832578e-05, "loss": 1.6668, "step": 28080 }, { "epoch": 1.0590715870408318, "grad_norm": 0.4590936601161957, "learning_rate": 8.356529356776786e-05, "loss": 1.6735, "step": 28090 }, { "epoch": 1.0596135282063441, "grad_norm": 0.61350017786026, "learning_rate": 8.355305379866541e-05, "loss": 1.667, "step": 28100 }, { "epoch": 1.0596677223228954, "eval_loss": 2.5593972206115723, "eval_runtime": 21.9806, "eval_samples_per_second": 227.474, "eval_steps_per_second": 1.228, "step": 28101 }, { "epoch": 1.0601554693718562, "grad_norm": 0.7067118883132935, "learning_rate": 8.354081049253462e-05, "loss": 1.6765, "step": 28110 }, { "epoch": 1.0606974105373685, "grad_norm": 0.27008959650993347, "learning_rate": 8.35285636508921e-05, "loss": 1.6628, "step": 28120 }, { "epoch": 1.0612393517028809, "grad_norm": 0.2994672358036041, "learning_rate": 8.351631327525501e-05, "loss": 1.6551, "step": 28130 }, { "epoch": 1.061781292868393, "grad_norm": 0.3203209340572357, "learning_rate": 8.350405936714083e-05, "loss": 1.6561, "step": 28140 }, { "epoch": 1.0623232340339053, "grad_norm": 0.3424260914325714, "learning_rate": 8.349180192806753e-05, "loss": 1.67, "step": 28150 }, { "epoch": 1.0628651751994174, "grad_norm": 0.2762835621833801, "learning_rate": 8.34795409595535e-05, "loss": 1.6691, "step": 28160 }, { "epoch": 1.0634071163649297, "grad_norm": 0.3313772976398468, "learning_rate": 8.346727646311758e-05, "loss": 1.6552, "step": 28170 }, { "epoch": 1.0639490575304418, "grad_norm": 0.2399122714996338, "learning_rate": 8.345500844027905e-05, "loss": 1.6666, "step": 28180 }, { "epoch": 1.0643826104628515, "eval_loss": 2.549626350402832, "eval_runtime": 21.9895, "eval_samples_per_second": 227.381, "eval_steps_per_second": 1.228, "step": 28188 }, { "epoch": 1.064490998695954, "grad_norm": 0.4699346721172333, "learning_rate": 8.344273689255761e-05, "loss": 1.668, "step": 28190 }, { "epoch": 1.0650329398614662, "grad_norm": 0.25492924451828003, "learning_rate": 8.343046182147342e-05, "loss": 1.6632, "step": 28200 }, { "epoch": 1.0655748810269785, "grad_norm": 0.4417177438735962, "learning_rate": 8.341818322854705e-05, "loss": 1.6678, "step": 28210 }, { "epoch": 1.0661168221924908, "grad_norm": 0.28891265392303467, "learning_rate": 8.340590111529952e-05, "loss": 1.6606, "step": 28220 }, { "epoch": 1.066658763358003, "grad_norm": 0.29153016209602356, "learning_rate": 8.339361548325226e-05, "loss": 1.6658, "step": 28230 }, { "epoch": 1.0672007045235152, "grad_norm": 0.2998339533805847, "learning_rate": 8.338132633392723e-05, "loss": 1.6612, "step": 28240 }, { "epoch": 1.0677426456890273, "grad_norm": 0.7401821613311768, "learning_rate": 8.336903366884668e-05, "loss": 1.6711, "step": 28250 }, { "epoch": 1.0682845868545396, "grad_norm": 0.3623445928096771, "learning_rate": 8.335673748953342e-05, "loss": 1.6542, "step": 28260 }, { "epoch": 1.0688265280200517, "grad_norm": 0.48223739862442017, "learning_rate": 8.334443779751063e-05, "loss": 1.6577, "step": 28270 }, { "epoch": 1.069097498602808, "eval_loss": 2.5425007343292236, "eval_runtime": 21.9868, "eval_samples_per_second": 227.409, "eval_steps_per_second": 1.228, "step": 28275 }, { "epoch": 1.069368469185564, "grad_norm": 0.4789433479309082, "learning_rate": 8.333213459430195e-05, "loss": 1.6562, "step": 28280 }, { "epoch": 1.0699104103510764, "grad_norm": 0.5614416003227234, "learning_rate": 8.331982788143142e-05, "loss": 1.6689, "step": 28290 }, { "epoch": 1.0704523515165885, "grad_norm": 0.4019344449043274, "learning_rate": 8.330751766042358e-05, "loss": 1.6564, "step": 28300 }, { "epoch": 1.0709942926821008, "grad_norm": 0.3532206118106842, "learning_rate": 8.329520393280335e-05, "loss": 1.6645, "step": 28310 }, { "epoch": 1.0715362338476129, "grad_norm": 0.3050895631313324, "learning_rate": 8.32828867000961e-05, "loss": 1.6581, "step": 28320 }, { "epoch": 1.0720781750131252, "grad_norm": 0.4680069386959076, "learning_rate": 8.327056596382766e-05, "loss": 1.6648, "step": 28330 }, { "epoch": 1.0726201161786373, "grad_norm": 0.5191404819488525, "learning_rate": 8.325824172552422e-05, "loss": 1.6579, "step": 28340 }, { "epoch": 1.0731620573441496, "grad_norm": 0.30772343277931213, "learning_rate": 8.324591398671249e-05, "loss": 1.6623, "step": 28350 }, { "epoch": 1.0737039985096617, "grad_norm": 0.351131796836853, "learning_rate": 8.323358274891956e-05, "loss": 1.6632, "step": 28360 }, { "epoch": 1.0738123867427642, "eval_loss": 2.5413732528686523, "eval_runtime": 21.9839, "eval_samples_per_second": 227.439, "eval_steps_per_second": 1.228, "step": 28362 }, { "epoch": 1.074245939675174, "grad_norm": 0.42983412742614746, "learning_rate": 8.322124801367298e-05, "loss": 1.652, "step": 28370 }, { "epoch": 1.0747878808406863, "grad_norm": 0.2206849902868271, "learning_rate": 8.320890978250073e-05, "loss": 1.6608, "step": 28380 }, { "epoch": 1.0753298220061984, "grad_norm": 0.2790736258029938, "learning_rate": 8.31965680569312e-05, "loss": 1.6622, "step": 28390 }, { "epoch": 1.0758717631717107, "grad_norm": 0.44509637355804443, "learning_rate": 8.318422283849323e-05, "loss": 1.6611, "step": 28400 }, { "epoch": 1.0764137043372228, "grad_norm": 0.24993574619293213, "learning_rate": 8.317187412871611e-05, "loss": 1.6573, "step": 28410 }, { "epoch": 1.0769556455027351, "grad_norm": 0.325591504573822, "learning_rate": 8.315952192912952e-05, "loss": 1.6501, "step": 28420 }, { "epoch": 1.0774975866682472, "grad_norm": 0.2710714638233185, "learning_rate": 8.314716624126362e-05, "loss": 1.6609, "step": 28430 }, { "epoch": 1.0780395278337596, "grad_norm": 0.5176997780799866, "learning_rate": 8.313480706664898e-05, "loss": 1.6707, "step": 28440 }, { "epoch": 1.0785272748827206, "eval_loss": 2.5304479598999023, "eval_runtime": 21.9868, "eval_samples_per_second": 227.409, "eval_steps_per_second": 1.228, "step": 28449 }, { "epoch": 1.0785814689992717, "grad_norm": 0.5030739307403564, "learning_rate": 8.312244440681657e-05, "loss": 1.6555, "step": 28450 }, { "epoch": 1.079123410164784, "grad_norm": 0.44651323556900024, "learning_rate": 8.311007826329786e-05, "loss": 1.6696, "step": 28460 }, { "epoch": 1.0796653513302963, "grad_norm": 0.6648367047309875, "learning_rate": 8.309770863762468e-05, "loss": 1.6589, "step": 28470 }, { "epoch": 1.0802072924958084, "grad_norm": 0.24871893227100372, "learning_rate": 8.308533553132935e-05, "loss": 1.6527, "step": 28480 }, { "epoch": 1.0807492336613207, "grad_norm": 0.4465760588645935, "learning_rate": 8.307295894594456e-05, "loss": 1.6585, "step": 28490 }, { "epoch": 1.0812911748268328, "grad_norm": 0.313731849193573, "learning_rate": 8.306057888300352e-05, "loss": 1.6579, "step": 28500 }, { "epoch": 1.081833115992345, "grad_norm": 0.2809997797012329, "learning_rate": 8.304819534403981e-05, "loss": 1.6533, "step": 28510 }, { "epoch": 1.0823750571578572, "grad_norm": 0.24525237083435059, "learning_rate": 8.303580833058742e-05, "loss": 1.6607, "step": 28520 }, { "epoch": 1.0829169983233695, "grad_norm": 0.2652153968811035, "learning_rate": 8.302341784418081e-05, "loss": 1.6687, "step": 28530 }, { "epoch": 1.083242163022677, "eval_loss": 2.5320417881011963, "eval_runtime": 21.9915, "eval_samples_per_second": 227.361, "eval_steps_per_second": 1.228, "step": 28536 }, { "epoch": 1.0834589394888818, "grad_norm": 0.24303193390369415, "learning_rate": 8.301102388635487e-05, "loss": 1.6608, "step": 28540 }, { "epoch": 1.084000880654394, "grad_norm": 0.26834824681282043, "learning_rate": 8.299862645864491e-05, "loss": 1.6616, "step": 28550 }, { "epoch": 1.0845428218199062, "grad_norm": 0.2935827374458313, "learning_rate": 8.298622556258664e-05, "loss": 1.6588, "step": 28560 }, { "epoch": 1.0850847629854183, "grad_norm": 0.27826014161109924, "learning_rate": 8.297382119971627e-05, "loss": 1.6731, "step": 28570 }, { "epoch": 1.0856267041509307, "grad_norm": 0.35834673047065735, "learning_rate": 8.296141337157038e-05, "loss": 1.657, "step": 28580 }, { "epoch": 1.0861686453164427, "grad_norm": 0.2979893386363983, "learning_rate": 8.294900207968597e-05, "loss": 1.6546, "step": 28590 }, { "epoch": 1.086710586481955, "grad_norm": 0.2854683995246887, "learning_rate": 8.293658732560056e-05, "loss": 1.6656, "step": 28600 }, { "epoch": 1.0872525276474674, "grad_norm": 0.37426939606666565, "learning_rate": 8.292416911085198e-05, "loss": 1.6513, "step": 28610 }, { "epoch": 1.0877944688129795, "grad_norm": 0.34777402877807617, "learning_rate": 8.291174743697857e-05, "loss": 1.6653, "step": 28620 }, { "epoch": 1.087957051162633, "eval_loss": 2.53008770942688, "eval_runtime": 21.9855, "eval_samples_per_second": 227.422, "eval_steps_per_second": 1.228, "step": 28623 }, { "epoch": 1.0883364099784918, "grad_norm": 0.28725841641426086, "learning_rate": 8.289932230551907e-05, "loss": 1.651, "step": 28630 }, { "epoch": 1.0888783511440039, "grad_norm": 0.28935521841049194, "learning_rate": 8.288689371801265e-05, "loss": 1.6616, "step": 28640 }, { "epoch": 1.0894202923095162, "grad_norm": 0.30465996265411377, "learning_rate": 8.287446167599891e-05, "loss": 1.6554, "step": 28650 }, { "epoch": 1.0899622334750283, "grad_norm": 0.5029714703559875, "learning_rate": 8.286202618101788e-05, "loss": 1.668, "step": 28660 }, { "epoch": 1.0905041746405406, "grad_norm": 0.2679192125797272, "learning_rate": 8.284958723461001e-05, "loss": 1.661, "step": 28670 }, { "epoch": 1.0910461158060527, "grad_norm": 0.3121162950992584, "learning_rate": 8.283714483831617e-05, "loss": 1.6601, "step": 28680 }, { "epoch": 1.091588056971565, "grad_norm": 0.2876189947128296, "learning_rate": 8.28246989936777e-05, "loss": 1.65, "step": 28690 }, { "epoch": 1.0921299981370773, "grad_norm": 0.30062171816825867, "learning_rate": 8.281224970223632e-05, "loss": 1.6531, "step": 28700 }, { "epoch": 1.0926719393025894, "grad_norm": 0.44612184166908264, "learning_rate": 8.27997969655342e-05, "loss": 1.6509, "step": 28710 }, { "epoch": 1.0926719393025894, "eval_loss": 2.5466320514678955, "eval_runtime": 21.9837, "eval_samples_per_second": 227.441, "eval_steps_per_second": 1.228, "step": 28710 }, { "epoch": 1.0932138804681018, "grad_norm": 0.2449900060892105, "learning_rate": 8.27873407851139e-05, "loss": 1.6529, "step": 28720 }, { "epoch": 1.0937558216336138, "grad_norm": 0.4357602894306183, "learning_rate": 8.27748811625185e-05, "loss": 1.6595, "step": 28730 }, { "epoch": 1.0942977627991262, "grad_norm": 0.34103602170944214, "learning_rate": 8.276241809929137e-05, "loss": 1.6567, "step": 28740 }, { "epoch": 1.0948397039646383, "grad_norm": 0.3430241346359253, "learning_rate": 8.274995159697646e-05, "loss": 1.6582, "step": 28750 }, { "epoch": 1.0953816451301506, "grad_norm": 0.40874654054641724, "learning_rate": 8.273748165711799e-05, "loss": 1.6566, "step": 28760 }, { "epoch": 1.0959235862956627, "grad_norm": 0.26068803668022156, "learning_rate": 8.272500828126075e-05, "loss": 1.6484, "step": 28770 }, { "epoch": 1.096465527461175, "grad_norm": 0.3570903241634369, "learning_rate": 8.271253147094981e-05, "loss": 1.6562, "step": 28780 }, { "epoch": 1.0970074686266873, "grad_norm": 0.2552138864994049, "learning_rate": 8.270005122773083e-05, "loss": 1.657, "step": 28790 }, { "epoch": 1.0973868274425458, "eval_loss": 2.531879186630249, "eval_runtime": 21.9807, "eval_samples_per_second": 227.473, "eval_steps_per_second": 1.228, "step": 28797 }, { "epoch": 1.0975494097921994, "grad_norm": 0.374850869178772, "learning_rate": 8.268756755314973e-05, "loss": 1.6501, "step": 28800 }, { "epoch": 1.0980913509577117, "grad_norm": 0.2875063419342041, "learning_rate": 8.2675080448753e-05, "loss": 1.6515, "step": 28810 }, { "epoch": 1.0986332921232238, "grad_norm": 0.2932276725769043, "learning_rate": 8.266258991608743e-05, "loss": 1.6496, "step": 28820 }, { "epoch": 1.0991752332887361, "grad_norm": 0.28365346789360046, "learning_rate": 8.265009595670034e-05, "loss": 1.6466, "step": 28830 }, { "epoch": 1.0997171744542482, "grad_norm": 0.32967546582221985, "learning_rate": 8.263759857213939e-05, "loss": 1.6494, "step": 28840 }, { "epoch": 1.1002591156197605, "grad_norm": 0.5435857772827148, "learning_rate": 8.262509776395274e-05, "loss": 1.661, "step": 28850 }, { "epoch": 1.1008010567852728, "grad_norm": 0.26168474555015564, "learning_rate": 8.26125935336889e-05, "loss": 1.6467, "step": 28860 }, { "epoch": 1.101342997950785, "grad_norm": 0.5927068591117859, "learning_rate": 8.260008588289687e-05, "loss": 1.6555, "step": 28870 }, { "epoch": 1.1018849391162973, "grad_norm": 0.28580188751220703, "learning_rate": 8.258757481312601e-05, "loss": 1.6536, "step": 28880 }, { "epoch": 1.1021017155825021, "eval_loss": 2.5326905250549316, "eval_runtime": 21.9872, "eval_samples_per_second": 227.405, "eval_steps_per_second": 1.228, "step": 28884 }, { "epoch": 1.1024268802818094, "grad_norm": 0.23324058949947357, "learning_rate": 8.257506032592617e-05, "loss": 1.6618, "step": 28890 }, { "epoch": 1.1029688214473217, "grad_norm": 0.37625429034233093, "learning_rate": 8.256254242284755e-05, "loss": 1.6533, "step": 28900 }, { "epoch": 1.1035107626128338, "grad_norm": 0.2690766453742981, "learning_rate": 8.255002110544087e-05, "loss": 1.6568, "step": 28910 }, { "epoch": 1.104052703778346, "grad_norm": 0.3211461007595062, "learning_rate": 8.253749637525717e-05, "loss": 1.6532, "step": 28920 }, { "epoch": 1.1045946449438584, "grad_norm": 0.30158957839012146, "learning_rate": 8.252496823384798e-05, "loss": 1.6473, "step": 28930 }, { "epoch": 1.1051365861093705, "grad_norm": 0.26189902424812317, "learning_rate": 8.251243668276524e-05, "loss": 1.6586, "step": 28940 }, { "epoch": 1.1056785272748828, "grad_norm": 0.3414219915866852, "learning_rate": 8.249990172356128e-05, "loss": 1.6553, "step": 28950 }, { "epoch": 1.106220468440395, "grad_norm": 0.2272077202796936, "learning_rate": 8.248736335778888e-05, "loss": 1.662, "step": 28960 }, { "epoch": 1.1067624096059072, "grad_norm": 0.3499011695384979, "learning_rate": 8.247482158700126e-05, "loss": 1.6621, "step": 28970 }, { "epoch": 1.1068166037224585, "eval_loss": 2.527860403060913, "eval_runtime": 21.9887, "eval_samples_per_second": 227.389, "eval_steps_per_second": 1.228, "step": 28971 }, { "epoch": 1.1073043507714193, "grad_norm": 0.44124558568000793, "learning_rate": 8.246227641275199e-05, "loss": 1.6541, "step": 28980 }, { "epoch": 1.1078462919369316, "grad_norm": 0.807167649269104, "learning_rate": 8.244972783659518e-05, "loss": 1.6499, "step": 28990 }, { "epoch": 1.1083882331024437, "grad_norm": 0.5441895723342896, "learning_rate": 8.243717586008525e-05, "loss": 1.6535, "step": 29000 }, { "epoch": 1.108930174267956, "grad_norm": 0.4006600081920624, "learning_rate": 8.24246204847771e-05, "loss": 1.6554, "step": 29010 }, { "epoch": 1.1094721154334684, "grad_norm": 0.26644378900527954, "learning_rate": 8.241206171222602e-05, "loss": 1.6445, "step": 29020 }, { "epoch": 1.1100140565989804, "grad_norm": 0.3243505358695984, "learning_rate": 8.239949954398777e-05, "loss": 1.6578, "step": 29030 }, { "epoch": 1.1105559977644928, "grad_norm": 0.39492732286453247, "learning_rate": 8.238693398161844e-05, "loss": 1.6574, "step": 29040 }, { "epoch": 1.1110979389300049, "grad_norm": 0.36204707622528076, "learning_rate": 8.237436502667467e-05, "loss": 1.647, "step": 29050 }, { "epoch": 1.1115314918624146, "eval_loss": 2.537062644958496, "eval_runtime": 21.9899, "eval_samples_per_second": 227.377, "eval_steps_per_second": 1.228, "step": 29058 }, { "epoch": 1.1116398800955172, "grad_norm": 0.4409829080104828, "learning_rate": 8.236179268071337e-05, "loss": 1.6599, "step": 29060 }, { "epoch": 1.1121818212610293, "grad_norm": 0.4123667776584625, "learning_rate": 8.2349216945292e-05, "loss": 1.6573, "step": 29070 }, { "epoch": 1.1127237624265416, "grad_norm": 0.29760655760765076, "learning_rate": 8.233663782196837e-05, "loss": 1.6542, "step": 29080 }, { "epoch": 1.1132657035920537, "grad_norm": 0.34215226769447327, "learning_rate": 8.232405531230074e-05, "loss": 1.6521, "step": 29090 }, { "epoch": 1.113807644757566, "grad_norm": 0.6061283349990845, "learning_rate": 8.231146941784776e-05, "loss": 1.6578, "step": 29100 }, { "epoch": 1.1143495859230783, "grad_norm": 0.5324410200119019, "learning_rate": 8.229888014016853e-05, "loss": 1.6522, "step": 29110 }, { "epoch": 1.1148915270885904, "grad_norm": 0.3367980420589447, "learning_rate": 8.228628748082256e-05, "loss": 1.6521, "step": 29120 }, { "epoch": 1.1154334682541027, "grad_norm": 0.34855327010154724, "learning_rate": 8.227369144136974e-05, "loss": 1.6478, "step": 29130 }, { "epoch": 1.1159754094196148, "grad_norm": 0.5364739298820496, "learning_rate": 8.226109202337043e-05, "loss": 1.6525, "step": 29140 }, { "epoch": 1.116246380002371, "eval_loss": 2.545180320739746, "eval_runtime": 21.9866, "eval_samples_per_second": 227.411, "eval_steps_per_second": 1.228, "step": 29145 }, { "epoch": 1.1165173505851271, "grad_norm": 0.36383602023124695, "learning_rate": 8.224848922838541e-05, "loss": 1.66, "step": 29150 }, { "epoch": 1.1170592917506392, "grad_norm": 0.5313844680786133, "learning_rate": 8.223588305797585e-05, "loss": 1.6447, "step": 29160 }, { "epoch": 1.1176012329161515, "grad_norm": 0.2722112536430359, "learning_rate": 8.222327351370332e-05, "loss": 1.6562, "step": 29170 }, { "epoch": 1.1181431740816639, "grad_norm": 0.39539259672164917, "learning_rate": 8.221066059712988e-05, "loss": 1.6494, "step": 29180 }, { "epoch": 1.118685115247176, "grad_norm": 0.4637277126312256, "learning_rate": 8.219804430981794e-05, "loss": 1.6445, "step": 29190 }, { "epoch": 1.1192270564126883, "grad_norm": 0.3050798177719116, "learning_rate": 8.218542465333035e-05, "loss": 1.6477, "step": 29200 }, { "epoch": 1.1197689975782004, "grad_norm": 0.3634278476238251, "learning_rate": 8.217280162923036e-05, "loss": 1.662, "step": 29210 }, { "epoch": 1.1203109387437127, "grad_norm": 0.2769427001476288, "learning_rate": 8.216017523908172e-05, "loss": 1.655, "step": 29220 }, { "epoch": 1.1208528799092248, "grad_norm": 0.2443336397409439, "learning_rate": 8.214754548444846e-05, "loss": 1.6374, "step": 29230 }, { "epoch": 1.1209612681423273, "eval_loss": 2.541795015335083, "eval_runtime": 21.9864, "eval_samples_per_second": 227.413, "eval_steps_per_second": 1.228, "step": 29232 }, { "epoch": 1.121394821074737, "grad_norm": 0.2958986163139343, "learning_rate": 8.213491236689514e-05, "loss": 1.6575, "step": 29240 }, { "epoch": 1.1219367622402494, "grad_norm": 0.2536433935165405, "learning_rate": 8.212227588798667e-05, "loss": 1.6449, "step": 29250 }, { "epoch": 1.1224787034057615, "grad_norm": 0.2808546721935272, "learning_rate": 8.210963604928842e-05, "loss": 1.664, "step": 29260 }, { "epoch": 1.1230206445712738, "grad_norm": 0.24417255818843842, "learning_rate": 8.209699285236618e-05, "loss": 1.6619, "step": 29270 }, { "epoch": 1.123562585736786, "grad_norm": 0.22852517664432526, "learning_rate": 8.208434629878607e-05, "loss": 1.6587, "step": 29280 }, { "epoch": 1.1241045269022982, "grad_norm": 0.3301936388015747, "learning_rate": 8.207169639011474e-05, "loss": 1.6574, "step": 29290 }, { "epoch": 1.1246464680678103, "grad_norm": 0.3570113480091095, "learning_rate": 8.205904312791921e-05, "loss": 1.648, "step": 29300 }, { "epoch": 1.1251884092333226, "grad_norm": 0.4051576256752014, "learning_rate": 8.20463865137669e-05, "loss": 1.6463, "step": 29310 }, { "epoch": 1.1256761562822837, "eval_loss": 2.5262796878814697, "eval_runtime": 21.9856, "eval_samples_per_second": 227.421, "eval_steps_per_second": 1.228, "step": 29319 }, { "epoch": 1.1257303503988347, "grad_norm": 0.3884764611721039, "learning_rate": 8.203372654922563e-05, "loss": 1.6512, "step": 29320 }, { "epoch": 1.126272291564347, "grad_norm": 0.7175701260566711, "learning_rate": 8.20210632358637e-05, "loss": 1.6392, "step": 29330 }, { "epoch": 1.1268142327298594, "grad_norm": 0.44228026270866394, "learning_rate": 8.200839657524976e-05, "loss": 1.6573, "step": 29340 }, { "epoch": 1.1273561738953715, "grad_norm": 0.25877845287323, "learning_rate": 8.199572656895291e-05, "loss": 1.6494, "step": 29350 }, { "epoch": 1.1278981150608838, "grad_norm": 0.42708685994148254, "learning_rate": 8.198305321854267e-05, "loss": 1.6489, "step": 29360 }, { "epoch": 1.1284400562263959, "grad_norm": 0.6025623083114624, "learning_rate": 8.197037652558895e-05, "loss": 1.6507, "step": 29370 }, { "epoch": 1.1289819973919082, "grad_norm": 0.4976705312728882, "learning_rate": 8.195769649166205e-05, "loss": 1.6528, "step": 29380 }, { "epoch": 1.1295239385574203, "grad_norm": 0.27502158284187317, "learning_rate": 8.194501311833277e-05, "loss": 1.659, "step": 29390 }, { "epoch": 1.1300658797229326, "grad_norm": 0.26740071177482605, "learning_rate": 8.193232640717223e-05, "loss": 1.6435, "step": 29400 }, { "epoch": 1.13039104442224, "eval_loss": 2.532335042953491, "eval_runtime": 21.9808, "eval_samples_per_second": 227.471, "eval_steps_per_second": 1.228, "step": 29406 }, { "epoch": 1.1306078208884447, "grad_norm": 0.21525554358959198, "learning_rate": 8.191963635975204e-05, "loss": 1.651, "step": 29410 }, { "epoch": 1.131149762053957, "grad_norm": 0.3878322243690491, "learning_rate": 8.190694297764417e-05, "loss": 1.6369, "step": 29420 }, { "epoch": 1.1316917032194693, "grad_norm": 0.2476329803466797, "learning_rate": 8.189424626242102e-05, "loss": 1.6483, "step": 29430 }, { "epoch": 1.1322336443849814, "grad_norm": 0.40698474645614624, "learning_rate": 8.18815462156554e-05, "loss": 1.6495, "step": 29440 }, { "epoch": 1.1327755855504937, "grad_norm": 0.2684831917285919, "learning_rate": 8.186884283892056e-05, "loss": 1.6424, "step": 29450 }, { "epoch": 1.1333175267160058, "grad_norm": 0.46887609362602234, "learning_rate": 8.185613613379011e-05, "loss": 1.6439, "step": 29460 }, { "epoch": 1.1338594678815181, "grad_norm": 0.34955888986587524, "learning_rate": 8.184342610183812e-05, "loss": 1.6492, "step": 29470 }, { "epoch": 1.1344014090470302, "grad_norm": 0.3082946240901947, "learning_rate": 8.183071274463903e-05, "loss": 1.6533, "step": 29480 }, { "epoch": 1.1349433502125426, "grad_norm": 0.35093745589256287, "learning_rate": 8.181799606376773e-05, "loss": 1.6564, "step": 29490 }, { "epoch": 1.1351059325621962, "eval_loss": 2.538316249847412, "eval_runtime": 21.9822, "eval_samples_per_second": 227.456, "eval_steps_per_second": 1.228, "step": 29493 }, { "epoch": 1.1354852913780547, "grad_norm": 0.26975366473197937, "learning_rate": 8.180527606079953e-05, "loss": 1.6604, "step": 29500 }, { "epoch": 1.136027232543567, "grad_norm": 0.2648986577987671, "learning_rate": 8.17925527373101e-05, "loss": 1.6453, "step": 29510 }, { "epoch": 1.1365691737090793, "grad_norm": 0.2807252109050751, "learning_rate": 8.177982609487556e-05, "loss": 1.634, "step": 29520 }, { "epoch": 1.1371111148745914, "grad_norm": 0.36864936351776123, "learning_rate": 8.176709613507243e-05, "loss": 1.6542, "step": 29530 }, { "epoch": 1.1376530560401037, "grad_norm": 0.3327690362930298, "learning_rate": 8.175436285947764e-05, "loss": 1.6508, "step": 29540 }, { "epoch": 1.1381949972056158, "grad_norm": 0.44101572036743164, "learning_rate": 8.174162626966853e-05, "loss": 1.6418, "step": 29550 }, { "epoch": 1.138736938371128, "grad_norm": 0.24692606925964355, "learning_rate": 8.172888636722288e-05, "loss": 1.6477, "step": 29560 }, { "epoch": 1.1392788795366404, "grad_norm": 0.41825541853904724, "learning_rate": 8.171614315371881e-05, "loss": 1.6572, "step": 29570 }, { "epoch": 1.1398208207021525, "grad_norm": 0.3095550835132599, "learning_rate": 8.170339663073492e-05, "loss": 1.6457, "step": 29580 }, { "epoch": 1.1398208207021525, "eval_loss": 2.5319855213165283, "eval_runtime": 21.9838, "eval_samples_per_second": 227.44, "eval_steps_per_second": 1.228, "step": 29580 }, { "epoch": 1.1403627618676648, "grad_norm": 0.5016186833381653, "learning_rate": 8.16906467998502e-05, "loss": 1.6549, "step": 29590 }, { "epoch": 1.140904703033177, "grad_norm": 0.4376733899116516, "learning_rate": 8.1677893662644e-05, "loss": 1.6502, "step": 29600 }, { "epoch": 1.1414466441986892, "grad_norm": 0.285548597574234, "learning_rate": 8.16651372206962e-05, "loss": 1.6387, "step": 29610 }, { "epoch": 1.1419885853642013, "grad_norm": 0.22487492859363556, "learning_rate": 8.165237747558694e-05, "loss": 1.6547, "step": 29620 }, { "epoch": 1.1425305265297137, "grad_norm": 0.37438520789146423, "learning_rate": 8.16396144288969e-05, "loss": 1.6615, "step": 29630 }, { "epoch": 1.1430724676952257, "grad_norm": 0.2509768009185791, "learning_rate": 8.162684808220708e-05, "loss": 1.6561, "step": 29640 }, { "epoch": 1.143614408860738, "grad_norm": 0.32760903239250183, "learning_rate": 8.161407843709889e-05, "loss": 1.6438, "step": 29650 }, { "epoch": 1.1441563500262504, "grad_norm": 0.29007697105407715, "learning_rate": 8.160130549515423e-05, "loss": 1.6537, "step": 29660 }, { "epoch": 1.1445357088421089, "eval_loss": 2.5295541286468506, "eval_runtime": 21.9895, "eval_samples_per_second": 227.382, "eval_steps_per_second": 1.228, "step": 29667 }, { "epoch": 1.1446982911917625, "grad_norm": 0.3338105380535126, "learning_rate": 8.158852925795534e-05, "loss": 1.6412, "step": 29670 }, { "epoch": 1.1452402323572748, "grad_norm": 0.2998930811882019, "learning_rate": 8.157574972708488e-05, "loss": 1.6408, "step": 29680 }, { "epoch": 1.1457821735227869, "grad_norm": 0.2588035464286804, "learning_rate": 8.156296690412593e-05, "loss": 1.6518, "step": 29690 }, { "epoch": 1.1463241146882992, "grad_norm": 0.24682050943374634, "learning_rate": 8.155018079066193e-05, "loss": 1.6513, "step": 29700 }, { "epoch": 1.1468660558538113, "grad_norm": 0.34920454025268555, "learning_rate": 8.153739138827684e-05, "loss": 1.6486, "step": 29710 }, { "epoch": 1.1474079970193236, "grad_norm": 0.24906253814697266, "learning_rate": 8.15245986985549e-05, "loss": 1.6509, "step": 29720 }, { "epoch": 1.1479499381848357, "grad_norm": 0.3515869677066803, "learning_rate": 8.151180272308085e-05, "loss": 1.6521, "step": 29730 }, { "epoch": 1.148491879350348, "grad_norm": 0.2527877688407898, "learning_rate": 8.149900346343975e-05, "loss": 1.6504, "step": 29740 }, { "epoch": 1.1490338205158603, "grad_norm": 0.29912281036376953, "learning_rate": 8.148620092121718e-05, "loss": 1.6499, "step": 29750 }, { "epoch": 1.1492505969820652, "eval_loss": 2.5310330390930176, "eval_runtime": 21.9865, "eval_samples_per_second": 227.412, "eval_steps_per_second": 1.228, "step": 29754 }, { "epoch": 1.1495757616813724, "grad_norm": 0.34561511874198914, "learning_rate": 8.147339509799902e-05, "loss": 1.6445, "step": 29760 }, { "epoch": 1.1501177028468847, "grad_norm": 0.2445221096277237, "learning_rate": 8.146058599537162e-05, "loss": 1.6496, "step": 29770 }, { "epoch": 1.1506596440123968, "grad_norm": 0.2948252856731415, "learning_rate": 8.144777361492168e-05, "loss": 1.6419, "step": 29780 }, { "epoch": 1.1512015851779092, "grad_norm": 0.297454833984375, "learning_rate": 8.14349579582364e-05, "loss": 1.6407, "step": 29790 }, { "epoch": 1.1517435263434213, "grad_norm": 0.2834935188293457, "learning_rate": 8.142213902690329e-05, "loss": 1.6598, "step": 29800 }, { "epoch": 1.1522854675089336, "grad_norm": 0.4210936427116394, "learning_rate": 8.140931682251029e-05, "loss": 1.6401, "step": 29810 }, { "epoch": 1.1528274086744457, "grad_norm": 0.33858510851860046, "learning_rate": 8.13964913466458e-05, "loss": 1.646, "step": 29820 }, { "epoch": 1.153369349839958, "grad_norm": 0.3140000104904175, "learning_rate": 8.138366260089855e-05, "loss": 1.6428, "step": 29830 }, { "epoch": 1.1539112910054703, "grad_norm": 0.24602891504764557, "learning_rate": 8.137083058685774e-05, "loss": 1.6408, "step": 29840 }, { "epoch": 1.1539654851220214, "eval_loss": 2.540917158126831, "eval_runtime": 21.9857, "eval_samples_per_second": 227.421, "eval_steps_per_second": 1.228, "step": 29841 }, { "epoch": 1.1544532321709824, "grad_norm": 0.23379047214984894, "learning_rate": 8.135799530611292e-05, "loss": 1.6488, "step": 29850 }, { "epoch": 1.1549951733364947, "grad_norm": 0.3018862307071686, "learning_rate": 8.134515676025407e-05, "loss": 1.6542, "step": 29860 }, { "epoch": 1.1555371145020068, "grad_norm": 0.24227862060070038, "learning_rate": 8.133231495087159e-05, "loss": 1.6456, "step": 29870 }, { "epoch": 1.1560790556675191, "grad_norm": 0.39517369866371155, "learning_rate": 8.131946987955627e-05, "loss": 1.6424, "step": 29880 }, { "epoch": 1.1566209968330314, "grad_norm": 0.3524185121059418, "learning_rate": 8.130662154789926e-05, "loss": 1.6485, "step": 29890 }, { "epoch": 1.1571629379985435, "grad_norm": 0.43569809198379517, "learning_rate": 8.12937699574922e-05, "loss": 1.6421, "step": 29900 }, { "epoch": 1.1577048791640558, "grad_norm": 0.3304058909416199, "learning_rate": 8.128091510992705e-05, "loss": 1.652, "step": 29910 }, { "epoch": 1.158246820329568, "grad_norm": 0.4421805143356323, "learning_rate": 8.126805700679628e-05, "loss": 1.6417, "step": 29920 }, { "epoch": 1.1586803732619777, "eval_loss": 2.530564785003662, "eval_runtime": 21.9874, "eval_samples_per_second": 227.403, "eval_steps_per_second": 1.228, "step": 29928 }, { "epoch": 1.1587887614950803, "grad_norm": 0.3330616056919098, "learning_rate": 8.125519564969263e-05, "loss": 1.6476, "step": 29930 }, { "epoch": 1.1593307026605923, "grad_norm": 0.36129215359687805, "learning_rate": 8.124233104020932e-05, "loss": 1.6414, "step": 29940 }, { "epoch": 1.1598726438261047, "grad_norm": 0.24318821728229523, "learning_rate": 8.122946317993999e-05, "loss": 1.6465, "step": 29950 }, { "epoch": 1.1604145849916168, "grad_norm": 0.2985212206840515, "learning_rate": 8.121659207047864e-05, "loss": 1.6386, "step": 29960 }, { "epoch": 1.160956526157129, "grad_norm": 0.33473873138427734, "learning_rate": 8.120371771341968e-05, "loss": 1.6402, "step": 29970 }, { "epoch": 1.1614984673226414, "grad_norm": 0.2525531053543091, "learning_rate": 8.119084011035794e-05, "loss": 1.6486, "step": 29980 }, { "epoch": 1.1620404084881535, "grad_norm": 0.36281898617744446, "learning_rate": 8.117795926288862e-05, "loss": 1.6468, "step": 29990 }, { "epoch": 1.1625823496536658, "grad_norm": 0.355827659368515, "learning_rate": 8.116507517260737e-05, "loss": 1.6445, "step": 30000 }, { "epoch": 1.163124290819178, "grad_norm": 0.38084909319877625, "learning_rate": 8.11521878411102e-05, "loss": 1.6413, "step": 30010 }, { "epoch": 1.163395261401934, "eval_loss": 2.527758836746216, "eval_runtime": 21.9849, "eval_samples_per_second": 227.428, "eval_steps_per_second": 1.228, "step": 30015 }, { "epoch": 1.1636662319846902, "grad_norm": 0.2793338894844055, "learning_rate": 8.113929726999354e-05, "loss": 1.6487, "step": 30020 }, { "epoch": 1.1642081731502023, "grad_norm": 0.31127527356147766, "learning_rate": 8.112640346085424e-05, "loss": 1.6481, "step": 30030 }, { "epoch": 1.1647501143157146, "grad_norm": 0.26151424646377563, "learning_rate": 8.11135064152895e-05, "loss": 1.6467, "step": 30040 }, { "epoch": 1.1652920554812267, "grad_norm": 0.3043818771839142, "learning_rate": 8.110060613489693e-05, "loss": 1.6401, "step": 30050 }, { "epoch": 1.165833996646739, "grad_norm": 0.26495102047920227, "learning_rate": 8.108770262127463e-05, "loss": 1.6412, "step": 30060 }, { "epoch": 1.1663759378122514, "grad_norm": 0.31619176268577576, "learning_rate": 8.107479587602097e-05, "loss": 1.6383, "step": 30070 }, { "epoch": 1.1669178789777634, "grad_norm": 0.22748631238937378, "learning_rate": 8.106188590073481e-05, "loss": 1.637, "step": 30080 }, { "epoch": 1.1674598201432758, "grad_norm": 0.4346810579299927, "learning_rate": 8.104897269701538e-05, "loss": 1.647, "step": 30090 }, { "epoch": 1.1680017613087879, "grad_norm": 0.2576606571674347, "learning_rate": 8.103605626646229e-05, "loss": 1.6389, "step": 30100 }, { "epoch": 1.1681101495418904, "eval_loss": 2.51259708404541, "eval_runtime": 21.9879, "eval_samples_per_second": 227.398, "eval_steps_per_second": 1.228, "step": 30102 }, { "epoch": 1.1685437024743002, "grad_norm": 0.31433582305908203, "learning_rate": 8.10231366106756e-05, "loss": 1.6312, "step": 30110 }, { "epoch": 1.1690856436398123, "grad_norm": 0.5255588889122009, "learning_rate": 8.101021373125573e-05, "loss": 1.634, "step": 30120 }, { "epoch": 1.1696275848053246, "grad_norm": 0.23931747674942017, "learning_rate": 8.099728762980349e-05, "loss": 1.6314, "step": 30130 }, { "epoch": 1.1701695259708367, "grad_norm": 0.2556639015674591, "learning_rate": 8.098435830792013e-05, "loss": 1.6428, "step": 30140 }, { "epoch": 1.170711467136349, "grad_norm": 0.4357036352157593, "learning_rate": 8.097142576720728e-05, "loss": 1.6342, "step": 30150 }, { "epoch": 1.1712534083018613, "grad_norm": 0.37833094596862793, "learning_rate": 8.095849000926696e-05, "loss": 1.6384, "step": 30160 }, { "epoch": 1.1717953494673734, "grad_norm": 0.2795352041721344, "learning_rate": 8.094555103570161e-05, "loss": 1.6411, "step": 30170 }, { "epoch": 1.1723372906328857, "grad_norm": 0.3068492114543915, "learning_rate": 8.093260884811403e-05, "loss": 1.6403, "step": 30180 }, { "epoch": 1.1728250376818468, "eval_loss": 2.527621269226074, "eval_runtime": 21.9849, "eval_samples_per_second": 227.429, "eval_steps_per_second": 1.228, "step": 30189 }, { "epoch": 1.1728792317983978, "grad_norm": 0.6460037231445312, "learning_rate": 8.091966344810746e-05, "loss": 1.6299, "step": 30190 }, { "epoch": 1.1734211729639101, "grad_norm": 0.5416850447654724, "learning_rate": 8.090671483728553e-05, "loss": 1.6407, "step": 30200 }, { "epoch": 1.1739631141294224, "grad_norm": 0.23775923252105713, "learning_rate": 8.08937630172522e-05, "loss": 1.6432, "step": 30210 }, { "epoch": 1.1745050552949345, "grad_norm": 0.26255470514297485, "learning_rate": 8.088080798961196e-05, "loss": 1.6431, "step": 30220 }, { "epoch": 1.1750469964604469, "grad_norm": 0.2868313491344452, "learning_rate": 8.086784975596959e-05, "loss": 1.638, "step": 30230 }, { "epoch": 1.175588937625959, "grad_norm": 0.3102096617221832, "learning_rate": 8.085488831793029e-05, "loss": 1.6405, "step": 30240 }, { "epoch": 1.1761308787914713, "grad_norm": 0.3971962630748749, "learning_rate": 8.084192367709967e-05, "loss": 1.6356, "step": 30250 }, { "epoch": 1.1766728199569834, "grad_norm": 0.39245739579200745, "learning_rate": 8.082895583508374e-05, "loss": 1.6448, "step": 30260 }, { "epoch": 1.1772147611224957, "grad_norm": 0.31180277466773987, "learning_rate": 8.081598479348892e-05, "loss": 1.6339, "step": 30270 }, { "epoch": 1.177539925821803, "eval_loss": 2.5259082317352295, "eval_runtime": 21.9866, "eval_samples_per_second": 227.411, "eval_steps_per_second": 1.228, "step": 30276 }, { "epoch": 1.1777567022880078, "grad_norm": 0.3723970353603363, "learning_rate": 8.080301055392196e-05, "loss": 1.6444, "step": 30280 }, { "epoch": 1.17829864345352, "grad_norm": 0.24248439073562622, "learning_rate": 8.079003311799008e-05, "loss": 1.6399, "step": 30290 }, { "epoch": 1.1788405846190324, "grad_norm": 0.545952320098877, "learning_rate": 8.077705248730089e-05, "loss": 1.6446, "step": 30300 }, { "epoch": 1.1793825257845445, "grad_norm": 0.27861401438713074, "learning_rate": 8.076406866346233e-05, "loss": 1.6314, "step": 30310 }, { "epoch": 1.1799244669500568, "grad_norm": 0.3368987441062927, "learning_rate": 8.075108164808281e-05, "loss": 1.6445, "step": 30320 }, { "epoch": 1.180466408115569, "grad_norm": 0.5238324999809265, "learning_rate": 8.073809144277109e-05, "loss": 1.6504, "step": 30330 }, { "epoch": 1.1810083492810812, "grad_norm": 0.4157116413116455, "learning_rate": 8.072509804913634e-05, "loss": 1.6359, "step": 30340 }, { "epoch": 1.1815502904465933, "grad_norm": 0.4345547556877136, "learning_rate": 8.071210146878813e-05, "loss": 1.6455, "step": 30350 }, { "epoch": 1.1820922316121056, "grad_norm": 0.3230676054954529, "learning_rate": 8.069910170333643e-05, "loss": 1.6376, "step": 30360 }, { "epoch": 1.1822548139617592, "eval_loss": 2.5333669185638428, "eval_runtime": 21.9869, "eval_samples_per_second": 227.408, "eval_steps_per_second": 1.228, "step": 30363 }, { "epoch": 1.1826341727776177, "grad_norm": 0.3938463032245636, "learning_rate": 8.068609875439159e-05, "loss": 1.6364, "step": 30370 }, { "epoch": 1.18317611394313, "grad_norm": 0.29343992471694946, "learning_rate": 8.067309262356435e-05, "loss": 1.6359, "step": 30380 }, { "epoch": 1.1837180551086424, "grad_norm": 0.25108593702316284, "learning_rate": 8.066008331246586e-05, "loss": 1.6415, "step": 30390 }, { "epoch": 1.1842599962741545, "grad_norm": 0.6940186619758606, "learning_rate": 8.064707082270765e-05, "loss": 1.6353, "step": 30400 }, { "epoch": 1.1848019374396668, "grad_norm": 0.33529943227767944, "learning_rate": 8.063405515590166e-05, "loss": 1.6275, "step": 30410 }, { "epoch": 1.1853438786051789, "grad_norm": 0.3404565453529358, "learning_rate": 8.06210363136602e-05, "loss": 1.638, "step": 30420 }, { "epoch": 1.1858858197706912, "grad_norm": 0.4092163145542145, "learning_rate": 8.0608014297596e-05, "loss": 1.6319, "step": 30430 }, { "epoch": 1.1864277609362033, "grad_norm": 0.5055906772613525, "learning_rate": 8.059498910932216e-05, "loss": 1.6342, "step": 30440 }, { "epoch": 1.1869697021017156, "grad_norm": 0.38912099599838257, "learning_rate": 8.05819607504522e-05, "loss": 1.6364, "step": 30450 }, { "epoch": 1.1869697021017156, "eval_loss": 2.5219056606292725, "eval_runtime": 21.9805, "eval_samples_per_second": 227.474, "eval_steps_per_second": 1.228, "step": 30450 }, { "epoch": 1.1875116432672277, "grad_norm": 0.47896572947502136, "learning_rate": 8.056892922260001e-05, "loss": 1.6426, "step": 30460 }, { "epoch": 1.18805358443274, "grad_norm": 0.2856700122356415, "learning_rate": 8.055589452737988e-05, "loss": 1.6501, "step": 30470 }, { "epoch": 1.1885955255982523, "grad_norm": 0.2293747514486313, "learning_rate": 8.054285666640649e-05, "loss": 1.6401, "step": 30480 }, { "epoch": 1.1891374667637644, "grad_norm": 0.46321722865104675, "learning_rate": 8.052981564129489e-05, "loss": 1.6478, "step": 30490 }, { "epoch": 1.1896794079292767, "grad_norm": 0.39135417342185974, "learning_rate": 8.051677145366058e-05, "loss": 1.6481, "step": 30500 }, { "epoch": 1.1902213490947888, "grad_norm": 0.4103468954563141, "learning_rate": 8.050372410511941e-05, "loss": 1.6359, "step": 30510 }, { "epoch": 1.1907632902603011, "grad_norm": 0.2855057418346405, "learning_rate": 8.049067359728763e-05, "loss": 1.6489, "step": 30520 }, { "epoch": 1.1913052314258132, "grad_norm": 0.3194611966609955, "learning_rate": 8.047761993178186e-05, "loss": 1.6374, "step": 30530 }, { "epoch": 1.191684590241672, "eval_loss": 2.5290892124176025, "eval_runtime": 21.9827, "eval_samples_per_second": 227.452, "eval_steps_per_second": 1.228, "step": 30537 }, { "epoch": 1.1918471725913256, "grad_norm": 0.47105491161346436, "learning_rate": 8.046456311021916e-05, "loss": 1.6399, "step": 30540 }, { "epoch": 1.1923891137568376, "grad_norm": 0.3898428976535797, "learning_rate": 8.045150313421693e-05, "loss": 1.6325, "step": 30550 }, { "epoch": 1.19293105492235, "grad_norm": 0.23930542171001434, "learning_rate": 8.0438440005393e-05, "loss": 1.6297, "step": 30560 }, { "epoch": 1.1934729960878623, "grad_norm": 0.31366342306137085, "learning_rate": 8.042537372536556e-05, "loss": 1.634, "step": 30570 }, { "epoch": 1.1940149372533744, "grad_norm": 0.22141672670841217, "learning_rate": 8.041230429575319e-05, "loss": 1.6477, "step": 30580 }, { "epoch": 1.1945568784188867, "grad_norm": 0.5529452562332153, "learning_rate": 8.039923171817492e-05, "loss": 1.6397, "step": 30590 }, { "epoch": 1.1950988195843988, "grad_norm": 0.4657527208328247, "learning_rate": 8.038615599425008e-05, "loss": 1.6408, "step": 30600 }, { "epoch": 1.195640760749911, "grad_norm": 0.4929318130016327, "learning_rate": 8.037307712559847e-05, "loss": 1.6377, "step": 30610 }, { "epoch": 1.1961827019154234, "grad_norm": 0.38976770639419556, "learning_rate": 8.03599951138402e-05, "loss": 1.6393, "step": 30620 }, { "epoch": 1.196399478381628, "eval_loss": 2.534869432449341, "eval_runtime": 21.9803, "eval_samples_per_second": 227.476, "eval_steps_per_second": 1.228, "step": 30624 }, { "epoch": 1.1967246430809355, "grad_norm": 0.2525932788848877, "learning_rate": 8.034690996059584e-05, "loss": 1.6411, "step": 30630 }, { "epoch": 1.1972665842464478, "grad_norm": 0.2530060112476349, "learning_rate": 8.033382166748633e-05, "loss": 1.6505, "step": 30640 }, { "epoch": 1.19780852541196, "grad_norm": 0.46789705753326416, "learning_rate": 8.032073023613299e-05, "loss": 1.6235, "step": 30650 }, { "epoch": 1.1983504665774722, "grad_norm": 0.547118604183197, "learning_rate": 8.03076356681575e-05, "loss": 1.6362, "step": 30660 }, { "epoch": 1.1988924077429843, "grad_norm": 0.33731144666671753, "learning_rate": 8.029453796518198e-05, "loss": 1.6387, "step": 30670 }, { "epoch": 1.1994343489084967, "grad_norm": 0.22524616122245789, "learning_rate": 8.028143712882893e-05, "loss": 1.6338, "step": 30680 }, { "epoch": 1.1999762900740087, "grad_norm": 0.46151962876319885, "learning_rate": 8.02683331607212e-05, "loss": 1.6475, "step": 30690 }, { "epoch": 1.200518231239521, "grad_norm": 0.313071072101593, "learning_rate": 8.025522606248206e-05, "loss": 1.6416, "step": 30700 }, { "epoch": 1.2010601724050334, "grad_norm": 0.480162113904953, "learning_rate": 8.024211583573516e-05, "loss": 1.6333, "step": 30710 }, { "epoch": 1.2011143665215844, "eval_loss": 2.5296213626861572, "eval_runtime": 21.9849, "eval_samples_per_second": 227.429, "eval_steps_per_second": 1.228, "step": 30711 }, { "epoch": 1.2016021135705455, "grad_norm": 0.3885667622089386, "learning_rate": 8.022900248210455e-05, "loss": 1.6343, "step": 30720 }, { "epoch": 1.2021440547360578, "grad_norm": 0.34139159321784973, "learning_rate": 8.021588600321465e-05, "loss": 1.6332, "step": 30730 }, { "epoch": 1.2026859959015699, "grad_norm": 0.2892656922340393, "learning_rate": 8.020276640069025e-05, "loss": 1.6362, "step": 30740 }, { "epoch": 1.2032279370670822, "grad_norm": 0.4062507748603821, "learning_rate": 8.018964367615659e-05, "loss": 1.6388, "step": 30750 }, { "epoch": 1.2037698782325943, "grad_norm": 0.3714730739593506, "learning_rate": 8.017651783123922e-05, "loss": 1.63, "step": 30760 }, { "epoch": 1.2043118193981066, "grad_norm": 0.5153228044509888, "learning_rate": 8.016338886756412e-05, "loss": 1.6387, "step": 30770 }, { "epoch": 1.2048537605636187, "grad_norm": 0.27551186084747314, "learning_rate": 8.015025678675767e-05, "loss": 1.634, "step": 30780 }, { "epoch": 1.205395701729131, "grad_norm": 0.8653702139854431, "learning_rate": 8.013712159044658e-05, "loss": 1.6362, "step": 30790 }, { "epoch": 1.2058292546615408, "eval_loss": 2.5252997875213623, "eval_runtime": 21.986, "eval_samples_per_second": 227.418, "eval_steps_per_second": 1.228, "step": 30798 }, { "epoch": 1.2059376428946433, "grad_norm": 0.8155677914619446, "learning_rate": 8.012398328025804e-05, "loss": 1.6397, "step": 30800 }, { "epoch": 1.2064795840601554, "grad_norm": 0.49000170826911926, "learning_rate": 8.01108418578195e-05, "loss": 1.6407, "step": 30810 }, { "epoch": 1.2070215252256677, "grad_norm": 0.37893688678741455, "learning_rate": 8.009769732475889e-05, "loss": 1.6368, "step": 30820 }, { "epoch": 1.2075634663911798, "grad_norm": 0.3986130356788635, "learning_rate": 8.008454968270452e-05, "loss": 1.634, "step": 30830 }, { "epoch": 1.2081054075566922, "grad_norm": 0.33178648352622986, "learning_rate": 8.0071398933285e-05, "loss": 1.6315, "step": 30840 }, { "epoch": 1.2086473487222043, "grad_norm": 0.3417208790779114, "learning_rate": 8.005824507812947e-05, "loss": 1.6246, "step": 30850 }, { "epoch": 1.2091892898877166, "grad_norm": 0.4329122006893158, "learning_rate": 8.004508811886732e-05, "loss": 1.6451, "step": 30860 }, { "epoch": 1.2097312310532287, "grad_norm": 0.4540995657444, "learning_rate": 8.003192805712839e-05, "loss": 1.6188, "step": 30870 }, { "epoch": 1.210273172218741, "grad_norm": 0.41036659479141235, "learning_rate": 8.001876489454289e-05, "loss": 1.6392, "step": 30880 }, { "epoch": 1.2105441428014971, "eval_loss": 2.5213656425476074, "eval_runtime": 21.9838, "eval_samples_per_second": 227.441, "eval_steps_per_second": 1.228, "step": 30885 }, { "epoch": 1.2108151133842533, "grad_norm": 0.2853230834007263, "learning_rate": 8.00055986327414e-05, "loss": 1.6335, "step": 30890 }, { "epoch": 1.2113570545497654, "grad_norm": 0.23493582010269165, "learning_rate": 7.999242927335493e-05, "loss": 1.6369, "step": 30900 }, { "epoch": 1.2118989957152777, "grad_norm": 0.4317961037158966, "learning_rate": 7.997925681801484e-05, "loss": 1.6346, "step": 30910 }, { "epoch": 1.2124409368807898, "grad_norm": 0.29279986023902893, "learning_rate": 7.996608126835285e-05, "loss": 1.6326, "step": 30920 }, { "epoch": 1.2129828780463021, "grad_norm": 0.3215675950050354, "learning_rate": 7.995290262600109e-05, "loss": 1.6445, "step": 30930 }, { "epoch": 1.2135248192118144, "grad_norm": 0.6567270755767822, "learning_rate": 7.99397208925921e-05, "loss": 1.6359, "step": 30940 }, { "epoch": 1.2140667603773265, "grad_norm": 0.5213804841041565, "learning_rate": 7.992653606975877e-05, "loss": 1.6262, "step": 30950 }, { "epoch": 1.2146087015428388, "grad_norm": 0.3358005881309509, "learning_rate": 7.991334815913437e-05, "loss": 1.6371, "step": 30960 }, { "epoch": 1.215150642708351, "grad_norm": 0.26169344782829285, "learning_rate": 7.990015716235255e-05, "loss": 1.6329, "step": 30970 }, { "epoch": 1.2152590309414535, "eval_loss": 2.521756172180176, "eval_runtime": 21.9857, "eval_samples_per_second": 227.421, "eval_steps_per_second": 1.228, "step": 30972 }, { "epoch": 1.2156925838738633, "grad_norm": 0.3503156304359436, "learning_rate": 7.988696308104738e-05, "loss": 1.6396, "step": 30980 }, { "epoch": 1.2162345250393753, "grad_norm": 0.4474554657936096, "learning_rate": 7.987376591685325e-05, "loss": 1.6303, "step": 30990 }, { "epoch": 1.2167764662048877, "grad_norm": 0.25015437602996826, "learning_rate": 7.986056567140502e-05, "loss": 1.6229, "step": 31000 }, { "epoch": 1.2173184073703998, "grad_norm": 0.40038561820983887, "learning_rate": 7.984736234633784e-05, "loss": 1.6299, "step": 31010 }, { "epoch": 1.217860348535912, "grad_norm": 0.3585070073604584, "learning_rate": 7.983415594328729e-05, "loss": 1.644, "step": 31020 }, { "epoch": 1.2184022897014244, "grad_norm": 0.2341560274362564, "learning_rate": 7.98209464638893e-05, "loss": 1.6389, "step": 31030 }, { "epoch": 1.2189442308669365, "grad_norm": 0.32264870405197144, "learning_rate": 7.980773390978024e-05, "loss": 1.635, "step": 31040 }, { "epoch": 1.2194861720324488, "grad_norm": 0.46334415674209595, "learning_rate": 7.979451828259681e-05, "loss": 1.6315, "step": 31050 }, { "epoch": 1.2199739190814096, "eval_loss": 2.524176597595215, "eval_runtime": 21.9898, "eval_samples_per_second": 227.379, "eval_steps_per_second": 1.228, "step": 31059 }, { "epoch": 1.220028113197961, "grad_norm": 0.2632630467414856, "learning_rate": 7.978129958397612e-05, "loss": 1.6317, "step": 31060 }, { "epoch": 1.2205700543634732, "grad_norm": 0.5674657225608826, "learning_rate": 7.97680778155556e-05, "loss": 1.6294, "step": 31070 }, { "epoch": 1.2211119955289853, "grad_norm": 0.4556896686553955, "learning_rate": 7.975485297897312e-05, "loss": 1.633, "step": 31080 }, { "epoch": 1.2216539366944976, "grad_norm": 0.362857848405838, "learning_rate": 7.974162507586696e-05, "loss": 1.632, "step": 31090 }, { "epoch": 1.2221958778600097, "grad_norm": 0.3741185963153839, "learning_rate": 7.972839410787568e-05, "loss": 1.6396, "step": 31100 }, { "epoch": 1.222737819025522, "grad_norm": 0.5633114576339722, "learning_rate": 7.971516007663831e-05, "loss": 1.6354, "step": 31110 }, { "epoch": 1.2232797601910343, "grad_norm": 0.3228178322315216, "learning_rate": 7.970192298379421e-05, "loss": 1.6383, "step": 31120 }, { "epoch": 1.2238217013565464, "grad_norm": 0.39119747281074524, "learning_rate": 7.968868283098314e-05, "loss": 1.6271, "step": 31130 }, { "epoch": 1.2243636425220588, "grad_norm": 0.28056901693344116, "learning_rate": 7.967543961984522e-05, "loss": 1.6356, "step": 31140 }, { "epoch": 1.224688807221366, "eval_loss": 2.5236127376556396, "eval_runtime": 21.9859, "eval_samples_per_second": 227.418, "eval_steps_per_second": 1.228, "step": 31146 }, { "epoch": 1.2249055836875709, "grad_norm": 0.4328171908855438, "learning_rate": 7.966219335202097e-05, "loss": 1.6377, "step": 31150 }, { "epoch": 1.2254475248530832, "grad_norm": 0.3190264105796814, "learning_rate": 7.964894402915127e-05, "loss": 1.6327, "step": 31160 }, { "epoch": 1.2259894660185953, "grad_norm": 0.38925376534461975, "learning_rate": 7.963569165287743e-05, "loss": 1.6092, "step": 31170 }, { "epoch": 1.2265314071841076, "grad_norm": 0.2989320755004883, "learning_rate": 7.962243622484104e-05, "loss": 1.6305, "step": 31180 }, { "epoch": 1.2270733483496197, "grad_norm": 0.21814024448394775, "learning_rate": 7.960917774668415e-05, "loss": 1.6351, "step": 31190 }, { "epoch": 1.227615289515132, "grad_norm": 0.23537899553775787, "learning_rate": 7.95959162200492e-05, "loss": 1.6261, "step": 31200 }, { "epoch": 1.2281572306806443, "grad_norm": 0.3734976649284363, "learning_rate": 7.958265164657889e-05, "loss": 1.6297, "step": 31210 }, { "epoch": 1.2286991718461564, "grad_norm": 0.3379276990890503, "learning_rate": 7.956938402791644e-05, "loss": 1.622, "step": 31220 }, { "epoch": 1.2292411130116687, "grad_norm": 0.3800952434539795, "learning_rate": 7.955611336570537e-05, "loss": 1.6307, "step": 31230 }, { "epoch": 1.2294036953613223, "eval_loss": 2.5284194946289062, "eval_runtime": 21.9883, "eval_samples_per_second": 227.393, "eval_steps_per_second": 1.228, "step": 31233 }, { "epoch": 1.2297830541771808, "grad_norm": 0.36053764820098877, "learning_rate": 7.954283966158957e-05, "loss": 1.6258, "step": 31240 }, { "epoch": 1.2303249953426931, "grad_norm": 0.24756401777267456, "learning_rate": 7.952956291721335e-05, "loss": 1.6383, "step": 31250 }, { "epoch": 1.2308669365082054, "grad_norm": 0.24404959380626678, "learning_rate": 7.951628313422139e-05, "loss": 1.6229, "step": 31260 }, { "epoch": 1.2314088776737175, "grad_norm": 0.2366773635149002, "learning_rate": 7.95030003142587e-05, "loss": 1.6288, "step": 31270 }, { "epoch": 1.2319508188392299, "grad_norm": 0.6960218548774719, "learning_rate": 7.948971445897072e-05, "loss": 1.6308, "step": 31280 }, { "epoch": 1.232492760004742, "grad_norm": 0.24987642467021942, "learning_rate": 7.947642557000324e-05, "loss": 1.6346, "step": 31290 }, { "epoch": 1.2330347011702543, "grad_norm": 0.3360191583633423, "learning_rate": 7.946313364900242e-05, "loss": 1.6323, "step": 31300 }, { "epoch": 1.2335766423357664, "grad_norm": 0.3646528124809265, "learning_rate": 7.944983869761481e-05, "loss": 1.6235, "step": 31310 }, { "epoch": 1.2341185835012787, "grad_norm": 0.29639899730682373, "learning_rate": 7.943654071748734e-05, "loss": 1.6291, "step": 31320 }, { "epoch": 1.2341185835012787, "eval_loss": 2.529906749725342, "eval_runtime": 21.9492, "eval_samples_per_second": 227.799, "eval_steps_per_second": 1.23, "step": 31320 }, { "epoch": 1.2346605246667908, "grad_norm": 0.2218153476715088, "learning_rate": 7.942323971026729e-05, "loss": 1.6226, "step": 31330 }, { "epoch": 1.235202465832303, "grad_norm": 0.25773563981056213, "learning_rate": 7.940993567760235e-05, "loss": 1.6332, "step": 31340 }, { "epoch": 1.2357444069978154, "grad_norm": 0.26830193400382996, "learning_rate": 7.939662862114053e-05, "loss": 1.6398, "step": 31350 }, { "epoch": 1.2362863481633275, "grad_norm": 0.3047340214252472, "learning_rate": 7.938331854253031e-05, "loss": 1.6377, "step": 31360 }, { "epoch": 1.2368282893288398, "grad_norm": 0.39592602849006653, "learning_rate": 7.937000544342042e-05, "loss": 1.6235, "step": 31370 }, { "epoch": 1.237370230494352, "grad_norm": 0.39517736434936523, "learning_rate": 7.935668932546009e-05, "loss": 1.639, "step": 31380 }, { "epoch": 1.2379121716598642, "grad_norm": 0.3887443542480469, "learning_rate": 7.934337019029881e-05, "loss": 1.6261, "step": 31390 }, { "epoch": 1.2384541128253763, "grad_norm": 0.5671885013580322, "learning_rate": 7.933004803958654e-05, "loss": 1.6313, "step": 31400 }, { "epoch": 1.238833471641235, "eval_loss": 2.5245509147644043, "eval_runtime": 21.9815, "eval_samples_per_second": 227.464, "eval_steps_per_second": 1.228, "step": 31407 }, { "epoch": 1.2389960539908886, "grad_norm": 0.3325177729129791, "learning_rate": 7.931672287497353e-05, "loss": 1.6346, "step": 31410 }, { "epoch": 1.2395379951564007, "grad_norm": 0.36562344431877136, "learning_rate": 7.930339469811045e-05, "loss": 1.6241, "step": 31420 }, { "epoch": 1.240079936321913, "grad_norm": 0.2752797603607178, "learning_rate": 7.929006351064838e-05, "loss": 1.6228, "step": 31430 }, { "epoch": 1.2406218774874254, "grad_norm": 0.27838945388793945, "learning_rate": 7.927672931423869e-05, "loss": 1.6249, "step": 31440 }, { "epoch": 1.2411638186529375, "grad_norm": 0.40533244609832764, "learning_rate": 7.926339211053316e-05, "loss": 1.6243, "step": 31450 }, { "epoch": 1.2417057598184498, "grad_norm": 0.47712355852127075, "learning_rate": 7.925005190118397e-05, "loss": 1.6253, "step": 31460 }, { "epoch": 1.2422477009839619, "grad_norm": 0.3724726140499115, "learning_rate": 7.923670868784364e-05, "loss": 1.6356, "step": 31470 }, { "epoch": 1.2427896421494742, "grad_norm": 0.26425930857658386, "learning_rate": 7.922336247216505e-05, "loss": 1.6291, "step": 31480 }, { "epoch": 1.2433315833149863, "grad_norm": 0.2903880476951599, "learning_rate": 7.92100132558015e-05, "loss": 1.6299, "step": 31490 }, { "epoch": 1.2435483597811912, "eval_loss": 2.520765781402588, "eval_runtime": 21.9829, "eval_samples_per_second": 227.449, "eval_steps_per_second": 1.228, "step": 31494 }, { "epoch": 1.2438735244804986, "grad_norm": 0.3055003881454468, "learning_rate": 7.91966610404066e-05, "loss": 1.6376, "step": 31500 }, { "epoch": 1.2444154656460107, "grad_norm": 0.37320762872695923, "learning_rate": 7.918330582763438e-05, "loss": 1.6298, "step": 31510 }, { "epoch": 1.244957406811523, "grad_norm": 0.2776867747306824, "learning_rate": 7.916994761913923e-05, "loss": 1.6266, "step": 31520 }, { "epoch": 1.2454993479770353, "grad_norm": 0.2319355010986328, "learning_rate": 7.91565864165759e-05, "loss": 1.6252, "step": 31530 }, { "epoch": 1.2460412891425474, "grad_norm": 0.5147307515144348, "learning_rate": 7.914322222159953e-05, "loss": 1.6261, "step": 31540 }, { "epoch": 1.2465832303080597, "grad_norm": 0.31287848949432373, "learning_rate": 7.912985503586562e-05, "loss": 1.6304, "step": 31550 }, { "epoch": 1.2471251714735718, "grad_norm": 0.2941262423992157, "learning_rate": 7.911648486103002e-05, "loss": 1.6325, "step": 31560 }, { "epoch": 1.2476671126390841, "grad_norm": 0.2733207941055298, "learning_rate": 7.910311169874898e-05, "loss": 1.6273, "step": 31570 }, { "epoch": 1.2482090538045965, "grad_norm": 0.40679219365119934, "learning_rate": 7.908973555067911e-05, "loss": 1.6264, "step": 31580 }, { "epoch": 1.2482632479211475, "eval_loss": 2.5255589485168457, "eval_runtime": 21.9824, "eval_samples_per_second": 227.454, "eval_steps_per_second": 1.228, "step": 31581 }, { "epoch": 1.2487509949701086, "grad_norm": 0.27531698346138, "learning_rate": 7.907635641847739e-05, "loss": 1.6209, "step": 31590 }, { "epoch": 1.2492929361356209, "grad_norm": 0.26964595913887024, "learning_rate": 7.906297430380114e-05, "loss": 1.6253, "step": 31600 }, { "epoch": 1.249834877301133, "grad_norm": 0.25738173723220825, "learning_rate": 7.904958920830813e-05, "loss": 1.6205, "step": 31610 }, { "epoch": 1.2503768184666453, "grad_norm": 0.2674032151699066, "learning_rate": 7.903620113365644e-05, "loss": 1.6191, "step": 31620 }, { "epoch": 1.2509187596321574, "grad_norm": 0.42425525188446045, "learning_rate": 7.902281008150447e-05, "loss": 1.6345, "step": 31630 }, { "epoch": 1.2514607007976697, "grad_norm": 0.4274544417858124, "learning_rate": 7.90094160535111e-05, "loss": 1.6372, "step": 31640 }, { "epoch": 1.2520026419631818, "grad_norm": 0.36170271039009094, "learning_rate": 7.89960190513355e-05, "loss": 1.6288, "step": 31650 }, { "epoch": 1.252544583128694, "grad_norm": 0.35101965069770813, "learning_rate": 7.898261907663725e-05, "loss": 1.6218, "step": 31660 }, { "epoch": 1.2529781360611039, "eval_loss": 2.5185720920562744, "eval_runtime": 21.9807, "eval_samples_per_second": 227.472, "eval_steps_per_second": 1.228, "step": 31668 }, { "epoch": 1.2530865242942064, "grad_norm": 0.2534119188785553, "learning_rate": 7.896921613107626e-05, "loss": 1.6279, "step": 31670 }, { "epoch": 1.2536284654597185, "grad_norm": 0.4205189049243927, "learning_rate": 7.895581021631286e-05, "loss": 1.623, "step": 31680 }, { "epoch": 1.2541704066252308, "grad_norm": 0.29453709721565247, "learning_rate": 7.894240133400767e-05, "loss": 1.6259, "step": 31690 }, { "epoch": 1.254712347790743, "grad_norm": 0.3749118149280548, "learning_rate": 7.892898948582177e-05, "loss": 1.6186, "step": 31700 }, { "epoch": 1.2552542889562552, "grad_norm": 0.3719863295555115, "learning_rate": 7.891557467341653e-05, "loss": 1.6233, "step": 31710 }, { "epoch": 1.2557962301217673, "grad_norm": 0.2719629406929016, "learning_rate": 7.890215689845374e-05, "loss": 1.642, "step": 31720 }, { "epoch": 1.2563381712872796, "grad_norm": 0.29956310987472534, "learning_rate": 7.888873616259552e-05, "loss": 1.6394, "step": 31730 }, { "epoch": 1.2568801124527917, "grad_norm": 0.6171798706054688, "learning_rate": 7.887531246750438e-05, "loss": 1.6224, "step": 31740 }, { "epoch": 1.257422053618304, "grad_norm": 0.42021244764328003, "learning_rate": 7.886188581484318e-05, "loss": 1.6289, "step": 31750 }, { "epoch": 1.2576930242010602, "eval_loss": 2.518956422805786, "eval_runtime": 21.9853, "eval_samples_per_second": 227.425, "eval_steps_per_second": 1.228, "step": 31755 }, { "epoch": 1.2579639947838164, "grad_norm": 0.3666936159133911, "learning_rate": 7.884845620627518e-05, "loss": 1.6277, "step": 31760 }, { "epoch": 1.2585059359493285, "grad_norm": 0.3201642334461212, "learning_rate": 7.883502364346396e-05, "loss": 1.6311, "step": 31770 }, { "epoch": 1.2590478771148408, "grad_norm": 0.2944391369819641, "learning_rate": 7.882158812807349e-05, "loss": 1.6259, "step": 31780 }, { "epoch": 1.2595898182803529, "grad_norm": 0.2937580943107605, "learning_rate": 7.88081496617681e-05, "loss": 1.626, "step": 31790 }, { "epoch": 1.2601317594458652, "grad_norm": 0.3028720021247864, "learning_rate": 7.87947082462125e-05, "loss": 1.6325, "step": 31800 }, { "epoch": 1.2606737006113775, "grad_norm": 0.2833790183067322, "learning_rate": 7.878126388307173e-05, "loss": 1.6247, "step": 31810 }, { "epoch": 1.2612156417768896, "grad_norm": 0.264164000749588, "learning_rate": 7.876781657401125e-05, "loss": 1.627, "step": 31820 }, { "epoch": 1.2617575829424017, "grad_norm": 0.3081631064414978, "learning_rate": 7.875436632069687e-05, "loss": 1.6225, "step": 31830 }, { "epoch": 1.262299524107914, "grad_norm": 0.4150921404361725, "learning_rate": 7.874091312479468e-05, "loss": 1.6336, "step": 31840 }, { "epoch": 1.2624079123410166, "eval_loss": 2.5215582847595215, "eval_runtime": 21.9862, "eval_samples_per_second": 227.416, "eval_steps_per_second": 1.228, "step": 31842 }, { "epoch": 1.2628414652734263, "grad_norm": 0.689495861530304, "learning_rate": 7.872745698797128e-05, "loss": 1.6334, "step": 31850 }, { "epoch": 1.2633834064389384, "grad_norm": 0.501124918460846, "learning_rate": 7.87139979118935e-05, "loss": 1.6275, "step": 31860 }, { "epoch": 1.2639253476044507, "grad_norm": 0.2936415672302246, "learning_rate": 7.870053589822863e-05, "loss": 1.6368, "step": 31870 }, { "epoch": 1.2644672887699628, "grad_norm": 0.3334285616874695, "learning_rate": 7.868707094864427e-05, "loss": 1.6255, "step": 31880 }, { "epoch": 1.2650092299354752, "grad_norm": 0.2995965778827667, "learning_rate": 7.867360306480839e-05, "loss": 1.6214, "step": 31890 }, { "epoch": 1.2655511711009875, "grad_norm": 0.3712153732776642, "learning_rate": 7.866013224838933e-05, "loss": 1.634, "step": 31900 }, { "epoch": 1.2660931122664996, "grad_norm": 0.30742502212524414, "learning_rate": 7.864665850105583e-05, "loss": 1.6193, "step": 31910 }, { "epoch": 1.2666350534320117, "grad_norm": 0.27019500732421875, "learning_rate": 7.863318182447693e-05, "loss": 1.627, "step": 31920 }, { "epoch": 1.2671228004809727, "eval_loss": 2.5232045650482178, "eval_runtime": 21.9899, "eval_samples_per_second": 227.377, "eval_steps_per_second": 1.228, "step": 31929 }, { "epoch": 1.267176994597524, "grad_norm": 0.3056955337524414, "learning_rate": 7.861970222032207e-05, "loss": 1.6083, "step": 31930 }, { "epoch": 1.2677189357630363, "grad_norm": 0.28661999106407166, "learning_rate": 7.860621969026106e-05, "loss": 1.6225, "step": 31940 }, { "epoch": 1.2682608769285484, "grad_norm": 0.26536825299263, "learning_rate": 7.859273423596403e-05, "loss": 1.6166, "step": 31950 }, { "epoch": 1.2688028180940607, "grad_norm": 0.5294485092163086, "learning_rate": 7.85792458591015e-05, "loss": 1.6225, "step": 31960 }, { "epoch": 1.2693447592595728, "grad_norm": 0.3292939364910126, "learning_rate": 7.85657545613444e-05, "loss": 1.6334, "step": 31970 }, { "epoch": 1.2698867004250851, "grad_norm": 0.23607225716114044, "learning_rate": 7.85522603443639e-05, "loss": 1.6311, "step": 31980 }, { "epoch": 1.2704286415905974, "grad_norm": 0.28158169984817505, "learning_rate": 7.853876320983165e-05, "loss": 1.622, "step": 31990 }, { "epoch": 1.2709705827561095, "grad_norm": 0.2377581000328064, "learning_rate": 7.852526315941961e-05, "loss": 1.623, "step": 32000 }, { "epoch": 1.2715125239216218, "grad_norm": 0.514994740486145, "learning_rate": 7.851176019480012e-05, "loss": 1.6112, "step": 32010 }, { "epoch": 1.271837688620929, "eval_loss": 2.515641689300537, "eval_runtime": 21.988, "eval_samples_per_second": 227.397, "eval_steps_per_second": 1.228, "step": 32016 }, { "epoch": 1.272054465087134, "grad_norm": 0.3797670900821686, "learning_rate": 7.849825431764585e-05, "loss": 1.6232, "step": 32020 }, { "epoch": 1.2725964062526463, "grad_norm": 0.44352027773857117, "learning_rate": 7.848474552962984e-05, "loss": 1.6295, "step": 32030 }, { "epoch": 1.2731383474181583, "grad_norm": 0.2417132705450058, "learning_rate": 7.847123383242552e-05, "loss": 1.6249, "step": 32040 }, { "epoch": 1.2736802885836707, "grad_norm": 0.4180681109428406, "learning_rate": 7.845771922770667e-05, "loss": 1.6065, "step": 32050 }, { "epoch": 1.2742222297491828, "grad_norm": 0.29426735639572144, "learning_rate": 7.84442017171474e-05, "loss": 1.6243, "step": 32060 }, { "epoch": 1.274764170914695, "grad_norm": 0.37015318870544434, "learning_rate": 7.84306813024222e-05, "loss": 1.6223, "step": 32070 }, { "epoch": 1.2753061120802074, "grad_norm": 0.2594332993030548, "learning_rate": 7.841715798520592e-05, "loss": 1.6248, "step": 32080 }, { "epoch": 1.2758480532457195, "grad_norm": 0.23385967314243317, "learning_rate": 7.840363176717377e-05, "loss": 1.6098, "step": 32090 }, { "epoch": 1.2763899944112318, "grad_norm": 0.38536781072616577, "learning_rate": 7.839010265000136e-05, "loss": 1.6224, "step": 32100 }, { "epoch": 1.2765525767608854, "eval_loss": 2.518597364425659, "eval_runtime": 21.9837, "eval_samples_per_second": 227.441, "eval_steps_per_second": 1.228, "step": 32103 }, { "epoch": 1.276931935576744, "grad_norm": 0.2687205374240875, "learning_rate": 7.837657063536456e-05, "loss": 1.6277, "step": 32110 }, { "epoch": 1.2774738767422562, "grad_norm": 0.29153600335121155, "learning_rate": 7.83630357249397e-05, "loss": 1.6221, "step": 32120 }, { "epoch": 1.2780158179077685, "grad_norm": 0.42591148614883423, "learning_rate": 7.834949792040337e-05, "loss": 1.6185, "step": 32130 }, { "epoch": 1.2785577590732806, "grad_norm": 0.5398927927017212, "learning_rate": 7.833595722343263e-05, "loss": 1.6177, "step": 32140 }, { "epoch": 1.2790997002387927, "grad_norm": 0.3706096410751343, "learning_rate": 7.832241363570482e-05, "loss": 1.6266, "step": 32150 }, { "epoch": 1.279641641404305, "grad_norm": 0.34234559535980225, "learning_rate": 7.830886715889766e-05, "loss": 1.6242, "step": 32160 }, { "epoch": 1.2801835825698173, "grad_norm": 0.288562148809433, "learning_rate": 7.829531779468925e-05, "loss": 1.6247, "step": 32170 }, { "epoch": 1.2807255237353294, "grad_norm": 0.5007647275924683, "learning_rate": 7.8281765544758e-05, "loss": 1.6259, "step": 32180 }, { "epoch": 1.2812674649008418, "grad_norm": 0.2426513284444809, "learning_rate": 7.826821041078271e-05, "loss": 1.6284, "step": 32190 }, { "epoch": 1.2812674649008418, "eval_loss": 2.516087293624878, "eval_runtime": 21.9703, "eval_samples_per_second": 227.58, "eval_steps_per_second": 1.229, "step": 32190 }, { "epoch": 1.2818094060663539, "grad_norm": 0.2416008710861206, "learning_rate": 7.825465239444255e-05, "loss": 1.6283, "step": 32200 }, { "epoch": 1.2823513472318662, "grad_norm": 0.3502400517463684, "learning_rate": 7.824109149741701e-05, "loss": 1.6253, "step": 32210 }, { "epoch": 1.2828932883973785, "grad_norm": 0.2979862689971924, "learning_rate": 7.822752772138594e-05, "loss": 1.6324, "step": 32220 }, { "epoch": 1.2834352295628906, "grad_norm": 0.355049192905426, "learning_rate": 7.821396106802958e-05, "loss": 1.6229, "step": 32230 }, { "epoch": 1.2839771707284027, "grad_norm": 0.29975321888923645, "learning_rate": 7.820039153902852e-05, "loss": 1.6195, "step": 32240 }, { "epoch": 1.284519111893915, "grad_norm": 0.3482322692871094, "learning_rate": 7.81868191360637e-05, "loss": 1.6228, "step": 32250 }, { "epoch": 1.2850610530594273, "grad_norm": 0.20890609920024872, "learning_rate": 7.817324386081637e-05, "loss": 1.6212, "step": 32260 }, { "epoch": 1.2856029942249394, "grad_norm": 0.35245707631111145, "learning_rate": 7.81596657149682e-05, "loss": 1.613, "step": 32270 }, { "epoch": 1.285982353040798, "eval_loss": 2.5217950344085693, "eval_runtime": 21.9904, "eval_samples_per_second": 227.372, "eval_steps_per_second": 1.228, "step": 32277 }, { "epoch": 1.2861449353904517, "grad_norm": 0.3934645354747772, "learning_rate": 7.814608470020118e-05, "loss": 1.6187, "step": 32280 }, { "epoch": 1.2866868765559638, "grad_norm": 0.2991849482059479, "learning_rate": 7.81325008181977e-05, "loss": 1.6208, "step": 32290 }, { "epoch": 1.2872288177214761, "grad_norm": 0.31971707940101624, "learning_rate": 7.811891407064044e-05, "loss": 1.6314, "step": 32300 }, { "epoch": 1.2877707588869884, "grad_norm": 0.32109859585762024, "learning_rate": 7.810532445921248e-05, "loss": 1.6099, "step": 32310 }, { "epoch": 1.2883127000525005, "grad_norm": 0.4225994348526001, "learning_rate": 7.809173198559724e-05, "loss": 1.6177, "step": 32320 }, { "epoch": 1.2888546412180126, "grad_norm": 0.2910853326320648, "learning_rate": 7.807813665147847e-05, "loss": 1.6291, "step": 32330 }, { "epoch": 1.289396582383525, "grad_norm": 0.3097500205039978, "learning_rate": 7.806453845854036e-05, "loss": 1.6157, "step": 32340 }, { "epoch": 1.2899385235490373, "grad_norm": 0.2302774041891098, "learning_rate": 7.805093740846736e-05, "loss": 1.635, "step": 32350 }, { "epoch": 1.2904804647145494, "grad_norm": 0.49598991870880127, "learning_rate": 7.80373335029443e-05, "loss": 1.6259, "step": 32360 }, { "epoch": 1.2906972411807542, "eval_loss": 2.5177066326141357, "eval_runtime": 21.9888, "eval_samples_per_second": 227.389, "eval_steps_per_second": 1.228, "step": 32364 }, { "epoch": 1.2910224058800617, "grad_norm": 0.32611334323883057, "learning_rate": 7.802372674365639e-05, "loss": 1.6188, "step": 32370 }, { "epoch": 1.2915643470455738, "grad_norm": 0.31275832653045654, "learning_rate": 7.801011713228915e-05, "loss": 1.6243, "step": 32380 }, { "epoch": 1.292106288211086, "grad_norm": 0.31753793358802795, "learning_rate": 7.799650467052853e-05, "loss": 1.6231, "step": 32390 }, { "epoch": 1.2926482293765984, "grad_norm": 0.25154951214790344, "learning_rate": 7.798288936006073e-05, "loss": 1.6303, "step": 32400 }, { "epoch": 1.2931901705421105, "grad_norm": 0.264728844165802, "learning_rate": 7.796927120257237e-05, "loss": 1.6155, "step": 32410 }, { "epoch": 1.2937321117076228, "grad_norm": 0.27522847056388855, "learning_rate": 7.795565019975045e-05, "loss": 1.6155, "step": 32420 }, { "epoch": 1.294274052873135, "grad_norm": 0.3371419906616211, "learning_rate": 7.794202635328222e-05, "loss": 1.614, "step": 32430 }, { "epoch": 1.2948159940386472, "grad_norm": 0.25964683294296265, "learning_rate": 7.792839966485537e-05, "loss": 1.6086, "step": 32440 }, { "epoch": 1.2953579352041593, "grad_norm": 0.22096186876296997, "learning_rate": 7.791477013615794e-05, "loss": 1.608, "step": 32450 }, { "epoch": 1.2954121293207106, "eval_loss": 2.525634288787842, "eval_runtime": 21.9845, "eval_samples_per_second": 227.433, "eval_steps_per_second": 1.228, "step": 32451 }, { "epoch": 1.2958998763696716, "grad_norm": 0.4005647301673889, "learning_rate": 7.790113776887825e-05, "loss": 1.6187, "step": 32460 }, { "epoch": 1.2964418175351837, "grad_norm": 0.24895216524600983, "learning_rate": 7.788750256470506e-05, "loss": 1.6125, "step": 32470 }, { "epoch": 1.296983758700696, "grad_norm": 0.2178104668855667, "learning_rate": 7.787386452532739e-05, "loss": 1.6205, "step": 32480 }, { "epoch": 1.2975256998662084, "grad_norm": 0.30151644349098206, "learning_rate": 7.78602236524347e-05, "loss": 1.6093, "step": 32490 }, { "epoch": 1.2980676410317205, "grad_norm": 0.3658519983291626, "learning_rate": 7.784657994771676e-05, "loss": 1.6158, "step": 32500 }, { "epoch": 1.2986095821972328, "grad_norm": 0.40358707308769226, "learning_rate": 7.783293341286368e-05, "loss": 1.6271, "step": 32510 }, { "epoch": 1.2991515233627449, "grad_norm": 0.4737723469734192, "learning_rate": 7.781928404956594e-05, "loss": 1.6102, "step": 32520 }, { "epoch": 1.2996934645282572, "grad_norm": 0.3582918345928192, "learning_rate": 7.780563185951437e-05, "loss": 1.6199, "step": 32530 }, { "epoch": 1.300127017460667, "eval_loss": 2.526118040084839, "eval_runtime": 21.9907, "eval_samples_per_second": 227.369, "eval_steps_per_second": 1.228, "step": 32538 }, { "epoch": 1.3002354056937695, "grad_norm": 0.282720148563385, "learning_rate": 7.779197684440014e-05, "loss": 1.6162, "step": 32540 }, { "epoch": 1.3007773468592816, "grad_norm": 0.3246510624885559, "learning_rate": 7.777831900591477e-05, "loss": 1.6151, "step": 32550 }, { "epoch": 1.3013192880247937, "grad_norm": 0.311619371175766, "learning_rate": 7.776465834575013e-05, "loss": 1.6199, "step": 32560 }, { "epoch": 1.301861229190306, "grad_norm": 0.4281388223171234, "learning_rate": 7.775099486559845e-05, "loss": 1.6132, "step": 32570 }, { "epoch": 1.3024031703558183, "grad_norm": 0.39166897535324097, "learning_rate": 7.773732856715229e-05, "loss": 1.6158, "step": 32580 }, { "epoch": 1.3029451115213304, "grad_norm": 0.26505813002586365, "learning_rate": 7.772365945210459e-05, "loss": 1.6237, "step": 32590 }, { "epoch": 1.3034870526868427, "grad_norm": 0.4248243570327759, "learning_rate": 7.770998752214863e-05, "loss": 1.6218, "step": 32600 }, { "epoch": 1.3040289938523548, "grad_norm": 0.21477878093719482, "learning_rate": 7.769631277897801e-05, "loss": 1.6209, "step": 32610 }, { "epoch": 1.3045709350178671, "grad_norm": 0.29866376519203186, "learning_rate": 7.768263522428667e-05, "loss": 1.6122, "step": 32620 }, { "epoch": 1.3048419056006233, "eval_loss": 2.5290870666503906, "eval_runtime": 21.9868, "eval_samples_per_second": 227.409, "eval_steps_per_second": 1.228, "step": 32625 }, { "epoch": 1.3051128761833795, "grad_norm": 0.2718803882598877, "learning_rate": 7.766895485976899e-05, "loss": 1.6174, "step": 32630 }, { "epoch": 1.3056548173488915, "grad_norm": 0.22970491647720337, "learning_rate": 7.765527168711958e-05, "loss": 1.6211, "step": 32640 }, { "epoch": 1.3061967585144036, "grad_norm": 0.36656203866004944, "learning_rate": 7.764158570803348e-05, "loss": 1.6204, "step": 32650 }, { "epoch": 1.306738699679916, "grad_norm": 0.31001171469688416, "learning_rate": 7.762789692420604e-05, "loss": 1.6245, "step": 32660 }, { "epoch": 1.3072806408454283, "grad_norm": 0.3693181872367859, "learning_rate": 7.761420533733297e-05, "loss": 1.616, "step": 32670 }, { "epoch": 1.3078225820109404, "grad_norm": 0.3442032039165497, "learning_rate": 7.760051094911032e-05, "loss": 1.6168, "step": 32680 }, { "epoch": 1.3083645231764527, "grad_norm": 0.26000893115997314, "learning_rate": 7.75868137612345e-05, "loss": 1.6162, "step": 32690 }, { "epoch": 1.3089064643419648, "grad_norm": 0.27503514289855957, "learning_rate": 7.757311377540226e-05, "loss": 1.6104, "step": 32700 }, { "epoch": 1.309448405507477, "grad_norm": 0.395526647567749, "learning_rate": 7.75594109933107e-05, "loss": 1.6291, "step": 32710 }, { "epoch": 1.3095567937405796, "eval_loss": 2.5185840129852295, "eval_runtime": 21.986, "eval_samples_per_second": 227.418, "eval_steps_per_second": 1.228, "step": 32712 }, { "epoch": 1.3099903466729894, "grad_norm": 0.31528306007385254, "learning_rate": 7.75457054166572e-05, "loss": 1.6118, "step": 32720 }, { "epoch": 1.3105322878385015, "grad_norm": 0.3324233889579773, "learning_rate": 7.753199704713963e-05, "loss": 1.6176, "step": 32730 }, { "epoch": 1.3110742290040138, "grad_norm": 0.2883225679397583, "learning_rate": 7.75182858864561e-05, "loss": 1.6269, "step": 32740 }, { "epoch": 1.311616170169526, "grad_norm": 0.4456532597541809, "learning_rate": 7.750457193630507e-05, "loss": 1.6139, "step": 32750 }, { "epoch": 1.3121581113350382, "grad_norm": 0.26752784848213196, "learning_rate": 7.749085519838537e-05, "loss": 1.6183, "step": 32760 }, { "epoch": 1.3127000525005503, "grad_norm": 0.46080008149147034, "learning_rate": 7.747713567439617e-05, "loss": 1.6097, "step": 32770 }, { "epoch": 1.3132419936660626, "grad_norm": 0.2891246974468231, "learning_rate": 7.746341336603698e-05, "loss": 1.6118, "step": 32780 }, { "epoch": 1.3137839348315747, "grad_norm": 0.4009568989276886, "learning_rate": 7.744968827500769e-05, "loss": 1.63, "step": 32790 }, { "epoch": 1.3142716818805358, "eval_loss": 2.504746913909912, "eval_runtime": 21.9893, "eval_samples_per_second": 227.384, "eval_steps_per_second": 1.228, "step": 32799 }, { "epoch": 1.314325875997087, "grad_norm": 0.31230422854423523, "learning_rate": 7.743596040300848e-05, "loss": 1.6139, "step": 32800 }, { "epoch": 1.3148678171625994, "grad_norm": 0.4773191809654236, "learning_rate": 7.742222975173991e-05, "loss": 1.619, "step": 32810 }, { "epoch": 1.3154097583281115, "grad_norm": 0.36002716422080994, "learning_rate": 7.740849632290284e-05, "loss": 1.6178, "step": 32820 }, { "epoch": 1.3159516994936238, "grad_norm": 0.22423742711544037, "learning_rate": 7.739476011819854e-05, "loss": 1.6269, "step": 32830 }, { "epoch": 1.3164936406591359, "grad_norm": 0.43875738978385925, "learning_rate": 7.73810211393286e-05, "loss": 1.6173, "step": 32840 }, { "epoch": 1.3170355818246482, "grad_norm": 0.24483536183834076, "learning_rate": 7.736727938799492e-05, "loss": 1.6307, "step": 32850 }, { "epoch": 1.3175775229901605, "grad_norm": 0.24819006025791168, "learning_rate": 7.73535348658998e-05, "loss": 1.61, "step": 32860 }, { "epoch": 1.3181194641556726, "grad_norm": 0.2164914309978485, "learning_rate": 7.733978757474579e-05, "loss": 1.6216, "step": 32870 }, { "epoch": 1.3186614053211847, "grad_norm": 0.2493513524532318, "learning_rate": 7.732603751623591e-05, "loss": 1.614, "step": 32880 }, { "epoch": 1.3189865700204921, "eval_loss": 2.5233335494995117, "eval_runtime": 21.9884, "eval_samples_per_second": 227.392, "eval_steps_per_second": 1.228, "step": 32886 }, { "epoch": 1.319203346486697, "grad_norm": 0.2988925278186798, "learning_rate": 7.731228469207342e-05, "loss": 1.6109, "step": 32890 }, { "epoch": 1.3197452876522093, "grad_norm": 0.3509455621242523, "learning_rate": 7.7298529103962e-05, "loss": 1.6183, "step": 32900 }, { "epoch": 1.3202872288177214, "grad_norm": 0.33325713872909546, "learning_rate": 7.728477075360558e-05, "loss": 1.6207, "step": 32910 }, { "epoch": 1.3208291699832337, "grad_norm": 0.22428090870380402, "learning_rate": 7.72710096427085e-05, "loss": 1.6048, "step": 32920 }, { "epoch": 1.3213711111487458, "grad_norm": 0.4815099835395813, "learning_rate": 7.725724577297547e-05, "loss": 1.6148, "step": 32930 }, { "epoch": 1.3219130523142582, "grad_norm": 0.2657027542591095, "learning_rate": 7.724347914611142e-05, "loss": 1.6154, "step": 32940 }, { "epoch": 1.3224549934797705, "grad_norm": 0.4155304729938507, "learning_rate": 7.722970976382179e-05, "loss": 1.622, "step": 32950 }, { "epoch": 1.3229969346452826, "grad_norm": 0.49975696206092834, "learning_rate": 7.721593762781221e-05, "loss": 1.6162, "step": 32960 }, { "epoch": 1.3235388758107947, "grad_norm": 0.376020222902298, "learning_rate": 7.720216273978872e-05, "loss": 1.6212, "step": 32970 }, { "epoch": 1.3237014581604485, "eval_loss": 2.526895523071289, "eval_runtime": 21.9876, "eval_samples_per_second": 227.401, "eval_steps_per_second": 1.228, "step": 32973 }, { "epoch": 1.324080816976307, "grad_norm": 0.22721335291862488, "learning_rate": 7.71883851014577e-05, "loss": 1.6171, "step": 32980 }, { "epoch": 1.3246227581418193, "grad_norm": 0.2434106022119522, "learning_rate": 7.717460471452588e-05, "loss": 1.6035, "step": 32990 }, { "epoch": 1.3251646993073314, "grad_norm": 0.4650270938873291, "learning_rate": 7.716082158070031e-05, "loss": 1.6103, "step": 33000 }, { "epoch": 1.3257066404728437, "grad_norm": 0.2861434817314148, "learning_rate": 7.714703570168835e-05, "loss": 1.6158, "step": 33010 }, { "epoch": 1.3262485816383558, "grad_norm": 0.7163834571838379, "learning_rate": 7.713324707919777e-05, "loss": 1.6211, "step": 33020 }, { "epoch": 1.326790522803868, "grad_norm": 0.685391902923584, "learning_rate": 7.711945571493663e-05, "loss": 1.624, "step": 33030 }, { "epoch": 1.3273324639693804, "grad_norm": 0.33094045519828796, "learning_rate": 7.710566161061337e-05, "loss": 1.6251, "step": 33040 }, { "epoch": 1.3278744051348925, "grad_norm": 0.2844617962837219, "learning_rate": 7.70918647679367e-05, "loss": 1.6184, "step": 33050 }, { "epoch": 1.3284163463004048, "grad_norm": 0.3345138430595398, "learning_rate": 7.707806518861575e-05, "loss": 1.6111, "step": 33060 }, { "epoch": 1.3284163463004048, "eval_loss": 2.527109384536743, "eval_runtime": 21.9601, "eval_samples_per_second": 227.686, "eval_steps_per_second": 1.23, "step": 33060 }, { "epoch": 1.328958287465917, "grad_norm": 0.48074135184288025, "learning_rate": 7.706426287435991e-05, "loss": 1.611, "step": 33070 }, { "epoch": 1.3295002286314292, "grad_norm": 0.31359565258026123, "learning_rate": 7.7050457826879e-05, "loss": 1.6273, "step": 33080 }, { "epoch": 1.3300421697969413, "grad_norm": 0.23901283740997314, "learning_rate": 7.703665004788312e-05, "loss": 1.6191, "step": 33090 }, { "epoch": 1.3305841109624537, "grad_norm": 0.5294537544250488, "learning_rate": 7.70228395390827e-05, "loss": 1.6146, "step": 33100 }, { "epoch": 1.3311260521279658, "grad_norm": 0.2701181173324585, "learning_rate": 7.700902630218852e-05, "loss": 1.6156, "step": 33110 }, { "epoch": 1.331667993293478, "grad_norm": 0.25721871852874756, "learning_rate": 7.699521033891171e-05, "loss": 1.6191, "step": 33120 }, { "epoch": 1.3322099344589904, "grad_norm": 0.35411280393600464, "learning_rate": 7.698139165096375e-05, "loss": 1.6178, "step": 33130 }, { "epoch": 1.3327518756245025, "grad_norm": 0.4553796052932739, "learning_rate": 7.696757024005642e-05, "loss": 1.6116, "step": 33140 }, { "epoch": 1.3331312344403612, "eval_loss": 2.523940086364746, "eval_runtime": 21.9819, "eval_samples_per_second": 227.46, "eval_steps_per_second": 1.228, "step": 33147 }, { "epoch": 1.3332938167900148, "grad_norm": 0.22905197739601135, "learning_rate": 7.695374610790187e-05, "loss": 1.6081, "step": 33150 }, { "epoch": 1.333835757955527, "grad_norm": 0.3004699945449829, "learning_rate": 7.693991925621256e-05, "loss": 1.6163, "step": 33160 }, { "epoch": 1.3343776991210392, "grad_norm": 0.3088211119174957, "learning_rate": 7.69260896867013e-05, "loss": 1.6104, "step": 33170 }, { "epoch": 1.3349196402865515, "grad_norm": 0.2766474187374115, "learning_rate": 7.691225740108126e-05, "loss": 1.6134, "step": 33180 }, { "epoch": 1.3354615814520636, "grad_norm": 0.4962151050567627, "learning_rate": 7.68984224010659e-05, "loss": 1.6066, "step": 33190 }, { "epoch": 1.3360035226175757, "grad_norm": 0.3378833830356598, "learning_rate": 7.688458468836903e-05, "loss": 1.6132, "step": 33200 }, { "epoch": 1.336545463783088, "grad_norm": 0.34599658846855164, "learning_rate": 7.687074426470484e-05, "loss": 1.6161, "step": 33210 }, { "epoch": 1.3370874049486003, "grad_norm": 0.3025360703468323, "learning_rate": 7.68569011317878e-05, "loss": 1.6168, "step": 33220 }, { "epoch": 1.3376293461141124, "grad_norm": 0.32311034202575684, "learning_rate": 7.684305529133273e-05, "loss": 1.6177, "step": 33230 }, { "epoch": 1.3378461225803173, "eval_loss": 2.5135555267333984, "eval_runtime": 21.987, "eval_samples_per_second": 227.407, "eval_steps_per_second": 1.228, "step": 33234 }, { "epoch": 1.3381712872796248, "grad_norm": 0.25993430614471436, "learning_rate": 7.682920674505481e-05, "loss": 1.612, "step": 33240 }, { "epoch": 1.3387132284451368, "grad_norm": 0.35539132356643677, "learning_rate": 7.681535549466954e-05, "loss": 1.6175, "step": 33250 }, { "epoch": 1.3392551696106492, "grad_norm": 0.24199128150939941, "learning_rate": 7.680150154189275e-05, "loss": 1.6121, "step": 33260 }, { "epoch": 1.3397971107761615, "grad_norm": 0.4272856116294861, "learning_rate": 7.678764488844059e-05, "loss": 1.6128, "step": 33270 }, { "epoch": 1.3403390519416736, "grad_norm": 0.23739628493785858, "learning_rate": 7.677378553602958e-05, "loss": 1.6096, "step": 33280 }, { "epoch": 1.3408809931071857, "grad_norm": 0.2996535897254944, "learning_rate": 7.675992348637654e-05, "loss": 1.6071, "step": 33290 }, { "epoch": 1.341422934272698, "grad_norm": 0.38013362884521484, "learning_rate": 7.674605874119865e-05, "loss": 1.6145, "step": 33300 }, { "epoch": 1.3419648754382103, "grad_norm": 0.5391820073127747, "learning_rate": 7.673219130221342e-05, "loss": 1.6162, "step": 33310 }, { "epoch": 1.3425068166037224, "grad_norm": 0.5446195006370544, "learning_rate": 7.671832117113868e-05, "loss": 1.6039, "step": 33320 }, { "epoch": 1.3425610107202737, "eval_loss": 2.505586624145508, "eval_runtime": 21.9853, "eval_samples_per_second": 227.425, "eval_steps_per_second": 1.228, "step": 33321 }, { "epoch": 1.3430487577692347, "grad_norm": 0.3738638460636139, "learning_rate": 7.670444834969262e-05, "loss": 1.6136, "step": 33330 }, { "epoch": 1.3435906989347468, "grad_norm": 0.2111603319644928, "learning_rate": 7.669057283959371e-05, "loss": 1.6126, "step": 33340 }, { "epoch": 1.3441326401002591, "grad_norm": 0.27275893092155457, "learning_rate": 7.667669464256081e-05, "loss": 1.6204, "step": 33350 }, { "epoch": 1.3446745812657714, "grad_norm": 0.3274075984954834, "learning_rate": 7.66628137603131e-05, "loss": 1.6052, "step": 33360 }, { "epoch": 1.3452165224312835, "grad_norm": 0.22435316443443298, "learning_rate": 7.664893019457007e-05, "loss": 1.6097, "step": 33370 }, { "epoch": 1.3457584635967959, "grad_norm": 0.41898712515830994, "learning_rate": 7.663504394705155e-05, "loss": 1.6118, "step": 33380 }, { "epoch": 1.346300404762308, "grad_norm": 0.2666551172733307, "learning_rate": 7.662115501947772e-05, "loss": 1.5982, "step": 33390 }, { "epoch": 1.3468423459278203, "grad_norm": 0.4487745463848114, "learning_rate": 7.660726341356908e-05, "loss": 1.6143, "step": 33400 }, { "epoch": 1.34727589886023, "eval_loss": 2.506096363067627, "eval_runtime": 21.9887, "eval_samples_per_second": 227.39, "eval_steps_per_second": 1.228, "step": 33408 }, { "epoch": 1.3473842870933324, "grad_norm": 0.21905028820037842, "learning_rate": 7.659336913104645e-05, "loss": 1.6089, "step": 33410 }, { "epoch": 1.3479262282588447, "grad_norm": 0.21832507848739624, "learning_rate": 7.657947217363099e-05, "loss": 1.6071, "step": 33420 }, { "epoch": 1.3484681694243568, "grad_norm": 0.26740700006484985, "learning_rate": 7.656557254304423e-05, "loss": 1.6158, "step": 33430 }, { "epoch": 1.349010110589869, "grad_norm": 0.5225000977516174, "learning_rate": 7.655167024100798e-05, "loss": 1.6123, "step": 33440 }, { "epoch": 1.3495520517553814, "grad_norm": 0.3178088068962097, "learning_rate": 7.653776526924435e-05, "loss": 1.6182, "step": 33450 }, { "epoch": 1.3500939929208935, "grad_norm": 0.21684478223323822, "learning_rate": 7.65238576294759e-05, "loss": 1.6098, "step": 33460 }, { "epoch": 1.3506359340864058, "grad_norm": 0.5009759664535522, "learning_rate": 7.650994732342539e-05, "loss": 1.6073, "step": 33470 }, { "epoch": 1.351177875251918, "grad_norm": 0.5376147031784058, "learning_rate": 7.649603435281601e-05, "loss": 1.609, "step": 33480 }, { "epoch": 1.3517198164174302, "grad_norm": 0.49498310685157776, "learning_rate": 7.648211871937121e-05, "loss": 1.6146, "step": 33490 }, { "epoch": 1.3519907870001862, "eval_loss": 2.51250958442688, "eval_runtime": 21.983, "eval_samples_per_second": 227.448, "eval_steps_per_second": 1.228, "step": 33495 }, { "epoch": 1.3522617575829425, "grad_norm": 0.4050725996494293, "learning_rate": 7.64682004248148e-05, "loss": 1.5954, "step": 33500 }, { "epoch": 1.3528036987484546, "grad_norm": 0.2994046211242676, "learning_rate": 7.645427947087096e-05, "loss": 1.6119, "step": 33510 }, { "epoch": 1.3533456399139667, "grad_norm": 0.2522560656070709, "learning_rate": 7.64403558592641e-05, "loss": 1.613, "step": 33520 }, { "epoch": 1.353887581079479, "grad_norm": 0.20905892550945282, "learning_rate": 7.642642959171905e-05, "loss": 1.5925, "step": 33530 }, { "epoch": 1.3544295222449914, "grad_norm": 0.259552925825119, "learning_rate": 7.641250066996092e-05, "loss": 1.6032, "step": 33540 }, { "epoch": 1.3549714634105035, "grad_norm": 0.2907361686229706, "learning_rate": 7.639856909571517e-05, "loss": 1.6111, "step": 33550 }, { "epoch": 1.3555134045760158, "grad_norm": 0.2905162274837494, "learning_rate": 7.638463487070759e-05, "loss": 1.6076, "step": 33560 }, { "epoch": 1.3560553457415279, "grad_norm": 0.3089943826198578, "learning_rate": 7.637069799666428e-05, "loss": 1.6011, "step": 33570 }, { "epoch": 1.3565972869070402, "grad_norm": 0.34541210532188416, "learning_rate": 7.635675847531169e-05, "loss": 1.6062, "step": 33580 }, { "epoch": 1.3567056751401427, "eval_loss": 2.5141220092773438, "eval_runtime": 21.9917, "eval_samples_per_second": 227.359, "eval_steps_per_second": 1.228, "step": 33582 }, { "epoch": 1.3571392280725525, "grad_norm": 0.3026711642742157, "learning_rate": 7.634281630837656e-05, "loss": 1.6096, "step": 33590 }, { "epoch": 1.3576811692380646, "grad_norm": 0.37327638268470764, "learning_rate": 7.632887149758604e-05, "loss": 1.6083, "step": 33600 }, { "epoch": 1.3582231104035767, "grad_norm": 0.24328859150409698, "learning_rate": 7.63149240446675e-05, "loss": 1.6124, "step": 33610 }, { "epoch": 1.358765051569089, "grad_norm": 0.26186954975128174, "learning_rate": 7.630097395134873e-05, "loss": 1.5939, "step": 33620 }, { "epoch": 1.3593069927346013, "grad_norm": 0.31656497716903687, "learning_rate": 7.628702121935776e-05, "loss": 1.6096, "step": 33630 }, { "epoch": 1.3598489339001134, "grad_norm": 0.3277546763420105, "learning_rate": 7.627306585042302e-05, "loss": 1.6049, "step": 33640 }, { "epoch": 1.3603908750656257, "grad_norm": 0.25422918796539307, "learning_rate": 7.625910784627326e-05, "loss": 1.6149, "step": 33650 }, { "epoch": 1.3609328162311378, "grad_norm": 0.22749896347522736, "learning_rate": 7.62451472086375e-05, "loss": 1.6047, "step": 33660 }, { "epoch": 1.3614205632800989, "eval_loss": 2.5274205207824707, "eval_runtime": 21.9881, "eval_samples_per_second": 227.396, "eval_steps_per_second": 1.228, "step": 33669 }, { "epoch": 1.3614747573966501, "grad_norm": 0.3570140302181244, "learning_rate": 7.623118393924515e-05, "loss": 1.6078, "step": 33670 }, { "epoch": 1.3620166985621625, "grad_norm": 0.583695113658905, "learning_rate": 7.62172180398259e-05, "loss": 1.6051, "step": 33680 }, { "epoch": 1.3625586397276745, "grad_norm": 0.2449539452791214, "learning_rate": 7.620324951210981e-05, "loss": 1.6079, "step": 33690 }, { "epoch": 1.3631005808931869, "grad_norm": 0.33772215247154236, "learning_rate": 7.618927835782724e-05, "loss": 1.6193, "step": 33700 }, { "epoch": 1.363642522058699, "grad_norm": 0.28392404317855835, "learning_rate": 7.617530457870883e-05, "loss": 1.6043, "step": 33710 }, { "epoch": 1.3641844632242113, "grad_norm": 0.34906795620918274, "learning_rate": 7.616132817648565e-05, "loss": 1.6015, "step": 33720 }, { "epoch": 1.3647264043897234, "grad_norm": 0.23826880753040314, "learning_rate": 7.614734915288899e-05, "loss": 1.6132, "step": 33730 }, { "epoch": 1.3652683455552357, "grad_norm": 0.4068436026573181, "learning_rate": 7.613336750965053e-05, "loss": 1.6078, "step": 33740 }, { "epoch": 1.3658102867207478, "grad_norm": 0.45196202397346497, "learning_rate": 7.611938324850226e-05, "loss": 1.5997, "step": 33750 }, { "epoch": 1.3661354514200552, "eval_loss": 2.5162079334259033, "eval_runtime": 21.9869, "eval_samples_per_second": 227.408, "eval_steps_per_second": 1.228, "step": 33756 }, { "epoch": 1.36635222788626, "grad_norm": 0.2863083779811859, "learning_rate": 7.610539637117647e-05, "loss": 1.6013, "step": 33760 }, { "epoch": 1.3668941690517724, "grad_norm": 0.3467727303504944, "learning_rate": 7.609140687940585e-05, "loss": 1.6113, "step": 33770 }, { "epoch": 1.3674361102172845, "grad_norm": 0.8201264142990112, "learning_rate": 7.607741477492327e-05, "loss": 1.6092, "step": 33780 }, { "epoch": 1.3679780513827968, "grad_norm": 0.3005310893058777, "learning_rate": 7.606342005946207e-05, "loss": 1.6119, "step": 33790 }, { "epoch": 1.368519992548309, "grad_norm": 0.31185653805732727, "learning_rate": 7.604942273475585e-05, "loss": 1.6117, "step": 33800 }, { "epoch": 1.3690619337138212, "grad_norm": 0.4523196816444397, "learning_rate": 7.603542280253853e-05, "loss": 1.6103, "step": 33810 }, { "epoch": 1.3696038748793335, "grad_norm": 0.2638733386993408, "learning_rate": 7.602142026454435e-05, "loss": 1.6264, "step": 33820 }, { "epoch": 1.3701458160448456, "grad_norm": 0.25433018803596497, "learning_rate": 7.60074151225079e-05, "loss": 1.6129, "step": 33830 }, { "epoch": 1.3706877572103577, "grad_norm": 0.22881822288036346, "learning_rate": 7.599340737816406e-05, "loss": 1.5969, "step": 33840 }, { "epoch": 1.3708503395600116, "eval_loss": 2.5215718746185303, "eval_runtime": 21.9866, "eval_samples_per_second": 227.411, "eval_steps_per_second": 1.228, "step": 33843 }, { "epoch": 1.37122969837587, "grad_norm": 0.47086840867996216, "learning_rate": 7.597939703324807e-05, "loss": 1.6004, "step": 33850 }, { "epoch": 1.3717716395413824, "grad_norm": 0.25351575016975403, "learning_rate": 7.596538408949546e-05, "loss": 1.6025, "step": 33860 }, { "epoch": 1.3723135807068945, "grad_norm": 0.47730979323387146, "learning_rate": 7.595136854864208e-05, "loss": 1.6075, "step": 33870 }, { "epoch": 1.3728555218724068, "grad_norm": 0.22560712695121765, "learning_rate": 7.593735041242414e-05, "loss": 1.6045, "step": 33880 }, { "epoch": 1.3733974630379189, "grad_norm": 0.3164345920085907, "learning_rate": 7.592332968257812e-05, "loss": 1.612, "step": 33890 }, { "epoch": 1.3739394042034312, "grad_norm": 0.5792235136032104, "learning_rate": 7.590930636084087e-05, "loss": 1.6049, "step": 33900 }, { "epoch": 1.3744813453689435, "grad_norm": 0.2980400025844574, "learning_rate": 7.589528044894951e-05, "loss": 1.6049, "step": 33910 }, { "epoch": 1.3750232865344556, "grad_norm": 0.2572561800479889, "learning_rate": 7.588125194864154e-05, "loss": 1.5972, "step": 33920 }, { "epoch": 1.3755652276999677, "grad_norm": 0.3515624701976776, "learning_rate": 7.586722086165471e-05, "loss": 1.6086, "step": 33930 }, { "epoch": 1.3755652276999677, "eval_loss": 2.5238847732543945, "eval_runtime": 21.9722, "eval_samples_per_second": 227.56, "eval_steps_per_second": 1.229, "step": 33930 }, { "epoch": 1.37610716886548, "grad_norm": 0.22956761717796326, "learning_rate": 7.585318718972719e-05, "loss": 1.5948, "step": 33940 }, { "epoch": 1.3766491100309923, "grad_norm": 0.2527923583984375, "learning_rate": 7.583915093459736e-05, "loss": 1.5908, "step": 33950 }, { "epoch": 1.3771910511965044, "grad_norm": 0.3912854790687561, "learning_rate": 7.5825112098004e-05, "loss": 1.5966, "step": 33960 }, { "epoch": 1.3777329923620167, "grad_norm": 0.2859535813331604, "learning_rate": 7.581107068168615e-05, "loss": 1.6188, "step": 33970 }, { "epoch": 1.3782749335275288, "grad_norm": 0.29767870903015137, "learning_rate": 7.579702668738323e-05, "loss": 1.604, "step": 33980 }, { "epoch": 1.3788168746930412, "grad_norm": 0.3692917227745056, "learning_rate": 7.578298011683493e-05, "loss": 1.6166, "step": 33990 }, { "epoch": 1.3793588158585535, "grad_norm": 0.40221714973449707, "learning_rate": 7.576893097178128e-05, "loss": 1.6078, "step": 34000 }, { "epoch": 1.3799007570240656, "grad_norm": 0.4148884415626526, "learning_rate": 7.575487925396264e-05, "loss": 1.6028, "step": 34010 }, { "epoch": 1.380280115839924, "eval_loss": 2.53521466255188, "eval_runtime": 21.9888, "eval_samples_per_second": 227.389, "eval_steps_per_second": 1.228, "step": 34017 }, { "epoch": 1.3804426981895779, "grad_norm": 0.5252642035484314, "learning_rate": 7.574082496511966e-05, "loss": 1.6146, "step": 34020 }, { "epoch": 1.38098463935509, "grad_norm": 0.4250527322292328, "learning_rate": 7.572676810699333e-05, "loss": 1.6074, "step": 34030 }, { "epoch": 1.3815265805206023, "grad_norm": 0.31851446628570557, "learning_rate": 7.571270868132496e-05, "loss": 1.5978, "step": 34040 }, { "epoch": 1.3820685216861144, "grad_norm": 0.22338920831680298, "learning_rate": 7.569864668985617e-05, "loss": 1.5961, "step": 34050 }, { "epoch": 1.3826104628516267, "grad_norm": 0.2865997552871704, "learning_rate": 7.568458213432888e-05, "loss": 1.6073, "step": 34060 }, { "epoch": 1.3831524040171388, "grad_norm": 0.3791174590587616, "learning_rate": 7.567051501648536e-05, "loss": 1.6018, "step": 34070 }, { "epoch": 1.383694345182651, "grad_norm": 0.3320346474647522, "learning_rate": 7.565644533806818e-05, "loss": 1.5995, "step": 34080 }, { "epoch": 1.3842362863481634, "grad_norm": 0.2221497744321823, "learning_rate": 7.564237310082024e-05, "loss": 1.5972, "step": 34090 }, { "epoch": 1.3847782275136755, "grad_norm": 0.27021268010139465, "learning_rate": 7.562829830648474e-05, "loss": 1.6121, "step": 34100 }, { "epoch": 1.3849950039798804, "eval_loss": 2.520078420639038, "eval_runtime": 21.9871, "eval_samples_per_second": 227.407, "eval_steps_per_second": 1.228, "step": 34104 }, { "epoch": 1.3853201686791878, "grad_norm": 0.35636231303215027, "learning_rate": 7.56142209568052e-05, "loss": 1.6015, "step": 34110 }, { "epoch": 1.3858621098447, "grad_norm": 0.45814478397369385, "learning_rate": 7.560014105352546e-05, "loss": 1.6131, "step": 34120 }, { "epoch": 1.3864040510102122, "grad_norm": 0.6515801548957825, "learning_rate": 7.55860585983897e-05, "loss": 1.6189, "step": 34130 }, { "epoch": 1.3869459921757243, "grad_norm": 0.32424890995025635, "learning_rate": 7.557197359314237e-05, "loss": 1.6162, "step": 34140 }, { "epoch": 1.3874879333412367, "grad_norm": 0.24668356776237488, "learning_rate": 7.555788603952825e-05, "loss": 1.607, "step": 34150 }, { "epoch": 1.3880298745067488, "grad_norm": 0.2475864738225937, "learning_rate": 7.554379593929248e-05, "loss": 1.6014, "step": 34160 }, { "epoch": 1.388571815672261, "grad_norm": 0.2331823706626892, "learning_rate": 7.552970329418045e-05, "loss": 1.6068, "step": 34170 }, { "epoch": 1.3891137568377734, "grad_norm": 0.2390824258327484, "learning_rate": 7.551560810593792e-05, "loss": 1.5906, "step": 34180 }, { "epoch": 1.3896556980032855, "grad_norm": 0.26939457654953003, "learning_rate": 7.550151037631092e-05, "loss": 1.6275, "step": 34190 }, { "epoch": 1.3897098921198368, "eval_loss": 2.518341064453125, "eval_runtime": 21.9824, "eval_samples_per_second": 227.455, "eval_steps_per_second": 1.228, "step": 34191 }, { "epoch": 1.3901976391687978, "grad_norm": 0.262103796005249, "learning_rate": 7.548741010704583e-05, "loss": 1.5979, "step": 34200 }, { "epoch": 1.3907395803343099, "grad_norm": 0.34173309803009033, "learning_rate": 7.547330729988931e-05, "loss": 1.5959, "step": 34210 }, { "epoch": 1.3912815214998222, "grad_norm": 0.22995412349700928, "learning_rate": 7.545920195658837e-05, "loss": 1.605, "step": 34220 }, { "epoch": 1.3918234626653345, "grad_norm": 0.2825547158718109, "learning_rate": 7.544509407889033e-05, "loss": 1.5997, "step": 34230 }, { "epoch": 1.3923654038308466, "grad_norm": 0.3113856613636017, "learning_rate": 7.54309836685428e-05, "loss": 1.6061, "step": 34240 }, { "epoch": 1.3929073449963587, "grad_norm": 0.32774263620376587, "learning_rate": 7.54168707272937e-05, "loss": 1.5979, "step": 34250 }, { "epoch": 1.393449286161871, "grad_norm": 0.3357396721839905, "learning_rate": 7.540275525689131e-05, "loss": 1.6033, "step": 34260 }, { "epoch": 1.3939912273273833, "grad_norm": 0.35256245732307434, "learning_rate": 7.538863725908416e-05, "loss": 1.6, "step": 34270 }, { "epoch": 1.394424780259793, "eval_loss": 2.519773244857788, "eval_runtime": 21.9811, "eval_samples_per_second": 227.468, "eval_steps_per_second": 1.228, "step": 34278 }, { "epoch": 1.3945331684928954, "grad_norm": 0.31579533219337463, "learning_rate": 7.537451673562116e-05, "loss": 1.5972, "step": 34280 }, { "epoch": 1.3950751096584078, "grad_norm": 0.3092842996120453, "learning_rate": 7.536039368825147e-05, "loss": 1.5975, "step": 34290 }, { "epoch": 1.3956170508239198, "grad_norm": 0.4527590572834015, "learning_rate": 7.534626811872463e-05, "loss": 1.6089, "step": 34300 }, { "epoch": 1.3961589919894322, "grad_norm": 0.30094006657600403, "learning_rate": 7.53321400287904e-05, "loss": 1.6083, "step": 34310 }, { "epoch": 1.3967009331549445, "grad_norm": 0.3692034184932709, "learning_rate": 7.531800942019895e-05, "loss": 1.6021, "step": 34320 }, { "epoch": 1.3972428743204566, "grad_norm": 0.37879860401153564, "learning_rate": 7.530387629470072e-05, "loss": 1.6015, "step": 34330 }, { "epoch": 1.3977848154859687, "grad_norm": 0.5758041739463806, "learning_rate": 7.528974065404644e-05, "loss": 1.598, "step": 34340 }, { "epoch": 1.398326756651481, "grad_norm": 0.29847633838653564, "learning_rate": 7.527560249998716e-05, "loss": 1.6055, "step": 34350 }, { "epoch": 1.3988686978169933, "grad_norm": 0.22990530729293823, "learning_rate": 7.526146183427428e-05, "loss": 1.5928, "step": 34360 }, { "epoch": 1.3991396683997492, "eval_loss": 2.5169806480407715, "eval_runtime": 21.9875, "eval_samples_per_second": 227.402, "eval_steps_per_second": 1.228, "step": 34365 }, { "epoch": 1.3994106389825054, "grad_norm": 0.20360521972179413, "learning_rate": 7.524731865865947e-05, "loss": 1.5949, "step": 34370 }, { "epoch": 1.3999525801480177, "grad_norm": 0.24859194457530975, "learning_rate": 7.523317297489473e-05, "loss": 1.6024, "step": 34380 }, { "epoch": 1.4004945213135298, "grad_norm": 0.33308878540992737, "learning_rate": 7.521902478473238e-05, "loss": 1.598, "step": 34390 }, { "epoch": 1.4010364624790421, "grad_norm": 0.27835598587989807, "learning_rate": 7.520487408992501e-05, "loss": 1.5965, "step": 34400 }, { "epoch": 1.4015784036445544, "grad_norm": 0.36223334074020386, "learning_rate": 7.519072089222557e-05, "loss": 1.6055, "step": 34410 }, { "epoch": 1.4021203448100665, "grad_norm": 0.3996673822402954, "learning_rate": 7.517656519338728e-05, "loss": 1.6096, "step": 34420 }, { "epoch": 1.4026622859755788, "grad_norm": 0.25303345918655396, "learning_rate": 7.516240699516367e-05, "loss": 1.614, "step": 34430 }, { "epoch": 1.403204227141091, "grad_norm": 0.3061133325099945, "learning_rate": 7.514824629930865e-05, "loss": 1.6032, "step": 34440 }, { "epoch": 1.4037461683066033, "grad_norm": 0.3153916895389557, "learning_rate": 7.513408310757632e-05, "loss": 1.5952, "step": 34450 }, { "epoch": 1.4038545565397056, "eval_loss": 2.5186572074890137, "eval_runtime": 21.9793, "eval_samples_per_second": 227.487, "eval_steps_per_second": 1.228, "step": 34452 }, { "epoch": 1.4042881094721154, "grad_norm": 0.24946312606334686, "learning_rate": 7.51199174217212e-05, "loss": 1.6027, "step": 34460 }, { "epoch": 1.4048300506376277, "grad_norm": 0.23080390691757202, "learning_rate": 7.510574924349807e-05, "loss": 1.604, "step": 34470 }, { "epoch": 1.4053719918031398, "grad_norm": 0.36294880509376526, "learning_rate": 7.509157857466202e-05, "loss": 1.608, "step": 34480 }, { "epoch": 1.405913932968652, "grad_norm": 0.3913600742816925, "learning_rate": 7.507740541696844e-05, "loss": 1.6031, "step": 34490 }, { "epoch": 1.4064558741341644, "grad_norm": 0.36812445521354675, "learning_rate": 7.506322977217305e-05, "loss": 1.596, "step": 34500 }, { "epoch": 1.4069978152996765, "grad_norm": 0.5203189253807068, "learning_rate": 7.504905164203184e-05, "loss": 1.6107, "step": 34510 }, { "epoch": 1.4075397564651888, "grad_norm": 0.2861349880695343, "learning_rate": 7.503487102830116e-05, "loss": 1.6116, "step": 34520 }, { "epoch": 1.408081697630701, "grad_norm": 0.3074442446231842, "learning_rate": 7.502068793273765e-05, "loss": 1.602, "step": 34530 }, { "epoch": 1.408569444679662, "eval_loss": 2.5188817977905273, "eval_runtime": 21.988, "eval_samples_per_second": 227.396, "eval_steps_per_second": 1.228, "step": 34539 }, { "epoch": 1.4086236387962132, "grad_norm": 0.3228742778301239, "learning_rate": 7.500650235709826e-05, "loss": 1.5973, "step": 34540 }, { "epoch": 1.4091655799617255, "grad_norm": 0.3475671708583832, "learning_rate": 7.49923143031402e-05, "loss": 1.5987, "step": 34550 }, { "epoch": 1.4097075211272376, "grad_norm": 0.3168829083442688, "learning_rate": 7.497812377262102e-05, "loss": 1.6001, "step": 34560 }, { "epoch": 1.4102494622927497, "grad_norm": 0.2580108046531677, "learning_rate": 7.496393076729862e-05, "loss": 1.6122, "step": 34570 }, { "epoch": 1.410791403458262, "grad_norm": 0.28347161412239075, "learning_rate": 7.494973528893117e-05, "loss": 1.609, "step": 34580 }, { "epoch": 1.4113333446237744, "grad_norm": 0.25245901942253113, "learning_rate": 7.49355373392771e-05, "loss": 1.6036, "step": 34590 }, { "epoch": 1.4118752857892864, "grad_norm": 0.29171431064605713, "learning_rate": 7.492133692009524e-05, "loss": 1.5972, "step": 34600 }, { "epoch": 1.4124172269547988, "grad_norm": 0.3764426112174988, "learning_rate": 7.490713403314462e-05, "loss": 1.6124, "step": 34610 }, { "epoch": 1.4129591681203109, "grad_norm": 0.2615674138069153, "learning_rate": 7.489292868018469e-05, "loss": 1.6089, "step": 34620 }, { "epoch": 1.4132843328196183, "eval_loss": 2.516463279724121, "eval_runtime": 21.9878, "eval_samples_per_second": 227.399, "eval_steps_per_second": 1.228, "step": 34626 }, { "epoch": 1.4135011092858232, "grad_norm": 0.2315002828836441, "learning_rate": 7.487872086297513e-05, "loss": 1.6043, "step": 34630 }, { "epoch": 1.4140430504513355, "grad_norm": 0.25207579135894775, "learning_rate": 7.48645105832759e-05, "loss": 1.5948, "step": 34640 }, { "epoch": 1.4145849916168476, "grad_norm": 0.29729408025741577, "learning_rate": 7.485029784284737e-05, "loss": 1.6036, "step": 34650 }, { "epoch": 1.4151269327823597, "grad_norm": 0.2224244922399521, "learning_rate": 7.483608264345011e-05, "loss": 1.592, "step": 34660 }, { "epoch": 1.415668873947872, "grad_norm": 0.3714025616645813, "learning_rate": 7.482186498684504e-05, "loss": 1.5983, "step": 34670 }, { "epoch": 1.4162108151133843, "grad_norm": 0.29220449924468994, "learning_rate": 7.480764487479342e-05, "loss": 1.5968, "step": 34680 }, { "epoch": 1.4167527562788964, "grad_norm": 0.2586628198623657, "learning_rate": 7.479342230905674e-05, "loss": 1.5986, "step": 34690 }, { "epoch": 1.4172946974444087, "grad_norm": 0.2448311746120453, "learning_rate": 7.477919729139684e-05, "loss": 1.5924, "step": 34700 }, { "epoch": 1.4178366386099208, "grad_norm": 0.43788665533065796, "learning_rate": 7.476496982357585e-05, "loss": 1.6003, "step": 34710 }, { "epoch": 1.4179992209595746, "eval_loss": 2.5183908939361572, "eval_runtime": 21.9844, "eval_samples_per_second": 227.434, "eval_steps_per_second": 1.228, "step": 34713 }, { "epoch": 1.4183785797754331, "grad_norm": 0.3235478103160858, "learning_rate": 7.47507399073562e-05, "loss": 1.6102, "step": 34720 }, { "epoch": 1.4189205209409455, "grad_norm": 0.2464754730463028, "learning_rate": 7.473650754450066e-05, "loss": 1.5979, "step": 34730 }, { "epoch": 1.4194624621064575, "grad_norm": 0.4333379566669464, "learning_rate": 7.472227273677225e-05, "loss": 1.6088, "step": 34740 }, { "epoch": 1.4200044032719699, "grad_norm": 0.2818497121334076, "learning_rate": 7.47080354859343e-05, "loss": 1.6013, "step": 34750 }, { "epoch": 1.420546344437482, "grad_norm": 0.2427944540977478, "learning_rate": 7.469379579375049e-05, "loss": 1.5914, "step": 34760 }, { "epoch": 1.4210882856029943, "grad_norm": 0.29250580072402954, "learning_rate": 7.467955366198474e-05, "loss": 1.6074, "step": 34770 }, { "epoch": 1.4216302267685064, "grad_norm": 0.26771166920661926, "learning_rate": 7.466530909240135e-05, "loss": 1.6108, "step": 34780 }, { "epoch": 1.4221721679340187, "grad_norm": 0.29733744263648987, "learning_rate": 7.465106208676485e-05, "loss": 1.5965, "step": 34790 }, { "epoch": 1.4227141090995308, "grad_norm": 0.2804502546787262, "learning_rate": 7.463681264684007e-05, "loss": 1.5984, "step": 34800 }, { "epoch": 1.4227141090995308, "eval_loss": 2.5124473571777344, "eval_runtime": 21.9639, "eval_samples_per_second": 227.646, "eval_steps_per_second": 1.229, "step": 34800 }, { "epoch": 1.423256050265043, "grad_norm": 0.46871018409729004, "learning_rate": 7.46225607743922e-05, "loss": 1.5917, "step": 34810 }, { "epoch": 1.4237979914305554, "grad_norm": 0.26123303174972534, "learning_rate": 7.46083064711867e-05, "loss": 1.5963, "step": 34820 }, { "epoch": 1.4243399325960675, "grad_norm": 0.22404921054840088, "learning_rate": 7.459404973898932e-05, "loss": 1.6005, "step": 34830 }, { "epoch": 1.4248818737615798, "grad_norm": 0.26554566621780396, "learning_rate": 7.457979057956615e-05, "loss": 1.6021, "step": 34840 }, { "epoch": 1.425423814927092, "grad_norm": 0.2421078085899353, "learning_rate": 7.45655289946835e-05, "loss": 1.6013, "step": 34850 }, { "epoch": 1.4259657560926042, "grad_norm": 0.22835715115070343, "learning_rate": 7.455126498610807e-05, "loss": 1.5989, "step": 34860 }, { "epoch": 1.4265076972581165, "grad_norm": 0.37547022104263306, "learning_rate": 7.453699855560683e-05, "loss": 1.6129, "step": 34870 }, { "epoch": 1.4270496384236286, "grad_norm": 0.4508250057697296, "learning_rate": 7.452272970494702e-05, "loss": 1.5987, "step": 34880 }, { "epoch": 1.4274289972394871, "eval_loss": 2.5165185928344727, "eval_runtime": 22.3018, "eval_samples_per_second": 224.198, "eval_steps_per_second": 1.211, "step": 34887 }, { "epoch": 1.4275915795891407, "grad_norm": 0.28928378224372864, "learning_rate": 7.450845843589622e-05, "loss": 1.5965, "step": 34890 }, { "epoch": 1.428133520754653, "grad_norm": 0.24763907492160797, "learning_rate": 7.449418475022228e-05, "loss": 1.5987, "step": 34900 }, { "epoch": 1.4286754619201654, "grad_norm": 0.28781482577323914, "learning_rate": 7.447990864969336e-05, "loss": 1.59, "step": 34910 }, { "epoch": 1.4292174030856775, "grad_norm": 0.4051133990287781, "learning_rate": 7.446563013607795e-05, "loss": 1.5962, "step": 34920 }, { "epoch": 1.4297593442511898, "grad_norm": 0.472084641456604, "learning_rate": 7.445134921114477e-05, "loss": 1.5906, "step": 34930 }, { "epoch": 1.4303012854167019, "grad_norm": 0.495980441570282, "learning_rate": 7.443706587666292e-05, "loss": 1.6111, "step": 34940 }, { "epoch": 1.4308432265822142, "grad_norm": 0.41576093435287476, "learning_rate": 7.442278013440173e-05, "loss": 1.5951, "step": 34950 }, { "epoch": 1.4313851677477265, "grad_norm": 0.48290589451789856, "learning_rate": 7.440849198613084e-05, "loss": 1.5979, "step": 34960 }, { "epoch": 1.4319271089132386, "grad_norm": 0.3737516701221466, "learning_rate": 7.439420143362025e-05, "loss": 1.6024, "step": 34970 }, { "epoch": 1.4321438853794435, "eval_loss": 2.513838768005371, "eval_runtime": 21.9922, "eval_samples_per_second": 227.353, "eval_steps_per_second": 1.228, "step": 34974 }, { "epoch": 1.4324690500787507, "grad_norm": 0.22067669034004211, "learning_rate": 7.437990847864018e-05, "loss": 1.6073, "step": 34980 }, { "epoch": 1.433010991244263, "grad_norm": 0.35069945454597473, "learning_rate": 7.436561312296118e-05, "loss": 1.5947, "step": 34990 }, { "epoch": 1.4335529324097753, "grad_norm": 0.3185942471027374, "learning_rate": 7.435131536835412e-05, "loss": 1.5965, "step": 35000 }, { "epoch": 1.4340948735752874, "grad_norm": 0.3867363929748535, "learning_rate": 7.433701521659012e-05, "loss": 1.6054, "step": 35010 }, { "epoch": 1.4346368147407997, "grad_norm": 0.2255895733833313, "learning_rate": 7.432271266944063e-05, "loss": 1.5981, "step": 35020 }, { "epoch": 1.4351787559063118, "grad_norm": 0.5799124836921692, "learning_rate": 7.43084077286774e-05, "loss": 1.5975, "step": 35030 }, { "epoch": 1.4357206970718241, "grad_norm": 0.32704293727874756, "learning_rate": 7.429410039607241e-05, "loss": 1.598, "step": 35040 }, { "epoch": 1.4362626382373365, "grad_norm": 0.2843274772167206, "learning_rate": 7.427979067339805e-05, "loss": 1.6012, "step": 35050 }, { "epoch": 1.4368045794028486, "grad_norm": 0.21767373383045197, "learning_rate": 7.426547856242692e-05, "loss": 1.5984, "step": 35060 }, { "epoch": 1.4368587735193998, "eval_loss": 2.5202910900115967, "eval_runtime": 21.9854, "eval_samples_per_second": 227.423, "eval_steps_per_second": 1.228, "step": 35061 }, { "epoch": 1.4373465205683609, "grad_norm": 0.3761364221572876, "learning_rate": 7.425116406493196e-05, "loss": 1.6041, "step": 35070 }, { "epoch": 1.437888461733873, "grad_norm": 0.3286113142967224, "learning_rate": 7.423684718268637e-05, "loss": 1.5982, "step": 35080 }, { "epoch": 1.4384304028993853, "grad_norm": 0.3064782917499542, "learning_rate": 7.422252791746369e-05, "loss": 1.5936, "step": 35090 }, { "epoch": 1.4389723440648974, "grad_norm": 0.2235429883003235, "learning_rate": 7.420820627103768e-05, "loss": 1.5973, "step": 35100 }, { "epoch": 1.4395142852304097, "grad_norm": 0.33427155017852783, "learning_rate": 7.419388224518247e-05, "loss": 1.5904, "step": 35110 }, { "epoch": 1.4400562263959218, "grad_norm": 0.26824381947517395, "learning_rate": 7.417955584167246e-05, "loss": 1.6038, "step": 35120 }, { "epoch": 1.440598167561434, "grad_norm": 0.3559771776199341, "learning_rate": 7.416522706228235e-05, "loss": 1.5985, "step": 35130 }, { "epoch": 1.4411401087269464, "grad_norm": 0.43836888670921326, "learning_rate": 7.415089590878713e-05, "loss": 1.6008, "step": 35140 }, { "epoch": 1.4415736616593562, "eval_loss": 2.522282838821411, "eval_runtime": 21.9813, "eval_samples_per_second": 227.466, "eval_steps_per_second": 1.228, "step": 35148 }, { "epoch": 1.4416820498924585, "grad_norm": 0.2667721211910248, "learning_rate": 7.413656238296205e-05, "loss": 1.5959, "step": 35150 }, { "epoch": 1.4422239910579708, "grad_norm": 0.34478846192359924, "learning_rate": 7.412222648658271e-05, "loss": 1.5947, "step": 35160 }, { "epoch": 1.442765932223483, "grad_norm": 0.2894307076931, "learning_rate": 7.410788822142497e-05, "loss": 1.5881, "step": 35170 }, { "epoch": 1.4433078733889952, "grad_norm": 0.44409897923469543, "learning_rate": 7.409354758926501e-05, "loss": 1.5902, "step": 35180 }, { "epoch": 1.4438498145545076, "grad_norm": 0.3519600033760071, "learning_rate": 7.407920459187925e-05, "loss": 1.6024, "step": 35190 }, { "epoch": 1.4443917557200197, "grad_norm": 0.2908567786216736, "learning_rate": 7.406485923104447e-05, "loss": 1.5973, "step": 35200 }, { "epoch": 1.4449336968855317, "grad_norm": 0.33565568923950195, "learning_rate": 7.405051150853771e-05, "loss": 1.5992, "step": 35210 }, { "epoch": 1.445475638051044, "grad_norm": 0.3194481134414673, "learning_rate": 7.403616142613627e-05, "loss": 1.5931, "step": 35220 }, { "epoch": 1.4460175792165564, "grad_norm": 0.25139880180358887, "learning_rate": 7.402180898561781e-05, "loss": 1.599, "step": 35230 }, { "epoch": 1.4462885497993123, "eval_loss": 2.5243048667907715, "eval_runtime": 21.9879, "eval_samples_per_second": 227.398, "eval_steps_per_second": 1.228, "step": 35235 }, { "epoch": 1.4465595203820685, "grad_norm": 0.4910086989402771, "learning_rate": 7.400745418876024e-05, "loss": 1.5972, "step": 35240 }, { "epoch": 1.4471014615475808, "grad_norm": 0.2239486426115036, "learning_rate": 7.399309703734178e-05, "loss": 1.5989, "step": 35250 }, { "epoch": 1.4476434027130929, "grad_norm": 0.3110259175300598, "learning_rate": 7.39787375331409e-05, "loss": 1.6041, "step": 35260 }, { "epoch": 1.4481853438786052, "grad_norm": 0.394132137298584, "learning_rate": 7.396437567793642e-05, "loss": 1.5946, "step": 35270 }, { "epoch": 1.4487272850441175, "grad_norm": 0.3766900599002838, "learning_rate": 7.39500114735074e-05, "loss": 1.5982, "step": 35280 }, { "epoch": 1.4492692262096296, "grad_norm": 0.37917500734329224, "learning_rate": 7.393564492163326e-05, "loss": 1.599, "step": 35290 }, { "epoch": 1.4498111673751417, "grad_norm": 0.30892854928970337, "learning_rate": 7.392127602409361e-05, "loss": 1.5915, "step": 35300 }, { "epoch": 1.450353108540654, "grad_norm": 0.23008306324481964, "learning_rate": 7.390690478266844e-05, "loss": 1.6001, "step": 35310 }, { "epoch": 1.4508950497061663, "grad_norm": 0.23139609396457672, "learning_rate": 7.389253119913801e-05, "loss": 1.5941, "step": 35320 }, { "epoch": 1.4510034379392687, "eval_loss": 2.5347366333007812, "eval_runtime": 22.3055, "eval_samples_per_second": 224.16, "eval_steps_per_second": 1.21, "step": 35322 }, { "epoch": 1.4514369908716784, "grad_norm": 0.3306918740272522, "learning_rate": 7.387815527528283e-05, "loss": 1.6064, "step": 35330 }, { "epoch": 1.4519789320371908, "grad_norm": 0.3398854732513428, "learning_rate": 7.386377701288372e-05, "loss": 1.5957, "step": 35340 }, { "epoch": 1.4525208732027028, "grad_norm": 0.4003507196903229, "learning_rate": 7.384939641372184e-05, "loss": 1.5979, "step": 35350 }, { "epoch": 1.4530628143682152, "grad_norm": 0.23540987074375153, "learning_rate": 7.383501347957854e-05, "loss": 1.5936, "step": 35360 }, { "epoch": 1.4536047555337275, "grad_norm": 0.23208670318126678, "learning_rate": 7.382062821223555e-05, "loss": 1.5963, "step": 35370 }, { "epoch": 1.4541466966992396, "grad_norm": 0.2580191195011139, "learning_rate": 7.380624061347485e-05, "loss": 1.5969, "step": 35380 }, { "epoch": 1.4546886378647519, "grad_norm": 0.4060412645339966, "learning_rate": 7.37918506850787e-05, "loss": 1.599, "step": 35390 }, { "epoch": 1.455230579030264, "grad_norm": 0.2602030634880066, "learning_rate": 7.377745842882971e-05, "loss": 1.5982, "step": 35400 }, { "epoch": 1.455718326079225, "eval_loss": 2.522488832473755, "eval_runtime": 21.9903, "eval_samples_per_second": 227.373, "eval_steps_per_second": 1.228, "step": 35409 }, { "epoch": 1.4557725201957763, "grad_norm": 0.31275516748428345, "learning_rate": 7.376306384651066e-05, "loss": 1.5937, "step": 35410 }, { "epoch": 1.4563144613612884, "grad_norm": 0.34258827567100525, "learning_rate": 7.374866693990474e-05, "loss": 1.5911, "step": 35420 }, { "epoch": 1.4568564025268007, "grad_norm": 0.2919900417327881, "learning_rate": 7.373426771079534e-05, "loss": 1.5827, "step": 35430 }, { "epoch": 1.4573983436923128, "grad_norm": 0.4928779900074005, "learning_rate": 7.37198661609662e-05, "loss": 1.5888, "step": 35440 }, { "epoch": 1.4579402848578251, "grad_norm": 0.25346246361732483, "learning_rate": 7.370546229220133e-05, "loss": 1.5951, "step": 35450 }, { "epoch": 1.4584822260233374, "grad_norm": 0.21547643840312958, "learning_rate": 7.369105610628498e-05, "loss": 1.5987, "step": 35460 }, { "epoch": 1.4590241671888495, "grad_norm": 0.26982036232948303, "learning_rate": 7.367664760500177e-05, "loss": 1.5837, "step": 35470 }, { "epoch": 1.4595661083543618, "grad_norm": 0.34236806631088257, "learning_rate": 7.366223679013652e-05, "loss": 1.595, "step": 35480 }, { "epoch": 1.460108049519874, "grad_norm": 0.41274112462997437, "learning_rate": 7.364782366347439e-05, "loss": 1.5924, "step": 35490 }, { "epoch": 1.4604332142191814, "eval_loss": 2.516339063644409, "eval_runtime": 21.9874, "eval_samples_per_second": 227.403, "eval_steps_per_second": 1.228, "step": 35496 }, { "epoch": 1.4606499906853863, "grad_norm": 0.6666343212127686, "learning_rate": 7.363340822680085e-05, "loss": 1.6145, "step": 35500 }, { "epoch": 1.4611919318508986, "grad_norm": 0.57722008228302, "learning_rate": 7.36189904819016e-05, "loss": 1.5893, "step": 35510 }, { "epoch": 1.4617338730164107, "grad_norm": 0.3001037836074829, "learning_rate": 7.360457043056263e-05, "loss": 1.5977, "step": 35520 }, { "epoch": 1.4622758141819228, "grad_norm": 0.3289230763912201, "learning_rate": 7.359014807457026e-05, "loss": 1.5955, "step": 35530 }, { "epoch": 1.462817755347435, "grad_norm": 0.26572784781455994, "learning_rate": 7.357572341571103e-05, "loss": 1.5957, "step": 35540 }, { "epoch": 1.4633596965129474, "grad_norm": 0.3806922435760498, "learning_rate": 7.356129645577186e-05, "loss": 1.5925, "step": 35550 }, { "epoch": 1.4639016376784595, "grad_norm": 0.31142738461494446, "learning_rate": 7.354686719653986e-05, "loss": 1.5968, "step": 35560 }, { "epoch": 1.4644435788439718, "grad_norm": 0.26502105593681335, "learning_rate": 7.353243563980247e-05, "loss": 1.6025, "step": 35570 }, { "epoch": 1.464985520009484, "grad_norm": 0.3371651768684387, "learning_rate": 7.351800178734741e-05, "loss": 1.5903, "step": 35580 }, { "epoch": 1.4651481023591377, "eval_loss": 2.5116758346557617, "eval_runtime": 21.9879, "eval_samples_per_second": 227.398, "eval_steps_per_second": 1.228, "step": 35583 }, { "epoch": 1.4655274611749962, "grad_norm": 0.3568730056285858, "learning_rate": 7.350356564096269e-05, "loss": 1.5913, "step": 35590 }, { "epoch": 1.4660694023405085, "grad_norm": 0.29186302423477173, "learning_rate": 7.348912720243661e-05, "loss": 1.5942, "step": 35600 }, { "epoch": 1.4666113435060206, "grad_norm": 0.2645188868045807, "learning_rate": 7.347468647355771e-05, "loss": 1.5962, "step": 35610 }, { "epoch": 1.4671532846715327, "grad_norm": 0.24721835553646088, "learning_rate": 7.346024345611485e-05, "loss": 1.5911, "step": 35620 }, { "epoch": 1.467695225837045, "grad_norm": 0.3433522582054138, "learning_rate": 7.344579815189718e-05, "loss": 1.5934, "step": 35630 }, { "epoch": 1.4682371670025574, "grad_norm": 0.28682151436805725, "learning_rate": 7.343135056269412e-05, "loss": 1.5839, "step": 35640 }, { "epoch": 1.4687791081680694, "grad_norm": 0.28299185633659363, "learning_rate": 7.341690069029538e-05, "loss": 1.5926, "step": 35650 }, { "epoch": 1.4693210493335818, "grad_norm": 0.3892780840396881, "learning_rate": 7.340244853649095e-05, "loss": 1.589, "step": 35660 }, { "epoch": 1.4698629904990939, "grad_norm": 0.296934574842453, "learning_rate": 7.338799410307107e-05, "loss": 1.6075, "step": 35670 }, { "epoch": 1.4698629904990939, "eval_loss": 2.520735740661621, "eval_runtime": 21.9779, "eval_samples_per_second": 227.501, "eval_steps_per_second": 1.229, "step": 35670 }, { "epoch": 1.4704049316646062, "grad_norm": 0.2969922423362732, "learning_rate": 7.337353739182631e-05, "loss": 1.5992, "step": 35680 }, { "epoch": 1.4709468728301185, "grad_norm": 0.24185840785503387, "learning_rate": 7.335907840454753e-05, "loss": 1.5935, "step": 35690 }, { "epoch": 1.4714888139956306, "grad_norm": 0.272232323884964, "learning_rate": 7.334461714302582e-05, "loss": 1.6053, "step": 35700 }, { "epoch": 1.472030755161143, "grad_norm": 0.2702069878578186, "learning_rate": 7.333015360905257e-05, "loss": 1.5961, "step": 35710 }, { "epoch": 1.472572696326655, "grad_norm": 0.28312546014785767, "learning_rate": 7.331568780441948e-05, "loss": 1.5832, "step": 35720 }, { "epoch": 1.4731146374921673, "grad_norm": 0.2772080898284912, "learning_rate": 7.330121973091849e-05, "loss": 1.6027, "step": 35730 }, { "epoch": 1.4736565786576794, "grad_norm": 0.2347124218940735, "learning_rate": 7.328674939034188e-05, "loss": 1.5872, "step": 35740 }, { "epoch": 1.4741985198231917, "grad_norm": 0.30288901925086975, "learning_rate": 7.327227678448215e-05, "loss": 1.5851, "step": 35750 }, { "epoch": 1.4745778786390502, "eval_loss": 2.5225229263305664, "eval_runtime": 21.6393, "eval_samples_per_second": 231.061, "eval_steps_per_second": 1.248, "step": 35757 }, { "epoch": 1.4747404609887038, "grad_norm": 0.3605928122997284, "learning_rate": 7.325780191513208e-05, "loss": 1.5919, "step": 35760 }, { "epoch": 1.4752824021542161, "grad_norm": 0.30377334356307983, "learning_rate": 7.324332478408478e-05, "loss": 1.5886, "step": 35770 }, { "epoch": 1.4758243433197284, "grad_norm": 0.32762688398361206, "learning_rate": 7.322884539313361e-05, "loss": 1.5903, "step": 35780 }, { "epoch": 1.4763662844852405, "grad_norm": 0.29475733637809753, "learning_rate": 7.321436374407222e-05, "loss": 1.59, "step": 35790 }, { "epoch": 1.4769082256507529, "grad_norm": 0.2471809685230255, "learning_rate": 7.319987983869454e-05, "loss": 1.5876, "step": 35800 }, { "epoch": 1.477450166816265, "grad_norm": 0.36135897040367126, "learning_rate": 7.318539367879475e-05, "loss": 1.5946, "step": 35810 }, { "epoch": 1.4779921079817773, "grad_norm": 0.42719608545303345, "learning_rate": 7.317090526616733e-05, "loss": 1.593, "step": 35820 }, { "epoch": 1.4785340491472894, "grad_norm": 0.22060419619083405, "learning_rate": 7.315641460260708e-05, "loss": 1.5905, "step": 35830 }, { "epoch": 1.4790759903128017, "grad_norm": 0.27766963839530945, "learning_rate": 7.3141921689909e-05, "loss": 1.5923, "step": 35840 }, { "epoch": 1.4792927667790066, "eval_loss": 2.5205538272857666, "eval_runtime": 21.9888, "eval_samples_per_second": 227.388, "eval_steps_per_second": 1.228, "step": 35844 }, { "epoch": 1.4796179314783138, "grad_norm": 0.21527595818042755, "learning_rate": 7.312742652986844e-05, "loss": 1.5926, "step": 35850 }, { "epoch": 1.480159872643826, "grad_norm": 0.2089533805847168, "learning_rate": 7.311292912428097e-05, "loss": 1.582, "step": 35860 }, { "epoch": 1.4807018138093384, "grad_norm": 0.2418367564678192, "learning_rate": 7.309842947494248e-05, "loss": 1.5862, "step": 35870 }, { "epoch": 1.4812437549748505, "grad_norm": 0.2674960494041443, "learning_rate": 7.308392758364912e-05, "loss": 1.5865, "step": 35880 }, { "epoch": 1.4817856961403628, "grad_norm": 0.24217753112316132, "learning_rate": 7.306942345219733e-05, "loss": 1.5894, "step": 35890 }, { "epoch": 1.482327637305875, "grad_norm": 0.2955038249492645, "learning_rate": 7.305491708238381e-05, "loss": 1.5957, "step": 35900 }, { "epoch": 1.4828695784713872, "grad_norm": 0.4298487603664398, "learning_rate": 7.304040847600555e-05, "loss": 1.5872, "step": 35910 }, { "epoch": 1.4834115196368995, "grad_norm": 0.3208651542663574, "learning_rate": 7.30258976348598e-05, "loss": 1.5896, "step": 35920 }, { "epoch": 1.4839534608024116, "grad_norm": 0.3353649079799652, "learning_rate": 7.301138456074414e-05, "loss": 1.5863, "step": 35930 }, { "epoch": 1.484007654918963, "eval_loss": 2.5121777057647705, "eval_runtime": 21.989, "eval_samples_per_second": 227.386, "eval_steps_per_second": 1.228, "step": 35931 }, { "epoch": 1.4844954019679237, "grad_norm": 0.2195397913455963, "learning_rate": 7.299686925545633e-05, "loss": 1.5874, "step": 35940 }, { "epoch": 1.485037343133436, "grad_norm": 0.27811160683631897, "learning_rate": 7.298235172079451e-05, "loss": 1.5921, "step": 35950 }, { "epoch": 1.4855792842989484, "grad_norm": 0.3784521818161011, "learning_rate": 7.296783195855703e-05, "loss": 1.5812, "step": 35960 }, { "epoch": 1.4861212254644605, "grad_norm": 0.20319710671901703, "learning_rate": 7.295330997054251e-05, "loss": 1.5943, "step": 35970 }, { "epoch": 1.4866631666299728, "grad_norm": 0.45995742082595825, "learning_rate": 7.293878575854993e-05, "loss": 1.5899, "step": 35980 }, { "epoch": 1.4872051077954849, "grad_norm": 0.4735308289527893, "learning_rate": 7.292425932437843e-05, "loss": 1.5944, "step": 35990 }, { "epoch": 1.4877470489609972, "grad_norm": 0.2938894033432007, "learning_rate": 7.290973066982752e-05, "loss": 1.5885, "step": 36000 }, { "epoch": 1.4882889901265095, "grad_norm": 0.4378818869590759, "learning_rate": 7.289519979669693e-05, "loss": 1.5856, "step": 36010 }, { "epoch": 1.4887225430589193, "eval_loss": 2.5050690174102783, "eval_runtime": 21.9834, "eval_samples_per_second": 227.444, "eval_steps_per_second": 1.228, "step": 36018 }, { "epoch": 1.4888309312920216, "grad_norm": 0.5193374752998352, "learning_rate": 7.288066670678665e-05, "loss": 1.6, "step": 36020 }, { "epoch": 1.4893728724575337, "grad_norm": 0.3998728394508362, "learning_rate": 7.286613140189704e-05, "loss": 1.5935, "step": 36030 }, { "epoch": 1.489914813623046, "grad_norm": 0.2564713954925537, "learning_rate": 7.285159388382864e-05, "loss": 1.5882, "step": 36040 }, { "epoch": 1.4904567547885583, "grad_norm": 0.2355422079563141, "learning_rate": 7.283705415438228e-05, "loss": 1.5895, "step": 36050 }, { "epoch": 1.4909986959540704, "grad_norm": 0.226087749004364, "learning_rate": 7.282251221535908e-05, "loss": 1.5908, "step": 36060 }, { "epoch": 1.4915406371195827, "grad_norm": 0.3110443353652954, "learning_rate": 7.280796806856048e-05, "loss": 1.5861, "step": 36070 }, { "epoch": 1.4920825782850948, "grad_norm": 0.2450808882713318, "learning_rate": 7.27934217157881e-05, "loss": 1.5803, "step": 36080 }, { "epoch": 1.4926245194506071, "grad_norm": 0.27178075909614563, "learning_rate": 7.277887315884388e-05, "loss": 1.5887, "step": 36090 }, { "epoch": 1.4931664606161195, "grad_norm": 0.35005462169647217, "learning_rate": 7.276432239953004e-05, "loss": 1.5939, "step": 36100 }, { "epoch": 1.4934374311988754, "eval_loss": 2.5150177478790283, "eval_runtime": 21.9887, "eval_samples_per_second": 227.39, "eval_steps_per_second": 1.228, "step": 36105 }, { "epoch": 1.4937084017816316, "grad_norm": 0.27534955739974976, "learning_rate": 7.274976943964906e-05, "loss": 1.5847, "step": 36110 }, { "epoch": 1.4942503429471439, "grad_norm": 0.29644301533699036, "learning_rate": 7.273521428100372e-05, "loss": 1.5859, "step": 36120 }, { "epoch": 1.494792284112656, "grad_norm": 0.37286531925201416, "learning_rate": 7.272065692539701e-05, "loss": 1.6031, "step": 36130 }, { "epoch": 1.4953342252781683, "grad_norm": 0.3895072340965271, "learning_rate": 7.270609737463229e-05, "loss": 1.5959, "step": 36140 }, { "epoch": 1.4958761664436804, "grad_norm": 0.3287084996700287, "learning_rate": 7.269153563051306e-05, "loss": 1.5838, "step": 36150 }, { "epoch": 1.4964181076091927, "grad_norm": 0.4030684530735016, "learning_rate": 7.267697169484323e-05, "loss": 1.58, "step": 36160 }, { "epoch": 1.4969600487747048, "grad_norm": 0.32397374510765076, "learning_rate": 7.26624055694269e-05, "loss": 1.5884, "step": 36170 }, { "epoch": 1.497501989940217, "grad_norm": 0.268887996673584, "learning_rate": 7.264783725606843e-05, "loss": 1.5928, "step": 36180 }, { "epoch": 1.4980439311057294, "grad_norm": 0.32079851627349854, "learning_rate": 7.263326675657251e-05, "loss": 1.5925, "step": 36190 }, { "epoch": 1.4981523193388318, "eval_loss": 2.512747049331665, "eval_runtime": 21.9853, "eval_samples_per_second": 227.425, "eval_steps_per_second": 1.228, "step": 36192 }, { "epoch": 1.4985858722712415, "grad_norm": 0.27530887722969055, "learning_rate": 7.261869407274406e-05, "loss": 1.5871, "step": 36200 }, { "epoch": 1.4991278134367538, "grad_norm": 0.39330941438674927, "learning_rate": 7.260411920638828e-05, "loss": 1.5856, "step": 36210 }, { "epoch": 1.499669754602266, "grad_norm": 0.23756785690784454, "learning_rate": 7.258954215931064e-05, "loss": 1.5878, "step": 36220 }, { "epoch": 1.5002116957677782, "grad_norm": 0.3410539925098419, "learning_rate": 7.257496293331688e-05, "loss": 1.6006, "step": 36230 }, { "epoch": 1.5007536369332906, "grad_norm": 0.3402417302131653, "learning_rate": 7.256038153021303e-05, "loss": 1.5809, "step": 36240 }, { "epoch": 1.5012955780988027, "grad_norm": 0.25963348150253296, "learning_rate": 7.254579795180534e-05, "loss": 1.58, "step": 36250 }, { "epoch": 1.5018375192643147, "grad_norm": 0.37318065762519836, "learning_rate": 7.253121219990038e-05, "loss": 1.5912, "step": 36260 }, { "epoch": 1.502379460429827, "grad_norm": 0.45932379364967346, "learning_rate": 7.251662427630496e-05, "loss": 1.5933, "step": 36270 }, { "epoch": 1.502867207478788, "eval_loss": 2.5038368701934814, "eval_runtime": 21.9923, "eval_samples_per_second": 227.352, "eval_steps_per_second": 1.228, "step": 36279 }, { "epoch": 1.5029214015953394, "grad_norm": 0.3073781430721283, "learning_rate": 7.25020341828262e-05, "loss": 1.5869, "step": 36280 }, { "epoch": 1.5034633427608515, "grad_norm": 0.29523077607154846, "learning_rate": 7.24874419212714e-05, "loss": 1.5864, "step": 36290 }, { "epoch": 1.5040052839263638, "grad_norm": 0.3561224341392517, "learning_rate": 7.247284749344824e-05, "loss": 1.5904, "step": 36300 }, { "epoch": 1.5045472250918759, "grad_norm": 0.26353198289871216, "learning_rate": 7.245825090116457e-05, "loss": 1.5867, "step": 36310 }, { "epoch": 1.5050891662573882, "grad_norm": 0.2485145479440689, "learning_rate": 7.244365214622859e-05, "loss": 1.5951, "step": 36320 }, { "epoch": 1.5056311074229005, "grad_norm": 0.21996235847473145, "learning_rate": 7.242905123044872e-05, "loss": 1.5847, "step": 36330 }, { "epoch": 1.5061730485884126, "grad_norm": 0.32352975010871887, "learning_rate": 7.241444815563364e-05, "loss": 1.5844, "step": 36340 }, { "epoch": 1.5067149897539247, "grad_norm": 0.3241994380950928, "learning_rate": 7.239984292359232e-05, "loss": 1.5895, "step": 36350 }, { "epoch": 1.507256930919437, "grad_norm": 0.7218349575996399, "learning_rate": 7.238523553613402e-05, "loss": 1.5877, "step": 36360 }, { "epoch": 1.5075820956187442, "eval_loss": 2.5044937133789062, "eval_runtime": 21.9867, "eval_samples_per_second": 227.411, "eval_steps_per_second": 1.228, "step": 36366 }, { "epoch": 1.5077988720849493, "grad_norm": 0.5906317830085754, "learning_rate": 7.237062599506821e-05, "loss": 1.589, "step": 36370 }, { "epoch": 1.5083408132504617, "grad_norm": 0.43497881293296814, "learning_rate": 7.23560143022047e-05, "loss": 1.5833, "step": 36380 }, { "epoch": 1.5088827544159737, "grad_norm": 0.3339081406593323, "learning_rate": 7.234140045935345e-05, "loss": 1.6007, "step": 36390 }, { "epoch": 1.5094246955814858, "grad_norm": 0.2763802111148834, "learning_rate": 7.23267844683248e-05, "loss": 1.5971, "step": 36400 }, { "epoch": 1.5099666367469982, "grad_norm": 0.3865683376789093, "learning_rate": 7.231216633092934e-05, "loss": 1.5844, "step": 36410 }, { "epoch": 1.5105085779125105, "grad_norm": 0.32255837321281433, "learning_rate": 7.229754604897786e-05, "loss": 1.5878, "step": 36420 }, { "epoch": 1.5110505190780226, "grad_norm": 0.29356804490089417, "learning_rate": 7.228292362428148e-05, "loss": 1.5826, "step": 36430 }, { "epoch": 1.5115924602435347, "grad_norm": 0.2429652065038681, "learning_rate": 7.226829905865156e-05, "loss": 1.5873, "step": 36440 }, { "epoch": 1.512134401409047, "grad_norm": 0.39031389355659485, "learning_rate": 7.225367235389972e-05, "loss": 1.5894, "step": 36450 }, { "epoch": 1.5122969837587008, "eval_loss": 2.497527837753296, "eval_runtime": 21.9849, "eval_samples_per_second": 227.429, "eval_steps_per_second": 1.228, "step": 36453 }, { "epoch": 1.5126763425745593, "grad_norm": 0.27026987075805664, "learning_rate": 7.223904351183786e-05, "loss": 1.5877, "step": 36460 }, { "epoch": 1.5132182837400716, "grad_norm": 0.26341304183006287, "learning_rate": 7.222441253427813e-05, "loss": 1.5977, "step": 36470 }, { "epoch": 1.5137602249055837, "grad_norm": 0.2732945382595062, "learning_rate": 7.220977942303295e-05, "loss": 1.5945, "step": 36480 }, { "epoch": 1.5143021660710958, "grad_norm": 0.29458683729171753, "learning_rate": 7.219514417991505e-05, "loss": 1.5861, "step": 36490 }, { "epoch": 1.5148441072366081, "grad_norm": 0.5213008522987366, "learning_rate": 7.218050680673729e-05, "loss": 1.5707, "step": 36500 }, { "epoch": 1.5153860484021204, "grad_norm": 0.4199194014072418, "learning_rate": 7.216586730531296e-05, "loss": 1.5796, "step": 36510 }, { "epoch": 1.5159279895676325, "grad_norm": 0.23940829932689667, "learning_rate": 7.215122567745552e-05, "loss": 1.5849, "step": 36520 }, { "epoch": 1.5164699307331446, "grad_norm": 0.2722623944282532, "learning_rate": 7.213658192497869e-05, "loss": 1.5892, "step": 36530 }, { "epoch": 1.517011871898657, "grad_norm": 0.24977056682109833, "learning_rate": 7.212193604969652e-05, "loss": 1.5806, "step": 36540 }, { "epoch": 1.517011871898657, "eval_loss": 2.511324644088745, "eval_runtime": 21.9284, "eval_samples_per_second": 228.015, "eval_steps_per_second": 1.231, "step": 36540 }, { "epoch": 1.5175538130641693, "grad_norm": 0.4285825788974762, "learning_rate": 7.21072880534232e-05, "loss": 1.5742, "step": 36550 }, { "epoch": 1.5180957542296816, "grad_norm": 0.72999107837677, "learning_rate": 7.209263793797335e-05, "loss": 1.5943, "step": 36560 }, { "epoch": 1.5186376953951937, "grad_norm": 0.4707547426223755, "learning_rate": 7.207798570516172e-05, "loss": 1.5901, "step": 36570 }, { "epoch": 1.5191796365607058, "grad_norm": 0.25756052136421204, "learning_rate": 7.206333135680336e-05, "loss": 1.5844, "step": 36580 }, { "epoch": 1.519721577726218, "grad_norm": 0.21552801132202148, "learning_rate": 7.204867489471359e-05, "loss": 1.5989, "step": 36590 }, { "epoch": 1.5202635188917304, "grad_norm": 0.3025147616863251, "learning_rate": 7.203401632070798e-05, "loss": 1.5821, "step": 36600 }, { "epoch": 1.5208054600572425, "grad_norm": 0.3975871801376343, "learning_rate": 7.201935563660239e-05, "loss": 1.5842, "step": 36610 }, { "epoch": 1.5213474012227548, "grad_norm": 0.2734328806400299, "learning_rate": 7.200469284421292e-05, "loss": 1.5842, "step": 36620 }, { "epoch": 1.5217267600386133, "eval_loss": 2.4983890056610107, "eval_runtime": 21.9821, "eval_samples_per_second": 227.458, "eval_steps_per_second": 1.228, "step": 36627 }, { "epoch": 1.521889342388267, "grad_norm": 0.23720048367977142, "learning_rate": 7.199002794535593e-05, "loss": 1.5873, "step": 36630 }, { "epoch": 1.5224312835537792, "grad_norm": 0.26733702421188354, "learning_rate": 7.197536094184803e-05, "loss": 1.5868, "step": 36640 }, { "epoch": 1.5229732247192915, "grad_norm": 0.3890104293823242, "learning_rate": 7.196069183550612e-05, "loss": 1.5894, "step": 36650 }, { "epoch": 1.5235151658848036, "grad_norm": 0.23871298134326935, "learning_rate": 7.194602062814733e-05, "loss": 1.5899, "step": 36660 }, { "epoch": 1.5240571070503157, "grad_norm": 0.20540666580200195, "learning_rate": 7.19313473215891e-05, "loss": 1.585, "step": 36670 }, { "epoch": 1.524599048215828, "grad_norm": 0.2062886506319046, "learning_rate": 7.191667191764906e-05, "loss": 1.5882, "step": 36680 }, { "epoch": 1.5251409893813404, "grad_norm": 0.2321225106716156, "learning_rate": 7.190199441814516e-05, "loss": 1.5901, "step": 36690 }, { "epoch": 1.5256829305468527, "grad_norm": 0.4270740747451782, "learning_rate": 7.188731482489556e-05, "loss": 1.5668, "step": 36700 }, { "epoch": 1.5262248717123648, "grad_norm": 0.34541916847229004, "learning_rate": 7.187263313971872e-05, "loss": 1.5925, "step": 36710 }, { "epoch": 1.5264416481785696, "eval_loss": 2.493448495864868, "eval_runtime": 21.9846, "eval_samples_per_second": 227.432, "eval_steps_per_second": 1.228, "step": 36714 }, { "epoch": 1.5267668128778769, "grad_norm": 0.44076773524284363, "learning_rate": 7.185794936443334e-05, "loss": 1.5788, "step": 36720 }, { "epoch": 1.5273087540433892, "grad_norm": 0.2881830334663391, "learning_rate": 7.184326350085839e-05, "loss": 1.5871, "step": 36730 }, { "epoch": 1.5278506952089015, "grad_norm": 0.4518699645996094, "learning_rate": 7.182857555081307e-05, "loss": 1.5941, "step": 36740 }, { "epoch": 1.5283926363744136, "grad_norm": 0.48421424627304077, "learning_rate": 7.181388551611688e-05, "loss": 1.5865, "step": 36750 }, { "epoch": 1.5289345775399257, "grad_norm": 0.2462109625339508, "learning_rate": 7.179919339858955e-05, "loss": 1.5774, "step": 36760 }, { "epoch": 1.529476518705438, "grad_norm": 0.25756916403770447, "learning_rate": 7.178449920005108e-05, "loss": 1.5809, "step": 36770 }, { "epoch": 1.5300184598709503, "grad_norm": 0.28613096475601196, "learning_rate": 7.176980292232173e-05, "loss": 1.5815, "step": 36780 }, { "epoch": 1.5305604010364626, "grad_norm": 0.3218177258968353, "learning_rate": 7.175510456722198e-05, "loss": 1.5914, "step": 36790 }, { "epoch": 1.5311023422019747, "grad_norm": 0.42091110348701477, "learning_rate": 7.174040413657262e-05, "loss": 1.5778, "step": 36800 }, { "epoch": 1.5311565363185258, "eval_loss": 2.505159854888916, "eval_runtime": 21.9856, "eval_samples_per_second": 227.422, "eval_steps_per_second": 1.228, "step": 36801 }, { "epoch": 1.5316442833674868, "grad_norm": 0.23221740126609802, "learning_rate": 7.17257016321947e-05, "loss": 1.5965, "step": 36810 }, { "epoch": 1.5321862245329991, "grad_norm": 0.28910213708877563, "learning_rate": 7.171099705590946e-05, "loss": 1.5775, "step": 36820 }, { "epoch": 1.5327281656985114, "grad_norm": 0.25105226039886475, "learning_rate": 7.169629040953846e-05, "loss": 1.5871, "step": 36830 }, { "epoch": 1.5332701068640235, "grad_norm": 0.2661377489566803, "learning_rate": 7.168158169490347e-05, "loss": 1.5805, "step": 36840 }, { "epoch": 1.5338120480295356, "grad_norm": 0.28218016028404236, "learning_rate": 7.166687091382659e-05, "loss": 1.5916, "step": 36850 }, { "epoch": 1.534353989195048, "grad_norm": 0.3052498400211334, "learning_rate": 7.165215806813009e-05, "loss": 1.5885, "step": 36860 }, { "epoch": 1.5348959303605603, "grad_norm": 0.2858087718486786, "learning_rate": 7.163744315963656e-05, "loss": 1.5752, "step": 36870 }, { "epoch": 1.5354378715260726, "grad_norm": 0.2672872245311737, "learning_rate": 7.162272619016879e-05, "loss": 1.5828, "step": 36880 }, { "epoch": 1.5358714244584823, "eval_loss": 2.495072364807129, "eval_runtime": 21.9812, "eval_samples_per_second": 227.467, "eval_steps_per_second": 1.228, "step": 36888 }, { "epoch": 1.5359798126915847, "grad_norm": 0.3383979797363281, "learning_rate": 7.160800716154987e-05, "loss": 1.5818, "step": 36890 }, { "epoch": 1.5365217538570968, "grad_norm": 0.3658377528190613, "learning_rate": 7.159328607560312e-05, "loss": 1.5777, "step": 36900 }, { "epoch": 1.537063695022609, "grad_norm": 0.2234833836555481, "learning_rate": 7.157856293415216e-05, "loss": 1.5839, "step": 36910 }, { "epoch": 1.5376056361881214, "grad_norm": 0.2896660566329956, "learning_rate": 7.156383773902076e-05, "loss": 1.5878, "step": 36920 }, { "epoch": 1.5381475773536335, "grad_norm": 0.37888798117637634, "learning_rate": 7.15491104920331e-05, "loss": 1.5934, "step": 36930 }, { "epoch": 1.5386895185191458, "grad_norm": 0.49281513690948486, "learning_rate": 7.153438119501346e-05, "loss": 1.5918, "step": 36940 }, { "epoch": 1.539231459684658, "grad_norm": 0.5974851846694946, "learning_rate": 7.151964984978645e-05, "loss": 1.59, "step": 36950 }, { "epoch": 1.5397734008501702, "grad_norm": 0.28145140409469604, "learning_rate": 7.150491645817695e-05, "loss": 1.5886, "step": 36960 }, { "epoch": 1.5403153420156825, "grad_norm": 0.2434787005186081, "learning_rate": 7.149018102201007e-05, "loss": 1.5792, "step": 36970 }, { "epoch": 1.5405863125984385, "eval_loss": 2.5023746490478516, "eval_runtime": 21.9846, "eval_samples_per_second": 227.432, "eval_steps_per_second": 1.228, "step": 36975 }, { "epoch": 1.5408572831811946, "grad_norm": 0.2465183436870575, "learning_rate": 7.147544354311114e-05, "loss": 1.5838, "step": 36980 }, { "epoch": 1.5413992243467067, "grad_norm": 0.23497800529003143, "learning_rate": 7.146070402330578e-05, "loss": 1.5924, "step": 36990 }, { "epoch": 1.541941165512219, "grad_norm": 0.3130892217159271, "learning_rate": 7.14459624644199e-05, "loss": 1.5842, "step": 37000 }, { "epoch": 1.5424831066777314, "grad_norm": 0.26590847969055176, "learning_rate": 7.143121886827955e-05, "loss": 1.5869, "step": 37010 }, { "epoch": 1.5430250478432437, "grad_norm": 0.22026236355304718, "learning_rate": 7.141647323671119e-05, "loss": 1.5772, "step": 37020 }, { "epoch": 1.5435669890087558, "grad_norm": 0.3443725109100342, "learning_rate": 7.140172557154138e-05, "loss": 1.5823, "step": 37030 }, { "epoch": 1.5441089301742679, "grad_norm": 0.3271442949771881, "learning_rate": 7.1386975874597e-05, "loss": 1.5902, "step": 37040 }, { "epoch": 1.5446508713397802, "grad_norm": 0.3126424252986908, "learning_rate": 7.137222414770522e-05, "loss": 1.5913, "step": 37050 }, { "epoch": 1.5451928125052925, "grad_norm": 0.3820621073246002, "learning_rate": 7.135747039269339e-05, "loss": 1.5824, "step": 37060 }, { "epoch": 1.5453012007383948, "eval_loss": 2.5038020610809326, "eval_runtime": 21.984, "eval_samples_per_second": 227.438, "eval_steps_per_second": 1.228, "step": 37062 }, { "epoch": 1.5457347536708046, "grad_norm": 0.29371535778045654, "learning_rate": 7.134271461138913e-05, "loss": 1.5818, "step": 37070 }, { "epoch": 1.5462766948363167, "grad_norm": 0.3977007567882538, "learning_rate": 7.132795680562035e-05, "loss": 1.5777, "step": 37080 }, { "epoch": 1.546818636001829, "grad_norm": 0.21876034140586853, "learning_rate": 7.131319697721515e-05, "loss": 1.5834, "step": 37090 }, { "epoch": 1.5473605771673413, "grad_norm": 0.28337687253952026, "learning_rate": 7.129843512800196e-05, "loss": 1.5842, "step": 37100 }, { "epoch": 1.5479025183328536, "grad_norm": 0.2559991180896759, "learning_rate": 7.128367125980938e-05, "loss": 1.5773, "step": 37110 }, { "epoch": 1.5484444594983657, "grad_norm": 0.442965567111969, "learning_rate": 7.126890537446628e-05, "loss": 1.5765, "step": 37120 }, { "epoch": 1.5489864006638778, "grad_norm": 0.4674750566482544, "learning_rate": 7.125413747380183e-05, "loss": 1.5938, "step": 37130 }, { "epoch": 1.5495283418293901, "grad_norm": 0.30193501710891724, "learning_rate": 7.12393675596454e-05, "loss": 1.585, "step": 37140 }, { "epoch": 1.5500160888783512, "eval_loss": 2.501018524169922, "eval_runtime": 21.9863, "eval_samples_per_second": 227.415, "eval_steps_per_second": 1.228, "step": 37149 }, { "epoch": 1.5500702829949025, "grad_norm": 0.4235888719558716, "learning_rate": 7.122459563382663e-05, "loss": 1.5755, "step": 37150 }, { "epoch": 1.5506122241604146, "grad_norm": 0.2661120593547821, "learning_rate": 7.120982169817538e-05, "loss": 1.5908, "step": 37160 }, { "epoch": 1.5511541653259266, "grad_norm": 0.2776598334312439, "learning_rate": 7.119504575452179e-05, "loss": 1.5824, "step": 37170 }, { "epoch": 1.551696106491439, "grad_norm": 0.44518810510635376, "learning_rate": 7.118026780469626e-05, "loss": 1.5958, "step": 37180 }, { "epoch": 1.5522380476569513, "grad_norm": 0.19919413328170776, "learning_rate": 7.116548785052938e-05, "loss": 1.5844, "step": 37190 }, { "epoch": 1.5527799888224636, "grad_norm": 0.27062350511550903, "learning_rate": 7.115070589385206e-05, "loss": 1.5944, "step": 37200 }, { "epoch": 1.5533219299879757, "grad_norm": 0.25841081142425537, "learning_rate": 7.113592193649542e-05, "loss": 1.5645, "step": 37210 }, { "epoch": 1.5538638711534878, "grad_norm": 0.22730769217014313, "learning_rate": 7.112113598029083e-05, "loss": 1.5833, "step": 37220 }, { "epoch": 1.554405812319, "grad_norm": 0.22557532787322998, "learning_rate": 7.110634802706988e-05, "loss": 1.5742, "step": 37230 }, { "epoch": 1.5547309770183073, "eval_loss": 2.501530885696411, "eval_runtime": 21.9894, "eval_samples_per_second": 227.383, "eval_steps_per_second": 1.228, "step": 37236 }, { "epoch": 1.5549477534845124, "grad_norm": 0.24140417575836182, "learning_rate": 7.109155807866449e-05, "loss": 1.5807, "step": 37240 }, { "epoch": 1.5554896946500245, "grad_norm": 0.33359408378601074, "learning_rate": 7.107676613690673e-05, "loss": 1.5806, "step": 37250 }, { "epoch": 1.5560316358155368, "grad_norm": 0.3180777132511139, "learning_rate": 7.1061972203629e-05, "loss": 1.577, "step": 37260 }, { "epoch": 1.556573576981049, "grad_norm": 0.29410409927368164, "learning_rate": 7.104717628066387e-05, "loss": 1.5777, "step": 37270 }, { "epoch": 1.5571155181465612, "grad_norm": 0.4459421932697296, "learning_rate": 7.103237836984421e-05, "loss": 1.5918, "step": 37280 }, { "epoch": 1.5576574593120736, "grad_norm": 0.24809594452381134, "learning_rate": 7.101757847300312e-05, "loss": 1.583, "step": 37290 }, { "epoch": 1.5581994004775856, "grad_norm": 0.2839214503765106, "learning_rate": 7.100277659197396e-05, "loss": 1.5854, "step": 37300 }, { "epoch": 1.5587413416430977, "grad_norm": 0.32646796107292175, "learning_rate": 7.098797272859032e-05, "loss": 1.5824, "step": 37310 }, { "epoch": 1.55928328280861, "grad_norm": 0.3208850920200348, "learning_rate": 7.097316688468602e-05, "loss": 1.5793, "step": 37320 }, { "epoch": 1.5594458651582639, "eval_loss": 2.494997024536133, "eval_runtime": 21.9879, "eval_samples_per_second": 227.397, "eval_steps_per_second": 1.228, "step": 37323 }, { "epoch": 1.5598252239741224, "grad_norm": 0.2501738965511322, "learning_rate": 7.095835906209515e-05, "loss": 1.59, "step": 37330 }, { "epoch": 1.5603671651396347, "grad_norm": 0.3885467052459717, "learning_rate": 7.094354926265206e-05, "loss": 1.5841, "step": 37340 }, { "epoch": 1.5609091063051468, "grad_norm": 0.48911768198013306, "learning_rate": 7.09287374881913e-05, "loss": 1.5772, "step": 37350 }, { "epoch": 1.5614510474706589, "grad_norm": 0.3899608552455902, "learning_rate": 7.09139237405477e-05, "loss": 1.5811, "step": 37360 }, { "epoch": 1.5619929886361712, "grad_norm": 0.44720959663391113, "learning_rate": 7.089910802155631e-05, "loss": 1.5769, "step": 37370 }, { "epoch": 1.5625349298016835, "grad_norm": 0.3570062816143036, "learning_rate": 7.088429033305245e-05, "loss": 1.5878, "step": 37380 }, { "epoch": 1.5630768709671956, "grad_norm": 0.3376300036907196, "learning_rate": 7.086947067687167e-05, "loss": 1.5816, "step": 37390 }, { "epoch": 1.5636188121327077, "grad_norm": 0.24573005735874176, "learning_rate": 7.085464905484974e-05, "loss": 1.58, "step": 37400 }, { "epoch": 1.56416075329822, "grad_norm": 0.3362915813922882, "learning_rate": 7.083982546882275e-05, "loss": 1.5777, "step": 37410 }, { "epoch": 1.56416075329822, "eval_loss": 2.491037368774414, "eval_runtime": 21.982, "eval_samples_per_second": 227.459, "eval_steps_per_second": 1.228, "step": 37410 }, { "epoch": 1.5647026944637323, "grad_norm": 0.37921565771102905, "learning_rate": 7.082499992062693e-05, "loss": 1.5786, "step": 37420 }, { "epoch": 1.5652446356292447, "grad_norm": 0.3702291250228882, "learning_rate": 7.081017241209883e-05, "loss": 1.5851, "step": 37430 }, { "epoch": 1.5657865767947567, "grad_norm": 0.23378178477287292, "learning_rate": 7.07953429450752e-05, "loss": 1.5818, "step": 37440 }, { "epoch": 1.5663285179602688, "grad_norm": 0.3024660646915436, "learning_rate": 7.078051152139308e-05, "loss": 1.5752, "step": 37450 }, { "epoch": 1.5668704591257812, "grad_norm": 0.25815534591674805, "learning_rate": 7.076567814288971e-05, "loss": 1.5895, "step": 37460 }, { "epoch": 1.5674124002912935, "grad_norm": 0.35706350207328796, "learning_rate": 7.075084281140256e-05, "loss": 1.567, "step": 37470 }, { "epoch": 1.5679543414568056, "grad_norm": 0.34599047899246216, "learning_rate": 7.07360055287694e-05, "loss": 1.5831, "step": 37480 }, { "epoch": 1.5684962826223177, "grad_norm": 0.22840653359889984, "learning_rate": 7.072116629682819e-05, "loss": 1.5795, "step": 37490 }, { "epoch": 1.5688756414381764, "eval_loss": 2.498384714126587, "eval_runtime": 21.9865, "eval_samples_per_second": 227.412, "eval_steps_per_second": 1.228, "step": 37497 }, { "epoch": 1.56903822378783, "grad_norm": 0.24497516453266144, "learning_rate": 7.070632511741717e-05, "loss": 1.5712, "step": 37500 }, { "epoch": 1.5695801649533423, "grad_norm": 0.3624199628829956, "learning_rate": 7.069148199237476e-05, "loss": 1.5803, "step": 37510 }, { "epoch": 2.000325164699307, "grad_norm": 0.2853184640407562, "learning_rate": 2.0000000000000002e-07, "loss": 1.5294, "step": 37520 }, { "epoch": 2.0008671058648195, "grad_norm": 0.2135554403066635, "learning_rate": 5.333333333333333e-07, "loss": 1.5365, "step": 37530 }, { "epoch": 2.001409047030332, "grad_norm": 0.20927861332893372, "learning_rate": 8.666666666666667e-07, "loss": 1.5365, "step": 37540 }, { "epoch": 2.001950988195844, "grad_norm": 0.17029166221618652, "learning_rate": 1.2000000000000002e-06, "loss": 1.5374, "step": 37550 }, { "epoch": 2.002492929361356, "grad_norm": 0.16770167648792267, "learning_rate": 1.5333333333333334e-06, "loss": 1.5412, "step": 37560 }, { "epoch": 2.0030348705268683, "grad_norm": 0.17810305953025818, "learning_rate": 1.8666666666666669e-06, "loss": 1.5236, "step": 37570 }, { "epoch": 2.0035768116923807, "grad_norm": 0.18917031586170197, "learning_rate": 2.2e-06, "loss": 1.5386, "step": 37580 }, { "epoch": 2.0037935881585853, "eval_loss": 2.4802141189575195, "eval_runtime": 22.3304, "eval_samples_per_second": 223.91, "eval_steps_per_second": 1.209, "step": 37584 }, { "epoch": 2.004118752857893, "grad_norm": 0.1666443794965744, "learning_rate": 2.5333333333333334e-06, "loss": 1.5323, "step": 37590 }, { "epoch": 2.0046606940234053, "grad_norm": 0.16958613693714142, "learning_rate": 2.8666666666666666e-06, "loss": 1.5331, "step": 37600 }, { "epoch": 2.005202635188917, "grad_norm": 0.16641965508460999, "learning_rate": 3.2000000000000003e-06, "loss": 1.5381, "step": 37610 }, { "epoch": 2.0057445763544295, "grad_norm": 0.18020287156105042, "learning_rate": 3.5333333333333335e-06, "loss": 1.5293, "step": 37620 }, { "epoch": 2.006286517519942, "grad_norm": 0.16953766345977783, "learning_rate": 3.866666666666667e-06, "loss": 1.5372, "step": 37630 }, { "epoch": 2.006828458685454, "grad_norm": 0.16563622653484344, "learning_rate": 4.2000000000000004e-06, "loss": 1.53, "step": 37640 }, { "epoch": 2.007370399850966, "grad_norm": 0.1674739271402359, "learning_rate": 4.533333333333334e-06, "loss": 1.5224, "step": 37650 }, { "epoch": 2.0079123410164783, "grad_norm": 0.17399495840072632, "learning_rate": 4.866666666666667e-06, "loss": 1.5198, "step": 37660 }, { "epoch": 2.0084542821819906, "grad_norm": 0.166429802775383, "learning_rate": 5.2e-06, "loss": 1.5246, "step": 37670 }, { "epoch": 2.008508476298542, "eval_loss": 2.476987600326538, "eval_runtime": 27.0724, "eval_samples_per_second": 184.69, "eval_steps_per_second": 0.997, "step": 37671 }, { "epoch": 2.008996223347503, "grad_norm": 0.17933695018291473, "learning_rate": 5.5333333333333334e-06, "loss": 1.5297, "step": 37680 }, { "epoch": 2.0095381645130153, "grad_norm": 0.16821593046188354, "learning_rate": 5.866666666666667e-06, "loss": 1.5318, "step": 37690 }, { "epoch": 2.010080105678527, "grad_norm": 0.17084164917469025, "learning_rate": 6.2e-06, "loss": 1.5276, "step": 37700 }, { "epoch": 2.0106220468440394, "grad_norm": 0.17240796983242035, "learning_rate": 6.533333333333333e-06, "loss": 1.5238, "step": 37710 }, { "epoch": 2.0111639880095518, "grad_norm": 0.1693965643644333, "learning_rate": 6.866666666666667e-06, "loss": 1.5267, "step": 37720 }, { "epoch": 2.011705929175064, "grad_norm": 0.17528323829174042, "learning_rate": 7.2e-06, "loss": 1.5202, "step": 37730 }, { "epoch": 2.012247870340576, "grad_norm": 0.16867630183696747, "learning_rate": 7.533333333333334e-06, "loss": 1.5173, "step": 37740 }, { "epoch": 2.0127898115060883, "grad_norm": 0.17133750021457672, "learning_rate": 7.866666666666667e-06, "loss": 1.5128, "step": 37750 }, { "epoch": 2.013223364438498, "eval_loss": 2.475126028060913, "eval_runtime": 21.9812, "eval_samples_per_second": 227.467, "eval_steps_per_second": 1.228, "step": 37758 }, { "epoch": 2.0133317526716006, "grad_norm": 0.17712301015853882, "learning_rate": 8.200000000000001e-06, "loss": 1.5215, "step": 37760 }, { "epoch": 2.013873693837113, "grad_norm": 0.19257335364818573, "learning_rate": 8.533333333333334e-06, "loss": 1.5109, "step": 37770 }, { "epoch": 2.014415635002625, "grad_norm": 0.2007877379655838, "learning_rate": 8.866666666666668e-06, "loss": 1.5275, "step": 37780 }, { "epoch": 2.014957576168137, "grad_norm": 0.16571199893951416, "learning_rate": 9.2e-06, "loss": 1.5206, "step": 37790 }, { "epoch": 2.0154995173336494, "grad_norm": 0.17200104892253876, "learning_rate": 9.533333333333334e-06, "loss": 1.5205, "step": 37800 }, { "epoch": 2.0160414584991617, "grad_norm": 0.1716819703578949, "learning_rate": 9.866666666666667e-06, "loss": 1.5087, "step": 37810 }, { "epoch": 2.016583399664674, "grad_norm": 0.21321678161621094, "learning_rate": 1.02e-05, "loss": 1.5171, "step": 37820 }, { "epoch": 2.017125340830186, "grad_norm": 0.16358011960983276, "learning_rate": 1.0533333333333335e-05, "loss": 1.5169, "step": 37830 }, { "epoch": 2.0176672819956982, "grad_norm": 0.16521596908569336, "learning_rate": 1.0866666666666667e-05, "loss": 1.5277, "step": 37840 }, { "epoch": 2.0179382525784546, "eval_loss": 2.476113796234131, "eval_runtime": 21.9762, "eval_samples_per_second": 227.519, "eval_steps_per_second": 1.229, "step": 37845 }, { "epoch": 2.0182092231612105, "grad_norm": 0.17164713144302368, "learning_rate": 1.1200000000000001e-05, "loss": 1.5241, "step": 37850 }, { "epoch": 2.018751164326723, "grad_norm": 0.21223784983158112, "learning_rate": 1.1533333333333334e-05, "loss": 1.5176, "step": 37860 }, { "epoch": 2.019293105492235, "grad_norm": 0.17650015652179718, "learning_rate": 1.1866666666666668e-05, "loss": 1.5128, "step": 37870 }, { "epoch": 2.019835046657747, "grad_norm": 0.23301972448825836, "learning_rate": 1.22e-05, "loss": 1.5302, "step": 37880 }, { "epoch": 2.0203769878232594, "grad_norm": 0.16798456013202667, "learning_rate": 1.2533333333333332e-05, "loss": 1.5146, "step": 37890 }, { "epoch": 2.0209189289887717, "grad_norm": 0.1657019704580307, "learning_rate": 1.2866666666666668e-05, "loss": 1.523, "step": 37900 }, { "epoch": 2.021460870154284, "grad_norm": 0.1911483108997345, "learning_rate": 1.32e-05, "loss": 1.5141, "step": 37910 }, { "epoch": 2.0220028113197963, "grad_norm": 0.22926867008209229, "learning_rate": 1.3533333333333335e-05, "loss": 1.5156, "step": 37920 }, { "epoch": 2.022544752485308, "grad_norm": 0.25321751832962036, "learning_rate": 1.3866666666666667e-05, "loss": 1.5193, "step": 37930 }, { "epoch": 2.0226531407184107, "eval_loss": 2.470311403274536, "eval_runtime": 21.9756, "eval_samples_per_second": 227.525, "eval_steps_per_second": 1.229, "step": 37932 }, { "epoch": 2.0230866936508205, "grad_norm": 0.20465686917304993, "learning_rate": 1.42e-05, "loss": 1.514, "step": 37940 }, { "epoch": 2.023628634816333, "grad_norm": 0.20215226709842682, "learning_rate": 1.4533333333333335e-05, "loss": 1.5079, "step": 37950 }, { "epoch": 2.024170575981845, "grad_norm": 0.18972061574459076, "learning_rate": 1.4866666666666668e-05, "loss": 1.5137, "step": 37960 }, { "epoch": 2.024712517147357, "grad_norm": 0.18428048491477966, "learning_rate": 1.52e-05, "loss": 1.5206, "step": 37970 }, { "epoch": 2.0252544583128693, "grad_norm": 0.21038727462291718, "learning_rate": 1.5533333333333333e-05, "loss": 1.515, "step": 37980 }, { "epoch": 2.0257963994783816, "grad_norm": 0.20297273993492126, "learning_rate": 1.586666666666667e-05, "loss": 1.5159, "step": 37990 }, { "epoch": 2.026338340643894, "grad_norm": 0.16452449560165405, "learning_rate": 1.62e-05, "loss": 1.5092, "step": 38000 }, { "epoch": 2.0268802818094063, "grad_norm": 0.25832870602607727, "learning_rate": 1.6533333333333333e-05, "loss": 1.5097, "step": 38010 }, { "epoch": 2.027368028858367, "eval_loss": 2.469764232635498, "eval_runtime": 21.9777, "eval_samples_per_second": 227.503, "eval_steps_per_second": 1.229, "step": 38019 }, { "epoch": 2.027422222974918, "grad_norm": 0.17639535665512085, "learning_rate": 1.6866666666666666e-05, "loss": 1.52, "step": 38020 }, { "epoch": 2.0279641641404305, "grad_norm": 0.2649475336074829, "learning_rate": 1.7199999999999998e-05, "loss": 1.5037, "step": 38030 }, { "epoch": 2.0285061053059428, "grad_norm": 0.19192355871200562, "learning_rate": 1.7533333333333334e-05, "loss": 1.501, "step": 38040 }, { "epoch": 2.029048046471455, "grad_norm": 0.19619537889957428, "learning_rate": 1.7866666666666666e-05, "loss": 1.5126, "step": 38050 }, { "epoch": 2.029589987636967, "grad_norm": 0.1901368945837021, "learning_rate": 1.8200000000000002e-05, "loss": 1.525, "step": 38060 }, { "epoch": 2.0301319288024793, "grad_norm": 0.1913931667804718, "learning_rate": 1.8533333333333334e-05, "loss": 1.5136, "step": 38070 }, { "epoch": 2.0306738699679916, "grad_norm": 0.1890285611152649, "learning_rate": 1.886666666666667e-05, "loss": 1.5198, "step": 38080 }, { "epoch": 2.031215811133504, "grad_norm": 0.19151568412780762, "learning_rate": 1.9200000000000003e-05, "loss": 1.5177, "step": 38090 }, { "epoch": 2.0317577522990162, "grad_norm": 0.16688157618045807, "learning_rate": 1.9533333333333335e-05, "loss": 1.5084, "step": 38100 }, { "epoch": 2.0320829169983234, "eval_loss": 2.4747180938720703, "eval_runtime": 21.9798, "eval_samples_per_second": 227.481, "eval_steps_per_second": 1.228, "step": 38106 }, { "epoch": 2.032299693464528, "grad_norm": 0.21504206955432892, "learning_rate": 1.9866666666666667e-05, "loss": 1.5145, "step": 38110 }, { "epoch": 2.0328416346300404, "grad_norm": 0.19307386875152588, "learning_rate": 2.0200000000000003e-05, "loss": 1.514, "step": 38120 }, { "epoch": 2.0333835757955527, "grad_norm": 0.17073208093643188, "learning_rate": 2.0533333333333336e-05, "loss": 1.5158, "step": 38130 }, { "epoch": 2.033925516961065, "grad_norm": 0.22306814789772034, "learning_rate": 2.0866666666666668e-05, "loss": 1.5055, "step": 38140 }, { "epoch": 2.034467458126577, "grad_norm": 0.170424222946167, "learning_rate": 2.12e-05, "loss": 1.5103, "step": 38150 }, { "epoch": 2.0350093992920892, "grad_norm": 0.18648818135261536, "learning_rate": 2.1533333333333333e-05, "loss": 1.5109, "step": 38160 }, { "epoch": 2.0355513404576016, "grad_norm": 0.18736661970615387, "learning_rate": 2.186666666666667e-05, "loss": 1.507, "step": 38170 }, { "epoch": 2.036093281623114, "grad_norm": 0.20201878249645233, "learning_rate": 2.22e-05, "loss": 1.5157, "step": 38180 }, { "epoch": 2.036635222788626, "grad_norm": 0.19277946650981903, "learning_rate": 2.2533333333333333e-05, "loss": 1.5152, "step": 38190 }, { "epoch": 2.0367978051382796, "eval_loss": 2.474329710006714, "eval_runtime": 21.9764, "eval_samples_per_second": 227.517, "eval_steps_per_second": 1.229, "step": 38193 }, { "epoch": 2.037177163954138, "grad_norm": 0.1907971054315567, "learning_rate": 2.2866666666666666e-05, "loss": 1.5019, "step": 38200 }, { "epoch": 2.0377191051196504, "grad_norm": 0.2295379936695099, "learning_rate": 2.32e-05, "loss": 1.5046, "step": 38210 }, { "epoch": 2.0382610462851627, "grad_norm": 0.18477369844913483, "learning_rate": 2.3533333333333334e-05, "loss": 1.5057, "step": 38220 }, { "epoch": 2.038802987450675, "grad_norm": 0.19044053554534912, "learning_rate": 2.3866666666666666e-05, "loss": 1.5217, "step": 38230 }, { "epoch": 2.0393449286161873, "grad_norm": 0.18540959060192108, "learning_rate": 2.4200000000000002e-05, "loss": 1.52, "step": 38240 }, { "epoch": 2.039886869781699, "grad_norm": 0.23876672983169556, "learning_rate": 2.4533333333333334e-05, "loss": 1.5129, "step": 38250 }, { "epoch": 2.0404288109472115, "grad_norm": 0.19681920111179352, "learning_rate": 2.486666666666667e-05, "loss": 1.502, "step": 38260 }, { "epoch": 2.040970752112724, "grad_norm": 0.17679736018180847, "learning_rate": 2.5200000000000003e-05, "loss": 1.5122, "step": 38270 }, { "epoch": 2.041512693278236, "grad_norm": 0.18017658591270447, "learning_rate": 2.553333333333334e-05, "loss": 1.5145, "step": 38280 }, { "epoch": 2.041512693278236, "eval_loss": 2.4778501987457275, "eval_runtime": 22.0279, "eval_samples_per_second": 226.985, "eval_steps_per_second": 1.226, "step": 38280 }, { "epoch": 2.042054634443748, "grad_norm": 0.17411205172538757, "learning_rate": 2.5866666666666667e-05, "loss": 1.5064, "step": 38290 }, { "epoch": 2.0425965756092603, "grad_norm": 0.18675434589385986, "learning_rate": 2.6200000000000003e-05, "loss": 1.5071, "step": 38300 }, { "epoch": 2.0431385167747727, "grad_norm": 0.17742466926574707, "learning_rate": 2.6533333333333332e-05, "loss": 1.5147, "step": 38310 }, { "epoch": 2.043680457940285, "grad_norm": 0.2024219036102295, "learning_rate": 2.6866666666666668e-05, "loss": 1.5201, "step": 38320 }, { "epoch": 2.0442223991057973, "grad_norm": 0.19301582872867584, "learning_rate": 2.7200000000000004e-05, "loss": 1.5131, "step": 38330 }, { "epoch": 2.044764340271309, "grad_norm": 0.186081200838089, "learning_rate": 2.7533333333333333e-05, "loss": 1.5113, "step": 38340 }, { "epoch": 2.0453062814368215, "grad_norm": 0.2549179792404175, "learning_rate": 2.786666666666667e-05, "loss": 1.5133, "step": 38350 }, { "epoch": 2.045848222602334, "grad_norm": 0.23150603473186493, "learning_rate": 2.8199999999999998e-05, "loss": 1.5087, "step": 38360 }, { "epoch": 2.0462275814181923, "eval_loss": 2.479970932006836, "eval_runtime": 21.9756, "eval_samples_per_second": 227.525, "eval_steps_per_second": 1.229, "step": 38367 }, { "epoch": 2.046390163767846, "grad_norm": 0.199720561504364, "learning_rate": 2.8533333333333333e-05, "loss": 1.5038, "step": 38370 }, { "epoch": 2.046932104933358, "grad_norm": 0.16712939739227295, "learning_rate": 2.886666666666667e-05, "loss": 1.5047, "step": 38380 }, { "epoch": 2.0474740460988703, "grad_norm": 0.17325983941555023, "learning_rate": 2.9199999999999998e-05, "loss": 1.517, "step": 38390 }, { "epoch": 2.0480159872643826, "grad_norm": 0.19841617345809937, "learning_rate": 2.9533333333333334e-05, "loss": 1.5113, "step": 38400 }, { "epoch": 2.048557928429895, "grad_norm": 0.21291446685791016, "learning_rate": 2.986666666666667e-05, "loss": 1.5056, "step": 38410 }, { "epoch": 2.0490998695954072, "grad_norm": 0.33324041962623596, "learning_rate": 3.02e-05, "loss": 1.4991, "step": 38420 }, { "epoch": 2.049641810760919, "grad_norm": 0.19952350854873657, "learning_rate": 3.0533333333333335e-05, "loss": 1.5244, "step": 38430 }, { "epoch": 2.0501837519264314, "grad_norm": 0.19988487660884857, "learning_rate": 3.086666666666667e-05, "loss": 1.509, "step": 38440 }, { "epoch": 2.0507256930919437, "grad_norm": 0.3386477530002594, "learning_rate": 3.12e-05, "loss": 1.5031, "step": 38450 }, { "epoch": 2.0509424695581484, "eval_loss": 2.480910301208496, "eval_runtime": 21.9771, "eval_samples_per_second": 227.51, "eval_steps_per_second": 1.229, "step": 38454 }, { "epoch": 2.051267634257456, "grad_norm": 0.23942768573760986, "learning_rate": 3.153333333333334e-05, "loss": 1.5143, "step": 38460 }, { "epoch": 2.051809575422968, "grad_norm": 0.2345375418663025, "learning_rate": 3.1866666666666664e-05, "loss": 1.5095, "step": 38470 }, { "epoch": 2.0523515165884803, "grad_norm": 0.23772574961185455, "learning_rate": 3.2200000000000003e-05, "loss": 1.5036, "step": 38480 }, { "epoch": 2.0528934577539926, "grad_norm": 0.21411636471748352, "learning_rate": 3.253333333333333e-05, "loss": 1.5111, "step": 38490 }, { "epoch": 2.053435398919505, "grad_norm": 0.17845569550991058, "learning_rate": 3.286666666666667e-05, "loss": 1.5151, "step": 38500 }, { "epoch": 2.053977340085017, "grad_norm": 0.23491686582565308, "learning_rate": 3.32e-05, "loss": 1.5025, "step": 38510 }, { "epoch": 2.054519281250529, "grad_norm": 0.2281605452299118, "learning_rate": 3.353333333333333e-05, "loss": 1.5121, "step": 38520 }, { "epoch": 2.0550612224160414, "grad_norm": 0.19572101533412933, "learning_rate": 3.3866666666666665e-05, "loss": 1.5169, "step": 38530 }, { "epoch": 2.0556031635815537, "grad_norm": 0.2043548971414566, "learning_rate": 3.4200000000000005e-05, "loss": 1.507, "step": 38540 }, { "epoch": 2.055657357698105, "eval_loss": 2.4816620349884033, "eval_runtime": 22.0156, "eval_samples_per_second": 227.111, "eval_steps_per_second": 1.226, "step": 38541 }, { "epoch": 2.056145104747066, "grad_norm": 0.3473966717720032, "learning_rate": 3.453333333333334e-05, "loss": 1.503, "step": 38550 }, { "epoch": 2.056687045912578, "grad_norm": 0.17651808261871338, "learning_rate": 3.486666666666667e-05, "loss": 1.5191, "step": 38560 }, { "epoch": 2.05722898707809, "grad_norm": 0.31023022532463074, "learning_rate": 3.52e-05, "loss": 1.5095, "step": 38570 }, { "epoch": 2.0577709282436025, "grad_norm": 0.22835835814476013, "learning_rate": 3.5533333333333334e-05, "loss": 1.496, "step": 38580 }, { "epoch": 2.058312869409115, "grad_norm": 0.28630900382995605, "learning_rate": 3.586666666666667e-05, "loss": 1.5129, "step": 38590 }, { "epoch": 2.058854810574627, "grad_norm": 0.2917551100254059, "learning_rate": 3.62e-05, "loss": 1.5131, "step": 38600 }, { "epoch": 2.059396751740139, "grad_norm": 0.23212890326976776, "learning_rate": 3.653333333333334e-05, "loss": 1.5075, "step": 38610 }, { "epoch": 2.0599386929056513, "grad_norm": 0.3044704496860504, "learning_rate": 3.6866666666666664e-05, "loss": 1.5153, "step": 38620 }, { "epoch": 2.060372245838061, "eval_loss": 2.466387987136841, "eval_runtime": 21.9823, "eval_samples_per_second": 227.456, "eval_steps_per_second": 1.228, "step": 38628 }, { "epoch": 2.0604806340711637, "grad_norm": 0.1840297132730484, "learning_rate": 3.72e-05, "loss": 1.5098, "step": 38630 }, { "epoch": 2.061022575236676, "grad_norm": 0.20662543177604675, "learning_rate": 3.7533333333333335e-05, "loss": 1.5006, "step": 38640 }, { "epoch": 2.0615645164021883, "grad_norm": 0.227204829454422, "learning_rate": 3.786666666666667e-05, "loss": 1.5052, "step": 38650 }, { "epoch": 2.0621064575677, "grad_norm": 0.377437561750412, "learning_rate": 3.82e-05, "loss": 1.5002, "step": 38660 }, { "epoch": 2.0626483987332125, "grad_norm": 0.3552536964416504, "learning_rate": 3.853333333333334e-05, "loss": 1.5104, "step": 38670 }, { "epoch": 2.063190339898725, "grad_norm": 0.31461700797080994, "learning_rate": 3.8866666666666665e-05, "loss": 1.5048, "step": 38680 }, { "epoch": 2.063732281064237, "grad_norm": 0.20412826538085938, "learning_rate": 3.9200000000000004e-05, "loss": 1.5058, "step": 38690 }, { "epoch": 2.064274222229749, "grad_norm": 0.22955581545829773, "learning_rate": 3.9533333333333337e-05, "loss": 1.5037, "step": 38700 }, { "epoch": 2.0648161633952613, "grad_norm": 0.2234533727169037, "learning_rate": 3.986666666666667e-05, "loss": 1.5108, "step": 38710 }, { "epoch": 2.0650871339780177, "eval_loss": 2.4609971046447754, "eval_runtime": 21.9738, "eval_samples_per_second": 227.543, "eval_steps_per_second": 1.229, "step": 38715 }, { "epoch": 2.0653581045607736, "grad_norm": 0.18891902267932892, "learning_rate": 4.02e-05, "loss": 1.504, "step": 38720 }, { "epoch": 2.065900045726286, "grad_norm": 0.21943628787994385, "learning_rate": 4.0533333333333334e-05, "loss": 1.5037, "step": 38730 }, { "epoch": 2.0664419868917983, "grad_norm": 0.24250586330890656, "learning_rate": 4.086666666666667e-05, "loss": 1.5022, "step": 38740 }, { "epoch": 2.06698392805731, "grad_norm": 0.2112322300672531, "learning_rate": 4.12e-05, "loss": 1.5052, "step": 38750 }, { "epoch": 2.0675258692228224, "grad_norm": 0.24614650011062622, "learning_rate": 4.153333333333334e-05, "loss": 1.5012, "step": 38760 }, { "epoch": 2.0680678103883348, "grad_norm": 0.25870656967163086, "learning_rate": 4.186666666666667e-05, "loss": 1.5078, "step": 38770 }, { "epoch": 2.068609751553847, "grad_norm": 0.18944492936134338, "learning_rate": 4.22e-05, "loss": 1.5, "step": 38780 }, { "epoch": 2.069151692719359, "grad_norm": 0.2570621073246002, "learning_rate": 4.2533333333333335e-05, "loss": 1.5076, "step": 38790 }, { "epoch": 2.0696936338848713, "grad_norm": 0.19097241759300232, "learning_rate": 4.286666666666667e-05, "loss": 1.5103, "step": 38800 }, { "epoch": 2.069802022117974, "eval_loss": 2.472243309020996, "eval_runtime": 21.9768, "eval_samples_per_second": 227.513, "eval_steps_per_second": 1.229, "step": 38802 }, { "epoch": 2.0702355750503836, "grad_norm": 0.2385208159685135, "learning_rate": 4.32e-05, "loss": 1.5022, "step": 38810 }, { "epoch": 2.070777516215896, "grad_norm": 0.3327140212059021, "learning_rate": 4.353333333333334e-05, "loss": 1.5071, "step": 38820 }, { "epoch": 2.071319457381408, "grad_norm": 0.36782822012901306, "learning_rate": 4.3866666666666665e-05, "loss": 1.5097, "step": 38830 }, { "epoch": 2.07186139854692, "grad_norm": 0.35685569047927856, "learning_rate": 4.4200000000000004e-05, "loss": 1.5072, "step": 38840 }, { "epoch": 2.0724033397124324, "grad_norm": 0.20131441950798035, "learning_rate": 4.4533333333333336e-05, "loss": 1.4922, "step": 38850 }, { "epoch": 2.0729452808779447, "grad_norm": 0.31305959820747375, "learning_rate": 4.486666666666667e-05, "loss": 1.509, "step": 38860 }, { "epoch": 2.073487222043457, "grad_norm": 0.2946673333644867, "learning_rate": 4.52e-05, "loss": 1.5089, "step": 38870 }, { "epoch": 2.0740291632089694, "grad_norm": 0.29778286814689636, "learning_rate": 4.553333333333333e-05, "loss": 1.4924, "step": 38880 }, { "epoch": 2.07451691025793, "eval_loss": 2.4727067947387695, "eval_runtime": 21.9813, "eval_samples_per_second": 227.466, "eval_steps_per_second": 1.228, "step": 38889 }, { "epoch": 2.0745711043744812, "grad_norm": 0.2734017074108124, "learning_rate": 4.5866666666666666e-05, "loss": 1.5117, "step": 38890 }, { "epoch": 2.0751130455399935, "grad_norm": 0.1901269257068634, "learning_rate": 4.6200000000000005e-05, "loss": 1.503, "step": 38900 }, { "epoch": 2.075654986705506, "grad_norm": 0.2383534014225006, "learning_rate": 4.653333333333334e-05, "loss": 1.4965, "step": 38910 }, { "epoch": 2.076196927871018, "grad_norm": 0.27332353591918945, "learning_rate": 4.686666666666667e-05, "loss": 1.5121, "step": 38920 }, { "epoch": 2.07673886903653, "grad_norm": 0.25885239243507385, "learning_rate": 4.72e-05, "loss": 1.5025, "step": 38930 }, { "epoch": 2.0772808102020424, "grad_norm": 0.19907046854496002, "learning_rate": 4.7533333333333334e-05, "loss": 1.5126, "step": 38940 }, { "epoch": 2.0778227513675547, "grad_norm": 0.19673128426074982, "learning_rate": 4.7866666666666674e-05, "loss": 1.5149, "step": 38950 }, { "epoch": 2.078364692533067, "grad_norm": 0.3096723258495331, "learning_rate": 4.82e-05, "loss": 1.509, "step": 38960 }, { "epoch": 2.0789066336985793, "grad_norm": 0.22866037487983704, "learning_rate": 4.853333333333334e-05, "loss": 1.5085, "step": 38970 }, { "epoch": 2.0792317983978865, "eval_loss": 2.4707653522491455, "eval_runtime": 21.9775, "eval_samples_per_second": 227.506, "eval_steps_per_second": 1.229, "step": 38976 }, { "epoch": 2.079448574864091, "grad_norm": 0.19826719164848328, "learning_rate": 4.886666666666667e-05, "loss": 1.5061, "step": 38980 }, { "epoch": 2.0799905160296035, "grad_norm": 0.3995167016983032, "learning_rate": 4.92e-05, "loss": 1.4995, "step": 38990 }, { "epoch": 2.080532457195116, "grad_norm": 0.3492770791053772, "learning_rate": 4.9533333333333336e-05, "loss": 1.5092, "step": 39000 }, { "epoch": 2.081074398360628, "grad_norm": 0.248097226023674, "learning_rate": 4.986666666666667e-05, "loss": 1.5146, "step": 39010 }, { "epoch": 2.08161633952614, "grad_norm": 0.3634330630302429, "learning_rate": 5.02e-05, "loss": 1.5122, "step": 39020 }, { "epoch": 2.0821582806916523, "grad_norm": 0.30074241757392883, "learning_rate": 5.053333333333333e-05, "loss": 1.5094, "step": 39030 }, { "epoch": 2.0827002218571646, "grad_norm": 0.19916889071464539, "learning_rate": 5.086666666666667e-05, "loss": 1.5129, "step": 39040 }, { "epoch": 2.083242163022677, "grad_norm": 0.19030417501926422, "learning_rate": 5.1200000000000004e-05, "loss": 1.5048, "step": 39050 }, { "epoch": 2.0837841041881893, "grad_norm": 0.2428630143404007, "learning_rate": 5.153333333333333e-05, "loss": 1.5089, "step": 39060 }, { "epoch": 2.0839466865378427, "eval_loss": 2.4720537662506104, "eval_runtime": 21.9794, "eval_samples_per_second": 227.486, "eval_steps_per_second": 1.228, "step": 39063 }, { "epoch": 2.084326045353701, "grad_norm": 0.2563911974430084, "learning_rate": 5.1866666666666676e-05, "loss": 1.5078, "step": 39070 }, { "epoch": 2.0848679865192135, "grad_norm": 0.20653724670410156, "learning_rate": 5.22e-05, "loss": 1.4982, "step": 39080 }, { "epoch": 2.0854099276847258, "grad_norm": 0.29985037446022034, "learning_rate": 5.2533333333333334e-05, "loss": 1.5188, "step": 39090 }, { "epoch": 2.085951868850238, "grad_norm": 0.2325274795293808, "learning_rate": 5.2866666666666666e-05, "loss": 1.4888, "step": 39100 }, { "epoch": 2.08649381001575, "grad_norm": 0.35616472363471985, "learning_rate": 5.3200000000000006e-05, "loss": 1.503, "step": 39110 }, { "epoch": 2.0870357511812623, "grad_norm": 0.23784513771533966, "learning_rate": 5.353333333333334e-05, "loss": 1.4995, "step": 39120 }, { "epoch": 2.0875776923467746, "grad_norm": 0.2512984275817871, "learning_rate": 5.3866666666666664e-05, "loss": 1.4988, "step": 39130 }, { "epoch": 2.088119633512287, "grad_norm": 0.31452298164367676, "learning_rate": 5.420000000000001e-05, "loss": 1.5064, "step": 39140 }, { "epoch": 2.0886615746777992, "grad_norm": 0.2067795991897583, "learning_rate": 5.4533333333333335e-05, "loss": 1.503, "step": 39150 }, { "epoch": 2.0886615746777992, "eval_loss": 2.478317975997925, "eval_runtime": 21.954, "eval_samples_per_second": 227.749, "eval_steps_per_second": 1.23, "step": 39150 }, { "epoch": 2.089203515843311, "grad_norm": 0.2181154042482376, "learning_rate": 5.486666666666667e-05, "loss": 1.5082, "step": 39160 }, { "epoch": 2.0897454570088234, "grad_norm": 0.25218093395233154, "learning_rate": 5.520000000000001e-05, "loss": 1.4949, "step": 39170 }, { "epoch": 2.0902873981743357, "grad_norm": 0.26879268884658813, "learning_rate": 5.553333333333334e-05, "loss": 1.5169, "step": 39180 }, { "epoch": 2.090829339339848, "grad_norm": 0.42360979318618774, "learning_rate": 5.5866666666666665e-05, "loss": 1.5001, "step": 39190 }, { "epoch": 2.09137128050536, "grad_norm": 0.3156111538410187, "learning_rate": 5.620000000000001e-05, "loss": 1.4959, "step": 39200 }, { "epoch": 2.0919132216708722, "grad_norm": 0.3810129761695862, "learning_rate": 5.6533333333333336e-05, "loss": 1.5044, "step": 39210 }, { "epoch": 2.0924551628363846, "grad_norm": 0.2980895936489105, "learning_rate": 5.686666666666667e-05, "loss": 1.509, "step": 39220 }, { "epoch": 2.092997104001897, "grad_norm": 0.2537708878517151, "learning_rate": 5.72e-05, "loss": 1.502, "step": 39230 }, { "epoch": 2.0933764628177554, "eval_loss": 2.486907482147217, "eval_runtime": 21.9802, "eval_samples_per_second": 227.478, "eval_steps_per_second": 1.228, "step": 39237 }, { "epoch": 2.093539045167409, "grad_norm": 0.2361438274383545, "learning_rate": 5.753333333333334e-05, "loss": 1.5046, "step": 39240 }, { "epoch": 2.094080986332921, "grad_norm": 0.3751475214958191, "learning_rate": 5.7866666666666666e-05, "loss": 1.5045, "step": 39250 }, { "epoch": 2.0946229274984334, "grad_norm": 0.28046801686286926, "learning_rate": 5.82e-05, "loss": 1.5143, "step": 39260 }, { "epoch": 2.0951648686639457, "grad_norm": 0.3462556302547455, "learning_rate": 5.853333333333334e-05, "loss": 1.5155, "step": 39270 }, { "epoch": 2.095706809829458, "grad_norm": 0.4051414728164673, "learning_rate": 5.886666666666667e-05, "loss": 1.5008, "step": 39280 }, { "epoch": 2.0962487509949703, "grad_norm": 0.348838746547699, "learning_rate": 5.92e-05, "loss": 1.5027, "step": 39290 }, { "epoch": 2.096790692160482, "grad_norm": 0.26631638407707214, "learning_rate": 5.953333333333334e-05, "loss": 1.5169, "step": 39300 }, { "epoch": 2.0973326333259945, "grad_norm": 0.20438840985298157, "learning_rate": 5.9866666666666674e-05, "loss": 1.5017, "step": 39310 }, { "epoch": 2.097874574491507, "grad_norm": 0.23914705216884613, "learning_rate": 6.02e-05, "loss": 1.5111, "step": 39320 }, { "epoch": 2.0980913509577115, "eval_loss": 2.485321521759033, "eval_runtime": 21.983, "eval_samples_per_second": 227.448, "eval_steps_per_second": 1.228, "step": 39324 }, { "epoch": 2.098416515657019, "grad_norm": 0.4190734028816223, "learning_rate": 6.053333333333333e-05, "loss": 1.5052, "step": 39330 }, { "epoch": 2.098958456822531, "grad_norm": 0.2981734871864319, "learning_rate": 6.086666666666667e-05, "loss": 1.5005, "step": 39340 }, { "epoch": 2.0995003979880433, "grad_norm": 0.28054559230804443, "learning_rate": 6.12e-05, "loss": 1.4988, "step": 39350 }, { "epoch": 2.1000423391535556, "grad_norm": 0.4990901052951813, "learning_rate": 6.153333333333333e-05, "loss": 1.4869, "step": 39360 }, { "epoch": 2.100584280319068, "grad_norm": 0.30956992506980896, "learning_rate": 6.186666666666668e-05, "loss": 1.4983, "step": 39370 }, { "epoch": 2.1011262214845803, "grad_norm": 0.25511854887008667, "learning_rate": 6.220000000000001e-05, "loss": 1.5014, "step": 39380 }, { "epoch": 2.101668162650092, "grad_norm": 0.5350297689437866, "learning_rate": 6.253333333333333e-05, "loss": 1.4979, "step": 39390 }, { "epoch": 2.1022101038156045, "grad_norm": 0.20907965302467346, "learning_rate": 6.286666666666667e-05, "loss": 1.5125, "step": 39400 }, { "epoch": 2.102752044981117, "grad_norm": 0.2567403018474579, "learning_rate": 6.32e-05, "loss": 1.4964, "step": 39410 }, { "epoch": 2.102806239097668, "eval_loss": 2.490506887435913, "eval_runtime": 21.9797, "eval_samples_per_second": 227.483, "eval_steps_per_second": 1.228, "step": 39411 }, { "epoch": 2.103293986146629, "grad_norm": 0.25373193621635437, "learning_rate": 6.353333333333334e-05, "loss": 1.5058, "step": 39420 }, { "epoch": 2.103835927312141, "grad_norm": 0.20478834211826324, "learning_rate": 6.386666666666667e-05, "loss": 1.4987, "step": 39430 }, { "epoch": 2.1043778684776533, "grad_norm": 0.2046637237071991, "learning_rate": 6.42e-05, "loss": 1.51, "step": 39440 }, { "epoch": 2.1049198096431656, "grad_norm": 0.2921828329563141, "learning_rate": 6.453333333333333e-05, "loss": 1.505, "step": 39450 }, { "epoch": 2.105461750808678, "grad_norm": 0.22420616447925568, "learning_rate": 6.486666666666667e-05, "loss": 1.5004, "step": 39460 }, { "epoch": 2.1060036919741902, "grad_norm": 0.39737066626548767, "learning_rate": 6.52e-05, "loss": 1.5068, "step": 39470 }, { "epoch": 2.106545633139702, "grad_norm": 0.21135340631008148, "learning_rate": 6.553333333333333e-05, "loss": 1.4962, "step": 39480 }, { "epoch": 2.1070875743052144, "grad_norm": 0.2920219600200653, "learning_rate": 6.586666666666666e-05, "loss": 1.5102, "step": 39490 }, { "epoch": 2.107521127237624, "eval_loss": 2.477649450302124, "eval_runtime": 22.0366, "eval_samples_per_second": 226.895, "eval_steps_per_second": 1.225, "step": 39498 }, { "epoch": 2.1076295154707267, "grad_norm": 0.34153836965560913, "learning_rate": 6.620000000000001e-05, "loss": 1.5078, "step": 39500 }, { "epoch": 2.108171456636239, "grad_norm": 0.27279841899871826, "learning_rate": 6.653333333333334e-05, "loss": 1.505, "step": 39510 }, { "epoch": 2.108713397801751, "grad_norm": 0.2870520055294037, "learning_rate": 6.686666666666666e-05, "loss": 1.5043, "step": 39520 }, { "epoch": 2.1092553389672632, "grad_norm": 0.34095126390457153, "learning_rate": 6.720000000000001e-05, "loss": 1.5083, "step": 39530 }, { "epoch": 2.1097972801327756, "grad_norm": 0.2083679437637329, "learning_rate": 6.753333333333334e-05, "loss": 1.4983, "step": 39540 }, { "epoch": 2.110339221298288, "grad_norm": 0.281099796295166, "learning_rate": 6.786666666666667e-05, "loss": 1.5089, "step": 39550 }, { "epoch": 2.1108811624638, "grad_norm": 0.29480496048927307, "learning_rate": 6.82e-05, "loss": 1.5133, "step": 39560 }, { "epoch": 2.111423103629312, "grad_norm": 0.38620585203170776, "learning_rate": 6.853333333333334e-05, "loss": 1.5034, "step": 39570 }, { "epoch": 2.1119650447948244, "grad_norm": 0.41479256749153137, "learning_rate": 6.886666666666667e-05, "loss": 1.5107, "step": 39580 }, { "epoch": 2.1122360153775808, "eval_loss": 2.4839091300964355, "eval_runtime": 22.0638, "eval_samples_per_second": 226.615, "eval_steps_per_second": 1.224, "step": 39585 }, { "epoch": 2.1125069859603367, "grad_norm": 0.3035687804222107, "learning_rate": 6.92e-05, "loss": 1.4977, "step": 39590 }, { "epoch": 2.113048927125849, "grad_norm": 0.34999310970306396, "learning_rate": 6.953333333333333e-05, "loss": 1.5053, "step": 39600 }, { "epoch": 2.113590868291361, "grad_norm": 0.27466028928756714, "learning_rate": 6.986666666666667e-05, "loss": 1.5103, "step": 39610 }, { "epoch": 2.114132809456873, "grad_norm": 0.4131866693496704, "learning_rate": 7.02e-05, "loss": 1.4978, "step": 39620 }, { "epoch": 2.1146747506223855, "grad_norm": 0.3529013991355896, "learning_rate": 7.053333333333334e-05, "loss": 1.5053, "step": 39630 }, { "epoch": 2.115216691787898, "grad_norm": 0.30995792150497437, "learning_rate": 7.086666666666666e-05, "loss": 1.513, "step": 39640 }, { "epoch": 2.11575863295341, "grad_norm": 0.3839752972126007, "learning_rate": 7.12e-05, "loss": 1.505, "step": 39650 }, { "epoch": 2.116300574118922, "grad_norm": 0.2367662787437439, "learning_rate": 7.153333333333334e-05, "loss": 1.5036, "step": 39660 }, { "epoch": 2.1168425152844343, "grad_norm": 0.22004197537899017, "learning_rate": 7.186666666666667e-05, "loss": 1.5015, "step": 39670 }, { "epoch": 2.116950903517537, "eval_loss": 2.4732017517089844, "eval_runtime": 21.9785, "eval_samples_per_second": 227.495, "eval_steps_per_second": 1.228, "step": 39672 }, { "epoch": 2.1173844564499467, "grad_norm": 0.2898196280002594, "learning_rate": 7.22e-05, "loss": 1.5044, "step": 39680 }, { "epoch": 2.117926397615459, "grad_norm": 0.30908989906311035, "learning_rate": 7.253333333333334e-05, "loss": 1.5071, "step": 39690 }, { "epoch": 2.1184683387809713, "grad_norm": 0.6186519265174866, "learning_rate": 7.286666666666667e-05, "loss": 1.522, "step": 39700 }, { "epoch": 2.119010279946483, "grad_norm": 0.47445744276046753, "learning_rate": 7.32e-05, "loss": 1.4992, "step": 39710 }, { "epoch": 2.1195522211119955, "grad_norm": 0.3463587760925293, "learning_rate": 7.353333333333334e-05, "loss": 1.512, "step": 39720 }, { "epoch": 2.120094162277508, "grad_norm": 0.3156580626964569, "learning_rate": 7.386666666666667e-05, "loss": 1.5026, "step": 39730 }, { "epoch": 2.12063610344302, "grad_norm": 0.3200650215148926, "learning_rate": 7.42e-05, "loss": 1.504, "step": 39740 }, { "epoch": 2.121178044608532, "grad_norm": 0.3450500965118408, "learning_rate": 7.453333333333333e-05, "loss": 1.4982, "step": 39750 }, { "epoch": 2.121665791657493, "eval_loss": 2.47827410697937, "eval_runtime": 22.0331, "eval_samples_per_second": 226.931, "eval_steps_per_second": 1.225, "step": 39759 }, { "epoch": 2.1217199857740443, "grad_norm": 0.221024289727211, "learning_rate": 7.486666666666668e-05, "loss": 1.5028, "step": 39760 }, { "epoch": 2.1222619269395566, "grad_norm": 0.2555961012840271, "learning_rate": 7.52e-05, "loss": 1.5123, "step": 39770 }, { "epoch": 2.122803868105069, "grad_norm": 0.34122392535209656, "learning_rate": 7.553333333333333e-05, "loss": 1.5043, "step": 39780 }, { "epoch": 2.1233458092705813, "grad_norm": 0.29069313406944275, "learning_rate": 7.586666666666668e-05, "loss": 1.4978, "step": 39790 }, { "epoch": 2.123887750436093, "grad_norm": 0.3182983696460724, "learning_rate": 7.620000000000001e-05, "loss": 1.5068, "step": 39800 }, { "epoch": 2.1244296916016054, "grad_norm": 0.5020242929458618, "learning_rate": 7.653333333333333e-05, "loss": 1.497, "step": 39810 }, { "epoch": 2.1249716327671178, "grad_norm": 0.27356261014938354, "learning_rate": 7.686666666666667e-05, "loss": 1.5029, "step": 39820 }, { "epoch": 2.12551357393263, "grad_norm": 0.315500408411026, "learning_rate": 7.72e-05, "loss": 1.5193, "step": 39830 }, { "epoch": 2.126055515098142, "grad_norm": 0.36083605885505676, "learning_rate": 7.753333333333334e-05, "loss": 1.5013, "step": 39840 }, { "epoch": 2.1263806797974496, "eval_loss": 2.4653656482696533, "eval_runtime": 21.9768, "eval_samples_per_second": 227.513, "eval_steps_per_second": 1.229, "step": 39846 }, { "epoch": 2.1265974562636543, "grad_norm": 0.3310469090938568, "learning_rate": 7.786666666666667e-05, "loss": 1.5136, "step": 39850 }, { "epoch": 2.1271393974291666, "grad_norm": 0.3014518618583679, "learning_rate": 7.82e-05, "loss": 1.5079, "step": 39860 }, { "epoch": 2.127681338594679, "grad_norm": 0.434340238571167, "learning_rate": 7.853333333333334e-05, "loss": 1.5123, "step": 39870 }, { "epoch": 2.128223279760191, "grad_norm": 0.34903663396835327, "learning_rate": 7.886666666666667e-05, "loss": 1.5069, "step": 39880 }, { "epoch": 2.128765220925703, "grad_norm": 0.2542589604854584, "learning_rate": 7.920000000000001e-05, "loss": 1.5123, "step": 39890 }, { "epoch": 2.1293071620912154, "grad_norm": 0.26018011569976807, "learning_rate": 7.953333333333333e-05, "loss": 1.5098, "step": 39900 }, { "epoch": 2.1298491032567277, "grad_norm": 0.19746781885623932, "learning_rate": 7.986666666666667e-05, "loss": 1.5074, "step": 39910 }, { "epoch": 2.13039104442224, "grad_norm": 0.2585922181606293, "learning_rate": 8.020000000000001e-05, "loss": 1.5005, "step": 39920 }, { "epoch": 2.1309329855877523, "grad_norm": 0.3514852523803711, "learning_rate": 8.053333333333334e-05, "loss": 1.5007, "step": 39930 }, { "epoch": 2.1310955679374057, "eval_loss": 2.4711849689483643, "eval_runtime": 21.9689, "eval_samples_per_second": 227.594, "eval_steps_per_second": 1.229, "step": 39933 }, { "epoch": 2.131474926753264, "grad_norm": 0.4120597839355469, "learning_rate": 8.086666666666666e-05, "loss": 1.5033, "step": 39940 }, { "epoch": 2.1320168679187765, "grad_norm": 0.27820542454719543, "learning_rate": 8.120000000000001e-05, "loss": 1.5124, "step": 39950 }, { "epoch": 2.132558809084289, "grad_norm": 0.33248940110206604, "learning_rate": 8.153333333333334e-05, "loss": 1.5004, "step": 39960 }, { "epoch": 2.133100750249801, "grad_norm": 0.4663317799568176, "learning_rate": 8.186666666666667e-05, "loss": 1.5161, "step": 39970 }, { "epoch": 2.133642691415313, "grad_norm": 0.463692843914032, "learning_rate": 8.22e-05, "loss": 1.5041, "step": 39980 }, { "epoch": 2.1341846325808254, "grad_norm": 0.3125864267349243, "learning_rate": 8.253333333333334e-05, "loss": 1.496, "step": 39990 }, { "epoch": 2.1347265737463377, "grad_norm": 0.4677470922470093, "learning_rate": 8.286666666666667e-05, "loss": 1.5, "step": 40000 }, { "epoch": 2.13526851491185, "grad_norm": 0.24888941645622253, "learning_rate": 8.32e-05, "loss": 1.5033, "step": 40010 }, { "epoch": 2.1358104560773623, "grad_norm": 0.26266810297966003, "learning_rate": 8.353333333333334e-05, "loss": 1.5042, "step": 40020 }, { "epoch": 2.1358104560773623, "eval_loss": 2.4674057960510254, "eval_runtime": 21.5786, "eval_samples_per_second": 231.711, "eval_steps_per_second": 1.251, "step": 40020 }, { "epoch": 2.136352397242874, "grad_norm": 0.36174437403678894, "learning_rate": 8.386666666666667e-05, "loss": 1.5109, "step": 40030 }, { "epoch": 2.1368943384083865, "grad_norm": 0.2121952325105667, "learning_rate": 8.42e-05, "loss": 1.4932, "step": 40040 }, { "epoch": 2.137436279573899, "grad_norm": 0.3189486861228943, "learning_rate": 8.453333333333335e-05, "loss": 1.51, "step": 40050 }, { "epoch": 2.137978220739411, "grad_norm": 0.36640292406082153, "learning_rate": 8.486666666666668e-05, "loss": 1.5087, "step": 40060 }, { "epoch": 2.138520161904923, "grad_norm": 0.22334055602550507, "learning_rate": 8.52e-05, "loss": 1.5019, "step": 40070 }, { "epoch": 2.1390621030704353, "grad_norm": 0.4532851576805115, "learning_rate": 8.553333333333333e-05, "loss": 1.5119, "step": 40080 }, { "epoch": 2.1396040442359476, "grad_norm": 0.30973002314567566, "learning_rate": 8.586666666666668e-05, "loss": 1.5028, "step": 40090 }, { "epoch": 2.14014598540146, "grad_norm": 0.27160486578941345, "learning_rate": 8.620000000000001e-05, "loss": 1.5112, "step": 40100 }, { "epoch": 2.1405253442173184, "eval_loss": 2.4754085540771484, "eval_runtime": 21.9771, "eval_samples_per_second": 227.51, "eval_steps_per_second": 1.229, "step": 40107 }, { "epoch": 2.1406879265669723, "grad_norm": 0.3385959267616272, "learning_rate": 8.653333333333333e-05, "loss": 1.5036, "step": 40110 }, { "epoch": 2.141229867732484, "grad_norm": 0.5644890069961548, "learning_rate": 8.686666666666667e-05, "loss": 1.5046, "step": 40120 }, { "epoch": 2.1417718088979965, "grad_norm": 0.4575164020061493, "learning_rate": 8.72e-05, "loss": 1.5059, "step": 40130 }, { "epoch": 2.1423137500635088, "grad_norm": 0.656599223613739, "learning_rate": 8.753333333333334e-05, "loss": 1.5027, "step": 40140 }, { "epoch": 2.142855691229021, "grad_norm": 0.4436124563217163, "learning_rate": 8.786666666666667e-05, "loss": 1.5026, "step": 40150 }, { "epoch": 2.1433976323945334, "grad_norm": 0.35058170557022095, "learning_rate": 8.82e-05, "loss": 1.5074, "step": 40160 }, { "epoch": 2.1439395735600453, "grad_norm": 0.2605305016040802, "learning_rate": 8.853333333333333e-05, "loss": 1.5077, "step": 40170 }, { "epoch": 2.1444815147255576, "grad_norm": 0.3072412312030792, "learning_rate": 8.886666666666668e-05, "loss": 1.5117, "step": 40180 }, { "epoch": 2.14502345589107, "grad_norm": 0.20943394303321838, "learning_rate": 8.92e-05, "loss": 1.5003, "step": 40190 }, { "epoch": 2.1452402323572746, "eval_loss": 2.478358507156372, "eval_runtime": 24.6282, "eval_samples_per_second": 203.019, "eval_steps_per_second": 1.096, "step": 40194 }, { "epoch": 2.1455653970565822, "grad_norm": 0.4068165719509125, "learning_rate": 8.953333333333333e-05, "loss": 1.5108, "step": 40200 }, { "epoch": 2.146107338222094, "grad_norm": 0.35715365409851074, "learning_rate": 8.986666666666666e-05, "loss": 1.5062, "step": 40210 }, { "epoch": 2.1466492793876064, "grad_norm": 0.4496524930000305, "learning_rate": 9.020000000000001e-05, "loss": 1.5063, "step": 40220 }, { "epoch": 2.1471912205531187, "grad_norm": 0.606382429599762, "learning_rate": 9.053333333333334e-05, "loss": 1.5041, "step": 40230 }, { "epoch": 2.147733161718631, "grad_norm": 0.3925984799861908, "learning_rate": 9.086666666666666e-05, "loss": 1.5084, "step": 40240 }, { "epoch": 2.148275102884143, "grad_norm": 0.29136770963668823, "learning_rate": 9.120000000000001e-05, "loss": 1.5013, "step": 40250 }, { "epoch": 2.1488170440496552, "grad_norm": 0.35544130206108093, "learning_rate": 9.153333333333334e-05, "loss": 1.5072, "step": 40260 }, { "epoch": 2.1493589852151676, "grad_norm": 0.3096676766872406, "learning_rate": 9.186666666666667e-05, "loss": 1.5037, "step": 40270 }, { "epoch": 2.14990092638068, "grad_norm": 0.2778821587562561, "learning_rate": 9.22e-05, "loss": 1.5112, "step": 40280 }, { "epoch": 2.149955120497231, "eval_loss": 2.4636154174804688, "eval_runtime": 24.7898, "eval_samples_per_second": 201.695, "eval_steps_per_second": 1.089, "step": 40281 }, { "epoch": 2.150442867546192, "grad_norm": 0.7011622786521912, "learning_rate": 9.253333333333334e-05, "loss": 1.5166, "step": 40290 }, { "epoch": 2.150984808711704, "grad_norm": 0.5509504079818726, "learning_rate": 9.286666666666667e-05, "loss": 1.5078, "step": 40300 }, { "epoch": 2.1515267498772164, "grad_norm": 0.28685396909713745, "learning_rate": 9.320000000000002e-05, "loss": 1.5159, "step": 40310 }, { "epoch": 2.1520686910427287, "grad_norm": 0.3149552345275879, "learning_rate": 9.353333333333333e-05, "loss": 1.5044, "step": 40320 }, { "epoch": 2.152610632208241, "grad_norm": 0.458866685628891, "learning_rate": 9.386666666666667e-05, "loss": 1.4973, "step": 40330 }, { "epoch": 2.1531525733737533, "grad_norm": 0.21605221927165985, "learning_rate": 9.42e-05, "loss": 1.5009, "step": 40340 }, { "epoch": 2.153694514539265, "grad_norm": 0.6214112043380737, "learning_rate": 9.453333333333335e-05, "loss": 1.5062, "step": 40350 }, { "epoch": 2.1542364557047775, "grad_norm": 0.5758342146873474, "learning_rate": 9.486666666666666e-05, "loss": 1.5175, "step": 40360 }, { "epoch": 2.1546700086371873, "eval_loss": 2.45977520942688, "eval_runtime": 21.979, "eval_samples_per_second": 227.489, "eval_steps_per_second": 1.228, "step": 40368 }, { "epoch": 2.15477839687029, "grad_norm": 0.3867745101451874, "learning_rate": 9.52e-05, "loss": 1.513, "step": 40370 }, { "epoch": 2.155320338035802, "grad_norm": 0.25635677576065063, "learning_rate": 9.553333333333334e-05, "loss": 1.5077, "step": 40380 }, { "epoch": 2.155862279201314, "grad_norm": 0.5329334735870361, "learning_rate": 9.586666666666667e-05, "loss": 1.5127, "step": 40390 }, { "epoch": 2.1564042203668263, "grad_norm": 0.4407355785369873, "learning_rate": 9.620000000000001e-05, "loss": 1.5063, "step": 40400 }, { "epoch": 2.1569461615323386, "grad_norm": 0.3090643584728241, "learning_rate": 9.653333333333334e-05, "loss": 1.5083, "step": 40410 }, { "epoch": 2.157488102697851, "grad_norm": 0.487301766872406, "learning_rate": 9.686666666666667e-05, "loss": 1.5051, "step": 40420 }, { "epoch": 2.1580300438633633, "grad_norm": 0.5774275064468384, "learning_rate": 9.72e-05, "loss": 1.5058, "step": 40430 }, { "epoch": 2.158571985028875, "grad_norm": 0.30334457755088806, "learning_rate": 9.753333333333334e-05, "loss": 1.514, "step": 40440 }, { "epoch": 2.1591139261943875, "grad_norm": 0.5308240056037903, "learning_rate": 9.786666666666667e-05, "loss": 1.5075, "step": 40450 }, { "epoch": 2.1593848967771434, "eval_loss": 2.4678995609283447, "eval_runtime": 22.396, "eval_samples_per_second": 223.254, "eval_steps_per_second": 1.206, "step": 40455 }, { "epoch": 2.1596558673599, "grad_norm": 0.5267515182495117, "learning_rate": 9.82e-05, "loss": 1.5033, "step": 40460 }, { "epoch": 2.160197808525412, "grad_norm": 0.4302433133125305, "learning_rate": 9.853333333333333e-05, "loss": 1.5039, "step": 40470 }, { "epoch": 2.160739749690924, "grad_norm": 0.2622862160205841, "learning_rate": 9.886666666666668e-05, "loss": 1.5074, "step": 40480 }, { "epoch": 2.1612816908564363, "grad_norm": 0.34495052695274353, "learning_rate": 9.92e-05, "loss": 1.5148, "step": 40490 }, { "epoch": 2.1618236320219486, "grad_norm": 0.28749775886535645, "learning_rate": 9.953333333333333e-05, "loss": 1.5168, "step": 40500 }, { "epoch": 2.162365573187461, "grad_norm": 0.34906288981437683, "learning_rate": 9.986666666666668e-05, "loss": 1.5094, "step": 40510 }, { "epoch": 2.1629075143529732, "grad_norm": 0.6776122450828552, "learning_rate": 9.999999899660717e-05, "loss": 1.5113, "step": 40520 }, { "epoch": 2.163449455518485, "grad_norm": 0.3558565080165863, "learning_rate": 9.99999928647622e-05, "loss": 1.5072, "step": 40530 }, { "epoch": 2.1639913966839974, "grad_norm": 0.2457047551870346, "learning_rate": 9.999998115851347e-05, "loss": 1.5147, "step": 40540 }, { "epoch": 2.1640997849171, "eval_loss": 2.471486806869507, "eval_runtime": 21.974, "eval_samples_per_second": 227.542, "eval_steps_per_second": 1.229, "step": 40542 }, { "epoch": 2.1645333378495097, "grad_norm": 0.520976722240448, "learning_rate": 9.999996387786247e-05, "loss": 1.5066, "step": 40550 }, { "epoch": 2.165075279015022, "grad_norm": 0.3190830945968628, "learning_rate": 9.99999410228113e-05, "loss": 1.5005, "step": 40560 }, { "epoch": 2.1656172201805344, "grad_norm": 0.31958481669425964, "learning_rate": 9.99999125933628e-05, "loss": 1.5157, "step": 40570 }, { "epoch": 2.1661591613460462, "grad_norm": 0.5813402533531189, "learning_rate": 9.999987858952052e-05, "loss": 1.5099, "step": 40580 }, { "epoch": 2.1667011025115586, "grad_norm": 0.34184807538986206, "learning_rate": 9.999983901128863e-05, "loss": 1.5005, "step": 40590 }, { "epoch": 2.167243043677071, "grad_norm": 0.3223097622394562, "learning_rate": 9.999979385867205e-05, "loss": 1.5134, "step": 40600 }, { "epoch": 2.167784984842583, "grad_norm": 0.20804668962955475, "learning_rate": 9.99997431316764e-05, "loss": 1.4992, "step": 40610 }, { "epoch": 2.168326926008095, "grad_norm": 0.25401726365089417, "learning_rate": 9.999968683030792e-05, "loss": 1.5153, "step": 40620 }, { "epoch": 2.168814673057056, "eval_loss": 2.468203067779541, "eval_runtime": 21.9753, "eval_samples_per_second": 227.529, "eval_steps_per_second": 1.229, "step": 40629 }, { "epoch": 2.1688688671736074, "grad_norm": 0.2348717451095581, "learning_rate": 9.999962495457362e-05, "loss": 1.5125, "step": 40630 }, { "epoch": 2.1694108083391197, "grad_norm": 0.43220508098602295, "learning_rate": 9.999955750448114e-05, "loss": 1.5078, "step": 40640 }, { "epoch": 2.169952749504632, "grad_norm": 0.28150442242622375, "learning_rate": 9.999948448003884e-05, "loss": 1.5077, "step": 40650 }, { "epoch": 2.170494690670144, "grad_norm": 0.24436496198177338, "learning_rate": 9.999940588125579e-05, "loss": 1.5158, "step": 40660 }, { "epoch": 2.171036631835656, "grad_norm": 0.3555525243282318, "learning_rate": 9.999932170814168e-05, "loss": 1.5009, "step": 40670 }, { "epoch": 2.1715785730011685, "grad_norm": 0.6536611318588257, "learning_rate": 9.999923196070698e-05, "loss": 1.5179, "step": 40680 }, { "epoch": 2.172120514166681, "grad_norm": 0.555479884147644, "learning_rate": 9.99991366389628e-05, "loss": 1.5034, "step": 40690 }, { "epoch": 2.172662455332193, "grad_norm": 0.27514633536338806, "learning_rate": 9.999903574292093e-05, "loss": 1.4983, "step": 40700 }, { "epoch": 2.173204396497705, "grad_norm": 0.2602129876613617, "learning_rate": 9.999892927259388e-05, "loss": 1.5167, "step": 40710 }, { "epoch": 2.1735295611970127, "eval_loss": 2.479363441467285, "eval_runtime": 21.9755, "eval_samples_per_second": 227.526, "eval_steps_per_second": 1.229, "step": 40716 }, { "epoch": 2.1737463376632173, "grad_norm": 0.3722988963127136, "learning_rate": 9.999881722799482e-05, "loss": 1.5071, "step": 40720 }, { "epoch": 2.1742882788287297, "grad_norm": 0.22509793937206268, "learning_rate": 9.999869960913767e-05, "loss": 1.5103, "step": 40730 }, { "epoch": 2.174830219994242, "grad_norm": 0.5048813819885254, "learning_rate": 9.999857641603697e-05, "loss": 1.5164, "step": 40740 }, { "epoch": 2.1753721611597543, "grad_norm": 0.3289371430873871, "learning_rate": 9.999844764870799e-05, "loss": 1.5028, "step": 40750 }, { "epoch": 2.175914102325266, "grad_norm": 0.5190215706825256, "learning_rate": 9.999831330716668e-05, "loss": 1.5106, "step": 40760 }, { "epoch": 2.1764560434907785, "grad_norm": 0.2816135883331299, "learning_rate": 9.999817339142969e-05, "loss": 1.5065, "step": 40770 }, { "epoch": 2.176997984656291, "grad_norm": 0.7596124410629272, "learning_rate": 9.999802790151434e-05, "loss": 1.5103, "step": 40780 }, { "epoch": 2.177539925821803, "grad_norm": 0.34131622314453125, "learning_rate": 9.999787683743863e-05, "loss": 1.5063, "step": 40790 }, { "epoch": 2.178081866987315, "grad_norm": 0.3968164026737213, "learning_rate": 9.999772019922132e-05, "loss": 1.5084, "step": 40800 }, { "epoch": 2.178244449336969, "eval_loss": 2.4752957820892334, "eval_runtime": 21.9803, "eval_samples_per_second": 227.477, "eval_steps_per_second": 1.228, "step": 40803 }, { "epoch": 2.1786238081528273, "grad_norm": 0.28055036067962646, "learning_rate": 9.999755798688178e-05, "loss": 1.5242, "step": 40810 }, { "epoch": 2.1791657493183396, "grad_norm": 0.35142403841018677, "learning_rate": 9.999739020044013e-05, "loss": 1.5095, "step": 40820 }, { "epoch": 2.179707690483852, "grad_norm": 0.2656286656856537, "learning_rate": 9.999721683991714e-05, "loss": 1.5034, "step": 40830 }, { "epoch": 2.1802496316493643, "grad_norm": 0.2500777840614319, "learning_rate": 9.999703790533428e-05, "loss": 1.5072, "step": 40840 }, { "epoch": 2.180791572814876, "grad_norm": 0.23317931592464447, "learning_rate": 9.999685339671372e-05, "loss": 1.5045, "step": 40850 }, { "epoch": 2.1813335139803884, "grad_norm": 0.6272380352020264, "learning_rate": 9.999666331407832e-05, "loss": 1.506, "step": 40860 }, { "epoch": 2.1818754551459008, "grad_norm": 0.28550583124160767, "learning_rate": 9.999646765745162e-05, "loss": 1.5038, "step": 40870 }, { "epoch": 2.182417396311413, "grad_norm": 0.3291584253311157, "learning_rate": 9.999626642685788e-05, "loss": 1.5097, "step": 40880 }, { "epoch": 2.182959337476925, "grad_norm": 0.6882104873657227, "learning_rate": 9.999605962232201e-05, "loss": 1.4979, "step": 40890 }, { "epoch": 2.182959337476925, "eval_loss": 2.482081174850464, "eval_runtime": 21.9465, "eval_samples_per_second": 227.827, "eval_steps_per_second": 1.23, "step": 40890 }, { "epoch": 2.1835012786424373, "grad_norm": 0.4152418375015259, "learning_rate": 9.999584724386959e-05, "loss": 1.4999, "step": 40900 }, { "epoch": 2.1840432198079496, "grad_norm": 0.40763190388679504, "learning_rate": 9.9995629291527e-05, "loss": 1.5092, "step": 40910 }, { "epoch": 2.184585160973462, "grad_norm": 0.29160091280937195, "learning_rate": 9.99954057653212e-05, "loss": 1.5012, "step": 40920 }, { "epoch": 2.185127102138974, "grad_norm": 0.40323200821876526, "learning_rate": 9.999517666527988e-05, "loss": 1.5059, "step": 40930 }, { "epoch": 2.185669043304486, "grad_norm": 0.30431511998176575, "learning_rate": 9.999494199143142e-05, "loss": 1.5109, "step": 40940 }, { "epoch": 2.1862109844699984, "grad_norm": 0.2627016007900238, "learning_rate": 9.999470174380489e-05, "loss": 1.5064, "step": 40950 }, { "epoch": 2.1867529256355107, "grad_norm": 0.47459253668785095, "learning_rate": 9.999445592243008e-05, "loss": 1.51, "step": 40960 }, { "epoch": 2.187294866801023, "grad_norm": 0.352264404296875, "learning_rate": 9.999420452733739e-05, "loss": 1.5102, "step": 40970 }, { "epoch": 2.1876742256168815, "eval_loss": 2.473921298980713, "eval_runtime": 21.9773, "eval_samples_per_second": 227.507, "eval_steps_per_second": 1.229, "step": 40977 }, { "epoch": 2.1878368079665353, "grad_norm": 0.35294270515441895, "learning_rate": 9.9993947558558e-05, "loss": 1.5141, "step": 40980 }, { "epoch": 2.188378749132047, "grad_norm": 0.4508706033229828, "learning_rate": 9.999368501612373e-05, "loss": 1.503, "step": 40990 }, { "epoch": 2.1889206902975595, "grad_norm": 0.5512154698371887, "learning_rate": 9.999341690006711e-05, "loss": 1.5284, "step": 41000 }, { "epoch": 2.189462631463072, "grad_norm": 0.4073977768421173, "learning_rate": 9.999314321042134e-05, "loss": 1.4996, "step": 41010 }, { "epoch": 2.190004572628584, "grad_norm": 0.458518385887146, "learning_rate": 9.999286394722031e-05, "loss": 1.5099, "step": 41020 }, { "epoch": 2.190546513794096, "grad_norm": 0.25982314348220825, "learning_rate": 9.999257911049866e-05, "loss": 1.5042, "step": 41030 }, { "epoch": 2.1910884549596084, "grad_norm": 0.2903514802455902, "learning_rate": 9.999228870029165e-05, "loss": 1.5033, "step": 41040 }, { "epoch": 2.1916303961251207, "grad_norm": 0.21735897660255432, "learning_rate": 9.999199271663524e-05, "loss": 1.5171, "step": 41050 }, { "epoch": 2.192172337290633, "grad_norm": 0.39294907450675964, "learning_rate": 9.999169115956612e-05, "loss": 1.5007, "step": 41060 }, { "epoch": 2.1923891137568376, "eval_loss": 2.489570140838623, "eval_runtime": 21.9756, "eval_samples_per_second": 227.525, "eval_steps_per_second": 1.229, "step": 41064 }, { "epoch": 2.1927142784561453, "grad_norm": 0.5847845673561096, "learning_rate": 9.999138402912161e-05, "loss": 1.5158, "step": 41070 }, { "epoch": 2.193256219621657, "grad_norm": 0.33957940340042114, "learning_rate": 9.99910713253398e-05, "loss": 1.5152, "step": 41080 }, { "epoch": 2.1937981607871695, "grad_norm": 0.3208034038543701, "learning_rate": 9.999075304825941e-05, "loss": 1.509, "step": 41090 }, { "epoch": 2.194340101952682, "grad_norm": 0.3303355872631073, "learning_rate": 9.999042919791985e-05, "loss": 1.5075, "step": 41100 }, { "epoch": 2.194882043118194, "grad_norm": 0.278929203748703, "learning_rate": 9.999009977436125e-05, "loss": 1.5121, "step": 41110 }, { "epoch": 2.195423984283706, "grad_norm": 0.21701566874980927, "learning_rate": 9.998976477762442e-05, "loss": 1.5121, "step": 41120 }, { "epoch": 2.1959659254492183, "grad_norm": 0.35072654485702515, "learning_rate": 9.998942420775086e-05, "loss": 1.5021, "step": 41130 }, { "epoch": 2.1965078666147306, "grad_norm": 0.36139407753944397, "learning_rate": 9.998907806478275e-05, "loss": 1.5029, "step": 41140 }, { "epoch": 2.197049807780243, "grad_norm": 0.31903785467147827, "learning_rate": 9.998872634876297e-05, "loss": 1.506, "step": 41150 }, { "epoch": 2.1971040018967942, "eval_loss": 2.4797050952911377, "eval_runtime": 21.9824, "eval_samples_per_second": 227.454, "eval_steps_per_second": 1.228, "step": 41151 }, { "epoch": 2.1975917489457553, "grad_norm": 0.6769224405288696, "learning_rate": 9.99883690597351e-05, "loss": 1.5126, "step": 41160 }, { "epoch": 2.198133690111267, "grad_norm": 0.44375690817832947, "learning_rate": 9.998800619774339e-05, "loss": 1.5127, "step": 41170 }, { "epoch": 2.1986756312767795, "grad_norm": 0.22975300252437592, "learning_rate": 9.998763776283277e-05, "loss": 1.5153, "step": 41180 }, { "epoch": 2.1992175724422918, "grad_norm": 0.250010222196579, "learning_rate": 9.998726375504892e-05, "loss": 1.5109, "step": 41190 }, { "epoch": 2.199759513607804, "grad_norm": 0.2538135349750519, "learning_rate": 9.998688417443815e-05, "loss": 1.5127, "step": 41200 }, { "epoch": 2.2003014547733164, "grad_norm": 0.25529077649116516, "learning_rate": 9.998649902104748e-05, "loss": 1.5056, "step": 41210 }, { "epoch": 2.2008433959388283, "grad_norm": 0.3090484142303467, "learning_rate": 9.998610829492462e-05, "loss": 1.5212, "step": 41220 }, { "epoch": 2.2013853371043406, "grad_norm": 0.2540747821331024, "learning_rate": 9.998571199611799e-05, "loss": 1.5252, "step": 41230 }, { "epoch": 2.2018188900367504, "eval_loss": 2.49056077003479, "eval_runtime": 21.9807, "eval_samples_per_second": 227.473, "eval_steps_per_second": 1.228, "step": 41238 }, { "epoch": 2.201927278269853, "grad_norm": 0.35519057512283325, "learning_rate": 9.998531012467664e-05, "loss": 1.5082, "step": 41240 }, { "epoch": 2.202469219435365, "grad_norm": 0.24406082928180695, "learning_rate": 9.998490268065038e-05, "loss": 1.5048, "step": 41250 }, { "epoch": 2.203011160600877, "grad_norm": 0.3896248936653137, "learning_rate": 9.998448966408971e-05, "loss": 1.4991, "step": 41260 }, { "epoch": 2.2035531017663894, "grad_norm": 0.278812974691391, "learning_rate": 9.998407107504575e-05, "loss": 1.5014, "step": 41270 }, { "epoch": 2.2040950429319017, "grad_norm": 0.5823280811309814, "learning_rate": 9.998364691357036e-05, "loss": 1.5131, "step": 41280 }, { "epoch": 2.204636984097414, "grad_norm": 0.3981192111968994, "learning_rate": 9.998321717971609e-05, "loss": 1.5001, "step": 41290 }, { "epoch": 2.205178925262926, "grad_norm": 0.6286061406135559, "learning_rate": 9.998278187353616e-05, "loss": 1.5109, "step": 41300 }, { "epoch": 2.2057208664284382, "grad_norm": 0.2553674280643463, "learning_rate": 9.998234099508454e-05, "loss": 1.5113, "step": 41310 }, { "epoch": 2.2062628075939505, "grad_norm": 0.48403024673461914, "learning_rate": 9.998189454441579e-05, "loss": 1.5138, "step": 41320 }, { "epoch": 2.2065337781767065, "eval_loss": 2.478606700897217, "eval_runtime": 21.9762, "eval_samples_per_second": 227.519, "eval_steps_per_second": 1.229, "step": 41325 }, { "epoch": 2.206804748759463, "grad_norm": 0.3329886794090271, "learning_rate": 9.998144252158523e-05, "loss": 1.5122, "step": 41330 }, { "epoch": 2.207346689924975, "grad_norm": 0.2454531043767929, "learning_rate": 9.998098492664888e-05, "loss": 1.5094, "step": 41340 }, { "epoch": 2.207888631090487, "grad_norm": 0.3834342360496521, "learning_rate": 9.998052175966339e-05, "loss": 1.5042, "step": 41350 }, { "epoch": 2.2084305722559994, "grad_norm": 0.27599623799324036, "learning_rate": 9.998005302068616e-05, "loss": 1.5139, "step": 41360 }, { "epoch": 2.2089725134215117, "grad_norm": 0.2955540716648102, "learning_rate": 9.997957870977525e-05, "loss": 1.5212, "step": 41370 }, { "epoch": 2.209514454587024, "grad_norm": 0.25166115164756775, "learning_rate": 9.99790988269894e-05, "loss": 1.5021, "step": 41380 }, { "epoch": 2.2100563957525363, "grad_norm": 0.3993496298789978, "learning_rate": 9.997861337238807e-05, "loss": 1.4986, "step": 41390 }, { "epoch": 2.210598336918048, "grad_norm": 0.3200553059577942, "learning_rate": 9.99781223460314e-05, "loss": 1.5055, "step": 41400 }, { "epoch": 2.2111402780835605, "grad_norm": 0.3932972550392151, "learning_rate": 9.99776257479802e-05, "loss": 1.5082, "step": 41410 }, { "epoch": 2.211248666316663, "eval_loss": 2.4762344360351562, "eval_runtime": 21.9757, "eval_samples_per_second": 227.524, "eval_steps_per_second": 1.229, "step": 41412 }, { "epoch": 2.211682219249073, "grad_norm": 0.22166556119918823, "learning_rate": 9.997712357829599e-05, "loss": 1.5199, "step": 41420 }, { "epoch": 2.212224160414585, "grad_norm": 0.4338929057121277, "learning_rate": 9.997661583704098e-05, "loss": 1.51, "step": 41430 }, { "epoch": 2.212766101580097, "grad_norm": 0.2761380672454834, "learning_rate": 9.99761025242781e-05, "loss": 1.524, "step": 41440 }, { "epoch": 2.2133080427456093, "grad_norm": 0.2140953242778778, "learning_rate": 9.997558364007087e-05, "loss": 1.5159, "step": 41450 }, { "epoch": 2.2138499839111216, "grad_norm": 0.2510915696620941, "learning_rate": 9.997505918448364e-05, "loss": 1.5055, "step": 41460 }, { "epoch": 2.214391925076634, "grad_norm": 0.38897550106048584, "learning_rate": 9.997452915758131e-05, "loss": 1.4971, "step": 41470 }, { "epoch": 2.2149338662421463, "grad_norm": 0.29425960779190063, "learning_rate": 9.997399355942958e-05, "loss": 1.5033, "step": 41480 }, { "epoch": 2.215475807407658, "grad_norm": 0.43580734729766846, "learning_rate": 9.997345239009477e-05, "loss": 1.5049, "step": 41490 }, { "epoch": 2.215963554456619, "eval_loss": 2.4717535972595215, "eval_runtime": 22.0352, "eval_samples_per_second": 226.909, "eval_steps_per_second": 1.225, "step": 41499 }, { "epoch": 2.2160177485731705, "grad_norm": 0.25963518023490906, "learning_rate": 9.997290564964395e-05, "loss": 1.4924, "step": 41500 }, { "epoch": 2.216559689738683, "grad_norm": 0.23817171156406403, "learning_rate": 9.997235333814482e-05, "loss": 1.497, "step": 41510 }, { "epoch": 2.217101630904195, "grad_norm": 0.19800572097301483, "learning_rate": 9.997179545566582e-05, "loss": 1.5142, "step": 41520 }, { "epoch": 2.217643572069707, "grad_norm": 0.21633438766002655, "learning_rate": 9.997123200227606e-05, "loss": 1.5033, "step": 41530 }, { "epoch": 2.2181855132352193, "grad_norm": 0.24905604124069214, "learning_rate": 9.99706629780453e-05, "loss": 1.4983, "step": 41540 }, { "epoch": 2.2187274544007316, "grad_norm": 0.47959622740745544, "learning_rate": 9.997008838304404e-05, "loss": 1.5122, "step": 41550 }, { "epoch": 2.219269395566244, "grad_norm": 0.5474976301193237, "learning_rate": 9.996950821734351e-05, "loss": 1.5153, "step": 41560 }, { "epoch": 2.2198113367317562, "grad_norm": 0.38861730694770813, "learning_rate": 9.996892248101553e-05, "loss": 1.5109, "step": 41570 }, { "epoch": 2.220353277897268, "grad_norm": 0.48361384868621826, "learning_rate": 9.996833117413266e-05, "loss": 1.503, "step": 41580 }, { "epoch": 2.2206784425965758, "eval_loss": 2.471097946166992, "eval_runtime": 21.9727, "eval_samples_per_second": 227.555, "eval_steps_per_second": 1.229, "step": 41586 }, { "epoch": 2.2208952190627804, "grad_norm": 0.24642570316791534, "learning_rate": 9.996773429676815e-05, "loss": 1.5177, "step": 41590 }, { "epoch": 2.2214371602282927, "grad_norm": 0.2803477942943573, "learning_rate": 9.996713184899595e-05, "loss": 1.514, "step": 41600 }, { "epoch": 2.221979101393805, "grad_norm": 0.555143415927887, "learning_rate": 9.996652383089069e-05, "loss": 1.5011, "step": 41610 }, { "epoch": 2.2225210425593174, "grad_norm": 0.2646414041519165, "learning_rate": 9.996591024252768e-05, "loss": 1.5026, "step": 41620 }, { "epoch": 2.2230629837248292, "grad_norm": 0.5564790964126587, "learning_rate": 9.996529108398294e-05, "loss": 1.5104, "step": 41630 }, { "epoch": 2.2236049248903416, "grad_norm": 0.5015528798103333, "learning_rate": 9.996466635533316e-05, "loss": 1.5151, "step": 41640 }, { "epoch": 2.224146866055854, "grad_norm": 0.3904198408126831, "learning_rate": 9.996403605665572e-05, "loss": 1.5045, "step": 41650 }, { "epoch": 2.224688807221366, "grad_norm": 0.27617716789245605, "learning_rate": 9.996340018802872e-05, "loss": 1.5048, "step": 41660 }, { "epoch": 2.225230748386878, "grad_norm": 0.2563170790672302, "learning_rate": 9.99627587495309e-05, "loss": 1.507, "step": 41670 }, { "epoch": 2.225393330736532, "eval_loss": 2.475794553756714, "eval_runtime": 21.9846, "eval_samples_per_second": 227.432, "eval_steps_per_second": 1.228, "step": 41673 }, { "epoch": 2.2257726895523904, "grad_norm": 0.20860429108142853, "learning_rate": 9.996211174124174e-05, "loss": 1.5048, "step": 41680 }, { "epoch": 2.2263146307179027, "grad_norm": 0.2962222397327423, "learning_rate": 9.996145916324138e-05, "loss": 1.4984, "step": 41690 }, { "epoch": 2.226856571883415, "grad_norm": 0.285791277885437, "learning_rate": 9.996080101561066e-05, "loss": 1.5111, "step": 41700 }, { "epoch": 2.2273985130489273, "grad_norm": 0.2907065451145172, "learning_rate": 9.996013729843113e-05, "loss": 1.5012, "step": 41710 }, { "epoch": 2.227940454214439, "grad_norm": 0.3030034303665161, "learning_rate": 9.995946801178498e-05, "loss": 1.5042, "step": 41720 }, { "epoch": 2.2284823953799515, "grad_norm": 0.3327134847640991, "learning_rate": 9.995879315575511e-05, "loss": 1.5058, "step": 41730 }, { "epoch": 2.229024336545464, "grad_norm": 0.2516813576221466, "learning_rate": 9.995811273042515e-05, "loss": 1.5054, "step": 41740 }, { "epoch": 2.229566277710976, "grad_norm": 0.43475955724716187, "learning_rate": 9.995742673587937e-05, "loss": 1.5044, "step": 41750 }, { "epoch": 2.230108218876488, "grad_norm": 0.4820566773414612, "learning_rate": 9.995673517220277e-05, "loss": 1.4987, "step": 41760 }, { "epoch": 2.230108218876488, "eval_loss": 2.476296901702881, "eval_runtime": 21.9717, "eval_samples_per_second": 227.565, "eval_steps_per_second": 1.229, "step": 41760 }, { "epoch": 2.2306501600420003, "grad_norm": 0.503626823425293, "learning_rate": 9.995603803948098e-05, "loss": 1.5076, "step": 41770 }, { "epoch": 2.2311921012075127, "grad_norm": 0.3283330500125885, "learning_rate": 9.995533533780038e-05, "loss": 1.5055, "step": 41780 }, { "epoch": 2.231734042373025, "grad_norm": 0.5820431113243103, "learning_rate": 9.9954627067248e-05, "loss": 1.5118, "step": 41790 }, { "epoch": 2.2322759835385373, "grad_norm": 0.29778963327407837, "learning_rate": 9.995391322791162e-05, "loss": 1.5087, "step": 41800 }, { "epoch": 2.232817924704049, "grad_norm": 0.4030333459377289, "learning_rate": 9.995319381987962e-05, "loss": 1.4998, "step": 41810 }, { "epoch": 2.2333598658695615, "grad_norm": 0.22505435347557068, "learning_rate": 9.995246884324115e-05, "loss": 1.5041, "step": 41820 }, { "epoch": 2.233901807035074, "grad_norm": 0.4648926258087158, "learning_rate": 9.995173829808599e-05, "loss": 1.4911, "step": 41830 }, { "epoch": 2.234443748200586, "grad_norm": 0.2858235239982605, "learning_rate": 9.995100218450467e-05, "loss": 1.5019, "step": 41840 }, { "epoch": 2.2348231070164446, "eval_loss": 2.4754035472869873, "eval_runtime": 21.9771, "eval_samples_per_second": 227.51, "eval_steps_per_second": 1.229, "step": 41847 }, { "epoch": 2.2349856893660984, "grad_norm": 0.48889511823654175, "learning_rate": 9.995026050258835e-05, "loss": 1.5035, "step": 41850 }, { "epoch": 2.2355276305316103, "grad_norm": 0.32360124588012695, "learning_rate": 9.994951325242891e-05, "loss": 1.4995, "step": 41860 }, { "epoch": 2.2360695716971226, "grad_norm": 0.3390207588672638, "learning_rate": 9.994876043411891e-05, "loss": 1.5077, "step": 41870 }, { "epoch": 2.236611512862635, "grad_norm": 0.3765084743499756, "learning_rate": 9.994800204775163e-05, "loss": 1.5077, "step": 41880 }, { "epoch": 2.2371534540281472, "grad_norm": 0.26570597290992737, "learning_rate": 9.994723809342099e-05, "loss": 1.5082, "step": 41890 }, { "epoch": 2.237695395193659, "grad_norm": 0.2624066472053528, "learning_rate": 9.994646857122165e-05, "loss": 1.5153, "step": 41900 }, { "epoch": 2.2382373363591714, "grad_norm": 0.5075863003730774, "learning_rate": 9.994569348124891e-05, "loss": 1.5177, "step": 41910 }, { "epoch": 2.2387792775246838, "grad_norm": 0.2840348184108734, "learning_rate": 9.99449128235988e-05, "loss": 1.507, "step": 41920 }, { "epoch": 2.239321218690196, "grad_norm": 0.23727837204933167, "learning_rate": 9.994412659836802e-05, "loss": 1.5062, "step": 41930 }, { "epoch": 2.2395379951564007, "eval_loss": 2.478437662124634, "eval_runtime": 21.9787, "eval_samples_per_second": 227.493, "eval_steps_per_second": 1.228, "step": 41934 }, { "epoch": 2.239863159855708, "grad_norm": 0.5313767790794373, "learning_rate": 9.994333480565397e-05, "loss": 1.5117, "step": 41940 }, { "epoch": 2.2404051010212203, "grad_norm": 0.28299862146377563, "learning_rate": 9.994253744555473e-05, "loss": 1.5064, "step": 41950 }, { "epoch": 2.2409470421867326, "grad_norm": 0.2567594349384308, "learning_rate": 9.994173451816906e-05, "loss": 1.5007, "step": 41960 }, { "epoch": 2.241488983352245, "grad_norm": 0.24751020967960358, "learning_rate": 9.994092602359645e-05, "loss": 1.5152, "step": 41970 }, { "epoch": 2.242030924517757, "grad_norm": 0.5088745951652527, "learning_rate": 9.994011196193702e-05, "loss": 1.5042, "step": 41980 }, { "epoch": 2.242572865683269, "grad_norm": 0.2354600876569748, "learning_rate": 9.993929233329164e-05, "loss": 1.5111, "step": 41990 }, { "epoch": 2.2431148068487814, "grad_norm": 0.22222284972667694, "learning_rate": 9.993846713776183e-05, "loss": 1.5087, "step": 42000 }, { "epoch": 2.2436567480142937, "grad_norm": 0.20365022122859955, "learning_rate": 9.993763637544983e-05, "loss": 1.5099, "step": 42010 }, { "epoch": 2.244198689179806, "grad_norm": 0.2721654176712036, "learning_rate": 9.993680004645851e-05, "loss": 1.5031, "step": 42020 }, { "epoch": 2.2442528832963573, "eval_loss": 2.4772350788116455, "eval_runtime": 21.9795, "eval_samples_per_second": 227.485, "eval_steps_per_second": 1.228, "step": 42021 }, { "epoch": 2.2447406303453183, "grad_norm": 0.3124418556690216, "learning_rate": 9.993595815089151e-05, "loss": 1.503, "step": 42030 }, { "epoch": 2.24528257151083, "grad_norm": 0.304231196641922, "learning_rate": 9.993511068885311e-05, "loss": 1.4958, "step": 42040 }, { "epoch": 2.2458245126763425, "grad_norm": 0.23490315675735474, "learning_rate": 9.99342576604483e-05, "loss": 1.5052, "step": 42050 }, { "epoch": 2.246366453841855, "grad_norm": 0.5602021217346191, "learning_rate": 9.99333990657827e-05, "loss": 1.5007, "step": 42060 }, { "epoch": 2.246908395007367, "grad_norm": 0.25727346539497375, "learning_rate": 9.993253490496272e-05, "loss": 1.5107, "step": 42070 }, { "epoch": 2.247450336172879, "grad_norm": 0.2954048216342926, "learning_rate": 9.99316651780954e-05, "loss": 1.5127, "step": 42080 }, { "epoch": 2.2479922773383914, "grad_norm": 0.23384593427181244, "learning_rate": 9.993078988528848e-05, "loss": 1.5024, "step": 42090 }, { "epoch": 2.2485342185039037, "grad_norm": 0.2479962408542633, "learning_rate": 9.992990902665037e-05, "loss": 1.5007, "step": 42100 }, { "epoch": 2.2489677714363134, "eval_loss": 2.471235990524292, "eval_runtime": 21.9804, "eval_samples_per_second": 227.476, "eval_steps_per_second": 1.228, "step": 42108 }, { "epoch": 2.249076159669416, "grad_norm": 0.26295939087867737, "learning_rate": 9.99290226022902e-05, "loss": 1.5132, "step": 42110 }, { "epoch": 2.2496181008349283, "grad_norm": 0.6523755192756653, "learning_rate": 9.992813061231775e-05, "loss": 1.5043, "step": 42120 }, { "epoch": 2.25016004200044, "grad_norm": 0.3719354569911957, "learning_rate": 9.992723305684357e-05, "loss": 1.5014, "step": 42130 }, { "epoch": 2.2507019831659525, "grad_norm": 0.2825568616390228, "learning_rate": 9.99263299359788e-05, "loss": 1.5012, "step": 42140 }, { "epoch": 2.251243924331465, "grad_norm": 0.4288165867328644, "learning_rate": 9.992542124983531e-05, "loss": 1.5053, "step": 42150 }, { "epoch": 2.251785865496977, "grad_norm": 0.22682829201221466, "learning_rate": 9.992450699852571e-05, "loss": 1.501, "step": 42160 }, { "epoch": 2.252327806662489, "grad_norm": 0.409411758184433, "learning_rate": 9.992358718216321e-05, "loss": 1.5048, "step": 42170 }, { "epoch": 2.2528697478280013, "grad_norm": 0.6447529792785645, "learning_rate": 9.992266180086178e-05, "loss": 1.5141, "step": 42180 }, { "epoch": 2.2534116889935136, "grad_norm": 0.35520994663238525, "learning_rate": 9.992173085473603e-05, "loss": 1.5043, "step": 42190 }, { "epoch": 2.2536826595762696, "eval_loss": 2.463776111602783, "eval_runtime": 21.9731, "eval_samples_per_second": 227.551, "eval_steps_per_second": 1.229, "step": 42195 }, { "epoch": 2.253953630159026, "grad_norm": 0.24560990929603577, "learning_rate": 9.992079434390129e-05, "loss": 1.5081, "step": 42200 }, { "epoch": 2.2544955713245383, "grad_norm": 0.41237837076187134, "learning_rate": 9.991985226847358e-05, "loss": 1.5058, "step": 42210 }, { "epoch": 2.25503751249005, "grad_norm": 0.2179434448480606, "learning_rate": 9.99189046285696e-05, "loss": 1.5061, "step": 42220 }, { "epoch": 2.2555794536555624, "grad_norm": 0.2224990427494049, "learning_rate": 9.991795142430672e-05, "loss": 1.4924, "step": 42230 }, { "epoch": 2.2561213948210748, "grad_norm": 0.3237124979496002, "learning_rate": 9.991699265580304e-05, "loss": 1.5058, "step": 42240 }, { "epoch": 2.256663335986587, "grad_norm": 0.2218666821718216, "learning_rate": 9.991602832317731e-05, "loss": 1.4995, "step": 42250 }, { "epoch": 2.2572052771520994, "grad_norm": 0.38904523849487305, "learning_rate": 9.991505842654901e-05, "loss": 1.503, "step": 42260 }, { "epoch": 2.2577472183176113, "grad_norm": 0.341505765914917, "learning_rate": 9.991408296603827e-05, "loss": 1.5107, "step": 42270 }, { "epoch": 2.2582891594831236, "grad_norm": 0.242193341255188, "learning_rate": 9.991310194176593e-05, "loss": 1.495, "step": 42280 }, { "epoch": 2.258397547716226, "eval_loss": 2.4719505310058594, "eval_runtime": 21.9979, "eval_samples_per_second": 227.294, "eval_steps_per_second": 1.227, "step": 42282 }, { "epoch": 2.258831100648636, "grad_norm": 0.26852932572364807, "learning_rate": 9.991211535385352e-05, "loss": 1.504, "step": 42290 }, { "epoch": 2.259373041814148, "grad_norm": 0.32731330394744873, "learning_rate": 9.991112320242322e-05, "loss": 1.4927, "step": 42300 }, { "epoch": 2.25991498297966, "grad_norm": 0.30170223116874695, "learning_rate": 9.991012548759801e-05, "loss": 1.5, "step": 42310 }, { "epoch": 2.2604569241451724, "grad_norm": 0.2903915345668793, "learning_rate": 9.990912220950141e-05, "loss": 1.4963, "step": 42320 }, { "epoch": 2.2609988653106847, "grad_norm": 0.2501979470252991, "learning_rate": 9.990811336825771e-05, "loss": 1.5035, "step": 42330 }, { "epoch": 2.261540806476197, "grad_norm": 0.3588942289352417, "learning_rate": 9.990709896399191e-05, "loss": 1.506, "step": 42340 }, { "epoch": 2.262082747641709, "grad_norm": 0.2870718240737915, "learning_rate": 9.990607899682967e-05, "loss": 1.4929, "step": 42350 }, { "epoch": 2.2626246888072212, "grad_norm": 0.2925523817539215, "learning_rate": 9.990505346689732e-05, "loss": 1.5084, "step": 42360 }, { "epoch": 2.2631124358561823, "eval_loss": 2.4687459468841553, "eval_runtime": 22.2759, "eval_samples_per_second": 224.458, "eval_steps_per_second": 1.212, "step": 42369 }, { "epoch": 2.2631666299727335, "grad_norm": 0.28121358156204224, "learning_rate": 9.99040223743219e-05, "loss": 1.5021, "step": 42370 }, { "epoch": 2.263708571138246, "grad_norm": 0.40164533257484436, "learning_rate": 9.990298571923114e-05, "loss": 1.4933, "step": 42380 }, { "epoch": 2.264250512303758, "grad_norm": 0.1876879632472992, "learning_rate": 9.990194350175346e-05, "loss": 1.5118, "step": 42390 }, { "epoch": 2.26479245346927, "grad_norm": 0.3837462067604065, "learning_rate": 9.990089572201796e-05, "loss": 1.4987, "step": 42400 }, { "epoch": 2.2653343946347824, "grad_norm": 0.3707353472709656, "learning_rate": 9.989984238015445e-05, "loss": 1.5162, "step": 42410 }, { "epoch": 2.2658763358002947, "grad_norm": 0.5873448252677917, "learning_rate": 9.98987834762934e-05, "loss": 1.4921, "step": 42420 }, { "epoch": 2.266418276965807, "grad_norm": 0.43937060236930847, "learning_rate": 9.989771901056598e-05, "loss": 1.5083, "step": 42430 }, { "epoch": 2.2669602181313193, "grad_norm": 0.3484452962875366, "learning_rate": 9.989664898310405e-05, "loss": 1.5054, "step": 42440 }, { "epoch": 2.267502159296831, "grad_norm": 0.3318488597869873, "learning_rate": 9.98955733940402e-05, "loss": 1.4976, "step": 42450 }, { "epoch": 2.267827323996139, "eval_loss": 2.4685611724853516, "eval_runtime": 22.0052, "eval_samples_per_second": 227.219, "eval_steps_per_second": 1.227, "step": 42456 }, { "epoch": 2.2680441004623435, "grad_norm": 0.3194591999053955, "learning_rate": 9.989449224350758e-05, "loss": 1.504, "step": 42460 }, { "epoch": 2.268586041627856, "grad_norm": 0.2176416665315628, "learning_rate": 9.989340553164021e-05, "loss": 1.5006, "step": 42470 }, { "epoch": 2.269127982793368, "grad_norm": 0.2666696608066559, "learning_rate": 9.989231325857266e-05, "loss": 1.5008, "step": 42480 }, { "epoch": 2.2696699239588805, "grad_norm": 0.5212827324867249, "learning_rate": 9.989121542444023e-05, "loss": 1.5013, "step": 42490 }, { "epoch": 2.2702118651243923, "grad_norm": 0.33473333716392517, "learning_rate": 9.989011202937896e-05, "loss": 1.5156, "step": 42500 }, { "epoch": 2.2707538062899046, "grad_norm": 0.46315068006515503, "learning_rate": 9.988900307352549e-05, "loss": 1.4865, "step": 42510 }, { "epoch": 2.271295747455417, "grad_norm": 0.3576599657535553, "learning_rate": 9.98878885570172e-05, "loss": 1.5077, "step": 42520 }, { "epoch": 2.2718376886209293, "grad_norm": 0.22602027654647827, "learning_rate": 9.988676847999218e-05, "loss": 1.5185, "step": 42530 }, { "epoch": 2.272379629786441, "grad_norm": 0.2150840014219284, "learning_rate": 9.988564284258916e-05, "loss": 1.5062, "step": 42540 }, { "epoch": 2.272542212136095, "eval_loss": 2.4602785110473633, "eval_runtime": 21.997, "eval_samples_per_second": 227.304, "eval_steps_per_second": 1.227, "step": 42543 }, { "epoch": 2.2729215709519535, "grad_norm": 0.3397773206233978, "learning_rate": 9.988451164494757e-05, "loss": 1.5026, "step": 42550 }, { "epoch": 2.273463512117466, "grad_norm": 0.213679701089859, "learning_rate": 9.988337488720753e-05, "loss": 1.5014, "step": 42560 }, { "epoch": 2.274005453282978, "grad_norm": 0.21069695055484772, "learning_rate": 9.98822325695099e-05, "loss": 1.5098, "step": 42570 }, { "epoch": 2.27454739444849, "grad_norm": 0.38944995403289795, "learning_rate": 9.988108469199613e-05, "loss": 1.496, "step": 42580 }, { "epoch": 2.2750893356140023, "grad_norm": 0.36275213956832886, "learning_rate": 9.987993125480848e-05, "loss": 1.5009, "step": 42590 }, { "epoch": 2.2756312767795146, "grad_norm": 0.27559274435043335, "learning_rate": 9.987877225808976e-05, "loss": 1.4994, "step": 42600 }, { "epoch": 2.276173217945027, "grad_norm": 0.365401029586792, "learning_rate": 9.987760770198359e-05, "loss": 1.5061, "step": 42610 }, { "epoch": 2.2767151591105392, "grad_norm": 0.29308363795280457, "learning_rate": 9.987643758663422e-05, "loss": 1.4963, "step": 42620 }, { "epoch": 2.277257100276051, "grad_norm": 0.43470004200935364, "learning_rate": 9.98752619121866e-05, "loss": 1.496, "step": 42630 }, { "epoch": 2.277257100276051, "eval_loss": 2.4641952514648438, "eval_runtime": 22.0028, "eval_samples_per_second": 227.243, "eval_steps_per_second": 1.227, "step": 42630 }, { "epoch": 2.2777990414415634, "grad_norm": 0.21174076199531555, "learning_rate": 9.987408067878636e-05, "loss": 1.5095, "step": 42640 }, { "epoch": 2.2783409826070757, "grad_norm": 0.4398151934146881, "learning_rate": 9.987289388657982e-05, "loss": 1.4995, "step": 42650 }, { "epoch": 2.278882923772588, "grad_norm": 0.41599544882774353, "learning_rate": 9.987170153571402e-05, "loss": 1.4949, "step": 42660 }, { "epoch": 2.2794248649381004, "grad_norm": 0.34385424852371216, "learning_rate": 9.987050362633663e-05, "loss": 1.5031, "step": 42670 }, { "epoch": 2.2799668061036122, "grad_norm": 0.242776557803154, "learning_rate": 9.986930015859607e-05, "loss": 1.4902, "step": 42680 }, { "epoch": 2.2805087472691246, "grad_norm": 0.4597446322441101, "learning_rate": 9.986809113264142e-05, "loss": 1.5039, "step": 42690 }, { "epoch": 2.281050688434637, "grad_norm": 0.25635120272636414, "learning_rate": 9.986687654862242e-05, "loss": 1.5071, "step": 42700 }, { "epoch": 2.281592629600149, "grad_norm": 0.23346717655658722, "learning_rate": 9.986565640668955e-05, "loss": 1.5025, "step": 42710 }, { "epoch": 2.2819719884160077, "eval_loss": 2.4699313640594482, "eval_runtime": 22.0353, "eval_samples_per_second": 226.908, "eval_steps_per_second": 1.225, "step": 42717 }, { "epoch": 2.282134570765661, "grad_norm": 0.4288314878940582, "learning_rate": 9.986443070699393e-05, "loss": 1.5069, "step": 42720 }, { "epoch": 2.2826765119311734, "grad_norm": 0.3048160672187805, "learning_rate": 9.986319944968745e-05, "loss": 1.5084, "step": 42730 }, { "epoch": 2.2832184530966857, "grad_norm": 0.2685145139694214, "learning_rate": 9.986196263492259e-05, "loss": 1.5078, "step": 42740 }, { "epoch": 2.283760394262198, "grad_norm": 0.36465343832969666, "learning_rate": 9.986072026285257e-05, "loss": 1.4972, "step": 42750 }, { "epoch": 2.28430233542771, "grad_norm": 0.27710703015327454, "learning_rate": 9.985947233363128e-05, "loss": 1.4929, "step": 42760 }, { "epoch": 2.284844276593222, "grad_norm": 0.5926934480667114, "learning_rate": 9.985821884741333e-05, "loss": 1.5015, "step": 42770 }, { "epoch": 2.2853862177587345, "grad_norm": 0.34573689103126526, "learning_rate": 9.985695980435398e-05, "loss": 1.5047, "step": 42780 }, { "epoch": 2.285928158924247, "grad_norm": 0.33093443512916565, "learning_rate": 9.985569520460918e-05, "loss": 1.4997, "step": 42790 }, { "epoch": 2.286470100089759, "grad_norm": 0.5260155200958252, "learning_rate": 9.985442504833562e-05, "loss": 1.5034, "step": 42800 }, { "epoch": 2.286686876555964, "eval_loss": 2.4743220806121826, "eval_runtime": 22.0148, "eval_samples_per_second": 227.12, "eval_steps_per_second": 1.226, "step": 42804 }, { "epoch": 2.287012041255271, "grad_norm": 0.22900910675525665, "learning_rate": 9.985314933569063e-05, "loss": 1.5057, "step": 42810 }, { "epoch": 2.2875539824207833, "grad_norm": 0.47526514530181885, "learning_rate": 9.985186806683222e-05, "loss": 1.4987, "step": 42820 }, { "epoch": 2.2880959235862957, "grad_norm": 0.2705422043800354, "learning_rate": 9.985058124191914e-05, "loss": 1.5031, "step": 42830 }, { "epoch": 2.288637864751808, "grad_norm": 0.21489684283733368, "learning_rate": 9.984928886111076e-05, "loss": 1.5001, "step": 42840 }, { "epoch": 2.2891798059173203, "grad_norm": 0.44066211581230164, "learning_rate": 9.98479909245672e-05, "loss": 1.5006, "step": 42850 }, { "epoch": 2.289721747082832, "grad_norm": 0.4437292516231537, "learning_rate": 9.984668743244921e-05, "loss": 1.5121, "step": 42860 }, { "epoch": 2.2902636882483445, "grad_norm": 0.41992589831352234, "learning_rate": 9.984537838491833e-05, "loss": 1.4998, "step": 42870 }, { "epoch": 2.290805629413857, "grad_norm": 0.26407718658447266, "learning_rate": 9.984406378213664e-05, "loss": 1.5009, "step": 42880 }, { "epoch": 2.291347570579369, "grad_norm": 0.35871899127960205, "learning_rate": 9.984274362426703e-05, "loss": 1.5003, "step": 42890 }, { "epoch": 2.2914017646959204, "eval_loss": 2.4577038288116455, "eval_runtime": 22.0057, "eval_samples_per_second": 227.214, "eval_steps_per_second": 1.227, "step": 42891 }, { "epoch": 2.2918895117448814, "grad_norm": 0.38924890756607056, "learning_rate": 9.984141791147303e-05, "loss": 1.5098, "step": 42900 }, { "epoch": 2.2924314529103933, "grad_norm": 0.46913039684295654, "learning_rate": 9.984008664391888e-05, "loss": 1.5006, "step": 42910 }, { "epoch": 2.2929733940759056, "grad_norm": 0.6839142441749573, "learning_rate": 9.983874982176944e-05, "loss": 1.5011, "step": 42920 }, { "epoch": 2.293515335241418, "grad_norm": 0.37098947167396545, "learning_rate": 9.983740744519037e-05, "loss": 1.5078, "step": 42930 }, { "epoch": 2.2940572764069302, "grad_norm": 0.4213247001171112, "learning_rate": 9.983605951434792e-05, "loss": 1.4979, "step": 42940 }, { "epoch": 2.294599217572442, "grad_norm": 0.37451714277267456, "learning_rate": 9.983470602940907e-05, "loss": 1.5055, "step": 42950 }, { "epoch": 2.2951411587379544, "grad_norm": 0.3335314989089966, "learning_rate": 9.983334699054149e-05, "loss": 1.4985, "step": 42960 }, { "epoch": 2.2956830999034668, "grad_norm": 0.5755605697631836, "learning_rate": 9.983198239791353e-05, "loss": 1.5072, "step": 42970 }, { "epoch": 2.2961166528358765, "eval_loss": 2.46994686126709, "eval_runtime": 22.0001, "eval_samples_per_second": 227.272, "eval_steps_per_second": 1.227, "step": 42978 }, { "epoch": 2.296225041068979, "grad_norm": 0.4055348038673401, "learning_rate": 9.983061225169423e-05, "loss": 1.514, "step": 42980 }, { "epoch": 2.296766982234491, "grad_norm": 0.41009873151779175, "learning_rate": 9.982923655205335e-05, "loss": 1.4959, "step": 42990 }, { "epoch": 2.2973089234000033, "grad_norm": 0.30407077074050903, "learning_rate": 9.982785529916124e-05, "loss": 1.4958, "step": 43000 }, { "epoch": 2.2978508645655156, "grad_norm": 0.3343801498413086, "learning_rate": 9.982646849318906e-05, "loss": 1.5003, "step": 43010 }, { "epoch": 2.298392805731028, "grad_norm": 0.4572197496891022, "learning_rate": 9.982507613430856e-05, "loss": 1.4998, "step": 43020 }, { "epoch": 2.29893474689654, "grad_norm": 0.3537229895591736, "learning_rate": 9.982367822269225e-05, "loss": 1.5059, "step": 43030 }, { "epoch": 2.299476688062052, "grad_norm": 0.4736888110637665, "learning_rate": 9.982227475851328e-05, "loss": 1.4988, "step": 43040 }, { "epoch": 2.3000186292275644, "grad_norm": 0.32492560148239136, "learning_rate": 9.982086574194552e-05, "loss": 1.5023, "step": 43050 }, { "epoch": 2.3005605703930767, "grad_norm": 0.19111861288547516, "learning_rate": 9.98194511731635e-05, "loss": 1.5075, "step": 43060 }, { "epoch": 2.3008315409758326, "eval_loss": 2.4626097679138184, "eval_runtime": 22.0143, "eval_samples_per_second": 227.125, "eval_steps_per_second": 1.226, "step": 43065 }, { "epoch": 2.301102511558589, "grad_norm": 0.416725754737854, "learning_rate": 9.981803105234246e-05, "loss": 1.5009, "step": 43070 }, { "epoch": 2.3016444527241013, "grad_norm": 0.5482866168022156, "learning_rate": 9.981660537965833e-05, "loss": 1.5044, "step": 43080 }, { "epoch": 2.302186393889613, "grad_norm": 0.22884666919708252, "learning_rate": 9.981517415528766e-05, "loss": 1.4965, "step": 43090 }, { "epoch": 2.3027283350551255, "grad_norm": 0.3131319284439087, "learning_rate": 9.98137373794078e-05, "loss": 1.5089, "step": 43100 }, { "epoch": 2.303270276220638, "grad_norm": 0.22370652854442596, "learning_rate": 9.981229505219673e-05, "loss": 1.5051, "step": 43110 }, { "epoch": 2.30381221738615, "grad_norm": 0.2758335471153259, "learning_rate": 9.98108471738331e-05, "loss": 1.5001, "step": 43120 }, { "epoch": 2.3043541585516625, "grad_norm": 0.309836208820343, "learning_rate": 9.980939374449627e-05, "loss": 1.5022, "step": 43130 }, { "epoch": 2.3048960997171744, "grad_norm": 0.2783421576023102, "learning_rate": 9.980793476436628e-05, "loss": 1.4996, "step": 43140 }, { "epoch": 2.3054380408826867, "grad_norm": 0.26890337467193604, "learning_rate": 9.980647023362388e-05, "loss": 1.5025, "step": 43150 }, { "epoch": 2.305546429115789, "eval_loss": 2.4663755893707275, "eval_runtime": 21.9786, "eval_samples_per_second": 227.494, "eval_steps_per_second": 1.228, "step": 43152 }, { "epoch": 2.305979982048199, "grad_norm": 0.26960644125938416, "learning_rate": 9.980500015245045e-05, "loss": 1.5067, "step": 43160 }, { "epoch": 2.3065219232137113, "grad_norm": 0.32048168778419495, "learning_rate": 9.980352452102815e-05, "loss": 1.4944, "step": 43170 }, { "epoch": 2.307063864379223, "grad_norm": 0.36742621660232544, "learning_rate": 9.980204333953974e-05, "loss": 1.4997, "step": 43180 }, { "epoch": 2.3076058055447355, "grad_norm": 0.38924074172973633, "learning_rate": 9.980055660816872e-05, "loss": 1.5018, "step": 43190 }, { "epoch": 2.308147746710248, "grad_norm": 0.7526839971542358, "learning_rate": 9.979906432709925e-05, "loss": 1.5036, "step": 43200 }, { "epoch": 2.30868968787576, "grad_norm": 0.49918413162231445, "learning_rate": 9.97975664965162e-05, "loss": 1.5053, "step": 43210 }, { "epoch": 2.309231629041272, "grad_norm": 0.21481403708457947, "learning_rate": 9.979606311660506e-05, "loss": 1.5091, "step": 43220 }, { "epoch": 2.3097735702067843, "grad_norm": 0.26321297883987427, "learning_rate": 9.979455418755215e-05, "loss": 1.4983, "step": 43230 }, { "epoch": 2.3102613172557454, "eval_loss": 2.472028970718384, "eval_runtime": 21.9804, "eval_samples_per_second": 227.476, "eval_steps_per_second": 1.228, "step": 43239 }, { "epoch": 2.3103155113722966, "grad_norm": 0.2163982391357422, "learning_rate": 9.979303970954434e-05, "loss": 1.5075, "step": 43240 }, { "epoch": 2.310857452537809, "grad_norm": 0.2770186960697174, "learning_rate": 9.979151968276922e-05, "loss": 1.5033, "step": 43250 }, { "epoch": 2.3113993937033213, "grad_norm": 0.3074730932712555, "learning_rate": 9.978999410741514e-05, "loss": 1.4884, "step": 43260 }, { "epoch": 2.311941334868833, "grad_norm": 0.30169418454170227, "learning_rate": 9.978846298367103e-05, "loss": 1.4956, "step": 43270 }, { "epoch": 2.3124832760343454, "grad_norm": 0.23095916211605072, "learning_rate": 9.978692631172657e-05, "loss": 1.4884, "step": 43280 }, { "epoch": 2.3130252171998578, "grad_norm": 0.28143152594566345, "learning_rate": 9.978538409177212e-05, "loss": 1.494, "step": 43290 }, { "epoch": 2.31356715836537, "grad_norm": 0.3327721953392029, "learning_rate": 9.978383632399876e-05, "loss": 1.5017, "step": 43300 }, { "epoch": 2.3141090995308824, "grad_norm": 0.2959439754486084, "learning_rate": 9.978228300859817e-05, "loss": 1.4953, "step": 43310 }, { "epoch": 2.3146510406963943, "grad_norm": 0.3569033741950989, "learning_rate": 9.978072414576277e-05, "loss": 1.509, "step": 43320 }, { "epoch": 2.314976205395702, "eval_loss": 2.471214771270752, "eval_runtime": 21.9774, "eval_samples_per_second": 227.506, "eval_steps_per_second": 1.229, "step": 43326 }, { "epoch": 2.3151929818619066, "grad_norm": 0.37210530042648315, "learning_rate": 9.97791597356857e-05, "loss": 1.4907, "step": 43330 }, { "epoch": 2.315734923027419, "grad_norm": 0.22837452590465546, "learning_rate": 9.977758977856074e-05, "loss": 1.494, "step": 43340 }, { "epoch": 2.316276864192931, "grad_norm": 0.351477712392807, "learning_rate": 9.977601427458235e-05, "loss": 1.4953, "step": 43350 }, { "epoch": 2.316818805358443, "grad_norm": 0.4676024913787842, "learning_rate": 9.97744332239457e-05, "loss": 1.5027, "step": 43360 }, { "epoch": 2.3173607465239554, "grad_norm": 0.3546915054321289, "learning_rate": 9.977284662684668e-05, "loss": 1.509, "step": 43370 }, { "epoch": 2.3179026876894677, "grad_norm": 0.45756760239601135, "learning_rate": 9.977125448348178e-05, "loss": 1.4956, "step": 43380 }, { "epoch": 2.31844462885498, "grad_norm": 0.44374728202819824, "learning_rate": 9.976965679404827e-05, "loss": 1.4851, "step": 43390 }, { "epoch": 2.318986570020492, "grad_norm": 0.2877369225025177, "learning_rate": 9.976805355874404e-05, "loss": 1.4979, "step": 43400 }, { "epoch": 2.3195285111860042, "grad_norm": 0.4321652352809906, "learning_rate": 9.976644477776768e-05, "loss": 1.5, "step": 43410 }, { "epoch": 2.319691093535658, "eval_loss": 2.4682838916778564, "eval_runtime": 21.9705, "eval_samples_per_second": 227.578, "eval_steps_per_second": 1.229, "step": 43413 }, { "epoch": 2.3200704523515165, "grad_norm": 0.3509621024131775, "learning_rate": 9.976483045131853e-05, "loss": 1.4899, "step": 43420 }, { "epoch": 2.320612393517029, "grad_norm": 0.2721761167049408, "learning_rate": 9.976321057959651e-05, "loss": 1.4967, "step": 43430 }, { "epoch": 2.321154334682541, "grad_norm": 0.27356627583503723, "learning_rate": 9.976158516280231e-05, "loss": 1.5021, "step": 43440 }, { "epoch": 2.321696275848053, "grad_norm": 0.30389806628227234, "learning_rate": 9.975995420113729e-05, "loss": 1.4943, "step": 43450 }, { "epoch": 2.3222382170135654, "grad_norm": 0.35005050897598267, "learning_rate": 9.975831769480345e-05, "loss": 1.5019, "step": 43460 }, { "epoch": 2.3227801581790777, "grad_norm": 0.19983713328838348, "learning_rate": 9.975667564400355e-05, "loss": 1.5049, "step": 43470 }, { "epoch": 2.32332209934459, "grad_norm": 0.27769649028778076, "learning_rate": 9.975502804894097e-05, "loss": 1.4946, "step": 43480 }, { "epoch": 2.3238640405101023, "grad_norm": 0.20963497459888458, "learning_rate": 9.975337490981984e-05, "loss": 1.4937, "step": 43490 }, { "epoch": 2.324405981675614, "grad_norm": 0.21975292265415192, "learning_rate": 9.975171622684492e-05, "loss": 1.4993, "step": 43500 }, { "epoch": 2.324405981675614, "eval_loss": 2.477883815765381, "eval_runtime": 21.9663, "eval_samples_per_second": 227.622, "eval_steps_per_second": 1.229, "step": 43500 }, { "epoch": 2.3249479228411265, "grad_norm": 0.3812916874885559, "learning_rate": 9.97500520002217e-05, "loss": 1.5059, "step": 43510 }, { "epoch": 2.325489864006639, "grad_norm": 0.29676496982574463, "learning_rate": 9.974838223015631e-05, "loss": 1.4986, "step": 43520 }, { "epoch": 2.326031805172151, "grad_norm": 0.3537111282348633, "learning_rate": 9.97467069168556e-05, "loss": 1.4903, "step": 43530 }, { "epoch": 2.3265737463376635, "grad_norm": 0.2975006699562073, "learning_rate": 9.974502606052711e-05, "loss": 1.4985, "step": 43540 }, { "epoch": 2.3271156875031753, "grad_norm": 0.42832067608833313, "learning_rate": 9.974333966137907e-05, "loss": 1.4866, "step": 43550 }, { "epoch": 2.3276576286686876, "grad_norm": 0.29303500056266785, "learning_rate": 9.974164771962035e-05, "loss": 1.496, "step": 43560 }, { "epoch": 2.3281995698342, "grad_norm": 0.330950528383255, "learning_rate": 9.973995023546055e-05, "loss": 1.4929, "step": 43570 }, { "epoch": 2.3287415109997123, "grad_norm": 0.20558315515518188, "learning_rate": 9.973824720911e-05, "loss": 1.5115, "step": 43580 }, { "epoch": 2.3291208698155708, "eval_loss": 2.4667203426361084, "eval_runtime": 21.9785, "eval_samples_per_second": 227.495, "eval_steps_per_second": 1.228, "step": 43587 }, { "epoch": 2.329283452165224, "grad_norm": 0.36295872926712036, "learning_rate": 9.973653864077958e-05, "loss": 1.4857, "step": 43590 }, { "epoch": 2.3298253933307365, "grad_norm": 0.6293714046478271, "learning_rate": 9.973482453068099e-05, "loss": 1.497, "step": 43600 }, { "epoch": 2.3303673344962488, "grad_norm": 0.4836002290248871, "learning_rate": 9.973310487902657e-05, "loss": 1.5107, "step": 43610 }, { "epoch": 2.330909275661761, "grad_norm": 0.3348879814147949, "learning_rate": 9.973137968602932e-05, "loss": 1.4975, "step": 43620 }, { "epoch": 2.331451216827273, "grad_norm": 0.24856191873550415, "learning_rate": 9.972964895190295e-05, "loss": 1.4994, "step": 43630 }, { "epoch": 2.3319931579927853, "grad_norm": 0.2245061695575714, "learning_rate": 9.972791267686188e-05, "loss": 1.494, "step": 43640 }, { "epoch": 2.3325350991582976, "grad_norm": 0.2887219786643982, "learning_rate": 9.972617086112116e-05, "loss": 1.5022, "step": 43650 }, { "epoch": 2.33307704032381, "grad_norm": 0.5362899303436279, "learning_rate": 9.97244235048966e-05, "loss": 1.5091, "step": 43660 }, { "epoch": 2.3336189814893222, "grad_norm": 0.4345687925815582, "learning_rate": 9.972267060840461e-05, "loss": 1.5024, "step": 43670 }, { "epoch": 2.333835757955527, "eval_loss": 2.4626216888427734, "eval_runtime": 21.9768, "eval_samples_per_second": 227.513, "eval_steps_per_second": 1.229, "step": 43674 }, { "epoch": 2.334160922654834, "grad_norm": 0.42070385813713074, "learning_rate": 9.972091217186236e-05, "loss": 1.5046, "step": 43680 }, { "epoch": 2.3347028638203464, "grad_norm": 0.5176911950111389, "learning_rate": 9.971914819548766e-05, "loss": 1.5029, "step": 43690 }, { "epoch": 2.3352448049858587, "grad_norm": 0.3545333743095398, "learning_rate": 9.971737867949903e-05, "loss": 1.4942, "step": 43700 }, { "epoch": 2.335786746151371, "grad_norm": 0.2512124180793762, "learning_rate": 9.971560362411569e-05, "loss": 1.5021, "step": 43710 }, { "epoch": 2.3363286873168834, "grad_norm": 0.32333534955978394, "learning_rate": 9.971382302955748e-05, "loss": 1.4935, "step": 43720 }, { "epoch": 2.3368706284823952, "grad_norm": 0.431264728307724, "learning_rate": 9.971203689604504e-05, "loss": 1.5011, "step": 43730 }, { "epoch": 2.3374125696479076, "grad_norm": 0.26169353723526, "learning_rate": 9.971024522379957e-05, "loss": 1.5027, "step": 43740 }, { "epoch": 2.33795451081342, "grad_norm": 0.44165289402008057, "learning_rate": 9.970844801304303e-05, "loss": 1.4827, "step": 43750 }, { "epoch": 2.338496451978932, "grad_norm": 0.3183143734931946, "learning_rate": 9.970664526399806e-05, "loss": 1.4988, "step": 43760 }, { "epoch": 2.3385506460954835, "eval_loss": 2.4591445922851562, "eval_runtime": 21.9816, "eval_samples_per_second": 227.463, "eval_steps_per_second": 1.228, "step": 43761 }, { "epoch": 2.3390383931444445, "grad_norm": 0.22885079681873322, "learning_rate": 9.970483697688798e-05, "loss": 1.4994, "step": 43770 }, { "epoch": 2.3395803343099564, "grad_norm": 0.20680442452430725, "learning_rate": 9.970302315193677e-05, "loss": 1.4886, "step": 43780 }, { "epoch": 2.3401222754754687, "grad_norm": 0.28351494669914246, "learning_rate": 9.970120378936914e-05, "loss": 1.4918, "step": 43790 }, { "epoch": 2.340664216640981, "grad_norm": 0.3337201774120331, "learning_rate": 9.969937888941046e-05, "loss": 1.4897, "step": 43800 }, { "epoch": 2.3412061578064933, "grad_norm": 0.37120136618614197, "learning_rate": 9.969754845228676e-05, "loss": 1.4988, "step": 43810 }, { "epoch": 2.341748098972005, "grad_norm": 0.2082194685935974, "learning_rate": 9.969571247822486e-05, "loss": 1.5011, "step": 43820 }, { "epoch": 2.3422900401375175, "grad_norm": 0.3116413652896881, "learning_rate": 9.969387096745211e-05, "loss": 1.4943, "step": 43830 }, { "epoch": 2.34283198130303, "grad_norm": 0.18814930319786072, "learning_rate": 9.969202392019668e-05, "loss": 1.5032, "step": 43840 }, { "epoch": 2.3432655342354396, "eval_loss": 2.4612669944763184, "eval_runtime": 21.9782, "eval_samples_per_second": 227.498, "eval_steps_per_second": 1.228, "step": 43848 }, { "epoch": 2.343373922468542, "grad_norm": 0.19654281437397003, "learning_rate": 9.969017133668738e-05, "loss": 1.4942, "step": 43850 }, { "epoch": 2.343915863634054, "grad_norm": 0.49275675415992737, "learning_rate": 9.968831321715365e-05, "loss": 1.4915, "step": 43860 }, { "epoch": 2.3444578047995663, "grad_norm": 0.26238903403282166, "learning_rate": 9.968644956182572e-05, "loss": 1.5072, "step": 43870 }, { "epoch": 2.3449997459650787, "grad_norm": 0.43797948956489563, "learning_rate": 9.968458037093442e-05, "loss": 1.4991, "step": 43880 }, { "epoch": 2.345541687130591, "grad_norm": 0.34569740295410156, "learning_rate": 9.968270564471131e-05, "loss": 1.5006, "step": 43890 }, { "epoch": 2.3460836282961033, "grad_norm": 0.3247714340686798, "learning_rate": 9.96808253833886e-05, "loss": 1.5022, "step": 43900 }, { "epoch": 2.346625569461615, "grad_norm": 0.3044470250606537, "learning_rate": 9.967893958719924e-05, "loss": 1.4978, "step": 43910 }, { "epoch": 2.3471675106271275, "grad_norm": 0.26124030351638794, "learning_rate": 9.967704825637681e-05, "loss": 1.5, "step": 43920 }, { "epoch": 2.34770945179264, "grad_norm": 0.2891956567764282, "learning_rate": 9.967515139115562e-05, "loss": 1.5099, "step": 43930 }, { "epoch": 2.3479804223753957, "eval_loss": 2.4739420413970947, "eval_runtime": 21.9776, "eval_samples_per_second": 227.505, "eval_steps_per_second": 1.229, "step": 43935 }, { "epoch": 2.348251392958152, "grad_norm": 0.3045171797275543, "learning_rate": 9.967324899177062e-05, "loss": 1.4922, "step": 43940 }, { "epoch": 2.3487933341236644, "grad_norm": 0.26287245750427246, "learning_rate": 9.96713410584575e-05, "loss": 1.5083, "step": 43950 }, { "epoch": 2.3493352752891763, "grad_norm": 0.37551262974739075, "learning_rate": 9.96694275914526e-05, "loss": 1.4992, "step": 43960 }, { "epoch": 2.3498772164546886, "grad_norm": 0.4330720901489258, "learning_rate": 9.966750859099294e-05, "loss": 1.4883, "step": 43970 }, { "epoch": 2.350419157620201, "grad_norm": 0.21787714958190918, "learning_rate": 9.966558405731624e-05, "loss": 1.4937, "step": 43980 }, { "epoch": 2.3509610987857132, "grad_norm": 0.28836148977279663, "learning_rate": 9.96636539906609e-05, "loss": 1.4988, "step": 43990 }, { "epoch": 2.351503039951225, "grad_norm": 0.29318341612815857, "learning_rate": 9.966171839126601e-05, "loss": 1.5102, "step": 44000 }, { "epoch": 2.3520449811167374, "grad_norm": 0.5823042988777161, "learning_rate": 9.965977725937138e-05, "loss": 1.4957, "step": 44010 }, { "epoch": 2.3525869222822497, "grad_norm": 0.5775142908096313, "learning_rate": 9.96578305952174e-05, "loss": 1.4923, "step": 44020 }, { "epoch": 2.3526953105153523, "eval_loss": 2.4744277000427246, "eval_runtime": 21.978, "eval_samples_per_second": 227.5, "eval_steps_per_second": 1.228, "step": 44022 }, { "epoch": 2.353128863447762, "grad_norm": 0.6233053803443909, "learning_rate": 9.965587839904527e-05, "loss": 1.507, "step": 44030 }, { "epoch": 2.353670804613274, "grad_norm": 0.5592018365859985, "learning_rate": 9.965392067109679e-05, "loss": 1.4945, "step": 44040 }, { "epoch": 2.3542127457787863, "grad_norm": 0.3838133215904236, "learning_rate": 9.965195741161449e-05, "loss": 1.4825, "step": 44050 }, { "epoch": 2.3547546869442986, "grad_norm": 0.2884215712547302, "learning_rate": 9.964998862084157e-05, "loss": 1.4858, "step": 44060 }, { "epoch": 2.355296628109811, "grad_norm": 0.22418469190597534, "learning_rate": 9.96480142990219e-05, "loss": 1.493, "step": 44070 }, { "epoch": 2.355838569275323, "grad_norm": 0.34609487652778625, "learning_rate": 9.964603444640007e-05, "loss": 1.5015, "step": 44080 }, { "epoch": 2.356380510440835, "grad_norm": 0.2725040912628174, "learning_rate": 9.964404906322132e-05, "loss": 1.4937, "step": 44090 }, { "epoch": 2.3569224516063474, "grad_norm": 0.6061669588088989, "learning_rate": 9.96420581497316e-05, "loss": 1.5041, "step": 44100 }, { "epoch": 2.3574101986553084, "eval_loss": 2.4810047149658203, "eval_runtime": 21.9811, "eval_samples_per_second": 227.468, "eval_steps_per_second": 1.228, "step": 44109 }, { "epoch": 2.3574643927718597, "grad_norm": 0.509492814540863, "learning_rate": 9.964006170617754e-05, "loss": 1.5008, "step": 44110 }, { "epoch": 2.358006333937372, "grad_norm": 0.4258333146572113, "learning_rate": 9.963805973280643e-05, "loss": 1.4977, "step": 44120 }, { "epoch": 2.3585482751028843, "grad_norm": 0.27747806906700134, "learning_rate": 9.96360522298663e-05, "loss": 1.4924, "step": 44130 }, { "epoch": 2.359090216268396, "grad_norm": 0.339024156332016, "learning_rate": 9.963403919760579e-05, "loss": 1.494, "step": 44140 }, { "epoch": 2.3596321574339085, "grad_norm": 0.3902308940887451, "learning_rate": 9.963202063627429e-05, "loss": 1.5009, "step": 44150 }, { "epoch": 2.360174098599421, "grad_norm": 0.20376642048358917, "learning_rate": 9.962999654612185e-05, "loss": 1.5112, "step": 44160 }, { "epoch": 2.360716039764933, "grad_norm": 0.28291308879852295, "learning_rate": 9.96279669273992e-05, "loss": 1.5011, "step": 44170 }, { "epoch": 2.3612579809304455, "grad_norm": 0.3442187011241913, "learning_rate": 9.962593178035776e-05, "loss": 1.5062, "step": 44180 }, { "epoch": 2.3617999220959573, "grad_norm": 0.23570683598518372, "learning_rate": 9.962389110524964e-05, "loss": 1.4994, "step": 44190 }, { "epoch": 2.362125086795265, "eval_loss": 2.45691180229187, "eval_runtime": 21.9778, "eval_samples_per_second": 227.503, "eval_steps_per_second": 1.229, "step": 44196 }, { "epoch": 2.3623418632614697, "grad_norm": 0.38584163784980774, "learning_rate": 9.962184490232763e-05, "loss": 1.5001, "step": 44200 }, { "epoch": 2.362883804426982, "grad_norm": 0.2459559589624405, "learning_rate": 9.96197931718452e-05, "loss": 1.4962, "step": 44210 }, { "epoch": 2.3634257455924943, "grad_norm": 0.217377707362175, "learning_rate": 9.96177359140565e-05, "loss": 1.4944, "step": 44220 }, { "epoch": 2.363967686758006, "grad_norm": 0.5001499652862549, "learning_rate": 9.96156731292164e-05, "loss": 1.4937, "step": 44230 }, { "epoch": 2.3645096279235185, "grad_norm": 0.33417242765426636, "learning_rate": 9.961360481758043e-05, "loss": 1.4935, "step": 44240 }, { "epoch": 2.365051569089031, "grad_norm": 0.4141455590724945, "learning_rate": 9.961153097940475e-05, "loss": 1.5026, "step": 44250 }, { "epoch": 2.365593510254543, "grad_norm": 0.6103190779685974, "learning_rate": 9.960945161494633e-05, "loss": 1.4783, "step": 44260 }, { "epoch": 2.366135451420055, "grad_norm": 0.4958433508872986, "learning_rate": 9.96073667244627e-05, "loss": 1.4915, "step": 44270 }, { "epoch": 2.3666773925855673, "grad_norm": 0.30558812618255615, "learning_rate": 9.960527630821217e-05, "loss": 1.4999, "step": 44280 }, { "epoch": 2.366839974935221, "eval_loss": 2.4537668228149414, "eval_runtime": 21.9779, "eval_samples_per_second": 227.501, "eval_steps_per_second": 1.229, "step": 44283 }, { "epoch": 2.3672193337510796, "grad_norm": 0.36779430508613586, "learning_rate": 9.960318036645363e-05, "loss": 1.4988, "step": 44290 }, { "epoch": 2.367761274916592, "grad_norm": 0.2908129096031189, "learning_rate": 9.960107889944679e-05, "loss": 1.5003, "step": 44300 }, { "epoch": 2.3683032160821043, "grad_norm": 0.5681326389312744, "learning_rate": 9.959897190745192e-05, "loss": 1.4934, "step": 44310 }, { "epoch": 2.368845157247616, "grad_norm": 0.40029388666152954, "learning_rate": 9.959685939073002e-05, "loss": 1.5063, "step": 44320 }, { "epoch": 2.3693870984131284, "grad_norm": 0.42044511437416077, "learning_rate": 9.959474134954283e-05, "loss": 1.4933, "step": 44330 }, { "epoch": 2.3699290395786408, "grad_norm": 0.590247392654419, "learning_rate": 9.959261778415267e-05, "loss": 1.4867, "step": 44340 }, { "epoch": 2.370470980744153, "grad_norm": 0.2779798209667206, "learning_rate": 9.959048869482262e-05, "loss": 1.5081, "step": 44350 }, { "epoch": 2.3710129219096654, "grad_norm": 0.48375606536865234, "learning_rate": 9.958835408181643e-05, "loss": 1.4951, "step": 44360 }, { "epoch": 2.3715548630751773, "grad_norm": 0.251369446516037, "learning_rate": 9.958621394539854e-05, "loss": 1.4872, "step": 44370 }, { "epoch": 2.3715548630751773, "eval_loss": 2.4517953395843506, "eval_runtime": 21.9682, "eval_samples_per_second": 227.602, "eval_steps_per_second": 1.229, "step": 44370 }, { "epoch": 2.3720968042406896, "grad_norm": 0.24449922144412994, "learning_rate": 9.9584068285834e-05, "loss": 1.4872, "step": 44380 }, { "epoch": 2.372638745406202, "grad_norm": 0.33364132046699524, "learning_rate": 9.95819171033887e-05, "loss": 1.4842, "step": 44390 }, { "epoch": 2.373180686571714, "grad_norm": 0.30811458826065063, "learning_rate": 9.957976039832901e-05, "loss": 1.4935, "step": 44400 }, { "epoch": 2.373722627737226, "grad_norm": 0.2445499747991562, "learning_rate": 9.957759817092218e-05, "loss": 1.5017, "step": 44410 }, { "epoch": 2.3742645689027384, "grad_norm": 0.2053236961364746, "learning_rate": 9.957543042143601e-05, "loss": 1.5071, "step": 44420 }, { "epoch": 2.3748065100682507, "grad_norm": 0.4515707492828369, "learning_rate": 9.957325715013905e-05, "loss": 1.4962, "step": 44430 }, { "epoch": 2.375348451233763, "grad_norm": 0.26038962602615356, "learning_rate": 9.957107835730052e-05, "loss": 1.4925, "step": 44440 }, { "epoch": 2.375890392399275, "grad_norm": 0.3523120880126953, "learning_rate": 9.95688940431903e-05, "loss": 1.4928, "step": 44450 }, { "epoch": 2.376269751215134, "eval_loss": 2.4564239978790283, "eval_runtime": 21.9781, "eval_samples_per_second": 227.499, "eval_steps_per_second": 1.228, "step": 44457 }, { "epoch": 2.3764323335647872, "grad_norm": 0.24596655368804932, "learning_rate": 9.956670420807899e-05, "loss": 1.5008, "step": 44460 }, { "epoch": 2.3769742747302995, "grad_norm": 0.47213152050971985, "learning_rate": 9.956450885223785e-05, "loss": 1.5027, "step": 44470 }, { "epoch": 2.377516215895812, "grad_norm": 0.3502696752548218, "learning_rate": 9.956230797593884e-05, "loss": 1.4894, "step": 44480 }, { "epoch": 2.378058157061324, "grad_norm": 0.31429505348205566, "learning_rate": 9.95601015794546e-05, "loss": 1.4878, "step": 44490 }, { "epoch": 2.378600098226836, "grad_norm": 0.22514714300632477, "learning_rate": 9.95578896630584e-05, "loss": 1.4924, "step": 44500 }, { "epoch": 2.3791420393923484, "grad_norm": 0.27507612109184265, "learning_rate": 9.95556722270243e-05, "loss": 1.5104, "step": 44510 }, { "epoch": 2.3796839805578607, "grad_norm": 0.3192688226699829, "learning_rate": 9.955344927162698e-05, "loss": 1.4992, "step": 44520 }, { "epoch": 2.380225921723373, "grad_norm": 0.20219798386096954, "learning_rate": 9.955122079714177e-05, "loss": 1.51, "step": 44530 }, { "epoch": 2.3807678628888853, "grad_norm": 0.21116623282432556, "learning_rate": 9.954898680384476e-05, "loss": 1.4943, "step": 44540 }, { "epoch": 2.38098463935509, "eval_loss": 2.4590439796447754, "eval_runtime": 22.3001, "eval_samples_per_second": 224.215, "eval_steps_per_second": 1.211, "step": 44544 }, { "epoch": 2.381309804054397, "grad_norm": 0.3956089913845062, "learning_rate": 9.954674729201268e-05, "loss": 1.4822, "step": 44550 }, { "epoch": 2.3818517452199095, "grad_norm": 0.5214465856552124, "learning_rate": 9.954450226192295e-05, "loss": 1.4843, "step": 44560 }, { "epoch": 2.382393686385422, "grad_norm": 0.4375009536743164, "learning_rate": 9.954225171385366e-05, "loss": 1.509, "step": 44570 }, { "epoch": 2.382935627550934, "grad_norm": 0.22339291870594025, "learning_rate": 9.953999564808362e-05, "loss": 1.4914, "step": 44580 }, { "epoch": 2.3834775687164464, "grad_norm": 0.2312263697385788, "learning_rate": 9.953773406489229e-05, "loss": 1.5064, "step": 44590 }, { "epoch": 2.3840195098819583, "grad_norm": 0.21803376078605652, "learning_rate": 9.953546696455984e-05, "loss": 1.5093, "step": 44600 }, { "epoch": 2.3845614510474706, "grad_norm": 0.33443108201026917, "learning_rate": 9.953319434736708e-05, "loss": 1.4916, "step": 44610 }, { "epoch": 2.385103392212983, "grad_norm": 0.30027446150779724, "learning_rate": 9.953091621359556e-05, "loss": 1.4997, "step": 44620 }, { "epoch": 2.3856453333784953, "grad_norm": 0.32896170020103455, "learning_rate": 9.952863256352746e-05, "loss": 1.4882, "step": 44630 }, { "epoch": 2.3856995274950465, "eval_loss": 2.4726407527923584, "eval_runtime": 21.9784, "eval_samples_per_second": 227.496, "eval_steps_per_second": 1.228, "step": 44631 }, { "epoch": 2.386187274544007, "grad_norm": 0.41596585512161255, "learning_rate": 9.95263433974457e-05, "loss": 1.4935, "step": 44640 }, { "epoch": 2.3867292157095195, "grad_norm": 0.2066296935081482, "learning_rate": 9.952404871563383e-05, "loss": 1.4963, "step": 44650 }, { "epoch": 2.3872711568750318, "grad_norm": 0.5098561644554138, "learning_rate": 9.952174851837609e-05, "loss": 1.4899, "step": 44660 }, { "epoch": 2.387813098040544, "grad_norm": 0.3089795410633087, "learning_rate": 9.951944280595745e-05, "loss": 1.4808, "step": 44670 }, { "epoch": 2.388355039206056, "grad_norm": 0.3919314742088318, "learning_rate": 9.951713157866352e-05, "loss": 1.4988, "step": 44680 }, { "epoch": 2.3888969803715683, "grad_norm": 0.6062659025192261, "learning_rate": 9.95148148367806e-05, "loss": 1.494, "step": 44690 }, { "epoch": 2.3894389215370806, "grad_norm": 0.40539994835853577, "learning_rate": 9.951249258059569e-05, "loss": 1.4863, "step": 44700 }, { "epoch": 2.389980862702593, "grad_norm": 0.3886335492134094, "learning_rate": 9.951016481039646e-05, "loss": 1.4878, "step": 44710 }, { "epoch": 2.3904144156350027, "eval_loss": 2.4730005264282227, "eval_runtime": 21.9758, "eval_samples_per_second": 227.523, "eval_steps_per_second": 1.229, "step": 44718 }, { "epoch": 2.3905228038681052, "grad_norm": 0.39864251017570496, "learning_rate": 9.950783152647124e-05, "loss": 1.4932, "step": 44720 }, { "epoch": 2.391064745033617, "grad_norm": 0.21343019604682922, "learning_rate": 9.950549272910909e-05, "loss": 1.4889, "step": 44730 }, { "epoch": 2.3916066861991294, "grad_norm": 0.33363330364227295, "learning_rate": 9.950314841859973e-05, "loss": 1.484, "step": 44740 }, { "epoch": 2.3921486273646417, "grad_norm": 0.41146817803382874, "learning_rate": 9.950079859523354e-05, "loss": 1.497, "step": 44750 }, { "epoch": 2.392690568530154, "grad_norm": 0.27292779088020325, "learning_rate": 9.949844325930165e-05, "loss": 1.4942, "step": 44760 }, { "epoch": 2.3932325096956664, "grad_norm": 0.2181469202041626, "learning_rate": 9.94960824110958e-05, "loss": 1.4911, "step": 44770 }, { "epoch": 2.3937744508611782, "grad_norm": 0.21608904004096985, "learning_rate": 9.94937160509084e-05, "loss": 1.4981, "step": 44780 }, { "epoch": 2.3943163920266906, "grad_norm": 0.2778734266757965, "learning_rate": 9.949134417903267e-05, "loss": 1.4898, "step": 44790 }, { "epoch": 2.394858333192203, "grad_norm": 0.3089962303638458, "learning_rate": 9.948896679576238e-05, "loss": 1.4905, "step": 44800 }, { "epoch": 2.395129303774959, "eval_loss": 2.4715840816497803, "eval_runtime": 21.9797, "eval_samples_per_second": 227.482, "eval_steps_per_second": 1.228, "step": 44805 }, { "epoch": 2.395400274357715, "grad_norm": 0.3231702148914337, "learning_rate": 9.948658390139203e-05, "loss": 1.5066, "step": 44810 }, { "epoch": 2.3959422155232275, "grad_norm": 0.40103909373283386, "learning_rate": 9.94841954962168e-05, "loss": 1.4987, "step": 44820 }, { "epoch": 2.3964841566887394, "grad_norm": 0.40768495202064514, "learning_rate": 9.948180158053257e-05, "loss": 1.4982, "step": 44830 }, { "epoch": 2.3970260978542517, "grad_norm": 0.23787713050842285, "learning_rate": 9.947940215463589e-05, "loss": 1.4838, "step": 44840 }, { "epoch": 2.397568039019764, "grad_norm": 0.3806591331958771, "learning_rate": 9.947699721882396e-05, "loss": 1.4936, "step": 44850 }, { "epoch": 2.3981099801852763, "grad_norm": 0.26466140151023865, "learning_rate": 9.947458677339473e-05, "loss": 1.4965, "step": 44860 }, { "epoch": 2.398651921350788, "grad_norm": 0.2688174247741699, "learning_rate": 9.947217081864678e-05, "loss": 1.494, "step": 44870 }, { "epoch": 2.3991938625163005, "grad_norm": 0.5081690549850464, "learning_rate": 9.94697493548794e-05, "loss": 1.4951, "step": 44880 }, { "epoch": 2.399735803681813, "grad_norm": 0.23584547638893127, "learning_rate": 9.946732238239251e-05, "loss": 1.4915, "step": 44890 }, { "epoch": 2.3998441919149154, "eval_loss": 2.4753541946411133, "eval_runtime": 21.9826, "eval_samples_per_second": 227.453, "eval_steps_per_second": 1.228, "step": 44892 }, { "epoch": 2.400277744847325, "grad_norm": 0.41271650791168213, "learning_rate": 9.946488990148679e-05, "loss": 1.4985, "step": 44900 }, { "epoch": 2.400819686012837, "grad_norm": 0.24468545615673065, "learning_rate": 9.946245191246358e-05, "loss": 1.4974, "step": 44910 }, { "epoch": 2.4013616271783493, "grad_norm": 0.3811487853527069, "learning_rate": 9.946000841562482e-05, "loss": 1.4896, "step": 44920 }, { "epoch": 2.4019035683438617, "grad_norm": 0.4173762798309326, "learning_rate": 9.945755941127327e-05, "loss": 1.4897, "step": 44930 }, { "epoch": 2.402445509509374, "grad_norm": 0.5052602291107178, "learning_rate": 9.945510489971228e-05, "loss": 1.4964, "step": 44940 }, { "epoch": 2.4029874506748863, "grad_norm": 0.3579169809818268, "learning_rate": 9.945264488124589e-05, "loss": 1.4941, "step": 44950 }, { "epoch": 2.403529391840398, "grad_norm": 0.2974871098995209, "learning_rate": 9.945017935617885e-05, "loss": 1.5033, "step": 44960 }, { "epoch": 2.4040713330059105, "grad_norm": 0.34116071462631226, "learning_rate": 9.944770832481656e-05, "loss": 1.4966, "step": 44970 }, { "epoch": 2.4045590800548715, "eval_loss": 2.4695467948913574, "eval_runtime": 21.9753, "eval_samples_per_second": 227.528, "eval_steps_per_second": 1.229, "step": 44979 }, { "epoch": 2.404613274171423, "grad_norm": 0.2582961916923523, "learning_rate": 9.944523178746516e-05, "loss": 1.499, "step": 44980 }, { "epoch": 2.405155215336935, "grad_norm": 0.32894718647003174, "learning_rate": 9.94427497444314e-05, "loss": 1.4877, "step": 44990 }, { "epoch": 2.4056971565024474, "grad_norm": 0.38005682826042175, "learning_rate": 9.944026219602274e-05, "loss": 1.4999, "step": 45000 }, { "epoch": 2.0005419411655123, "grad_norm": 0.2491222321987152, "learning_rate": 9.943776914254736e-05, "loss": 1.5076, "step": 45010 }, { "epoch": 2.0010838823310246, "grad_norm": 0.3733494281768799, "learning_rate": 9.943527058431406e-05, "loss": 1.517, "step": 45020 }, { "epoch": 2.0016258234965365, "grad_norm": 0.33631569147109985, "learning_rate": 9.943276652163235e-05, "loss": 1.5062, "step": 45030 }, { "epoch": 2.002167764662049, "grad_norm": 0.21265359222888947, "learning_rate": 9.943025695481244e-05, "loss": 1.5177, "step": 45040 }, { "epoch": 2.002709705827561, "grad_norm": 0.18703655898571014, "learning_rate": 9.94277418841652e-05, "loss": 1.5098, "step": 45050 }, { "epoch": 2.0032516469930735, "grad_norm": 0.44634345173835754, "learning_rate": 9.942522131000216e-05, "loss": 1.5068, "step": 45060 }, { "epoch": 2.0035768116923807, "eval_loss": 2.45697283744812, "eval_runtime": 28.4848, "eval_samples_per_second": 175.532, "eval_steps_per_second": 0.948, "step": 45066 }, { "epoch": 2.0037935881585853, "grad_norm": 0.22693060338497162, "learning_rate": 9.942269523263559e-05, "loss": 1.5085, "step": 45070 }, { "epoch": 2.0043355293240976, "grad_norm": 0.22670039534568787, "learning_rate": 9.942016365237841e-05, "loss": 1.5155, "step": 45080 }, { "epoch": 2.00487747048961, "grad_norm": 0.2845227122306824, "learning_rate": 9.94176265695442e-05, "loss": 1.5119, "step": 45090 }, { "epoch": 2.0054194116551223, "grad_norm": 0.2120792120695114, "learning_rate": 9.941508398444725e-05, "loss": 1.5108, "step": 45100 }, { "epoch": 2.0059613528206346, "grad_norm": 0.2501295804977417, "learning_rate": 9.941253589740255e-05, "loss": 1.5151, "step": 45110 }, { "epoch": 2.0065032939861465, "grad_norm": 0.21968966722488403, "learning_rate": 9.940998230872569e-05, "loss": 1.5051, "step": 45120 }, { "epoch": 2.007045235151659, "grad_norm": 0.2440696805715561, "learning_rate": 9.940742321873304e-05, "loss": 1.5188, "step": 45130 }, { "epoch": 2.007587176317171, "grad_norm": 0.26428186893463135, "learning_rate": 9.940485862774162e-05, "loss": 1.5097, "step": 45140 }, { "epoch": 2.0081291174826834, "grad_norm": 0.2735063135623932, "learning_rate": 9.940228853606908e-05, "loss": 1.5153, "step": 45150 }, { "epoch": 2.008291699832337, "eval_loss": 2.474893093109131, "eval_runtime": 22.0407, "eval_samples_per_second": 226.853, "eval_steps_per_second": 1.225, "step": 45153 }, { "epoch": 2.0086710586481953, "grad_norm": 0.21670082211494446, "learning_rate": 9.939971294403382e-05, "loss": 1.5133, "step": 45160 }, { "epoch": 2.0092129998137076, "grad_norm": 0.219325989484787, "learning_rate": 9.939713185195486e-05, "loss": 1.5115, "step": 45170 }, { "epoch": 2.00975494097922, "grad_norm": 0.21777480840682983, "learning_rate": 9.939454526015199e-05, "loss": 1.5066, "step": 45180 }, { "epoch": 2.0102968821447322, "grad_norm": 0.24728938937187195, "learning_rate": 9.939195316894558e-05, "loss": 1.5173, "step": 45190 }, { "epoch": 2.0108388233102445, "grad_norm": 0.3391672670841217, "learning_rate": 9.938935557865676e-05, "loss": 1.4985, "step": 45200 }, { "epoch": 2.0113807644757564, "grad_norm": 0.22963690757751465, "learning_rate": 9.938675248960726e-05, "loss": 1.5031, "step": 45210 }, { "epoch": 2.0119227056412687, "grad_norm": 0.2681106626987457, "learning_rate": 9.93841439021196e-05, "loss": 1.5047, "step": 45220 }, { "epoch": 2.012464646806781, "grad_norm": 0.37070515751838684, "learning_rate": 9.938152981651687e-05, "loss": 1.5039, "step": 45230 }, { "epoch": 2.0130065879722934, "grad_norm": 0.342690110206604, "learning_rate": 9.937891023312292e-05, "loss": 1.5114, "step": 45240 }, { "epoch": 2.0130065879722934, "eval_loss": 2.4622411727905273, "eval_runtime": 21.7527, "eval_samples_per_second": 229.857, "eval_steps_per_second": 1.241, "step": 45240 }, { "epoch": 2.0135485291378057, "grad_norm": 0.4903299808502197, "learning_rate": 9.937628515226225e-05, "loss": 1.5174, "step": 45250 }, { "epoch": 2.0140904703033176, "grad_norm": 0.2701393961906433, "learning_rate": 9.937365457426003e-05, "loss": 1.5235, "step": 45260 }, { "epoch": 2.01463241146883, "grad_norm": 0.21151518821716309, "learning_rate": 9.937101849944213e-05, "loss": 1.4979, "step": 45270 }, { "epoch": 2.015174352634342, "grad_norm": 0.1973683089017868, "learning_rate": 9.936837692813511e-05, "loss": 1.5211, "step": 45280 }, { "epoch": 2.0157162937998545, "grad_norm": 0.2013273984193802, "learning_rate": 9.936572986066616e-05, "loss": 1.5038, "step": 45290 }, { "epoch": 2.0162582349653664, "grad_norm": 0.28360608220100403, "learning_rate": 9.936307729736324e-05, "loss": 1.5059, "step": 45300 }, { "epoch": 2.0168001761308787, "grad_norm": 0.2819204032421112, "learning_rate": 9.93604192385549e-05, "loss": 1.5104, "step": 45310 }, { "epoch": 2.017342117296391, "grad_norm": 0.20827263593673706, "learning_rate": 9.935775568457041e-05, "loss": 1.4944, "step": 45320 }, { "epoch": 2.0177214761122495, "eval_loss": 2.4640448093414307, "eval_runtime": 22.083, "eval_samples_per_second": 226.419, "eval_steps_per_second": 1.223, "step": 45327 }, { "epoch": 2.0178840584619033, "grad_norm": 0.21158994734287262, "learning_rate": 9.935508663573973e-05, "loss": 1.5125, "step": 45330 }, { "epoch": 2.0184259996274156, "grad_norm": 0.3482116162776947, "learning_rate": 9.935241209239349e-05, "loss": 1.5073, "step": 45340 }, { "epoch": 2.0189679407929275, "grad_norm": 0.2513381540775299, "learning_rate": 9.9349732054863e-05, "loss": 1.4914, "step": 45350 }, { "epoch": 2.01950988195844, "grad_norm": 0.21970421075820923, "learning_rate": 9.934704652348024e-05, "loss": 1.511, "step": 45360 }, { "epoch": 2.020051823123952, "grad_norm": 0.20519542694091797, "learning_rate": 9.934435549857793e-05, "loss": 1.4999, "step": 45370 }, { "epoch": 2.0205937642894645, "grad_norm": 0.2705243229866028, "learning_rate": 9.934165898048934e-05, "loss": 1.5099, "step": 45380 }, { "epoch": 2.0211357054549763, "grad_norm": 0.3955739140510559, "learning_rate": 9.933895696954857e-05, "loss": 1.5111, "step": 45390 }, { "epoch": 2.0216776466204887, "grad_norm": 0.39523282647132874, "learning_rate": 9.933624946609031e-05, "loss": 1.5154, "step": 45400 }, { "epoch": 2.022219587786001, "grad_norm": 0.22369325160980225, "learning_rate": 9.933353647044995e-05, "loss": 1.5127, "step": 45410 }, { "epoch": 2.022436364252206, "eval_loss": 2.4598445892333984, "eval_runtime": 22.0511, "eval_samples_per_second": 226.746, "eval_steps_per_second": 1.224, "step": 45414 }, { "epoch": 2.0227615289515133, "grad_norm": 0.29554933309555054, "learning_rate": 9.933081798296358e-05, "loss": 1.507, "step": 45420 }, { "epoch": 2.0233034701170256, "grad_norm": 0.2695947289466858, "learning_rate": 9.932809400396793e-05, "loss": 1.505, "step": 45430 }, { "epoch": 2.0238454112825375, "grad_norm": 0.25676560401916504, "learning_rate": 9.932536453380045e-05, "loss": 1.5104, "step": 45440 }, { "epoch": 2.02438735244805, "grad_norm": 0.21625731885433197, "learning_rate": 9.932262957279926e-05, "loss": 1.5112, "step": 45450 }, { "epoch": 2.024929293613562, "grad_norm": 0.22126367688179016, "learning_rate": 9.931988912130313e-05, "loss": 1.4907, "step": 45460 }, { "epoch": 2.0254712347790744, "grad_norm": 0.2600845396518707, "learning_rate": 9.931714317965158e-05, "loss": 1.5057, "step": 45470 }, { "epoch": 2.0260131759445863, "grad_norm": 0.2164200097322464, "learning_rate": 9.931439174818472e-05, "loss": 1.5125, "step": 45480 }, { "epoch": 2.0265551171100986, "grad_norm": 0.3730428218841553, "learning_rate": 9.931163482724341e-05, "loss": 1.5166, "step": 45490 }, { "epoch": 2.027097058275611, "grad_norm": 0.2146274298429489, "learning_rate": 9.930887241716915e-05, "loss": 1.5042, "step": 45500 }, { "epoch": 2.027151252392162, "eval_loss": 2.4563684463500977, "eval_runtime": 22.0541, "eval_samples_per_second": 226.715, "eval_steps_per_second": 1.224, "step": 45501 }, { "epoch": 2.0276389994411232, "grad_norm": 0.2864300310611725, "learning_rate": 9.930610451830417e-05, "loss": 1.5001, "step": 45510 }, { "epoch": 2.0281809406066356, "grad_norm": 0.37505003809928894, "learning_rate": 9.93033311309913e-05, "loss": 1.5074, "step": 45520 }, { "epoch": 2.0287228817721474, "grad_norm": 0.3186861276626587, "learning_rate": 9.93005522555741e-05, "loss": 1.5096, "step": 45530 }, { "epoch": 2.0292648229376598, "grad_norm": 0.31974560022354126, "learning_rate": 9.929776789239685e-05, "loss": 1.5028, "step": 45540 }, { "epoch": 2.029806764103172, "grad_norm": 0.22991715371608734, "learning_rate": 9.929497804180442e-05, "loss": 1.4912, "step": 45550 }, { "epoch": 2.0303487052686844, "grad_norm": 0.38136276602745056, "learning_rate": 9.929218270414243e-05, "loss": 1.5082, "step": 45560 }, { "epoch": 2.0308906464341967, "grad_norm": 0.2910080850124359, "learning_rate": 9.928938187975714e-05, "loss": 1.5081, "step": 45570 }, { "epoch": 2.0314325875997086, "grad_norm": 0.20356157422065735, "learning_rate": 9.928657556899551e-05, "loss": 1.4984, "step": 45580 }, { "epoch": 2.0318661405321183, "eval_loss": 2.454059600830078, "eval_runtime": 22.0277, "eval_samples_per_second": 226.987, "eval_steps_per_second": 1.226, "step": 45588 }, { "epoch": 2.031974528765221, "grad_norm": 0.32182246446609497, "learning_rate": 9.928376377220517e-05, "loss": 1.507, "step": 45590 }, { "epoch": 2.032516469930733, "grad_norm": 0.19996988773345947, "learning_rate": 9.928094648973443e-05, "loss": 1.5189, "step": 45600 }, { "epoch": 2.0330584110962455, "grad_norm": 0.2483881115913391, "learning_rate": 9.92781237219323e-05, "loss": 1.5088, "step": 45610 }, { "epoch": 2.0336003522617574, "grad_norm": 0.3291274309158325, "learning_rate": 9.927529546914842e-05, "loss": 1.5016, "step": 45620 }, { "epoch": 2.0341422934272697, "grad_norm": 0.21742436289787292, "learning_rate": 9.927246173173318e-05, "loss": 1.507, "step": 45630 }, { "epoch": 2.034684234592782, "grad_norm": 0.34620094299316406, "learning_rate": 9.926962251003758e-05, "loss": 1.5015, "step": 45640 }, { "epoch": 2.0352261757582943, "grad_norm": 0.2289854735136032, "learning_rate": 9.926677780441335e-05, "loss": 1.5091, "step": 45650 }, { "epoch": 2.0357681169238067, "grad_norm": 0.32256433367729187, "learning_rate": 9.926392761521286e-05, "loss": 1.5123, "step": 45660 }, { "epoch": 2.0363100580893185, "grad_norm": 0.22222593426704407, "learning_rate": 9.926107194278921e-05, "loss": 1.5074, "step": 45670 }, { "epoch": 2.036581028672075, "eval_loss": 2.465963125228882, "eval_runtime": 21.9821, "eval_samples_per_second": 227.458, "eval_steps_per_second": 1.228, "step": 45675 }, { "epoch": 2.036851999254831, "grad_norm": 0.23388569056987762, "learning_rate": 9.925821078749612e-05, "loss": 1.5062, "step": 45680 }, { "epoch": 2.037393940420343, "grad_norm": 0.22175799310207367, "learning_rate": 9.925534414968802e-05, "loss": 1.4969, "step": 45690 }, { "epoch": 2.0379358815858555, "grad_norm": 0.5243973135948181, "learning_rate": 9.925247202972004e-05, "loss": 1.5039, "step": 45700 }, { "epoch": 2.0384778227513674, "grad_norm": 0.3757253885269165, "learning_rate": 9.924959442794794e-05, "loss": 1.5076, "step": 45710 }, { "epoch": 2.0390197639168797, "grad_norm": 0.4267645478248596, "learning_rate": 9.92467113447282e-05, "loss": 1.5162, "step": 45720 }, { "epoch": 2.039561705082392, "grad_norm": 0.19706867635250092, "learning_rate": 9.924382278041796e-05, "loss": 1.5014, "step": 45730 }, { "epoch": 2.0401036462479043, "grad_norm": 0.3324958086013794, "learning_rate": 9.924092873537506e-05, "loss": 1.5019, "step": 45740 }, { "epoch": 2.0406455874134166, "grad_norm": 0.2539781630039215, "learning_rate": 9.923802920995794e-05, "loss": 1.5161, "step": 45750 }, { "epoch": 2.0411875285789285, "grad_norm": 0.4110063314437866, "learning_rate": 9.923512420452585e-05, "loss": 1.4998, "step": 45760 }, { "epoch": 2.041295916812031, "eval_loss": 2.4641757011413574, "eval_runtime": 21.9849, "eval_samples_per_second": 227.429, "eval_steps_per_second": 1.228, "step": 45762 }, { "epoch": 2.041729469744441, "grad_norm": 0.22075022757053375, "learning_rate": 9.923221371943862e-05, "loss": 1.4926, "step": 45770 }, { "epoch": 2.042271410909953, "grad_norm": 0.3489071726799011, "learning_rate": 9.92292977550568e-05, "loss": 1.5091, "step": 45780 }, { "epoch": 2.0428133520754654, "grad_norm": 0.364443302154541, "learning_rate": 9.922637631174158e-05, "loss": 1.5006, "step": 45790 }, { "epoch": 2.0433552932409773, "grad_norm": 0.26170814037323, "learning_rate": 9.922344938985489e-05, "loss": 1.5044, "step": 45800 }, { "epoch": 2.0438972344064896, "grad_norm": 0.2701292634010315, "learning_rate": 9.922051698975927e-05, "loss": 1.5028, "step": 45810 }, { "epoch": 2.044439175572002, "grad_norm": 0.5133768320083618, "learning_rate": 9.921757911181801e-05, "loss": 1.514, "step": 45820 }, { "epoch": 2.0449811167375143, "grad_norm": 0.20944929122924805, "learning_rate": 9.921463575639503e-05, "loss": 1.5047, "step": 45830 }, { "epoch": 2.0455230579030266, "grad_norm": 0.2594199776649475, "learning_rate": 9.921168692385491e-05, "loss": 1.5131, "step": 45840 }, { "epoch": 2.0460108049519876, "eval_loss": 2.4567441940307617, "eval_runtime": 21.9911, "eval_samples_per_second": 227.365, "eval_steps_per_second": 1.228, "step": 45849 }, { "epoch": 2.0460649990685384, "grad_norm": 0.4213975667953491, "learning_rate": 9.920873261456297e-05, "loss": 1.4995, "step": 45850 }, { "epoch": 2.0466069402340508, "grad_norm": 0.33704066276550293, "learning_rate": 9.920577282888515e-05, "loss": 1.5053, "step": 45860 }, { "epoch": 2.047148881399563, "grad_norm": 0.3914563059806824, "learning_rate": 9.920280756718815e-05, "loss": 1.508, "step": 45870 }, { "epoch": 2.0476908225650754, "grad_norm": 0.22689972817897797, "learning_rate": 9.919983682983924e-05, "loss": 1.5135, "step": 45880 }, { "epoch": 2.0482327637305877, "grad_norm": 0.24155005812644958, "learning_rate": 9.919686061720645e-05, "loss": 1.5056, "step": 45890 }, { "epoch": 2.0487747048960996, "grad_norm": 0.22122272849082947, "learning_rate": 9.919387892965845e-05, "loss": 1.5075, "step": 45900 }, { "epoch": 2.049316646061612, "grad_norm": 0.18150360882282257, "learning_rate": 9.919089176756458e-05, "loss": 1.5084, "step": 45910 }, { "epoch": 2.049858587227124, "grad_norm": 0.31947770714759827, "learning_rate": 9.918789913129491e-05, "loss": 1.5126, "step": 45920 }, { "epoch": 2.0504005283926365, "grad_norm": 0.21400408446788788, "learning_rate": 9.918490102122014e-05, "loss": 1.5074, "step": 45930 }, { "epoch": 2.0507256930919437, "eval_loss": 2.472001552581787, "eval_runtime": 21.9893, "eval_samples_per_second": 227.383, "eval_steps_per_second": 1.228, "step": 45936 }, { "epoch": 2.0509424695581484, "grad_norm": 0.4493444263935089, "learning_rate": 9.918189743771165e-05, "loss": 1.5229, "step": 45940 }, { "epoch": 2.0514844107236607, "grad_norm": 0.28981345891952515, "learning_rate": 9.917888838114155e-05, "loss": 1.5128, "step": 45950 }, { "epoch": 2.052026351889173, "grad_norm": 0.3488079011440277, "learning_rate": 9.917587385188255e-05, "loss": 1.4949, "step": 45960 }, { "epoch": 2.0525682930546854, "grad_norm": 0.25213801860809326, "learning_rate": 9.917285385030808e-05, "loss": 1.5046, "step": 45970 }, { "epoch": 2.0531102342201977, "grad_norm": 0.1905914545059204, "learning_rate": 9.916982837679226e-05, "loss": 1.5046, "step": 45980 }, { "epoch": 2.0536521753857095, "grad_norm": 0.2938050329685211, "learning_rate": 9.916679743170986e-05, "loss": 1.5087, "step": 45990 }, { "epoch": 2.054194116551222, "grad_norm": 0.3296414613723755, "learning_rate": 9.916376101543636e-05, "loss": 1.5114, "step": 46000 }, { "epoch": 2.054736057716734, "grad_norm": 0.32699063420295715, "learning_rate": 9.916071912834789e-05, "loss": 1.5049, "step": 46010 }, { "epoch": 2.0552779988822465, "grad_norm": 0.28838619589805603, "learning_rate": 9.915767177082125e-05, "loss": 1.5045, "step": 46020 }, { "epoch": 2.0554405812319, "eval_loss": 2.4637975692749023, "eval_runtime": 21.9861, "eval_samples_per_second": 227.417, "eval_steps_per_second": 1.228, "step": 46023 }, { "epoch": 2.0558199400477584, "grad_norm": 0.23613585531711578, "learning_rate": 9.915461894323395e-05, "loss": 1.5072, "step": 46030 }, { "epoch": 2.0563618812132707, "grad_norm": 0.33887359499931335, "learning_rate": 9.915156064596414e-05, "loss": 1.497, "step": 46040 }, { "epoch": 2.056903822378783, "grad_norm": 0.24096645414829254, "learning_rate": 9.914849687939071e-05, "loss": 1.4923, "step": 46050 }, { "epoch": 2.0574457635442953, "grad_norm": 0.29144373536109924, "learning_rate": 9.914542764389314e-05, "loss": 1.4964, "step": 46060 }, { "epoch": 2.0579877047098076, "grad_norm": 0.34292078018188477, "learning_rate": 9.914235293985167e-05, "loss": 1.514, "step": 46070 }, { "epoch": 2.0585296458753195, "grad_norm": 0.286112904548645, "learning_rate": 9.913927276764715e-05, "loss": 1.5068, "step": 46080 }, { "epoch": 2.059071587040832, "grad_norm": 0.2864058017730713, "learning_rate": 9.913618712766117e-05, "loss": 1.5067, "step": 46090 }, { "epoch": 2.059613528206344, "grad_norm": 0.2555348575115204, "learning_rate": 9.913309602027593e-05, "loss": 1.5068, "step": 46100 }, { "epoch": 2.0601554693718565, "grad_norm": 0.21725402772426605, "learning_rate": 9.912999944587437e-05, "loss": 1.5055, "step": 46110 }, { "epoch": 2.0601554693718565, "eval_loss": 2.46505069732666, "eval_runtime": 22.3497, "eval_samples_per_second": 223.717, "eval_steps_per_second": 1.208, "step": 46110 }, { "epoch": 2.0606974105373683, "grad_norm": 0.3962418735027313, "learning_rate": 9.912689740484007e-05, "loss": 1.5052, "step": 46120 }, { "epoch": 2.0612393517028806, "grad_norm": 0.22441263496875763, "learning_rate": 9.91237898975573e-05, "loss": 1.5036, "step": 46130 }, { "epoch": 2.061781292868393, "grad_norm": 0.3478442430496216, "learning_rate": 9.9120676924411e-05, "loss": 1.5032, "step": 46140 }, { "epoch": 2.0623232340339053, "grad_norm": 0.41713225841522217, "learning_rate": 9.911755848578681e-05, "loss": 1.5111, "step": 46150 }, { "epoch": 2.0628651751994176, "grad_norm": 0.20105630159378052, "learning_rate": 9.9114434582071e-05, "loss": 1.5153, "step": 46160 }, { "epoch": 2.0634071163649295, "grad_norm": 0.48680949211120605, "learning_rate": 9.911130521365057e-05, "loss": 1.5127, "step": 46170 }, { "epoch": 2.0639490575304418, "grad_norm": 0.2089950144290924, "learning_rate": 9.910817038091315e-05, "loss": 1.4976, "step": 46180 }, { "epoch": 2.064490998695954, "grad_norm": 0.35045289993286133, "learning_rate": 9.910503008424709e-05, "loss": 1.4987, "step": 46190 }, { "epoch": 2.0648703575118126, "eval_loss": 2.4648759365081787, "eval_runtime": 22.0369, "eval_samples_per_second": 226.892, "eval_steps_per_second": 1.225, "step": 46197 }, { "epoch": 2.0650329398614664, "grad_norm": 0.30531641840934753, "learning_rate": 9.910188432404139e-05, "loss": 1.5034, "step": 46200 }, { "epoch": 2.0655748810269783, "grad_norm": 0.2192566692829132, "learning_rate": 9.909873310068571e-05, "loss": 1.5177, "step": 46210 }, { "epoch": 2.0661168221924906, "grad_norm": 0.39566564559936523, "learning_rate": 9.909557641457047e-05, "loss": 1.5048, "step": 46220 }, { "epoch": 2.066658763358003, "grad_norm": 0.4346673786640167, "learning_rate": 9.909241426608664e-05, "loss": 1.4999, "step": 46230 }, { "epoch": 2.0672007045235152, "grad_norm": 0.2114131599664688, "learning_rate": 9.908924665562595e-05, "loss": 1.5104, "step": 46240 }, { "epoch": 2.0677426456890275, "grad_norm": 0.21276456117630005, "learning_rate": 9.908607358358082e-05, "loss": 1.5089, "step": 46250 }, { "epoch": 2.0682845868545394, "grad_norm": 0.20837821066379547, "learning_rate": 9.90828950503443e-05, "loss": 1.508, "step": 46260 }, { "epoch": 2.0688265280200517, "grad_norm": 0.22510722279548645, "learning_rate": 9.907971105631014e-05, "loss": 1.4973, "step": 46270 }, { "epoch": 2.069368469185564, "grad_norm": 0.2420571893453598, "learning_rate": 9.907652160187272e-05, "loss": 1.5006, "step": 46280 }, { "epoch": 2.069585245651769, "eval_loss": 2.455404043197632, "eval_runtime": 22.0244, "eval_samples_per_second": 227.021, "eval_steps_per_second": 1.226, "step": 46284 }, { "epoch": 2.0699104103510764, "grad_norm": 0.21104340255260468, "learning_rate": 9.907332668742718e-05, "loss": 1.5047, "step": 46290 }, { "epoch": 2.0704523515165887, "grad_norm": 0.3561536967754364, "learning_rate": 9.907012631336927e-05, "loss": 1.504, "step": 46300 }, { "epoch": 2.0709942926821006, "grad_norm": 0.19159241020679474, "learning_rate": 9.906692048009546e-05, "loss": 1.5021, "step": 46310 }, { "epoch": 2.071536233847613, "grad_norm": 0.4784151315689087, "learning_rate": 9.906370918800283e-05, "loss": 1.4997, "step": 46320 }, { "epoch": 2.072078175013125, "grad_norm": 0.2709428369998932, "learning_rate": 9.906049243748925e-05, "loss": 1.5038, "step": 46330 }, { "epoch": 2.0726201161786375, "grad_norm": 0.3371290862560272, "learning_rate": 9.905727022895313e-05, "loss": 1.5185, "step": 46340 }, { "epoch": 2.0731620573441494, "grad_norm": 0.3282695710659027, "learning_rate": 9.905404256279367e-05, "loss": 1.5043, "step": 46350 }, { "epoch": 2.0737039985096617, "grad_norm": 0.22218751907348633, "learning_rate": 9.905080943941068e-05, "loss": 1.5047, "step": 46360 }, { "epoch": 2.074245939675174, "grad_norm": 0.4120001494884491, "learning_rate": 9.904757085920466e-05, "loss": 1.4932, "step": 46370 }, { "epoch": 2.0743001337917253, "eval_loss": 2.4604697227478027, "eval_runtime": 22.0263, "eval_samples_per_second": 227.001, "eval_steps_per_second": 1.226, "step": 46371 }, { "epoch": 2.0747878808406863, "grad_norm": 0.2726476788520813, "learning_rate": 9.90443268225768e-05, "loss": 1.4971, "step": 46380 }, { "epoch": 2.0753298220061986, "grad_norm": 0.25668570399284363, "learning_rate": 9.904107732992896e-05, "loss": 1.5062, "step": 46390 }, { "epoch": 2.0758717631717105, "grad_norm": 0.3078915476799011, "learning_rate": 9.903782238166364e-05, "loss": 1.504, "step": 46400 }, { "epoch": 2.076413704337223, "grad_norm": 0.23746353387832642, "learning_rate": 9.903456197818411e-05, "loss": 1.5084, "step": 46410 }, { "epoch": 2.076955645502735, "grad_norm": 0.1999620497226715, "learning_rate": 9.90312961198942e-05, "loss": 1.4992, "step": 46420 }, { "epoch": 2.0774975866682475, "grad_norm": 0.3216739594936371, "learning_rate": 9.90280248071985e-05, "loss": 1.5011, "step": 46430 }, { "epoch": 2.0780395278337593, "grad_norm": 0.21200136840343475, "learning_rate": 9.902474804050224e-05, "loss": 1.4959, "step": 46440 }, { "epoch": 2.0785814689992717, "grad_norm": 0.21161618828773499, "learning_rate": 9.902146582021133e-05, "loss": 1.5019, "step": 46450 }, { "epoch": 2.0790150219316814, "eval_loss": 2.4619641304016113, "eval_runtime": 21.9868, "eval_samples_per_second": 227.409, "eval_steps_per_second": 1.228, "step": 46458 }, { "epoch": 2.079123410164784, "grad_norm": 0.28815415501594543, "learning_rate": 9.901817814673236e-05, "loss": 1.4998, "step": 46460 }, { "epoch": 2.0796653513302963, "grad_norm": 0.21010233461856842, "learning_rate": 9.901488502047257e-05, "loss": 1.5081, "step": 46470 }, { "epoch": 2.0802072924958086, "grad_norm": 0.21125809848308563, "learning_rate": 9.901158644183993e-05, "loss": 1.5111, "step": 46480 }, { "epoch": 2.0807492336613205, "grad_norm": 0.2528863251209259, "learning_rate": 9.900828241124303e-05, "loss": 1.4949, "step": 46490 }, { "epoch": 2.081291174826833, "grad_norm": 0.5834783911705017, "learning_rate": 9.900497292909119e-05, "loss": 1.5182, "step": 46500 }, { "epoch": 2.081833115992345, "grad_norm": 0.47388049960136414, "learning_rate": 9.900165799579434e-05, "loss": 1.4988, "step": 46510 }, { "epoch": 2.0823750571578574, "grad_norm": 0.2034447342157364, "learning_rate": 9.899833761176312e-05, "loss": 1.4929, "step": 46520 }, { "epoch": 2.0829169983233697, "grad_norm": 0.2005794793367386, "learning_rate": 9.899501177740889e-05, "loss": 1.4957, "step": 46530 }, { "epoch": 2.0834589394888816, "grad_norm": 0.3143729269504547, "learning_rate": 9.899168049314358e-05, "loss": 1.4937, "step": 46540 }, { "epoch": 2.083729910071638, "eval_loss": 2.4592928886413574, "eval_runtime": 21.99, "eval_samples_per_second": 227.376, "eval_steps_per_second": 1.228, "step": 46545 }, { "epoch": 2.084000880654394, "grad_norm": 0.3189544379711151, "learning_rate": 9.898834375937991e-05, "loss": 1.5108, "step": 46550 }, { "epoch": 2.0845428218199062, "grad_norm": 0.4922240674495697, "learning_rate": 9.898500157653118e-05, "loss": 1.51, "step": 46560 }, { "epoch": 2.0850847629854186, "grad_norm": 0.18532593548297882, "learning_rate": 9.898165394501142e-05, "loss": 1.5036, "step": 46570 }, { "epoch": 2.0856267041509304, "grad_norm": 0.26253679394721985, "learning_rate": 9.897830086523531e-05, "loss": 1.5052, "step": 46580 }, { "epoch": 2.0861686453164427, "grad_norm": 0.36160239577293396, "learning_rate": 9.897494233761823e-05, "loss": 1.5125, "step": 46590 }, { "epoch": 2.086710586481955, "grad_norm": 0.25832414627075195, "learning_rate": 9.897157836257621e-05, "loss": 1.5106, "step": 46600 }, { "epoch": 2.0872525276474674, "grad_norm": 0.32197335362434387, "learning_rate": 9.896820894052598e-05, "loss": 1.5029, "step": 46610 }, { "epoch": 2.0877944688129797, "grad_norm": 0.18866315484046936, "learning_rate": 9.896483407188492e-05, "loss": 1.502, "step": 46620 }, { "epoch": 2.0883364099784916, "grad_norm": 0.25076818466186523, "learning_rate": 9.896145375707106e-05, "loss": 1.5093, "step": 46630 }, { "epoch": 2.088444798211594, "eval_loss": 2.461345911026001, "eval_runtime": 26.7525, "eval_samples_per_second": 186.898, "eval_steps_per_second": 1.009, "step": 46632 }, { "epoch": 2.088878351144004, "grad_norm": 0.20240730047225952, "learning_rate": 9.89580679965032e-05, "loss": 1.508, "step": 46640 }, { "epoch": 2.089420292309516, "grad_norm": 0.2117396742105484, "learning_rate": 9.895467679060071e-05, "loss": 1.4989, "step": 46650 }, { "epoch": 2.0899622334750285, "grad_norm": 0.3446267545223236, "learning_rate": 9.89512801397837e-05, "loss": 1.5041, "step": 46660 }, { "epoch": 2.0905041746405404, "grad_norm": 0.22244545817375183, "learning_rate": 9.89478780444729e-05, "loss": 1.5003, "step": 46670 }, { "epoch": 2.0910461158060527, "grad_norm": 0.18350328505039215, "learning_rate": 9.894447050508981e-05, "loss": 1.5068, "step": 46680 }, { "epoch": 2.091588056971565, "grad_norm": 0.2232082635164261, "learning_rate": 9.894105752205648e-05, "loss": 1.5096, "step": 46690 }, { "epoch": 2.0921299981370773, "grad_norm": 0.2229575365781784, "learning_rate": 9.893763909579571e-05, "loss": 1.4963, "step": 46700 }, { "epoch": 2.0926719393025897, "grad_norm": 0.22202491760253906, "learning_rate": 9.893421522673098e-05, "loss": 1.4979, "step": 46710 }, { "epoch": 2.0931596863515503, "eval_loss": 2.4592254161834717, "eval_runtime": 22.1699, "eval_samples_per_second": 225.531, "eval_steps_per_second": 1.218, "step": 46719 }, { "epoch": 2.0932138804681015, "grad_norm": 0.28266942501068115, "learning_rate": 9.89307859152864e-05, "loss": 1.5001, "step": 46720 }, { "epoch": 2.093755821633614, "grad_norm": 0.24115784466266632, "learning_rate": 9.89273511618868e-05, "loss": 1.501, "step": 46730 }, { "epoch": 2.094297762799126, "grad_norm": 0.3385097086429596, "learning_rate": 9.892391096695766e-05, "loss": 1.5128, "step": 46740 }, { "epoch": 2.0948397039646385, "grad_norm": 0.2886834144592285, "learning_rate": 9.89204653309251e-05, "loss": 1.4963, "step": 46750 }, { "epoch": 2.0953816451301503, "grad_norm": 0.40250498056411743, "learning_rate": 9.891701425421599e-05, "loss": 1.5119, "step": 46760 }, { "epoch": 2.0959235862956627, "grad_norm": 0.20079436898231506, "learning_rate": 9.891355773725783e-05, "loss": 1.5029, "step": 46770 }, { "epoch": 2.096465527461175, "grad_norm": 0.26996469497680664, "learning_rate": 9.891009578047879e-05, "loss": 1.4942, "step": 46780 }, { "epoch": 2.0970074686266873, "grad_norm": 0.31331729888916016, "learning_rate": 9.890662838430771e-05, "loss": 1.5042, "step": 46790 }, { "epoch": 2.0975494097921996, "grad_norm": 0.27219855785369873, "learning_rate": 9.890315554917415e-05, "loss": 1.5004, "step": 46800 }, { "epoch": 2.097874574491507, "eval_loss": 2.4539170265197754, "eval_runtime": 22.0178, "eval_samples_per_second": 227.089, "eval_steps_per_second": 1.226, "step": 46806 }, { "epoch": 2.0980913509577115, "grad_norm": 0.2593035399913788, "learning_rate": 9.889967727550829e-05, "loss": 1.495, "step": 46810 }, { "epoch": 2.098633292123224, "grad_norm": 0.219623863697052, "learning_rate": 9.889619356374097e-05, "loss": 1.5117, "step": 46820 }, { "epoch": 2.099175233288736, "grad_norm": 0.3427010774612427, "learning_rate": 9.88927044143038e-05, "loss": 1.5115, "step": 46830 }, { "epoch": 2.0997171744542484, "grad_norm": 0.2844060957431793, "learning_rate": 9.888920982762895e-05, "loss": 1.5038, "step": 46840 }, { "epoch": 2.1002591156197603, "grad_norm": 0.6262246966362, "learning_rate": 9.888570980414935e-05, "loss": 1.4964, "step": 46850 }, { "epoch": 2.1008010567852726, "grad_norm": 0.3818177878856659, "learning_rate": 9.888220434429856e-05, "loss": 1.499, "step": 46860 }, { "epoch": 2.101342997950785, "grad_norm": 0.19796334207057953, "learning_rate": 9.887869344851081e-05, "loss": 1.5019, "step": 46870 }, { "epoch": 2.1018849391162973, "grad_norm": 0.40566131472587585, "learning_rate": 9.8875177117221e-05, "loss": 1.4951, "step": 46880 }, { "epoch": 2.1024268802818096, "grad_norm": 0.2360570877790451, "learning_rate": 9.887165535086473e-05, "loss": 1.501, "step": 46890 }, { "epoch": 2.102589462631463, "eval_loss": 2.4633328914642334, "eval_runtime": 22.3971, "eval_samples_per_second": 223.243, "eval_steps_per_second": 1.206, "step": 46893 }, { "epoch": 2.1029688214473214, "grad_norm": 0.41195404529571533, "learning_rate": 9.88681281498783e-05, "loss": 1.5036, "step": 46900 }, { "epoch": 2.1035107626128338, "grad_norm": 0.30758100748062134, "learning_rate": 9.886459551469858e-05, "loss": 1.5054, "step": 46910 }, { "epoch": 2.104052703778346, "grad_norm": 0.40162351727485657, "learning_rate": 9.886105744576322e-05, "loss": 1.4949, "step": 46920 }, { "epoch": 2.1045946449438584, "grad_norm": 0.32191726565361023, "learning_rate": 9.88575139435105e-05, "loss": 1.5065, "step": 46930 }, { "epoch": 2.1051365861093707, "grad_norm": 0.5494133830070496, "learning_rate": 9.885396500837934e-05, "loss": 1.5051, "step": 46940 }, { "epoch": 2.1056785272748826, "grad_norm": 0.23451875150203705, "learning_rate": 9.88504106408094e-05, "loss": 1.4936, "step": 46950 }, { "epoch": 2.106220468440395, "grad_norm": 0.2888700067996979, "learning_rate": 9.884685084124098e-05, "loss": 1.4968, "step": 46960 }, { "epoch": 2.106762409605907, "grad_norm": 0.24672222137451172, "learning_rate": 9.884328561011503e-05, "loss": 1.4998, "step": 46970 }, { "epoch": 2.1073043507714195, "grad_norm": 0.2256728857755661, "learning_rate": 9.883971494787322e-05, "loss": 1.5046, "step": 46980 }, { "epoch": 2.1073043507714195, "eval_loss": 2.471791982650757, "eval_runtime": 21.9693, "eval_samples_per_second": 227.591, "eval_steps_per_second": 1.229, "step": 46980 }, { "epoch": 2.1078462919369314, "grad_norm": 0.21322891116142273, "learning_rate": 9.883613885495785e-05, "loss": 1.5042, "step": 46990 }, { "epoch": 2.1083882331024437, "grad_norm": 0.35615795850753784, "learning_rate": 9.88325573318119e-05, "loss": 1.4933, "step": 47000 }, { "epoch": 2.108930174267956, "grad_norm": 0.3552355170249939, "learning_rate": 9.882897037887907e-05, "loss": 1.5039, "step": 47010 }, { "epoch": 2.1094721154334684, "grad_norm": 0.6598255038261414, "learning_rate": 9.882537799660368e-05, "loss": 1.4987, "step": 47020 }, { "epoch": 2.1100140565989807, "grad_norm": 0.22862312197685242, "learning_rate": 9.882178018543071e-05, "loss": 1.5196, "step": 47030 }, { "epoch": 2.1105559977644925, "grad_norm": 0.3337612450122833, "learning_rate": 9.881817694580588e-05, "loss": 1.5018, "step": 47040 }, { "epoch": 2.111097938930005, "grad_norm": 0.24085493385791779, "learning_rate": 9.881456827817553e-05, "loss": 1.5019, "step": 47050 }, { "epoch": 2.111639880095517, "grad_norm": 0.20780259370803833, "learning_rate": 9.881095418298668e-05, "loss": 1.5074, "step": 47060 }, { "epoch": 2.1120192389113757, "eval_loss": 2.4535341262817383, "eval_runtime": 21.9889, "eval_samples_per_second": 227.388, "eval_steps_per_second": 1.228, "step": 47067 }, { "epoch": 2.1121818212610295, "grad_norm": 0.41703495383262634, "learning_rate": 9.880733466068704e-05, "loss": 1.5105, "step": 47070 }, { "epoch": 2.1127237624265414, "grad_norm": 0.3521629571914673, "learning_rate": 9.880370971172497e-05, "loss": 1.5062, "step": 47080 }, { "epoch": 2.1132657035920537, "grad_norm": 0.3287181854248047, "learning_rate": 9.880007933654953e-05, "loss": 1.4956, "step": 47090 }, { "epoch": 2.113807644757566, "grad_norm": 0.3003343939781189, "learning_rate": 9.87964435356104e-05, "loss": 1.5089, "step": 47100 }, { "epoch": 2.1143495859230783, "grad_norm": 0.2599339187145233, "learning_rate": 9.879280230935801e-05, "loss": 1.4982, "step": 47110 }, { "epoch": 2.1148915270885906, "grad_norm": 0.22060526907444, "learning_rate": 9.878915565824341e-05, "loss": 1.4989, "step": 47120 }, { "epoch": 2.1154334682541025, "grad_norm": 0.28959226608276367, "learning_rate": 9.878550358271829e-05, "loss": 1.5042, "step": 47130 }, { "epoch": 2.115975409419615, "grad_norm": 0.31336092948913574, "learning_rate": 9.878184608323509e-05, "loss": 1.4997, "step": 47140 }, { "epoch": 2.116517350585127, "grad_norm": 0.23946624994277954, "learning_rate": 9.877818316024689e-05, "loss": 1.5056, "step": 47150 }, { "epoch": 2.116734127051332, "eval_loss": 2.464033365249634, "eval_runtime": 21.9877, "eval_samples_per_second": 227.4, "eval_steps_per_second": 1.228, "step": 47154 }, { "epoch": 2.1170592917506394, "grad_norm": 0.23651756346225739, "learning_rate": 9.877451481420742e-05, "loss": 1.502, "step": 47160 }, { "epoch": 2.1176012329161518, "grad_norm": 0.2244928926229477, "learning_rate": 9.877084104557111e-05, "loss": 1.5014, "step": 47170 }, { "epoch": 2.1181431740816636, "grad_norm": 0.24885980784893036, "learning_rate": 9.876716185479303e-05, "loss": 1.5075, "step": 47180 }, { "epoch": 2.118685115247176, "grad_norm": 0.22047527134418488, "learning_rate": 9.876347724232898e-05, "loss": 1.5042, "step": 47190 }, { "epoch": 2.1192270564126883, "grad_norm": 0.2784343659877777, "learning_rate": 9.875978720863536e-05, "loss": 1.5095, "step": 47200 }, { "epoch": 2.1197689975782006, "grad_norm": 0.41502806544303894, "learning_rate": 9.875609175416929e-05, "loss": 1.498, "step": 47210 }, { "epoch": 2.1203109387437125, "grad_norm": 0.23642586171627045, "learning_rate": 9.875239087938853e-05, "loss": 1.4923, "step": 47220 }, { "epoch": 2.1208528799092248, "grad_norm": 0.3005571663379669, "learning_rate": 9.874868458475155e-05, "loss": 1.5045, "step": 47230 }, { "epoch": 2.121394821074737, "grad_norm": 0.2985764443874359, "learning_rate": 9.874497287071747e-05, "loss": 1.5013, "step": 47240 }, { "epoch": 2.1214490151912884, "eval_loss": 2.4620718955993652, "eval_runtime": 22.3871, "eval_samples_per_second": 223.343, "eval_steps_per_second": 1.206, "step": 47241 }, { "epoch": 2.1219367622402494, "grad_norm": 0.3452117443084717, "learning_rate": 9.874125573774608e-05, "loss": 1.5131, "step": 47250 }, { "epoch": 2.1224787034057617, "grad_norm": 0.33107510209083557, "learning_rate": 9.873753318629781e-05, "loss": 1.4932, "step": 47260 }, { "epoch": 2.1230206445712736, "grad_norm": 0.20669598877429962, "learning_rate": 9.873380521683383e-05, "loss": 1.5066, "step": 47270 }, { "epoch": 2.123562585736786, "grad_norm": 0.2573641538619995, "learning_rate": 9.873007182981592e-05, "loss": 1.5003, "step": 47280 }, { "epoch": 2.1241045269022982, "grad_norm": 0.2514735460281372, "learning_rate": 9.87263330257066e-05, "loss": 1.4996, "step": 47290 }, { "epoch": 2.1246464680678105, "grad_norm": 0.7629513740539551, "learning_rate": 9.872258880496896e-05, "loss": 1.5085, "step": 47300 }, { "epoch": 2.1251884092333224, "grad_norm": 0.2581771910190582, "learning_rate": 9.871883916806686e-05, "loss": 1.5017, "step": 47310 }, { "epoch": 2.1257303503988347, "grad_norm": 0.22503337264060974, "learning_rate": 9.871508411546475e-05, "loss": 1.4997, "step": 47320 }, { "epoch": 2.1261639033312445, "eval_loss": 2.4661858081817627, "eval_runtime": 25.2283, "eval_samples_per_second": 198.19, "eval_steps_per_second": 1.07, "step": 47328 }, { "epoch": 2.126272291564347, "grad_norm": 0.3019942045211792, "learning_rate": 9.871132364762782e-05, "loss": 1.5063, "step": 47330 }, { "epoch": 2.1268142327298594, "grad_norm": 0.3621060848236084, "learning_rate": 9.87075577650219e-05, "loss": 1.4999, "step": 47340 }, { "epoch": 2.1273561738953717, "grad_norm": 0.18868303298950195, "learning_rate": 9.870378646811347e-05, "loss": 1.5056, "step": 47350 }, { "epoch": 2.1278981150608836, "grad_norm": 0.20302945375442505, "learning_rate": 9.870000975736974e-05, "loss": 1.5063, "step": 47360 }, { "epoch": 2.128440056226396, "grad_norm": 0.41298550367355347, "learning_rate": 9.86962276332585e-05, "loss": 1.5067, "step": 47370 }, { "epoch": 2.128981997391908, "grad_norm": 0.25545287132263184, "learning_rate": 9.869244009624831e-05, "loss": 1.4994, "step": 47380 }, { "epoch": 2.1295239385574205, "grad_norm": 0.33961889147758484, "learning_rate": 9.868864714680832e-05, "loss": 1.5066, "step": 47390 }, { "epoch": 2.1300658797229324, "grad_norm": 0.29086834192276, "learning_rate": 9.868484878540842e-05, "loss": 1.505, "step": 47400 }, { "epoch": 2.1306078208884447, "grad_norm": 0.2222570925951004, "learning_rate": 9.868104501251909e-05, "loss": 1.4965, "step": 47410 }, { "epoch": 2.130878791471201, "eval_loss": 2.467705249786377, "eval_runtime": 22.0169, "eval_samples_per_second": 227.099, "eval_steps_per_second": 1.226, "step": 47415 }, { "epoch": 2.131149762053957, "grad_norm": 0.25193503499031067, "learning_rate": 9.867723582861155e-05, "loss": 1.5039, "step": 47420 }, { "epoch": 2.1316917032194693, "grad_norm": 0.2993335723876953, "learning_rate": 9.867342123415768e-05, "loss": 1.5052, "step": 47430 }, { "epoch": 2.1322336443849816, "grad_norm": 0.2402217835187912, "learning_rate": 9.866960122962998e-05, "loss": 1.4947, "step": 47440 }, { "epoch": 2.1327755855504935, "grad_norm": 0.47997650504112244, "learning_rate": 9.866577581550169e-05, "loss": 1.5065, "step": 47450 }, { "epoch": 2.133317526716006, "grad_norm": 0.3690037429332733, "learning_rate": 9.866194499224665e-05, "loss": 1.5024, "step": 47460 }, { "epoch": 2.133859467881518, "grad_norm": 0.2865743935108185, "learning_rate": 9.865810876033946e-05, "loss": 1.4975, "step": 47470 }, { "epoch": 2.1344014090470305, "grad_norm": 0.2986805737018585, "learning_rate": 9.865426712025527e-05, "loss": 1.4964, "step": 47480 }, { "epoch": 2.1349433502125423, "grad_norm": 0.44685614109039307, "learning_rate": 9.865042007247001e-05, "loss": 1.4987, "step": 47490 }, { "epoch": 2.1354852913780547, "grad_norm": 0.26483583450317383, "learning_rate": 9.864656761746021e-05, "loss": 1.4933, "step": 47500 }, { "epoch": 2.135593679611157, "eval_loss": 2.4674875736236572, "eval_runtime": 22.0484, "eval_samples_per_second": 226.774, "eval_steps_per_second": 1.225, "step": 47502 }, { "epoch": 2.136027232543567, "grad_norm": 0.2297901213169098, "learning_rate": 9.864270975570313e-05, "loss": 1.5029, "step": 47510 }, { "epoch": 2.1365691737090793, "grad_norm": 0.251094251871109, "learning_rate": 9.863884648767662e-05, "loss": 1.5057, "step": 47520 }, { "epoch": 2.1371111148745916, "grad_norm": 0.36560431122779846, "learning_rate": 9.863497781385928e-05, "loss": 1.4962, "step": 47530 }, { "epoch": 2.1376530560401035, "grad_norm": 0.3538174033164978, "learning_rate": 9.863110373473033e-05, "loss": 1.4831, "step": 47540 }, { "epoch": 2.138194997205616, "grad_norm": 0.20020203292369843, "learning_rate": 9.862722425076968e-05, "loss": 1.5092, "step": 47550 }, { "epoch": 2.138736938371128, "grad_norm": 0.3080503046512604, "learning_rate": 9.862333936245789e-05, "loss": 1.5111, "step": 47560 }, { "epoch": 2.1392788795366404, "grad_norm": 0.3943610191345215, "learning_rate": 9.861944907027624e-05, "loss": 1.5024, "step": 47570 }, { "epoch": 2.1398208207021527, "grad_norm": 0.25787243247032166, "learning_rate": 9.861555337470658e-05, "loss": 1.4955, "step": 47580 }, { "epoch": 2.1403085677511133, "eval_loss": 2.4601619243621826, "eval_runtime": 22.0199, "eval_samples_per_second": 227.067, "eval_steps_per_second": 1.226, "step": 47589 }, { "epoch": 2.1403627618676646, "grad_norm": 0.2594373822212219, "learning_rate": 9.861165227623154e-05, "loss": 1.5006, "step": 47590 }, { "epoch": 2.140904703033177, "grad_norm": 0.1998678594827652, "learning_rate": 9.860774577533438e-05, "loss": 1.493, "step": 47600 }, { "epoch": 2.1414466441986892, "grad_norm": 0.30177435278892517, "learning_rate": 9.860383387249897e-05, "loss": 1.5081, "step": 47610 }, { "epoch": 2.1419885853642016, "grad_norm": 0.3916855752468109, "learning_rate": 9.859991656820994e-05, "loss": 1.4935, "step": 47620 }, { "epoch": 2.1425305265297134, "grad_norm": 0.3234284520149231, "learning_rate": 9.859599386295255e-05, "loss": 1.5059, "step": 47630 }, { "epoch": 2.1430724676952257, "grad_norm": 0.4442438781261444, "learning_rate": 9.85920657572127e-05, "loss": 1.5093, "step": 47640 }, { "epoch": 2.143614408860738, "grad_norm": 0.194505512714386, "learning_rate": 9.858813225147702e-05, "loss": 1.4962, "step": 47650 }, { "epoch": 2.1441563500262504, "grad_norm": 0.31255653500556946, "learning_rate": 9.858419334623273e-05, "loss": 1.5066, "step": 47660 }, { "epoch": 2.1446982911917627, "grad_norm": 0.2588372230529785, "learning_rate": 9.858024904196782e-05, "loss": 1.502, "step": 47670 }, { "epoch": 2.14502345589107, "eval_loss": 2.4643874168395996, "eval_runtime": 21.985, "eval_samples_per_second": 227.428, "eval_steps_per_second": 1.228, "step": 47676 }, { "epoch": 2.1452402323572746, "grad_norm": 0.27704402804374695, "learning_rate": 9.857629933917084e-05, "loss": 1.5025, "step": 47680 }, { "epoch": 2.145782173522787, "grad_norm": 0.34092339873313904, "learning_rate": 9.857234423833111e-05, "loss": 1.4959, "step": 47690 }, { "epoch": 2.146324114688299, "grad_norm": 0.37414854764938354, "learning_rate": 9.856838373993852e-05, "loss": 1.494, "step": 47700 }, { "epoch": 2.1468660558538115, "grad_norm": 0.19338767230510712, "learning_rate": 9.856441784448373e-05, "loss": 1.5082, "step": 47710 }, { "epoch": 2.1474079970193234, "grad_norm": 0.23796340823173523, "learning_rate": 9.8560446552458e-05, "loss": 1.4824, "step": 47720 }, { "epoch": 2.1479499381848357, "grad_norm": 0.3311995565891266, "learning_rate": 9.855646986435325e-05, "loss": 1.5069, "step": 47730 }, { "epoch": 2.148491879350348, "grad_norm": 0.2240724116563797, "learning_rate": 9.855248778066212e-05, "loss": 1.4987, "step": 47740 }, { "epoch": 2.1490338205158603, "grad_norm": 0.22334522008895874, "learning_rate": 9.854850030187791e-05, "loss": 1.4981, "step": 47750 }, { "epoch": 2.1495757616813727, "grad_norm": 0.21262875199317932, "learning_rate": 9.854450742849452e-05, "loss": 1.5029, "step": 47760 }, { "epoch": 2.149738344031026, "eval_loss": 2.4613311290740967, "eval_runtime": 21.9832, "eval_samples_per_second": 227.447, "eval_steps_per_second": 1.228, "step": 47763 }, { "epoch": 2.1501177028468845, "grad_norm": 0.2315114289522171, "learning_rate": 9.854050916100664e-05, "loss": 1.5089, "step": 47770 }, { "epoch": 2.150659644012397, "grad_norm": 0.34692510962486267, "learning_rate": 9.853650549990949e-05, "loss": 1.488, "step": 47780 }, { "epoch": 2.151201585177909, "grad_norm": 0.2978134751319885, "learning_rate": 9.853249644569907e-05, "loss": 1.5069, "step": 47790 }, { "epoch": 2.1517435263434215, "grad_norm": 0.2450045347213745, "learning_rate": 9.852848199887198e-05, "loss": 1.4962, "step": 47800 }, { "epoch": 2.152285467508934, "grad_norm": 0.27408814430236816, "learning_rate": 9.852446215992552e-05, "loss": 1.5035, "step": 47810 }, { "epoch": 2.1528274086744457, "grad_norm": 0.288369745016098, "learning_rate": 9.852043692935766e-05, "loss": 1.5054, "step": 47820 }, { "epoch": 2.153369349839958, "grad_norm": 0.33964666724205017, "learning_rate": 9.851640630766703e-05, "loss": 1.5055, "step": 47830 }, { "epoch": 2.1539112910054703, "grad_norm": 0.3584079444408417, "learning_rate": 9.851237029535289e-05, "loss": 1.4909, "step": 47840 }, { "epoch": 2.1544532321709826, "grad_norm": 0.3552767336368561, "learning_rate": 9.850832889291525e-05, "loss": 1.4932, "step": 47850 }, { "epoch": 2.1544532321709826, "eval_loss": 2.4597816467285156, "eval_runtime": 21.9841, "eval_samples_per_second": 227.437, "eval_steps_per_second": 1.228, "step": 47850 }, { "epoch": 2.1549951733364945, "grad_norm": 0.24502573907375336, "learning_rate": 9.850428210085471e-05, "loss": 1.4988, "step": 47860 }, { "epoch": 2.155537114502007, "grad_norm": 0.26566943526268005, "learning_rate": 9.850022991967258e-05, "loss": 1.5044, "step": 47870 }, { "epoch": 2.156079055667519, "grad_norm": 0.22309494018554688, "learning_rate": 9.849617234987083e-05, "loss": 1.5052, "step": 47880 }, { "epoch": 2.1566209968330314, "grad_norm": 0.3310026526451111, "learning_rate": 9.849210939195209e-05, "loss": 1.5026, "step": 47890 }, { "epoch": 2.1571629379985433, "grad_norm": 0.24902848899364471, "learning_rate": 9.848804104641966e-05, "loss": 1.5058, "step": 47900 }, { "epoch": 2.1577048791640556, "grad_norm": 0.21098916232585907, "learning_rate": 9.848396731377751e-05, "loss": 1.5041, "step": 47910 }, { "epoch": 2.158246820329568, "grad_norm": 0.23908697068691254, "learning_rate": 9.84798881945303e-05, "loss": 1.5018, "step": 47920 }, { "epoch": 2.1587887614950803, "grad_norm": 0.41047078371047974, "learning_rate": 9.847580368918329e-05, "loss": 1.5021, "step": 47930 }, { "epoch": 2.1591681203109387, "eval_loss": 2.4657371044158936, "eval_runtime": 21.9884, "eval_samples_per_second": 227.392, "eval_steps_per_second": 1.228, "step": 47937 }, { "epoch": 2.1593307026605926, "grad_norm": 0.4476509988307953, "learning_rate": 9.847171379824248e-05, "loss": 1.5102, "step": 47940 }, { "epoch": 2.1598726438261044, "grad_norm": 0.43281930685043335, "learning_rate": 9.846761852221449e-05, "loss": 1.4999, "step": 47950 }, { "epoch": 2.1604145849916168, "grad_norm": 0.20760992169380188, "learning_rate": 9.846351786160665e-05, "loss": 1.5029, "step": 47960 }, { "epoch": 2.160956526157129, "grad_norm": 0.24444232881069183, "learning_rate": 9.84594118169269e-05, "loss": 1.5091, "step": 47970 }, { "epoch": 2.1614984673226414, "grad_norm": 0.25241169333457947, "learning_rate": 9.84553003886839e-05, "loss": 1.5019, "step": 47980 }, { "epoch": 2.1620404084881537, "grad_norm": 0.26885291934013367, "learning_rate": 9.845118357738696e-05, "loss": 1.5068, "step": 47990 }, { "epoch": 2.1625823496536656, "grad_norm": 0.26971253752708435, "learning_rate": 9.844706138354603e-05, "loss": 1.5013, "step": 48000 }, { "epoch": 2.163124290819178, "grad_norm": 0.20452235639095306, "learning_rate": 9.844293380767178e-05, "loss": 1.4947, "step": 48010 }, { "epoch": 2.16366623198469, "grad_norm": 0.19715271890163422, "learning_rate": 9.843880085027551e-05, "loss": 1.496, "step": 48020 }, { "epoch": 2.163883008450895, "eval_loss": 2.4602131843566895, "eval_runtime": 21.9834, "eval_samples_per_second": 227.444, "eval_steps_per_second": 1.228, "step": 48024 }, { "epoch": 2.1642081731502025, "grad_norm": 0.30456554889678955, "learning_rate": 9.843466251186916e-05, "loss": 1.5103, "step": 48030 }, { "epoch": 2.1647501143157144, "grad_norm": 0.24360065162181854, "learning_rate": 9.843051879296539e-05, "loss": 1.5072, "step": 48040 }, { "epoch": 2.1652920554812267, "grad_norm": 0.2575078308582306, "learning_rate": 9.842636969407753e-05, "loss": 1.5027, "step": 48050 }, { "epoch": 2.165833996646739, "grad_norm": 0.1880762130022049, "learning_rate": 9.842221521571951e-05, "loss": 1.4936, "step": 48060 }, { "epoch": 2.1663759378122514, "grad_norm": 0.24595960974693298, "learning_rate": 9.841805535840602e-05, "loss": 1.501, "step": 48070 }, { "epoch": 2.1669178789777637, "grad_norm": 0.21159407496452332, "learning_rate": 9.841389012265231e-05, "loss": 1.4968, "step": 48080 }, { "epoch": 2.1674598201432755, "grad_norm": 0.4702402353286743, "learning_rate": 9.840971950897437e-05, "loss": 1.4929, "step": 48090 }, { "epoch": 2.168001761308788, "grad_norm": 0.23953235149383545, "learning_rate": 9.840554351788885e-05, "loss": 1.4948, "step": 48100 }, { "epoch": 2.1685437024743, "grad_norm": 0.21100491285324097, "learning_rate": 9.840136214991305e-05, "loss": 1.4927, "step": 48110 }, { "epoch": 2.1685978965908514, "eval_loss": 2.4579579830169678, "eval_runtime": 21.9856, "eval_samples_per_second": 227.422, "eval_steps_per_second": 1.228, "step": 48111 }, { "epoch": 2.1690856436398125, "grad_norm": 0.2241096794605255, "learning_rate": 9.839717540556495e-05, "loss": 1.4958, "step": 48120 }, { "epoch": 2.1696275848053244, "grad_norm": 0.19508574903011322, "learning_rate": 9.839298328536316e-05, "loss": 1.4886, "step": 48130 }, { "epoch": 2.1701695259708367, "grad_norm": 0.3244903087615967, "learning_rate": 9.838878578982699e-05, "loss": 1.4895, "step": 48140 }, { "epoch": 2.170711467136349, "grad_norm": 0.3189353346824646, "learning_rate": 9.838458291947641e-05, "loss": 1.4833, "step": 48150 }, { "epoch": 2.1712534083018613, "grad_norm": 0.20821264386177063, "learning_rate": 9.838037467483207e-05, "loss": 1.498, "step": 48160 }, { "epoch": 2.1717953494673736, "grad_norm": 0.21814101934432983, "learning_rate": 9.837616105641523e-05, "loss": 1.4939, "step": 48170 }, { "epoch": 2.1723372906328855, "grad_norm": 0.2125815451145172, "learning_rate": 9.837194206474789e-05, "loss": 1.4896, "step": 48180 }, { "epoch": 2.172879231798398, "grad_norm": 0.20504556596279144, "learning_rate": 9.83677177003527e-05, "loss": 1.498, "step": 48190 }, { "epoch": 2.1733127847308076, "eval_loss": 2.4657247066497803, "eval_runtime": 22.1285, "eval_samples_per_second": 225.953, "eval_steps_per_second": 1.22, "step": 48198 }, { "epoch": 2.17342117296391, "grad_norm": 0.21012401580810547, "learning_rate": 9.836348796375288e-05, "loss": 1.4979, "step": 48200 }, { "epoch": 2.1739631141294224, "grad_norm": 0.42915037274360657, "learning_rate": 9.835925285547245e-05, "loss": 1.4883, "step": 48210 }, { "epoch": 2.1745050552949348, "grad_norm": 0.350690633058548, "learning_rate": 9.835501237603603e-05, "loss": 1.5056, "step": 48220 }, { "epoch": 2.1750469964604466, "grad_norm": 0.23836711049079895, "learning_rate": 9.835076652596891e-05, "loss": 1.5043, "step": 48230 }, { "epoch": 2.175588937625959, "grad_norm": 0.4392627477645874, "learning_rate": 9.834651530579703e-05, "loss": 1.489, "step": 48240 }, { "epoch": 2.1761308787914713, "grad_norm": 0.1950349509716034, "learning_rate": 9.834225871604701e-05, "loss": 1.4914, "step": 48250 }, { "epoch": 2.1766728199569836, "grad_norm": 0.36042729020118713, "learning_rate": 9.833799675724619e-05, "loss": 1.5052, "step": 48260 }, { "epoch": 2.1772147611224955, "grad_norm": 0.3218774199485779, "learning_rate": 9.833372942992248e-05, "loss": 1.4963, "step": 48270 }, { "epoch": 2.1777567022880078, "grad_norm": 0.17845819890499115, "learning_rate": 9.832945673460448e-05, "loss": 1.494, "step": 48280 }, { "epoch": 2.178027672870764, "eval_loss": 2.4591383934020996, "eval_runtime": 22.2137, "eval_samples_per_second": 225.086, "eval_steps_per_second": 1.215, "step": 48285 }, { "epoch": 2.17829864345352, "grad_norm": 0.36038216948509216, "learning_rate": 9.832517867182151e-05, "loss": 1.4997, "step": 48290 }, { "epoch": 2.1788405846190324, "grad_norm": 0.2766604721546173, "learning_rate": 9.832089524210352e-05, "loss": 1.4955, "step": 48300 }, { "epoch": 2.1793825257845447, "grad_norm": 0.2935325801372528, "learning_rate": 9.831660644598109e-05, "loss": 1.5001, "step": 48310 }, { "epoch": 2.1799244669500566, "grad_norm": 0.3200201094150543, "learning_rate": 9.831231228398553e-05, "loss": 1.4945, "step": 48320 }, { "epoch": 2.180466408115569, "grad_norm": 0.27418839931488037, "learning_rate": 9.830801275664876e-05, "loss": 1.5014, "step": 48330 }, { "epoch": 2.1810083492810812, "grad_norm": 0.26256710290908813, "learning_rate": 9.830370786450339e-05, "loss": 1.5014, "step": 48340 }, { "epoch": 2.1815502904465935, "grad_norm": 0.4114410877227783, "learning_rate": 9.82993976080827e-05, "loss": 1.51, "step": 48350 }, { "epoch": 2.1820922316121054, "grad_norm": 0.3471300005912781, "learning_rate": 9.829508198792062e-05, "loss": 1.5092, "step": 48360 }, { "epoch": 2.1826341727776177, "grad_norm": 0.356444776058197, "learning_rate": 9.829076100455176e-05, "loss": 1.4959, "step": 48370 }, { "epoch": 2.1827425610107203, "eval_loss": 2.4686012268066406, "eval_runtime": 22.0175, "eval_samples_per_second": 227.092, "eval_steps_per_second": 1.226, "step": 48372 }, { "epoch": 2.18317611394313, "grad_norm": 0.2611772418022156, "learning_rate": 9.828643465851137e-05, "loss": 1.4966, "step": 48380 }, { "epoch": 2.1837180551086424, "grad_norm": 0.43987488746643066, "learning_rate": 9.828210295033537e-05, "loss": 1.4965, "step": 48390 }, { "epoch": 2.1842599962741547, "grad_norm": 0.24238485097885132, "learning_rate": 9.82777658805604e-05, "loss": 1.4908, "step": 48400 }, { "epoch": 2.1848019374396666, "grad_norm": 0.2008477747440338, "learning_rate": 9.827342344972366e-05, "loss": 1.4946, "step": 48410 }, { "epoch": 2.185343878605179, "grad_norm": 0.21971365809440613, "learning_rate": 9.826907565836311e-05, "loss": 1.4978, "step": 48420 }, { "epoch": 2.185885819770691, "grad_norm": 0.35851284861564636, "learning_rate": 9.826472250701732e-05, "loss": 1.4988, "step": 48430 }, { "epoch": 2.1864277609362035, "grad_norm": 0.24326647818088531, "learning_rate": 9.826036399622553e-05, "loss": 1.4949, "step": 48440 }, { "epoch": 2.186969702101716, "grad_norm": 0.23622386157512665, "learning_rate": 9.82560001265277e-05, "loss": 1.4959, "step": 48450 }, { "epoch": 2.1874574491506764, "eval_loss": 2.4709348678588867, "eval_runtime": 22.0771, "eval_samples_per_second": 226.479, "eval_steps_per_second": 1.223, "step": 48459 }, { "epoch": 2.1875116432672277, "grad_norm": 0.22727850079536438, "learning_rate": 9.825163089846434e-05, "loss": 1.4841, "step": 48460 }, { "epoch": 2.18805358443274, "grad_norm": 0.291652649641037, "learning_rate": 9.824725631257674e-05, "loss": 1.4959, "step": 48470 }, { "epoch": 2.1885955255982523, "grad_norm": 0.27845892310142517, "learning_rate": 9.824287636940678e-05, "loss": 1.4919, "step": 48480 }, { "epoch": 2.1891374667637646, "grad_norm": 0.3905391991138458, "learning_rate": 9.823849106949704e-05, "loss": 1.4948, "step": 48490 }, { "epoch": 2.1896794079292765, "grad_norm": 0.2136894315481186, "learning_rate": 9.823410041339075e-05, "loss": 1.4967, "step": 48500 }, { "epoch": 2.190221349094789, "grad_norm": 0.3162279725074768, "learning_rate": 9.82297044016318e-05, "loss": 1.4927, "step": 48510 }, { "epoch": 2.190763290260301, "grad_norm": 0.24722278118133545, "learning_rate": 9.822530303476476e-05, "loss": 1.4965, "step": 48520 }, { "epoch": 2.1913052314258135, "grad_norm": 0.2130030393600464, "learning_rate": 9.822089631333484e-05, "loss": 1.5057, "step": 48530 }, { "epoch": 2.1918471725913253, "grad_norm": 0.2408854067325592, "learning_rate": 9.821648423788794e-05, "loss": 1.5172, "step": 48540 }, { "epoch": 2.192172337290633, "eval_loss": 2.4656124114990234, "eval_runtime": 22.0104, "eval_samples_per_second": 227.165, "eval_steps_per_second": 1.227, "step": 48546 }, { "epoch": 2.1923891137568376, "grad_norm": 0.21235163509845734, "learning_rate": 9.821206680897059e-05, "loss": 1.4993, "step": 48550 }, { "epoch": 2.19293105492235, "grad_norm": 0.365730345249176, "learning_rate": 9.820764402713002e-05, "loss": 1.4979, "step": 48560 }, { "epoch": 2.1934729960878623, "grad_norm": 0.26309072971343994, "learning_rate": 9.82032158929141e-05, "loss": 1.4962, "step": 48570 }, { "epoch": 2.1940149372533746, "grad_norm": 0.3084958493709564, "learning_rate": 9.819878240687136e-05, "loss": 1.4912, "step": 48580 }, { "epoch": 2.1945568784188865, "grad_norm": 0.2697133719921112, "learning_rate": 9.8194343569551e-05, "loss": 1.4936, "step": 48590 }, { "epoch": 2.195098819584399, "grad_norm": 0.31936559081077576, "learning_rate": 9.81898993815029e-05, "loss": 1.4938, "step": 48600 }, { "epoch": 2.195640760749911, "grad_norm": 0.2950705289840698, "learning_rate": 9.818544984327756e-05, "loss": 1.501, "step": 48610 }, { "epoch": 2.1961827019154234, "grad_norm": 0.49501827359199524, "learning_rate": 9.818099495542619e-05, "loss": 1.5048, "step": 48620 }, { "epoch": 2.1967246430809357, "grad_norm": 0.22042915225028992, "learning_rate": 9.817653471850066e-05, "loss": 1.4881, "step": 48630 }, { "epoch": 2.196887225430589, "eval_loss": 2.4624252319335938, "eval_runtime": 22.0712, "eval_samples_per_second": 226.54, "eval_steps_per_second": 1.223, "step": 48633 }, { "epoch": 2.1972665842464476, "grad_norm": 0.26132088899612427, "learning_rate": 9.817206913305344e-05, "loss": 1.497, "step": 48640 }, { "epoch": 2.19780852541196, "grad_norm": 0.19975581765174866, "learning_rate": 9.816759819963773e-05, "loss": 1.483, "step": 48650 }, { "epoch": 2.1983504665774722, "grad_norm": 0.2904106080532074, "learning_rate": 9.816312191880738e-05, "loss": 1.4892, "step": 48660 }, { "epoch": 2.1988924077429846, "grad_norm": 0.2334350049495697, "learning_rate": 9.815864029111688e-05, "loss": 1.4904, "step": 48670 }, { "epoch": 2.1994343489084964, "grad_norm": 0.24541063606739044, "learning_rate": 9.81541533171214e-05, "loss": 1.4901, "step": 48680 }, { "epoch": 2.1999762900740087, "grad_norm": 0.1976650059223175, "learning_rate": 9.814966099737676e-05, "loss": 1.479, "step": 48690 }, { "epoch": 2.200518231239521, "grad_norm": 0.22315511107444763, "learning_rate": 9.814516333243945e-05, "loss": 1.5053, "step": 48700 }, { "epoch": 2.2010601724050334, "grad_norm": 0.3041779100894928, "learning_rate": 9.814066032286664e-05, "loss": 1.4908, "step": 48710 }, { "epoch": 2.2016021135705457, "grad_norm": 0.3959655463695526, "learning_rate": 9.813615196921611e-05, "loss": 1.497, "step": 48720 }, { "epoch": 2.2016021135705457, "eval_loss": 2.467682361602783, "eval_runtime": 21.8632, "eval_samples_per_second": 228.695, "eval_steps_per_second": 1.235, "step": 48720 }, { "epoch": 2.2021440547360576, "grad_norm": 0.2074437290430069, "learning_rate": 9.813163827204634e-05, "loss": 1.4933, "step": 48730 }, { "epoch": 2.20268599590157, "grad_norm": 0.24793848395347595, "learning_rate": 9.812711923191651e-05, "loss": 1.5001, "step": 48740 }, { "epoch": 2.203227937067082, "grad_norm": 0.2648943364620209, "learning_rate": 9.812259484938638e-05, "loss": 1.4939, "step": 48750 }, { "epoch": 2.2037698782325945, "grad_norm": 0.4681974947452545, "learning_rate": 9.811806512501641e-05, "loss": 1.4988, "step": 48760 }, { "epoch": 2.2043118193981064, "grad_norm": 0.30370041728019714, "learning_rate": 9.811353005936774e-05, "loss": 1.4922, "step": 48770 }, { "epoch": 2.2048537605636187, "grad_norm": 0.26227137446403503, "learning_rate": 9.810898965300213e-05, "loss": 1.4986, "step": 48780 }, { "epoch": 2.205395701729131, "grad_norm": 0.3216424286365509, "learning_rate": 9.810444390648207e-05, "loss": 1.493, "step": 48790 }, { "epoch": 2.2059376428946433, "grad_norm": 0.19285954535007477, "learning_rate": 9.809989282037062e-05, "loss": 1.4856, "step": 48800 }, { "epoch": 2.206317001710502, "eval_loss": 2.4649438858032227, "eval_runtime": 22.0562, "eval_samples_per_second": 226.694, "eval_steps_per_second": 1.224, "step": 48807 }, { "epoch": 2.2064795840601557, "grad_norm": 0.5123102068901062, "learning_rate": 9.809533639523156e-05, "loss": 1.494, "step": 48810 }, { "epoch": 2.2070215252256675, "grad_norm": 0.23556865751743317, "learning_rate": 9.809077463162935e-05, "loss": 1.4987, "step": 48820 }, { "epoch": 2.20756346639118, "grad_norm": 0.25429365038871765, "learning_rate": 9.808620753012906e-05, "loss": 1.492, "step": 48830 }, { "epoch": 2.208105407556692, "grad_norm": 0.2279432713985443, "learning_rate": 9.808163509129643e-05, "loss": 1.4876, "step": 48840 }, { "epoch": 2.2086473487222045, "grad_norm": 0.23476704955101013, "learning_rate": 9.807705731569789e-05, "loss": 1.4923, "step": 48850 }, { "epoch": 2.209189289887717, "grad_norm": 0.19318480789661407, "learning_rate": 9.80724742039005e-05, "loss": 1.4884, "step": 48860 }, { "epoch": 2.2097312310532287, "grad_norm": 0.2585408091545105, "learning_rate": 9.806788575647204e-05, "loss": 1.5005, "step": 48870 }, { "epoch": 2.210273172218741, "grad_norm": 0.23720701038837433, "learning_rate": 9.806329197398085e-05, "loss": 1.4892, "step": 48880 }, { "epoch": 2.2108151133842533, "grad_norm": 0.3471413552761078, "learning_rate": 9.805869285699602e-05, "loss": 1.4984, "step": 48890 }, { "epoch": 2.211031889850458, "eval_loss": 2.4734652042388916, "eval_runtime": 22.0944, "eval_samples_per_second": 226.302, "eval_steps_per_second": 1.222, "step": 48894 }, { "epoch": 2.2113570545497656, "grad_norm": 0.4286750555038452, "learning_rate": 9.805408840608725e-05, "loss": 1.4975, "step": 48900 }, { "epoch": 2.2118989957152775, "grad_norm": 0.4188242256641388, "learning_rate": 9.804947862182494e-05, "loss": 1.4967, "step": 48910 }, { "epoch": 2.21244093688079, "grad_norm": 0.3151668310165405, "learning_rate": 9.804486350478013e-05, "loss": 1.4959, "step": 48920 }, { "epoch": 2.212982878046302, "grad_norm": 0.24722276628017426, "learning_rate": 9.804024305552451e-05, "loss": 1.486, "step": 48930 }, { "epoch": 2.2135248192118144, "grad_norm": 0.33431771397590637, "learning_rate": 9.803561727463042e-05, "loss": 1.4907, "step": 48940 }, { "epoch": 2.2140667603773263, "grad_norm": 0.20495231449604034, "learning_rate": 9.803098616267093e-05, "loss": 1.5058, "step": 48950 }, { "epoch": 2.2146087015428386, "grad_norm": 0.2133825570344925, "learning_rate": 9.802634972021967e-05, "loss": 1.4913, "step": 48960 }, { "epoch": 2.215150642708351, "grad_norm": 0.32575109601020813, "learning_rate": 9.802170794785103e-05, "loss": 1.4912, "step": 48970 }, { "epoch": 2.2156925838738633, "grad_norm": 0.24184833467006683, "learning_rate": 9.801706084613997e-05, "loss": 1.4915, "step": 48980 }, { "epoch": 2.2157467779904145, "eval_loss": 2.473459243774414, "eval_runtime": 22.0863, "eval_samples_per_second": 226.385, "eval_steps_per_second": 1.222, "step": 48981 }, { "epoch": 2.2162345250393756, "grad_norm": 0.281916081905365, "learning_rate": 9.801240841566221e-05, "loss": 1.4939, "step": 48990 }, { "epoch": 2.2167764662048874, "grad_norm": 0.24919818341732025, "learning_rate": 9.8007750656994e-05, "loss": 1.5073, "step": 49000 }, { "epoch": 2.2173184073703998, "grad_norm": 0.3815664052963257, "learning_rate": 9.800308757071238e-05, "loss": 1.4974, "step": 49010 }, { "epoch": 2.217860348535912, "grad_norm": 0.21346397697925568, "learning_rate": 9.799841915739496e-05, "loss": 1.49, "step": 49020 }, { "epoch": 2.2184022897014244, "grad_norm": 0.2835428714752197, "learning_rate": 9.799374541762005e-05, "loss": 1.4981, "step": 49030 }, { "epoch": 2.2189442308669367, "grad_norm": 0.20683638751506805, "learning_rate": 9.798906635196665e-05, "loss": 1.5014, "step": 49040 }, { "epoch": 2.2194861720324486, "grad_norm": 0.28130945563316345, "learning_rate": 9.798438196101432e-05, "loss": 1.4794, "step": 49050 }, { "epoch": 2.220028113197961, "grad_norm": 0.6789763569831848, "learning_rate": 9.797969224534338e-05, "loss": 1.504, "step": 49060 }, { "epoch": 2.2204616661303707, "eval_loss": 2.4563019275665283, "eval_runtime": 22.0507, "eval_samples_per_second": 226.75, "eval_steps_per_second": 1.224, "step": 49068 }, { "epoch": 2.220570054363473, "grad_norm": 0.5758925676345825, "learning_rate": 9.797499720553476e-05, "loss": 1.4929, "step": 49070 }, { "epoch": 2.2211119955289855, "grad_norm": 0.26593759655952454, "learning_rate": 9.797029684217008e-05, "loss": 1.4907, "step": 49080 }, { "epoch": 2.2216539366944974, "grad_norm": 0.2138567566871643, "learning_rate": 9.796559115583158e-05, "loss": 1.4922, "step": 49090 }, { "epoch": 2.2221958778600097, "grad_norm": 0.31835222244262695, "learning_rate": 9.796088014710218e-05, "loss": 1.4927, "step": 49100 }, { "epoch": 2.222737819025522, "grad_norm": 0.19409584999084473, "learning_rate": 9.795616381656546e-05, "loss": 1.4954, "step": 49110 }, { "epoch": 2.2232797601910343, "grad_norm": 0.273907870054245, "learning_rate": 9.795144216480566e-05, "loss": 1.4999, "step": 49120 }, { "epoch": 2.2238217013565467, "grad_norm": 0.19406692683696747, "learning_rate": 9.794671519240768e-05, "loss": 1.4948, "step": 49130 }, { "epoch": 2.2243636425220585, "grad_norm": 0.262638121843338, "learning_rate": 9.79419828999571e-05, "loss": 1.4956, "step": 49140 }, { "epoch": 2.224905583687571, "grad_norm": 0.26331815123558044, "learning_rate": 9.79372452880401e-05, "loss": 1.4954, "step": 49150 }, { "epoch": 2.2251765542703272, "eval_loss": 2.458817958831787, "eval_runtime": 22.1352, "eval_samples_per_second": 225.885, "eval_steps_per_second": 1.22, "step": 49155 }, { "epoch": 2.225447524853083, "grad_norm": 0.35624048113822937, "learning_rate": 9.793250235724358e-05, "loss": 1.4944, "step": 49160 }, { "epoch": 2.2259894660185955, "grad_norm": 0.24932223558425903, "learning_rate": 9.792775410815506e-05, "loss": 1.4867, "step": 49170 }, { "epoch": 2.2265314071841074, "grad_norm": 0.2496650665998459, "learning_rate": 9.792300054136272e-05, "loss": 1.4798, "step": 49180 }, { "epoch": 2.2270733483496197, "grad_norm": 0.29679879546165466, "learning_rate": 9.791824165745543e-05, "loss": 1.493, "step": 49190 }, { "epoch": 2.227615289515132, "grad_norm": 0.3611978590488434, "learning_rate": 9.79134774570227e-05, "loss": 1.4927, "step": 49200 }, { "epoch": 2.2281572306806443, "grad_norm": 0.289468377828598, "learning_rate": 9.790870794065469e-05, "loss": 1.5052, "step": 49210 }, { "epoch": 2.2286991718461566, "grad_norm": 0.31869956851005554, "learning_rate": 9.790393310894225e-05, "loss": 1.5031, "step": 49220 }, { "epoch": 2.2292411130116685, "grad_norm": 0.1822601705789566, "learning_rate": 9.789915296247682e-05, "loss": 1.4961, "step": 49230 }, { "epoch": 2.229783054177181, "grad_norm": 0.19582431018352509, "learning_rate": 9.789436750185059e-05, "loss": 1.4834, "step": 49240 }, { "epoch": 2.2298914424102834, "eval_loss": 2.4611740112304688, "eval_runtime": 23.4689, "eval_samples_per_second": 213.048, "eval_steps_per_second": 1.15, "step": 49242 }, { "epoch": 2.230324995342693, "grad_norm": 0.3001343607902527, "learning_rate": 9.788957672765634e-05, "loss": 1.4975, "step": 49250 }, { "epoch": 2.2308669365082054, "grad_norm": 0.32307741045951843, "learning_rate": 9.788478064048753e-05, "loss": 1.4875, "step": 49260 }, { "epoch": 2.2314088776737178, "grad_norm": 0.21440020203590393, "learning_rate": 9.787997924093829e-05, "loss": 1.5025, "step": 49270 }, { "epoch": 2.2319508188392296, "grad_norm": 0.21042481064796448, "learning_rate": 9.787517252960335e-05, "loss": 1.5006, "step": 49280 }, { "epoch": 2.232492760004742, "grad_norm": 0.22425493597984314, "learning_rate": 9.787036050707822e-05, "loss": 1.4948, "step": 49290 }, { "epoch": 2.2330347011702543, "grad_norm": 0.24447576701641083, "learning_rate": 9.786554317395894e-05, "loss": 1.4939, "step": 49300 }, { "epoch": 2.2335766423357666, "grad_norm": 0.3083951771259308, "learning_rate": 9.786072053084228e-05, "loss": 1.4949, "step": 49310 }, { "epoch": 2.2341185835012785, "grad_norm": 0.24689245223999023, "learning_rate": 9.785589257832565e-05, "loss": 1.489, "step": 49320 }, { "epoch": 2.2346063305502395, "eval_loss": 2.45898175239563, "eval_runtime": 22.1974, "eval_samples_per_second": 225.252, "eval_steps_per_second": 1.216, "step": 49329 }, { "epoch": 2.2346605246667908, "grad_norm": 0.3437364995479584, "learning_rate": 9.78510593170071e-05, "loss": 1.4934, "step": 49330 }, { "epoch": 2.235202465832303, "grad_norm": 0.34142521023750305, "learning_rate": 9.784622074748537e-05, "loss": 1.4972, "step": 49340 }, { "epoch": 2.2357444069978154, "grad_norm": 0.3373830020427704, "learning_rate": 9.784137687035983e-05, "loss": 1.4975, "step": 49350 }, { "epoch": 2.2362863481633277, "grad_norm": 0.2561146318912506, "learning_rate": 9.78365276862305e-05, "loss": 1.4934, "step": 49360 }, { "epoch": 2.2368282893288396, "grad_norm": 0.19827355444431305, "learning_rate": 9.783167319569813e-05, "loss": 1.4969, "step": 49370 }, { "epoch": 2.237370230494352, "grad_norm": 0.2824994921684265, "learning_rate": 9.782681339936404e-05, "loss": 1.4843, "step": 49380 }, { "epoch": 2.2379121716598642, "grad_norm": 0.21020928025245667, "learning_rate": 9.782194829783022e-05, "loss": 1.4916, "step": 49390 }, { "epoch": 2.2384541128253765, "grad_norm": 0.30868810415267944, "learning_rate": 9.781707789169937e-05, "loss": 1.4969, "step": 49400 }, { "epoch": 2.2389960539908884, "grad_norm": 0.41937023401260376, "learning_rate": 9.781220218157479e-05, "loss": 1.4893, "step": 49410 }, { "epoch": 2.239321218690196, "eval_loss": 2.4543631076812744, "eval_runtime": 22.0555, "eval_samples_per_second": 226.7, "eval_steps_per_second": 1.224, "step": 49416 }, { "epoch": 2.2395379951564007, "grad_norm": 0.6382551789283752, "learning_rate": 9.780732116806052e-05, "loss": 1.5002, "step": 49420 }, { "epoch": 2.240079936321913, "grad_norm": 0.2832179069519043, "learning_rate": 9.780243485176111e-05, "loss": 1.4967, "step": 49430 }, { "epoch": 2.2406218774874254, "grad_norm": 0.19235193729400635, "learning_rate": 9.779754323328192e-05, "loss": 1.4979, "step": 49440 }, { "epoch": 2.2411638186529377, "grad_norm": 0.26842355728149414, "learning_rate": 9.779264631322888e-05, "loss": 1.4917, "step": 49450 }, { "epoch": 2.2417057598184496, "grad_norm": 0.28281205892562866, "learning_rate": 9.778774409220859e-05, "loss": 1.4907, "step": 49460 }, { "epoch": 2.242247700983962, "grad_norm": 0.3516809344291687, "learning_rate": 9.778283657082832e-05, "loss": 1.4887, "step": 49470 }, { "epoch": 2.242789642149474, "grad_norm": 0.23774586617946625, "learning_rate": 9.777792374969603e-05, "loss": 1.4881, "step": 49480 }, { "epoch": 2.2433315833149865, "grad_norm": 0.32771846652030945, "learning_rate": 9.777300562942027e-05, "loss": 1.4883, "step": 49490 }, { "epoch": 2.243873524480499, "grad_norm": 0.25199854373931885, "learning_rate": 9.776808221061024e-05, "loss": 1.4944, "step": 49500 }, { "epoch": 2.244036106830152, "eval_loss": 2.4585046768188477, "eval_runtime": 22.068, "eval_samples_per_second": 226.572, "eval_steps_per_second": 1.223, "step": 49503 }, { "epoch": 2.2444154656460107, "grad_norm": 0.26867803931236267, "learning_rate": 9.776315349387589e-05, "loss": 1.4877, "step": 49510 }, { "epoch": 2.244957406811523, "grad_norm": 0.2837635576725006, "learning_rate": 9.775821947982775e-05, "loss": 1.4914, "step": 49520 }, { "epoch": 2.2454993479770353, "grad_norm": 0.24937689304351807, "learning_rate": 9.775328016907701e-05, "loss": 1.4903, "step": 49530 }, { "epoch": 2.2460412891425476, "grad_norm": 0.20561334490776062, "learning_rate": 9.774833556223554e-05, "loss": 1.4971, "step": 49540 }, { "epoch": 2.2465832303080595, "grad_norm": 0.2335055023431778, "learning_rate": 9.774338565991586e-05, "loss": 1.4979, "step": 49550 }, { "epoch": 2.247125171473572, "grad_norm": 0.3294164836406708, "learning_rate": 9.773843046273111e-05, "loss": 1.4996, "step": 49560 }, { "epoch": 2.247667112639084, "grad_norm": 0.3552034795284271, "learning_rate": 9.773346997129518e-05, "loss": 1.5011, "step": 49570 }, { "epoch": 2.2482090538045965, "grad_norm": 0.36698946356773376, "learning_rate": 9.77285041862225e-05, "loss": 1.5012, "step": 49580 }, { "epoch": 2.2487509949701083, "grad_norm": 0.2022208720445633, "learning_rate": 9.772353310812824e-05, "loss": 1.5007, "step": 49590 }, { "epoch": 2.2487509949701083, "eval_loss": 2.4587907791137695, "eval_runtime": 22.0232, "eval_samples_per_second": 227.033, "eval_steps_per_second": 1.226, "step": 49590 }, { "epoch": 2.2492929361356206, "grad_norm": 0.3783475160598755, "learning_rate": 9.771855673762818e-05, "loss": 1.4963, "step": 49600 }, { "epoch": 2.249834877301133, "grad_norm": 0.4129343628883362, "learning_rate": 9.771357507533878e-05, "loss": 1.4945, "step": 49610 }, { "epoch": 2.2503768184666453, "grad_norm": 0.2977985739707947, "learning_rate": 9.770858812187715e-05, "loss": 1.4925, "step": 49620 }, { "epoch": 2.2509187596321576, "grad_norm": 0.22732873260974884, "learning_rate": 9.770359587786105e-05, "loss": 1.4973, "step": 49630 }, { "epoch": 2.2514607007976695, "grad_norm": 0.27571314573287964, "learning_rate": 9.769859834390887e-05, "loss": 1.4836, "step": 49640 }, { "epoch": 2.252002641963182, "grad_norm": 0.19114813208580017, "learning_rate": 9.769359552063972e-05, "loss": 1.4889, "step": 49650 }, { "epoch": 2.252544583128694, "grad_norm": 0.193264439702034, "learning_rate": 9.76885874086733e-05, "loss": 1.5037, "step": 49660 }, { "epoch": 2.2530865242942064, "grad_norm": 0.2822969853878021, "learning_rate": 9.768357400863004e-05, "loss": 1.4948, "step": 49670 }, { "epoch": 2.253465883110065, "eval_loss": 2.4479169845581055, "eval_runtime": 21.9892, "eval_samples_per_second": 227.384, "eval_steps_per_second": 1.228, "step": 49677 }, { "epoch": 2.2536284654597187, "grad_norm": 0.2823607325553894, "learning_rate": 9.767855532113091e-05, "loss": 1.4953, "step": 49680 }, { "epoch": 2.2541704066252306, "grad_norm": 0.29018524289131165, "learning_rate": 9.767353134679765e-05, "loss": 1.4909, "step": 49690 }, { "epoch": 2.254712347790743, "grad_norm": 0.2224309891462326, "learning_rate": 9.76685020862526e-05, "loss": 1.4952, "step": 49700 }, { "epoch": 2.2552542889562552, "grad_norm": 0.25766855478286743, "learning_rate": 9.766346754011875e-05, "loss": 1.4953, "step": 49710 }, { "epoch": 2.2557962301217676, "grad_norm": 0.2575187385082245, "learning_rate": 9.765842770901979e-05, "loss": 1.4876, "step": 49720 }, { "epoch": 2.25633817128728, "grad_norm": 0.3350069522857666, "learning_rate": 9.765338259358e-05, "loss": 1.4925, "step": 49730 }, { "epoch": 2.2568801124527917, "grad_norm": 0.21751070022583008, "learning_rate": 9.764833219442435e-05, "loss": 1.4812, "step": 49740 }, { "epoch": 2.257422053618304, "grad_norm": 0.19443929195404053, "learning_rate": 9.764327651217847e-05, "loss": 1.4924, "step": 49750 }, { "epoch": 2.2579639947838164, "grad_norm": 0.21549266576766968, "learning_rate": 9.763821554746864e-05, "loss": 1.484, "step": 49760 }, { "epoch": 2.258180771250021, "eval_loss": 2.451836585998535, "eval_runtime": 21.9854, "eval_samples_per_second": 227.423, "eval_steps_per_second": 1.228, "step": 49764 }, { "epoch": 2.2585059359493287, "grad_norm": 0.30445384979248047, "learning_rate": 9.763314930092178e-05, "loss": 1.4924, "step": 49770 }, { "epoch": 2.2590478771148406, "grad_norm": 0.3584153950214386, "learning_rate": 9.762807777316548e-05, "loss": 1.4855, "step": 49780 }, { "epoch": 2.259589818280353, "grad_norm": 0.2762848138809204, "learning_rate": 9.762300096482799e-05, "loss": 1.5014, "step": 49790 }, { "epoch": 2.260131759445865, "grad_norm": 0.22628115117549896, "learning_rate": 9.761791887653817e-05, "loss": 1.4932, "step": 49800 }, { "epoch": 2.2606737006113775, "grad_norm": 0.3497247099876404, "learning_rate": 9.761283150892561e-05, "loss": 1.5017, "step": 49810 }, { "epoch": 2.2612156417768894, "grad_norm": 0.40231335163116455, "learning_rate": 9.760773886262046e-05, "loss": 1.4924, "step": 49820 }, { "epoch": 2.2617575829424017, "grad_norm": 0.2883796989917755, "learning_rate": 9.760264093825361e-05, "loss": 1.4829, "step": 49830 }, { "epoch": 2.262299524107914, "grad_norm": 0.4345490038394928, "learning_rate": 9.759753773645658e-05, "loss": 1.4926, "step": 49840 }, { "epoch": 2.2628414652734263, "grad_norm": 0.2081182599067688, "learning_rate": 9.759242925786151e-05, "loss": 1.5033, "step": 49850 }, { "epoch": 2.2628956593899776, "eval_loss": 2.4485983848571777, "eval_runtime": 22.0488, "eval_samples_per_second": 226.769, "eval_steps_per_second": 1.225, "step": 49851 }, { "epoch": 2.2633834064389386, "grad_norm": 0.29944297671318054, "learning_rate": 9.758731550310122e-05, "loss": 1.4909, "step": 49860 }, { "epoch": 2.2639253476044505, "grad_norm": 0.37277737259864807, "learning_rate": 9.758219647280919e-05, "loss": 1.4931, "step": 49870 }, { "epoch": 2.264467288769963, "grad_norm": 0.2762742340564728, "learning_rate": 9.757707216761953e-05, "loss": 1.4982, "step": 49880 }, { "epoch": 2.265009229935475, "grad_norm": 0.22631752490997314, "learning_rate": 9.7571942588167e-05, "loss": 1.4871, "step": 49890 }, { "epoch": 2.2655511711009875, "grad_norm": 0.22785073518753052, "learning_rate": 9.756680773508708e-05, "loss": 1.4866, "step": 49900 }, { "epoch": 2.2660931122665, "grad_norm": 0.3055456876754761, "learning_rate": 9.756166760901581e-05, "loss": 1.4829, "step": 49910 }, { "epoch": 2.2666350534320117, "grad_norm": 0.25249767303466797, "learning_rate": 9.755652221058993e-05, "loss": 1.501, "step": 49920 }, { "epoch": 2.267176994597524, "grad_norm": 0.25835105776786804, "learning_rate": 9.755137154044686e-05, "loss": 1.4893, "step": 49930 }, { "epoch": 2.2676105475299337, "eval_loss": 2.449587345123291, "eval_runtime": 22.013, "eval_samples_per_second": 227.138, "eval_steps_per_second": 1.227, "step": 49938 }, { "epoch": 2.2677189357630363, "grad_norm": 0.3590300679206848, "learning_rate": 9.754621559922461e-05, "loss": 1.4947, "step": 49940 }, { "epoch": 2.2682608769285486, "grad_norm": 0.24853801727294922, "learning_rate": 9.75410543875619e-05, "loss": 1.4931, "step": 49950 }, { "epoch": 2.2688028180940605, "grad_norm": 0.19558899104595184, "learning_rate": 9.753588790609807e-05, "loss": 1.4906, "step": 49960 }, { "epoch": 2.269344759259573, "grad_norm": 0.2478579878807068, "learning_rate": 9.75307161554731e-05, "loss": 1.4903, "step": 49970 }, { "epoch": 2.269886700425085, "grad_norm": 0.32022833824157715, "learning_rate": 9.752553913632768e-05, "loss": 1.4904, "step": 49980 }, { "epoch": 2.2704286415905974, "grad_norm": 0.22632542252540588, "learning_rate": 9.752035684930309e-05, "loss": 1.4888, "step": 49990 }, { "epoch": 2.2709705827561093, "grad_norm": 0.29843834042549133, "learning_rate": 9.75151692950413e-05, "loss": 1.4982, "step": 50000 }, { "epoch": 2.2715125239216216, "grad_norm": 0.24948932230472565, "learning_rate": 9.750997647418492e-05, "loss": 1.4907, "step": 50010 }, { "epoch": 2.272054465087134, "grad_norm": 0.26221540570259094, "learning_rate": 9.750477838737721e-05, "loss": 1.4931, "step": 50020 }, { "epoch": 2.27232543566989, "eval_loss": 2.4510858058929443, "eval_runtime": 22.1311, "eval_samples_per_second": 225.927, "eval_steps_per_second": 1.22, "step": 50025 }, { "epoch": 2.2725964062526463, "grad_norm": 0.4738941192626953, "learning_rate": 9.74995750352621e-05, "loss": 1.5008, "step": 50030 }, { "epoch": 2.2731383474181586, "grad_norm": 0.29958027601242065, "learning_rate": 9.749436641848415e-05, "loss": 1.4866, "step": 50040 }, { "epoch": 2.2736802885836704, "grad_norm": 0.21620671451091766, "learning_rate": 9.748915253768856e-05, "loss": 1.4908, "step": 50050 }, { "epoch": 2.2742222297491828, "grad_norm": 0.21653667092323303, "learning_rate": 9.748393339352125e-05, "loss": 1.4871, "step": 50060 }, { "epoch": 2.274764170914695, "grad_norm": 0.2310364544391632, "learning_rate": 9.747870898662871e-05, "loss": 1.4929, "step": 50070 }, { "epoch": 2.2753061120802074, "grad_norm": 0.23432140052318573, "learning_rate": 9.747347931765812e-05, "loss": 1.4955, "step": 50080 }, { "epoch": 2.2758480532457197, "grad_norm": 0.22089755535125732, "learning_rate": 9.746824438725731e-05, "loss": 1.489, "step": 50090 }, { "epoch": 2.2763899944112316, "grad_norm": 0.3632870018482208, "learning_rate": 9.746300419607479e-05, "loss": 1.4839, "step": 50100 }, { "epoch": 2.276931935576744, "grad_norm": 0.18700088560581207, "learning_rate": 9.745775874475963e-05, "loss": 1.4952, "step": 50110 }, { "epoch": 2.2770403238098464, "eval_loss": 2.4573004245758057, "eval_runtime": 22.0595, "eval_samples_per_second": 226.66, "eval_steps_per_second": 1.224, "step": 50112 }, { "epoch": 2.277473876742256, "grad_norm": 0.23492306470870972, "learning_rate": 9.745250803396166e-05, "loss": 1.4832, "step": 50120 }, { "epoch": 2.2780158179077685, "grad_norm": 0.35017403960227966, "learning_rate": 9.744725206433131e-05, "loss": 1.4903, "step": 50130 }, { "epoch": 2.278557759073281, "grad_norm": 0.2380804866552353, "learning_rate": 9.744199083651968e-05, "loss": 1.492, "step": 50140 }, { "epoch": 2.2790997002387927, "grad_norm": 0.2782959043979645, "learning_rate": 9.743672435117846e-05, "loss": 1.4848, "step": 50150 }, { "epoch": 2.279641641404305, "grad_norm": 0.19856207072734833, "learning_rate": 9.743145260896009e-05, "loss": 1.4947, "step": 50160 }, { "epoch": 2.2801835825698173, "grad_norm": 0.3704603612422943, "learning_rate": 9.742617561051758e-05, "loss": 1.4797, "step": 50170 }, { "epoch": 2.2807255237353297, "grad_norm": 0.38659003376960754, "learning_rate": 9.742089335650462e-05, "loss": 1.4944, "step": 50180 }, { "epoch": 2.2812674649008415, "grad_norm": 0.2731989324092865, "learning_rate": 9.741560584757559e-05, "loss": 1.4955, "step": 50190 }, { "epoch": 2.2817552119498026, "eval_loss": 2.4447662830352783, "eval_runtime": 22.0833, "eval_samples_per_second": 226.415, "eval_steps_per_second": 1.223, "step": 50199 }, { "epoch": 2.281809406066354, "grad_norm": 0.312407523393631, "learning_rate": 9.741031308438543e-05, "loss": 1.4969, "step": 50200 }, { "epoch": 2.282351347231866, "grad_norm": 0.2855091691017151, "learning_rate": 9.740501506758983e-05, "loss": 1.4872, "step": 50210 }, { "epoch": 2.2828932883973785, "grad_norm": 0.33906736969947815, "learning_rate": 9.739971179784508e-05, "loss": 1.4888, "step": 50220 }, { "epoch": 2.2834352295628904, "grad_norm": 0.21002908051013947, "learning_rate": 9.739440327580809e-05, "loss": 1.49, "step": 50230 }, { "epoch": 2.2839771707284027, "grad_norm": 0.24895620346069336, "learning_rate": 9.738908950213648e-05, "loss": 1.4906, "step": 50240 }, { "epoch": 2.284519111893915, "grad_norm": 0.2253609001636505, "learning_rate": 9.738377047748852e-05, "loss": 1.5001, "step": 50250 }, { "epoch": 2.2850610530594273, "grad_norm": 0.30024558305740356, "learning_rate": 9.737844620252307e-05, "loss": 1.4887, "step": 50260 }, { "epoch": 2.2856029942249396, "grad_norm": 0.3111526370048523, "learning_rate": 9.737311667789967e-05, "loss": 1.4861, "step": 50270 }, { "epoch": 2.2861449353904515, "grad_norm": 0.22137415409088135, "learning_rate": 9.736778190427859e-05, "loss": 1.4972, "step": 50280 }, { "epoch": 2.286470100089759, "eval_loss": 2.4479713439941406, "eval_runtime": 21.9852, "eval_samples_per_second": 227.425, "eval_steps_per_second": 1.228, "step": 50286 }, { "epoch": 2.286686876555964, "grad_norm": 0.18520957231521606, "learning_rate": 9.73624418823206e-05, "loss": 1.4849, "step": 50290 }, { "epoch": 2.287228817721476, "grad_norm": 0.30916085839271545, "learning_rate": 9.735709661268723e-05, "loss": 1.4852, "step": 50300 }, { "epoch": 2.2877707588869884, "grad_norm": 0.5676615238189697, "learning_rate": 9.735174609604063e-05, "loss": 1.4893, "step": 50310 }, { "epoch": 2.2883127000525008, "grad_norm": 0.40237462520599365, "learning_rate": 9.734639033304361e-05, "loss": 1.4901, "step": 50320 }, { "epoch": 2.2888546412180126, "grad_norm": 0.24099163711071014, "learning_rate": 9.734102932435959e-05, "loss": 1.4842, "step": 50330 }, { "epoch": 2.289396582383525, "grad_norm": 0.2044031023979187, "learning_rate": 9.73356630706527e-05, "loss": 1.4834, "step": 50340 }, { "epoch": 2.2899385235490373, "grad_norm": 0.22731153666973114, "learning_rate": 9.733029157258765e-05, "loss": 1.471, "step": 50350 }, { "epoch": 2.2904804647145496, "grad_norm": 0.19987347722053528, "learning_rate": 9.732491483082987e-05, "loss": 1.4786, "step": 50360 }, { "epoch": 2.2910224058800615, "grad_norm": 0.23623400926589966, "learning_rate": 9.73195328460454e-05, "loss": 1.4864, "step": 50370 }, { "epoch": 2.2911849882297153, "eval_loss": 2.4462499618530273, "eval_runtime": 22.0809, "eval_samples_per_second": 226.44, "eval_steps_per_second": 1.223, "step": 50373 }, { "epoch": 2.2915643470455738, "grad_norm": 0.2596568167209625, "learning_rate": 9.731414561890093e-05, "loss": 1.4828, "step": 50380 }, { "epoch": 2.292106288211086, "grad_norm": 0.22793002426624298, "learning_rate": 9.730875315006381e-05, "loss": 1.4874, "step": 50390 }, { "epoch": 2.2926482293765984, "grad_norm": 0.34478330612182617, "learning_rate": 9.730335544020204e-05, "loss": 1.494, "step": 50400 }, { "epoch": 2.2931901705421103, "grad_norm": 0.3564873933792114, "learning_rate": 9.729795248998425e-05, "loss": 1.4937, "step": 50410 }, { "epoch": 2.2937321117076226, "grad_norm": 0.21651984751224518, "learning_rate": 9.729254430007976e-05, "loss": 1.4791, "step": 50420 }, { "epoch": 2.294274052873135, "grad_norm": 0.25641965866088867, "learning_rate": 9.728713087115848e-05, "loss": 1.4991, "step": 50430 }, { "epoch": 2.2948159940386472, "grad_norm": 0.3314838707447052, "learning_rate": 9.728171220389104e-05, "loss": 1.5014, "step": 50440 }, { "epoch": 2.2953579352041595, "grad_norm": 0.18577730655670166, "learning_rate": 9.727628829894866e-05, "loss": 1.4913, "step": 50450 }, { "epoch": 2.2958998763696714, "grad_norm": 0.3247391879558563, "learning_rate": 9.727085915700321e-05, "loss": 1.4837, "step": 50460 }, { "epoch": 2.2958998763696714, "eval_loss": 2.445127010345459, "eval_runtime": 22.0909, "eval_samples_per_second": 226.337, "eval_steps_per_second": 1.222, "step": 50460 }, { "epoch": 2.2964418175351837, "grad_norm": 0.2847256660461426, "learning_rate": 9.726542477872726e-05, "loss": 1.4776, "step": 50470 }, { "epoch": 2.296983758700696, "grad_norm": 0.23234505951404572, "learning_rate": 9.725998516479399e-05, "loss": 1.4889, "step": 50480 }, { "epoch": 2.2975256998662084, "grad_norm": 0.20872457325458527, "learning_rate": 9.725454031587725e-05, "loss": 1.4975, "step": 50490 }, { "epoch": 2.2980676410317207, "grad_norm": 0.24152375757694244, "learning_rate": 9.724909023265147e-05, "loss": 1.4884, "step": 50500 }, { "epoch": 2.2986095821972325, "grad_norm": 0.45855534076690674, "learning_rate": 9.724363491579185e-05, "loss": 1.4975, "step": 50510 }, { "epoch": 2.299151523362745, "grad_norm": 0.37530308961868286, "learning_rate": 9.723817436597413e-05, "loss": 1.4934, "step": 50520 }, { "epoch": 2.299693464528257, "grad_norm": 0.3856462240219116, "learning_rate": 9.723270858387474e-05, "loss": 1.4977, "step": 50530 }, { "epoch": 2.3002354056937695, "grad_norm": 0.21550019085407257, "learning_rate": 9.722723757017078e-05, "loss": 1.5, "step": 50540 }, { "epoch": 2.300614764509628, "eval_loss": 2.45145320892334, "eval_runtime": 22.1166, "eval_samples_per_second": 226.074, "eval_steps_per_second": 1.221, "step": 50547 }, { "epoch": 2.300777346859282, "grad_norm": 0.2878018617630005, "learning_rate": 9.722176132553995e-05, "loss": 1.498, "step": 50550 }, { "epoch": 2.3013192880247937, "grad_norm": 0.3644629418849945, "learning_rate": 9.721627985066064e-05, "loss": 1.4972, "step": 50560 }, { "epoch": 2.301861229190306, "grad_norm": 0.22482427954673767, "learning_rate": 9.721079314621186e-05, "loss": 1.4867, "step": 50570 }, { "epoch": 2.3024031703558183, "grad_norm": 0.3207145631313324, "learning_rate": 9.72053012128733e-05, "loss": 1.481, "step": 50580 }, { "epoch": 2.3029451115213306, "grad_norm": 0.26092955470085144, "learning_rate": 9.719980405132527e-05, "loss": 1.4841, "step": 50590 }, { "epoch": 2.3034870526868425, "grad_norm": 0.30404138565063477, "learning_rate": 9.71943016622487e-05, "loss": 1.4924, "step": 50600 }, { "epoch": 2.304028993852355, "grad_norm": 0.23371928930282593, "learning_rate": 9.718879404632525e-05, "loss": 1.4786, "step": 50610 }, { "epoch": 2.304570935017867, "grad_norm": 0.2730085551738739, "learning_rate": 9.718328120423715e-05, "loss": 1.4862, "step": 50620 }, { "epoch": 2.3051128761833795, "grad_norm": 0.19008232653141022, "learning_rate": 9.717776313666731e-05, "loss": 1.4935, "step": 50630 }, { "epoch": 2.305329652649584, "eval_loss": 2.4463045597076416, "eval_runtime": 22.0762, "eval_samples_per_second": 226.488, "eval_steps_per_second": 1.223, "step": 50634 }, { "epoch": 2.3056548173488913, "grad_norm": 0.30709657073020935, "learning_rate": 9.717223984429931e-05, "loss": 1.4934, "step": 50640 }, { "epoch": 2.3061967585144036, "grad_norm": 0.3597027361392975, "learning_rate": 9.716671132781731e-05, "loss": 1.488, "step": 50650 }, { "epoch": 2.306738699679916, "grad_norm": 0.2394603192806244, "learning_rate": 9.71611775879062e-05, "loss": 1.4865, "step": 50660 }, { "epoch": 2.3072806408454283, "grad_norm": 0.2578514814376831, "learning_rate": 9.715563862525145e-05, "loss": 1.4828, "step": 50670 }, { "epoch": 2.3078225820109406, "grad_norm": 0.2434462159872055, "learning_rate": 9.715009444053921e-05, "loss": 1.4906, "step": 50680 }, { "epoch": 2.3083645231764525, "grad_norm": 0.3057672381401062, "learning_rate": 9.714454503445626e-05, "loss": 1.4889, "step": 50690 }, { "epoch": 2.308906464341965, "grad_norm": 0.2835559546947479, "learning_rate": 9.713899040769004e-05, "loss": 1.4879, "step": 50700 }, { "epoch": 2.309448405507477, "grad_norm": 0.23302173614501953, "learning_rate": 9.713343056092866e-05, "loss": 1.4891, "step": 50710 }, { "epoch": 2.3099903466729894, "grad_norm": 0.2765030562877655, "learning_rate": 9.71278654948608e-05, "loss": 1.4879, "step": 50720 }, { "epoch": 2.3100445407895407, "eval_loss": 2.4514873027801514, "eval_runtime": 22.1411, "eval_samples_per_second": 225.825, "eval_steps_per_second": 1.219, "step": 50721 }, { "epoch": 2.3105322878385017, "grad_norm": 0.3144146502017975, "learning_rate": 9.712229521017588e-05, "loss": 1.4854, "step": 50730 }, { "epoch": 2.3110742290040136, "grad_norm": 0.2929839491844177, "learning_rate": 9.71167197075639e-05, "loss": 1.4764, "step": 50740 }, { "epoch": 2.311616170169526, "grad_norm": 0.38944029808044434, "learning_rate": 9.711113898771554e-05, "loss": 1.4945, "step": 50750 }, { "epoch": 2.3121581113350382, "grad_norm": 0.28745847940444946, "learning_rate": 9.71055530513221e-05, "loss": 1.4949, "step": 50760 }, { "epoch": 2.3127000525005506, "grad_norm": 0.26115939021110535, "learning_rate": 9.709996189907557e-05, "loss": 1.4907, "step": 50770 }, { "epoch": 2.313241993666063, "grad_norm": 0.24977637827396393, "learning_rate": 9.709436553166853e-05, "loss": 1.4829, "step": 50780 }, { "epoch": 2.3137839348315747, "grad_norm": 0.35427191853523254, "learning_rate": 9.708876394979424e-05, "loss": 1.4819, "step": 50790 }, { "epoch": 2.314325875997087, "grad_norm": 0.24472922086715698, "learning_rate": 9.708315715414661e-05, "loss": 1.4968, "step": 50800 }, { "epoch": 2.314759428929497, "eval_loss": 2.440610647201538, "eval_runtime": 21.9959, "eval_samples_per_second": 227.315, "eval_steps_per_second": 1.227, "step": 50808 }, { "epoch": 2.3148678171625994, "grad_norm": 0.338789165019989, "learning_rate": 9.707754514542017e-05, "loss": 1.4834, "step": 50810 }, { "epoch": 2.3154097583281117, "grad_norm": 0.30206480622291565, "learning_rate": 9.707192792431014e-05, "loss": 1.4907, "step": 50820 }, { "epoch": 2.3159516994936236, "grad_norm": 0.23182201385498047, "learning_rate": 9.70663054915123e-05, "loss": 1.4917, "step": 50830 }, { "epoch": 2.316493640659136, "grad_norm": 0.3595997989177704, "learning_rate": 9.70606778477232e-05, "loss": 1.4962, "step": 50840 }, { "epoch": 2.317035581824648, "grad_norm": 0.21878936886787415, "learning_rate": 9.705504499363993e-05, "loss": 1.4847, "step": 50850 }, { "epoch": 2.3175775229901605, "grad_norm": 0.19247514009475708, "learning_rate": 9.704940692996027e-05, "loss": 1.4883, "step": 50860 }, { "epoch": 2.3181194641556724, "grad_norm": 0.20777463912963867, "learning_rate": 9.704376365738262e-05, "loss": 1.5016, "step": 50870 }, { "epoch": 2.3186614053211847, "grad_norm": 0.2756713628768921, "learning_rate": 9.703811517660609e-05, "loss": 1.494, "step": 50880 }, { "epoch": 2.319203346486697, "grad_norm": 0.3496546745300293, "learning_rate": 9.703246148833037e-05, "loss": 1.495, "step": 50890 }, { "epoch": 2.319474317069453, "eval_loss": 2.451153039932251, "eval_runtime": 22.0437, "eval_samples_per_second": 226.823, "eval_steps_per_second": 1.225, "step": 50895 }, { "epoch": 2.3197452876522093, "grad_norm": 0.28930479288101196, "learning_rate": 9.702680259325579e-05, "loss": 1.4995, "step": 50900 }, { "epoch": 2.3202872288177216, "grad_norm": 0.1946384757757187, "learning_rate": 9.702113849208337e-05, "loss": 1.4856, "step": 50910 }, { "epoch": 2.3208291699832335, "grad_norm": 0.23787322640419006, "learning_rate": 9.701546918551475e-05, "loss": 1.4841, "step": 50920 }, { "epoch": 2.321371111148746, "grad_norm": 0.2119310200214386, "learning_rate": 9.700979467425222e-05, "loss": 1.4882, "step": 50930 }, { "epoch": 2.321913052314258, "grad_norm": 0.23019666969776154, "learning_rate": 9.700411495899872e-05, "loss": 1.4834, "step": 50940 }, { "epoch": 2.3224549934797705, "grad_norm": 0.22270162403583527, "learning_rate": 9.699843004045782e-05, "loss": 1.4819, "step": 50950 }, { "epoch": 2.322996934645283, "grad_norm": 0.3237341642379761, "learning_rate": 9.699273991933373e-05, "loss": 1.4903, "step": 50960 }, { "epoch": 2.3235388758107947, "grad_norm": 0.26717808842658997, "learning_rate": 9.698704459633136e-05, "loss": 1.4825, "step": 50970 }, { "epoch": 2.324080816976307, "grad_norm": 0.27529168128967285, "learning_rate": 9.698134407215618e-05, "loss": 1.4881, "step": 50980 }, { "epoch": 2.3241892052094095, "eval_loss": 2.4506008625030518, "eval_runtime": 22.0206, "eval_samples_per_second": 227.06, "eval_steps_per_second": 1.226, "step": 50982 }, { "epoch": 2.3246227581418193, "grad_norm": 0.17906807363033295, "learning_rate": 9.697563834751436e-05, "loss": 1.4853, "step": 50990 }, { "epoch": 2.3251646993073316, "grad_norm": 0.2530919313430786, "learning_rate": 9.69699274231127e-05, "loss": 1.4873, "step": 51000 }, { "epoch": 2.3257066404728435, "grad_norm": 0.20605182647705078, "learning_rate": 9.696421129965865e-05, "loss": 1.4754, "step": 51010 }, { "epoch": 2.326248581638356, "grad_norm": 0.3275115489959717, "learning_rate": 9.69584899778603e-05, "loss": 1.4704, "step": 51020 }, { "epoch": 2.326790522803868, "grad_norm": 0.4294751286506653, "learning_rate": 9.695276345842638e-05, "loss": 1.4987, "step": 51030 }, { "epoch": 2.3273324639693804, "grad_norm": 0.22030548751354218, "learning_rate": 9.694703174206624e-05, "loss": 1.4899, "step": 51040 }, { "epoch": 2.3278744051348923, "grad_norm": 0.263075053691864, "learning_rate": 9.694129482948994e-05, "loss": 1.4819, "step": 51050 }, { "epoch": 2.3284163463004046, "grad_norm": 0.26459792256355286, "learning_rate": 9.693555272140813e-05, "loss": 1.499, "step": 51060 }, { "epoch": 2.3289040933493657, "eval_loss": 2.4480111598968506, "eval_runtime": 22.0391, "eval_samples_per_second": 226.869, "eval_steps_per_second": 1.225, "step": 51069 }, { "epoch": 2.328958287465917, "grad_norm": 0.19399993121623993, "learning_rate": 9.692980541853211e-05, "loss": 1.4926, "step": 51070 }, { "epoch": 2.3295002286314292, "grad_norm": 0.2647286057472229, "learning_rate": 9.692405292157384e-05, "loss": 1.4846, "step": 51080 }, { "epoch": 2.3300421697969416, "grad_norm": 0.2626708149909973, "learning_rate": 9.69182952312459e-05, "loss": 1.4875, "step": 51090 }, { "epoch": 2.3305841109624534, "grad_norm": 0.3158663213253021, "learning_rate": 9.691253234826155e-05, "loss": 1.4902, "step": 51100 }, { "epoch": 2.3311260521279658, "grad_norm": 0.18909795582294464, "learning_rate": 9.690676427333467e-05, "loss": 1.4898, "step": 51110 }, { "epoch": 2.331667993293478, "grad_norm": 0.3357420265674591, "learning_rate": 9.690099100717974e-05, "loss": 1.4999, "step": 51120 }, { "epoch": 2.3322099344589904, "grad_norm": 0.36199477314949036, "learning_rate": 9.689521255051198e-05, "loss": 1.4957, "step": 51130 }, { "epoch": 2.3327518756245027, "grad_norm": 0.36792802810668945, "learning_rate": 9.688942890404718e-05, "loss": 1.4897, "step": 51140 }, { "epoch": 2.3332938167900146, "grad_norm": 0.22224745154380798, "learning_rate": 9.68836400685018e-05, "loss": 1.4866, "step": 51150 }, { "epoch": 2.3336189814893222, "eval_loss": 2.449270248413086, "eval_runtime": 22.033, "eval_samples_per_second": 226.933, "eval_steps_per_second": 1.225, "step": 51156 }, { "epoch": 2.333835757955527, "grad_norm": 0.20259080827236176, "learning_rate": 9.687784604459292e-05, "loss": 1.4879, "step": 51160 }, { "epoch": 2.334377699121039, "grad_norm": 0.41556867957115173, "learning_rate": 9.687204683303829e-05, "loss": 1.4895, "step": 51170 }, { "epoch": 2.3349196402865515, "grad_norm": 0.24334754049777985, "learning_rate": 9.686624243455627e-05, "loss": 1.4751, "step": 51180 }, { "epoch": 2.335461581452064, "grad_norm": 0.20898622274398804, "learning_rate": 9.686043284986593e-05, "loss": 1.4863, "step": 51190 }, { "epoch": 2.3360035226175757, "grad_norm": 0.22658079862594604, "learning_rate": 9.685461807968688e-05, "loss": 1.4837, "step": 51200 }, { "epoch": 2.336545463783088, "grad_norm": 0.29090017080307007, "learning_rate": 9.684879812473948e-05, "loss": 1.4827, "step": 51210 }, { "epoch": 2.3370874049486003, "grad_norm": 0.27824729681015015, "learning_rate": 9.684297298574465e-05, "loss": 1.4882, "step": 51220 }, { "epoch": 2.3376293461141127, "grad_norm": 0.2605385482311249, "learning_rate": 9.683714266342398e-05, "loss": 1.4894, "step": 51230 }, { "epoch": 2.3381712872796245, "grad_norm": 0.34890076518058777, "learning_rate": 9.683130715849973e-05, "loss": 1.4949, "step": 51240 }, { "epoch": 2.3383338696292784, "eval_loss": 2.450643539428711, "eval_runtime": 22.0782, "eval_samples_per_second": 226.468, "eval_steps_per_second": 1.223, "step": 51243 }, { "epoch": 2.338713228445137, "grad_norm": 0.2740577459335327, "learning_rate": 9.682546647169478e-05, "loss": 1.4953, "step": 51250 }, { "epoch": 2.339255169610649, "grad_norm": 0.29385995864868164, "learning_rate": 9.68196206037326e-05, "loss": 1.4833, "step": 51260 }, { "epoch": 2.3397971107761615, "grad_norm": 0.2904885709285736, "learning_rate": 9.681376955533739e-05, "loss": 1.4861, "step": 51270 }, { "epoch": 2.3403390519416734, "grad_norm": 0.19600605964660645, "learning_rate": 9.680791332723396e-05, "loss": 1.4891, "step": 51280 }, { "epoch": 2.3408809931071857, "grad_norm": 0.35659798979759216, "learning_rate": 9.680205192014772e-05, "loss": 1.481, "step": 51290 }, { "epoch": 2.341422934272698, "grad_norm": 0.20791636407375336, "learning_rate": 9.679618533480478e-05, "loss": 1.4846, "step": 51300 }, { "epoch": 2.3419648754382103, "grad_norm": 0.36341404914855957, "learning_rate": 9.679031357193186e-05, "loss": 1.487, "step": 51310 }, { "epoch": 2.3425068166037226, "grad_norm": 0.2511788308620453, "learning_rate": 9.678443663225633e-05, "loss": 1.4911, "step": 51320 }, { "epoch": 2.3430487577692345, "grad_norm": 0.22106388211250305, "learning_rate": 9.67785545165062e-05, "loss": 1.4855, "step": 51330 }, { "epoch": 2.3430487577692345, "eval_loss": 2.449833631515503, "eval_runtime": 22.0541, "eval_samples_per_second": 226.716, "eval_steps_per_second": 1.224, "step": 51330 }, { "epoch": 2.343590698934747, "grad_norm": 0.2117205709218979, "learning_rate": 9.677266722541012e-05, "loss": 1.4836, "step": 51340 }, { "epoch": 2.344132640100259, "grad_norm": 0.261532187461853, "learning_rate": 9.67667747596974e-05, "loss": 1.4843, "step": 51350 }, { "epoch": 2.3446745812657714, "grad_norm": 0.25580722093582153, "learning_rate": 9.676087712009794e-05, "loss": 1.4842, "step": 51360 }, { "epoch": 2.3452165224312838, "grad_norm": 0.21774953603744507, "learning_rate": 9.675497430734233e-05, "loss": 1.4816, "step": 51370 }, { "epoch": 2.3457584635967956, "grad_norm": 0.4116745591163635, "learning_rate": 9.67490663221618e-05, "loss": 1.4927, "step": 51380 }, { "epoch": 2.346300404762308, "grad_norm": 0.35664743185043335, "learning_rate": 9.674315316528816e-05, "loss": 1.4871, "step": 51390 }, { "epoch": 2.3468423459278203, "grad_norm": 0.25654247403144836, "learning_rate": 9.673723483745395e-05, "loss": 1.4728, "step": 51400 }, { "epoch": 2.3473842870933326, "grad_norm": 0.32242023944854736, "learning_rate": 9.673131133939229e-05, "loss": 1.4841, "step": 51410 }, { "epoch": 2.347763645909191, "eval_loss": 2.4569506645202637, "eval_runtime": 21.6699, "eval_samples_per_second": 230.735, "eval_steps_per_second": 1.246, "step": 51417 }, { "epoch": 2.347926228258845, "grad_norm": 0.19315354526042938, "learning_rate": 9.672538267183699e-05, "loss": 1.4803, "step": 51420 }, { "epoch": 2.3484681694243568, "grad_norm": 0.2572130262851715, "learning_rate": 9.67194488355224e-05, "loss": 1.4969, "step": 51430 }, { "epoch": 2.349010110589869, "grad_norm": 0.5099160671234131, "learning_rate": 9.671350983118365e-05, "loss": 1.4876, "step": 51440 }, { "epoch": 2.3495520517553814, "grad_norm": 0.2558972239494324, "learning_rate": 9.670756565955637e-05, "loss": 1.4961, "step": 51450 }, { "epoch": 2.3500939929208937, "grad_norm": 0.5650994181632996, "learning_rate": 9.670161632137696e-05, "loss": 1.4858, "step": 51460 }, { "epoch": 2.3506359340864056, "grad_norm": 0.45410850644111633, "learning_rate": 9.669566181738236e-05, "loss": 1.4921, "step": 51470 }, { "epoch": 2.351177875251918, "grad_norm": 0.37703588604927063, "learning_rate": 9.668970214831019e-05, "loss": 1.4899, "step": 51480 }, { "epoch": 2.35171981641743, "grad_norm": 0.20002073049545288, "learning_rate": 9.668373731489872e-05, "loss": 1.4873, "step": 51490 }, { "epoch": 2.3522617575829425, "grad_norm": 0.2333013415336609, "learning_rate": 9.667776731788685e-05, "loss": 1.4952, "step": 51500 }, { "epoch": 2.352478534049147, "eval_loss": 2.4526426792144775, "eval_runtime": 22.019, "eval_samples_per_second": 227.077, "eval_steps_per_second": 1.226, "step": 51504 }, { "epoch": 2.3528036987484544, "grad_norm": 0.28557243943214417, "learning_rate": 9.667179215801411e-05, "loss": 1.4851, "step": 51510 }, { "epoch": 2.3533456399139667, "grad_norm": 0.2601202130317688, "learning_rate": 9.666581183602069e-05, "loss": 1.4857, "step": 51520 }, { "epoch": 2.353887581079479, "grad_norm": 0.37464073300361633, "learning_rate": 9.665982635264736e-05, "loss": 1.491, "step": 51530 }, { "epoch": 2.3544295222449914, "grad_norm": 0.3240549862384796, "learning_rate": 9.665383570863565e-05, "loss": 1.478, "step": 51540 }, { "epoch": 2.3549714634105037, "grad_norm": 0.18953123688697815, "learning_rate": 9.66478399047276e-05, "loss": 1.4791, "step": 51550 }, { "epoch": 2.3555134045760155, "grad_norm": 0.24826766550540924, "learning_rate": 9.664183894166595e-05, "loss": 1.4875, "step": 51560 }, { "epoch": 2.356055345741528, "grad_norm": 0.2706245481967926, "learning_rate": 9.66358328201941e-05, "loss": 1.4802, "step": 51570 }, { "epoch": 2.35659728690704, "grad_norm": 0.3040546178817749, "learning_rate": 9.662982154105604e-05, "loss": 1.4826, "step": 51580 }, { "epoch": 2.3571392280725525, "grad_norm": 0.28579655289649963, "learning_rate": 9.66238051049964e-05, "loss": 1.4826, "step": 51590 }, { "epoch": 2.3571934221891038, "eval_loss": 2.4456629753112793, "eval_runtime": 22.0034, "eval_samples_per_second": 227.238, "eval_steps_per_second": 1.227, "step": 51591 }, { "epoch": 2.357681169238065, "grad_norm": 0.2639320194721222, "learning_rate": 9.661778351276052e-05, "loss": 1.4805, "step": 51600 }, { "epoch": 2.3582231104035767, "grad_norm": 0.3113354742527008, "learning_rate": 9.66117567650943e-05, "loss": 1.4745, "step": 51610 }, { "epoch": 2.358765051569089, "grad_norm": 0.2081727683544159, "learning_rate": 9.660572486274432e-05, "loss": 1.4801, "step": 51620 }, { "epoch": 2.3593069927346013, "grad_norm": 0.2256453037261963, "learning_rate": 9.659968780645777e-05, "loss": 1.4801, "step": 51630 }, { "epoch": 2.3598489339001136, "grad_norm": 0.23762857913970947, "learning_rate": 9.659364559698249e-05, "loss": 1.4833, "step": 51640 }, { "epoch": 2.3603908750656255, "grad_norm": 0.28543922305107117, "learning_rate": 9.6587598235067e-05, "loss": 1.4877, "step": 51650 }, { "epoch": 2.360932816231138, "grad_norm": 0.32550597190856934, "learning_rate": 9.658154572146039e-05, "loss": 1.4831, "step": 51660 }, { "epoch": 2.36147475739665, "grad_norm": 0.3355502188205719, "learning_rate": 9.65754880569124e-05, "loss": 1.4936, "step": 51670 }, { "epoch": 2.36190831032906, "eval_loss": 2.46115779876709, "eval_runtime": 22.0455, "eval_samples_per_second": 226.804, "eval_steps_per_second": 1.225, "step": 51678 }, { "epoch": 2.3620166985621625, "grad_norm": 0.3635583519935608, "learning_rate": 9.65694252421735e-05, "loss": 1.4941, "step": 51680 }, { "epoch": 2.3625586397276743, "grad_norm": 0.2427382469177246, "learning_rate": 9.656335727799464e-05, "loss": 1.4888, "step": 51690 }, { "epoch": 2.3631005808931866, "grad_norm": 0.178348109126091, "learning_rate": 9.655728416512754e-05, "loss": 1.4835, "step": 51700 }, { "epoch": 2.363642522058699, "grad_norm": 0.22014687955379486, "learning_rate": 9.65512059043245e-05, "loss": 1.485, "step": 51710 }, { "epoch": 2.3641844632242113, "grad_norm": 0.21226602792739868, "learning_rate": 9.654512249633848e-05, "loss": 1.4826, "step": 51720 }, { "epoch": 2.3647264043897236, "grad_norm": 0.28715527057647705, "learning_rate": 9.653903394192304e-05, "loss": 1.4772, "step": 51730 }, { "epoch": 2.3652683455552355, "grad_norm": 0.24136418104171753, "learning_rate": 9.653294024183243e-05, "loss": 1.4856, "step": 51740 }, { "epoch": 2.365810286720748, "grad_norm": 0.20212812721729279, "learning_rate": 9.65268413968215e-05, "loss": 1.4721, "step": 51750 }, { "epoch": 2.36635222788626, "grad_norm": 0.2756584882736206, "learning_rate": 9.652073740764576e-05, "loss": 1.4901, "step": 51760 }, { "epoch": 2.366623198469016, "eval_loss": 2.4501538276672363, "eval_runtime": 22.0902, "eval_samples_per_second": 226.345, "eval_steps_per_second": 1.222, "step": 51765 }, { "epoch": 2.3668941690517724, "grad_norm": 0.27957209944725037, "learning_rate": 9.651462827506131e-05, "loss": 1.484, "step": 51770 }, { "epoch": 2.3674361102172847, "grad_norm": 0.3119787871837616, "learning_rate": 9.650851399982495e-05, "loss": 1.4874, "step": 51780 }, { "epoch": 2.3679780513827966, "grad_norm": 0.37608903646469116, "learning_rate": 9.65023945826941e-05, "loss": 1.4897, "step": 51790 }, { "epoch": 2.368519992548309, "grad_norm": 0.1750231236219406, "learning_rate": 9.64962700244268e-05, "loss": 1.4893, "step": 51800 }, { "epoch": 2.3690619337138212, "grad_norm": 0.20804855227470398, "learning_rate": 9.649014032578174e-05, "loss": 1.4854, "step": 51810 }, { "epoch": 2.3696038748793335, "grad_norm": 0.19923941791057587, "learning_rate": 9.648400548751819e-05, "loss": 1.4891, "step": 51820 }, { "epoch": 2.370145816044846, "grad_norm": 0.3422909379005432, "learning_rate": 9.647786551039617e-05, "loss": 1.4859, "step": 51830 }, { "epoch": 2.3706877572103577, "grad_norm": 0.2580278813838959, "learning_rate": 9.647172039517623e-05, "loss": 1.4886, "step": 51840 }, { "epoch": 2.37122969837587, "grad_norm": 0.21753491461277008, "learning_rate": 9.646557014261966e-05, "loss": 1.4819, "step": 51850 }, { "epoch": 2.3713380866089726, "eval_loss": 2.452852964401245, "eval_runtime": 22.005, "eval_samples_per_second": 227.221, "eval_steps_per_second": 1.227, "step": 51852 }, { "epoch": 2.3717716395413824, "grad_norm": 0.3423318862915039, "learning_rate": 9.645941475348825e-05, "loss": 1.4947, "step": 51860 }, { "epoch": 2.3723135807068947, "grad_norm": 0.28139030933380127, "learning_rate": 9.645325422854454e-05, "loss": 1.4803, "step": 51870 }, { "epoch": 2.3728555218724066, "grad_norm": 0.21756911277770996, "learning_rate": 9.644708856855168e-05, "loss": 1.4883, "step": 51880 }, { "epoch": 2.373397463037919, "grad_norm": 0.22105887532234192, "learning_rate": 9.644091777427344e-05, "loss": 1.4905, "step": 51890 }, { "epoch": 2.373939404203431, "grad_norm": 0.2563340365886688, "learning_rate": 9.643474184647422e-05, "loss": 1.4769, "step": 51900 }, { "epoch": 2.3744813453689435, "grad_norm": 0.19484004378318787, "learning_rate": 9.642856078591906e-05, "loss": 1.4858, "step": 51910 }, { "epoch": 2.3750232865344554, "grad_norm": 0.4082275629043579, "learning_rate": 9.642237459337366e-05, "loss": 1.4934, "step": 51920 }, { "epoch": 2.3755652276999677, "grad_norm": 0.26656705141067505, "learning_rate": 9.641618326960435e-05, "loss": 1.4833, "step": 51930 }, { "epoch": 2.3760529747489287, "eval_loss": 2.4559152126312256, "eval_runtime": 22.0417, "eval_samples_per_second": 226.843, "eval_steps_per_second": 1.225, "step": 51939 }, { "epoch": 2.37610716886548, "grad_norm": 0.38334181904792786, "learning_rate": 9.640998681537805e-05, "loss": 1.4832, "step": 51940 }, { "epoch": 2.3766491100309923, "grad_norm": 0.34898388385772705, "learning_rate": 9.640378523146238e-05, "loss": 1.489, "step": 51950 }, { "epoch": 2.3771910511965046, "grad_norm": 0.23095417022705078, "learning_rate": 9.639757851862553e-05, "loss": 1.4816, "step": 51960 }, { "epoch": 2.3777329923620165, "grad_norm": 0.19512107968330383, "learning_rate": 9.63913666776364e-05, "loss": 1.4864, "step": 51970 }, { "epoch": 2.378274933527529, "grad_norm": 0.4121658205986023, "learning_rate": 9.638514970926447e-05, "loss": 1.4825, "step": 51980 }, { "epoch": 2.378816874693041, "grad_norm": 0.3080250024795532, "learning_rate": 9.637892761427987e-05, "loss": 1.4921, "step": 51990 }, { "epoch": 2.3793588158585535, "grad_norm": 0.25781115889549255, "learning_rate": 9.637270039345335e-05, "loss": 1.4752, "step": 52000 }, { "epoch": 2.379900757024066, "grad_norm": 0.2879182994365692, "learning_rate": 9.636646804755635e-05, "loss": 1.4941, "step": 52010 }, { "epoch": 2.3804426981895777, "grad_norm": 0.2381771355867386, "learning_rate": 9.636023057736088e-05, "loss": 1.4927, "step": 52020 }, { "epoch": 2.3807678628888853, "eval_loss": 2.451416492462158, "eval_runtime": 22.0461, "eval_samples_per_second": 226.797, "eval_steps_per_second": 1.225, "step": 52026 }, { "epoch": 2.38098463935509, "grad_norm": 0.33801183104515076, "learning_rate": 9.63539879836396e-05, "loss": 1.4843, "step": 52030 }, { "epoch": 2.3815265805206023, "grad_norm": 0.19671766459941864, "learning_rate": 9.634774026716585e-05, "loss": 1.4758, "step": 52040 }, { "epoch": 2.3820685216861146, "grad_norm": 0.17954659461975098, "learning_rate": 9.634148742871353e-05, "loss": 1.4805, "step": 52050 }, { "epoch": 2.3826104628516265, "grad_norm": 0.31026336550712585, "learning_rate": 9.633522946905725e-05, "loss": 1.4882, "step": 52060 }, { "epoch": 2.383152404017139, "grad_norm": 0.19763007760047913, "learning_rate": 9.632896638897219e-05, "loss": 1.4804, "step": 52070 }, { "epoch": 2.383694345182651, "grad_norm": 0.37613585591316223, "learning_rate": 9.63226981892342e-05, "loss": 1.4732, "step": 52080 }, { "epoch": 2.3842362863481634, "grad_norm": 0.3250291347503662, "learning_rate": 9.631642487061978e-05, "loss": 1.4788, "step": 52090 }, { "epoch": 2.3847782275136753, "grad_norm": 0.4253854751586914, "learning_rate": 9.631014643390602e-05, "loss": 1.4871, "step": 52100 }, { "epoch": 2.3853201686791876, "grad_norm": 0.1921018362045288, "learning_rate": 9.630386287987067e-05, "loss": 1.4825, "step": 52110 }, { "epoch": 2.3854827510288414, "eval_loss": 2.4489946365356445, "eval_runtime": 21.9884, "eval_samples_per_second": 227.393, "eval_steps_per_second": 1.228, "step": 52113 }, { "epoch": 2.3858621098447, "grad_norm": 0.2642991840839386, "learning_rate": 9.629757420929212e-05, "loss": 1.4773, "step": 52120 }, { "epoch": 2.3864040510102122, "grad_norm": 0.20693929493427277, "learning_rate": 9.629128042294936e-05, "loss": 1.4714, "step": 52130 }, { "epoch": 2.3869459921757246, "grad_norm": 0.2861025631427765, "learning_rate": 9.628498152162205e-05, "loss": 1.4816, "step": 52140 }, { "epoch": 2.3874879333412364, "grad_norm": 0.1987767517566681, "learning_rate": 9.627867750609047e-05, "loss": 1.4794, "step": 52150 }, { "epoch": 2.3880298745067488, "grad_norm": 0.19934241473674774, "learning_rate": 9.627236837713553e-05, "loss": 1.4802, "step": 52160 }, { "epoch": 2.388571815672261, "grad_norm": 0.2977670133113861, "learning_rate": 9.626605413553881e-05, "loss": 1.4917, "step": 52170 }, { "epoch": 2.3891137568377734, "grad_norm": 0.22710345685482025, "learning_rate": 9.625973478208243e-05, "loss": 1.4835, "step": 52180 }, { "epoch": 2.3896556980032857, "grad_norm": 0.2273036539554596, "learning_rate": 9.625341031754925e-05, "loss": 1.4893, "step": 52190 }, { "epoch": 2.3901976391687976, "grad_norm": 0.2654477655887604, "learning_rate": 9.624708074272272e-05, "loss": 1.4888, "step": 52200 }, { "epoch": 2.3901976391687976, "eval_loss": 2.4439258575439453, "eval_runtime": 21.9105, "eval_samples_per_second": 228.201, "eval_steps_per_second": 1.232, "step": 52200 }, { "epoch": 2.39073958033431, "grad_norm": 0.2779911160469055, "learning_rate": 9.624074605838688e-05, "loss": 1.4723, "step": 52210 }, { "epoch": 2.391281521499822, "grad_norm": 0.20526061952114105, "learning_rate": 9.623440626532649e-05, "loss": 1.4886, "step": 52220 }, { "epoch": 2.3918234626653345, "grad_norm": 0.23855385184288025, "learning_rate": 9.622806136432684e-05, "loss": 1.4846, "step": 52230 }, { "epoch": 2.392365403830847, "grad_norm": 0.1878088265657425, "learning_rate": 9.622171135617397e-05, "loss": 1.4781, "step": 52240 }, { "epoch": 2.3929073449963587, "grad_norm": 0.28239238262176514, "learning_rate": 9.621535624165446e-05, "loss": 1.4894, "step": 52250 }, { "epoch": 2.393449286161871, "grad_norm": 0.24423588812351227, "learning_rate": 9.620899602155557e-05, "loss": 1.4842, "step": 52260 }, { "epoch": 2.3939912273273833, "grad_norm": 0.20243407785892487, "learning_rate": 9.620263069666514e-05, "loss": 1.4767, "step": 52270 }, { "epoch": 2.3945331684928957, "grad_norm": 0.3433877229690552, "learning_rate": 9.619626026777172e-05, "loss": 1.4847, "step": 52280 }, { "epoch": 2.394912527308754, "eval_loss": 2.4462528228759766, "eval_runtime": 22.0531, "eval_samples_per_second": 226.726, "eval_steps_per_second": 1.224, "step": 52287 }, { "epoch": 2.3950751096584075, "grad_norm": 0.2804941236972809, "learning_rate": 9.618988473566442e-05, "loss": 1.4887, "step": 52290 }, { "epoch": 2.39561705082392, "grad_norm": 0.21407003700733185, "learning_rate": 9.618350410113304e-05, "loss": 1.4823, "step": 52300 }, { "epoch": 2.396158991989432, "grad_norm": 0.2938377261161804, "learning_rate": 9.617711836496797e-05, "loss": 1.489, "step": 52310 }, { "epoch": 2.3967009331549445, "grad_norm": 0.19561982154846191, "learning_rate": 9.617072752796025e-05, "loss": 1.4788, "step": 52320 }, { "epoch": 2.3972428743204564, "grad_norm": 0.2788771688938141, "learning_rate": 9.616433159090154e-05, "loss": 1.4788, "step": 52330 }, { "epoch": 2.3977848154859687, "grad_norm": 0.24922244250774384, "learning_rate": 9.615793055458415e-05, "loss": 1.4827, "step": 52340 }, { "epoch": 2.398326756651481, "grad_norm": 0.18937751650810242, "learning_rate": 9.615152441980104e-05, "loss": 1.4771, "step": 52350 }, { "epoch": 2.3988686978169933, "grad_norm": 0.19057206809520721, "learning_rate": 9.614511318734572e-05, "loss": 1.4782, "step": 52360 }, { "epoch": 2.3994106389825056, "grad_norm": 0.40637481212615967, "learning_rate": 9.613869685801242e-05, "loss": 1.4894, "step": 52370 }, { "epoch": 2.3996274154487103, "eval_loss": 2.4485349655151367, "eval_runtime": 25.128, "eval_samples_per_second": 198.981, "eval_steps_per_second": 1.074, "step": 52374 }, { "epoch": 2.3999525801480175, "grad_norm": 0.18058116734027863, "learning_rate": 9.613227543259595e-05, "loss": 1.4847, "step": 52380 }, { "epoch": 2.40049452131353, "grad_norm": 0.2915930151939392, "learning_rate": 9.61258489118918e-05, "loss": 1.4771, "step": 52390 }, { "epoch": 2.401036462479042, "grad_norm": 0.20396968722343445, "learning_rate": 9.611941729669602e-05, "loss": 1.4799, "step": 52400 }, { "epoch": 2.4015784036445544, "grad_norm": 0.18857981264591217, "learning_rate": 9.611298058780536e-05, "loss": 1.4857, "step": 52410 }, { "epoch": 2.4021203448100668, "grad_norm": 0.3981879651546478, "learning_rate": 9.610653878601715e-05, "loss": 1.4748, "step": 52420 }, { "epoch": 2.4026622859755786, "grad_norm": 0.17841531336307526, "learning_rate": 9.610009189212938e-05, "loss": 1.4865, "step": 52430 }, { "epoch": 2.403204227141091, "grad_norm": 0.37024664878845215, "learning_rate": 9.609363990694067e-05, "loss": 1.4806, "step": 52440 }, { "epoch": 2.4037461683066033, "grad_norm": 0.19459642469882965, "learning_rate": 9.608718283125026e-05, "loss": 1.4754, "step": 52450 }, { "epoch": 2.4042881094721156, "grad_norm": 0.2543799579143524, "learning_rate": 9.608072066585803e-05, "loss": 1.4872, "step": 52460 }, { "epoch": 2.404342303588667, "eval_loss": 2.44846773147583, "eval_runtime": 21.9857, "eval_samples_per_second": 227.42, "eval_steps_per_second": 1.228, "step": 52461 }, { "epoch": 2.404830050637628, "grad_norm": 0.31468695402145386, "learning_rate": 9.607425341156447e-05, "loss": 1.4677, "step": 52470 }, { "epoch": 2.4053719918031398, "grad_norm": 0.31996336579322815, "learning_rate": 9.606778106917071e-05, "loss": 1.4961, "step": 52480 }, { "epoch": 2.405913932968652, "grad_norm": 0.22491812705993652, "learning_rate": 9.606130363947856e-05, "loss": 1.4923, "step": 52490 }, { "epoch": 2.4064558741341644, "grad_norm": 0.18299773335456848, "learning_rate": 9.605482112329037e-05, "loss": 1.4828, "step": 52500 }, { "epoch": 2.4069978152996767, "grad_norm": 0.17865203320980072, "learning_rate": 9.604833352140918e-05, "loss": 1.4884, "step": 52510 }, { "epoch": 2.4075397564651886, "grad_norm": 0.31600865721702576, "learning_rate": 9.604184083463863e-05, "loss": 1.4814, "step": 52520 }, { "epoch": 2.408081697630701, "grad_norm": 0.22451327741146088, "learning_rate": 9.603534306378305e-05, "loss": 1.4824, "step": 52530 }, { "epoch": 2.408623638796213, "grad_norm": 0.1985584944486618, "learning_rate": 9.602884020964734e-05, "loss": 1.4852, "step": 52540 }, { "epoch": 2.409057191728623, "eval_loss": 2.45249605178833, "eval_runtime": 22.1056, "eval_samples_per_second": 226.187, "eval_steps_per_second": 1.221, "step": 52548 }, { "epoch": 2.4091655799617255, "grad_norm": 0.2913689911365509, "learning_rate": 9.6022332273037e-05, "loss": 1.4858, "step": 52550 }, { "epoch": 2.4097075211272374, "grad_norm": 0.28544992208480835, "learning_rate": 9.601581925475825e-05, "loss": 1.4845, "step": 52560 }, { "epoch": 2.4102494622927497, "grad_norm": 0.34636637568473816, "learning_rate": 9.600930115561791e-05, "loss": 1.4848, "step": 52570 }, { "epoch": 2.410791403458262, "grad_norm": 0.22824904322624207, "learning_rate": 9.600277797642335e-05, "loss": 1.4804, "step": 52580 }, { "epoch": 2.4113333446237744, "grad_norm": 0.2189302295446396, "learning_rate": 9.59962497179827e-05, "loss": 1.4794, "step": 52590 }, { "epoch": 2.4118752857892867, "grad_norm": 0.1900554746389389, "learning_rate": 9.59897163811046e-05, "loss": 1.4769, "step": 52600 }, { "epoch": 2.4124172269547985, "grad_norm": 0.2690756618976593, "learning_rate": 9.59831779665984e-05, "loss": 1.4868, "step": 52610 }, { "epoch": 2.412959168120311, "grad_norm": 0.22319385409355164, "learning_rate": 9.597663447527407e-05, "loss": 1.4726, "step": 52620 }, { "epoch": 2.413501109285823, "grad_norm": 0.22841401398181915, "learning_rate": 9.597008590794211e-05, "loss": 1.4806, "step": 52630 }, { "epoch": 2.413772079868579, "eval_loss": 2.4524683952331543, "eval_runtime": 22.0707, "eval_samples_per_second": 226.544, "eval_steps_per_second": 1.223, "step": 52635 }, { "epoch": 2.4140430504513355, "grad_norm": 0.20807218551635742, "learning_rate": 9.596353226541382e-05, "loss": 1.4779, "step": 52640 }, { "epoch": 2.414584991616848, "grad_norm": 0.4142747223377228, "learning_rate": 9.595697354850101e-05, "loss": 1.4986, "step": 52650 }, { "epoch": 2.4151269327823597, "grad_norm": 0.21264301240444183, "learning_rate": 9.59504097580161e-05, "loss": 1.4877, "step": 52660 }, { "epoch": 2.415668873947872, "grad_norm": 0.5480301976203918, "learning_rate": 9.594384089477224e-05, "loss": 1.4897, "step": 52670 }, { "epoch": 2.4162108151133843, "grad_norm": 0.42489802837371826, "learning_rate": 9.593726695958313e-05, "loss": 1.4774, "step": 52680 }, { "epoch": 2.4167527562788966, "grad_norm": 0.17096111178398132, "learning_rate": 9.593068795326312e-05, "loss": 1.4863, "step": 52690 }, { "epoch": 2.4172946974444085, "grad_norm": 0.19619131088256836, "learning_rate": 9.592410387662717e-05, "loss": 1.4682, "step": 52700 }, { "epoch": 2.417836638609921, "grad_norm": 0.18955209851264954, "learning_rate": 9.591751473049095e-05, "loss": 1.4765, "step": 52710 }, { "epoch": 2.418378579775433, "grad_norm": 0.1831616461277008, "learning_rate": 9.591092051567063e-05, "loss": 1.4783, "step": 52720 }, { "epoch": 2.4184869680085357, "eval_loss": 2.446958303451538, "eval_runtime": 22.1086, "eval_samples_per_second": 226.156, "eval_steps_per_second": 1.221, "step": 52722 }, { "epoch": 2.4189205209409455, "grad_norm": 0.2192278653383255, "learning_rate": 9.590432123298307e-05, "loss": 1.4874, "step": 52730 }, { "epoch": 2.4194624621064573, "grad_norm": 0.19387726485729218, "learning_rate": 9.589771688324582e-05, "loss": 1.4803, "step": 52740 }, { "epoch": 2.4200044032719696, "grad_norm": 0.280454158782959, "learning_rate": 9.589110746727692e-05, "loss": 1.4762, "step": 52750 }, { "epoch": 2.420546344437482, "grad_norm": 0.18469981849193573, "learning_rate": 9.588449298589518e-05, "loss": 1.478, "step": 52760 }, { "epoch": 2.4210882856029943, "grad_norm": 0.21699224412441254, "learning_rate": 9.587787343991996e-05, "loss": 1.473, "step": 52770 }, { "epoch": 2.4216302267685066, "grad_norm": 0.373770147562027, "learning_rate": 9.587124883017126e-05, "loss": 1.4781, "step": 52780 }, { "epoch": 2.4221721679340185, "grad_norm": 0.305627703666687, "learning_rate": 9.586461915746968e-05, "loss": 1.4829, "step": 52790 }, { "epoch": 2.4227141090995308, "grad_norm": 0.23528113961219788, "learning_rate": 9.585798442263651e-05, "loss": 1.4835, "step": 52800 }, { "epoch": 2.423201856148492, "eval_loss": 2.444547653198242, "eval_runtime": 22.0829, "eval_samples_per_second": 226.42, "eval_steps_per_second": 1.223, "step": 52809 }, { "epoch": 2.423256050265043, "grad_norm": 0.33448222279548645, "learning_rate": 9.58513446264936e-05, "loss": 1.4654, "step": 52810 }, { "epoch": 2.4237979914305554, "grad_norm": 0.2976095974445343, "learning_rate": 9.584469976986349e-05, "loss": 1.4784, "step": 52820 }, { "epoch": 2.4243399325960677, "grad_norm": 0.19375227391719818, "learning_rate": 9.58380498535693e-05, "loss": 1.4728, "step": 52830 }, { "epoch": 2.4248818737615796, "grad_norm": 0.30406102538108826, "learning_rate": 9.583139487843479e-05, "loss": 1.4812, "step": 52840 }, { "epoch": 2.425423814927092, "grad_norm": 0.3576555550098419, "learning_rate": 9.582473484528436e-05, "loss": 1.4949, "step": 52850 }, { "epoch": 2.4259657560926042, "grad_norm": 0.21541939675807953, "learning_rate": 9.581806975494303e-05, "loss": 1.4831, "step": 52860 }, { "epoch": 2.4265076972581165, "grad_norm": 0.20319156348705292, "learning_rate": 9.581139960823642e-05, "loss": 1.471, "step": 52870 }, { "epoch": 2.427049638423629, "grad_norm": 0.3167076110839844, "learning_rate": 9.58047244059908e-05, "loss": 1.4771, "step": 52880 }, { "epoch": 2.4275915795891407, "grad_norm": 0.41747766733169556, "learning_rate": 9.579804414903311e-05, "loss": 1.4705, "step": 52890 }, { "epoch": 2.4279167442884484, "eval_loss": 2.454636573791504, "eval_runtime": 21.9836, "eval_samples_per_second": 227.442, "eval_steps_per_second": 1.228, "step": 52896 }, { "epoch": 2.428133520754653, "grad_norm": 0.37934955954551697, "learning_rate": 9.579135883819082e-05, "loss": 1.4835, "step": 52900 }, { "epoch": 2.4286754619201654, "grad_norm": 0.1990528702735901, "learning_rate": 9.578466847429208e-05, "loss": 1.4811, "step": 52910 }, { "epoch": 2.4292174030856777, "grad_norm": 0.1967354118824005, "learning_rate": 9.57779730581657e-05, "loss": 1.4862, "step": 52920 }, { "epoch": 2.4297593442511896, "grad_norm": 0.22842636704444885, "learning_rate": 9.577127259064106e-05, "loss": 1.4771, "step": 52930 }, { "epoch": 2.430301285416702, "grad_norm": 0.34596872329711914, "learning_rate": 9.576456707254817e-05, "loss": 1.4861, "step": 52940 }, { "epoch": 2.430843226582214, "grad_norm": 0.2179035097360611, "learning_rate": 9.57578565047177e-05, "loss": 1.4817, "step": 52950 }, { "epoch": 2.4313851677477265, "grad_norm": 0.3425596356391907, "learning_rate": 9.57511408879809e-05, "loss": 1.4933, "step": 52960 }, { "epoch": 2.4319271089132384, "grad_norm": 0.33133429288864136, "learning_rate": 9.574442022316972e-05, "loss": 1.4704, "step": 52970 }, { "epoch": 2.4324690500787507, "grad_norm": 0.2703513205051422, "learning_rate": 9.573769451111665e-05, "loss": 1.4756, "step": 52980 }, { "epoch": 2.4326316324284045, "eval_loss": 2.4538090229034424, "eval_runtime": 22.1163, "eval_samples_per_second": 226.078, "eval_steps_per_second": 1.221, "step": 52983 }, { "epoch": 2.433010991244263, "grad_norm": 0.2822078764438629, "learning_rate": 9.573096375265484e-05, "loss": 1.4883, "step": 52990 }, { "epoch": 2.4335529324097753, "grad_norm": 0.2804018259048462, "learning_rate": 9.572422794861808e-05, "loss": 1.4807, "step": 53000 }, { "epoch": 2.4340948735752876, "grad_norm": 0.41560617089271545, "learning_rate": 9.571748709984076e-05, "loss": 1.4775, "step": 53010 }, { "epoch": 2.4346368147407995, "grad_norm": 0.43294498324394226, "learning_rate": 9.571074120715794e-05, "loss": 1.4794, "step": 53020 }, { "epoch": 2.435178755906312, "grad_norm": 0.22881726920604706, "learning_rate": 9.570399027140523e-05, "loss": 1.4722, "step": 53030 }, { "epoch": 2.435720697071824, "grad_norm": 0.25796639919281006, "learning_rate": 9.569723429341892e-05, "loss": 1.4782, "step": 53040 }, { "epoch": 2.4362626382373365, "grad_norm": 0.2922258973121643, "learning_rate": 9.569047327403593e-05, "loss": 1.4862, "step": 53050 }, { "epoch": 2.436804579402849, "grad_norm": 0.2601679265499115, "learning_rate": 9.568370721409376e-05, "loss": 1.4967, "step": 53060 }, { "epoch": 2.4373465205683607, "grad_norm": 0.2096012830734253, "learning_rate": 9.567693611443057e-05, "loss": 1.4771, "step": 53070 }, { "epoch": 2.4373465205683607, "eval_loss": 2.448545217514038, "eval_runtime": 22.001, "eval_samples_per_second": 227.262, "eval_steps_per_second": 1.227, "step": 53070 }, { "epoch": 2.437888461733873, "grad_norm": 0.18229645490646362, "learning_rate": 9.567015997588516e-05, "loss": 1.4764, "step": 53080 }, { "epoch": 2.4384304028993853, "grad_norm": 0.20466655492782593, "learning_rate": 9.566337879929687e-05, "loss": 1.478, "step": 53090 }, { "epoch": 2.4389723440648976, "grad_norm": 0.27803587913513184, "learning_rate": 9.565659258550576e-05, "loss": 1.4613, "step": 53100 }, { "epoch": 2.43951428523041, "grad_norm": 0.2388664335012436, "learning_rate": 9.564980133535249e-05, "loss": 1.4798, "step": 53110 }, { "epoch": 2.440056226395922, "grad_norm": 0.20684485137462616, "learning_rate": 9.564300504967832e-05, "loss": 1.4779, "step": 53120 }, { "epoch": 2.440598167561434, "grad_norm": 0.22506524622440338, "learning_rate": 9.563620372932513e-05, "loss": 1.4851, "step": 53130 }, { "epoch": 2.4411401087269464, "grad_norm": 0.2067488580942154, "learning_rate": 9.562939737513544e-05, "loss": 1.476, "step": 53140 }, { "epoch": 2.4416820498924587, "grad_norm": 0.27859237790107727, "learning_rate": 9.56225859879524e-05, "loss": 1.4776, "step": 53150 }, { "epoch": 2.4420614087083172, "eval_loss": 2.4477126598358154, "eval_runtime": 22.0385, "eval_samples_per_second": 226.876, "eval_steps_per_second": 1.225, "step": 53157 }, { "epoch": 2.4422239910579706, "grad_norm": 0.31960025429725647, "learning_rate": 9.561576956861978e-05, "loss": 1.4773, "step": 53160 }, { "epoch": 2.442765932223483, "grad_norm": 0.23332948982715607, "learning_rate": 9.560894811798198e-05, "loss": 1.4791, "step": 53170 }, { "epoch": 2.4433078733889952, "grad_norm": 0.20001184940338135, "learning_rate": 9.560212163688395e-05, "loss": 1.4825, "step": 53180 }, { "epoch": 2.4438498145545076, "grad_norm": 0.2940216064453125, "learning_rate": 9.559529012617141e-05, "loss": 1.4872, "step": 53190 }, { "epoch": 2.4443917557200194, "grad_norm": 0.25465646386146545, "learning_rate": 9.558845358669055e-05, "loss": 1.4896, "step": 53200 }, { "epoch": 2.4449336968855317, "grad_norm": 0.2544267177581787, "learning_rate": 9.55816120192883e-05, "loss": 1.4844, "step": 53210 }, { "epoch": 2.445475638051044, "grad_norm": 0.30805036425590515, "learning_rate": 9.557476542481212e-05, "loss": 1.4777, "step": 53220 }, { "epoch": 2.4460175792165564, "grad_norm": 0.23450466990470886, "learning_rate": 9.556791380411017e-05, "loss": 1.486, "step": 53230 }, { "epoch": 2.4465595203820687, "grad_norm": 0.2862785756587982, "learning_rate": 9.556105715803116e-05, "loss": 1.4807, "step": 53240 }, { "epoch": 2.4467762968482734, "eval_loss": 2.448235273361206, "eval_runtime": 22.0612, "eval_samples_per_second": 226.642, "eval_steps_per_second": 1.224, "step": 53244 }, { "epoch": 2.4471014615475806, "grad_norm": 0.23192840814590454, "learning_rate": 9.555419548742452e-05, "loss": 1.4753, "step": 53250 }, { "epoch": 2.447643402713093, "grad_norm": 0.3470098078250885, "learning_rate": 9.554732879314019e-05, "loss": 1.4847, "step": 53260 }, { "epoch": 2.448185343878605, "grad_norm": 0.3917534351348877, "learning_rate": 9.554045707602882e-05, "loss": 1.481, "step": 53270 }, { "epoch": 2.4487272850441175, "grad_norm": 0.36082231998443604, "learning_rate": 9.553358033694164e-05, "loss": 1.4753, "step": 53280 }, { "epoch": 2.44926922620963, "grad_norm": 0.24314887821674347, "learning_rate": 9.552669857673049e-05, "loss": 1.488, "step": 53290 }, { "epoch": 2.4498111673751417, "grad_norm": 0.27214857935905457, "learning_rate": 9.551981179624789e-05, "loss": 1.4761, "step": 53300 }, { "epoch": 2.450353108540654, "grad_norm": 0.22485631704330444, "learning_rate": 9.55129199963469e-05, "loss": 1.4849, "step": 53310 }, { "epoch": 2.4508950497061663, "grad_norm": 0.2175171673297882, "learning_rate": 9.55060231778813e-05, "loss": 1.4772, "step": 53320 }, { "epoch": 2.4514369908716787, "grad_norm": 0.5146223306655884, "learning_rate": 9.549912134170539e-05, "loss": 1.4912, "step": 53330 }, { "epoch": 2.45149118498823, "eval_loss": 2.4793365001678467, "eval_runtime": 22.2104, "eval_samples_per_second": 225.12, "eval_steps_per_second": 1.216, "step": 53331 }, { "epoch": 2.4519789320371905, "grad_norm": 0.2823374271392822, "learning_rate": 9.549221448867415e-05, "loss": 1.4967, "step": 53340 }, { "epoch": 2.452520873202703, "grad_norm": 0.26054081320762634, "learning_rate": 9.54853026196432e-05, "loss": 1.4803, "step": 53350 }, { "epoch": 2.453062814368215, "grad_norm": 0.27415192127227783, "learning_rate": 9.547838573546872e-05, "loss": 1.487, "step": 53360 }, { "epoch": 2.4536047555337275, "grad_norm": 0.19018392264842987, "learning_rate": 9.547146383700756e-05, "loss": 1.4945, "step": 53370 }, { "epoch": 2.4541466966992393, "grad_norm": 0.23459668457508087, "learning_rate": 9.546453692511715e-05, "loss": 1.4807, "step": 53380 }, { "epoch": 2.4546886378647517, "grad_norm": 0.30195194482803345, "learning_rate": 9.545760500065562e-05, "loss": 1.4858, "step": 53390 }, { "epoch": 2.455230579030264, "grad_norm": 0.23426523804664612, "learning_rate": 9.54506680644816e-05, "loss": 1.4805, "step": 53400 }, { "epoch": 2.4557725201957763, "grad_norm": 0.23431915044784546, "learning_rate": 9.544372611745444e-05, "loss": 1.4824, "step": 53410 }, { "epoch": 2.456206073128186, "eval_loss": 2.459233283996582, "eval_runtime": 22.0413, "eval_samples_per_second": 226.847, "eval_steps_per_second": 1.225, "step": 53418 }, { "epoch": 2.4563144613612886, "grad_norm": 0.19480261206626892, "learning_rate": 9.54367791604341e-05, "loss": 1.4798, "step": 53420 }, { "epoch": 2.4568564025268005, "grad_norm": 0.23288105428218842, "learning_rate": 9.54298271942811e-05, "loss": 1.4837, "step": 53430 }, { "epoch": 2.457398343692313, "grad_norm": 0.21174949407577515, "learning_rate": 9.542287021985665e-05, "loss": 1.4851, "step": 53440 }, { "epoch": 2.457940284857825, "grad_norm": 0.4950149953365326, "learning_rate": 9.541590823802252e-05, "loss": 1.4755, "step": 53450 }, { "epoch": 2.4584822260233374, "grad_norm": 0.2723096013069153, "learning_rate": 9.540894124964115e-05, "loss": 1.4802, "step": 53460 }, { "epoch": 2.4590241671888498, "grad_norm": 0.17027664184570312, "learning_rate": 9.54019692555756e-05, "loss": 1.4746, "step": 53470 }, { "epoch": 2.4595661083543616, "grad_norm": 0.30771180987358093, "learning_rate": 9.539499225668948e-05, "loss": 1.4835, "step": 53480 }, { "epoch": 2.460108049519874, "grad_norm": 0.21357007324695587, "learning_rate": 9.538801025384709e-05, "loss": 1.4775, "step": 53490 }, { "epoch": 2.4606499906853863, "grad_norm": 0.28452086448669434, "learning_rate": 9.538102324791336e-05, "loss": 1.4832, "step": 53500 }, { "epoch": 2.460920961268142, "eval_loss": 2.4530394077301025, "eval_runtime": 22.1982, "eval_samples_per_second": 225.244, "eval_steps_per_second": 1.216, "step": 53505 }, { "epoch": 2.4611919318508986, "grad_norm": 0.20882080495357513, "learning_rate": 9.537403123975378e-05, "loss": 1.4729, "step": 53510 }, { "epoch": 2.461733873016411, "grad_norm": 0.2761918306350708, "learning_rate": 9.536703423023449e-05, "loss": 1.4739, "step": 53520 }, { "epoch": 2.4622758141819228, "grad_norm": 0.1824912279844284, "learning_rate": 9.536003222022225e-05, "loss": 1.48, "step": 53530 }, { "epoch": 2.462817755347435, "grad_norm": 0.22570142149925232, "learning_rate": 9.535302521058445e-05, "loss": 1.4832, "step": 53540 }, { "epoch": 2.4633596965129474, "grad_norm": 0.3082813620567322, "learning_rate": 9.534601320218909e-05, "loss": 1.4899, "step": 53550 }, { "epoch": 2.4639016376784597, "grad_norm": 0.17578811943531036, "learning_rate": 9.533899619590477e-05, "loss": 1.4744, "step": 53560 }, { "epoch": 2.4644435788439716, "grad_norm": 0.20538325607776642, "learning_rate": 9.533197419260073e-05, "loss": 1.4834, "step": 53570 }, { "epoch": 2.464985520009484, "grad_norm": 0.2888244688510895, "learning_rate": 9.532494719314686e-05, "loss": 1.4833, "step": 53580 }, { "epoch": 2.465527461174996, "grad_norm": 0.26800277829170227, "learning_rate": 9.531791519841359e-05, "loss": 1.4856, "step": 53590 }, { "epoch": 2.4656358494080988, "eval_loss": 2.4533004760742188, "eval_runtime": 22.8599, "eval_samples_per_second": 218.724, "eval_steps_per_second": 1.181, "step": 53592 }, { "epoch": 2.4660694023405085, "grad_norm": 0.24594108760356903, "learning_rate": 9.531087820927201e-05, "loss": 1.4797, "step": 53600 }, { "epoch": 2.4666113435060204, "grad_norm": 0.18168674409389496, "learning_rate": 9.530383622659386e-05, "loss": 1.4837, "step": 53610 }, { "epoch": 2.4671532846715327, "grad_norm": 0.20581193268299103, "learning_rate": 9.529678925125148e-05, "loss": 1.4761, "step": 53620 }, { "epoch": 2.467695225837045, "grad_norm": 0.33261099457740784, "learning_rate": 9.528973728411778e-05, "loss": 1.4789, "step": 53630 }, { "epoch": 2.4682371670025574, "grad_norm": 0.3050486147403717, "learning_rate": 9.528268032606636e-05, "loss": 1.4759, "step": 53640 }, { "epoch": 2.4687791081680697, "grad_norm": 0.24290646612644196, "learning_rate": 9.527561837797136e-05, "loss": 1.4944, "step": 53650 }, { "epoch": 2.4693210493335815, "grad_norm": 0.26528191566467285, "learning_rate": 9.526855144070763e-05, "loss": 1.4721, "step": 53660 }, { "epoch": 2.469862990499094, "grad_norm": 0.1806386113166809, "learning_rate": 9.52614795151506e-05, "loss": 1.4691, "step": 53670 }, { "epoch": 2.470350737548055, "eval_loss": 2.45682430267334, "eval_runtime": 21.9989, "eval_samples_per_second": 227.284, "eval_steps_per_second": 1.227, "step": 53679 }, { "epoch": 2.470404931664606, "grad_norm": 0.27701669931411743, "learning_rate": 9.525440260217627e-05, "loss": 1.4868, "step": 53680 }, { "epoch": 2.4709468728301185, "grad_norm": 0.23009248077869415, "learning_rate": 9.52473207026613e-05, "loss": 1.4797, "step": 53690 }, { "epoch": 2.471488813995631, "grad_norm": 0.19565734267234802, "learning_rate": 9.524023381748298e-05, "loss": 1.4742, "step": 53700 }, { "epoch": 2.4720307551611427, "grad_norm": 0.3481823801994324, "learning_rate": 9.523314194751921e-05, "loss": 1.4823, "step": 53710 }, { "epoch": 2.472572696326655, "grad_norm": 0.1881093680858612, "learning_rate": 9.522604509364849e-05, "loss": 1.4707, "step": 53720 }, { "epoch": 2.4731146374921673, "grad_norm": 0.4059028625488281, "learning_rate": 9.521894325674994e-05, "loss": 1.4749, "step": 53730 }, { "epoch": 2.4736565786576796, "grad_norm": 0.3592798709869385, "learning_rate": 9.521183643770333e-05, "loss": 1.477, "step": 53740 }, { "epoch": 2.474198519823192, "grad_norm": 0.2864842712879181, "learning_rate": 9.520472463738899e-05, "loss": 1.4784, "step": 53750 }, { "epoch": 2.474740460988704, "grad_norm": 0.20083869993686676, "learning_rate": 9.519760785668791e-05, "loss": 1.4775, "step": 53760 }, { "epoch": 2.4750656256880115, "eval_loss": 2.4518940448760986, "eval_runtime": 33.4746, "eval_samples_per_second": 149.367, "eval_steps_per_second": 0.807, "step": 53766 }, { "epoch": 2.475282402154216, "grad_norm": 0.25543951988220215, "learning_rate": 9.519048609648169e-05, "loss": 1.4624, "step": 53770 }, { "epoch": 2.4758243433197284, "grad_norm": 0.2484443336725235, "learning_rate": 9.518335935765256e-05, "loss": 1.483, "step": 53780 }, { "epoch": 2.4763662844852408, "grad_norm": 0.3281131088733673, "learning_rate": 9.517622764108331e-05, "loss": 1.4762, "step": 53790 }, { "epoch": 2.4769082256507526, "grad_norm": 0.23684971034526825, "learning_rate": 9.516909094765741e-05, "loss": 1.4772, "step": 53800 }, { "epoch": 2.477450166816265, "grad_norm": 0.2768543064594269, "learning_rate": 9.516194927825894e-05, "loss": 1.4877, "step": 53810 }, { "epoch": 2.4779921079817773, "grad_norm": 0.20820960402488708, "learning_rate": 9.515480263377253e-05, "loss": 1.4711, "step": 53820 }, { "epoch": 2.4785340491472896, "grad_norm": 0.18678176403045654, "learning_rate": 9.514765101508353e-05, "loss": 1.4923, "step": 53830 }, { "epoch": 2.4790759903128015, "grad_norm": 0.22122722864151, "learning_rate": 9.514049442307782e-05, "loss": 1.4837, "step": 53840 }, { "epoch": 2.4796179314783138, "grad_norm": 0.3165682256221771, "learning_rate": 9.513333285864192e-05, "loss": 1.4856, "step": 53850 }, { "epoch": 2.4797805138279676, "eval_loss": 2.4556586742401123, "eval_runtime": 33.9488, "eval_samples_per_second": 147.281, "eval_steps_per_second": 0.795, "step": 53853 }, { "epoch": 2.480159872643826, "grad_norm": 0.2486167550086975, "learning_rate": 9.5126166322663e-05, "loss": 1.4829, "step": 53860 }, { "epoch": 2.4807018138093384, "grad_norm": 0.21120484173297882, "learning_rate": 9.51189948160288e-05, "loss": 1.479, "step": 53870 }, { "epoch": 2.4812437549748507, "grad_norm": 0.18009105324745178, "learning_rate": 9.511181833962772e-05, "loss": 1.4698, "step": 53880 }, { "epoch": 2.4817856961403626, "grad_norm": 0.19406986236572266, "learning_rate": 9.51046368943487e-05, "loss": 1.484, "step": 53890 }, { "epoch": 2.482327637305875, "grad_norm": 0.22672545909881592, "learning_rate": 9.50974504810814e-05, "loss": 1.4773, "step": 53900 }, { "epoch": 2.4828695784713872, "grad_norm": 0.21448560059070587, "learning_rate": 9.509025910071602e-05, "loss": 1.4781, "step": 53910 }, { "epoch": 2.4834115196368995, "grad_norm": 0.3304313123226166, "learning_rate": 9.508306275414339e-05, "loss": 1.472, "step": 53920 }, { "epoch": 2.483953460802412, "grad_norm": 0.29814818501472473, "learning_rate": 9.507586144225497e-05, "loss": 1.4737, "step": 53930 }, { "epoch": 2.4844954019679237, "grad_norm": 0.20384559035301208, "learning_rate": 9.506865516594282e-05, "loss": 1.4748, "step": 53940 }, { "epoch": 2.4844954019679237, "eval_loss": 2.4536936283111572, "eval_runtime": 21.8133, "eval_samples_per_second": 229.218, "eval_steps_per_second": 1.238, "step": 53940 }, { "epoch": 2.485037343133436, "grad_norm": 0.1892993301153183, "learning_rate": 9.506144392609965e-05, "loss": 1.4846, "step": 53950 }, { "epoch": 2.4855792842989484, "grad_norm": 0.37276574969291687, "learning_rate": 9.505422772361872e-05, "loss": 1.4809, "step": 53960 }, { "epoch": 2.4861212254644607, "grad_norm": 0.5390621423721313, "learning_rate": 9.504700655939396e-05, "loss": 1.4752, "step": 53970 }, { "epoch": 2.4866631666299726, "grad_norm": 0.26775336265563965, "learning_rate": 9.50397804343199e-05, "loss": 1.4858, "step": 53980 }, { "epoch": 2.487205107795485, "grad_norm": 0.18470676243305206, "learning_rate": 9.503254934929165e-05, "loss": 1.4741, "step": 53990 }, { "epoch": 2.487747048960997, "grad_norm": 0.24282990396022797, "learning_rate": 9.502531330520501e-05, "loss": 1.4781, "step": 54000 }, { "epoch": 2.4882889901265095, "grad_norm": 0.19152526557445526, "learning_rate": 9.501807230295634e-05, "loss": 1.4714, "step": 54010 }, { "epoch": 2.4888309312920214, "grad_norm": 0.24509835243225098, "learning_rate": 9.50108263434426e-05, "loss": 1.4673, "step": 54020 }, { "epoch": 2.4892102901078803, "eval_loss": 2.4450418949127197, "eval_runtime": 21.9853, "eval_samples_per_second": 227.425, "eval_steps_per_second": 1.228, "step": 54027 }, { "epoch": 2.4893728724575337, "grad_norm": 0.20996253192424774, "learning_rate": 9.500357542756139e-05, "loss": 1.4696, "step": 54030 }, { "epoch": 2.489914813623046, "grad_norm": 0.3065512776374817, "learning_rate": 9.499631955621097e-05, "loss": 1.4837, "step": 54040 }, { "epoch": 2.4904567547885583, "grad_norm": 0.2927444577217102, "learning_rate": 9.498905873029012e-05, "loss": 1.4833, "step": 54050 }, { "epoch": 2.4909986959540706, "grad_norm": 0.3556462228298187, "learning_rate": 9.498179295069827e-05, "loss": 1.4744, "step": 54060 }, { "epoch": 2.4915406371195825, "grad_norm": 0.3211055099964142, "learning_rate": 9.49745222183355e-05, "loss": 1.4723, "step": 54070 }, { "epoch": 2.492082578285095, "grad_norm": 0.510176956653595, "learning_rate": 9.496724653410249e-05, "loss": 1.4706, "step": 54080 }, { "epoch": 2.492624519450607, "grad_norm": 0.20977604389190674, "learning_rate": 9.495996589890048e-05, "loss": 1.4692, "step": 54090 }, { "epoch": 2.4931664606161195, "grad_norm": 0.19488461315631866, "learning_rate": 9.495268031363138e-05, "loss": 1.4821, "step": 54100 }, { "epoch": 2.493708401781632, "grad_norm": 0.21856850385665894, "learning_rate": 9.494538977919771e-05, "loss": 1.4704, "step": 54110 }, { "epoch": 2.4939251782478364, "eval_loss": 2.4471306800842285, "eval_runtime": 21.9864, "eval_samples_per_second": 227.414, "eval_steps_per_second": 1.228, "step": 54114 }, { "epoch": 2.4942503429471437, "grad_norm": 0.21702884137630463, "learning_rate": 9.493809429650256e-05, "loss": 1.4802, "step": 54120 }, { "epoch": 2.494792284112656, "grad_norm": 0.17151543498039246, "learning_rate": 9.49307938664497e-05, "loss": 1.4798, "step": 54130 }, { "epoch": 2.4953342252781683, "grad_norm": 0.20608776807785034, "learning_rate": 9.492348848994345e-05, "loss": 1.4736, "step": 54140 }, { "epoch": 2.4958761664436806, "grad_norm": 0.29367008805274963, "learning_rate": 9.491617816788879e-05, "loss": 1.4799, "step": 54150 }, { "epoch": 2.496418107609193, "grad_norm": 0.38167864084243774, "learning_rate": 9.490886290119128e-05, "loss": 1.4636, "step": 54160 }, { "epoch": 2.496960048774705, "grad_norm": 0.18490886688232422, "learning_rate": 9.490154269075708e-05, "loss": 1.4694, "step": 54170 }, { "epoch": 2.497501989940217, "grad_norm": 0.44868624210357666, "learning_rate": 9.489421753749303e-05, "loss": 1.4783, "step": 54180 }, { "epoch": 2.4980439311057294, "grad_norm": 0.34574294090270996, "learning_rate": 9.488688744230649e-05, "loss": 1.4775, "step": 54190 }, { "epoch": 2.4985858722712417, "grad_norm": 0.34734290838241577, "learning_rate": 9.487955240610553e-05, "loss": 1.4725, "step": 54200 }, { "epoch": 2.498640066387793, "eval_loss": 2.442676305770874, "eval_runtime": 21.9936, "eval_samples_per_second": 227.339, "eval_steps_per_second": 1.228, "step": 54201 }, { "epoch": 2.4991278134367536, "grad_norm": 0.18172703683376312, "learning_rate": 9.487221242979874e-05, "loss": 1.4862, "step": 54210 }, { "epoch": 2.499669754602266, "grad_norm": 0.4136309027671814, "learning_rate": 9.486486751429539e-05, "loss": 1.4723, "step": 54220 }, { "epoch": 2.5002116957677782, "grad_norm": 0.2756052315235138, "learning_rate": 9.485751766050533e-05, "loss": 1.4846, "step": 54230 }, { "epoch": 2.5007536369332906, "grad_norm": 0.34378984570503235, "learning_rate": 9.485016286933902e-05, "loss": 1.4631, "step": 54240 }, { "epoch": 2.5012955780988024, "grad_norm": 0.2502152919769287, "learning_rate": 9.484280314170754e-05, "loss": 1.4684, "step": 54250 }, { "epoch": 2.5018375192643147, "grad_norm": 0.20602858066558838, "learning_rate": 9.483543847852258e-05, "loss": 1.461, "step": 54260 }, { "epoch": 2.502379460429827, "grad_norm": 0.19559136033058167, "learning_rate": 9.482806888069648e-05, "loss": 1.4821, "step": 54270 }, { "epoch": 2.5029214015953394, "grad_norm": 0.19327175617218018, "learning_rate": 9.48206943491421e-05, "loss": 1.4777, "step": 54280 }, { "epoch": 2.503354954527749, "eval_loss": 2.4406850337982178, "eval_runtime": 21.9849, "eval_samples_per_second": 227.429, "eval_steps_per_second": 1.228, "step": 54288 }, { "epoch": 2.5034633427608517, "grad_norm": 0.19267085194587708, "learning_rate": 9.4813314884773e-05, "loss": 1.48, "step": 54290 }, { "epoch": 2.5040052839263636, "grad_norm": 0.21109183132648468, "learning_rate": 9.48059304885033e-05, "loss": 1.4782, "step": 54300 }, { "epoch": 2.504547225091876, "grad_norm": 0.2512279152870178, "learning_rate": 9.479854116124775e-05, "loss": 1.4898, "step": 54310 }, { "epoch": 2.505089166257388, "grad_norm": 0.24899804592132568, "learning_rate": 9.479114690392172e-05, "loss": 1.4642, "step": 54320 }, { "epoch": 2.5056311074229005, "grad_norm": 0.20213483273983002, "learning_rate": 9.478374771744115e-05, "loss": 1.4686, "step": 54330 }, { "epoch": 2.506173048588413, "grad_norm": 0.1799444556236267, "learning_rate": 9.477634360272265e-05, "loss": 1.4745, "step": 54340 }, { "epoch": 2.5067149897539247, "grad_norm": 0.347083181142807, "learning_rate": 9.476893456068338e-05, "loss": 1.48, "step": 54350 }, { "epoch": 2.507256930919437, "grad_norm": 0.2862820625305176, "learning_rate": 9.476152059224117e-05, "loss": 1.4804, "step": 54360 }, { "epoch": 2.5077988720849493, "grad_norm": 0.285040020942688, "learning_rate": 9.475410169831442e-05, "loss": 1.4798, "step": 54370 }, { "epoch": 2.5080698426677053, "eval_loss": 2.4463353157043457, "eval_runtime": 22.1709, "eval_samples_per_second": 225.521, "eval_steps_per_second": 1.218, "step": 54375 }, { "epoch": 2.5083408132504617, "grad_norm": 0.2216855138540268, "learning_rate": 9.474667787982213e-05, "loss": 1.4854, "step": 54380 }, { "epoch": 2.508882754415974, "grad_norm": 0.307620108127594, "learning_rate": 9.473924913768396e-05, "loss": 1.4787, "step": 54390 }, { "epoch": 2.509424695581486, "grad_norm": 0.30470430850982666, "learning_rate": 9.473181547282013e-05, "loss": 1.4721, "step": 54400 }, { "epoch": 2.509966636746998, "grad_norm": 0.35153964161872864, "learning_rate": 9.47243768861515e-05, "loss": 1.4701, "step": 54410 }, { "epoch": 2.5105085779125105, "grad_norm": 0.24849040806293488, "learning_rate": 9.471693337859953e-05, "loss": 1.4609, "step": 54420 }, { "epoch": 2.5110505190780223, "grad_norm": 0.3956458270549774, "learning_rate": 9.470948495108628e-05, "loss": 1.4819, "step": 54430 }, { "epoch": 2.5115924602435347, "grad_norm": 0.21388791501522064, "learning_rate": 9.470203160453445e-05, "loss": 1.4825, "step": 54440 }, { "epoch": 2.512134401409047, "grad_norm": 0.35142725706100464, "learning_rate": 9.46945733398673e-05, "loss": 1.4787, "step": 54450 }, { "epoch": 2.5126763425745593, "grad_norm": 0.22184672951698303, "learning_rate": 9.468711015800874e-05, "loss": 1.4662, "step": 54460 }, { "epoch": 2.512784730807662, "eval_loss": 2.446195363998413, "eval_runtime": 22.0811, "eval_samples_per_second": 226.438, "eval_steps_per_second": 1.223, "step": 54462 }, { "epoch": 2.5132182837400716, "grad_norm": 0.27088162302970886, "learning_rate": 9.46796420598833e-05, "loss": 1.473, "step": 54470 }, { "epoch": 2.5137602249055835, "grad_norm": 0.2419961541891098, "learning_rate": 9.467216904641606e-05, "loss": 1.4824, "step": 54480 }, { "epoch": 2.514302166071096, "grad_norm": 0.2746829688549042, "learning_rate": 9.466469111853275e-05, "loss": 1.4729, "step": 54490 }, { "epoch": 2.514844107236608, "grad_norm": 0.2866189479827881, "learning_rate": 9.465720827715972e-05, "loss": 1.476, "step": 54500 }, { "epoch": 2.5153860484021204, "grad_norm": 0.19014035165309906, "learning_rate": 9.464972052322388e-05, "loss": 1.4803, "step": 54510 }, { "epoch": 2.5159279895676327, "grad_norm": 0.2546852231025696, "learning_rate": 9.464222785765284e-05, "loss": 1.4749, "step": 54520 }, { "epoch": 2.5164699307331446, "grad_norm": 0.3189975917339325, "learning_rate": 9.46347302813747e-05, "loss": 1.4838, "step": 54530 }, { "epoch": 2.517011871898657, "grad_norm": 0.25641825795173645, "learning_rate": 9.462722779531825e-05, "loss": 1.4775, "step": 54540 }, { "epoch": 2.517499618947618, "eval_loss": 2.4492218494415283, "eval_runtime": 22.0696, "eval_samples_per_second": 226.556, "eval_steps_per_second": 1.223, "step": 54549 }, { "epoch": 2.5175538130641693, "grad_norm": 0.36368492245674133, "learning_rate": 9.461972040041286e-05, "loss": 1.4834, "step": 54550 }, { "epoch": 2.5180957542296816, "grad_norm": 0.41073617339134216, "learning_rate": 9.461220809758854e-05, "loss": 1.475, "step": 54560 }, { "epoch": 2.518637695395194, "grad_norm": 0.42227840423583984, "learning_rate": 9.460469088777585e-05, "loss": 1.4782, "step": 54570 }, { "epoch": 2.5191796365607058, "grad_norm": 0.24830390512943268, "learning_rate": 9.459716877190599e-05, "loss": 1.4789, "step": 54580 }, { "epoch": 2.519721577726218, "grad_norm": 0.2122548371553421, "learning_rate": 9.458964175091078e-05, "loss": 1.4714, "step": 54590 }, { "epoch": 2.5202635188917304, "grad_norm": 0.1787809282541275, "learning_rate": 9.458210982572264e-05, "loss": 1.4732, "step": 54600 }, { "epoch": 2.5208054600572423, "grad_norm": 0.19361905753612518, "learning_rate": 9.457457299727458e-05, "loss": 1.4837, "step": 54610 }, { "epoch": 2.521347401222755, "grad_norm": 0.459349662065506, "learning_rate": 9.456703126650023e-05, "loss": 1.4909, "step": 54620 }, { "epoch": 2.521889342388267, "grad_norm": 0.31457701325416565, "learning_rate": 9.455948463433384e-05, "loss": 1.4623, "step": 54630 }, { "epoch": 2.5222145070875746, "eval_loss": 2.4459049701690674, "eval_runtime": 22.0427, "eval_samples_per_second": 226.832, "eval_steps_per_second": 1.225, "step": 54636 }, { "epoch": 2.522431283553779, "grad_norm": 0.2660439610481262, "learning_rate": 9.455193310171022e-05, "loss": 1.4708, "step": 54640 }, { "epoch": 2.5229732247192915, "grad_norm": 0.22614122927188873, "learning_rate": 9.454437666956486e-05, "loss": 1.4763, "step": 54650 }, { "epoch": 2.5235151658848034, "grad_norm": 0.23727241158485413, "learning_rate": 9.45368153388338e-05, "loss": 1.4875, "step": 54660 }, { "epoch": 2.5240571070503157, "grad_norm": 0.20123521983623505, "learning_rate": 9.452924911045372e-05, "loss": 1.4907, "step": 54670 }, { "epoch": 2.524599048215828, "grad_norm": 0.28257882595062256, "learning_rate": 9.452167798536186e-05, "loss": 1.4782, "step": 54680 }, { "epoch": 2.5251409893813404, "grad_norm": 0.16655373573303223, "learning_rate": 9.451410196449613e-05, "loss": 1.4794, "step": 54690 }, { "epoch": 2.5256829305468527, "grad_norm": 0.28501424193382263, "learning_rate": 9.450652104879499e-05, "loss": 1.4713, "step": 54700 }, { "epoch": 2.5262248717123645, "grad_norm": 0.24294120073318481, "learning_rate": 9.449893523919754e-05, "loss": 1.4853, "step": 54710 }, { "epoch": 2.526766812877877, "grad_norm": 0.5156522393226624, "learning_rate": 9.44913445366435e-05, "loss": 1.4686, "step": 54720 }, { "epoch": 2.5269293952275307, "eval_loss": 2.447787046432495, "eval_runtime": 22.1759, "eval_samples_per_second": 225.47, "eval_steps_per_second": 1.218, "step": 54723 }, { "epoch": 2.527308754043389, "grad_norm": 0.21539968252182007, "learning_rate": 9.448374894207314e-05, "loss": 1.4801, "step": 54730 }, { "epoch": 2.5278506952089015, "grad_norm": 0.268772155046463, "learning_rate": 9.447614845642738e-05, "loss": 1.4644, "step": 54740 }, { "epoch": 2.528392636374414, "grad_norm": 0.1783050000667572, "learning_rate": 9.446854308064774e-05, "loss": 1.4827, "step": 54750 }, { "epoch": 2.5289345775399257, "grad_norm": 0.17679934203624725, "learning_rate": 9.446093281567635e-05, "loss": 1.4651, "step": 54760 }, { "epoch": 2.529476518705438, "grad_norm": 0.2454293668270111, "learning_rate": 9.445331766245592e-05, "loss": 1.4663, "step": 54770 }, { "epoch": 2.5300184598709503, "grad_norm": 0.1850767731666565, "learning_rate": 9.444569762192977e-05, "loss": 1.4661, "step": 54780 }, { "epoch": 2.5305604010364626, "grad_norm": 0.2817839980125427, "learning_rate": 9.443807269504187e-05, "loss": 1.4777, "step": 54790 }, { "epoch": 2.531102342201975, "grad_norm": 0.2719174921512604, "learning_rate": 9.443044288273675e-05, "loss": 1.4745, "step": 54800 }, { "epoch": 2.531644283367487, "grad_norm": 0.7066235542297363, "learning_rate": 9.442280818595955e-05, "loss": 1.466, "step": 54810 }, { "epoch": 2.531644283367487, "eval_loss": 2.447768211364746, "eval_runtime": 21.9147, "eval_samples_per_second": 228.157, "eval_steps_per_second": 1.232, "step": 54810 }, { "epoch": 2.532186224532999, "grad_norm": 0.3116154968738556, "learning_rate": 9.441516860565602e-05, "loss": 1.4752, "step": 54820 }, { "epoch": 2.5327281656985114, "grad_norm": 0.29827791452407837, "learning_rate": 9.440752414277254e-05, "loss": 1.4729, "step": 54830 }, { "epoch": 2.5332701068640233, "grad_norm": 0.2395966500043869, "learning_rate": 9.439987479825607e-05, "loss": 1.4742, "step": 54840 }, { "epoch": 2.5338120480295356, "grad_norm": 0.21512676775455475, "learning_rate": 9.439222057305414e-05, "loss": 1.4761, "step": 54850 }, { "epoch": 2.534353989195048, "grad_norm": 0.19757066667079926, "learning_rate": 9.438456146811496e-05, "loss": 1.4736, "step": 54860 }, { "epoch": 2.5348959303605603, "grad_norm": 0.2103499174118042, "learning_rate": 9.43768974843873e-05, "loss": 1.4746, "step": 54870 }, { "epoch": 2.5354378715260726, "grad_norm": 0.19325312972068787, "learning_rate": 9.436922862282052e-05, "loss": 1.4561, "step": 54880 }, { "epoch": 2.5359798126915845, "grad_norm": 0.2294219732284546, "learning_rate": 9.436155488436464e-05, "loss": 1.4783, "step": 54890 }, { "epoch": 2.5363591715074434, "eval_loss": 2.4521801471710205, "eval_runtime": 22.0914, "eval_samples_per_second": 226.332, "eval_steps_per_second": 1.222, "step": 54897 }, { "epoch": 2.5365217538570968, "grad_norm": 0.40045541524887085, "learning_rate": 9.435387626997024e-05, "loss": 1.4763, "step": 54900 }, { "epoch": 2.537063695022609, "grad_norm": 0.2468022257089615, "learning_rate": 9.434619278058848e-05, "loss": 1.4752, "step": 54910 }, { "epoch": 2.5376056361881214, "grad_norm": 0.2977105975151062, "learning_rate": 9.433850441717122e-05, "loss": 1.4786, "step": 54920 }, { "epoch": 2.5381475773536337, "grad_norm": 0.20533820986747742, "learning_rate": 9.433081118067078e-05, "loss": 1.479, "step": 54930 }, { "epoch": 2.5386895185191456, "grad_norm": 0.2988840937614441, "learning_rate": 9.432311307204024e-05, "loss": 1.4668, "step": 54940 }, { "epoch": 2.539231459684658, "grad_norm": 0.24603962898254395, "learning_rate": 9.431541009223316e-05, "loss": 1.4821, "step": 54950 }, { "epoch": 2.5397734008501702, "grad_norm": 0.1960807591676712, "learning_rate": 9.430770224220376e-05, "loss": 1.4664, "step": 54960 }, { "epoch": 2.5403153420156825, "grad_norm": 0.20046474039554596, "learning_rate": 9.429998952290688e-05, "loss": 1.4732, "step": 54970 }, { "epoch": 2.540857283181195, "grad_norm": 0.1977715790271759, "learning_rate": 9.429227193529791e-05, "loss": 1.4807, "step": 54980 }, { "epoch": 2.5410740596473995, "eval_loss": 2.445849657058716, "eval_runtime": 21.9883, "eval_samples_per_second": 227.394, "eval_steps_per_second": 1.228, "step": 54984 }, { "epoch": 2.5413992243467067, "grad_norm": 0.20431387424468994, "learning_rate": 9.42845494803329e-05, "loss": 1.4758, "step": 54990 }, { "epoch": 2.541941165512219, "grad_norm": 0.3124183416366577, "learning_rate": 9.427682215896846e-05, "loss": 1.4776, "step": 55000 }, { "epoch": 2.5424831066777314, "grad_norm": 0.17832711338996887, "learning_rate": 9.426908997216179e-05, "loss": 1.4732, "step": 55010 }, { "epoch": 2.5430250478432437, "grad_norm": 0.2197619527578354, "learning_rate": 9.426135292087076e-05, "loss": 1.4652, "step": 55020 }, { "epoch": 2.543566989008756, "grad_norm": 0.3124410808086395, "learning_rate": 9.425361100605378e-05, "loss": 1.4776, "step": 55030 }, { "epoch": 2.544108930174268, "grad_norm": 0.28143149614334106, "learning_rate": 9.424586422866989e-05, "loss": 1.4755, "step": 55040 }, { "epoch": 2.54465087133978, "grad_norm": 0.3505375385284424, "learning_rate": 9.423811258967872e-05, "loss": 1.4826, "step": 55050 }, { "epoch": 2.5451928125052925, "grad_norm": 0.30467939376831055, "learning_rate": 9.423035609004054e-05, "loss": 1.4683, "step": 55060 }, { "epoch": 2.5457347536708044, "grad_norm": 0.2334168255329132, "learning_rate": 9.422259473071615e-05, "loss": 1.4676, "step": 55070 }, { "epoch": 2.545788947787356, "eval_loss": 2.4590961933135986, "eval_runtime": 22.0238, "eval_samples_per_second": 227.027, "eval_steps_per_second": 1.226, "step": 55071 }, { "epoch": 2.5462766948363167, "grad_norm": 0.24799096584320068, "learning_rate": 9.421482851266702e-05, "loss": 1.4726, "step": 55080 }, { "epoch": 2.546818636001829, "grad_norm": 0.17815372347831726, "learning_rate": 9.420705743685519e-05, "loss": 1.4807, "step": 55090 }, { "epoch": 2.5473605771673413, "grad_norm": 0.1867280900478363, "learning_rate": 9.419928150424328e-05, "loss": 1.479, "step": 55100 }, { "epoch": 2.5479025183328536, "grad_norm": 0.18417419493198395, "learning_rate": 9.419150071579459e-05, "loss": 1.4774, "step": 55110 }, { "epoch": 2.5484444594983655, "grad_norm": 0.24251243472099304, "learning_rate": 9.418371507247294e-05, "loss": 1.4771, "step": 55120 }, { "epoch": 2.548986400663878, "grad_norm": 0.22498652338981628, "learning_rate": 9.417592457524278e-05, "loss": 1.4618, "step": 55130 }, { "epoch": 2.54952834182939, "grad_norm": 0.3402760624885559, "learning_rate": 9.416812922506917e-05, "loss": 1.4772, "step": 55140 }, { "epoch": 2.5500702829949025, "grad_norm": 0.22304527461528778, "learning_rate": 9.416032902291778e-05, "loss": 1.4924, "step": 55150 }, { "epoch": 2.5505038359273122, "eval_loss": 2.4431533813476562, "eval_runtime": 22.327, "eval_samples_per_second": 223.944, "eval_steps_per_second": 1.209, "step": 55158 }, { "epoch": 2.5506122241604148, "grad_norm": 0.17721644043922424, "learning_rate": 9.415252396975482e-05, "loss": 1.4695, "step": 55160 }, { "epoch": 2.5511541653259266, "grad_norm": 0.2341170608997345, "learning_rate": 9.414471406654718e-05, "loss": 1.4676, "step": 55170 }, { "epoch": 2.551696106491439, "grad_norm": 0.17687572538852692, "learning_rate": 9.413689931426232e-05, "loss": 1.4775, "step": 55180 }, { "epoch": 2.5522380476569513, "grad_norm": 0.3460327982902527, "learning_rate": 9.412907971386828e-05, "loss": 1.4701, "step": 55190 }, { "epoch": 2.5527799888224636, "grad_norm": 0.3097844123840332, "learning_rate": 9.412125526633373e-05, "loss": 1.4664, "step": 55200 }, { "epoch": 2.553321929987976, "grad_norm": 0.2532496452331543, "learning_rate": 9.411342597262794e-05, "loss": 1.4723, "step": 55210 }, { "epoch": 2.553863871153488, "grad_norm": 0.18894581496715546, "learning_rate": 9.410559183372072e-05, "loss": 1.4768, "step": 55220 }, { "epoch": 2.554405812319, "grad_norm": 0.3138901889324188, "learning_rate": 9.409775285058259e-05, "loss": 1.4703, "step": 55230 }, { "epoch": 2.5549477534845124, "grad_norm": 0.5384250283241272, "learning_rate": 9.408990902418458e-05, "loss": 1.4734, "step": 55240 }, { "epoch": 2.5552187240672684, "eval_loss": 2.4425833225250244, "eval_runtime": 22.3536, "eval_samples_per_second": 223.678, "eval_steps_per_second": 1.208, "step": 55245 }, { "epoch": 2.0001083882331026, "grad_norm": 0.246513232588768, "learning_rate": 9.408206035549835e-05, "loss": 1.4546, "step": 55250 }, { "epoch": 2.000650329398615, "grad_norm": 0.18351303040981293, "learning_rate": 9.407420684549616e-05, "loss": 1.4714, "step": 55260 }, { "epoch": 2.0011922705641267, "grad_norm": 0.38743826746940613, "learning_rate": 9.406634849515087e-05, "loss": 1.4758, "step": 55270 }, { "epoch": 2.001734211729639, "grad_norm": 0.2079382985830307, "learning_rate": 9.405848530543593e-05, "loss": 1.4662, "step": 55280 }, { "epoch": 2.0022761528951514, "grad_norm": 0.19639058411121368, "learning_rate": 9.40506172773254e-05, "loss": 1.4805, "step": 55290 }, { "epoch": 2.0028180940606637, "grad_norm": 0.19277872145175934, "learning_rate": 9.404274441179397e-05, "loss": 1.4692, "step": 55300 }, { "epoch": 2.0033600352261756, "grad_norm": 0.3094417452812195, "learning_rate": 9.403486670981685e-05, "loss": 1.4718, "step": 55310 }, { "epoch": 2.003901976391688, "grad_norm": 0.2434214949607849, "learning_rate": 9.402698417236991e-05, "loss": 1.4711, "step": 55320 }, { "epoch": 2.0044439175572, "grad_norm": 0.37863555550575256, "learning_rate": 9.401909680042962e-05, "loss": 1.4704, "step": 55330 }, { "epoch": 2.0045523057903027, "eval_loss": 2.450153112411499, "eval_runtime": 62.0654, "eval_samples_per_second": 80.56, "eval_steps_per_second": 0.435, "step": 55332 }, { "epoch": 2.0049858587227125, "grad_norm": 0.7027290463447571, "learning_rate": 9.401120459497302e-05, "loss": 1.4786, "step": 55340 }, { "epoch": 2.005527799888225, "grad_norm": 0.42648592591285706, "learning_rate": 9.400330755697774e-05, "loss": 1.4755, "step": 55350 }, { "epoch": 2.0060697410537367, "grad_norm": 0.22446554899215698, "learning_rate": 9.399540568742209e-05, "loss": 1.4791, "step": 55360 }, { "epoch": 2.006611682219249, "grad_norm": 0.2071634829044342, "learning_rate": 9.398749898728487e-05, "loss": 1.4712, "step": 55370 }, { "epoch": 2.0071536233847613, "grad_norm": 0.22032909095287323, "learning_rate": 9.397958745754554e-05, "loss": 1.4703, "step": 55380 }, { "epoch": 2.0076955645502736, "grad_norm": 0.24659113585948944, "learning_rate": 9.397167109918416e-05, "loss": 1.4654, "step": 55390 }, { "epoch": 2.0082375057157855, "grad_norm": 0.23570233583450317, "learning_rate": 9.396374991318135e-05, "loss": 1.4684, "step": 55400 }, { "epoch": 2.008779446881298, "grad_norm": 0.3380047380924225, "learning_rate": 9.395582390051838e-05, "loss": 1.4691, "step": 55410 }, { "epoch": 2.009267193930259, "eval_loss": 2.4444944858551025, "eval_runtime": 24.1261, "eval_samples_per_second": 207.245, "eval_steps_per_second": 1.119, "step": 55419 }, { "epoch": 2.00932138804681, "grad_norm": 0.2984738051891327, "learning_rate": 9.394789306217707e-05, "loss": 1.4741, "step": 55420 }, { "epoch": 2.0098633292123225, "grad_norm": 0.2363492250442505, "learning_rate": 9.393995739913985e-05, "loss": 1.474, "step": 55430 }, { "epoch": 2.010405270377835, "grad_norm": 0.18736572563648224, "learning_rate": 9.393201691238976e-05, "loss": 1.4761, "step": 55440 }, { "epoch": 2.0109472115433467, "grad_norm": 0.27000486850738525, "learning_rate": 9.392407160291045e-05, "loss": 1.4624, "step": 55450 }, { "epoch": 2.011489152708859, "grad_norm": 0.26010996103286743, "learning_rate": 9.391612147168617e-05, "loss": 1.4704, "step": 55460 }, { "epoch": 2.0120310938743713, "grad_norm": 0.2281525433063507, "learning_rate": 9.390816651970168e-05, "loss": 1.465, "step": 55470 }, { "epoch": 2.0125730350398836, "grad_norm": 0.20982374250888824, "learning_rate": 9.390020674794245e-05, "loss": 1.4664, "step": 55480 }, { "epoch": 2.013114976205396, "grad_norm": 0.20972926914691925, "learning_rate": 9.389224215739452e-05, "loss": 1.4597, "step": 55490 }, { "epoch": 2.013656917370908, "grad_norm": 0.21081332862377167, "learning_rate": 9.388427274904447e-05, "loss": 1.4585, "step": 55500 }, { "epoch": 2.0139820820702155, "eval_loss": 2.451312780380249, "eval_runtime": 21.9802, "eval_samples_per_second": 227.477, "eval_steps_per_second": 1.228, "step": 55506 }, { "epoch": 2.01419885853642, "grad_norm": 0.22025637328624725, "learning_rate": 9.387629852387952e-05, "loss": 1.4756, "step": 55510 }, { "epoch": 2.0147407997019324, "grad_norm": 0.18817798793315887, "learning_rate": 9.38683194828875e-05, "loss": 1.4721, "step": 55520 }, { "epoch": 2.0152827408674447, "grad_norm": 0.30482780933380127, "learning_rate": 9.386033562705681e-05, "loss": 1.4679, "step": 55530 }, { "epoch": 2.0158246820329566, "grad_norm": 0.20792457461357117, "learning_rate": 9.385234695737647e-05, "loss": 1.4554, "step": 55540 }, { "epoch": 2.016366623198469, "grad_norm": 0.39416491985321045, "learning_rate": 9.384435347483606e-05, "loss": 1.4665, "step": 55550 }, { "epoch": 2.0169085643639812, "grad_norm": 0.29534631967544556, "learning_rate": 9.383635518042579e-05, "loss": 1.4604, "step": 55560 }, { "epoch": 2.0174505055294936, "grad_norm": 0.20709377527236938, "learning_rate": 9.382835207513644e-05, "loss": 1.4783, "step": 55570 }, { "epoch": 2.017992446695006, "grad_norm": 0.1838233321905136, "learning_rate": 9.382034415995942e-05, "loss": 1.4729, "step": 55580 }, { "epoch": 2.0185343878605178, "grad_norm": 0.20121820271015167, "learning_rate": 9.381233143588671e-05, "loss": 1.469, "step": 55590 }, { "epoch": 2.0186969702101716, "eval_loss": 2.4466960430145264, "eval_runtime": 22.0147, "eval_samples_per_second": 227.121, "eval_steps_per_second": 1.226, "step": 55593 }, { "epoch": 2.01907632902603, "grad_norm": 0.24472562968730927, "learning_rate": 9.380431390391089e-05, "loss": 1.4665, "step": 55600 }, { "epoch": 2.0196182701915424, "grad_norm": 0.2527831196784973, "learning_rate": 9.379629156502513e-05, "loss": 1.4773, "step": 55610 }, { "epoch": 2.0201602113570547, "grad_norm": 0.23660998046398163, "learning_rate": 9.37882644202232e-05, "loss": 1.4686, "step": 55620 }, { "epoch": 2.0207021525225666, "grad_norm": 0.2622048258781433, "learning_rate": 9.378023247049949e-05, "loss": 1.4686, "step": 55630 }, { "epoch": 2.021244093688079, "grad_norm": 0.1906578242778778, "learning_rate": 9.377219571684895e-05, "loss": 1.4706, "step": 55640 }, { "epoch": 2.021786034853591, "grad_norm": 0.2571401000022888, "learning_rate": 9.376415416026712e-05, "loss": 1.4626, "step": 55650 }, { "epoch": 2.0223279760191035, "grad_norm": 0.37901771068573, "learning_rate": 9.375610780175017e-05, "loss": 1.4766, "step": 55660 }, { "epoch": 2.022869917184616, "grad_norm": 0.23881062865257263, "learning_rate": 9.374805664229484e-05, "loss": 1.4653, "step": 55670 }, { "epoch": 2.0234118583501277, "grad_norm": 0.5407462120056152, "learning_rate": 9.37400006828985e-05, "loss": 1.4641, "step": 55680 }, { "epoch": 2.0234118583501277, "eval_loss": 2.4462928771972656, "eval_runtime": 21.8259, "eval_samples_per_second": 229.086, "eval_steps_per_second": 1.237, "step": 55680 }, { "epoch": 2.02395379951564, "grad_norm": 0.24240484833717346, "learning_rate": 9.373193992455907e-05, "loss": 1.4575, "step": 55690 }, { "epoch": 2.0244957406811523, "grad_norm": 0.3244767487049103, "learning_rate": 9.372387436827507e-05, "loss": 1.4706, "step": 55700 }, { "epoch": 2.0250376818466647, "grad_norm": 0.22140353918075562, "learning_rate": 9.371580401504564e-05, "loss": 1.4656, "step": 55710 }, { "epoch": 2.0255796230121765, "grad_norm": 0.21475926041603088, "learning_rate": 9.370772886587049e-05, "loss": 1.4706, "step": 55720 }, { "epoch": 2.026121564177689, "grad_norm": 0.2393941879272461, "learning_rate": 9.369964892174995e-05, "loss": 1.4622, "step": 55730 }, { "epoch": 2.026663505343201, "grad_norm": 0.23679640889167786, "learning_rate": 9.369156418368491e-05, "loss": 1.4604, "step": 55740 }, { "epoch": 2.0272054465087135, "grad_norm": 0.2113851010799408, "learning_rate": 9.368347465267688e-05, "loss": 1.4713, "step": 55750 }, { "epoch": 2.027747387674226, "grad_norm": 0.25252246856689453, "learning_rate": 9.367538032972797e-05, "loss": 1.4595, "step": 55760 }, { "epoch": 2.0281267464900843, "eval_loss": 2.453479051589966, "eval_runtime": 21.9786, "eval_samples_per_second": 227.494, "eval_steps_per_second": 1.228, "step": 55767 }, { "epoch": 2.0282893288397377, "grad_norm": 0.23864135146141052, "learning_rate": 9.366728121584084e-05, "loss": 1.4588, "step": 55770 }, { "epoch": 2.02883127000525, "grad_norm": 0.24793609976768494, "learning_rate": 9.365917731201879e-05, "loss": 1.4593, "step": 55780 }, { "epoch": 2.0293732111707623, "grad_norm": 0.21740341186523438, "learning_rate": 9.365106861926571e-05, "loss": 1.4745, "step": 55790 }, { "epoch": 2.0299151523362746, "grad_norm": 0.19946753978729248, "learning_rate": 9.364295513858604e-05, "loss": 1.4723, "step": 55800 }, { "epoch": 2.030457093501787, "grad_norm": 0.2608671486377716, "learning_rate": 9.363483687098487e-05, "loss": 1.4651, "step": 55810 }, { "epoch": 2.030999034667299, "grad_norm": 0.2773636281490326, "learning_rate": 9.362671381746784e-05, "loss": 1.4797, "step": 55820 }, { "epoch": 2.031540975832811, "grad_norm": 0.3309532105922699, "learning_rate": 9.361858597904119e-05, "loss": 1.4591, "step": 55830 }, { "epoch": 2.0320829169983234, "grad_norm": 0.343264102935791, "learning_rate": 9.36104533567118e-05, "loss": 1.4657, "step": 55840 }, { "epoch": 2.0326248581638358, "grad_norm": 0.21975140273571014, "learning_rate": 9.360231595148708e-05, "loss": 1.4684, "step": 55850 }, { "epoch": 2.0328416346300404, "eval_loss": 2.453883409500122, "eval_runtime": 21.9811, "eval_samples_per_second": 227.468, "eval_steps_per_second": 1.228, "step": 55854 }, { "epoch": 2.0331667993293476, "grad_norm": 0.2824975848197937, "learning_rate": 9.359417376437503e-05, "loss": 1.4678, "step": 55860 }, { "epoch": 2.03370874049486, "grad_norm": 0.41226598620414734, "learning_rate": 9.35860267963843e-05, "loss": 1.4693, "step": 55870 }, { "epoch": 2.0342506816603723, "grad_norm": 0.37096548080444336, "learning_rate": 9.35778750485241e-05, "loss": 1.461, "step": 55880 }, { "epoch": 2.0347926228258846, "grad_norm": 0.3635852038860321, "learning_rate": 9.356971852180421e-05, "loss": 1.4598, "step": 55890 }, { "epoch": 2.035334563991397, "grad_norm": 0.32016879320144653, "learning_rate": 9.356155721723506e-05, "loss": 1.4612, "step": 55900 }, { "epoch": 2.0358765051569088, "grad_norm": 0.23647364974021912, "learning_rate": 9.355339113582761e-05, "loss": 1.4744, "step": 55910 }, { "epoch": 2.036418446322421, "grad_norm": 0.40886321663856506, "learning_rate": 9.354522027859347e-05, "loss": 1.4683, "step": 55920 }, { "epoch": 2.0369603874879334, "grad_norm": 0.3399202525615692, "learning_rate": 9.353704464654477e-05, "loss": 1.4559, "step": 55930 }, { "epoch": 2.0375023286534457, "grad_norm": 0.2759416401386261, "learning_rate": 9.35288642406943e-05, "loss": 1.4592, "step": 55940 }, { "epoch": 2.037556522769997, "eval_loss": 2.454275608062744, "eval_runtime": 21.9844, "eval_samples_per_second": 227.434, "eval_steps_per_second": 1.228, "step": 55941 }, { "epoch": 2.0380442698189576, "grad_norm": 0.193961039185524, "learning_rate": 9.352067906205538e-05, "loss": 1.4633, "step": 55950 }, { "epoch": 2.03858621098447, "grad_norm": 0.23220661282539368, "learning_rate": 9.3512489111642e-05, "loss": 1.4639, "step": 55960 }, { "epoch": 2.039128152149982, "grad_norm": 0.24919536709785461, "learning_rate": 9.350429439046867e-05, "loss": 1.4727, "step": 55970 }, { "epoch": 2.0396700933154945, "grad_norm": 0.3676759898662567, "learning_rate": 9.349609489955053e-05, "loss": 1.477, "step": 55980 }, { "epoch": 2.040212034481007, "grad_norm": 0.5084336996078491, "learning_rate": 9.348789063990328e-05, "loss": 1.4609, "step": 55990 }, { "epoch": 2.0407539756465187, "grad_norm": 0.33119305968284607, "learning_rate": 9.347968161254321e-05, "loss": 1.4632, "step": 56000 }, { "epoch": 2.041295916812031, "grad_norm": 0.23832644522190094, "learning_rate": 9.347146781848726e-05, "loss": 1.4682, "step": 56010 }, { "epoch": 2.0418378579775434, "grad_norm": 0.2528223395347595, "learning_rate": 9.346324925875293e-05, "loss": 1.4668, "step": 56020 }, { "epoch": 2.042271410909953, "eval_loss": 2.461331844329834, "eval_runtime": 21.9887, "eval_samples_per_second": 227.39, "eval_steps_per_second": 1.228, "step": 56028 }, { "epoch": 2.0423797991430557, "grad_norm": 0.30627909302711487, "learning_rate": 9.345502593435824e-05, "loss": 1.4561, "step": 56030 }, { "epoch": 2.0429217403085675, "grad_norm": 0.275127112865448, "learning_rate": 9.34467978463219e-05, "loss": 1.4664, "step": 56040 }, { "epoch": 2.04346368147408, "grad_norm": 0.25963136553764343, "learning_rate": 9.343856499566315e-05, "loss": 1.471, "step": 56050 }, { "epoch": 2.044005622639592, "grad_norm": 0.18673470616340637, "learning_rate": 9.343032738340187e-05, "loss": 1.4709, "step": 56060 }, { "epoch": 2.0445475638051045, "grad_norm": 0.23496182262897491, "learning_rate": 9.342208501055849e-05, "loss": 1.4652, "step": 56070 }, { "epoch": 2.045089504970617, "grad_norm": 0.23610053956508636, "learning_rate": 9.341383787815402e-05, "loss": 1.4712, "step": 56080 }, { "epoch": 2.0456314461361287, "grad_norm": 0.2762158215045929, "learning_rate": 9.340558598721008e-05, "loss": 1.4671, "step": 56090 }, { "epoch": 2.046173387301641, "grad_norm": 0.23105528950691223, "learning_rate": 9.339732933874891e-05, "loss": 1.4625, "step": 56100 }, { "epoch": 2.0467153284671533, "grad_norm": 0.4698254466056824, "learning_rate": 9.338906793379327e-05, "loss": 1.4586, "step": 56110 }, { "epoch": 2.0469862990499093, "eval_loss": 2.449103832244873, "eval_runtime": 22.0267, "eval_samples_per_second": 226.997, "eval_steps_per_second": 1.226, "step": 56115 }, { "epoch": 2.0472572696326656, "grad_norm": 0.311443954706192, "learning_rate": 9.338080177336656e-05, "loss": 1.4692, "step": 56120 }, { "epoch": 2.0477992107981775, "grad_norm": 0.3137112557888031, "learning_rate": 9.337253085849276e-05, "loss": 1.4666, "step": 56130 }, { "epoch": 2.04834115196369, "grad_norm": 0.27474987506866455, "learning_rate": 9.336425519019644e-05, "loss": 1.4654, "step": 56140 }, { "epoch": 2.048883093129202, "grad_norm": 0.21653681993484497, "learning_rate": 9.335597476950275e-05, "loss": 1.4556, "step": 56150 }, { "epoch": 2.0494250342947145, "grad_norm": 0.21602825820446014, "learning_rate": 9.334768959743742e-05, "loss": 1.4701, "step": 56160 }, { "epoch": 2.0499669754602268, "grad_norm": 0.2874417006969452, "learning_rate": 9.333939967502681e-05, "loss": 1.4702, "step": 56170 }, { "epoch": 2.0505089166257386, "grad_norm": 0.3574830889701843, "learning_rate": 9.333110500329781e-05, "loss": 1.4587, "step": 56180 }, { "epoch": 2.051050857791251, "grad_norm": 0.33918556571006775, "learning_rate": 9.332280558327795e-05, "loss": 1.4667, "step": 56190 }, { "epoch": 2.0515927989567633, "grad_norm": 0.25858092308044434, "learning_rate": 9.33145014159953e-05, "loss": 1.4654, "step": 56200 }, { "epoch": 2.051701187189866, "eval_loss": 2.4660325050354004, "eval_runtime": 21.9813, "eval_samples_per_second": 227.466, "eval_steps_per_second": 1.228, "step": 56202 }, { "epoch": 2.0521347401222756, "grad_norm": 0.24299031496047974, "learning_rate": 9.330619250247855e-05, "loss": 1.4677, "step": 56210 }, { "epoch": 2.052676681287788, "grad_norm": 0.2615341246128082, "learning_rate": 9.3297878843757e-05, "loss": 1.4633, "step": 56220 }, { "epoch": 2.0532186224533, "grad_norm": 0.34123480319976807, "learning_rate": 9.328956044086049e-05, "loss": 1.4689, "step": 56230 }, { "epoch": 2.053760563618812, "grad_norm": 0.3530578315258026, "learning_rate": 9.328123729481947e-05, "loss": 1.4615, "step": 56240 }, { "epoch": 2.0543025047843244, "grad_norm": 0.18291862308979034, "learning_rate": 9.327290940666497e-05, "loss": 1.4621, "step": 56250 }, { "epoch": 2.0548444459498367, "grad_norm": 0.19218549132347107, "learning_rate": 9.326457677742861e-05, "loss": 1.4736, "step": 56260 }, { "epoch": 2.0553863871153486, "grad_norm": 0.26161691546440125, "learning_rate": 9.325623940814263e-05, "loss": 1.4674, "step": 56270 }, { "epoch": 2.055928328280861, "grad_norm": 0.2795359194278717, "learning_rate": 9.324789729983979e-05, "loss": 1.4581, "step": 56280 }, { "epoch": 2.056416075329822, "eval_loss": 2.465681314468384, "eval_runtime": 21.9832, "eval_samples_per_second": 227.447, "eval_steps_per_second": 1.228, "step": 56289 }, { "epoch": 2.0564702694463732, "grad_norm": 0.19691872596740723, "learning_rate": 9.323955045355349e-05, "loss": 1.4692, "step": 56290 }, { "epoch": 2.0570122106118855, "grad_norm": 0.33033034205436707, "learning_rate": 9.323119887031769e-05, "loss": 1.4676, "step": 56300 }, { "epoch": 2.057554151777398, "grad_norm": 0.2493034452199936, "learning_rate": 9.322284255116696e-05, "loss": 1.4552, "step": 56310 }, { "epoch": 2.0580960929429097, "grad_norm": 0.3580038249492645, "learning_rate": 9.321448149713645e-05, "loss": 1.4593, "step": 56320 }, { "epoch": 2.058638034108422, "grad_norm": 0.2858411967754364, "learning_rate": 9.320611570926189e-05, "loss": 1.4693, "step": 56330 }, { "epoch": 2.0591799752739344, "grad_norm": 0.35059741139411926, "learning_rate": 9.319774518857958e-05, "loss": 1.468, "step": 56340 }, { "epoch": 2.0597219164394467, "grad_norm": 0.30402854084968567, "learning_rate": 9.318936993612643e-05, "loss": 1.4626, "step": 56350 }, { "epoch": 2.0602638576049586, "grad_norm": 0.33084359765052795, "learning_rate": 9.318098995293993e-05, "loss": 1.4748, "step": 56360 }, { "epoch": 2.060805798770471, "grad_norm": 0.21346881985664368, "learning_rate": 9.317260524005817e-05, "loss": 1.4647, "step": 56370 }, { "epoch": 2.061130963469778, "eval_loss": 2.4557716846466064, "eval_runtime": 21.9864, "eval_samples_per_second": 227.413, "eval_steps_per_second": 1.228, "step": 56376 }, { "epoch": 2.061347739935983, "grad_norm": 0.32445028424263, "learning_rate": 9.31642157985198e-05, "loss": 1.4523, "step": 56380 }, { "epoch": 2.0618896811014955, "grad_norm": 0.2711530029773712, "learning_rate": 9.315582162936407e-05, "loss": 1.4613, "step": 56390 }, { "epoch": 2.062431622267008, "grad_norm": 0.23968590795993805, "learning_rate": 9.314742273363082e-05, "loss": 1.4578, "step": 56400 }, { "epoch": 2.0629735634325197, "grad_norm": 0.29744964838027954, "learning_rate": 9.313901911236046e-05, "loss": 1.4645, "step": 56410 }, { "epoch": 2.063515504598032, "grad_norm": 0.23724764585494995, "learning_rate": 9.313061076659398e-05, "loss": 1.4636, "step": 56420 }, { "epoch": 2.0640574457635443, "grad_norm": 0.18018679320812225, "learning_rate": 9.312219769737299e-05, "loss": 1.4586, "step": 56430 }, { "epoch": 2.0645993869290566, "grad_norm": 0.25777846574783325, "learning_rate": 9.311377990573967e-05, "loss": 1.464, "step": 56440 }, { "epoch": 2.065141328094569, "grad_norm": 0.2517983317375183, "learning_rate": 9.310535739273675e-05, "loss": 1.4611, "step": 56450 }, { "epoch": 2.065683269260081, "grad_norm": 0.32330676913261414, "learning_rate": 9.30969301594076e-05, "loss": 1.4483, "step": 56460 }, { "epoch": 2.0658458516097347, "eval_loss": 2.4468648433685303, "eval_runtime": 23.3942, "eval_samples_per_second": 213.728, "eval_steps_per_second": 1.154, "step": 56463 }, { "epoch": 2.066225210425593, "grad_norm": 0.3357566297054291, "learning_rate": 9.308849820679614e-05, "loss": 1.4712, "step": 56470 }, { "epoch": 2.0667671515911055, "grad_norm": 0.18558350205421448, "learning_rate": 9.308006153594692e-05, "loss": 1.4578, "step": 56480 }, { "epoch": 2.067309092756618, "grad_norm": 0.324517160654068, "learning_rate": 9.307162014790496e-05, "loss": 1.457, "step": 56490 }, { "epoch": 2.0678510339221297, "grad_norm": 0.2188093215227127, "learning_rate": 9.3063174043716e-05, "loss": 1.4654, "step": 56500 }, { "epoch": 2.068392975087642, "grad_norm": 0.19635216891765594, "learning_rate": 9.30547232244263e-05, "loss": 1.4495, "step": 56510 }, { "epoch": 2.0689349162531543, "grad_norm": 0.18253515660762787, "learning_rate": 9.304626769108271e-05, "loss": 1.4619, "step": 56520 }, { "epoch": 2.0694768574186666, "grad_norm": 0.19728650152683258, "learning_rate": 9.303780744473265e-05, "loss": 1.4629, "step": 56530 }, { "epoch": 2.0700187985841785, "grad_norm": 0.27253544330596924, "learning_rate": 9.302934248642414e-05, "loss": 1.4604, "step": 56540 }, { "epoch": 2.070560739749691, "grad_norm": 0.22469353675842285, "learning_rate": 9.30208728172058e-05, "loss": 1.4624, "step": 56550 }, { "epoch": 2.070560739749691, "eval_loss": 2.449016809463501, "eval_runtime": 21.9605, "eval_samples_per_second": 227.682, "eval_steps_per_second": 1.229, "step": 56550 }, { "epoch": 2.071102680915203, "grad_norm": 0.4223212003707886, "learning_rate": 9.301239843812681e-05, "loss": 1.4569, "step": 56560 }, { "epoch": 2.0716446220807154, "grad_norm": 0.1832091063261032, "learning_rate": 9.30039193502369e-05, "loss": 1.4669, "step": 56570 }, { "epoch": 2.0721865632462277, "grad_norm": 0.19555889070034027, "learning_rate": 9.29954355545865e-05, "loss": 1.4459, "step": 56580 }, { "epoch": 2.0727285044117396, "grad_norm": 0.25337639451026917, "learning_rate": 9.29869470522265e-05, "loss": 1.4591, "step": 56590 }, { "epoch": 2.073270445577252, "grad_norm": 0.2375129759311676, "learning_rate": 9.29784538442084e-05, "loss": 1.4576, "step": 56600 }, { "epoch": 2.0738123867427642, "grad_norm": 0.3068578243255615, "learning_rate": 9.296995593158433e-05, "loss": 1.4535, "step": 56610 }, { "epoch": 2.0743543279082766, "grad_norm": 0.37019553780555725, "learning_rate": 9.296145331540696e-05, "loss": 1.4583, "step": 56620 }, { "epoch": 2.074896269073789, "grad_norm": 0.26135745644569397, "learning_rate": 9.295294599672959e-05, "loss": 1.463, "step": 56630 }, { "epoch": 2.0752756278896474, "eval_loss": 2.45114803314209, "eval_runtime": 21.9951, "eval_samples_per_second": 227.323, "eval_steps_per_second": 1.228, "step": 56637 }, { "epoch": 2.0754382102393008, "grad_norm": 0.23497585952281952, "learning_rate": 9.294443397660603e-05, "loss": 1.4568, "step": 56640 }, { "epoch": 2.075980151404813, "grad_norm": 0.20370684564113617, "learning_rate": 9.293591725609072e-05, "loss": 1.4567, "step": 56650 }, { "epoch": 2.0765220925703254, "grad_norm": 0.2905084490776062, "learning_rate": 9.292739583623869e-05, "loss": 1.4572, "step": 56660 }, { "epoch": 2.0770640337358377, "grad_norm": 0.17985448241233826, "learning_rate": 9.291886971810554e-05, "loss": 1.465, "step": 56670 }, { "epoch": 2.0776059749013496, "grad_norm": 0.2356506884098053, "learning_rate": 9.291033890274743e-05, "loss": 1.4714, "step": 56680 }, { "epoch": 2.078147916066862, "grad_norm": 0.22777162492275238, "learning_rate": 9.290180339122111e-05, "loss": 1.458, "step": 56690 }, { "epoch": 2.078689857232374, "grad_norm": 0.17766469717025757, "learning_rate": 9.289326318458396e-05, "loss": 1.466, "step": 56700 }, { "epoch": 2.0792317983978865, "grad_norm": 0.18408410251140594, "learning_rate": 9.288471828389387e-05, "loss": 1.4562, "step": 56710 }, { "epoch": 2.079773739563399, "grad_norm": 0.2577512264251709, "learning_rate": 9.287616869020935e-05, "loss": 1.4582, "step": 56720 }, { "epoch": 2.0799905160296035, "eval_loss": 2.459651231765747, "eval_runtime": 21.9706, "eval_samples_per_second": 227.577, "eval_steps_per_second": 1.229, "step": 56724 }, { "epoch": 2.0803156807289107, "grad_norm": 0.19766086339950562, "learning_rate": 9.286761440458952e-05, "loss": 1.4558, "step": 56730 }, { "epoch": 2.080857621894423, "grad_norm": 0.33140110969543457, "learning_rate": 9.285905542809401e-05, "loss": 1.4656, "step": 56740 }, { "epoch": 2.0813995630599353, "grad_norm": 0.26236221194267273, "learning_rate": 9.285049176178308e-05, "loss": 1.4667, "step": 56750 }, { "epoch": 2.0819415042254477, "grad_norm": 0.39471328258514404, "learning_rate": 9.284192340671756e-05, "loss": 1.4631, "step": 56760 }, { "epoch": 2.0824834453909595, "grad_norm": 0.19031473994255066, "learning_rate": 9.283335036395886e-05, "loss": 1.4657, "step": 56770 }, { "epoch": 2.083025386556472, "grad_norm": 0.27123594284057617, "learning_rate": 9.282477263456899e-05, "loss": 1.4586, "step": 56780 }, { "epoch": 2.083567327721984, "grad_norm": 0.4629122018814087, "learning_rate": 9.281619021961049e-05, "loss": 1.4637, "step": 56790 }, { "epoch": 2.0841092688874965, "grad_norm": 0.2233474850654602, "learning_rate": 9.280760312014651e-05, "loss": 1.4612, "step": 56800 }, { "epoch": 2.084651210053009, "grad_norm": 0.25030142068862915, "learning_rate": 9.279901133724082e-05, "loss": 1.446, "step": 56810 }, { "epoch": 2.0847054041695596, "eval_loss": 2.456845998764038, "eval_runtime": 21.9864, "eval_samples_per_second": 227.413, "eval_steps_per_second": 1.228, "step": 56811 }, { "epoch": 2.0851931512185207, "grad_norm": 0.2584102153778076, "learning_rate": 9.279041487195772e-05, "loss": 1.468, "step": 56820 }, { "epoch": 2.085735092384033, "grad_norm": 0.2037636637687683, "learning_rate": 9.27818137253621e-05, "loss": 1.457, "step": 56830 }, { "epoch": 2.0862770335495453, "grad_norm": 0.2464050054550171, "learning_rate": 9.27732078985194e-05, "loss": 1.4455, "step": 56840 }, { "epoch": 2.0868189747150576, "grad_norm": 0.24278301000595093, "learning_rate": 9.27645973924957e-05, "loss": 1.4556, "step": 56850 }, { "epoch": 2.08736091588057, "grad_norm": 0.2788692116737366, "learning_rate": 9.275598220835765e-05, "loss": 1.4473, "step": 56860 }, { "epoch": 2.087902857046082, "grad_norm": 0.21799428761005402, "learning_rate": 9.274736234717243e-05, "loss": 1.4548, "step": 56870 }, { "epoch": 2.088444798211594, "grad_norm": 0.2989203929901123, "learning_rate": 9.273873781000786e-05, "loss": 1.4623, "step": 56880 }, { "epoch": 2.0889867393771064, "grad_norm": 0.21010935306549072, "learning_rate": 9.273010859793231e-05, "loss": 1.456, "step": 56890 }, { "epoch": 2.089420292309516, "eval_loss": 2.455754041671753, "eval_runtime": 21.9865, "eval_samples_per_second": 227.412, "eval_steps_per_second": 1.228, "step": 56898 }, { "epoch": 2.0895286805426188, "grad_norm": 0.18887412548065186, "learning_rate": 9.272147471201467e-05, "loss": 1.456, "step": 56900 }, { "epoch": 2.0900706217081306, "grad_norm": 0.26745525002479553, "learning_rate": 9.271283615332455e-05, "loss": 1.4589, "step": 56910 }, { "epoch": 2.090612562873643, "grad_norm": 0.18780305981636047, "learning_rate": 9.270419292293201e-05, "loss": 1.4585, "step": 56920 }, { "epoch": 2.0911545040391553, "grad_norm": 0.21684403717517853, "learning_rate": 9.269554502190775e-05, "loss": 1.4464, "step": 56930 }, { "epoch": 2.0916964452046676, "grad_norm": 0.4031422436237335, "learning_rate": 9.268689245132302e-05, "loss": 1.4465, "step": 56940 }, { "epoch": 2.09223838637018, "grad_norm": 0.31991732120513916, "learning_rate": 9.267823521224967e-05, "loss": 1.4656, "step": 56950 }, { "epoch": 2.0927803275356918, "grad_norm": 0.29410219192504883, "learning_rate": 9.266957330576015e-05, "loss": 1.452, "step": 56960 }, { "epoch": 2.093322268701204, "grad_norm": 0.3743274211883545, "learning_rate": 9.26609067329274e-05, "loss": 1.4552, "step": 56970 }, { "epoch": 2.0938642098667164, "grad_norm": 0.5883728265762329, "learning_rate": 9.265223549482505e-05, "loss": 1.4578, "step": 56980 }, { "epoch": 2.0941351804494723, "eval_loss": 2.460325002670288, "eval_runtime": 22.2713, "eval_samples_per_second": 224.505, "eval_steps_per_second": 1.212, "step": 56985 }, { "epoch": 2.0944061510322287, "grad_norm": 0.3245681822299957, "learning_rate": 9.264355959252722e-05, "loss": 1.4606, "step": 56990 }, { "epoch": 2.0949480921977406, "grad_norm": 0.255845844745636, "learning_rate": 9.263487902710869e-05, "loss": 1.4688, "step": 57000 }, { "epoch": 2.095490033363253, "grad_norm": 0.17143049836158752, "learning_rate": 9.262619379964474e-05, "loss": 1.4586, "step": 57010 }, { "epoch": 2.096031974528765, "grad_norm": 0.2582264244556427, "learning_rate": 9.261750391121122e-05, "loss": 1.4485, "step": 57020 }, { "epoch": 2.0965739156942775, "grad_norm": 0.3058115541934967, "learning_rate": 9.260880936288466e-05, "loss": 1.4608, "step": 57030 }, { "epoch": 2.09711585685979, "grad_norm": 0.3971554934978485, "learning_rate": 9.260011015574207e-05, "loss": 1.4587, "step": 57040 }, { "epoch": 2.0976577980253017, "grad_norm": 0.21815316379070282, "learning_rate": 9.25914062908611e-05, "loss": 1.4634, "step": 57050 }, { "epoch": 2.098199739190814, "grad_norm": 0.18554462492465973, "learning_rate": 9.258269776931989e-05, "loss": 1.4547, "step": 57060 }, { "epoch": 2.0987416803563264, "grad_norm": 0.2115841954946518, "learning_rate": 9.257398459219727e-05, "loss": 1.4534, "step": 57070 }, { "epoch": 2.098850068589429, "eval_loss": 2.4590704441070557, "eval_runtime": 21.9818, "eval_samples_per_second": 227.461, "eval_steps_per_second": 1.228, "step": 57072 }, { "epoch": 2.0992836215218387, "grad_norm": 0.3519175350666046, "learning_rate": 9.256526676057257e-05, "loss": 1.4481, "step": 57080 }, { "epoch": 2.0998255626873505, "grad_norm": 0.3369831144809723, "learning_rate": 9.25565442755257e-05, "loss": 1.4422, "step": 57090 }, { "epoch": 2.100367503852863, "grad_norm": 0.3334982395172119, "learning_rate": 9.254781713813719e-05, "loss": 1.444, "step": 57100 }, { "epoch": 2.100909445018375, "grad_norm": 0.3039768636226654, "learning_rate": 9.253908534948811e-05, "loss": 1.4585, "step": 57110 }, { "epoch": 2.1014513861838875, "grad_norm": 0.3292928636074066, "learning_rate": 9.253034891066011e-05, "loss": 1.4428, "step": 57120 }, { "epoch": 2.1019933273494, "grad_norm": 0.2547774910926819, "learning_rate": 9.252160782273544e-05, "loss": 1.4511, "step": 57130 }, { "epoch": 2.1025352685149117, "grad_norm": 0.19933480024337769, "learning_rate": 9.251286208679688e-05, "loss": 1.456, "step": 57140 }, { "epoch": 2.103077209680424, "grad_norm": 0.20023803412914276, "learning_rate": 9.250411170392783e-05, "loss": 1.4615, "step": 57150 }, { "epoch": 2.103564956729385, "eval_loss": 2.44368839263916, "eval_runtime": 21.9837, "eval_samples_per_second": 227.441, "eval_steps_per_second": 1.228, "step": 57159 }, { "epoch": 2.1036191508459363, "grad_norm": 0.3000018894672394, "learning_rate": 9.249535667521225e-05, "loss": 1.4423, "step": 57160 }, { "epoch": 2.1041610920114486, "grad_norm": 0.18277384340763092, "learning_rate": 9.248659700173469e-05, "loss": 1.4596, "step": 57170 }, { "epoch": 2.1047030331769605, "grad_norm": 0.33772820234298706, "learning_rate": 9.247783268458023e-05, "loss": 1.4496, "step": 57180 }, { "epoch": 2.105244974342473, "grad_norm": 0.2701638340950012, "learning_rate": 9.246906372483456e-05, "loss": 1.4604, "step": 57190 }, { "epoch": 2.105786915507985, "grad_norm": 0.33614516258239746, "learning_rate": 9.246029012358398e-05, "loss": 1.4474, "step": 57200 }, { "epoch": 2.1063288566734975, "grad_norm": 0.18986083567142487, "learning_rate": 9.245151188191526e-05, "loss": 1.448, "step": 57210 }, { "epoch": 2.1068707978390098, "grad_norm": 0.19249024987220764, "learning_rate": 9.244272900091586e-05, "loss": 1.4549, "step": 57220 }, { "epoch": 2.1074127390045216, "grad_norm": 0.21236182749271393, "learning_rate": 9.243394148167376e-05, "loss": 1.4536, "step": 57230 }, { "epoch": 2.107954680170034, "grad_norm": 0.37698227167129517, "learning_rate": 9.242514932527751e-05, "loss": 1.4607, "step": 57240 }, { "epoch": 2.108279844869341, "eval_loss": 2.428705930709839, "eval_runtime": 21.9743, "eval_samples_per_second": 227.539, "eval_steps_per_second": 1.229, "step": 57246 }, { "epoch": 2.1084966213355463, "grad_norm": 0.26461127400398254, "learning_rate": 9.241635253281624e-05, "loss": 1.4558, "step": 57250 }, { "epoch": 2.1090385625010586, "grad_norm": 0.20196294784545898, "learning_rate": 9.240755110537966e-05, "loss": 1.4561, "step": 57260 }, { "epoch": 2.109580503666571, "grad_norm": 0.40879595279693604, "learning_rate": 9.239874504405806e-05, "loss": 1.4488, "step": 57270 }, { "epoch": 2.1101224448320828, "grad_norm": 0.2242775410413742, "learning_rate": 9.238993434994229e-05, "loss": 1.4513, "step": 57280 }, { "epoch": 2.110664385997595, "grad_norm": 0.25892990827560425, "learning_rate": 9.238111902412379e-05, "loss": 1.4625, "step": 57290 }, { "epoch": 2.1112063271631074, "grad_norm": 0.1938840001821518, "learning_rate": 9.237229906769456e-05, "loss": 1.4614, "step": 57300 }, { "epoch": 2.1117482683286197, "grad_norm": 0.2417358160018921, "learning_rate": 9.236347448174717e-05, "loss": 1.4616, "step": 57310 }, { "epoch": 2.1122902094941316, "grad_norm": 0.26493963599205017, "learning_rate": 9.235464526737477e-05, "loss": 1.4459, "step": 57320 }, { "epoch": 2.112832150659644, "grad_norm": 0.19729004800319672, "learning_rate": 9.234581142567112e-05, "loss": 1.4565, "step": 57330 }, { "epoch": 2.1129947330092977, "eval_loss": 2.458986520767212, "eval_runtime": 21.9806, "eval_samples_per_second": 227.473, "eval_steps_per_second": 1.228, "step": 57333 }, { "epoch": 2.1133740918251562, "grad_norm": 0.2965938448905945, "learning_rate": 9.233697295773046e-05, "loss": 1.4564, "step": 57340 }, { "epoch": 2.1139160329906685, "grad_norm": 0.281646728515625, "learning_rate": 9.23281298646477e-05, "loss": 1.4464, "step": 57350 }, { "epoch": 2.114457974156181, "grad_norm": 0.2671820819377899, "learning_rate": 9.231928214751828e-05, "loss": 1.4495, "step": 57360 }, { "epoch": 2.1149999153216927, "grad_norm": 0.20213744044303894, "learning_rate": 9.23104298074382e-05, "loss": 1.4672, "step": 57370 }, { "epoch": 2.115541856487205, "grad_norm": 0.3106236755847931, "learning_rate": 9.230157284550407e-05, "loss": 1.4501, "step": 57380 }, { "epoch": 2.1160837976527174, "grad_norm": 0.26210352778434753, "learning_rate": 9.229271126281304e-05, "loss": 1.454, "step": 57390 }, { "epoch": 2.1166257388182297, "grad_norm": 0.2264351099729538, "learning_rate": 9.228384506046286e-05, "loss": 1.4441, "step": 57400 }, { "epoch": 2.1171676799837416, "grad_norm": 0.19489522278308868, "learning_rate": 9.22749742395518e-05, "loss": 1.4577, "step": 57410 }, { "epoch": 2.117709621149254, "grad_norm": 0.20375080406665802, "learning_rate": 9.226609880117877e-05, "loss": 1.4462, "step": 57420 }, { "epoch": 2.117709621149254, "eval_loss": 2.4545302391052246, "eval_runtime": 21.9693, "eval_samples_per_second": 227.591, "eval_steps_per_second": 1.229, "step": 57420 }, { "epoch": 2.118251562314766, "grad_norm": 0.29413971304893494, "learning_rate": 9.225721874644324e-05, "loss": 1.465, "step": 57430 }, { "epoch": 2.1187935034802785, "grad_norm": 0.2669055461883545, "learning_rate": 9.224833407644519e-05, "loss": 1.454, "step": 57440 }, { "epoch": 2.119335444645791, "grad_norm": 0.2955906391143799, "learning_rate": 9.223944479228522e-05, "loss": 1.4574, "step": 57450 }, { "epoch": 2.1198773858113027, "grad_norm": 0.3815632462501526, "learning_rate": 9.223055089506451e-05, "loss": 1.4574, "step": 57460 }, { "epoch": 2.120419326976815, "grad_norm": 0.3034198582172394, "learning_rate": 9.22216523858848e-05, "loss": 1.4515, "step": 57470 }, { "epoch": 2.1209612681423273, "grad_norm": 0.22253744304180145, "learning_rate": 9.22127492658484e-05, "loss": 1.4485, "step": 57480 }, { "epoch": 2.1215032093078396, "grad_norm": 0.39392632246017456, "learning_rate": 9.220384153605816e-05, "loss": 1.4488, "step": 57490 }, { "epoch": 2.122045150473352, "grad_norm": 0.19380737841129303, "learning_rate": 9.219492919761758e-05, "loss": 1.448, "step": 57500 }, { "epoch": 2.1224245092892104, "eval_loss": 2.4559643268585205, "eval_runtime": 21.9784, "eval_samples_per_second": 227.496, "eval_steps_per_second": 1.228, "step": 57507 }, { "epoch": 2.122587091638864, "grad_norm": 0.1623711884021759, "learning_rate": 9.218601225163063e-05, "loss": 1.4629, "step": 57510 }, { "epoch": 2.123129032804376, "grad_norm": 0.27015969157218933, "learning_rate": 9.217709069920196e-05, "loss": 1.4466, "step": 57520 }, { "epoch": 2.1236709739698885, "grad_norm": 0.20501737296581268, "learning_rate": 9.216816454143667e-05, "loss": 1.4536, "step": 57530 }, { "epoch": 2.124212915135401, "grad_norm": 0.22715558111667633, "learning_rate": 9.215923377944055e-05, "loss": 1.4488, "step": 57540 }, { "epoch": 2.1247548563009127, "grad_norm": 0.27195459604263306, "learning_rate": 9.215029841431984e-05, "loss": 1.4442, "step": 57550 }, { "epoch": 2.125296797466425, "grad_norm": 0.2515755295753479, "learning_rate": 9.21413584471815e-05, "loss": 1.4584, "step": 57560 }, { "epoch": 2.1258387386319373, "grad_norm": 0.2585810720920563, "learning_rate": 9.213241387913289e-05, "loss": 1.4515, "step": 57570 }, { "epoch": 2.1263806797974496, "grad_norm": 0.3669833540916443, "learning_rate": 9.212346471128207e-05, "loss": 1.4564, "step": 57580 }, { "epoch": 2.1269226209629615, "grad_norm": 0.2513576149940491, "learning_rate": 9.211451094473764e-05, "loss": 1.4557, "step": 57590 }, { "epoch": 2.1271393974291666, "eval_loss": 2.456778049468994, "eval_runtime": 22.6763, "eval_samples_per_second": 220.495, "eval_steps_per_second": 1.191, "step": 57594 }, { "epoch": 2.127464562128474, "grad_norm": 0.2454679310321808, "learning_rate": 9.210555258060871e-05, "loss": 1.4549, "step": 57600 }, { "epoch": 2.128006503293986, "grad_norm": 0.3271390199661255, "learning_rate": 9.209658962000502e-05, "loss": 1.4548, "step": 57610 }, { "epoch": 2.1285484444594984, "grad_norm": 0.2133026272058487, "learning_rate": 9.208762206403687e-05, "loss": 1.4508, "step": 57620 }, { "epoch": 2.1290903856250107, "grad_norm": 0.20085935294628143, "learning_rate": 9.20786499138151e-05, "loss": 1.4557, "step": 57630 }, { "epoch": 2.1296323267905226, "grad_norm": 0.3363330662250519, "learning_rate": 9.206967317045119e-05, "loss": 1.4512, "step": 57640 }, { "epoch": 2.130174267956035, "grad_norm": 0.2005142867565155, "learning_rate": 9.20606918350571e-05, "loss": 1.4578, "step": 57650 }, { "epoch": 2.1307162091215472, "grad_norm": 0.32124650478363037, "learning_rate": 9.205170590874539e-05, "loss": 1.4407, "step": 57660 }, { "epoch": 2.1312581502870596, "grad_norm": 0.406781405210495, "learning_rate": 9.204271539262922e-05, "loss": 1.4442, "step": 57670 }, { "epoch": 2.131800091452572, "grad_norm": 0.19096383452415466, "learning_rate": 9.20337202878223e-05, "loss": 1.4632, "step": 57680 }, { "epoch": 2.1318542855691227, "eval_loss": 2.4498584270477295, "eval_runtime": 21.9755, "eval_samples_per_second": 227.526, "eval_steps_per_second": 1.229, "step": 57681 }, { "epoch": 2.1323420326180837, "grad_norm": 0.21396368741989136, "learning_rate": 9.202472059543888e-05, "loss": 1.4473, "step": 57690 }, { "epoch": 2.132883973783596, "grad_norm": 0.33212918043136597, "learning_rate": 9.201571631659384e-05, "loss": 1.4547, "step": 57700 }, { "epoch": 2.1334259149491084, "grad_norm": 0.3928041458129883, "learning_rate": 9.200670745240255e-05, "loss": 1.4552, "step": 57710 }, { "epoch": 2.1339678561146207, "grad_norm": 0.2717365622520447, "learning_rate": 9.199769400398101e-05, "loss": 1.4431, "step": 57720 }, { "epoch": 2.1345097972801326, "grad_norm": 0.19520139694213867, "learning_rate": 9.198867597244577e-05, "loss": 1.4426, "step": 57730 }, { "epoch": 2.135051738445645, "grad_norm": 0.4766680896282196, "learning_rate": 9.197965335891394e-05, "loss": 1.4433, "step": 57740 }, { "epoch": 2.135593679611157, "grad_norm": 0.5358844995498657, "learning_rate": 9.19706261645032e-05, "loss": 1.4486, "step": 57750 }, { "epoch": 2.1361356207766695, "grad_norm": 0.40660426020622253, "learning_rate": 9.196159439033179e-05, "loss": 1.4481, "step": 57760 }, { "epoch": 2.1365691737090793, "eval_loss": 2.4472365379333496, "eval_runtime": 21.9775, "eval_samples_per_second": 227.505, "eval_steps_per_second": 1.229, "step": 57768 }, { "epoch": 2.136677561942182, "grad_norm": 0.21572063863277435, "learning_rate": 9.195255803751855e-05, "loss": 1.4457, "step": 57770 }, { "epoch": 2.1372195031076937, "grad_norm": 0.29680702090263367, "learning_rate": 9.194351710718285e-05, "loss": 1.4545, "step": 57780 }, { "epoch": 2.137761444273206, "grad_norm": 0.27061325311660767, "learning_rate": 9.193447160044465e-05, "loss": 1.4451, "step": 57790 }, { "epoch": 2.1383033854387183, "grad_norm": 0.2027037888765335, "learning_rate": 9.192542151842447e-05, "loss": 1.4497, "step": 57800 }, { "epoch": 2.1388453266042307, "grad_norm": 0.20895957946777344, "learning_rate": 9.191636686224336e-05, "loss": 1.4519, "step": 57810 }, { "epoch": 2.1393872677697425, "grad_norm": 0.18710966408252716, "learning_rate": 9.1907307633023e-05, "loss": 1.4492, "step": 57820 }, { "epoch": 2.139929208935255, "grad_norm": 0.270098477602005, "learning_rate": 9.189824383188562e-05, "loss": 1.4581, "step": 57830 }, { "epoch": 2.140471150100767, "grad_norm": 0.21558840572834015, "learning_rate": 9.1889175459954e-05, "loss": 1.4398, "step": 57840 }, { "epoch": 2.1410130912662795, "grad_norm": 0.2560511827468872, "learning_rate": 9.188010251835147e-05, "loss": 1.4501, "step": 57850 }, { "epoch": 2.1412840618490354, "eval_loss": 2.448354482650757, "eval_runtime": 21.9774, "eval_samples_per_second": 227.506, "eval_steps_per_second": 1.229, "step": 57855 }, { "epoch": 2.141555032431792, "grad_norm": 0.3320198357105255, "learning_rate": 9.187102500820195e-05, "loss": 1.4466, "step": 57860 }, { "epoch": 2.1420969735973037, "grad_norm": 0.37881776690483093, "learning_rate": 9.186194293062993e-05, "loss": 1.446, "step": 57870 }, { "epoch": 2.142638914762816, "grad_norm": 0.38807186484336853, "learning_rate": 9.185285628676045e-05, "loss": 1.4483, "step": 57880 }, { "epoch": 2.1431808559283283, "grad_norm": 0.27356836199760437, "learning_rate": 9.184376507771916e-05, "loss": 1.4483, "step": 57890 }, { "epoch": 2.1437227970938406, "grad_norm": 0.31562650203704834, "learning_rate": 9.18346693046322e-05, "loss": 1.4557, "step": 57900 }, { "epoch": 2.144264738259353, "grad_norm": 0.3229880630970001, "learning_rate": 9.182556896862632e-05, "loss": 1.4511, "step": 57910 }, { "epoch": 2.144806679424865, "grad_norm": 0.36681440472602844, "learning_rate": 9.181646407082885e-05, "loss": 1.4448, "step": 57920 }, { "epoch": 2.145348620590377, "grad_norm": 0.4348771274089813, "learning_rate": 9.180735461236764e-05, "loss": 1.4456, "step": 57930 }, { "epoch": 2.1458905617558894, "grad_norm": 0.35928380489349365, "learning_rate": 9.179824059437113e-05, "loss": 1.448, "step": 57940 }, { "epoch": 2.145998949988992, "eval_loss": 2.451070547103882, "eval_runtime": 21.9847, "eval_samples_per_second": 227.431, "eval_steps_per_second": 1.228, "step": 57942 }, { "epoch": 2.1464325029214018, "grad_norm": 0.3611794710159302, "learning_rate": 9.178912201796836e-05, "loss": 1.4492, "step": 57950 }, { "epoch": 2.1469744440869136, "grad_norm": 0.21141478419303894, "learning_rate": 9.177999888428886e-05, "loss": 1.4474, "step": 57960 }, { "epoch": 2.147516385252426, "grad_norm": 0.2665724456310272, "learning_rate": 9.17708711944628e-05, "loss": 1.4562, "step": 57970 }, { "epoch": 2.1480583264179383, "grad_norm": 0.3022826015949249, "learning_rate": 9.176173894962086e-05, "loss": 1.4337, "step": 57980 }, { "epoch": 2.1486002675834506, "grad_norm": 0.27852869033813477, "learning_rate": 9.175260215089429e-05, "loss": 1.4481, "step": 57990 }, { "epoch": 2.149142208748963, "grad_norm": 0.22829607129096985, "learning_rate": 9.174346079941492e-05, "loss": 1.4468, "step": 58000 }, { "epoch": 2.1496841499144748, "grad_norm": 0.24403288960456848, "learning_rate": 9.173431489631517e-05, "loss": 1.4571, "step": 58010 }, { "epoch": 2.150226091079987, "grad_norm": 0.23664097487926483, "learning_rate": 9.172516444272796e-05, "loss": 1.4461, "step": 58020 }, { "epoch": 2.150713838128948, "eval_loss": 2.4530248641967773, "eval_runtime": 21.9794, "eval_samples_per_second": 227.486, "eval_steps_per_second": 1.228, "step": 58029 }, { "epoch": 2.1507680322454994, "grad_norm": 0.19086076319217682, "learning_rate": 9.171600943978683e-05, "loss": 1.4514, "step": 58030 }, { "epoch": 2.1513099734110117, "grad_norm": 0.22935578227043152, "learning_rate": 9.170684988862586e-05, "loss": 1.456, "step": 58040 }, { "epoch": 2.1518519145765236, "grad_norm": 0.21930955350399017, "learning_rate": 9.16976857903797e-05, "loss": 1.4513, "step": 58050 }, { "epoch": 2.152393855742036, "grad_norm": 0.2644178867340088, "learning_rate": 9.168851714618352e-05, "loss": 1.4377, "step": 58060 }, { "epoch": 2.152935796907548, "grad_norm": 0.3892003297805786, "learning_rate": 9.167934395717314e-05, "loss": 1.4409, "step": 58070 }, { "epoch": 2.1534777380730605, "grad_norm": 0.40704429149627686, "learning_rate": 9.167016622448488e-05, "loss": 1.4479, "step": 58080 }, { "epoch": 2.154019679238573, "grad_norm": 0.33170846104621887, "learning_rate": 9.166098394925562e-05, "loss": 1.4459, "step": 58090 }, { "epoch": 2.1545616204040847, "grad_norm": 0.18887291848659515, "learning_rate": 9.165179713262286e-05, "loss": 1.4583, "step": 58100 }, { "epoch": 2.155103561569597, "grad_norm": 0.2687866687774658, "learning_rate": 9.164260577572456e-05, "loss": 1.4504, "step": 58110 }, { "epoch": 2.1554287262689042, "eval_loss": 2.450887680053711, "eval_runtime": 21.9808, "eval_samples_per_second": 227.471, "eval_steps_per_second": 1.228, "step": 58116 }, { "epoch": 2.1556455027351094, "grad_norm": 0.21473592519760132, "learning_rate": 9.163340987969938e-05, "loss": 1.4491, "step": 58120 }, { "epoch": 2.1561874439006217, "grad_norm": 0.1952633261680603, "learning_rate": 9.162420944568641e-05, "loss": 1.4505, "step": 58130 }, { "epoch": 2.156729385066134, "grad_norm": 0.21670879423618317, "learning_rate": 9.161500447482539e-05, "loss": 1.4447, "step": 58140 }, { "epoch": 2.157271326231646, "grad_norm": 0.27638787031173706, "learning_rate": 9.160579496825656e-05, "loss": 1.4444, "step": 58150 }, { "epoch": 2.157813267397158, "grad_norm": 0.2775803804397583, "learning_rate": 9.15965809271208e-05, "loss": 1.4442, "step": 58160 }, { "epoch": 2.1583552085626705, "grad_norm": 0.2165958732366562, "learning_rate": 9.158736235255949e-05, "loss": 1.4533, "step": 58170 }, { "epoch": 2.158897149728183, "grad_norm": 0.3519152104854584, "learning_rate": 9.157813924571455e-05, "loss": 1.4455, "step": 58180 }, { "epoch": 2.1594390908936947, "grad_norm": 0.23166511952877045, "learning_rate": 9.156891160772854e-05, "loss": 1.4437, "step": 58190 }, { "epoch": 2.159981032059207, "grad_norm": 0.32799333333969116, "learning_rate": 9.155967943974453e-05, "loss": 1.4477, "step": 58200 }, { "epoch": 2.160143614408861, "eval_loss": 2.4706532955169678, "eval_runtime": 21.9762, "eval_samples_per_second": 227.518, "eval_steps_per_second": 1.229, "step": 58203 }, { "epoch": 2.1605229732247193, "grad_norm": 0.26824527978897095, "learning_rate": 9.155044274290614e-05, "loss": 1.4423, "step": 58210 }, { "epoch": 2.1610649143902316, "grad_norm": 0.3500238358974457, "learning_rate": 9.15412015183576e-05, "loss": 1.4509, "step": 58220 }, { "epoch": 2.1616068555557435, "grad_norm": 0.38133561611175537, "learning_rate": 9.153195576724367e-05, "loss": 1.4519, "step": 58230 }, { "epoch": 2.162148796721256, "grad_norm": 0.27178552746772766, "learning_rate": 9.152270549070964e-05, "loss": 1.4541, "step": 58240 }, { "epoch": 2.162690737886768, "grad_norm": 0.24917040765285492, "learning_rate": 9.151345068990146e-05, "loss": 1.449, "step": 58250 }, { "epoch": 2.1632326790522804, "grad_norm": 0.39437514543533325, "learning_rate": 9.150419136596551e-05, "loss": 1.4438, "step": 58260 }, { "epoch": 2.1637746202177928, "grad_norm": 0.346034437417984, "learning_rate": 9.149492752004882e-05, "loss": 1.4544, "step": 58270 }, { "epoch": 2.1643165613833046, "grad_norm": 0.3629694879055023, "learning_rate": 9.148565915329896e-05, "loss": 1.4454, "step": 58280 }, { "epoch": 2.164858502548817, "grad_norm": 0.20559094846248627, "learning_rate": 9.147638626686404e-05, "loss": 1.4417, "step": 58290 }, { "epoch": 2.164858502548817, "eval_loss": 2.461625099182129, "eval_runtime": 21.9403, "eval_samples_per_second": 227.891, "eval_steps_per_second": 1.231, "step": 58290 }, { "epoch": 2.1654004437143293, "grad_norm": 0.1901310533285141, "learning_rate": 9.146710886189276e-05, "loss": 1.4454, "step": 58300 }, { "epoch": 2.1659423848798416, "grad_norm": 0.2069934457540512, "learning_rate": 9.145782693953435e-05, "loss": 1.4521, "step": 58310 }, { "epoch": 2.166484326045354, "grad_norm": 0.34269529581069946, "learning_rate": 9.144854050093863e-05, "loss": 1.4463, "step": 58320 }, { "epoch": 2.1670262672108658, "grad_norm": 0.2547290325164795, "learning_rate": 9.143924954725595e-05, "loss": 1.4447, "step": 58330 }, { "epoch": 2.167568208376378, "grad_norm": 0.2823292016983032, "learning_rate": 9.142995407963724e-05, "loss": 1.4402, "step": 58340 }, { "epoch": 2.1681101495418904, "grad_norm": 0.3173072934150696, "learning_rate": 9.1420654099234e-05, "loss": 1.4404, "step": 58350 }, { "epoch": 2.1686520907074027, "grad_norm": 0.23170779645442963, "learning_rate": 9.141134960719824e-05, "loss": 1.4542, "step": 58360 }, { "epoch": 2.1691940318729146, "grad_norm": 0.33306044340133667, "learning_rate": 9.140204060468257e-05, "loss": 1.4494, "step": 58370 }, { "epoch": 2.1695733906887735, "eval_loss": 2.4640088081359863, "eval_runtime": 21.9887, "eval_samples_per_second": 227.39, "eval_steps_per_second": 1.228, "step": 58377 }, { "epoch": 2.169735973038427, "grad_norm": 0.23135629296302795, "learning_rate": 9.139272709284015e-05, "loss": 1.4456, "step": 58380 }, { "epoch": 2.1702779142039392, "grad_norm": 0.22279199957847595, "learning_rate": 9.138340907282472e-05, "loss": 1.4514, "step": 58390 }, { "epoch": 2.1708198553694515, "grad_norm": 0.27086111903190613, "learning_rate": 9.137408654579051e-05, "loss": 1.4481, "step": 58400 }, { "epoch": 2.171361796534964, "grad_norm": 0.21697980165481567, "learning_rate": 9.136475951289239e-05, "loss": 1.4424, "step": 58410 }, { "epoch": 2.1719037377004757, "grad_norm": 0.32999521493911743, "learning_rate": 9.135542797528574e-05, "loss": 1.4552, "step": 58420 }, { "epoch": 2.172445678865988, "grad_norm": 0.3084947466850281, "learning_rate": 9.134609193412652e-05, "loss": 1.433, "step": 58430 }, { "epoch": 2.1729876200315004, "grad_norm": 0.21755863726139069, "learning_rate": 9.13367513905712e-05, "loss": 1.4449, "step": 58440 }, { "epoch": 2.1735295611970127, "grad_norm": 0.1901983916759491, "learning_rate": 9.13274063457769e-05, "loss": 1.4482, "step": 58450 }, { "epoch": 2.1740715023625246, "grad_norm": 0.18798936903476715, "learning_rate": 9.131805680090122e-05, "loss": 1.4472, "step": 58460 }, { "epoch": 2.1742882788287297, "eval_loss": 2.459465503692627, "eval_runtime": 21.9819, "eval_samples_per_second": 227.459, "eval_steps_per_second": 1.228, "step": 58464 }, { "epoch": 2.174613443528037, "grad_norm": 0.2203335165977478, "learning_rate": 9.130870275710235e-05, "loss": 1.4538, "step": 58470 }, { "epoch": 2.175155384693549, "grad_norm": 0.22482015192508698, "learning_rate": 9.1299344215539e-05, "loss": 1.4545, "step": 58480 }, { "epoch": 2.1756973258590615, "grad_norm": 0.3685661852359772, "learning_rate": 9.12899811773705e-05, "loss": 1.441, "step": 58490 }, { "epoch": 2.176239267024574, "grad_norm": 0.27606692910194397, "learning_rate": 9.128061364375668e-05, "loss": 1.4412, "step": 58500 }, { "epoch": 2.1767812081900857, "grad_norm": 0.2645034193992615, "learning_rate": 9.127124161585795e-05, "loss": 1.4463, "step": 58510 }, { "epoch": 2.177323149355598, "grad_norm": 0.4667447805404663, "learning_rate": 9.126186509483529e-05, "loss": 1.4512, "step": 58520 }, { "epoch": 2.1778650905211103, "grad_norm": 0.5272287726402283, "learning_rate": 9.125248408185021e-05, "loss": 1.4451, "step": 58530 }, { "epoch": 2.1784070316866226, "grad_norm": 0.2895019054412842, "learning_rate": 9.12430985780648e-05, "loss": 1.4515, "step": 58540 }, { "epoch": 2.178948972852135, "grad_norm": 0.25505638122558594, "learning_rate": 9.123370858464169e-05, "loss": 1.4515, "step": 58550 }, { "epoch": 2.179003166968686, "eval_loss": 2.462846279144287, "eval_runtime": 21.9798, "eval_samples_per_second": 227.481, "eval_steps_per_second": 1.228, "step": 58551 }, { "epoch": 2.179490914017647, "grad_norm": 0.24811875820159912, "learning_rate": 9.122431410274406e-05, "loss": 1.4464, "step": 58560 }, { "epoch": 2.180032855183159, "grad_norm": 0.22253675758838654, "learning_rate": 9.12149151335357e-05, "loss": 1.4418, "step": 58570 }, { "epoch": 2.1805747963486715, "grad_norm": 0.3881581127643585, "learning_rate": 9.120551167818084e-05, "loss": 1.4429, "step": 58580 }, { "epoch": 2.181116737514184, "grad_norm": 0.19040873646736145, "learning_rate": 9.119610373784442e-05, "loss": 1.4412, "step": 58590 }, { "epoch": 2.1816586786796957, "grad_norm": 0.18105448782444, "learning_rate": 9.118669131369179e-05, "loss": 1.4504, "step": 58600 }, { "epoch": 2.182200619845208, "grad_norm": 0.23432651162147522, "learning_rate": 9.117727440688896e-05, "loss": 1.4373, "step": 58610 }, { "epoch": 2.1827425610107203, "grad_norm": 0.22334584593772888, "learning_rate": 9.116785301860244e-05, "loss": 1.4414, "step": 58620 }, { "epoch": 2.1832845021762326, "grad_norm": 0.3361223638057709, "learning_rate": 9.115842714999931e-05, "loss": 1.4394, "step": 58630 }, { "epoch": 2.1837180551086424, "eval_loss": 2.464482545852661, "eval_runtime": 21.9821, "eval_samples_per_second": 227.458, "eval_steps_per_second": 1.228, "step": 58638 }, { "epoch": 2.183826443341745, "grad_norm": 0.5349717140197754, "learning_rate": 9.11489968022472e-05, "loss": 1.4439, "step": 58640 }, { "epoch": 2.184368384507257, "grad_norm": 0.5961711406707764, "learning_rate": 9.113956197651434e-05, "loss": 1.4424, "step": 58650 }, { "epoch": 2.184910325672769, "grad_norm": 0.3221561312675476, "learning_rate": 9.113012267396943e-05, "loss": 1.4393, "step": 58660 }, { "epoch": 2.1854522668382814, "grad_norm": 0.36589309573173523, "learning_rate": 9.112067889578178e-05, "loss": 1.4436, "step": 58670 }, { "epoch": 2.1859942080037937, "grad_norm": 0.21223586797714233, "learning_rate": 9.111123064312125e-05, "loss": 1.4466, "step": 58680 }, { "epoch": 2.1865361491693056, "grad_norm": 0.21100690960884094, "learning_rate": 9.110177791715824e-05, "loss": 1.4415, "step": 58690 }, { "epoch": 2.187078090334818, "grad_norm": 0.3193996548652649, "learning_rate": 9.109232071906373e-05, "loss": 1.4573, "step": 58700 }, { "epoch": 2.1876200315003302, "grad_norm": 0.18448546528816223, "learning_rate": 9.108285905000922e-05, "loss": 1.4535, "step": 58710 }, { "epoch": 2.1881619726658426, "grad_norm": 0.3342556953430176, "learning_rate": 9.107339291116679e-05, "loss": 1.4356, "step": 58720 }, { "epoch": 2.1884329432485985, "eval_loss": 2.4611220359802246, "eval_runtime": 21.982, "eval_samples_per_second": 227.459, "eval_steps_per_second": 1.228, "step": 58725 }, { "epoch": 2.188703913831355, "grad_norm": 0.43229833245277405, "learning_rate": 9.106392230370906e-05, "loss": 1.4644, "step": 58730 }, { "epoch": 2.1892458549968667, "grad_norm": 0.36450695991516113, "learning_rate": 9.10544472288092e-05, "loss": 1.4381, "step": 58740 }, { "epoch": 2.189787796162379, "grad_norm": 0.18473106622695923, "learning_rate": 9.104496768764096e-05, "loss": 1.4404, "step": 58750 }, { "epoch": 2.1903297373278914, "grad_norm": 0.3914441168308258, "learning_rate": 9.103548368137863e-05, "loss": 1.4448, "step": 58760 }, { "epoch": 2.1908716784934037, "grad_norm": 0.38350018858909607, "learning_rate": 9.102599521119701e-05, "loss": 1.4437, "step": 58770 }, { "epoch": 2.191413619658916, "grad_norm": 0.1921490877866745, "learning_rate": 9.101650227827152e-05, "loss": 1.4433, "step": 58780 }, { "epoch": 2.191955560824428, "grad_norm": 0.25018230080604553, "learning_rate": 9.100700488377809e-05, "loss": 1.4451, "step": 58790 }, { "epoch": 2.19249750198994, "grad_norm": 0.5067526698112488, "learning_rate": 9.099750302889323e-05, "loss": 1.4481, "step": 58800 }, { "epoch": 2.1930394431554525, "grad_norm": 0.27210572361946106, "learning_rate": 9.098799671479397e-05, "loss": 1.4553, "step": 58810 }, { "epoch": 2.193147831388555, "eval_loss": 2.46246600151062, "eval_runtime": 21.9768, "eval_samples_per_second": 227.512, "eval_steps_per_second": 1.229, "step": 58812 }, { "epoch": 2.193581384320965, "grad_norm": 0.1989605873823166, "learning_rate": 9.097848594265793e-05, "loss": 1.4463, "step": 58820 }, { "epoch": 2.1941233254864767, "grad_norm": 0.18576236069202423, "learning_rate": 9.096897071366326e-05, "loss": 1.4411, "step": 58830 }, { "epoch": 2.194665266651989, "grad_norm": 0.2913901209831238, "learning_rate": 9.095945102898865e-05, "loss": 1.4448, "step": 58840 }, { "epoch": 2.1952072078175013, "grad_norm": 0.5270187854766846, "learning_rate": 9.094992688981337e-05, "loss": 1.454, "step": 58850 }, { "epoch": 2.1957491489830137, "grad_norm": 0.2984652519226074, "learning_rate": 9.094039829731721e-05, "loss": 1.4452, "step": 58860 }, { "epoch": 2.1962910901485255, "grad_norm": 0.3765738308429718, "learning_rate": 9.093086525268059e-05, "loss": 1.4389, "step": 58870 }, { "epoch": 2.196833031314038, "grad_norm": 0.31733348965644836, "learning_rate": 9.092132775708432e-05, "loss": 1.4405, "step": 58880 }, { "epoch": 2.19737497247955, "grad_norm": 0.19566883146762848, "learning_rate": 9.091178581170996e-05, "loss": 1.451, "step": 58890 }, { "epoch": 2.197862719528511, "eval_loss": 2.460601568222046, "eval_runtime": 21.9826, "eval_samples_per_second": 227.453, "eval_steps_per_second": 1.228, "step": 58899 }, { "epoch": 2.1979169136450625, "grad_norm": 0.2175428867340088, "learning_rate": 9.090223941773949e-05, "loss": 1.4446, "step": 58900 }, { "epoch": 2.198458854810575, "grad_norm": 0.17789192497730255, "learning_rate": 9.089268857635546e-05, "loss": 1.4443, "step": 58910 }, { "epoch": 2.1990007959760867, "grad_norm": 0.17181305587291718, "learning_rate": 9.0883133288741e-05, "loss": 1.4535, "step": 58920 }, { "epoch": 2.199542737141599, "grad_norm": 0.3057771325111389, "learning_rate": 9.087357355607977e-05, "loss": 1.4402, "step": 58930 }, { "epoch": 2.2000846783071113, "grad_norm": 0.3295128643512726, "learning_rate": 9.086400937955601e-05, "loss": 1.4578, "step": 58940 }, { "epoch": 2.2006266194726236, "grad_norm": 0.3450293242931366, "learning_rate": 9.085444076035448e-05, "loss": 1.451, "step": 58950 }, { "epoch": 2.201168560638136, "grad_norm": 0.31713640689849854, "learning_rate": 9.084486769966047e-05, "loss": 1.4599, "step": 58960 }, { "epoch": 2.201710501803648, "grad_norm": 0.23452381789684296, "learning_rate": 9.083529019865988e-05, "loss": 1.4549, "step": 58970 }, { "epoch": 2.20225244296916, "grad_norm": 0.19139593839645386, "learning_rate": 9.082570825853912e-05, "loss": 1.4405, "step": 58980 }, { "epoch": 2.2025776076684673, "eval_loss": 2.4606852531433105, "eval_runtime": 21.9818, "eval_samples_per_second": 227.461, "eval_steps_per_second": 1.228, "step": 58986 }, { "epoch": 2.2027943841346724, "grad_norm": 0.20362070202827454, "learning_rate": 9.081612188048518e-05, "loss": 1.4418, "step": 58990 }, { "epoch": 2.2033363253001848, "grad_norm": 0.24732381105422974, "learning_rate": 9.080653106568555e-05, "loss": 1.4289, "step": 59000 }, { "epoch": 2.2038782664656966, "grad_norm": 0.2842128872871399, "learning_rate": 9.07969358153283e-05, "loss": 1.4486, "step": 59010 }, { "epoch": 2.204420207631209, "grad_norm": 0.4157765805721283, "learning_rate": 9.078733613060205e-05, "loss": 1.4461, "step": 59020 }, { "epoch": 2.2049621487967213, "grad_norm": 0.5429375171661377, "learning_rate": 9.0777732012696e-05, "loss": 1.4433, "step": 59030 }, { "epoch": 2.2055040899622336, "grad_norm": 0.29770663380622864, "learning_rate": 9.076812346279982e-05, "loss": 1.4464, "step": 59040 }, { "epoch": 2.206046031127746, "grad_norm": 0.22405195236206055, "learning_rate": 9.075851048210379e-05, "loss": 1.4495, "step": 59050 }, { "epoch": 2.2065879722932578, "grad_norm": 0.43941786885261536, "learning_rate": 9.074889307179873e-05, "loss": 1.4481, "step": 59060 }, { "epoch": 2.20712991345877, "grad_norm": 0.20217150449752808, "learning_rate": 9.073927123307602e-05, "loss": 1.4482, "step": 59070 }, { "epoch": 2.207292495808424, "eval_loss": 2.457340717315674, "eval_runtime": 21.9825, "eval_samples_per_second": 227.453, "eval_steps_per_second": 1.228, "step": 59073 }, { "epoch": 2.2076718546242824, "grad_norm": 0.34927546977996826, "learning_rate": 9.072964496712755e-05, "loss": 1.4482, "step": 59080 }, { "epoch": 2.2082137957897947, "grad_norm": 0.2522357702255249, "learning_rate": 9.072001427514578e-05, "loss": 1.4505, "step": 59090 }, { "epoch": 2.2087557369553066, "grad_norm": 0.2815626263618469, "learning_rate": 9.07103791583237e-05, "loss": 1.4479, "step": 59100 }, { "epoch": 2.209297678120819, "grad_norm": 0.22079609334468842, "learning_rate": 9.070073961785491e-05, "loss": 1.4494, "step": 59110 }, { "epoch": 2.209839619286331, "grad_norm": 0.2886086702346802, "learning_rate": 9.06910956549335e-05, "loss": 1.4356, "step": 59120 }, { "epoch": 2.2103815604518435, "grad_norm": 0.39057257771492004, "learning_rate": 9.068144727075409e-05, "loss": 1.4445, "step": 59130 }, { "epoch": 2.210923501617356, "grad_norm": 0.41756001114845276, "learning_rate": 9.067179446651195e-05, "loss": 1.4419, "step": 59140 }, { "epoch": 2.2114654427828677, "grad_norm": 0.30022770166397095, "learning_rate": 9.066213724340274e-05, "loss": 1.4551, "step": 59150 }, { "epoch": 2.21200738394838, "grad_norm": 0.20884348452091217, "learning_rate": 9.065247560262282e-05, "loss": 1.456, "step": 59160 }, { "epoch": 2.21200738394838, "eval_loss": 2.453835964202881, "eval_runtime": 21.9755, "eval_samples_per_second": 227.526, "eval_steps_per_second": 1.229, "step": 59160 }, { "epoch": 2.2125493251138924, "grad_norm": 0.3450928330421448, "learning_rate": 9.0642809545369e-05, "loss": 1.4548, "step": 59170 }, { "epoch": 2.2130912662794047, "grad_norm": 0.3694857358932495, "learning_rate": 9.063313907283868e-05, "loss": 1.4562, "step": 59180 }, { "epoch": 2.213633207444917, "grad_norm": 0.28221815824508667, "learning_rate": 9.06234641862298e-05, "loss": 1.4502, "step": 59190 }, { "epoch": 2.214175148610429, "grad_norm": 0.207607701420784, "learning_rate": 9.061378488674084e-05, "loss": 1.4369, "step": 59200 }, { "epoch": 2.214717089775941, "grad_norm": 0.3098548948764801, "learning_rate": 9.060410117557083e-05, "loss": 1.4371, "step": 59210 }, { "epoch": 2.2152590309414535, "grad_norm": 0.3138673007488251, "learning_rate": 9.059441305391932e-05, "loss": 1.443, "step": 59220 }, { "epoch": 2.215800972106966, "grad_norm": 0.3069899380207062, "learning_rate": 9.058472052298649e-05, "loss": 1.4424, "step": 59230 }, { "epoch": 2.2163429132724777, "grad_norm": 0.4762488305568695, "learning_rate": 9.057502358397296e-05, "loss": 1.4312, "step": 59240 }, { "epoch": 2.2167222720883366, "eval_loss": 2.4593636989593506, "eval_runtime": 21.9809, "eval_samples_per_second": 227.471, "eval_steps_per_second": 1.228, "step": 59247 }, { "epoch": 2.21688485443799, "grad_norm": 0.2531556487083435, "learning_rate": 9.056532223807995e-05, "loss": 1.4506, "step": 59250 }, { "epoch": 2.2174267956035023, "grad_norm": 0.42446276545524597, "learning_rate": 9.055561648650924e-05, "loss": 1.4452, "step": 59260 }, { "epoch": 2.2179687367690146, "grad_norm": 0.2571756839752197, "learning_rate": 9.054590633046313e-05, "loss": 1.4384, "step": 59270 }, { "epoch": 2.2185106779345265, "grad_norm": 0.29887983202934265, "learning_rate": 9.053619177114446e-05, "loss": 1.4408, "step": 59280 }, { "epoch": 2.219052619100039, "grad_norm": 0.22026404738426208, "learning_rate": 9.052647280975664e-05, "loss": 1.4522, "step": 59290 }, { "epoch": 2.219594560265551, "grad_norm": 0.20031945407390594, "learning_rate": 9.051674944750362e-05, "loss": 1.4536, "step": 59300 }, { "epoch": 2.2201365014310634, "grad_norm": 0.2620982527732849, "learning_rate": 9.050702168558987e-05, "loss": 1.4432, "step": 59310 }, { "epoch": 2.2206784425965758, "grad_norm": 0.35358208417892456, "learning_rate": 9.049728952522042e-05, "loss": 1.444, "step": 59320 }, { "epoch": 2.2212203837620876, "grad_norm": 0.36926135420799255, "learning_rate": 9.048755296760087e-05, "loss": 1.4593, "step": 59330 }, { "epoch": 2.2214371602282927, "eval_loss": 2.448256015777588, "eval_runtime": 21.981, "eval_samples_per_second": 227.47, "eval_steps_per_second": 1.228, "step": 59334 }, { "epoch": 2.2217623249276, "grad_norm": 0.17719149589538574, "learning_rate": 9.047781201393731e-05, "loss": 1.4395, "step": 59340 }, { "epoch": 2.2223042660931123, "grad_norm": 0.23782901465892792, "learning_rate": 9.046806666543645e-05, "loss": 1.4385, "step": 59350 }, { "epoch": 2.2228462072586246, "grad_norm": 0.36395078897476196, "learning_rate": 9.045831692330546e-05, "loss": 1.4533, "step": 59360 }, { "epoch": 2.223388148424137, "grad_norm": 0.4966432452201843, "learning_rate": 9.044856278875212e-05, "loss": 1.4494, "step": 59370 }, { "epoch": 2.2239300895896488, "grad_norm": 0.4597684442996979, "learning_rate": 9.043880426298475e-05, "loss": 1.4468, "step": 59380 }, { "epoch": 2.224472030755161, "grad_norm": 0.2446877360343933, "learning_rate": 9.042904134721214e-05, "loss": 1.4415, "step": 59390 }, { "epoch": 2.2250139719206734, "grad_norm": 0.2874411940574646, "learning_rate": 9.041927404264371e-05, "loss": 1.4472, "step": 59400 }, { "epoch": 2.2255559130861857, "grad_norm": 0.5566506385803223, "learning_rate": 9.040950235048938e-05, "loss": 1.446, "step": 59410 }, { "epoch": 2.2260978542516976, "grad_norm": 0.19569170475006104, "learning_rate": 9.039972627195965e-05, "loss": 1.4365, "step": 59420 }, { "epoch": 2.226152048368249, "eval_loss": 2.458207607269287, "eval_runtime": 21.9788, "eval_samples_per_second": 227.492, "eval_steps_per_second": 1.228, "step": 59421 }, { "epoch": 2.22663979541721, "grad_norm": 0.41606298089027405, "learning_rate": 9.038994580826549e-05, "loss": 1.4496, "step": 59430 }, { "epoch": 2.2271817365827222, "grad_norm": 0.21181747317314148, "learning_rate": 9.03801609606185e-05, "loss": 1.4425, "step": 59440 }, { "epoch": 2.2277236777482345, "grad_norm": 0.3672482371330261, "learning_rate": 9.03703717302308e-05, "loss": 1.4444, "step": 59450 }, { "epoch": 2.228265618913747, "grad_norm": 0.2655334174633026, "learning_rate": 9.0360578118315e-05, "loss": 1.4421, "step": 59460 }, { "epoch": 2.2288075600792587, "grad_norm": 0.20638707280158997, "learning_rate": 9.035078012608431e-05, "loss": 1.4422, "step": 59470 }, { "epoch": 2.229349501244771, "grad_norm": 0.5288010239601135, "learning_rate": 9.034097775475244e-05, "loss": 1.4441, "step": 59480 }, { "epoch": 2.2298914424102834, "grad_norm": 0.37558361887931824, "learning_rate": 9.03311710055337e-05, "loss": 1.4386, "step": 59490 }, { "epoch": 2.2304333835757957, "grad_norm": 0.30273330211639404, "learning_rate": 9.032135987964287e-05, "loss": 1.4426, "step": 59500 }, { "epoch": 2.2308669365082054, "eval_loss": 2.4551827907562256, "eval_runtime": 21.9794, "eval_samples_per_second": 227.485, "eval_steps_per_second": 1.228, "step": 59508 }, { "epoch": 2.2309753247413076, "grad_norm": 0.34168776869773865, "learning_rate": 9.031154437829533e-05, "loss": 1.4485, "step": 59510 }, { "epoch": 2.23151726590682, "grad_norm": 0.2830340266227722, "learning_rate": 9.030172450270699e-05, "loss": 1.446, "step": 59520 }, { "epoch": 2.232059207072332, "grad_norm": 0.18490228056907654, "learning_rate": 9.029190025409426e-05, "loss": 1.4459, "step": 59530 }, { "epoch": 2.2326011482378445, "grad_norm": 0.32973939180374146, "learning_rate": 9.028207163367417e-05, "loss": 1.4491, "step": 59540 }, { "epoch": 2.233143089403357, "grad_norm": 0.2619604766368866, "learning_rate": 9.027223864266423e-05, "loss": 1.4446, "step": 59550 }, { "epoch": 2.2336850305688687, "grad_norm": 0.3706190884113312, "learning_rate": 9.02624012822825e-05, "loss": 1.4369, "step": 59560 }, { "epoch": 2.234226971734381, "grad_norm": 0.1908445507287979, "learning_rate": 9.025255955374758e-05, "loss": 1.4321, "step": 59570 }, { "epoch": 2.2347689128998933, "grad_norm": 0.25038012862205505, "learning_rate": 9.024271345827864e-05, "loss": 1.4451, "step": 59580 }, { "epoch": 2.2353108540654056, "grad_norm": 0.22101671993732452, "learning_rate": 9.023286299709538e-05, "loss": 1.4376, "step": 59590 }, { "epoch": 2.2355818246481616, "eval_loss": 2.449619770050049, "eval_runtime": 21.9821, "eval_samples_per_second": 227.458, "eval_steps_per_second": 1.228, "step": 59595 }, { "epoch": 2.235852795230918, "grad_norm": 0.2644299268722534, "learning_rate": 9.022300817141799e-05, "loss": 1.4417, "step": 59600 }, { "epoch": 2.23639473639643, "grad_norm": 0.18024158477783203, "learning_rate": 9.02131489824673e-05, "loss": 1.4442, "step": 59610 }, { "epoch": 2.236936677561942, "grad_norm": 0.2031889110803604, "learning_rate": 9.020328543146457e-05, "loss": 1.4512, "step": 59620 }, { "epoch": 2.2374786187274545, "grad_norm": 0.2275354415178299, "learning_rate": 9.01934175196317e-05, "loss": 1.4478, "step": 59630 }, { "epoch": 2.2380205598929668, "grad_norm": 0.1913514882326126, "learning_rate": 9.018354524819104e-05, "loss": 1.4532, "step": 59640 }, { "epoch": 2.2385625010584786, "grad_norm": 0.17917923629283905, "learning_rate": 9.017366861836555e-05, "loss": 1.4534, "step": 59650 }, { "epoch": 2.239104442223991, "grad_norm": 0.1910235583782196, "learning_rate": 9.016378763137872e-05, "loss": 1.4452, "step": 59660 }, { "epoch": 2.2396463833895033, "grad_norm": 0.2699836194515228, "learning_rate": 9.015390228845454e-05, "loss": 1.4452, "step": 59670 }, { "epoch": 2.2401883245550156, "grad_norm": 0.5121615529060364, "learning_rate": 9.014401259081754e-05, "loss": 1.4472, "step": 59680 }, { "epoch": 2.240296712788118, "eval_loss": 2.445277452468872, "eval_runtime": 21.9791, "eval_samples_per_second": 227.489, "eval_steps_per_second": 1.228, "step": 59682 }, { "epoch": 2.240730265720528, "grad_norm": 0.3202587962150574, "learning_rate": 9.013411853969286e-05, "loss": 1.447, "step": 59690 }, { "epoch": 2.24127220688604, "grad_norm": 0.29666054248809814, "learning_rate": 9.012422013630611e-05, "loss": 1.4478, "step": 59700 }, { "epoch": 2.241814148051552, "grad_norm": 0.2042292356491089, "learning_rate": 9.011431738188348e-05, "loss": 1.4473, "step": 59710 }, { "epoch": 2.2423560892170644, "grad_norm": 0.1927926242351532, "learning_rate": 9.010441027765165e-05, "loss": 1.4467, "step": 59720 }, { "epoch": 2.2428980303825767, "grad_norm": 0.3930176794528961, "learning_rate": 9.009449882483788e-05, "loss": 1.4493, "step": 59730 }, { "epoch": 2.2434399715480886, "grad_norm": 0.31526270508766174, "learning_rate": 9.008458302466995e-05, "loss": 1.4487, "step": 59740 }, { "epoch": 2.243981912713601, "grad_norm": 0.2965898811817169, "learning_rate": 9.007466287837622e-05, "loss": 1.4528, "step": 59750 }, { "epoch": 2.2445238538791132, "grad_norm": 0.3894706964492798, "learning_rate": 9.006473838718551e-05, "loss": 1.443, "step": 59760 }, { "epoch": 2.2450116009280743, "eval_loss": 2.4487032890319824, "eval_runtime": 21.9873, "eval_samples_per_second": 227.404, "eval_steps_per_second": 1.228, "step": 59769 }, { "epoch": 2.2450657950446256, "grad_norm": 0.2878785729408264, "learning_rate": 9.005480955232726e-05, "loss": 1.4379, "step": 59770 }, { "epoch": 2.245607736210138, "grad_norm": 0.2061561793088913, "learning_rate": 9.004487637503139e-05, "loss": 1.4331, "step": 59780 }, { "epoch": 2.2461496773756497, "grad_norm": 0.24609437584877014, "learning_rate": 9.00349388565284e-05, "loss": 1.4486, "step": 59790 }, { "epoch": 2.246691618541162, "grad_norm": 0.24656355381011963, "learning_rate": 9.002499699804927e-05, "loss": 1.442, "step": 59800 }, { "epoch": 2.2472335597066744, "grad_norm": 0.23344773054122925, "learning_rate": 9.001505080082558e-05, "loss": 1.452, "step": 59810 }, { "epoch": 2.2477755008721867, "grad_norm": 0.2287297546863556, "learning_rate": 9.000510026608942e-05, "loss": 1.4439, "step": 59820 }, { "epoch": 2.248317442037699, "grad_norm": 0.2048652619123459, "learning_rate": 8.999514539507342e-05, "loss": 1.4392, "step": 59830 }, { "epoch": 2.248859383203211, "grad_norm": 0.2534552812576294, "learning_rate": 8.998518618901073e-05, "loss": 1.4449, "step": 59840 }, { "epoch": 2.249401324368723, "grad_norm": 0.22183406352996826, "learning_rate": 8.997522264913508e-05, "loss": 1.4556, "step": 59850 }, { "epoch": 2.2497264890680304, "eval_loss": 2.4434633255004883, "eval_runtime": 21.9826, "eval_samples_per_second": 227.452, "eval_steps_per_second": 1.228, "step": 59856 }, { "epoch": 2.2499432655342355, "grad_norm": 0.26273661851882935, "learning_rate": 8.996525477668068e-05, "loss": 1.4403, "step": 59860 }, { "epoch": 2.250485206699748, "grad_norm": 0.2545994222164154, "learning_rate": 8.995528257288233e-05, "loss": 1.4377, "step": 59870 }, { "epoch": 2.2510271478652597, "grad_norm": 0.2006346583366394, "learning_rate": 8.994530603897534e-05, "loss": 1.4457, "step": 59880 }, { "epoch": 2.251569089030772, "grad_norm": 0.17895658314228058, "learning_rate": 8.993532517619554e-05, "loss": 1.4424, "step": 59890 }, { "epoch": 2.2521110301962843, "grad_norm": 0.2239982634782791, "learning_rate": 8.992533998577936e-05, "loss": 1.4329, "step": 59900 }, { "epoch": 2.2526529713617967, "grad_norm": 0.2703670263290405, "learning_rate": 8.991535046896367e-05, "loss": 1.456, "step": 59910 }, { "epoch": 2.2531949125273085, "grad_norm": 0.23329605162143707, "learning_rate": 8.990535662698596e-05, "loss": 1.4411, "step": 59920 }, { "epoch": 2.253736853692821, "grad_norm": 0.25912684202194214, "learning_rate": 8.989535846108421e-05, "loss": 1.446, "step": 59930 }, { "epoch": 2.254278794858333, "grad_norm": 0.22521668672561646, "learning_rate": 8.988535597249696e-05, "loss": 1.4516, "step": 59940 }, { "epoch": 2.254441377207987, "eval_loss": 2.432748556137085, "eval_runtime": 21.9825, "eval_samples_per_second": 227.454, "eval_steps_per_second": 1.228, "step": 59943 }, { "epoch": 2.2548207360238455, "grad_norm": 0.20753563940525055, "learning_rate": 8.987534916246327e-05, "loss": 1.4364, "step": 59950 }, { "epoch": 2.255362677189358, "grad_norm": 0.18667367100715637, "learning_rate": 8.986533803222272e-05, "loss": 1.4418, "step": 59960 }, { "epoch": 2.2559046183548697, "grad_norm": 0.2076788991689682, "learning_rate": 8.985532258301549e-05, "loss": 1.4435, "step": 59970 }, { "epoch": 2.256446559520382, "grad_norm": 0.2706129550933838, "learning_rate": 8.98453028160822e-05, "loss": 1.4415, "step": 59980 }, { "epoch": 2.2569885006858943, "grad_norm": 0.21341146528720856, "learning_rate": 8.98352787326641e-05, "loss": 1.4393, "step": 59990 }, { "epoch": 2.2575304418514066, "grad_norm": 0.21545018255710602, "learning_rate": 8.982525033400289e-05, "loss": 1.4484, "step": 60000 }, { "epoch": 2.258072383016919, "grad_norm": 0.23385457694530487, "learning_rate": 8.98152176213409e-05, "loss": 1.4451, "step": 60010 }, { "epoch": 2.258614324182431, "grad_norm": 0.20415639877319336, "learning_rate": 8.980518059592088e-05, "loss": 1.4394, "step": 60020 }, { "epoch": 2.259156265347943, "grad_norm": 0.25711020827293396, "learning_rate": 8.97951392589862e-05, "loss": 1.4322, "step": 60030 }, { "epoch": 2.259156265347943, "eval_loss": 2.4509191513061523, "eval_runtime": 21.9779, "eval_samples_per_second": 227.502, "eval_steps_per_second": 1.229, "step": 60030 }, { "epoch": 2.2596982065134554, "grad_norm": 0.3049508333206177, "learning_rate": 8.978509361178073e-05, "loss": 1.4373, "step": 60040 }, { "epoch": 2.2602401476789677, "grad_norm": 0.2312183976173401, "learning_rate": 8.97750436555489e-05, "loss": 1.4361, "step": 60050 }, { "epoch": 2.26078208884448, "grad_norm": 0.2977278232574463, "learning_rate": 8.976498939153562e-05, "loss": 1.4418, "step": 60060 }, { "epoch": 2.261324030009992, "grad_norm": 0.18335352838039398, "learning_rate": 8.975493082098639e-05, "loss": 1.4427, "step": 60070 }, { "epoch": 2.2618659711755043, "grad_norm": 0.22939863801002502, "learning_rate": 8.974486794514723e-05, "loss": 1.4411, "step": 60080 }, { "epoch": 2.2624079123410166, "grad_norm": 0.17828980088233948, "learning_rate": 8.973480076526469e-05, "loss": 1.4399, "step": 60090 }, { "epoch": 2.262949853506529, "grad_norm": 0.2951532304286957, "learning_rate": 8.97247292825858e-05, "loss": 1.4469, "step": 60100 }, { "epoch": 2.2634917946720408, "grad_norm": 0.2627731263637543, "learning_rate": 8.971465349835824e-05, "loss": 1.4371, "step": 60110 }, { "epoch": 2.2638711534878997, "eval_loss": 2.4536399841308594, "eval_runtime": 21.9812, "eval_samples_per_second": 227.467, "eval_steps_per_second": 1.228, "step": 60117 }, { "epoch": 2.264033735837553, "grad_norm": 0.1896529346704483, "learning_rate": 8.970457341383011e-05, "loss": 1.4413, "step": 60120 }, { "epoch": 2.2645756770030654, "grad_norm": 0.1765110045671463, "learning_rate": 8.969448903025008e-05, "loss": 1.445, "step": 60130 }, { "epoch": 2.2651176181685777, "grad_norm": 0.19842223823070526, "learning_rate": 8.96844003488674e-05, "loss": 1.4459, "step": 60140 }, { "epoch": 2.2656595593340896, "grad_norm": 0.23995903134346008, "learning_rate": 8.967430737093179e-05, "loss": 1.4476, "step": 60150 }, { "epoch": 2.266201500499602, "grad_norm": 0.3694940209388733, "learning_rate": 8.966421009769352e-05, "loss": 1.4348, "step": 60160 }, { "epoch": 2.266743441665114, "grad_norm": 0.27862706780433655, "learning_rate": 8.965410853040338e-05, "loss": 1.4425, "step": 60170 }, { "epoch": 2.2672853828306265, "grad_norm": 0.2391398549079895, "learning_rate": 8.964400267031274e-05, "loss": 1.4437, "step": 60180 }, { "epoch": 2.267827323996139, "grad_norm": 0.17560423910617828, "learning_rate": 8.963389251867346e-05, "loss": 1.4396, "step": 60190 }, { "epoch": 2.2683692651616507, "grad_norm": 0.2710578143596649, "learning_rate": 8.962377807673795e-05, "loss": 1.4407, "step": 60200 }, { "epoch": 2.268586041627856, "eval_loss": 2.433633327484131, "eval_runtime": 21.9783, "eval_samples_per_second": 227.497, "eval_steps_per_second": 1.228, "step": 60204 }, { "epoch": 2.268911206327163, "grad_norm": 0.2406301647424698, "learning_rate": 8.961365934575913e-05, "loss": 1.4453, "step": 60210 }, { "epoch": 2.2694531474926753, "grad_norm": 0.48485299944877625, "learning_rate": 8.960353632699046e-05, "loss": 1.4345, "step": 60220 }, { "epoch": 2.2699950886581877, "grad_norm": 0.270643025636673, "learning_rate": 8.959340902168594e-05, "loss": 1.4527, "step": 60230 }, { "epoch": 2.2705370298237, "grad_norm": 0.22174952924251556, "learning_rate": 8.95832774311001e-05, "loss": 1.4415, "step": 60240 }, { "epoch": 2.271078970989212, "grad_norm": 0.4398956298828125, "learning_rate": 8.957314155648801e-05, "loss": 1.4397, "step": 60250 }, { "epoch": 2.271620912154724, "grad_norm": 0.2305718958377838, "learning_rate": 8.956300139910525e-05, "loss": 1.4484, "step": 60260 }, { "epoch": 2.2721628533202365, "grad_norm": 0.2533299922943115, "learning_rate": 8.955285696020793e-05, "loss": 1.4524, "step": 60270 }, { "epoch": 2.272704794485749, "grad_norm": 0.20475125312805176, "learning_rate": 8.954270824105268e-05, "loss": 1.4427, "step": 60280 }, { "epoch": 2.2732467356512607, "grad_norm": 0.20677226781845093, "learning_rate": 8.953255524289671e-05, "loss": 1.4397, "step": 60290 }, { "epoch": 2.273300929767812, "eval_loss": 2.436509132385254, "eval_runtime": 22.2954, "eval_samples_per_second": 224.261, "eval_steps_per_second": 1.211, "step": 60291 }, { "epoch": 2.273788676816773, "grad_norm": 0.21641433238983154, "learning_rate": 8.952239796699774e-05, "loss": 1.4478, "step": 60300 }, { "epoch": 2.2743306179822853, "grad_norm": 0.18868723511695862, "learning_rate": 8.951223641461398e-05, "loss": 1.4441, "step": 60310 }, { "epoch": 2.2748725591477976, "grad_norm": 0.22199149429798126, "learning_rate": 8.950207058700422e-05, "loss": 1.4376, "step": 60320 }, { "epoch": 2.2754145003133095, "grad_norm": 0.21569089591503143, "learning_rate": 8.949190048542773e-05, "loss": 1.4365, "step": 60330 }, { "epoch": 2.275956441478822, "grad_norm": 0.2327711433172226, "learning_rate": 8.948172611114438e-05, "loss": 1.4375, "step": 60340 }, { "epoch": 2.276498382644334, "grad_norm": 0.25674811005592346, "learning_rate": 8.947154746541448e-05, "loss": 1.442, "step": 60350 }, { "epoch": 2.2770403238098464, "grad_norm": 0.26655909419059753, "learning_rate": 8.946136454949895e-05, "loss": 1.4425, "step": 60360 }, { "epoch": 2.2775822649753588, "grad_norm": 0.2611353099346161, "learning_rate": 8.94511773646592e-05, "loss": 1.4407, "step": 60370 }, { "epoch": 2.2780158179077685, "eval_loss": 2.4322497844696045, "eval_runtime": 21.9808, "eval_samples_per_second": 227.471, "eval_steps_per_second": 1.228, "step": 60378 }, { "epoch": 2.2781242061408706, "grad_norm": 0.22814051806926727, "learning_rate": 8.944098591215717e-05, "loss": 1.4372, "step": 60380 }, { "epoch": 2.278666147306383, "grad_norm": 0.3651120066642761, "learning_rate": 8.943079019325533e-05, "loss": 1.4415, "step": 60390 }, { "epoch": 2.2792080884718953, "grad_norm": 0.3389890193939209, "learning_rate": 8.94205902092167e-05, "loss": 1.4422, "step": 60400 }, { "epoch": 2.2797500296374076, "grad_norm": 0.32138633728027344, "learning_rate": 8.941038596130477e-05, "loss": 1.428, "step": 60410 }, { "epoch": 2.28029197080292, "grad_norm": 0.2286197692155838, "learning_rate": 8.940017745078363e-05, "loss": 1.4454, "step": 60420 }, { "epoch": 2.2808339119684318, "grad_norm": 0.26349860429763794, "learning_rate": 8.938996467891786e-05, "loss": 1.4401, "step": 60430 }, { "epoch": 2.281375853133944, "grad_norm": 0.1832720935344696, "learning_rate": 8.937974764697258e-05, "loss": 1.451, "step": 60440 }, { "epoch": 2.2819177942994564, "grad_norm": 0.18660111725330353, "learning_rate": 8.936952635621341e-05, "loss": 1.4453, "step": 60450 }, { "epoch": 2.2824597354649687, "grad_norm": 0.22778889536857605, "learning_rate": 8.935930080790655e-05, "loss": 1.4451, "step": 60460 }, { "epoch": 2.2827307060477247, "eval_loss": 2.4412660598754883, "eval_runtime": 21.9845, "eval_samples_per_second": 227.433, "eval_steps_per_second": 1.228, "step": 60465 }, { "epoch": 2.283001676630481, "grad_norm": 0.4049639105796814, "learning_rate": 8.934907100331865e-05, "loss": 1.4458, "step": 60470 }, { "epoch": 2.283543617795993, "grad_norm": 0.20592905580997467, "learning_rate": 8.933883694371699e-05, "loss": 1.441, "step": 60480 }, { "epoch": 2.2840855589615052, "grad_norm": 0.5759056806564331, "learning_rate": 8.932859863036927e-05, "loss": 1.4342, "step": 60490 }, { "epoch": 2.2846275001270175, "grad_norm": 0.3856758177280426, "learning_rate": 8.931835606454382e-05, "loss": 1.4307, "step": 60500 }, { "epoch": 2.28516944129253, "grad_norm": 0.24121706187725067, "learning_rate": 8.930810924750939e-05, "loss": 1.4466, "step": 60510 }, { "epoch": 2.2857113824580417, "grad_norm": 0.3475324511528015, "learning_rate": 8.929785818053534e-05, "loss": 1.4371, "step": 60520 }, { "epoch": 2.286253323623554, "grad_norm": 0.2951948642730713, "learning_rate": 8.928760286489155e-05, "loss": 1.4513, "step": 60530 }, { "epoch": 2.2867952647890664, "grad_norm": 0.27565380930900574, "learning_rate": 8.927734330184835e-05, "loss": 1.4397, "step": 60540 }, { "epoch": 2.2873372059545787, "grad_norm": 0.22539940476417542, "learning_rate": 8.92670794926767e-05, "loss": 1.4414, "step": 60550 }, { "epoch": 2.2874455941876812, "eval_loss": 2.4488258361816406, "eval_runtime": 21.9792, "eval_samples_per_second": 227.487, "eval_steps_per_second": 1.228, "step": 60552 }, { "epoch": 2.2878791471200906, "grad_norm": 0.2256643921136856, "learning_rate": 8.9256811438648e-05, "loss": 1.4383, "step": 60560 }, { "epoch": 2.288421088285603, "grad_norm": 0.23651210963726044, "learning_rate": 8.924653914103423e-05, "loss": 1.44, "step": 60570 }, { "epoch": 2.288963029451115, "grad_norm": 0.2763255834579468, "learning_rate": 8.923626260110789e-05, "loss": 1.4377, "step": 60580 }, { "epoch": 2.2895049706166275, "grad_norm": 0.34174492955207825, "learning_rate": 8.922598182014197e-05, "loss": 1.4544, "step": 60590 }, { "epoch": 2.29004691178214, "grad_norm": 0.26863643527030945, "learning_rate": 8.921569679941e-05, "loss": 1.4435, "step": 60600 }, { "epoch": 2.2905888529476517, "grad_norm": 0.24815820157527924, "learning_rate": 8.920540754018608e-05, "loss": 1.4385, "step": 60610 }, { "epoch": 2.291130794113164, "grad_norm": 0.3145918548107147, "learning_rate": 8.919511404374476e-05, "loss": 1.446, "step": 60620 }, { "epoch": 2.2916727352786763, "grad_norm": 0.2556162178516388, "learning_rate": 8.918481631136119e-05, "loss": 1.4419, "step": 60630 }, { "epoch": 2.2921604823276374, "eval_loss": 2.4316673278808594, "eval_runtime": 21.9778, "eval_samples_per_second": 227.503, "eval_steps_per_second": 1.229, "step": 60639 }, { "epoch": 2.2922146764441886, "grad_norm": 0.32888561487197876, "learning_rate": 8.9174514344311e-05, "loss": 1.4446, "step": 60640 }, { "epoch": 2.292756617609701, "grad_norm": 0.289276123046875, "learning_rate": 8.916420814387033e-05, "loss": 1.4418, "step": 60650 }, { "epoch": 2.293298558775213, "grad_norm": 0.4594111740589142, "learning_rate": 8.915389771131587e-05, "loss": 1.4414, "step": 60660 }, { "epoch": 2.293840499940725, "grad_norm": 0.2204999476671219, "learning_rate": 8.914358304792486e-05, "loss": 1.4378, "step": 60670 }, { "epoch": 2.2943824411062375, "grad_norm": 0.22972345352172852, "learning_rate": 8.913326415497501e-05, "loss": 1.4386, "step": 60680 }, { "epoch": 2.2949243822717498, "grad_norm": 0.2072676122188568, "learning_rate": 8.912294103374459e-05, "loss": 1.4439, "step": 60690 }, { "epoch": 2.295466323437262, "grad_norm": 0.18969523906707764, "learning_rate": 8.911261368551238e-05, "loss": 1.4416, "step": 60700 }, { "epoch": 2.296008264602774, "grad_norm": 0.24862347543239594, "learning_rate": 8.910228211155771e-05, "loss": 1.4567, "step": 60710 }, { "epoch": 2.2965502057682863, "grad_norm": 0.3798186779022217, "learning_rate": 8.909194631316035e-05, "loss": 1.4435, "step": 60720 }, { "epoch": 2.2968753704675935, "eval_loss": 2.450904369354248, "eval_runtime": 21.9806, "eval_samples_per_second": 227.474, "eval_steps_per_second": 1.228, "step": 60726 }, { "epoch": 2.2970921469337986, "grad_norm": 0.32553133368492126, "learning_rate": 8.908160629160073e-05, "loss": 1.4286, "step": 60730 }, { "epoch": 2.297634088099311, "grad_norm": 0.3055340647697449, "learning_rate": 8.907126204815966e-05, "loss": 1.4405, "step": 60740 }, { "epoch": 2.298176029264823, "grad_norm": 0.28653863072395325, "learning_rate": 8.906091358411858e-05, "loss": 1.4405, "step": 60750 }, { "epoch": 2.298717970430335, "grad_norm": 0.2887609004974365, "learning_rate": 8.905056090075942e-05, "loss": 1.4377, "step": 60760 }, { "epoch": 2.2992599115958474, "grad_norm": 0.23055769503116608, "learning_rate": 8.90402039993646e-05, "loss": 1.4483, "step": 60770 }, { "epoch": 2.2998018527613597, "grad_norm": 0.20625241100788116, "learning_rate": 8.90298428812171e-05, "loss": 1.4344, "step": 60780 }, { "epoch": 2.3003437939268716, "grad_norm": 0.1865055412054062, "learning_rate": 8.901947754760039e-05, "loss": 1.4477, "step": 60790 }, { "epoch": 2.300885735092384, "grad_norm": 0.36700624227523804, "learning_rate": 8.90091079997985e-05, "loss": 1.4487, "step": 60800 }, { "epoch": 2.3014276762578962, "grad_norm": 0.2842310667037964, "learning_rate": 8.899873423909599e-05, "loss": 1.4478, "step": 60810 }, { "epoch": 2.30159025860755, "eval_loss": 2.4509172439575195, "eval_runtime": 35.9865, "eval_samples_per_second": 138.941, "eval_steps_per_second": 0.75, "step": 60813 }, { "epoch": 2.3019696174234086, "grad_norm": 0.24928165972232819, "learning_rate": 8.898835626677786e-05, "loss": 1.4322, "step": 60820 }, { "epoch": 2.302511558588921, "grad_norm": 0.22808949649333954, "learning_rate": 8.897797408412973e-05, "loss": 1.4377, "step": 60830 }, { "epoch": 2.3030534997544327, "grad_norm": 0.22761550545692444, "learning_rate": 8.896758769243769e-05, "loss": 1.4548, "step": 60840 }, { "epoch": 2.303595440919945, "grad_norm": 0.24886593222618103, "learning_rate": 8.895719709298837e-05, "loss": 1.4398, "step": 60850 }, { "epoch": 2.3041373820854574, "grad_norm": 0.1771778017282486, "learning_rate": 8.894680228706888e-05, "loss": 1.4439, "step": 60860 }, { "epoch": 2.3046793232509697, "grad_norm": 0.28259819746017456, "learning_rate": 8.893640327596692e-05, "loss": 1.4408, "step": 60870 }, { "epoch": 2.305221264416482, "grad_norm": 0.23001182079315186, "learning_rate": 8.892600006097065e-05, "loss": 1.4418, "step": 60880 }, { "epoch": 2.305763205581994, "grad_norm": 0.21615125238895416, "learning_rate": 8.891559264336879e-05, "loss": 1.4419, "step": 60890 }, { "epoch": 2.306305146747506, "grad_norm": 0.2845569849014282, "learning_rate": 8.890518102445055e-05, "loss": 1.4417, "step": 60900 }, { "epoch": 2.306305146747506, "eval_loss": 2.452852487564087, "eval_runtime": 21.8629, "eval_samples_per_second": 228.698, "eval_steps_per_second": 1.235, "step": 60900 }, { "epoch": 2.3068470879130185, "grad_norm": 0.21112145483493805, "learning_rate": 8.88947652055057e-05, "loss": 1.4355, "step": 60910 }, { "epoch": 2.307389029078531, "grad_norm": 0.2809900641441345, "learning_rate": 8.888434518782449e-05, "loss": 1.441, "step": 60920 }, { "epoch": 2.3079309702440427, "grad_norm": 0.19691738486289978, "learning_rate": 8.88739209726977e-05, "loss": 1.4439, "step": 60930 }, { "epoch": 2.308472911409555, "grad_norm": 0.5515458583831787, "learning_rate": 8.886349256141665e-05, "loss": 1.4473, "step": 60940 }, { "epoch": 2.3090148525750673, "grad_norm": 0.34075281023979187, "learning_rate": 8.885305995527317e-05, "loss": 1.4532, "step": 60950 }, { "epoch": 2.3095567937405796, "grad_norm": 0.2703841030597687, "learning_rate": 8.884262315555958e-05, "loss": 1.4428, "step": 60960 }, { "epoch": 2.3100987349060915, "grad_norm": 0.24917836487293243, "learning_rate": 8.88321821635688e-05, "loss": 1.4448, "step": 60970 }, { "epoch": 2.310640676071604, "grad_norm": 0.22919951379299164, "learning_rate": 8.882173698059414e-05, "loss": 1.4484, "step": 60980 }, { "epoch": 2.3110200348874628, "eval_loss": 2.4533286094665527, "eval_runtime": 21.9805, "eval_samples_per_second": 227.474, "eval_steps_per_second": 1.228, "step": 60987 }, { "epoch": 2.311182617237116, "grad_norm": 0.2503993511199951, "learning_rate": 8.881128760792957e-05, "loss": 1.4377, "step": 60990 }, { "epoch": 2.3117245584026285, "grad_norm": 0.2338937520980835, "learning_rate": 8.880083404686947e-05, "loss": 1.4339, "step": 61000 }, { "epoch": 2.312266499568141, "grad_norm": 0.2059098482131958, "learning_rate": 8.879037629870878e-05, "loss": 1.4293, "step": 61010 }, { "epoch": 2.3128084407336527, "grad_norm": 0.20788943767547607, "learning_rate": 8.8779914364743e-05, "loss": 1.429, "step": 61020 }, { "epoch": 2.313350381899165, "grad_norm": 0.2890428900718689, "learning_rate": 8.87694482462681e-05, "loss": 1.4375, "step": 61030 }, { "epoch": 2.3138923230646773, "grad_norm": 0.235799640417099, "learning_rate": 8.875897794458053e-05, "loss": 1.4417, "step": 61040 }, { "epoch": 2.3144342642301896, "grad_norm": 0.39794206619262695, "learning_rate": 8.874850346097736e-05, "loss": 1.4408, "step": 61050 }, { "epoch": 2.314976205395702, "grad_norm": 0.46668577194213867, "learning_rate": 8.87380247967561e-05, "loss": 1.4364, "step": 61060 }, { "epoch": 2.315518146561214, "grad_norm": 0.21078045666217804, "learning_rate": 8.872754195321479e-05, "loss": 1.4361, "step": 61070 }, { "epoch": 2.315734923027419, "eval_loss": 2.4492809772491455, "eval_runtime": 21.9892, "eval_samples_per_second": 227.384, "eval_steps_per_second": 1.228, "step": 61074 }, { "epoch": 2.316060087726726, "grad_norm": 0.27754735946655273, "learning_rate": 8.871705493165203e-05, "loss": 1.4314, "step": 61080 }, { "epoch": 2.3166020288922384, "grad_norm": 0.22259338200092316, "learning_rate": 8.870656373336689e-05, "loss": 1.4417, "step": 61090 }, { "epoch": 2.3171439700577507, "grad_norm": 0.21421173214912415, "learning_rate": 8.869606835965896e-05, "loss": 1.4432, "step": 61100 }, { "epoch": 2.317685911223263, "grad_norm": 0.2673332691192627, "learning_rate": 8.868556881182838e-05, "loss": 1.4422, "step": 61110 }, { "epoch": 2.318227852388775, "grad_norm": 0.18308067321777344, "learning_rate": 8.867506509117578e-05, "loss": 1.426, "step": 61120 }, { "epoch": 2.3187697935542873, "grad_norm": 0.3170804977416992, "learning_rate": 8.866455719900234e-05, "loss": 1.4377, "step": 61130 }, { "epoch": 2.3193117347197996, "grad_norm": 0.28330197930336, "learning_rate": 8.865404513660968e-05, "loss": 1.437, "step": 61140 }, { "epoch": 2.319853675885312, "grad_norm": 0.17759016156196594, "learning_rate": 8.864352890530005e-05, "loss": 1.439, "step": 61150 }, { "epoch": 2.3203956170508238, "grad_norm": 0.33059704303741455, "learning_rate": 8.863300850637611e-05, "loss": 1.4346, "step": 61160 }, { "epoch": 2.320449811167375, "eval_loss": 2.450498580932617, "eval_runtime": 21.984, "eval_samples_per_second": 227.438, "eval_steps_per_second": 1.228, "step": 61161 }, { "epoch": 2.320937558216336, "grad_norm": 0.26688456535339355, "learning_rate": 8.862248394114111e-05, "loss": 1.4378, "step": 61170 }, { "epoch": 2.3214794993818484, "grad_norm": 0.25356730818748474, "learning_rate": 8.861195521089875e-05, "loss": 1.439, "step": 61180 }, { "epoch": 2.3220214405473607, "grad_norm": 0.33288466930389404, "learning_rate": 8.860142231695331e-05, "loss": 1.4367, "step": 61190 }, { "epoch": 2.3225633817128726, "grad_norm": 0.22826845943927765, "learning_rate": 8.859088526060957e-05, "loss": 1.4443, "step": 61200 }, { "epoch": 2.323105322878385, "grad_norm": 0.214629128575325, "learning_rate": 8.85803440431728e-05, "loss": 1.4391, "step": 61210 }, { "epoch": 2.323647264043897, "grad_norm": 0.21806223690509796, "learning_rate": 8.856979866594879e-05, "loss": 1.4335, "step": 61220 }, { "epoch": 2.3241892052094095, "grad_norm": 0.20262044668197632, "learning_rate": 8.855924913024388e-05, "loss": 1.4399, "step": 61230 }, { "epoch": 2.324731146374922, "grad_norm": 0.267966628074646, "learning_rate": 8.854869543736487e-05, "loss": 1.4412, "step": 61240 }, { "epoch": 2.3251646993073316, "eval_loss": 2.4549691677093506, "eval_runtime": 21.9814, "eval_samples_per_second": 227.465, "eval_steps_per_second": 1.228, "step": 61248 }, { "epoch": 2.3252730875404337, "grad_norm": 0.23619084060192108, "learning_rate": 8.853813758861915e-05, "loss": 1.442, "step": 61250 }, { "epoch": 2.325815028705946, "grad_norm": 0.4099680185317993, "learning_rate": 8.852757558531453e-05, "loss": 1.4374, "step": 61260 }, { "epoch": 2.3263569698714583, "grad_norm": 0.20198297500610352, "learning_rate": 8.851700942875943e-05, "loss": 1.4314, "step": 61270 }, { "epoch": 2.3268989110369707, "grad_norm": 0.22473087906837463, "learning_rate": 8.850643912026269e-05, "loss": 1.4323, "step": 61280 }, { "epoch": 2.327440852202483, "grad_norm": 0.19374053180217743, "learning_rate": 8.849586466113376e-05, "loss": 1.4348, "step": 61290 }, { "epoch": 2.327982793367995, "grad_norm": 0.20663167536258698, "learning_rate": 8.848528605268255e-05, "loss": 1.4335, "step": 61300 }, { "epoch": 2.328524734533507, "grad_norm": 0.2488778531551361, "learning_rate": 8.847470329621945e-05, "loss": 1.4479, "step": 61310 }, { "epoch": 2.3290666756990195, "grad_norm": 0.181378573179245, "learning_rate": 8.846411639305546e-05, "loss": 1.4368, "step": 61320 }, { "epoch": 2.329608616864532, "grad_norm": 0.2096365988254547, "learning_rate": 8.845352534450202e-05, "loss": 1.4265, "step": 61330 }, { "epoch": 2.3298795874472877, "eval_loss": 2.452427625656128, "eval_runtime": 21.9811, "eval_samples_per_second": 227.468, "eval_steps_per_second": 1.228, "step": 61335 }, { "epoch": 2.3301505580300437, "grad_norm": 0.3072015345096588, "learning_rate": 8.84429301518711e-05, "loss": 1.4511, "step": 61340 }, { "epoch": 2.330692499195556, "grad_norm": 0.21584992110729218, "learning_rate": 8.843233081647519e-05, "loss": 1.4421, "step": 61350 }, { "epoch": 2.3312344403610683, "grad_norm": 0.5617396831512451, "learning_rate": 8.842172733962727e-05, "loss": 1.4423, "step": 61360 }, { "epoch": 2.3317763815265806, "grad_norm": 0.194200336933136, "learning_rate": 8.841111972264088e-05, "loss": 1.4333, "step": 61370 }, { "epoch": 2.3323183226920925, "grad_norm": 0.457123339176178, "learning_rate": 8.840050796683005e-05, "loss": 1.441, "step": 61380 }, { "epoch": 2.332860263857605, "grad_norm": 0.24595646560192108, "learning_rate": 8.838989207350928e-05, "loss": 1.4481, "step": 61390 }, { "epoch": 2.333402205023117, "grad_norm": 0.22267189621925354, "learning_rate": 8.837927204399367e-05, "loss": 1.4456, "step": 61400 }, { "epoch": 2.3339441461886294, "grad_norm": 0.18372011184692383, "learning_rate": 8.836864787959873e-05, "loss": 1.4442, "step": 61410 }, { "epoch": 2.3344860873541418, "grad_norm": 0.18260882794857025, "learning_rate": 8.835801958164059e-05, "loss": 1.4382, "step": 61420 }, { "epoch": 2.334594475587244, "eval_loss": 2.4512012004852295, "eval_runtime": 21.9808, "eval_samples_per_second": 227.471, "eval_steps_per_second": 1.228, "step": 61422 }, { "epoch": 2.3350280285196536, "grad_norm": 0.238543301820755, "learning_rate": 8.834738715143577e-05, "loss": 1.4325, "step": 61430 }, { "epoch": 2.335569969685166, "grad_norm": 0.47301405668258667, "learning_rate": 8.833675059030144e-05, "loss": 1.4422, "step": 61440 }, { "epoch": 2.3361119108506783, "grad_norm": 0.23695412278175354, "learning_rate": 8.832610989955518e-05, "loss": 1.4361, "step": 61450 }, { "epoch": 2.3366538520161906, "grad_norm": 0.23388248682022095, "learning_rate": 8.831546508051511e-05, "loss": 1.441, "step": 61460 }, { "epoch": 2.337195793181703, "grad_norm": 0.2939755320549011, "learning_rate": 8.830481613449985e-05, "loss": 1.4408, "step": 61470 }, { "epoch": 2.3377377343472148, "grad_norm": 0.3122168779373169, "learning_rate": 8.82941630628286e-05, "loss": 1.4366, "step": 61480 }, { "epoch": 2.338279675512727, "grad_norm": 0.4362255036830902, "learning_rate": 8.828350586682096e-05, "loss": 1.4309, "step": 61490 }, { "epoch": 2.3388216166782394, "grad_norm": 0.3685493469238281, "learning_rate": 8.82728445477971e-05, "loss": 1.4369, "step": 61500 }, { "epoch": 2.3393093637272004, "eval_loss": 2.454096555709839, "eval_runtime": 21.9765, "eval_samples_per_second": 227.516, "eval_steps_per_second": 1.229, "step": 61509 }, { "epoch": 2.3393635578437517, "grad_norm": 0.3012941777706146, "learning_rate": 8.826217910707774e-05, "loss": 1.4327, "step": 61510 }, { "epoch": 2.339905499009264, "grad_norm": 0.4149247407913208, "learning_rate": 8.825150954598403e-05, "loss": 1.4361, "step": 61520 }, { "epoch": 2.340447440174776, "grad_norm": 0.17556257545948029, "learning_rate": 8.824083586583767e-05, "loss": 1.4273, "step": 61530 }, { "epoch": 2.3409893813402882, "grad_norm": 0.42168980836868286, "learning_rate": 8.823015806796089e-05, "loss": 1.4306, "step": 61540 }, { "epoch": 2.3415313225058005, "grad_norm": 0.29091402888298035, "learning_rate": 8.82194761536764e-05, "loss": 1.4425, "step": 61550 }, { "epoch": 2.342073263671313, "grad_norm": 0.33587974309921265, "learning_rate": 8.820879012430742e-05, "loss": 1.4383, "step": 61560 }, { "epoch": 2.3426152048368247, "grad_norm": 0.2704252600669861, "learning_rate": 8.81980999811777e-05, "loss": 1.4397, "step": 61570 }, { "epoch": 2.343157146002337, "grad_norm": 0.21759971976280212, "learning_rate": 8.81874057256115e-05, "loss": 1.4436, "step": 61580 }, { "epoch": 2.3436990871678494, "grad_norm": 0.28754401206970215, "learning_rate": 8.817670735893356e-05, "loss": 1.4314, "step": 61590 }, { "epoch": 2.3440242518671566, "eval_loss": 2.4510679244995117, "eval_runtime": 21.9791, "eval_samples_per_second": 227.489, "eval_steps_per_second": 1.228, "step": 61596 }, { "epoch": 2.3442410283333617, "grad_norm": 0.39755532145500183, "learning_rate": 8.816600488246914e-05, "loss": 1.441, "step": 61600 }, { "epoch": 2.3447829694988735, "grad_norm": 0.27198687195777893, "learning_rate": 8.815529829754403e-05, "loss": 1.4492, "step": 61610 }, { "epoch": 2.345324910664386, "grad_norm": 0.2784078121185303, "learning_rate": 8.814458760548452e-05, "loss": 1.4396, "step": 61620 }, { "epoch": 2.345866851829898, "grad_norm": 0.205166295170784, "learning_rate": 8.81338728076174e-05, "loss": 1.4402, "step": 61630 }, { "epoch": 2.3464087929954105, "grad_norm": 0.23504230380058289, "learning_rate": 8.812315390526997e-05, "loss": 1.4417, "step": 61640 }, { "epoch": 2.346950734160923, "grad_norm": 0.21687854826450348, "learning_rate": 8.811243089977002e-05, "loss": 1.4379, "step": 61650 }, { "epoch": 2.3474926753264347, "grad_norm": 0.27661964297294617, "learning_rate": 8.810170379244591e-05, "loss": 1.446, "step": 61660 }, { "epoch": 2.348034616491947, "grad_norm": 0.23024803400039673, "learning_rate": 8.809097258462645e-05, "loss": 1.4403, "step": 61670 }, { "epoch": 2.3485765576574593, "grad_norm": 0.4340742230415344, "learning_rate": 8.808023727764095e-05, "loss": 1.439, "step": 61680 }, { "epoch": 2.348739140007113, "eval_loss": 2.4533681869506836, "eval_runtime": 21.9799, "eval_samples_per_second": 227.481, "eval_steps_per_second": 1.228, "step": 61683 }, { "epoch": 2.3491184988229716, "grad_norm": 0.2405279576778412, "learning_rate": 8.806949787281929e-05, "loss": 1.444, "step": 61690 }, { "epoch": 2.349660439988484, "grad_norm": 0.3794168531894684, "learning_rate": 8.805875437149182e-05, "loss": 1.4397, "step": 61700 }, { "epoch": 2.350202381153996, "grad_norm": 0.272239625453949, "learning_rate": 8.804800677498935e-05, "loss": 1.4343, "step": 61710 }, { "epoch": 2.350744322319508, "grad_norm": 0.20025727152824402, "learning_rate": 8.803725508464332e-05, "loss": 1.4387, "step": 61720 }, { "epoch": 2.3512862634850205, "grad_norm": 0.24284347891807556, "learning_rate": 8.802649930178553e-05, "loss": 1.4398, "step": 61730 }, { "epoch": 2.3518282046505328, "grad_norm": 0.20169271528720856, "learning_rate": 8.801573942774842e-05, "loss": 1.4471, "step": 61740 }, { "epoch": 2.352370145816045, "grad_norm": 0.2248934954404831, "learning_rate": 8.800497546386484e-05, "loss": 1.4281, "step": 61750 }, { "epoch": 2.352912086981557, "grad_norm": 0.2882291376590729, "learning_rate": 8.79942074114682e-05, "loss": 1.4445, "step": 61760 }, { "epoch": 2.3534540281470693, "grad_norm": 0.544728696346283, "learning_rate": 8.798343527189238e-05, "loss": 1.4378, "step": 61770 }, { "epoch": 2.3534540281470693, "eval_loss": 2.448328971862793, "eval_runtime": 21.7136, "eval_samples_per_second": 230.271, "eval_steps_per_second": 1.243, "step": 61770 }, { "epoch": 2.3539959693125816, "grad_norm": 0.3125400245189667, "learning_rate": 8.797265904647179e-05, "loss": 1.4284, "step": 61780 }, { "epoch": 2.354537910478094, "grad_norm": 0.3059818744659424, "learning_rate": 8.796187873654138e-05, "loss": 1.4296, "step": 61790 }, { "epoch": 2.355079851643606, "grad_norm": 0.2530340254306793, "learning_rate": 8.795109434343652e-05, "loss": 1.4302, "step": 61800 }, { "epoch": 2.355621792809118, "grad_norm": 0.2673911452293396, "learning_rate": 8.794030586849315e-05, "loss": 1.4459, "step": 61810 }, { "epoch": 2.3561637339746304, "grad_norm": 0.3139052391052246, "learning_rate": 8.792951331304771e-05, "loss": 1.4379, "step": 61820 }, { "epoch": 2.3567056751401427, "grad_norm": 0.19884108006954193, "learning_rate": 8.791871667843713e-05, "loss": 1.4348, "step": 61830 }, { "epoch": 2.3572476163056546, "grad_norm": 0.2218913733959198, "learning_rate": 8.790791596599882e-05, "loss": 1.4465, "step": 61840 }, { "epoch": 2.357789557471167, "grad_norm": 0.18263356387615204, "learning_rate": 8.789711117707078e-05, "loss": 1.4378, "step": 61850 }, { "epoch": 2.3581689162870254, "eval_loss": 2.4493706226348877, "eval_runtime": 21.983, "eval_samples_per_second": 227.448, "eval_steps_per_second": 1.228, "step": 61857 }, { "epoch": 2.3583314986366792, "grad_norm": 0.2500559091567993, "learning_rate": 8.788630231299142e-05, "loss": 1.4311, "step": 61860 }, { "epoch": 2.3588734398021916, "grad_norm": 0.27540549635887146, "learning_rate": 8.787548937509971e-05, "loss": 1.431, "step": 61870 }, { "epoch": 2.359415380967704, "grad_norm": 0.23163899779319763, "learning_rate": 8.786467236473511e-05, "loss": 1.4425, "step": 61880 }, { "epoch": 2.3599573221332157, "grad_norm": 0.19727367162704468, "learning_rate": 8.785385128323759e-05, "loss": 1.45, "step": 61890 }, { "epoch": 2.360499263298728, "grad_norm": 0.2537407875061035, "learning_rate": 8.784302613194758e-05, "loss": 1.4446, "step": 61900 }, { "epoch": 2.3610412044642404, "grad_norm": 0.208670973777771, "learning_rate": 8.783219691220611e-05, "loss": 1.4393, "step": 61910 }, { "epoch": 2.3615831456297527, "grad_norm": 0.18386749923229218, "learning_rate": 8.782136362535462e-05, "loss": 1.4468, "step": 61920 }, { "epoch": 2.362125086795265, "grad_norm": 0.19878651201725006, "learning_rate": 8.781052627273512e-05, "loss": 1.444, "step": 61930 }, { "epoch": 2.362667027960777, "grad_norm": 0.27927711606025696, "learning_rate": 8.779968485569004e-05, "loss": 1.4331, "step": 61940 }, { "epoch": 2.362883804426982, "eval_loss": 2.4520583152770996, "eval_runtime": 21.9818, "eval_samples_per_second": 227.461, "eval_steps_per_second": 1.228, "step": 61944 }, { "epoch": 2.363208969126289, "grad_norm": 0.5157132744789124, "learning_rate": 8.778883937556243e-05, "loss": 1.4434, "step": 61950 }, { "epoch": 2.3637509102918015, "grad_norm": 0.6511501669883728, "learning_rate": 8.777798983369573e-05, "loss": 1.4315, "step": 61960 }, { "epoch": 2.364292851457314, "grad_norm": 0.21835680305957794, "learning_rate": 8.776713623143397e-05, "loss": 1.4358, "step": 61970 }, { "epoch": 2.3648347926228257, "grad_norm": 0.1703735738992691, "learning_rate": 8.775627857012165e-05, "loss": 1.4404, "step": 61980 }, { "epoch": 2.365376733788338, "grad_norm": 0.17746131122112274, "learning_rate": 8.774541685110373e-05, "loss": 1.4322, "step": 61990 }, { "epoch": 2.3659186749538503, "grad_norm": 0.34184667468070984, "learning_rate": 8.773455107572574e-05, "loss": 1.4246, "step": 62000 }, { "epoch": 2.3664606161193626, "grad_norm": 0.2798058092594147, "learning_rate": 8.772368124533369e-05, "loss": 1.4387, "step": 62010 }, { "epoch": 2.3670025572848745, "grad_norm": 0.40121662616729736, "learning_rate": 8.771280736127407e-05, "loss": 1.4394, "step": 62020 }, { "epoch": 2.367544498450387, "grad_norm": 0.26767289638519287, "learning_rate": 8.770192942489392e-05, "loss": 1.4371, "step": 62030 }, { "epoch": 2.367598692566938, "eval_loss": 2.441128969192505, "eval_runtime": 21.982, "eval_samples_per_second": 227.459, "eval_steps_per_second": 1.228, "step": 62031 }, { "epoch": 2.368086439615899, "grad_norm": 0.32896319031715393, "learning_rate": 8.76910474375407e-05, "loss": 1.4328, "step": 62040 }, { "epoch": 2.3686283807814115, "grad_norm": 0.4783940017223358, "learning_rate": 8.768016140056247e-05, "loss": 1.4453, "step": 62050 }, { "epoch": 2.369170321946924, "grad_norm": 0.29626142978668213, "learning_rate": 8.766927131530774e-05, "loss": 1.4437, "step": 62060 }, { "epoch": 2.3697122631124357, "grad_norm": 0.31796202063560486, "learning_rate": 8.765837718312549e-05, "loss": 1.4328, "step": 62070 }, { "epoch": 2.370254204277948, "grad_norm": 0.6041905283927917, "learning_rate": 8.764747900536529e-05, "loss": 1.4374, "step": 62080 }, { "epoch": 2.3707961454434603, "grad_norm": 0.2997055649757385, "learning_rate": 8.76365767833771e-05, "loss": 1.4417, "step": 62090 }, { "epoch": 2.3713380866089726, "grad_norm": 0.23351405560970306, "learning_rate": 8.76256705185115e-05, "loss": 1.4327, "step": 62100 }, { "epoch": 2.371880027774485, "grad_norm": 0.18303707242012024, "learning_rate": 8.761476021211947e-05, "loss": 1.4324, "step": 62110 }, { "epoch": 2.3723135807068947, "eval_loss": 2.431164503097534, "eval_runtime": 21.98, "eval_samples_per_second": 227.48, "eval_steps_per_second": 1.228, "step": 62118 }, { "epoch": 2.372421968939997, "grad_norm": 0.17883995175361633, "learning_rate": 8.760384586555255e-05, "loss": 1.4199, "step": 62120 }, { "epoch": 2.372963910105509, "grad_norm": 0.1785135418176651, "learning_rate": 8.759292748016275e-05, "loss": 1.4307, "step": 62130 }, { "epoch": 2.3735058512710214, "grad_norm": 0.3247034549713135, "learning_rate": 8.75820050573026e-05, "loss": 1.4367, "step": 62140 }, { "epoch": 2.3740477924365337, "grad_norm": 0.2257276326417923, "learning_rate": 8.757107859832512e-05, "loss": 1.4524, "step": 62150 }, { "epoch": 2.374589733602046, "grad_norm": 0.34088021516799927, "learning_rate": 8.756014810458384e-05, "loss": 1.4359, "step": 62160 }, { "epoch": 2.375131674767558, "grad_norm": 0.2236095666885376, "learning_rate": 8.754921357743277e-05, "loss": 1.439, "step": 62170 }, { "epoch": 2.3756736159330702, "grad_norm": 0.2399052530527115, "learning_rate": 8.753827501822643e-05, "loss": 1.4326, "step": 62180 }, { "epoch": 2.3762155570985826, "grad_norm": 0.18329595029354095, "learning_rate": 8.752733242831985e-05, "loss": 1.4373, "step": 62190 }, { "epoch": 2.376757498264095, "grad_norm": 0.2353314459323883, "learning_rate": 8.751638580906856e-05, "loss": 1.441, "step": 62200 }, { "epoch": 2.377028468846851, "eval_loss": 2.4490318298339844, "eval_runtime": 21.9823, "eval_samples_per_second": 227.456, "eval_steps_per_second": 1.228, "step": 62205 }, { "epoch": 2.3772994394296068, "grad_norm": 0.30364421010017395, "learning_rate": 8.750543516182855e-05, "loss": 1.4362, "step": 62210 }, { "epoch": 2.377841380595119, "grad_norm": 0.1798836588859558, "learning_rate": 8.749448048795637e-05, "loss": 1.4259, "step": 62220 }, { "epoch": 2.3783833217606314, "grad_norm": 0.24086004495620728, "learning_rate": 8.748352178880902e-05, "loss": 1.4362, "step": 62230 }, { "epoch": 2.3789252629261437, "grad_norm": 0.23006141185760498, "learning_rate": 8.747255906574402e-05, "loss": 1.4397, "step": 62240 }, { "epoch": 2.3794672040916556, "grad_norm": 0.23331311345100403, "learning_rate": 8.74615923201194e-05, "loss": 1.4479, "step": 62250 }, { "epoch": 2.380009145257168, "grad_norm": 0.2259208709001541, "learning_rate": 8.745062155329363e-05, "loss": 1.4515, "step": 62260 }, { "epoch": 2.38055108642268, "grad_norm": 0.3277144730091095, "learning_rate": 8.743964676662576e-05, "loss": 1.4405, "step": 62270 }, { "epoch": 2.3810930275881925, "grad_norm": 0.25958386063575745, "learning_rate": 8.742866796147528e-05, "loss": 1.4254, "step": 62280 }, { "epoch": 2.381634968753705, "grad_norm": 0.2229194939136505, "learning_rate": 8.741768513920222e-05, "loss": 1.426, "step": 62290 }, { "epoch": 2.381743356986807, "eval_loss": 2.435068368911743, "eval_runtime": 21.9789, "eval_samples_per_second": 227.491, "eval_steps_per_second": 1.228, "step": 62292 }, { "epoch": 2.3821769099192167, "grad_norm": 0.4171644449234009, "learning_rate": 8.740669830116706e-05, "loss": 1.4468, "step": 62300 }, { "epoch": 2.382718851084729, "grad_norm": 0.17529958486557007, "learning_rate": 8.739570744873081e-05, "loss": 1.4307, "step": 62310 }, { "epoch": 2.3832607922502413, "grad_norm": 0.25872156023979187, "learning_rate": 8.738471258325499e-05, "loss": 1.4456, "step": 62320 }, { "epoch": 2.3838027334157537, "grad_norm": 0.18120713531970978, "learning_rate": 8.737371370610156e-05, "loss": 1.4495, "step": 62330 }, { "epoch": 2.384344674581266, "grad_norm": 0.3255040645599365, "learning_rate": 8.736271081863302e-05, "loss": 1.4378, "step": 62340 }, { "epoch": 2.384886615746778, "grad_norm": 0.22683602571487427, "learning_rate": 8.735170392221237e-05, "loss": 1.4417, "step": 62350 }, { "epoch": 2.38542855691229, "grad_norm": 0.5052164196968079, "learning_rate": 8.73406930182031e-05, "loss": 1.4369, "step": 62360 }, { "epoch": 2.3859704980778025, "grad_norm": 0.27900445461273193, "learning_rate": 8.732967810796918e-05, "loss": 1.4283, "step": 62370 }, { "epoch": 2.3864582451267635, "eval_loss": 2.445622682571411, "eval_runtime": 21.9893, "eval_samples_per_second": 227.383, "eval_steps_per_second": 1.228, "step": 62379 }, { "epoch": 2.386512439243315, "grad_norm": 0.30575114488601685, "learning_rate": 8.73186591928751e-05, "loss": 1.438, "step": 62380 }, { "epoch": 2.387054380408827, "grad_norm": 0.2561473846435547, "learning_rate": 8.730763627428585e-05, "loss": 1.4324, "step": 62390 }, { "epoch": 2.387596321574339, "grad_norm": 0.2688450515270233, "learning_rate": 8.729660935356686e-05, "loss": 1.4184, "step": 62400 }, { "epoch": 2.3881382627398513, "grad_norm": 0.3544863164424896, "learning_rate": 8.728557843208413e-05, "loss": 1.4436, "step": 62410 }, { "epoch": 2.3886802039053636, "grad_norm": 0.30758488178253174, "learning_rate": 8.727454351120413e-05, "loss": 1.427, "step": 62420 }, { "epoch": 2.389222145070876, "grad_norm": 0.41708388924598694, "learning_rate": 8.726350459229379e-05, "loss": 1.4349, "step": 62430 }, { "epoch": 2.389764086236388, "grad_norm": 0.5974102020263672, "learning_rate": 8.725246167672057e-05, "loss": 1.4272, "step": 62440 }, { "epoch": 2.3903060274019, "grad_norm": 0.19719624519348145, "learning_rate": 8.724141476585244e-05, "loss": 1.4359, "step": 62450 }, { "epoch": 2.3908479685674124, "grad_norm": 0.3106327950954437, "learning_rate": 8.723036386105781e-05, "loss": 1.4333, "step": 62460 }, { "epoch": 2.3911731332667197, "eval_loss": 2.4473304748535156, "eval_runtime": 21.9738, "eval_samples_per_second": 227.544, "eval_steps_per_second": 1.229, "step": 62466 }, { "epoch": 2.3913899097329248, "grad_norm": 0.2692268192768097, "learning_rate": 8.721930896370563e-05, "loss": 1.421, "step": 62470 }, { "epoch": 2.3919318508984366, "grad_norm": 0.2162933647632599, "learning_rate": 8.720825007516535e-05, "loss": 1.4401, "step": 62480 }, { "epoch": 2.392473792063949, "grad_norm": 0.3203500509262085, "learning_rate": 8.719718719680687e-05, "loss": 1.4334, "step": 62490 }, { "epoch": 2.3930157332294613, "grad_norm": 0.3285951018333435, "learning_rate": 8.718612033000066e-05, "loss": 1.4321, "step": 62500 }, { "epoch": 2.3935576743949736, "grad_norm": 0.4296458065509796, "learning_rate": 8.717504947611757e-05, "loss": 1.4345, "step": 62510 }, { "epoch": 2.394099615560486, "grad_norm": 0.1817467212677002, "learning_rate": 8.716397463652907e-05, "loss": 1.4386, "step": 62520 }, { "epoch": 2.3946415567259978, "grad_norm": 0.3981388807296753, "learning_rate": 8.7152895812607e-05, "loss": 1.4317, "step": 62530 }, { "epoch": 2.39518349789151, "grad_norm": 0.2751355767250061, "learning_rate": 8.71418130057238e-05, "loss": 1.4464, "step": 62540 }, { "epoch": 2.3957254390570224, "grad_norm": 0.2315666377544403, "learning_rate": 8.713072621725235e-05, "loss": 1.4424, "step": 62550 }, { "epoch": 2.3958880214066762, "eval_loss": 2.4334030151367188, "eval_runtime": 22.0386, "eval_samples_per_second": 226.874, "eval_steps_per_second": 1.225, "step": 62553 }, { "epoch": 2.3962673802225347, "grad_norm": 0.2237350344657898, "learning_rate": 8.711963544856606e-05, "loss": 1.4372, "step": 62560 }, { "epoch": 2.396809321388047, "grad_norm": 0.17813153564929962, "learning_rate": 8.710854070103876e-05, "loss": 1.4269, "step": 62570 }, { "epoch": 2.397351262553559, "grad_norm": 0.2021496742963791, "learning_rate": 8.709744197604483e-05, "loss": 1.4307, "step": 62580 }, { "epoch": 2.397893203719071, "grad_norm": 0.1962280124425888, "learning_rate": 8.708633927495916e-05, "loss": 1.4401, "step": 62590 }, { "epoch": 2.3984351448845835, "grad_norm": 0.29893505573272705, "learning_rate": 8.707523259915707e-05, "loss": 1.4374, "step": 62600 }, { "epoch": 2.398977086050096, "grad_norm": 0.24217230081558228, "learning_rate": 8.706412195001444e-05, "loss": 1.4346, "step": 62610 }, { "epoch": 2.3995190272156077, "grad_norm": 0.24626600742340088, "learning_rate": 8.705300732890756e-05, "loss": 1.4344, "step": 62620 }, { "epoch": 2.40006096838112, "grad_norm": 0.17191511392593384, "learning_rate": 8.704188873721332e-05, "loss": 1.443, "step": 62630 }, { "epoch": 2.4006029095466324, "grad_norm": 0.20966501533985138, "learning_rate": 8.703076617630901e-05, "loss": 1.4345, "step": 62640 }, { "epoch": 2.4006029095466324, "eval_loss": 2.4349613189697266, "eval_runtime": 21.971, "eval_samples_per_second": 227.572, "eval_steps_per_second": 1.229, "step": 62640 }, { "epoch": 2.4011448507121447, "grad_norm": 0.190885990858078, "learning_rate": 8.701963964757245e-05, "loss": 1.4334, "step": 62650 }, { "epoch": 2.4016867918776565, "grad_norm": 0.26069727540016174, "learning_rate": 8.700850915238195e-05, "loss": 1.4388, "step": 62660 }, { "epoch": 2.402228733043169, "grad_norm": 0.33938780426979065, "learning_rate": 8.699737469211629e-05, "loss": 1.4331, "step": 62670 }, { "epoch": 2.402770674208681, "grad_norm": 0.23580268025398254, "learning_rate": 8.698623626815478e-05, "loss": 1.4318, "step": 62680 }, { "epoch": 2.4033126153741935, "grad_norm": 0.23255738615989685, "learning_rate": 8.697509388187721e-05, "loss": 1.4455, "step": 62690 }, { "epoch": 2.403854556539706, "grad_norm": 0.3351677656173706, "learning_rate": 8.696394753466381e-05, "loss": 1.4373, "step": 62700 }, { "epoch": 2.4043964977052177, "grad_norm": 0.18452972173690796, "learning_rate": 8.695279722789536e-05, "loss": 1.4417, "step": 62710 }, { "epoch": 2.40493843887073, "grad_norm": 0.21630007028579712, "learning_rate": 8.694164296295311e-05, "loss": 1.434, "step": 62720 }, { "epoch": 2.4053177976865885, "eval_loss": 2.435746908187866, "eval_runtime": 22.0344, "eval_samples_per_second": 226.918, "eval_steps_per_second": 1.225, "step": 62727 }, { "epoch": 2.4054803800362423, "grad_norm": 0.3593086004257202, "learning_rate": 8.693048474121883e-05, "loss": 1.4383, "step": 62730 }, { "epoch": 2.4060223212017546, "grad_norm": 0.4216225743293762, "learning_rate": 8.69193225640747e-05, "loss": 1.4458, "step": 62740 }, { "epoch": 2.406564262367267, "grad_norm": 0.2987191379070282, "learning_rate": 8.690815643290348e-05, "loss": 1.4488, "step": 62750 }, { "epoch": 2.407106203532779, "grad_norm": 0.2709275186061859, "learning_rate": 8.68969863490884e-05, "loss": 1.4483, "step": 62760 }, { "epoch": 2.407648144698291, "grad_norm": 0.17814864218235016, "learning_rate": 8.68858123140131e-05, "loss": 1.4451, "step": 62770 }, { "epoch": 2.4081900858638035, "grad_norm": 0.24348965287208557, "learning_rate": 8.687463432906182e-05, "loss": 1.4463, "step": 62780 }, { "epoch": 2.4087320270293158, "grad_norm": 0.21512000262737274, "learning_rate": 8.686345239561921e-05, "loss": 1.4508, "step": 62790 }, { "epoch": 2.409273968194828, "grad_norm": 0.3042374551296234, "learning_rate": 8.685226651507047e-05, "loss": 1.4606, "step": 62800 }, { "epoch": 2.40981590936034, "grad_norm": 0.22630225121974945, "learning_rate": 8.684107668880124e-05, "loss": 1.4499, "step": 62810 }, { "epoch": 2.410032685826545, "eval_loss": 2.45141339302063, "eval_runtime": 21.9807, "eval_samples_per_second": 227.472, "eval_steps_per_second": 1.228, "step": 62814 }, { "epoch": 2.4103578505258523, "grad_norm": 0.22491927444934845, "learning_rate": 8.682988291819766e-05, "loss": 1.4527, "step": 62820 }, { "epoch": 2.4108997916913646, "grad_norm": 0.18473847210407257, "learning_rate": 8.68186852046464e-05, "loss": 1.4372, "step": 62830 }, { "epoch": 2.411441732856877, "grad_norm": 0.18476863205432892, "learning_rate": 8.680748354953454e-05, "loss": 1.4481, "step": 62840 }, { "epoch": 2.411983674022389, "grad_norm": 0.20057182013988495, "learning_rate": 8.679627795424973e-05, "loss": 1.442, "step": 62850 }, { "epoch": 2.412525615187901, "grad_norm": 0.2545701563358307, "learning_rate": 8.678506842018002e-05, "loss": 1.4425, "step": 62860 }, { "epoch": 2.4130675563534134, "grad_norm": 0.18435277044773102, "learning_rate": 8.677385494871406e-05, "loss": 1.4428, "step": 62870 }, { "epoch": 2.4136094975189257, "grad_norm": 0.3235138952732086, "learning_rate": 8.676263754124089e-05, "loss": 1.4548, "step": 62880 }, { "epoch": 2.4141514386844376, "grad_norm": 0.36199185252189636, "learning_rate": 8.675141619915008e-05, "loss": 1.4421, "step": 62890 }, { "epoch": 2.41469337984995, "grad_norm": 0.19079086184501648, "learning_rate": 8.674019092383168e-05, "loss": 1.4568, "step": 62900 }, { "epoch": 2.414747573966501, "eval_loss": 2.44244122505188, "eval_runtime": 21.9822, "eval_samples_per_second": 227.456, "eval_steps_per_second": 1.228, "step": 62901 }, { "epoch": 2.4152353210154622, "grad_norm": 0.20537631213665009, "learning_rate": 8.672896171667623e-05, "loss": 1.454, "step": 62910 }, { "epoch": 2.4157772621809745, "grad_norm": 0.1923794001340866, "learning_rate": 8.671772857907476e-05, "loss": 1.444, "step": 62920 }, { "epoch": 2.416319203346487, "grad_norm": 0.3296695649623871, "learning_rate": 8.670649151241876e-05, "loss": 1.4404, "step": 62930 }, { "epoch": 2.4168611445119987, "grad_norm": 0.4826582670211792, "learning_rate": 8.669525051810028e-05, "loss": 1.4398, "step": 62940 }, { "epoch": 2.417403085677511, "grad_norm": 0.23408730328083038, "learning_rate": 8.668400559751175e-05, "loss": 1.4405, "step": 62950 }, { "epoch": 2.4179450268430234, "grad_norm": 0.461852103471756, "learning_rate": 8.667275675204617e-05, "loss": 1.4522, "step": 62960 }, { "epoch": 2.4184869680085357, "grad_norm": 0.22658464312553406, "learning_rate": 8.6661503983097e-05, "loss": 1.4552, "step": 62970 }, { "epoch": 2.419028909174048, "grad_norm": 0.1908838152885437, "learning_rate": 8.665024729205816e-05, "loss": 1.4352, "step": 62980 }, { "epoch": 2.4194624621064573, "eval_loss": 2.4345483779907227, "eval_runtime": 21.9764, "eval_samples_per_second": 227.517, "eval_steps_per_second": 1.229, "step": 62988 }, { "epoch": 2.41957085033956, "grad_norm": 0.24472688138484955, "learning_rate": 8.663898668032412e-05, "loss": 1.4473, "step": 62990 }, { "epoch": 2.420112791505072, "grad_norm": 0.2447074055671692, "learning_rate": 8.662772214928976e-05, "loss": 1.4536, "step": 63000 }, { "epoch": 3.0005419411655123, "grad_norm": 0.16394081711769104, "learning_rate": 8.661645370035048e-05, "loss": 1.4525, "step": 63010 }, { "epoch": 3.0010838823310246, "grad_norm": 0.2631014585494995, "learning_rate": 8.660518133490221e-05, "loss": 1.4492, "step": 63020 }, { "epoch": 3.0016258234965365, "grad_norm": 0.1863064467906952, "learning_rate": 8.659390505434127e-05, "loss": 1.4496, "step": 63030 }, { "epoch": 3.002167764662049, "grad_norm": 0.1799326092004776, "learning_rate": 8.658262486006455e-05, "loss": 1.4565, "step": 63040 }, { "epoch": 3.002709705827561, "grad_norm": 0.18670876324176788, "learning_rate": 8.657134075346938e-05, "loss": 1.4478, "step": 63050 }, { "epoch": 3.0032516469930735, "grad_norm": 0.23742614686489105, "learning_rate": 8.65600527359536e-05, "loss": 1.4542, "step": 63060 }, { "epoch": 3.0037935881585853, "grad_norm": 0.21548855304718018, "learning_rate": 8.654876080891547e-05, "loss": 1.4522, "step": 63070 }, { "epoch": 3.0040645587413417, "eval_loss": 2.447054624557495, "eval_runtime": 22.3909, "eval_samples_per_second": 223.305, "eval_steps_per_second": 1.206, "step": 63075 }, { "epoch": 3.0043355293240976, "grad_norm": 0.20155511796474457, "learning_rate": 8.653746497375385e-05, "loss": 1.4611, "step": 63080 }, { "epoch": 3.00487747048961, "grad_norm": 0.21091540157794952, "learning_rate": 8.652616523186797e-05, "loss": 1.444, "step": 63090 }, { "epoch": 3.0054194116551223, "grad_norm": 0.20185105502605438, "learning_rate": 8.651486158465764e-05, "loss": 1.453, "step": 63100 }, { "epoch": 3.0059613528206346, "grad_norm": 0.21705321967601776, "learning_rate": 8.650355403352307e-05, "loss": 1.4463, "step": 63110 }, { "epoch": 3.0065032939861465, "grad_norm": 0.22515122592449188, "learning_rate": 8.649224257986499e-05, "loss": 1.4393, "step": 63120 }, { "epoch": 3.007045235151659, "grad_norm": 0.2835371494293213, "learning_rate": 8.648092722508463e-05, "loss": 1.449, "step": 63130 }, { "epoch": 3.007587176317171, "grad_norm": 0.36949828267097473, "learning_rate": 8.64696079705837e-05, "loss": 1.4525, "step": 63140 }, { "epoch": 3.0081291174826834, "grad_norm": 0.23466630280017853, "learning_rate": 8.645828481776434e-05, "loss": 1.4465, "step": 63150 }, { "epoch": 3.0086710586481953, "grad_norm": 0.2540377676486969, "learning_rate": 8.644695776802925e-05, "loss": 1.454, "step": 63160 }, { "epoch": 3.008779446881298, "eval_loss": 2.4413795471191406, "eval_runtime": 21.9823, "eval_samples_per_second": 227.456, "eval_steps_per_second": 1.228, "step": 63162 }, { "epoch": 3.0092129998137076, "grad_norm": 0.3684237599372864, "learning_rate": 8.643562682278154e-05, "loss": 1.4465, "step": 63170 }, { "epoch": 3.00975494097922, "grad_norm": 0.1901058703660965, "learning_rate": 8.642429198342488e-05, "loss": 1.4547, "step": 63180 }, { "epoch": 3.0102968821447322, "grad_norm": 0.18713931739330292, "learning_rate": 8.641295325136336e-05, "loss": 1.441, "step": 63190 }, { "epoch": 3.0108388233102445, "grad_norm": 0.18807746469974518, "learning_rate": 8.640161062800155e-05, "loss": 1.4493, "step": 63200 }, { "epoch": 3.0113807644757564, "grad_norm": 0.19221310317516327, "learning_rate": 8.639026411474457e-05, "loss": 1.44, "step": 63210 }, { "epoch": 3.0119227056412687, "grad_norm": 0.2589902877807617, "learning_rate": 8.637891371299796e-05, "loss": 1.4451, "step": 63220 }, { "epoch": 3.012464646806781, "grad_norm": 0.22735221683979034, "learning_rate": 8.636755942416774e-05, "loss": 1.4497, "step": 63230 }, { "epoch": 3.0130065879722934, "grad_norm": 0.17304742336273193, "learning_rate": 8.635620124966043e-05, "loss": 1.4586, "step": 63240 }, { "epoch": 3.0134943350212544, "eval_loss": 2.4555513858795166, "eval_runtime": 21.9864, "eval_samples_per_second": 227.414, "eval_steps_per_second": 1.228, "step": 63249 }, { "epoch": 3.0135485291378057, "grad_norm": 0.19638432562351227, "learning_rate": 8.634483919088306e-05, "loss": 1.4583, "step": 63250 }, { "epoch": 3.0140904703033176, "grad_norm": 0.3353283703327179, "learning_rate": 8.633347324924309e-05, "loss": 1.4459, "step": 63260 }, { "epoch": 3.01463241146883, "grad_norm": 0.23562084138393402, "learning_rate": 8.63221034261485e-05, "loss": 1.449, "step": 63270 }, { "epoch": 3.015174352634342, "grad_norm": 0.23831287026405334, "learning_rate": 8.63107297230077e-05, "loss": 1.4461, "step": 63280 }, { "epoch": 3.0157162937998545, "grad_norm": 0.2882630228996277, "learning_rate": 8.629935214122968e-05, "loss": 1.453, "step": 63290 }, { "epoch": 3.0162582349653664, "grad_norm": 0.26988014578819275, "learning_rate": 8.628797068222378e-05, "loss": 1.451, "step": 63300 }, { "epoch": 3.0168001761308787, "grad_norm": 0.2622480094432831, "learning_rate": 8.627658534739992e-05, "loss": 1.4587, "step": 63310 }, { "epoch": 3.017342117296391, "grad_norm": 0.2087298184633255, "learning_rate": 8.626519613816844e-05, "loss": 1.4502, "step": 63320 }, { "epoch": 3.0178840584619033, "grad_norm": 0.20758210122585297, "learning_rate": 8.62538030559402e-05, "loss": 1.4498, "step": 63330 }, { "epoch": 3.0182092231612105, "eval_loss": 2.4477882385253906, "eval_runtime": 21.9833, "eval_samples_per_second": 227.445, "eval_steps_per_second": 1.228, "step": 63336 }, { "epoch": 3.0184259996274156, "grad_norm": 0.1754673719406128, "learning_rate": 8.624240610212656e-05, "loss": 1.4539, "step": 63340 }, { "epoch": 3.0189679407929275, "grad_norm": 0.15522533655166626, "learning_rate": 8.623100527813928e-05, "loss": 1.4482, "step": 63350 }, { "epoch": 3.01950988195844, "grad_norm": 0.22570842504501343, "learning_rate": 8.621960058539062e-05, "loss": 1.4564, "step": 63360 }, { "epoch": 3.020051823123952, "grad_norm": 0.20296502113342285, "learning_rate": 8.620819202529342e-05, "loss": 1.4586, "step": 63370 }, { "epoch": 3.0205937642894645, "grad_norm": 0.2882785201072693, "learning_rate": 8.619677959926089e-05, "loss": 1.4478, "step": 63380 }, { "epoch": 3.0211357054549763, "grad_norm": 0.3313170075416565, "learning_rate": 8.618536330870673e-05, "loss": 1.4577, "step": 63390 }, { "epoch": 3.0216776466204887, "grad_norm": 0.19241797924041748, "learning_rate": 8.617394315504516e-05, "loss": 1.4574, "step": 63400 }, { "epoch": 3.022219587786001, "grad_norm": 0.32396382093429565, "learning_rate": 8.616251913969085e-05, "loss": 1.4519, "step": 63410 }, { "epoch": 3.0227615289515133, "grad_norm": 0.1714586317539215, "learning_rate": 8.615109126405897e-05, "loss": 1.4567, "step": 63420 }, { "epoch": 3.0229241113011667, "eval_loss": 2.4477341175079346, "eval_runtime": 21.9816, "eval_samples_per_second": 227.463, "eval_steps_per_second": 1.228, "step": 63423 }, { "epoch": 3.0233034701170256, "grad_norm": 0.1809176504611969, "learning_rate": 8.613965952956515e-05, "loss": 1.446, "step": 63430 }, { "epoch": 3.0238454112825375, "grad_norm": 0.20341655611991882, "learning_rate": 8.61282239376255e-05, "loss": 1.4551, "step": 63440 }, { "epoch": 3.02438735244805, "grad_norm": 0.1895596832036972, "learning_rate": 8.611678448965661e-05, "loss": 1.4487, "step": 63450 }, { "epoch": 3.024929293613562, "grad_norm": 0.2253957986831665, "learning_rate": 8.610534118707556e-05, "loss": 1.4559, "step": 63460 }, { "epoch": 3.0254712347790744, "grad_norm": 0.2183631956577301, "learning_rate": 8.609389403129988e-05, "loss": 1.4415, "step": 63470 }, { "epoch": 3.0260131759445863, "grad_norm": 0.3052237629890442, "learning_rate": 8.608244302374762e-05, "loss": 1.4455, "step": 63480 }, { "epoch": 3.0265551171100986, "grad_norm": 0.28673261404037476, "learning_rate": 8.607098816583725e-05, "loss": 1.4617, "step": 63490 }, { "epoch": 3.027097058275611, "grad_norm": 0.20791488885879517, "learning_rate": 8.605952945898777e-05, "loss": 1.4532, "step": 63500 }, { "epoch": 3.0276389994411232, "grad_norm": 0.34028980135917664, "learning_rate": 8.604806690461863e-05, "loss": 1.4535, "step": 63510 }, { "epoch": 3.0276389994411232, "eval_loss": 2.4497005939483643, "eval_runtime": 21.9833, "eval_samples_per_second": 227.446, "eval_steps_per_second": 1.228, "step": 63510 }, { "epoch": 3.0281809406066356, "grad_norm": 0.22710201144218445, "learning_rate": 8.603660050414974e-05, "loss": 1.4425, "step": 63520 }, { "epoch": 3.0287228817721474, "grad_norm": 0.17469532787799835, "learning_rate": 8.602513025900155e-05, "loss": 1.4445, "step": 63530 }, { "epoch": 3.0292648229376598, "grad_norm": 0.21239697933197021, "learning_rate": 8.601365617059491e-05, "loss": 1.4371, "step": 63540 }, { "epoch": 3.029806764103172, "grad_norm": 0.1866355538368225, "learning_rate": 8.60021782403512e-05, "loss": 1.4426, "step": 63550 }, { "epoch": 3.0303487052686844, "grad_norm": 0.175773486495018, "learning_rate": 8.599069646969223e-05, "loss": 1.4477, "step": 63560 }, { "epoch": 3.0308906464341967, "grad_norm": 0.2518947124481201, "learning_rate": 8.597921086004035e-05, "loss": 1.4465, "step": 63570 }, { "epoch": 3.0314325875997086, "grad_norm": 0.3670835494995117, "learning_rate": 8.596772141281833e-05, "loss": 1.4487, "step": 63580 }, { "epoch": 3.031974528765221, "grad_norm": 0.3510816991329193, "learning_rate": 8.59562281294494e-05, "loss": 1.4518, "step": 63590 }, { "epoch": 3.0323538875810794, "eval_loss": 2.4546306133270264, "eval_runtime": 22.1411, "eval_samples_per_second": 225.824, "eval_steps_per_second": 1.219, "step": 63597 }, { "epoch": 3.032516469930733, "grad_norm": 0.3301527500152588, "learning_rate": 8.594473101135734e-05, "loss": 1.4655, "step": 63600 }, { "epoch": 3.0330584110962455, "grad_norm": 0.2271365225315094, "learning_rate": 8.593323005996638e-05, "loss": 1.4479, "step": 63610 }, { "epoch": 3.0336003522617574, "grad_norm": 0.3536682724952698, "learning_rate": 8.592172527670114e-05, "loss": 1.4523, "step": 63620 }, { "epoch": 3.0341422934272697, "grad_norm": 0.4474346935749054, "learning_rate": 8.591021666298684e-05, "loss": 1.4426, "step": 63630 }, { "epoch": 3.034684234592782, "grad_norm": 0.41748547554016113, "learning_rate": 8.589870422024909e-05, "loss": 1.4496, "step": 63640 }, { "epoch": 3.0352261757582943, "grad_norm": 0.39396533370018005, "learning_rate": 8.5887187949914e-05, "loss": 1.4458, "step": 63650 }, { "epoch": 3.0357681169238067, "grad_norm": 0.30107173323631287, "learning_rate": 8.58756678534082e-05, "loss": 1.4475, "step": 63660 }, { "epoch": 3.0363100580893185, "grad_norm": 0.21877449750900269, "learning_rate": 8.586414393215869e-05, "loss": 1.4491, "step": 63670 }, { "epoch": 3.036851999254831, "grad_norm": 0.3609618842601776, "learning_rate": 8.585261618759303e-05, "loss": 1.4532, "step": 63680 }, { "epoch": 3.037068775721036, "eval_loss": 2.4435439109802246, "eval_runtime": 21.984, "eval_samples_per_second": 227.438, "eval_steps_per_second": 1.228, "step": 63684 }, { "epoch": 3.037393940420343, "grad_norm": 0.21839331090450287, "learning_rate": 8.584108462113922e-05, "loss": 1.4429, "step": 63690 }, { "epoch": 3.0379358815858555, "grad_norm": 0.24262754619121552, "learning_rate": 8.582954923422578e-05, "loss": 1.4498, "step": 63700 }, { "epoch": 3.0384778227513674, "grad_norm": 0.18923676013946533, "learning_rate": 8.581801002828159e-05, "loss": 1.4449, "step": 63710 }, { "epoch": 3.0390197639168797, "grad_norm": 0.2811407446861267, "learning_rate": 8.580646700473614e-05, "loss": 1.4465, "step": 63720 }, { "epoch": 3.039561705082392, "grad_norm": 0.2581160366535187, "learning_rate": 8.579492016501929e-05, "loss": 1.4538, "step": 63730 }, { "epoch": 3.0401036462479043, "grad_norm": 0.4067544639110565, "learning_rate": 8.578336951056145e-05, "loss": 1.4401, "step": 63740 }, { "epoch": 3.0406455874134166, "grad_norm": 0.2783118784427643, "learning_rate": 8.577181504279342e-05, "loss": 1.4468, "step": 63750 }, { "epoch": 3.0411875285789285, "grad_norm": 0.2964227795600891, "learning_rate": 8.576025676314654e-05, "loss": 1.4491, "step": 63760 }, { "epoch": 3.041729469744441, "grad_norm": 0.2709335684776306, "learning_rate": 8.57486946730526e-05, "loss": 1.4501, "step": 63770 }, { "epoch": 3.041783663860992, "eval_loss": 2.4320621490478516, "eval_runtime": 21.9837, "eval_samples_per_second": 227.441, "eval_steps_per_second": 1.228, "step": 63771 }, { "epoch": 3.042271410909953, "grad_norm": 0.31969088315963745, "learning_rate": 8.573712877394387e-05, "loss": 1.4542, "step": 63780 }, { "epoch": 3.0428133520754654, "grad_norm": 0.43009239435195923, "learning_rate": 8.572555906725309e-05, "loss": 1.4478, "step": 63790 }, { "epoch": 3.0433552932409773, "grad_norm": 0.24113118648529053, "learning_rate": 8.571398555441344e-05, "loss": 1.4368, "step": 63800 }, { "epoch": 3.0438972344064896, "grad_norm": 0.30968958139419556, "learning_rate": 8.570240823685858e-05, "loss": 1.451, "step": 63810 }, { "epoch": 3.044439175572002, "grad_norm": 0.2378184050321579, "learning_rate": 8.569082711602271e-05, "loss": 1.443, "step": 63820 }, { "epoch": 3.0449811167375143, "grad_norm": 0.19775545597076416, "learning_rate": 8.567924219334042e-05, "loss": 1.452, "step": 63830 }, { "epoch": 3.0455230579030266, "grad_norm": 0.17699836194515228, "learning_rate": 8.566765347024679e-05, "loss": 1.452, "step": 63840 }, { "epoch": 3.0460649990685384, "grad_norm": 0.1834602653980255, "learning_rate": 8.565606094817741e-05, "loss": 1.4525, "step": 63850 }, { "epoch": 3.046498552000948, "eval_loss": 2.435060977935791, "eval_runtime": 21.9853, "eval_samples_per_second": 227.424, "eval_steps_per_second": 1.228, "step": 63858 }, { "epoch": 3.0466069402340508, "grad_norm": 0.2576271891593933, "learning_rate": 8.56444646285683e-05, "loss": 1.4452, "step": 63860 }, { "epoch": 3.047148881399563, "grad_norm": 0.20896773040294647, "learning_rate": 8.563286451285595e-05, "loss": 1.4548, "step": 63870 }, { "epoch": 3.0476908225650754, "grad_norm": 0.2274290919303894, "learning_rate": 8.562126060247733e-05, "loss": 1.4541, "step": 63880 }, { "epoch": 3.0482327637305877, "grad_norm": 0.2640198767185211, "learning_rate": 8.560965289886987e-05, "loss": 1.4511, "step": 63890 }, { "epoch": 3.0487747048960996, "grad_norm": 0.20334435999393463, "learning_rate": 8.559804140347156e-05, "loss": 1.4497, "step": 63900 }, { "epoch": 3.049316646061612, "grad_norm": 0.28822678327560425, "learning_rate": 8.558642611772069e-05, "loss": 1.4519, "step": 63910 }, { "epoch": 3.049858587227124, "grad_norm": 0.30483877658843994, "learning_rate": 8.557480704305614e-05, "loss": 1.4519, "step": 63920 }, { "epoch": 3.0504005283926365, "grad_norm": 0.29864802956581116, "learning_rate": 8.556318418091724e-05, "loss": 1.449, "step": 63930 }, { "epoch": 3.0509424695581484, "grad_norm": 0.43397045135498047, "learning_rate": 8.555155753274379e-05, "loss": 1.4485, "step": 63940 }, { "epoch": 3.051213440140905, "eval_loss": 2.4513673782348633, "eval_runtime": 21.9827, "eval_samples_per_second": 227.451, "eval_steps_per_second": 1.228, "step": 63945 }, { "epoch": 3.0514844107236607, "grad_norm": 0.3487168550491333, "learning_rate": 8.553992709997602e-05, "loss": 1.4545, "step": 63950 }, { "epoch": 3.052026351889173, "grad_norm": 0.18993450701236725, "learning_rate": 8.552829288405467e-05, "loss": 1.4491, "step": 63960 }, { "epoch": 3.0525682930546854, "grad_norm": 0.22048941254615784, "learning_rate": 8.551665488642096e-05, "loss": 1.442, "step": 63970 }, { "epoch": 3.0531102342201977, "grad_norm": 0.2803003191947937, "learning_rate": 8.55050131085165e-05, "loss": 1.4496, "step": 63980 }, { "epoch": 3.0536521753857095, "grad_norm": 0.43568509817123413, "learning_rate": 8.549336755178347e-05, "loss": 1.4516, "step": 63990 }, { "epoch": 3.054194116551222, "grad_norm": 0.24292457103729248, "learning_rate": 8.548171821766448e-05, "loss": 1.4408, "step": 64000 }, { "epoch": 3.054736057716734, "grad_norm": 0.1853036731481552, "learning_rate": 8.547006510760254e-05, "loss": 1.4507, "step": 64010 }, { "epoch": 3.0552779988822465, "grad_norm": 0.2446528673171997, "learning_rate": 8.545840822304125e-05, "loss": 1.4424, "step": 64020 }, { "epoch": 3.0558199400477584, "grad_norm": 0.17944113910198212, "learning_rate": 8.544674756542457e-05, "loss": 1.455, "step": 64030 }, { "epoch": 3.055928328280861, "eval_loss": 2.433335304260254, "eval_runtime": 21.9868, "eval_samples_per_second": 227.409, "eval_steps_per_second": 1.228, "step": 64032 }, { "epoch": 3.0563618812132707, "grad_norm": 0.3536889851093292, "learning_rate": 8.5435083136197e-05, "loss": 1.4485, "step": 64040 }, { "epoch": 3.056903822378783, "grad_norm": 0.1827295571565628, "learning_rate": 8.542341493680345e-05, "loss": 1.4464, "step": 64050 }, { "epoch": 3.0574457635442953, "grad_norm": 0.1838574856519699, "learning_rate": 8.541174296868935e-05, "loss": 1.4464, "step": 64060 }, { "epoch": 3.0579877047098076, "grad_norm": 0.2586890757083893, "learning_rate": 8.540006723330057e-05, "loss": 1.4497, "step": 64070 }, { "epoch": 3.0585296458753195, "grad_norm": 0.3454444408416748, "learning_rate": 8.538838773208344e-05, "loss": 1.4462, "step": 64080 }, { "epoch": 3.059071587040832, "grad_norm": 0.27531322836875916, "learning_rate": 8.537670446648477e-05, "loss": 1.4482, "step": 64090 }, { "epoch": 3.059613528206344, "grad_norm": 0.2372499257326126, "learning_rate": 8.536501743795183e-05, "loss": 1.4447, "step": 64100 }, { "epoch": 3.0601554693718565, "grad_norm": 0.20991750061511993, "learning_rate": 8.535332664793237e-05, "loss": 1.4462, "step": 64110 }, { "epoch": 3.0606432164208175, "eval_loss": 2.4356119632720947, "eval_runtime": 21.9868, "eval_samples_per_second": 227.409, "eval_steps_per_second": 1.228, "step": 64119 }, { "epoch": 3.0606974105373683, "grad_norm": 0.23890291154384613, "learning_rate": 8.534163209787459e-05, "loss": 1.4582, "step": 64120 }, { "epoch": 3.0612393517028806, "grad_norm": 0.25897133350372314, "learning_rate": 8.532993378922716e-05, "loss": 1.4452, "step": 64130 }, { "epoch": 3.061781292868393, "grad_norm": 0.2815474569797516, "learning_rate": 8.53182317234392e-05, "loss": 1.4509, "step": 64140 }, { "epoch": 3.0623232340339053, "grad_norm": 0.22710613906383514, "learning_rate": 8.530652590196033e-05, "loss": 1.4503, "step": 64150 }, { "epoch": 3.0628651751994176, "grad_norm": 0.26313063502311707, "learning_rate": 8.529481632624059e-05, "loss": 1.4433, "step": 64160 }, { "epoch": 3.0634071163649295, "grad_norm": 0.17818008363246918, "learning_rate": 8.528310299773055e-05, "loss": 1.4435, "step": 64170 }, { "epoch": 3.0639490575304418, "grad_norm": 0.1890927255153656, "learning_rate": 8.527138591788118e-05, "loss": 1.4366, "step": 64180 }, { "epoch": 3.064490998695954, "grad_norm": 0.27579453587532043, "learning_rate": 8.525966508814396e-05, "loss": 1.4499, "step": 64190 }, { "epoch": 3.0650329398614664, "grad_norm": 0.17526105046272278, "learning_rate": 8.524794050997079e-05, "loss": 1.4415, "step": 64200 }, { "epoch": 3.0653581045607736, "eval_loss": 2.440138101577759, "eval_runtime": 21.9841, "eval_samples_per_second": 227.437, "eval_steps_per_second": 1.228, "step": 64206 }, { "epoch": 3.0655748810269783, "grad_norm": 0.23076623678207397, "learning_rate": 8.523621218481407e-05, "loss": 1.4393, "step": 64210 }, { "epoch": 3.0661168221924906, "grad_norm": 0.29393690824508667, "learning_rate": 8.522448011412665e-05, "loss": 1.4491, "step": 64220 }, { "epoch": 3.066658763358003, "grad_norm": 0.16649694740772247, "learning_rate": 8.521274429936187e-05, "loss": 1.4364, "step": 64230 }, { "epoch": 3.0672007045235152, "grad_norm": 0.23092569410800934, "learning_rate": 8.520100474197348e-05, "loss": 1.444, "step": 64240 }, { "epoch": 3.0677426456890275, "grad_norm": 0.24326132237911224, "learning_rate": 8.518926144341577e-05, "loss": 1.4569, "step": 64250 }, { "epoch": 3.0682845868545394, "grad_norm": 0.27092161774635315, "learning_rate": 8.51775144051434e-05, "loss": 1.443, "step": 64260 }, { "epoch": 3.0688265280200517, "grad_norm": 0.3469485640525818, "learning_rate": 8.516576362861159e-05, "loss": 1.4355, "step": 64270 }, { "epoch": 3.069368469185564, "grad_norm": 0.22915403544902802, "learning_rate": 8.515400911527592e-05, "loss": 1.4623, "step": 64280 }, { "epoch": 3.0699104103510764, "grad_norm": 0.2217090129852295, "learning_rate": 8.514225086659253e-05, "loss": 1.4463, "step": 64290 }, { "epoch": 3.0700729927007298, "eval_loss": 2.4342753887176514, "eval_runtime": 21.983, "eval_samples_per_second": 227.448, "eval_steps_per_second": 1.228, "step": 64293 }, { "epoch": 3.0704523515165887, "grad_norm": 0.3960467278957367, "learning_rate": 8.513048888401795e-05, "loss": 1.4428, "step": 64300 }, { "epoch": 3.0709942926821006, "grad_norm": 0.20913895964622498, "learning_rate": 8.511872316900925e-05, "loss": 1.4455, "step": 64310 }, { "epoch": 3.071536233847613, "grad_norm": 0.3125520944595337, "learning_rate": 8.510695372302385e-05, "loss": 1.4572, "step": 64320 }, { "epoch": 3.072078175013125, "grad_norm": 0.3635540306568146, "learning_rate": 8.509518054751976e-05, "loss": 1.452, "step": 64330 }, { "epoch": 3.0726201161786375, "grad_norm": 0.2593584358692169, "learning_rate": 8.508340364395536e-05, "loss": 1.4375, "step": 64340 }, { "epoch": 3.0731620573441494, "grad_norm": 0.22683855891227722, "learning_rate": 8.507162301378952e-05, "loss": 1.4461, "step": 64350 }, { "epoch": 3.0737039985096617, "grad_norm": 0.18193486332893372, "learning_rate": 8.505983865848158e-05, "loss": 1.4501, "step": 64360 }, { "epoch": 3.074245939675174, "grad_norm": 0.35355544090270996, "learning_rate": 8.504805057949132e-05, "loss": 1.444, "step": 64370 }, { "epoch": 3.0747878808406863, "grad_norm": 0.16228243708610535, "learning_rate": 8.503625877827904e-05, "loss": 1.4489, "step": 64380 }, { "epoch": 3.0747878808406863, "eval_loss": 2.4377474784851074, "eval_runtime": 21.9803, "eval_samples_per_second": 227.477, "eval_steps_per_second": 1.228, "step": 64380 }, { "epoch": 3.0753298220061986, "grad_norm": 0.26819851994514465, "learning_rate": 8.50244632563054e-05, "loss": 1.4487, "step": 64390 }, { "epoch": 3.0758717631717105, "grad_norm": 0.21382486820220947, "learning_rate": 8.501266401503164e-05, "loss": 1.4461, "step": 64400 }, { "epoch": 3.076413704337223, "grad_norm": 0.20011401176452637, "learning_rate": 8.500086105591935e-05, "loss": 1.4511, "step": 64410 }, { "epoch": 3.076955645502735, "grad_norm": 0.2521159052848816, "learning_rate": 8.498905438043061e-05, "loss": 1.4421, "step": 64420 }, { "epoch": 3.0774975866682475, "grad_norm": 0.27458107471466064, "learning_rate": 8.497724399002805e-05, "loss": 1.4556, "step": 64430 }, { "epoch": 3.0780395278337593, "grad_norm": 0.17625468969345093, "learning_rate": 8.496542988617463e-05, "loss": 1.4562, "step": 64440 }, { "epoch": 3.0785814689992717, "grad_norm": 0.23053233325481415, "learning_rate": 8.495361207033387e-05, "loss": 1.4438, "step": 64450 }, { "epoch": 3.079123410164784, "grad_norm": 0.20668716728687286, "learning_rate": 8.494179054396968e-05, "loss": 1.4421, "step": 64460 }, { "epoch": 3.0795027689806425, "eval_loss": 2.4462552070617676, "eval_runtime": 208.061, "eval_samples_per_second": 24.031, "eval_steps_per_second": 0.13, "step": 64467 }, { "epoch": 3.0796653513302963, "grad_norm": 0.2157292515039444, "learning_rate": 8.492996530854646e-05, "loss": 1.441, "step": 64470 }, { "epoch": 3.0802072924958086, "grad_norm": 0.1837170124053955, "learning_rate": 8.491813636552911e-05, "loss": 1.4421, "step": 64480 }, { "epoch": 3.0807492336613205, "grad_norm": 0.2104165405035019, "learning_rate": 8.490630371638291e-05, "loss": 1.4516, "step": 64490 }, { "epoch": 3.081291174826833, "grad_norm": 0.24644435942173004, "learning_rate": 8.489446736257365e-05, "loss": 1.4451, "step": 64500 }, { "epoch": 3.081833115992345, "grad_norm": 0.18532828986644745, "learning_rate": 8.488262730556754e-05, "loss": 1.4551, "step": 64510 }, { "epoch": 3.0823750571578574, "grad_norm": 0.20310962200164795, "learning_rate": 8.487078354683132e-05, "loss": 1.4506, "step": 64520 }, { "epoch": 3.0829169983233697, "grad_norm": 0.22750219702720642, "learning_rate": 8.48589360878321e-05, "loss": 1.4364, "step": 64530 }, { "epoch": 3.0834589394888816, "grad_norm": 0.23471009731292725, "learning_rate": 8.484708493003753e-05, "loss": 1.455, "step": 64540 }, { "epoch": 3.084000880654394, "grad_norm": 0.17168253660202026, "learning_rate": 8.483523007491565e-05, "loss": 1.4408, "step": 64550 }, { "epoch": 3.084217657120599, "eval_loss": 2.4336390495300293, "eval_runtime": 21.9861, "eval_samples_per_second": 227.416, "eval_steps_per_second": 1.228, "step": 64554 }, { "epoch": 3.0845428218199062, "grad_norm": 0.21957682073116302, "learning_rate": 8.4823371523935e-05, "loss": 1.4431, "step": 64560 }, { "epoch": 3.0850847629854186, "grad_norm": 0.25939157605171204, "learning_rate": 8.481150927856458e-05, "loss": 1.4492, "step": 64570 }, { "epoch": 3.0856267041509304, "grad_norm": 0.22554215788841248, "learning_rate": 8.479964334027381e-05, "loss": 1.4551, "step": 64580 }, { "epoch": 3.0861686453164427, "grad_norm": 0.20122438669204712, "learning_rate": 8.478777371053259e-05, "loss": 1.4445, "step": 64590 }, { "epoch": 3.086710586481955, "grad_norm": 0.18250145018100739, "learning_rate": 8.477590039081131e-05, "loss": 1.4475, "step": 64600 }, { "epoch": 3.0872525276474674, "grad_norm": 0.18288695812225342, "learning_rate": 8.476402338258073e-05, "loss": 1.4484, "step": 64610 }, { "epoch": 3.0877944688129797, "grad_norm": 0.19832782447338104, "learning_rate": 8.475214268731219e-05, "loss": 1.4513, "step": 64620 }, { "epoch": 3.0883364099784916, "grad_norm": 0.38973701000213623, "learning_rate": 8.474025830647737e-05, "loss": 1.442, "step": 64630 }, { "epoch": 3.088878351144004, "grad_norm": 0.3162194490432739, "learning_rate": 8.472837024154847e-05, "loss": 1.4464, "step": 64640 }, { "epoch": 3.088932545260555, "eval_loss": 2.423926830291748, "eval_runtime": 21.9851, "eval_samples_per_second": 227.426, "eval_steps_per_second": 1.228, "step": 64641 }, { "epoch": 3.089420292309516, "grad_norm": 0.1702577918767929, "learning_rate": 8.471647849399815e-05, "loss": 1.4542, "step": 64650 }, { "epoch": 3.0899622334750285, "grad_norm": 0.315075546503067, "learning_rate": 8.470458306529946e-05, "loss": 1.4416, "step": 64660 }, { "epoch": 3.0905041746405404, "grad_norm": 0.25835493206977844, "learning_rate": 8.4692683956926e-05, "loss": 1.436, "step": 64670 }, { "epoch": 3.0910461158060527, "grad_norm": 0.6356673240661621, "learning_rate": 8.468078117035176e-05, "loss": 1.4472, "step": 64680 }, { "epoch": 3.091588056971565, "grad_norm": 0.3583969175815582, "learning_rate": 8.466887470705121e-05, "loss": 1.4487, "step": 64690 }, { "epoch": 3.0921299981370773, "grad_norm": 0.21938195824623108, "learning_rate": 8.465696456849928e-05, "loss": 1.4513, "step": 64700 }, { "epoch": 3.0926719393025897, "grad_norm": 0.26139944791793823, "learning_rate": 8.464505075617133e-05, "loss": 1.4505, "step": 64710 }, { "epoch": 3.0932138804681015, "grad_norm": 0.2597460150718689, "learning_rate": 8.46331332715432e-05, "loss": 1.4483, "step": 64720 }, { "epoch": 3.0936474334005113, "eval_loss": 2.4286162853240967, "eval_runtime": 21.6785, "eval_samples_per_second": 230.643, "eval_steps_per_second": 1.245, "step": 64728 }, { "epoch": 3.093755821633614, "grad_norm": 0.21105916798114777, "learning_rate": 8.462121211609117e-05, "loss": 1.4437, "step": 64730 }, { "epoch": 3.094297762799126, "grad_norm": 0.1735154688358307, "learning_rate": 8.4609287291292e-05, "loss": 1.4413, "step": 64740 }, { "epoch": 3.0948397039646385, "grad_norm": 0.1659323126077652, "learning_rate": 8.459735879862286e-05, "loss": 1.4488, "step": 64750 }, { "epoch": 3.0953816451301503, "grad_norm": 0.19731490314006805, "learning_rate": 8.45854266395614e-05, "loss": 1.4478, "step": 64760 }, { "epoch": 3.0959235862956627, "grad_norm": 0.19939690828323364, "learning_rate": 8.457349081558576e-05, "loss": 1.4378, "step": 64770 }, { "epoch": 3.096465527461175, "grad_norm": 0.24715344607830048, "learning_rate": 8.456155132817443e-05, "loss": 1.4477, "step": 64780 }, { "epoch": 3.0970074686266873, "grad_norm": 0.20696981251239777, "learning_rate": 8.45496081788065e-05, "loss": 1.4452, "step": 64790 }, { "epoch": 3.0975494097921996, "grad_norm": 0.19315387308597565, "learning_rate": 8.45376613689614e-05, "loss": 1.4469, "step": 64800 }, { "epoch": 3.0980913509577115, "grad_norm": 0.23861078917980194, "learning_rate": 8.452571090011905e-05, "loss": 1.4624, "step": 64810 }, { "epoch": 3.098362321540468, "eval_loss": 2.4268784523010254, "eval_runtime": 22.037, "eval_samples_per_second": 226.891, "eval_steps_per_second": 1.225, "step": 64815 }, { "epoch": 3.098633292123224, "grad_norm": 0.1983337700366974, "learning_rate": 8.45137567737598e-05, "loss": 1.4448, "step": 64820 }, { "epoch": 3.099175233288736, "grad_norm": 0.2253299057483673, "learning_rate": 8.450179899136451e-05, "loss": 1.4471, "step": 64830 }, { "epoch": 3.0997171744542484, "grad_norm": 0.2381051778793335, "learning_rate": 8.448983755441447e-05, "loss": 1.4461, "step": 64840 }, { "epoch": 3.1002591156197603, "grad_norm": 0.23918847739696503, "learning_rate": 8.447787246439135e-05, "loss": 1.4571, "step": 64850 }, { "epoch": 3.1008010567852726, "grad_norm": 0.17684470117092133, "learning_rate": 8.446590372277738e-05, "loss": 1.4455, "step": 64860 }, { "epoch": 3.101342997950785, "grad_norm": 0.1668468713760376, "learning_rate": 8.445393133105519e-05, "loss": 1.4437, "step": 64870 }, { "epoch": 3.1018849391162973, "grad_norm": 0.18969334661960602, "learning_rate": 8.444195529070785e-05, "loss": 1.4375, "step": 64880 }, { "epoch": 3.1024268802818096, "grad_norm": 0.1837807595729828, "learning_rate": 8.442997560321894e-05, "loss": 1.4407, "step": 64890 }, { "epoch": 3.1029688214473214, "grad_norm": 0.22503748536109924, "learning_rate": 8.44179922700724e-05, "loss": 1.4465, "step": 64900 }, { "epoch": 3.103077209680424, "eval_loss": 2.4285776615142822, "eval_runtime": 21.9808, "eval_samples_per_second": 227.472, "eval_steps_per_second": 1.228, "step": 64902 }, { "epoch": 3.1035107626128338, "grad_norm": 0.20925331115722656, "learning_rate": 8.44060052927527e-05, "loss": 1.4417, "step": 64910 }, { "epoch": 3.104052703778346, "grad_norm": 0.36361682415008545, "learning_rate": 8.439401467274474e-05, "loss": 1.4505, "step": 64920 }, { "epoch": 3.1045946449438584, "grad_norm": 0.2141452133655548, "learning_rate": 8.438202041153385e-05, "loss": 1.4502, "step": 64930 }, { "epoch": 3.1051365861093707, "grad_norm": 0.22404509782791138, "learning_rate": 8.437002251060585e-05, "loss": 1.4467, "step": 64940 }, { "epoch": 3.1056785272748826, "grad_norm": 0.26203832030296326, "learning_rate": 8.435802097144696e-05, "loss": 1.4514, "step": 64950 }, { "epoch": 3.106220468440395, "grad_norm": 0.265337198972702, "learning_rate": 8.434601579554389e-05, "loss": 1.4438, "step": 64960 }, { "epoch": 3.106762409605907, "grad_norm": 0.1931227445602417, "learning_rate": 8.433400698438381e-05, "loss": 1.4518, "step": 64970 }, { "epoch": 3.1073043507714195, "grad_norm": 0.2169187366962433, "learning_rate": 8.432199453945427e-05, "loss": 1.4367, "step": 64980 }, { "epoch": 3.1077920978203806, "eval_loss": 2.420952796936035, "eval_runtime": 21.9789, "eval_samples_per_second": 227.491, "eval_steps_per_second": 1.228, "step": 64989 }, { "epoch": 3.1078462919369314, "grad_norm": 0.2237025499343872, "learning_rate": 8.430997846224338e-05, "loss": 1.4437, "step": 64990 }, { "epoch": 3.1083882331024437, "grad_norm": 0.18218779563903809, "learning_rate": 8.42979587542396e-05, "loss": 1.442, "step": 65000 }, { "epoch": 3.108930174267956, "grad_norm": 0.2928388714790344, "learning_rate": 8.428593541693188e-05, "loss": 1.4455, "step": 65010 }, { "epoch": 3.1094721154334684, "grad_norm": 0.22536158561706543, "learning_rate": 8.427390845180963e-05, "loss": 1.4388, "step": 65020 }, { "epoch": 3.1100140565989807, "grad_norm": 0.3034621477127075, "learning_rate": 8.426187786036269e-05, "loss": 1.4458, "step": 65030 }, { "epoch": 3.1105559977644925, "grad_norm": 0.21954983472824097, "learning_rate": 8.424984364408138e-05, "loss": 1.4349, "step": 65040 }, { "epoch": 3.111097938930005, "grad_norm": 0.18350431323051453, "learning_rate": 8.423780580445642e-05, "loss": 1.4489, "step": 65050 }, { "epoch": 3.111639880095517, "grad_norm": 0.19277605414390564, "learning_rate": 8.4225764342979e-05, "loss": 1.4507, "step": 65060 }, { "epoch": 3.1121818212610295, "grad_norm": 0.25510692596435547, "learning_rate": 8.42137192611408e-05, "loss": 1.4467, "step": 65070 }, { "epoch": 3.1125069859603367, "eval_loss": 2.427020788192749, "eval_runtime": 21.9917, "eval_samples_per_second": 227.358, "eval_steps_per_second": 1.228, "step": 65076 }, { "epoch": 3.1127237624265414, "grad_norm": 0.2290118932723999, "learning_rate": 8.42016705604339e-05, "loss": 1.4456, "step": 65080 }, { "epoch": 3.1132657035920537, "grad_norm": 0.19887521862983704, "learning_rate": 8.41896182423508e-05, "loss": 1.4479, "step": 65090 }, { "epoch": 3.113807644757566, "grad_norm": 0.17075598239898682, "learning_rate": 8.417756230838455e-05, "loss": 1.443, "step": 65100 }, { "epoch": 3.1143495859230783, "grad_norm": 0.2500585913658142, "learning_rate": 8.416550276002853e-05, "loss": 1.44, "step": 65110 }, { "epoch": 3.1148915270885906, "grad_norm": 0.2130533754825592, "learning_rate": 8.415343959877668e-05, "loss": 1.4478, "step": 65120 }, { "epoch": 3.1154334682541025, "grad_norm": 0.3005780279636383, "learning_rate": 8.41413728261233e-05, "loss": 1.4438, "step": 65130 }, { "epoch": 3.115975409419615, "grad_norm": 0.23416976630687714, "learning_rate": 8.412930244356316e-05, "loss": 1.4435, "step": 65140 }, { "epoch": 3.116517350585127, "grad_norm": 0.26363202929496765, "learning_rate": 8.41172284525915e-05, "loss": 1.4469, "step": 65150 }, { "epoch": 3.1170592917506394, "grad_norm": 0.20009048283100128, "learning_rate": 8.4105150854704e-05, "loss": 1.4412, "step": 65160 }, { "epoch": 3.117221874100293, "eval_loss": 2.424807071685791, "eval_runtime": 21.9875, "eval_samples_per_second": 227.402, "eval_steps_per_second": 1.228, "step": 65163 }, { "epoch": 3.1176012329161518, "grad_norm": 0.2043033242225647, "learning_rate": 8.409306965139677e-05, "loss": 1.4525, "step": 65170 }, { "epoch": 3.1181431740816636, "grad_norm": 0.27456438541412354, "learning_rate": 8.408098484416639e-05, "loss": 1.4432, "step": 65180 }, { "epoch": 3.118685115247176, "grad_norm": 0.4368920624256134, "learning_rate": 8.406889643450984e-05, "loss": 1.4449, "step": 65190 }, { "epoch": 3.1192270564126883, "grad_norm": 0.3747996389865875, "learning_rate": 8.405680442392464e-05, "loss": 1.4441, "step": 65200 }, { "epoch": 3.1197689975782006, "grad_norm": 0.3094594478607178, "learning_rate": 8.404470881390863e-05, "loss": 1.4518, "step": 65210 }, { "epoch": 3.1203109387437125, "grad_norm": 0.30118393898010254, "learning_rate": 8.40326096059602e-05, "loss": 1.4454, "step": 65220 }, { "epoch": 3.1208528799092248, "grad_norm": 0.20605798065662384, "learning_rate": 8.402050680157816e-05, "loss": 1.4495, "step": 65230 }, { "epoch": 3.121394821074737, "grad_norm": 0.21210525929927826, "learning_rate": 8.400840040226172e-05, "loss": 1.4417, "step": 65240 }, { "epoch": 3.1219367622402494, "grad_norm": 0.17496547102928162, "learning_rate": 8.399629040951057e-05, "loss": 1.4494, "step": 65250 }, { "epoch": 3.1219367622402494, "eval_loss": 2.42055082321167, "eval_runtime": 21.5444, "eval_samples_per_second": 232.079, "eval_steps_per_second": 1.253, "step": 65250 }, { "epoch": 3.1224787034057617, "grad_norm": 0.26712679862976074, "learning_rate": 8.398417682482486e-05, "loss": 1.4512, "step": 65260 }, { "epoch": 3.1230206445712736, "grad_norm": 0.2958221733570099, "learning_rate": 8.397205964970515e-05, "loss": 1.4454, "step": 65270 }, { "epoch": 3.123562585736786, "grad_norm": 0.2679630219936371, "learning_rate": 8.39599388856525e-05, "loss": 1.4397, "step": 65280 }, { "epoch": 3.1241045269022982, "grad_norm": 0.16526180505752563, "learning_rate": 8.394781453416832e-05, "loss": 1.4498, "step": 65290 }, { "epoch": 3.1246464680678105, "grad_norm": 0.20681805908679962, "learning_rate": 8.393568659675458e-05, "loss": 1.4472, "step": 65300 }, { "epoch": 3.1251884092333224, "grad_norm": 0.19305351376533508, "learning_rate": 8.392355507491361e-05, "loss": 1.4432, "step": 65310 }, { "epoch": 3.1257303503988347, "grad_norm": 0.21879535913467407, "learning_rate": 8.391141997014819e-05, "loss": 1.4487, "step": 65320 }, { "epoch": 3.126272291564347, "grad_norm": 0.22270238399505615, "learning_rate": 8.389928128396161e-05, "loss": 1.4372, "step": 65330 }, { "epoch": 3.1266516503802055, "eval_loss": 2.4269917011260986, "eval_runtime": 21.979, "eval_samples_per_second": 227.49, "eval_steps_per_second": 1.228, "step": 65337 }, { "epoch": 3.1268142327298594, "grad_norm": 0.2375665158033371, "learning_rate": 8.388713901785753e-05, "loss": 1.4447, "step": 65340 }, { "epoch": 3.1273561738953717, "grad_norm": 0.30421313643455505, "learning_rate": 8.387499317334007e-05, "loss": 1.4416, "step": 65350 }, { "epoch": 3.1278981150608836, "grad_norm": 0.3451899290084839, "learning_rate": 8.386284375191381e-05, "loss": 1.4467, "step": 65360 }, { "epoch": 3.128440056226396, "grad_norm": 0.26824066042900085, "learning_rate": 8.385069075508379e-05, "loss": 1.4496, "step": 65370 }, { "epoch": 3.128981997391908, "grad_norm": 0.17959991097450256, "learning_rate": 8.383853418435546e-05, "loss": 1.4531, "step": 65380 }, { "epoch": 3.1295239385574205, "grad_norm": 0.18542522192001343, "learning_rate": 8.38263740412347e-05, "loss": 1.4485, "step": 65390 }, { "epoch": 3.1300658797229324, "grad_norm": 0.1759880632162094, "learning_rate": 8.38142103272279e-05, "loss": 1.4377, "step": 65400 }, { "epoch": 3.1306078208884447, "grad_norm": 0.24404051899909973, "learning_rate": 8.380204304384181e-05, "loss": 1.4517, "step": 65410 }, { "epoch": 3.131149762053957, "grad_norm": 0.18275314569473267, "learning_rate": 8.37898721925837e-05, "loss": 1.4604, "step": 65420 }, { "epoch": 3.131366538520162, "eval_loss": 2.424156665802002, "eval_runtime": 21.9886, "eval_samples_per_second": 227.39, "eval_steps_per_second": 1.228, "step": 65424 }, { "epoch": 3.1316917032194693, "grad_norm": 0.22716642916202545, "learning_rate": 8.377769777496118e-05, "loss": 1.4573, "step": 65430 }, { "epoch": 3.1322336443849816, "grad_norm": 0.2442425787448883, "learning_rate": 8.376551979248242e-05, "loss": 1.4396, "step": 65440 }, { "epoch": 3.1327755855504935, "grad_norm": 0.25765150785446167, "learning_rate": 8.375333824665594e-05, "loss": 1.4532, "step": 65450 }, { "epoch": 3.133317526716006, "grad_norm": 0.22376912832260132, "learning_rate": 8.374115313899077e-05, "loss": 1.4431, "step": 65460 }, { "epoch": 3.133859467881518, "grad_norm": 0.36618417501449585, "learning_rate": 8.372896447099634e-05, "loss": 1.4448, "step": 65470 }, { "epoch": 3.1344014090470305, "grad_norm": 0.35767969489097595, "learning_rate": 8.371677224418248e-05, "loss": 1.4337, "step": 65480 }, { "epoch": 3.1349433502125423, "grad_norm": 0.48897460103034973, "learning_rate": 8.370457646005957e-05, "loss": 1.4542, "step": 65490 }, { "epoch": 3.1354852913780547, "grad_norm": 0.39159682393074036, "learning_rate": 8.369237712013835e-05, "loss": 1.4522, "step": 65500 }, { "epoch": 3.136027232543567, "grad_norm": 0.36611253023147583, "learning_rate": 8.368017422593003e-05, "loss": 1.441, "step": 65510 }, { "epoch": 3.1360814266601182, "eval_loss": 2.427338123321533, "eval_runtime": 22.8212, "eval_samples_per_second": 219.095, "eval_steps_per_second": 1.183, "step": 65511 }, { "epoch": 3.1365691737090793, "grad_norm": 0.26878681778907776, "learning_rate": 8.366796777894624e-05, "loss": 1.4446, "step": 65520 }, { "epoch": 3.1371111148745916, "grad_norm": 0.2367294430732727, "learning_rate": 8.365575778069907e-05, "loss": 1.4538, "step": 65530 }, { "epoch": 3.1376530560401035, "grad_norm": 0.29069700837135315, "learning_rate": 8.364354423270102e-05, "loss": 1.4405, "step": 65540 }, { "epoch": 3.138194997205616, "grad_norm": 0.22299885749816895, "learning_rate": 8.363132713646509e-05, "loss": 1.4406, "step": 65550 }, { "epoch": 3.138736938371128, "grad_norm": 0.2363450676202774, "learning_rate": 8.361910649350465e-05, "loss": 1.4323, "step": 65560 }, { "epoch": 3.1392788795366404, "grad_norm": 0.2508092224597931, "learning_rate": 8.360688230533356e-05, "loss": 1.4348, "step": 65570 }, { "epoch": 3.1398208207021527, "grad_norm": 0.20643532276153564, "learning_rate": 8.359465457346607e-05, "loss": 1.4358, "step": 65580 }, { "epoch": 3.1403627618676646, "grad_norm": 0.22416460514068604, "learning_rate": 8.358242329941692e-05, "loss": 1.4501, "step": 65590 }, { "epoch": 3.1407963148000744, "eval_loss": 2.4230735301971436, "eval_runtime": 23.1943, "eval_samples_per_second": 215.571, "eval_steps_per_second": 1.164, "step": 65598 }, { "epoch": 3.140904703033177, "grad_norm": 0.3181062936782837, "learning_rate": 8.357018848470128e-05, "loss": 1.4362, "step": 65600 }, { "epoch": 3.1414466441986892, "grad_norm": 0.23214562237262726, "learning_rate": 8.35579501308347e-05, "loss": 1.4405, "step": 65610 }, { "epoch": 3.1419885853642016, "grad_norm": 0.35660114884376526, "learning_rate": 8.354570823933327e-05, "loss": 1.4464, "step": 65620 }, { "epoch": 3.1425305265297134, "grad_norm": 0.38225075602531433, "learning_rate": 8.353346281171343e-05, "loss": 1.4411, "step": 65630 }, { "epoch": 3.1430724676952257, "grad_norm": 0.2181611806154251, "learning_rate": 8.35212138494921e-05, "loss": 1.442, "step": 65640 }, { "epoch": 3.143614408860738, "grad_norm": 0.23916418850421906, "learning_rate": 8.35089613541866e-05, "loss": 1.4485, "step": 65650 }, { "epoch": 3.1441563500262504, "grad_norm": 0.19111785292625427, "learning_rate": 8.349670532731478e-05, "loss": 1.4476, "step": 65660 }, { "epoch": 3.1446982911917627, "grad_norm": 0.31324857473373413, "learning_rate": 8.34844457703948e-05, "loss": 1.4374, "step": 65670 }, { "epoch": 3.1452402323572746, "grad_norm": 0.3937076926231384, "learning_rate": 8.347218268494535e-05, "loss": 1.4406, "step": 65680 }, { "epoch": 3.145511202940031, "eval_loss": 2.4279704093933105, "eval_runtime": 22.8461, "eval_samples_per_second": 218.856, "eval_steps_per_second": 1.182, "step": 65685 }, { "epoch": 3.145782173522787, "grad_norm": 0.27373912930488586, "learning_rate": 8.345991607248553e-05, "loss": 1.4465, "step": 65690 }, { "epoch": 3.146324114688299, "grad_norm": 0.16764716804027557, "learning_rate": 8.344764593453485e-05, "loss": 1.4447, "step": 65700 }, { "epoch": 3.1468660558538115, "grad_norm": 0.21356230974197388, "learning_rate": 8.343537227261332e-05, "loss": 1.4484, "step": 65710 }, { "epoch": 3.1474079970193234, "grad_norm": 0.17575284838676453, "learning_rate": 8.342309508824132e-05, "loss": 1.4387, "step": 65720 }, { "epoch": 3.1479499381848357, "grad_norm": 0.20323576033115387, "learning_rate": 8.34108143829397e-05, "loss": 1.4325, "step": 65730 }, { "epoch": 3.148491879350348, "grad_norm": 0.30600157380104065, "learning_rate": 8.339853015822974e-05, "loss": 1.4434, "step": 65740 }, { "epoch": 3.1490338205158603, "grad_norm": 0.22637711465358734, "learning_rate": 8.338624241563316e-05, "loss": 1.4399, "step": 65750 }, { "epoch": 3.1495757616813727, "grad_norm": 0.17684820294380188, "learning_rate": 8.33739511566721e-05, "loss": 1.4412, "step": 65760 }, { "epoch": 3.1501177028468845, "grad_norm": 0.20868733525276184, "learning_rate": 8.336165638286916e-05, "loss": 1.4387, "step": 65770 }, { "epoch": 3.150226091079987, "eval_loss": 2.425462245941162, "eval_runtime": 22.0377, "eval_samples_per_second": 226.884, "eval_steps_per_second": 1.225, "step": 65772 }, { "epoch": 3.150659644012397, "grad_norm": 0.23098771274089813, "learning_rate": 8.334935809574738e-05, "loss": 1.4372, "step": 65780 }, { "epoch": 3.151201585177909, "grad_norm": 0.17905355989933014, "learning_rate": 8.33370562968302e-05, "loss": 1.4408, "step": 65790 }, { "epoch": 3.1517435263434215, "grad_norm": 0.21767891943454742, "learning_rate": 8.332475098764149e-05, "loss": 1.4326, "step": 65800 }, { "epoch": 3.152285467508934, "grad_norm": 0.2932087779045105, "learning_rate": 8.331244216970561e-05, "loss": 1.4417, "step": 65810 }, { "epoch": 3.1528274086744457, "grad_norm": 0.2508069574832916, "learning_rate": 8.330012984454732e-05, "loss": 1.4398, "step": 65820 }, { "epoch": 3.153369349839958, "grad_norm": 0.20580008625984192, "learning_rate": 8.32878140136918e-05, "loss": 1.4417, "step": 65830 }, { "epoch": 3.1539112910054703, "grad_norm": 0.222931370139122, "learning_rate": 8.327549467866472e-05, "loss": 1.4447, "step": 65840 }, { "epoch": 3.1544532321709826, "grad_norm": 0.17937925457954407, "learning_rate": 8.32631718409921e-05, "loss": 1.4547, "step": 65850 }, { "epoch": 3.154940979219943, "eval_loss": 2.4275054931640625, "eval_runtime": 21.995, "eval_samples_per_second": 227.324, "eval_steps_per_second": 1.228, "step": 65859 }, { "epoch": 3.1549951733364945, "grad_norm": 0.27297332882881165, "learning_rate": 8.325084550220046e-05, "loss": 1.4369, "step": 65860 }, { "epoch": 3.155537114502007, "grad_norm": 0.2378816157579422, "learning_rate": 8.323851566381672e-05, "loss": 1.4453, "step": 65870 }, { "epoch": 3.156079055667519, "grad_norm": 0.30926477909088135, "learning_rate": 8.322618232736827e-05, "loss": 1.4455, "step": 65880 }, { "epoch": 3.1566209968330314, "grad_norm": 0.19149400293827057, "learning_rate": 8.32138454943829e-05, "loss": 1.4509, "step": 65890 }, { "epoch": 3.1571629379985433, "grad_norm": 0.18199436366558075, "learning_rate": 8.320150516638884e-05, "loss": 1.4574, "step": 65900 }, { "epoch": 3.1577048791640556, "grad_norm": 0.3855782747268677, "learning_rate": 8.318916134491477e-05, "loss": 1.4557, "step": 65910 }, { "epoch": 3.158246820329568, "grad_norm": 0.29194483160972595, "learning_rate": 8.317681403148978e-05, "loss": 1.4476, "step": 65920 }, { "epoch": 3.1587887614950803, "grad_norm": 0.3535638451576233, "learning_rate": 8.316446322764338e-05, "loss": 1.4398, "step": 65930 }, { "epoch": 3.1593307026605926, "grad_norm": 0.36056360602378845, "learning_rate": 8.315210893490556e-05, "loss": 1.44, "step": 65940 }, { "epoch": 3.1596558673599, "eval_loss": 2.4279699325561523, "eval_runtime": 21.9818, "eval_samples_per_second": 227.461, "eval_steps_per_second": 1.228, "step": 65946 }, { "epoch": 3.1598726438261044, "grad_norm": 0.17839354276657104, "learning_rate": 8.313975115480671e-05, "loss": 1.4455, "step": 65950 }, { "epoch": 3.1604145849916168, "grad_norm": 0.21037691831588745, "learning_rate": 8.312738988887766e-05, "loss": 1.4449, "step": 65960 }, { "epoch": 3.160956526157129, "grad_norm": 0.25134557485580444, "learning_rate": 8.311502513864966e-05, "loss": 1.4403, "step": 65970 }, { "epoch": 3.1614984673226414, "grad_norm": 0.17751184105873108, "learning_rate": 8.310265690565443e-05, "loss": 1.4473, "step": 65980 }, { "epoch": 3.1620404084881537, "grad_norm": 0.1860496550798416, "learning_rate": 8.309028519142406e-05, "loss": 1.4408, "step": 65990 }, { "epoch": 3.1625823496536656, "grad_norm": 0.22845673561096191, "learning_rate": 8.30779099974911e-05, "loss": 1.4459, "step": 66000 }, { "epoch": 3.163124290819178, "grad_norm": 0.4244441092014313, "learning_rate": 8.306553132538856e-05, "loss": 1.4389, "step": 66010 }, { "epoch": 3.16366623198469, "grad_norm": 0.3140546679496765, "learning_rate": 8.305314917664985e-05, "loss": 1.4415, "step": 66020 }, { "epoch": 3.1642081731502025, "grad_norm": 0.34219682216644287, "learning_rate": 8.304076355280883e-05, "loss": 1.4586, "step": 66030 }, { "epoch": 3.164370755499856, "eval_loss": 2.432257890701294, "eval_runtime": 21.9773, "eval_samples_per_second": 227.508, "eval_steps_per_second": 1.229, "step": 66033 }, { "epoch": 3.1647501143157144, "grad_norm": 0.17820283770561218, "learning_rate": 8.302837445539974e-05, "loss": 1.4485, "step": 66040 }, { "epoch": 3.1652920554812267, "grad_norm": 0.22140522301197052, "learning_rate": 8.301598188595732e-05, "loss": 1.4522, "step": 66050 }, { "epoch": 3.165833996646739, "grad_norm": 0.24713575839996338, "learning_rate": 8.300358584601671e-05, "loss": 1.4491, "step": 66060 }, { "epoch": 3.1663759378122514, "grad_norm": 0.32958516478538513, "learning_rate": 8.299118633711344e-05, "loss": 1.44, "step": 66070 }, { "epoch": 3.1669178789777637, "grad_norm": 0.24236957728862762, "learning_rate": 8.297878336078354e-05, "loss": 1.4442, "step": 66080 }, { "epoch": 3.1674598201432755, "grad_norm": 0.2900751531124115, "learning_rate": 8.296637691856342e-05, "loss": 1.4535, "step": 66090 }, { "epoch": 3.168001761308788, "grad_norm": 0.2060800939798355, "learning_rate": 8.295396701198996e-05, "loss": 1.4472, "step": 66100 }, { "epoch": 3.1685437024743, "grad_norm": 0.45042285323143005, "learning_rate": 8.294155364260045e-05, "loss": 1.4494, "step": 66110 }, { "epoch": 3.1690856436398125, "grad_norm": 0.21350212395191193, "learning_rate": 8.292913681193254e-05, "loss": 1.447, "step": 66120 }, { "epoch": 3.1690856436398125, "eval_loss": 2.430042266845703, "eval_runtime": 21.5428, "eval_samples_per_second": 232.096, "eval_steps_per_second": 1.253, "step": 66120 }, { "epoch": 3.1696275848053244, "grad_norm": 0.19495175778865814, "learning_rate": 8.291671652152445e-05, "loss": 1.448, "step": 66130 }, { "epoch": 3.1701695259708367, "grad_norm": 0.22730892896652222, "learning_rate": 8.290429277291471e-05, "loss": 1.4446, "step": 66140 }, { "epoch": 3.170711467136349, "grad_norm": 0.23822692036628723, "learning_rate": 8.289186556764233e-05, "loss": 1.4448, "step": 66150 }, { "epoch": 3.1712534083018613, "grad_norm": 0.21188586950302124, "learning_rate": 8.287943490724673e-05, "loss": 1.441, "step": 66160 }, { "epoch": 3.1717953494673736, "grad_norm": 0.279965877532959, "learning_rate": 8.286700079326777e-05, "loss": 1.4367, "step": 66170 }, { "epoch": 3.1723372906328855, "grad_norm": 0.16774581372737885, "learning_rate": 8.285456322724577e-05, "loss": 1.4426, "step": 66180 }, { "epoch": 3.172879231798398, "grad_norm": 0.2504456341266632, "learning_rate": 8.284212221072139e-05, "loss": 1.4505, "step": 66190 }, { "epoch": 3.17342117296391, "grad_norm": 0.17107921838760376, "learning_rate": 8.282967774523579e-05, "loss": 1.4483, "step": 66200 }, { "epoch": 3.1738005317797686, "eval_loss": 2.4283711910247803, "eval_runtime": 21.9868, "eval_samples_per_second": 227.409, "eval_steps_per_second": 1.228, "step": 66207 }, { "epoch": 3.1739631141294224, "grad_norm": 0.2413926124572754, "learning_rate": 8.281722983233054e-05, "loss": 1.4405, "step": 66210 }, { "epoch": 3.1745050552949348, "grad_norm": 0.22221823036670685, "learning_rate": 8.280477847354763e-05, "loss": 1.4362, "step": 66220 }, { "epoch": 3.1750469964604466, "grad_norm": 0.2293311357498169, "learning_rate": 8.279232367042946e-05, "loss": 1.4406, "step": 66230 }, { "epoch": 3.175588937625959, "grad_norm": 0.22770948708057404, "learning_rate": 8.27798654245189e-05, "loss": 1.4537, "step": 66240 }, { "epoch": 3.1761308787914713, "grad_norm": 0.2080654501914978, "learning_rate": 8.276740373735922e-05, "loss": 1.4435, "step": 66250 }, { "epoch": 3.1766728199569836, "grad_norm": 0.23802123963832855, "learning_rate": 8.275493861049414e-05, "loss": 1.4486, "step": 66260 }, { "epoch": 3.1772147611224955, "grad_norm": 0.18610985577106476, "learning_rate": 8.274247004546775e-05, "loss": 1.4385, "step": 66270 }, { "epoch": 3.1777567022880078, "grad_norm": 0.2491571456193924, "learning_rate": 8.272999804382461e-05, "loss": 1.4416, "step": 66280 }, { "epoch": 3.17829864345352, "grad_norm": 0.41166606545448303, "learning_rate": 8.271752260710972e-05, "loss": 1.4539, "step": 66290 }, { "epoch": 3.1785154199197247, "eval_loss": 2.4275481700897217, "eval_runtime": 21.9832, "eval_samples_per_second": 227.446, "eval_steps_per_second": 1.228, "step": 66294 }, { "epoch": 3.1788405846190324, "grad_norm": 0.2705402672290802, "learning_rate": 8.270504373686846e-05, "loss": 1.4452, "step": 66300 }, { "epoch": 3.1793825257845447, "grad_norm": 0.16571412980556488, "learning_rate": 8.269256143464666e-05, "loss": 1.4402, "step": 66310 }, { "epoch": 3.1799244669500566, "grad_norm": 0.17406509816646576, "learning_rate": 8.268007570199058e-05, "loss": 1.4416, "step": 66320 }, { "epoch": 3.180466408115569, "grad_norm": 0.26004037261009216, "learning_rate": 8.26675865404469e-05, "loss": 1.4407, "step": 66330 }, { "epoch": 3.1810083492810812, "grad_norm": 0.18530094623565674, "learning_rate": 8.265509395156272e-05, "loss": 1.4352, "step": 66340 }, { "epoch": 3.1815502904465935, "grad_norm": 0.22706040740013123, "learning_rate": 8.264259793688555e-05, "loss": 1.4528, "step": 66350 }, { "epoch": 3.1820922316121054, "grad_norm": 0.3016396462917328, "learning_rate": 8.263009849796338e-05, "loss": 1.4534, "step": 66360 }, { "epoch": 3.1826341727776177, "grad_norm": 0.2243020385503769, "learning_rate": 8.261759563634458e-05, "loss": 1.4415, "step": 66370 }, { "epoch": 3.18317611394313, "grad_norm": 0.18285973370075226, "learning_rate": 8.260508935357791e-05, "loss": 1.4299, "step": 66380 }, { "epoch": 3.1832303080596813, "eval_loss": 2.425734043121338, "eval_runtime": 21.9789, "eval_samples_per_second": 227.491, "eval_steps_per_second": 1.228, "step": 66381 }, { "epoch": 3.1837180551086424, "grad_norm": 0.2271660417318344, "learning_rate": 8.259257965121263e-05, "loss": 1.4353, "step": 66390 }, { "epoch": 3.1842599962741547, "grad_norm": 0.1780574768781662, "learning_rate": 8.258006653079838e-05, "loss": 1.4335, "step": 66400 }, { "epoch": 3.1848019374396666, "grad_norm": 0.2370023876428604, "learning_rate": 8.256754999388522e-05, "loss": 1.4438, "step": 66410 }, { "epoch": 3.185343878605179, "grad_norm": 0.34876930713653564, "learning_rate": 8.255503004202365e-05, "loss": 1.4481, "step": 66420 }, { "epoch": 3.185885819770691, "grad_norm": 0.19301725924015045, "learning_rate": 8.254250667676461e-05, "loss": 1.4412, "step": 66430 }, { "epoch": 3.1864277609362035, "grad_norm": 0.22976385056972504, "learning_rate": 8.252997989965942e-05, "loss": 1.4483, "step": 66440 }, { "epoch": 3.186969702101716, "grad_norm": 0.17878347635269165, "learning_rate": 8.251744971225982e-05, "loss": 1.4393, "step": 66450 }, { "epoch": 3.1875116432672277, "grad_norm": 0.2477036416530609, "learning_rate": 8.250491611611803e-05, "loss": 1.4437, "step": 66460 }, { "epoch": 3.1879451961996375, "eval_loss": 2.425063371658325, "eval_runtime": 21.9818, "eval_samples_per_second": 227.46, "eval_steps_per_second": 1.228, "step": 66468 }, { "epoch": 3.18805358443274, "grad_norm": 0.26957419514656067, "learning_rate": 8.249237911278665e-05, "loss": 1.4446, "step": 66470 }, { "epoch": 3.1885955255982523, "grad_norm": 0.19030284881591797, "learning_rate": 8.24798387038187e-05, "loss": 1.4414, "step": 66480 }, { "epoch": 3.1891374667637646, "grad_norm": 0.18507073819637299, "learning_rate": 8.246729489076763e-05, "loss": 1.4501, "step": 66490 }, { "epoch": 3.1896794079292765, "grad_norm": 0.17628097534179688, "learning_rate": 8.245474767518734e-05, "loss": 1.444, "step": 66500 }, { "epoch": 3.190221349094789, "grad_norm": 0.259546160697937, "learning_rate": 8.24421970586321e-05, "loss": 1.44, "step": 66510 }, { "epoch": 3.190763290260301, "grad_norm": 0.22335298359394073, "learning_rate": 8.242964304265662e-05, "loss": 1.4356, "step": 66520 }, { "epoch": 3.1913052314258135, "grad_norm": 0.22630515694618225, "learning_rate": 8.241708562881604e-05, "loss": 1.4469, "step": 66530 }, { "epoch": 3.1918471725913253, "grad_norm": 0.2385028600692749, "learning_rate": 8.240452481866595e-05, "loss": 1.4392, "step": 66540 }, { "epoch": 3.1923891137568376, "grad_norm": 0.22308200597763062, "learning_rate": 8.239196061376229e-05, "loss": 1.4402, "step": 66550 }, { "epoch": 3.192660084339594, "eval_loss": 2.420628309249878, "eval_runtime": 21.9793, "eval_samples_per_second": 227.487, "eval_steps_per_second": 1.228, "step": 66555 }, { "epoch": 3.19293105492235, "grad_norm": 0.388051837682724, "learning_rate": 8.237939301566148e-05, "loss": 1.4462, "step": 66560 }, { "epoch": 3.1934729960878623, "grad_norm": 0.18918316066265106, "learning_rate": 8.236682202592032e-05, "loss": 1.4424, "step": 66570 }, { "epoch": 3.1940149372533746, "grad_norm": 0.392870157957077, "learning_rate": 8.235424764609607e-05, "loss": 1.447, "step": 66580 }, { "epoch": 3.1945568784188865, "grad_norm": 0.46847355365753174, "learning_rate": 8.234166987774639e-05, "loss": 1.4523, "step": 66590 }, { "epoch": 3.195098819584399, "grad_norm": 0.23610202968120575, "learning_rate": 8.232908872242932e-05, "loss": 1.4373, "step": 66600 }, { "epoch": 3.195640760749911, "grad_norm": 0.39547818899154663, "learning_rate": 8.231650418170343e-05, "loss": 1.4522, "step": 66610 }, { "epoch": 3.1961827019154234, "grad_norm": 0.3930874168872833, "learning_rate": 8.230391625712759e-05, "loss": 1.4393, "step": 66620 }, { "epoch": 3.1967246430809357, "grad_norm": 0.2097681313753128, "learning_rate": 8.229132495026113e-05, "loss": 1.4432, "step": 66630 }, { "epoch": 3.1972665842464476, "grad_norm": 0.19665232300758362, "learning_rate": 8.227873026266383e-05, "loss": 1.436, "step": 66640 }, { "epoch": 3.19737497247955, "eval_loss": 2.4246163368225098, "eval_runtime": 21.9864, "eval_samples_per_second": 227.414, "eval_steps_per_second": 1.228, "step": 66642 }, { "epoch": 3.19780852541196, "grad_norm": 0.2388431578874588, "learning_rate": 8.226613219589583e-05, "loss": 1.4458, "step": 66650 }, { "epoch": 3.1983504665774722, "grad_norm": 0.22420534491539001, "learning_rate": 8.225353075151781e-05, "loss": 1.4418, "step": 66660 }, { "epoch": 3.1988924077429846, "grad_norm": 0.22993390262126923, "learning_rate": 8.224092593109068e-05, "loss": 1.4308, "step": 66670 }, { "epoch": 3.1994343489084964, "grad_norm": 0.21597042679786682, "learning_rate": 8.222831773617592e-05, "loss": 1.4508, "step": 66680 }, { "epoch": 3.1999762900740087, "grad_norm": 0.21598871052265167, "learning_rate": 8.221570616833538e-05, "loss": 1.4428, "step": 66690 }, { "epoch": 3.200518231239521, "grad_norm": 0.20474159717559814, "learning_rate": 8.220309122913132e-05, "loss": 1.4461, "step": 66700 }, { "epoch": 3.2010601724050334, "grad_norm": 0.2574845850467682, "learning_rate": 8.219047292012642e-05, "loss": 1.4409, "step": 66710 }, { "epoch": 3.2016021135705457, "grad_norm": 0.17870694398880005, "learning_rate": 8.217785124288378e-05, "loss": 1.4424, "step": 66720 }, { "epoch": 3.2020898606195063, "eval_loss": 2.422410726547241, "eval_runtime": 21.9819, "eval_samples_per_second": 227.46, "eval_steps_per_second": 1.228, "step": 66729 }, { "epoch": 3.2021440547360576, "grad_norm": 0.2235659956932068, "learning_rate": 8.216522619896693e-05, "loss": 1.4361, "step": 66730 }, { "epoch": 3.20268599590157, "grad_norm": 0.16267363727092743, "learning_rate": 8.215259778993979e-05, "loss": 1.4447, "step": 66740 }, { "epoch": 3.203227937067082, "grad_norm": 0.26697710156440735, "learning_rate": 8.213996601736673e-05, "loss": 1.4388, "step": 66750 }, { "epoch": 3.2037698782325945, "grad_norm": 0.3052631616592407, "learning_rate": 8.21273308828125e-05, "loss": 1.4425, "step": 66760 }, { "epoch": 3.2043118193981064, "grad_norm": 0.22814327478408813, "learning_rate": 8.21146923878423e-05, "loss": 1.4351, "step": 66770 }, { "epoch": 3.2048537605636187, "grad_norm": 0.19500422477722168, "learning_rate": 8.210205053402172e-05, "loss": 1.4411, "step": 66780 }, { "epoch": 3.205395701729131, "grad_norm": 0.1982443779706955, "learning_rate": 8.20894053229168e-05, "loss": 1.4501, "step": 66790 }, { "epoch": 3.2059376428946433, "grad_norm": 0.2346677929162979, "learning_rate": 8.207675675609395e-05, "loss": 1.443, "step": 66800 }, { "epoch": 3.2064795840601557, "grad_norm": 0.18981340527534485, "learning_rate": 8.206410483512004e-05, "loss": 1.4277, "step": 66810 }, { "epoch": 3.206804748759463, "eval_loss": 2.4243276119232178, "eval_runtime": 21.9797, "eval_samples_per_second": 227.482, "eval_steps_per_second": 1.228, "step": 66816 }, { "epoch": 3.2070215252256675, "grad_norm": 0.20465369522571564, "learning_rate": 8.20514495615623e-05, "loss": 1.4445, "step": 66820 }, { "epoch": 3.20756346639118, "grad_norm": 0.18857480585575104, "learning_rate": 8.203879093698845e-05, "loss": 1.439, "step": 66830 }, { "epoch": 3.208105407556692, "grad_norm": 0.19271798431873322, "learning_rate": 8.202612896296657e-05, "loss": 1.4458, "step": 66840 }, { "epoch": 3.2086473487222045, "grad_norm": 0.42280739545822144, "learning_rate": 8.201346364106516e-05, "loss": 1.4374, "step": 66850 }, { "epoch": 3.209189289887717, "grad_norm": 0.21561777591705322, "learning_rate": 8.200079497285316e-05, "loss": 1.4565, "step": 66860 }, { "epoch": 3.2097312310532287, "grad_norm": 0.20918765664100647, "learning_rate": 8.198812295989991e-05, "loss": 1.4396, "step": 66870 }, { "epoch": 3.210273172218741, "grad_norm": 0.33416321873664856, "learning_rate": 8.197544760377514e-05, "loss": 1.4401, "step": 66880 }, { "epoch": 3.2108151133842533, "grad_norm": 0.2590186297893524, "learning_rate": 8.196276890604906e-05, "loss": 1.4388, "step": 66890 }, { "epoch": 3.2113570545497656, "grad_norm": 0.20215663313865662, "learning_rate": 8.195008686829222e-05, "loss": 1.4443, "step": 66900 }, { "epoch": 3.211519636899419, "eval_loss": 2.431623935699463, "eval_runtime": 21.9765, "eval_samples_per_second": 227.515, "eval_steps_per_second": 1.229, "step": 66903 }, { "epoch": 3.2118989957152775, "grad_norm": 0.3255484700202942, "learning_rate": 8.193740149207561e-05, "loss": 1.44, "step": 66910 }, { "epoch": 3.21244093688079, "grad_norm": 0.33131662011146545, "learning_rate": 8.192471277897068e-05, "loss": 1.4452, "step": 66920 }, { "epoch": 3.212982878046302, "grad_norm": 0.2151406705379486, "learning_rate": 8.191202073054922e-05, "loss": 1.4488, "step": 66930 }, { "epoch": 3.2135248192118144, "grad_norm": 0.2985169291496277, "learning_rate": 8.189932534838346e-05, "loss": 1.442, "step": 66940 }, { "epoch": 3.2140667603773263, "grad_norm": 0.21713115274906158, "learning_rate": 8.188662663404607e-05, "loss": 1.44, "step": 66950 }, { "epoch": 3.2146087015428386, "grad_norm": 0.25025808811187744, "learning_rate": 8.18739245891101e-05, "loss": 1.4394, "step": 66960 }, { "epoch": 3.215150642708351, "grad_norm": 0.4430973529815674, "learning_rate": 8.186121921514903e-05, "loss": 1.4407, "step": 66970 }, { "epoch": 3.2156925838738633, "grad_norm": 0.25356000661849976, "learning_rate": 8.184851051373673e-05, "loss": 1.4364, "step": 66980 }, { "epoch": 3.2162345250393756, "grad_norm": 0.3219984471797943, "learning_rate": 8.183579848644753e-05, "loss": 1.4377, "step": 66990 }, { "epoch": 3.2162345250393756, "eval_loss": 2.4254963397979736, "eval_runtime": 21.8545, "eval_samples_per_second": 228.785, "eval_steps_per_second": 1.235, "step": 66990 }, { "epoch": 3.2167764662048874, "grad_norm": 0.17054729163646698, "learning_rate": 8.18230831348561e-05, "loss": 1.4393, "step": 67000 }, { "epoch": 3.2173184073703998, "grad_norm": 0.26265397667884827, "learning_rate": 8.181036446053761e-05, "loss": 1.4521, "step": 67010 }, { "epoch": 3.217860348535912, "grad_norm": 0.3703189492225647, "learning_rate": 8.179764246506755e-05, "loss": 1.4443, "step": 67020 }, { "epoch": 3.2184022897014244, "grad_norm": 0.21587614715099335, "learning_rate": 8.17849171500219e-05, "loss": 1.4459, "step": 67030 }, { "epoch": 3.2189442308669367, "grad_norm": 0.21136698126792908, "learning_rate": 8.1772188516977e-05, "loss": 1.4417, "step": 67040 }, { "epoch": 3.2194861720324486, "grad_norm": 0.38758090138435364, "learning_rate": 8.175945656750962e-05, "loss": 1.4469, "step": 67050 }, { "epoch": 3.220028113197961, "grad_norm": 0.3118115961551666, "learning_rate": 8.174672130319694e-05, "loss": 1.4373, "step": 67060 }, { "epoch": 3.220570054363473, "grad_norm": 0.19323886930942535, "learning_rate": 8.173398272561654e-05, "loss": 1.4394, "step": 67070 }, { "epoch": 3.2209494131793317, "eval_loss": 2.42934513092041, "eval_runtime": 24.88, "eval_samples_per_second": 200.964, "eval_steps_per_second": 1.085, "step": 67077 }, { "epoch": 3.2211119955289855, "grad_norm": 0.22981129586696625, "learning_rate": 8.172124083634643e-05, "loss": 1.4441, "step": 67080 }, { "epoch": 3.2216539366944974, "grad_norm": 0.18165723979473114, "learning_rate": 8.170849563696501e-05, "loss": 1.4517, "step": 67090 }, { "epoch": 3.2221958778600097, "grad_norm": 0.19143837690353394, "learning_rate": 8.169574712905111e-05, "loss": 1.4327, "step": 67100 }, { "epoch": 3.222737819025522, "grad_norm": 0.3314666152000427, "learning_rate": 8.168299531418396e-05, "loss": 1.4345, "step": 67110 }, { "epoch": 3.2232797601910343, "grad_norm": 0.2969511151313782, "learning_rate": 8.167024019394321e-05, "loss": 1.4327, "step": 67120 }, { "epoch": 3.2238217013565467, "grad_norm": 0.2874842882156372, "learning_rate": 8.165748176990887e-05, "loss": 1.4512, "step": 67130 }, { "epoch": 3.2243636425220585, "grad_norm": 0.2401866316795349, "learning_rate": 8.164472004366145e-05, "loss": 1.4391, "step": 67140 }, { "epoch": 3.224905583687571, "grad_norm": 0.26989611983299255, "learning_rate": 8.163195501678177e-05, "loss": 1.4425, "step": 67150 }, { "epoch": 3.225447524853083, "grad_norm": 0.21331489086151123, "learning_rate": 8.161918669085113e-05, "loss": 1.4442, "step": 67160 }, { "epoch": 3.225664301319288, "eval_loss": 2.4356348514556885, "eval_runtime": 26.8873, "eval_samples_per_second": 185.961, "eval_steps_per_second": 1.004, "step": 67164 }, { "epoch": 3.2259894660185955, "grad_norm": 0.28904426097869873, "learning_rate": 8.160641506745123e-05, "loss": 1.4346, "step": 67170 }, { "epoch": 3.2265314071841074, "grad_norm": 0.3067649304866791, "learning_rate": 8.159364014816412e-05, "loss": 1.449, "step": 67180 }, { "epoch": 3.2270733483496197, "grad_norm": 0.23630201816558838, "learning_rate": 8.158086193457234e-05, "loss": 1.4384, "step": 67190 }, { "epoch": 3.227615289515132, "grad_norm": 0.21013253927230835, "learning_rate": 8.156808042825878e-05, "loss": 1.4325, "step": 67200 }, { "epoch": 3.2281572306806443, "grad_norm": 0.18870244920253754, "learning_rate": 8.155529563080676e-05, "loss": 1.4469, "step": 67210 }, { "epoch": 3.2286991718461566, "grad_norm": 0.266428142786026, "learning_rate": 8.154250754380002e-05, "loss": 1.442, "step": 67220 }, { "epoch": 3.2292411130116685, "grad_norm": 0.22033743560314178, "learning_rate": 8.152971616882269e-05, "loss": 1.4389, "step": 67230 }, { "epoch": 3.229783054177181, "grad_norm": 0.29417768120765686, "learning_rate": 8.151692150745928e-05, "loss": 1.4398, "step": 67240 }, { "epoch": 3.230324995342693, "grad_norm": 0.2595650255680084, "learning_rate": 8.150412356129478e-05, "loss": 1.4334, "step": 67250 }, { "epoch": 3.2303791894592444, "eval_loss": 2.4399569034576416, "eval_runtime": 22.4553, "eval_samples_per_second": 222.665, "eval_steps_per_second": 1.202, "step": 67251 }, { "epoch": 3.2308669365082054, "grad_norm": 0.16750743985176086, "learning_rate": 8.14913223319145e-05, "loss": 1.4424, "step": 67260 }, { "epoch": 3.2314088776737178, "grad_norm": 0.17600034177303314, "learning_rate": 8.147851782090425e-05, "loss": 1.4473, "step": 67270 }, { "epoch": 3.2319508188392296, "grad_norm": 0.18144869804382324, "learning_rate": 8.146571002985013e-05, "loss": 1.4402, "step": 67280 }, { "epoch": 3.232492760004742, "grad_norm": 0.19220831990242004, "learning_rate": 8.145289896033879e-05, "loss": 1.4406, "step": 67290 }, { "epoch": 3.2330347011702543, "grad_norm": 0.22534775733947754, "learning_rate": 8.144008461395716e-05, "loss": 1.4429, "step": 67300 }, { "epoch": 3.2335766423357666, "grad_norm": 0.18956543505191803, "learning_rate": 8.142726699229265e-05, "loss": 1.4538, "step": 67310 }, { "epoch": 3.2341185835012785, "grad_norm": 0.2575107514858246, "learning_rate": 8.141444609693302e-05, "loss": 1.4445, "step": 67320 }, { "epoch": 3.2346605246667908, "grad_norm": 0.37571388483047485, "learning_rate": 8.14016219294665e-05, "loss": 1.4412, "step": 67330 }, { "epoch": 3.2350940775992005, "eval_loss": 2.430121660232544, "eval_runtime": 24.4358, "eval_samples_per_second": 204.618, "eval_steps_per_second": 1.105, "step": 67338 }, { "epoch": 3.235202465832303, "grad_norm": 0.2669738531112671, "learning_rate": 8.138879449148168e-05, "loss": 1.4511, "step": 67340 }, { "epoch": 3.2357444069978154, "grad_norm": 0.16547146439552307, "learning_rate": 8.137596378456757e-05, "loss": 1.4397, "step": 67350 }, { "epoch": 3.2362863481633277, "grad_norm": 0.168304443359375, "learning_rate": 8.136312981031358e-05, "loss": 1.4388, "step": 67360 }, { "epoch": 3.2368282893288396, "grad_norm": 0.43757399916648865, "learning_rate": 8.135029257030953e-05, "loss": 1.4345, "step": 67370 }, { "epoch": 3.237370230494352, "grad_norm": 0.26308125257492065, "learning_rate": 8.133745206614561e-05, "loss": 1.4538, "step": 67380 }, { "epoch": 3.2379121716598642, "grad_norm": 0.18845511972904205, "learning_rate": 8.132460829941252e-05, "loss": 1.4393, "step": 67390 }, { "epoch": 3.2384541128253765, "grad_norm": 0.17935842275619507, "learning_rate": 8.13117612717012e-05, "loss": 1.437, "step": 67400 }, { "epoch": 3.2389960539908884, "grad_norm": 0.18170326948165894, "learning_rate": 8.129891098460316e-05, "loss": 1.4439, "step": 67410 }, { "epoch": 3.2395379951564007, "grad_norm": 0.2227085679769516, "learning_rate": 8.128605743971018e-05, "loss": 1.4394, "step": 67420 }, { "epoch": 3.239808965739157, "eval_loss": 2.421604633331299, "eval_runtime": 22.8082, "eval_samples_per_second": 219.219, "eval_steps_per_second": 1.184, "step": 67425 }, { "epoch": 3.240079936321913, "grad_norm": 0.26854372024536133, "learning_rate": 8.127320063861455e-05, "loss": 1.4398, "step": 67430 }, { "epoch": 3.2406218774874254, "grad_norm": 0.2939850389957428, "learning_rate": 8.126034058290887e-05, "loss": 1.4404, "step": 67440 }, { "epoch": 3.2411638186529377, "grad_norm": 0.20707334578037262, "learning_rate": 8.124747727418623e-05, "loss": 1.4451, "step": 67450 }, { "epoch": 3.2417057598184496, "grad_norm": 0.17989417910575867, "learning_rate": 8.123461071404005e-05, "loss": 1.4282, "step": 67460 }, { "epoch": 3.242247700983962, "grad_norm": 0.2010103315114975, "learning_rate": 8.122174090406418e-05, "loss": 1.4304, "step": 67470 }, { "epoch": 3.242789642149474, "grad_norm": 0.24504514038562775, "learning_rate": 8.120886784585292e-05, "loss": 1.4444, "step": 67480 }, { "epoch": 3.2433315833149865, "grad_norm": 0.16585427522659302, "learning_rate": 8.119599154100087e-05, "loss": 1.4337, "step": 67490 }, { "epoch": 3.243873524480499, "grad_norm": 0.2048460990190506, "learning_rate": 8.118311199110314e-05, "loss": 1.4403, "step": 67500 }, { "epoch": 3.2444154656460107, "grad_norm": 0.18525651097297668, "learning_rate": 8.117022919775516e-05, "loss": 1.4412, "step": 67510 }, { "epoch": 3.2445238538791132, "eval_loss": 2.4250948429107666, "eval_runtime": 24.9863, "eval_samples_per_second": 200.11, "eval_steps_per_second": 1.081, "step": 67512 }, { "epoch": 3.244957406811523, "grad_norm": 0.1921282261610031, "learning_rate": 8.115734316255281e-05, "loss": 1.4357, "step": 67520 }, { "epoch": 3.2454993479770353, "grad_norm": 0.3551950454711914, "learning_rate": 8.114445388709236e-05, "loss": 1.449, "step": 67530 }, { "epoch": 3.2460412891425476, "grad_norm": 0.26464539766311646, "learning_rate": 8.113156137297048e-05, "loss": 1.4367, "step": 67540 }, { "epoch": 3.2465832303080595, "grad_norm": 0.24829427897930145, "learning_rate": 8.111866562178419e-05, "loss": 1.4472, "step": 67550 }, { "epoch": 3.247125171473572, "grad_norm": 0.2007465362548828, "learning_rate": 8.110576663513105e-05, "loss": 1.4395, "step": 67560 }, { "epoch": 3.247667112639084, "grad_norm": 0.17801623046398163, "learning_rate": 8.109286441460885e-05, "loss": 1.4425, "step": 67570 }, { "epoch": 3.2482090538045965, "grad_norm": 0.2939807176589966, "learning_rate": 8.107995896181588e-05, "loss": 1.4434, "step": 67580 }, { "epoch": 3.2487509949701083, "grad_norm": 0.18530774116516113, "learning_rate": 8.106705027835083e-05, "loss": 1.4438, "step": 67590 }, { "epoch": 3.2492387420190694, "eval_loss": 2.4217963218688965, "eval_runtime": 23.4048, "eval_samples_per_second": 213.631, "eval_steps_per_second": 1.154, "step": 67599 }, { "epoch": 3.2492929361356206, "grad_norm": 0.2213825136423111, "learning_rate": 8.105413836581277e-05, "loss": 1.4492, "step": 67600 }, { "epoch": 3.249834877301133, "grad_norm": 0.2034333199262619, "learning_rate": 8.104122322580116e-05, "loss": 1.4339, "step": 67610 }, { "epoch": 3.2503768184666453, "grad_norm": 0.16729718446731567, "learning_rate": 8.102830485991589e-05, "loss": 1.4311, "step": 67620 }, { "epoch": 3.2509187596321576, "grad_norm": 0.18632495403289795, "learning_rate": 8.101538326975721e-05, "loss": 1.4385, "step": 67630 }, { "epoch": 3.2514607007976695, "grad_norm": 0.3231146037578583, "learning_rate": 8.100245845692579e-05, "loss": 1.4494, "step": 67640 }, { "epoch": 3.252002641963182, "grad_norm": 0.37084832787513733, "learning_rate": 8.09895304230227e-05, "loss": 1.4487, "step": 67650 }, { "epoch": 3.252544583128694, "grad_norm": 0.37098175287246704, "learning_rate": 8.097659916964943e-05, "loss": 1.4386, "step": 67660 }, { "epoch": 3.2530865242942064, "grad_norm": 0.2461938112974167, "learning_rate": 8.096366469840785e-05, "loss": 1.4404, "step": 67670 }, { "epoch": 3.2536284654597187, "grad_norm": 0.21850334107875824, "learning_rate": 8.095072701090019e-05, "loss": 1.4517, "step": 67680 }, { "epoch": 3.253953630159026, "eval_loss": 2.423842191696167, "eval_runtime": 21.9821, "eval_samples_per_second": 227.458, "eval_steps_per_second": 1.228, "step": 67686 }, { "epoch": 3.2541704066252306, "grad_norm": 0.18838092684745789, "learning_rate": 8.093778610872912e-05, "loss": 1.4467, "step": 67690 }, { "epoch": 3.254712347790743, "grad_norm": 0.21146613359451294, "learning_rate": 8.092484199349775e-05, "loss": 1.4391, "step": 67700 }, { "epoch": 3.2552542889562552, "grad_norm": 0.21148332953453064, "learning_rate": 8.091189466680948e-05, "loss": 1.4415, "step": 67710 }, { "epoch": 3.2557962301217676, "grad_norm": 0.18964509665966034, "learning_rate": 8.089894413026823e-05, "loss": 1.4444, "step": 67720 }, { "epoch": 3.25633817128728, "grad_norm": 0.21399128437042236, "learning_rate": 8.08859903854782e-05, "loss": 1.4514, "step": 67730 }, { "epoch": 3.2568801124527917, "grad_norm": 0.20721645653247833, "learning_rate": 8.087303343404406e-05, "loss": 1.4518, "step": 67740 }, { "epoch": 3.257422053618304, "grad_norm": 0.2715248763561249, "learning_rate": 8.086007327757088e-05, "loss": 1.4342, "step": 67750 }, { "epoch": 3.2579639947838164, "grad_norm": 0.19381947815418243, "learning_rate": 8.08471099176641e-05, "loss": 1.4455, "step": 67760 }, { "epoch": 3.2585059359493287, "grad_norm": 0.169798344373703, "learning_rate": 8.083414335592955e-05, "loss": 1.4354, "step": 67770 }, { "epoch": 3.258668518298982, "eval_loss": 2.425126075744629, "eval_runtime": 23.083, "eval_samples_per_second": 216.61, "eval_steps_per_second": 1.17, "step": 67773 }, { "epoch": 3.2590478771148406, "grad_norm": 0.17293022572994232, "learning_rate": 8.08211735939735e-05, "loss": 1.4359, "step": 67780 }, { "epoch": 3.259589818280353, "grad_norm": 0.28945857286453247, "learning_rate": 8.080820063340254e-05, "loss": 1.4333, "step": 67790 }, { "epoch": 3.260131759445865, "grad_norm": 0.18180948495864868, "learning_rate": 8.079522447582375e-05, "loss": 1.4428, "step": 67800 }, { "epoch": 3.2606737006113775, "grad_norm": 0.2261655181646347, "learning_rate": 8.078224512284455e-05, "loss": 1.4295, "step": 67810 }, { "epoch": 3.2612156417768894, "grad_norm": 0.5620535612106323, "learning_rate": 8.076926257607274e-05, "loss": 1.4416, "step": 67820 }, { "epoch": 3.2617575829424017, "grad_norm": 0.24003808200359344, "learning_rate": 8.075627683711658e-05, "loss": 1.4422, "step": 67830 }, { "epoch": 3.262299524107914, "grad_norm": 0.2081241011619568, "learning_rate": 8.074328790758466e-05, "loss": 1.4396, "step": 67840 }, { "epoch": 3.2628414652734263, "grad_norm": 0.22388330101966858, "learning_rate": 8.073029578908601e-05, "loss": 1.4407, "step": 67850 }, { "epoch": 3.2633834064389386, "grad_norm": 0.27499768137931824, "learning_rate": 8.071730048323002e-05, "loss": 1.4343, "step": 67860 }, { "epoch": 3.2633834064389386, "eval_loss": 2.4300835132598877, "eval_runtime": 22.2873, "eval_samples_per_second": 224.343, "eval_steps_per_second": 1.211, "step": 67860 }, { "epoch": 3.2639253476044505, "grad_norm": 0.17892539501190186, "learning_rate": 8.070430199162648e-05, "loss": 1.4281, "step": 67870 }, { "epoch": 3.264467288769963, "grad_norm": 0.24364718794822693, "learning_rate": 8.069130031588562e-05, "loss": 1.4381, "step": 67880 }, { "epoch": 3.265009229935475, "grad_norm": 0.22364751994609833, "learning_rate": 8.067829545761804e-05, "loss": 1.4383, "step": 67890 }, { "epoch": 3.2655511711009875, "grad_norm": 0.3235653340816498, "learning_rate": 8.066528741843468e-05, "loss": 1.4419, "step": 67900 }, { "epoch": 3.2660931122665, "grad_norm": 0.2588692605495453, "learning_rate": 8.065227619994695e-05, "loss": 1.4372, "step": 67910 }, { "epoch": 3.2666350534320117, "grad_norm": 0.22628279030323029, "learning_rate": 8.063926180376661e-05, "loss": 1.4345, "step": 67920 }, { "epoch": 3.267176994597524, "grad_norm": 0.29548385739326477, "learning_rate": 8.062624423150584e-05, "loss": 1.4383, "step": 67930 }, { "epoch": 3.2677189357630363, "grad_norm": 0.21970342099666595, "learning_rate": 8.061322348477717e-05, "loss": 1.4512, "step": 67940 }, { "epoch": 3.268098294578895, "eval_loss": 2.420971393585205, "eval_runtime": 28.5235, "eval_samples_per_second": 175.294, "eval_steps_per_second": 0.947, "step": 67947 }, { "epoch": 3.2682608769285486, "grad_norm": 0.19080619513988495, "learning_rate": 8.06001995651936e-05, "loss": 1.4366, "step": 67950 }, { "epoch": 3.2688028180940605, "grad_norm": 0.21438750624656677, "learning_rate": 8.058717247436845e-05, "loss": 1.4318, "step": 67960 }, { "epoch": 3.269344759259573, "grad_norm": 0.24113547801971436, "learning_rate": 8.057414221391545e-05, "loss": 1.443, "step": 67970 }, { "epoch": 3.269886700425085, "grad_norm": 0.22619077563285828, "learning_rate": 8.056110878544875e-05, "loss": 1.4333, "step": 67980 }, { "epoch": 3.2704286415905974, "grad_norm": 0.192766472697258, "learning_rate": 8.054807219058287e-05, "loss": 1.4479, "step": 67990 }, { "epoch": 3.2709705827561093, "grad_norm": 0.22505764663219452, "learning_rate": 8.053503243093275e-05, "loss": 1.4371, "step": 68000 }, { "epoch": 3.2715125239216216, "grad_norm": 0.18467269837856293, "learning_rate": 8.052198950811364e-05, "loss": 1.4294, "step": 68010 }, { "epoch": 3.272054465087134, "grad_norm": 0.21085874736309052, "learning_rate": 8.050894342374128e-05, "loss": 1.4519, "step": 68020 }, { "epoch": 3.2725964062526463, "grad_norm": 0.3685001730918884, "learning_rate": 8.049589417943176e-05, "loss": 1.4364, "step": 68030 }, { "epoch": 3.272813182718851, "eval_loss": 2.4268741607666016, "eval_runtime": 23.9572, "eval_samples_per_second": 208.705, "eval_steps_per_second": 1.127, "step": 68034 }, { "epoch": 3.2731383474181586, "grad_norm": 0.21414147317409515, "learning_rate": 8.048284177680158e-05, "loss": 1.4309, "step": 68040 }, { "epoch": 3.2736802885836704, "grad_norm": 0.23878353834152222, "learning_rate": 8.046978621746759e-05, "loss": 1.4443, "step": 68050 }, { "epoch": 3.2742222297491828, "grad_norm": 0.18432900309562683, "learning_rate": 8.045672750304703e-05, "loss": 1.429, "step": 68060 }, { "epoch": 3.274764170914695, "grad_norm": 0.18248611688613892, "learning_rate": 8.044366563515762e-05, "loss": 1.4367, "step": 68070 }, { "epoch": 3.2753061120802074, "grad_norm": 0.22367575764656067, "learning_rate": 8.043060061541737e-05, "loss": 1.4467, "step": 68080 }, { "epoch": 3.2758480532457197, "grad_norm": 0.17164652049541473, "learning_rate": 8.041753244544472e-05, "loss": 1.4351, "step": 68090 }, { "epoch": 3.2763899944112316, "grad_norm": 0.2587142288684845, "learning_rate": 8.04044611268585e-05, "loss": 1.4501, "step": 68100 }, { "epoch": 3.276931935576744, "grad_norm": 0.21978062391281128, "learning_rate": 8.039138666127793e-05, "loss": 1.4343, "step": 68110 }, { "epoch": 3.277473876742256, "grad_norm": 0.1821766346693039, "learning_rate": 8.037830905032264e-05, "loss": 1.4463, "step": 68120 }, { "epoch": 3.2775280708588075, "eval_loss": 2.42914080619812, "eval_runtime": 25.1974, "eval_samples_per_second": 198.433, "eval_steps_per_second": 1.072, "step": 68121 }, { "epoch": 3.2780158179077685, "grad_norm": 0.20630845427513123, "learning_rate": 8.036522829561259e-05, "loss": 1.4386, "step": 68130 }, { "epoch": 3.278557759073281, "grad_norm": 0.18165987730026245, "learning_rate": 8.035214439876818e-05, "loss": 1.4445, "step": 68140 }, { "epoch": 3.2790997002387927, "grad_norm": 0.2866304814815521, "learning_rate": 8.03390573614102e-05, "loss": 1.4429, "step": 68150 }, { "epoch": 3.279641641404305, "grad_norm": 0.4309597909450531, "learning_rate": 8.032596718515982e-05, "loss": 1.4471, "step": 68160 }, { "epoch": 3.2801835825698173, "grad_norm": 0.16174156963825226, "learning_rate": 8.031287387163854e-05, "loss": 1.4416, "step": 68170 }, { "epoch": 3.2807255237353297, "grad_norm": 0.20739299058914185, "learning_rate": 8.029977742246837e-05, "loss": 1.434, "step": 68180 }, { "epoch": 3.2812674649008415, "grad_norm": 0.30324587225914, "learning_rate": 8.02866778392716e-05, "loss": 1.4353, "step": 68190 }, { "epoch": 3.281809406066354, "grad_norm": 0.2629935145378113, "learning_rate": 8.027357512367097e-05, "loss": 1.4326, "step": 68200 }, { "epoch": 3.2822429589987636, "eval_loss": 2.420156955718994, "eval_runtime": 25.5157, "eval_samples_per_second": 195.958, "eval_steps_per_second": 1.058, "step": 68208 }, { "epoch": 3.282351347231866, "grad_norm": 0.23528800904750824, "learning_rate": 8.026046927728959e-05, "loss": 1.4362, "step": 68210 }, { "epoch": 3.2828932883973785, "grad_norm": 0.18353603780269623, "learning_rate": 8.024736030175092e-05, "loss": 1.4437, "step": 68220 }, { "epoch": 3.2834352295628904, "grad_norm": 0.3570035994052887, "learning_rate": 8.02342481986789e-05, "loss": 1.4461, "step": 68230 }, { "epoch": 3.2839771707284027, "grad_norm": 0.43758606910705566, "learning_rate": 8.022113296969773e-05, "loss": 1.4389, "step": 68240 }, { "epoch": 3.284519111893915, "grad_norm": 0.2364160418510437, "learning_rate": 8.020801461643214e-05, "loss": 1.4388, "step": 68250 }, { "epoch": 3.2850610530594273, "grad_norm": 0.3303424119949341, "learning_rate": 8.019489314050715e-05, "loss": 1.4482, "step": 68260 }, { "epoch": 3.2856029942249396, "grad_norm": 0.3738381266593933, "learning_rate": 8.018176854354815e-05, "loss": 1.4299, "step": 68270 }, { "epoch": 3.2861449353904515, "grad_norm": 0.1708504557609558, "learning_rate": 8.016864082718102e-05, "loss": 1.4301, "step": 68280 }, { "epoch": 3.286686876555964, "grad_norm": 0.22790126502513885, "learning_rate": 8.015550999303192e-05, "loss": 1.4363, "step": 68290 }, { "epoch": 3.28695784713872, "eval_loss": 2.4150733947753906, "eval_runtime": 25.2805, "eval_samples_per_second": 197.781, "eval_steps_per_second": 1.068, "step": 68295 }, { "epoch": 3.287228817721476, "grad_norm": 0.17331331968307495, "learning_rate": 8.014237604272744e-05, "loss": 1.4265, "step": 68300 }, { "epoch": 3.2877707588869884, "grad_norm": 0.2437063455581665, "learning_rate": 8.012923897789461e-05, "loss": 1.4362, "step": 68310 }, { "epoch": 3.2883127000525008, "grad_norm": 0.1904052495956421, "learning_rate": 8.011609880016074e-05, "loss": 1.4315, "step": 68320 }, { "epoch": 3.2888546412180126, "grad_norm": 0.17929191887378693, "learning_rate": 8.010295551115358e-05, "loss": 1.4352, "step": 68330 }, { "epoch": 3.289396582383525, "grad_norm": 0.24011602997779846, "learning_rate": 8.00898091125013e-05, "loss": 1.4416, "step": 68340 }, { "epoch": 3.2899385235490373, "grad_norm": 0.26743006706237793, "learning_rate": 8.007665960583237e-05, "loss": 1.4295, "step": 68350 }, { "epoch": 3.2904804647145496, "grad_norm": 0.21712139248847961, "learning_rate": 8.006350699277573e-05, "loss": 1.4409, "step": 68360 }, { "epoch": 3.2910224058800615, "grad_norm": 0.19347068667411804, "learning_rate": 8.005035127496068e-05, "loss": 1.4342, "step": 68370 }, { "epoch": 3.2915643470455738, "grad_norm": 0.2079339623451233, "learning_rate": 8.003719245401684e-05, "loss": 1.4394, "step": 68380 }, { "epoch": 3.2916727352786763, "eval_loss": 2.4259541034698486, "eval_runtime": 29.0409, "eval_samples_per_second": 172.171, "eval_steps_per_second": 0.93, "step": 68382 }, { "epoch": 3.292106288211086, "grad_norm": 0.17019985616207123, "learning_rate": 8.002403053157432e-05, "loss": 1.4358, "step": 68390 }, { "epoch": 3.2926482293765984, "grad_norm": 0.18729358911514282, "learning_rate": 8.001086550926354e-05, "loss": 1.431, "step": 68400 }, { "epoch": 3.2931901705421103, "grad_norm": 0.3095646798610687, "learning_rate": 7.999769738871533e-05, "loss": 1.4412, "step": 68410 }, { "epoch": 3.2937321117076226, "grad_norm": 0.17975299060344696, "learning_rate": 7.998452617156088e-05, "loss": 1.4424, "step": 68420 }, { "epoch": 3.294274052873135, "grad_norm": 0.15972350537776947, "learning_rate": 7.997135185943182e-05, "loss": 1.4329, "step": 68430 }, { "epoch": 3.2948159940386472, "grad_norm": 0.19751180708408356, "learning_rate": 7.995817445396009e-05, "loss": 1.437, "step": 68440 }, { "epoch": 3.2953579352041595, "grad_norm": 0.17910368740558624, "learning_rate": 7.994499395677807e-05, "loss": 1.4297, "step": 68450 }, { "epoch": 3.2958998763696714, "grad_norm": 0.24977631866931915, "learning_rate": 7.99318103695185e-05, "loss": 1.4352, "step": 68460 }, { "epoch": 3.2963876234186325, "eval_loss": 2.4190785884857178, "eval_runtime": 26.6502, "eval_samples_per_second": 187.616, "eval_steps_per_second": 1.013, "step": 68469 }, { "epoch": 3.2964418175351837, "grad_norm": 0.2186552733182907, "learning_rate": 7.99186236938145e-05, "loss": 1.4364, "step": 68470 }, { "epoch": 3.296983758700696, "grad_norm": 0.29373136162757874, "learning_rate": 7.990543393129959e-05, "loss": 1.4334, "step": 68480 }, { "epoch": 3.2975256998662084, "grad_norm": 0.17339174449443817, "learning_rate": 7.989224108360765e-05, "loss": 1.4425, "step": 68490 }, { "epoch": 3.2980676410317207, "grad_norm": 0.16692684590816498, "learning_rate": 7.987904515237297e-05, "loss": 1.445, "step": 68500 }, { "epoch": 3.2986095821972325, "grad_norm": 0.17328135669231415, "learning_rate": 7.986584613923017e-05, "loss": 1.4348, "step": 68510 }, { "epoch": 3.299151523362745, "grad_norm": 0.2851394712924957, "learning_rate": 7.985264404581431e-05, "loss": 1.45, "step": 68520 }, { "epoch": 3.299693464528257, "grad_norm": 0.18991175293922424, "learning_rate": 7.983943887376083e-05, "loss": 1.4392, "step": 68530 }, { "epoch": 3.3002354056937695, "grad_norm": 0.18527284264564514, "learning_rate": 7.982623062470547e-05, "loss": 1.4445, "step": 68540 }, { "epoch": 3.300777346859282, "grad_norm": 0.1802045851945877, "learning_rate": 7.981301930028446e-05, "loss": 1.4487, "step": 68550 }, { "epoch": 3.301102511558589, "eval_loss": 2.417846441268921, "eval_runtime": 26.4831, "eval_samples_per_second": 188.799, "eval_steps_per_second": 1.02, "step": 68556 }, { "epoch": 3.3013192880247937, "grad_norm": 0.15754951536655426, "learning_rate": 7.979980490213435e-05, "loss": 1.4363, "step": 68560 }, { "epoch": 3.301861229190306, "grad_norm": 0.31062695384025574, "learning_rate": 7.978658743189205e-05, "loss": 1.4359, "step": 68570 }, { "epoch": 3.3024031703558183, "grad_norm": 0.2875862717628479, "learning_rate": 7.977336689119495e-05, "loss": 1.4459, "step": 68580 }, { "epoch": 3.3029451115213306, "grad_norm": 0.18160034716129303, "learning_rate": 7.97601432816807e-05, "loss": 1.4378, "step": 68590 }, { "epoch": 3.3034870526868425, "grad_norm": 0.24987049400806427, "learning_rate": 7.97469166049874e-05, "loss": 1.4372, "step": 68600 }, { "epoch": 3.304028993852355, "grad_norm": 0.2157442718744278, "learning_rate": 7.973368686275353e-05, "loss": 1.4379, "step": 68610 }, { "epoch": 3.304570935017867, "grad_norm": 0.20178145170211792, "learning_rate": 7.972045405661788e-05, "loss": 1.4259, "step": 68620 }, { "epoch": 3.3051128761833795, "grad_norm": 0.24957998096942902, "learning_rate": 7.970721818821972e-05, "loss": 1.4474, "step": 68630 }, { "epoch": 3.3056548173488913, "grad_norm": 0.16413570940494537, "learning_rate": 7.969397925919863e-05, "loss": 1.434, "step": 68640 }, { "epoch": 3.305817399698545, "eval_loss": 2.419210433959961, "eval_runtime": 24.5355, "eval_samples_per_second": 203.786, "eval_steps_per_second": 1.1, "step": 68643 }, { "epoch": 3.3061967585144036, "grad_norm": 0.31925585865974426, "learning_rate": 7.968073727119461e-05, "loss": 1.4409, "step": 68650 }, { "epoch": 3.306738699679916, "grad_norm": 0.193577840924263, "learning_rate": 7.966749222584802e-05, "loss": 1.4379, "step": 68660 }, { "epoch": 3.3072806408454283, "grad_norm": 0.3227108418941498, "learning_rate": 7.965424412479958e-05, "loss": 1.4412, "step": 68670 }, { "epoch": 3.3078225820109406, "grad_norm": 0.2504270076751709, "learning_rate": 7.964099296969042e-05, "loss": 1.4256, "step": 68680 }, { "epoch": 3.3083645231764525, "grad_norm": 0.23025809228420258, "learning_rate": 7.962773876216202e-05, "loss": 1.4289, "step": 68690 }, { "epoch": 3.308906464341965, "grad_norm": 0.17569345235824585, "learning_rate": 7.961448150385628e-05, "loss": 1.4433, "step": 68700 }, { "epoch": 3.309448405507477, "grad_norm": 0.24907849729061127, "learning_rate": 7.960122119641542e-05, "loss": 1.443, "step": 68710 }, { "epoch": 3.3099903466729894, "grad_norm": 0.30965548753738403, "learning_rate": 7.958795784148208e-05, "loss": 1.4377, "step": 68720 }, { "epoch": 3.3105322878385017, "grad_norm": 0.30555593967437744, "learning_rate": 7.95746914406993e-05, "loss": 1.441, "step": 68730 }, { "epoch": 3.3105322878385017, "eval_loss": 2.420576572418213, "eval_runtime": 21.9481, "eval_samples_per_second": 227.81, "eval_steps_per_second": 1.23, "step": 68730 }, { "epoch": 3.3110742290040136, "grad_norm": 0.2148512899875641, "learning_rate": 7.956142199571042e-05, "loss": 1.4404, "step": 68740 }, { "epoch": 3.311616170169526, "grad_norm": 0.3420555591583252, "learning_rate": 7.954814950815922e-05, "loss": 1.4432, "step": 68750 }, { "epoch": 3.3121581113350382, "grad_norm": 0.22187262773513794, "learning_rate": 7.953487397968984e-05, "loss": 1.4425, "step": 68760 }, { "epoch": 3.3127000525005506, "grad_norm": 0.2587040662765503, "learning_rate": 7.95215954119468e-05, "loss": 1.451, "step": 68770 }, { "epoch": 3.313241993666063, "grad_norm": 0.20211346447467804, "learning_rate": 7.950831380657496e-05, "loss": 1.4466, "step": 68780 }, { "epoch": 3.3137839348315747, "grad_norm": 0.2853049337863922, "learning_rate": 7.949502916521962e-05, "loss": 1.4423, "step": 68790 }, { "epoch": 3.314325875997087, "grad_norm": 0.17778904736042023, "learning_rate": 7.948174148952642e-05, "loss": 1.4383, "step": 68800 }, { "epoch": 3.3148678171625994, "grad_norm": 0.30810484290122986, "learning_rate": 7.946845078114137e-05, "loss": 1.4441, "step": 68810 }, { "epoch": 3.315247175978458, "eval_loss": 2.4207398891448975, "eval_runtime": 32.3649, "eval_samples_per_second": 154.488, "eval_steps_per_second": 0.834, "step": 68817 }, { "epoch": 3.3154097583281117, "grad_norm": 0.19365157186985016, "learning_rate": 7.945515704171088e-05, "loss": 1.4428, "step": 68820 }, { "epoch": 3.3159516994936236, "grad_norm": 0.17585448920726776, "learning_rate": 7.944186027288169e-05, "loss": 1.4359, "step": 68830 }, { "epoch": 3.316493640659136, "grad_norm": 0.1955270618200302, "learning_rate": 7.942856047630098e-05, "loss": 1.4375, "step": 68840 }, { "epoch": 3.317035581824648, "grad_norm": 0.17522399127483368, "learning_rate": 7.941525765361624e-05, "loss": 1.4442, "step": 68850 }, { "epoch": 3.3175775229901605, "grad_norm": 0.20333018898963928, "learning_rate": 7.940195180647539e-05, "loss": 1.4478, "step": 68860 }, { "epoch": 3.3181194641556724, "grad_norm": 0.22244490683078766, "learning_rate": 7.93886429365267e-05, "loss": 1.428, "step": 68870 }, { "epoch": 3.3186614053211847, "grad_norm": 0.22800850868225098, "learning_rate": 7.93753310454188e-05, "loss": 1.4382, "step": 68880 }, { "epoch": 3.319203346486697, "grad_norm": 0.3180639147758484, "learning_rate": 7.93620161348007e-05, "loss": 1.4318, "step": 68890 }, { "epoch": 3.3197452876522093, "grad_norm": 0.4034172594547272, "learning_rate": 7.934869820632183e-05, "loss": 1.4457, "step": 68900 }, { "epoch": 3.319962064118414, "eval_loss": 2.4232521057128906, "eval_runtime": 24.2726, "eval_samples_per_second": 205.994, "eval_steps_per_second": 1.112, "step": 68904 }, { "epoch": 3.3202872288177216, "grad_norm": 0.32735925912857056, "learning_rate": 7.933537726163195e-05, "loss": 1.4403, "step": 68910 }, { "epoch": 3.3208291699832335, "grad_norm": 0.2666130065917969, "learning_rate": 7.932205330238118e-05, "loss": 1.4348, "step": 68920 }, { "epoch": 3.321371111148746, "grad_norm": 0.20888814330101013, "learning_rate": 7.930872633022006e-05, "loss": 1.4446, "step": 68930 }, { "epoch": 3.321913052314258, "grad_norm": 0.26774919033050537, "learning_rate": 7.929539634679941e-05, "loss": 1.4374, "step": 68940 }, { "epoch": 3.3224549934797705, "grad_norm": 0.18306075036525726, "learning_rate": 7.928206335377057e-05, "loss": 1.4408, "step": 68950 }, { "epoch": 3.322996934645283, "grad_norm": 0.2182762324810028, "learning_rate": 7.926872735278514e-05, "loss": 1.439, "step": 68960 }, { "epoch": 3.3235388758107947, "grad_norm": 0.23973847925662994, "learning_rate": 7.925538834549514e-05, "loss": 1.4403, "step": 68970 }, { "epoch": 3.324080816976307, "grad_norm": 0.2488727569580078, "learning_rate": 7.924204633355295e-05, "loss": 1.4386, "step": 68980 }, { "epoch": 3.3246227581418193, "grad_norm": 0.2777588963508606, "learning_rate": 7.922870131861127e-05, "loss": 1.4477, "step": 68990 }, { "epoch": 3.3246769522583706, "eval_loss": 2.42183518409729, "eval_runtime": 21.8322, "eval_samples_per_second": 229.02, "eval_steps_per_second": 1.237, "step": 68991 }, { "epoch": 3.3251646993073316, "grad_norm": 0.18954722583293915, "learning_rate": 7.921535330232328e-05, "loss": 1.4356, "step": 69000 }, { "epoch": 3.3257066404728435, "grad_norm": 0.2268332540988922, "learning_rate": 7.920200228634245e-05, "loss": 1.4413, "step": 69010 }, { "epoch": 3.326248581638356, "grad_norm": 0.21182286739349365, "learning_rate": 7.918864827232268e-05, "loss": 1.4295, "step": 69020 }, { "epoch": 3.326790522803868, "grad_norm": 0.16312585771083832, "learning_rate": 7.917529126191815e-05, "loss": 1.4398, "step": 69030 }, { "epoch": 3.3273324639693804, "grad_norm": 0.2508258819580078, "learning_rate": 7.916193125678349e-05, "loss": 1.431, "step": 69040 }, { "epoch": 3.3278744051348923, "grad_norm": 0.3282955586910248, "learning_rate": 7.914856825857371e-05, "loss": 1.4384, "step": 69050 }, { "epoch": 3.3284163463004046, "grad_norm": 0.20805367827415466, "learning_rate": 7.913520226894413e-05, "loss": 1.4365, "step": 69060 }, { "epoch": 3.328958287465917, "grad_norm": 0.32144641876220703, "learning_rate": 7.912183328955047e-05, "loss": 1.439, "step": 69070 }, { "epoch": 3.3293918403983267, "eval_loss": 2.420147657394409, "eval_runtime": 23.884, "eval_samples_per_second": 209.345, "eval_steps_per_second": 1.13, "step": 69078 }, { "epoch": 3.3295002286314292, "grad_norm": 0.27090176939964294, "learning_rate": 7.910846132204883e-05, "loss": 1.4386, "step": 69080 }, { "epoch": 3.3300421697969416, "grad_norm": 0.25403621792793274, "learning_rate": 7.909508636809567e-05, "loss": 1.4382, "step": 69090 }, { "epoch": 3.3305841109624534, "grad_norm": 0.23677849769592285, "learning_rate": 7.908170842934783e-05, "loss": 1.4414, "step": 69100 }, { "epoch": 3.3311260521279658, "grad_norm": 0.2112101912498474, "learning_rate": 7.90683275074625e-05, "loss": 1.4353, "step": 69110 }, { "epoch": 3.331667993293478, "grad_norm": 0.18061186373233795, "learning_rate": 7.905494360409725e-05, "loss": 1.4271, "step": 69120 }, { "epoch": 3.3322099344589904, "grad_norm": 0.34150585532188416, "learning_rate": 7.904155672091002e-05, "loss": 1.4275, "step": 69130 }, { "epoch": 3.3327518756245027, "grad_norm": 0.20496203005313873, "learning_rate": 7.902816685955912e-05, "loss": 1.4328, "step": 69140 }, { "epoch": 3.3332938167900146, "grad_norm": 0.43717578053474426, "learning_rate": 7.901477402170323e-05, "loss": 1.4455, "step": 69150 }, { "epoch": 3.333835757955527, "grad_norm": 0.29195889830589294, "learning_rate": 7.90013782090014e-05, "loss": 1.4471, "step": 69160 }, { "epoch": 3.3341067285382833, "eval_loss": 2.4220991134643555, "eval_runtime": 23.5219, "eval_samples_per_second": 212.568, "eval_steps_per_second": 1.148, "step": 69165 }, { "epoch": 3.334377699121039, "grad_norm": 0.27426832914352417, "learning_rate": 7.898797942311304e-05, "loss": 1.4368, "step": 69170 }, { "epoch": 3.3349196402865515, "grad_norm": 0.21939682960510254, "learning_rate": 7.897457766569793e-05, "loss": 1.4365, "step": 69180 }, { "epoch": 3.335461581452064, "grad_norm": 0.20194536447525024, "learning_rate": 7.896117293841622e-05, "loss": 1.4327, "step": 69190 }, { "epoch": 3.3360035226175757, "grad_norm": 0.22306720912456512, "learning_rate": 7.894776524292845e-05, "loss": 1.4425, "step": 69200 }, { "epoch": 3.336545463783088, "grad_norm": 0.547161877155304, "learning_rate": 7.893435458089549e-05, "loss": 1.4359, "step": 69210 }, { "epoch": 3.3370874049486003, "grad_norm": 0.17057354748249054, "learning_rate": 7.89209409539786e-05, "loss": 1.4307, "step": 69220 }, { "epoch": 3.3376293461141127, "grad_norm": 0.4212014973163605, "learning_rate": 7.890752436383939e-05, "loss": 1.4284, "step": 69230 }, { "epoch": 3.3381712872796245, "grad_norm": 0.272225558757782, "learning_rate": 7.889410481213986e-05, "loss": 1.4391, "step": 69240 }, { "epoch": 3.338713228445137, "grad_norm": 0.2661522626876831, "learning_rate": 7.888068230054236e-05, "loss": 1.4344, "step": 69250 }, { "epoch": 3.3388216166782394, "eval_loss": 2.4209210872650146, "eval_runtime": 23.9348, "eval_samples_per_second": 208.901, "eval_steps_per_second": 1.128, "step": 69252 }, { "epoch": 3.339255169610649, "grad_norm": 0.21756187081336975, "learning_rate": 7.886725683070963e-05, "loss": 1.4455, "step": 69260 }, { "epoch": 3.3397971107761615, "grad_norm": 0.1777390092611313, "learning_rate": 7.885382840430475e-05, "loss": 1.4404, "step": 69270 }, { "epoch": 3.3403390519416734, "grad_norm": 0.21643613278865814, "learning_rate": 7.884039702299113e-05, "loss": 1.4348, "step": 69280 }, { "epoch": 3.3408809931071857, "grad_norm": 0.21973682940006256, "learning_rate": 7.882696268843267e-05, "loss": 1.4467, "step": 69290 }, { "epoch": 3.341422934272698, "grad_norm": 0.2839016020298004, "learning_rate": 7.881352540229351e-05, "loss": 1.4296, "step": 69300 }, { "epoch": 3.3419648754382103, "grad_norm": 0.17551878094673157, "learning_rate": 7.880008516623822e-05, "loss": 1.4323, "step": 69310 }, { "epoch": 3.3425068166037226, "grad_norm": 0.19211049377918243, "learning_rate": 7.878664198193169e-05, "loss": 1.436, "step": 69320 }, { "epoch": 3.3430487577692345, "grad_norm": 0.2430991530418396, "learning_rate": 7.877319585103922e-05, "loss": 1.4405, "step": 69330 }, { "epoch": 3.3435365048181955, "eval_loss": 2.417872190475464, "eval_runtime": 28.3658, "eval_samples_per_second": 176.269, "eval_steps_per_second": 0.952, "step": 69339 }, { "epoch": 3.343590698934747, "grad_norm": 0.2997911274433136, "learning_rate": 7.875974677522648e-05, "loss": 1.4357, "step": 69340 }, { "epoch": 3.344132640100259, "grad_norm": 0.18667905032634735, "learning_rate": 7.874629475615946e-05, "loss": 1.4338, "step": 69350 }, { "epoch": 3.3446745812657714, "grad_norm": 0.20530866086483002, "learning_rate": 7.873283979550452e-05, "loss": 1.4477, "step": 69360 }, { "epoch": 3.3452165224312838, "grad_norm": 0.16726155579090118, "learning_rate": 7.871938189492844e-05, "loss": 1.4393, "step": 69370 }, { "epoch": 3.3457584635967956, "grad_norm": 0.16196750104427338, "learning_rate": 7.870592105609832e-05, "loss": 1.4274, "step": 69380 }, { "epoch": 3.346300404762308, "grad_norm": 0.18552835285663605, "learning_rate": 7.86924572806816e-05, "loss": 1.4396, "step": 69390 }, { "epoch": 3.3468423459278203, "grad_norm": 0.19698499143123627, "learning_rate": 7.867899057034616e-05, "loss": 1.4374, "step": 69400 }, { "epoch": 3.3473842870933326, "grad_norm": 0.23695054650306702, "learning_rate": 7.866552092676015e-05, "loss": 1.4411, "step": 69410 }, { "epoch": 3.347926228258845, "grad_norm": 0.2107638120651245, "learning_rate": 7.865204835159217e-05, "loss": 1.4317, "step": 69420 }, { "epoch": 3.348251392958152, "eval_loss": 2.422180652618408, "eval_runtime": 29.2437, "eval_samples_per_second": 170.977, "eval_steps_per_second": 0.923, "step": 69426 }, { "epoch": 3.3484681694243568, "grad_norm": 0.24402372539043427, "learning_rate": 7.863857284651111e-05, "loss": 1.4371, "step": 69430 }, { "epoch": 3.349010110589869, "grad_norm": 0.2646619379520416, "learning_rate": 7.862509441318627e-05, "loss": 1.4186, "step": 69440 }, { "epoch": 3.3495520517553814, "grad_norm": 0.3436126708984375, "learning_rate": 7.861161305328733e-05, "loss": 1.4437, "step": 69450 }, { "epoch": 3.3500939929208937, "grad_norm": 0.1728588491678238, "learning_rate": 7.859812876848426e-05, "loss": 1.4393, "step": 69460 }, { "epoch": 3.3506359340864056, "grad_norm": 0.2518586814403534, "learning_rate": 7.858464156044745e-05, "loss": 1.4365, "step": 69470 }, { "epoch": 3.351177875251918, "grad_norm": 0.2023596614599228, "learning_rate": 7.857115143084763e-05, "loss": 1.4384, "step": 69480 }, { "epoch": 3.35171981641743, "grad_norm": 0.22805948555469513, "learning_rate": 7.855765838135592e-05, "loss": 1.4287, "step": 69490 }, { "epoch": 3.3522617575829425, "grad_norm": 0.18878723680973053, "learning_rate": 7.854416241364376e-05, "loss": 1.4451, "step": 69500 }, { "epoch": 3.3528036987484544, "grad_norm": 0.22859229147434235, "learning_rate": 7.8530663529383e-05, "loss": 1.4299, "step": 69510 }, { "epoch": 3.3529662810981082, "eval_loss": 2.420332908630371, "eval_runtime": 22.0276, "eval_samples_per_second": 226.988, "eval_steps_per_second": 1.226, "step": 69513 }, { "epoch": 3.3533456399139667, "grad_norm": 0.23224987089633942, "learning_rate": 7.851716173024578e-05, "loss": 1.4408, "step": 69520 }, { "epoch": 3.353887581079479, "grad_norm": 0.2047203630208969, "learning_rate": 7.850365701790466e-05, "loss": 1.4358, "step": 69530 }, { "epoch": 3.3544295222449914, "grad_norm": 0.40388110280036926, "learning_rate": 7.849014939403256e-05, "loss": 1.43, "step": 69540 }, { "epoch": 3.3549714634105037, "grad_norm": 0.21261848509311676, "learning_rate": 7.847663886030274e-05, "loss": 1.4404, "step": 69550 }, { "epoch": 3.3555134045760155, "grad_norm": 0.2009463608264923, "learning_rate": 7.846312541838883e-05, "loss": 1.4355, "step": 69560 }, { "epoch": 3.356055345741528, "grad_norm": 0.22957339882850647, "learning_rate": 7.844960906996481e-05, "loss": 1.4421, "step": 69570 }, { "epoch": 3.35659728690704, "grad_norm": 0.2667635679244995, "learning_rate": 7.843608981670501e-05, "loss": 1.4367, "step": 69580 }, { "epoch": 3.3571392280725525, "grad_norm": 0.19165495038032532, "learning_rate": 7.842256766028416e-05, "loss": 1.4375, "step": 69590 }, { "epoch": 3.357681169238065, "grad_norm": 0.2126622498035431, "learning_rate": 7.840904260237732e-05, "loss": 1.4342, "step": 69600 }, { "epoch": 3.357681169238065, "eval_loss": 2.420774459838867, "eval_runtime": 21.9323, "eval_samples_per_second": 227.974, "eval_steps_per_second": 1.231, "step": 69600 }, { "epoch": 3.3582231104035767, "grad_norm": 0.21864204108715057, "learning_rate": 7.839551464465992e-05, "loss": 1.4502, "step": 69610 }, { "epoch": 3.358765051569089, "grad_norm": 0.1772039234638214, "learning_rate": 7.838198378880772e-05, "loss": 1.4457, "step": 69620 }, { "epoch": 3.3593069927346013, "grad_norm": 0.16376861929893494, "learning_rate": 7.83684500364969e-05, "loss": 1.4307, "step": 69630 }, { "epoch": 3.3598489339001136, "grad_norm": 0.19408194720745087, "learning_rate": 7.835491338940395e-05, "loss": 1.4434, "step": 69640 }, { "epoch": 3.3603908750656255, "grad_norm": 0.18771915137767792, "learning_rate": 7.834137384920572e-05, "loss": 1.4245, "step": 69650 }, { "epoch": 3.360932816231138, "grad_norm": 0.1942000687122345, "learning_rate": 7.832783141757943e-05, "loss": 1.4336, "step": 69660 }, { "epoch": 3.36147475739665, "grad_norm": 0.22460921108722687, "learning_rate": 7.831428609620267e-05, "loss": 1.4373, "step": 69670 }, { "epoch": 3.3620166985621625, "grad_norm": 0.182665154337883, "learning_rate": 7.830073788675336e-05, "loss": 1.439, "step": 69680 }, { "epoch": 3.362396057378021, "eval_loss": 2.4225962162017822, "eval_runtime": 26.9741, "eval_samples_per_second": 185.363, "eval_steps_per_second": 1.001, "step": 69687 }, { "epoch": 3.3625586397276743, "grad_norm": 0.18876463174819946, "learning_rate": 7.828718679090981e-05, "loss": 1.4367, "step": 69690 }, { "epoch": 3.3631005808931866, "grad_norm": 0.2040254920721054, "learning_rate": 7.827363281035067e-05, "loss": 1.4358, "step": 69700 }, { "epoch": 3.363642522058699, "grad_norm": 0.375626802444458, "learning_rate": 7.826007594675493e-05, "loss": 1.436, "step": 69710 }, { "epoch": 3.3641844632242113, "grad_norm": 0.1845981627702713, "learning_rate": 7.824651620180196e-05, "loss": 1.4379, "step": 69720 }, { "epoch": 3.3647264043897236, "grad_norm": 0.1836758852005005, "learning_rate": 7.82329535771715e-05, "loss": 1.4381, "step": 69730 }, { "epoch": 3.3652683455552355, "grad_norm": 0.2422439604997635, "learning_rate": 7.821938807454361e-05, "loss": 1.4424, "step": 69740 }, { "epoch": 3.365810286720748, "grad_norm": 0.2933938205242157, "learning_rate": 7.820581969559877e-05, "loss": 1.4292, "step": 69750 }, { "epoch": 3.36635222788626, "grad_norm": 0.4664517343044281, "learning_rate": 7.819224844201769e-05, "loss": 1.4266, "step": 69760 }, { "epoch": 3.3668941690517724, "grad_norm": 0.211371049284935, "learning_rate": 7.817867431548158e-05, "loss": 1.4308, "step": 69770 }, { "epoch": 3.367110945517977, "eval_loss": 2.440751791000366, "eval_runtime": 22.2008, "eval_samples_per_second": 225.217, "eval_steps_per_second": 1.216, "step": 69774 }, { "epoch": 3.3674361102172847, "grad_norm": 0.39874711632728577, "learning_rate": 7.816509731767191e-05, "loss": 1.4285, "step": 69780 }, { "epoch": 3.3679780513827966, "grad_norm": 0.23130638897418976, "learning_rate": 7.815151745027058e-05, "loss": 1.4322, "step": 69790 }, { "epoch": 3.368519992548309, "grad_norm": 0.18918637931346893, "learning_rate": 7.81379347149598e-05, "loss": 1.4355, "step": 69800 }, { "epoch": 3.3690619337138212, "grad_norm": 0.27294501662254333, "learning_rate": 7.812434911342209e-05, "loss": 1.4453, "step": 69810 }, { "epoch": 3.3696038748793335, "grad_norm": 0.19342243671417236, "learning_rate": 7.811076064734043e-05, "loss": 1.4399, "step": 69820 }, { "epoch": 3.370145816044846, "grad_norm": 0.2188844233751297, "learning_rate": 7.809716931839804e-05, "loss": 1.4368, "step": 69830 }, { "epoch": 3.3706877572103577, "grad_norm": 0.19374194741249084, "learning_rate": 7.808357512827862e-05, "loss": 1.4304, "step": 69840 }, { "epoch": 3.37122969837587, "grad_norm": 0.2000492513179779, "learning_rate": 7.806997807866614e-05, "loss": 1.426, "step": 69850 }, { "epoch": 3.3717716395413824, "grad_norm": 0.2283279299736023, "learning_rate": 7.805637817124493e-05, "loss": 1.4398, "step": 69860 }, { "epoch": 3.3718258336579336, "eval_loss": 2.4315900802612305, "eval_runtime": 24.6052, "eval_samples_per_second": 203.209, "eval_steps_per_second": 1.097, "step": 69861 }, { "epoch": 3.3723135807068947, "grad_norm": 0.21723055839538574, "learning_rate": 7.804277540769967e-05, "loss": 1.4309, "step": 69870 }, { "epoch": 3.3728555218724066, "grad_norm": 0.19172869622707367, "learning_rate": 7.802916978971546e-05, "loss": 1.4248, "step": 69880 }, { "epoch": 3.373397463037919, "grad_norm": 0.16879266500473022, "learning_rate": 7.801556131897769e-05, "loss": 1.4338, "step": 69890 }, { "epoch": 3.373939404203431, "grad_norm": 0.3514922857284546, "learning_rate": 7.800194999717207e-05, "loss": 1.4281, "step": 69900 }, { "epoch": 3.3744813453689435, "grad_norm": 0.28702405095100403, "learning_rate": 7.798833582598476e-05, "loss": 1.4389, "step": 69910 }, { "epoch": 3.3750232865344554, "grad_norm": 0.2494763731956482, "learning_rate": 7.797471880710223e-05, "loss": 1.4392, "step": 69920 }, { "epoch": 3.3755652276999677, "grad_norm": 0.2236117422580719, "learning_rate": 7.796109894221127e-05, "loss": 1.4325, "step": 69930 }, { "epoch": 3.37610716886548, "grad_norm": 0.1991259753704071, "learning_rate": 7.794747623299906e-05, "loss": 1.4294, "step": 69940 }, { "epoch": 3.3765407217978898, "eval_loss": 2.4242312908172607, "eval_runtime": 22.1933, "eval_samples_per_second": 225.294, "eval_steps_per_second": 1.217, "step": 69948 }, { "epoch": 3.3766491100309923, "grad_norm": 0.3077497184276581, "learning_rate": 7.793385068115312e-05, "loss": 1.4422, "step": 69950 }, { "epoch": 3.3771910511965046, "grad_norm": 0.17531952261924744, "learning_rate": 7.792022228836133e-05, "loss": 1.4326, "step": 69960 }, { "epoch": 3.3777329923620165, "grad_norm": 0.1661709100008011, "learning_rate": 7.790659105631192e-05, "loss": 1.4322, "step": 69970 }, { "epoch": 3.378274933527529, "grad_norm": 0.21228908002376556, "learning_rate": 7.789295698669345e-05, "loss": 1.4374, "step": 69980 }, { "epoch": 3.378816874693041, "grad_norm": 0.22769136726856232, "learning_rate": 7.787932008119487e-05, "loss": 1.4357, "step": 69990 }, { "epoch": 3.3793588158585535, "grad_norm": 0.16828486323356628, "learning_rate": 7.786568034150545e-05, "loss": 1.4477, "step": 70000 }, { "epoch": 3.379900757024066, "grad_norm": 0.32244718074798584, "learning_rate": 7.785203776931482e-05, "loss": 1.4328, "step": 70010 }, { "epoch": 3.3804426981895777, "grad_norm": 0.2961633801460266, "learning_rate": 7.783839236631294e-05, "loss": 1.4321, "step": 70020 }, { "epoch": 3.38098463935509, "grad_norm": 0.1998753696680069, "learning_rate": 7.78247441341902e-05, "loss": 1.4415, "step": 70030 }, { "epoch": 3.3812556099378464, "eval_loss": 2.421795606613159, "eval_runtime": 24.1321, "eval_samples_per_second": 207.193, "eval_steps_per_second": 1.119, "step": 70035 }, { "epoch": 3.3815265805206023, "grad_norm": 0.16955262422561646, "learning_rate": 7.781109307463725e-05, "loss": 1.4366, "step": 70040 }, { "epoch": 3.3820685216861146, "grad_norm": 0.18801350891590118, "learning_rate": 7.77974391893451e-05, "loss": 1.4322, "step": 70050 }, { "epoch": 3.3826104628516265, "grad_norm": 0.1863052397966385, "learning_rate": 7.778378248000517e-05, "loss": 1.4334, "step": 70060 }, { "epoch": 3.383152404017139, "grad_norm": 0.3966461420059204, "learning_rate": 7.77701229483092e-05, "loss": 1.4332, "step": 70070 }, { "epoch": 3.383694345182651, "grad_norm": 0.2498437464237213, "learning_rate": 7.775646059594924e-05, "loss": 1.4432, "step": 70080 }, { "epoch": 3.3842362863481634, "grad_norm": 0.22239406406879425, "learning_rate": 7.774279542461776e-05, "loss": 1.4272, "step": 70090 }, { "epoch": 3.3847782275136753, "grad_norm": 0.26137417554855347, "learning_rate": 7.77291274360075e-05, "loss": 1.4275, "step": 70100 }, { "epoch": 3.3853201686791876, "grad_norm": 0.28286683559417725, "learning_rate": 7.771545663181161e-05, "loss": 1.4328, "step": 70110 }, { "epoch": 3.3858621098447, "grad_norm": 0.22739897668361664, "learning_rate": 7.770178301372361e-05, "loss": 1.4253, "step": 70120 }, { "epoch": 3.3859704980778025, "eval_loss": 2.4276702404022217, "eval_runtime": 21.9823, "eval_samples_per_second": 227.455, "eval_steps_per_second": 1.228, "step": 70122 }, { "epoch": 3.3864040510102122, "grad_norm": 0.20265692472457886, "learning_rate": 7.768810658343724e-05, "loss": 1.4306, "step": 70130 }, { "epoch": 3.3869459921757246, "grad_norm": 0.2867891490459442, "learning_rate": 7.767442734264677e-05, "loss": 1.4319, "step": 70140 }, { "epoch": 3.3874879333412364, "grad_norm": 0.1854824721813202, "learning_rate": 7.766074529304666e-05, "loss": 1.4365, "step": 70150 }, { "epoch": 3.3880298745067488, "grad_norm": 0.3422788679599762, "learning_rate": 7.764706043633183e-05, "loss": 1.4379, "step": 70160 }, { "epoch": 3.388571815672261, "grad_norm": 0.17778363823890686, "learning_rate": 7.763337277419745e-05, "loss": 1.4323, "step": 70170 }, { "epoch": 3.3891137568377734, "grad_norm": 0.1999989151954651, "learning_rate": 7.761968230833913e-05, "loss": 1.4236, "step": 70180 }, { "epoch": 3.3896556980032857, "grad_norm": 0.24541445076465607, "learning_rate": 7.760598904045277e-05, "loss": 1.4302, "step": 70190 }, { "epoch": 3.3901976391687976, "grad_norm": 0.2255592793226242, "learning_rate": 7.759229297223463e-05, "loss": 1.441, "step": 70200 }, { "epoch": 3.3906853862177586, "eval_loss": 2.4177823066711426, "eval_runtime": 23.5061, "eval_samples_per_second": 212.711, "eval_steps_per_second": 1.149, "step": 70209 }, { "epoch": 3.39073958033431, "grad_norm": 0.19751523435115814, "learning_rate": 7.757859410538131e-05, "loss": 1.4337, "step": 70210 }, { "epoch": 3.391281521499822, "grad_norm": 0.3200652301311493, "learning_rate": 7.75648924415898e-05, "loss": 1.4221, "step": 70220 }, { "epoch": 3.3918234626653345, "grad_norm": 0.2096845954656601, "learning_rate": 7.755118798255738e-05, "loss": 1.4365, "step": 70230 }, { "epoch": 3.392365403830847, "grad_norm": 0.2815345525741577, "learning_rate": 7.753748072998169e-05, "loss": 1.4421, "step": 70240 }, { "epoch": 3.3929073449963587, "grad_norm": 0.21538645029067993, "learning_rate": 7.752377068556073e-05, "loss": 1.4339, "step": 70250 }, { "epoch": 3.393449286161871, "grad_norm": 0.1685652881860733, "learning_rate": 7.751005785099286e-05, "loss": 1.4168, "step": 70260 }, { "epoch": 3.3939912273273833, "grad_norm": 0.16810309886932373, "learning_rate": 7.749634222797674e-05, "loss": 1.437, "step": 70270 }, { "epoch": 3.3945331684928957, "grad_norm": 0.32125574350357056, "learning_rate": 7.748262381821143e-05, "loss": 1.435, "step": 70280 }, { "epoch": 3.3950751096584075, "grad_norm": 0.2670728266239166, "learning_rate": 7.746890262339627e-05, "loss": 1.4457, "step": 70290 }, { "epoch": 3.395400274357715, "eval_loss": 2.4280776977539062, "eval_runtime": 25.8886, "eval_samples_per_second": 193.135, "eval_steps_per_second": 1.043, "step": 70296 }, { "epoch": 3.39561705082392, "grad_norm": 0.3197952210903168, "learning_rate": 7.745517864523102e-05, "loss": 1.4286, "step": 70300 }, { "epoch": 3.396158991989432, "grad_norm": 0.19565744698047638, "learning_rate": 7.744145188541573e-05, "loss": 1.4334, "step": 70310 }, { "epoch": 3.3967009331549445, "grad_norm": 0.22424980998039246, "learning_rate": 7.742772234565081e-05, "loss": 1.432, "step": 70320 }, { "epoch": 3.3972428743204564, "grad_norm": 0.27934402227401733, "learning_rate": 7.741399002763702e-05, "loss": 1.4402, "step": 70330 }, { "epoch": 3.3977848154859687, "grad_norm": 0.16731417179107666, "learning_rate": 7.740025493307543e-05, "loss": 1.4352, "step": 70340 }, { "epoch": 3.398326756651481, "grad_norm": 0.24355627596378326, "learning_rate": 7.738651706366754e-05, "loss": 1.4329, "step": 70350 }, { "epoch": 3.3988686978169933, "grad_norm": 0.1672547459602356, "learning_rate": 7.73727764211151e-05, "loss": 1.4408, "step": 70360 }, { "epoch": 3.3994106389825056, "grad_norm": 0.21636275947093964, "learning_rate": 7.735903300712025e-05, "loss": 1.4255, "step": 70370 }, { "epoch": 3.3999525801480175, "grad_norm": 0.21118460595607758, "learning_rate": 7.734528682338546e-05, "loss": 1.4435, "step": 70380 }, { "epoch": 3.4001151624976713, "eval_loss": 2.429720163345337, "eval_runtime": 24.7724, "eval_samples_per_second": 201.837, "eval_steps_per_second": 1.09, "step": 70383 }, { "epoch": 3.40049452131353, "grad_norm": 0.30819687247276306, "learning_rate": 7.733153787161356e-05, "loss": 1.4255, "step": 70390 }, { "epoch": 3.401036462479042, "grad_norm": 0.20422889292240143, "learning_rate": 7.73177861535077e-05, "loss": 1.4364, "step": 70400 }, { "epoch": 3.4015784036445544, "grad_norm": 0.3542598783969879, "learning_rate": 7.73040316707714e-05, "loss": 1.4333, "step": 70410 }, { "epoch": 3.4021203448100668, "grad_norm": 0.20476211607456207, "learning_rate": 7.729027442510847e-05, "loss": 1.4459, "step": 70420 }, { "epoch": 3.4026622859755786, "grad_norm": 0.22746922075748444, "learning_rate": 7.727651441822312e-05, "loss": 1.4267, "step": 70430 }, { "epoch": 3.403204227141091, "grad_norm": 0.23665057122707367, "learning_rate": 7.726275165181988e-05, "loss": 1.4302, "step": 70440 }, { "epoch": 3.4037461683066033, "grad_norm": 0.2850460112094879, "learning_rate": 7.724898612760362e-05, "loss": 1.4315, "step": 70450 }, { "epoch": 3.4042881094721156, "grad_norm": 0.17364229261875153, "learning_rate": 7.723521784727956e-05, "loss": 1.4364, "step": 70460 }, { "epoch": 3.404830050637628, "grad_norm": 0.21966204047203064, "learning_rate": 7.722144681255325e-05, "loss": 1.4284, "step": 70470 }, { "epoch": 3.404830050637628, "eval_loss": 2.4222443103790283, "eval_runtime": 21.8336, "eval_samples_per_second": 229.005, "eval_steps_per_second": 1.237, "step": 70470 }, { "epoch": 3.4053719918031398, "grad_norm": 0.23222704231739044, "learning_rate": 7.720767302513059e-05, "loss": 1.4334, "step": 70480 }, { "epoch": 3.405913932968652, "grad_norm": 0.19313640892505646, "learning_rate": 7.719389648671779e-05, "loss": 1.4361, "step": 70490 }, { "epoch": 3.4064558741341644, "grad_norm": 0.17659373581409454, "learning_rate": 7.718011719902146e-05, "loss": 1.4402, "step": 70500 }, { "epoch": 3.4069978152996767, "grad_norm": 0.2190655916929245, "learning_rate": 7.716633516374852e-05, "loss": 1.4326, "step": 70510 }, { "epoch": 3.4075397564651886, "grad_norm": 0.2351011484861374, "learning_rate": 7.715255038260621e-05, "loss": 1.4198, "step": 70520 }, { "epoch": 3.408081697630701, "grad_norm": 0.2913278341293335, "learning_rate": 7.713876285730213e-05, "loss": 1.4363, "step": 70530 }, { "epoch": 3.408623638796213, "grad_norm": 0.21653717756271362, "learning_rate": 7.712497258954422e-05, "loss": 1.4284, "step": 70540 }, { "epoch": 3.4091655799617255, "grad_norm": 0.2118827849626541, "learning_rate": 7.711117958104077e-05, "loss": 1.4306, "step": 70550 }, { "epoch": 3.409544938777584, "eval_loss": 2.4244723320007324, "eval_runtime": 23.3419, "eval_samples_per_second": 214.207, "eval_steps_per_second": 1.157, "step": 70557 }, { "epoch": 3.4097075211272374, "grad_norm": 0.3481065630912781, "learning_rate": 7.70973838335004e-05, "loss": 1.4373, "step": 70560 }, { "epoch": 3.4102494622927497, "grad_norm": 0.16835106909275055, "learning_rate": 7.708358534863205e-05, "loss": 1.433, "step": 70570 }, { "epoch": 3.410791403458262, "grad_norm": 0.3342517912387848, "learning_rate": 7.706978412814501e-05, "loss": 1.4379, "step": 70580 }, { "epoch": 3.4113333446237744, "grad_norm": 0.19866886734962463, "learning_rate": 7.705598017374894e-05, "loss": 1.4306, "step": 70590 }, { "epoch": 3.4118752857892867, "grad_norm": 0.29506245255470276, "learning_rate": 7.704217348715381e-05, "loss": 1.4344, "step": 70600 }, { "epoch": 3.4124172269547985, "grad_norm": 0.2535003125667572, "learning_rate": 7.702836407006993e-05, "loss": 1.441, "step": 70610 }, { "epoch": 3.412959168120311, "grad_norm": 0.23304949700832367, "learning_rate": 7.701455192420793e-05, "loss": 1.429, "step": 70620 }, { "epoch": 3.413501109285823, "grad_norm": 0.2439218908548355, "learning_rate": 7.700073705127883e-05, "loss": 1.4379, "step": 70630 }, { "epoch": 3.4140430504513355, "grad_norm": 0.2127150297164917, "learning_rate": 7.698691945299392e-05, "loss": 1.4366, "step": 70640 }, { "epoch": 3.41425982691754, "eval_loss": 2.4175617694854736, "eval_runtime": 21.9871, "eval_samples_per_second": 227.406, "eval_steps_per_second": 1.228, "step": 70644 }, { "epoch": 3.414584991616848, "grad_norm": 0.17583894729614258, "learning_rate": 7.697309913106491e-05, "loss": 1.4296, "step": 70650 }, { "epoch": 3.4151269327823597, "grad_norm": 0.1831618696451187, "learning_rate": 7.695927608720376e-05, "loss": 1.435, "step": 70660 }, { "epoch": 3.415668873947872, "grad_norm": 0.22078917920589447, "learning_rate": 7.694545032312284e-05, "loss": 1.4256, "step": 70670 }, { "epoch": 3.4162108151133843, "grad_norm": 0.19926197826862335, "learning_rate": 7.69316218405348e-05, "loss": 1.4341, "step": 70680 }, { "epoch": 3.4167527562788966, "grad_norm": 0.20558814704418182, "learning_rate": 7.691779064115267e-05, "loss": 1.4258, "step": 70690 }, { "epoch": 3.4172946974444085, "grad_norm": 0.1772947460412979, "learning_rate": 7.690395672668979e-05, "loss": 1.4311, "step": 70700 }, { "epoch": 3.417836638609921, "grad_norm": 0.29191502928733826, "learning_rate": 7.689012009885986e-05, "loss": 1.4376, "step": 70710 }, { "epoch": 3.418378579775433, "grad_norm": 0.21399341523647308, "learning_rate": 7.687628075937689e-05, "loss": 1.4347, "step": 70720 }, { "epoch": 3.4189205209409455, "grad_norm": 0.2068105787038803, "learning_rate": 7.686243870995522e-05, "loss": 1.4269, "step": 70730 }, { "epoch": 3.4189747150574967, "eval_loss": 2.414785861968994, "eval_runtime": 22.7197, "eval_samples_per_second": 220.073, "eval_steps_per_second": 1.188, "step": 70731 }, { "epoch": 3.4194624621064573, "grad_norm": 0.22110241651535034, "learning_rate": 7.684859395230956e-05, "loss": 1.4331, "step": 70740 }, { "epoch": 3.4200044032719696, "grad_norm": 0.1638895571231842, "learning_rate": 7.683474648815496e-05, "loss": 1.419, "step": 70750 }, { "epoch": 3.420546344437482, "grad_norm": 0.20719309151172638, "learning_rate": 7.682089631920674e-05, "loss": 1.4246, "step": 70760 }, { "epoch": 3.4210882856029943, "grad_norm": 0.19159828126430511, "learning_rate": 7.680704344718063e-05, "loss": 1.4245, "step": 70770 }, { "epoch": 3.4216302267685066, "grad_norm": 0.29333052039146423, "learning_rate": 7.679318787379264e-05, "loss": 1.4397, "step": 70780 }, { "epoch": 3.4221721679340185, "grad_norm": 0.22254757583141327, "learning_rate": 7.677932960075917e-05, "loss": 1.4231, "step": 70790 }, { "epoch": 3.4227141090995308, "grad_norm": 0.2216396927833557, "learning_rate": 7.67654686297969e-05, "loss": 1.433, "step": 70800 }, { "epoch": 3.423256050265043, "grad_norm": 0.27683907747268677, "learning_rate": 7.675160496262288e-05, "loss": 1.4404, "step": 70810 }, { "epoch": 3.423689603197453, "eval_loss": 2.416546583175659, "eval_runtime": 21.9857, "eval_samples_per_second": 227.421, "eval_steps_per_second": 1.228, "step": 70818 }, { "epoch": 3.4237979914305554, "grad_norm": 0.3137832581996918, "learning_rate": 7.673773860095445e-05, "loss": 1.434, "step": 70820 }, { "epoch": 3.4243399325960677, "grad_norm": 0.1663368195295334, "learning_rate": 7.672386954650936e-05, "loss": 1.427, "step": 70830 }, { "epoch": 3.4248818737615796, "grad_norm": 0.18768776953220367, "learning_rate": 7.670999780100563e-05, "loss": 1.4314, "step": 70840 }, { "epoch": 3.425423814927092, "grad_norm": 0.19177640974521637, "learning_rate": 7.669612336616162e-05, "loss": 1.4275, "step": 70850 }, { "epoch": 3.4259657560926042, "grad_norm": 0.2142564207315445, "learning_rate": 7.668224624369603e-05, "loss": 1.4467, "step": 70860 }, { "epoch": 3.4265076972581165, "grad_norm": 0.24929867684841156, "learning_rate": 7.666836643532793e-05, "loss": 1.4374, "step": 70870 }, { "epoch": 3.427049638423629, "grad_norm": 0.20959332585334778, "learning_rate": 7.665448394277664e-05, "loss": 1.4289, "step": 70880 }, { "epoch": 3.4275915795891407, "grad_norm": 0.23564212024211884, "learning_rate": 7.664059876776195e-05, "loss": 1.4368, "step": 70890 }, { "epoch": 3.428133520754653, "grad_norm": 0.29305994510650635, "learning_rate": 7.662671091200378e-05, "loss": 1.4361, "step": 70900 }, { "epoch": 3.4284044913374094, "eval_loss": 2.4167122840881348, "eval_runtime": 22.6755, "eval_samples_per_second": 220.502, "eval_steps_per_second": 1.191, "step": 70905 }, { "epoch": 3.4286754619201654, "grad_norm": 0.19599109888076782, "learning_rate": 7.66128203772226e-05, "loss": 1.4286, "step": 70910 }, { "epoch": 3.4292174030856777, "grad_norm": 0.2723342478275299, "learning_rate": 7.659892716513904e-05, "loss": 1.4253, "step": 70920 }, { "epoch": 3.4297593442511896, "grad_norm": 0.25713953375816345, "learning_rate": 7.658503127747415e-05, "loss": 1.4388, "step": 70930 }, { "epoch": 3.430301285416702, "grad_norm": 0.24171973764896393, "learning_rate": 7.657113271594931e-05, "loss": 1.4361, "step": 70940 }, { "epoch": 3.430843226582214, "grad_norm": 0.17661350965499878, "learning_rate": 7.65572314822862e-05, "loss": 1.4331, "step": 70950 }, { "epoch": 3.4313851677477265, "grad_norm": 0.23326051235198975, "learning_rate": 7.654332757820684e-05, "loss": 1.4239, "step": 70960 }, { "epoch": 3.4319271089132384, "grad_norm": 0.1615256667137146, "learning_rate": 7.652942100543361e-05, "loss": 1.4351, "step": 70970 }, { "epoch": 3.4324690500787507, "grad_norm": 0.19447456300258636, "learning_rate": 7.651551176568916e-05, "loss": 1.4277, "step": 70980 }, { "epoch": 3.433010991244263, "grad_norm": 0.3392447829246521, "learning_rate": 7.650159986069653e-05, "loss": 1.4348, "step": 70990 }, { "epoch": 3.4331193794773656, "eval_loss": 2.42305850982666, "eval_runtime": 22.2077, "eval_samples_per_second": 225.147, "eval_steps_per_second": 1.216, "step": 70992 }, { "epoch": 3.4335529324097753, "grad_norm": 0.2070346623659134, "learning_rate": 7.648768529217907e-05, "loss": 1.4279, "step": 71000 }, { "epoch": 3.4340948735752876, "grad_norm": 0.2491535097360611, "learning_rate": 7.647376806186043e-05, "loss": 1.4386, "step": 71010 }, { "epoch": 3.4346368147407995, "grad_norm": 0.3890012502670288, "learning_rate": 7.645984817146464e-05, "loss": 1.429, "step": 71020 }, { "epoch": 3.435178755906312, "grad_norm": 0.19839666783809662, "learning_rate": 7.644592562271603e-05, "loss": 1.4271, "step": 71030 }, { "epoch": 3.435720697071824, "grad_norm": 0.24999631941318512, "learning_rate": 7.643200041733926e-05, "loss": 1.4357, "step": 71040 }, { "epoch": 3.4362626382373365, "grad_norm": 0.18307673931121826, "learning_rate": 7.641807255705932e-05, "loss": 1.4381, "step": 71050 }, { "epoch": 3.436804579402849, "grad_norm": 0.3571625351905823, "learning_rate": 7.640414204360154e-05, "loss": 1.4406, "step": 71060 }, { "epoch": 3.4373465205683607, "grad_norm": 0.23355412483215332, "learning_rate": 7.639020887869157e-05, "loss": 1.4392, "step": 71070 }, { "epoch": 3.4378342676173217, "eval_loss": 2.4152307510375977, "eval_runtime": 22.2007, "eval_samples_per_second": 225.218, "eval_steps_per_second": 1.216, "step": 71079 }, { "epoch": 3.437888461733873, "grad_norm": 0.20043282210826874, "learning_rate": 7.63762730640554e-05, "loss": 1.4451, "step": 71080 }, { "epoch": 3.4384304028993853, "grad_norm": 0.19627268612384796, "learning_rate": 7.636233460141934e-05, "loss": 1.4379, "step": 71090 }, { "epoch": 3.4389723440648976, "grad_norm": 0.24721965193748474, "learning_rate": 7.634839349251e-05, "loss": 1.4345, "step": 71100 }, { "epoch": 3.43951428523041, "grad_norm": 0.2560265362262726, "learning_rate": 7.633444973905435e-05, "loss": 1.4404, "step": 71110 }, { "epoch": 3.440056226395922, "grad_norm": 0.21362638473510742, "learning_rate": 7.63205033427797e-05, "loss": 1.4416, "step": 71120 }, { "epoch": 3.440598167561434, "grad_norm": 0.24023322761058807, "learning_rate": 7.630655430541367e-05, "loss": 1.4308, "step": 71130 }, { "epoch": 3.4411401087269464, "grad_norm": 0.20237214863300323, "learning_rate": 7.629260262868419e-05, "loss": 1.4432, "step": 71140 }, { "epoch": 3.4416820498924587, "grad_norm": 0.18272891640663147, "learning_rate": 7.627864831431955e-05, "loss": 1.4389, "step": 71150 }, { "epoch": 3.4422239910579706, "grad_norm": 0.2650783061981201, "learning_rate": 7.626469136404834e-05, "loss": 1.439, "step": 71160 }, { "epoch": 3.4425491557572783, "eval_loss": 2.4182510375976562, "eval_runtime": 22.3053, "eval_samples_per_second": 224.162, "eval_steps_per_second": 1.21, "step": 71166 }, { "epoch": 3.442765932223483, "grad_norm": 0.2870977222919464, "learning_rate": 7.625073177959945e-05, "loss": 1.4253, "step": 71170 }, { "epoch": 3.4433078733889952, "grad_norm": 0.20810657739639282, "learning_rate": 7.62367695627022e-05, "loss": 1.4484, "step": 71180 }, { "epoch": 3.4438498145545076, "grad_norm": 0.177906796336174, "learning_rate": 7.622280471508611e-05, "loss": 1.441, "step": 71190 }, { "epoch": 3.4443917557200194, "grad_norm": 0.17265500128269196, "learning_rate": 7.620883723848114e-05, "loss": 1.4263, "step": 71200 }, { "epoch": 3.4449336968855317, "grad_norm": 0.23366230726242065, "learning_rate": 7.61948671346175e-05, "loss": 1.4195, "step": 71210 }, { "epoch": 3.445475638051044, "grad_norm": 0.18871016800403595, "learning_rate": 7.618089440522571e-05, "loss": 1.4275, "step": 71220 }, { "epoch": 3.4460175792165564, "grad_norm": 0.1917314976453781, "learning_rate": 7.61669190520367e-05, "loss": 1.4325, "step": 71230 }, { "epoch": 3.4465595203820687, "grad_norm": 0.2467162013053894, "learning_rate": 7.615294107678165e-05, "loss": 1.4374, "step": 71240 }, { "epoch": 3.4471014615475806, "grad_norm": 0.1812368631362915, "learning_rate": 7.61389604811921e-05, "loss": 1.4434, "step": 71250 }, { "epoch": 3.4472640438972344, "eval_loss": 2.420837163925171, "eval_runtime": 23.1164, "eval_samples_per_second": 216.296, "eval_steps_per_second": 1.168, "step": 71253 }, { "epoch": 3.447643402713093, "grad_norm": 0.24200530350208282, "learning_rate": 7.61249772669999e-05, "loss": 1.4313, "step": 71260 }, { "epoch": 3.448185343878605, "grad_norm": 0.19202779233455658, "learning_rate": 7.61109914359372e-05, "loss": 1.4212, "step": 71270 }, { "epoch": 3.4487272850441175, "grad_norm": 0.34746161103248596, "learning_rate": 7.609700298973659e-05, "loss": 1.4312, "step": 71280 }, { "epoch": 3.44926922620963, "grad_norm": 0.19444984197616577, "learning_rate": 7.608301193013082e-05, "loss": 1.416, "step": 71290 }, { "epoch": 3.4498111673751417, "grad_norm": 0.17890185117721558, "learning_rate": 7.606901825885305e-05, "loss": 1.431, "step": 71300 }, { "epoch": 3.450353108540654, "grad_norm": 0.1906130313873291, "learning_rate": 7.605502197763678e-05, "loss": 1.4435, "step": 71310 }, { "epoch": 3.4508950497061663, "grad_norm": 0.2459091991186142, "learning_rate": 7.60410230882158e-05, "loss": 1.4252, "step": 71320 }, { "epoch": 3.4514369908716787, "grad_norm": 0.3785655200481415, "learning_rate": 7.602702159232424e-05, "loss": 1.4277, "step": 71330 }, { "epoch": 3.4519789320371905, "grad_norm": 0.2577289044857025, "learning_rate": 7.601301749169652e-05, "loss": 1.429, "step": 71340 }, { "epoch": 3.4519789320371905, "eval_loss": 2.4179348945617676, "eval_runtime": 21.9382, "eval_samples_per_second": 227.913, "eval_steps_per_second": 1.231, "step": 71340 }, { "epoch": 3.452520873202703, "grad_norm": 0.2082802802324295, "learning_rate": 7.599901078806744e-05, "loss": 1.4214, "step": 71350 }, { "epoch": 3.453062814368215, "grad_norm": 0.251081645488739, "learning_rate": 7.598500148317206e-05, "loss": 1.4394, "step": 71360 }, { "epoch": 3.4536047555337275, "grad_norm": 0.1895270198583603, "learning_rate": 7.597098957874582e-05, "loss": 1.4283, "step": 71370 }, { "epoch": 3.4541466966992393, "grad_norm": 0.252121239900589, "learning_rate": 7.595697507652442e-05, "loss": 1.4305, "step": 71380 }, { "epoch": 3.4546886378647517, "grad_norm": 0.32456740736961365, "learning_rate": 7.594295797824393e-05, "loss": 1.4383, "step": 71390 }, { "epoch": 3.455230579030264, "grad_norm": 0.22235190868377686, "learning_rate": 7.592893828564073e-05, "loss": 1.4284, "step": 71400 }, { "epoch": 3.4557725201957763, "grad_norm": 0.19792044162750244, "learning_rate": 7.591491600045155e-05, "loss": 1.4249, "step": 71410 }, { "epoch": 3.4563144613612886, "grad_norm": 0.26035961508750916, "learning_rate": 7.590089112441336e-05, "loss": 1.4245, "step": 71420 }, { "epoch": 3.456693820177147, "eval_loss": 2.4162979125976562, "eval_runtime": 21.987, "eval_samples_per_second": 227.408, "eval_steps_per_second": 1.228, "step": 71427 }, { "epoch": 3.4568564025268005, "grad_norm": 0.18416984379291534, "learning_rate": 7.58868636592635e-05, "loss": 1.4349, "step": 71430 }, { "epoch": 3.457398343692313, "grad_norm": 0.21309533715248108, "learning_rate": 7.587283360673969e-05, "loss": 1.4194, "step": 71440 }, { "epoch": 3.457940284857825, "grad_norm": 0.17019227147102356, "learning_rate": 7.585880096857985e-05, "loss": 1.4342, "step": 71450 }, { "epoch": 3.4584822260233374, "grad_norm": 0.16507531702518463, "learning_rate": 7.58447657465223e-05, "loss": 1.424, "step": 71460 }, { "epoch": 3.4590241671888498, "grad_norm": 0.21044619381427765, "learning_rate": 7.583072794230567e-05, "loss": 1.428, "step": 71470 }, { "epoch": 3.4595661083543616, "grad_norm": 0.2250647246837616, "learning_rate": 7.581668755766891e-05, "loss": 1.4316, "step": 71480 }, { "epoch": 3.460108049519874, "grad_norm": 0.21987579762935638, "learning_rate": 7.580264459435129e-05, "loss": 1.4411, "step": 71490 }, { "epoch": 3.4606499906853863, "grad_norm": 0.21779508888721466, "learning_rate": 7.578859905409234e-05, "loss": 1.4381, "step": 71500 }, { "epoch": 3.4611919318508986, "grad_norm": 0.3549853265285492, "learning_rate": 7.577455093863202e-05, "loss": 1.4346, "step": 71510 }, { "epoch": 3.4614087083171032, "eval_loss": 2.4182517528533936, "eval_runtime": 21.9784, "eval_samples_per_second": 227.496, "eval_steps_per_second": 1.228, "step": 71514 }, { "epoch": 3.461733873016411, "grad_norm": 0.46638888120651245, "learning_rate": 7.57605002497105e-05, "loss": 1.4274, "step": 71520 }, { "epoch": 3.4622758141819228, "grad_norm": 0.28370195627212524, "learning_rate": 7.574644698906836e-05, "loss": 1.4282, "step": 71530 }, { "epoch": 3.462817755347435, "grad_norm": 0.2209625244140625, "learning_rate": 7.573239115844644e-05, "loss": 1.4407, "step": 71540 }, { "epoch": 3.4633596965129474, "grad_norm": 0.2586290240287781, "learning_rate": 7.571833275958591e-05, "loss": 1.437, "step": 71550 }, { "epoch": 3.4639016376784597, "grad_norm": 0.17665983736515045, "learning_rate": 7.570427179422827e-05, "loss": 1.4312, "step": 71560 }, { "epoch": 3.4644435788439716, "grad_norm": 0.1650412678718567, "learning_rate": 7.569020826411532e-05, "loss": 1.4262, "step": 71570 }, { "epoch": 3.464985520009484, "grad_norm": 0.19921010732650757, "learning_rate": 7.56761421709892e-05, "loss": 1.4333, "step": 71580 }, { "epoch": 3.465527461174996, "grad_norm": 0.2843552827835083, "learning_rate": 7.566207351659236e-05, "loss": 1.4295, "step": 71590 }, { "epoch": 3.4660694023405085, "grad_norm": 0.3298152983188629, "learning_rate": 7.564800230266756e-05, "loss": 1.4246, "step": 71600 }, { "epoch": 3.46612359645706, "eval_loss": 2.4160807132720947, "eval_runtime": 21.9823, "eval_samples_per_second": 227.456, "eval_steps_per_second": 1.228, "step": 71601 }, { "epoch": 3.4666113435060204, "grad_norm": 0.41355177760124207, "learning_rate": 7.563392853095786e-05, "loss": 1.4208, "step": 71610 }, { "epoch": 3.4671532846715327, "grad_norm": 0.25345563888549805, "learning_rate": 7.56198522032067e-05, "loss": 1.4262, "step": 71620 }, { "epoch": 3.467695225837045, "grad_norm": 0.2895798981189728, "learning_rate": 7.560577332115777e-05, "loss": 1.4344, "step": 71630 }, { "epoch": 3.4682371670025574, "grad_norm": 0.18971815705299377, "learning_rate": 7.559169188655509e-05, "loss": 1.4359, "step": 71640 }, { "epoch": 3.4687791081680697, "grad_norm": 0.24166151881217957, "learning_rate": 7.557760790114304e-05, "loss": 1.4282, "step": 71650 }, { "epoch": 3.4693210493335815, "grad_norm": 0.18398047983646393, "learning_rate": 7.556352136666624e-05, "loss": 1.4319, "step": 71660 }, { "epoch": 3.469862990499094, "grad_norm": 0.1882873922586441, "learning_rate": 7.554943228486969e-05, "loss": 1.4305, "step": 71670 }, { "epoch": 3.470404931664606, "grad_norm": 0.39805862307548523, "learning_rate": 7.55353406574987e-05, "loss": 1.4383, "step": 71680 }, { "epoch": 3.470838484597016, "eval_loss": 2.422006607055664, "eval_runtime": 21.9837, "eval_samples_per_second": 227.442, "eval_steps_per_second": 1.228, "step": 71688 }, { "epoch": 3.4709468728301185, "grad_norm": 0.4627326726913452, "learning_rate": 7.552124648629887e-05, "loss": 1.423, "step": 71690 }, { "epoch": 3.471488813995631, "grad_norm": 0.20239268243312836, "learning_rate": 7.55071497730161e-05, "loss": 1.4374, "step": 71700 }, { "epoch": 3.4720307551611427, "grad_norm": 0.21879947185516357, "learning_rate": 7.549305051939665e-05, "loss": 1.4308, "step": 71710 }, { "epoch": 3.472572696326655, "grad_norm": 0.2617088854312897, "learning_rate": 7.547894872718709e-05, "loss": 1.4352, "step": 71720 }, { "epoch": 3.4731146374921673, "grad_norm": 0.24820886552333832, "learning_rate": 7.546484439813427e-05, "loss": 1.4237, "step": 71730 }, { "epoch": 3.4736565786576796, "grad_norm": 0.22648629546165466, "learning_rate": 7.545073753398537e-05, "loss": 1.435, "step": 71740 }, { "epoch": 3.474198519823192, "grad_norm": 0.20121285319328308, "learning_rate": 7.543662813648789e-05, "loss": 1.4264, "step": 71750 }, { "epoch": 3.474740460988704, "grad_norm": 0.32857656478881836, "learning_rate": 7.542251620738964e-05, "loss": 1.4422, "step": 71760 }, { "epoch": 3.475282402154216, "grad_norm": 0.1786399483680725, "learning_rate": 7.540840174843876e-05, "loss": 1.4308, "step": 71770 }, { "epoch": 3.475553372736972, "eval_loss": 2.418339490890503, "eval_runtime": 21.9871, "eval_samples_per_second": 227.406, "eval_steps_per_second": 1.228, "step": 71775 }, { "epoch": 3.4758243433197284, "grad_norm": 0.202467679977417, "learning_rate": 7.539428476138367e-05, "loss": 1.4229, "step": 71780 }, { "epoch": 3.4763662844852408, "grad_norm": 0.18343421816825867, "learning_rate": 7.538016524797313e-05, "loss": 1.4268, "step": 71790 }, { "epoch": 3.4769082256507526, "grad_norm": 0.3386618494987488, "learning_rate": 7.53660432099562e-05, "loss": 1.4319, "step": 71800 }, { "epoch": 3.477450166816265, "grad_norm": 0.2889906167984009, "learning_rate": 7.535191864908224e-05, "loss": 1.4279, "step": 71810 }, { "epoch": 3.4779921079817773, "grad_norm": 0.25574788451194763, "learning_rate": 7.533779156710098e-05, "loss": 1.4254, "step": 71820 }, { "epoch": 3.4785340491472896, "grad_norm": 0.282247930765152, "learning_rate": 7.532366196576238e-05, "loss": 1.4315, "step": 71830 }, { "epoch": 3.4790759903128015, "grad_norm": 0.22523202002048492, "learning_rate": 7.530952984681679e-05, "loss": 1.4407, "step": 71840 }, { "epoch": 3.4796179314783138, "grad_norm": 0.2516654431819916, "learning_rate": 7.529539521201481e-05, "loss": 1.4203, "step": 71850 }, { "epoch": 3.480159872643826, "grad_norm": 0.19450673460960388, "learning_rate": 7.528125806310737e-05, "loss": 1.4341, "step": 71860 }, { "epoch": 3.4802682608769286, "eval_loss": 2.420518636703491, "eval_runtime": 21.9826, "eval_samples_per_second": 227.453, "eval_steps_per_second": 1.228, "step": 71862 }, { "epoch": 3.4807018138093384, "grad_norm": 0.17963695526123047, "learning_rate": 7.526711840184576e-05, "loss": 1.4376, "step": 71870 }, { "epoch": 3.4812437549748507, "grad_norm": 0.1823495328426361, "learning_rate": 7.525297622998151e-05, "loss": 1.4332, "step": 71880 }, { "epoch": 3.4817856961403626, "grad_norm": 0.18945930898189545, "learning_rate": 7.523883154926648e-05, "loss": 1.4278, "step": 71890 }, { "epoch": 3.482327637305875, "grad_norm": 0.20653122663497925, "learning_rate": 7.522468436145288e-05, "loss": 1.4371, "step": 71900 }, { "epoch": 3.4828695784713872, "grad_norm": 0.17666640877723694, "learning_rate": 7.521053466829317e-05, "loss": 1.4255, "step": 71910 }, { "epoch": 3.4834115196368995, "grad_norm": 0.21840277314186096, "learning_rate": 7.51963824715402e-05, "loss": 1.4243, "step": 71920 }, { "epoch": 3.483953460802412, "grad_norm": 0.19102028012275696, "learning_rate": 7.518222777294703e-05, "loss": 1.4402, "step": 71930 }, { "epoch": 3.4844954019679237, "grad_norm": 0.17519988119602203, "learning_rate": 7.51680705742671e-05, "loss": 1.4324, "step": 71940 }, { "epoch": 3.4849831490168848, "eval_loss": 2.4200730323791504, "eval_runtime": 21.6386, "eval_samples_per_second": 231.068, "eval_steps_per_second": 1.248, "step": 71949 }, { "epoch": 3.485037343133436, "grad_norm": 0.18477900326251984, "learning_rate": 7.515391087725416e-05, "loss": 1.4264, "step": 71950 }, { "epoch": 3.4855792842989484, "grad_norm": 0.21089692413806915, "learning_rate": 7.513974868366224e-05, "loss": 1.4269, "step": 71960 }, { "epoch": 3.4861212254644607, "grad_norm": 0.34691062569618225, "learning_rate": 7.51255839952457e-05, "loss": 1.4232, "step": 71970 }, { "epoch": 3.4866631666299726, "grad_norm": 0.2588226795196533, "learning_rate": 7.511141681375917e-05, "loss": 1.4332, "step": 71980 }, { "epoch": 3.487205107795485, "grad_norm": 0.16295021772384644, "learning_rate": 7.509724714095766e-05, "loss": 1.4296, "step": 71990 }, { "epoch": 3.487747048960997, "grad_norm": 0.22175998985767365, "learning_rate": 7.508307497859641e-05, "loss": 1.4286, "step": 72000 }, { "epoch": 3.0005419411655123, "grad_norm": 0.28885793685913086, "learning_rate": 7.506890032843104e-05, "loss": 1.4315, "step": 72010 }, { "epoch": 3.0010838823310246, "grad_norm": 0.17862290143966675, "learning_rate": 7.505472319221742e-05, "loss": 1.4237, "step": 72020 }, { "epoch": 3.0016258234965365, "grad_norm": 0.1830783486366272, "learning_rate": 7.504054357171176e-05, "loss": 1.4236, "step": 72030 }, { "epoch": 3.001950988195844, "eval_loss": 2.42079496383667, "eval_runtime": 22.1932, "eval_samples_per_second": 225.294, "eval_steps_per_second": 1.217, "step": 72036 }, { "epoch": 3.002167764662049, "grad_norm": 0.22852027416229248, "learning_rate": 7.502636146867058e-05, "loss": 1.4378, "step": 72040 }, { "epoch": 3.002709705827561, "grad_norm": 0.17318575084209442, "learning_rate": 7.501217688485067e-05, "loss": 1.4238, "step": 72050 }, { "epoch": 3.0032516469930735, "grad_norm": 0.32434844970703125, "learning_rate": 7.499798982200917e-05, "loss": 1.4348, "step": 72060 }, { "epoch": 3.0037935881585853, "grad_norm": 0.3870048522949219, "learning_rate": 7.498380028190353e-05, "loss": 1.4307, "step": 72070 }, { "epoch": 3.0043355293240976, "grad_norm": 0.21093975007534027, "learning_rate": 7.496960826629147e-05, "loss": 1.4365, "step": 72080 }, { "epoch": 3.00487747048961, "grad_norm": 0.1736556887626648, "learning_rate": 7.495541377693103e-05, "loss": 1.4264, "step": 72090 }, { "epoch": 3.0054194116551223, "grad_norm": 0.23631495237350464, "learning_rate": 7.494121681558056e-05, "loss": 1.4222, "step": 72100 }, { "epoch": 3.0059613528206346, "grad_norm": 0.1855953484773636, "learning_rate": 7.492701738399874e-05, "loss": 1.4191, "step": 72110 }, { "epoch": 3.0065032939861465, "grad_norm": 0.16358767449855804, "learning_rate": 7.491281548394454e-05, "loss": 1.4286, "step": 72120 }, { "epoch": 3.0066658763358003, "eval_loss": 2.4213919639587402, "eval_runtime": 22.016, "eval_samples_per_second": 227.108, "eval_steps_per_second": 1.226, "step": 72123 }, { "epoch": 3.007045235151659, "grad_norm": 0.17632584273815155, "learning_rate": 7.489861111717718e-05, "loss": 1.4182, "step": 72130 }, { "epoch": 3.007587176317171, "grad_norm": 0.16463103890419006, "learning_rate": 7.488440428545626e-05, "loss": 1.4179, "step": 72140 }, { "epoch": 3.0081291174826834, "grad_norm": 0.3055421710014343, "learning_rate": 7.487019499054169e-05, "loss": 1.426, "step": 72150 }, { "epoch": 3.0086710586481953, "grad_norm": 0.27146390080451965, "learning_rate": 7.485598323419362e-05, "loss": 1.425, "step": 72160 }, { "epoch": 3.0092129998137076, "grad_norm": 0.2870032787322998, "learning_rate": 7.484176901817256e-05, "loss": 1.4395, "step": 72170 }, { "epoch": 3.00975494097922, "grad_norm": 0.18291185796260834, "learning_rate": 7.482755234423931e-05, "loss": 1.428, "step": 72180 }, { "epoch": 3.0102968821447322, "grad_norm": 0.16777105629444122, "learning_rate": 7.481333321415493e-05, "loss": 1.4148, "step": 72190 }, { "epoch": 3.0108388233102445, "grad_norm": 0.19106002151966095, "learning_rate": 7.479911162968087e-05, "loss": 1.4297, "step": 72200 }, { "epoch": 3.0113807644757564, "grad_norm": 0.22009027004241943, "learning_rate": 7.478488759257882e-05, "loss": 1.4288, "step": 72210 }, { "epoch": 3.0113807644757564, "eval_loss": 2.4199483394622803, "eval_runtime": 22.9392, "eval_samples_per_second": 217.968, "eval_steps_per_second": 1.177, "step": 72210 }, { "epoch": 3.0119227056412687, "grad_norm": 0.20799531042575836, "learning_rate": 7.477066110461078e-05, "loss": 1.4217, "step": 72220 }, { "epoch": 3.012464646806781, "grad_norm": 0.17034943401813507, "learning_rate": 7.475643216753909e-05, "loss": 1.4257, "step": 72230 }, { "epoch": 3.0130065879722934, "grad_norm": 0.16152597963809967, "learning_rate": 7.474220078312636e-05, "loss": 1.4231, "step": 72240 }, { "epoch": 3.0135485291378057, "grad_norm": 0.23870690166950226, "learning_rate": 7.47279669531355e-05, "loss": 1.4311, "step": 72250 }, { "epoch": 3.0140904703033176, "grad_norm": 0.19177548587322235, "learning_rate": 7.471373067932975e-05, "loss": 1.4227, "step": 72260 }, { "epoch": 3.01463241146883, "grad_norm": 0.23941627144813538, "learning_rate": 7.469949196347263e-05, "loss": 1.4267, "step": 72270 }, { "epoch": 3.015174352634342, "grad_norm": 0.1809268444776535, "learning_rate": 7.468525080732798e-05, "loss": 1.421, "step": 72280 }, { "epoch": 3.0157162937998545, "grad_norm": 0.18789643049240112, "learning_rate": 7.46710072126599e-05, "loss": 1.4302, "step": 72290 }, { "epoch": 3.016095652615713, "eval_loss": 2.415761947631836, "eval_runtime": 22.0228, "eval_samples_per_second": 227.038, "eval_steps_per_second": 1.226, "step": 72297 }, { "epoch": 3.0162582349653664, "grad_norm": 0.18077979981899261, "learning_rate": 7.465676118123287e-05, "loss": 1.4176, "step": 72300 }, { "epoch": 3.0168001761308787, "grad_norm": 0.4619799852371216, "learning_rate": 7.46425127148116e-05, "loss": 1.4209, "step": 72310 }, { "epoch": 3.017342117296391, "grad_norm": 0.1716815084218979, "learning_rate": 7.462826181516115e-05, "loss": 1.4313, "step": 72320 }, { "epoch": 3.0178840584619033, "grad_norm": 0.21145406365394592, "learning_rate": 7.461400848404681e-05, "loss": 1.4226, "step": 72330 }, { "epoch": 3.0184259996274156, "grad_norm": 0.19309388101100922, "learning_rate": 7.459975272323427e-05, "loss": 1.4252, "step": 72340 }, { "epoch": 3.0189679407929275, "grad_norm": 0.37424296140670776, "learning_rate": 7.458549453448948e-05, "loss": 1.4245, "step": 72350 }, { "epoch": 3.01950988195844, "grad_norm": 0.36267608404159546, "learning_rate": 7.457123391957863e-05, "loss": 1.4271, "step": 72360 }, { "epoch": 3.020051823123952, "grad_norm": 0.1854252964258194, "learning_rate": 7.455697088026831e-05, "loss": 1.4313, "step": 72370 }, { "epoch": 3.0205937642894645, "grad_norm": 0.2632382810115814, "learning_rate": 7.454270541832532e-05, "loss": 1.4254, "step": 72380 }, { "epoch": 3.020810540755669, "eval_loss": 2.422041416168213, "eval_runtime": 21.9894, "eval_samples_per_second": 227.382, "eval_steps_per_second": 1.228, "step": 72384 }, { "epoch": 3.0211357054549763, "grad_norm": 0.16941377520561218, "learning_rate": 7.452843753551687e-05, "loss": 1.433, "step": 72390 }, { "epoch": 3.0216776466204887, "grad_norm": 0.20150338113307953, "learning_rate": 7.451416723361033e-05, "loss": 1.4228, "step": 72400 }, { "epoch": 3.022219587786001, "grad_norm": 0.20586276054382324, "learning_rate": 7.449989451437347e-05, "loss": 1.4336, "step": 72410 }, { "epoch": 3.0227615289515133, "grad_norm": 0.180009663105011, "learning_rate": 7.448561937957435e-05, "loss": 1.4399, "step": 72420 }, { "epoch": 3.0233034701170256, "grad_norm": 0.17290925979614258, "learning_rate": 7.44713418309813e-05, "loss": 1.4349, "step": 72430 }, { "epoch": 3.0238454112825375, "grad_norm": 0.19423924386501312, "learning_rate": 7.445706187036295e-05, "loss": 1.4175, "step": 72440 }, { "epoch": 3.02438735244805, "grad_norm": 0.19001413881778717, "learning_rate": 7.444277949948826e-05, "loss": 1.4253, "step": 72450 }, { "epoch": 3.024929293613562, "grad_norm": 0.27171728014945984, "learning_rate": 7.442849472012648e-05, "loss": 1.4299, "step": 72460 }, { "epoch": 3.0254712347790744, "grad_norm": 0.24754732847213745, "learning_rate": 7.441420753404709e-05, "loss": 1.4204, "step": 72470 }, { "epoch": 3.0255254288956257, "eval_loss": 2.420412063598633, "eval_runtime": 21.9948, "eval_samples_per_second": 227.326, "eval_steps_per_second": 1.228, "step": 72471 }, { "epoch": 3.0260131759445863, "grad_norm": 0.18741434812545776, "learning_rate": 7.439991794301997e-05, "loss": 1.4214, "step": 72480 }, { "epoch": 3.0265551171100986, "grad_norm": 0.2331148236989975, "learning_rate": 7.438562594881523e-05, "loss": 1.4267, "step": 72490 }, { "epoch": 3.027097058275611, "grad_norm": 0.21943721175193787, "learning_rate": 7.437133155320333e-05, "loss": 1.4202, "step": 72500 }, { "epoch": 3.0276389994411232, "grad_norm": 0.1892043948173523, "learning_rate": 7.435703475795498e-05, "loss": 1.4201, "step": 72510 }, { "epoch": 3.0281809406066356, "grad_norm": 0.2600436210632324, "learning_rate": 7.434273556484119e-05, "loss": 1.4357, "step": 72520 }, { "epoch": 3.0287228817721474, "grad_norm": 0.32084086537361145, "learning_rate": 7.43284339756333e-05, "loss": 1.4274, "step": 72530 }, { "epoch": 3.0292648229376598, "grad_norm": 0.23586052656173706, "learning_rate": 7.431412999210292e-05, "loss": 1.4261, "step": 72540 }, { "epoch": 3.029806764103172, "grad_norm": 0.2831364870071411, "learning_rate": 7.429982361602198e-05, "loss": 1.4191, "step": 72550 }, { "epoch": 3.030240317035582, "eval_loss": 2.41964054107666, "eval_runtime": 21.992, "eval_samples_per_second": 227.355, "eval_steps_per_second": 1.228, "step": 72558 }, { "epoch": 3.0303487052686844, "grad_norm": 0.1781311184167862, "learning_rate": 7.428551484916269e-05, "loss": 1.4297, "step": 72560 }, { "epoch": 3.0308906464341967, "grad_norm": 0.34944701194763184, "learning_rate": 7.427120369329753e-05, "loss": 1.4319, "step": 72570 }, { "epoch": 3.0314325875997086, "grad_norm": 0.17898167669773102, "learning_rate": 7.425689015019932e-05, "loss": 1.4229, "step": 72580 }, { "epoch": 3.031974528765221, "grad_norm": 0.19159488379955292, "learning_rate": 7.42425742216412e-05, "loss": 1.4248, "step": 72590 }, { "epoch": 3.032516469930733, "grad_norm": 0.1828126460313797, "learning_rate": 7.422825590939648e-05, "loss": 1.4298, "step": 72600 }, { "epoch": 3.0330584110962455, "grad_norm": 0.1625342071056366, "learning_rate": 7.421393521523893e-05, "loss": 1.425, "step": 72610 }, { "epoch": 3.0336003522617574, "grad_norm": 0.1856265813112259, "learning_rate": 7.41996121409425e-05, "loss": 1.4231, "step": 72620 }, { "epoch": 3.0341422934272697, "grad_norm": 0.20504432916641235, "learning_rate": 7.418528668828147e-05, "loss": 1.4211, "step": 72630 }, { "epoch": 3.034684234592782, "grad_norm": 0.1928977072238922, "learning_rate": 7.417095885903043e-05, "loss": 1.421, "step": 72640 }, { "epoch": 3.034955205175538, "eval_loss": 2.4224603176116943, "eval_runtime": 21.9942, "eval_samples_per_second": 227.333, "eval_steps_per_second": 1.228, "step": 72645 }, { "epoch": 3.0352261757582943, "grad_norm": 0.26011061668395996, "learning_rate": 7.415662865496421e-05, "loss": 1.4156, "step": 72650 }, { "epoch": 3.0357681169238067, "grad_norm": 0.1762949675321579, "learning_rate": 7.414229607785803e-05, "loss": 1.4256, "step": 72660 }, { "epoch": 3.0363100580893185, "grad_norm": 0.23641474545001984, "learning_rate": 7.412796112948731e-05, "loss": 1.4283, "step": 72670 }, { "epoch": 3.036851999254831, "grad_norm": 0.2524915635585785, "learning_rate": 7.41136238116278e-05, "loss": 1.4353, "step": 72680 }, { "epoch": 3.037393940420343, "grad_norm": 0.16015665233135223, "learning_rate": 7.409928412605557e-05, "loss": 1.4262, "step": 72690 }, { "epoch": 3.0379358815858555, "grad_norm": 0.252564936876297, "learning_rate": 7.408494207454694e-05, "loss": 1.4269, "step": 72700 }, { "epoch": 3.0384778227513674, "grad_norm": 0.2023906707763672, "learning_rate": 7.407059765887854e-05, "loss": 1.4323, "step": 72710 }, { "epoch": 3.0390197639168797, "grad_norm": 0.22321388125419617, "learning_rate": 7.40562508808273e-05, "loss": 1.4271, "step": 72720 }, { "epoch": 3.039561705082392, "grad_norm": 0.26535144448280334, "learning_rate": 7.404190174217044e-05, "loss": 1.4305, "step": 72730 }, { "epoch": 3.0396700933154945, "eval_loss": 2.43088436126709, "eval_runtime": 21.9914, "eval_samples_per_second": 227.362, "eval_steps_per_second": 1.228, "step": 72732 }, { "epoch": 3.0401036462479043, "grad_norm": 0.20327869057655334, "learning_rate": 7.402755024468547e-05, "loss": 1.4258, "step": 72740 }, { "epoch": 3.0406455874134166, "grad_norm": 0.2531057596206665, "learning_rate": 7.401319639015018e-05, "loss": 1.4226, "step": 72750 }, { "epoch": 3.0411875285789285, "grad_norm": 0.2527235746383667, "learning_rate": 7.399884018034265e-05, "loss": 1.4343, "step": 72760 }, { "epoch": 3.041729469744441, "grad_norm": 0.16884733736515045, "learning_rate": 7.398448161704131e-05, "loss": 1.4123, "step": 72770 }, { "epoch": 3.042271410909953, "grad_norm": 0.2351938784122467, "learning_rate": 7.397012070202478e-05, "loss": 1.4316, "step": 72780 }, { "epoch": 3.0428133520754654, "grad_norm": 0.17390334606170654, "learning_rate": 7.395575743707208e-05, "loss": 1.4276, "step": 72790 }, { "epoch": 3.0433552932409773, "grad_norm": 0.23687273263931274, "learning_rate": 7.394139182396245e-05, "loss": 1.4308, "step": 72800 }, { "epoch": 3.0438972344064896, "grad_norm": 0.22024738788604736, "learning_rate": 7.39270238644754e-05, "loss": 1.4276, "step": 72810 }, { "epoch": 3.0443849814554507, "eval_loss": 2.4263041019439697, "eval_runtime": 21.995, "eval_samples_per_second": 227.324, "eval_steps_per_second": 1.228, "step": 72819 }, { "epoch": 3.044439175572002, "grad_norm": 0.37231162190437317, "learning_rate": 7.391265356039084e-05, "loss": 1.4351, "step": 72820 }, { "epoch": 3.0449811167375143, "grad_norm": 0.1920226663351059, "learning_rate": 7.389828091348888e-05, "loss": 1.4319, "step": 72830 }, { "epoch": 3.0455230579030266, "grad_norm": 0.16061918437480927, "learning_rate": 7.388390592554989e-05, "loss": 1.4258, "step": 72840 }, { "epoch": 3.0460649990685384, "grad_norm": 0.20206622779369354, "learning_rate": 7.386952859835466e-05, "loss": 1.4202, "step": 72850 }, { "epoch": 3.0466069402340508, "grad_norm": 0.31317874789237976, "learning_rate": 7.385514893368411e-05, "loss": 1.4229, "step": 72860 }, { "epoch": 3.047148881399563, "grad_norm": 0.17705164849758148, "learning_rate": 7.38407669333196e-05, "loss": 1.4275, "step": 72870 }, { "epoch": 3.0476908225650754, "grad_norm": 0.2397981733083725, "learning_rate": 7.382638259904268e-05, "loss": 1.4225, "step": 72880 }, { "epoch": 3.0482327637305877, "grad_norm": 0.19595515727996826, "learning_rate": 7.38119959326352e-05, "loss": 1.437, "step": 72890 }, { "epoch": 3.0487747048960996, "grad_norm": 0.18026091158390045, "learning_rate": 7.379760693587935e-05, "loss": 1.43, "step": 72900 }, { "epoch": 3.0490998695954072, "eval_loss": 2.430786371231079, "eval_runtime": 22.1488, "eval_samples_per_second": 225.746, "eval_steps_per_second": 1.219, "step": 72906 }, { "epoch": 3.049316646061612, "grad_norm": 0.26389095187187195, "learning_rate": 7.378321561055756e-05, "loss": 1.4251, "step": 72910 }, { "epoch": 3.049858587227124, "grad_norm": 0.33883413672447205, "learning_rate": 7.376882195845256e-05, "loss": 1.4346, "step": 72920 }, { "epoch": 3.0504005283926365, "grad_norm": 0.17455531656742096, "learning_rate": 7.37544259813474e-05, "loss": 1.4272, "step": 72930 }, { "epoch": 3.0509424695581484, "grad_norm": 0.1896173059940338, "learning_rate": 7.374002768102535e-05, "loss": 1.4286, "step": 72940 }, { "epoch": 3.0514844107236607, "grad_norm": 0.1727907508611679, "learning_rate": 7.372562705927006e-05, "loss": 1.4246, "step": 72950 }, { "epoch": 3.052026351889173, "grad_norm": 0.17617569863796234, "learning_rate": 7.371122411786538e-05, "loss": 1.433, "step": 72960 }, { "epoch": 3.0525682930546854, "grad_norm": 0.2908771336078644, "learning_rate": 7.369681885859548e-05, "loss": 1.4263, "step": 72970 }, { "epoch": 3.0531102342201977, "grad_norm": 0.2100307047367096, "learning_rate": 7.368241128324485e-05, "loss": 1.4221, "step": 72980 }, { "epoch": 3.0536521753857095, "grad_norm": 0.24599306285381317, "learning_rate": 7.366800139359822e-05, "loss": 1.4234, "step": 72990 }, { "epoch": 3.0538147577353634, "eval_loss": 2.424680709838867, "eval_runtime": 22.0534, "eval_samples_per_second": 226.723, "eval_steps_per_second": 1.224, "step": 72993 }, { "epoch": 3.054194116551222, "grad_norm": 0.18987296521663666, "learning_rate": 7.365358919144063e-05, "loss": 1.427, "step": 73000 }, { "epoch": 3.054736057716734, "grad_norm": 0.17865373194217682, "learning_rate": 7.363917467855736e-05, "loss": 1.426, "step": 73010 }, { "epoch": 3.0552779988822465, "grad_norm": 0.17104874551296234, "learning_rate": 7.362475785673409e-05, "loss": 1.4333, "step": 73020 }, { "epoch": 3.0558199400477584, "grad_norm": 0.17414012551307678, "learning_rate": 7.361033872775667e-05, "loss": 1.4307, "step": 73030 }, { "epoch": 3.0563618812132707, "grad_norm": 0.2933975160121918, "learning_rate": 7.359591729341128e-05, "loss": 1.4239, "step": 73040 }, { "epoch": 3.056903822378783, "grad_norm": 0.1879478394985199, "learning_rate": 7.358149355548439e-05, "loss": 1.4258, "step": 73050 }, { "epoch": 3.0574457635442953, "grad_norm": 0.18025019764900208, "learning_rate": 7.356706751576274e-05, "loss": 1.421, "step": 73060 }, { "epoch": 3.0579877047098076, "grad_norm": 0.16190853714942932, "learning_rate": 7.355263917603341e-05, "loss": 1.4229, "step": 73070 }, { "epoch": 3.0585296458753195, "grad_norm": 0.19061139225959778, "learning_rate": 7.353820853808365e-05, "loss": 1.4202, "step": 73080 }, { "epoch": 3.0585296458753195, "eval_loss": 2.4280989170074463, "eval_runtime": 21.9869, "eval_samples_per_second": 227.408, "eval_steps_per_second": 1.228, "step": 73080 }, { "epoch": 3.059071587040832, "grad_norm": 0.24955511093139648, "learning_rate": 7.352377560370113e-05, "loss": 1.4388, "step": 73090 }, { "epoch": 3.059613528206344, "grad_norm": 0.24170927703380585, "learning_rate": 7.350934037467371e-05, "loss": 1.4251, "step": 73100 }, { "epoch": 3.0601554693718565, "grad_norm": 0.19894978404045105, "learning_rate": 7.349490285278953e-05, "loss": 1.4229, "step": 73110 }, { "epoch": 3.0606974105373683, "grad_norm": 0.16050221025943756, "learning_rate": 7.348046303983713e-05, "loss": 1.4217, "step": 73120 }, { "epoch": 3.0612393517028806, "grad_norm": 0.20622624456882477, "learning_rate": 7.346602093760516e-05, "loss": 1.4358, "step": 73130 }, { "epoch": 3.061781292868393, "grad_norm": 0.17460785806179047, "learning_rate": 7.345157654788272e-05, "loss": 1.4205, "step": 73140 }, { "epoch": 3.0623232340339053, "grad_norm": 0.2245752215385437, "learning_rate": 7.343712987245908e-05, "loss": 1.4378, "step": 73150 }, { "epoch": 3.0628651751994176, "grad_norm": 0.2324395626783371, "learning_rate": 7.342268091312381e-05, "loss": 1.4331, "step": 73160 }, { "epoch": 3.063244534015276, "eval_loss": 2.4189603328704834, "eval_runtime": 22.2302, "eval_samples_per_second": 224.919, "eval_steps_per_second": 1.215, "step": 73167 }, { "epoch": 3.0634071163649295, "grad_norm": 0.16124655306339264, "learning_rate": 7.340822967166684e-05, "loss": 1.427, "step": 73170 }, { "epoch": 3.0639490575304418, "grad_norm": 0.171858012676239, "learning_rate": 7.339377614987827e-05, "loss": 1.4179, "step": 73180 }, { "epoch": 3.064490998695954, "grad_norm": 0.28253042697906494, "learning_rate": 7.33793203495486e-05, "loss": 1.4261, "step": 73190 }, { "epoch": 3.0650329398614664, "grad_norm": 0.1876271665096283, "learning_rate": 7.336486227246851e-05, "loss": 1.431, "step": 73200 }, { "epoch": 3.0655748810269783, "grad_norm": 0.2576892077922821, "learning_rate": 7.335040192042901e-05, "loss": 1.4216, "step": 73210 }, { "epoch": 3.0661168221924906, "grad_norm": 0.28171801567077637, "learning_rate": 7.333593929522138e-05, "loss": 1.4285, "step": 73220 }, { "epoch": 3.066658763358003, "grad_norm": 0.16934189200401306, "learning_rate": 7.33214743986372e-05, "loss": 1.4277, "step": 73230 }, { "epoch": 3.0672007045235152, "grad_norm": 0.17389722168445587, "learning_rate": 7.330700723246835e-05, "loss": 1.4219, "step": 73240 }, { "epoch": 3.0677426456890275, "grad_norm": 0.18714025616645813, "learning_rate": 7.32925377985069e-05, "loss": 1.4354, "step": 73250 }, { "epoch": 3.067959422155232, "eval_loss": 2.4224860668182373, "eval_runtime": 21.9928, "eval_samples_per_second": 227.347, "eval_steps_per_second": 1.228, "step": 73254 }, { "epoch": 3.0682845868545394, "grad_norm": 0.1830187737941742, "learning_rate": 7.327806609854527e-05, "loss": 1.4227, "step": 73260 }, { "epoch": 3.0688265280200517, "grad_norm": 0.1765843778848648, "learning_rate": 7.326359213437618e-05, "loss": 1.4215, "step": 73270 }, { "epoch": 3.069368469185564, "grad_norm": 0.19436825811862946, "learning_rate": 7.32491159077926e-05, "loss": 1.423, "step": 73280 }, { "epoch": 3.0699104103510764, "grad_norm": 0.31252750754356384, "learning_rate": 7.323463742058776e-05, "loss": 1.4223, "step": 73290 }, { "epoch": 3.0704523515165887, "grad_norm": 0.2948218286037445, "learning_rate": 7.322015667455521e-05, "loss": 1.4295, "step": 73300 }, { "epoch": 3.0709942926821006, "grad_norm": 0.1795574426651001, "learning_rate": 7.320567367148875e-05, "loss": 1.4268, "step": 73310 }, { "epoch": 3.071536233847613, "grad_norm": 0.2362855225801468, "learning_rate": 7.319118841318246e-05, "loss": 1.4224, "step": 73320 }, { "epoch": 3.072078175013125, "grad_norm": 0.22740645706653595, "learning_rate": 7.317670090143076e-05, "loss": 1.4203, "step": 73330 }, { "epoch": 3.0726201161786375, "grad_norm": 0.1676589697599411, "learning_rate": 7.316221113802825e-05, "loss": 1.4286, "step": 73340 }, { "epoch": 3.072674310295189, "eval_loss": 2.4217214584350586, "eval_runtime": 21.9907, "eval_samples_per_second": 227.369, "eval_steps_per_second": 1.228, "step": 73341 }, { "epoch": 3.0731620573441494, "grad_norm": 0.4173310697078705, "learning_rate": 7.314771912476987e-05, "loss": 1.4293, "step": 73350 }, { "epoch": 3.0737039985096617, "grad_norm": 0.198992520570755, "learning_rate": 7.313322486345085e-05, "loss": 1.4195, "step": 73360 }, { "epoch": 3.074245939675174, "grad_norm": 0.19896645843982697, "learning_rate": 7.311872835586665e-05, "loss": 1.4355, "step": 73370 }, { "epoch": 3.0747878808406863, "grad_norm": 0.1793803870677948, "learning_rate": 7.310422960381305e-05, "loss": 1.4228, "step": 73380 }, { "epoch": 3.0753298220061986, "grad_norm": 0.21066676080226898, "learning_rate": 7.308972860908609e-05, "loss": 1.4219, "step": 73390 }, { "epoch": 3.0758717631717105, "grad_norm": 0.18298877775669098, "learning_rate": 7.30752253734821e-05, "loss": 1.4309, "step": 73400 }, { "epoch": 3.076413704337223, "grad_norm": 0.202785462141037, "learning_rate": 7.306071989879762e-05, "loss": 1.423, "step": 73410 }, { "epoch": 3.076955645502735, "grad_norm": 0.17988531291484833, "learning_rate": 7.304621218682961e-05, "loss": 1.4266, "step": 73420 }, { "epoch": 3.077389198435145, "eval_loss": 2.4217171669006348, "eval_runtime": 22.0012, "eval_samples_per_second": 227.26, "eval_steps_per_second": 1.227, "step": 73428 }, { "epoch": 3.0774975866682475, "grad_norm": 0.29091671109199524, "learning_rate": 7.303170223937518e-05, "loss": 1.4327, "step": 73430 }, { "epoch": 3.0780395278337593, "grad_norm": 0.1689477562904358, "learning_rate": 7.301719005823175e-05, "loss": 1.4226, "step": 73440 }, { "epoch": 3.0785814689992717, "grad_norm": 0.21074718236923218, "learning_rate": 7.300267564519703e-05, "loss": 1.4233, "step": 73450 }, { "epoch": 3.079123410164784, "grad_norm": 0.18468187749385834, "learning_rate": 7.298815900206904e-05, "loss": 1.4186, "step": 73460 }, { "epoch": 3.0796653513302963, "grad_norm": 0.16234566271305084, "learning_rate": 7.2973640130646e-05, "loss": 1.42, "step": 73470 }, { "epoch": 3.0802072924958086, "grad_norm": 0.20264677703380585, "learning_rate": 7.295911903272643e-05, "loss": 1.4322, "step": 73480 }, { "epoch": 3.0807492336613205, "grad_norm": 0.20148052275180817, "learning_rate": 7.29445957101092e-05, "loss": 1.42, "step": 73490 }, { "epoch": 3.081291174826833, "grad_norm": 0.1632404625415802, "learning_rate": 7.293007016459333e-05, "loss": 1.4228, "step": 73500 }, { "epoch": 3.081833115992345, "grad_norm": 0.28163808584213257, "learning_rate": 7.291554239797823e-05, "loss": 1.4194, "step": 73510 }, { "epoch": 3.082104086575101, "eval_loss": 2.420088768005371, "eval_runtime": 21.9934, "eval_samples_per_second": 227.341, "eval_steps_per_second": 1.228, "step": 73515 }, { "epoch": 3.0823750571578574, "grad_norm": 0.2194111943244934, "learning_rate": 7.290101241206354e-05, "loss": 1.4236, "step": 73520 }, { "epoch": 3.0829169983233697, "grad_norm": 0.3260360658168793, "learning_rate": 7.288648020864911e-05, "loss": 1.436, "step": 73530 }, { "epoch": 3.0834589394888816, "grad_norm": 0.21551568806171417, "learning_rate": 7.287194578953518e-05, "loss": 1.423, "step": 73540 }, { "epoch": 3.084000880654394, "grad_norm": 0.2473076581954956, "learning_rate": 7.28574091565222e-05, "loss": 1.4256, "step": 73550 }, { "epoch": 3.0845428218199062, "grad_norm": 0.24069726467132568, "learning_rate": 7.28428703114109e-05, "loss": 1.4311, "step": 73560 }, { "epoch": 3.0850847629854186, "grad_norm": 0.24383601546287537, "learning_rate": 7.28283292560023e-05, "loss": 1.4249, "step": 73570 }, { "epoch": 3.0856267041509304, "grad_norm": 0.17306429147720337, "learning_rate": 7.281378599209768e-05, "loss": 1.4223, "step": 73580 }, { "epoch": 3.0861686453164427, "grad_norm": 0.3249399960041046, "learning_rate": 7.279924052149858e-05, "loss": 1.4173, "step": 73590 }, { "epoch": 3.086710586481955, "grad_norm": 0.33424919843673706, "learning_rate": 7.278469284600684e-05, "loss": 1.4187, "step": 73600 }, { "epoch": 3.0868189747150576, "eval_loss": 2.4198853969573975, "eval_runtime": 21.9949, "eval_samples_per_second": 227.326, "eval_steps_per_second": 1.228, "step": 73602 }, { "epoch": 3.0872525276474674, "grad_norm": 0.25549885630607605, "learning_rate": 7.277014296742457e-05, "loss": 1.4192, "step": 73610 }, { "epoch": 3.0877944688129797, "grad_norm": 0.3509141206741333, "learning_rate": 7.275559088755414e-05, "loss": 1.4217, "step": 73620 }, { "epoch": 3.0883364099784916, "grad_norm": 0.1701222062110901, "learning_rate": 7.27410366081982e-05, "loss": 1.4255, "step": 73630 }, { "epoch": 3.088878351144004, "grad_norm": 0.4111487865447998, "learning_rate": 7.272648013115965e-05, "loss": 1.4136, "step": 73640 }, { "epoch": 3.089420292309516, "grad_norm": 0.292900949716568, "learning_rate": 7.271192145824172e-05, "loss": 1.4301, "step": 73650 }, { "epoch": 3.0899622334750285, "grad_norm": 0.24990233778953552, "learning_rate": 7.269736059124785e-05, "loss": 1.4301, "step": 73660 }, { "epoch": 3.0905041746405404, "grad_norm": 0.2537482678890228, "learning_rate": 7.26827975319818e-05, "loss": 1.4215, "step": 73670 }, { "epoch": 3.0910461158060527, "grad_norm": 0.23013080656528473, "learning_rate": 7.266823228224754e-05, "loss": 1.4313, "step": 73680 }, { "epoch": 3.0915338628550137, "eval_loss": 2.418518304824829, "eval_runtime": 21.9932, "eval_samples_per_second": 227.343, "eval_steps_per_second": 1.228, "step": 73689 }, { "epoch": 3.091588056971565, "grad_norm": 0.20117320120334625, "learning_rate": 7.265366484384938e-05, "loss": 1.4179, "step": 73690 }, { "epoch": 3.0921299981370773, "grad_norm": 0.25527241826057434, "learning_rate": 7.263909521859186e-05, "loss": 1.4265, "step": 73700 }, { "epoch": 3.0926719393025897, "grad_norm": 0.1841808259487152, "learning_rate": 7.262452340827981e-05, "loss": 1.4271, "step": 73710 }, { "epoch": 3.0932138804681015, "grad_norm": 0.16479361057281494, "learning_rate": 7.260994941471832e-05, "loss": 1.4119, "step": 73720 }, { "epoch": 3.093755821633614, "grad_norm": 0.23891477286815643, "learning_rate": 7.259537323971276e-05, "loss": 1.424, "step": 73730 }, { "epoch": 3.094297762799126, "grad_norm": 0.22275440394878387, "learning_rate": 7.258079488506874e-05, "loss": 1.4334, "step": 73740 }, { "epoch": 3.0948397039646385, "grad_norm": 0.19591204822063446, "learning_rate": 7.256621435259218e-05, "loss": 1.4379, "step": 73750 }, { "epoch": 3.0953816451301503, "grad_norm": 0.19313859939575195, "learning_rate": 7.255163164408926e-05, "loss": 1.4185, "step": 73760 }, { "epoch": 3.0959235862956627, "grad_norm": 0.23939813673496246, "learning_rate": 7.25370467613664e-05, "loss": 1.4255, "step": 73770 }, { "epoch": 3.0962487509949703, "eval_loss": 2.417570114135742, "eval_runtime": 21.9951, "eval_samples_per_second": 227.323, "eval_steps_per_second": 1.228, "step": 73776 }, { "epoch": 3.096465527461175, "grad_norm": 0.23741766810417175, "learning_rate": 7.252245970623035e-05, "loss": 1.4347, "step": 73780 }, { "epoch": 3.0970074686266873, "grad_norm": 0.17643536627292633, "learning_rate": 7.250787048048804e-05, "loss": 1.4208, "step": 73790 }, { "epoch": 3.0975494097921996, "grad_norm": 0.1725606471300125, "learning_rate": 7.249327908594675e-05, "loss": 1.4172, "step": 73800 }, { "epoch": 3.0980913509577115, "grad_norm": 0.17194822430610657, "learning_rate": 7.247868552441402e-05, "loss": 1.4279, "step": 73810 }, { "epoch": 3.098633292123224, "grad_norm": 0.17184831202030182, "learning_rate": 7.246408979769759e-05, "loss": 1.4215, "step": 73820 }, { "epoch": 3.099175233288736, "grad_norm": 0.15591196715831757, "learning_rate": 7.244949190760553e-05, "loss": 1.413, "step": 73830 }, { "epoch": 3.0997171744542484, "grad_norm": 0.2347261756658554, "learning_rate": 7.243489185594618e-05, "loss": 1.4282, "step": 73840 }, { "epoch": 3.1002591156197603, "grad_norm": 0.17219695448875427, "learning_rate": 7.242028964452812e-05, "loss": 1.4218, "step": 73850 }, { "epoch": 3.1008010567852726, "grad_norm": 0.19845928251743317, "learning_rate": 7.240568527516022e-05, "loss": 1.4226, "step": 73860 }, { "epoch": 3.1009636391349265, "eval_loss": 2.4213979244232178, "eval_runtime": 21.9911, "eval_samples_per_second": 227.365, "eval_steps_per_second": 1.228, "step": 73863 }, { "epoch": 3.101342997950785, "grad_norm": 0.18098603188991547, "learning_rate": 7.239107874965158e-05, "loss": 1.4348, "step": 73870 }, { "epoch": 3.1018849391162973, "grad_norm": 0.19403134286403656, "learning_rate": 7.237647006981162e-05, "loss": 1.4321, "step": 73880 }, { "epoch": 3.1024268802818096, "grad_norm": 0.2447945922613144, "learning_rate": 7.236185923745e-05, "loss": 1.4228, "step": 73890 }, { "epoch": 3.1029688214473214, "grad_norm": 0.20486268401145935, "learning_rate": 7.234724625437661e-05, "loss": 1.4183, "step": 73900 }, { "epoch": 3.1035107626128338, "grad_norm": 0.18675167858600616, "learning_rate": 7.233263112240168e-05, "loss": 1.4293, "step": 73910 }, { "epoch": 3.104052703778346, "grad_norm": 0.3332042396068573, "learning_rate": 7.231801384333567e-05, "loss": 1.4271, "step": 73920 }, { "epoch": 3.1045946449438584, "grad_norm": 0.20681266486644745, "learning_rate": 7.230339441898928e-05, "loss": 1.4215, "step": 73930 }, { "epoch": 3.1051365861093707, "grad_norm": 0.2513537108898163, "learning_rate": 7.228877285117352e-05, "loss": 1.4296, "step": 73940 }, { "epoch": 3.1056785272748826, "grad_norm": 0.17482328414916992, "learning_rate": 7.227414914169961e-05, "loss": 1.4263, "step": 73950 }, { "epoch": 3.1056785272748826, "eval_loss": 2.4194869995117188, "eval_runtime": 21.9853, "eval_samples_per_second": 227.425, "eval_steps_per_second": 1.228, "step": 73950 }, { "epoch": 3.106220468440395, "grad_norm": 0.2170400321483612, "learning_rate": 7.225952329237913e-05, "loss": 1.4152, "step": 73960 }, { "epoch": 3.106762409605907, "grad_norm": 0.22855044901371002, "learning_rate": 7.224489530502384e-05, "loss": 1.4359, "step": 73970 }, { "epoch": 3.1073043507714195, "grad_norm": 0.18862365186214447, "learning_rate": 7.223026518144578e-05, "loss": 1.4132, "step": 73980 }, { "epoch": 3.1078462919369314, "grad_norm": 0.21314261853694916, "learning_rate": 7.221563292345728e-05, "loss": 1.4295, "step": 73990 }, { "epoch": 3.1083882331024437, "grad_norm": 0.3071286976337433, "learning_rate": 7.220099853287091e-05, "loss": 1.4306, "step": 74000 }, { "epoch": 3.108930174267956, "grad_norm": 0.20251207053661346, "learning_rate": 7.218636201149952e-05, "loss": 1.4239, "step": 74010 }, { "epoch": 3.1094721154334684, "grad_norm": 0.18129009008407593, "learning_rate": 7.217172336115622e-05, "loss": 1.4233, "step": 74020 }, { "epoch": 3.1100140565989807, "grad_norm": 0.21674220263957977, "learning_rate": 7.21570825836544e-05, "loss": 1.426, "step": 74030 }, { "epoch": 3.110393415414839, "eval_loss": 2.4165384769439697, "eval_runtime": 21.9918, "eval_samples_per_second": 227.357, "eval_steps_per_second": 1.228, "step": 74037 }, { "epoch": 3.1105559977644925, "grad_norm": 0.23506887257099152, "learning_rate": 7.214243968080766e-05, "loss": 1.4339, "step": 74040 }, { "epoch": 3.111097938930005, "grad_norm": 0.19796548783779144, "learning_rate": 7.212779465442993e-05, "loss": 1.4343, "step": 74050 }, { "epoch": 3.111639880095517, "grad_norm": 0.22716623544692993, "learning_rate": 7.211314750633534e-05, "loss": 1.4137, "step": 74060 }, { "epoch": 3.1121818212610295, "grad_norm": 0.3029038608074188, "learning_rate": 7.209849823833835e-05, "loss": 1.4262, "step": 74070 }, { "epoch": 3.1127237624265414, "grad_norm": 0.175888329744339, "learning_rate": 7.208384685225364e-05, "loss": 1.4147, "step": 74080 }, { "epoch": 3.1132657035920537, "grad_norm": 0.28215131163597107, "learning_rate": 7.206919334989613e-05, "loss": 1.4238, "step": 74090 }, { "epoch": 3.113807644757566, "grad_norm": 0.26978573203086853, "learning_rate": 7.205453773308107e-05, "loss": 1.4235, "step": 74100 }, { "epoch": 3.1143495859230783, "grad_norm": 0.16594207286834717, "learning_rate": 7.20398800036239e-05, "loss": 1.4189, "step": 74110 }, { "epoch": 3.1148915270885906, "grad_norm": 0.22124223411083221, "learning_rate": 7.20252201633404e-05, "loss": 1.4344, "step": 74120 }, { "epoch": 3.1151083035547953, "eval_loss": 2.4167044162750244, "eval_runtime": 21.9938, "eval_samples_per_second": 227.337, "eval_steps_per_second": 1.228, "step": 74124 }, { "epoch": 3.1154334682541025, "grad_norm": 0.16019515693187714, "learning_rate": 7.201055821404653e-05, "loss": 1.4233, "step": 74130 }, { "epoch": 3.115975409419615, "grad_norm": 0.19272254407405853, "learning_rate": 7.199589415755856e-05, "loss": 1.4239, "step": 74140 }, { "epoch": 3.116517350585127, "grad_norm": 0.3636123538017273, "learning_rate": 7.198122799569302e-05, "loss": 1.4233, "step": 74150 }, { "epoch": 3.1170592917506394, "grad_norm": 0.31708648800849915, "learning_rate": 7.196655973026667e-05, "loss": 1.4282, "step": 74160 }, { "epoch": 3.1176012329161518, "grad_norm": 0.19407716393470764, "learning_rate": 7.195188936309657e-05, "loss": 1.4302, "step": 74170 }, { "epoch": 3.1181431740816636, "grad_norm": 0.17568087577819824, "learning_rate": 7.193721689600002e-05, "loss": 1.4249, "step": 74180 }, { "epoch": 3.118685115247176, "grad_norm": 0.36815229058265686, "learning_rate": 7.192254233079455e-05, "loss": 1.4195, "step": 74190 }, { "epoch": 3.1192270564126883, "grad_norm": 0.21112428605556488, "learning_rate": 7.190786566929802e-05, "loss": 1.4215, "step": 74200 }, { "epoch": 3.1197689975782006, "grad_norm": 0.19633443653583527, "learning_rate": 7.189318691332851e-05, "loss": 1.4301, "step": 74210 }, { "epoch": 3.119823191694752, "eval_loss": 2.416978359222412, "eval_runtime": 21.9898, "eval_samples_per_second": 227.378, "eval_steps_per_second": 1.228, "step": 74211 }, { "epoch": 3.1203109387437125, "grad_norm": 0.32902562618255615, "learning_rate": 7.187850606470435e-05, "loss": 1.4162, "step": 74220 }, { "epoch": 3.1208528799092248, "grad_norm": 0.19823166728019714, "learning_rate": 7.186382312524411e-05, "loss": 1.4167, "step": 74230 }, { "epoch": 3.121394821074737, "grad_norm": 0.1761886328458786, "learning_rate": 7.18491380967667e-05, "loss": 1.4312, "step": 74240 }, { "epoch": 3.1219367622402494, "grad_norm": 0.3041189908981323, "learning_rate": 7.18344509810912e-05, "loss": 1.4233, "step": 74250 }, { "epoch": 3.1224787034057617, "grad_norm": 0.39153873920440674, "learning_rate": 7.181976178003701e-05, "loss": 1.4268, "step": 74260 }, { "epoch": 3.1230206445712736, "grad_norm": 0.3814428448677063, "learning_rate": 7.180507049542376e-05, "loss": 1.4169, "step": 74270 }, { "epoch": 3.123562585736786, "grad_norm": 0.29164746403694153, "learning_rate": 7.179037712907131e-05, "loss": 1.4271, "step": 74280 }, { "epoch": 3.1241045269022982, "grad_norm": 0.20582854747772217, "learning_rate": 7.177568168279987e-05, "loss": 1.4168, "step": 74290 }, { "epoch": 3.124538079834708, "eval_loss": 2.4158596992492676, "eval_runtime": 22.0314, "eval_samples_per_second": 226.949, "eval_steps_per_second": 1.226, "step": 74298 }, { "epoch": 3.1246464680678105, "grad_norm": 0.18478041887283325, "learning_rate": 7.17609841584298e-05, "loss": 1.4319, "step": 74300 }, { "epoch": 3.1251884092333224, "grad_norm": 0.2710546851158142, "learning_rate": 7.174628455778178e-05, "loss": 1.4217, "step": 74310 }, { "epoch": 3.1257303503988347, "grad_norm": 0.18582426011562347, "learning_rate": 7.173158288267674e-05, "loss": 1.427, "step": 74320 }, { "epoch": 3.126272291564347, "grad_norm": 0.1683790534734726, "learning_rate": 7.171687913493586e-05, "loss": 1.4333, "step": 74330 }, { "epoch": 3.1268142327298594, "grad_norm": 0.16057170927524567, "learning_rate": 7.170217331638056e-05, "loss": 1.4262, "step": 74340 }, { "epoch": 3.1273561738953717, "grad_norm": 0.19682690501213074, "learning_rate": 7.168746542883254e-05, "loss": 1.4231, "step": 74350 }, { "epoch": 3.1278981150608836, "grad_norm": 0.2511323094367981, "learning_rate": 7.167275547411376e-05, "loss": 1.4138, "step": 74360 }, { "epoch": 3.128440056226396, "grad_norm": 0.2534506320953369, "learning_rate": 7.16580434540464e-05, "loss": 1.4225, "step": 74370 }, { "epoch": 3.128981997391908, "grad_norm": 0.26025325059890747, "learning_rate": 7.164332937045295e-05, "loss": 1.425, "step": 74380 }, { "epoch": 3.129252967974664, "eval_loss": 2.41723370552063, "eval_runtime": 22.0929, "eval_samples_per_second": 226.317, "eval_steps_per_second": 1.222, "step": 74385 }, { "epoch": 3.1295239385574205, "grad_norm": 0.3324603736400604, "learning_rate": 7.162861322515613e-05, "loss": 1.43, "step": 74390 }, { "epoch": 3.1300658797229324, "grad_norm": 0.21369896829128265, "learning_rate": 7.161389501997889e-05, "loss": 1.4211, "step": 74400 }, { "epoch": 3.1306078208884447, "grad_norm": 0.5364189743995667, "learning_rate": 7.159917475674447e-05, "loss": 1.424, "step": 74410 }, { "epoch": 3.131149762053957, "grad_norm": 0.29370662569999695, "learning_rate": 7.158445243727632e-05, "loss": 1.4254, "step": 74420 }, { "epoch": 3.1316917032194693, "grad_norm": 0.4463825821876526, "learning_rate": 7.156972806339823e-05, "loss": 1.4264, "step": 74430 }, { "epoch": 3.1322336443849816, "grad_norm": 0.181520015001297, "learning_rate": 7.155500163693417e-05, "loss": 1.422, "step": 74440 }, { "epoch": 3.1327755855504935, "grad_norm": 0.18724457919597626, "learning_rate": 7.154027315970838e-05, "loss": 1.4197, "step": 74450 }, { "epoch": 3.133317526716006, "grad_norm": 0.19526837766170502, "learning_rate": 7.152554263354535e-05, "loss": 1.4262, "step": 74460 }, { "epoch": 3.133859467881518, "grad_norm": 0.21833248436450958, "learning_rate": 7.151081006026984e-05, "loss": 1.4281, "step": 74470 }, { "epoch": 3.1339678561146207, "eval_loss": 2.4146862030029297, "eval_runtime": 22.0465, "eval_samples_per_second": 226.794, "eval_steps_per_second": 1.225, "step": 74472 }, { "epoch": 3.1344014090470305, "grad_norm": 0.22797681391239166, "learning_rate": 7.149607544170687e-05, "loss": 1.4222, "step": 74480 }, { "epoch": 3.1349433502125423, "grad_norm": 0.16637133061885834, "learning_rate": 7.14813387796817e-05, "loss": 1.4116, "step": 74490 }, { "epoch": 3.1354852913780547, "grad_norm": 0.28121089935302734, "learning_rate": 7.146660007601984e-05, "loss": 1.417, "step": 74500 }, { "epoch": 3.136027232543567, "grad_norm": 0.19828984141349792, "learning_rate": 7.145185933254705e-05, "loss": 1.4322, "step": 74510 }, { "epoch": 3.1365691737090793, "grad_norm": 0.2876355051994324, "learning_rate": 7.143711655108935e-05, "loss": 1.435, "step": 74520 }, { "epoch": 3.1371111148745916, "grad_norm": 0.1621021181344986, "learning_rate": 7.1422371733473e-05, "loss": 1.4288, "step": 74530 }, { "epoch": 3.1376530560401035, "grad_norm": 0.3681042492389679, "learning_rate": 7.140762488152457e-05, "loss": 1.429, "step": 74540 }, { "epoch": 3.138194997205616, "grad_norm": 0.22536900639533997, "learning_rate": 7.139287599707078e-05, "loss": 1.4203, "step": 74550 }, { "epoch": 3.138682744254577, "eval_loss": 2.4159655570983887, "eval_runtime": 22.1157, "eval_samples_per_second": 226.084, "eval_steps_per_second": 1.221, "step": 74559 }, { "epoch": 3.138736938371128, "grad_norm": 0.18401791155338287, "learning_rate": 7.137812508193868e-05, "loss": 1.4321, "step": 74560 }, { "epoch": 3.1392788795366404, "grad_norm": 0.38875266909599304, "learning_rate": 7.136337213795556e-05, "loss": 1.4213, "step": 74570 }, { "epoch": 3.1398208207021527, "grad_norm": 0.28335729241371155, "learning_rate": 7.134861716694894e-05, "loss": 1.4259, "step": 74580 }, { "epoch": 3.1403627618676646, "grad_norm": 0.4349845051765442, "learning_rate": 7.13338601707466e-05, "loss": 1.4315, "step": 74590 }, { "epoch": 3.140904703033177, "grad_norm": 0.2121918946504593, "learning_rate": 7.131910115117659e-05, "loss": 1.4343, "step": 74600 }, { "epoch": 3.1414466441986892, "grad_norm": 0.281886488199234, "learning_rate": 7.130434011006716e-05, "loss": 1.4218, "step": 74610 }, { "epoch": 3.1419885853642016, "grad_norm": 0.1884024739265442, "learning_rate": 7.128957704924689e-05, "loss": 1.4322, "step": 74620 }, { "epoch": 3.1425305265297134, "grad_norm": 0.3853340446949005, "learning_rate": 7.127481197054452e-05, "loss": 1.4274, "step": 74630 }, { "epoch": 3.1430724676952257, "grad_norm": 0.18657207489013672, "learning_rate": 7.12600448757891e-05, "loss": 1.4131, "step": 74640 }, { "epoch": 3.1433976323945334, "eval_loss": 2.421818733215332, "eval_runtime": 22.0955, "eval_samples_per_second": 226.29, "eval_steps_per_second": 1.222, "step": 74646 }, { "epoch": 3.143614408860738, "grad_norm": 0.17503514885902405, "learning_rate": 7.124527576680993e-05, "loss": 1.425, "step": 74650 }, { "epoch": 3.1441563500262504, "grad_norm": 0.17085334658622742, "learning_rate": 7.123050464543652e-05, "loss": 1.4221, "step": 74660 }, { "epoch": 3.1446982911917627, "grad_norm": 0.19825150072574615, "learning_rate": 7.121573151349863e-05, "loss": 1.4195, "step": 74670 }, { "epoch": 3.1452402323572746, "grad_norm": 0.29708579182624817, "learning_rate": 7.120095637282636e-05, "loss": 1.4341, "step": 74680 }, { "epoch": 3.145782173522787, "grad_norm": 0.2100587636232376, "learning_rate": 7.118617922524996e-05, "loss": 1.4208, "step": 74690 }, { "epoch": 3.146324114688299, "grad_norm": 0.2220253199338913, "learning_rate": 7.117140007259993e-05, "loss": 1.4305, "step": 74700 }, { "epoch": 3.1468660558538115, "grad_norm": 0.17661844193935394, "learning_rate": 7.115661891670706e-05, "loss": 1.4279, "step": 74710 }, { "epoch": 3.1474079970193234, "grad_norm": 0.19139379262924194, "learning_rate": 7.11418357594024e-05, "loss": 1.4318, "step": 74720 }, { "epoch": 3.1479499381848357, "grad_norm": 0.17980216443538666, "learning_rate": 7.11270506025172e-05, "loss": 1.4247, "step": 74730 }, { "epoch": 3.1481125205344895, "eval_loss": 2.416665554046631, "eval_runtime": 22.0909, "eval_samples_per_second": 226.337, "eval_steps_per_second": 1.222, "step": 74733 }, { "epoch": 3.148491879350348, "grad_norm": 0.24544402956962585, "learning_rate": 7.111226344788298e-05, "loss": 1.4119, "step": 74740 }, { "epoch": 3.1490338205158603, "grad_norm": 0.26411178708076477, "learning_rate": 7.109747429733154e-05, "loss": 1.4213, "step": 74750 }, { "epoch": 3.1495757616813727, "grad_norm": 0.2932426333427429, "learning_rate": 7.108268315269485e-05, "loss": 1.4356, "step": 74760 }, { "epoch": 3.1501177028468845, "grad_norm": 0.35350051522254944, "learning_rate": 7.106789001580521e-05, "loss": 1.4253, "step": 74770 }, { "epoch": 3.150659644012397, "grad_norm": 0.1805257648229599, "learning_rate": 7.10530948884951e-05, "loss": 1.4194, "step": 74780 }, { "epoch": 3.151201585177909, "grad_norm": 0.20616401731967926, "learning_rate": 7.10382977725973e-05, "loss": 1.4175, "step": 74790 }, { "epoch": 3.1517435263434215, "grad_norm": 0.16186383366584778, "learning_rate": 7.10234986699448e-05, "loss": 1.4185, "step": 74800 }, { "epoch": 3.152285467508934, "grad_norm": 0.31783926486968994, "learning_rate": 7.100869758237085e-05, "loss": 1.4254, "step": 74810 }, { "epoch": 3.1528274086744457, "grad_norm": 0.20256149768829346, "learning_rate": 7.099389451170897e-05, "loss": 1.4193, "step": 74820 }, { "epoch": 3.1528274086744457, "eval_loss": 2.4210216999053955, "eval_runtime": 22.0823, "eval_samples_per_second": 226.426, "eval_steps_per_second": 1.223, "step": 74820 }, { "epoch": 3.153369349839958, "grad_norm": 0.169488787651062, "learning_rate": 7.097908945979284e-05, "loss": 1.4263, "step": 74830 }, { "epoch": 3.1539112910054703, "grad_norm": 0.1810005009174347, "learning_rate": 7.09642824284565e-05, "loss": 1.4223, "step": 74840 }, { "epoch": 3.1544532321709826, "grad_norm": 0.21549201011657715, "learning_rate": 7.094947341953417e-05, "loss": 1.4189, "step": 74850 }, { "epoch": 3.1549951733364945, "grad_norm": 0.274050772190094, "learning_rate": 7.09346624348603e-05, "loss": 1.4256, "step": 74860 }, { "epoch": 3.155537114502007, "grad_norm": 0.21615564823150635, "learning_rate": 7.091984947626964e-05, "loss": 1.4145, "step": 74870 }, { "epoch": 3.156079055667519, "grad_norm": 0.17899556457996368, "learning_rate": 7.090503454559715e-05, "loss": 1.4263, "step": 74880 }, { "epoch": 3.1566209968330314, "grad_norm": 0.2249787449836731, "learning_rate": 7.089021764467803e-05, "loss": 1.4118, "step": 74890 }, { "epoch": 3.1571629379985433, "grad_norm": 0.17908982932567596, "learning_rate": 7.087539877534772e-05, "loss": 1.4206, "step": 74900 }, { "epoch": 3.1575422968144022, "eval_loss": 2.4155755043029785, "eval_runtime": 21.9967, "eval_samples_per_second": 227.307, "eval_steps_per_second": 1.227, "step": 74907 }, { "epoch": 3.1577048791640556, "grad_norm": 0.17990335822105408, "learning_rate": 7.086057793944195e-05, "loss": 1.4283, "step": 74910 }, { "epoch": 3.158246820329568, "grad_norm": 0.19940659403800964, "learning_rate": 7.084575513879664e-05, "loss": 1.4323, "step": 74920 }, { "epoch": 3.1587887614950803, "grad_norm": 0.1748953014612198, "learning_rate": 7.083093037524798e-05, "loss": 1.4191, "step": 74930 }, { "epoch": 3.1593307026605926, "grad_norm": 0.16563956439495087, "learning_rate": 7.08161036506324e-05, "loss": 1.4231, "step": 74940 }, { "epoch": 3.1598726438261044, "grad_norm": 0.21732689440250397, "learning_rate": 7.080127496678655e-05, "loss": 1.4266, "step": 74950 }, { "epoch": 3.1604145849916168, "grad_norm": 0.17851050198078156, "learning_rate": 7.078644432554738e-05, "loss": 1.4195, "step": 74960 }, { "epoch": 3.160956526157129, "grad_norm": 0.2197248935699463, "learning_rate": 7.077161172875202e-05, "loss": 1.4252, "step": 74970 }, { "epoch": 3.1614984673226414, "grad_norm": 0.2679365575313568, "learning_rate": 7.075677717823787e-05, "loss": 1.4283, "step": 74980 }, { "epoch": 3.1620404084881537, "grad_norm": 0.18566496670246124, "learning_rate": 7.074194067584256e-05, "loss": 1.4232, "step": 74990 }, { "epoch": 3.1622571849543584, "eval_loss": 2.4319992065429688, "eval_runtime": 21.9934, "eval_samples_per_second": 227.341, "eval_steps_per_second": 1.228, "step": 74994 }, { "epoch": 3.1625823496536656, "grad_norm": 0.28876206278800964, "learning_rate": 7.072710222340399e-05, "loss": 1.4202, "step": 75000 }, { "epoch": 3.163124290819178, "grad_norm": 0.22490562498569489, "learning_rate": 7.07122618227603e-05, "loss": 1.424, "step": 75010 }, { "epoch": 3.16366623198469, "grad_norm": 0.17399635910987854, "learning_rate": 7.069741947574981e-05, "loss": 1.4218, "step": 75020 }, { "epoch": 3.1642081731502025, "grad_norm": 0.2350047528743744, "learning_rate": 7.068257518421116e-05, "loss": 1.4214, "step": 75030 }, { "epoch": 3.1647501143157144, "grad_norm": 0.5656566619873047, "learning_rate": 7.066772894998316e-05, "loss": 1.4225, "step": 75040 }, { "epoch": 3.1652920554812267, "grad_norm": 0.22501088678836823, "learning_rate": 7.065288077490493e-05, "loss": 1.4295, "step": 75050 }, { "epoch": 3.165833996646739, "grad_norm": 0.2622585594654083, "learning_rate": 7.06380306608158e-05, "loss": 1.411, "step": 75060 }, { "epoch": 3.1663759378122514, "grad_norm": 0.2026064693927765, "learning_rate": 7.062317860955529e-05, "loss": 1.4129, "step": 75070 }, { "epoch": 3.1669178789777637, "grad_norm": 0.16883380711078644, "learning_rate": 7.060832462296329e-05, "loss": 1.4127, "step": 75080 }, { "epoch": 3.1669720730943145, "eval_loss": 2.4277634620666504, "eval_runtime": 22.0408, "eval_samples_per_second": 226.852, "eval_steps_per_second": 1.225, "step": 75081 }, { "epoch": 3.1674598201432755, "grad_norm": 0.17658884823322296, "learning_rate": 7.059346870287978e-05, "loss": 1.4141, "step": 75090 }, { "epoch": 3.168001761308788, "grad_norm": 0.16324864327907562, "learning_rate": 7.057861085114506e-05, "loss": 1.4321, "step": 75100 }, { "epoch": 3.1685437024743, "grad_norm": 0.17185302078723907, "learning_rate": 7.056375106959967e-05, "loss": 1.4238, "step": 75110 }, { "epoch": 3.1690856436398125, "grad_norm": 0.23366904258728027, "learning_rate": 7.054888936008437e-05, "loss": 1.4175, "step": 75120 }, { "epoch": 3.1696275848053244, "grad_norm": 0.23290032148361206, "learning_rate": 7.053402572444017e-05, "loss": 1.4307, "step": 75130 }, { "epoch": 3.1701695259708367, "grad_norm": 0.19138121604919434, "learning_rate": 7.05191601645083e-05, "loss": 1.4317, "step": 75140 }, { "epoch": 3.170711467136349, "grad_norm": 0.1849122941493988, "learning_rate": 7.050429268213023e-05, "loss": 1.431, "step": 75150 }, { "epoch": 3.1712534083018613, "grad_norm": 0.2830689549446106, "learning_rate": 7.04894232791477e-05, "loss": 1.413, "step": 75160 }, { "epoch": 3.171686961234271, "eval_loss": 2.426955223083496, "eval_runtime": 22.0447, "eval_samples_per_second": 226.812, "eval_steps_per_second": 1.225, "step": 75168 }, { "epoch": 3.1717953494673736, "grad_norm": 0.21067143976688385, "learning_rate": 7.047455195740268e-05, "loss": 1.4298, "step": 75170 }, { "epoch": 3.1723372906328855, "grad_norm": 0.2845075726509094, "learning_rate": 7.045967871873734e-05, "loss": 1.4164, "step": 75180 }, { "epoch": 3.172879231798398, "grad_norm": 0.2222658097743988, "learning_rate": 7.044480356499412e-05, "loss": 1.4213, "step": 75190 }, { "epoch": 3.17342117296391, "grad_norm": 0.2342739850282669, "learning_rate": 7.042992649801568e-05, "loss": 1.425, "step": 75200 }, { "epoch": 3.1739631141294224, "grad_norm": 0.16035151481628418, "learning_rate": 7.041504751964494e-05, "loss": 1.4337, "step": 75210 }, { "epoch": 3.1745050552949348, "grad_norm": 0.1617584526538849, "learning_rate": 7.040016663172505e-05, "loss": 1.4214, "step": 75220 }, { "epoch": 3.1750469964604466, "grad_norm": 0.18865042924880981, "learning_rate": 7.038528383609936e-05, "loss": 1.4262, "step": 75230 }, { "epoch": 3.175588937625959, "grad_norm": 0.15919503569602966, "learning_rate": 7.037039913461152e-05, "loss": 1.4259, "step": 75240 }, { "epoch": 3.1761308787914713, "grad_norm": 0.18104073405265808, "learning_rate": 7.035551252910535e-05, "loss": 1.4228, "step": 75250 }, { "epoch": 3.176401849374227, "eval_loss": 2.4324724674224854, "eval_runtime": 22.1726, "eval_samples_per_second": 225.504, "eval_steps_per_second": 1.218, "step": 75255 }, { "epoch": 3.1766728199569836, "grad_norm": 0.24971145391464233, "learning_rate": 7.034062402142497e-05, "loss": 1.4224, "step": 75260 }, { "epoch": 3.1772147611224955, "grad_norm": 0.22338011860847473, "learning_rate": 7.032573361341469e-05, "loss": 1.4086, "step": 75270 }, { "epoch": 3.1777567022880078, "grad_norm": 0.2989422082901001, "learning_rate": 7.031084130691905e-05, "loss": 1.4188, "step": 75280 }, { "epoch": 3.17829864345352, "grad_norm": 0.38781970739364624, "learning_rate": 7.029594710378288e-05, "loss": 1.4263, "step": 75290 }, { "epoch": 3.1788405846190324, "grad_norm": 0.27515822649002075, "learning_rate": 7.028105100585118e-05, "loss": 1.4258, "step": 75300 }, { "epoch": 3.1793825257845447, "grad_norm": 0.24076929688453674, "learning_rate": 7.026615301496923e-05, "loss": 1.4264, "step": 75310 }, { "epoch": 3.1799244669500566, "grad_norm": 0.1977168768644333, "learning_rate": 7.025125313298253e-05, "loss": 1.4267, "step": 75320 }, { "epoch": 3.180466408115569, "grad_norm": 0.20575913786888123, "learning_rate": 7.02363513617368e-05, "loss": 1.4178, "step": 75330 }, { "epoch": 3.1810083492810812, "grad_norm": 0.25656020641326904, "learning_rate": 7.022144770307801e-05, "loss": 1.4224, "step": 75340 }, { "epoch": 3.181116737514184, "eval_loss": 2.4162216186523438, "eval_runtime": 21.9936, "eval_samples_per_second": 227.339, "eval_steps_per_second": 1.228, "step": 75342 }, { "epoch": 3.1815502904465935, "grad_norm": 0.20052428543567657, "learning_rate": 7.020654215885236e-05, "loss": 1.4175, "step": 75350 }, { "epoch": 3.1820922316121054, "grad_norm": 0.2730656564235687, "learning_rate": 7.01916347309063e-05, "loss": 1.427, "step": 75360 }, { "epoch": 3.1826341727776177, "grad_norm": 0.19795359671115875, "learning_rate": 7.017672542108648e-05, "loss": 1.4233, "step": 75370 }, { "epoch": 3.18317611394313, "grad_norm": 0.25802189111709595, "learning_rate": 7.016181423123981e-05, "loss": 1.4181, "step": 75380 }, { "epoch": 3.1837180551086424, "grad_norm": 0.2566634714603424, "learning_rate": 7.01469011632134e-05, "loss": 1.4277, "step": 75390 }, { "epoch": 3.1842599962741547, "grad_norm": 0.18765607476234436, "learning_rate": 7.013198621885465e-05, "loss": 1.4108, "step": 75400 }, { "epoch": 3.1848019374396666, "grad_norm": 0.2734335660934448, "learning_rate": 7.011706940001113e-05, "loss": 1.4238, "step": 75410 }, { "epoch": 3.185343878605179, "grad_norm": 0.2739078402519226, "learning_rate": 7.010215070853071e-05, "loss": 1.4204, "step": 75420 }, { "epoch": 3.18583162565414, "eval_loss": 2.4123969078063965, "eval_runtime": 21.9921, "eval_samples_per_second": 227.354, "eval_steps_per_second": 1.228, "step": 75429 }, { "epoch": 3.185885819770691, "grad_norm": 0.18177048861980438, "learning_rate": 7.008723014626142e-05, "loss": 1.4138, "step": 75430 }, { "epoch": 3.1864277609362035, "grad_norm": 0.2982783615589142, "learning_rate": 7.007230771505155e-05, "loss": 1.4217, "step": 75440 }, { "epoch": 3.186969702101716, "grad_norm": 0.24831917881965637, "learning_rate": 7.005738341674964e-05, "loss": 1.4239, "step": 75450 }, { "epoch": 3.1875116432672277, "grad_norm": 0.20293238759040833, "learning_rate": 7.004245725320445e-05, "loss": 1.4171, "step": 75460 }, { "epoch": 3.18805358443274, "grad_norm": 0.27422091364860535, "learning_rate": 7.002752922626496e-05, "loss": 1.4212, "step": 75470 }, { "epoch": 3.1885955255982523, "grad_norm": 0.19502641260623932, "learning_rate": 7.001259933778041e-05, "loss": 1.4212, "step": 75480 }, { "epoch": 3.1891374667637646, "grad_norm": 0.1884526014328003, "learning_rate": 6.999766758960019e-05, "loss": 1.4176, "step": 75490 }, { "epoch": 3.1896794079292765, "grad_norm": 0.1656298190355301, "learning_rate": 6.998273398357406e-05, "loss": 1.429, "step": 75500 }, { "epoch": 3.190221349094789, "grad_norm": 0.17676104605197906, "learning_rate": 6.996779852155189e-05, "loss": 1.423, "step": 75510 }, { "epoch": 3.190546513794096, "eval_loss": 2.412719488143921, "eval_runtime": 21.9959, "eval_samples_per_second": 227.315, "eval_steps_per_second": 1.228, "step": 75516 }, { "epoch": 3.190763290260301, "grad_norm": 0.3444202244281769, "learning_rate": 6.99528612053838e-05, "loss": 1.4231, "step": 75520 }, { "epoch": 3.1913052314258135, "grad_norm": 0.20383748412132263, "learning_rate": 6.99379220369202e-05, "loss": 1.4305, "step": 75530 }, { "epoch": 3.1918471725913253, "grad_norm": 0.2849511206150055, "learning_rate": 6.992298101801167e-05, "loss": 1.4264, "step": 75540 }, { "epoch": 3.1923891137568376, "grad_norm": 0.20015236735343933, "learning_rate": 6.990803815050903e-05, "loss": 1.4288, "step": 75550 }, { "epoch": 3.19293105492235, "grad_norm": 0.19224634766578674, "learning_rate": 6.989309343626337e-05, "loss": 1.4244, "step": 75560 }, { "epoch": 3.1934729960878623, "grad_norm": 0.16140466928482056, "learning_rate": 6.987814687712592e-05, "loss": 1.4274, "step": 75570 }, { "epoch": 3.1940149372533746, "grad_norm": 0.1749192178249359, "learning_rate": 6.986319847494826e-05, "loss": 1.4347, "step": 75580 }, { "epoch": 3.1945568784188865, "grad_norm": 0.16757184267044067, "learning_rate": 6.98482482315821e-05, "loss": 1.419, "step": 75590 }, { "epoch": 3.195098819584399, "grad_norm": 0.15887753665447235, "learning_rate": 6.98332961488794e-05, "loss": 1.4307, "step": 75600 }, { "epoch": 3.1952614019340526, "eval_loss": 2.424544095993042, "eval_runtime": 21.9927, "eval_samples_per_second": 227.348, "eval_steps_per_second": 1.228, "step": 75603 }, { "epoch": 3.195640760749911, "grad_norm": 0.3752150237560272, "learning_rate": 6.98183422286924e-05, "loss": 1.421, "step": 75610 }, { "epoch": 3.1961827019154234, "grad_norm": 0.18618738651275635, "learning_rate": 6.980338647287347e-05, "loss": 1.4293, "step": 75620 }, { "epoch": 3.1967246430809357, "grad_norm": 0.19021612405776978, "learning_rate": 6.97884288832753e-05, "loss": 1.4218, "step": 75630 }, { "epoch": 3.1972665842464476, "grad_norm": 0.1741076409816742, "learning_rate": 6.977346946175078e-05, "loss": 1.4211, "step": 75640 }, { "epoch": 3.19780852541196, "grad_norm": 0.1816461980342865, "learning_rate": 6.975850821015298e-05, "loss": 1.4124, "step": 75650 }, { "epoch": 3.1983504665774722, "grad_norm": 0.22760295867919922, "learning_rate": 6.974354513033528e-05, "loss": 1.4251, "step": 75660 }, { "epoch": 3.1988924077429846, "grad_norm": 0.255193293094635, "learning_rate": 6.972858022415119e-05, "loss": 1.4269, "step": 75670 }, { "epoch": 3.1994343489084964, "grad_norm": 0.1877882033586502, "learning_rate": 6.971361349345455e-05, "loss": 1.4093, "step": 75680 }, { "epoch": 3.1999762900740087, "grad_norm": 0.17674243450164795, "learning_rate": 6.969864494009934e-05, "loss": 1.4263, "step": 75690 }, { "epoch": 3.1999762900740087, "eval_loss": 2.420743942260742, "eval_runtime": 21.9872, "eval_samples_per_second": 227.405, "eval_steps_per_second": 1.228, "step": 75690 }, { "epoch": 3.200518231239521, "grad_norm": 0.1850854456424713, "learning_rate": 6.968367456593983e-05, "loss": 1.4203, "step": 75700 }, { "epoch": 3.2010601724050334, "grad_norm": 0.25063949823379517, "learning_rate": 6.966870237283045e-05, "loss": 1.4245, "step": 75710 }, { "epoch": 3.2016021135705457, "grad_norm": 0.23362627625465393, "learning_rate": 6.96537283626259e-05, "loss": 1.4203, "step": 75720 }, { "epoch": 3.2021440547360576, "grad_norm": 0.17846883833408356, "learning_rate": 6.96387525371811e-05, "loss": 1.4251, "step": 75730 }, { "epoch": 3.20268599590157, "grad_norm": 0.2213391810655594, "learning_rate": 6.96237748983512e-05, "loss": 1.4155, "step": 75740 }, { "epoch": 3.203227937067082, "grad_norm": 0.1812654733657837, "learning_rate": 6.960879544799156e-05, "loss": 1.4178, "step": 75750 }, { "epoch": 3.2037698782325945, "grad_norm": 0.18986418843269348, "learning_rate": 6.959381418795775e-05, "loss": 1.4217, "step": 75760 }, { "epoch": 3.2043118193981064, "grad_norm": 0.15823641419410706, "learning_rate": 6.957883112010563e-05, "loss": 1.4217, "step": 75770 }, { "epoch": 3.2046911782139653, "eval_loss": 2.4173123836517334, "eval_runtime": 21.9872, "eval_samples_per_second": 227.405, "eval_steps_per_second": 1.228, "step": 75777 }, { "epoch": 3.2048537605636187, "grad_norm": 0.25696510076522827, "learning_rate": 6.956384624629117e-05, "loss": 1.4149, "step": 75780 }, { "epoch": 3.205395701729131, "grad_norm": 0.18557502329349518, "learning_rate": 6.954885956837067e-05, "loss": 1.4202, "step": 75790 }, { "epoch": 3.2059376428946433, "grad_norm": 0.2925853133201599, "learning_rate": 6.953387108820064e-05, "loss": 1.4157, "step": 75800 }, { "epoch": 3.2064795840601557, "grad_norm": 0.17210768163204193, "learning_rate": 6.951888080763772e-05, "loss": 1.4136, "step": 75810 }, { "epoch": 3.2070215252256675, "grad_norm": 0.17783254384994507, "learning_rate": 6.950388872853891e-05, "loss": 1.4264, "step": 75820 }, { "epoch": 3.20756346639118, "grad_norm": 0.1816641092300415, "learning_rate": 6.948889485276132e-05, "loss": 1.4121, "step": 75830 }, { "epoch": 3.208105407556692, "grad_norm": 0.25515398383140564, "learning_rate": 6.947389918216234e-05, "loss": 1.4282, "step": 75840 }, { "epoch": 3.2086473487222045, "grad_norm": 0.16730128228664398, "learning_rate": 6.94589017185996e-05, "loss": 1.4233, "step": 75850 }, { "epoch": 3.209189289887717, "grad_norm": 0.2866290509700775, "learning_rate": 6.944390246393085e-05, "loss": 1.4215, "step": 75860 }, { "epoch": 3.2094060663539214, "eval_loss": 2.418275833129883, "eval_runtime": 21.9933, "eval_samples_per_second": 227.342, "eval_steps_per_second": 1.228, "step": 75864 }, { "epoch": 3.2097312310532287, "grad_norm": 0.21577335894107819, "learning_rate": 6.942890142001418e-05, "loss": 1.4179, "step": 75870 }, { "epoch": 3.210273172218741, "grad_norm": 0.20668751001358032, "learning_rate": 6.941389858870785e-05, "loss": 1.4216, "step": 75880 }, { "epoch": 3.2108151133842533, "grad_norm": 0.23815388977527618, "learning_rate": 6.939889397187034e-05, "loss": 1.4147, "step": 75890 }, { "epoch": 3.2113570545497656, "grad_norm": 0.18798255920410156, "learning_rate": 6.938388757136036e-05, "loss": 1.4145, "step": 75900 }, { "epoch": 3.2118989957152775, "grad_norm": 0.24447955191135406, "learning_rate": 6.936887938903684e-05, "loss": 1.43, "step": 75910 }, { "epoch": 3.21244093688079, "grad_norm": 0.288022518157959, "learning_rate": 6.935386942675892e-05, "loss": 1.4171, "step": 75920 }, { "epoch": 3.212982878046302, "grad_norm": 0.22881217300891876, "learning_rate": 6.933885768638598e-05, "loss": 1.4228, "step": 75930 }, { "epoch": 3.2135248192118144, "grad_norm": 0.20432133972644806, "learning_rate": 6.932384416977759e-05, "loss": 1.4242, "step": 75940 }, { "epoch": 3.2140667603773263, "grad_norm": 0.21099141240119934, "learning_rate": 6.930882887879359e-05, "loss": 1.4182, "step": 75950 }, { "epoch": 3.2141209544938776, "eval_loss": 2.428762674331665, "eval_runtime": 21.9941, "eval_samples_per_second": 227.334, "eval_steps_per_second": 1.228, "step": 75951 }, { "epoch": 3.2146087015428386, "grad_norm": 0.19293814897537231, "learning_rate": 6.929381181529399e-05, "loss": 1.4111, "step": 75960 }, { "epoch": 3.215150642708351, "grad_norm": 0.209365114569664, "learning_rate": 6.927879298113901e-05, "loss": 1.4083, "step": 75970 }, { "epoch": 3.2156925838738633, "grad_norm": 0.20382948219776154, "learning_rate": 6.926377237818917e-05, "loss": 1.4229, "step": 75980 }, { "epoch": 3.2162345250393756, "grad_norm": 0.18336911499500275, "learning_rate": 6.924875000830513e-05, "loss": 1.4244, "step": 75990 }, { "epoch": 3.2167764662048874, "grad_norm": 0.18866483867168427, "learning_rate": 6.92337258733478e-05, "loss": 1.4241, "step": 76000 }, { "epoch": 3.2173184073703998, "grad_norm": 0.21788087487220764, "learning_rate": 6.921869997517834e-05, "loss": 1.4319, "step": 76010 }, { "epoch": 3.217860348535912, "grad_norm": 0.1925356537103653, "learning_rate": 6.9203672315658e-05, "loss": 1.4204, "step": 76020 }, { "epoch": 3.2184022897014244, "grad_norm": 0.24953441321849823, "learning_rate": 6.918864289664845e-05, "loss": 1.4209, "step": 76030 }, { "epoch": 3.218835842633834, "eval_loss": 2.4290771484375, "eval_runtime": 21.9942, "eval_samples_per_second": 227.333, "eval_steps_per_second": 1.228, "step": 76038 }, { "epoch": 3.2189442308669367, "grad_norm": 0.18474015593528748, "learning_rate": 6.917361172001139e-05, "loss": 1.4223, "step": 76040 }, { "epoch": 3.2194861720324486, "grad_norm": 0.27110907435417175, "learning_rate": 6.915857878760885e-05, "loss": 1.4256, "step": 76050 }, { "epoch": 3.220028113197961, "grad_norm": 0.19559454917907715, "learning_rate": 6.914354410130305e-05, "loss": 1.4299, "step": 76060 }, { "epoch": 3.220570054363473, "grad_norm": 0.16140873730182648, "learning_rate": 6.912850766295641e-05, "loss": 1.4068, "step": 76070 }, { "epoch": 3.2211119955289855, "grad_norm": 0.17691193521022797, "learning_rate": 6.911346947443157e-05, "loss": 1.4216, "step": 76080 }, { "epoch": 3.2216539366944974, "grad_norm": 0.173243448138237, "learning_rate": 6.909842953759142e-05, "loss": 1.4172, "step": 76090 }, { "epoch": 3.2221958778600097, "grad_norm": 0.3001802861690521, "learning_rate": 6.908338785429901e-05, "loss": 1.4205, "step": 76100 }, { "epoch": 3.222737819025522, "grad_norm": 0.21718403697013855, "learning_rate": 6.906834442641768e-05, "loss": 1.4201, "step": 76110 }, { "epoch": 3.2232797601910343, "grad_norm": 0.19174130260944366, "learning_rate": 6.905329925581089e-05, "loss": 1.4161, "step": 76120 }, { "epoch": 3.2235507307737903, "eval_loss": 2.4190409183502197, "eval_runtime": 21.9914, "eval_samples_per_second": 227.361, "eval_steps_per_second": 1.228, "step": 76125 }, { "epoch": 3.2238217013565467, "grad_norm": 0.28363704681396484, "learning_rate": 6.903825234434243e-05, "loss": 1.4222, "step": 76130 }, { "epoch": 3.2243636425220585, "grad_norm": 0.21950308978557587, "learning_rate": 6.902320369387621e-05, "loss": 1.4202, "step": 76140 }, { "epoch": 3.224905583687571, "grad_norm": 0.22058185935020447, "learning_rate": 6.90081533062764e-05, "loss": 1.4147, "step": 76150 }, { "epoch": 3.225447524853083, "grad_norm": 0.354383260011673, "learning_rate": 6.899310118340736e-05, "loss": 1.4167, "step": 76160 }, { "epoch": 3.2259894660185955, "grad_norm": 0.3099483251571655, "learning_rate": 6.897804732713371e-05, "loss": 1.4339, "step": 76170 }, { "epoch": 3.2265314071841074, "grad_norm": 0.33063191175460815, "learning_rate": 6.896299173932023e-05, "loss": 1.4213, "step": 76180 }, { "epoch": 3.2270733483496197, "grad_norm": 0.17252486944198608, "learning_rate": 6.894793442183199e-05, "loss": 1.4295, "step": 76190 }, { "epoch": 3.227615289515132, "grad_norm": 0.20971451699733734, "learning_rate": 6.893287537653417e-05, "loss": 1.4133, "step": 76200 }, { "epoch": 3.2281572306806443, "grad_norm": 0.16854190826416016, "learning_rate": 6.891781460529223e-05, "loss": 1.4208, "step": 76210 }, { "epoch": 3.228265618913747, "eval_loss": 2.4187822341918945, "eval_runtime": 21.9892, "eval_samples_per_second": 227.385, "eval_steps_per_second": 1.228, "step": 76212 }, { "epoch": 3.2286991718461566, "grad_norm": 0.1864662617444992, "learning_rate": 6.890275210997185e-05, "loss": 1.4251, "step": 76220 }, { "epoch": 3.2292411130116685, "grad_norm": 0.15556700527668, "learning_rate": 6.888768789243889e-05, "loss": 1.4134, "step": 76230 }, { "epoch": 3.229783054177181, "grad_norm": 0.17359033226966858, "learning_rate": 6.887262195455946e-05, "loss": 1.4146, "step": 76240 }, { "epoch": 3.230324995342693, "grad_norm": 0.29231011867523193, "learning_rate": 6.885755429819985e-05, "loss": 1.4274, "step": 76250 }, { "epoch": 3.2308669365082054, "grad_norm": 0.16642163693904877, "learning_rate": 6.884248492522656e-05, "loss": 1.4075, "step": 76260 }, { "epoch": 3.2314088776737178, "grad_norm": 0.4484625458717346, "learning_rate": 6.882741383750635e-05, "loss": 1.4122, "step": 76270 }, { "epoch": 3.2319508188392296, "grad_norm": 0.14720787107944489, "learning_rate": 6.881234103690616e-05, "loss": 1.4155, "step": 76280 }, { "epoch": 3.232492760004742, "grad_norm": 0.19290123879909515, "learning_rate": 6.87972665252931e-05, "loss": 1.4226, "step": 76290 }, { "epoch": 3.232980507053703, "eval_loss": 2.4077162742614746, "eval_runtime": 21.993, "eval_samples_per_second": 227.345, "eval_steps_per_second": 1.228, "step": 76299 }, { "epoch": 3.2330347011702543, "grad_norm": 0.18437008559703827, "learning_rate": 6.878219030453459e-05, "loss": 1.4208, "step": 76300 }, { "epoch": 3.2335766423357666, "grad_norm": 0.1884869635105133, "learning_rate": 6.876711237649816e-05, "loss": 1.416, "step": 76310 }, { "epoch": 3.2341185835012785, "grad_norm": 0.19494092464447021, "learning_rate": 6.875203274305163e-05, "loss": 1.4133, "step": 76320 }, { "epoch": 3.2346605246667908, "grad_norm": 0.16780586540699005, "learning_rate": 6.8736951406063e-05, "loss": 1.4222, "step": 76330 }, { "epoch": 3.235202465832303, "grad_norm": 0.24331122636795044, "learning_rate": 6.872186836740046e-05, "loss": 1.4253, "step": 76340 }, { "epoch": 3.2357444069978154, "grad_norm": 0.2462978959083557, "learning_rate": 6.870678362893243e-05, "loss": 1.4215, "step": 76350 }, { "epoch": 3.2362863481633277, "grad_norm": 0.3127591907978058, "learning_rate": 6.869169719252756e-05, "loss": 1.4199, "step": 76360 }, { "epoch": 3.2368282893288396, "grad_norm": 0.19799236953258514, "learning_rate": 6.867660906005467e-05, "loss": 1.4101, "step": 76370 }, { "epoch": 3.237370230494352, "grad_norm": 0.4551452696323395, "learning_rate": 6.866151923338286e-05, "loss": 1.4229, "step": 76380 }, { "epoch": 3.237695395193659, "eval_loss": 2.4275617599487305, "eval_runtime": 21.99, "eval_samples_per_second": 227.376, "eval_steps_per_second": 1.228, "step": 76386 }, { "epoch": 3.2379121716598642, "grad_norm": 0.19475868344306946, "learning_rate": 6.864642771438136e-05, "loss": 1.4142, "step": 76390 }, { "epoch": 3.2384541128253765, "grad_norm": 0.5060222148895264, "learning_rate": 6.863133450491961e-05, "loss": 1.4153, "step": 76400 }, { "epoch": 3.2389960539908884, "grad_norm": 0.2045438289642334, "learning_rate": 6.861623960686734e-05, "loss": 1.413, "step": 76410 }, { "epoch": 3.2395379951564007, "grad_norm": 0.24009686708450317, "learning_rate": 6.860114302209443e-05, "loss": 1.4246, "step": 76420 }, { "epoch": 3.240079936321913, "grad_norm": 0.1824415624141693, "learning_rate": 6.858604475247097e-05, "loss": 1.4248, "step": 76430 }, { "epoch": 3.2406218774874254, "grad_norm": 0.19345873594284058, "learning_rate": 6.857094479986726e-05, "loss": 1.4301, "step": 76440 }, { "epoch": 3.2411638186529377, "grad_norm": 0.17767898738384247, "learning_rate": 6.855584316615384e-05, "loss": 1.4193, "step": 76450 }, { "epoch": 3.2417057598184496, "grad_norm": 0.24474495649337769, "learning_rate": 6.854073985320141e-05, "loss": 1.4279, "step": 76460 }, { "epoch": 3.242247700983962, "grad_norm": 0.2586842179298401, "learning_rate": 6.852563486288093e-05, "loss": 1.4149, "step": 76470 }, { "epoch": 3.2424102833336157, "eval_loss": 2.4281370639801025, "eval_runtime": 21.9931, "eval_samples_per_second": 227.344, "eval_steps_per_second": 1.228, "step": 76473 }, { "epoch": 3.242789642149474, "grad_norm": 0.20308630168437958, "learning_rate": 6.85105281970635e-05, "loss": 1.4204, "step": 76480 }, { "epoch": 3.2433315833149865, "grad_norm": 0.17210350930690765, "learning_rate": 6.849541985762053e-05, "loss": 1.4308, "step": 76490 }, { "epoch": 3.243873524480499, "grad_norm": 0.351959228515625, "learning_rate": 6.848030984642351e-05, "loss": 1.4203, "step": 76500 }, { "epoch": 3.2444154656460107, "grad_norm": 0.22208164632320404, "learning_rate": 6.846519816534423e-05, "loss": 1.4174, "step": 76510 }, { "epoch": 3.244957406811523, "grad_norm": 0.3489255905151367, "learning_rate": 6.845008481625468e-05, "loss": 1.4243, "step": 76520 }, { "epoch": 3.2454993479770353, "grad_norm": 0.20681335031986237, "learning_rate": 6.843496980102702e-05, "loss": 1.4338, "step": 76530 }, { "epoch": 3.2460412891425476, "grad_norm": 0.20921902358531952, "learning_rate": 6.841985312153362e-05, "loss": 1.4329, "step": 76540 }, { "epoch": 3.2465832303080595, "grad_norm": 0.2758723199367523, "learning_rate": 6.840473477964707e-05, "loss": 1.4197, "step": 76550 }, { "epoch": 3.247125171473572, "grad_norm": 0.19785663485527039, "learning_rate": 6.838961477724018e-05, "loss": 1.4155, "step": 76560 }, { "epoch": 3.247125171473572, "eval_loss": 2.4455416202545166, "eval_runtime": 21.9858, "eval_samples_per_second": 227.419, "eval_steps_per_second": 1.228, "step": 76560 }, { "epoch": 3.247667112639084, "grad_norm": 0.16675598919391632, "learning_rate": 6.837449311618595e-05, "loss": 1.4148, "step": 76570 }, { "epoch": 3.2482090538045965, "grad_norm": 0.2686634957790375, "learning_rate": 6.835936979835757e-05, "loss": 1.4247, "step": 76580 }, { "epoch": 3.2487509949701083, "grad_norm": 0.16704584658145905, "learning_rate": 6.834424482562846e-05, "loss": 1.412, "step": 76590 }, { "epoch": 3.2492929361356206, "grad_norm": 0.2601518929004669, "learning_rate": 6.832911819987224e-05, "loss": 1.4192, "step": 76600 }, { "epoch": 3.249834877301133, "grad_norm": 0.21344710886478424, "learning_rate": 6.831398992296273e-05, "loss": 1.4149, "step": 76610 }, { "epoch": 3.2503768184666453, "grad_norm": 0.267600953578949, "learning_rate": 6.829885999677395e-05, "loss": 1.4142, "step": 76620 }, { "epoch": 3.2509187596321576, "grad_norm": 0.1773858517408371, "learning_rate": 6.828372842318013e-05, "loss": 1.4171, "step": 76630 }, { "epoch": 3.2514607007976695, "grad_norm": 0.1974133402109146, "learning_rate": 6.826859520405572e-05, "loss": 1.4088, "step": 76640 }, { "epoch": 3.2518400596135284, "eval_loss": 2.4362905025482178, "eval_runtime": 21.9908, "eval_samples_per_second": 227.368, "eval_steps_per_second": 1.228, "step": 76647 }, { "epoch": 3.252002641963182, "grad_norm": 0.3224498927593231, "learning_rate": 6.825346034127531e-05, "loss": 1.417, "step": 76650 }, { "epoch": 3.252544583128694, "grad_norm": 0.32483381032943726, "learning_rate": 6.823832383671379e-05, "loss": 1.4267, "step": 76660 }, { "epoch": 3.2530865242942064, "grad_norm": 0.1879141479730606, "learning_rate": 6.82231856922462e-05, "loss": 1.4178, "step": 76670 }, { "epoch": 3.2536284654597187, "grad_norm": 0.21414776146411896, "learning_rate": 6.820804590974776e-05, "loss": 1.4175, "step": 76680 }, { "epoch": 3.2541704066252306, "grad_norm": 0.19383347034454346, "learning_rate": 6.819290449109395e-05, "loss": 1.4178, "step": 76690 }, { "epoch": 3.254712347790743, "grad_norm": 0.17373976111412048, "learning_rate": 6.81777614381604e-05, "loss": 1.4182, "step": 76700 }, { "epoch": 3.2552542889562552, "grad_norm": 0.18311308324337006, "learning_rate": 6.816261675282297e-05, "loss": 1.4162, "step": 76710 }, { "epoch": 3.2557962301217676, "grad_norm": 0.1700367033481598, "learning_rate": 6.814747043695772e-05, "loss": 1.4163, "step": 76720 }, { "epoch": 3.25633817128728, "grad_norm": 0.4533218443393707, "learning_rate": 6.813232249244093e-05, "loss": 1.4266, "step": 76730 }, { "epoch": 3.2565549477534845, "eval_loss": 2.439307689666748, "eval_runtime": 21.9896, "eval_samples_per_second": 227.38, "eval_steps_per_second": 1.228, "step": 76734 }, { "epoch": 3.2568801124527917, "grad_norm": 0.2232131063938141, "learning_rate": 6.811717292114904e-05, "loss": 1.4208, "step": 76740 }, { "epoch": 3.257422053618304, "grad_norm": 0.24600377678871155, "learning_rate": 6.810202172495873e-05, "loss": 1.4128, "step": 76750 }, { "epoch": 3.2579639947838164, "grad_norm": 0.1848914921283722, "learning_rate": 6.808686890574683e-05, "loss": 1.4298, "step": 76760 }, { "epoch": 3.2585059359493287, "grad_norm": 0.2184385061264038, "learning_rate": 6.807171446539042e-05, "loss": 1.407, "step": 76770 }, { "epoch": 3.2590478771148406, "grad_norm": 0.18162274360656738, "learning_rate": 6.80565584057668e-05, "loss": 1.4158, "step": 76780 }, { "epoch": 3.259589818280353, "grad_norm": 0.18968087434768677, "learning_rate": 6.804140072875338e-05, "loss": 1.416, "step": 76790 }, { "epoch": 3.260131759445865, "grad_norm": 0.19787006080150604, "learning_rate": 6.802624143622789e-05, "loss": 1.4232, "step": 76800 }, { "epoch": 3.2606737006113775, "grad_norm": 0.19424383342266083, "learning_rate": 6.801108053006815e-05, "loss": 1.4144, "step": 76810 }, { "epoch": 3.2612156417768894, "grad_norm": 0.23365655541419983, "learning_rate": 6.799591801215223e-05, "loss": 1.4212, "step": 76820 }, { "epoch": 3.2612698358934407, "eval_loss": 2.4371750354766846, "eval_runtime": 21.9951, "eval_samples_per_second": 227.324, "eval_steps_per_second": 1.228, "step": 76821 }, { "epoch": 3.2617575829424017, "grad_norm": 0.20863786339759827, "learning_rate": 6.798075388435845e-05, "loss": 1.4189, "step": 76830 }, { "epoch": 3.262299524107914, "grad_norm": 0.18672865629196167, "learning_rate": 6.79655881485652e-05, "loss": 1.4129, "step": 76840 }, { "epoch": 3.2628414652734263, "grad_norm": 0.3191605508327484, "learning_rate": 6.795042080665117e-05, "loss": 1.4204, "step": 76850 }, { "epoch": 3.2633834064389386, "grad_norm": 0.1636675000190735, "learning_rate": 6.793525186049527e-05, "loss": 1.422, "step": 76860 }, { "epoch": 3.2639253476044505, "grad_norm": 0.2027965933084488, "learning_rate": 6.792008131197651e-05, "loss": 1.4215, "step": 76870 }, { "epoch": 3.264467288769963, "grad_norm": 0.1797472983598709, "learning_rate": 6.790490916297419e-05, "loss": 1.4197, "step": 76880 }, { "epoch": 3.265009229935475, "grad_norm": 0.2633003294467926, "learning_rate": 6.788973541536772e-05, "loss": 1.418, "step": 76890 }, { "epoch": 3.2655511711009875, "grad_norm": 0.30197060108184814, "learning_rate": 6.78745600710368e-05, "loss": 1.4259, "step": 76900 }, { "epoch": 3.2659847240333972, "eval_loss": 2.4383397102355957, "eval_runtime": 21.995, "eval_samples_per_second": 227.325, "eval_steps_per_second": 1.228, "step": 76908 }, { "epoch": 3.2660931122665, "grad_norm": 0.21342779695987701, "learning_rate": 6.785938313186128e-05, "loss": 1.4201, "step": 76910 }, { "epoch": 3.2666350534320117, "grad_norm": 0.18133488297462463, "learning_rate": 6.784420459972122e-05, "loss": 1.4201, "step": 76920 }, { "epoch": 3.267176994597524, "grad_norm": 0.2161795049905777, "learning_rate": 6.782902447649684e-05, "loss": 1.4147, "step": 76930 }, { "epoch": 3.2677189357630363, "grad_norm": 0.22516366839408875, "learning_rate": 6.78138427640686e-05, "loss": 1.4197, "step": 76940 }, { "epoch": 3.2682608769285486, "grad_norm": 0.20277120172977448, "learning_rate": 6.779865946431716e-05, "loss": 1.4132, "step": 76950 }, { "epoch": 3.2688028180940605, "grad_norm": 0.2785748243331909, "learning_rate": 6.778347457912335e-05, "loss": 1.4275, "step": 76960 }, { "epoch": 3.269344759259573, "grad_norm": 0.2544669210910797, "learning_rate": 6.776828811036821e-05, "loss": 1.4225, "step": 76970 }, { "epoch": 3.269886700425085, "grad_norm": 0.16335433721542358, "learning_rate": 6.775310005993297e-05, "loss": 1.4102, "step": 76980 }, { "epoch": 3.2704286415905974, "grad_norm": 0.249395951628685, "learning_rate": 6.773791042969907e-05, "loss": 1.4151, "step": 76990 }, { "epoch": 3.2706996121733534, "eval_loss": 2.4316024780273438, "eval_runtime": 21.9962, "eval_samples_per_second": 227.312, "eval_steps_per_second": 1.227, "step": 76995 }, { "epoch": 3.2709705827561093, "grad_norm": 0.17765724658966064, "learning_rate": 6.772271922154814e-05, "loss": 1.4182, "step": 77000 }, { "epoch": 3.2715125239216216, "grad_norm": 0.2077697366476059, "learning_rate": 6.770752643736196e-05, "loss": 1.4264, "step": 77010 }, { "epoch": 3.272054465087134, "grad_norm": 0.18246911466121674, "learning_rate": 6.769233207902261e-05, "loss": 1.4242, "step": 77020 }, { "epoch": 3.2725964062526463, "grad_norm": 0.3925098478794098, "learning_rate": 6.767713614841223e-05, "loss": 1.4246, "step": 77030 }, { "epoch": 3.2731383474181586, "grad_norm": 0.21297501027584076, "learning_rate": 6.766193864741327e-05, "loss": 1.4159, "step": 77040 }, { "epoch": 3.2736802885836704, "grad_norm": 0.19493433833122253, "learning_rate": 6.764673957790834e-05, "loss": 1.4209, "step": 77050 }, { "epoch": 3.2742222297491828, "grad_norm": 0.26771730184555054, "learning_rate": 6.763153894178022e-05, "loss": 1.4178, "step": 77060 }, { "epoch": 3.274764170914695, "grad_norm": 0.2222745716571808, "learning_rate": 6.761633674091187e-05, "loss": 1.4288, "step": 77070 }, { "epoch": 3.2753061120802074, "grad_norm": 0.27417925000190735, "learning_rate": 6.760113297718653e-05, "loss": 1.4272, "step": 77080 }, { "epoch": 3.2754145003133095, "eval_loss": 2.4333410263061523, "eval_runtime": 21.988, "eval_samples_per_second": 227.397, "eval_steps_per_second": 1.228, "step": 77082 }, { "epoch": 3.2758480532457197, "grad_norm": 0.4127950966358185, "learning_rate": 6.758592765248752e-05, "loss": 1.4098, "step": 77090 }, { "epoch": 3.2763899944112316, "grad_norm": 0.2694561779499054, "learning_rate": 6.757072076869845e-05, "loss": 1.4175, "step": 77100 }, { "epoch": 3.276931935576744, "grad_norm": 0.4011683166027069, "learning_rate": 6.755551232770306e-05, "loss": 1.4237, "step": 77110 }, { "epoch": 3.277473876742256, "grad_norm": 0.20086224377155304, "learning_rate": 6.754030233138533e-05, "loss": 1.4171, "step": 77120 }, { "epoch": 3.2780158179077685, "grad_norm": 0.2316787838935852, "learning_rate": 6.752509078162938e-05, "loss": 1.4164, "step": 77130 }, { "epoch": 3.278557759073281, "grad_norm": 0.259022980928421, "learning_rate": 6.750987768031954e-05, "loss": 1.4197, "step": 77140 }, { "epoch": 3.2790997002387927, "grad_norm": 0.42053425312042236, "learning_rate": 6.749466302934042e-05, "loss": 1.4367, "step": 77150 }, { "epoch": 3.279641641404305, "grad_norm": 0.1725689321756363, "learning_rate": 6.747944683057666e-05, "loss": 1.4162, "step": 77160 }, { "epoch": 3.280129388453266, "eval_loss": 2.433736801147461, "eval_runtime": 21.9879, "eval_samples_per_second": 227.397, "eval_steps_per_second": 1.228, "step": 77169 }, { "epoch": 3.2801835825698173, "grad_norm": 0.183704674243927, "learning_rate": 6.746422908591318e-05, "loss": 1.413, "step": 77170 }, { "epoch": 3.2807255237353297, "grad_norm": 0.2713957130908966, "learning_rate": 6.744900979723515e-05, "loss": 1.4109, "step": 77180 }, { "epoch": 3.2812674649008415, "grad_norm": 0.18868058919906616, "learning_rate": 6.743378896642781e-05, "loss": 1.4193, "step": 77190 }, { "epoch": 3.281809406066354, "grad_norm": 0.22179383039474487, "learning_rate": 6.741856659537669e-05, "loss": 1.4177, "step": 77200 }, { "epoch": 3.282351347231866, "grad_norm": 0.18624810874462128, "learning_rate": 6.740334268596746e-05, "loss": 1.4239, "step": 77210 }, { "epoch": 3.2828932883973785, "grad_norm": 0.2454560250043869, "learning_rate": 6.738811724008598e-05, "loss": 1.425, "step": 77220 }, { "epoch": 3.2834352295628904, "grad_norm": 0.26571282744407654, "learning_rate": 6.737289025961835e-05, "loss": 1.4224, "step": 77230 }, { "epoch": 3.2839771707284027, "grad_norm": 0.19458989799022675, "learning_rate": 6.735766174645075e-05, "loss": 1.4143, "step": 77240 }, { "epoch": 3.284519111893915, "grad_norm": 0.2220762073993683, "learning_rate": 6.734243170246968e-05, "loss": 1.4226, "step": 77250 }, { "epoch": 3.284844276593222, "eval_loss": 2.4338366985321045, "eval_runtime": 22.2711, "eval_samples_per_second": 224.507, "eval_steps_per_second": 1.212, "step": 77256 }, { "epoch": 3.2850610530594273, "grad_norm": 0.1980799287557602, "learning_rate": 6.732720012956175e-05, "loss": 1.4256, "step": 77260 }, { "epoch": 3.2856029942249396, "grad_norm": 0.2740095555782318, "learning_rate": 6.731196702961381e-05, "loss": 1.4159, "step": 77270 }, { "epoch": 3.2861449353904515, "grad_norm": 0.16144628822803497, "learning_rate": 6.729673240451283e-05, "loss": 1.4139, "step": 77280 }, { "epoch": 3.286686876555964, "grad_norm": 0.20206600427627563, "learning_rate": 6.728149625614602e-05, "loss": 1.4197, "step": 77290 }, { "epoch": 3.287228817721476, "grad_norm": 0.2360743135213852, "learning_rate": 6.726625858640078e-05, "loss": 1.422, "step": 77300 }, { "epoch": 3.2877707588869884, "grad_norm": 0.2006218433380127, "learning_rate": 6.72510193971647e-05, "loss": 1.4153, "step": 77310 }, { "epoch": 3.2883127000525008, "grad_norm": 0.2214326560497284, "learning_rate": 6.72357786903255e-05, "loss": 1.4235, "step": 77320 }, { "epoch": 3.2888546412180126, "grad_norm": 0.183137446641922, "learning_rate": 6.722053646777116e-05, "loss": 1.419, "step": 77330 }, { "epoch": 3.289396582383525, "grad_norm": 0.18729977309703827, "learning_rate": 6.720529273138983e-05, "loss": 1.4225, "step": 77340 }, { "epoch": 3.2895591647331788, "eval_loss": 2.433459758758545, "eval_runtime": 21.993, "eval_samples_per_second": 227.345, "eval_steps_per_second": 1.228, "step": 77343 }, { "epoch": 3.2899385235490373, "grad_norm": 0.19643135368824005, "learning_rate": 6.719004748306982e-05, "loss": 1.4253, "step": 77350 }, { "epoch": 3.2904804647145496, "grad_norm": 0.21529223024845123, "learning_rate": 6.717480072469967e-05, "loss": 1.4184, "step": 77360 }, { "epoch": 3.2910224058800615, "grad_norm": 0.1587986797094345, "learning_rate": 6.715955245816804e-05, "loss": 1.4235, "step": 77370 }, { "epoch": 3.2915643470455738, "grad_norm": 0.23578478395938873, "learning_rate": 6.714430268536384e-05, "loss": 1.4172, "step": 77380 }, { "epoch": 3.292106288211086, "grad_norm": 0.33754247426986694, "learning_rate": 6.712905140817616e-05, "loss": 1.4207, "step": 77390 }, { "epoch": 3.2926482293765984, "grad_norm": 0.20548036694526672, "learning_rate": 6.711379862849426e-05, "loss": 1.4004, "step": 77400 }, { "epoch": 3.2931901705421103, "grad_norm": 0.1700463443994522, "learning_rate": 6.709854434820757e-05, "loss": 1.419, "step": 77410 }, { "epoch": 3.2937321117076226, "grad_norm": 0.381428062915802, "learning_rate": 6.708328856920574e-05, "loss": 1.4164, "step": 77420 }, { "epoch": 3.294274052873135, "grad_norm": 0.19036360085010529, "learning_rate": 6.706803129337856e-05, "loss": 1.4059, "step": 77430 }, { "epoch": 3.294274052873135, "eval_loss": 2.4337708950042725, "eval_runtime": 22.2936, "eval_samples_per_second": 224.279, "eval_steps_per_second": 1.211, "step": 77430 }, { "epoch": 3.2948159940386472, "grad_norm": 0.2967754900455475, "learning_rate": 6.705277252261608e-05, "loss": 1.4183, "step": 77440 }, { "epoch": 3.2953579352041595, "grad_norm": 0.18681344389915466, "learning_rate": 6.703751225880847e-05, "loss": 1.4228, "step": 77450 }, { "epoch": 3.2958998763696714, "grad_norm": 0.15400539338588715, "learning_rate": 6.70222505038461e-05, "loss": 1.4108, "step": 77460 }, { "epoch": 3.2964418175351837, "grad_norm": 0.16726654767990112, "learning_rate": 6.700698725961952e-05, "loss": 1.4053, "step": 77470 }, { "epoch": 3.296983758700696, "grad_norm": 0.2558479309082031, "learning_rate": 6.699172252801948e-05, "loss": 1.4146, "step": 77480 }, { "epoch": 3.2975256998662084, "grad_norm": 0.16007401049137115, "learning_rate": 6.697645631093694e-05, "loss": 1.4116, "step": 77490 }, { "epoch": 3.2980676410317207, "grad_norm": 0.23393070697784424, "learning_rate": 6.696118861026297e-05, "loss": 1.4125, "step": 77500 }, { "epoch": 3.2986095821972325, "grad_norm": 0.18870221078395844, "learning_rate": 6.69459194278889e-05, "loss": 1.4135, "step": 77510 }, { "epoch": 3.298988941013091, "eval_loss": 2.4385015964508057, "eval_runtime": 21.9917, "eval_samples_per_second": 227.358, "eval_steps_per_second": 1.228, "step": 77517 }, { "epoch": 3.299151523362745, "grad_norm": 0.2689937651157379, "learning_rate": 6.69306487657062e-05, "loss": 1.4227, "step": 77520 }, { "epoch": 3.299693464528257, "grad_norm": 0.28837496042251587, "learning_rate": 6.691537662560651e-05, "loss": 1.4243, "step": 77530 }, { "epoch": 3.3002354056937695, "grad_norm": 0.18641532957553864, "learning_rate": 6.69001030094817e-05, "loss": 1.4236, "step": 77540 }, { "epoch": 3.300777346859282, "grad_norm": 0.18917597830295563, "learning_rate": 6.68848279192238e-05, "loss": 1.4141, "step": 77550 }, { "epoch": 3.3013192880247937, "grad_norm": 0.201051265001297, "learning_rate": 6.6869551356725e-05, "loss": 1.422, "step": 77560 }, { "epoch": 3.301861229190306, "grad_norm": 0.18273517489433289, "learning_rate": 6.685427332387774e-05, "loss": 1.4275, "step": 77570 }, { "epoch": 3.3024031703558183, "grad_norm": 0.2597062587738037, "learning_rate": 6.683899382257454e-05, "loss": 1.426, "step": 77580 }, { "epoch": 3.3029451115213306, "grad_norm": 0.33226585388183594, "learning_rate": 6.682371285470819e-05, "loss": 1.4262, "step": 77590 }, { "epoch": 3.3034870526868425, "grad_norm": 0.22054164111614227, "learning_rate": 6.680843042217165e-05, "loss": 1.4183, "step": 77600 }, { "epoch": 3.3037038291530476, "eval_loss": 2.4387526512145996, "eval_runtime": 21.991, "eval_samples_per_second": 227.366, "eval_steps_per_second": 1.228, "step": 77604 }, { "epoch": 3.304028993852355, "grad_norm": 0.19982264935970306, "learning_rate": 6.679314652685798e-05, "loss": 1.4194, "step": 77610 }, { "epoch": 3.304570935017867, "grad_norm": 0.3592228293418884, "learning_rate": 6.677786117066054e-05, "loss": 1.4139, "step": 77620 }, { "epoch": 3.3051128761833795, "grad_norm": 0.27388885617256165, "learning_rate": 6.676257435547279e-05, "loss": 1.4126, "step": 77630 }, { "epoch": 3.3056548173488913, "grad_norm": 0.165022611618042, "learning_rate": 6.674728608318839e-05, "loss": 1.426, "step": 77640 }, { "epoch": 3.3061967585144036, "grad_norm": 0.28118664026260376, "learning_rate": 6.67319963557012e-05, "loss": 1.4054, "step": 77650 }, { "epoch": 3.306738699679916, "grad_norm": 0.16331221163272858, "learning_rate": 6.671670517490525e-05, "loss": 1.419, "step": 77660 }, { "epoch": 3.3072806408454283, "grad_norm": 0.16730548441410065, "learning_rate": 6.67014125426947e-05, "loss": 1.4121, "step": 77670 }, { "epoch": 3.3078225820109406, "grad_norm": 0.22583581507205963, "learning_rate": 6.668611846096397e-05, "loss": 1.4194, "step": 77680 }, { "epoch": 3.3083645231764525, "grad_norm": 0.195210799574852, "learning_rate": 6.667082293160766e-05, "loss": 1.4119, "step": 77690 }, { "epoch": 3.3084187172930037, "eval_loss": 2.4286701679229736, "eval_runtime": 21.9887, "eval_samples_per_second": 227.39, "eval_steps_per_second": 1.228, "step": 77691 }, { "epoch": 3.308906464341965, "grad_norm": 0.18108178675174713, "learning_rate": 6.665552595652043e-05, "loss": 1.4121, "step": 77700 }, { "epoch": 3.309448405507477, "grad_norm": 0.2247098833322525, "learning_rate": 6.664022753759728e-05, "loss": 1.4101, "step": 77710 }, { "epoch": 3.3099903466729894, "grad_norm": 0.3786323666572571, "learning_rate": 6.662492767673325e-05, "loss": 1.4119, "step": 77720 }, { "epoch": 3.3105322878385017, "grad_norm": 0.19558481872081757, "learning_rate": 6.660962637582366e-05, "loss": 1.4074, "step": 77730 }, { "epoch": 3.3110742290040136, "grad_norm": 0.2246645838022232, "learning_rate": 6.659432363676397e-05, "loss": 1.4203, "step": 77740 }, { "epoch": 3.311616170169526, "grad_norm": 0.18004010617733002, "learning_rate": 6.65790194614498e-05, "loss": 1.4246, "step": 77750 }, { "epoch": 3.3121581113350382, "grad_norm": 0.19459094107151031, "learning_rate": 6.656371385177697e-05, "loss": 1.4134, "step": 77760 }, { "epoch": 3.3127000525005506, "grad_norm": 0.16068406403064728, "learning_rate": 6.654840680964148e-05, "loss": 1.4124, "step": 77770 }, { "epoch": 3.3131336054329603, "eval_loss": 2.4155197143554688, "eval_runtime": 21.9967, "eval_samples_per_second": 227.307, "eval_steps_per_second": 1.227, "step": 77778 }, { "epoch": 3.313241993666063, "grad_norm": 0.19042399525642395, "learning_rate": 6.653309833693947e-05, "loss": 1.4129, "step": 77780 }, { "epoch": 3.3137839348315747, "grad_norm": 0.17300648987293243, "learning_rate": 6.651778843556734e-05, "loss": 1.4206, "step": 77790 }, { "epoch": 3.314325875997087, "grad_norm": 0.23803412914276123, "learning_rate": 6.650247710742156e-05, "loss": 1.4187, "step": 77800 }, { "epoch": 3.3148678171625994, "grad_norm": 0.26673057675361633, "learning_rate": 6.648716435439887e-05, "loss": 1.4223, "step": 77810 }, { "epoch": 3.3154097583281117, "grad_norm": 0.25232914090156555, "learning_rate": 6.647185017839612e-05, "loss": 1.4211, "step": 77820 }, { "epoch": 3.3159516994936236, "grad_norm": 0.22854040563106537, "learning_rate": 6.645653458131037e-05, "loss": 1.4249, "step": 77830 }, { "epoch": 3.316493640659136, "grad_norm": 0.1900588870048523, "learning_rate": 6.644121756503888e-05, "loss": 1.4148, "step": 77840 }, { "epoch": 3.317035581824648, "grad_norm": 0.23278219997882843, "learning_rate": 6.642589913147902e-05, "loss": 1.4233, "step": 77850 }, { "epoch": 3.3175775229901605, "grad_norm": 0.2536775767803192, "learning_rate": 6.641057928252837e-05, "loss": 1.412, "step": 77860 }, { "epoch": 3.3178484935729164, "eval_loss": 2.4168598651885986, "eval_runtime": 21.9945, "eval_samples_per_second": 227.329, "eval_steps_per_second": 1.228, "step": 77865 }, { "epoch": 3.3181194641556724, "grad_norm": 0.16754235327243805, "learning_rate": 6.63952580200847e-05, "loss": 1.4147, "step": 77870 }, { "epoch": 3.3186614053211847, "grad_norm": 0.1937492936849594, "learning_rate": 6.637993534604595e-05, "loss": 1.4192, "step": 77880 }, { "epoch": 3.319203346486697, "grad_norm": 0.17745104432106018, "learning_rate": 6.636461126231022e-05, "loss": 1.4204, "step": 77890 }, { "epoch": 3.3197452876522093, "grad_norm": 0.21982987225055695, "learning_rate": 6.634928577077577e-05, "loss": 1.41, "step": 77900 }, { "epoch": 3.3202872288177216, "grad_norm": 0.2929491102695465, "learning_rate": 6.633395887334108e-05, "loss": 1.4172, "step": 77910 }, { "epoch": 3.3208291699832335, "grad_norm": 0.17290639877319336, "learning_rate": 6.631863057190479e-05, "loss": 1.4262, "step": 77920 }, { "epoch": 3.321371111148746, "grad_norm": 0.18496091663837433, "learning_rate": 6.630330086836565e-05, "loss": 1.4092, "step": 77930 }, { "epoch": 3.321913052314258, "grad_norm": 0.3094109892845154, "learning_rate": 6.62879697646227e-05, "loss": 1.422, "step": 77940 }, { "epoch": 3.3224549934797705, "grad_norm": 0.21480301022529602, "learning_rate": 6.627263726257506e-05, "loss": 1.4147, "step": 77950 }, { "epoch": 3.3225633817128726, "eval_loss": 2.4209840297698975, "eval_runtime": 21.9901, "eval_samples_per_second": 227.376, "eval_steps_per_second": 1.228, "step": 77952 }, { "epoch": 3.322996934645283, "grad_norm": 0.19507268071174622, "learning_rate": 6.625730336412204e-05, "loss": 1.4109, "step": 77960 }, { "epoch": 3.3235388758107947, "grad_norm": 0.21125446259975433, "learning_rate": 6.624196807116317e-05, "loss": 1.4112, "step": 77970 }, { "epoch": 3.324080816976307, "grad_norm": 0.2264462113380432, "learning_rate": 6.62266313855981e-05, "loss": 1.4279, "step": 77980 }, { "epoch": 3.3246227581418193, "grad_norm": 0.2783096432685852, "learning_rate": 6.621129330932668e-05, "loss": 1.4212, "step": 77990 }, { "epoch": 3.3251646993073316, "grad_norm": 0.17773371934890747, "learning_rate": 6.61959538442489e-05, "loss": 1.4149, "step": 78000 }, { "epoch": 3.3257066404728435, "grad_norm": 0.1886042207479477, "learning_rate": 6.618061299226497e-05, "loss": 1.4196, "step": 78010 }, { "epoch": 3.326248581638356, "grad_norm": 0.2692718505859375, "learning_rate": 6.616527075527527e-05, "loss": 1.4199, "step": 78020 }, { "epoch": 3.326790522803868, "grad_norm": 0.1580006331205368, "learning_rate": 6.61499271351803e-05, "loss": 1.4211, "step": 78030 }, { "epoch": 3.327278269852829, "eval_loss": 2.4329586029052734, "eval_runtime": 21.9893, "eval_samples_per_second": 227.384, "eval_steps_per_second": 1.228, "step": 78039 }, { "epoch": 3.3273324639693804, "grad_norm": 0.18620441854000092, "learning_rate": 6.613458213388073e-05, "loss": 1.4193, "step": 78040 }, { "epoch": 3.3278744051348923, "grad_norm": 0.20754049718379974, "learning_rate": 6.61192357532775e-05, "loss": 1.4075, "step": 78050 }, { "epoch": 3.3284163463004046, "grad_norm": 0.19851942360401154, "learning_rate": 6.61038879952716e-05, "loss": 1.4155, "step": 78060 }, { "epoch": 3.328958287465917, "grad_norm": 0.1629553735256195, "learning_rate": 6.608853886176426e-05, "loss": 1.4237, "step": 78070 }, { "epoch": 3.3295002286314292, "grad_norm": 0.15505240857601166, "learning_rate": 6.607318835465689e-05, "loss": 1.4255, "step": 78080 }, { "epoch": 3.3300421697969416, "grad_norm": 0.3402341902256012, "learning_rate": 6.6057836475851e-05, "loss": 1.4163, "step": 78090 }, { "epoch": 3.3305841109624534, "grad_norm": 0.1934230774641037, "learning_rate": 6.604248322724835e-05, "loss": 1.4173, "step": 78100 }, { "epoch": 3.3311260521279658, "grad_norm": 0.17497047781944275, "learning_rate": 6.602712861075082e-05, "loss": 1.4166, "step": 78110 }, { "epoch": 3.331667993293478, "grad_norm": 0.19935786724090576, "learning_rate": 6.601177262826046e-05, "loss": 1.4221, "step": 78120 }, { "epoch": 3.3319931579927853, "eval_loss": 2.4348928928375244, "eval_runtime": 21.9935, "eval_samples_per_second": 227.34, "eval_steps_per_second": 1.228, "step": 78126 }, { "epoch": 3.3322099344589904, "grad_norm": 0.17210763692855835, "learning_rate": 6.599641528167952e-05, "loss": 1.4111, "step": 78130 }, { "epoch": 3.3327518756245027, "grad_norm": 0.175918847322464, "learning_rate": 6.598105657291041e-05, "loss": 1.4183, "step": 78140 }, { "epoch": 3.3332938167900146, "grad_norm": 0.1733727604150772, "learning_rate": 6.596569650385567e-05, "loss": 1.4179, "step": 78150 }, { "epoch": 3.333835757955527, "grad_norm": 0.16511158645153046, "learning_rate": 6.595033507641806e-05, "loss": 1.415, "step": 78160 }, { "epoch": 3.334377699121039, "grad_norm": 0.3177291750907898, "learning_rate": 6.593497229250048e-05, "loss": 1.4095, "step": 78170 }, { "epoch": 3.3349196402865515, "grad_norm": 0.172699436545372, "learning_rate": 6.591960815400601e-05, "loss": 1.4035, "step": 78180 }, { "epoch": 3.335461581452064, "grad_norm": 0.28057149052619934, "learning_rate": 6.590424266283791e-05, "loss": 1.4162, "step": 78190 }, { "epoch": 3.3360035226175757, "grad_norm": 0.23236146569252014, "learning_rate": 6.588887582089955e-05, "loss": 1.417, "step": 78200 }, { "epoch": 3.336545463783088, "grad_norm": 0.2135874629020691, "learning_rate": 6.587350763009452e-05, "loss": 1.4192, "step": 78210 }, { "epoch": 3.336708046132742, "eval_loss": 2.4323973655700684, "eval_runtime": 21.9907, "eval_samples_per_second": 227.369, "eval_steps_per_second": 1.228, "step": 78213 }, { "epoch": 3.3370874049486003, "grad_norm": 0.4779936671257019, "learning_rate": 6.585813809232659e-05, "loss": 1.415, "step": 78220 }, { "epoch": 3.3376293461141127, "grad_norm": 0.22560039162635803, "learning_rate": 6.584276720949964e-05, "loss": 1.4238, "step": 78230 }, { "epoch": 3.3381712872796245, "grad_norm": 0.20752812922000885, "learning_rate": 6.582739498351778e-05, "loss": 1.4179, "step": 78240 }, { "epoch": 3.338713228445137, "grad_norm": 0.16067805886268616, "learning_rate": 6.581202141628522e-05, "loss": 1.4103, "step": 78250 }, { "epoch": 3.339255169610649, "grad_norm": 0.18570862710475922, "learning_rate": 6.579664650970638e-05, "loss": 1.4131, "step": 78260 }, { "epoch": 3.3397971107761615, "grad_norm": 0.2738288938999176, "learning_rate": 6.578127026568587e-05, "loss": 1.4215, "step": 78270 }, { "epoch": 3.3403390519416734, "grad_norm": 0.26719287037849426, "learning_rate": 6.57658926861284e-05, "loss": 1.4188, "step": 78280 }, { "epoch": 3.3408809931071857, "grad_norm": 0.23562686145305634, "learning_rate": 6.575051377293888e-05, "loss": 1.4081, "step": 78290 }, { "epoch": 3.341422934272698, "grad_norm": 0.17355681955814362, "learning_rate": 6.573513352802239e-05, "loss": 1.4196, "step": 78300 }, { "epoch": 3.341422934272698, "eval_loss": 2.430081605911255, "eval_runtime": 21.985, "eval_samples_per_second": 227.428, "eval_steps_per_second": 1.228, "step": 78300 }, { "epoch": 3.3419648754382103, "grad_norm": 0.19452211260795593, "learning_rate": 6.571975195328416e-05, "loss": 1.4239, "step": 78310 }, { "epoch": 3.3425068166037226, "grad_norm": 0.20565609633922577, "learning_rate": 6.570436905062961e-05, "loss": 1.4145, "step": 78320 }, { "epoch": 3.3430487577692345, "grad_norm": 0.163151815533638, "learning_rate": 6.568898482196428e-05, "loss": 1.4153, "step": 78330 }, { "epoch": 3.343590698934747, "grad_norm": 0.17055930197238922, "learning_rate": 6.567359926919394e-05, "loss": 1.4203, "step": 78340 }, { "epoch": 3.344132640100259, "grad_norm": 0.17260503768920898, "learning_rate": 6.565821239422444e-05, "loss": 1.4126, "step": 78350 }, { "epoch": 3.3446745812657714, "grad_norm": 0.19056035578250885, "learning_rate": 6.564282419896187e-05, "loss": 1.4296, "step": 78360 }, { "epoch": 3.3452165224312838, "grad_norm": 0.1595456898212433, "learning_rate": 6.562743468531244e-05, "loss": 1.4219, "step": 78370 }, { "epoch": 3.3457584635967956, "grad_norm": 0.16452792286872864, "learning_rate": 6.561204385518257e-05, "loss": 1.4136, "step": 78380 }, { "epoch": 3.346137822412654, "eval_loss": 2.4075536727905273, "eval_runtime": 21.9892, "eval_samples_per_second": 227.384, "eval_steps_per_second": 1.228, "step": 78387 }, { "epoch": 3.346300404762308, "grad_norm": 0.3267938196659088, "learning_rate": 6.559665171047876e-05, "loss": 1.4088, "step": 78390 }, { "epoch": 3.3468423459278203, "grad_norm": 0.24453267455101013, "learning_rate": 6.558125825310772e-05, "loss": 1.4215, "step": 78400 }, { "epoch": 3.3473842870933326, "grad_norm": 0.20705412328243256, "learning_rate": 6.556586348497637e-05, "loss": 1.4251, "step": 78410 }, { "epoch": 3.347926228258845, "grad_norm": 0.2045924812555313, "learning_rate": 6.555046740799173e-05, "loss": 1.4172, "step": 78420 }, { "epoch": 3.3484681694243568, "grad_norm": 0.1746375858783722, "learning_rate": 6.553507002406099e-05, "loss": 1.4219, "step": 78430 }, { "epoch": 3.349010110589869, "grad_norm": 0.23190824687480927, "learning_rate": 6.551967133509149e-05, "loss": 1.4169, "step": 78440 }, { "epoch": 3.3495520517553814, "grad_norm": 0.21592742204666138, "learning_rate": 6.550427134299079e-05, "loss": 1.415, "step": 78450 }, { "epoch": 3.3500939929208937, "grad_norm": 0.18433372676372528, "learning_rate": 6.548887004966658e-05, "loss": 1.415, "step": 78460 }, { "epoch": 3.3506359340864056, "grad_norm": 0.1842111349105835, "learning_rate": 6.547346745702666e-05, "loss": 1.4273, "step": 78470 }, { "epoch": 3.3508527105526107, "eval_loss": 2.4247753620147705, "eval_runtime": 21.9917, "eval_samples_per_second": 227.358, "eval_steps_per_second": 1.228, "step": 78474 }, { "epoch": 3.351177875251918, "grad_norm": 0.18961256742477417, "learning_rate": 6.545806356697908e-05, "loss": 1.4107, "step": 78480 }, { "epoch": 3.35171981641743, "grad_norm": 0.15932081639766693, "learning_rate": 6.544265838143197e-05, "loss": 1.4176, "step": 78490 }, { "epoch": 3.3522617575829425, "grad_norm": 0.1957116425037384, "learning_rate": 6.54272519022937e-05, "loss": 1.4084, "step": 78500 }, { "epoch": 3.3528036987484544, "grad_norm": 0.2836982011795044, "learning_rate": 6.541184413147273e-05, "loss": 1.4135, "step": 78510 }, { "epoch": 3.3533456399139667, "grad_norm": 0.19204333424568176, "learning_rate": 6.53964350708777e-05, "loss": 1.4181, "step": 78520 }, { "epoch": 3.353887581079479, "grad_norm": 0.173109233379364, "learning_rate": 6.538102472241745e-05, "loss": 1.4161, "step": 78530 }, { "epoch": 3.3544295222449914, "grad_norm": 0.24930110573768616, "learning_rate": 6.536561308800091e-05, "loss": 1.4201, "step": 78540 }, { "epoch": 3.3549714634105037, "grad_norm": 0.1809731125831604, "learning_rate": 6.535020016953723e-05, "loss": 1.4107, "step": 78550 }, { "epoch": 3.3555134045760155, "grad_norm": 0.18697808682918549, "learning_rate": 6.53347859689357e-05, "loss": 1.415, "step": 78560 }, { "epoch": 3.355567598692567, "eval_loss": 2.433493137359619, "eval_runtime": 21.9938, "eval_samples_per_second": 227.337, "eval_steps_per_second": 1.228, "step": 78561 }, { "epoch": 3.356055345741528, "grad_norm": 0.24766592681407928, "learning_rate": 6.531937048810573e-05, "loss": 1.4146, "step": 78570 }, { "epoch": 3.35659728690704, "grad_norm": 0.16486966609954834, "learning_rate": 6.530395372895697e-05, "loss": 1.4123, "step": 78580 }, { "epoch": 3.3571392280725525, "grad_norm": 0.20207463204860687, "learning_rate": 6.528853569339913e-05, "loss": 1.4197, "step": 78590 }, { "epoch": 3.357681169238065, "grad_norm": 0.17322023212909698, "learning_rate": 6.527311638334218e-05, "loss": 1.4166, "step": 78600 }, { "epoch": 3.3582231104035767, "grad_norm": 0.18780970573425293, "learning_rate": 6.525769580069617e-05, "loss": 1.4212, "step": 78610 }, { "epoch": 3.358765051569089, "grad_norm": 0.2877364456653595, "learning_rate": 6.524227394737135e-05, "loss": 1.4195, "step": 78620 }, { "epoch": 3.3593069927346013, "grad_norm": 0.2212965041399002, "learning_rate": 6.522685082527807e-05, "loss": 1.4172, "step": 78630 }, { "epoch": 3.3598489339001136, "grad_norm": 0.2948903441429138, "learning_rate": 6.521142643632692e-05, "loss": 1.4171, "step": 78640 }, { "epoch": 3.3602824868325234, "eval_loss": 2.4318501949310303, "eval_runtime": 24.8893, "eval_samples_per_second": 200.89, "eval_steps_per_second": 1.085, "step": 78648 }, { "epoch": 3.3603908750656255, "grad_norm": 0.1802278459072113, "learning_rate": 6.51960007824286e-05, "loss": 1.4057, "step": 78650 }, { "epoch": 3.360932816231138, "grad_norm": 0.2904506027698517, "learning_rate": 6.518057386549398e-05, "loss": 1.4219, "step": 78660 }, { "epoch": 3.36147475739665, "grad_norm": 0.3286694586277008, "learning_rate": 6.516514568743407e-05, "loss": 1.4142, "step": 78670 }, { "epoch": 3.3620166985621625, "grad_norm": 0.1851622760295868, "learning_rate": 6.514971625016004e-05, "loss": 1.4056, "step": 78680 }, { "epoch": 3.3625586397276743, "grad_norm": 0.1889377385377884, "learning_rate": 6.513428555558321e-05, "loss": 1.4122, "step": 78690 }, { "epoch": 3.3631005808931866, "grad_norm": 0.15249784290790558, "learning_rate": 6.51188536056151e-05, "loss": 1.4211, "step": 78700 }, { "epoch": 3.363642522058699, "grad_norm": 0.3704091012477875, "learning_rate": 6.510342040216733e-05, "loss": 1.4221, "step": 78710 }, { "epoch": 3.3641844632242113, "grad_norm": 0.2278876155614853, "learning_rate": 6.508798594715172e-05, "loss": 1.4273, "step": 78720 }, { "epoch": 3.3647264043897236, "grad_norm": 0.21338103711605072, "learning_rate": 6.507255024248019e-05, "loss": 1.4185, "step": 78730 }, { "epoch": 3.3649973749724795, "eval_loss": 2.418487548828125, "eval_runtime": 21.9902, "eval_samples_per_second": 227.374, "eval_steps_per_second": 1.228, "step": 78735 }, { "epoch": 3.3652683455552355, "grad_norm": 0.3524332642555237, "learning_rate": 6.505711329006488e-05, "loss": 1.4218, "step": 78740 }, { "epoch": 3.365810286720748, "grad_norm": 0.1828557401895523, "learning_rate": 6.504167509181804e-05, "loss": 1.4172, "step": 78750 }, { "epoch": 3.36635222788626, "grad_norm": 0.21107490360736847, "learning_rate": 6.502623564965206e-05, "loss": 1.4145, "step": 78760 }, { "epoch": 3.3668941690517724, "grad_norm": 0.26644033193588257, "learning_rate": 6.501079496547957e-05, "loss": 1.4113, "step": 78770 }, { "epoch": 3.3674361102172847, "grad_norm": 0.15477254986763, "learning_rate": 6.499535304121324e-05, "loss": 1.4176, "step": 78780 }, { "epoch": 3.3679780513827966, "grad_norm": 0.1923697292804718, "learning_rate": 6.497990987876598e-05, "loss": 1.4192, "step": 78790 }, { "epoch": 3.368519992548309, "grad_norm": 0.18154162168502808, "learning_rate": 6.496446548005082e-05, "loss": 1.4184, "step": 78800 }, { "epoch": 3.3690619337138212, "grad_norm": 0.24058213829994202, "learning_rate": 6.494901984698093e-05, "loss": 1.4109, "step": 78810 }, { "epoch": 3.3696038748793335, "grad_norm": 0.1790485382080078, "learning_rate": 6.493357298146965e-05, "loss": 1.4162, "step": 78820 }, { "epoch": 3.3697122631124357, "eval_loss": 2.420318126678467, "eval_runtime": 21.9925, "eval_samples_per_second": 227.35, "eval_steps_per_second": 1.228, "step": 78822 }, { "epoch": 3.370145816044846, "grad_norm": 0.1824759691953659, "learning_rate": 6.491812488543049e-05, "loss": 1.4181, "step": 78830 }, { "epoch": 3.3706877572103577, "grad_norm": 0.27409785985946655, "learning_rate": 6.490267556077706e-05, "loss": 1.4215, "step": 78840 }, { "epoch": 3.37122969837587, "grad_norm": 0.17618605494499207, "learning_rate": 6.488722500942321e-05, "loss": 1.4215, "step": 78850 }, { "epoch": 3.3717716395413824, "grad_norm": 0.21089158952236176, "learning_rate": 6.487177323328282e-05, "loss": 1.4105, "step": 78860 }, { "epoch": 3.3723135807068947, "grad_norm": 0.1864428073167801, "learning_rate": 6.485632023427003e-05, "loss": 1.4122, "step": 78870 }, { "epoch": 3.3728555218724066, "grad_norm": 0.21700777113437653, "learning_rate": 6.484086601429907e-05, "loss": 1.4169, "step": 78880 }, { "epoch": 3.373397463037919, "grad_norm": 0.1815451681613922, "learning_rate": 6.482541057528437e-05, "loss": 1.428, "step": 78890 }, { "epoch": 3.373939404203431, "grad_norm": 0.23216386139392853, "learning_rate": 6.480995391914046e-05, "loss": 1.4166, "step": 78900 }, { "epoch": 3.3744271512523922, "eval_loss": 2.411614418029785, "eval_runtime": 21.9934, "eval_samples_per_second": 227.341, "eval_steps_per_second": 1.228, "step": 78909 }, { "epoch": 3.3744813453689435, "grad_norm": 0.1651027947664261, "learning_rate": 6.479449604778206e-05, "loss": 1.4183, "step": 78910 }, { "epoch": 3.3750232865344554, "grad_norm": 0.20378755033016205, "learning_rate": 6.477903696312398e-05, "loss": 1.4167, "step": 78920 }, { "epoch": 3.3755652276999677, "grad_norm": 0.19732166826725006, "learning_rate": 6.476357666708129e-05, "loss": 1.4083, "step": 78930 }, { "epoch": 3.37610716886548, "grad_norm": 0.23895850777626038, "learning_rate": 6.47481151615691e-05, "loss": 1.4173, "step": 78940 }, { "epoch": 3.3766491100309923, "grad_norm": 0.15536242723464966, "learning_rate": 6.473265244850273e-05, "loss": 1.4142, "step": 78950 }, { "epoch": 3.3771910511965046, "grad_norm": 0.23873640596866608, "learning_rate": 6.471718852979762e-05, "loss": 1.4163, "step": 78960 }, { "epoch": 3.3777329923620165, "grad_norm": 0.3809368908405304, "learning_rate": 6.47017234073694e-05, "loss": 1.404, "step": 78970 }, { "epoch": 3.378274933527529, "grad_norm": 0.19484516978263855, "learning_rate": 6.468625708313378e-05, "loss": 1.4137, "step": 78980 }, { "epoch": 3.378816874693041, "grad_norm": 0.23201866447925568, "learning_rate": 6.46707895590067e-05, "loss": 1.4246, "step": 78990 }, { "epoch": 3.3791420393923484, "eval_loss": 2.4239916801452637, "eval_runtime": 21.9918, "eval_samples_per_second": 227.358, "eval_steps_per_second": 1.228, "step": 78996 }, { "epoch": 3.3793588158585535, "grad_norm": 0.17255160212516785, "learning_rate": 6.46553208369042e-05, "loss": 1.4182, "step": 79000 }, { "epoch": 3.379900757024066, "grad_norm": 0.148577019572258, "learning_rate": 6.463985091874248e-05, "loss": 1.4222, "step": 79010 }, { "epoch": 3.3804426981895777, "grad_norm": 0.17819197475910187, "learning_rate": 6.462437980643786e-05, "loss": 1.4207, "step": 79020 }, { "epoch": 3.38098463935509, "grad_norm": 0.18190248310565948, "learning_rate": 6.460890750190686e-05, "loss": 1.4113, "step": 79030 }, { "epoch": 3.3815265805206023, "grad_norm": 0.17424237728118896, "learning_rate": 6.459343400706612e-05, "loss": 1.4333, "step": 79040 }, { "epoch": 3.3820685216861146, "grad_norm": 0.16150598227977753, "learning_rate": 6.457795932383242e-05, "loss": 1.4171, "step": 79050 }, { "epoch": 3.3826104628516265, "grad_norm": 0.1738124042749405, "learning_rate": 6.456248345412272e-05, "loss": 1.4096, "step": 79060 }, { "epoch": 3.383152404017139, "grad_norm": 0.21694537997245789, "learning_rate": 6.454700639985408e-05, "loss": 1.4149, "step": 79070 }, { "epoch": 3.383694345182651, "grad_norm": 0.170954167842865, "learning_rate": 6.453152816294373e-05, "loss": 1.4234, "step": 79080 }, { "epoch": 3.383856927532305, "eval_loss": 2.4151031970977783, "eval_runtime": 21.9982, "eval_samples_per_second": 227.291, "eval_steps_per_second": 1.227, "step": 79083 }, { "epoch": 3.3842362863481634, "grad_norm": 0.2635592818260193, "learning_rate": 6.451604874530905e-05, "loss": 1.4245, "step": 79090 }, { "epoch": 3.3847782275136753, "grad_norm": 0.2918653190135956, "learning_rate": 6.450056814886756e-05, "loss": 1.4131, "step": 79100 }, { "epoch": 3.3853201686791876, "grad_norm": 0.3705879747867584, "learning_rate": 6.448508637553695e-05, "loss": 1.427, "step": 79110 }, { "epoch": 3.3858621098447, "grad_norm": 0.18804314732551575, "learning_rate": 6.446960342723503e-05, "loss": 1.4223, "step": 79120 }, { "epoch": 3.3864040510102122, "grad_norm": 0.15354295074939728, "learning_rate": 6.445411930587971e-05, "loss": 1.4193, "step": 79130 }, { "epoch": 3.3869459921757246, "grad_norm": 0.2109123021364212, "learning_rate": 6.443863401338917e-05, "loss": 1.4122, "step": 79140 }, { "epoch": 3.3874879333412364, "grad_norm": 0.2178303450345993, "learning_rate": 6.442314755168162e-05, "loss": 1.4152, "step": 79150 }, { "epoch": 3.3880298745067488, "grad_norm": 0.24825222790241241, "learning_rate": 6.440765992267546e-05, "loss": 1.4112, "step": 79160 }, { "epoch": 3.388571815672261, "grad_norm": 0.18957504630088806, "learning_rate": 6.439217112828924e-05, "loss": 1.4217, "step": 79170 }, { "epoch": 3.388571815672261, "eval_loss": 2.423619270324707, "eval_runtime": 21.99, "eval_samples_per_second": 227.376, "eval_steps_per_second": 1.228, "step": 79170 }, { "epoch": 3.3891137568377734, "grad_norm": 0.16275407373905182, "learning_rate": 6.437668117044162e-05, "loss": 1.4085, "step": 79180 }, { "epoch": 3.3896556980032857, "grad_norm": 0.26474809646606445, "learning_rate": 6.436119005105145e-05, "loss": 1.4169, "step": 79190 }, { "epoch": 3.3901976391687976, "grad_norm": 0.3433353900909424, "learning_rate": 6.434569777203772e-05, "loss": 1.4049, "step": 79200 }, { "epoch": 3.39073958033431, "grad_norm": 0.31058382987976074, "learning_rate": 6.433020433531951e-05, "loss": 1.4222, "step": 79210 }, { "epoch": 3.391281521499822, "grad_norm": 0.1867647022008896, "learning_rate": 6.43147097428161e-05, "loss": 1.4368, "step": 79220 }, { "epoch": 3.3918234626653345, "grad_norm": 0.38746219873428345, "learning_rate": 6.429921399644687e-05, "loss": 1.4072, "step": 79230 }, { "epoch": 3.392365403830847, "grad_norm": 0.21169978380203247, "learning_rate": 6.42837170981314e-05, "loss": 1.4154, "step": 79240 }, { "epoch": 3.3929073449963587, "grad_norm": 0.17615120112895966, "learning_rate": 6.426821904978936e-05, "loss": 1.4117, "step": 79250 }, { "epoch": 3.393286703812217, "eval_loss": 2.415933609008789, "eval_runtime": 21.9936, "eval_samples_per_second": 227.339, "eval_steps_per_second": 1.228, "step": 79257 }, { "epoch": 3.393449286161871, "grad_norm": 0.2107549011707306, "learning_rate": 6.425271985334059e-05, "loss": 1.4119, "step": 79260 }, { "epoch": 3.3939912273273833, "grad_norm": 0.30386942625045776, "learning_rate": 6.423721951070502e-05, "loss": 1.408, "step": 79270 }, { "epoch": 3.3945331684928957, "grad_norm": 0.17193304002285004, "learning_rate": 6.422171802380283e-05, "loss": 1.4055, "step": 79280 }, { "epoch": 3.3950751096584075, "grad_norm": 0.30357927083969116, "learning_rate": 6.420621539455426e-05, "loss": 1.411, "step": 79290 }, { "epoch": 3.39561705082392, "grad_norm": 0.20240160822868347, "learning_rate": 6.419071162487969e-05, "loss": 1.4088, "step": 79300 }, { "epoch": 3.396158991989432, "grad_norm": 0.18283316493034363, "learning_rate": 6.417520671669964e-05, "loss": 1.4094, "step": 79310 }, { "epoch": 3.3967009331549445, "grad_norm": 0.16190268099308014, "learning_rate": 6.415970067193483e-05, "loss": 1.4148, "step": 79320 }, { "epoch": 3.3972428743204564, "grad_norm": 0.22923493385314941, "learning_rate": 6.414419349250608e-05, "loss": 1.409, "step": 79330 }, { "epoch": 3.3977848154859687, "grad_norm": 0.3721964359283447, "learning_rate": 6.412868518033432e-05, "loss": 1.4198, "step": 79340 }, { "epoch": 3.3980015919521738, "eval_loss": 2.421030044555664, "eval_runtime": 21.9926, "eval_samples_per_second": 227.349, "eval_steps_per_second": 1.228, "step": 79344 }, { "epoch": 3.398326756651481, "grad_norm": 0.392916738986969, "learning_rate": 6.41131757373407e-05, "loss": 1.4077, "step": 79350 }, { "epoch": 3.3988686978169933, "grad_norm": 0.5418851375579834, "learning_rate": 6.409766516544642e-05, "loss": 1.4154, "step": 79360 }, { "epoch": 3.3994106389825056, "grad_norm": 0.20894989371299744, "learning_rate": 6.408215346657287e-05, "loss": 1.4091, "step": 79370 }, { "epoch": 3.3999525801480175, "grad_norm": 0.2258988916873932, "learning_rate": 6.406664064264159e-05, "loss": 1.4104, "step": 79380 }, { "epoch": 3.40049452131353, "grad_norm": 0.1968545764684677, "learning_rate": 6.405112669557424e-05, "loss": 1.4136, "step": 79390 }, { "epoch": 3.401036462479042, "grad_norm": 0.2986617684364319, "learning_rate": 6.403561162729257e-05, "loss": 1.4032, "step": 79400 }, { "epoch": 3.4015784036445544, "grad_norm": 0.16921205818653107, "learning_rate": 6.402009543971861e-05, "loss": 1.4121, "step": 79410 }, { "epoch": 3.4021203448100668, "grad_norm": 0.21860603988170624, "learning_rate": 6.400457813477435e-05, "loss": 1.418, "step": 79420 }, { "epoch": 3.4026622859755786, "grad_norm": 0.15775620937347412, "learning_rate": 6.398905971438207e-05, "loss": 1.4144, "step": 79430 }, { "epoch": 3.40271648009213, "eval_loss": 2.416477680206299, "eval_runtime": 21.9934, "eval_samples_per_second": 227.341, "eval_steps_per_second": 1.228, "step": 79431 }, { "epoch": 3.403204227141091, "grad_norm": 0.16970393061637878, "learning_rate": 6.39735401804641e-05, "loss": 1.4145, "step": 79440 }, { "epoch": 3.4037461683066033, "grad_norm": 0.21157051622867584, "learning_rate": 6.395801953494292e-05, "loss": 1.4297, "step": 79450 }, { "epoch": 3.4042881094721156, "grad_norm": 0.24515606462955475, "learning_rate": 6.394249777974118e-05, "loss": 1.4207, "step": 79460 }, { "epoch": 3.404830050637628, "grad_norm": 0.20401237905025482, "learning_rate": 6.392697491678163e-05, "loss": 1.4104, "step": 79470 }, { "epoch": 3.4053719918031398, "grad_norm": 0.2022889405488968, "learning_rate": 6.391145094798718e-05, "loss": 1.4191, "step": 79480 }, { "epoch": 3.405913932968652, "grad_norm": 0.2641688287258148, "learning_rate": 6.389592587528089e-05, "loss": 1.4175, "step": 79490 }, { "epoch": 3.4064558741341644, "grad_norm": 0.30329614877700806, "learning_rate": 6.388039970058591e-05, "loss": 1.4097, "step": 79500 }, { "epoch": 3.4069978152996767, "grad_norm": 0.19015826284885406, "learning_rate": 6.386487242582559e-05, "loss": 1.4183, "step": 79510 }, { "epoch": 3.4074313682320865, "eval_loss": 2.4201693534851074, "eval_runtime": 22.0994, "eval_samples_per_second": 226.25, "eval_steps_per_second": 1.222, "step": 79518 }, { "epoch": 3.4075397564651886, "grad_norm": 0.2884151041507721, "learning_rate": 6.384934405292335e-05, "loss": 1.4081, "step": 79520 }, { "epoch": 3.408081697630701, "grad_norm": 0.2025626003742218, "learning_rate": 6.383381458380279e-05, "loss": 1.408, "step": 79530 }, { "epoch": 3.408623638796213, "grad_norm": 0.20512153208255768, "learning_rate": 6.381828402038763e-05, "loss": 1.414, "step": 79540 }, { "epoch": 3.4091655799617255, "grad_norm": 0.23173965513706207, "learning_rate": 6.380275236460174e-05, "loss": 1.4234, "step": 79550 }, { "epoch": 3.4097075211272374, "grad_norm": 0.21376845240592957, "learning_rate": 6.378721961836908e-05, "loss": 1.4114, "step": 79560 }, { "epoch": 3.4102494622927497, "grad_norm": 0.25625619292259216, "learning_rate": 6.377168578361383e-05, "loss": 1.4096, "step": 79570 }, { "epoch": 3.410791403458262, "grad_norm": 0.1973353773355484, "learning_rate": 6.375615086226022e-05, "loss": 1.4196, "step": 79580 }, { "epoch": 3.4113333446237744, "grad_norm": 0.21326379477977753, "learning_rate": 6.374061485623266e-05, "loss": 1.4115, "step": 79590 }, { "epoch": 3.4118752857892867, "grad_norm": 0.16970664262771606, "learning_rate": 6.372507776745567e-05, "loss": 1.4198, "step": 79600 }, { "epoch": 3.4121462563720426, "eval_loss": 2.4215962886810303, "eval_runtime": 21.994, "eval_samples_per_second": 227.335, "eval_steps_per_second": 1.228, "step": 79605 }, { "epoch": 3.4124172269547985, "grad_norm": 0.15813182294368744, "learning_rate": 6.370953959785393e-05, "loss": 1.4056, "step": 79610 }, { "epoch": 3.412959168120311, "grad_norm": 0.3504032492637634, "learning_rate": 6.369400034935224e-05, "loss": 1.4224, "step": 79620 }, { "epoch": 3.413501109285823, "grad_norm": 0.4499680995941162, "learning_rate": 6.367846002387552e-05, "loss": 1.4056, "step": 79630 }, { "epoch": 3.4140430504513355, "grad_norm": 0.2306382656097412, "learning_rate": 6.366291862334887e-05, "loss": 1.416, "step": 79640 }, { "epoch": 3.414584991616848, "grad_norm": 0.1756628006696701, "learning_rate": 6.364737614969747e-05, "loss": 1.4121, "step": 79650 }, { "epoch": 3.4151269327823597, "grad_norm": 0.20633266866207123, "learning_rate": 6.363183260484665e-05, "loss": 1.4095, "step": 79660 }, { "epoch": 3.415668873947872, "grad_norm": 0.17313161492347717, "learning_rate": 6.361628799072187e-05, "loss": 1.4148, "step": 79670 }, { "epoch": 3.4162108151133843, "grad_norm": 0.16287872195243835, "learning_rate": 6.360074230924877e-05, "loss": 1.399, "step": 79680 }, { "epoch": 3.4167527562788966, "grad_norm": 0.18536217510700226, "learning_rate": 6.358519556235302e-05, "loss": 1.4155, "step": 79690 }, { "epoch": 3.4168611445119987, "eval_loss": 2.429548501968384, "eval_runtime": 21.9964, "eval_samples_per_second": 227.31, "eval_steps_per_second": 1.227, "step": 79692 }, { "epoch": 3.4172946974444085, "grad_norm": 0.3025163412094116, "learning_rate": 6.356964775196055e-05, "loss": 1.4113, "step": 79700 }, { "epoch": 3.417836638609921, "grad_norm": 0.30600860714912415, "learning_rate": 6.35540988799973e-05, "loss": 1.4125, "step": 79710 }, { "epoch": 3.418378579775433, "grad_norm": 0.2034313678741455, "learning_rate": 6.353854894838942e-05, "loss": 1.4084, "step": 79720 }, { "epoch": 3.4189205209409455, "grad_norm": 0.16151131689548492, "learning_rate": 6.352299795906317e-05, "loss": 1.4079, "step": 79730 }, { "epoch": 3.4194624621064573, "grad_norm": 0.3311507999897003, "learning_rate": 6.350744591394494e-05, "loss": 1.4156, "step": 79740 }, { "epoch": 3.4200044032719696, "grad_norm": 0.21057330071926117, "learning_rate": 6.349189281496124e-05, "loss": 1.4149, "step": 79750 }, { "epoch": 3.420546344437482, "grad_norm": 0.2549402117729187, "learning_rate": 6.347633866403873e-05, "loss": 1.4086, "step": 79760 }, { "epoch": 3.4210882856029943, "grad_norm": 0.2768961787223816, "learning_rate": 6.346078346310417e-05, "loss": 1.4118, "step": 79770 }, { "epoch": 3.4215760326519553, "eval_loss": 2.4365317821502686, "eval_runtime": 21.997, "eval_samples_per_second": 227.304, "eval_steps_per_second": 1.227, "step": 79779 }, { "epoch": 3.4216302267685066, "grad_norm": 0.16737370193004608, "learning_rate": 6.344522721408453e-05, "loss": 1.416, "step": 79780 }, { "epoch": 3.4221721679340185, "grad_norm": 0.1741742491722107, "learning_rate": 6.342966991890677e-05, "loss": 1.4094, "step": 79790 }, { "epoch": 3.4227141090995308, "grad_norm": 0.19367846846580505, "learning_rate": 6.341411157949812e-05, "loss": 1.4087, "step": 79800 }, { "epoch": 3.423256050265043, "grad_norm": 0.17661771178245544, "learning_rate": 6.339855219778586e-05, "loss": 1.4114, "step": 79810 }, { "epoch": 3.4237979914305554, "grad_norm": 0.1794527918100357, "learning_rate": 6.338299177569739e-05, "loss": 1.4256, "step": 79820 }, { "epoch": 3.4243399325960677, "grad_norm": 0.16882766783237457, "learning_rate": 6.336743031516031e-05, "loss": 1.4138, "step": 79830 }, { "epoch": 3.4248818737615796, "grad_norm": 0.2601432204246521, "learning_rate": 6.33518678181023e-05, "loss": 1.4253, "step": 79840 }, { "epoch": 3.425423814927092, "grad_norm": 0.15983355045318604, "learning_rate": 6.333630428645116e-05, "loss": 1.4191, "step": 79850 }, { "epoch": 3.4259657560926042, "grad_norm": 0.2166626751422882, "learning_rate": 6.332073972213482e-05, "loss": 1.4176, "step": 79860 }, { "epoch": 3.4262909207919114, "eval_loss": 2.441222906112671, "eval_runtime": 21.995, "eval_samples_per_second": 227.324, "eval_steps_per_second": 1.228, "step": 79866 }, { "epoch": 3.4265076972581165, "grad_norm": 0.27949678897857666, "learning_rate": 6.330517412708138e-05, "loss": 1.4055, "step": 79870 }, { "epoch": 3.427049638423629, "grad_norm": 0.23983129858970642, "learning_rate": 6.328960750321903e-05, "loss": 1.4154, "step": 79880 }, { "epoch": 3.4275915795891407, "grad_norm": 0.18861855566501617, "learning_rate": 6.32740398524761e-05, "loss": 1.4094, "step": 79890 }, { "epoch": 3.428133520754653, "grad_norm": 0.2114916443824768, "learning_rate": 6.325847117678102e-05, "loss": 1.4138, "step": 79900 }, { "epoch": 3.4286754619201654, "grad_norm": 0.22669847309589386, "learning_rate": 6.32429014780624e-05, "loss": 1.4116, "step": 79910 }, { "epoch": 3.4292174030856777, "grad_norm": 0.18799784779548645, "learning_rate": 6.322733075824891e-05, "loss": 1.4199, "step": 79920 }, { "epoch": 3.4297593442511896, "grad_norm": 0.2161915898323059, "learning_rate": 6.321175901926941e-05, "loss": 1.4138, "step": 79930 }, { "epoch": 3.430301285416702, "grad_norm": 0.17232954502105713, "learning_rate": 6.319618626305288e-05, "loss": 1.4154, "step": 79940 }, { "epoch": 3.430843226582214, "grad_norm": 0.20791089534759521, "learning_rate": 6.318061249152835e-05, "loss": 1.4172, "step": 79950 }, { "epoch": 3.431005808931868, "eval_loss": 2.4264652729034424, "eval_runtime": 21.9927, "eval_samples_per_second": 227.349, "eval_steps_per_second": 1.228, "step": 79953 }, { "epoch": 3.4313851677477265, "grad_norm": 0.22153067588806152, "learning_rate": 6.316503770662508e-05, "loss": 1.4078, "step": 79960 }, { "epoch": 3.4319271089132384, "grad_norm": 0.17650219798088074, "learning_rate": 6.314946191027238e-05, "loss": 1.4162, "step": 79970 }, { "epoch": 3.4324690500787507, "grad_norm": 0.24956724047660828, "learning_rate": 6.313388510439972e-05, "loss": 1.4138, "step": 79980 }, { "epoch": 3.433010991244263, "grad_norm": 0.17035385966300964, "learning_rate": 6.311830729093669e-05, "loss": 1.4043, "step": 79990 }, { "epoch": 3.4335529324097753, "grad_norm": 0.16698023676872253, "learning_rate": 6.3102728471813e-05, "loss": 1.4169, "step": 80000 }, { "epoch": 3.4340948735752876, "grad_norm": 0.19518424570560455, "learning_rate": 6.308714864895847e-05, "loss": 1.4078, "step": 80010 }, { "epoch": 3.4346368147407995, "grad_norm": 0.27307751774787903, "learning_rate": 6.30715678243031e-05, "loss": 1.4151, "step": 80020 }, { "epoch": 3.435178755906312, "grad_norm": 0.2107393890619278, "learning_rate": 6.305598599977694e-05, "loss": 1.418, "step": 80030 }, { "epoch": 3.435720697071824, "grad_norm": 0.20705653727054596, "learning_rate": 6.304040317731022e-05, "loss": 1.4223, "step": 80040 }, { "epoch": 3.435720697071824, "eval_loss": 2.445282459259033, "eval_runtime": 21.9862, "eval_samples_per_second": 227.416, "eval_steps_per_second": 1.228, "step": 80040 }, { "epoch": 3.4362626382373365, "grad_norm": 0.1651361584663391, "learning_rate": 6.302481935883325e-05, "loss": 1.4108, "step": 80050 }, { "epoch": 3.436804579402849, "grad_norm": 0.16168753802776337, "learning_rate": 6.300923454627649e-05, "loss": 1.4176, "step": 80060 }, { "epoch": 3.4373465205683607, "grad_norm": 0.18073947727680206, "learning_rate": 6.299364874157054e-05, "loss": 1.3954, "step": 80070 }, { "epoch": 3.437888461733873, "grad_norm": 0.2578413784503937, "learning_rate": 6.297806194664609e-05, "loss": 1.4147, "step": 80080 }, { "epoch": 3.4384304028993853, "grad_norm": 0.16976307332515717, "learning_rate": 6.296247416343396e-05, "loss": 1.4134, "step": 80090 }, { "epoch": 3.4389723440648976, "grad_norm": 0.16870634257793427, "learning_rate": 6.29468853938651e-05, "loss": 1.4163, "step": 80100 }, { "epoch": 3.43951428523041, "grad_norm": 0.23185555636882782, "learning_rate": 6.293129563987057e-05, "loss": 1.4076, "step": 80110 }, { "epoch": 3.440056226395922, "grad_norm": 0.15721189975738525, "learning_rate": 6.291570490338159e-05, "loss": 1.412, "step": 80120 }, { "epoch": 3.4404355852117803, "eval_loss": 2.428380250930786, "eval_runtime": 21.9892, "eval_samples_per_second": 227.384, "eval_steps_per_second": 1.228, "step": 80127 }, { "epoch": 3.440598167561434, "grad_norm": 0.2671518623828888, "learning_rate": 6.290011318632945e-05, "loss": 1.4068, "step": 80130 }, { "epoch": 3.4411401087269464, "grad_norm": 0.1867525279521942, "learning_rate": 6.288452049064558e-05, "loss": 1.4163, "step": 80140 }, { "epoch": 3.4416820498924587, "grad_norm": 0.17865586280822754, "learning_rate": 6.286892681826154e-05, "loss": 1.409, "step": 80150 }, { "epoch": 3.4422239910579706, "grad_norm": 0.19090549647808075, "learning_rate": 6.285333217110901e-05, "loss": 1.4124, "step": 80160 }, { "epoch": 3.442765932223483, "grad_norm": 0.17862895131111145, "learning_rate": 6.283773655111979e-05, "loss": 1.4123, "step": 80170 }, { "epoch": 3.4433078733889952, "grad_norm": 0.17863786220550537, "learning_rate": 6.28221399602258e-05, "loss": 1.4109, "step": 80180 }, { "epoch": 3.4438498145545076, "grad_norm": 0.18190784752368927, "learning_rate": 6.280654240035906e-05, "loss": 1.4188, "step": 80190 }, { "epoch": 3.4443917557200194, "grad_norm": 0.21105967462062836, "learning_rate": 6.279094387345173e-05, "loss": 1.4069, "step": 80200 }, { "epoch": 3.4449336968855317, "grad_norm": 0.24444468319416046, "learning_rate": 6.277534438143612e-05, "loss": 1.4072, "step": 80210 }, { "epoch": 3.445150473351737, "eval_loss": 2.4381296634674072, "eval_runtime": 21.9948, "eval_samples_per_second": 227.326, "eval_steps_per_second": 1.228, "step": 80214 }, { "epoch": 3.445475638051044, "grad_norm": 0.16191159188747406, "learning_rate": 6.27597439262446e-05, "loss": 1.4072, "step": 80220 }, { "epoch": 3.4460175792165564, "grad_norm": 0.19114407896995544, "learning_rate": 6.27441425098097e-05, "loss": 1.4165, "step": 80230 }, { "epoch": 3.4465595203820687, "grad_norm": 0.18071117997169495, "learning_rate": 6.272854013406403e-05, "loss": 1.4204, "step": 80240 }, { "epoch": 3.4471014615475806, "grad_norm": 0.2497524917125702, "learning_rate": 6.271293680094037e-05, "loss": 1.4179, "step": 80250 }, { "epoch": 3.447643402713093, "grad_norm": 0.17332278192043304, "learning_rate": 6.269733251237159e-05, "loss": 1.4071, "step": 80260 }, { "epoch": 3.448185343878605, "grad_norm": 0.24143235385417938, "learning_rate": 6.268172727029065e-05, "loss": 1.4087, "step": 80270 }, { "epoch": 3.4487272850441175, "grad_norm": 0.18241146206855774, "learning_rate": 6.266612107663072e-05, "loss": 1.4128, "step": 80280 }, { "epoch": 3.44926922620963, "grad_norm": 0.386482834815979, "learning_rate": 6.265051393332498e-05, "loss": 1.398, "step": 80290 }, { "epoch": 3.4498111673751417, "grad_norm": 0.22998222708702087, "learning_rate": 6.26349058423068e-05, "loss": 1.4189, "step": 80300 }, { "epoch": 3.449865361491693, "eval_loss": 2.4301164150238037, "eval_runtime": 21.9925, "eval_samples_per_second": 227.35, "eval_steps_per_second": 1.228, "step": 80301 }, { "epoch": 3.450353108540654, "grad_norm": 0.37004634737968445, "learning_rate": 6.261929680550963e-05, "loss": 1.4088, "step": 80310 }, { "epoch": 3.4508950497061663, "grad_norm": 0.19532030820846558, "learning_rate": 6.260368682486704e-05, "loss": 1.4071, "step": 80320 }, { "epoch": 3.4514369908716787, "grad_norm": 0.1866229772567749, "learning_rate": 6.258807590231275e-05, "loss": 1.4214, "step": 80330 }, { "epoch": 3.4519789320371905, "grad_norm": 0.18280602991580963, "learning_rate": 6.257246403978056e-05, "loss": 1.4061, "step": 80340 }, { "epoch": 3.452520873202703, "grad_norm": 0.2126053422689438, "learning_rate": 6.255685123920437e-05, "loss": 1.4077, "step": 80350 }, { "epoch": 3.453062814368215, "grad_norm": 0.1738966852426529, "learning_rate": 6.25412375025183e-05, "loss": 1.4177, "step": 80360 }, { "epoch": 3.4536047555337275, "grad_norm": 0.23334649205207825, "learning_rate": 6.252562283165645e-05, "loss": 1.412, "step": 80370 }, { "epoch": 3.4541466966992393, "grad_norm": 0.20518024265766144, "learning_rate": 6.251000722855312e-05, "loss": 1.4076, "step": 80380 }, { "epoch": 3.4545802496316496, "eval_loss": 2.419971227645874, "eval_runtime": 22.0015, "eval_samples_per_second": 227.257, "eval_steps_per_second": 1.227, "step": 80388 }, { "epoch": 3.4546886378647517, "grad_norm": 0.23325951397418976, "learning_rate": 6.249439069514269e-05, "loss": 1.4109, "step": 80390 }, { "epoch": 3.455230579030264, "grad_norm": 0.20998698472976685, "learning_rate": 6.247877323335967e-05, "loss": 1.4102, "step": 80400 }, { "epoch": 3.4557725201957763, "grad_norm": 0.17500971257686615, "learning_rate": 6.246315484513873e-05, "loss": 1.4089, "step": 80410 }, { "epoch": 3.4563144613612886, "grad_norm": 0.19114546477794647, "learning_rate": 6.244753553241455e-05, "loss": 1.4072, "step": 80420 }, { "epoch": 3.4568564025268005, "grad_norm": 0.28155219554901123, "learning_rate": 6.243191529712199e-05, "loss": 1.4249, "step": 80430 }, { "epoch": 3.457398343692313, "grad_norm": 0.18383963406085968, "learning_rate": 6.241629414119603e-05, "loss": 1.4128, "step": 80440 }, { "epoch": 3.457940284857825, "grad_norm": 0.2947709262371063, "learning_rate": 6.240067206657177e-05, "loss": 1.4164, "step": 80450 }, { "epoch": 3.4584822260233374, "grad_norm": 0.26993468403816223, "learning_rate": 6.238504907518437e-05, "loss": 1.4158, "step": 80460 }, { "epoch": 3.4590241671888498, "grad_norm": 0.1699138730764389, "learning_rate": 6.236942516896915e-05, "loss": 1.4134, "step": 80470 }, { "epoch": 3.4592951377716057, "eval_loss": 2.409397840499878, "eval_runtime": 21.9932, "eval_samples_per_second": 227.343, "eval_steps_per_second": 1.228, "step": 80475 }, { "epoch": 3.4595661083543616, "grad_norm": 0.1866215318441391, "learning_rate": 6.235380034986154e-05, "loss": 1.4173, "step": 80480 }, { "epoch": 3.460108049519874, "grad_norm": 0.1942782998085022, "learning_rate": 6.233817461979707e-05, "loss": 1.4136, "step": 80490 }, { "epoch": 3.4606499906853863, "grad_norm": 0.23292642831802368, "learning_rate": 6.23225479807114e-05, "loss": 1.4177, "step": 80500 }, { "epoch": 3.4611919318508986, "grad_norm": 0.19163425266742706, "learning_rate": 6.230692043454027e-05, "loss": 1.4147, "step": 80510 }, { "epoch": 3.461733873016411, "grad_norm": 0.18934902548789978, "learning_rate": 6.229129198321955e-05, "loss": 1.4062, "step": 80520 }, { "epoch": 3.4622758141819228, "grad_norm": 0.16462522745132446, "learning_rate": 6.227566262868523e-05, "loss": 1.419, "step": 80530 }, { "epoch": 3.462817755347435, "grad_norm": 0.16511207818984985, "learning_rate": 6.226003237287343e-05, "loss": 1.4138, "step": 80540 }, { "epoch": 3.4633596965129474, "grad_norm": 0.23615549504756927, "learning_rate": 6.224440121772034e-05, "loss": 1.4121, "step": 80550 }, { "epoch": 3.4639016376784597, "grad_norm": 0.19872863590717316, "learning_rate": 6.222876916516225e-05, "loss": 1.4056, "step": 80560 }, { "epoch": 3.464010025911562, "eval_loss": 2.413600444793701, "eval_runtime": 21.9886, "eval_samples_per_second": 227.391, "eval_steps_per_second": 1.228, "step": 80562 }, { "epoch": 3.4644435788439716, "grad_norm": 0.2045847475528717, "learning_rate": 6.221313621713566e-05, "loss": 1.4146, "step": 80570 }, { "epoch": 3.464985520009484, "grad_norm": 0.3229454755783081, "learning_rate": 6.219750237557704e-05, "loss": 1.4039, "step": 80580 }, { "epoch": 3.465527461174996, "grad_norm": 0.20860926806926727, "learning_rate": 6.218186764242308e-05, "loss": 1.414, "step": 80590 }, { "epoch": 3.4660694023405085, "grad_norm": 0.22892901301383972, "learning_rate": 6.216623201961054e-05, "loss": 1.4056, "step": 80600 }, { "epoch": 3.4666113435060204, "grad_norm": 0.20201945304870605, "learning_rate": 6.215059550907632e-05, "loss": 1.4123, "step": 80610 }, { "epoch": 3.4671532846715327, "grad_norm": 0.21173055469989777, "learning_rate": 6.213495811275733e-05, "loss": 1.4081, "step": 80620 }, { "epoch": 3.467695225837045, "grad_norm": 0.20124772191047668, "learning_rate": 6.211931983259073e-05, "loss": 1.4108, "step": 80630 }, { "epoch": 3.4682371670025574, "grad_norm": 0.2871626019477844, "learning_rate": 6.210368067051368e-05, "loss": 1.4115, "step": 80640 }, { "epoch": 3.4687249140515184, "eval_loss": 2.4178643226623535, "eval_runtime": 21.9935, "eval_samples_per_second": 227.34, "eval_steps_per_second": 1.228, "step": 80649 }, { "epoch": 3.4687791081680697, "grad_norm": 0.17180606722831726, "learning_rate": 6.208804062846353e-05, "loss": 1.4096, "step": 80650 }, { "epoch": 3.4693210493335815, "grad_norm": 0.250835657119751, "learning_rate": 6.207239970837767e-05, "loss": 1.4025, "step": 80660 }, { "epoch": 3.469862990499094, "grad_norm": 0.20965339243412018, "learning_rate": 6.205675791219365e-05, "loss": 1.423, "step": 80670 }, { "epoch": 3.470404931664606, "grad_norm": 0.1662900745868683, "learning_rate": 6.204111524184907e-05, "loss": 1.406, "step": 80680 }, { "epoch": 3.4709468728301185, "grad_norm": 0.16801480948925018, "learning_rate": 6.202547169928173e-05, "loss": 1.4126, "step": 80690 }, { "epoch": 3.471488813995631, "grad_norm": 0.1683754026889801, "learning_rate": 6.200982728642945e-05, "loss": 1.4136, "step": 80700 }, { "epoch": 3.4720307551611427, "grad_norm": 0.17398451268672943, "learning_rate": 6.19941820052302e-05, "loss": 1.396, "step": 80710 }, { "epoch": 3.472572696326655, "grad_norm": 0.25240159034729004, "learning_rate": 6.197853585762204e-05, "loss": 1.4273, "step": 80720 }, { "epoch": 3.4731146374921673, "grad_norm": 0.2683483064174652, "learning_rate": 6.196288884554315e-05, "loss": 1.4114, "step": 80730 }, { "epoch": 3.4734398021914745, "eval_loss": 2.4308483600616455, "eval_runtime": 21.9921, "eval_samples_per_second": 227.355, "eval_steps_per_second": 1.228, "step": 80736 }, { "epoch": 3.4736565786576796, "grad_norm": 0.1892116814851761, "learning_rate": 6.194724097093184e-05, "loss": 1.4226, "step": 80740 }, { "epoch": 3.474198519823192, "grad_norm": 0.22284260392189026, "learning_rate": 6.193159223572647e-05, "loss": 1.4117, "step": 80750 }, { "epoch": 3.474740460988704, "grad_norm": 0.32861995697021484, "learning_rate": 6.191594264186556e-05, "loss": 1.4114, "step": 80760 }, { "epoch": 3.475282402154216, "grad_norm": 0.16212435066699982, "learning_rate": 6.190029219128769e-05, "loss": 1.4031, "step": 80770 }, { "epoch": 3.4758243433197284, "grad_norm": 0.16907572746276855, "learning_rate": 6.188464088593157e-05, "loss": 1.4082, "step": 80780 }, { "epoch": 3.4763662844852408, "grad_norm": 0.16995957493782043, "learning_rate": 6.186898872773605e-05, "loss": 1.4242, "step": 80790 }, { "epoch": 3.4769082256507526, "grad_norm": 0.21154280006885529, "learning_rate": 6.185333571864001e-05, "loss": 1.4095, "step": 80800 }, { "epoch": 3.477450166816265, "grad_norm": 0.22190575301647186, "learning_rate": 6.183768186058252e-05, "loss": 1.4173, "step": 80810 }, { "epoch": 3.4779921079817773, "grad_norm": 0.1600603610277176, "learning_rate": 6.182202715550266e-05, "loss": 1.4161, "step": 80820 }, { "epoch": 3.478154690331431, "eval_loss": 2.4302024841308594, "eval_runtime": 21.996, "eval_samples_per_second": 227.314, "eval_steps_per_second": 1.227, "step": 80823 }, { "epoch": 3.4785340491472896, "grad_norm": 0.15820740163326263, "learning_rate": 6.18063716053397e-05, "loss": 1.4155, "step": 80830 }, { "epoch": 3.4790759903128015, "grad_norm": 0.20332497358322144, "learning_rate": 6.179071521203299e-05, "loss": 1.4101, "step": 80840 }, { "epoch": 3.4796179314783138, "grad_norm": 0.23405461013317108, "learning_rate": 6.177505797752195e-05, "loss": 1.4245, "step": 80850 }, { "epoch": 3.480159872643826, "grad_norm": 0.36314642429351807, "learning_rate": 6.175939990374613e-05, "loss": 1.415, "step": 80860 }, { "epoch": 3.4807018138093384, "grad_norm": 0.3759239614009857, "learning_rate": 6.174374099264522e-05, "loss": 1.4123, "step": 80870 }, { "epoch": 3.4812437549748507, "grad_norm": 0.18687903881072998, "learning_rate": 6.172808124615895e-05, "loss": 1.4108, "step": 80880 }, { "epoch": 3.4817856961403626, "grad_norm": 0.27162638306617737, "learning_rate": 6.171242066622718e-05, "loss": 1.4069, "step": 80890 }, { "epoch": 3.482327637305875, "grad_norm": 0.16891661286354065, "learning_rate": 6.16967592547899e-05, "loss": 1.4246, "step": 80900 }, { "epoch": 3.4828695784713872, "grad_norm": 0.24053658545017242, "learning_rate": 6.168109701378712e-05, "loss": 1.4098, "step": 80910 }, { "epoch": 3.4828695784713872, "eval_loss": 2.433274269104004, "eval_runtime": 22.0653, "eval_samples_per_second": 226.601, "eval_steps_per_second": 1.224, "step": 80910 }, { "epoch": 3.4834115196368995, "grad_norm": 0.16287986934185028, "learning_rate": 6.166543394515906e-05, "loss": 1.4179, "step": 80920 }, { "epoch": 3.483953460802412, "grad_norm": 0.2310248166322708, "learning_rate": 6.1649770050846e-05, "loss": 1.4039, "step": 80930 }, { "epoch": 3.4844954019679237, "grad_norm": 0.1746436357498169, "learning_rate": 6.16341053327883e-05, "loss": 1.4139, "step": 80940 }, { "epoch": 3.485037343133436, "grad_norm": 0.17406900227069855, "learning_rate": 6.16184397929264e-05, "loss": 1.4117, "step": 80950 }, { "epoch": 3.4855792842989484, "grad_norm": 0.17554210126399994, "learning_rate": 6.160277343320095e-05, "loss": 1.4133, "step": 80960 }, { "epoch": 3.4861212254644607, "grad_norm": 0.1602802872657776, "learning_rate": 6.158710625555257e-05, "loss": 1.4056, "step": 80970 }, { "epoch": 3.4866631666299726, "grad_norm": 0.19458161294460297, "learning_rate": 6.157143826192207e-05, "loss": 1.4136, "step": 80980 }, { "epoch": 3.487205107795485, "grad_norm": 0.23610615730285645, "learning_rate": 6.155576945425032e-05, "loss": 1.414, "step": 80990 }, { "epoch": 3.4875844666113434, "eval_loss": 2.429168701171875, "eval_runtime": 22.2262, "eval_samples_per_second": 224.96, "eval_steps_per_second": 1.215, "step": 80997 }, { "epoch": 3.487747048960997, "grad_norm": 0.18065819144248962, "learning_rate": 6.154009983447834e-05, "loss": 1.4076, "step": 81000 }, { "epoch": 4.000541941165512, "grad_norm": 0.2180778533220291, "learning_rate": 6.152442940454717e-05, "loss": 1.3974, "step": 81010 }, { "epoch": 4.001083882331025, "grad_norm": 0.20014463365077972, "learning_rate": 6.1508758166398e-05, "loss": 1.4054, "step": 81020 }, { "epoch": 4.0016258234965365, "grad_norm": 0.1851683109998703, "learning_rate": 6.149308612197213e-05, "loss": 1.4141, "step": 81030 }, { "epoch": 4.002167764662049, "grad_norm": 0.1777936816215515, "learning_rate": 6.147741327321095e-05, "loss": 1.4072, "step": 81040 }, { "epoch": 4.002709705827561, "grad_norm": 0.17153799533843994, "learning_rate": 6.146173962205594e-05, "loss": 1.4019, "step": 81050 }, { "epoch": 4.003251646993073, "grad_norm": 0.18811866641044617, "learning_rate": 6.144606517044869e-05, "loss": 1.4112, "step": 81060 }, { "epoch": 4.003793588158586, "grad_norm": 0.23102541267871857, "learning_rate": 6.143038992033083e-05, "loss": 1.4195, "step": 81070 }, { "epoch": 4.004335529324098, "grad_norm": 0.2683873176574707, "learning_rate": 6.141471387364423e-05, "loss": 1.3947, "step": 81080 }, { "epoch": 4.004552305790303, "eval_loss": 2.4371650218963623, "eval_runtime": 22.4585, "eval_samples_per_second": 222.633, "eval_steps_per_second": 1.202, "step": 81084 }, { "epoch": 4.00487747048961, "grad_norm": 0.15493124723434448, "learning_rate": 6.139903703233069e-05, "loss": 1.4126, "step": 81090 }, { "epoch": 4.005419411655122, "grad_norm": 0.17667770385742188, "learning_rate": 6.138335939833225e-05, "loss": 1.403, "step": 81100 }, { "epoch": 4.005961352820634, "grad_norm": 0.1961461454629898, "learning_rate": 6.136768097359096e-05, "loss": 1.4091, "step": 81110 }, { "epoch": 4.006503293986147, "grad_norm": 0.15184424817562103, "learning_rate": 6.135200176004897e-05, "loss": 1.4149, "step": 81120 }, { "epoch": 4.007045235151659, "grad_norm": 0.18130755424499512, "learning_rate": 6.133632175964861e-05, "loss": 1.4037, "step": 81130 }, { "epoch": 4.007587176317171, "grad_norm": 0.1635865420103073, "learning_rate": 6.132064097433222e-05, "loss": 1.4111, "step": 81140 }, { "epoch": 4.008129117482683, "grad_norm": 0.22441507875919342, "learning_rate": 6.130495940604225e-05, "loss": 1.4135, "step": 81150 }, { "epoch": 4.008671058648195, "grad_norm": 0.30155861377716064, "learning_rate": 6.128927705672129e-05, "loss": 1.4027, "step": 81160 }, { "epoch": 4.009212999813708, "grad_norm": 0.3162151873111725, "learning_rate": 6.127359392831198e-05, "loss": 1.4073, "step": 81170 }, { "epoch": 4.009267193930259, "eval_loss": 2.4355599880218506, "eval_runtime": 21.9746, "eval_samples_per_second": 227.536, "eval_steps_per_second": 1.229, "step": 81171 }, { "epoch": 4.00975494097922, "grad_norm": 0.22945579886436462, "learning_rate": 6.12579100227571e-05, "loss": 1.4077, "step": 81180 }, { "epoch": 4.010296882144732, "grad_norm": 0.25422438979148865, "learning_rate": 6.124222534199952e-05, "loss": 1.4, "step": 81190 }, { "epoch": 4.0108388233102445, "grad_norm": 0.21404403448104858, "learning_rate": 6.122653988798214e-05, "loss": 1.4092, "step": 81200 }, { "epoch": 4.011380764475756, "grad_norm": 0.17690123617649078, "learning_rate": 6.121085366264802e-05, "loss": 1.4084, "step": 81210 }, { "epoch": 4.011922705641269, "grad_norm": 0.18206678330898285, "learning_rate": 6.119516666794034e-05, "loss": 1.4086, "step": 81220 }, { "epoch": 4.012464646806781, "grad_norm": 0.20577572286128998, "learning_rate": 6.11794789058023e-05, "loss": 1.4019, "step": 81230 }, { "epoch": 4.013006587972293, "grad_norm": 0.16592557728290558, "learning_rate": 6.116379037817724e-05, "loss": 1.4091, "step": 81240 }, { "epoch": 4.013548529137806, "grad_norm": 0.1908210813999176, "learning_rate": 6.114810108700857e-05, "loss": 1.4164, "step": 81250 }, { "epoch": 4.013982082070215, "eval_loss": 2.4322612285614014, "eval_runtime": 21.9747, "eval_samples_per_second": 227.534, "eval_steps_per_second": 1.229, "step": 81258 }, { "epoch": 4.014090470303318, "grad_norm": 0.1619943529367447, "learning_rate": 6.113241103423986e-05, "loss": 1.4089, "step": 81260 }, { "epoch": 4.01463241146883, "grad_norm": 0.17974944412708282, "learning_rate": 6.11167202218147e-05, "loss": 1.4134, "step": 81270 }, { "epoch": 4.015174352634342, "grad_norm": 0.1860845535993576, "learning_rate": 6.110102865167677e-05, "loss": 1.4062, "step": 81280 }, { "epoch": 4.015716293799854, "grad_norm": 0.16427412629127502, "learning_rate": 6.108533632576992e-05, "loss": 1.3899, "step": 81290 }, { "epoch": 4.016258234965367, "grad_norm": 0.17551249265670776, "learning_rate": 6.106964324603801e-05, "loss": 1.4103, "step": 81300 }, { "epoch": 4.016800176130879, "grad_norm": 0.20582075417041779, "learning_rate": 6.105394941442503e-05, "loss": 1.4072, "step": 81310 }, { "epoch": 4.017342117296391, "grad_norm": 0.1872161328792572, "learning_rate": 6.10382548328751e-05, "loss": 1.4127, "step": 81320 }, { "epoch": 4.017884058461903, "grad_norm": 0.18607717752456665, "learning_rate": 6.102255950333239e-05, "loss": 1.4179, "step": 81330 }, { "epoch": 4.018425999627415, "grad_norm": 0.16646206378936768, "learning_rate": 6.100686342774115e-05, "loss": 1.4067, "step": 81340 }, { "epoch": 4.018696970210171, "eval_loss": 2.431658983230591, "eval_runtime": 21.9738, "eval_samples_per_second": 227.544, "eval_steps_per_second": 1.229, "step": 81345 }, { "epoch": 4.018967940792928, "grad_norm": 0.33718639612197876, "learning_rate": 6.0991166608045745e-05, "loss": 1.4178, "step": 81350 }, { "epoch": 4.01950988195844, "grad_norm": 0.32710033655166626, "learning_rate": 6.097546904619061e-05, "loss": 1.4108, "step": 81360 }, { "epoch": 4.020051823123952, "grad_norm": 0.21620182693004608, "learning_rate": 6.095977074412034e-05, "loss": 1.4062, "step": 81370 }, { "epoch": 4.0205937642894645, "grad_norm": 0.2620053291320801, "learning_rate": 6.0944071703779536e-05, "loss": 1.4072, "step": 81380 }, { "epoch": 4.021135705454976, "grad_norm": 0.16901114583015442, "learning_rate": 6.092837192711294e-05, "loss": 1.4053, "step": 81390 }, { "epoch": 4.021677646620489, "grad_norm": 0.2617866098880768, "learning_rate": 6.091267141606537e-05, "loss": 1.4131, "step": 81400 }, { "epoch": 4.022219587786001, "grad_norm": 0.218822181224823, "learning_rate": 6.089697017258171e-05, "loss": 1.4063, "step": 81410 }, { "epoch": 4.022761528951513, "grad_norm": 0.167058527469635, "learning_rate": 6.088126819860701e-05, "loss": 1.4009, "step": 81420 }, { "epoch": 4.023303470117026, "grad_norm": 0.1802741438150406, "learning_rate": 6.086556549608634e-05, "loss": 1.4077, "step": 81430 }, { "epoch": 4.023411858350128, "eval_loss": 2.4219465255737305, "eval_runtime": 21.9753, "eval_samples_per_second": 227.528, "eval_steps_per_second": 1.229, "step": 81432 }, { "epoch": 4.0238454112825375, "grad_norm": 0.1666201651096344, "learning_rate": 6.084986206696487e-05, "loss": 1.4031, "step": 81440 }, { "epoch": 4.02438735244805, "grad_norm": 0.19865860044956207, "learning_rate": 6.08341579131879e-05, "loss": 1.3978, "step": 81450 }, { "epoch": 4.024929293613562, "grad_norm": 0.1948530673980713, "learning_rate": 6.081845303670077e-05, "loss": 1.4148, "step": 81460 }, { "epoch": 4.025471234779074, "grad_norm": 0.1879536211490631, "learning_rate": 6.0802747439448935e-05, "loss": 1.411, "step": 81470 }, { "epoch": 4.026013175944587, "grad_norm": 0.20297576487064362, "learning_rate": 6.078704112337795e-05, "loss": 1.406, "step": 81480 }, { "epoch": 4.026555117110099, "grad_norm": 0.17799319326877594, "learning_rate": 6.077133409043342e-05, "loss": 1.4034, "step": 81490 }, { "epoch": 4.027097058275611, "grad_norm": 0.23970834910869598, "learning_rate": 6.075562634256109e-05, "loss": 1.4028, "step": 81500 }, { "epoch": 4.027638999441123, "grad_norm": 0.28241363167762756, "learning_rate": 6.073991788170675e-05, "loss": 1.4104, "step": 81510 }, { "epoch": 4.028126746490084, "eval_loss": 2.4167044162750244, "eval_runtime": 21.9669, "eval_samples_per_second": 227.615, "eval_steps_per_second": 1.229, "step": 81519 }, { "epoch": 4.028180940606635, "grad_norm": 0.22441983222961426, "learning_rate": 6.072420870981631e-05, "loss": 1.4092, "step": 81520 }, { "epoch": 4.028722881772148, "grad_norm": 0.2237686812877655, "learning_rate": 6.070849882883576e-05, "loss": 1.4094, "step": 81530 }, { "epoch": 4.02926482293766, "grad_norm": 0.17908205091953278, "learning_rate": 6.069278824071114e-05, "loss": 1.4032, "step": 81540 }, { "epoch": 4.029806764103172, "grad_norm": 0.18530701100826263, "learning_rate": 6.0677076947388635e-05, "loss": 1.4038, "step": 81550 }, { "epoch": 4.030348705268684, "grad_norm": 0.17356233298778534, "learning_rate": 6.066136495081448e-05, "loss": 1.4042, "step": 81560 }, { "epoch": 4.030890646434196, "grad_norm": 0.1672087162733078, "learning_rate": 6.0645652252935005e-05, "loss": 1.408, "step": 81570 }, { "epoch": 4.031432587599709, "grad_norm": 0.24931927025318146, "learning_rate": 6.0629938855696655e-05, "loss": 1.4034, "step": 81580 }, { "epoch": 4.031974528765221, "grad_norm": 0.297122061252594, "learning_rate": 6.061422476104592e-05, "loss": 1.4126, "step": 81590 }, { "epoch": 4.032516469930733, "grad_norm": 0.1879911869764328, "learning_rate": 6.0598509970929396e-05, "loss": 1.4148, "step": 81600 }, { "epoch": 4.03284163463004, "eval_loss": 2.4190409183502197, "eval_runtime": 21.9863, "eval_samples_per_second": 227.414, "eval_steps_per_second": 1.228, "step": 81606 }, { "epoch": 4.0330584110962455, "grad_norm": 0.16557785868644714, "learning_rate": 6.058279448729376e-05, "loss": 1.3992, "step": 81610 }, { "epoch": 4.033600352261757, "grad_norm": 0.21484564244747162, "learning_rate": 6.056707831208579e-05, "loss": 1.4228, "step": 81620 }, { "epoch": 4.03414229342727, "grad_norm": 0.21425080299377441, "learning_rate": 6.055136144725232e-05, "loss": 1.4058, "step": 81630 }, { "epoch": 4.034684234592782, "grad_norm": 0.16896581649780273, "learning_rate": 6.0535643894740304e-05, "loss": 1.4159, "step": 81640 }, { "epoch": 4.035226175758294, "grad_norm": 0.20105527341365814, "learning_rate": 6.0519925656496746e-05, "loss": 1.4198, "step": 81650 }, { "epoch": 4.035768116923807, "grad_norm": 0.18627329170703888, "learning_rate": 6.050420673446876e-05, "loss": 1.3985, "step": 81660 }, { "epoch": 4.0363100580893185, "grad_norm": 0.1602090746164322, "learning_rate": 6.048848713060354e-05, "loss": 1.4063, "step": 81670 }, { "epoch": 4.036851999254831, "grad_norm": 0.27577728033065796, "learning_rate": 6.0472766846848384e-05, "loss": 1.4078, "step": 81680 }, { "epoch": 4.037393940420343, "grad_norm": 0.24706071615219116, "learning_rate": 6.045704588515062e-05, "loss": 1.4088, "step": 81690 }, { "epoch": 4.0375565227699965, "eval_loss": 2.4332613945007324, "eval_runtime": 21.9896, "eval_samples_per_second": 227.38, "eval_steps_per_second": 1.228, "step": 81693 }, { "epoch": 4.037935881585855, "grad_norm": 0.18013040721416473, "learning_rate": 6.0441324247457686e-05, "loss": 1.4039, "step": 81700 }, { "epoch": 4.038477822751368, "grad_norm": 0.2291117012500763, "learning_rate": 6.042560193571714e-05, "loss": 1.4175, "step": 81710 }, { "epoch": 4.03901976391688, "grad_norm": 0.24475926160812378, "learning_rate": 6.04098789518766e-05, "loss": 1.4029, "step": 81720 }, { "epoch": 4.039561705082392, "grad_norm": 0.31280967593193054, "learning_rate": 6.039415529788372e-05, "loss": 1.4068, "step": 81730 }, { "epoch": 4.040103646247904, "grad_norm": 0.274687796831131, "learning_rate": 6.037843097568631e-05, "loss": 1.4094, "step": 81740 }, { "epoch": 4.040645587413416, "grad_norm": 0.1743050217628479, "learning_rate": 6.036270598723222e-05, "loss": 1.4009, "step": 81750 }, { "epoch": 4.041187528578929, "grad_norm": 0.17927388846874237, "learning_rate": 6.0346980334469386e-05, "loss": 1.4118, "step": 81760 }, { "epoch": 4.041729469744441, "grad_norm": 0.2120426744222641, "learning_rate": 6.0331254019345864e-05, "loss": 1.3993, "step": 81770 }, { "epoch": 4.042271410909953, "grad_norm": 0.2474319189786911, "learning_rate": 6.0315527043809726e-05, "loss": 1.4069, "step": 81780 }, { "epoch": 4.042271410909953, "eval_loss": 2.4317309856414795, "eval_runtime": 21.9711, "eval_samples_per_second": 227.572, "eval_steps_per_second": 1.229, "step": 81780 }, { "epoch": 4.042813352075465, "grad_norm": 0.22434458136558533, "learning_rate": 6.0299799409809175e-05, "loss": 1.4062, "step": 81790 }, { "epoch": 4.043355293240977, "grad_norm": 0.16757360100746155, "learning_rate": 6.028407111929248e-05, "loss": 1.4082, "step": 81800 }, { "epoch": 4.04389723440649, "grad_norm": 0.23129674792289734, "learning_rate": 6.0268342174207994e-05, "loss": 1.3953, "step": 81810 }, { "epoch": 4.044439175572002, "grad_norm": 0.21264296770095825, "learning_rate": 6.025261257650416e-05, "loss": 1.4025, "step": 81820 }, { "epoch": 4.044981116737514, "grad_norm": 0.20318235456943512, "learning_rate": 6.0236882328129495e-05, "loss": 1.4144, "step": 81830 }, { "epoch": 4.045523057903027, "grad_norm": 0.15029390156269073, "learning_rate": 6.022115143103256e-05, "loss": 1.4105, "step": 81840 }, { "epoch": 4.0460649990685384, "grad_norm": 0.23690800368785858, "learning_rate": 6.020541988716207e-05, "loss": 1.4113, "step": 81850 }, { "epoch": 4.046606940234051, "grad_norm": 0.16659775376319885, "learning_rate": 6.018968769846676e-05, "loss": 1.4208, "step": 81860 }, { "epoch": 4.04698629904991, "eval_loss": 2.436213731765747, "eval_runtime": 21.9755, "eval_samples_per_second": 227.526, "eval_steps_per_second": 1.229, "step": 81867 }, { "epoch": 4.047148881399563, "grad_norm": 0.25907376408576965, "learning_rate": 6.017395486689547e-05, "loss": 1.4063, "step": 81870 }, { "epoch": 4.047690822565075, "grad_norm": 0.15979789197444916, "learning_rate": 6.015822139439712e-05, "loss": 1.4152, "step": 81880 }, { "epoch": 4.048232763730588, "grad_norm": 0.1618310660123825, "learning_rate": 6.014248728292068e-05, "loss": 1.4019, "step": 81890 }, { "epoch": 4.0487747048961, "grad_norm": 0.22831520438194275, "learning_rate": 6.0126752534415255e-05, "loss": 1.4113, "step": 81900 }, { "epoch": 4.049316646061612, "grad_norm": 0.24901092052459717, "learning_rate": 6.011101715082997e-05, "loss": 1.4011, "step": 81910 }, { "epoch": 4.049858587227124, "grad_norm": 0.18291537463665009, "learning_rate": 6.009528113411409e-05, "loss": 1.4053, "step": 81920 }, { "epoch": 4.050400528392636, "grad_norm": 0.3508703410625458, "learning_rate": 6.007954448621691e-05, "loss": 1.4052, "step": 81930 }, { "epoch": 4.050942469558149, "grad_norm": 0.22802935540676117, "learning_rate": 6.0063807209087784e-05, "loss": 1.4107, "step": 81940 }, { "epoch": 4.051484410723661, "grad_norm": 0.3096991777420044, "learning_rate": 6.004806930467621e-05, "loss": 1.4167, "step": 81950 }, { "epoch": 4.051701187189866, "eval_loss": 2.434500217437744, "eval_runtime": 21.9752, "eval_samples_per_second": 227.529, "eval_steps_per_second": 1.229, "step": 81954 }, { "epoch": 4.052026351889173, "grad_norm": 0.2774633765220642, "learning_rate": 6.0032330774931754e-05, "loss": 1.4139, "step": 81960 }, { "epoch": 4.052568293054685, "grad_norm": 0.16032350063323975, "learning_rate": 6.0016591621803986e-05, "loss": 1.4121, "step": 81970 }, { "epoch": 4.053110234220197, "grad_norm": 0.26607418060302734, "learning_rate": 6.000085184724265e-05, "loss": 1.4051, "step": 81980 }, { "epoch": 4.05365217538571, "grad_norm": 0.26877331733703613, "learning_rate": 5.998511145319748e-05, "loss": 1.4075, "step": 81990 }, { "epoch": 4.054194116551222, "grad_norm": 0.18772627413272858, "learning_rate": 5.996937044161835e-05, "loss": 1.41, "step": 82000 }, { "epoch": 4.054736057716734, "grad_norm": 0.17956435680389404, "learning_rate": 5.9953628814455184e-05, "loss": 1.4223, "step": 82010 }, { "epoch": 4.0552779988822465, "grad_norm": 0.38991832733154297, "learning_rate": 5.9937886573657986e-05, "loss": 1.4082, "step": 82020 }, { "epoch": 4.055819940047758, "grad_norm": 0.20310132205486298, "learning_rate": 5.9922143721176846e-05, "loss": 1.4081, "step": 82030 }, { "epoch": 4.056361881213271, "grad_norm": 0.3149273693561554, "learning_rate": 5.990640025896189e-05, "loss": 1.4048, "step": 82040 }, { "epoch": 4.056416075329822, "eval_loss": 2.435964822769165, "eval_runtime": 21.972, "eval_samples_per_second": 227.562, "eval_steps_per_second": 1.229, "step": 82041 }, { "epoch": 4.056903822378783, "grad_norm": 0.16741731762886047, "learning_rate": 5.989065618896339e-05, "loss": 1.4019, "step": 82050 }, { "epoch": 4.057445763544295, "grad_norm": 0.19802165031433105, "learning_rate": 5.987491151313164e-05, "loss": 1.4076, "step": 82060 }, { "epoch": 4.057987704709808, "grad_norm": 0.17228524386882782, "learning_rate": 5.9859166233417016e-05, "loss": 1.4031, "step": 82070 }, { "epoch": 4.0585296458753195, "grad_norm": 0.21210512518882751, "learning_rate": 5.984342035176996e-05, "loss": 1.4075, "step": 82080 }, { "epoch": 4.059071587040832, "grad_norm": 0.2215586155653, "learning_rate": 5.982767387014102e-05, "loss": 1.3966, "step": 82090 }, { "epoch": 4.059613528206344, "grad_norm": 0.19042372703552246, "learning_rate": 5.98119267904808e-05, "loss": 1.4053, "step": 82100 }, { "epoch": 4.060155469371856, "grad_norm": 0.2051682025194168, "learning_rate": 5.979617911473999e-05, "loss": 1.4144, "step": 82110 }, { "epoch": 4.060697410537369, "grad_norm": 0.20389145612716675, "learning_rate": 5.978043084486934e-05, "loss": 1.4136, "step": 82120 }, { "epoch": 4.061130963469778, "eval_loss": 2.4284167289733887, "eval_runtime": 21.9746, "eval_samples_per_second": 227.536, "eval_steps_per_second": 1.229, "step": 82128 }, { "epoch": 4.061239351702881, "grad_norm": 0.18237143754959106, "learning_rate": 5.9764681982819656e-05, "loss": 1.4024, "step": 82130 }, { "epoch": 4.061781292868393, "grad_norm": 0.17889219522476196, "learning_rate": 5.974893253054186e-05, "loss": 1.4037, "step": 82140 }, { "epoch": 4.062323234033905, "grad_norm": 0.23920495808124542, "learning_rate": 5.973318248998693e-05, "loss": 1.4037, "step": 82150 }, { "epoch": 4.062865175199417, "grad_norm": 0.18914079666137695, "learning_rate": 5.971743186310589e-05, "loss": 1.41, "step": 82160 }, { "epoch": 4.06340711636493, "grad_norm": 0.21431829035282135, "learning_rate": 5.970168065184987e-05, "loss": 1.4089, "step": 82170 }, { "epoch": 4.063949057530442, "grad_norm": 0.16956184804439545, "learning_rate": 5.968592885817007e-05, "loss": 1.409, "step": 82180 }, { "epoch": 4.064490998695954, "grad_norm": 0.23388327658176422, "learning_rate": 5.967017648401775e-05, "loss": 1.4165, "step": 82190 }, { "epoch": 4.065032939861466, "grad_norm": 0.22132398188114166, "learning_rate": 5.965442353134424e-05, "loss": 1.3967, "step": 82200 }, { "epoch": 4.065574881026978, "grad_norm": 0.32104483246803284, "learning_rate": 5.963867000210094e-05, "loss": 1.4003, "step": 82210 }, { "epoch": 4.065845851609734, "eval_loss": 2.434609889984131, "eval_runtime": 21.9734, "eval_samples_per_second": 227.548, "eval_steps_per_second": 1.229, "step": 82215 }, { "epoch": 4.066116822192491, "grad_norm": 0.27653661370277405, "learning_rate": 5.962291589823935e-05, "loss": 1.4099, "step": 82220 }, { "epoch": 4.066658763358003, "grad_norm": 0.18210026621818542, "learning_rate": 5.960716122171102e-05, "loss": 1.4126, "step": 82230 }, { "epoch": 4.067200704523515, "grad_norm": 0.20089492201805115, "learning_rate": 5.959140597446753e-05, "loss": 1.4042, "step": 82240 }, { "epoch": 4.0677426456890275, "grad_norm": 0.20778271555900574, "learning_rate": 5.957565015846063e-05, "loss": 1.4037, "step": 82250 }, { "epoch": 4.068284586854539, "grad_norm": 0.31842777132987976, "learning_rate": 5.955989377564203e-05, "loss": 1.4034, "step": 82260 }, { "epoch": 4.068826528020052, "grad_norm": 0.1979900300502777, "learning_rate": 5.95441368279636e-05, "loss": 1.391, "step": 82270 }, { "epoch": 4.069368469185564, "grad_norm": 0.1853083223104477, "learning_rate": 5.952837931737723e-05, "loss": 1.417, "step": 82280 }, { "epoch": 4.069910410351076, "grad_norm": 0.18772120773792267, "learning_rate": 5.951262124583489e-05, "loss": 1.4061, "step": 82290 }, { "epoch": 4.070452351516589, "grad_norm": 0.207114115357399, "learning_rate": 5.9496862615288615e-05, "loss": 1.4112, "step": 82300 }, { "epoch": 4.070560739749691, "eval_loss": 2.4208648204803467, "eval_runtime": 21.9732, "eval_samples_per_second": 227.549, "eval_steps_per_second": 1.229, "step": 82302 }, { "epoch": 4.070994292682101, "grad_norm": 0.21243199706077576, "learning_rate": 5.948110342769054e-05, "loss": 1.4098, "step": 82310 }, { "epoch": 4.071536233847613, "grad_norm": 0.18174535036087036, "learning_rate": 5.946534368499281e-05, "loss": 1.3921, "step": 82320 }, { "epoch": 4.072078175013125, "grad_norm": 0.195078507065773, "learning_rate": 5.944958338914769e-05, "loss": 1.3993, "step": 82330 }, { "epoch": 4.072620116178637, "grad_norm": 0.1927531659603119, "learning_rate": 5.943382254210751e-05, "loss": 1.4068, "step": 82340 }, { "epoch": 4.07316205734415, "grad_norm": 0.3011392652988434, "learning_rate": 5.941806114582464e-05, "loss": 1.4216, "step": 82350 }, { "epoch": 4.073703998509662, "grad_norm": 0.22039276361465454, "learning_rate": 5.940229920225154e-05, "loss": 1.4045, "step": 82360 }, { "epoch": 4.0742459396751745, "grad_norm": 0.1818740963935852, "learning_rate": 5.9386536713340726e-05, "loss": 1.4074, "step": 82370 }, { "epoch": 4.074787880840686, "grad_norm": 0.1722990870475769, "learning_rate": 5.9370773681044776e-05, "loss": 1.4028, "step": 82380 }, { "epoch": 4.075275627889647, "eval_loss": 2.426165819168091, "eval_runtime": 21.9727, "eval_samples_per_second": 227.555, "eval_steps_per_second": 1.229, "step": 82389 }, { "epoch": 4.075329822006198, "grad_norm": 0.17619258165359497, "learning_rate": 5.935501010731637e-05, "loss": 1.4043, "step": 82390 }, { "epoch": 4.075871763171711, "grad_norm": 0.1646595150232315, "learning_rate": 5.93392459941082e-05, "loss": 1.4092, "step": 82400 }, { "epoch": 4.076413704337223, "grad_norm": 0.2252381443977356, "learning_rate": 5.932348134337311e-05, "loss": 1.4022, "step": 82410 }, { "epoch": 4.076955645502735, "grad_norm": 0.23596051335334778, "learning_rate": 5.9307716157063895e-05, "loss": 1.4146, "step": 82420 }, { "epoch": 4.0774975866682475, "grad_norm": 0.20619599521160126, "learning_rate": 5.9291950437133515e-05, "loss": 1.3984, "step": 82430 }, { "epoch": 4.078039527833759, "grad_norm": 0.2613559663295746, "learning_rate": 5.927618418553495e-05, "loss": 1.4036, "step": 82440 }, { "epoch": 4.078581468999272, "grad_norm": 0.26218506693840027, "learning_rate": 5.9260417404221245e-05, "loss": 1.4058, "step": 82450 }, { "epoch": 4.079123410164784, "grad_norm": 0.17391842603683472, "learning_rate": 5.924465009514554e-05, "loss": 1.4002, "step": 82460 }, { "epoch": 4.079665351330296, "grad_norm": 0.2079029232263565, "learning_rate": 5.9228882260261e-05, "loss": 1.4079, "step": 82470 }, { "epoch": 4.0799905160296035, "eval_loss": 2.4345686435699463, "eval_runtime": 21.9671, "eval_samples_per_second": 227.614, "eval_steps_per_second": 1.229, "step": 82476 }, { "epoch": 4.080207292495809, "grad_norm": 0.16714847087860107, "learning_rate": 5.9213113901520875e-05, "loss": 1.4065, "step": 82480 }, { "epoch": 4.0807492336613205, "grad_norm": 0.19766834378242493, "learning_rate": 5.9197345020878515e-05, "loss": 1.4094, "step": 82490 }, { "epoch": 4.081291174826833, "grad_norm": 0.26485520601272583, "learning_rate": 5.918157562028726e-05, "loss": 1.4029, "step": 82500 }, { "epoch": 4.081833115992345, "grad_norm": 0.25532785058021545, "learning_rate": 5.9165805701700595e-05, "loss": 1.3951, "step": 82510 }, { "epoch": 4.082375057157857, "grad_norm": 0.28087395429611206, "learning_rate": 5.915003526707198e-05, "loss": 1.4069, "step": 82520 }, { "epoch": 4.08291699832337, "grad_norm": 0.306506872177124, "learning_rate": 5.9134264318355025e-05, "loss": 1.3996, "step": 82530 }, { "epoch": 4.083458939488882, "grad_norm": 0.2110404223203659, "learning_rate": 5.911849285750335e-05, "loss": 1.418, "step": 82540 }, { "epoch": 4.084000880654394, "grad_norm": 0.22511455416679382, "learning_rate": 5.910272088647067e-05, "loss": 1.4133, "step": 82550 }, { "epoch": 4.084542821819906, "grad_norm": 0.17213839292526245, "learning_rate": 5.9086948407210727e-05, "loss": 1.4103, "step": 82560 }, { "epoch": 4.08470540416956, "eval_loss": 2.431107521057129, "eval_runtime": 21.9786, "eval_samples_per_second": 227.494, "eval_steps_per_second": 1.228, "step": 82563 }, { "epoch": 4.085084762985418, "grad_norm": 0.26532599329948425, "learning_rate": 5.907117542167737e-05, "loss": 1.4042, "step": 82570 }, { "epoch": 4.085626704150931, "grad_norm": 0.16096952557563782, "learning_rate": 5.905540193182446e-05, "loss": 1.4038, "step": 82580 }, { "epoch": 4.086168645316443, "grad_norm": 0.1794336587190628, "learning_rate": 5.903962793960599e-05, "loss": 1.4073, "step": 82590 }, { "epoch": 4.086710586481955, "grad_norm": 0.17377042770385742, "learning_rate": 5.902385344697594e-05, "loss": 1.3989, "step": 82600 }, { "epoch": 4.087252527647467, "grad_norm": 0.16201172769069672, "learning_rate": 5.9008078455888394e-05, "loss": 1.3959, "step": 82610 }, { "epoch": 4.087794468812979, "grad_norm": 0.27610713243484497, "learning_rate": 5.899230296829748e-05, "loss": 1.3948, "step": 82620 }, { "epoch": 4.088336409978492, "grad_norm": 0.25443559885025024, "learning_rate": 5.897652698615741e-05, "loss": 1.396, "step": 82630 }, { "epoch": 4.088878351144004, "grad_norm": 0.24116799235343933, "learning_rate": 5.896075051142246e-05, "loss": 1.404, "step": 82640 }, { "epoch": 4.089420292309516, "grad_norm": 0.20483553409576416, "learning_rate": 5.894497354604692e-05, "loss": 1.405, "step": 82650 }, { "epoch": 4.089420292309516, "eval_loss": 2.4280037879943848, "eval_runtime": 21.9762, "eval_samples_per_second": 227.519, "eval_steps_per_second": 1.229, "step": 82650 }, { "epoch": 4.0899622334750285, "grad_norm": 0.26821398735046387, "learning_rate": 5.892919609198517e-05, "loss": 1.4138, "step": 82660 }, { "epoch": 4.09050417464054, "grad_norm": 0.1844187080860138, "learning_rate": 5.891341815119168e-05, "loss": 1.4063, "step": 82670 }, { "epoch": 4.091046115806053, "grad_norm": 0.3293144702911377, "learning_rate": 5.8897639725620956e-05, "loss": 1.4092, "step": 82680 }, { "epoch": 4.091588056971565, "grad_norm": 0.20657339692115784, "learning_rate": 5.888186081722752e-05, "loss": 1.3962, "step": 82690 }, { "epoch": 4.092129998137077, "grad_norm": 0.16567468643188477, "learning_rate": 5.8866081427966036e-05, "loss": 1.4108, "step": 82700 }, { "epoch": 4.09267193930259, "grad_norm": 0.18625999987125397, "learning_rate": 5.885030155979116e-05, "loss": 1.4004, "step": 82710 }, { "epoch": 4.0932138804681015, "grad_norm": 0.16223831474781036, "learning_rate": 5.8834521214657635e-05, "loss": 1.4017, "step": 82720 }, { "epoch": 4.093755821633614, "grad_norm": 0.1961243897676468, "learning_rate": 5.881874039452029e-05, "loss": 1.4039, "step": 82730 }, { "epoch": 4.094135180449473, "eval_loss": 2.4336957931518555, "eval_runtime": 21.6769, "eval_samples_per_second": 230.66, "eval_steps_per_second": 1.246, "step": 82737 }, { "epoch": 4.094297762799126, "grad_norm": 0.20036591589450836, "learning_rate": 5.880295910133394e-05, "loss": 1.4055, "step": 82740 }, { "epoch": 4.094839703964638, "grad_norm": 0.20665618777275085, "learning_rate": 5.8787177337053555e-05, "loss": 1.4043, "step": 82750 }, { "epoch": 4.095381645130151, "grad_norm": 0.18086576461791992, "learning_rate": 5.8771395103634065e-05, "loss": 1.41, "step": 82760 }, { "epoch": 4.095923586295663, "grad_norm": 0.15253938734531403, "learning_rate": 5.8755612403030524e-05, "loss": 1.4057, "step": 82770 }, { "epoch": 4.096465527461175, "grad_norm": 0.17018210887908936, "learning_rate": 5.873982923719804e-05, "loss": 1.4059, "step": 82780 }, { "epoch": 4.097007468626687, "grad_norm": 0.1573159396648407, "learning_rate": 5.872404560809173e-05, "loss": 1.4166, "step": 82790 }, { "epoch": 4.097549409792199, "grad_norm": 0.2046526074409485, "learning_rate": 5.870826151766683e-05, "loss": 1.4067, "step": 82800 }, { "epoch": 4.098091350957712, "grad_norm": 0.1698278784751892, "learning_rate": 5.869247696787857e-05, "loss": 1.4116, "step": 82810 }, { "epoch": 4.098633292123224, "grad_norm": 0.24143236875534058, "learning_rate": 5.867669196068231e-05, "loss": 1.3992, "step": 82820 }, { "epoch": 4.098850068589429, "eval_loss": 2.4264564514160156, "eval_runtime": 21.9736, "eval_samples_per_second": 227.546, "eval_steps_per_second": 1.229, "step": 82824 }, { "epoch": 4.099175233288736, "grad_norm": 0.16382378339767456, "learning_rate": 5.866090649803342e-05, "loss": 1.4064, "step": 82830 }, { "epoch": 4.099717174454248, "grad_norm": 0.18778187036514282, "learning_rate": 5.864512058188733e-05, "loss": 1.398, "step": 82840 }, { "epoch": 4.10025911561976, "grad_norm": 0.1958286166191101, "learning_rate": 5.862933421419952e-05, "loss": 1.4076, "step": 82850 }, { "epoch": 4.100801056785273, "grad_norm": 0.18923796713352203, "learning_rate": 5.861354739692553e-05, "loss": 1.4033, "step": 82860 }, { "epoch": 4.101342997950785, "grad_norm": 0.155537948012352, "learning_rate": 5.859776013202098e-05, "loss": 1.4019, "step": 82870 }, { "epoch": 4.101884939116297, "grad_norm": 0.2815153896808624, "learning_rate": 5.858197242144155e-05, "loss": 1.4076, "step": 82880 }, { "epoch": 4.10242688028181, "grad_norm": 0.20242029428482056, "learning_rate": 5.856618426714291e-05, "loss": 1.3973, "step": 82890 }, { "epoch": 4.1029688214473214, "grad_norm": 0.18118534982204437, "learning_rate": 5.855039567108084e-05, "loss": 1.4034, "step": 82900 }, { "epoch": 4.103510762612834, "grad_norm": 0.18179315328598022, "learning_rate": 5.853460663521117e-05, "loss": 1.3943, "step": 82910 }, { "epoch": 4.103564956729385, "eval_loss": 2.43251633644104, "eval_runtime": 21.9701, "eval_samples_per_second": 227.582, "eval_steps_per_second": 1.229, "step": 82911 }, { "epoch": 4.104052703778346, "grad_norm": 0.3180215358734131, "learning_rate": 5.851881716148979e-05, "loss": 1.3989, "step": 82920 }, { "epoch": 4.104594644943858, "grad_norm": 0.22346888482570648, "learning_rate": 5.850302725187261e-05, "loss": 1.4041, "step": 82930 }, { "epoch": 4.105136586109371, "grad_norm": 0.1704285740852356, "learning_rate": 5.8487236908315635e-05, "loss": 1.4034, "step": 82940 }, { "epoch": 4.105678527274883, "grad_norm": 0.1817091554403305, "learning_rate": 5.8471446132774864e-05, "loss": 1.402, "step": 82950 }, { "epoch": 4.106220468440395, "grad_norm": 0.16778716444969177, "learning_rate": 5.8455654927206436e-05, "loss": 1.399, "step": 82960 }, { "epoch": 4.106762409605907, "grad_norm": 0.20462718605995178, "learning_rate": 5.8439863293566476e-05, "loss": 1.4, "step": 82970 }, { "epoch": 4.107304350771419, "grad_norm": 0.16164131462574005, "learning_rate": 5.842407123381118e-05, "loss": 1.4022, "step": 82980 }, { "epoch": 4.107846291936932, "grad_norm": 0.17389550805091858, "learning_rate": 5.8408278749896816e-05, "loss": 1.4129, "step": 82990 }, { "epoch": 4.108279844869341, "eval_loss": 2.4385242462158203, "eval_runtime": 21.9887, "eval_samples_per_second": 227.39, "eval_steps_per_second": 1.228, "step": 82998 }, { "epoch": 4.108388233102444, "grad_norm": 0.3119494616985321, "learning_rate": 5.8392485843779676e-05, "loss": 1.41, "step": 83000 }, { "epoch": 4.108930174267956, "grad_norm": 0.21673525869846344, "learning_rate": 5.83766925174161e-05, "loss": 1.4092, "step": 83010 }, { "epoch": 4.109472115433468, "grad_norm": 0.17000888288021088, "learning_rate": 5.836089877276254e-05, "loss": 1.4118, "step": 83020 }, { "epoch": 4.11001405659898, "grad_norm": 0.1890566647052765, "learning_rate": 5.83451046117754e-05, "loss": 1.4017, "step": 83030 }, { "epoch": 4.110555997764493, "grad_norm": 0.21135641634464264, "learning_rate": 5.832931003641127e-05, "loss": 1.393, "step": 83040 }, { "epoch": 4.111097938930005, "grad_norm": 0.15873472392559052, "learning_rate": 5.8313515048626634e-05, "loss": 1.4016, "step": 83050 }, { "epoch": 4.111639880095517, "grad_norm": 0.24823595583438873, "learning_rate": 5.8297719650378136e-05, "loss": 1.41, "step": 83060 }, { "epoch": 4.1121818212610295, "grad_norm": 0.18484194576740265, "learning_rate": 5.828192384362245e-05, "loss": 1.4021, "step": 83070 }, { "epoch": 4.112723762426541, "grad_norm": 0.26047736406326294, "learning_rate": 5.826612763031632e-05, "loss": 1.4074, "step": 83080 }, { "epoch": 4.112994733009297, "eval_loss": 2.4378349781036377, "eval_runtime": 21.9749, "eval_samples_per_second": 227.532, "eval_steps_per_second": 1.229, "step": 83085 }, { "epoch": 4.113265703592054, "grad_norm": 0.38496696949005127, "learning_rate": 5.825033101241644e-05, "loss": 1.4136, "step": 83090 }, { "epoch": 4.113807644757566, "grad_norm": 0.19482018053531647, "learning_rate": 5.823453399187967e-05, "loss": 1.4096, "step": 83100 }, { "epoch": 4.114349585923078, "grad_norm": 0.17476175725460052, "learning_rate": 5.821873657066288e-05, "loss": 1.4118, "step": 83110 }, { "epoch": 4.114891527088591, "grad_norm": 0.19466517865657806, "learning_rate": 5.820293875072298e-05, "loss": 1.4088, "step": 83120 }, { "epoch": 4.1154334682541025, "grad_norm": 0.20809796452522278, "learning_rate": 5.818714053401695e-05, "loss": 1.4084, "step": 83130 }, { "epoch": 4.115975409419615, "grad_norm": 0.16254140436649323, "learning_rate": 5.8171341922501755e-05, "loss": 1.4097, "step": 83140 }, { "epoch": 4.116517350585127, "grad_norm": 0.15851789712905884, "learning_rate": 5.8155542918134496e-05, "loss": 1.4123, "step": 83150 }, { "epoch": 4.117059291750639, "grad_norm": 0.29275140166282654, "learning_rate": 5.8139743522872306e-05, "loss": 1.4053, "step": 83160 }, { "epoch": 4.117601232916152, "grad_norm": 0.2825930714607239, "learning_rate": 5.812394373867229e-05, "loss": 1.4093, "step": 83170 }, { "epoch": 4.117709621149254, "eval_loss": 2.4347915649414062, "eval_runtime": 21.9741, "eval_samples_per_second": 227.54, "eval_steps_per_second": 1.229, "step": 83172 }, { "epoch": 4.118143174081664, "grad_norm": 0.24042131006717682, "learning_rate": 5.81081435674917e-05, "loss": 1.4135, "step": 83180 }, { "epoch": 4.118685115247176, "grad_norm": 0.22210003435611725, "learning_rate": 5.809234301128779e-05, "loss": 1.4164, "step": 83190 }, { "epoch": 4.119227056412688, "grad_norm": 0.15581659972667694, "learning_rate": 5.807654207201784e-05, "loss": 1.3988, "step": 83200 }, { "epoch": 4.1197689975782, "grad_norm": 0.19380082190036774, "learning_rate": 5.8060740751639217e-05, "loss": 1.4111, "step": 83210 }, { "epoch": 4.120310938743713, "grad_norm": 0.1942562609910965, "learning_rate": 5.8044939052109315e-05, "loss": 1.4082, "step": 83220 }, { "epoch": 4.120852879909225, "grad_norm": 0.25676843523979187, "learning_rate": 5.802913697538559e-05, "loss": 1.4086, "step": 83230 }, { "epoch": 4.121394821074737, "grad_norm": 0.21797262132167816, "learning_rate": 5.8013334523425536e-05, "loss": 1.408, "step": 83240 }, { "epoch": 4.121936762240249, "grad_norm": 0.18944242596626282, "learning_rate": 5.799753169818666e-05, "loss": 1.4046, "step": 83250 }, { "epoch": 4.1224245092892104, "eval_loss": 2.430614709854126, "eval_runtime": 21.9688, "eval_samples_per_second": 227.596, "eval_steps_per_second": 1.229, "step": 83259 }, { "epoch": 4.122478703405761, "grad_norm": 0.24879737198352814, "learning_rate": 5.798172850162658e-05, "loss": 1.4057, "step": 83260 }, { "epoch": 4.123020644571274, "grad_norm": 0.16706986725330353, "learning_rate": 5.796592493570292e-05, "loss": 1.4101, "step": 83270 }, { "epoch": 4.123562585736786, "grad_norm": 0.16859695315361023, "learning_rate": 5.795012100237337e-05, "loss": 1.3993, "step": 83280 }, { "epoch": 4.124104526902298, "grad_norm": 0.17874988913536072, "learning_rate": 5.7934316703595625e-05, "loss": 1.4068, "step": 83290 }, { "epoch": 4.1246464680678105, "grad_norm": 0.2080361247062683, "learning_rate": 5.791851204132745e-05, "loss": 1.4041, "step": 83300 }, { "epoch": 4.125188409233322, "grad_norm": 0.20870813727378845, "learning_rate": 5.79027070175267e-05, "loss": 1.4055, "step": 83310 }, { "epoch": 4.125730350398835, "grad_norm": 0.17612333595752716, "learning_rate": 5.7886901634151205e-05, "loss": 1.4071, "step": 83320 }, { "epoch": 4.126272291564347, "grad_norm": 0.18745100498199463, "learning_rate": 5.787109589315887e-05, "loss": 1.4132, "step": 83330 }, { "epoch": 4.126814232729859, "grad_norm": 0.2577305734157562, "learning_rate": 5.7855289796507627e-05, "loss": 1.4039, "step": 83340 }, { "epoch": 4.127139397429167, "eval_loss": 2.428053855895996, "eval_runtime": 21.9759, "eval_samples_per_second": 227.521, "eval_steps_per_second": 1.229, "step": 83346 }, { "epoch": 4.127356173895372, "grad_norm": 0.1890375018119812, "learning_rate": 5.783948334615548e-05, "loss": 1.3949, "step": 83350 }, { "epoch": 4.1278981150608836, "grad_norm": 0.34618306159973145, "learning_rate": 5.782367654406047e-05, "loss": 1.4095, "step": 83360 }, { "epoch": 4.128440056226396, "grad_norm": 0.19756010174751282, "learning_rate": 5.780786939218068e-05, "loss": 1.4154, "step": 83370 }, { "epoch": 4.128981997391908, "grad_norm": 0.2689013183116913, "learning_rate": 5.77920618924742e-05, "loss": 1.4067, "step": 83380 }, { "epoch": 4.12952393855742, "grad_norm": 0.17390170693397522, "learning_rate": 5.777625404689924e-05, "loss": 1.4065, "step": 83390 }, { "epoch": 4.130065879722933, "grad_norm": 0.2956944704055786, "learning_rate": 5.7760445857413946e-05, "loss": 1.4047, "step": 83400 }, { "epoch": 4.130607820888445, "grad_norm": 0.2633947432041168, "learning_rate": 5.774463732597662e-05, "loss": 1.4067, "step": 83410 }, { "epoch": 4.131149762053957, "grad_norm": 0.17089660465717316, "learning_rate": 5.772882845454556e-05, "loss": 1.4171, "step": 83420 }, { "epoch": 4.131691703219469, "grad_norm": 0.18730990588665009, "learning_rate": 5.771301924507905e-05, "loss": 1.4067, "step": 83430 }, { "epoch": 4.131854285569123, "eval_loss": 2.4326930046081543, "eval_runtime": 21.9721, "eval_samples_per_second": 227.562, "eval_steps_per_second": 1.229, "step": 83433 }, { "epoch": 4.132233644384981, "grad_norm": 0.16217251121997833, "learning_rate": 5.7697209699535514e-05, "loss": 1.4039, "step": 83440 }, { "epoch": 4.132775585550494, "grad_norm": 0.16589990258216858, "learning_rate": 5.768139981987334e-05, "loss": 1.3967, "step": 83450 }, { "epoch": 4.133317526716006, "grad_norm": 0.1889846920967102, "learning_rate": 5.7665589608051e-05, "loss": 1.4068, "step": 83460 }, { "epoch": 4.133859467881518, "grad_norm": 0.18602542579174042, "learning_rate": 5.7649779066026985e-05, "loss": 1.4024, "step": 83470 }, { "epoch": 4.1344014090470305, "grad_norm": 0.24643942713737488, "learning_rate": 5.7633968195759855e-05, "loss": 1.4069, "step": 83480 }, { "epoch": 4.134943350212542, "grad_norm": 0.18482206761837006, "learning_rate": 5.7618156999208186e-05, "loss": 1.4186, "step": 83490 }, { "epoch": 4.135485291378055, "grad_norm": 0.1943228840827942, "learning_rate": 5.7602345478330577e-05, "loss": 1.4047, "step": 83500 }, { "epoch": 4.136027232543567, "grad_norm": 0.2460983842611313, "learning_rate": 5.7586533635085705e-05, "loss": 1.4117, "step": 83510 }, { "epoch": 4.136569173709079, "grad_norm": 0.22592556476593018, "learning_rate": 5.75707214714323e-05, "loss": 1.407, "step": 83520 }, { "epoch": 4.136569173709079, "eval_loss": 2.431594133377075, "eval_runtime": 21.9667, "eval_samples_per_second": 227.617, "eval_steps_per_second": 1.229, "step": 83520 }, { "epoch": 4.137111114874592, "grad_norm": 0.22698943316936493, "learning_rate": 5.7554908989329084e-05, "loss": 1.409, "step": 83530 }, { "epoch": 4.1376530560401035, "grad_norm": 0.1876407265663147, "learning_rate": 5.7539096190734806e-05, "loss": 1.4023, "step": 83540 }, { "epoch": 4.138194997205616, "grad_norm": 0.17315153777599335, "learning_rate": 5.752328307760833e-05, "loss": 1.3965, "step": 83550 }, { "epoch": 4.138736938371128, "grad_norm": 0.1867133527994156, "learning_rate": 5.7507469651908486e-05, "loss": 1.414, "step": 83560 }, { "epoch": 4.13927887953664, "grad_norm": 0.2372438907623291, "learning_rate": 5.74916559155942e-05, "loss": 1.41, "step": 83570 }, { "epoch": 4.139820820702153, "grad_norm": 0.16649603843688965, "learning_rate": 5.7475841870624416e-05, "loss": 1.3959, "step": 83580 }, { "epoch": 4.140362761867665, "grad_norm": 0.2065044343471527, "learning_rate": 5.746002751895807e-05, "loss": 1.4078, "step": 83590 }, { "epoch": 4.140904703033177, "grad_norm": 0.20945148169994354, "learning_rate": 5.744421286255418e-05, "loss": 1.3964, "step": 83600 }, { "epoch": 4.141284061849036, "eval_loss": 2.4306061267852783, "eval_runtime": 21.9679, "eval_samples_per_second": 227.605, "eval_steps_per_second": 1.229, "step": 83607 }, { "epoch": 4.141446644198689, "grad_norm": 0.21159766614437103, "learning_rate": 5.7428397903371845e-05, "loss": 1.4085, "step": 83610 }, { "epoch": 4.141988585364201, "grad_norm": 0.1735406070947647, "learning_rate": 5.741258264337009e-05, "loss": 1.4062, "step": 83620 }, { "epoch": 4.142530526529714, "grad_norm": 0.21971061825752258, "learning_rate": 5.739676708450809e-05, "loss": 1.4088, "step": 83630 }, { "epoch": 4.143072467695226, "grad_norm": 0.1740388572216034, "learning_rate": 5.7380951228744985e-05, "loss": 1.4059, "step": 83640 }, { "epoch": 4.1436144088607385, "grad_norm": 0.21105538308620453, "learning_rate": 5.736513507803999e-05, "loss": 1.4035, "step": 83650 }, { "epoch": 4.14415635002625, "grad_norm": 0.21012961864471436, "learning_rate": 5.7349318634352325e-05, "loss": 1.4016, "step": 83660 }, { "epoch": 4.144698291191762, "grad_norm": 0.21430379152297974, "learning_rate": 5.7333501899641254e-05, "loss": 1.399, "step": 83670 }, { "epoch": 4.145240232357275, "grad_norm": 0.16836793720722198, "learning_rate": 5.731768487586611e-05, "loss": 1.3952, "step": 83680 }, { "epoch": 4.145782173522787, "grad_norm": 0.22241011261940002, "learning_rate": 5.730186756498622e-05, "loss": 1.4089, "step": 83690 }, { "epoch": 4.145998949988992, "eval_loss": 2.433333396911621, "eval_runtime": 21.9717, "eval_samples_per_second": 227.566, "eval_steps_per_second": 1.229, "step": 83694 }, { "epoch": 4.146324114688299, "grad_norm": 0.34341859817504883, "learning_rate": 5.728604996896096e-05, "loss": 1.4097, "step": 83700 }, { "epoch": 4.1468660558538115, "grad_norm": 0.4072984755039215, "learning_rate": 5.7270232089749775e-05, "loss": 1.398, "step": 83710 }, { "epoch": 4.147407997019323, "grad_norm": 0.21958887577056885, "learning_rate": 5.725441392931209e-05, "loss": 1.4012, "step": 83720 }, { "epoch": 4.147949938184836, "grad_norm": 0.18960371613502502, "learning_rate": 5.7238595489607396e-05, "loss": 1.4089, "step": 83730 }, { "epoch": 4.148491879350348, "grad_norm": 0.16692198812961578, "learning_rate": 5.722277677259521e-05, "loss": 1.4043, "step": 83740 }, { "epoch": 4.14903382051586, "grad_norm": 0.1643580198287964, "learning_rate": 5.720695778023508e-05, "loss": 1.395, "step": 83750 }, { "epoch": 4.149575761681373, "grad_norm": 0.23333659768104553, "learning_rate": 5.7191138514486605e-05, "loss": 1.4017, "step": 83760 }, { "epoch": 4.1501177028468845, "grad_norm": 0.19941531121730804, "learning_rate": 5.7175318977309424e-05, "loss": 1.4022, "step": 83770 }, { "epoch": 4.150659644012397, "grad_norm": 0.19911114871501923, "learning_rate": 5.7159499170663144e-05, "loss": 1.4112, "step": 83780 }, { "epoch": 4.150713838128948, "eval_loss": 2.4322268962860107, "eval_runtime": 21.9745, "eval_samples_per_second": 227.537, "eval_steps_per_second": 1.229, "step": 83781 }, { "epoch": 4.151201585177909, "grad_norm": 0.18389646708965302, "learning_rate": 5.7143679096507494e-05, "loss": 1.402, "step": 83790 }, { "epoch": 4.151743526343421, "grad_norm": 0.1758451610803604, "learning_rate": 5.712785875680218e-05, "loss": 1.4039, "step": 83800 }, { "epoch": 4.152285467508934, "grad_norm": 0.16958458721637726, "learning_rate": 5.711203815350696e-05, "loss": 1.4069, "step": 83810 }, { "epoch": 4.152827408674446, "grad_norm": 0.16103295981884003, "learning_rate": 5.709621728858164e-05, "loss": 1.404, "step": 83820 }, { "epoch": 4.153369349839958, "grad_norm": 0.17550979554653168, "learning_rate": 5.7080396163986004e-05, "loss": 1.402, "step": 83830 }, { "epoch": 4.15391129100547, "grad_norm": 0.2050134539604187, "learning_rate": 5.706457478167992e-05, "loss": 1.4078, "step": 83840 }, { "epoch": 4.154453232170982, "grad_norm": 0.1583109349012375, "learning_rate": 5.704875314362331e-05, "loss": 1.414, "step": 83850 }, { "epoch": 4.154995173336495, "grad_norm": 0.20555706322193146, "learning_rate": 5.703293125177602e-05, "loss": 1.4025, "step": 83860 }, { "epoch": 4.155428726268904, "eval_loss": 2.4321482181549072, "eval_runtime": 21.9695, "eval_samples_per_second": 227.588, "eval_steps_per_second": 1.229, "step": 83868 }, { "epoch": 4.155537114502007, "grad_norm": 0.2936631739139557, "learning_rate": 5.701710910809805e-05, "loss": 1.4003, "step": 83870 }, { "epoch": 4.156079055667519, "grad_norm": 0.24286337196826935, "learning_rate": 5.700128671454935e-05, "loss": 1.4065, "step": 83880 }, { "epoch": 4.156620996833031, "grad_norm": 0.3643859624862671, "learning_rate": 5.6985464073089944e-05, "loss": 1.3969, "step": 83890 }, { "epoch": 4.157162937998543, "grad_norm": 0.3168624937534332, "learning_rate": 5.696964118567988e-05, "loss": 1.3999, "step": 83900 }, { "epoch": 4.157704879164056, "grad_norm": 0.24165990948677063, "learning_rate": 5.6953818054279206e-05, "loss": 1.3952, "step": 83910 }, { "epoch": 4.158246820329568, "grad_norm": 0.16725140810012817, "learning_rate": 5.693799468084804e-05, "loss": 1.4162, "step": 83920 }, { "epoch": 4.15878876149508, "grad_norm": 0.2385065108537674, "learning_rate": 5.6922171067346495e-05, "loss": 1.3966, "step": 83930 }, { "epoch": 4.159330702660593, "grad_norm": 0.23973079025745392, "learning_rate": 5.690634721573475e-05, "loss": 1.409, "step": 83940 }, { "epoch": 4.159872643826104, "grad_norm": 0.18828116357326508, "learning_rate": 5.6890523127972993e-05, "loss": 1.3998, "step": 83950 }, { "epoch": 4.16014361440886, "eval_loss": 2.4352917671203613, "eval_runtime": 21.9698, "eval_samples_per_second": 227.585, "eval_steps_per_second": 1.229, "step": 83955 }, { "epoch": 4.160414584991617, "grad_norm": 0.1964593529701233, "learning_rate": 5.687469880602143e-05, "loss": 1.4101, "step": 83960 }, { "epoch": 4.160956526157129, "grad_norm": 0.2196066826581955, "learning_rate": 5.685887425184033e-05, "loss": 1.4071, "step": 83970 }, { "epoch": 4.161498467322641, "grad_norm": 0.16051217913627625, "learning_rate": 5.684304946738995e-05, "loss": 1.411, "step": 83980 }, { "epoch": 4.162040408488154, "grad_norm": 0.23995092511177063, "learning_rate": 5.68272244546306e-05, "loss": 1.4054, "step": 83990 }, { "epoch": 4.162582349653666, "grad_norm": 0.35630932450294495, "learning_rate": 5.6811399215522634e-05, "loss": 1.4047, "step": 84000 }, { "epoch": 4.163124290819178, "grad_norm": 0.26559507846832275, "learning_rate": 5.6795573752026386e-05, "loss": 1.4162, "step": 84010 }, { "epoch": 4.16366623198469, "grad_norm": 0.28465500473976135, "learning_rate": 5.6779748066102254e-05, "loss": 1.4168, "step": 84020 }, { "epoch": 4.164208173150202, "grad_norm": 0.3914095163345337, "learning_rate": 5.676392215971066e-05, "loss": 1.4054, "step": 84030 }, { "epoch": 4.164750114315715, "grad_norm": 0.2624354958534241, "learning_rate": 5.6748096034812047e-05, "loss": 1.4156, "step": 84040 }, { "epoch": 4.164858502548817, "eval_loss": 2.431173324584961, "eval_runtime": 21.9748, "eval_samples_per_second": 227.534, "eval_steps_per_second": 1.229, "step": 84042 }, { "epoch": 4.165292055481227, "grad_norm": 0.25440993905067444, "learning_rate": 5.6732269693366894e-05, "loss": 1.4029, "step": 84050 }, { "epoch": 4.1658339966467395, "grad_norm": 0.1714615821838379, "learning_rate": 5.6716443137335695e-05, "loss": 1.3996, "step": 84060 }, { "epoch": 4.166375937812251, "grad_norm": 0.3408913016319275, "learning_rate": 5.670061636867896e-05, "loss": 1.4094, "step": 84070 }, { "epoch": 4.166917878977763, "grad_norm": 0.25224050879478455, "learning_rate": 5.6684789389357254e-05, "loss": 1.4019, "step": 84080 }, { "epoch": 4.167459820143276, "grad_norm": 0.18869130313396454, "learning_rate": 5.666896220133116e-05, "loss": 1.398, "step": 84090 }, { "epoch": 4.168001761308788, "grad_norm": 0.23559992015361786, "learning_rate": 5.665313480656127e-05, "loss": 1.3896, "step": 84100 }, { "epoch": 4.1685437024743, "grad_norm": 0.33458632230758667, "learning_rate": 5.663730720700822e-05, "loss": 1.4115, "step": 84110 }, { "epoch": 4.1690856436398125, "grad_norm": 0.24740028381347656, "learning_rate": 5.662147940463265e-05, "loss": 1.4121, "step": 84120 }, { "epoch": 4.1695733906887735, "eval_loss": 2.432617664337158, "eval_runtime": 21.9739, "eval_samples_per_second": 227.543, "eval_steps_per_second": 1.229, "step": 84129 }, { "epoch": 4.169627584805324, "grad_norm": 0.15262192487716675, "learning_rate": 5.6605651401395265e-05, "loss": 1.3928, "step": 84130 }, { "epoch": 4.170169525970837, "grad_norm": 0.16818031668663025, "learning_rate": 5.658982319925675e-05, "loss": 1.4084, "step": 84140 }, { "epoch": 4.170711467136349, "grad_norm": 0.16727863252162933, "learning_rate": 5.6573994800177844e-05, "loss": 1.3928, "step": 84150 }, { "epoch": 4.171253408301861, "grad_norm": 0.21401916444301605, "learning_rate": 5.655816620611929e-05, "loss": 1.3994, "step": 84160 }, { "epoch": 4.171795349467374, "grad_norm": 0.21131601929664612, "learning_rate": 5.6542337419041866e-05, "loss": 1.4001, "step": 84170 }, { "epoch": 4.1723372906328855, "grad_norm": 0.17307646572589874, "learning_rate": 5.652650844090639e-05, "loss": 1.3984, "step": 84180 }, { "epoch": 4.172879231798398, "grad_norm": 0.16799761354923248, "learning_rate": 5.651067927367367e-05, "loss": 1.405, "step": 84190 }, { "epoch": 4.17342117296391, "grad_norm": 0.20099224150180817, "learning_rate": 5.649484991930456e-05, "loss": 1.4075, "step": 84200 }, { "epoch": 4.173963114129422, "grad_norm": 0.28969597816467285, "learning_rate": 5.647902037975994e-05, "loss": 1.4005, "step": 84210 }, { "epoch": 4.17428827882873, "eval_loss": 2.434582471847534, "eval_runtime": 21.9717, "eval_samples_per_second": 227.565, "eval_steps_per_second": 1.229, "step": 84216 }, { "epoch": 4.174505055294935, "grad_norm": 0.272678017616272, "learning_rate": 5.646319065700068e-05, "loss": 1.4166, "step": 84220 }, { "epoch": 4.175046996460447, "grad_norm": 0.1885080486536026, "learning_rate": 5.644736075298772e-05, "loss": 1.4171, "step": 84230 }, { "epoch": 4.175588937625959, "grad_norm": 0.1917075365781784, "learning_rate": 5.643153066968201e-05, "loss": 1.3912, "step": 84240 }, { "epoch": 4.176130878791471, "grad_norm": 0.1733640879392624, "learning_rate": 5.641570040904448e-05, "loss": 1.4086, "step": 84250 }, { "epoch": 4.176672819956983, "grad_norm": 0.21975889801979065, "learning_rate": 5.6399869973036136e-05, "loss": 1.4118, "step": 84260 }, { "epoch": 4.177214761122496, "grad_norm": 0.1629042774438858, "learning_rate": 5.638403936361797e-05, "loss": 1.4043, "step": 84270 }, { "epoch": 4.177756702288008, "grad_norm": 0.16598139703273773, "learning_rate": 5.6368208582751026e-05, "loss": 1.413, "step": 84280 }, { "epoch": 4.17829864345352, "grad_norm": 0.16502542793750763, "learning_rate": 5.6352377632396347e-05, "loss": 1.404, "step": 84290 }, { "epoch": 4.178840584619032, "grad_norm": 0.16018883883953094, "learning_rate": 5.633654651451499e-05, "loss": 1.3957, "step": 84300 }, { "epoch": 4.179003166968686, "eval_loss": 2.4341461658477783, "eval_runtime": 21.9742, "eval_samples_per_second": 227.539, "eval_steps_per_second": 1.229, "step": 84303 }, { "epoch": 4.179382525784544, "grad_norm": 0.1671213060617447, "learning_rate": 5.632071523106805e-05, "loss": 1.4043, "step": 84310 }, { "epoch": 4.179924466950057, "grad_norm": 0.35982993245124817, "learning_rate": 5.630488378401665e-05, "loss": 1.3999, "step": 84320 }, { "epoch": 4.180466408115569, "grad_norm": 0.3010498881340027, "learning_rate": 5.62890521753219e-05, "loss": 1.4013, "step": 84330 }, { "epoch": 4.181008349281081, "grad_norm": 0.27373287081718445, "learning_rate": 5.627322040694497e-05, "loss": 1.3974, "step": 84340 }, { "epoch": 4.1815502904465935, "grad_norm": 0.21938002109527588, "learning_rate": 5.6257388480847026e-05, "loss": 1.4011, "step": 84350 }, { "epoch": 4.182092231612105, "grad_norm": 0.184707373380661, "learning_rate": 5.6241556398989246e-05, "loss": 1.3999, "step": 84360 }, { "epoch": 4.182634172777618, "grad_norm": 0.15879768133163452, "learning_rate": 5.622572416333286e-05, "loss": 1.4061, "step": 84370 }, { "epoch": 4.18317611394313, "grad_norm": 0.16178719699382782, "learning_rate": 5.620989177583908e-05, "loss": 1.4106, "step": 84380 }, { "epoch": 4.183718055108642, "grad_norm": 0.18774659931659698, "learning_rate": 5.619405923846916e-05, "loss": 1.4041, "step": 84390 }, { "epoch": 4.183718055108642, "eval_loss": 2.4340009689331055, "eval_runtime": 21.9661, "eval_samples_per_second": 227.623, "eval_steps_per_second": 1.229, "step": 84390 }, { "epoch": 4.184259996274155, "grad_norm": 0.29684072732925415, "learning_rate": 5.617822655318438e-05, "loss": 1.4182, "step": 84400 }, { "epoch": 4.1848019374396666, "grad_norm": 0.24184374511241913, "learning_rate": 5.616239372194599e-05, "loss": 1.406, "step": 84410 }, { "epoch": 4.185343878605179, "grad_norm": 0.17345279455184937, "learning_rate": 5.614656074671532e-05, "loss": 1.4158, "step": 84420 }, { "epoch": 4.185885819770691, "grad_norm": 0.18434324860572815, "learning_rate": 5.613072762945369e-05, "loss": 1.415, "step": 84430 }, { "epoch": 4.186427760936203, "grad_norm": 0.16675621271133423, "learning_rate": 5.6114894372122415e-05, "loss": 1.4093, "step": 84440 }, { "epoch": 4.186969702101716, "grad_norm": 0.18245235085487366, "learning_rate": 5.6099060976682883e-05, "loss": 1.4074, "step": 84450 }, { "epoch": 4.187511643267228, "grad_norm": 0.18735003471374512, "learning_rate": 5.608322744509644e-05, "loss": 1.403, "step": 84460 }, { "epoch": 4.1880535844327405, "grad_norm": 0.22122140228748322, "learning_rate": 5.606739377932447e-05, "loss": 1.3978, "step": 84470 }, { "epoch": 4.188432943248599, "eval_loss": 2.4321837425231934, "eval_runtime": 21.9752, "eval_samples_per_second": 227.529, "eval_steps_per_second": 1.229, "step": 84477 }, { "epoch": 4.188595525598252, "grad_norm": 0.19624446332454681, "learning_rate": 5.605155998132843e-05, "loss": 1.4006, "step": 84480 }, { "epoch": 4.189137466763764, "grad_norm": 0.2357572615146637, "learning_rate": 5.603572605306967e-05, "loss": 1.3976, "step": 84490 }, { "epoch": 4.189679407929277, "grad_norm": 0.1909378468990326, "learning_rate": 5.6019891996509676e-05, "loss": 1.3962, "step": 84500 }, { "epoch": 4.190221349094789, "grad_norm": 0.2646555006504059, "learning_rate": 5.600405781360989e-05, "loss": 1.4053, "step": 84510 }, { "epoch": 4.190763290260301, "grad_norm": 0.16853564977645874, "learning_rate": 5.598822350633177e-05, "loss": 1.4133, "step": 84520 }, { "epoch": 4.1913052314258135, "grad_norm": 0.1681780219078064, "learning_rate": 5.5972389076636833e-05, "loss": 1.4132, "step": 84530 }, { "epoch": 4.191847172591325, "grad_norm": 0.15586897730827332, "learning_rate": 5.595655452648655e-05, "loss": 1.4015, "step": 84540 }, { "epoch": 4.192389113756838, "grad_norm": 0.1730291098356247, "learning_rate": 5.594071985784244e-05, "loss": 1.3961, "step": 84550 }, { "epoch": 4.19293105492235, "grad_norm": 0.16029633581638336, "learning_rate": 5.5924885072666045e-05, "loss": 1.4024, "step": 84560 }, { "epoch": 4.193147831388555, "eval_loss": 2.4340431690216064, "eval_runtime": 21.9997, "eval_samples_per_second": 227.276, "eval_steps_per_second": 1.227, "step": 84564 }, { "epoch": 4.193472996087862, "grad_norm": 0.2048177421092987, "learning_rate": 5.5909050172918896e-05, "loss": 1.403, "step": 84570 }, { "epoch": 4.194014937253375, "grad_norm": 0.1830647587776184, "learning_rate": 5.589321516056256e-05, "loss": 1.4039, "step": 84580 }, { "epoch": 4.1945568784188865, "grad_norm": 0.18727242946624756, "learning_rate": 5.587738003755861e-05, "loss": 1.4054, "step": 84590 }, { "epoch": 4.195098819584399, "grad_norm": 0.2537229657173157, "learning_rate": 5.5861544805868624e-05, "loss": 1.4062, "step": 84600 }, { "epoch": 4.195640760749911, "grad_norm": 0.23978114128112793, "learning_rate": 5.584570946745422e-05, "loss": 1.4057, "step": 84610 }, { "epoch": 4.196182701915423, "grad_norm": 0.19209878146648407, "learning_rate": 5.5829874024276995e-05, "loss": 1.409, "step": 84620 }, { "epoch": 4.196724643080936, "grad_norm": 0.22839245200157166, "learning_rate": 5.581403847829857e-05, "loss": 1.4135, "step": 84630 }, { "epoch": 4.197266584246448, "grad_norm": 0.1937345266342163, "learning_rate": 5.5798202831480605e-05, "loss": 1.3996, "step": 84640 }, { "epoch": 4.19780852541196, "grad_norm": 0.23591110110282898, "learning_rate": 5.5782367085784725e-05, "loss": 1.4079, "step": 84650 }, { "epoch": 4.197862719528511, "eval_loss": 2.4296658039093018, "eval_runtime": 22.048, "eval_samples_per_second": 226.778, "eval_steps_per_second": 1.225, "step": 84651 }, { "epoch": 4.198350466577472, "grad_norm": 0.17365294694900513, "learning_rate": 5.5766531243172616e-05, "loss": 1.3987, "step": 84660 }, { "epoch": 4.198892407742984, "grad_norm": 0.20040321350097656, "learning_rate": 5.575069530560594e-05, "loss": 1.4116, "step": 84670 }, { "epoch": 4.199434348908497, "grad_norm": 0.1579209417104721, "learning_rate": 5.573485927504639e-05, "loss": 1.4013, "step": 84680 }, { "epoch": 4.199976290074009, "grad_norm": 0.2494509518146515, "learning_rate": 5.5719023153455674e-05, "loss": 1.4098, "step": 84690 }, { "epoch": 4.200518231239521, "grad_norm": 0.18554720282554626, "learning_rate": 5.5703186942795484e-05, "loss": 1.4108, "step": 84700 }, { "epoch": 4.201060172405033, "grad_norm": 0.1679755598306656, "learning_rate": 5.5687350645027544e-05, "loss": 1.4001, "step": 84710 }, { "epoch": 4.201602113570545, "grad_norm": 0.17022711038589478, "learning_rate": 5.56715142621136e-05, "loss": 1.3902, "step": 84720 }, { "epoch": 4.202144054736058, "grad_norm": 0.17805786430835724, "learning_rate": 5.5655677796015374e-05, "loss": 1.4025, "step": 84730 }, { "epoch": 4.202577607668467, "eval_loss": 2.4276254177093506, "eval_runtime": 22.04, "eval_samples_per_second": 226.86, "eval_steps_per_second": 1.225, "step": 84738 }, { "epoch": 4.20268599590157, "grad_norm": 0.18491090834140778, "learning_rate": 5.563984124869463e-05, "loss": 1.4003, "step": 84740 }, { "epoch": 4.203227937067082, "grad_norm": 0.1828235238790512, "learning_rate": 5.5624004622113125e-05, "loss": 1.4063, "step": 84750 }, { "epoch": 4.2037698782325945, "grad_norm": 0.160418301820755, "learning_rate": 5.560816791823264e-05, "loss": 1.4013, "step": 84760 }, { "epoch": 4.204311819398106, "grad_norm": 0.17551931738853455, "learning_rate": 5.559233113901498e-05, "loss": 1.4134, "step": 84770 }, { "epoch": 4.204853760563619, "grad_norm": 0.18492551147937775, "learning_rate": 5.557649428642189e-05, "loss": 1.4118, "step": 84780 }, { "epoch": 4.205395701729131, "grad_norm": 0.1690473109483719, "learning_rate": 5.556065736241518e-05, "loss": 1.4027, "step": 84790 }, { "epoch": 4.205937642894643, "grad_norm": 0.15582244098186493, "learning_rate": 5.5544820368956674e-05, "loss": 1.3997, "step": 84800 }, { "epoch": 4.206479584060156, "grad_norm": 0.21982191503047943, "learning_rate": 5.55289833080082e-05, "loss": 1.3989, "step": 84810 }, { "epoch": 4.2070215252256675, "grad_norm": 0.1695873886346817, "learning_rate": 5.551314618153156e-05, "loss": 1.4016, "step": 84820 }, { "epoch": 4.2072924958084235, "eval_loss": 2.426410436630249, "eval_runtime": 22.0191, "eval_samples_per_second": 227.076, "eval_steps_per_second": 1.226, "step": 84825 }, { "epoch": 4.20756346639118, "grad_norm": 0.15506160259246826, "learning_rate": 5.5497308991488606e-05, "loss": 1.4008, "step": 84830 }, { "epoch": 4.208105407556692, "grad_norm": 0.21147039532661438, "learning_rate": 5.548147173984116e-05, "loss": 1.3972, "step": 84840 }, { "epoch": 4.208647348722204, "grad_norm": 0.3284474313259125, "learning_rate": 5.546563442855108e-05, "loss": 1.405, "step": 84850 }, { "epoch": 4.209189289887717, "grad_norm": 0.2625653147697449, "learning_rate": 5.544979705958025e-05, "loss": 1.4091, "step": 84860 }, { "epoch": 4.209731231053229, "grad_norm": 0.24892371892929077, "learning_rate": 5.543395963489051e-05, "loss": 1.4066, "step": 84870 }, { "epoch": 4.210273172218741, "grad_norm": 0.26100948452949524, "learning_rate": 5.541812215644373e-05, "loss": 1.4048, "step": 84880 }, { "epoch": 4.210815113384253, "grad_norm": 0.18004588782787323, "learning_rate": 5.5402284626201794e-05, "loss": 1.3915, "step": 84890 }, { "epoch": 4.211357054549765, "grad_norm": 0.21767373383045197, "learning_rate": 5.538644704612658e-05, "loss": 1.4005, "step": 84900 }, { "epoch": 4.211898995715278, "grad_norm": 0.18447516858577728, "learning_rate": 5.537060941817999e-05, "loss": 1.4034, "step": 84910 }, { "epoch": 4.2120073839483805, "eval_loss": 2.4239137172698975, "eval_runtime": 21.9695, "eval_samples_per_second": 227.588, "eval_steps_per_second": 1.229, "step": 84912 }, { "epoch": 4.21244093688079, "grad_norm": 0.1832926869392395, "learning_rate": 5.535477174432391e-05, "loss": 1.4034, "step": 84920 }, { "epoch": 4.212982878046302, "grad_norm": 0.1812276393175125, "learning_rate": 5.533893402652025e-05, "loss": 1.392, "step": 84930 }, { "epoch": 4.213524819211814, "grad_norm": 0.18913184106349945, "learning_rate": 5.5323096266730914e-05, "loss": 1.4024, "step": 84940 }, { "epoch": 4.214066760377326, "grad_norm": 0.22599507868289948, "learning_rate": 5.530725846691781e-05, "loss": 1.4047, "step": 84950 }, { "epoch": 4.214608701542839, "grad_norm": 0.1804644763469696, "learning_rate": 5.529142062904286e-05, "loss": 1.4025, "step": 84960 }, { "epoch": 4.215150642708351, "grad_norm": 0.19616399705410004, "learning_rate": 5.527558275506799e-05, "loss": 1.3948, "step": 84970 }, { "epoch": 4.215692583873863, "grad_norm": 0.22985543310642242, "learning_rate": 5.5259744846955145e-05, "loss": 1.3937, "step": 84980 }, { "epoch": 4.216234525039376, "grad_norm": 0.20359660685062408, "learning_rate": 5.52439069066662e-05, "loss": 1.3941, "step": 84990 }, { "epoch": 4.216722272088337, "eval_loss": 2.4288182258605957, "eval_runtime": 21.9691, "eval_samples_per_second": 227.592, "eval_steps_per_second": 1.229, "step": 84999 }, { "epoch": 4.216776466204887, "grad_norm": 0.18419791758060455, "learning_rate": 5.5228068936163134e-05, "loss": 1.4054, "step": 85000 }, { "epoch": 4.2173184073704, "grad_norm": 0.17802853882312775, "learning_rate": 5.52122309374079e-05, "loss": 1.4009, "step": 85010 }, { "epoch": 4.217860348535912, "grad_norm": 0.16491647064685822, "learning_rate": 5.519639291236241e-05, "loss": 1.4079, "step": 85020 }, { "epoch": 4.218402289701424, "grad_norm": 0.26249879598617554, "learning_rate": 5.518055486298862e-05, "loss": 1.4104, "step": 85030 }, { "epoch": 4.218944230866937, "grad_norm": 0.16674381494522095, "learning_rate": 5.516471679124846e-05, "loss": 1.4072, "step": 85040 }, { "epoch": 4.219486172032449, "grad_norm": 0.1718483567237854, "learning_rate": 5.514887869910391e-05, "loss": 1.4052, "step": 85050 }, { "epoch": 4.220028113197961, "grad_norm": 0.18503984808921814, "learning_rate": 5.5133040588516925e-05, "loss": 1.3951, "step": 85060 }, { "epoch": 4.220570054363473, "grad_norm": 0.25337323546409607, "learning_rate": 5.511720246144945e-05, "loss": 1.399, "step": 85070 }, { "epoch": 4.221111995528985, "grad_norm": 0.17536914348602295, "learning_rate": 5.510136431986342e-05, "loss": 1.401, "step": 85080 }, { "epoch": 4.221437160228293, "eval_loss": 2.4300763607025146, "eval_runtime": 21.9731, "eval_samples_per_second": 227.551, "eval_steps_per_second": 1.229, "step": 85086 }, { "epoch": 4.221653936694498, "grad_norm": 0.25776609778404236, "learning_rate": 5.5085526165720835e-05, "loss": 1.4017, "step": 85090 }, { "epoch": 4.22219587786001, "grad_norm": 0.23579788208007812, "learning_rate": 5.5069688000983635e-05, "loss": 1.3972, "step": 85100 }, { "epoch": 4.222737819025522, "grad_norm": 0.23605017364025116, "learning_rate": 5.505384982761379e-05, "loss": 1.4034, "step": 85110 }, { "epoch": 4.223279760191034, "grad_norm": 0.23162227869033813, "learning_rate": 5.503801164757327e-05, "loss": 1.403, "step": 85120 }, { "epoch": 4.223821701356546, "grad_norm": 0.21863441169261932, "learning_rate": 5.502217346282401e-05, "loss": 1.4017, "step": 85130 }, { "epoch": 4.224363642522059, "grad_norm": 0.1740538477897644, "learning_rate": 5.500633527532802e-05, "loss": 1.4107, "step": 85140 }, { "epoch": 4.224905583687571, "grad_norm": 0.1974310427904129, "learning_rate": 5.499049708704723e-05, "loss": 1.4013, "step": 85150 }, { "epoch": 4.225447524853083, "grad_norm": 0.24513813853263855, "learning_rate": 5.497465889994362e-05, "loss": 1.4011, "step": 85160 }, { "epoch": 4.2259894660185955, "grad_norm": 0.28687775135040283, "learning_rate": 5.495882071597915e-05, "loss": 1.4033, "step": 85170 }, { "epoch": 4.226152048368249, "eval_loss": 2.429248332977295, "eval_runtime": 21.97, "eval_samples_per_second": 227.583, "eval_steps_per_second": 1.229, "step": 85173 }, { "epoch": 4.226531407184107, "grad_norm": 0.17238466441631317, "learning_rate": 5.4942982537115826e-05, "loss": 1.4076, "step": 85180 }, { "epoch": 4.22707334834962, "grad_norm": 0.1770007312297821, "learning_rate": 5.4927144365315544e-05, "loss": 1.4096, "step": 85190 }, { "epoch": 4.227615289515132, "grad_norm": 0.1884445995092392, "learning_rate": 5.49113062025403e-05, "loss": 1.3935, "step": 85200 }, { "epoch": 4.228157230680644, "grad_norm": 0.17585532367229462, "learning_rate": 5.489546805075209e-05, "loss": 1.4014, "step": 85210 }, { "epoch": 4.228699171846157, "grad_norm": 0.15574230253696442, "learning_rate": 5.487962991191281e-05, "loss": 1.4228, "step": 85220 }, { "epoch": 4.2292411130116685, "grad_norm": 0.1755153387784958, "learning_rate": 5.486379178798446e-05, "loss": 1.4031, "step": 85230 }, { "epoch": 4.229783054177181, "grad_norm": 0.22659145295619965, "learning_rate": 5.484795368092901e-05, "loss": 1.4086, "step": 85240 }, { "epoch": 4.230324995342693, "grad_norm": 0.2872506082057953, "learning_rate": 5.483211559270838e-05, "loss": 1.4091, "step": 85250 }, { "epoch": 4.230866936508205, "grad_norm": 0.21537162363529205, "learning_rate": 5.4816277525284544e-05, "loss": 1.4028, "step": 85260 }, { "epoch": 4.230866936508205, "eval_loss": 2.42655086517334, "eval_runtime": 21.9673, "eval_samples_per_second": 227.611, "eval_steps_per_second": 1.229, "step": 85260 }, { "epoch": 4.231408877673718, "grad_norm": 0.17939117550849915, "learning_rate": 5.480043948061947e-05, "loss": 1.4065, "step": 85270 }, { "epoch": 4.23195081883923, "grad_norm": 0.5240494012832642, "learning_rate": 5.4784601460675064e-05, "loss": 1.4144, "step": 85280 }, { "epoch": 4.232492760004742, "grad_norm": 0.18183523416519165, "learning_rate": 5.47687634674133e-05, "loss": 1.3938, "step": 85290 }, { "epoch": 4.233034701170254, "grad_norm": 0.16505871713161469, "learning_rate": 5.4752925502796105e-05, "loss": 1.3924, "step": 85300 }, { "epoch": 4.233576642335766, "grad_norm": 0.20003126561641693, "learning_rate": 5.473708756878545e-05, "loss": 1.4059, "step": 85310 }, { "epoch": 4.234118583501279, "grad_norm": 0.21324165165424347, "learning_rate": 5.472124966734322e-05, "loss": 1.4002, "step": 85320 }, { "epoch": 4.234660524666791, "grad_norm": 0.32130008935928345, "learning_rate": 5.4705411800431386e-05, "loss": 1.4071, "step": 85330 }, { "epoch": 4.2352024658323035, "grad_norm": 0.18662355840206146, "learning_rate": 5.468957397001186e-05, "loss": 1.3962, "step": 85340 }, { "epoch": 4.235581824648162, "eval_loss": 2.428666830062866, "eval_runtime": 21.9762, "eval_samples_per_second": 227.519, "eval_steps_per_second": 1.229, "step": 85347 }, { "epoch": 4.235744406997815, "grad_norm": 0.271916002035141, "learning_rate": 5.467373617804655e-05, "loss": 1.4044, "step": 85350 }, { "epoch": 4.236286348163327, "grad_norm": 0.27290457487106323, "learning_rate": 5.465789842649739e-05, "loss": 1.3982, "step": 85360 }, { "epoch": 4.23682828932884, "grad_norm": 0.25049006938934326, "learning_rate": 5.4642060717326305e-05, "loss": 1.4102, "step": 85370 }, { "epoch": 4.237370230494352, "grad_norm": 0.17757458984851837, "learning_rate": 5.462622305249517e-05, "loss": 1.4, "step": 85380 }, { "epoch": 4.237912171659864, "grad_norm": 0.16723614931106567, "learning_rate": 5.461038543396589e-05, "loss": 1.4082, "step": 85390 }, { "epoch": 4.2384541128253765, "grad_norm": 0.14943601191043854, "learning_rate": 5.4594547863700396e-05, "loss": 1.4074, "step": 85400 }, { "epoch": 4.238996053990888, "grad_norm": 0.18632861971855164, "learning_rate": 5.457871034366053e-05, "loss": 1.4057, "step": 85410 }, { "epoch": 4.239537995156401, "grad_norm": 0.16720369458198547, "learning_rate": 5.456287287580821e-05, "loss": 1.3995, "step": 85420 }, { "epoch": 4.240079936321913, "grad_norm": 0.23148633539676666, "learning_rate": 5.45470354621053e-05, "loss": 1.3956, "step": 85430 }, { "epoch": 4.240296712788118, "eval_loss": 2.4289402961730957, "eval_runtime": 21.9722, "eval_samples_per_second": 227.561, "eval_steps_per_second": 1.229, "step": 85434 }, { "epoch": 4.240621877487425, "grad_norm": 0.3228211998939514, "learning_rate": 5.453119810451366e-05, "loss": 1.3988, "step": 85440 }, { "epoch": 4.241163818652938, "grad_norm": 0.28075844049453735, "learning_rate": 5.451536080499518e-05, "loss": 1.3984, "step": 85450 }, { "epoch": 4.2417057598184496, "grad_norm": 0.23300626873970032, "learning_rate": 5.449952356551168e-05, "loss": 1.3963, "step": 85460 }, { "epoch": 4.242247700983962, "grad_norm": 0.2622208595275879, "learning_rate": 5.448368638802506e-05, "loss": 1.3987, "step": 85470 }, { "epoch": 4.242789642149474, "grad_norm": 0.15895061194896698, "learning_rate": 5.4467849274497094e-05, "loss": 1.4072, "step": 85480 }, { "epoch": 4.243331583314986, "grad_norm": 0.17281389236450195, "learning_rate": 5.4452012226889646e-05, "loss": 1.4066, "step": 85490 }, { "epoch": 4.243873524480499, "grad_norm": 0.2625819444656372, "learning_rate": 5.443617524716458e-05, "loss": 1.4131, "step": 85500 }, { "epoch": 4.244415465646011, "grad_norm": 0.20588402450084686, "learning_rate": 5.442033833728365e-05, "loss": 1.4123, "step": 85510 }, { "epoch": 4.2449574068115234, "grad_norm": 0.22198638319969177, "learning_rate": 5.440450149920869e-05, "loss": 1.4101, "step": 85520 }, { "epoch": 4.245011600928074, "eval_loss": 2.4276344776153564, "eval_runtime": 21.9945, "eval_samples_per_second": 227.33, "eval_steps_per_second": 1.228, "step": 85521 }, { "epoch": 4.245499347977035, "grad_norm": 0.16215014457702637, "learning_rate": 5.438866473490152e-05, "loss": 1.3984, "step": 85530 }, { "epoch": 4.246041289142547, "grad_norm": 0.1525203287601471, "learning_rate": 5.4372828046323885e-05, "loss": 1.3939, "step": 85540 }, { "epoch": 4.24658323030806, "grad_norm": 0.16212084889411926, "learning_rate": 5.43569914354376e-05, "loss": 1.4041, "step": 85550 }, { "epoch": 4.247125171473572, "grad_norm": 0.17924129962921143, "learning_rate": 5.434115490420443e-05, "loss": 1.3977, "step": 85560 }, { "epoch": 4.247667112639084, "grad_norm": 0.22136323153972626, "learning_rate": 5.432531845458612e-05, "loss": 1.4001, "step": 85570 }, { "epoch": 4.2482090538045965, "grad_norm": 0.26103144884109497, "learning_rate": 5.4309482088544416e-05, "loss": 1.405, "step": 85580 }, { "epoch": 4.248750994970108, "grad_norm": 0.23051413893699646, "learning_rate": 5.429364580804111e-05, "loss": 1.4005, "step": 85590 }, { "epoch": 4.249292936135621, "grad_norm": 0.19414301216602325, "learning_rate": 5.427780961503787e-05, "loss": 1.4039, "step": 85600 }, { "epoch": 4.24972648906803, "eval_loss": 2.4262514114379883, "eval_runtime": 21.973, "eval_samples_per_second": 227.551, "eval_steps_per_second": 1.229, "step": 85608 }, { "epoch": 4.249834877301133, "grad_norm": 0.1806284487247467, "learning_rate": 5.426197351149645e-05, "loss": 1.4023, "step": 85610 }, { "epoch": 4.250376818466645, "grad_norm": 0.1894899308681488, "learning_rate": 5.424613749937852e-05, "loss": 1.3989, "step": 85620 }, { "epoch": 4.250918759632158, "grad_norm": 0.1757279932498932, "learning_rate": 5.423030158064584e-05, "loss": 1.4181, "step": 85630 }, { "epoch": 4.2514607007976695, "grad_norm": 0.345478892326355, "learning_rate": 5.421446575726004e-05, "loss": 1.4023, "step": 85640 }, { "epoch": 4.252002641963182, "grad_norm": 0.36112889647483826, "learning_rate": 5.419863003118281e-05, "loss": 1.4054, "step": 85650 }, { "epoch": 4.252544583128694, "grad_norm": 0.1516331434249878, "learning_rate": 5.418279440437582e-05, "loss": 1.4045, "step": 85660 }, { "epoch": 4.253086524294206, "grad_norm": 0.23912928998470306, "learning_rate": 5.416695887880071e-05, "loss": 1.4001, "step": 85670 }, { "epoch": 4.253628465459719, "grad_norm": 0.27274298667907715, "learning_rate": 5.41511234564191e-05, "loss": 1.4137, "step": 85680 }, { "epoch": 4.254170406625231, "grad_norm": 0.25264525413513184, "learning_rate": 5.4135288139192664e-05, "loss": 1.4097, "step": 85690 }, { "epoch": 4.2544413772079865, "eval_loss": 2.4301393032073975, "eval_runtime": 21.9696, "eval_samples_per_second": 227.587, "eval_steps_per_second": 1.229, "step": 85695 }, { "epoch": 4.254712347790743, "grad_norm": 0.1786414533853531, "learning_rate": 5.411945292908296e-05, "loss": 1.4005, "step": 85700 }, { "epoch": 4.255254288956255, "grad_norm": 0.17254769802093506, "learning_rate": 5.41036178280516e-05, "loss": 1.4008, "step": 85710 }, { "epoch": 4.255796230121767, "grad_norm": 0.31122511625289917, "learning_rate": 5.408778283806018e-05, "loss": 1.4009, "step": 85720 }, { "epoch": 4.25633817128728, "grad_norm": 0.2594967484474182, "learning_rate": 5.407194796107027e-05, "loss": 1.4123, "step": 85730 }, { "epoch": 4.256880112452792, "grad_norm": 0.26264312863349915, "learning_rate": 5.405611319904339e-05, "loss": 1.39, "step": 85740 }, { "epoch": 4.2574220536183045, "grad_norm": 0.18427279591560364, "learning_rate": 5.404027855394113e-05, "loss": 1.3978, "step": 85750 }, { "epoch": 4.257963994783816, "grad_norm": 0.2185283601284027, "learning_rate": 5.4024444027725006e-05, "loss": 1.4082, "step": 85760 }, { "epoch": 4.258505935949328, "grad_norm": 0.16532769799232483, "learning_rate": 5.40086096223565e-05, "loss": 1.4063, "step": 85770 }, { "epoch": 4.259047877114841, "grad_norm": 0.17376570403575897, "learning_rate": 5.399277533979712e-05, "loss": 1.4046, "step": 85780 }, { "epoch": 4.259156265347944, "eval_loss": 2.4298863410949707, "eval_runtime": 21.973, "eval_samples_per_second": 227.552, "eval_steps_per_second": 1.229, "step": 85782 }, { "epoch": 4.259589818280353, "grad_norm": 0.17817267775535583, "learning_rate": 5.397694118200839e-05, "loss": 1.4041, "step": 85790 }, { "epoch": 4.260131759445865, "grad_norm": 0.21518279612064362, "learning_rate": 5.396110715095173e-05, "loss": 1.3957, "step": 85800 }, { "epoch": 4.2606737006113775, "grad_norm": 0.1868053376674652, "learning_rate": 5.3945273248588604e-05, "loss": 1.3996, "step": 85810 }, { "epoch": 4.261215641776889, "grad_norm": 0.16047891974449158, "learning_rate": 5.392943947688045e-05, "loss": 1.4125, "step": 85820 }, { "epoch": 4.261757582942402, "grad_norm": 0.16372744739055634, "learning_rate": 5.3913605837788686e-05, "loss": 1.399, "step": 85830 }, { "epoch": 4.262299524107914, "grad_norm": 0.15229913592338562, "learning_rate": 5.3897772333274696e-05, "loss": 1.4013, "step": 85840 }, { "epoch": 4.262841465273426, "grad_norm": 0.2408658266067505, "learning_rate": 5.3881938965299916e-05, "loss": 1.4027, "step": 85850 }, { "epoch": 4.263383406438939, "grad_norm": 0.1828664392232895, "learning_rate": 5.386610573582567e-05, "loss": 1.398, "step": 85860 }, { "epoch": 4.2638711534879, "eval_loss": 2.43064022064209, "eval_runtime": 21.9716, "eval_samples_per_second": 227.567, "eval_steps_per_second": 1.229, "step": 85869 }, { "epoch": 4.2639253476044505, "grad_norm": 0.24171289801597595, "learning_rate": 5.385027264681332e-05, "loss": 1.4029, "step": 85870 }, { "epoch": 4.264467288769963, "grad_norm": 0.19249555468559265, "learning_rate": 5.38344397002242e-05, "loss": 1.4006, "step": 85880 }, { "epoch": 4.265009229935475, "grad_norm": 0.1631373018026352, "learning_rate": 5.381860689801963e-05, "loss": 1.4004, "step": 85890 }, { "epoch": 4.265551171100987, "grad_norm": 0.2803032398223877, "learning_rate": 5.38027742421609e-05, "loss": 1.4041, "step": 85900 }, { "epoch": 4.2660931122665, "grad_norm": 0.1907922625541687, "learning_rate": 5.378694173460932e-05, "loss": 1.4021, "step": 85910 }, { "epoch": 4.266635053432012, "grad_norm": 0.182045578956604, "learning_rate": 5.377110937732612e-05, "loss": 1.4019, "step": 85920 }, { "epoch": 4.267176994597524, "grad_norm": 0.17628154158592224, "learning_rate": 5.3755277172272556e-05, "loss": 1.3939, "step": 85930 }, { "epoch": 4.267718935763036, "grad_norm": 0.18110013008117676, "learning_rate": 5.3739445121409846e-05, "loss": 1.4047, "step": 85940 }, { "epoch": 4.268260876928548, "grad_norm": 0.2522640824317932, "learning_rate": 5.372361322669922e-05, "loss": 1.4082, "step": 85950 }, { "epoch": 4.268586041627856, "eval_loss": 2.4356658458709717, "eval_runtime": 21.9696, "eval_samples_per_second": 227.588, "eval_steps_per_second": 1.229, "step": 85956 }, { "epoch": 4.268802818094061, "grad_norm": 0.17792746424674988, "learning_rate": 5.370778149010184e-05, "loss": 1.3923, "step": 85960 }, { "epoch": 4.269344759259573, "grad_norm": 0.20900298655033112, "learning_rate": 5.3691949913578865e-05, "loss": 1.3991, "step": 85970 }, { "epoch": 4.269886700425085, "grad_norm": 0.18769441545009613, "learning_rate": 5.3676118499091466e-05, "loss": 1.4003, "step": 85980 }, { "epoch": 4.270428641590597, "grad_norm": 0.20186538994312286, "learning_rate": 5.366028724860076e-05, "loss": 1.3982, "step": 85990 }, { "epoch": 4.270970582756109, "grad_norm": 0.15831774473190308, "learning_rate": 5.3644456164067847e-05, "loss": 1.4019, "step": 86000 }, { "epoch": 4.271512523921622, "grad_norm": 0.20776121318340302, "learning_rate": 5.362862524745382e-05, "loss": 1.3977, "step": 86010 }, { "epoch": 4.272054465087134, "grad_norm": 0.1834535449743271, "learning_rate": 5.3612794500719745e-05, "loss": 1.4064, "step": 86020 }, { "epoch": 4.272596406252646, "grad_norm": 0.18609929084777832, "learning_rate": 5.3596963925826657e-05, "loss": 1.4038, "step": 86030 }, { "epoch": 4.273138347418159, "grad_norm": 0.19254687428474426, "learning_rate": 5.3581133524735614e-05, "loss": 1.3849, "step": 86040 }, { "epoch": 4.273300929767812, "eval_loss": 2.434671640396118, "eval_runtime": 21.6596, "eval_samples_per_second": 230.845, "eval_steps_per_second": 1.247, "step": 86043 }, { "epoch": 4.27368028858367, "grad_norm": 0.1640702337026596, "learning_rate": 5.356530329940757e-05, "loss": 1.4098, "step": 86050 }, { "epoch": 4.274222229749183, "grad_norm": 0.20824840664863586, "learning_rate": 5.3549473251803514e-05, "loss": 1.4025, "step": 86060 }, { "epoch": 4.274764170914695, "grad_norm": 0.18921475112438202, "learning_rate": 5.353364338388442e-05, "loss": 1.399, "step": 86070 }, { "epoch": 4.275306112080207, "grad_norm": 0.27768686413764954, "learning_rate": 5.351781369761124e-05, "loss": 1.407, "step": 86080 }, { "epoch": 4.27584805324572, "grad_norm": 0.2548810839653015, "learning_rate": 5.350198419494484e-05, "loss": 1.4034, "step": 86090 }, { "epoch": 4.276389994411232, "grad_norm": 0.1735634207725525, "learning_rate": 5.348615487784614e-05, "loss": 1.4001, "step": 86100 }, { "epoch": 4.276931935576744, "grad_norm": 0.16922864317893982, "learning_rate": 5.3470325748276004e-05, "loss": 1.3943, "step": 86110 }, { "epoch": 4.277473876742256, "grad_norm": 0.20345307886600494, "learning_rate": 5.3454496808195264e-05, "loss": 1.41, "step": 86120 }, { "epoch": 4.278015817907768, "grad_norm": 0.20915158092975616, "learning_rate": 5.343866805956476e-05, "loss": 1.4036, "step": 86130 }, { "epoch": 4.278015817907768, "eval_loss": 2.4313769340515137, "eval_runtime": 21.964, "eval_samples_per_second": 227.645, "eval_steps_per_second": 1.229, "step": 86130 }, { "epoch": 4.278557759073281, "grad_norm": 0.2022494673728943, "learning_rate": 5.342283950434529e-05, "loss": 1.3995, "step": 86140 }, { "epoch": 4.279099700238793, "grad_norm": 0.1643056720495224, "learning_rate": 5.3407011144497596e-05, "loss": 1.4006, "step": 86150 }, { "epoch": 4.2796416414043055, "grad_norm": 0.23833966255187988, "learning_rate": 5.339118298198245e-05, "loss": 1.3962, "step": 86160 }, { "epoch": 4.280183582569817, "grad_norm": 0.22995562851428986, "learning_rate": 5.337535501876057e-05, "loss": 1.4049, "step": 86170 }, { "epoch": 4.280725523735329, "grad_norm": 0.29235386848449707, "learning_rate": 5.335952725679265e-05, "loss": 1.3905, "step": 86180 }, { "epoch": 4.281267464900842, "grad_norm": 0.3634209632873535, "learning_rate": 5.334369969803937e-05, "loss": 1.4187, "step": 86190 }, { "epoch": 4.281809406066354, "grad_norm": 0.2359488159418106, "learning_rate": 5.33278723444614e-05, "loss": 1.3963, "step": 86200 }, { "epoch": 4.282351347231867, "grad_norm": 0.19134867191314697, "learning_rate": 5.331204519801933e-05, "loss": 1.4091, "step": 86210 }, { "epoch": 4.282730706047725, "eval_loss": 2.434406280517578, "eval_runtime": 21.9736, "eval_samples_per_second": 227.546, "eval_steps_per_second": 1.229, "step": 86217 }, { "epoch": 4.2828932883973785, "grad_norm": 0.22415363788604736, "learning_rate": 5.329621826067377e-05, "loss": 1.4002, "step": 86220 }, { "epoch": 4.28343522956289, "grad_norm": 0.22894415259361267, "learning_rate": 5.32803915343853e-05, "loss": 1.4073, "step": 86230 }, { "epoch": 4.283977170728403, "grad_norm": 0.21755249798297882, "learning_rate": 5.3264565021114474e-05, "loss": 1.3983, "step": 86240 }, { "epoch": 4.284519111893915, "grad_norm": 0.22237862646579742, "learning_rate": 5.324873872282179e-05, "loss": 1.41, "step": 86250 }, { "epoch": 4.285061053059427, "grad_norm": 0.2396155744791031, "learning_rate": 5.323291264146774e-05, "loss": 1.3943, "step": 86260 }, { "epoch": 4.28560299422494, "grad_norm": 0.1976693719625473, "learning_rate": 5.321708677901282e-05, "loss": 1.4141, "step": 86270 }, { "epoch": 4.2861449353904515, "grad_norm": 0.23698921501636505, "learning_rate": 5.320126113741742e-05, "loss": 1.3918, "step": 86280 }, { "epoch": 4.286686876555964, "grad_norm": 0.2332429438829422, "learning_rate": 5.3185435718641993e-05, "loss": 1.3988, "step": 86290 }, { "epoch": 4.287228817721476, "grad_norm": 0.18696242570877075, "learning_rate": 5.3169610524646916e-05, "loss": 1.4141, "step": 86300 }, { "epoch": 4.287445594187681, "eval_loss": 2.429657459259033, "eval_runtime": 21.9725, "eval_samples_per_second": 227.558, "eval_steps_per_second": 1.229, "step": 86304 }, { "epoch": 4.287770758886988, "grad_norm": 0.15365861356258392, "learning_rate": 5.315378555739254e-05, "loss": 1.4071, "step": 86310 }, { "epoch": 4.288312700052501, "grad_norm": 0.16417507827281952, "learning_rate": 5.313796081883918e-05, "loss": 1.3956, "step": 86320 }, { "epoch": 4.288854641218013, "grad_norm": 0.19014696776866913, "learning_rate": 5.3122136310947194e-05, "loss": 1.4021, "step": 86330 }, { "epoch": 4.289396582383525, "grad_norm": 0.20190609991550446, "learning_rate": 5.3106312035676766e-05, "loss": 1.3925, "step": 86340 }, { "epoch": 4.289938523549037, "grad_norm": 0.16114814579486847, "learning_rate": 5.3090487994988184e-05, "loss": 1.3964, "step": 86350 }, { "epoch": 4.290480464714549, "grad_norm": 0.3137826919555664, "learning_rate": 5.307466419084166e-05, "loss": 1.4145, "step": 86360 }, { "epoch": 4.291022405880062, "grad_norm": 0.23774784803390503, "learning_rate": 5.3058840625197394e-05, "loss": 1.4024, "step": 86370 }, { "epoch": 4.291564347045574, "grad_norm": 0.23315760493278503, "learning_rate": 5.304301730001552e-05, "loss": 1.3947, "step": 86380 }, { "epoch": 4.292106288211086, "grad_norm": 0.34956422448158264, "learning_rate": 5.302719421725615e-05, "loss": 1.3983, "step": 86390 }, { "epoch": 4.292160482327637, "eval_loss": 2.433197498321533, "eval_runtime": 21.9709, "eval_samples_per_second": 227.574, "eval_steps_per_second": 1.229, "step": 86391 }, { "epoch": 4.292648229376598, "grad_norm": 0.17017148435115814, "learning_rate": 5.30113713788794e-05, "loss": 1.3998, "step": 86400 }, { "epoch": 4.29319017054211, "grad_norm": 0.17241883277893066, "learning_rate": 5.2995548786845336e-05, "loss": 1.3969, "step": 86410 }, { "epoch": 4.293732111707623, "grad_norm": 0.19140473008155823, "learning_rate": 5.2979726443113985e-05, "loss": 1.3943, "step": 86420 }, { "epoch": 4.294274052873135, "grad_norm": 0.18566745519638062, "learning_rate": 5.296390434964537e-05, "loss": 1.4065, "step": 86430 }, { "epoch": 4.294815994038647, "grad_norm": 0.19149713218212128, "learning_rate": 5.294808250839942e-05, "loss": 1.3848, "step": 86440 }, { "epoch": 4.2953579352041595, "grad_norm": 0.2127964198589325, "learning_rate": 5.2932260921336106e-05, "loss": 1.4022, "step": 86450 }, { "epoch": 4.295899876369671, "grad_norm": 0.17330385744571686, "learning_rate": 5.291643959041533e-05, "loss": 1.4034, "step": 86460 }, { "epoch": 4.296441817535184, "grad_norm": 0.21223121881484985, "learning_rate": 5.290061851759698e-05, "loss": 1.4036, "step": 86470 }, { "epoch": 4.2968753704675935, "eval_loss": 2.4309399127960205, "eval_runtime": 21.9752, "eval_samples_per_second": 227.529, "eval_steps_per_second": 1.229, "step": 86478 }, { "epoch": 4.296983758700696, "grad_norm": 0.2610706388950348, "learning_rate": 5.288479770484088e-05, "loss": 1.4027, "step": 86480 }, { "epoch": 4.297525699866208, "grad_norm": 0.159620001912117, "learning_rate": 5.2868977154106866e-05, "loss": 1.4052, "step": 86490 }, { "epoch": 4.298067641031721, "grad_norm": 0.2618952691555023, "learning_rate": 5.28531568673547e-05, "loss": 1.4027, "step": 86500 }, { "epoch": 4.2986095821972325, "grad_norm": 0.2610114812850952, "learning_rate": 5.283733684654415e-05, "loss": 1.4044, "step": 86510 }, { "epoch": 4.299151523362745, "grad_norm": 0.22828315198421478, "learning_rate": 5.282151709363492e-05, "loss": 1.4014, "step": 86520 }, { "epoch": 4.299693464528257, "grad_norm": 0.1908428966999054, "learning_rate": 5.2805697610586716e-05, "loss": 1.4008, "step": 86530 }, { "epoch": 4.300235405693769, "grad_norm": 0.21566815674304962, "learning_rate": 5.278987839935913e-05, "loss": 1.4012, "step": 86540 }, { "epoch": 4.300777346859282, "grad_norm": 0.23819507658481598, "learning_rate": 5.277405946191183e-05, "loss": 1.4001, "step": 86550 }, { "epoch": 4.301319288024794, "grad_norm": 0.19423484802246094, "learning_rate": 5.275824080020439e-05, "loss": 1.4052, "step": 86560 }, { "epoch": 4.30159025860755, "eval_loss": 2.4245753288269043, "eval_runtime": 21.9738, "eval_samples_per_second": 227.543, "eval_steps_per_second": 1.229, "step": 86565 }, { "epoch": 4.3018612291903064, "grad_norm": 0.19236032664775848, "learning_rate": 5.2742422416196325e-05, "loss": 1.3929, "step": 86570 }, { "epoch": 4.302403170355818, "grad_norm": 0.1677989512681961, "learning_rate": 5.272660431184717e-05, "loss": 1.3995, "step": 86580 }, { "epoch": 4.30294511152133, "grad_norm": 0.19151169061660767, "learning_rate": 5.2710786489116416e-05, "loss": 1.4002, "step": 86590 }, { "epoch": 4.303487052686843, "grad_norm": 0.1588166505098343, "learning_rate": 5.2694968949963485e-05, "loss": 1.4059, "step": 86600 }, { "epoch": 4.304028993852355, "grad_norm": 0.1966225802898407, "learning_rate": 5.267915169634779e-05, "loss": 1.4075, "step": 86610 }, { "epoch": 4.304570935017868, "grad_norm": 0.16342392563819885, "learning_rate": 5.266333473022873e-05, "loss": 1.4083, "step": 86620 }, { "epoch": 4.3051128761833795, "grad_norm": 0.1621856838464737, "learning_rate": 5.26475180535656e-05, "loss": 1.3987, "step": 86630 }, { "epoch": 4.305654817348891, "grad_norm": 0.17513424158096313, "learning_rate": 5.2631701668317726e-05, "loss": 1.4009, "step": 86640 }, { "epoch": 4.306196758514404, "grad_norm": 0.20303316414356232, "learning_rate": 5.261588557644437e-05, "loss": 1.3863, "step": 86650 }, { "epoch": 4.306305146747507, "eval_loss": 2.4262466430664062, "eval_runtime": 21.9658, "eval_samples_per_second": 227.627, "eval_steps_per_second": 1.229, "step": 86652 }, { "epoch": 4.306738699679916, "grad_norm": 0.1640668660402298, "learning_rate": 5.260006977990474e-05, "loss": 1.4091, "step": 86660 }, { "epoch": 4.307280640845428, "grad_norm": 0.16020847856998444, "learning_rate": 5.258425428065805e-05, "loss": 1.4089, "step": 86670 }, { "epoch": 4.307822582010941, "grad_norm": 0.20218387246131897, "learning_rate": 5.256843908066346e-05, "loss": 1.3933, "step": 86680 }, { "epoch": 4.3083645231764525, "grad_norm": 0.2258821278810501, "learning_rate": 5.2552624181880086e-05, "loss": 1.406, "step": 86690 }, { "epoch": 4.308906464341965, "grad_norm": 0.2921973168849945, "learning_rate": 5.253680958626699e-05, "loss": 1.3927, "step": 86700 }, { "epoch": 4.309448405507477, "grad_norm": 0.29356056451797485, "learning_rate": 5.252099529578323e-05, "loss": 1.398, "step": 86710 }, { "epoch": 4.309990346672989, "grad_norm": 0.16703596711158752, "learning_rate": 5.250518131238783e-05, "loss": 1.4071, "step": 86720 }, { "epoch": 4.310532287838502, "grad_norm": 0.1876375526189804, "learning_rate": 5.248936763803972e-05, "loss": 1.4027, "step": 86730 }, { "epoch": 4.311020034887463, "eval_loss": 2.4287726879119873, "eval_runtime": 21.9767, "eval_samples_per_second": 227.513, "eval_steps_per_second": 1.229, "step": 86739 }, { "epoch": 4.311074229004014, "grad_norm": 0.20849046111106873, "learning_rate": 5.2473554274697846e-05, "loss": 1.3907, "step": 86740 }, { "epoch": 4.311616170169526, "grad_norm": 0.24068142473697662, "learning_rate": 5.24577412243211e-05, "loss": 1.3999, "step": 86750 }, { "epoch": 4.312158111335038, "grad_norm": 0.3270260691642761, "learning_rate": 5.244192848886834e-05, "loss": 1.4041, "step": 86760 }, { "epoch": 4.31270005250055, "grad_norm": 0.2979717552661896, "learning_rate": 5.242611607029836e-05, "loss": 1.4006, "step": 86770 }, { "epoch": 4.313241993666063, "grad_norm": 0.16698554158210754, "learning_rate": 5.241030397056996e-05, "loss": 1.4058, "step": 86780 }, { "epoch": 4.313783934831575, "grad_norm": 0.16404001414775848, "learning_rate": 5.2394492191641854e-05, "loss": 1.4078, "step": 86790 }, { "epoch": 4.314325875997087, "grad_norm": 0.16933251917362213, "learning_rate": 5.237868073547274e-05, "loss": 1.3915, "step": 86800 }, { "epoch": 4.314867817162599, "grad_norm": 0.18435190618038177, "learning_rate": 5.2362869604021304e-05, "loss": 1.3983, "step": 86810 }, { "epoch": 4.315409758328111, "grad_norm": 0.1849733144044876, "learning_rate": 5.2347058799246104e-05, "loss": 1.3964, "step": 86820 }, { "epoch": 4.315734923027419, "eval_loss": 2.4305026531219482, "eval_runtime": 21.9708, "eval_samples_per_second": 227.574, "eval_steps_per_second": 1.229, "step": 86826 }, { "epoch": 4.315951699493624, "grad_norm": 0.2581421732902527, "learning_rate": 5.233124832310574e-05, "loss": 1.3981, "step": 86830 }, { "epoch": 4.316493640659136, "grad_norm": 0.19301174581050873, "learning_rate": 5.2315438177558754e-05, "loss": 1.3909, "step": 86840 }, { "epoch": 4.317035581824648, "grad_norm": 0.3168368637561798, "learning_rate": 5.229962836456364e-05, "loss": 1.4017, "step": 86850 }, { "epoch": 4.3175775229901605, "grad_norm": 0.30351701378822327, "learning_rate": 5.2283818886078827e-05, "loss": 1.3929, "step": 86860 }, { "epoch": 4.318119464155672, "grad_norm": 0.2167241871356964, "learning_rate": 5.226800974406274e-05, "loss": 1.4048, "step": 86870 }, { "epoch": 4.318661405321185, "grad_norm": 0.24982495605945587, "learning_rate": 5.225220094047375e-05, "loss": 1.4042, "step": 86880 }, { "epoch": 4.319203346486697, "grad_norm": 0.15455639362335205, "learning_rate": 5.2236392477270165e-05, "loss": 1.4019, "step": 86890 }, { "epoch": 4.319745287652209, "grad_norm": 0.20189005136489868, "learning_rate": 5.22205843564103e-05, "loss": 1.3887, "step": 86900 }, { "epoch": 4.320287228817722, "grad_norm": 0.17087921500205994, "learning_rate": 5.2204776579852375e-05, "loss": 1.403, "step": 86910 }, { "epoch": 4.320449811167375, "eval_loss": 2.429542303085327, "eval_runtime": 21.9702, "eval_samples_per_second": 227.581, "eval_steps_per_second": 1.229, "step": 86913 }, { "epoch": 4.3208291699832335, "grad_norm": 0.19037020206451416, "learning_rate": 5.218896914955459e-05, "loss": 1.3954, "step": 86920 }, { "epoch": 4.321371111148746, "grad_norm": 0.29596278071403503, "learning_rate": 5.217316206747509e-05, "loss": 1.3994, "step": 86930 }, { "epoch": 4.321913052314258, "grad_norm": 0.2806200385093689, "learning_rate": 5.2157355335572024e-05, "loss": 1.4008, "step": 86940 }, { "epoch": 4.32245499347977, "grad_norm": 0.19318336248397827, "learning_rate": 5.214154895580342e-05, "loss": 1.4017, "step": 86950 }, { "epoch": 4.322996934645283, "grad_norm": 0.266025185585022, "learning_rate": 5.2125742930127316e-05, "loss": 1.3989, "step": 86960 }, { "epoch": 4.323538875810795, "grad_norm": 0.20118410885334015, "learning_rate": 5.210993726050173e-05, "loss": 1.4009, "step": 86970 }, { "epoch": 4.324080816976307, "grad_norm": 0.3275969922542572, "learning_rate": 5.209413194888453e-05, "loss": 1.3998, "step": 86980 }, { "epoch": 4.324622758141819, "grad_norm": 0.22696785628795624, "learning_rate": 5.2078326997233665e-05, "loss": 1.4047, "step": 86990 }, { "epoch": 4.325164699307331, "grad_norm": 0.1600492149591446, "learning_rate": 5.206252240750695e-05, "loss": 1.392, "step": 87000 }, { "epoch": 4.325164699307331, "eval_loss": 2.4212775230407715, "eval_runtime": 21.9652, "eval_samples_per_second": 227.633, "eval_steps_per_second": 1.229, "step": 87000 }, { "epoch": 4.325706640472844, "grad_norm": 0.17092597484588623, "learning_rate": 5.204671818166225e-05, "loss": 1.3931, "step": 87010 }, { "epoch": 4.326248581638356, "grad_norm": 0.3140299320220947, "learning_rate": 5.203091432165724e-05, "loss": 1.3986, "step": 87020 }, { "epoch": 4.3267905228038686, "grad_norm": 0.2115614265203476, "learning_rate": 5.201511082944968e-05, "loss": 1.4102, "step": 87030 }, { "epoch": 4.32733246396938, "grad_norm": 0.15958638489246368, "learning_rate": 5.199930770699725e-05, "loss": 1.4012, "step": 87040 }, { "epoch": 4.327874405134892, "grad_norm": 0.15782247483730316, "learning_rate": 5.198350495625753e-05, "loss": 1.4026, "step": 87050 }, { "epoch": 4.328416346300405, "grad_norm": 0.32305723428726196, "learning_rate": 5.1967702579188125e-05, "loss": 1.3988, "step": 87060 }, { "epoch": 4.328958287465917, "grad_norm": 0.23694272339344025, "learning_rate": 5.1951900577746584e-05, "loss": 1.4041, "step": 87070 }, { "epoch": 4.329500228631429, "grad_norm": 0.21021704375743866, "learning_rate": 5.1936098953890335e-05, "loss": 1.3879, "step": 87080 }, { "epoch": 4.329879587447287, "eval_loss": 2.4319331645965576, "eval_runtime": 21.9684, "eval_samples_per_second": 227.6, "eval_steps_per_second": 1.229, "step": 87087 }, { "epoch": 4.330042169796942, "grad_norm": 0.2358713448047638, "learning_rate": 5.192029770957685e-05, "loss": 1.3971, "step": 87090 }, { "epoch": 4.330584110962453, "grad_norm": 0.19418524205684662, "learning_rate": 5.1904496846763536e-05, "loss": 1.4053, "step": 87100 }, { "epoch": 4.331126052127966, "grad_norm": 0.21557055413722992, "learning_rate": 5.1888696367407696e-05, "loss": 1.408, "step": 87110 }, { "epoch": 4.331667993293478, "grad_norm": 0.15955239534378052, "learning_rate": 5.1872896273466645e-05, "loss": 1.4037, "step": 87120 }, { "epoch": 4.33220993445899, "grad_norm": 0.24472442269325256, "learning_rate": 5.1857096566897614e-05, "loss": 1.3886, "step": 87130 }, { "epoch": 4.332751875624503, "grad_norm": 0.24180065095424652, "learning_rate": 5.184129724965784e-05, "loss": 1.3925, "step": 87140 }, { "epoch": 4.333293816790015, "grad_norm": 0.3072583079338074, "learning_rate": 5.1825498323704423e-05, "loss": 1.4073, "step": 87150 }, { "epoch": 4.333835757955527, "grad_norm": 0.36256688833236694, "learning_rate": 5.1809699790994505e-05, "loss": 1.4036, "step": 87160 }, { "epoch": 4.334377699121039, "grad_norm": 0.3161357641220093, "learning_rate": 5.179390165348514e-05, "loss": 1.3945, "step": 87170 }, { "epoch": 4.334594475587244, "eval_loss": 2.4241182804107666, "eval_runtime": 21.9706, "eval_samples_per_second": 227.577, "eval_steps_per_second": 1.229, "step": 87174 }, { "epoch": 4.334919640286551, "grad_norm": 0.1909865289926529, "learning_rate": 5.177810391313329e-05, "loss": 1.4059, "step": 87180 }, { "epoch": 4.335461581452064, "grad_norm": 0.2220858484506607, "learning_rate": 5.176230657189596e-05, "loss": 1.4005, "step": 87190 }, { "epoch": 4.336003522617576, "grad_norm": 0.23918281495571136, "learning_rate": 5.1746509631730035e-05, "loss": 1.3936, "step": 87200 }, { "epoch": 4.336545463783088, "grad_norm": 0.25223907828330994, "learning_rate": 5.173071309459236e-05, "loss": 1.4003, "step": 87210 }, { "epoch": 4.3370874049486, "grad_norm": 0.1692775934934616, "learning_rate": 5.171491696243976e-05, "loss": 1.4004, "step": 87220 }, { "epoch": 4.337629346114112, "grad_norm": 0.22520475089550018, "learning_rate": 5.1699121237228984e-05, "loss": 1.4074, "step": 87230 }, { "epoch": 4.338171287279625, "grad_norm": 0.23879382014274597, "learning_rate": 5.168332592091673e-05, "loss": 1.3965, "step": 87240 }, { "epoch": 4.338713228445137, "grad_norm": 0.18485189974308014, "learning_rate": 5.166753101545967e-05, "loss": 1.4024, "step": 87250 }, { "epoch": 4.339255169610649, "grad_norm": 0.2033102959394455, "learning_rate": 5.165173652281441e-05, "loss": 1.4067, "step": 87260 }, { "epoch": 4.3393093637272, "eval_loss": 2.4278345108032227, "eval_runtime": 21.973, "eval_samples_per_second": 227.552, "eval_steps_per_second": 1.229, "step": 87261 }, { "epoch": 4.3397971107761615, "grad_norm": 0.16319087147712708, "learning_rate": 5.163594244493748e-05, "loss": 1.4038, "step": 87270 }, { "epoch": 4.340339051941673, "grad_norm": 0.18227989971637726, "learning_rate": 5.1620148783785385e-05, "loss": 1.4148, "step": 87280 }, { "epoch": 4.340880993107186, "grad_norm": 0.17856183648109436, "learning_rate": 5.1604355541314586e-05, "loss": 1.4062, "step": 87290 }, { "epoch": 4.341422934272698, "grad_norm": 0.21139036118984222, "learning_rate": 5.15885627194815e-05, "loss": 1.399, "step": 87300 }, { "epoch": 4.34196487543821, "grad_norm": 0.3597468435764313, "learning_rate": 5.157277032024245e-05, "loss": 1.3997, "step": 87310 }, { "epoch": 4.342506816603723, "grad_norm": 0.18101923167705536, "learning_rate": 5.155697834555372e-05, "loss": 1.3952, "step": 87320 }, { "epoch": 4.3430487577692345, "grad_norm": 0.20467126369476318, "learning_rate": 5.154118679737158e-05, "loss": 1.4021, "step": 87330 }, { "epoch": 4.343590698934747, "grad_norm": 0.2586832046508789, "learning_rate": 5.152539567765219e-05, "loss": 1.4073, "step": 87340 }, { "epoch": 4.344024251867157, "eval_loss": 2.429697036743164, "eval_runtime": 21.9753, "eval_samples_per_second": 227.528, "eval_steps_per_second": 1.229, "step": 87348 }, { "epoch": 4.344132640100259, "grad_norm": 0.20841944217681885, "learning_rate": 5.150960498835169e-05, "loss": 1.3959, "step": 87350 }, { "epoch": 4.344674581265771, "grad_norm": 0.15651927888393402, "learning_rate": 5.149381473142621e-05, "loss": 1.4009, "step": 87360 }, { "epoch": 4.345216522431284, "grad_norm": 0.22933441400527954, "learning_rate": 5.147802490883169e-05, "loss": 1.3964, "step": 87370 }, { "epoch": 4.345758463596796, "grad_norm": 0.2737453281879425, "learning_rate": 5.146223552252416e-05, "loss": 1.3928, "step": 87380 }, { "epoch": 4.346300404762308, "grad_norm": 0.22717493772506714, "learning_rate": 5.144644657445957e-05, "loss": 1.3937, "step": 87390 }, { "epoch": 4.34684234592782, "grad_norm": 0.16106519103050232, "learning_rate": 5.143065806659373e-05, "loss": 1.4036, "step": 87400 }, { "epoch": 4.347384287093332, "grad_norm": 0.2053665816783905, "learning_rate": 5.141487000088245e-05, "loss": 1.4002, "step": 87410 }, { "epoch": 4.347926228258845, "grad_norm": 0.14804090559482574, "learning_rate": 5.139908237928155e-05, "loss": 1.3886, "step": 87420 }, { "epoch": 4.348468169424357, "grad_norm": 0.2843748927116394, "learning_rate": 5.138329520374666e-05, "loss": 1.4034, "step": 87430 }, { "epoch": 4.348739140007113, "eval_loss": 2.4298503398895264, "eval_runtime": 21.9707, "eval_samples_per_second": 227.576, "eval_steps_per_second": 1.229, "step": 87435 }, { "epoch": 4.3490101105898695, "grad_norm": 0.15882396697998047, "learning_rate": 5.1367508476233474e-05, "loss": 1.3942, "step": 87440 }, { "epoch": 4.349552051755381, "grad_norm": 0.23637013137340546, "learning_rate": 5.135172219869755e-05, "loss": 1.4045, "step": 87450 }, { "epoch": 4.350093992920893, "grad_norm": 0.20774583518505096, "learning_rate": 5.1335936373094475e-05, "loss": 1.3984, "step": 87460 }, { "epoch": 4.350635934086406, "grad_norm": 0.21993571519851685, "learning_rate": 5.132015100137966e-05, "loss": 1.4071, "step": 87470 }, { "epoch": 4.351177875251918, "grad_norm": 0.16270707547664642, "learning_rate": 5.130436608550856e-05, "loss": 1.3982, "step": 87480 }, { "epoch": 4.35171981641743, "grad_norm": 0.16741152107715607, "learning_rate": 5.128858162743658e-05, "loss": 1.3972, "step": 87490 }, { "epoch": 4.3522617575829425, "grad_norm": 0.19168831408023834, "learning_rate": 5.1272797629118976e-05, "loss": 1.4038, "step": 87500 }, { "epoch": 4.352803698748454, "grad_norm": 0.18986192345619202, "learning_rate": 5.1257014092511e-05, "loss": 1.4015, "step": 87510 }, { "epoch": 4.353345639913967, "grad_norm": 0.15182600915431976, "learning_rate": 5.12412310195679e-05, "loss": 1.3979, "step": 87520 }, { "epoch": 4.35345402814707, "eval_loss": 2.431755542755127, "eval_runtime": 21.9949, "eval_samples_per_second": 227.325, "eval_steps_per_second": 1.228, "step": 87522 }, { "epoch": 4.353887581079479, "grad_norm": 0.23063354194164276, "learning_rate": 5.122544841224476e-05, "loss": 1.3934, "step": 87530 }, { "epoch": 4.354429522244991, "grad_norm": 0.2470499873161316, "learning_rate": 5.120966627249669e-05, "loss": 1.4009, "step": 87540 }, { "epoch": 4.354971463410504, "grad_norm": 0.15349188446998596, "learning_rate": 5.119388460227872e-05, "loss": 1.3953, "step": 87550 }, { "epoch": 4.3555134045760155, "grad_norm": 0.2129431664943695, "learning_rate": 5.1178103403545794e-05, "loss": 1.3964, "step": 87560 }, { "epoch": 4.356055345741528, "grad_norm": 0.16350790858268738, "learning_rate": 5.116232267825279e-05, "loss": 1.3981, "step": 87570 }, { "epoch": 4.35659728690704, "grad_norm": 0.19389735162258148, "learning_rate": 5.114654242835465e-05, "loss": 1.4019, "step": 87580 }, { "epoch": 4.357139228072552, "grad_norm": 0.2006743848323822, "learning_rate": 5.113076265580606e-05, "loss": 1.3975, "step": 87590 }, { "epoch": 4.357681169238065, "grad_norm": 0.1707877218723297, "learning_rate": 5.111498336256181e-05, "loss": 1.403, "step": 87600 }, { "epoch": 4.358168916287026, "eval_loss": 2.4279067516326904, "eval_runtime": 21.9698, "eval_samples_per_second": 227.585, "eval_steps_per_second": 1.229, "step": 87609 }, { "epoch": 4.358223110403577, "grad_norm": 0.2142249345779419, "learning_rate": 5.109920455057655e-05, "loss": 1.403, "step": 87610 }, { "epoch": 4.358765051569089, "grad_norm": 0.1643165647983551, "learning_rate": 5.10834262218049e-05, "loss": 1.3975, "step": 87620 }, { "epoch": 4.359306992734601, "grad_norm": 0.20058096945285797, "learning_rate": 5.106764837820141e-05, "loss": 1.4049, "step": 87630 }, { "epoch": 4.359848933900113, "grad_norm": 0.17463991045951843, "learning_rate": 5.1051871021720546e-05, "loss": 1.4036, "step": 87640 }, { "epoch": 4.360390875065626, "grad_norm": 0.20207591354846954, "learning_rate": 5.103609415431678e-05, "loss": 1.4025, "step": 87650 }, { "epoch": 4.360932816231138, "grad_norm": 0.1925676167011261, "learning_rate": 5.102031777794446e-05, "loss": 1.3931, "step": 87660 }, { "epoch": 4.36147475739665, "grad_norm": 0.2542779743671417, "learning_rate": 5.100454189455787e-05, "loss": 1.3996, "step": 87670 }, { "epoch": 4.3620166985621625, "grad_norm": 0.21328918635845184, "learning_rate": 5.0988766506111316e-05, "loss": 1.398, "step": 87680 }, { "epoch": 4.362558639727674, "grad_norm": 0.23086091876029968, "learning_rate": 5.097299161455893e-05, "loss": 1.4014, "step": 87690 }, { "epoch": 4.362883804426982, "eval_loss": 2.4321718215942383, "eval_runtime": 21.9729, "eval_samples_per_second": 227.553, "eval_steps_per_second": 1.229, "step": 87696 }, { "epoch": 4.363100580893187, "grad_norm": 0.15713347494602203, "learning_rate": 5.095721722185487e-05, "loss": 1.3932, "step": 87700 }, { "epoch": 4.363642522058699, "grad_norm": 0.16743972897529602, "learning_rate": 5.0941443329953185e-05, "loss": 1.399, "step": 87710 }, { "epoch": 4.364184463224211, "grad_norm": 0.15407240390777588, "learning_rate": 5.0925669940807885e-05, "loss": 1.4004, "step": 87720 }, { "epoch": 4.364726404389724, "grad_norm": 0.1776094287633896, "learning_rate": 5.090989705637289e-05, "loss": 1.3983, "step": 87730 }, { "epoch": 4.3652683455552355, "grad_norm": 0.16441775858402252, "learning_rate": 5.0894124678602116e-05, "loss": 1.3959, "step": 87740 }, { "epoch": 4.365810286720748, "grad_norm": 0.21521979570388794, "learning_rate": 5.0878352809449336e-05, "loss": 1.3909, "step": 87750 }, { "epoch": 4.36635222788626, "grad_norm": 0.1725737303495407, "learning_rate": 5.086258145086831e-05, "loss": 1.3974, "step": 87760 }, { "epoch": 4.366894169051772, "grad_norm": 0.1918652206659317, "learning_rate": 5.084681060481271e-05, "loss": 1.3925, "step": 87770 }, { "epoch": 4.367436110217285, "grad_norm": 0.16911828517913818, "learning_rate": 5.083104027323623e-05, "loss": 1.391, "step": 87780 }, { "epoch": 4.367598692566938, "eval_loss": 2.4334969520568848, "eval_runtime": 21.9702, "eval_samples_per_second": 227.581, "eval_steps_per_second": 1.229, "step": 87783 }, { "epoch": 4.367978051382797, "grad_norm": 0.16556398570537567, "learning_rate": 5.081527045809236e-05, "loss": 1.3964, "step": 87790 }, { "epoch": 4.368519992548309, "grad_norm": 0.17277011275291443, "learning_rate": 5.0799501161334606e-05, "loss": 1.3978, "step": 87800 }, { "epoch": 4.369061933713821, "grad_norm": 0.17669574916362762, "learning_rate": 5.078373238491642e-05, "loss": 1.4088, "step": 87810 }, { "epoch": 4.369603874879333, "grad_norm": 0.19774076342582703, "learning_rate": 5.076796413079116e-05, "loss": 1.4003, "step": 87820 }, { "epoch": 4.370145816044846, "grad_norm": 0.22697632014751434, "learning_rate": 5.075219640091212e-05, "loss": 1.3944, "step": 87830 }, { "epoch": 4.370687757210358, "grad_norm": 0.19095872342586517, "learning_rate": 5.0736429197232574e-05, "loss": 1.3905, "step": 87840 }, { "epoch": 4.3712296983758705, "grad_norm": 0.2845691740512848, "learning_rate": 5.072066252170565e-05, "loss": 1.4046, "step": 87850 }, { "epoch": 4.371771639541382, "grad_norm": 0.20539964735507965, "learning_rate": 5.070489637628447e-05, "loss": 1.3944, "step": 87860 }, { "epoch": 4.372313580706894, "grad_norm": 0.22066408395767212, "learning_rate": 5.06891307629221e-05, "loss": 1.3861, "step": 87870 }, { "epoch": 4.372313580706894, "eval_loss": 2.430873394012451, "eval_runtime": 22.02, "eval_samples_per_second": 227.067, "eval_steps_per_second": 1.226, "step": 87870 }, { "epoch": 4.372855521872407, "grad_norm": 0.25216028094291687, "learning_rate": 5.067336568357147e-05, "loss": 1.3944, "step": 87880 }, { "epoch": 4.373397463037919, "grad_norm": 0.16091406345367432, "learning_rate": 5.065760114018553e-05, "loss": 1.4024, "step": 87890 }, { "epoch": 4.373939404203432, "grad_norm": 0.19635522365570068, "learning_rate": 5.0641837134717095e-05, "loss": 1.409, "step": 87900 }, { "epoch": 4.3744813453689435, "grad_norm": 0.24364635348320007, "learning_rate": 5.062607366911897e-05, "loss": 1.3902, "step": 87910 }, { "epoch": 4.375023286534455, "grad_norm": 0.24988219141960144, "learning_rate": 5.0610310745343836e-05, "loss": 1.4047, "step": 87920 }, { "epoch": 4.375565227699968, "grad_norm": 0.18812812864780426, "learning_rate": 5.0594548365344354e-05, "loss": 1.3898, "step": 87930 }, { "epoch": 4.37610716886548, "grad_norm": 0.3591182231903076, "learning_rate": 5.057878653107311e-05, "loss": 1.4025, "step": 87940 }, { "epoch": 4.376649110030992, "grad_norm": 0.23308515548706055, "learning_rate": 5.0563025244482574e-05, "loss": 1.4003, "step": 87950 }, { "epoch": 4.37702846884685, "eval_loss": 2.4251065254211426, "eval_runtime": 21.9772, "eval_samples_per_second": 227.509, "eval_steps_per_second": 1.229, "step": 87957 }, { "epoch": 4.377191051196505, "grad_norm": 0.17095594108104706, "learning_rate": 5.054726450752521e-05, "loss": 1.4007, "step": 87960 }, { "epoch": 4.3777329923620165, "grad_norm": 0.19432257115840912, "learning_rate": 5.05315043221534e-05, "loss": 1.4043, "step": 87970 }, { "epoch": 4.378274933527529, "grad_norm": 0.18219472467899323, "learning_rate": 5.051574469031942e-05, "loss": 1.4022, "step": 87980 }, { "epoch": 4.378816874693041, "grad_norm": 0.17361074686050415, "learning_rate": 5.049998561397552e-05, "loss": 1.3977, "step": 87990 }, { "epoch": 4.379358815858553, "grad_norm": 0.21151654422283173, "learning_rate": 5.0484227095073865e-05, "loss": 1.3887, "step": 88000 }, { "epoch": 4.379900757024066, "grad_norm": 0.16382147371768951, "learning_rate": 5.0468469135566554e-05, "loss": 1.3901, "step": 88010 }, { "epoch": 4.380442698189578, "grad_norm": 0.1616712063550949, "learning_rate": 5.045271173740562e-05, "loss": 1.4068, "step": 88020 }, { "epoch": 4.38098463935509, "grad_norm": 0.17013293504714966, "learning_rate": 5.043695490254302e-05, "loss": 1.3819, "step": 88030 }, { "epoch": 4.381526580520602, "grad_norm": 0.1871418058872223, "learning_rate": 5.0421198632930624e-05, "loss": 1.4056, "step": 88040 }, { "epoch": 4.381743356986807, "eval_loss": 2.4317269325256348, "eval_runtime": 21.9684, "eval_samples_per_second": 227.6, "eval_steps_per_second": 1.229, "step": 88044 }, { "epoch": 4.382068521686114, "grad_norm": 0.2265293151140213, "learning_rate": 5.0405442930520253e-05, "loss": 1.3987, "step": 88050 }, { "epoch": 4.382610462851627, "grad_norm": 0.26006123423576355, "learning_rate": 5.0389687797263664e-05, "loss": 1.397, "step": 88060 }, { "epoch": 4.383152404017139, "grad_norm": 0.19830361008644104, "learning_rate": 5.037393323511256e-05, "loss": 1.421, "step": 88070 }, { "epoch": 4.383694345182651, "grad_norm": 0.2708401083946228, "learning_rate": 5.03581792460185e-05, "loss": 1.3956, "step": 88080 }, { "epoch": 4.384236286348163, "grad_norm": 0.18328110873699188, "learning_rate": 5.034242583193305e-05, "loss": 1.3926, "step": 88090 }, { "epoch": 4.384778227513675, "grad_norm": 0.19660696387290955, "learning_rate": 5.032667299480767e-05, "loss": 1.386, "step": 88100 }, { "epoch": 4.385320168679188, "grad_norm": 0.21113744378089905, "learning_rate": 5.0310920736593737e-05, "loss": 1.4048, "step": 88110 }, { "epoch": 4.3858621098447, "grad_norm": 0.22649426758289337, "learning_rate": 5.029516905924258e-05, "loss": 1.3993, "step": 88120 }, { "epoch": 4.386404051010212, "grad_norm": 0.16807417571544647, "learning_rate": 5.02794179647055e-05, "loss": 1.3882, "step": 88130 }, { "epoch": 4.3864582451267635, "eval_loss": 2.411639451980591, "eval_runtime": 21.9652, "eval_samples_per_second": 227.633, "eval_steps_per_second": 1.229, "step": 88131 }, { "epoch": 4.386945992175725, "grad_norm": 0.18633247911930084, "learning_rate": 5.026366745493359e-05, "loss": 1.4067, "step": 88140 }, { "epoch": 4.387487933341236, "grad_norm": 0.16578739881515503, "learning_rate": 5.0247917531877985e-05, "loss": 1.3962, "step": 88150 }, { "epoch": 4.388029874506749, "grad_norm": 0.23297452926635742, "learning_rate": 5.0232168197489746e-05, "loss": 1.4044, "step": 88160 }, { "epoch": 4.388571815672261, "grad_norm": 0.24161213636398315, "learning_rate": 5.02164194537198e-05, "loss": 1.4081, "step": 88170 }, { "epoch": 4.389113756837773, "grad_norm": 0.20169183611869812, "learning_rate": 5.020067130251904e-05, "loss": 1.3996, "step": 88180 }, { "epoch": 4.389655698003286, "grad_norm": 0.21233811974525452, "learning_rate": 5.01849237458383e-05, "loss": 1.4046, "step": 88190 }, { "epoch": 4.390197639168798, "grad_norm": 0.15420041978359222, "learning_rate": 5.016917678562828e-05, "loss": 1.4007, "step": 88200 }, { "epoch": 4.39073958033431, "grad_norm": 0.1760631501674652, "learning_rate": 5.0153430423839676e-05, "loss": 1.3998, "step": 88210 }, { "epoch": 4.39117313326672, "eval_loss": 2.422487258911133, "eval_runtime": 21.9702, "eval_samples_per_second": 227.581, "eval_steps_per_second": 1.229, "step": 88218 }, { "epoch": 4.391281521499822, "grad_norm": 0.18712304532527924, "learning_rate": 5.013768466242307e-05, "loss": 1.403, "step": 88220 }, { "epoch": 4.391823462665334, "grad_norm": 0.1596558541059494, "learning_rate": 5.0121939503328985e-05, "loss": 1.3987, "step": 88230 }, { "epoch": 4.392365403830847, "grad_norm": 0.1593562513589859, "learning_rate": 5.010619494850785e-05, "loss": 1.4045, "step": 88240 }, { "epoch": 4.392907344996359, "grad_norm": 0.18083150684833527, "learning_rate": 5.0090450999910035e-05, "loss": 1.3972, "step": 88250 }, { "epoch": 4.3934492861618715, "grad_norm": 0.16882987320423126, "learning_rate": 5.007470765948584e-05, "loss": 1.4041, "step": 88260 }, { "epoch": 4.393991227327383, "grad_norm": 0.17636427283287048, "learning_rate": 5.005896492918547e-05, "loss": 1.3923, "step": 88270 }, { "epoch": 4.394533168492895, "grad_norm": 0.16823971271514893, "learning_rate": 5.004322281095907e-05, "loss": 1.395, "step": 88280 }, { "epoch": 4.395075109658408, "grad_norm": 0.1983443945646286, "learning_rate": 5.002748130675672e-05, "loss": 1.3915, "step": 88290 }, { "epoch": 4.39561705082392, "grad_norm": 0.16008202731609344, "learning_rate": 5.001174041852839e-05, "loss": 1.3959, "step": 88300 }, { "epoch": 4.395888021406676, "eval_loss": 2.420740842819214, "eval_runtime": 21.9717, "eval_samples_per_second": 227.565, "eval_steps_per_second": 1.229, "step": 88305 }, { "epoch": 4.396158991989433, "grad_norm": 0.1860068142414093, "learning_rate": 4.9996000148224e-05, "loss": 1.3967, "step": 88310 }, { "epoch": 4.3967009331549445, "grad_norm": 0.2595568001270294, "learning_rate": 4.99802604977934e-05, "loss": 1.3915, "step": 88320 }, { "epoch": 4.397242874320456, "grad_norm": 0.16008210182189941, "learning_rate": 4.996452146918632e-05, "loss": 1.396, "step": 88330 }, { "epoch": 4.397784815485969, "grad_norm": 0.16884075105190277, "learning_rate": 4.994878306435245e-05, "loss": 1.3935, "step": 88340 }, { "epoch": 4.398326756651481, "grad_norm": 0.17851825058460236, "learning_rate": 4.993304528524143e-05, "loss": 1.3962, "step": 88350 }, { "epoch": 4.398868697816993, "grad_norm": 0.16581206023693085, "learning_rate": 4.9917308133802745e-05, "loss": 1.399, "step": 88360 }, { "epoch": 4.399410638982506, "grad_norm": 0.2509933412075043, "learning_rate": 4.9901571611985855e-05, "loss": 1.3986, "step": 88370 }, { "epoch": 4.3999525801480175, "grad_norm": 0.1800008863210678, "learning_rate": 4.988583572174015e-05, "loss": 1.4032, "step": 88380 }, { "epoch": 4.40049452131353, "grad_norm": 0.19821348786354065, "learning_rate": 4.987010046501491e-05, "loss": 1.3933, "step": 88390 }, { "epoch": 4.400602909546633, "eval_loss": 2.4244353771209717, "eval_runtime": 22.0054, "eval_samples_per_second": 227.217, "eval_steps_per_second": 1.227, "step": 88392 }, { "epoch": 4.401036462479042, "grad_norm": 0.18530848622322083, "learning_rate": 4.9854365843759354e-05, "loss": 1.405, "step": 88400 }, { "epoch": 4.401578403644554, "grad_norm": 0.1724652349948883, "learning_rate": 4.983863185992261e-05, "loss": 1.3891, "step": 88410 }, { "epoch": 4.402120344810067, "grad_norm": 0.2382229119539261, "learning_rate": 4.982289851545376e-05, "loss": 1.3989, "step": 88420 }, { "epoch": 4.402662285975579, "grad_norm": 0.22667750716209412, "learning_rate": 4.980716581230176e-05, "loss": 1.3992, "step": 88430 }, { "epoch": 4.403204227141091, "grad_norm": 0.2111603170633316, "learning_rate": 4.9791433752415494e-05, "loss": 1.3973, "step": 88440 }, { "epoch": 4.403746168306603, "grad_norm": 0.16338708996772766, "learning_rate": 4.977570233774382e-05, "loss": 1.4002, "step": 88450 }, { "epoch": 4.404288109472115, "grad_norm": 0.27522462606430054, "learning_rate": 4.9759971570235454e-05, "loss": 1.4011, "step": 88460 }, { "epoch": 4.404830050637628, "grad_norm": 0.2958241105079651, "learning_rate": 4.9744241451839056e-05, "loss": 1.4013, "step": 88470 }, { "epoch": 4.405317797686589, "eval_loss": 2.416937828063965, "eval_runtime": 22.0681, "eval_samples_per_second": 226.571, "eval_steps_per_second": 1.223, "step": 88479 }, { "epoch": 4.40537199180314, "grad_norm": 0.1971891224384308, "learning_rate": 4.9728511984503223e-05, "loss": 1.3898, "step": 88480 }, { "epoch": 4.405913932968652, "grad_norm": 0.18956254422664642, "learning_rate": 4.971278317017642e-05, "loss": 1.3957, "step": 88490 }, { "epoch": 4.406455874134164, "grad_norm": 0.2962309420108795, "learning_rate": 4.969705501080709e-05, "loss": 1.3972, "step": 88500 }, { "epoch": 4.406997815299676, "grad_norm": 0.2366226315498352, "learning_rate": 4.968132750834358e-05, "loss": 1.4011, "step": 88510 }, { "epoch": 4.407539756465189, "grad_norm": 0.25303101539611816, "learning_rate": 4.9665600664734104e-05, "loss": 1.3946, "step": 88520 }, { "epoch": 4.408081697630701, "grad_norm": 0.1666572242975235, "learning_rate": 4.964987448192686e-05, "loss": 1.3924, "step": 88530 }, { "epoch": 4.408623638796213, "grad_norm": 0.21928876638412476, "learning_rate": 4.9634148961869945e-05, "loss": 1.3978, "step": 88540 }, { "epoch": 4.4091655799617255, "grad_norm": 0.15686646103858948, "learning_rate": 4.961842410651135e-05, "loss": 1.3989, "step": 88550 }, { "epoch": 4.409707521127237, "grad_norm": 0.2927147448062897, "learning_rate": 4.960269991779902e-05, "loss": 1.3994, "step": 88560 }, { "epoch": 4.410032685826545, "eval_loss": 2.4300734996795654, "eval_runtime": 22.0915, "eval_samples_per_second": 226.331, "eval_steps_per_second": 1.222, "step": 88566 }, { "epoch": 4.41024946229275, "grad_norm": 0.16612303256988525, "learning_rate": 4.958697639768078e-05, "loss": 1.3973, "step": 88570 }, { "epoch": 4.410791403458262, "grad_norm": 0.2341911494731903, "learning_rate": 4.957125354810444e-05, "loss": 1.399, "step": 88580 }, { "epoch": 4.411333344623774, "grad_norm": 0.2245527058839798, "learning_rate": 4.9555531371017604e-05, "loss": 1.4014, "step": 88590 }, { "epoch": 4.411875285789287, "grad_norm": 0.25589320063591003, "learning_rate": 4.9539809868367906e-05, "loss": 1.395, "step": 88600 }, { "epoch": 4.4124172269547985, "grad_norm": 0.19590523838996887, "learning_rate": 4.952408904210288e-05, "loss": 1.3888, "step": 88610 }, { "epoch": 4.412959168120311, "grad_norm": 0.1683030128479004, "learning_rate": 4.950836889416991e-05, "loss": 1.3929, "step": 88620 }, { "epoch": 4.413501109285823, "grad_norm": 0.20308978855609894, "learning_rate": 4.949264942651637e-05, "loss": 1.4006, "step": 88630 }, { "epoch": 4.414043050451335, "grad_norm": 0.167439803481102, "learning_rate": 4.9476930641089506e-05, "loss": 1.3978, "step": 88640 }, { "epoch": 4.414584991616848, "grad_norm": 0.2057618796825409, "learning_rate": 4.9461212539836486e-05, "loss": 1.3981, "step": 88650 }, { "epoch": 4.414747573966501, "eval_loss": 2.4309983253479004, "eval_runtime": 21.9716, "eval_samples_per_second": 227.566, "eval_steps_per_second": 1.229, "step": 88653 }, { "epoch": 4.41512693278236, "grad_norm": 0.17505063116550446, "learning_rate": 4.944549512470441e-05, "loss": 1.3983, "step": 88660 }, { "epoch": 4.415668873947872, "grad_norm": 0.16853992640972137, "learning_rate": 4.942977839764028e-05, "loss": 1.3883, "step": 88670 }, { "epoch": 4.416210815113384, "grad_norm": 0.3285401165485382, "learning_rate": 4.941406236059104e-05, "loss": 1.3933, "step": 88680 }, { "epoch": 4.416752756278896, "grad_norm": 0.22175738215446472, "learning_rate": 4.9398347015503474e-05, "loss": 1.3954, "step": 88690 }, { "epoch": 4.417294697444409, "grad_norm": 0.17914403975009918, "learning_rate": 4.938263236432438e-05, "loss": 1.3919, "step": 88700 }, { "epoch": 4.417836638609921, "grad_norm": 0.2186654806137085, "learning_rate": 4.936691840900041e-05, "loss": 1.3968, "step": 88710 }, { "epoch": 4.418378579775434, "grad_norm": 0.2118908166885376, "learning_rate": 4.935120515147811e-05, "loss": 1.3969, "step": 88720 }, { "epoch": 4.4189205209409455, "grad_norm": 0.2079256772994995, "learning_rate": 4.9335492593704e-05, "loss": 1.3966, "step": 88730 }, { "epoch": 4.419462462106457, "grad_norm": 0.17602074146270752, "learning_rate": 4.931978073762448e-05, "loss": 1.404, "step": 88740 }, { "epoch": 4.419462462106457, "eval_loss": 2.4278478622436523, "eval_runtime": 21.9718, "eval_samples_per_second": 227.565, "eval_steps_per_second": 1.229, "step": 88740 }, { "epoch": 4.42000440327197, "grad_norm": 0.16428543627262115, "learning_rate": 4.930406958518584e-05, "loss": 1.415, "step": 88750 }, { "epoch": 4.420546344437482, "grad_norm": 0.18856200575828552, "learning_rate": 4.928835913833435e-05, "loss": 1.4063, "step": 88760 }, { "epoch": 4.421088285602994, "grad_norm": 0.22589999437332153, "learning_rate": 4.9272649399016134e-05, "loss": 1.4013, "step": 88770 }, { "epoch": 4.421630226768507, "grad_norm": 0.17148156464099884, "learning_rate": 4.925694036917723e-05, "loss": 1.3959, "step": 88780 }, { "epoch": 4.4221721679340185, "grad_norm": 0.19456231594085693, "learning_rate": 4.924123205076362e-05, "loss": 1.3995, "step": 88790 }, { "epoch": 4.422714109099531, "grad_norm": 0.18404072523117065, "learning_rate": 4.92255244457212e-05, "loss": 1.3852, "step": 88800 }, { "epoch": 4.423256050265043, "grad_norm": 0.16325946152210236, "learning_rate": 4.920981755599573e-05, "loss": 1.399, "step": 88810 }, { "epoch": 4.423797991430555, "grad_norm": 0.16499677300453186, "learning_rate": 4.9194111383532914e-05, "loss": 1.3996, "step": 88820 }, { "epoch": 4.4241773502464135, "eval_loss": 2.4304182529449463, "eval_runtime": 21.9655, "eval_samples_per_second": 227.63, "eval_steps_per_second": 1.229, "step": 88827 }, { "epoch": 4.424339932596068, "grad_norm": 0.20651055872440338, "learning_rate": 4.917840593027838e-05, "loss": 1.3967, "step": 88830 }, { "epoch": 4.42488187376158, "grad_norm": 0.16554446518421173, "learning_rate": 4.9162701198177655e-05, "loss": 1.3991, "step": 88840 }, { "epoch": 4.425423814927092, "grad_norm": 0.19531118869781494, "learning_rate": 4.914699718917615e-05, "loss": 1.3937, "step": 88850 }, { "epoch": 4.425965756092604, "grad_norm": 0.23367935419082642, "learning_rate": 4.913129390521922e-05, "loss": 1.3936, "step": 88860 }, { "epoch": 4.426507697258116, "grad_norm": 0.2578764855861664, "learning_rate": 4.911559134825213e-05, "loss": 1.3988, "step": 88870 }, { "epoch": 4.427049638423629, "grad_norm": 0.2369229793548584, "learning_rate": 4.9099889520220034e-05, "loss": 1.3805, "step": 88880 }, { "epoch": 4.427591579589141, "grad_norm": 0.20222489535808563, "learning_rate": 4.9084188423068e-05, "loss": 1.3985, "step": 88890 }, { "epoch": 4.428133520754653, "grad_norm": 0.1944282501935959, "learning_rate": 4.9068488058741044e-05, "loss": 1.3978, "step": 88900 }, { "epoch": 4.428675461920165, "grad_norm": 0.2579196095466614, "learning_rate": 4.905278842918402e-05, "loss": 1.4046, "step": 88910 }, { "epoch": 4.4288922383863705, "eval_loss": 2.4295578002929688, "eval_runtime": 21.969, "eval_samples_per_second": 227.593, "eval_steps_per_second": 1.229, "step": 88914 }, { "epoch": 4.429217403085677, "grad_norm": 0.21468320488929749, "learning_rate": 4.903708953634174e-05, "loss": 1.3975, "step": 88920 }, { "epoch": 4.42975934425119, "grad_norm": 0.35410991311073303, "learning_rate": 4.902139138215893e-05, "loss": 1.3927, "step": 88930 }, { "epoch": 4.430301285416702, "grad_norm": 0.3901987373828888, "learning_rate": 4.900569396858019e-05, "loss": 1.4055, "step": 88940 }, { "epoch": 4.430843226582214, "grad_norm": 0.20195499062538147, "learning_rate": 4.898999729755006e-05, "loss": 1.3868, "step": 88950 }, { "epoch": 4.4313851677477265, "grad_norm": 0.28525593876838684, "learning_rate": 4.8974301371012986e-05, "loss": 1.4099, "step": 88960 }, { "epoch": 4.431927108913238, "grad_norm": 0.17980363965034485, "learning_rate": 4.895860619091327e-05, "loss": 1.3877, "step": 88970 }, { "epoch": 4.432469050078751, "grad_norm": 0.18058229982852936, "learning_rate": 4.8942911759195196e-05, "loss": 1.3908, "step": 88980 }, { "epoch": 4.433010991244263, "grad_norm": 0.17202942073345184, "learning_rate": 4.892721807780293e-05, "loss": 1.399, "step": 88990 }, { "epoch": 4.433552932409775, "grad_norm": 0.24401873350143433, "learning_rate": 4.891152514868052e-05, "loss": 1.386, "step": 89000 }, { "epoch": 4.433607126526327, "eval_loss": 2.4285366535186768, "eval_runtime": 21.9665, "eval_samples_per_second": 227.619, "eval_steps_per_second": 1.229, "step": 89001 }, { "epoch": 4.434094873575288, "grad_norm": 0.1815049946308136, "learning_rate": 4.889583297377194e-05, "loss": 1.398, "step": 89010 }, { "epoch": 4.4346368147407995, "grad_norm": 0.2082160860300064, "learning_rate": 4.8880141555021055e-05, "loss": 1.4029, "step": 89020 }, { "epoch": 4.435178755906312, "grad_norm": 0.20171265304088593, "learning_rate": 4.8864450894371683e-05, "loss": 1.397, "step": 89030 }, { "epoch": 4.435720697071824, "grad_norm": 0.2109455019235611, "learning_rate": 4.8848760993767497e-05, "loss": 1.3919, "step": 89040 }, { "epoch": 4.436262638237336, "grad_norm": 0.2744022011756897, "learning_rate": 4.883307185515207e-05, "loss": 1.4098, "step": 89050 }, { "epoch": 4.436804579402849, "grad_norm": 0.1699230968952179, "learning_rate": 4.881738348046897e-05, "loss": 1.3943, "step": 89060 }, { "epoch": 4.437346520568361, "grad_norm": 0.16242730617523193, "learning_rate": 4.880169587166151e-05, "loss": 1.4005, "step": 89070 }, { "epoch": 4.437888461733873, "grad_norm": 0.14713962376117706, "learning_rate": 4.8786009030673084e-05, "loss": 1.3952, "step": 89080 }, { "epoch": 4.438322014666283, "eval_loss": 2.425018548965454, "eval_runtime": 21.9685, "eval_samples_per_second": 227.599, "eval_steps_per_second": 1.229, "step": 89088 }, { "epoch": 4.438430402899385, "grad_norm": 0.1697177290916443, "learning_rate": 4.877032295944689e-05, "loss": 1.4059, "step": 89090 }, { "epoch": 4.438972344064897, "grad_norm": 0.18118594586849213, "learning_rate": 4.875463765992603e-05, "loss": 1.3885, "step": 89100 }, { "epoch": 4.43951428523041, "grad_norm": 0.3551163375377655, "learning_rate": 4.8738953134053535e-05, "loss": 1.3941, "step": 89110 }, { "epoch": 4.440056226395922, "grad_norm": 0.18483349680900574, "learning_rate": 4.872326938377235e-05, "loss": 1.3873, "step": 89120 }, { "epoch": 4.4405981675614346, "grad_norm": 0.22286327183246613, "learning_rate": 4.870758641102531e-05, "loss": 1.3998, "step": 89130 }, { "epoch": 4.441140108726946, "grad_norm": 0.2854524850845337, "learning_rate": 4.8691904217755126e-05, "loss": 1.3987, "step": 89140 }, { "epoch": 4.441682049892458, "grad_norm": 0.37562283873558044, "learning_rate": 4.867622280590447e-05, "loss": 1.3821, "step": 89150 }, { "epoch": 4.442223991057971, "grad_norm": 0.20074434578418732, "learning_rate": 4.866054217741589e-05, "loss": 1.3955, "step": 89160 }, { "epoch": 4.442765932223483, "grad_norm": 0.1584572046995163, "learning_rate": 4.86448623342318e-05, "loss": 1.3925, "step": 89170 }, { "epoch": 4.443036902806239, "eval_loss": 2.4282519817352295, "eval_runtime": 21.9667, "eval_samples_per_second": 227.618, "eval_steps_per_second": 1.229, "step": 89175 }, { "epoch": 4.443307873388995, "grad_norm": 0.18110856413841248, "learning_rate": 4.8629183278294584e-05, "loss": 1.4015, "step": 89180 }, { "epoch": 4.443849814554508, "grad_norm": 0.1557188332080841, "learning_rate": 4.86135050115465e-05, "loss": 1.4022, "step": 89190 }, { "epoch": 4.444391755720019, "grad_norm": 0.15835806727409363, "learning_rate": 4.859782753592968e-05, "loss": 1.3943, "step": 89200 }, { "epoch": 4.444933696885532, "grad_norm": 0.24644139409065247, "learning_rate": 4.858215085338617e-05, "loss": 1.4002, "step": 89210 }, { "epoch": 4.445475638051044, "grad_norm": 0.3007756471633911, "learning_rate": 4.856647496585797e-05, "loss": 1.3948, "step": 89220 }, { "epoch": 4.446017579216556, "grad_norm": 0.15734322369098663, "learning_rate": 4.8550799875286904e-05, "loss": 1.4099, "step": 89230 }, { "epoch": 4.446559520382069, "grad_norm": 0.19473432004451752, "learning_rate": 4.853512558361475e-05, "loss": 1.4047, "step": 89240 }, { "epoch": 4.447101461547581, "grad_norm": 0.17770782113075256, "learning_rate": 4.8519452092783193e-05, "loss": 1.3881, "step": 89250 }, { "epoch": 4.447643402713093, "grad_norm": 0.19613268971443176, "learning_rate": 4.850377940473375e-05, "loss": 1.4063, "step": 89260 }, { "epoch": 4.447751790946195, "eval_loss": 2.4215874671936035, "eval_runtime": 21.964, "eval_samples_per_second": 227.645, "eval_steps_per_second": 1.229, "step": 89262 }, { "epoch": 4.448185343878605, "grad_norm": 0.23467159271240234, "learning_rate": 4.848810752140791e-05, "loss": 1.3918, "step": 89270 }, { "epoch": 4.448727285044117, "grad_norm": 0.1882312148809433, "learning_rate": 4.847243644474707e-05, "loss": 1.3882, "step": 89280 }, { "epoch": 4.44926922620963, "grad_norm": 0.16581347584724426, "learning_rate": 4.8456766176692435e-05, "loss": 1.3909, "step": 89290 }, { "epoch": 4.449811167375142, "grad_norm": 0.2854909300804138, "learning_rate": 4.84410967191852e-05, "loss": 1.4066, "step": 89300 }, { "epoch": 4.4503531085406545, "grad_norm": 0.28027182817459106, "learning_rate": 4.842542807416644e-05, "loss": 1.4093, "step": 89310 }, { "epoch": 4.450895049706166, "grad_norm": 0.2496391236782074, "learning_rate": 4.840976024357709e-05, "loss": 1.3993, "step": 89320 }, { "epoch": 4.451436990871678, "grad_norm": 0.17088313400745392, "learning_rate": 4.839409322935804e-05, "loss": 1.3956, "step": 89330 }, { "epoch": 4.451978932037191, "grad_norm": 0.20805992186069489, "learning_rate": 4.837842703345003e-05, "loss": 1.3964, "step": 89340 }, { "epoch": 4.452466679086152, "eval_loss": 2.4300498962402344, "eval_runtime": 21.9632, "eval_samples_per_second": 227.654, "eval_steps_per_second": 1.229, "step": 89349 }, { "epoch": 4.452520873202703, "grad_norm": 0.22051380574703217, "learning_rate": 4.8362761657793756e-05, "loss": 1.3976, "step": 89350 }, { "epoch": 4.453062814368215, "grad_norm": 0.20481519401073456, "learning_rate": 4.834709710432972e-05, "loss": 1.3972, "step": 89360 }, { "epoch": 4.4536047555337275, "grad_norm": 0.17479705810546875, "learning_rate": 4.8331433374998426e-05, "loss": 1.4061, "step": 89370 }, { "epoch": 4.454146696699239, "grad_norm": 0.16444361209869385, "learning_rate": 4.831577047174023e-05, "loss": 1.3974, "step": 89380 }, { "epoch": 4.454688637864752, "grad_norm": 0.18008925020694733, "learning_rate": 4.830010839649535e-05, "loss": 1.4009, "step": 89390 }, { "epoch": 4.455230579030264, "grad_norm": 0.17722876369953156, "learning_rate": 4.828444715120395e-05, "loss": 1.397, "step": 89400 }, { "epoch": 4.455772520195776, "grad_norm": 0.21487845480442047, "learning_rate": 4.826878673780609e-05, "loss": 1.398, "step": 89410 }, { "epoch": 4.456314461361289, "grad_norm": 0.16381223499774933, "learning_rate": 4.8253127158241693e-05, "loss": 1.4073, "step": 89420 }, { "epoch": 4.4568564025268005, "grad_norm": 0.20386900007724762, "learning_rate": 4.823746841445062e-05, "loss": 1.3958, "step": 89430 }, { "epoch": 4.457181567226108, "eval_loss": 2.4307987689971924, "eval_runtime": 21.974, "eval_samples_per_second": 227.542, "eval_steps_per_second": 1.229, "step": 89436 }, { "epoch": 4.457398343692313, "grad_norm": 0.2596319019794464, "learning_rate": 4.82218105083726e-05, "loss": 1.4007, "step": 89440 }, { "epoch": 4.457940284857825, "grad_norm": 0.1717374473810196, "learning_rate": 4.820615344194728e-05, "loss": 1.3932, "step": 89450 }, { "epoch": 4.458482226023337, "grad_norm": 0.16155289113521576, "learning_rate": 4.819049721711415e-05, "loss": 1.3987, "step": 89460 }, { "epoch": 4.45902416718885, "grad_norm": 0.19415195286273956, "learning_rate": 4.8174841835812665e-05, "loss": 1.3963, "step": 89470 }, { "epoch": 4.459566108354362, "grad_norm": 0.20656591653823853, "learning_rate": 4.815918729998217e-05, "loss": 1.3838, "step": 89480 }, { "epoch": 4.460108049519874, "grad_norm": 0.33386117219924927, "learning_rate": 4.814353361156182e-05, "loss": 1.3849, "step": 89490 }, { "epoch": 4.460649990685386, "grad_norm": 0.21893806755542755, "learning_rate": 4.8127880772490764e-05, "loss": 1.3819, "step": 89500 }, { "epoch": 4.461191931850898, "grad_norm": 0.22136323153972626, "learning_rate": 4.811222878470801e-05, "loss": 1.3927, "step": 89510 }, { "epoch": 4.461733873016411, "grad_norm": 0.40589383244514465, "learning_rate": 4.809657765015245e-05, "loss": 1.4007, "step": 89520 }, { "epoch": 4.461896455366064, "eval_loss": 2.4328644275665283, "eval_runtime": 21.9657, "eval_samples_per_second": 227.627, "eval_steps_per_second": 1.229, "step": 89523 }, { "epoch": 4.462275814181923, "grad_norm": 0.2112714648246765, "learning_rate": 4.808092737076287e-05, "loss": 1.3999, "step": 89530 }, { "epoch": 4.4628177553474355, "grad_norm": 0.22988109290599823, "learning_rate": 4.8065277948477996e-05, "loss": 1.4017, "step": 89540 }, { "epoch": 4.463359696512947, "grad_norm": 0.2842319905757904, "learning_rate": 4.804962938523636e-05, "loss": 1.3933, "step": 89550 }, { "epoch": 4.463901637678459, "grad_norm": 0.16226890683174133, "learning_rate": 4.803398168297645e-05, "loss": 1.4046, "step": 89560 }, { "epoch": 4.464443578843972, "grad_norm": 0.18163561820983887, "learning_rate": 4.801833484363667e-05, "loss": 1.3999, "step": 89570 }, { "epoch": 4.464985520009484, "grad_norm": 0.1625688374042511, "learning_rate": 4.8002688869155246e-05, "loss": 1.3945, "step": 89580 }, { "epoch": 4.465527461174997, "grad_norm": 0.15731951594352722, "learning_rate": 4.798704376147034e-05, "loss": 1.3903, "step": 89590 }, { "epoch": 4.4660694023405085, "grad_norm": 0.2153928130865097, "learning_rate": 4.797139952252e-05, "loss": 1.4023, "step": 89600 }, { "epoch": 4.46661134350602, "grad_norm": 0.2910442650318146, "learning_rate": 4.795575615424219e-05, "loss": 1.3847, "step": 89610 }, { "epoch": 4.46661134350602, "eval_loss": 2.4339542388916016, "eval_runtime": 21.954, "eval_samples_per_second": 227.749, "eval_steps_per_second": 1.23, "step": 89610 }, { "epoch": 4.467153284671533, "grad_norm": 0.17365294694900513, "learning_rate": 4.794011365857472e-05, "loss": 1.3971, "step": 89620 }, { "epoch": 4.467695225837045, "grad_norm": 0.3318266272544861, "learning_rate": 4.7924472037455304e-05, "loss": 1.3983, "step": 89630 }, { "epoch": 4.468237167002557, "grad_norm": 0.26000094413757324, "learning_rate": 4.790883129282161e-05, "loss": 1.3969, "step": 89640 }, { "epoch": 4.46877910816807, "grad_norm": 0.1824985295534134, "learning_rate": 4.789319142661107e-05, "loss": 1.3838, "step": 89650 }, { "epoch": 4.4693210493335815, "grad_norm": 0.1758066862821579, "learning_rate": 4.787755244076112e-05, "loss": 1.3934, "step": 89660 }, { "epoch": 4.469862990499094, "grad_norm": 0.17066720128059387, "learning_rate": 4.786191433720909e-05, "loss": 1.4048, "step": 89670 }, { "epoch": 4.470404931664606, "grad_norm": 0.1667661815881729, "learning_rate": 4.784627711789209e-05, "loss": 1.3922, "step": 89680 }, { "epoch": 4.470946872830118, "grad_norm": 0.1883508414030075, "learning_rate": 4.7830640784747225e-05, "loss": 1.3911, "step": 89690 }, { "epoch": 4.4713262316459765, "eval_loss": 2.428875207901001, "eval_runtime": 21.9652, "eval_samples_per_second": 227.633, "eval_steps_per_second": 1.229, "step": 89697 }, { "epoch": 4.471488813995631, "grad_norm": 0.21949131786823273, "learning_rate": 4.781500533971147e-05, "loss": 1.3919, "step": 89700 }, { "epoch": 4.472030755161143, "grad_norm": 0.19700855016708374, "learning_rate": 4.779937078472164e-05, "loss": 1.4053, "step": 89710 }, { "epoch": 4.472572696326655, "grad_norm": 0.17879433929920197, "learning_rate": 4.7783737121714504e-05, "loss": 1.3874, "step": 89720 }, { "epoch": 4.473114637492167, "grad_norm": 0.16354680061340332, "learning_rate": 4.77681043526267e-05, "loss": 1.4007, "step": 89730 }, { "epoch": 4.473656578657679, "grad_norm": 0.23582887649536133, "learning_rate": 4.7752472479394706e-05, "loss": 1.4049, "step": 89740 }, { "epoch": 4.474198519823192, "grad_norm": 0.1751241385936737, "learning_rate": 4.7736841503954956e-05, "loss": 1.3926, "step": 89750 }, { "epoch": 4.474740460988704, "grad_norm": 0.2157001942396164, "learning_rate": 4.772121142824374e-05, "loss": 1.3967, "step": 89760 }, { "epoch": 4.475282402154216, "grad_norm": 0.2224288284778595, "learning_rate": 4.770558225419728e-05, "loss": 1.4, "step": 89770 }, { "epoch": 4.4758243433197284, "grad_norm": 0.1734488606452942, "learning_rate": 4.7689953983751604e-05, "loss": 1.3875, "step": 89780 }, { "epoch": 4.4760411197859336, "eval_loss": 2.4124958515167236, "eval_runtime": 21.9708, "eval_samples_per_second": 227.574, "eval_steps_per_second": 1.229, "step": 89784 }, { "epoch": 4.47636628448524, "grad_norm": 0.17843768000602722, "learning_rate": 4.767432661884269e-05, "loss": 1.3945, "step": 89790 }, { "epoch": 4.476908225650753, "grad_norm": 0.165896475315094, "learning_rate": 4.76587001614064e-05, "loss": 1.4107, "step": 89800 }, { "epoch": 4.477450166816265, "grad_norm": 0.19189293682575226, "learning_rate": 4.7643074613378455e-05, "loss": 1.3998, "step": 89810 }, { "epoch": 4.477992107981777, "grad_norm": 0.26775965094566345, "learning_rate": 4.7627449976694486e-05, "loss": 1.3911, "step": 89820 }, { "epoch": 4.47853404914729, "grad_norm": 0.2671341896057129, "learning_rate": 4.761182625329001e-05, "loss": 1.4019, "step": 89830 }, { "epoch": 4.4790759903128015, "grad_norm": 0.16249962151050568, "learning_rate": 4.7596203445100416e-05, "loss": 1.3971, "step": 89840 }, { "epoch": 4.479617931478314, "grad_norm": 0.273316353559494, "learning_rate": 4.7580581554060986e-05, "loss": 1.39, "step": 89850 }, { "epoch": 4.480159872643826, "grad_norm": 0.19733589887619019, "learning_rate": 4.7564960582106933e-05, "loss": 1.4028, "step": 89860 }, { "epoch": 4.480701813809338, "grad_norm": 0.17823106050491333, "learning_rate": 4.754934053117325e-05, "loss": 1.4006, "step": 89870 }, { "epoch": 4.48075600792589, "eval_loss": 2.4114816188812256, "eval_runtime": 21.9671, "eval_samples_per_second": 227.613, "eval_steps_per_second": 1.229, "step": 89871 }, { "epoch": 4.481243754974851, "grad_norm": 0.18662744760513306, "learning_rate": 4.753372140319492e-05, "loss": 1.3969, "step": 89880 }, { "epoch": 4.481785696140363, "grad_norm": 0.23681077361106873, "learning_rate": 4.751810320010678e-05, "loss": 1.4017, "step": 89890 }, { "epoch": 4.482327637305875, "grad_norm": 0.3316483795642853, "learning_rate": 4.750248592384352e-05, "loss": 1.3945, "step": 89900 }, { "epoch": 4.482869578471387, "grad_norm": 0.261080801486969, "learning_rate": 4.748686957633975e-05, "loss": 1.3913, "step": 89910 }, { "epoch": 4.483411519636899, "grad_norm": 0.1946885585784912, "learning_rate": 4.747125415952998e-05, "loss": 1.3939, "step": 89920 }, { "epoch": 4.483953460802412, "grad_norm": 0.2138025015592575, "learning_rate": 4.745563967534855e-05, "loss": 1.3982, "step": 89930 }, { "epoch": 4.484495401967924, "grad_norm": 0.2956826984882355, "learning_rate": 4.744002612572972e-05, "loss": 1.3999, "step": 89940 }, { "epoch": 4.4850373431334365, "grad_norm": 0.30752211809158325, "learning_rate": 4.742441351260761e-05, "loss": 1.3892, "step": 89950 }, { "epoch": 4.485470896065846, "eval_loss": 2.4178147315979004, "eval_runtime": 21.9672, "eval_samples_per_second": 227.613, "eval_steps_per_second": 1.229, "step": 89958 }, { "epoch": 4.485579284298948, "grad_norm": 0.2102116495370865, "learning_rate": 4.7408801837916306e-05, "loss": 1.4062, "step": 89960 }, { "epoch": 4.48612122546446, "grad_norm": 0.3934997320175171, "learning_rate": 4.7393191103589643e-05, "loss": 1.3999, "step": 89970 }, { "epoch": 4.486663166629973, "grad_norm": 0.2734357416629791, "learning_rate": 4.737758131156144e-05, "loss": 1.3997, "step": 89980 }, { "epoch": 4.487205107795485, "grad_norm": 0.1576622873544693, "learning_rate": 4.736197246376538e-05, "loss": 1.3969, "step": 89990 }, { "epoch": 4.487747048960998, "grad_norm": 0.18201720714569092, "learning_rate": 4.7346364562135e-05, "loss": 1.3923, "step": 90000 } ], "logging_steps": 10, "max_steps": 92260, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8386602891165565e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }