diff --git "a/checkpoint-90000/trainer_state.json" "b/checkpoint-90000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-90000/trainer_state.json" @@ -0,0 +1,71305 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.487747048960998, + "eval_steps": 87, + "global_step": 90000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000541941165512219, + "grad_norm": 18.66890525817871, + "learning_rate": 3.3333333333333335e-07, + "loss": 7.3063, + "step": 10 + }, + { + "epoch": 0.001083882331024438, + "grad_norm": 14.986076354980469, + "learning_rate": 6.666666666666667e-07, + "loss": 6.602, + "step": 20 + }, + { + "epoch": 0.0016258234965366573, + "grad_norm": 7.8340582847595215, + "learning_rate": 1.0000000000000002e-06, + "loss": 5.4589, + "step": 30 + }, + { + "epoch": 0.002167764662048876, + "grad_norm": 3.6684656143188477, + "learning_rate": 1.3333333333333334e-06, + "loss": 4.4819, + "step": 40 + }, + { + "epoch": 0.0027097058275610954, + "grad_norm": 1.7751961946487427, + "learning_rate": 1.6666666666666667e-06, + "loss": 3.8516, + "step": 50 + }, + { + "epoch": 0.0032516469930733145, + "grad_norm": 0.8667094707489014, + "learning_rate": 2.0000000000000003e-06, + "loss": 3.471, + "step": 60 + }, + { + "epoch": 0.0037935881585855337, + "grad_norm": 0.48009157180786133, + "learning_rate": 2.3333333333333336e-06, + "loss": 3.2603, + "step": 70 + }, + { + "epoch": 0.004335529324097752, + "grad_norm": 0.31772586703300476, + "learning_rate": 2.666666666666667e-06, + "loss": 3.1565, + "step": 80 + }, + { + "epoch": 0.004714888139956306, + "eval_loss": 3.0095248222351074, + "eval_runtime": 22.3757, + "eval_samples_per_second": 223.456, + "eval_steps_per_second": 1.207, + "step": 87 + }, + { + "epoch": 0.0048774704896099716, + "grad_norm": 0.22845230996608734, + "learning_rate": 3e-06, + "loss": 3.0993, + "step": 90 + }, + { + "epoch": 0.005419411655122191, + "grad_norm": 0.1711948961019516, + "learning_rate": 3.3333333333333333e-06, + "loss": 3.067, + "step": 100 + }, + { + "epoch": 0.00596135282063441, + "grad_norm": 0.24801534414291382, + "learning_rate": 3.666666666666667e-06, + "loss": 3.0429, + "step": 110 + }, + { + "epoch": 0.006503293986146629, + "grad_norm": 0.1762855052947998, + "learning_rate": 4.000000000000001e-06, + "loss": 3.0244, + "step": 120 + }, + { + "epoch": 0.007045235151658848, + "grad_norm": 0.20275895297527313, + "learning_rate": 4.333333333333334e-06, + "loss": 3.0101, + "step": 130 + }, + { + "epoch": 0.007587176317171067, + "grad_norm": 0.20807678997516632, + "learning_rate": 4.666666666666667e-06, + "loss": 2.9962, + "step": 140 + }, + { + "epoch": 0.008129117482683286, + "grad_norm": 0.21902145445346832, + "learning_rate": 5e-06, + "loss": 2.9879, + "step": 150 + }, + { + "epoch": 0.008671058648195505, + "grad_norm": 0.2640894055366516, + "learning_rate": 5.333333333333334e-06, + "loss": 2.9807, + "step": 160 + }, + { + "epoch": 0.009212999813707724, + "grad_norm": 0.2247193157672882, + "learning_rate": 5.666666666666667e-06, + "loss": 2.9764, + "step": 170 + }, + { + "epoch": 0.009429776279912611, + "eval_loss": 2.909708023071289, + "eval_runtime": 21.9944, + "eval_samples_per_second": 227.331, + "eval_steps_per_second": 1.228, + "step": 174 + }, + { + "epoch": 0.009754940979219943, + "grad_norm": 0.1678290069103241, + "learning_rate": 6e-06, + "loss": 2.9725, + "step": 180 + }, + { + "epoch": 0.010296882144732162, + "grad_norm": 0.16639265418052673, + "learning_rate": 6.333333333333334e-06, + "loss": 2.9687, + "step": 190 + }, + { + "epoch": 0.010838823310244381, + "grad_norm": 0.16535356640815735, + "learning_rate": 6.666666666666667e-06, + "loss": 2.968, + "step": 200 + }, + { + "epoch": 0.0113807644757566, + "grad_norm": 0.12626643478870392, + "learning_rate": 7.000000000000001e-06, + "loss": 2.9656, + "step": 210 + }, + { + "epoch": 0.01192270564126882, + "grad_norm": 0.16154739260673523, + "learning_rate": 7.333333333333334e-06, + "loss": 2.9658, + "step": 220 + }, + { + "epoch": 0.012464646806781039, + "grad_norm": 0.10952357947826385, + "learning_rate": 7.666666666666667e-06, + "loss": 2.9626, + "step": 230 + }, + { + "epoch": 0.013006587972293258, + "grad_norm": 0.13569523394107819, + "learning_rate": 8.000000000000001e-06, + "loss": 2.9611, + "step": 240 + }, + { + "epoch": 0.013548529137805477, + "grad_norm": 0.18444740772247314, + "learning_rate": 8.333333333333334e-06, + "loss": 2.9599, + "step": 250 + }, + { + "epoch": 0.014090470303317696, + "grad_norm": 0.1738782823085785, + "learning_rate": 8.666666666666668e-06, + "loss": 2.9588, + "step": 260 + }, + { + "epoch": 0.014144664419868919, + "eval_loss": 2.9037859439849854, + "eval_runtime": 21.9954, + "eval_samples_per_second": 227.321, + "eval_steps_per_second": 1.228, + "step": 261 + }, + { + "epoch": 0.014632411468829916, + "grad_norm": 0.1825179159641266, + "learning_rate": 9e-06, + "loss": 2.9562, + "step": 270 + }, + { + "epoch": 0.015174352634342135, + "grad_norm": 0.1394873708486557, + "learning_rate": 9.333333333333334e-06, + "loss": 2.9553, + "step": 280 + }, + { + "epoch": 0.015716293799854352, + "grad_norm": 0.11560714244842529, + "learning_rate": 9.666666666666667e-06, + "loss": 2.9563, + "step": 290 + }, + { + "epoch": 0.01625823496536657, + "grad_norm": 0.16794171929359436, + "learning_rate": 1e-05, + "loss": 2.9527, + "step": 300 + }, + { + "epoch": 0.01680017613087879, + "grad_norm": 0.11284048855304718, + "learning_rate": 1.0333333333333333e-05, + "loss": 2.9529, + "step": 310 + }, + { + "epoch": 0.01734211729639101, + "grad_norm": 0.12263070046901703, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.9521, + "step": 320 + }, + { + "epoch": 0.01788405846190323, + "grad_norm": 0.22584529221057892, + "learning_rate": 1.1000000000000001e-05, + "loss": 2.9527, + "step": 330 + }, + { + "epoch": 0.018425999627415448, + "grad_norm": 0.15804457664489746, + "learning_rate": 1.1333333333333334e-05, + "loss": 2.9509, + "step": 340 + }, + { + "epoch": 0.018859552559825223, + "eval_loss": 2.8973894119262695, + "eval_runtime": 21.9951, + "eval_samples_per_second": 227.324, + "eval_steps_per_second": 1.228, + "step": 348 + }, + { + "epoch": 0.018967940792927667, + "grad_norm": 0.20428232848644257, + "learning_rate": 1.1666666666666668e-05, + "loss": 2.9489, + "step": 350 + }, + { + "epoch": 0.019509881958439886, + "grad_norm": 0.18693846464157104, + "learning_rate": 1.2e-05, + "loss": 2.95, + "step": 360 + }, + { + "epoch": 0.020051823123952105, + "grad_norm": 0.113795205950737, + "learning_rate": 1.2333333333333334e-05, + "loss": 2.9469, + "step": 370 + }, + { + "epoch": 0.020593764289464325, + "grad_norm": 0.18272531032562256, + "learning_rate": 1.2666666666666668e-05, + "loss": 2.9474, + "step": 380 + }, + { + "epoch": 0.021135705454976544, + "grad_norm": 0.14928747713565826, + "learning_rate": 1.3000000000000001e-05, + "loss": 2.9478, + "step": 390 + }, + { + "epoch": 0.021677646620488763, + "grad_norm": 0.28558146953582764, + "learning_rate": 1.3333333333333333e-05, + "loss": 2.9453, + "step": 400 + }, + { + "epoch": 0.022219587786000982, + "grad_norm": 0.2676473557949066, + "learning_rate": 1.3666666666666666e-05, + "loss": 2.9447, + "step": 410 + }, + { + "epoch": 0.0227615289515132, + "grad_norm": 0.18305936455726624, + "learning_rate": 1.4000000000000001e-05, + "loss": 2.9433, + "step": 420 + }, + { + "epoch": 0.02330347011702542, + "grad_norm": 0.21853986382484436, + "learning_rate": 1.4333333333333334e-05, + "loss": 2.9438, + "step": 430 + }, + { + "epoch": 0.02357444069978153, + "eval_loss": 2.8930819034576416, + "eval_runtime": 21.9945, + "eval_samples_per_second": 227.33, + "eval_steps_per_second": 1.228, + "step": 435 + }, + { + "epoch": 0.02384541128253764, + "grad_norm": 0.218394935131073, + "learning_rate": 1.4666666666666668e-05, + "loss": 2.942, + "step": 440 + }, + { + "epoch": 0.02438735244804986, + "grad_norm": 0.15290504693984985, + "learning_rate": 1.5e-05, + "loss": 2.9412, + "step": 450 + }, + { + "epoch": 0.024929293613562078, + "grad_norm": 0.2329382598400116, + "learning_rate": 1.5333333333333334e-05, + "loss": 2.9393, + "step": 460 + }, + { + "epoch": 0.025471234779074297, + "grad_norm": 0.2563980519771576, + "learning_rate": 1.5666666666666667e-05, + "loss": 2.9404, + "step": 470 + }, + { + "epoch": 0.026013175944586516, + "grad_norm": 0.18090900778770447, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.9384, + "step": 480 + }, + { + "epoch": 0.026555117110098735, + "grad_norm": 0.10313646495342255, + "learning_rate": 1.6333333333333335e-05, + "loss": 2.9383, + "step": 490 + }, + { + "epoch": 0.027097058275610954, + "grad_norm": 0.127348393201828, + "learning_rate": 1.6666666666666667e-05, + "loss": 2.9374, + "step": 500 + }, + { + "epoch": 0.027638999441123174, + "grad_norm": 0.39012327790260315, + "learning_rate": 1.7000000000000003e-05, + "loss": 2.9369, + "step": 510 + }, + { + "epoch": 0.028180940606635393, + "grad_norm": 0.26286575198173523, + "learning_rate": 1.7333333333333336e-05, + "loss": 2.9365, + "step": 520 + }, + { + "epoch": 0.028289328839737837, + "eval_loss": 2.885981798171997, + "eval_runtime": 35.0266, + "eval_samples_per_second": 142.749, + "eval_steps_per_second": 0.771, + "step": 522 + }, + { + "epoch": 0.028722881772147612, + "grad_norm": 0.34882956743240356, + "learning_rate": 1.7666666666666668e-05, + "loss": 2.9374, + "step": 530 + }, + { + "epoch": 0.02926482293765983, + "grad_norm": 0.2220120131969452, + "learning_rate": 1.8e-05, + "loss": 2.934, + "step": 540 + }, + { + "epoch": 0.02980676410317205, + "grad_norm": 0.11112108826637268, + "learning_rate": 1.8333333333333333e-05, + "loss": 2.9332, + "step": 550 + }, + { + "epoch": 0.03034870526868427, + "grad_norm": 0.1723136603832245, + "learning_rate": 1.866666666666667e-05, + "loss": 2.9332, + "step": 560 + }, + { + "epoch": 0.03089064643419649, + "grad_norm": 0.1271258443593979, + "learning_rate": 1.9e-05, + "loss": 2.9326, + "step": 570 + }, + { + "epoch": 0.031432587599708704, + "grad_norm": 0.2465275079011917, + "learning_rate": 1.9333333333333333e-05, + "loss": 2.9304, + "step": 580 + }, + { + "epoch": 0.03197452876522092, + "grad_norm": 0.24456460773944855, + "learning_rate": 1.9666666666666666e-05, + "loss": 2.9304, + "step": 590 + }, + { + "epoch": 0.03251646993073314, + "grad_norm": 0.18503615260124207, + "learning_rate": 2e-05, + "loss": 2.9272, + "step": 600 + }, + { + "epoch": 0.03300421697969414, + "eval_loss": 2.8818912506103516, + "eval_runtime": 21.9944, + "eval_samples_per_second": 227.331, + "eval_steps_per_second": 1.228, + "step": 609 + }, + { + "epoch": 0.03305841109624536, + "grad_norm": 0.18611077964305878, + "learning_rate": 2.0333333333333334e-05, + "loss": 2.9287, + "step": 610 + }, + { + "epoch": 0.03360035226175758, + "grad_norm": 0.2303771823644638, + "learning_rate": 2.0666666666666666e-05, + "loss": 2.9272, + "step": 620 + }, + { + "epoch": 0.0341422934272698, + "grad_norm": 0.12826542556285858, + "learning_rate": 2.1e-05, + "loss": 2.926, + "step": 630 + }, + { + "epoch": 0.03468423459278202, + "grad_norm": 0.18119564652442932, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.9259, + "step": 640 + }, + { + "epoch": 0.03522617575829424, + "grad_norm": 0.17282642424106598, + "learning_rate": 2.1666666666666667e-05, + "loss": 2.9264, + "step": 650 + }, + { + "epoch": 0.03576811692380646, + "grad_norm": 0.18431398272514343, + "learning_rate": 2.2000000000000003e-05, + "loss": 2.9243, + "step": 660 + }, + { + "epoch": 0.03631005808931868, + "grad_norm": 0.2258967012166977, + "learning_rate": 2.2333333333333335e-05, + "loss": 2.9244, + "step": 670 + }, + { + "epoch": 0.036851999254830896, + "grad_norm": 0.14141803979873657, + "learning_rate": 2.2666666666666668e-05, + "loss": 2.9226, + "step": 680 + }, + { + "epoch": 0.037393940420343115, + "grad_norm": 0.3774568736553192, + "learning_rate": 2.3000000000000003e-05, + "loss": 2.9226, + "step": 690 + }, + { + "epoch": 0.037719105119650445, + "eval_loss": 2.8792285919189453, + "eval_runtime": 21.9939, + "eval_samples_per_second": 227.336, + "eval_steps_per_second": 1.228, + "step": 696 + }, + { + "epoch": 0.037935881585855334, + "grad_norm": 0.2437097430229187, + "learning_rate": 2.3333333333333336e-05, + "loss": 2.9214, + "step": 700 + }, + { + "epoch": 0.03847782275136755, + "grad_norm": 0.18445032835006714, + "learning_rate": 2.3666666666666668e-05, + "loss": 2.9201, + "step": 710 + }, + { + "epoch": 0.03901976391687977, + "grad_norm": 0.12405350804328918, + "learning_rate": 2.4e-05, + "loss": 2.919, + "step": 720 + }, + { + "epoch": 0.03956170508239199, + "grad_norm": 0.14779530465602875, + "learning_rate": 2.4333333333333336e-05, + "loss": 2.919, + "step": 730 + }, + { + "epoch": 0.04010364624790421, + "grad_norm": 0.16767142713069916, + "learning_rate": 2.466666666666667e-05, + "loss": 2.9153, + "step": 740 + }, + { + "epoch": 0.04064558741341643, + "grad_norm": 0.22561559081077576, + "learning_rate": 2.5e-05, + "loss": 2.916, + "step": 750 + }, + { + "epoch": 0.04118752857892865, + "grad_norm": 0.1435348242521286, + "learning_rate": 2.5333333333333337e-05, + "loss": 2.9134, + "step": 760 + }, + { + "epoch": 0.04172946974444087, + "grad_norm": 0.25675374269485474, + "learning_rate": 2.5666666666666666e-05, + "loss": 2.9133, + "step": 770 + }, + { + "epoch": 0.04227141090995309, + "grad_norm": 0.22575390338897705, + "learning_rate": 2.6000000000000002e-05, + "loss": 2.9124, + "step": 780 + }, + { + "epoch": 0.042433993259606756, + "eval_loss": 2.874786853790283, + "eval_runtime": 21.9951, + "eval_samples_per_second": 227.324, + "eval_steps_per_second": 1.228, + "step": 783 + }, + { + "epoch": 0.04281335207546531, + "grad_norm": 0.17684337496757507, + "learning_rate": 2.633333333333333e-05, + "loss": 2.9118, + "step": 790 + }, + { + "epoch": 0.043355293240977526, + "grad_norm": 0.27737951278686523, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.9098, + "step": 800 + }, + { + "epoch": 0.043897234406489745, + "grad_norm": 0.28385818004608154, + "learning_rate": 2.7000000000000002e-05, + "loss": 2.9101, + "step": 810 + }, + { + "epoch": 0.044439175572001964, + "grad_norm": 0.21399657428264618, + "learning_rate": 2.733333333333333e-05, + "loss": 2.909, + "step": 820 + }, + { + "epoch": 0.04498111673751418, + "grad_norm": 0.20526191592216492, + "learning_rate": 2.7666666666666667e-05, + "loss": 2.906, + "step": 830 + }, + { + "epoch": 0.0455230579030264, + "grad_norm": 0.3085150420665741, + "learning_rate": 2.8000000000000003e-05, + "loss": 2.9058, + "step": 840 + }, + { + "epoch": 0.04606499906853862, + "grad_norm": 0.18636156618595123, + "learning_rate": 2.8333333333333335e-05, + "loss": 2.9055, + "step": 850 + }, + { + "epoch": 0.04660694023405084, + "grad_norm": 0.23989151418209076, + "learning_rate": 2.8666666666666668e-05, + "loss": 2.9046, + "step": 860 + }, + { + "epoch": 0.04714888139956306, + "grad_norm": 0.23799440264701843, + "learning_rate": 2.9e-05, + "loss": 2.9056, + "step": 870 + }, + { + "epoch": 0.04714888139956306, + "eval_loss": 2.871436595916748, + "eval_runtime": 21.8935, + "eval_samples_per_second": 228.379, + "eval_steps_per_second": 1.233, + "step": 870 + }, + { + "epoch": 0.04769082256507528, + "grad_norm": 0.1619342863559723, + "learning_rate": 2.9333333333333336e-05, + "loss": 2.9041, + "step": 880 + }, + { + "epoch": 0.0482327637305875, + "grad_norm": 0.28320956230163574, + "learning_rate": 2.9666666666666672e-05, + "loss": 2.9025, + "step": 890 + }, + { + "epoch": 0.04877470489609972, + "grad_norm": 0.18892525136470795, + "learning_rate": 3e-05, + "loss": 2.8997, + "step": 900 + }, + { + "epoch": 0.049316646061611936, + "grad_norm": 0.29564598202705383, + "learning_rate": 3.0333333333333337e-05, + "loss": 2.9001, + "step": 910 + }, + { + "epoch": 0.049858587227124156, + "grad_norm": 0.24769143760204315, + "learning_rate": 3.066666666666667e-05, + "loss": 2.8987, + "step": 920 + }, + { + "epoch": 0.050400528392636375, + "grad_norm": 0.21904461085796356, + "learning_rate": 3.1e-05, + "loss": 2.8968, + "step": 930 + }, + { + "epoch": 0.050942469558148594, + "grad_norm": 0.29892289638519287, + "learning_rate": 3.1333333333333334e-05, + "loss": 2.8972, + "step": 940 + }, + { + "epoch": 0.05148441072366081, + "grad_norm": 0.20083844661712646, + "learning_rate": 3.1666666666666666e-05, + "loss": 2.8951, + "step": 950 + }, + { + "epoch": 0.051863769539519364, + "eval_loss": 2.865743398666382, + "eval_runtime": 21.7153, + "eval_samples_per_second": 230.253, + "eval_steps_per_second": 1.243, + "step": 957 + }, + { + "epoch": 0.05202635188917303, + "grad_norm": 0.26588037610054016, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.8932, + "step": 960 + }, + { + "epoch": 0.05256829305468525, + "grad_norm": 0.23635846376419067, + "learning_rate": 3.233333333333333e-05, + "loss": 2.8911, + "step": 970 + }, + { + "epoch": 0.05311023422019747, + "grad_norm": 0.34888023138046265, + "learning_rate": 3.266666666666667e-05, + "loss": 2.891, + "step": 980 + }, + { + "epoch": 0.05365217538570969, + "grad_norm": 0.34451714158058167, + "learning_rate": 3.3e-05, + "loss": 2.8907, + "step": 990 + }, + { + "epoch": 0.05419411655122191, + "grad_norm": 0.2428581416606903, + "learning_rate": 3.3333333333333335e-05, + "loss": 2.8909, + "step": 1000 + }, + { + "epoch": 0.05473605771673413, + "grad_norm": 0.17025752365589142, + "learning_rate": 3.366666666666667e-05, + "loss": 2.8891, + "step": 1010 + }, + { + "epoch": 0.05527799888224635, + "grad_norm": 0.23046477138996124, + "learning_rate": 3.4000000000000007e-05, + "loss": 2.8887, + "step": 1020 + }, + { + "epoch": 0.055819940047758566, + "grad_norm": 0.28877273201942444, + "learning_rate": 3.433333333333333e-05, + "loss": 2.8855, + "step": 1030 + }, + { + "epoch": 0.056361881213270786, + "grad_norm": 0.16221983730793, + "learning_rate": 3.466666666666667e-05, + "loss": 2.8857, + "step": 1040 + }, + { + "epoch": 0.056578657679475675, + "eval_loss": 2.8612656593322754, + "eval_runtime": 22.0002, + "eval_samples_per_second": 227.27, + "eval_steps_per_second": 1.227, + "step": 1044 + }, + { + "epoch": 0.056903822378783005, + "grad_norm": 0.10896836966276169, + "learning_rate": 3.5e-05, + "loss": 2.8861, + "step": 1050 + }, + { + "epoch": 0.057445763544295224, + "grad_norm": 0.2890329360961914, + "learning_rate": 3.5333333333333336e-05, + "loss": 2.8826, + "step": 1060 + }, + { + "epoch": 0.05798770470980744, + "grad_norm": 0.4240993857383728, + "learning_rate": 3.566666666666667e-05, + "loss": 2.885, + "step": 1070 + }, + { + "epoch": 0.05852964587531966, + "grad_norm": 0.33681583404541016, + "learning_rate": 3.6e-05, + "loss": 2.8837, + "step": 1080 + }, + { + "epoch": 0.05907158704083188, + "grad_norm": 0.23618997633457184, + "learning_rate": 3.633333333333333e-05, + "loss": 2.8846, + "step": 1090 + }, + { + "epoch": 0.0596135282063441, + "grad_norm": 0.1710646152496338, + "learning_rate": 3.6666666666666666e-05, + "loss": 2.881, + "step": 1100 + }, + { + "epoch": 0.06015546937185632, + "grad_norm": 0.2047516405582428, + "learning_rate": 3.7e-05, + "loss": 2.8773, + "step": 1110 + }, + { + "epoch": 0.06069741053736854, + "grad_norm": 0.20800012350082397, + "learning_rate": 3.733333333333334e-05, + "loss": 2.8787, + "step": 1120 + }, + { + "epoch": 0.06123935170288076, + "grad_norm": 0.2583804428577423, + "learning_rate": 3.766666666666667e-05, + "loss": 2.876, + "step": 1130 + }, + { + "epoch": 0.06129354581943198, + "eval_loss": 2.860222578048706, + "eval_runtime": 26.0987, + "eval_samples_per_second": 191.581, + "eval_steps_per_second": 1.035, + "step": 1131 + }, + { + "epoch": 0.06178129286839298, + "grad_norm": 0.1588805764913559, + "learning_rate": 3.8e-05, + "loss": 2.8741, + "step": 1140 + }, + { + "epoch": 0.062323234033905196, + "grad_norm": 0.22128504514694214, + "learning_rate": 3.8333333333333334e-05, + "loss": 2.8781, + "step": 1150 + }, + { + "epoch": 0.06286517519941741, + "grad_norm": 0.2544479966163635, + "learning_rate": 3.866666666666667e-05, + "loss": 2.8762, + "step": 1160 + }, + { + "epoch": 0.06340711636492963, + "grad_norm": 0.3215446472167969, + "learning_rate": 3.9000000000000006e-05, + "loss": 2.8758, + "step": 1170 + }, + { + "epoch": 0.06394905753044185, + "grad_norm": 0.20564351975917816, + "learning_rate": 3.933333333333333e-05, + "loss": 2.8725, + "step": 1180 + }, + { + "epoch": 0.06449099869595407, + "grad_norm": 0.24028970301151276, + "learning_rate": 3.966666666666667e-05, + "loss": 2.8728, + "step": 1190 + }, + { + "epoch": 0.06503293986146629, + "grad_norm": 0.21962830424308777, + "learning_rate": 4e-05, + "loss": 2.8702, + "step": 1200 + }, + { + "epoch": 0.0655748810269785, + "grad_norm": 0.281931072473526, + "learning_rate": 4.0333333333333336e-05, + "loss": 2.869, + "step": 1210 + }, + { + "epoch": 0.06600843395938828, + "eval_loss": 2.8557112216949463, + "eval_runtime": 21.9913, + "eval_samples_per_second": 227.363, + "eval_steps_per_second": 1.228, + "step": 1218 + }, + { + "epoch": 0.06611682219249072, + "grad_norm": 0.16926707327365875, + "learning_rate": 4.066666666666667e-05, + "loss": 2.8687, + "step": 1220 + }, + { + "epoch": 0.06665876335800294, + "grad_norm": 0.3461899757385254, + "learning_rate": 4.1e-05, + "loss": 2.8666, + "step": 1230 + }, + { + "epoch": 0.06720070452351516, + "grad_norm": 0.324008047580719, + "learning_rate": 4.133333333333333e-05, + "loss": 2.8691, + "step": 1240 + }, + { + "epoch": 0.06774264568902738, + "grad_norm": 0.2589833438396454, + "learning_rate": 4.166666666666667e-05, + "loss": 2.8676, + "step": 1250 + }, + { + "epoch": 0.0682845868545396, + "grad_norm": 0.21630503237247467, + "learning_rate": 4.2e-05, + "loss": 2.8649, + "step": 1260 + }, + { + "epoch": 0.06882652802005182, + "grad_norm": 0.19127613306045532, + "learning_rate": 4.233333333333334e-05, + "loss": 2.8643, + "step": 1270 + }, + { + "epoch": 0.06936846918556404, + "grad_norm": 0.2493632286787033, + "learning_rate": 4.266666666666667e-05, + "loss": 2.8633, + "step": 1280 + }, + { + "epoch": 0.06991041035107626, + "grad_norm": 0.26412254571914673, + "learning_rate": 4.3e-05, + "loss": 2.8644, + "step": 1290 + }, + { + "epoch": 0.07045235151658848, + "grad_norm": 0.27580294013023376, + "learning_rate": 4.3333333333333334e-05, + "loss": 2.8618, + "step": 1300 + }, + { + "epoch": 0.0707233220993446, + "eval_loss": 2.8502135276794434, + "eval_runtime": 21.9938, + "eval_samples_per_second": 227.337, + "eval_steps_per_second": 1.228, + "step": 1305 + }, + { + "epoch": 0.0709942926821007, + "grad_norm": 0.37026703357696533, + "learning_rate": 4.3666666666666666e-05, + "loss": 2.8585, + "step": 1310 + }, + { + "epoch": 0.07153623384761292, + "grad_norm": 0.3927730917930603, + "learning_rate": 4.4000000000000006e-05, + "loss": 2.8594, + "step": 1320 + }, + { + "epoch": 0.07207817501312513, + "grad_norm": 0.21110251545906067, + "learning_rate": 4.433333333333334e-05, + "loss": 2.858, + "step": 1330 + }, + { + "epoch": 0.07262011617863735, + "grad_norm": 0.29518288373947144, + "learning_rate": 4.466666666666667e-05, + "loss": 2.8537, + "step": 1340 + }, + { + "epoch": 0.07316205734414957, + "grad_norm": 0.27836179733276367, + "learning_rate": 4.5e-05, + "loss": 2.8552, + "step": 1350 + }, + { + "epoch": 0.07370399850966179, + "grad_norm": 0.39191293716430664, + "learning_rate": 4.5333333333333335e-05, + "loss": 2.8549, + "step": 1360 + }, + { + "epoch": 0.07424593967517401, + "grad_norm": 0.1891399621963501, + "learning_rate": 4.566666666666667e-05, + "loss": 2.8531, + "step": 1370 + }, + { + "epoch": 0.07478788084068623, + "grad_norm": 0.3418962061405182, + "learning_rate": 4.600000000000001e-05, + "loss": 2.8513, + "step": 1380 + }, + { + "epoch": 0.07532982200619845, + "grad_norm": 0.2758435308933258, + "learning_rate": 4.633333333333333e-05, + "loss": 2.8523, + "step": 1390 + }, + { + "epoch": 0.07543821023930089, + "eval_loss": 2.8491737842559814, + "eval_runtime": 21.9942, + "eval_samples_per_second": 227.333, + "eval_steps_per_second": 1.228, + "step": 1392 + }, + { + "epoch": 0.07587176317171067, + "grad_norm": 0.5267038941383362, + "learning_rate": 4.666666666666667e-05, + "loss": 2.8527, + "step": 1400 + }, + { + "epoch": 0.07641370433722289, + "grad_norm": 0.31036752462387085, + "learning_rate": 4.7e-05, + "loss": 2.8491, + "step": 1410 + }, + { + "epoch": 0.0769556455027351, + "grad_norm": 0.2229401171207428, + "learning_rate": 4.7333333333333336e-05, + "loss": 2.849, + "step": 1420 + }, + { + "epoch": 0.07749758666824733, + "grad_norm": 0.40294215083122253, + "learning_rate": 4.766666666666667e-05, + "loss": 2.847, + "step": 1430 + }, + { + "epoch": 0.07803952783375954, + "grad_norm": 0.25234729051589966, + "learning_rate": 4.8e-05, + "loss": 2.8497, + "step": 1440 + }, + { + "epoch": 0.07858146899927176, + "grad_norm": 0.2245229333639145, + "learning_rate": 4.8333333333333334e-05, + "loss": 2.8429, + "step": 1450 + }, + { + "epoch": 0.07912341016478398, + "grad_norm": 0.37367942929267883, + "learning_rate": 4.866666666666667e-05, + "loss": 2.8461, + "step": 1460 + }, + { + "epoch": 0.0796653513302962, + "grad_norm": 0.36259761452674866, + "learning_rate": 4.9e-05, + "loss": 2.8394, + "step": 1470 + }, + { + "epoch": 0.0801530983792572, + "eval_loss": 2.847864866256714, + "eval_runtime": 21.994, + "eval_samples_per_second": 227.334, + "eval_steps_per_second": 1.228, + "step": 1479 + }, + { + "epoch": 0.08020729249580842, + "grad_norm": 0.3754774034023285, + "learning_rate": 4.933333333333334e-05, + "loss": 2.8415, + "step": 1480 + }, + { + "epoch": 0.08074923366132064, + "grad_norm": 0.5675404071807861, + "learning_rate": 4.966666666666667e-05, + "loss": 2.8442, + "step": 1490 + }, + { + "epoch": 0.08129117482683286, + "grad_norm": 0.23455142974853516, + "learning_rate": 5e-05, + "loss": 2.8377, + "step": 1500 + }, + { + "epoch": 0.08183311599234508, + "grad_norm": 0.34516745805740356, + "learning_rate": 5.0333333333333335e-05, + "loss": 2.8378, + "step": 1510 + }, + { + "epoch": 0.0823750571578573, + "grad_norm": 0.26785463094711304, + "learning_rate": 5.0666666666666674e-05, + "loss": 2.8371, + "step": 1520 + }, + { + "epoch": 0.08291699832336952, + "grad_norm": 0.39150965213775635, + "learning_rate": 5.1000000000000006e-05, + "loss": 2.8366, + "step": 1530 + }, + { + "epoch": 0.08345893948888174, + "grad_norm": 0.322078138589859, + "learning_rate": 5.133333333333333e-05, + "loss": 2.8332, + "step": 1540 + }, + { + "epoch": 0.08400088065439396, + "grad_norm": 0.34766703844070435, + "learning_rate": 5.166666666666667e-05, + "loss": 2.8366, + "step": 1550 + }, + { + "epoch": 0.08454282181990617, + "grad_norm": 0.32657214999198914, + "learning_rate": 5.2000000000000004e-05, + "loss": 2.8305, + "step": 1560 + }, + { + "epoch": 0.08486798651921351, + "eval_loss": 2.8430960178375244, + "eval_runtime": 21.9941, + "eval_samples_per_second": 227.334, + "eval_steps_per_second": 1.228, + "step": 1566 + }, + { + "epoch": 0.0850847629854184, + "grad_norm": 0.5440207719802856, + "learning_rate": 5.2333333333333336e-05, + "loss": 2.8311, + "step": 1570 + }, + { + "epoch": 0.08562670415093061, + "grad_norm": 0.43800434470176697, + "learning_rate": 5.266666666666666e-05, + "loss": 2.834, + "step": 1580 + }, + { + "epoch": 0.08616864531644283, + "grad_norm": 0.28220343589782715, + "learning_rate": 5.300000000000001e-05, + "loss": 2.8264, + "step": 1590 + }, + { + "epoch": 0.08671058648195505, + "grad_norm": 0.5123216509819031, + "learning_rate": 5.333333333333333e-05, + "loss": 2.8261, + "step": 1600 + }, + { + "epoch": 0.08725252764746727, + "grad_norm": 0.3581772744655609, + "learning_rate": 5.3666666666666666e-05, + "loss": 2.8278, + "step": 1610 + }, + { + "epoch": 0.08779446881297949, + "grad_norm": 0.39921578764915466, + "learning_rate": 5.4000000000000005e-05, + "loss": 2.8234, + "step": 1620 + }, + { + "epoch": 0.08833640997849171, + "grad_norm": 0.36354538798332214, + "learning_rate": 5.433333333333334e-05, + "loss": 2.8229, + "step": 1630 + }, + { + "epoch": 0.08887835114400393, + "grad_norm": 0.30660825967788696, + "learning_rate": 5.466666666666666e-05, + "loss": 2.8225, + "step": 1640 + }, + { + "epoch": 0.08942029230951615, + "grad_norm": 0.30880966782569885, + "learning_rate": 5.500000000000001e-05, + "loss": 2.8206, + "step": 1650 + }, + { + "epoch": 0.08958287465916981, + "eval_loss": 2.8423779010772705, + "eval_runtime": 21.999, + "eval_samples_per_second": 227.283, + "eval_steps_per_second": 1.227, + "step": 1653 + }, + { + "epoch": 0.08996223347502837, + "grad_norm": 0.2573600113391876, + "learning_rate": 5.5333333333333334e-05, + "loss": 2.8159, + "step": 1660 + }, + { + "epoch": 0.09050417464054059, + "grad_norm": 0.30160731077194214, + "learning_rate": 5.566666666666667e-05, + "loss": 2.8184, + "step": 1670 + }, + { + "epoch": 0.0910461158060528, + "grad_norm": 0.2629443109035492, + "learning_rate": 5.6000000000000006e-05, + "loss": 2.8192, + "step": 1680 + }, + { + "epoch": 0.09158805697156502, + "grad_norm": 0.23509126901626587, + "learning_rate": 5.633333333333334e-05, + "loss": 2.8148, + "step": 1690 + }, + { + "epoch": 0.09212999813707724, + "grad_norm": 0.42312270402908325, + "learning_rate": 5.666666666666667e-05, + "loss": 2.813, + "step": 1700 + }, + { + "epoch": 0.09267193930258946, + "grad_norm": 0.46546509861946106, + "learning_rate": 5.6999999999999996e-05, + "loss": 2.8086, + "step": 1710 + }, + { + "epoch": 0.09321388046810168, + "grad_norm": 0.23622998595237732, + "learning_rate": 5.7333333333333336e-05, + "loss": 2.8111, + "step": 1720 + }, + { + "epoch": 0.0937558216336139, + "grad_norm": 0.607647180557251, + "learning_rate": 5.766666666666667e-05, + "loss": 2.8102, + "step": 1730 + }, + { + "epoch": 0.09429776279912612, + "grad_norm": 0.40518462657928467, + "learning_rate": 5.8e-05, + "loss": 2.8072, + "step": 1740 + }, + { + "epoch": 0.09429776279912612, + "eval_loss": 2.834738254547119, + "eval_runtime": 21.6631, + "eval_samples_per_second": 230.807, + "eval_steps_per_second": 1.246, + "step": 1740 + }, + { + "epoch": 0.09483970396463834, + "grad_norm": 0.2947821021080017, + "learning_rate": 5.833333333333334e-05, + "loss": 2.807, + "step": 1750 + }, + { + "epoch": 0.09538164513015056, + "grad_norm": 0.304720938205719, + "learning_rate": 5.866666666666667e-05, + "loss": 2.8086, + "step": 1760 + }, + { + "epoch": 0.09592358629566278, + "grad_norm": 0.37076836824417114, + "learning_rate": 5.9e-05, + "loss": 2.8035, + "step": 1770 + }, + { + "epoch": 0.096465527461175, + "grad_norm": 0.4996398389339447, + "learning_rate": 5.9333333333333343e-05, + "loss": 2.804, + "step": 1780 + }, + { + "epoch": 0.09700746862668722, + "grad_norm": 0.37258633971214294, + "learning_rate": 5.966666666666667e-05, + "loss": 2.8074, + "step": 1790 + }, + { + "epoch": 0.09754940979219943, + "grad_norm": 0.2905193567276001, + "learning_rate": 6e-05, + "loss": 2.8006, + "step": 1800 + }, + { + "epoch": 0.09809135095771165, + "grad_norm": 0.27609148621559143, + "learning_rate": 6.033333333333334e-05, + "loss": 2.7965, + "step": 1810 + }, + { + "epoch": 0.09863329212322387, + "grad_norm": 0.3159330189228058, + "learning_rate": 6.066666666666667e-05, + "loss": 2.7977, + "step": 1820 + }, + { + "epoch": 0.09901265093908243, + "eval_loss": 2.8351926803588867, + "eval_runtime": 21.9919, + "eval_samples_per_second": 227.356, + "eval_steps_per_second": 1.228, + "step": 1827 + }, + { + "epoch": 0.09917523328873609, + "grad_norm": 0.3412172794342041, + "learning_rate": 6.1e-05, + "loss": 2.7945, + "step": 1830 + }, + { + "epoch": 0.09971717445424831, + "grad_norm": 0.2713095247745514, + "learning_rate": 6.133333333333334e-05, + "loss": 2.7904, + "step": 1840 + }, + { + "epoch": 0.10025911561976053, + "grad_norm": 0.3899465799331665, + "learning_rate": 6.166666666666667e-05, + "loss": 2.7907, + "step": 1850 + }, + { + "epoch": 0.10080105678527275, + "grad_norm": 0.2539375424385071, + "learning_rate": 6.2e-05, + "loss": 2.7935, + "step": 1860 + }, + { + "epoch": 0.10134299795078497, + "grad_norm": 0.6003013253211975, + "learning_rate": 6.233333333333334e-05, + "loss": 2.7854, + "step": 1870 + }, + { + "epoch": 0.10188493911629719, + "grad_norm": 0.410022497177124, + "learning_rate": 6.266666666666667e-05, + "loss": 2.7879, + "step": 1880 + }, + { + "epoch": 0.10242688028180941, + "grad_norm": 0.24878162145614624, + "learning_rate": 6.3e-05, + "loss": 2.7848, + "step": 1890 + }, + { + "epoch": 0.10296882144732163, + "grad_norm": 0.5053821802139282, + "learning_rate": 6.333333333333333e-05, + "loss": 2.7876, + "step": 1900 + }, + { + "epoch": 0.10351076261283385, + "grad_norm": 0.43480825424194336, + "learning_rate": 6.366666666666668e-05, + "loss": 2.7826, + "step": 1910 + }, + { + "epoch": 0.10372753907903873, + "eval_loss": 2.837707996368408, + "eval_runtime": 22.0381, + "eval_samples_per_second": 226.88, + "eval_steps_per_second": 1.225, + "step": 1914 + }, + { + "epoch": 0.10405270377834606, + "grad_norm": 0.3400084674358368, + "learning_rate": 6.400000000000001e-05, + "loss": 2.7832, + "step": 1920 + }, + { + "epoch": 0.10459464494385828, + "grad_norm": 0.2867070436477661, + "learning_rate": 6.433333333333333e-05, + "loss": 2.7821, + "step": 1930 + }, + { + "epoch": 0.1051365861093705, + "grad_norm": 0.3295210003852844, + "learning_rate": 6.466666666666666e-05, + "loss": 2.776, + "step": 1940 + }, + { + "epoch": 0.10567852727488272, + "grad_norm": 0.29053670167922974, + "learning_rate": 6.500000000000001e-05, + "loss": 2.7776, + "step": 1950 + }, + { + "epoch": 0.10622046844039494, + "grad_norm": 0.4241558313369751, + "learning_rate": 6.533333333333334e-05, + "loss": 2.7788, + "step": 1960 + }, + { + "epoch": 0.10676240960590716, + "grad_norm": 0.6300092339515686, + "learning_rate": 6.566666666666666e-05, + "loss": 2.7763, + "step": 1970 + }, + { + "epoch": 0.10730435077141938, + "grad_norm": 0.29351305961608887, + "learning_rate": 6.6e-05, + "loss": 2.7713, + "step": 1980 + }, + { + "epoch": 0.1078462919369316, + "grad_norm": 0.5574642419815063, + "learning_rate": 6.633333333333334e-05, + "loss": 2.7693, + "step": 1990 + }, + { + "epoch": 0.10838823310244382, + "grad_norm": 0.34177178144454956, + "learning_rate": 6.666666666666667e-05, + "loss": 2.7661, + "step": 2000 + }, + { + "epoch": 0.10844242721899504, + "eval_loss": 2.8290762901306152, + "eval_runtime": 21.9941, + "eval_samples_per_second": 227.334, + "eval_steps_per_second": 1.228, + "step": 2001 + }, + { + "epoch": 0.10893017426795604, + "grad_norm": 0.5755301117897034, + "learning_rate": 6.7e-05, + "loss": 2.7709, + "step": 2010 + }, + { + "epoch": 0.10947211543346826, + "grad_norm": 0.4249889850616455, + "learning_rate": 6.733333333333333e-05, + "loss": 2.7636, + "step": 2020 + }, + { + "epoch": 0.11001405659898048, + "grad_norm": 0.3331117033958435, + "learning_rate": 6.766666666666667e-05, + "loss": 2.768, + "step": 2030 + }, + { + "epoch": 0.1105559977644927, + "grad_norm": 0.6131373643875122, + "learning_rate": 6.800000000000001e-05, + "loss": 2.7659, + "step": 2040 + }, + { + "epoch": 0.11109793893000491, + "grad_norm": 0.36327242851257324, + "learning_rate": 6.833333333333333e-05, + "loss": 2.7612, + "step": 2050 + }, + { + "epoch": 0.11163988009551713, + "grad_norm": 0.4649696946144104, + "learning_rate": 6.866666666666666e-05, + "loss": 2.7571, + "step": 2060 + }, + { + "epoch": 0.11218182126102935, + "grad_norm": 0.4964756965637207, + "learning_rate": 6.9e-05, + "loss": 2.7529, + "step": 2070 + }, + { + "epoch": 0.11272376242654157, + "grad_norm": 0.261391818523407, + "learning_rate": 6.933333333333334e-05, + "loss": 2.7493, + "step": 2080 + }, + { + "epoch": 0.11315731535895135, + "eval_loss": 2.822695732116699, + "eval_runtime": 21.997, + "eval_samples_per_second": 227.304, + "eval_steps_per_second": 1.227, + "step": 2088 + }, + { + "epoch": 0.11326570359205379, + "grad_norm": 0.4503042995929718, + "learning_rate": 6.966666666666668e-05, + "loss": 2.7474, + "step": 2090 + }, + { + "epoch": 0.11380764475756601, + "grad_norm": 0.5234399437904358, + "learning_rate": 7e-05, + "loss": 2.7477, + "step": 2100 + }, + { + "epoch": 0.11434958592307823, + "grad_norm": 0.4587494432926178, + "learning_rate": 7.033333333333334e-05, + "loss": 2.7441, + "step": 2110 + }, + { + "epoch": 0.11489152708859045, + "grad_norm": 0.3193627893924713, + "learning_rate": 7.066666666666667e-05, + "loss": 2.7529, + "step": 2120 + }, + { + "epoch": 0.11543346825410267, + "grad_norm": 0.60213702917099, + "learning_rate": 7.1e-05, + "loss": 2.7452, + "step": 2130 + }, + { + "epoch": 0.11597540941961489, + "grad_norm": 0.495370477437973, + "learning_rate": 7.133333333333334e-05, + "loss": 2.7431, + "step": 2140 + }, + { + "epoch": 0.1165173505851271, + "grad_norm": 0.46439045667648315, + "learning_rate": 7.166666666666667e-05, + "loss": 2.7463, + "step": 2150 + }, + { + "epoch": 0.11705929175063932, + "grad_norm": 0.4808029234409332, + "learning_rate": 7.2e-05, + "loss": 2.7399, + "step": 2160 + }, + { + "epoch": 0.11760123291615154, + "grad_norm": 0.40961867570877075, + "learning_rate": 7.233333333333335e-05, + "loss": 2.7369, + "step": 2170 + }, + { + "epoch": 0.11787220349890765, + "eval_loss": 2.823870897293091, + "eval_runtime": 21.9942, + "eval_samples_per_second": 227.333, + "eval_steps_per_second": 1.228, + "step": 2175 + }, + { + "epoch": 0.11814317408166376, + "grad_norm": 0.4309645891189575, + "learning_rate": 7.266666666666667e-05, + "loss": 2.7333, + "step": 2180 + }, + { + "epoch": 0.11868511524717598, + "grad_norm": 0.46611249446868896, + "learning_rate": 7.3e-05, + "loss": 2.7364, + "step": 2190 + }, + { + "epoch": 0.1192270564126882, + "grad_norm": 0.3425257205963135, + "learning_rate": 7.333333333333333e-05, + "loss": 2.7344, + "step": 2200 + }, + { + "epoch": 0.11976899757820042, + "grad_norm": 0.6439690589904785, + "learning_rate": 7.366666666666668e-05, + "loss": 2.7389, + "step": 2210 + }, + { + "epoch": 0.12031093874371264, + "grad_norm": 0.9094191193580627, + "learning_rate": 7.4e-05, + "loss": 2.727, + "step": 2220 + }, + { + "epoch": 0.12085287990922486, + "grad_norm": 0.4872890114784241, + "learning_rate": 7.433333333333333e-05, + "loss": 2.7286, + "step": 2230 + }, + { + "epoch": 0.12139482107473708, + "grad_norm": 0.4804534614086151, + "learning_rate": 7.466666666666667e-05, + "loss": 2.7233, + "step": 2240 + }, + { + "epoch": 0.1219367622402493, + "grad_norm": 0.330377995967865, + "learning_rate": 7.500000000000001e-05, + "loss": 2.7211, + "step": 2250 + }, + { + "epoch": 0.12247870340576152, + "grad_norm": 0.9292433857917786, + "learning_rate": 7.533333333333334e-05, + "loss": 2.7224, + "step": 2260 + }, + { + "epoch": 0.12258709163886396, + "eval_loss": 2.818483352661133, + "eval_runtime": 21.9931, + "eval_samples_per_second": 227.344, + "eval_steps_per_second": 1.228, + "step": 2262 + }, + { + "epoch": 0.12302064457127374, + "grad_norm": 0.4465082287788391, + "learning_rate": 7.566666666666667e-05, + "loss": 2.7214, + "step": 2270 + }, + { + "epoch": 0.12356258573678595, + "grad_norm": 0.300624817609787, + "learning_rate": 7.6e-05, + "loss": 2.722, + "step": 2280 + }, + { + "epoch": 0.12410452690229817, + "grad_norm": 0.41431039571762085, + "learning_rate": 7.633333333333334e-05, + "loss": 2.7135, + "step": 2290 + }, + { + "epoch": 0.12464646806781039, + "grad_norm": 0.3542834520339966, + "learning_rate": 7.666666666666667e-05, + "loss": 2.7118, + "step": 2300 + }, + { + "epoch": 0.1251884092333226, + "grad_norm": 0.42152753472328186, + "learning_rate": 7.7e-05, + "loss": 2.7193, + "step": 2310 + }, + { + "epoch": 0.12573035039883482, + "grad_norm": 0.4453124701976776, + "learning_rate": 7.733333333333333e-05, + "loss": 2.7154, + "step": 2320 + }, + { + "epoch": 0.12627229156434705, + "grad_norm": 0.6320910453796387, + "learning_rate": 7.766666666666667e-05, + "loss": 2.7098, + "step": 2330 + }, + { + "epoch": 0.12681423272985926, + "grad_norm": 0.3247159421443939, + "learning_rate": 7.800000000000001e-05, + "loss": 2.7067, + "step": 2340 + }, + { + "epoch": 0.12730197977882027, + "eval_loss": 2.813124179840088, + "eval_runtime": 21.9951, + "eval_samples_per_second": 227.323, + "eval_steps_per_second": 1.228, + "step": 2349 + }, + { + "epoch": 0.1273561738953715, + "grad_norm": 0.34726181626319885, + "learning_rate": 7.833333333333333e-05, + "loss": 2.7054, + "step": 2350 + }, + { + "epoch": 0.1278981150608837, + "grad_norm": 0.5048098564147949, + "learning_rate": 7.866666666666666e-05, + "loss": 2.7063, + "step": 2360 + }, + { + "epoch": 0.12844005622639593, + "grad_norm": 0.4321523606777191, + "learning_rate": 7.900000000000001e-05, + "loss": 2.7011, + "step": 2370 + }, + { + "epoch": 0.12898199739190813, + "grad_norm": 0.2796650826931, + "learning_rate": 7.933333333333334e-05, + "loss": 2.6997, + "step": 2380 + }, + { + "epoch": 0.12952393855742036, + "grad_norm": 0.5107274651527405, + "learning_rate": 7.966666666666666e-05, + "loss": 2.6933, + "step": 2390 + }, + { + "epoch": 0.13006587972293257, + "grad_norm": 0.6369110345840454, + "learning_rate": 8e-05, + "loss": 2.6989, + "step": 2400 + }, + { + "epoch": 0.1306078208884448, + "grad_norm": 0.6331022381782532, + "learning_rate": 8.033333333333334e-05, + "loss": 2.6923, + "step": 2410 + }, + { + "epoch": 0.131149762053957, + "grad_norm": 0.36579135060310364, + "learning_rate": 8.066666666666667e-05, + "loss": 2.6889, + "step": 2420 + }, + { + "epoch": 0.13169170321946924, + "grad_norm": 0.9732735753059387, + "learning_rate": 8.1e-05, + "loss": 2.6948, + "step": 2430 + }, + { + "epoch": 0.13201686791877656, + "eval_loss": 2.808922290802002, + "eval_runtime": 21.9927, + "eval_samples_per_second": 227.348, + "eval_steps_per_second": 1.228, + "step": 2436 + }, + { + "epoch": 0.13223364438498145, + "grad_norm": 0.5418347120285034, + "learning_rate": 8.133333333333334e-05, + "loss": 2.6871, + "step": 2440 + }, + { + "epoch": 0.13277558555049368, + "grad_norm": 0.43290242552757263, + "learning_rate": 8.166666666666667e-05, + "loss": 2.6827, + "step": 2450 + }, + { + "epoch": 0.13331752671600589, + "grad_norm": 0.6338348984718323, + "learning_rate": 8.2e-05, + "loss": 2.6821, + "step": 2460 + }, + { + "epoch": 0.13385946788151812, + "grad_norm": 0.604308545589447, + "learning_rate": 8.233333333333333e-05, + "loss": 2.6774, + "step": 2470 + }, + { + "epoch": 0.13440140904703032, + "grad_norm": 0.49863043427467346, + "learning_rate": 8.266666666666667e-05, + "loss": 2.6781, + "step": 2480 + }, + { + "epoch": 0.13494335021254256, + "grad_norm": 0.9325143098831177, + "learning_rate": 8.3e-05, + "loss": 2.6793, + "step": 2490 + }, + { + "epoch": 0.13548529137805476, + "grad_norm": 0.5320800542831421, + "learning_rate": 8.333333333333334e-05, + "loss": 2.6797, + "step": 2500 + }, + { + "epoch": 0.136027232543567, + "grad_norm": 0.3204202353954315, + "learning_rate": 8.366666666666668e-05, + "loss": 2.6757, + "step": 2510 + }, + { + "epoch": 0.1365691737090792, + "grad_norm": 0.2703516483306885, + "learning_rate": 8.4e-05, + "loss": 2.6667, + "step": 2520 + }, + { + "epoch": 0.13673175605873286, + "eval_loss": 2.8014180660247803, + "eval_runtime": 21.9979, + "eval_samples_per_second": 227.295, + "eval_steps_per_second": 1.227, + "step": 2523 + }, + { + "epoch": 0.13711111487459143, + "grad_norm": 0.4166392385959625, + "learning_rate": 8.433333333333334e-05, + "loss": 2.672, + "step": 2530 + }, + { + "epoch": 0.13765305604010364, + "grad_norm": 0.6374923586845398, + "learning_rate": 8.466666666666667e-05, + "loss": 2.666, + "step": 2540 + }, + { + "epoch": 0.13819499720561587, + "grad_norm": 0.41108188033103943, + "learning_rate": 8.5e-05, + "loss": 2.6661, + "step": 2550 + }, + { + "epoch": 0.13873693837112808, + "grad_norm": 0.6000506281852722, + "learning_rate": 8.533333333333334e-05, + "loss": 2.6677, + "step": 2560 + }, + { + "epoch": 0.1392788795366403, + "grad_norm": 0.3874584436416626, + "learning_rate": 8.566666666666667e-05, + "loss": 2.6664, + "step": 2570 + }, + { + "epoch": 0.13982082070215252, + "grad_norm": 1.0246257781982422, + "learning_rate": 8.6e-05, + "loss": 2.6662, + "step": 2580 + }, + { + "epoch": 0.14036276186766475, + "grad_norm": 0.6086533069610596, + "learning_rate": 8.633333333333334e-05, + "loss": 2.6615, + "step": 2590 + }, + { + "epoch": 0.14090470303317695, + "grad_norm": 0.2854156792163849, + "learning_rate": 8.666666666666667e-05, + "loss": 2.6512, + "step": 2600 + }, + { + "epoch": 0.1414466441986892, + "grad_norm": 0.6454458236694336, + "learning_rate": 8.7e-05, + "loss": 2.6608, + "step": 2610 + }, + { + "epoch": 0.1414466441986892, + "eval_loss": 2.8015899658203125, + "eval_runtime": 21.9849, + "eval_samples_per_second": 227.429, + "eval_steps_per_second": 1.228, + "step": 2610 + }, + { + "epoch": 0.1419885853642014, + "grad_norm": 0.553676426410675, + "learning_rate": 8.733333333333333e-05, + "loss": 2.6523, + "step": 2620 + }, + { + "epoch": 0.14253052652971362, + "grad_norm": 0.6649917364120483, + "learning_rate": 8.766666666666668e-05, + "loss": 2.6496, + "step": 2630 + }, + { + "epoch": 0.14307246769522583, + "grad_norm": 0.3380162715911865, + "learning_rate": 8.800000000000001e-05, + "loss": 2.6456, + "step": 2640 + }, + { + "epoch": 0.14361440886073806, + "grad_norm": 0.4091057777404785, + "learning_rate": 8.833333333333333e-05, + "loss": 2.6397, + "step": 2650 + }, + { + "epoch": 0.14415635002625027, + "grad_norm": 0.5197025537490845, + "learning_rate": 8.866666666666668e-05, + "loss": 2.6446, + "step": 2660 + }, + { + "epoch": 0.1446982911917625, + "grad_norm": 0.3578343093395233, + "learning_rate": 8.900000000000001e-05, + "loss": 2.6438, + "step": 2670 + }, + { + "epoch": 0.1452402323572747, + "grad_norm": 0.7891026735305786, + "learning_rate": 8.933333333333334e-05, + "loss": 2.6442, + "step": 2680 + }, + { + "epoch": 0.14578217352278694, + "grad_norm": 0.49294915795326233, + "learning_rate": 8.966666666666666e-05, + "loss": 2.6435, + "step": 2690 + }, + { + "epoch": 0.14616153233864548, + "eval_loss": 2.7972233295440674, + "eval_runtime": 21.9971, + "eval_samples_per_second": 227.303, + "eval_steps_per_second": 1.227, + "step": 2697 + }, + { + "epoch": 0.14632411468829915, + "grad_norm": 0.5119202136993408, + "learning_rate": 9e-05, + "loss": 2.6435, + "step": 2700 + }, + { + "epoch": 0.14686605585381138, + "grad_norm": 0.5359740853309631, + "learning_rate": 9.033333333333334e-05, + "loss": 2.6324, + "step": 2710 + }, + { + "epoch": 0.14740799701932358, + "grad_norm": 0.6155476570129395, + "learning_rate": 9.066666666666667e-05, + "loss": 2.6331, + "step": 2720 + }, + { + "epoch": 0.14794993818483582, + "grad_norm": 0.9409157633781433, + "learning_rate": 9.1e-05, + "loss": 2.6363, + "step": 2730 + }, + { + "epoch": 0.14849187935034802, + "grad_norm": 0.3532879650592804, + "learning_rate": 9.133333333333334e-05, + "loss": 2.6232, + "step": 2740 + }, + { + "epoch": 0.14903382051586025, + "grad_norm": 0.38057300448417664, + "learning_rate": 9.166666666666667e-05, + "loss": 2.6234, + "step": 2750 + }, + { + "epoch": 0.14957576168137246, + "grad_norm": 0.6639463901519775, + "learning_rate": 9.200000000000001e-05, + "loss": 2.6245, + "step": 2760 + }, + { + "epoch": 0.1501177028468847, + "grad_norm": 0.6744067668914795, + "learning_rate": 9.233333333333333e-05, + "loss": 2.6265, + "step": 2770 + }, + { + "epoch": 0.1506596440123969, + "grad_norm": 0.5296260714530945, + "learning_rate": 9.266666666666666e-05, + "loss": 2.6255, + "step": 2780 + }, + { + "epoch": 0.15087642047860178, + "eval_loss": 2.7909934520721436, + "eval_runtime": 21.9988, + "eval_samples_per_second": 227.285, + "eval_steps_per_second": 1.227, + "step": 2784 + }, + { + "epoch": 0.15120158517790913, + "grad_norm": 1.1150288581848145, + "learning_rate": 9.300000000000001e-05, + "loss": 2.6247, + "step": 2790 + }, + { + "epoch": 0.15174352634342134, + "grad_norm": 0.3209221661090851, + "learning_rate": 9.333333333333334e-05, + "loss": 2.6256, + "step": 2800 + }, + { + "epoch": 0.15228546750893357, + "grad_norm": 0.7962296605110168, + "learning_rate": 9.366666666666668e-05, + "loss": 2.6113, + "step": 2810 + }, + { + "epoch": 0.15282740867444577, + "grad_norm": 0.345284640789032, + "learning_rate": 9.4e-05, + "loss": 2.6234, + "step": 2820 + }, + { + "epoch": 0.153369349839958, + "grad_norm": 0.3219437003135681, + "learning_rate": 9.433333333333334e-05, + "loss": 2.619, + "step": 2830 + }, + { + "epoch": 0.1539112910054702, + "grad_norm": 0.3087829351425171, + "learning_rate": 9.466666666666667e-05, + "loss": 2.605, + "step": 2840 + }, + { + "epoch": 0.15445323217098245, + "grad_norm": 0.3032431900501251, + "learning_rate": 9.5e-05, + "loss": 2.6083, + "step": 2850 + }, + { + "epoch": 0.15499517333649465, + "grad_norm": 0.3768031597137451, + "learning_rate": 9.533333333333334e-05, + "loss": 2.6076, + "step": 2860 + }, + { + "epoch": 0.15553711450200688, + "grad_norm": 0.37252676486968994, + "learning_rate": 9.566666666666667e-05, + "loss": 2.6045, + "step": 2870 + }, + { + "epoch": 0.1555913086185581, + "eval_loss": 2.7903635501861572, + "eval_runtime": 21.9938, + "eval_samples_per_second": 227.337, + "eval_steps_per_second": 1.228, + "step": 2871 + }, + { + "epoch": 0.1560790556675191, + "grad_norm": 0.9097828269004822, + "learning_rate": 9.6e-05, + "loss": 2.6135, + "step": 2880 + }, + { + "epoch": 0.15662099683303132, + "grad_norm": 0.5291810631752014, + "learning_rate": 9.633333333333335e-05, + "loss": 2.5981, + "step": 2890 + }, + { + "epoch": 0.15716293799854353, + "grad_norm": 0.7748283743858337, + "learning_rate": 9.666666666666667e-05, + "loss": 2.5933, + "step": 2900 + }, + { + "epoch": 0.15770487916405576, + "grad_norm": 0.4185350239276886, + "learning_rate": 9.7e-05, + "loss": 2.5929, + "step": 2910 + }, + { + "epoch": 0.15824682032956797, + "grad_norm": 0.32910650968551636, + "learning_rate": 9.733333333333335e-05, + "loss": 2.5889, + "step": 2920 + }, + { + "epoch": 0.1587887614950802, + "grad_norm": 0.32221078872680664, + "learning_rate": 9.766666666666668e-05, + "loss": 2.5938, + "step": 2930 + }, + { + "epoch": 0.1593307026605924, + "grad_norm": 0.6913098096847534, + "learning_rate": 9.8e-05, + "loss": 2.5899, + "step": 2940 + }, + { + "epoch": 0.15987264382610464, + "grad_norm": 1.0831592082977295, + "learning_rate": 9.833333333333333e-05, + "loss": 2.5864, + "step": 2950 + }, + { + "epoch": 0.1603061967585144, + "eval_loss": 2.782402276992798, + "eval_runtime": 21.9947, + "eval_samples_per_second": 227.328, + "eval_steps_per_second": 1.228, + "step": 2958 + }, + { + "epoch": 0.16041458499161684, + "grad_norm": 0.6254389882087708, + "learning_rate": 9.866666666666668e-05, + "loss": 2.5797, + "step": 2960 + }, + { + "epoch": 0.16095652615712908, + "grad_norm": 0.34385034441947937, + "learning_rate": 9.900000000000001e-05, + "loss": 2.5846, + "step": 2970 + }, + { + "epoch": 0.16149846732264128, + "grad_norm": 0.7574624419212341, + "learning_rate": 9.933333333333334e-05, + "loss": 2.5805, + "step": 2980 + }, + { + "epoch": 0.16204040848815351, + "grad_norm": 0.36409491300582886, + "learning_rate": 9.966666666666667e-05, + "loss": 2.5852, + "step": 2990 + }, + { + "epoch": 0.16258234965366572, + "grad_norm": 0.7688263654708862, + "learning_rate": 0.0001, + "loss": 2.585, + "step": 3000 + }, + { + "epoch": 0.16312429081917795, + "grad_norm": 0.9283438920974731, + "learning_rate": 9.99999069936256e-05, + "loss": 2.582, + "step": 3010 + }, + { + "epoch": 0.16366623198469016, + "grad_norm": 0.5236392617225647, + "learning_rate": 9.999962797488683e-05, + "loss": 2.5818, + "step": 3020 + }, + { + "epoch": 0.1642081731502024, + "grad_norm": 0.4604909121990204, + "learning_rate": 9.999916294493705e-05, + "loss": 2.574, + "step": 3030 + }, + { + "epoch": 0.1647501143157146, + "grad_norm": 0.5703723430633545, + "learning_rate": 9.999851190569852e-05, + "loss": 2.5638, + "step": 3040 + }, + { + "epoch": 0.1650210848984707, + "eval_loss": 2.777949571609497, + "eval_runtime": 21.9996, + "eval_samples_per_second": 227.277, + "eval_steps_per_second": 1.227, + "step": 3045 + }, + { + "epoch": 0.16529205548122683, + "grad_norm": 1.0797618627548218, + "learning_rate": 9.99976748598624e-05, + "loss": 2.5687, + "step": 3050 + }, + { + "epoch": 0.16583399664673903, + "grad_norm": 0.7626991271972656, + "learning_rate": 9.999665181088869e-05, + "loss": 2.5726, + "step": 3060 + }, + { + "epoch": 0.16637593781225127, + "grad_norm": 0.46589240431785583, + "learning_rate": 9.999544276300629e-05, + "loss": 2.5625, + "step": 3070 + }, + { + "epoch": 0.16691787897776347, + "grad_norm": 0.4888695478439331, + "learning_rate": 9.999404772121297e-05, + "loss": 2.5547, + "step": 3080 + }, + { + "epoch": 0.1674598201432757, + "grad_norm": 1.3562276363372803, + "learning_rate": 9.999246669127524e-05, + "loss": 2.5644, + "step": 3090 + }, + { + "epoch": 0.1680017613087879, + "grad_norm": 0.3566310405731201, + "learning_rate": 9.999069967972854e-05, + "loss": 2.5584, + "step": 3100 + }, + { + "epoch": 0.16854370247430014, + "grad_norm": 0.7351515293121338, + "learning_rate": 9.998874669387696e-05, + "loss": 2.5589, + "step": 3110 + }, + { + "epoch": 0.16908564363981235, + "grad_norm": 0.5617663264274597, + "learning_rate": 9.998660774179343e-05, + "loss": 2.566, + "step": 3120 + }, + { + "epoch": 0.16962758480532458, + "grad_norm": 0.7777019143104553, + "learning_rate": 9.998428283231952e-05, + "loss": 2.5553, + "step": 3130 + }, + { + "epoch": 0.16973597303842702, + "eval_loss": 2.773970127105713, + "eval_runtime": 21.9948, + "eval_samples_per_second": 227.326, + "eval_steps_per_second": 1.228, + "step": 3132 + }, + { + "epoch": 0.1701695259708368, + "grad_norm": 0.8600534796714783, + "learning_rate": 9.998177197506557e-05, + "loss": 2.5529, + "step": 3140 + }, + { + "epoch": 0.17071146713634902, + "grad_norm": 0.3683708906173706, + "learning_rate": 9.997907518041047e-05, + "loss": 2.5552, + "step": 3150 + }, + { + "epoch": 0.17125340830186123, + "grad_norm": 0.5638754367828369, + "learning_rate": 9.997619245950172e-05, + "loss": 2.5465, + "step": 3160 + }, + { + "epoch": 0.17179534946737346, + "grad_norm": 0.4635675251483917, + "learning_rate": 9.997312382425543e-05, + "loss": 2.5419, + "step": 3170 + }, + { + "epoch": 0.17233729063288566, + "grad_norm": 0.48697274923324585, + "learning_rate": 9.99698692873561e-05, + "loss": 2.5409, + "step": 3180 + }, + { + "epoch": 0.1728792317983979, + "grad_norm": 0.48942264914512634, + "learning_rate": 9.99664288622568e-05, + "loss": 2.5347, + "step": 3190 + }, + { + "epoch": 0.1734211729639101, + "grad_norm": 0.3773297369480133, + "learning_rate": 9.996280256317887e-05, + "loss": 2.5432, + "step": 3200 + }, + { + "epoch": 0.17396311412942234, + "grad_norm": 0.8151489496231079, + "learning_rate": 9.995899040511207e-05, + "loss": 2.5359, + "step": 3210 + }, + { + "epoch": 0.17445086117838332, + "eval_loss": 2.7794876098632812, + "eval_runtime": 21.6952, + "eval_samples_per_second": 230.465, + "eval_steps_per_second": 1.245, + "step": 3219 + }, + { + "epoch": 0.17450505529493454, + "grad_norm": 0.6645236015319824, + "learning_rate": 9.995499240381441e-05, + "loss": 2.5412, + "step": 3220 + }, + { + "epoch": 0.17504699646044677, + "grad_norm": 0.31391623616218567, + "learning_rate": 9.995080857581208e-05, + "loss": 2.5403, + "step": 3230 + }, + { + "epoch": 0.17558893762595898, + "grad_norm": 0.6431202292442322, + "learning_rate": 9.994643893839943e-05, + "loss": 2.5346, + "step": 3240 + }, + { + "epoch": 0.1761308787914712, + "grad_norm": 0.6171109676361084, + "learning_rate": 9.994188350963887e-05, + "loss": 2.5362, + "step": 3250 + }, + { + "epoch": 0.17667281995698342, + "grad_norm": 0.3944205343723297, + "learning_rate": 9.993714230836076e-05, + "loss": 2.5302, + "step": 3260 + }, + { + "epoch": 0.17721476112249565, + "grad_norm": 0.462694376707077, + "learning_rate": 9.993221535416346e-05, + "loss": 2.5301, + "step": 3270 + }, + { + "epoch": 0.17775670228800786, + "grad_norm": 0.39359036087989807, + "learning_rate": 9.992710266741307e-05, + "loss": 2.5236, + "step": 3280 + }, + { + "epoch": 0.1782986434535201, + "grad_norm": 0.6656551361083984, + "learning_rate": 9.99218042692435e-05, + "loss": 2.5282, + "step": 3290 + }, + { + "epoch": 0.1788405846190323, + "grad_norm": 0.36592236161231995, + "learning_rate": 9.991632018155627e-05, + "loss": 2.5288, + "step": 3300 + }, + { + "epoch": 0.17916574931833962, + "eval_loss": 2.763190746307373, + "eval_runtime": 21.9963, + "eval_samples_per_second": 227.311, + "eval_steps_per_second": 1.227, + "step": 3306 + }, + { + "epoch": 0.17938252578454453, + "grad_norm": 1.2581602334976196, + "learning_rate": 9.991065042702054e-05, + "loss": 2.5198, + "step": 3310 + }, + { + "epoch": 0.17992446695005673, + "grad_norm": 0.9863691926002502, + "learning_rate": 9.990479502907287e-05, + "loss": 2.5248, + "step": 3320 + }, + { + "epoch": 0.18046640811556897, + "grad_norm": 0.538944661617279, + "learning_rate": 9.989875401191725e-05, + "loss": 2.5226, + "step": 3330 + }, + { + "epoch": 0.18100834928108117, + "grad_norm": 0.8944976925849915, + "learning_rate": 9.989252740052489e-05, + "loss": 2.512, + "step": 3340 + }, + { + "epoch": 0.1815502904465934, + "grad_norm": 0.5679588913917542, + "learning_rate": 9.988611522063423e-05, + "loss": 2.5097, + "step": 3350 + }, + { + "epoch": 0.1820922316121056, + "grad_norm": 0.40262141823768616, + "learning_rate": 9.987951749875081e-05, + "loss": 2.509, + "step": 3360 + }, + { + "epoch": 0.18263417277761784, + "grad_norm": 0.3973683714866638, + "learning_rate": 9.987273426214702e-05, + "loss": 2.5156, + "step": 3370 + }, + { + "epoch": 0.18317611394313005, + "grad_norm": 0.4501461982727051, + "learning_rate": 9.98657655388622e-05, + "loss": 2.5193, + "step": 3380 + }, + { + "epoch": 0.18371805510864228, + "grad_norm": 0.3253689110279083, + "learning_rate": 9.985861135770237e-05, + "loss": 2.5033, + "step": 3390 + }, + { + "epoch": 0.18388063745829594, + "eval_loss": 2.767366647720337, + "eval_runtime": 22.0, + "eval_samples_per_second": 227.273, + "eval_steps_per_second": 1.227, + "step": 3393 + }, + { + "epoch": 0.18425999627415449, + "grad_norm": 0.737877368927002, + "learning_rate": 9.985127174824017e-05, + "loss": 2.5085, + "step": 3400 + }, + { + "epoch": 0.18480193743966672, + "grad_norm": 1.0722659826278687, + "learning_rate": 9.984374674081472e-05, + "loss": 2.4993, + "step": 3410 + }, + { + "epoch": 0.18534387860517892, + "grad_norm": 0.7790729403495789, + "learning_rate": 9.983603636653154e-05, + "loss": 2.4982, + "step": 3420 + }, + { + "epoch": 0.18588581977069116, + "grad_norm": 0.3918309211730957, + "learning_rate": 9.982814065726233e-05, + "loss": 2.4979, + "step": 3430 + }, + { + "epoch": 0.18642776093620336, + "grad_norm": 0.31229421496391296, + "learning_rate": 9.982005964564495e-05, + "loss": 2.4927, + "step": 3440 + }, + { + "epoch": 0.1869697021017156, + "grad_norm": 0.4041496813297272, + "learning_rate": 9.981179336508322e-05, + "loss": 2.4964, + "step": 3450 + }, + { + "epoch": 0.1875116432672278, + "grad_norm": 0.3821089267730713, + "learning_rate": 9.980334184974672e-05, + "loss": 2.4913, + "step": 3460 + }, + { + "epoch": 0.18805358443274003, + "grad_norm": 0.4665864408016205, + "learning_rate": 9.979470513457084e-05, + "loss": 2.487, + "step": 3470 + }, + { + "epoch": 0.18859552559825224, + "grad_norm": 0.509825587272644, + "learning_rate": 9.978588325525639e-05, + "loss": 2.4932, + "step": 3480 + }, + { + "epoch": 0.18859552559825224, + "eval_loss": 2.7670769691467285, + "eval_runtime": 21.9697, + "eval_samples_per_second": 227.587, + "eval_steps_per_second": 1.229, + "step": 3480 + }, + { + "epoch": 0.18913746676376447, + "grad_norm": 1.2350674867630005, + "learning_rate": 9.977687624826966e-05, + "loss": 2.4936, + "step": 3490 + }, + { + "epoch": 0.18967940792927668, + "grad_norm": 0.7966161370277405, + "learning_rate": 9.97676841508422e-05, + "loss": 2.4861, + "step": 3500 + }, + { + "epoch": 0.1902213490947889, + "grad_norm": 0.3995961844921112, + "learning_rate": 9.975830700097056e-05, + "loss": 2.488, + "step": 3510 + }, + { + "epoch": 0.19076329026030112, + "grad_norm": 0.3042275309562683, + "learning_rate": 9.974874483741632e-05, + "loss": 2.4736, + "step": 3520 + }, + { + "epoch": 0.19130523142581332, + "grad_norm": 0.31050169467926025, + "learning_rate": 9.973899769970578e-05, + "loss": 2.4825, + "step": 3530 + }, + { + "epoch": 0.19184717259132555, + "grad_norm": 0.323839008808136, + "learning_rate": 9.972906562812986e-05, + "loss": 2.4788, + "step": 3540 + }, + { + "epoch": 0.19238911375683776, + "grad_norm": 0.8257955312728882, + "learning_rate": 9.971894866374397e-05, + "loss": 2.4857, + "step": 3550 + }, + { + "epoch": 0.19293105492235, + "grad_norm": 0.547508955001831, + "learning_rate": 9.970864684836776e-05, + "loss": 2.4832, + "step": 3560 + }, + { + "epoch": 0.19331041373820854, + "eval_loss": 2.761841297149658, + "eval_runtime": 21.9898, + "eval_samples_per_second": 227.379, + "eval_steps_per_second": 1.228, + "step": 3567 + }, + { + "epoch": 0.1934729960878622, + "grad_norm": 0.7686718106269836, + "learning_rate": 9.969816022458495e-05, + "loss": 2.4779, + "step": 3570 + }, + { + "epoch": 0.19401493725337443, + "grad_norm": 0.3649592995643616, + "learning_rate": 9.968748883574324e-05, + "loss": 2.4733, + "step": 3580 + }, + { + "epoch": 0.19455687841888664, + "grad_norm": 0.7720659375190735, + "learning_rate": 9.967663272595408e-05, + "loss": 2.465, + "step": 3590 + }, + { + "epoch": 0.19509881958439887, + "grad_norm": 0.47627437114715576, + "learning_rate": 9.966559194009244e-05, + "loss": 2.4662, + "step": 3600 + }, + { + "epoch": 0.19564076074991107, + "grad_norm": 0.3539658486843109, + "learning_rate": 9.965436652379671e-05, + "loss": 2.4693, + "step": 3610 + }, + { + "epoch": 0.1961827019154233, + "grad_norm": 1.2727680206298828, + "learning_rate": 9.964295652346844e-05, + "loss": 2.465, + "step": 3620 + }, + { + "epoch": 0.1967246430809355, + "grad_norm": 0.34611645340919495, + "learning_rate": 9.963136198627224e-05, + "loss": 2.4634, + "step": 3630 + }, + { + "epoch": 0.19726658424644775, + "grad_norm": 0.7994367480278015, + "learning_rate": 9.961958296013543e-05, + "loss": 2.4702, + "step": 3640 + }, + { + "epoch": 0.19780852541195995, + "grad_norm": 1.1131157875061035, + "learning_rate": 9.960761949374802e-05, + "loss": 2.4639, + "step": 3650 + }, + { + "epoch": 0.19802530187816486, + "eval_loss": 2.7547664642333984, + "eval_runtime": 21.9926, + "eval_samples_per_second": 227.35, + "eval_steps_per_second": 1.228, + "step": 3654 + }, + { + "epoch": 0.19835046657747218, + "grad_norm": 1.1642026901245117, + "learning_rate": 9.959547163656238e-05, + "loss": 2.4701, + "step": 3660 + }, + { + "epoch": 0.1988924077429844, + "grad_norm": 0.38460174202919006, + "learning_rate": 9.958313943879311e-05, + "loss": 2.4701, + "step": 3670 + }, + { + "epoch": 0.19943434890849662, + "grad_norm": 0.4307605028152466, + "learning_rate": 9.957062295141675e-05, + "loss": 2.46, + "step": 3680 + }, + { + "epoch": 0.19997629007400883, + "grad_norm": 0.952154815196991, + "learning_rate": 9.955792222617171e-05, + "loss": 2.4643, + "step": 3690 + }, + { + "epoch": 0.20051823123952106, + "grad_norm": 0.3698415756225586, + "learning_rate": 9.95450373155579e-05, + "loss": 2.4557, + "step": 3700 + }, + { + "epoch": 0.20106017240503327, + "grad_norm": 0.3291034996509552, + "learning_rate": 9.953196827283659e-05, + "loss": 2.4598, + "step": 3710 + }, + { + "epoch": 0.2016021135705455, + "grad_norm": 0.31008443236351013, + "learning_rate": 9.95187151520302e-05, + "loss": 2.4511, + "step": 3720 + }, + { + "epoch": 0.2021440547360577, + "grad_norm": 0.5721209645271301, + "learning_rate": 9.950527800792205e-05, + "loss": 2.4536, + "step": 3730 + }, + { + "epoch": 0.20268599590156994, + "grad_norm": 0.2794472575187683, + "learning_rate": 9.949165689605615e-05, + "loss": 2.4425, + "step": 3740 + }, + { + "epoch": 0.20274019001812116, + "eval_loss": 2.7546393871307373, + "eval_runtime": 22.0026, + "eval_samples_per_second": 227.246, + "eval_steps_per_second": 1.227, + "step": 3741 + }, + { + "epoch": 0.20322793706708214, + "grad_norm": 0.844109058380127, + "learning_rate": 9.947785187273695e-05, + "loss": 2.4386, + "step": 3750 + }, + { + "epoch": 0.20376987823259438, + "grad_norm": 0.3052925765514374, + "learning_rate": 9.946386299502911e-05, + "loss": 2.448, + "step": 3760 + }, + { + "epoch": 0.20431181939810658, + "grad_norm": 0.5775868892669678, + "learning_rate": 9.94496903207573e-05, + "loss": 2.4449, + "step": 3770 + }, + { + "epoch": 0.20485376056361881, + "grad_norm": 0.7839900255203247, + "learning_rate": 9.943533390850595e-05, + "loss": 2.4545, + "step": 3780 + }, + { + "epoch": 0.20539570172913102, + "grad_norm": 0.4013843834400177, + "learning_rate": 9.942079381761889e-05, + "loss": 2.4563, + "step": 3790 + }, + { + "epoch": 0.20593764289464325, + "grad_norm": 0.3987264633178711, + "learning_rate": 9.940607010819937e-05, + "loss": 2.4431, + "step": 3800 + }, + { + "epoch": 0.20647958406015546, + "grad_norm": 0.3003256022930145, + "learning_rate": 9.93911628411095e-05, + "loss": 2.4424, + "step": 3810 + }, + { + "epoch": 0.2070215252256677, + "grad_norm": 0.6472198367118835, + "learning_rate": 9.937607207797024e-05, + "loss": 2.4379, + "step": 3820 + }, + { + "epoch": 0.20745507815807745, + "eval_loss": 2.7585620880126953, + "eval_runtime": 21.9936, + "eval_samples_per_second": 227.339, + "eval_steps_per_second": 1.228, + "step": 3828 + }, + { + "epoch": 0.2075634663911799, + "grad_norm": 0.4794847071170807, + "learning_rate": 9.9360797881161e-05, + "loss": 2.4339, + "step": 3830 + }, + { + "epoch": 0.20810540755669213, + "grad_norm": 0.3864356577396393, + "learning_rate": 9.934534031381946e-05, + "loss": 2.4437, + "step": 3840 + }, + { + "epoch": 0.20864734872220433, + "grad_norm": 0.4814181923866272, + "learning_rate": 9.932969943984126e-05, + "loss": 2.4322, + "step": 3850 + }, + { + "epoch": 0.20918928988771657, + "grad_norm": 0.6743614077568054, + "learning_rate": 9.931387532387981e-05, + "loss": 2.4356, + "step": 3860 + }, + { + "epoch": 0.20973123105322877, + "grad_norm": 0.6256967186927795, + "learning_rate": 9.929786803134592e-05, + "loss": 2.4571, + "step": 3870 + }, + { + "epoch": 0.210273172218741, + "grad_norm": 0.45648303627967834, + "learning_rate": 9.928167762840761e-05, + "loss": 2.432, + "step": 3880 + }, + { + "epoch": 0.2108151133842532, + "grad_norm": 0.4223450720310211, + "learning_rate": 9.926530418198978e-05, + "loss": 2.4279, + "step": 3890 + }, + { + "epoch": 0.21135705454976544, + "grad_norm": 0.40104198455810547, + "learning_rate": 9.924874775977402e-05, + "loss": 2.4261, + "step": 3900 + }, + { + "epoch": 0.21189899571527765, + "grad_norm": 0.7316650748252869, + "learning_rate": 9.923200843019818e-05, + "loss": 2.4293, + "step": 3910 + }, + { + "epoch": 0.21216996629803378, + "eval_loss": 2.747528553009033, + "eval_runtime": 22.0, + "eval_samples_per_second": 227.273, + "eval_steps_per_second": 1.227, + "step": 3915 + }, + { + "epoch": 0.21244093688078988, + "grad_norm": 0.39364123344421387, + "learning_rate": 9.921508626245628e-05, + "loss": 2.4287, + "step": 3920 + }, + { + "epoch": 0.2129828780463021, + "grad_norm": 0.5997620224952698, + "learning_rate": 9.919798132649803e-05, + "loss": 2.4304, + "step": 3930 + }, + { + "epoch": 0.21352481921181432, + "grad_norm": 0.6328970193862915, + "learning_rate": 9.91806936930287e-05, + "loss": 2.4236, + "step": 3940 + }, + { + "epoch": 0.21406676037732653, + "grad_norm": 0.8760083913803101, + "learning_rate": 9.916322343350875e-05, + "loss": 2.4235, + "step": 3950 + }, + { + "epoch": 0.21460870154283876, + "grad_norm": 0.9582383036613464, + "learning_rate": 9.914557062015352e-05, + "loss": 2.4171, + "step": 3960 + }, + { + "epoch": 0.21515064270835096, + "grad_norm": 0.7223474383354187, + "learning_rate": 9.912773532593297e-05, + "loss": 2.4194, + "step": 3970 + }, + { + "epoch": 0.2156925838738632, + "grad_norm": 0.7395852208137512, + "learning_rate": 9.910971762457138e-05, + "loss": 2.412, + "step": 3980 + }, + { + "epoch": 0.2162345250393754, + "grad_norm": 0.352475643157959, + "learning_rate": 9.909151759054702e-05, + "loss": 2.4086, + "step": 3990 + }, + { + "epoch": 0.21677646620488764, + "grad_norm": 0.43535661697387695, + "learning_rate": 9.907313529909185e-05, + "loss": 2.4128, + "step": 4000 + }, + { + "epoch": 0.21688485443799008, + "eval_loss": 2.742093563079834, + "eval_runtime": 21.9979, + "eval_samples_per_second": 227.294, + "eval_steps_per_second": 1.227, + "step": 4002 + }, + { + "epoch": 0.21731840737039984, + "grad_norm": 0.4158700406551361, + "learning_rate": 9.905457082619124e-05, + "loss": 2.4054, + "step": 4010 + }, + { + "epoch": 0.21786034853591207, + "grad_norm": 0.6706676483154297, + "learning_rate": 9.903582424858355e-05, + "loss": 2.3992, + "step": 4020 + }, + { + "epoch": 0.21840228970142428, + "grad_norm": 1.0080608129501343, + "learning_rate": 9.901689564375998e-05, + "loss": 2.4159, + "step": 4030 + }, + { + "epoch": 0.2189442308669365, + "grad_norm": 0.8006541728973389, + "learning_rate": 9.899778508996412e-05, + "loss": 2.413, + "step": 4040 + }, + { + "epoch": 0.21948617203244872, + "grad_norm": 0.7094182968139648, + "learning_rate": 9.89784926661917e-05, + "loss": 2.4095, + "step": 4050 + }, + { + "epoch": 0.22002811319796095, + "grad_norm": 0.5873300433158875, + "learning_rate": 9.895901845219013e-05, + "loss": 2.4033, + "step": 4060 + }, + { + "epoch": 0.22057005436347316, + "grad_norm": 0.65219646692276, + "learning_rate": 9.893936252845842e-05, + "loss": 2.3991, + "step": 4070 + }, + { + "epoch": 0.2211119955289854, + "grad_norm": 0.5918698310852051, + "learning_rate": 9.891952497624662e-05, + "loss": 2.4142, + "step": 4080 + }, + { + "epoch": 0.22159974257794637, + "eval_loss": 2.7375481128692627, + "eval_runtime": 21.9899, + "eval_samples_per_second": 227.377, + "eval_steps_per_second": 1.228, + "step": 4089 + }, + { + "epoch": 0.2216539366944976, + "grad_norm": 1.1681138277053833, + "learning_rate": 9.889950587755549e-05, + "loss": 2.4009, + "step": 4090 + }, + { + "epoch": 0.22219587786000983, + "grad_norm": 0.3698543310165405, + "learning_rate": 9.88793053151364e-05, + "loss": 2.398, + "step": 4100 + }, + { + "epoch": 0.22273781902552203, + "grad_norm": 0.3500048816204071, + "learning_rate": 9.885892337249069e-05, + "loss": 2.3997, + "step": 4110 + }, + { + "epoch": 0.22327976019103427, + "grad_norm": 0.9475021958351135, + "learning_rate": 9.88383601338695e-05, + "loss": 2.4023, + "step": 4120 + }, + { + "epoch": 0.22382170135654647, + "grad_norm": 0.37473389506340027, + "learning_rate": 9.881761568427335e-05, + "loss": 2.3969, + "step": 4130 + }, + { + "epoch": 0.2243636425220587, + "grad_norm": 0.46967896819114685, + "learning_rate": 9.879669010945189e-05, + "loss": 2.3995, + "step": 4140 + }, + { + "epoch": 0.2249055836875709, + "grad_norm": 0.40588507056236267, + "learning_rate": 9.877558349590341e-05, + "loss": 2.3983, + "step": 4150 + }, + { + "epoch": 0.22544752485308314, + "grad_norm": 0.32905760407447815, + "learning_rate": 9.875429593087454e-05, + "loss": 2.3965, + "step": 4160 + }, + { + "epoch": 0.22598946601859535, + "grad_norm": 0.7571531534194946, + "learning_rate": 9.873282750235993e-05, + "loss": 2.3858, + "step": 4170 + }, + { + "epoch": 0.2263146307179027, + "eval_loss": 2.7395927906036377, + "eval_runtime": 21.9951, + "eval_samples_per_second": 227.323, + "eval_steps_per_second": 1.228, + "step": 4176 + }, + { + "epoch": 0.22653140718410758, + "grad_norm": 0.4100936949253082, + "learning_rate": 9.871117829910181e-05, + "loss": 2.3913, + "step": 4180 + }, + { + "epoch": 0.22707334834961979, + "grad_norm": 1.1051249504089355, + "learning_rate": 9.868934841058972e-05, + "loss": 2.385, + "step": 4190 + }, + { + "epoch": 0.22761528951513202, + "grad_norm": 0.4184013903141022, + "learning_rate": 9.866733792706003e-05, + "loss": 2.3901, + "step": 4200 + }, + { + "epoch": 0.22815723068064422, + "grad_norm": 0.8634352087974548, + "learning_rate": 9.864514693949563e-05, + "loss": 2.382, + "step": 4210 + }, + { + "epoch": 0.22869917184615646, + "grad_norm": 0.42947426438331604, + "learning_rate": 9.86227755396256e-05, + "loss": 2.3847, + "step": 4220 + }, + { + "epoch": 0.22924111301166866, + "grad_norm": 0.7337902784347534, + "learning_rate": 9.860022381992467e-05, + "loss": 2.3877, + "step": 4230 + }, + { + "epoch": 0.2297830541771809, + "grad_norm": 0.33461204171180725, + "learning_rate": 9.857749187361308e-05, + "loss": 2.3771, + "step": 4240 + }, + { + "epoch": 0.2303249953426931, + "grad_norm": 0.4798398017883301, + "learning_rate": 9.85545797946559e-05, + "loss": 2.3745, + "step": 4250 + }, + { + "epoch": 0.23086693650820533, + "grad_norm": 0.7971112728118896, + "learning_rate": 9.853148767776293e-05, + "loss": 2.3856, + "step": 4260 + }, + { + "epoch": 0.231029518857859, + "eval_loss": 2.7393031120300293, + "eval_runtime": 21.9959, + "eval_samples_per_second": 227.316, + "eval_steps_per_second": 1.228, + "step": 4263 + }, + { + "epoch": 0.23140887767371754, + "grad_norm": 0.5765118598937988, + "learning_rate": 9.85082156183881e-05, + "loss": 2.3818, + "step": 4270 + }, + { + "epoch": 0.23195081883922977, + "grad_norm": 0.9162944555282593, + "learning_rate": 9.848476371272922e-05, + "loss": 2.3756, + "step": 4280 + }, + { + "epoch": 0.23249276000474198, + "grad_norm": 0.757483959197998, + "learning_rate": 9.846113205772746e-05, + "loss": 2.3754, + "step": 4290 + }, + { + "epoch": 0.2330347011702542, + "grad_norm": 0.5934547185897827, + "learning_rate": 9.8437320751067e-05, + "loss": 2.3688, + "step": 4300 + }, + { + "epoch": 0.23357664233576642, + "grad_norm": 0.540972888469696, + "learning_rate": 9.841332989117469e-05, + "loss": 2.3713, + "step": 4310 + }, + { + "epoch": 0.23411858350127865, + "grad_norm": 0.41171032190322876, + "learning_rate": 9.838915957721953e-05, + "loss": 2.3675, + "step": 4320 + }, + { + "epoch": 0.23466052466679085, + "grad_norm": 0.5488543510437012, + "learning_rate": 9.83648099091123e-05, + "loss": 2.369, + "step": 4330 + }, + { + "epoch": 0.2352024658323031, + "grad_norm": 0.4907141923904419, + "learning_rate": 9.834028098750525e-05, + "loss": 2.3694, + "step": 4340 + }, + { + "epoch": 0.2357444069978153, + "grad_norm": 0.3236653804779053, + "learning_rate": 9.83155729137915e-05, + "loss": 2.3596, + "step": 4350 + }, + { + "epoch": 0.2357444069978153, + "eval_loss": 2.7318739891052246, + "eval_runtime": 21.9575, + "eval_samples_per_second": 227.713, + "eval_steps_per_second": 1.23, + "step": 4350 + }, + { + "epoch": 0.23628634816332753, + "grad_norm": 0.29620563983917236, + "learning_rate": 9.82906857901048e-05, + "loss": 2.3661, + "step": 4360 + }, + { + "epoch": 0.23682828932883973, + "grad_norm": 0.9057065844535828, + "learning_rate": 9.826561971931891e-05, + "loss": 2.366, + "step": 4370 + }, + { + "epoch": 0.23737023049435196, + "grad_norm": 0.9793679118156433, + "learning_rate": 9.824037480504741e-05, + "loss": 2.3657, + "step": 4380 + }, + { + "epoch": 0.23791217165986417, + "grad_norm": 0.6782917976379395, + "learning_rate": 9.821495115164309e-05, + "loss": 2.3674, + "step": 4390 + }, + { + "epoch": 0.2384541128253764, + "grad_norm": 0.3981853425502777, + "learning_rate": 9.818934886419756e-05, + "loss": 2.3678, + "step": 4400 + }, + { + "epoch": 0.2389960539908886, + "grad_norm": 0.5394397974014282, + "learning_rate": 9.816356804854089e-05, + "loss": 2.3544, + "step": 4410 + }, + { + "epoch": 0.23953799515640084, + "grad_norm": 0.45772433280944824, + "learning_rate": 9.813760881124107e-05, + "loss": 2.3546, + "step": 4420 + }, + { + "epoch": 0.24007993632191305, + "grad_norm": 0.5437049269676208, + "learning_rate": 9.811147125960364e-05, + "loss": 2.3592, + "step": 4430 + }, + { + "epoch": 0.24045929513777162, + "eval_loss": 2.728144407272339, + "eval_runtime": 21.9944, + "eval_samples_per_second": 227.331, + "eval_steps_per_second": 1.228, + "step": 4437 + }, + { + "epoch": 0.24062187748742528, + "grad_norm": 0.8073667883872986, + "learning_rate": 9.808515550167124e-05, + "loss": 2.3523, + "step": 4440 + }, + { + "epoch": 0.24116381865293748, + "grad_norm": 0.40337878465652466, + "learning_rate": 9.805866164622311e-05, + "loss": 2.3574, + "step": 4450 + }, + { + "epoch": 0.24170575981844972, + "grad_norm": 0.7153369188308716, + "learning_rate": 9.803198980277467e-05, + "loss": 2.3557, + "step": 4460 + }, + { + "epoch": 0.24224770098396192, + "grad_norm": 0.37222856283187866, + "learning_rate": 9.800514008157711e-05, + "loss": 2.3553, + "step": 4470 + }, + { + "epoch": 0.24278964214947416, + "grad_norm": 0.3131774067878723, + "learning_rate": 9.79781125936169e-05, + "loss": 2.3562, + "step": 4480 + }, + { + "epoch": 0.24333158331498636, + "grad_norm": 0.5932138562202454, + "learning_rate": 9.79509074506153e-05, + "loss": 2.3422, + "step": 4490 + }, + { + "epoch": 0.2438735244804986, + "grad_norm": 0.6757357716560364, + "learning_rate": 9.792352476502796e-05, + "loss": 2.349, + "step": 4500 + }, + { + "epoch": 0.2444154656460108, + "grad_norm": 0.9189159870147705, + "learning_rate": 9.789596465004437e-05, + "loss": 2.341, + "step": 4510 + }, + { + "epoch": 0.24495740681152303, + "grad_norm": 0.40294119715690613, + "learning_rate": 9.786822721958751e-05, + "loss": 2.3526, + "step": 4520 + }, + { + "epoch": 0.24517418327772791, + "eval_loss": 2.7286596298217773, + "eval_runtime": 21.9954, + "eval_samples_per_second": 227.32, + "eval_steps_per_second": 1.228, + "step": 4524 + }, + { + "epoch": 0.24549934797703524, + "grad_norm": 0.4049138128757477, + "learning_rate": 9.784031258831325e-05, + "loss": 2.3427, + "step": 4530 + }, + { + "epoch": 0.24604128914254747, + "grad_norm": 0.5206155776977539, + "learning_rate": 9.781222087161003e-05, + "loss": 2.347, + "step": 4540 + }, + { + "epoch": 0.24658323030805968, + "grad_norm": 0.3420925736427307, + "learning_rate": 9.77839521855982e-05, + "loss": 2.3421, + "step": 4550 + }, + { + "epoch": 0.2471251714735719, + "grad_norm": 1.2839010953903198, + "learning_rate": 9.775550664712966e-05, + "loss": 2.3492, + "step": 4560 + }, + { + "epoch": 0.2476671126390841, + "grad_norm": 0.6513694524765015, + "learning_rate": 9.772688437378738e-05, + "loss": 2.3417, + "step": 4570 + }, + { + "epoch": 0.24820905380459635, + "grad_norm": 0.7786572575569153, + "learning_rate": 9.769808548388488e-05, + "loss": 2.333, + "step": 4580 + }, + { + "epoch": 0.24875099497010855, + "grad_norm": 0.8283511996269226, + "learning_rate": 9.766911009646569e-05, + "loss": 2.3358, + "step": 4590 + }, + { + "epoch": 0.24929293613562079, + "grad_norm": 0.5858640074729919, + "learning_rate": 9.763995833130299e-05, + "loss": 2.3424, + "step": 4600 + }, + { + "epoch": 0.249834877301133, + "grad_norm": 0.41306570172309875, + "learning_rate": 9.761063030889898e-05, + "loss": 2.3338, + "step": 4610 + }, + { + "epoch": 0.2498890714176842, + "eval_loss": 2.723004102706909, + "eval_runtime": 21.9968, + "eval_samples_per_second": 227.306, + "eval_steps_per_second": 1.227, + "step": 4611 + }, + { + "epoch": 0.2503768184666452, + "grad_norm": 0.32257717847824097, + "learning_rate": 9.758112615048448e-05, + "loss": 2.3332, + "step": 4620 + }, + { + "epoch": 0.25091875963215743, + "grad_norm": 0.31163015961647034, + "learning_rate": 9.755144597801837e-05, + "loss": 2.3306, + "step": 4630 + }, + { + "epoch": 0.25146070079766963, + "grad_norm": 0.6164253950119019, + "learning_rate": 9.752158991418708e-05, + "loss": 2.3288, + "step": 4640 + }, + { + "epoch": 0.2520026419631819, + "grad_norm": 0.6337904930114746, + "learning_rate": 9.749155808240415e-05, + "loss": 2.3271, + "step": 4650 + }, + { + "epoch": 0.2525445831286941, + "grad_norm": 0.3386767506599426, + "learning_rate": 9.746135060680966e-05, + "loss": 2.3342, + "step": 4660 + }, + { + "epoch": 0.2530865242942063, + "grad_norm": 0.9337306022644043, + "learning_rate": 9.743096761226972e-05, + "loss": 2.3299, + "step": 4670 + }, + { + "epoch": 0.2536284654597185, + "grad_norm": 0.31421270966529846, + "learning_rate": 9.7400409224376e-05, + "loss": 2.3302, + "step": 4680 + }, + { + "epoch": 0.25417040662523077, + "grad_norm": 0.33126044273376465, + "learning_rate": 9.736967556944516e-05, + "loss": 2.325, + "step": 4690 + }, + { + "epoch": 0.25460395955764054, + "eval_loss": 2.722799301147461, + "eval_runtime": 25.1071, + "eval_samples_per_second": 199.147, + "eval_steps_per_second": 1.075, + "step": 4698 + }, + { + "epoch": 0.254712347790743, + "grad_norm": 0.3403181731700897, + "learning_rate": 9.733876677451833e-05, + "loss": 2.319, + "step": 4700 + }, + { + "epoch": 0.2552542889562552, + "grad_norm": 0.4288688004016876, + "learning_rate": 9.730768296736064e-05, + "loss": 2.322, + "step": 4710 + }, + { + "epoch": 0.2557962301217674, + "grad_norm": 1.1494555473327637, + "learning_rate": 9.727642427646061e-05, + "loss": 2.319, + "step": 4720 + }, + { + "epoch": 0.25633817128727965, + "grad_norm": 1.3199281692504883, + "learning_rate": 9.72449908310297e-05, + "loss": 2.3299, + "step": 4730 + }, + { + "epoch": 0.25688011245279185, + "grad_norm": 1.220360279083252, + "learning_rate": 9.721338276100172e-05, + "loss": 2.3441, + "step": 4740 + }, + { + "epoch": 0.25742205361830406, + "grad_norm": 1.106679916381836, + "learning_rate": 9.718160019703232e-05, + "loss": 2.3379, + "step": 4750 + }, + { + "epoch": 0.25796399478381626, + "grad_norm": 1.3295619487762451, + "learning_rate": 9.714964327049842e-05, + "loss": 2.3501, + "step": 4760 + }, + { + "epoch": 0.2585059359493285, + "grad_norm": 0.6279563307762146, + "learning_rate": 9.711751211349773e-05, + "loss": 2.3166, + "step": 4770 + }, + { + "epoch": 0.25904787711484073, + "grad_norm": 0.3395664095878601, + "learning_rate": 9.70852068588481e-05, + "loss": 2.3087, + "step": 4780 + }, + { + "epoch": 0.25931884769759683, + "eval_loss": 2.721379518508911, + "eval_runtime": 21.9931, + "eval_samples_per_second": 227.344, + "eval_steps_per_second": 1.228, + "step": 4785 + }, + { + "epoch": 0.25958981828035294, + "grad_norm": 0.32632923126220703, + "learning_rate": 9.705272764008709e-05, + "loss": 2.3064, + "step": 4790 + }, + { + "epoch": 0.26013175944586514, + "grad_norm": 0.3286370038986206, + "learning_rate": 9.702007459147134e-05, + "loss": 2.3013, + "step": 4800 + }, + { + "epoch": 0.2606737006113774, + "grad_norm": 0.6520362496376038, + "learning_rate": 9.698724784797604e-05, + "loss": 2.3196, + "step": 4810 + }, + { + "epoch": 0.2612156417768896, + "grad_norm": 0.6383447051048279, + "learning_rate": 9.695424754529434e-05, + "loss": 2.3109, + "step": 4820 + }, + { + "epoch": 0.2617575829424018, + "grad_norm": 0.3725563585758209, + "learning_rate": 9.692107381983684e-05, + "loss": 2.3072, + "step": 4830 + }, + { + "epoch": 0.262299524107914, + "grad_norm": 0.5179880857467651, + "learning_rate": 9.688772680873103e-05, + "loss": 2.3065, + "step": 4840 + }, + { + "epoch": 0.2628414652734263, + "grad_norm": 0.5016129612922668, + "learning_rate": 9.685420664982067e-05, + "loss": 2.3088, + "step": 4850 + }, + { + "epoch": 0.2633834064389385, + "grad_norm": 0.5394784212112427, + "learning_rate": 9.682051348166523e-05, + "loss": 2.2979, + "step": 4860 + }, + { + "epoch": 0.2639253476044507, + "grad_norm": 0.5578838586807251, + "learning_rate": 9.678664744353935e-05, + "loss": 2.3034, + "step": 4870 + }, + { + "epoch": 0.26403373583755313, + "eval_loss": 2.723337173461914, + "eval_runtime": 21.9924, + "eval_samples_per_second": 227.351, + "eval_steps_per_second": 1.228, + "step": 4872 + }, + { + "epoch": 0.2644672887699629, + "grad_norm": 0.34530109167099, + "learning_rate": 9.675260867543224e-05, + "loss": 2.3096, + "step": 4880 + }, + { + "epoch": 0.26500922993547515, + "grad_norm": 0.7636291980743408, + "learning_rate": 9.671839731804716e-05, + "loss": 2.307, + "step": 4890 + }, + { + "epoch": 0.26555117110098736, + "grad_norm": 0.3167097568511963, + "learning_rate": 9.66840135128007e-05, + "loss": 2.3068, + "step": 4900 + }, + { + "epoch": 0.26609311226649957, + "grad_norm": 0.45626381039619446, + "learning_rate": 9.664945740182235e-05, + "loss": 2.2948, + "step": 4910 + }, + { + "epoch": 0.26663505343201177, + "grad_norm": 0.695152759552002, + "learning_rate": 9.661472912795383e-05, + "loss": 2.2975, + "step": 4920 + }, + { + "epoch": 0.26717699459752403, + "grad_norm": 0.30375921726226807, + "learning_rate": 9.65798288347485e-05, + "loss": 2.3041, + "step": 4930 + }, + { + "epoch": 0.26771893576303624, + "grad_norm": 0.8904628753662109, + "learning_rate": 9.654475666647078e-05, + "loss": 2.2957, + "step": 4940 + }, + { + "epoch": 0.26826087692854844, + "grad_norm": 1.199233889579773, + "learning_rate": 9.650951276809561e-05, + "loss": 2.2994, + "step": 4950 + }, + { + "epoch": 0.2687486239775094, + "eval_loss": 2.7227141857147217, + "eval_runtime": 21.9965, + "eval_samples_per_second": 227.309, + "eval_steps_per_second": 1.227, + "step": 4959 + }, + { + "epoch": 0.26880281809406065, + "grad_norm": 2.123131513595581, + "learning_rate": 9.647409728530772e-05, + "loss": 2.3143, + "step": 4960 + }, + { + "epoch": 0.2693447592595729, + "grad_norm": 0.7237464189529419, + "learning_rate": 9.643851036450115e-05, + "loss": 2.3183, + "step": 4970 + }, + { + "epoch": 0.2698867004250851, + "grad_norm": 0.6150104403495789, + "learning_rate": 9.640275215277858e-05, + "loss": 2.3048, + "step": 4980 + }, + { + "epoch": 0.2704286415905973, + "grad_norm": 0.3302861154079437, + "learning_rate": 9.636682279795076e-05, + "loss": 2.2903, + "step": 4990 + }, + { + "epoch": 0.2709705827561095, + "grad_norm": 0.7235733270645142, + "learning_rate": 9.633072244853587e-05, + "loss": 2.2807, + "step": 5000 + }, + { + "epoch": 0.2715125239216218, + "grad_norm": 0.36814624071121216, + "learning_rate": 9.629445125375891e-05, + "loss": 2.2886, + "step": 5010 + }, + { + "epoch": 0.272054465087134, + "grad_norm": 0.5725374221801758, + "learning_rate": 9.625800936355108e-05, + "loss": 2.2996, + "step": 5020 + }, + { + "epoch": 0.2725964062526462, + "grad_norm": 0.45517677068710327, + "learning_rate": 9.62213969285492e-05, + "loss": 2.2876, + "step": 5030 + }, + { + "epoch": 0.2731383474181584, + "grad_norm": 0.5474647879600525, + "learning_rate": 9.618461410009503e-05, + "loss": 2.2838, + "step": 5040 + }, + { + "epoch": 0.2734635121174657, + "eval_loss": 2.713071346282959, + "eval_runtime": 21.9919, + "eval_samples_per_second": 227.356, + "eval_steps_per_second": 1.228, + "step": 5046 + }, + { + "epoch": 0.27368028858367066, + "grad_norm": 0.3035077452659607, + "learning_rate": 9.614766103023473e-05, + "loss": 2.2759, + "step": 5050 + }, + { + "epoch": 0.27422222974918287, + "grad_norm": 0.6675074100494385, + "learning_rate": 9.611053787171804e-05, + "loss": 2.2889, + "step": 5060 + }, + { + "epoch": 0.27476417091469507, + "grad_norm": 0.5895007252693176, + "learning_rate": 9.607324477799793e-05, + "loss": 2.2786, + "step": 5070 + }, + { + "epoch": 0.2753061120802073, + "grad_norm": 0.781353771686554, + "learning_rate": 9.603578190322974e-05, + "loss": 2.2852, + "step": 5080 + }, + { + "epoch": 0.27584805324571954, + "grad_norm": 0.3827020823955536, + "learning_rate": 9.599814940227062e-05, + "loss": 2.2813, + "step": 5090 + }, + { + "epoch": 0.27638999441123174, + "grad_norm": 0.6647722125053406, + "learning_rate": 9.59603474306789e-05, + "loss": 2.279, + "step": 5100 + }, + { + "epoch": 0.27693193557674395, + "grad_norm": 0.3594939112663269, + "learning_rate": 9.592237614471346e-05, + "loss": 2.2697, + "step": 5110 + }, + { + "epoch": 0.27747387674225615, + "grad_norm": 0.3628252446651459, + "learning_rate": 9.588423570133301e-05, + "loss": 2.2752, + "step": 5120 + }, + { + "epoch": 0.2780158179077684, + "grad_norm": 0.322070449590683, + "learning_rate": 9.584592625819555e-05, + "loss": 2.2714, + "step": 5130 + }, + { + "epoch": 0.2781784002574221, + "eval_loss": 2.711329698562622, + "eval_runtime": 22.0018, + "eval_samples_per_second": 227.255, + "eval_steps_per_second": 1.227, + "step": 5133 + }, + { + "epoch": 0.2785577590732806, + "grad_norm": 0.4778302311897278, + "learning_rate": 9.580744797365761e-05, + "loss": 2.2721, + "step": 5140 + }, + { + "epoch": 0.2790997002387928, + "grad_norm": 0.44883444905281067, + "learning_rate": 9.57688010067737e-05, + "loss": 2.2807, + "step": 5150 + }, + { + "epoch": 0.27964164140430503, + "grad_norm": 0.9123862981796265, + "learning_rate": 9.572998551729552e-05, + "loss": 2.2729, + "step": 5160 + }, + { + "epoch": 0.2801835825698173, + "grad_norm": 2.519432544708252, + "learning_rate": 9.569100166567143e-05, + "loss": 2.2818, + "step": 5170 + }, + { + "epoch": 0.2807255237353295, + "grad_norm": 1.1521965265274048, + "learning_rate": 9.565184961304577e-05, + "loss": 2.2977, + "step": 5180 + }, + { + "epoch": 0.2812674649008417, + "grad_norm": 1.3088760375976562, + "learning_rate": 9.561252952125808e-05, + "loss": 2.2886, + "step": 5190 + }, + { + "epoch": 0.2818094060663539, + "grad_norm": 0.45803970098495483, + "learning_rate": 9.557304155284256e-05, + "loss": 2.2879, + "step": 5200 + }, + { + "epoch": 0.28235134723186617, + "grad_norm": 0.4502812623977661, + "learning_rate": 9.553338587102732e-05, + "loss": 2.2774, + "step": 5210 + }, + { + "epoch": 0.2828932883973784, + "grad_norm": 0.3390588164329529, + "learning_rate": 9.549356263973376e-05, + "loss": 2.2749, + "step": 5220 + }, + { + "epoch": 0.2828932883973784, + "eval_loss": 2.7108614444732666, + "eval_runtime": 21.8696, + "eval_samples_per_second": 228.628, + "eval_steps_per_second": 1.235, + "step": 5220 + }, + { + "epoch": 0.2834352295628906, + "grad_norm": 0.31600069999694824, + "learning_rate": 9.545357202357584e-05, + "loss": 2.2686, + "step": 5230 + }, + { + "epoch": 0.2839771707284028, + "grad_norm": 0.6420486569404602, + "learning_rate": 9.541341418785944e-05, + "loss": 2.2669, + "step": 5240 + }, + { + "epoch": 0.28451911189391504, + "grad_norm": 0.6971787810325623, + "learning_rate": 9.537308929858167e-05, + "loss": 2.2553, + "step": 5250 + }, + { + "epoch": 0.28506105305942725, + "grad_norm": 0.35600554943084717, + "learning_rate": 9.533259752243015e-05, + "loss": 2.2607, + "step": 5260 + }, + { + "epoch": 0.28560299422493945, + "grad_norm": 0.4857500195503235, + "learning_rate": 9.529193902678236e-05, + "loss": 2.2604, + "step": 5270 + }, + { + "epoch": 0.28614493539045166, + "grad_norm": 0.41848480701446533, + "learning_rate": 9.525111397970495e-05, + "loss": 2.2627, + "step": 5280 + }, + { + "epoch": 0.28668687655596387, + "grad_norm": 0.39947494864463806, + "learning_rate": 9.521012254995298e-05, + "loss": 2.259, + "step": 5290 + }, + { + "epoch": 0.2872288177214761, + "grad_norm": 0.6807723045349121, + "learning_rate": 9.516896490696936e-05, + "loss": 2.2634, + "step": 5300 + }, + { + "epoch": 0.28760817653733467, + "eval_loss": 2.706552267074585, + "eval_runtime": 21.9925, + "eval_samples_per_second": 227.35, + "eval_steps_per_second": 1.228, + "step": 5307 + }, + { + "epoch": 0.28777075888698833, + "grad_norm": 0.7820791602134705, + "learning_rate": 9.512764122088394e-05, + "loss": 2.2567, + "step": 5310 + }, + { + "epoch": 0.28831270005250054, + "grad_norm": 0.6624732613563538, + "learning_rate": 9.508615166251305e-05, + "loss": 2.263, + "step": 5320 + }, + { + "epoch": 0.28885464121801274, + "grad_norm": 0.3670085072517395, + "learning_rate": 9.504449640335858e-05, + "loss": 2.2528, + "step": 5330 + }, + { + "epoch": 0.289396582383525, + "grad_norm": 0.36448076367378235, + "learning_rate": 9.500267561560746e-05, + "loss": 2.2564, + "step": 5340 + }, + { + "epoch": 0.2899385235490372, + "grad_norm": 0.3871496021747589, + "learning_rate": 9.496068947213073e-05, + "loss": 2.2561, + "step": 5350 + }, + { + "epoch": 0.2904804647145494, + "grad_norm": 0.5003486275672913, + "learning_rate": 9.491853814648305e-05, + "loss": 2.2539, + "step": 5360 + }, + { + "epoch": 0.2910224058800616, + "grad_norm": 1.1620166301727295, + "learning_rate": 9.487622181290183e-05, + "loss": 2.2517, + "step": 5370 + }, + { + "epoch": 0.2915643470455739, + "grad_norm": 0.725284218788147, + "learning_rate": 9.483374064630656e-05, + "loss": 2.2456, + "step": 5380 + }, + { + "epoch": 0.2921062882110861, + "grad_norm": 0.541685938835144, + "learning_rate": 9.479109482229812e-05, + "loss": 2.2553, + "step": 5390 + }, + { + "epoch": 0.29232306467729097, + "eval_loss": 2.7066264152526855, + "eval_runtime": 21.9975, + "eval_samples_per_second": 227.298, + "eval_steps_per_second": 1.227, + "step": 5394 + }, + { + "epoch": 0.2926482293765983, + "grad_norm": 0.40974506735801697, + "learning_rate": 9.474828451715798e-05, + "loss": 2.2516, + "step": 5400 + }, + { + "epoch": 0.2931901705421105, + "grad_norm": 0.5276529788970947, + "learning_rate": 9.470530990784752e-05, + "loss": 2.2554, + "step": 5410 + }, + { + "epoch": 0.29373211170762276, + "grad_norm": 0.9658313393592834, + "learning_rate": 9.466217117200735e-05, + "loss": 2.2501, + "step": 5420 + }, + { + "epoch": 0.29427405287313496, + "grad_norm": 0.6076865196228027, + "learning_rate": 9.461886848795642e-05, + "loss": 2.2438, + "step": 5430 + }, + { + "epoch": 0.29481599403864717, + "grad_norm": 0.5501825213432312, + "learning_rate": 9.457540203469142e-05, + "loss": 2.2419, + "step": 5440 + }, + { + "epoch": 0.29535793520415937, + "grad_norm": 0.41925522685050964, + "learning_rate": 9.453177199188603e-05, + "loss": 2.2496, + "step": 5450 + }, + { + "epoch": 0.29589987636967163, + "grad_norm": 0.4038546681404114, + "learning_rate": 9.448797853989013e-05, + "loss": 2.255, + "step": 5460 + }, + { + "epoch": 0.29644181753518384, + "grad_norm": 0.3127026855945587, + "learning_rate": 9.444402185972901e-05, + "loss": 2.2396, + "step": 5470 + }, + { + "epoch": 0.29698375870069604, + "grad_norm": 0.6538156867027283, + "learning_rate": 9.439990213310277e-05, + "loss": 2.2377, + "step": 5480 + }, + { + "epoch": 0.29703795281724726, + "eval_loss": 2.7067112922668457, + "eval_runtime": 21.9955, + "eval_samples_per_second": 227.32, + "eval_steps_per_second": 1.228, + "step": 5481 + }, + { + "epoch": 0.29752569986620825, + "grad_norm": 0.6304920315742493, + "learning_rate": 9.435561954238548e-05, + "loss": 2.2417, + "step": 5490 + }, + { + "epoch": 0.2980676410317205, + "grad_norm": 0.5496434569358826, + "learning_rate": 9.431117427062434e-05, + "loss": 2.2353, + "step": 5500 + }, + { + "epoch": 0.2986095821972327, + "grad_norm": 0.7714524865150452, + "learning_rate": 9.426656650153909e-05, + "loss": 2.2292, + "step": 5510 + }, + { + "epoch": 0.2991515233627449, + "grad_norm": 0.7220959663391113, + "learning_rate": 9.422179641952113e-05, + "loss": 2.2506, + "step": 5520 + }, + { + "epoch": 0.2996934645282571, + "grad_norm": 0.5006898045539856, + "learning_rate": 9.417686420963283e-05, + "loss": 2.2222, + "step": 5530 + }, + { + "epoch": 0.3002354056937694, + "grad_norm": 0.3837469220161438, + "learning_rate": 9.413177005760672e-05, + "loss": 2.2429, + "step": 5540 + }, + { + "epoch": 0.3007773468592816, + "grad_norm": 0.34782886505126953, + "learning_rate": 9.408651414984472e-05, + "loss": 2.2423, + "step": 5550 + }, + { + "epoch": 0.3013192880247938, + "grad_norm": 0.35922521352767944, + "learning_rate": 9.404109667341746e-05, + "loss": 2.2349, + "step": 5560 + }, + { + "epoch": 0.30175284095720356, + "eval_loss": 2.698547124862671, + "eval_runtime": 21.9987, + "eval_samples_per_second": 227.287, + "eval_steps_per_second": 1.227, + "step": 5568 + }, + { + "epoch": 0.301861229190306, + "grad_norm": 0.4572945237159729, + "learning_rate": 9.399551781606329e-05, + "loss": 2.2244, + "step": 5570 + }, + { + "epoch": 0.30240317035581826, + "grad_norm": 0.3920208513736725, + "learning_rate": 9.394977776618779e-05, + "loss": 2.2299, + "step": 5580 + }, + { + "epoch": 0.30294511152133047, + "grad_norm": 0.9155771136283875, + "learning_rate": 9.390387671286279e-05, + "loss": 2.2424, + "step": 5590 + }, + { + "epoch": 0.3034870526868427, + "grad_norm": 0.34311428666114807, + "learning_rate": 9.38578148458256e-05, + "loss": 2.2273, + "step": 5600 + }, + { + "epoch": 0.3040289938523549, + "grad_norm": 0.39582574367523193, + "learning_rate": 9.381159235547839e-05, + "loss": 2.2342, + "step": 5610 + }, + { + "epoch": 0.30457093501786714, + "grad_norm": 0.37435415387153625, + "learning_rate": 9.376520943288716e-05, + "loss": 2.2312, + "step": 5620 + }, + { + "epoch": 0.30511287618337934, + "grad_norm": 0.5571390986442566, + "learning_rate": 9.371866626978118e-05, + "loss": 2.2193, + "step": 5630 + }, + { + "epoch": 0.30565481734889155, + "grad_norm": 0.348417192697525, + "learning_rate": 9.367196305855199e-05, + "loss": 2.24, + "step": 5640 + }, + { + "epoch": 0.30619675851440376, + "grad_norm": 0.9660398364067078, + "learning_rate": 9.362509999225281e-05, + "loss": 2.2291, + "step": 5650 + }, + { + "epoch": 0.3064677290971599, + "eval_loss": 2.700347900390625, + "eval_runtime": 21.9973, + "eval_samples_per_second": 227.301, + "eval_steps_per_second": 1.227, + "step": 5655 + }, + { + "epoch": 0.306738699679916, + "grad_norm": 0.6863952875137329, + "learning_rate": 9.357807726459754e-05, + "loss": 2.2291, + "step": 5660 + }, + { + "epoch": 0.3072806408454282, + "grad_norm": 0.4206200838088989, + "learning_rate": 9.353089506996016e-05, + "loss": 2.2224, + "step": 5670 + }, + { + "epoch": 0.3078225820109404, + "grad_norm": 0.9796401262283325, + "learning_rate": 9.348355360337374e-05, + "loss": 2.2199, + "step": 5680 + }, + { + "epoch": 0.30836452317645263, + "grad_norm": 0.8994840979576111, + "learning_rate": 9.343605306052977e-05, + "loss": 2.2317, + "step": 5690 + }, + { + "epoch": 0.3089064643419649, + "grad_norm": 0.6637131571769714, + "learning_rate": 9.338839363777728e-05, + "loss": 2.2211, + "step": 5700 + }, + { + "epoch": 0.3094484055074771, + "grad_norm": 0.6918546557426453, + "learning_rate": 9.334057553212204e-05, + "loss": 2.2204, + "step": 5710 + }, + { + "epoch": 0.3099903466729893, + "grad_norm": 0.3313464820384979, + "learning_rate": 9.32925989412258e-05, + "loss": 2.2202, + "step": 5720 + }, + { + "epoch": 0.3105322878385015, + "grad_norm": 0.4739852249622345, + "learning_rate": 9.324446406340537e-05, + "loss": 2.2164, + "step": 5730 + }, + { + "epoch": 0.31107422900401377, + "grad_norm": 1.1123601198196411, + "learning_rate": 9.319617109763188e-05, + "loss": 2.2114, + "step": 5740 + }, + { + "epoch": 0.3111826172371162, + "eval_loss": 2.697807550430298, + "eval_runtime": 29.7475, + "eval_samples_per_second": 168.081, + "eval_steps_per_second": 0.908, + "step": 5742 + }, + { + "epoch": 0.311616170169526, + "grad_norm": 0.33748936653137207, + "learning_rate": 9.314772024352995e-05, + "loss": 2.2157, + "step": 5750 + }, + { + "epoch": 0.3121581113350382, + "grad_norm": 0.35050588846206665, + "learning_rate": 9.309911170137682e-05, + "loss": 2.2109, + "step": 5760 + }, + { + "epoch": 0.3127000525005504, + "grad_norm": 0.5154600739479065, + "learning_rate": 9.30503456721016e-05, + "loss": 2.2026, + "step": 5770 + }, + { + "epoch": 0.31324199366606265, + "grad_norm": 0.5823956727981567, + "learning_rate": 9.300142235728432e-05, + "loss": 2.2115, + "step": 5780 + }, + { + "epoch": 0.31378393483157485, + "grad_norm": 0.7447882890701294, + "learning_rate": 9.295234195915523e-05, + "loss": 2.2098, + "step": 5790 + }, + { + "epoch": 0.31432587599708706, + "grad_norm": 0.6827070116996765, + "learning_rate": 9.290310468059389e-05, + "loss": 2.2105, + "step": 5800 + }, + { + "epoch": 0.31486781716259926, + "grad_norm": 0.42079001665115356, + "learning_rate": 9.285371072512831e-05, + "loss": 2.2102, + "step": 5810 + }, + { + "epoch": 0.3154097583281115, + "grad_norm": 0.7562916874885559, + "learning_rate": 9.280416029693419e-05, + "loss": 2.2039, + "step": 5820 + }, + { + "epoch": 0.3158975053770725, + "eval_loss": 2.691206455230713, + "eval_runtime": 48.3032, + "eval_samples_per_second": 103.513, + "eval_steps_per_second": 0.559, + "step": 5829 + }, + { + "epoch": 0.3159516994936237, + "grad_norm": 0.6581352949142456, + "learning_rate": 9.275445360083398e-05, + "loss": 2.2065, + "step": 5830 + }, + { + "epoch": 0.31649364065913593, + "grad_norm": 0.8172852993011475, + "learning_rate": 9.270459084229612e-05, + "loss": 2.2078, + "step": 5840 + }, + { + "epoch": 0.31703558182464814, + "grad_norm": 0.8475619554519653, + "learning_rate": 9.265457222743414e-05, + "loss": 2.2049, + "step": 5850 + }, + { + "epoch": 0.3175775229901604, + "grad_norm": 0.37048637866973877, + "learning_rate": 9.260439796300582e-05, + "loss": 2.2131, + "step": 5860 + }, + { + "epoch": 0.3181194641556726, + "grad_norm": 0.4438531994819641, + "learning_rate": 9.255406825641233e-05, + "loss": 2.1972, + "step": 5870 + }, + { + "epoch": 0.3186614053211848, + "grad_norm": 0.7300620079040527, + "learning_rate": 9.250358331569737e-05, + "loss": 2.1957, + "step": 5880 + }, + { + "epoch": 0.319203346486697, + "grad_norm": 0.42140352725982666, + "learning_rate": 9.245294334954636e-05, + "loss": 2.2043, + "step": 5890 + }, + { + "epoch": 0.3197452876522093, + "grad_norm": 1.2300440073013306, + "learning_rate": 9.24021485672855e-05, + "loss": 2.2077, + "step": 5900 + }, + { + "epoch": 0.3202872288177215, + "grad_norm": 0.9198090434074402, + "learning_rate": 9.23511991788809e-05, + "loss": 2.1974, + "step": 5910 + }, + { + "epoch": 0.3206123935170288, + "eval_loss": 2.7107741832733154, + "eval_runtime": 52.6522, + "eval_samples_per_second": 94.963, + "eval_steps_per_second": 0.513, + "step": 5916 + }, + { + "epoch": 0.3208291699832337, + "grad_norm": 0.9036116600036621, + "learning_rate": 9.230009539493787e-05, + "loss": 2.2209, + "step": 5920 + }, + { + "epoch": 0.3213711111487459, + "grad_norm": 0.5668349266052246, + "learning_rate": 9.224883742669982e-05, + "loss": 2.2166, + "step": 5930 + }, + { + "epoch": 0.32191305231425815, + "grad_norm": 0.5860267281532288, + "learning_rate": 9.219742548604756e-05, + "loss": 2.1962, + "step": 5940 + }, + { + "epoch": 0.32245499347977036, + "grad_norm": 0.3655729591846466, + "learning_rate": 9.214585978549832e-05, + "loss": 2.2041, + "step": 5950 + }, + { + "epoch": 0.32299693464528256, + "grad_norm": 0.34334370493888855, + "learning_rate": 9.209414053820495e-05, + "loss": 2.198, + "step": 5960 + }, + { + "epoch": 0.32353887581079477, + "grad_norm": 0.39599186182022095, + "learning_rate": 9.2042267957955e-05, + "loss": 2.1826, + "step": 5970 + }, + { + "epoch": 0.32408081697630703, + "grad_norm": 0.46203556656837463, + "learning_rate": 9.199024225916982e-05, + "loss": 2.1947, + "step": 5980 + }, + { + "epoch": 0.32462275814181923, + "grad_norm": 0.43823203444480896, + "learning_rate": 9.193806365690371e-05, + "loss": 2.1897, + "step": 5990 + }, + { + "epoch": 0.32516469930733144, + "grad_norm": 0.3208545744419098, + "learning_rate": 9.1885732366843e-05, + "loss": 2.1952, + "step": 6000 + }, + { + "epoch": 0.00016258234965366573, + "eval_loss": 2.6887643337249756, + "eval_runtime": 22.0393, + "eval_samples_per_second": 226.867, + "eval_steps_per_second": 1.225, + "step": 6003 + }, + { + "epoch": 0.000541941165512219, + "grad_norm": 0.34268251061439514, + "learning_rate": 9.183324860530519e-05, + "loss": 2.1881, + "step": 6010 + }, + { + "epoch": 0.001083882331024438, + "grad_norm": 0.6927510499954224, + "learning_rate": 9.178061258923802e-05, + "loss": 2.1964, + "step": 6020 + }, + { + "epoch": 0.0016258234965366573, + "grad_norm": 0.7226220965385437, + "learning_rate": 9.172782453621862e-05, + "loss": 2.1923, + "step": 6030 + }, + { + "epoch": 0.002167764662048876, + "grad_norm": 0.6258693337440491, + "learning_rate": 9.167488466445255e-05, + "loss": 2.1977, + "step": 6040 + }, + { + "epoch": 0.0027097058275610954, + "grad_norm": 0.9858429431915283, + "learning_rate": 9.162179319277296e-05, + "loss": 2.1936, + "step": 6050 + }, + { + "epoch": 0.0032516469930733145, + "grad_norm": 0.6434946656227112, + "learning_rate": 9.156855034063963e-05, + "loss": 2.2008, + "step": 6060 + }, + { + "epoch": 0.0037935881585855337, + "grad_norm": 0.49262088537216187, + "learning_rate": 9.151515632813808e-05, + "loss": 2.1885, + "step": 6070 + }, + { + "epoch": 0.004335529324097752, + "grad_norm": 0.5512810945510864, + "learning_rate": 9.14616113759787e-05, + "loss": 2.1764, + "step": 6080 + }, + { + "epoch": 0.0048774704896099716, + "grad_norm": 0.38034671545028687, + "learning_rate": 9.140791570549583e-05, + "loss": 2.1774, + "step": 6090 + }, + { + "epoch": 0.0048774704896099716, + "eval_loss": 2.6898508071899414, + "eval_runtime": 21.9661, + "eval_samples_per_second": 227.624, + "eval_steps_per_second": 1.229, + "step": 6090 + }, + { + "epoch": 0.005419411655122191, + "grad_norm": 0.9372425079345703, + "learning_rate": 9.135406953864675e-05, + "loss": 2.1785, + "step": 6100 + }, + { + "epoch": 0.00596135282063441, + "grad_norm": 0.36292022466659546, + "learning_rate": 9.130007309801089e-05, + "loss": 2.1849, + "step": 6110 + }, + { + "epoch": 0.006503293986146629, + "grad_norm": 0.47751471400260925, + "learning_rate": 9.12459266067888e-05, + "loss": 2.1846, + "step": 6120 + }, + { + "epoch": 0.007045235151658848, + "grad_norm": 0.5181723237037659, + "learning_rate": 9.119163028880136e-05, + "loss": 2.1772, + "step": 6130 + }, + { + "epoch": 0.007587176317171067, + "grad_norm": 0.641890823841095, + "learning_rate": 9.113718436848873e-05, + "loss": 2.1719, + "step": 6140 + }, + { + "epoch": 0.008129117482683286, + "grad_norm": 0.6258209347724915, + "learning_rate": 9.108258907090944e-05, + "loss": 2.1779, + "step": 6150 + }, + { + "epoch": 0.008671058648195505, + "grad_norm": 0.41677314043045044, + "learning_rate": 9.102784462173954e-05, + "loss": 2.1814, + "step": 6160 + }, + { + "epoch": 0.009212999813707724, + "grad_norm": 0.4094899594783783, + "learning_rate": 9.097295124727161e-05, + "loss": 2.169, + "step": 6170 + }, + { + "epoch": 0.009592358629566278, + "eval_loss": 2.683100461959839, + "eval_runtime": 22.9235, + "eval_samples_per_second": 218.117, + "eval_steps_per_second": 1.178, + "step": 6177 + }, + { + "epoch": 0.009754940979219943, + "grad_norm": 0.6764087080955505, + "learning_rate": 9.091790917441381e-05, + "loss": 2.1747, + "step": 6180 + }, + { + "epoch": 0.010296882144732162, + "grad_norm": 0.6722964644432068, + "learning_rate": 9.086271863068893e-05, + "loss": 2.1805, + "step": 6190 + }, + { + "epoch": 0.010838823310244381, + "grad_norm": 0.5901824235916138, + "learning_rate": 9.080737984423358e-05, + "loss": 2.1904, + "step": 6200 + }, + { + "epoch": 0.0113807644757566, + "grad_norm": 0.4733932614326477, + "learning_rate": 9.075189304379703e-05, + "loss": 2.1834, + "step": 6210 + }, + { + "epoch": 0.01192270564126882, + "grad_norm": 0.6376433968544006, + "learning_rate": 9.06962584587405e-05, + "loss": 2.1765, + "step": 6220 + }, + { + "epoch": 0.012464646806781039, + "grad_norm": 0.559138298034668, + "learning_rate": 9.0640476319036e-05, + "loss": 2.1715, + "step": 6230 + }, + { + "epoch": 0.013006587972293258, + "grad_norm": 0.7046493887901306, + "learning_rate": 9.05845468552655e-05, + "loss": 2.1645, + "step": 6240 + }, + { + "epoch": 0.013548529137805477, + "grad_norm": 1.1869386434555054, + "learning_rate": 9.052847029861999e-05, + "loss": 2.1667, + "step": 6250 + }, + { + "epoch": 0.014090470303317696, + "grad_norm": 0.5609129071235657, + "learning_rate": 9.047224688089845e-05, + "loss": 2.1764, + "step": 6260 + }, + { + "epoch": 0.014307246769522584, + "eval_loss": 2.6828575134277344, + "eval_runtime": 21.9786, + "eval_samples_per_second": 227.494, + "eval_steps_per_second": 1.228, + "step": 6264 + }, + { + "epoch": 0.014632411468829916, + "grad_norm": 0.3735623359680176, + "learning_rate": 9.041587683450695e-05, + "loss": 2.159, + "step": 6270 + }, + { + "epoch": 0.015174352634342135, + "grad_norm": 0.5112239718437195, + "learning_rate": 9.035936039245761e-05, + "loss": 2.1625, + "step": 6280 + }, + { + "epoch": 0.015716293799854352, + "grad_norm": 0.7340002059936523, + "learning_rate": 9.030269778836777e-05, + "loss": 2.1693, + "step": 6290 + }, + { + "epoch": 0.01625823496536657, + "grad_norm": 0.5309348106384277, + "learning_rate": 9.024588925645889e-05, + "loss": 2.1689, + "step": 6300 + }, + { + "epoch": 0.01680017613087879, + "grad_norm": 0.33672529458999634, + "learning_rate": 9.018893503155569e-05, + "loss": 2.1676, + "step": 6310 + }, + { + "epoch": 0.01734211729639101, + "grad_norm": 0.43856242299079895, + "learning_rate": 9.013183534908508e-05, + "loss": 2.1618, + "step": 6320 + }, + { + "epoch": 0.01788405846190323, + "grad_norm": 0.385219931602478, + "learning_rate": 9.007459044507528e-05, + "loss": 2.1699, + "step": 6330 + }, + { + "epoch": 0.018425999627415448, + "grad_norm": 0.5022485256195068, + "learning_rate": 9.001720055615476e-05, + "loss": 2.1621, + "step": 6340 + }, + { + "epoch": 0.018967940792927667, + "grad_norm": 0.38395243883132935, + "learning_rate": 8.995966591955132e-05, + "loss": 2.1586, + "step": 6350 + }, + { + "epoch": 0.01902213490947889, + "eval_loss": 2.6814119815826416, + "eval_runtime": 23.8855, + "eval_samples_per_second": 209.332, + "eval_steps_per_second": 1.13, + "step": 6351 + }, + { + "epoch": 0.019509881958439886, + "grad_norm": 0.3883703649044037, + "learning_rate": 8.990198677309109e-05, + "loss": 2.1581, + "step": 6360 + }, + { + "epoch": 0.020051823123952105, + "grad_norm": 0.3477310538291931, + "learning_rate": 8.984416335519754e-05, + "loss": 2.1587, + "step": 6370 + }, + { + "epoch": 0.020593764289464325, + "grad_norm": 0.35369783639907837, + "learning_rate": 8.978619590489055e-05, + "loss": 2.164, + "step": 6380 + }, + { + "epoch": 0.021135705454976544, + "grad_norm": 0.6048836708068848, + "learning_rate": 8.972808466178529e-05, + "loss": 2.1553, + "step": 6390 + }, + { + "epoch": 0.021677646620488763, + "grad_norm": 0.5932040214538574, + "learning_rate": 8.966982986609141e-05, + "loss": 2.1536, + "step": 6400 + }, + { + "epoch": 0.022219587786000982, + "grad_norm": 0.6168770790100098, + "learning_rate": 8.961143175861187e-05, + "loss": 2.1562, + "step": 6410 + }, + { + "epoch": 0.0227615289515132, + "grad_norm": 0.41919735074043274, + "learning_rate": 8.955289058074207e-05, + "loss": 2.1598, + "step": 6420 + }, + { + "epoch": 0.02330347011702542, + "grad_norm": 0.6106112003326416, + "learning_rate": 8.94942065744688e-05, + "loss": 2.1558, + "step": 6430 + }, + { + "epoch": 0.023737023049435195, + "eval_loss": 2.683075189590454, + "eval_runtime": 21.975, + "eval_samples_per_second": 227.532, + "eval_steps_per_second": 1.229, + "step": 6438 + }, + { + "epoch": 0.02384541128253764, + "grad_norm": 0.7993687987327576, + "learning_rate": 8.943537998236922e-05, + "loss": 2.1442, + "step": 6440 + }, + { + "epoch": 0.02438735244804986, + "grad_norm": 0.6228429079055786, + "learning_rate": 8.937641104760994e-05, + "loss": 2.1479, + "step": 6450 + }, + { + "epoch": 0.024929293613562078, + "grad_norm": 0.47318562865257263, + "learning_rate": 8.931730001394591e-05, + "loss": 2.1512, + "step": 6460 + }, + { + "epoch": 0.025471234779074297, + "grad_norm": 0.3833015263080597, + "learning_rate": 8.92580471257195e-05, + "loss": 2.1565, + "step": 6470 + }, + { + "epoch": 0.026013175944586516, + "grad_norm": 0.357759565114975, + "learning_rate": 8.919865262785941e-05, + "loss": 2.15, + "step": 6480 + }, + { + "epoch": 0.026555117110098735, + "grad_norm": 0.478135347366333, + "learning_rate": 8.913911676587976e-05, + "loss": 2.1399, + "step": 6490 + }, + { + "epoch": 0.027097058275610954, + "grad_norm": 0.3849744498729706, + "learning_rate": 8.907943978587896e-05, + "loss": 2.1505, + "step": 6500 + }, + { + "epoch": 0.027638999441123174, + "grad_norm": 0.7612195611000061, + "learning_rate": 8.901962193453875e-05, + "loss": 2.1517, + "step": 6510 + }, + { + "epoch": 0.028180940606635393, + "grad_norm": 0.39389902353286743, + "learning_rate": 8.895966345912322e-05, + "loss": 2.1512, + "step": 6520 + }, + { + "epoch": 0.028451911189391502, + "eval_loss": 2.684783935546875, + "eval_runtime": 21.9732, + "eval_samples_per_second": 227.55, + "eval_steps_per_second": 1.229, + "step": 6525 + }, + { + "epoch": 0.028722881772147612, + "grad_norm": 0.3195110559463501, + "learning_rate": 8.889956460747773e-05, + "loss": 2.1501, + "step": 6530 + }, + { + "epoch": 0.02926482293765983, + "grad_norm": 0.34488433599472046, + "learning_rate": 8.883932562802787e-05, + "loss": 2.1392, + "step": 6540 + }, + { + "epoch": 0.02980676410317205, + "grad_norm": 0.49418726563453674, + "learning_rate": 8.877894676977848e-05, + "loss": 2.148, + "step": 6550 + }, + { + "epoch": 0.03034870526868427, + "grad_norm": 0.48197141289711, + "learning_rate": 8.871842828231265e-05, + "loss": 2.1441, + "step": 6560 + }, + { + "epoch": 0.03089064643419649, + "grad_norm": 0.5270874500274658, + "learning_rate": 8.865777041579057e-05, + "loss": 2.1395, + "step": 6570 + }, + { + "epoch": 0.031432587599708704, + "grad_norm": 0.42525890469551086, + "learning_rate": 8.859697342094864e-05, + "loss": 2.1438, + "step": 6580 + }, + { + "epoch": 0.03197452876522092, + "grad_norm": 0.3241468071937561, + "learning_rate": 8.85360375490983e-05, + "loss": 2.1313, + "step": 6590 + }, + { + "epoch": 0.03251646993073314, + "grad_norm": 0.4242691695690155, + "learning_rate": 8.84749630521251e-05, + "loss": 2.1338, + "step": 6600 + }, + { + "epoch": 0.03305841109624536, + "grad_norm": 0.3246161937713623, + "learning_rate": 8.84137501824876e-05, + "loss": 2.1438, + "step": 6610 + }, + { + "epoch": 0.03316679932934781, + "eval_loss": 2.6783671379089355, + "eval_runtime": 21.9713, + "eval_samples_per_second": 227.57, + "eval_steps_per_second": 1.229, + "step": 6612 + }, + { + "epoch": 0.03360035226175758, + "grad_norm": 0.37965840101242065, + "learning_rate": 8.835239919321632e-05, + "loss": 2.1324, + "step": 6620 + }, + { + "epoch": 0.0341422934272698, + "grad_norm": 0.6797935962677002, + "learning_rate": 8.829091033791274e-05, + "loss": 2.1389, + "step": 6630 + }, + { + "epoch": 0.03468423459278202, + "grad_norm": 0.7013537287712097, + "learning_rate": 8.822928387074821e-05, + "loss": 2.1369, + "step": 6640 + }, + { + "epoch": 0.03522617575829424, + "grad_norm": 0.344716340303421, + "learning_rate": 8.816752004646294e-05, + "loss": 2.1294, + "step": 6650 + }, + { + "epoch": 0.03576811692380646, + "grad_norm": 0.33730098605155945, + "learning_rate": 8.810561912036489e-05, + "loss": 2.148, + "step": 6660 + }, + { + "epoch": 0.03631005808931868, + "grad_norm": 0.41755759716033936, + "learning_rate": 8.804358134832874e-05, + "loss": 2.133, + "step": 6670 + }, + { + "epoch": 0.036851999254830896, + "grad_norm": 1.0376006364822388, + "learning_rate": 8.798140698679489e-05, + "loss": 2.1425, + "step": 6680 + }, + { + "epoch": 0.037393940420343115, + "grad_norm": 0.673877477645874, + "learning_rate": 8.791909629276827e-05, + "loss": 2.1426, + "step": 6690 + }, + { + "epoch": 0.037881687469304114, + "eval_loss": 2.6814136505126953, + "eval_runtime": 21.9797, + "eval_samples_per_second": 227.482, + "eval_steps_per_second": 1.228, + "step": 6699 + }, + { + "epoch": 0.037935881585855334, + "grad_norm": 0.5003832578659058, + "learning_rate": 8.785664952381746e-05, + "loss": 2.1438, + "step": 6700 + }, + { + "epoch": 0.03847782275136755, + "grad_norm": 0.4365040957927704, + "learning_rate": 8.77940669380734e-05, + "loss": 2.1297, + "step": 6710 + }, + { + "epoch": 0.03901976391687977, + "grad_norm": 0.3096308708190918, + "learning_rate": 8.773134879422856e-05, + "loss": 2.134, + "step": 6720 + }, + { + "epoch": 0.03956170508239199, + "grad_norm": 0.9517742395401001, + "learning_rate": 8.766849535153568e-05, + "loss": 2.1274, + "step": 6730 + }, + { + "epoch": 0.04010364624790421, + "grad_norm": 0.4467058777809143, + "learning_rate": 8.760550686980681e-05, + "loss": 2.1337, + "step": 6740 + }, + { + "epoch": 0.04064558741341643, + "grad_norm": 0.619213879108429, + "learning_rate": 8.754238360941218e-05, + "loss": 2.1252, + "step": 6750 + }, + { + "epoch": 0.04118752857892865, + "grad_norm": 0.5627840161323547, + "learning_rate": 8.747912583127913e-05, + "loss": 2.1229, + "step": 6760 + }, + { + "epoch": 0.04172946974444087, + "grad_norm": 0.34763067960739136, + "learning_rate": 8.741573379689109e-05, + "loss": 2.1238, + "step": 6770 + }, + { + "epoch": 0.04227141090995309, + "grad_norm": 0.45356321334838867, + "learning_rate": 8.735220776828641e-05, + "loss": 2.1258, + "step": 6780 + }, + { + "epoch": 0.04259657560926042, + "eval_loss": 2.6720757484436035, + "eval_runtime": 25.5872, + "eval_samples_per_second": 195.41, + "eval_steps_per_second": 1.055, + "step": 6786 + }, + { + "epoch": 0.04281335207546531, + "grad_norm": 0.3579098880290985, + "learning_rate": 8.728854800805733e-05, + "loss": 2.122, + "step": 6790 + }, + { + "epoch": 0.043355293240977526, + "grad_norm": 0.49912065267562866, + "learning_rate": 8.722475477934894e-05, + "loss": 2.1271, + "step": 6800 + }, + { + "epoch": 0.043897234406489745, + "grad_norm": 0.5217333436012268, + "learning_rate": 8.716082834585797e-05, + "loss": 2.1158, + "step": 6810 + }, + { + "epoch": 0.044439175572001964, + "grad_norm": 0.6708068251609802, + "learning_rate": 8.709676897183176e-05, + "loss": 2.1309, + "step": 6820 + }, + { + "epoch": 0.04498111673751418, + "grad_norm": 0.4707426130771637, + "learning_rate": 8.703257692206724e-05, + "loss": 2.1155, + "step": 6830 + }, + { + "epoch": 0.0455230579030264, + "grad_norm": 0.5237321257591248, + "learning_rate": 8.696825246190972e-05, + "loss": 2.1245, + "step": 6840 + }, + { + "epoch": 0.04606499906853862, + "grad_norm": 0.6378241181373596, + "learning_rate": 8.690379585725186e-05, + "loss": 2.118, + "step": 6850 + }, + { + "epoch": 0.04660694023405084, + "grad_norm": 0.33660611510276794, + "learning_rate": 8.683920737453254e-05, + "loss": 2.128, + "step": 6860 + }, + { + "epoch": 0.04714888139956306, + "grad_norm": 0.2975222170352936, + "learning_rate": 8.677448728073583e-05, + "loss": 2.1156, + "step": 6870 + }, + { + "epoch": 0.04731146374921673, + "eval_loss": 2.673781394958496, + "eval_runtime": 22.7337, + "eval_samples_per_second": 219.938, + "eval_steps_per_second": 1.188, + "step": 6873 + }, + { + "epoch": 0.04769082256507528, + "grad_norm": 0.34510669112205505, + "learning_rate": 8.670963584338975e-05, + "loss": 2.1145, + "step": 6880 + }, + { + "epoch": 0.0482327637305875, + "grad_norm": 0.3134111166000366, + "learning_rate": 8.664465333056526e-05, + "loss": 2.1195, + "step": 6890 + }, + { + "epoch": 0.04877470489609972, + "grad_norm": 0.37668564915657043, + "learning_rate": 8.657954001087521e-05, + "loss": 2.1195, + "step": 6900 + }, + { + "epoch": 0.049316646061611936, + "grad_norm": 0.8455840945243835, + "learning_rate": 8.651429615347309e-05, + "loss": 2.1116, + "step": 6910 + }, + { + "epoch": 0.049858587227124156, + "grad_norm": 1.0009831190109253, + "learning_rate": 8.644892202805195e-05, + "loss": 2.1119, + "step": 6920 + }, + { + "epoch": 0.050400528392636375, + "grad_norm": 0.9167200326919556, + "learning_rate": 8.638341790484341e-05, + "loss": 2.1142, + "step": 6930 + }, + { + "epoch": 0.050942469558148594, + "grad_norm": 0.6210402250289917, + "learning_rate": 8.631778405461638e-05, + "loss": 2.1147, + "step": 6940 + }, + { + "epoch": 0.05148441072366081, + "grad_norm": 0.6486452221870422, + "learning_rate": 8.625202074867607e-05, + "loss": 2.107, + "step": 6950 + }, + { + "epoch": 0.05202635188917303, + "grad_norm": 0.2966337502002716, + "learning_rate": 8.618612825886272e-05, + "loss": 2.0978, + "step": 6960 + }, + { + "epoch": 0.05202635188917303, + "eval_loss": 2.667224168777466, + "eval_runtime": 21.9683, + "eval_samples_per_second": 227.601, + "eval_steps_per_second": 1.229, + "step": 6960 + }, + { + "epoch": 0.05256829305468525, + "grad_norm": 1.5529571771621704, + "learning_rate": 8.612010685755066e-05, + "loss": 2.1092, + "step": 6970 + }, + { + "epoch": 0.05311023422019747, + "grad_norm": 0.6471717357635498, + "learning_rate": 8.605395681764706e-05, + "loss": 2.1193, + "step": 6980 + }, + { + "epoch": 0.05365217538570969, + "grad_norm": 0.5466669797897339, + "learning_rate": 8.598767841259078e-05, + "loss": 2.1173, + "step": 6990 + }, + { + "epoch": 0.05419411655122191, + "grad_norm": 0.3719353973865509, + "learning_rate": 8.592127191635138e-05, + "loss": 2.1186, + "step": 7000 + }, + { + "epoch": 0.05473605771673413, + "grad_norm": 0.6376820206642151, + "learning_rate": 8.585473760342786e-05, + "loss": 2.1094, + "step": 7010 + }, + { + "epoch": 0.05527799888224635, + "grad_norm": 0.36836645007133484, + "learning_rate": 8.578807574884756e-05, + "loss": 2.1028, + "step": 7020 + }, + { + "epoch": 0.055819940047758566, + "grad_norm": 0.318914532661438, + "learning_rate": 8.572128662816498e-05, + "loss": 2.107, + "step": 7030 + }, + { + "epoch": 0.056361881213270786, + "grad_norm": 0.5992175340652466, + "learning_rate": 8.56543705174608e-05, + "loss": 2.1014, + "step": 7040 + }, + { + "epoch": 0.056741240029129336, + "eval_loss": 2.667701244354248, + "eval_runtime": 21.977, + "eval_samples_per_second": 227.51, + "eval_steps_per_second": 1.229, + "step": 7047 + }, + { + "epoch": 0.056903822378783005, + "grad_norm": 0.3516386151313782, + "learning_rate": 8.558732769334055e-05, + "loss": 2.1166, + "step": 7050 + }, + { + "epoch": 0.057445763544295224, + "grad_norm": 1.2428702116012573, + "learning_rate": 8.552015843293358e-05, + "loss": 2.1083, + "step": 7060 + }, + { + "epoch": 0.05798770470980744, + "grad_norm": 0.812322735786438, + "learning_rate": 8.545286301389183e-05, + "loss": 2.1043, + "step": 7070 + }, + { + "epoch": 0.05852964587531966, + "grad_norm": 0.5382115840911865, + "learning_rate": 8.538544171438879e-05, + "loss": 2.0998, + "step": 7080 + }, + { + "epoch": 0.05907158704083188, + "grad_norm": 0.7883318662643433, + "learning_rate": 8.531789481311824e-05, + "loss": 2.1118, + "step": 7090 + }, + { + "epoch": 0.0596135282063441, + "grad_norm": 0.3522588312625885, + "learning_rate": 8.525022258929319e-05, + "loss": 2.1094, + "step": 7100 + }, + { + "epoch": 0.06015546937185632, + "grad_norm": 0.5178406238555908, + "learning_rate": 8.518242532264468e-05, + "loss": 2.0969, + "step": 7110 + }, + { + "epoch": 0.06069741053736854, + "grad_norm": 0.6928080916404724, + "learning_rate": 8.511450329342061e-05, + "loss": 2.0937, + "step": 7120 + }, + { + "epoch": 0.06123935170288076, + "grad_norm": 0.33726730942726135, + "learning_rate": 8.504645678238462e-05, + "loss": 2.0893, + "step": 7130 + }, + { + "epoch": 0.06145612816908565, + "eval_loss": 2.6699295043945312, + "eval_runtime": 21.9715, + "eval_samples_per_second": 227.567, + "eval_steps_per_second": 1.229, + "step": 7134 + }, + { + "epoch": 0.06178129286839298, + "grad_norm": 0.5048802495002747, + "learning_rate": 8.497828607081488e-05, + "loss": 2.1111, + "step": 7140 + }, + { + "epoch": 0.062323234033905196, + "grad_norm": 0.3473956286907196, + "learning_rate": 8.490999144050299e-05, + "loss": 2.1016, + "step": 7150 + }, + { + "epoch": 0.06286517519941741, + "grad_norm": 0.3585287928581238, + "learning_rate": 8.484157317375276e-05, + "loss": 2.091, + "step": 7160 + }, + { + "epoch": 0.06340711636492963, + "grad_norm": 0.929985761642456, + "learning_rate": 8.47730315533791e-05, + "loss": 2.103, + "step": 7170 + }, + { + "epoch": 0.06394905753044185, + "grad_norm": 0.5229865312576294, + "learning_rate": 8.470436686270678e-05, + "loss": 2.09, + "step": 7180 + }, + { + "epoch": 0.06449099869595407, + "grad_norm": 0.6161647439002991, + "learning_rate": 8.463557938556928e-05, + "loss": 2.0994, + "step": 7190 + }, + { + "epoch": 0.06503293986146629, + "grad_norm": 0.4682093858718872, + "learning_rate": 8.456666940630772e-05, + "loss": 2.094, + "step": 7200 + }, + { + "epoch": 0.0655748810269785, + "grad_norm": 0.35867783427238464, + "learning_rate": 8.449763720976947e-05, + "loss": 2.0879, + "step": 7210 + }, + { + "epoch": 0.06611682219249072, + "grad_norm": 0.44877171516418457, + "learning_rate": 8.442848308130723e-05, + "loss": 2.0922, + "step": 7220 + }, + { + "epoch": 0.06617101630904194, + "eval_loss": 2.6712543964385986, + "eval_runtime": 21.9738, + "eval_samples_per_second": 227.544, + "eval_steps_per_second": 1.229, + "step": 7221 + }, + { + "epoch": 0.06665876335800294, + "grad_norm": 0.4711715877056122, + "learning_rate": 8.435920730677762e-05, + "loss": 2.0889, + "step": 7230 + }, + { + "epoch": 0.06720070452351516, + "grad_norm": 0.5644111037254333, + "learning_rate": 8.428981017254012e-05, + "loss": 2.0885, + "step": 7240 + }, + { + "epoch": 0.06774264568902738, + "grad_norm": 0.6288381814956665, + "learning_rate": 8.42202919654559e-05, + "loss": 2.0922, + "step": 7250 + }, + { + "epoch": 0.0682845868545396, + "grad_norm": 0.39220157265663147, + "learning_rate": 8.41506529728866e-05, + "loss": 2.0756, + "step": 7260 + }, + { + "epoch": 0.06882652802005182, + "grad_norm": 0.4057351052761078, + "learning_rate": 8.408089348269307e-05, + "loss": 2.0918, + "step": 7270 + }, + { + "epoch": 0.06936846918556404, + "grad_norm": 0.4261212944984436, + "learning_rate": 8.401101378323434e-05, + "loss": 2.0806, + "step": 7280 + }, + { + "epoch": 0.06991041035107626, + "grad_norm": 0.6397058367729187, + "learning_rate": 8.394101416336627e-05, + "loss": 2.0885, + "step": 7290 + }, + { + "epoch": 0.07045235151658848, + "grad_norm": 0.7682284712791443, + "learning_rate": 8.387089491244048e-05, + "loss": 2.0817, + "step": 7300 + }, + { + "epoch": 0.07088590444899825, + "eval_loss": 2.657864809036255, + "eval_runtime": 21.9777, + "eval_samples_per_second": 227.503, + "eval_steps_per_second": 1.229, + "step": 7308 + }, + { + "epoch": 0.0709942926821007, + "grad_norm": 0.4054642617702484, + "learning_rate": 8.380065632030305e-05, + "loss": 2.093, + "step": 7310 + }, + { + "epoch": 0.07153623384761292, + "grad_norm": 0.42616334557533264, + "learning_rate": 8.37302986772934e-05, + "loss": 2.0864, + "step": 7320 + }, + { + "epoch": 0.07207817501312513, + "grad_norm": 0.5650444030761719, + "learning_rate": 8.365982227424306e-05, + "loss": 2.0884, + "step": 7330 + }, + { + "epoch": 0.07262011617863735, + "grad_norm": 0.9685772061347961, + "learning_rate": 8.358922740247447e-05, + "loss": 2.0863, + "step": 7340 + }, + { + "epoch": 0.07316205734414957, + "grad_norm": 0.8171983957290649, + "learning_rate": 8.351851435379974e-05, + "loss": 2.0896, + "step": 7350 + }, + { + "epoch": 0.07370399850966179, + "grad_norm": 0.6826300024986267, + "learning_rate": 8.34476834205195e-05, + "loss": 2.0914, + "step": 7360 + }, + { + "epoch": 0.07424593967517401, + "grad_norm": 0.8995702266693115, + "learning_rate": 8.337673489542172e-05, + "loss": 2.0719, + "step": 7370 + }, + { + "epoch": 0.07478788084068623, + "grad_norm": 0.3203958570957184, + "learning_rate": 8.330566907178038e-05, + "loss": 2.0861, + "step": 7380 + }, + { + "epoch": 0.07532982200619845, + "grad_norm": 0.3850855827331543, + "learning_rate": 8.323448624335435e-05, + "loss": 2.0916, + "step": 7390 + }, + { + "epoch": 0.07560079258895457, + "eval_loss": 2.671057939529419, + "eval_runtime": 21.9786, + "eval_samples_per_second": 227.494, + "eval_steps_per_second": 1.228, + "step": 7395 + }, + { + "epoch": 0.07587176317171067, + "grad_norm": 1.0707188844680786, + "learning_rate": 8.316318670438614e-05, + "loss": 2.0833, + "step": 7400 + }, + { + "epoch": 0.07641370433722289, + "grad_norm": 0.36094823479652405, + "learning_rate": 8.309177074960073e-05, + "loss": 2.087, + "step": 7410 + }, + { + "epoch": 0.0769556455027351, + "grad_norm": 0.34225502610206604, + "learning_rate": 8.30202386742043e-05, + "loss": 2.0898, + "step": 7420 + }, + { + "epoch": 0.07749758666824733, + "grad_norm": 0.3515205681324005, + "learning_rate": 8.294859077388301e-05, + "loss": 2.0787, + "step": 7430 + }, + { + "epoch": 0.07803952783375954, + "grad_norm": 0.4942980408668518, + "learning_rate": 8.287682734480182e-05, + "loss": 2.0727, + "step": 7440 + }, + { + "epoch": 0.07858146899927176, + "grad_norm": 0.5919073224067688, + "learning_rate": 8.280494868360325e-05, + "loss": 2.085, + "step": 7450 + }, + { + "epoch": 0.07912341016478398, + "grad_norm": 0.5980909466743469, + "learning_rate": 8.27329550874061e-05, + "loss": 2.0768, + "step": 7460 + }, + { + "epoch": 0.0796653513302962, + "grad_norm": 0.4768329858779907, + "learning_rate": 8.266084685380434e-05, + "loss": 2.0824, + "step": 7470 + }, + { + "epoch": 0.08020729249580842, + "grad_norm": 0.6061816215515137, + "learning_rate": 8.258862428086572e-05, + "loss": 2.0784, + "step": 7480 + }, + { + "epoch": 0.08031568072891086, + "eval_loss": 2.6704938411712646, + "eval_runtime": 21.9746, + "eval_samples_per_second": 227.536, + "eval_steps_per_second": 1.229, + "step": 7482 + }, + { + "epoch": 0.08074923366132064, + "grad_norm": 0.4925580322742462, + "learning_rate": 8.251628766713068e-05, + "loss": 2.0779, + "step": 7490 + }, + { + "epoch": 0.08129117482683286, + "grad_norm": 0.38380166888237, + "learning_rate": 8.244383731161109e-05, + "loss": 2.0689, + "step": 7500 + }, + { + "epoch": 0.08183311599234508, + "grad_norm": 0.7923119068145752, + "learning_rate": 8.237127351378889e-05, + "loss": 2.0795, + "step": 7510 + }, + { + "epoch": 0.0823750571578573, + "grad_norm": 0.6250379085540771, + "learning_rate": 8.229859657361504e-05, + "loss": 2.0803, + "step": 7520 + }, + { + "epoch": 0.08291699832336952, + "grad_norm": 0.32212239503860474, + "learning_rate": 8.222580679150813e-05, + "loss": 2.0658, + "step": 7530 + }, + { + "epoch": 0.08345893948888174, + "grad_norm": 0.35104089975357056, + "learning_rate": 8.215290446835322e-05, + "loss": 2.0689, + "step": 7540 + }, + { + "epoch": 0.08400088065439396, + "grad_norm": 0.6346544623374939, + "learning_rate": 8.207988990550055e-05, + "loss": 2.0708, + "step": 7550 + }, + { + "epoch": 0.08454282181990617, + "grad_norm": 0.3987348675727844, + "learning_rate": 8.200676340476437e-05, + "loss": 2.0642, + "step": 7560 + }, + { + "epoch": 0.08503056886886717, + "eval_loss": 2.6780905723571777, + "eval_runtime": 21.979, + "eval_samples_per_second": 227.49, + "eval_steps_per_second": 1.228, + "step": 7569 + }, + { + "epoch": 0.0850847629854184, + "grad_norm": 0.9453224539756775, + "learning_rate": 8.193352526842159e-05, + "loss": 2.0647, + "step": 7570 + }, + { + "epoch": 0.08562670415093061, + "grad_norm": 0.3680925667285919, + "learning_rate": 8.186017579921055e-05, + "loss": 2.0611, + "step": 7580 + }, + { + "epoch": 0.08616864531644283, + "grad_norm": 0.8765574097633362, + "learning_rate": 8.178671530032988e-05, + "loss": 2.0722, + "step": 7590 + }, + { + "epoch": 0.08671058648195505, + "grad_norm": 0.674678385257721, + "learning_rate": 8.171314407543708e-05, + "loss": 2.0738, + "step": 7600 + }, + { + "epoch": 0.08725252764746727, + "grad_norm": 0.582229495048523, + "learning_rate": 8.163946242864744e-05, + "loss": 2.0725, + "step": 7610 + }, + { + "epoch": 0.08779446881297949, + "grad_norm": 0.4998132288455963, + "learning_rate": 8.15656706645326e-05, + "loss": 2.0749, + "step": 7620 + }, + { + "epoch": 0.08833640997849171, + "grad_norm": 0.38911086320877075, + "learning_rate": 8.149176908811947e-05, + "loss": 2.0665, + "step": 7630 + }, + { + "epoch": 0.08887835114400393, + "grad_norm": 0.7126851081848145, + "learning_rate": 8.141775800488877e-05, + "loss": 2.0599, + "step": 7640 + }, + { + "epoch": 0.08942029230951615, + "grad_norm": 0.6182229518890381, + "learning_rate": 8.134363772077399e-05, + "loss": 2.0657, + "step": 7650 + }, + { + "epoch": 0.08974545700882348, + "eval_loss": 2.663647174835205, + "eval_runtime": 21.9735, + "eval_samples_per_second": 227.547, + "eval_steps_per_second": 1.229, + "step": 7656 + }, + { + "epoch": 0.08996223347502837, + "grad_norm": 0.5466020107269287, + "learning_rate": 8.126940854215997e-05, + "loss": 2.0764, + "step": 7660 + }, + { + "epoch": 0.09050417464054059, + "grad_norm": 0.7600970268249512, + "learning_rate": 8.119507077588165e-05, + "loss": 2.0711, + "step": 7670 + }, + { + "epoch": 0.0910461158060528, + "grad_norm": 0.3224121034145355, + "learning_rate": 8.11206247292229e-05, + "loss": 2.0595, + "step": 7680 + }, + { + "epoch": 0.09158805697156502, + "grad_norm": 0.3413337171077728, + "learning_rate": 8.10460707099151e-05, + "loss": 2.0586, + "step": 7690 + }, + { + "epoch": 0.09212999813707724, + "grad_norm": 0.337230384349823, + "learning_rate": 8.097140902613601e-05, + "loss": 2.0525, + "step": 7700 + }, + { + "epoch": 0.09267193930258946, + "grad_norm": 1.0965609550476074, + "learning_rate": 8.089663998650839e-05, + "loss": 2.0618, + "step": 7710 + }, + { + "epoch": 0.09321388046810168, + "grad_norm": 0.7565444707870483, + "learning_rate": 8.082176390009878e-05, + "loss": 2.0523, + "step": 7720 + }, + { + "epoch": 0.0937558216336139, + "grad_norm": 0.49668484926223755, + "learning_rate": 8.074678107641623e-05, + "loss": 2.0595, + "step": 7730 + }, + { + "epoch": 0.09429776279912612, + "grad_norm": 0.9314787983894348, + "learning_rate": 8.067169182541099e-05, + "loss": 2.0604, + "step": 7740 + }, + { + "epoch": 0.09446034514877978, + "eval_loss": 2.659668445587158, + "eval_runtime": 21.9747, + "eval_samples_per_second": 227.535, + "eval_steps_per_second": 1.229, + "step": 7743 + }, + { + "epoch": 0.09483970396463834, + "grad_norm": 0.6480481624603271, + "learning_rate": 8.059649645747325e-05, + "loss": 2.0626, + "step": 7750 + }, + { + "epoch": 0.09538164513015056, + "grad_norm": 0.9633429050445557, + "learning_rate": 8.052119528343181e-05, + "loss": 2.0628, + "step": 7760 + }, + { + "epoch": 0.09592358629566278, + "grad_norm": 0.5564477443695068, + "learning_rate": 8.044578861455286e-05, + "loss": 2.0621, + "step": 7770 + }, + { + "epoch": 0.096465527461175, + "grad_norm": 0.613134503364563, + "learning_rate": 8.037027676253866e-05, + "loss": 2.059, + "step": 7780 + }, + { + "epoch": 0.09700746862668722, + "grad_norm": 0.7734301686286926, + "learning_rate": 8.029466003952628e-05, + "loss": 2.062, + "step": 7790 + }, + { + "epoch": 0.09754940979219943, + "grad_norm": 0.31820887327194214, + "learning_rate": 8.021893875808625e-05, + "loss": 2.0541, + "step": 7800 + }, + { + "epoch": 0.09809135095771165, + "grad_norm": 0.3358112573623657, + "learning_rate": 8.014311323122131e-05, + "loss": 2.0547, + "step": 7810 + }, + { + "epoch": 0.09863329212322387, + "grad_norm": 0.5778986811637878, + "learning_rate": 8.006718377236514e-05, + "loss": 2.057, + "step": 7820 + }, + { + "epoch": 0.09917523328873609, + "grad_norm": 0.318333238363266, + "learning_rate": 7.999115069538099e-05, + "loss": 2.0503, + "step": 7830 + }, + { + "epoch": 0.09917523328873609, + "eval_loss": 2.6534852981567383, + "eval_runtime": 21.973, + "eval_samples_per_second": 227.552, + "eval_steps_per_second": 1.229, + "step": 7830 + }, + { + "epoch": 0.09971717445424831, + "grad_norm": 0.5033796429634094, + "learning_rate": 7.991501431456047e-05, + "loss": 2.0504, + "step": 7840 + }, + { + "epoch": 0.10025911561976053, + "grad_norm": 0.39477086067199707, + "learning_rate": 7.983877494462219e-05, + "loss": 2.0601, + "step": 7850 + }, + { + "epoch": 0.10080105678527275, + "grad_norm": 0.37355226278305054, + "learning_rate": 7.976243290071045e-05, + "loss": 2.0529, + "step": 7860 + }, + { + "epoch": 0.10134299795078497, + "grad_norm": 0.6517744064331055, + "learning_rate": 7.968598849839404e-05, + "loss": 2.0555, + "step": 7870 + }, + { + "epoch": 0.10188493911629719, + "grad_norm": 0.3515459895133972, + "learning_rate": 7.960944205366478e-05, + "loss": 2.0515, + "step": 7880 + }, + { + "epoch": 0.10242688028180941, + "grad_norm": 0.590758740901947, + "learning_rate": 7.953279388293634e-05, + "loss": 2.0536, + "step": 7890 + }, + { + "epoch": 0.10296882144732163, + "grad_norm": 0.3137708008289337, + "learning_rate": 7.945604430304289e-05, + "loss": 2.0495, + "step": 7900 + }, + { + "epoch": 0.10351076261283385, + "grad_norm": 0.5091531872749329, + "learning_rate": 7.93791936312377e-05, + "loss": 2.0461, + "step": 7910 + }, + { + "epoch": 0.1038901214286924, + "eval_loss": 2.6576335430145264, + "eval_runtime": 21.9728, + "eval_samples_per_second": 227.554, + "eval_steps_per_second": 1.229, + "step": 7917 + }, + { + "epoch": 0.10405270377834606, + "grad_norm": 0.33678263425827026, + "learning_rate": 7.930224218519207e-05, + "loss": 2.0508, + "step": 7920 + }, + { + "epoch": 0.10459464494385828, + "grad_norm": 0.36633098125457764, + "learning_rate": 7.922519028299376e-05, + "loss": 2.0451, + "step": 7930 + }, + { + "epoch": 0.1051365861093705, + "grad_norm": 0.531390905380249, + "learning_rate": 7.914803824314579e-05, + "loss": 2.0512, + "step": 7940 + }, + { + "epoch": 0.10567852727488272, + "grad_norm": 0.6033332347869873, + "learning_rate": 7.907078638456506e-05, + "loss": 2.0421, + "step": 7950 + }, + { + "epoch": 0.10622046844039494, + "grad_norm": 0.7366685271263123, + "learning_rate": 7.899343502658123e-05, + "loss": 2.0543, + "step": 7960 + }, + { + "epoch": 0.10676240960590716, + "grad_norm": 0.5616899132728577, + "learning_rate": 7.891598448893508e-05, + "loss": 2.0458, + "step": 7970 + }, + { + "epoch": 0.10730435077141938, + "grad_norm": 0.7124751806259155, + "learning_rate": 7.88384350917775e-05, + "loss": 2.0412, + "step": 7980 + }, + { + "epoch": 0.1078462919369316, + "grad_norm": 0.5059435963630676, + "learning_rate": 7.876078715566794e-05, + "loss": 2.0452, + "step": 7990 + }, + { + "epoch": 0.10838823310244382, + "grad_norm": 0.297770619392395, + "learning_rate": 7.868304100157318e-05, + "loss": 2.0514, + "step": 8000 + }, + { + "epoch": 0.1086050095686487, + "eval_loss": 2.6553268432617188, + "eval_runtime": 21.9655, + "eval_samples_per_second": 227.63, + "eval_steps_per_second": 1.229, + "step": 8004 + }, + { + "epoch": 0.10893017426795604, + "grad_norm": 0.890073299407959, + "learning_rate": 7.860519695086608e-05, + "loss": 2.0421, + "step": 8010 + }, + { + "epoch": 0.10947211543346826, + "grad_norm": 0.9192163348197937, + "learning_rate": 7.852725532532405e-05, + "loss": 2.0366, + "step": 8020 + }, + { + "epoch": 0.11001405659898048, + "grad_norm": 0.3507506847381592, + "learning_rate": 7.84492164471279e-05, + "loss": 2.0436, + "step": 8030 + }, + { + "epoch": 0.1105559977644927, + "grad_norm": 0.315594881772995, + "learning_rate": 7.837108063886046e-05, + "loss": 2.0421, + "step": 8040 + }, + { + "epoch": 0.11109793893000491, + "grad_norm": 0.6328078508377075, + "learning_rate": 7.829284822350516e-05, + "loss": 2.0414, + "step": 8050 + }, + { + "epoch": 0.11163988009551713, + "grad_norm": 0.6946396231651306, + "learning_rate": 7.821451952444487e-05, + "loss": 2.0334, + "step": 8060 + }, + { + "epoch": 0.11218182126102935, + "grad_norm": 0.3898144066333771, + "learning_rate": 7.813609486546036e-05, + "loss": 2.0386, + "step": 8070 + }, + { + "epoch": 0.11272376242654157, + "grad_norm": 0.8166786432266235, + "learning_rate": 7.805757457072913e-05, + "loss": 2.0515, + "step": 8080 + }, + { + "epoch": 0.11326570359205379, + "grad_norm": 0.36218130588531494, + "learning_rate": 7.797895896482396e-05, + "loss": 2.0395, + "step": 8090 + }, + { + "epoch": 0.11331989770860501, + "eval_loss": 2.6553170680999756, + "eval_runtime": 21.9791, + "eval_samples_per_second": 227.489, + "eval_steps_per_second": 1.228, + "step": 8091 + }, + { + "epoch": 0.11380764475756601, + "grad_norm": 0.34464749693870544, + "learning_rate": 7.790024837271165e-05, + "loss": 2.0413, + "step": 8100 + }, + { + "epoch": 0.11434958592307823, + "grad_norm": 0.46647629141807556, + "learning_rate": 7.782144311975158e-05, + "loss": 2.047, + "step": 8110 + }, + { + "epoch": 0.11489152708859045, + "grad_norm": 0.30626147985458374, + "learning_rate": 7.77425435316945e-05, + "loss": 2.0349, + "step": 8120 + }, + { + "epoch": 0.11543346825410267, + "grad_norm": 0.3106520473957062, + "learning_rate": 7.7663549934681e-05, + "loss": 2.031, + "step": 8130 + }, + { + "epoch": 0.11597540941961489, + "grad_norm": 0.6385337710380554, + "learning_rate": 7.758446265524038e-05, + "loss": 2.026, + "step": 8140 + }, + { + "epoch": 0.1165173505851271, + "grad_norm": 0.7750070095062256, + "learning_rate": 7.750528202028912e-05, + "loss": 2.0243, + "step": 8150 + }, + { + "epoch": 0.11705929175063932, + "grad_norm": 0.5177359580993652, + "learning_rate": 7.74260083571296e-05, + "loss": 2.0276, + "step": 8160 + }, + { + "epoch": 0.11760123291615154, + "grad_norm": 0.9061049818992615, + "learning_rate": 7.734664199344876e-05, + "loss": 2.0382, + "step": 8170 + }, + { + "epoch": 0.11803478584856132, + "eval_loss": 2.6514930725097656, + "eval_runtime": 21.9691, + "eval_samples_per_second": 227.592, + "eval_steps_per_second": 1.229, + "step": 8178 + }, + { + "epoch": 0.11814317408166376, + "grad_norm": 0.8364623188972473, + "learning_rate": 7.726718325731671e-05, + "loss": 2.0361, + "step": 8180 + }, + { + "epoch": 0.11868511524717598, + "grad_norm": 0.8486116528511047, + "learning_rate": 7.718763247718542e-05, + "loss": 2.0466, + "step": 8190 + }, + { + "epoch": 0.1192270564126882, + "grad_norm": 0.9383118152618408, + "learning_rate": 7.710798998188731e-05, + "loss": 2.0303, + "step": 8200 + }, + { + "epoch": 0.11976899757820042, + "grad_norm": 0.6073142886161804, + "learning_rate": 7.702825610063393e-05, + "loss": 2.0357, + "step": 8210 + }, + { + "epoch": 0.12031093874371264, + "grad_norm": 0.46166083216667175, + "learning_rate": 7.694843116301458e-05, + "loss": 2.0319, + "step": 8220 + }, + { + "epoch": 0.12085287990922486, + "grad_norm": 0.5513178110122681, + "learning_rate": 7.686851549899494e-05, + "loss": 2.0273, + "step": 8230 + }, + { + "epoch": 0.12139482107473708, + "grad_norm": 0.4025670289993286, + "learning_rate": 7.678850943891573e-05, + "loss": 2.0338, + "step": 8240 + }, + { + "epoch": 0.1219367622402493, + "grad_norm": 0.7250736355781555, + "learning_rate": 7.670841331349134e-05, + "loss": 2.0223, + "step": 8250 + }, + { + "epoch": 0.12247870340576152, + "grad_norm": 0.5293325185775757, + "learning_rate": 7.662822745380845e-05, + "loss": 2.0363, + "step": 8260 + }, + { + "epoch": 0.12274967398851762, + "eval_loss": 2.6460092067718506, + "eval_runtime": 21.9726, + "eval_samples_per_second": 227.556, + "eval_steps_per_second": 1.229, + "step": 8265 + }, + { + "epoch": 0.12302064457127374, + "grad_norm": 0.4530751407146454, + "learning_rate": 7.654795219132465e-05, + "loss": 2.0366, + "step": 8270 + }, + { + "epoch": 0.12356258573678595, + "grad_norm": 0.47878649830818176, + "learning_rate": 7.64675878578671e-05, + "loss": 2.0289, + "step": 8280 + }, + { + "epoch": 0.12410452690229817, + "grad_norm": 0.5691394209861755, + "learning_rate": 7.638713478563116e-05, + "loss": 2.023, + "step": 8290 + }, + { + "epoch": 0.12464646806781039, + "grad_norm": 0.5451269149780273, + "learning_rate": 7.630659330717899e-05, + "loss": 2.0293, + "step": 8300 + }, + { + "epoch": 0.1251884092333226, + "grad_norm": 0.2928202450275421, + "learning_rate": 7.622596375543815e-05, + "loss": 2.0301, + "step": 8310 + }, + { + "epoch": 0.12573035039883482, + "grad_norm": 0.4254739582538605, + "learning_rate": 7.614524646370034e-05, + "loss": 2.0301, + "step": 8320 + }, + { + "epoch": 0.12627229156434705, + "grad_norm": 0.40174198150634766, + "learning_rate": 7.606444176561989e-05, + "loss": 2.0225, + "step": 8330 + }, + { + "epoch": 0.12681423272985926, + "grad_norm": 0.32534322142601013, + "learning_rate": 7.598354999521243e-05, + "loss": 2.023, + "step": 8340 + }, + { + "epoch": 0.1273561738953715, + "grad_norm": 0.6066051125526428, + "learning_rate": 7.590257148685352e-05, + "loss": 2.0251, + "step": 8350 + }, + { + "epoch": 0.12746456212847393, + "eval_loss": 2.6490955352783203, + "eval_runtime": 21.9681, + "eval_samples_per_second": 227.602, + "eval_steps_per_second": 1.229, + "step": 8352 + }, + { + "epoch": 0.1278981150608837, + "grad_norm": 0.6870770454406738, + "learning_rate": 7.582150657527732e-05, + "loss": 2.021, + "step": 8360 + }, + { + "epoch": 0.12844005622639593, + "grad_norm": 0.6110761761665344, + "learning_rate": 7.574035559557506e-05, + "loss": 2.0273, + "step": 8370 + }, + { + "epoch": 0.12898199739190813, + "grad_norm": 0.32752725481987, + "learning_rate": 7.565911888319375e-05, + "loss": 2.0185, + "step": 8380 + }, + { + "epoch": 0.12952393855742036, + "grad_norm": 0.5558075904846191, + "learning_rate": 7.557779677393486e-05, + "loss": 2.011, + "step": 8390 + }, + { + "epoch": 0.13006587972293257, + "grad_norm": 0.46736517548561096, + "learning_rate": 7.549638960395283e-05, + "loss": 2.0255, + "step": 8400 + }, + { + "epoch": 0.1306078208884448, + "grad_norm": 0.35495907068252563, + "learning_rate": 7.541489770975365e-05, + "loss": 2.0289, + "step": 8410 + }, + { + "epoch": 0.131149762053957, + "grad_norm": 0.6544122099876404, + "learning_rate": 7.533332142819358e-05, + "loss": 2.0241, + "step": 8420 + }, + { + "epoch": 0.13169170321946924, + "grad_norm": 0.4280942380428314, + "learning_rate": 7.52516610964777e-05, + "loss": 2.0245, + "step": 8430 + }, + { + "epoch": 0.13217945026843023, + "eval_loss": 2.6533358097076416, + "eval_runtime": 21.9742, + "eval_samples_per_second": 227.54, + "eval_steps_per_second": 1.229, + "step": 8439 + }, + { + "epoch": 0.13223364438498145, + "grad_norm": 0.4303838610649109, + "learning_rate": 7.516991705215853e-05, + "loss": 2.0144, + "step": 8440 + }, + { + "epoch": 0.13277558555049368, + "grad_norm": 0.7431203722953796, + "learning_rate": 7.508808963313461e-05, + "loss": 2.0236, + "step": 8450 + }, + { + "epoch": 0.13331752671600589, + "grad_norm": 0.6149893403053284, + "learning_rate": 7.500617917764908e-05, + "loss": 2.0145, + "step": 8460 + }, + { + "epoch": 0.13385946788151812, + "grad_norm": 0.3794841468334198, + "learning_rate": 7.492418602428841e-05, + "loss": 2.0113, + "step": 8470 + }, + { + "epoch": 0.13440140904703032, + "grad_norm": 0.4046865403652191, + "learning_rate": 7.484211051198085e-05, + "loss": 2.0092, + "step": 8480 + }, + { + "epoch": 0.13494335021254256, + "grad_norm": 0.4416908025741577, + "learning_rate": 7.475995297999507e-05, + "loss": 2.0194, + "step": 8490 + }, + { + "epoch": 0.13548529137805476, + "grad_norm": 0.4587673246860504, + "learning_rate": 7.467771376793887e-05, + "loss": 2.0126, + "step": 8500 + }, + { + "epoch": 0.136027232543567, + "grad_norm": 0.4561171233654022, + "learning_rate": 7.459539321575758e-05, + "loss": 2.0158, + "step": 8510 + }, + { + "epoch": 0.1365691737090792, + "grad_norm": 0.4048430323600769, + "learning_rate": 7.451299166373283e-05, + "loss": 2.0094, + "step": 8520 + }, + { + "epoch": 0.13689433840838655, + "eval_loss": 2.6495518684387207, + "eval_runtime": 21.9712, + "eval_samples_per_second": 227.571, + "eval_steps_per_second": 1.229, + "step": 8526 + }, + { + "epoch": 0.13711111487459143, + "grad_norm": 0.9034155011177063, + "learning_rate": 7.443050945248102e-05, + "loss": 2.0219, + "step": 8530 + }, + { + "epoch": 0.13765305604010364, + "grad_norm": 0.7499250173568726, + "learning_rate": 7.434794692295202e-05, + "loss": 2.0136, + "step": 8540 + }, + { + "epoch": 0.13819499720561587, + "grad_norm": 0.6840431094169617, + "learning_rate": 7.426530441642766e-05, + "loss": 2.0084, + "step": 8550 + }, + { + "epoch": 0.13873693837112808, + "grad_norm": 0.44849923253059387, + "learning_rate": 7.418258227452038e-05, + "loss": 2.0216, + "step": 8560 + }, + { + "epoch": 0.1392788795366403, + "grad_norm": 0.3296073079109192, + "learning_rate": 7.40997808391718e-05, + "loss": 2.0037, + "step": 8570 + }, + { + "epoch": 0.13982082070215252, + "grad_norm": 0.34490448236465454, + "learning_rate": 7.401690045265133e-05, + "loss": 2.0104, + "step": 8580 + }, + { + "epoch": 0.14036276186766475, + "grad_norm": 0.3491741418838501, + "learning_rate": 7.39339414575547e-05, + "loss": 2.0163, + "step": 8590 + }, + { + "epoch": 0.14090470303317695, + "grad_norm": 0.37607693672180176, + "learning_rate": 7.385090419680259e-05, + "loss": 2.0061, + "step": 8600 + }, + { + "epoch": 0.1414466441986892, + "grad_norm": 0.6395640969276428, + "learning_rate": 7.37677890136392e-05, + "loss": 2.0192, + "step": 8610 + }, + { + "epoch": 0.14160922654834285, + "eval_loss": 2.6450469493865967, + "eval_runtime": 21.9781, + "eval_samples_per_second": 227.499, + "eval_steps_per_second": 1.228, + "step": 8613 + }, + { + "epoch": 0.1419885853642014, + "grad_norm": 0.44370248913764954, + "learning_rate": 7.368459625163084e-05, + "loss": 1.9993, + "step": 8620 + }, + { + "epoch": 0.14253052652971362, + "grad_norm": 0.8615736961364746, + "learning_rate": 7.360132625466452e-05, + "loss": 1.9997, + "step": 8630 + }, + { + "epoch": 0.14307246769522583, + "grad_norm": 0.45963114500045776, + "learning_rate": 7.351797936694645e-05, + "loss": 2.0108, + "step": 8640 + }, + { + "epoch": 0.14361440886073806, + "grad_norm": 0.34752896428108215, + "learning_rate": 7.34345559330007e-05, + "loss": 2.0089, + "step": 8650 + }, + { + "epoch": 0.14415635002625027, + "grad_norm": 0.36175432801246643, + "learning_rate": 7.33510562976678e-05, + "loss": 2.0071, + "step": 8660 + }, + { + "epoch": 0.1446982911917625, + "grad_norm": 0.32326793670654297, + "learning_rate": 7.326748080610324e-05, + "loss": 1.9989, + "step": 8670 + }, + { + "epoch": 0.1452402323572747, + "grad_norm": 1.1309995651245117, + "learning_rate": 7.318382980377603e-05, + "loss": 2.0054, + "step": 8680 + }, + { + "epoch": 0.14578217352278694, + "grad_norm": 0.31617847084999084, + "learning_rate": 7.310010363646736e-05, + "loss": 2.0057, + "step": 8690 + }, + { + "epoch": 0.14632411468829915, + "grad_norm": 0.34660130739212036, + "learning_rate": 7.301630265026908e-05, + "loss": 2.0027, + "step": 8700 + }, + { + "epoch": 0.14632411468829915, + "eval_loss": 2.6555795669555664, + "eval_runtime": 21.9664, + "eval_samples_per_second": 227.62, + "eval_steps_per_second": 1.229, + "step": 8700 + }, + { + "epoch": 0.14686605585381138, + "grad_norm": 0.421053409576416, + "learning_rate": 7.293242719158241e-05, + "loss": 2.0116, + "step": 8710 + }, + { + "epoch": 0.14740799701932358, + "grad_norm": 0.5713244080543518, + "learning_rate": 7.284847760711628e-05, + "loss": 2.0094, + "step": 8720 + }, + { + "epoch": 0.14794993818483582, + "grad_norm": 0.5165607929229736, + "learning_rate": 7.27644542438861e-05, + "loss": 2.0045, + "step": 8730 + }, + { + "epoch": 0.14849187935034802, + "grad_norm": 0.7992691993713379, + "learning_rate": 7.268035744921225e-05, + "loss": 1.9992, + "step": 8740 + }, + { + "epoch": 0.14903382051586025, + "grad_norm": 0.42167598009109497, + "learning_rate": 7.259618757071866e-05, + "loss": 1.9992, + "step": 8750 + }, + { + "epoch": 0.14957576168137246, + "grad_norm": 0.4957919418811798, + "learning_rate": 7.251194495633132e-05, + "loss": 2.0026, + "step": 8760 + }, + { + "epoch": 0.1501177028468847, + "grad_norm": 0.512251079082489, + "learning_rate": 7.24276299542769e-05, + "loss": 1.9965, + "step": 8770 + }, + { + "epoch": 0.1506596440123969, + "grad_norm": 0.30542322993278503, + "learning_rate": 7.234324291308129e-05, + "loss": 1.9981, + "step": 8780 + }, + { + "epoch": 0.15103900282825547, + "eval_loss": 2.644069194793701, + "eval_runtime": 21.9682, + "eval_samples_per_second": 227.602, + "eval_steps_per_second": 1.229, + "step": 8787 + }, + { + "epoch": 0.15120158517790913, + "grad_norm": 0.7472847104072571, + "learning_rate": 7.225878418156819e-05, + "loss": 2.0041, + "step": 8790 + }, + { + "epoch": 0.15174352634342134, + "grad_norm": 0.3889388144016266, + "learning_rate": 7.217425410885759e-05, + "loss": 1.9959, + "step": 8800 + }, + { + "epoch": 0.15228546750893357, + "grad_norm": 0.40479448437690735, + "learning_rate": 7.208965304436444e-05, + "loss": 2.0035, + "step": 8810 + }, + { + "epoch": 0.15282740867444577, + "grad_norm": 0.5486807227134705, + "learning_rate": 7.200498133779706e-05, + "loss": 2.0039, + "step": 8820 + }, + { + "epoch": 0.153369349839958, + "grad_norm": 0.3424724042415619, + "learning_rate": 7.192023933915586e-05, + "loss": 2.0085, + "step": 8830 + }, + { + "epoch": 0.1539112910054702, + "grad_norm": 0.3559320867061615, + "learning_rate": 7.18354273987318e-05, + "loss": 1.9998, + "step": 8840 + }, + { + "epoch": 0.15445323217098245, + "grad_norm": 0.4217950105667114, + "learning_rate": 7.175054586710486e-05, + "loss": 2.0115, + "step": 8850 + }, + { + "epoch": 0.15499517333649465, + "grad_norm": 0.8350193500518799, + "learning_rate": 7.166559509514283e-05, + "loss": 2.0065, + "step": 8860 + }, + { + "epoch": 0.15553711450200688, + "grad_norm": 0.43232548236846924, + "learning_rate": 7.158057543399957e-05, + "loss": 2.0002, + "step": 8870 + }, + { + "epoch": 0.15575389096821177, + "eval_loss": 2.642789602279663, + "eval_runtime": 21.9675, + "eval_samples_per_second": 227.609, + "eval_steps_per_second": 1.229, + "step": 8874 + }, + { + "epoch": 0.1560790556675191, + "grad_norm": 0.8233237862586975, + "learning_rate": 7.149548723511377e-05, + "loss": 2.004, + "step": 8880 + }, + { + "epoch": 0.15662099683303132, + "grad_norm": 0.7288250923156738, + "learning_rate": 7.141033085020747e-05, + "loss": 1.997, + "step": 8890 + }, + { + "epoch": 0.15716293799854353, + "grad_norm": 0.6390490531921387, + "learning_rate": 7.132510663128448e-05, + "loss": 2.0031, + "step": 8900 + }, + { + "epoch": 0.15770487916405576, + "grad_norm": 0.41019633412361145, + "learning_rate": 7.123981493062907e-05, + "loss": 1.9946, + "step": 8910 + }, + { + "epoch": 0.15824682032956797, + "grad_norm": 0.541032075881958, + "learning_rate": 7.115445610080444e-05, + "loss": 1.9958, + "step": 8920 + }, + { + "epoch": 0.1587887614950802, + "grad_norm": 0.5177111029624939, + "learning_rate": 7.106903049465123e-05, + "loss": 1.9975, + "step": 8930 + }, + { + "epoch": 0.1593307026605924, + "grad_norm": 0.3048165738582611, + "learning_rate": 7.098353846528619e-05, + "loss": 1.9908, + "step": 8940 + }, + { + "epoch": 0.15987264382610464, + "grad_norm": 0.33911651372909546, + "learning_rate": 7.089798036610058e-05, + "loss": 2.002, + "step": 8950 + }, + { + "epoch": 0.16041458499161684, + "grad_norm": 0.9156370162963867, + "learning_rate": 7.081235655075878e-05, + "loss": 1.9849, + "step": 8960 + }, + { + "epoch": 0.16046877910816806, + "eval_loss": 2.637845754623413, + "eval_runtime": 21.9743, + "eval_samples_per_second": 227.538, + "eval_steps_per_second": 1.229, + "step": 8961 + }, + { + "epoch": 0.16095652615712908, + "grad_norm": 0.5576390027999878, + "learning_rate": 7.072666737319683e-05, + "loss": 1.9829, + "step": 8970 + }, + { + "epoch": 0.16149846732264128, + "grad_norm": 0.36364230513572693, + "learning_rate": 7.064091318762089e-05, + "loss": 1.9983, + "step": 8980 + }, + { + "epoch": 0.16204040848815351, + "grad_norm": 0.7000935077667236, + "learning_rate": 7.055509434850597e-05, + "loss": 1.9852, + "step": 8990 + }, + { + "epoch": 0.16258234965366572, + "grad_norm": 0.6612693667411804, + "learning_rate": 7.046921121059417e-05, + "loss": 1.9829, + "step": 9000 + }, + { + "epoch": 0.16312429081917795, + "grad_norm": 0.6157468557357788, + "learning_rate": 7.038326412889353e-05, + "loss": 1.9901, + "step": 9010 + }, + { + "epoch": 0.16366623198469016, + "grad_norm": 0.33146920800209045, + "learning_rate": 7.029725345867628e-05, + "loss": 1.9893, + "step": 9020 + }, + { + "epoch": 0.1642081731502024, + "grad_norm": 0.39117079973220825, + "learning_rate": 7.02111795554776e-05, + "loss": 1.9896, + "step": 9030 + }, + { + "epoch": 0.1647501143157146, + "grad_norm": 0.4249345660209656, + "learning_rate": 7.0125042775094e-05, + "loss": 1.9823, + "step": 9040 + }, + { + "epoch": 0.1651836672481244, + "eval_loss": 2.6504335403442383, + "eval_runtime": 21.9717, + "eval_samples_per_second": 227.566, + "eval_steps_per_second": 1.229, + "step": 9048 + }, + { + "epoch": 0.16529205548122683, + "grad_norm": 1.197481632232666, + "learning_rate": 7.003884347358187e-05, + "loss": 1.9937, + "step": 9050 + }, + { + "epoch": 0.16583399664673903, + "grad_norm": 0.7394921183586121, + "learning_rate": 6.995258200725613e-05, + "loss": 1.9839, + "step": 9060 + }, + { + "epoch": 0.16637593781225127, + "grad_norm": 0.6299876570701599, + "learning_rate": 6.986625873268857e-05, + "loss": 1.9854, + "step": 9070 + }, + { + "epoch": 0.16691787897776347, + "grad_norm": 0.7766085863113403, + "learning_rate": 6.977987400670654e-05, + "loss": 1.9938, + "step": 9080 + }, + { + "epoch": 0.1674598201432757, + "grad_norm": 0.40683385729789734, + "learning_rate": 6.969342818639138e-05, + "loss": 1.9844, + "step": 9090 + }, + { + "epoch": 0.1680017613087879, + "grad_norm": 0.33413490653038025, + "learning_rate": 6.960692162907695e-05, + "loss": 1.9802, + "step": 9100 + }, + { + "epoch": 0.16854370247430014, + "grad_norm": 0.3267943859100342, + "learning_rate": 6.952035469234823e-05, + "loss": 1.9908, + "step": 9110 + }, + { + "epoch": 0.16908564363981235, + "grad_norm": 0.3775721788406372, + "learning_rate": 6.943372773403972e-05, + "loss": 1.9816, + "step": 9120 + }, + { + "epoch": 0.16962758480532458, + "grad_norm": 0.42679643630981445, + "learning_rate": 6.934704111223407e-05, + "loss": 1.9814, + "step": 9130 + }, + { + "epoch": 0.16989855538808069, + "eval_loss": 2.642576217651367, + "eval_runtime": 21.9695, + "eval_samples_per_second": 227.588, + "eval_steps_per_second": 1.229, + "step": 9135 + }, + { + "epoch": 0.1701695259708368, + "grad_norm": 0.542743444442749, + "learning_rate": 6.926029518526054e-05, + "loss": 1.995, + "step": 9140 + }, + { + "epoch": 0.17071146713634902, + "grad_norm": 0.298638254404068, + "learning_rate": 6.917349031169353e-05, + "loss": 1.9888, + "step": 9150 + }, + { + "epoch": 0.17125340830186123, + "grad_norm": 0.6379884481430054, + "learning_rate": 6.908662685035111e-05, + "loss": 1.9863, + "step": 9160 + }, + { + "epoch": 0.17179534946737346, + "grad_norm": 0.3423231542110443, + "learning_rate": 6.899970516029355e-05, + "loss": 1.9858, + "step": 9170 + }, + { + "epoch": 0.17233729063288566, + "grad_norm": 0.41656017303466797, + "learning_rate": 6.891272560082177e-05, + "loss": 1.9928, + "step": 9180 + }, + { + "epoch": 0.1728792317983979, + "grad_norm": 0.41668909788131714, + "learning_rate": 6.882568853147594e-05, + "loss": 1.9793, + "step": 9190 + }, + { + "epoch": 0.1734211729639101, + "grad_norm": 0.45455989241600037, + "learning_rate": 6.873859431203393e-05, + "loss": 1.9811, + "step": 9200 + }, + { + "epoch": 0.17396311412942234, + "grad_norm": 0.421617716550827, + "learning_rate": 6.865144330250984e-05, + "loss": 1.9803, + "step": 9210 + }, + { + "epoch": 0.17450505529493454, + "grad_norm": 0.33842185139656067, + "learning_rate": 6.856423586315258e-05, + "loss": 1.9708, + "step": 9220 + }, + { + "epoch": 0.17461344352803698, + "eval_loss": 2.636101245880127, + "eval_runtime": 21.9746, + "eval_samples_per_second": 227.535, + "eval_steps_per_second": 1.229, + "step": 9222 + }, + { + "epoch": 0.17504699646044677, + "grad_norm": 0.4086952805519104, + "learning_rate": 6.847697235444422e-05, + "loss": 1.9812, + "step": 9230 + }, + { + "epoch": 0.17558893762595898, + "grad_norm": 0.573617696762085, + "learning_rate": 6.83896531370987e-05, + "loss": 1.9738, + "step": 9240 + }, + { + "epoch": 0.1761308787914712, + "grad_norm": 0.5594168901443481, + "learning_rate": 6.830227857206014e-05, + "loss": 1.9763, + "step": 9250 + }, + { + "epoch": 0.17667281995698342, + "grad_norm": 0.820530116558075, + "learning_rate": 6.821484902050152e-05, + "loss": 1.9754, + "step": 9260 + }, + { + "epoch": 0.17721476112249565, + "grad_norm": 0.33057236671447754, + "learning_rate": 6.81273648438231e-05, + "loss": 1.9775, + "step": 9270 + }, + { + "epoch": 0.17775670228800786, + "grad_norm": 0.4354369640350342, + "learning_rate": 6.803982640365092e-05, + "loss": 1.9864, + "step": 9280 + }, + { + "epoch": 0.1782986434535201, + "grad_norm": 0.3154381811618805, + "learning_rate": 6.795223406183532e-05, + "loss": 1.9862, + "step": 9290 + }, + { + "epoch": 0.1788405846190323, + "grad_norm": 0.31054070591926575, + "learning_rate": 6.78645881804495e-05, + "loss": 1.9838, + "step": 9300 + }, + { + "epoch": 0.1793283316679933, + "eval_loss": 2.6388654708862305, + "eval_runtime": 21.9681, + "eval_samples_per_second": 227.603, + "eval_steps_per_second": 1.229, + "step": 9309 + }, + { + "epoch": 0.17938252578454453, + "grad_norm": 0.3508462607860565, + "learning_rate": 6.777688912178787e-05, + "loss": 1.9744, + "step": 9310 + }, + { + "epoch": 0.17992446695005673, + "grad_norm": 0.34929049015045166, + "learning_rate": 6.768913724836477e-05, + "loss": 1.9707, + "step": 9320 + }, + { + "epoch": 0.18046640811556897, + "grad_norm": 0.32265743613243103, + "learning_rate": 6.760133292291277e-05, + "loss": 1.9676, + "step": 9330 + }, + { + "epoch": 0.18100834928108117, + "grad_norm": 0.32434341311454773, + "learning_rate": 6.751347650838134e-05, + "loss": 1.96, + "step": 9340 + }, + { + "epoch": 0.1815502904465934, + "grad_norm": 0.5159777998924255, + "learning_rate": 6.742556836793518e-05, + "loss": 1.9691, + "step": 9350 + }, + { + "epoch": 0.1820922316121056, + "grad_norm": 0.30936720967292786, + "learning_rate": 6.733760886495284e-05, + "loss": 1.9696, + "step": 9360 + }, + { + "epoch": 0.18263417277761784, + "grad_norm": 0.3295222222805023, + "learning_rate": 6.724959836302518e-05, + "loss": 1.9852, + "step": 9370 + }, + { + "epoch": 0.18317611394313005, + "grad_norm": 0.631144106388092, + "learning_rate": 6.716153722595392e-05, + "loss": 1.9825, + "step": 9380 + }, + { + "epoch": 0.18371805510864228, + "grad_norm": 0.4007103741168976, + "learning_rate": 6.707342581775e-05, + "loss": 1.9684, + "step": 9390 + }, + { + "epoch": 0.1840432198079496, + "eval_loss": 2.635845422744751, + "eval_runtime": 21.9618, + "eval_samples_per_second": 227.668, + "eval_steps_per_second": 1.229, + "step": 9396 + }, + { + "epoch": 0.18425999627415449, + "grad_norm": 0.7023712396621704, + "learning_rate": 6.69852645026322e-05, + "loss": 1.973, + "step": 9400 + }, + { + "epoch": 0.18480193743966672, + "grad_norm": 0.6182842254638672, + "learning_rate": 6.689705364502562e-05, + "loss": 1.9714, + "step": 9410 + }, + { + "epoch": 0.18534387860517892, + "grad_norm": 0.5204935669898987, + "learning_rate": 6.680879360956012e-05, + "loss": 1.9647, + "step": 9420 + }, + { + "epoch": 0.18588581977069116, + "grad_norm": 0.3946668803691864, + "learning_rate": 6.672048476106886e-05, + "loss": 1.9776, + "step": 9430 + }, + { + "epoch": 0.18642776093620336, + "grad_norm": 0.44481825828552246, + "learning_rate": 6.663212746458676e-05, + "loss": 1.9718, + "step": 9440 + }, + { + "epoch": 0.1869697021017156, + "grad_norm": 0.4806019365787506, + "learning_rate": 6.6543722085349e-05, + "loss": 1.9699, + "step": 9450 + }, + { + "epoch": 0.1875116432672278, + "grad_norm": 0.3834541141986847, + "learning_rate": 6.645526898878955e-05, + "loss": 1.9617, + "step": 9460 + }, + { + "epoch": 0.18805358443274003, + "grad_norm": 0.587719738483429, + "learning_rate": 6.636676854053958e-05, + "loss": 1.9663, + "step": 9470 + }, + { + "epoch": 0.18859552559825224, + "grad_norm": 0.48372459411621094, + "learning_rate": 6.627822110642603e-05, + "loss": 1.9715, + "step": 9480 + }, + { + "epoch": 0.1887581079479059, + "eval_loss": 2.629991292953491, + "eval_runtime": 21.9683, + "eval_samples_per_second": 227.601, + "eval_steps_per_second": 1.229, + "step": 9483 + }, + { + "epoch": 0.18913746676376447, + "grad_norm": 0.368070513010025, + "learning_rate": 6.618962705247003e-05, + "loss": 1.9678, + "step": 9490 + }, + { + "epoch": 0.18967940792927668, + "grad_norm": 0.3025422692298889, + "learning_rate": 6.610098674488546e-05, + "loss": 1.9647, + "step": 9500 + }, + { + "epoch": 0.1902213490947889, + "grad_norm": 0.4809657037258148, + "learning_rate": 6.601230055007734e-05, + "loss": 1.9742, + "step": 9510 + }, + { + "epoch": 0.19076329026030112, + "grad_norm": 0.3071550726890564, + "learning_rate": 6.592356883464043e-05, + "loss": 1.9588, + "step": 9520 + }, + { + "epoch": 0.19130523142581332, + "grad_norm": 0.3039422035217285, + "learning_rate": 6.583479196535763e-05, + "loss": 1.961, + "step": 9530 + }, + { + "epoch": 0.19184717259132555, + "grad_norm": 1.0146269798278809, + "learning_rate": 6.574597030919844e-05, + "loss": 1.9722, + "step": 9540 + }, + { + "epoch": 0.19238911375683776, + "grad_norm": 0.3183740973472595, + "learning_rate": 6.565710423331757e-05, + "loss": 1.9659, + "step": 9550 + }, + { + "epoch": 0.19293105492235, + "grad_norm": 0.373046338558197, + "learning_rate": 6.556819410505331e-05, + "loss": 1.9615, + "step": 9560 + }, + { + "epoch": 0.1934729960878622, + "grad_norm": 0.36389511823654175, + "learning_rate": 6.547924029192603e-05, + "loss": 1.9639, + "step": 9570 + }, + { + "epoch": 0.1934729960878622, + "eval_loss": 2.6214120388031006, + "eval_runtime": 21.9741, + "eval_samples_per_second": 227.54, + "eval_steps_per_second": 1.229, + "step": 9570 + }, + { + "epoch": 0.19401493725337443, + "grad_norm": 0.3653751015663147, + "learning_rate": 6.539024316163671e-05, + "loss": 1.9665, + "step": 9580 + }, + { + "epoch": 0.19455687841888664, + "grad_norm": 0.4899474084377289, + "learning_rate": 6.530120308206536e-05, + "loss": 1.968, + "step": 9590 + }, + { + "epoch": 0.19509881958439887, + "grad_norm": 0.3284156322479248, + "learning_rate": 6.521212042126951e-05, + "loss": 1.9563, + "step": 9600 + }, + { + "epoch": 0.19564076074991107, + "grad_norm": 0.6163231134414673, + "learning_rate": 6.512299554748281e-05, + "loss": 1.9764, + "step": 9610 + }, + { + "epoch": 0.1961827019154233, + "grad_norm": 0.5178772807121277, + "learning_rate": 6.503382882911322e-05, + "loss": 1.97, + "step": 9620 + }, + { + "epoch": 0.1967246430809355, + "grad_norm": 0.33963051438331604, + "learning_rate": 6.494462063474181e-05, + "loss": 1.9612, + "step": 9630 + }, + { + "epoch": 0.19726658424644775, + "grad_norm": 0.2811043858528137, + "learning_rate": 6.485537133312107e-05, + "loss": 1.9682, + "step": 9640 + }, + { + "epoch": 0.19780852541195995, + "grad_norm": 0.3501632511615753, + "learning_rate": 6.476608129317342e-05, + "loss": 1.9547, + "step": 9650 + }, + { + "epoch": 0.19818788422781852, + "eval_loss": 2.6299521923065186, + "eval_runtime": 21.9699, + "eval_samples_per_second": 227.584, + "eval_steps_per_second": 1.229, + "step": 9657 + }, + { + "epoch": 0.19835046657747218, + "grad_norm": 0.3678341209888458, + "learning_rate": 6.46767508839896e-05, + "loss": 1.9613, + "step": 9660 + }, + { + "epoch": 0.1988924077429844, + "grad_norm": 0.62614905834198, + "learning_rate": 6.458738047482731e-05, + "loss": 1.9626, + "step": 9670 + }, + { + "epoch": 0.19943434890849662, + "grad_norm": 0.7425987124443054, + "learning_rate": 6.449797043510954e-05, + "loss": 1.9591, + "step": 9680 + }, + { + "epoch": 0.19997629007400883, + "grad_norm": 0.3964003324508667, + "learning_rate": 6.440852113442314e-05, + "loss": 1.9638, + "step": 9690 + }, + { + "epoch": 0.20051823123952106, + "grad_norm": 0.4931100308895111, + "learning_rate": 6.431903294251721e-05, + "loss": 1.9487, + "step": 9700 + }, + { + "epoch": 0.20106017240503327, + "grad_norm": 0.31457212567329407, + "learning_rate": 6.422950622930164e-05, + "loss": 1.9618, + "step": 9710 + }, + { + "epoch": 0.2016021135705455, + "grad_norm": 0.44526612758636475, + "learning_rate": 6.413994136484553e-05, + "loss": 1.9584, + "step": 9720 + }, + { + "epoch": 0.2021440547360577, + "grad_norm": 0.3651030957698822, + "learning_rate": 6.405033871937572e-05, + "loss": 1.9614, + "step": 9730 + }, + { + "epoch": 0.20268599590156994, + "grad_norm": 0.639445960521698, + "learning_rate": 6.396069866327519e-05, + "loss": 1.964, + "step": 9740 + }, + { + "epoch": 0.20290277236777482, + "eval_loss": 2.6391215324401855, + "eval_runtime": 21.9615, + "eval_samples_per_second": 227.671, + "eval_steps_per_second": 1.229, + "step": 9744 + }, + { + "epoch": 0.20322793706708214, + "grad_norm": 0.45781099796295166, + "learning_rate": 6.387102156708152e-05, + "loss": 1.9654, + "step": 9750 + }, + { + "epoch": 0.20376987823259438, + "grad_norm": 0.3018638789653778, + "learning_rate": 6.37813078014855e-05, + "loss": 1.9603, + "step": 9760 + }, + { + "epoch": 0.20431181939810658, + "grad_norm": 0.2993408739566803, + "learning_rate": 6.369155773732945e-05, + "loss": 1.9519, + "step": 9770 + }, + { + "epoch": 0.20485376056361881, + "grad_norm": 0.35188788175582886, + "learning_rate": 6.360177174560567e-05, + "loss": 1.945, + "step": 9780 + }, + { + "epoch": 0.20539570172913102, + "grad_norm": 0.5056437849998474, + "learning_rate": 6.351195019745508e-05, + "loss": 1.9569, + "step": 9790 + }, + { + "epoch": 0.20593764289464325, + "grad_norm": 0.4438895583152771, + "learning_rate": 6.342209346416553e-05, + "loss": 1.9503, + "step": 9800 + }, + { + "epoch": 0.20647958406015546, + "grad_norm": 0.8153665661811829, + "learning_rate": 6.333220191717026e-05, + "loss": 1.9529, + "step": 9810 + }, + { + "epoch": 0.2070215252256677, + "grad_norm": 0.47668522596359253, + "learning_rate": 6.324227592804651e-05, + "loss": 1.9515, + "step": 9820 + }, + { + "epoch": 0.2075634663911799, + "grad_norm": 0.3650936782360077, + "learning_rate": 6.315231586851382e-05, + "loss": 1.9473, + "step": 9830 + }, + { + "epoch": 0.20761766050773112, + "eval_loss": 2.6374318599700928, + "eval_runtime": 21.9702, + "eval_samples_per_second": 227.581, + "eval_steps_per_second": 1.229, + "step": 9831 + }, + { + "epoch": 0.20810540755669213, + "grad_norm": 0.4178442060947418, + "learning_rate": 6.306232211043262e-05, + "loss": 1.9593, + "step": 9840 + }, + { + "epoch": 0.20864734872220433, + "grad_norm": 0.39025014638900757, + "learning_rate": 6.297229502580257e-05, + "loss": 1.957, + "step": 9850 + }, + { + "epoch": 0.20918928988771657, + "grad_norm": 0.573856770992279, + "learning_rate": 6.288223498676114e-05, + "loss": 1.9455, + "step": 9860 + }, + { + "epoch": 0.20973123105322877, + "grad_norm": 0.32634246349334717, + "learning_rate": 6.279214236558201e-05, + "loss": 1.9464, + "step": 9870 + }, + { + "epoch": 0.210273172218741, + "grad_norm": 0.48771846294403076, + "learning_rate": 6.270201753467351e-05, + "loss": 1.9471, + "step": 9880 + }, + { + "epoch": 0.2108151133842532, + "grad_norm": 0.3284076154232025, + "learning_rate": 6.261186086657722e-05, + "loss": 1.9512, + "step": 9890 + }, + { + "epoch": 0.21135705454976544, + "grad_norm": 0.4014468789100647, + "learning_rate": 6.252167273396614e-05, + "loss": 1.9596, + "step": 9900 + }, + { + "epoch": 0.21189899571527765, + "grad_norm": 0.2978026568889618, + "learning_rate": 6.24314535096435e-05, + "loss": 1.95, + "step": 9910 + }, + { + "epoch": 0.21233254864768744, + "eval_loss": 2.6269800662994385, + "eval_runtime": 21.9722, + "eval_samples_per_second": 227.561, + "eval_steps_per_second": 1.229, + "step": 9918 + }, + { + "epoch": 0.21244093688078988, + "grad_norm": 0.36181050539016724, + "learning_rate": 6.234120356654096e-05, + "loss": 1.9543, + "step": 9920 + }, + { + "epoch": 0.2129828780463021, + "grad_norm": 0.7839191555976868, + "learning_rate": 6.225092327771723e-05, + "loss": 1.9515, + "step": 9930 + }, + { + "epoch": 0.21352481921181432, + "grad_norm": 1.2086142301559448, + "learning_rate": 6.216061301635633e-05, + "loss": 1.9502, + "step": 9940 + }, + { + "epoch": 0.21406676037732653, + "grad_norm": 0.30762261152267456, + "learning_rate": 6.207027315576635e-05, + "loss": 1.9565, + "step": 9950 + }, + { + "epoch": 0.21460870154283876, + "grad_norm": 0.31394830346107483, + "learning_rate": 6.197990406937757e-05, + "loss": 1.9466, + "step": 9960 + }, + { + "epoch": 0.21515064270835096, + "grad_norm": 0.3644161820411682, + "learning_rate": 6.188950613074122e-05, + "loss": 1.954, + "step": 9970 + }, + { + "epoch": 0.2156925838738632, + "grad_norm": 0.34890973567962646, + "learning_rate": 6.179907971352766e-05, + "loss": 1.936, + "step": 9980 + }, + { + "epoch": 0.2162345250393754, + "grad_norm": 0.4451602101325989, + "learning_rate": 6.170862519152505e-05, + "loss": 1.9447, + "step": 9990 + }, + { + "epoch": 0.21677646620488764, + "grad_norm": 0.33436113595962524, + "learning_rate": 6.161814293863772e-05, + "loss": 1.9497, + "step": 10000 + }, + { + "epoch": 0.21704743678764374, + "eval_loss": 2.6255741119384766, + "eval_runtime": 21.9675, + "eval_samples_per_second": 227.609, + "eval_steps_per_second": 1.229, + "step": 10005 + }, + { + "epoch": 0.21731840737039984, + "grad_norm": 0.6699984669685364, + "learning_rate": 6.152763332888464e-05, + "loss": 1.9456, + "step": 10010 + }, + { + "epoch": 0.21786034853591207, + "grad_norm": 0.3605382740497589, + "learning_rate": 6.143709673639778e-05, + "loss": 1.9341, + "step": 10020 + }, + { + "epoch": 0.21840228970142428, + "grad_norm": 0.35184648633003235, + "learning_rate": 6.134653353542074e-05, + "loss": 1.9538, + "step": 10030 + }, + { + "epoch": 0.2189442308669365, + "grad_norm": 0.3720100522041321, + "learning_rate": 6.125594410030706e-05, + "loss": 1.9441, + "step": 10040 + }, + { + "epoch": 0.21948617203244872, + "grad_norm": 0.5039480924606323, + "learning_rate": 6.116532880551876e-05, + "loss": 1.9407, + "step": 10050 + }, + { + "epoch": 0.22002811319796095, + "grad_norm": 0.3840113878250122, + "learning_rate": 6.107468802562472e-05, + "loss": 1.9444, + "step": 10060 + }, + { + "epoch": 0.22057005436347316, + "grad_norm": 0.5373072028160095, + "learning_rate": 6.098402213529916e-05, + "loss": 1.948, + "step": 10070 + }, + { + "epoch": 0.2211119955289854, + "grad_norm": 0.2886420786380768, + "learning_rate": 6.089333150932014e-05, + "loss": 1.9425, + "step": 10080 + }, + { + "epoch": 0.2216539366944976, + "grad_norm": 0.2997380495071411, + "learning_rate": 6.0802616522567914e-05, + "loss": 1.9477, + "step": 10090 + }, + { + "epoch": 0.22176232492760004, + "eval_loss": 2.6249098777770996, + "eval_runtime": 21.9708, + "eval_samples_per_second": 227.574, + "eval_steps_per_second": 1.229, + "step": 10092 + }, + { + "epoch": 0.22219587786000983, + "grad_norm": 0.3256387412548065, + "learning_rate": 6.0711877550023474e-05, + "loss": 1.9371, + "step": 10100 + }, + { + "epoch": 0.22273781902552203, + "grad_norm": 0.37883371114730835, + "learning_rate": 6.062111496676694e-05, + "loss": 1.9464, + "step": 10110 + }, + { + "epoch": 0.22327976019103427, + "grad_norm": 0.42293113470077515, + "learning_rate": 6.053032914797605e-05, + "loss": 1.9389, + "step": 10120 + }, + { + "epoch": 0.22382170135654647, + "grad_norm": 0.9902483820915222, + "learning_rate": 6.043952046892457e-05, + "loss": 1.9459, + "step": 10130 + }, + { + "epoch": 0.2243636425220587, + "grad_norm": 0.28801223635673523, + "learning_rate": 6.034868930498076e-05, + "loss": 1.9296, + "step": 10140 + }, + { + "epoch": 0.2249055836875709, + "grad_norm": 0.4709838926792145, + "learning_rate": 6.025783603160583e-05, + "loss": 1.9471, + "step": 10150 + }, + { + "epoch": 0.22544752485308314, + "grad_norm": 0.5633691549301147, + "learning_rate": 6.016696102435241e-05, + "loss": 1.9371, + "step": 10160 + }, + { + "epoch": 0.22598946601859535, + "grad_norm": 0.6367583870887756, + "learning_rate": 6.0076064658862884e-05, + "loss": 1.9431, + "step": 10170 + }, + { + "epoch": 0.22647721306755636, + "eval_loss": 2.6244375705718994, + "eval_runtime": 21.9646, + "eval_samples_per_second": 227.639, + "eval_steps_per_second": 1.229, + "step": 10179 + }, + { + "epoch": 0.22653140718410758, + "grad_norm": 0.6641575694084167, + "learning_rate": 5.998514731086805e-05, + "loss": 1.9408, + "step": 10180 + }, + { + "epoch": 0.22707334834961979, + "grad_norm": 0.45785051584243774, + "learning_rate": 5.9894209356185314e-05, + "loss": 1.9407, + "step": 10190 + }, + { + "epoch": 0.22761528951513202, + "grad_norm": 0.473785936832428, + "learning_rate": 5.980325117071736e-05, + "loss": 1.9439, + "step": 10200 + }, + { + "epoch": 0.22815723068064422, + "grad_norm": 0.6695595979690552, + "learning_rate": 5.971227313045043e-05, + "loss": 1.945, + "step": 10210 + }, + { + "epoch": 0.22869917184615646, + "grad_norm": 0.502714991569519, + "learning_rate": 5.9621275611452874e-05, + "loss": 1.9456, + "step": 10220 + }, + { + "epoch": 0.22924111301166866, + "grad_norm": 0.6983989477157593, + "learning_rate": 5.9530258989873555e-05, + "loss": 1.9517, + "step": 10230 + }, + { + "epoch": 0.2297830541771809, + "grad_norm": 0.3308335840702057, + "learning_rate": 5.943922364194029e-05, + "loss": 1.9387, + "step": 10240 + }, + { + "epoch": 0.2303249953426931, + "grad_norm": 0.2856954038143158, + "learning_rate": 5.93481699439583e-05, + "loss": 1.9409, + "step": 10250 + }, + { + "epoch": 0.23086693650820533, + "grad_norm": 0.3123374879360199, + "learning_rate": 5.925709827230868e-05, + "loss": 1.933, + "step": 10260 + }, + { + "epoch": 0.23119210120751266, + "eval_loss": 2.6269350051879883, + "eval_runtime": 21.9652, + "eval_samples_per_second": 227.633, + "eval_steps_per_second": 1.229, + "step": 10266 + }, + { + "epoch": 0.23140887767371754, + "grad_norm": 0.3088303506374359, + "learning_rate": 5.91660090034468e-05, + "loss": 1.9344, + "step": 10270 + }, + { + "epoch": 0.23195081883922977, + "grad_norm": 0.3397095501422882, + "learning_rate": 5.907490251390079e-05, + "loss": 1.9421, + "step": 10280 + }, + { + "epoch": 0.23249276000474198, + "grad_norm": 0.40521690249443054, + "learning_rate": 5.898377918026993e-05, + "loss": 1.9369, + "step": 10290 + }, + { + "epoch": 0.2330347011702542, + "grad_norm": 0.3461083769798279, + "learning_rate": 5.889263937922315e-05, + "loss": 1.937, + "step": 10300 + }, + { + "epoch": 0.23357664233576642, + "grad_norm": 0.47172901034355164, + "learning_rate": 5.8801483487497476e-05, + "loss": 1.9437, + "step": 10310 + }, + { + "epoch": 0.23411858350127865, + "grad_norm": 0.4379764795303345, + "learning_rate": 5.87103118818964e-05, + "loss": 1.92, + "step": 10320 + }, + { + "epoch": 0.23466052466679085, + "grad_norm": 0.7908567190170288, + "learning_rate": 5.861912493928837e-05, + "loss": 1.9329, + "step": 10330 + }, + { + "epoch": 0.2352024658323031, + "grad_norm": 0.5137506723403931, + "learning_rate": 5.852792303660528e-05, + "loss": 1.9414, + "step": 10340 + }, + { + "epoch": 0.2357444069978153, + "grad_norm": 0.31868940591812134, + "learning_rate": 5.8436706550840805e-05, + "loss": 1.9387, + "step": 10350 + }, + { + "epoch": 0.23590698934746895, + "eval_loss": 2.6262524127960205, + "eval_runtime": 21.9674, + "eval_samples_per_second": 227.61, + "eval_steps_per_second": 1.229, + "step": 10353 + }, + { + "epoch": 0.23628634816332753, + "grad_norm": 0.3951703608036041, + "learning_rate": 5.834547585904898e-05, + "loss": 1.9313, + "step": 10360 + }, + { + "epoch": 0.23682828932883973, + "grad_norm": 0.3555956780910492, + "learning_rate": 5.8254231338342446e-05, + "loss": 1.9298, + "step": 10370 + }, + { + "epoch": 0.23737023049435196, + "grad_norm": 0.3569830358028412, + "learning_rate": 5.8162973365891106e-05, + "loss": 1.9423, + "step": 10380 + }, + { + "epoch": 0.23791217165986417, + "grad_norm": 0.34016862511634827, + "learning_rate": 5.807170231892042e-05, + "loss": 1.9239, + "step": 10390 + }, + { + "epoch": 0.2384541128253764, + "grad_norm": 0.32383885979652405, + "learning_rate": 5.7980418574709924e-05, + "loss": 1.9378, + "step": 10400 + }, + { + "epoch": 0.2389960539908886, + "grad_norm": 0.42014604806900024, + "learning_rate": 5.788912251059162e-05, + "loss": 1.9331, + "step": 10410 + }, + { + "epoch": 0.23953799515640084, + "grad_norm": 0.3834390640258789, + "learning_rate": 5.7797814503948414e-05, + "loss": 1.9344, + "step": 10420 + }, + { + "epoch": 0.24007993632191305, + "grad_norm": 0.33072102069854736, + "learning_rate": 5.770649493221262e-05, + "loss": 1.9281, + "step": 10430 + }, + { + "epoch": 0.24062187748742528, + "grad_norm": 0.41425782442092896, + "learning_rate": 5.7615164172864346e-05, + "loss": 1.9247, + "step": 10440 + }, + { + "epoch": 0.24062187748742528, + "eval_loss": 2.6209261417388916, + "eval_runtime": 21.9653, + "eval_samples_per_second": 227.632, + "eval_steps_per_second": 1.229, + "step": 10440 + }, + { + "epoch": 0.24116381865293748, + "grad_norm": 0.4126664400100708, + "learning_rate": 5.7523822603429924e-05, + "loss": 1.9254, + "step": 10450 + }, + { + "epoch": 0.24170575981844972, + "grad_norm": 0.4077990651130676, + "learning_rate": 5.7432470601480394e-05, + "loss": 1.9298, + "step": 10460 + }, + { + "epoch": 0.24224770098396192, + "grad_norm": 0.33790072798728943, + "learning_rate": 5.7341108544629894e-05, + "loss": 1.9214, + "step": 10470 + }, + { + "epoch": 0.24278964214947416, + "grad_norm": 0.5326333045959473, + "learning_rate": 5.724973681053417e-05, + "loss": 1.9277, + "step": 10480 + }, + { + "epoch": 0.24333158331498636, + "grad_norm": 0.8307605981826782, + "learning_rate": 5.715835577688894e-05, + "loss": 1.9399, + "step": 10490 + }, + { + "epoch": 0.2438735244804986, + "grad_norm": 0.29457995295524597, + "learning_rate": 5.706696582142834e-05, + "loss": 1.9274, + "step": 10500 + }, + { + "epoch": 0.2444154656460108, + "grad_norm": 0.47820374369621277, + "learning_rate": 5.697556732192343e-05, + "loss": 1.9332, + "step": 10510 + }, + { + "epoch": 0.24495740681152303, + "grad_norm": 0.615271270275116, + "learning_rate": 5.688416065618057e-05, + "loss": 1.9358, + "step": 10520 + }, + { + "epoch": 0.24533676562738158, + "eval_loss": 2.623511552810669, + "eval_runtime": 21.9658, + "eval_samples_per_second": 227.627, + "eval_steps_per_second": 1.229, + "step": 10527 + }, + { + "epoch": 0.24549934797703524, + "grad_norm": 0.3703593909740448, + "learning_rate": 5.679274620203986e-05, + "loss": 1.9345, + "step": 10530 + }, + { + "epoch": 0.24604128914254747, + "grad_norm": 0.3069552779197693, + "learning_rate": 5.670132433737363e-05, + "loss": 1.9229, + "step": 10540 + }, + { + "epoch": 0.24658323030805968, + "grad_norm": 0.30190664529800415, + "learning_rate": 5.660989544008479e-05, + "loss": 1.9288, + "step": 10550 + }, + { + "epoch": 0.2471251714735719, + "grad_norm": 0.5308563113212585, + "learning_rate": 5.651845988810538e-05, + "loss": 1.9246, + "step": 10560 + }, + { + "epoch": 0.2476671126390841, + "grad_norm": 0.33339574933052063, + "learning_rate": 5.642701805939491e-05, + "loss": 1.9239, + "step": 10570 + }, + { + "epoch": 0.24820905380459635, + "grad_norm": 0.6166539788246155, + "learning_rate": 5.633557033193884e-05, + "loss": 1.9311, + "step": 10580 + }, + { + "epoch": 0.24875099497010855, + "grad_norm": 0.3371501863002777, + "learning_rate": 5.624411708374703e-05, + "loss": 1.9148, + "step": 10590 + }, + { + "epoch": 0.24929293613562079, + "grad_norm": 0.5741251111030579, + "learning_rate": 5.615265869285212e-05, + "loss": 1.936, + "step": 10600 + }, + { + "epoch": 0.249834877301133, + "grad_norm": 0.8390110731124878, + "learning_rate": 5.606119553730808e-05, + "loss": 1.9235, + "step": 10610 + }, + { + "epoch": 0.2500516537673379, + "eval_loss": 2.6314127445220947, + "eval_runtime": 21.9728, + "eval_samples_per_second": 227.554, + "eval_steps_per_second": 1.229, + "step": 10614 + }, + { + "epoch": 0.2503768184666452, + "grad_norm": 0.31572863459587097, + "learning_rate": 5.596972799518849e-05, + "loss": 1.9226, + "step": 10620 + }, + { + "epoch": 0.25091875963215743, + "grad_norm": 0.3893384635448456, + "learning_rate": 5.5878256444585134e-05, + "loss": 1.9379, + "step": 10630 + }, + { + "epoch": 0.25146070079766963, + "grad_norm": 0.4373924136161804, + "learning_rate": 5.578678126360632e-05, + "loss": 1.9296, + "step": 10640 + }, + { + "epoch": 0.2520026419631819, + "grad_norm": 0.2836342751979828, + "learning_rate": 5.5695302830375374e-05, + "loss": 1.9231, + "step": 10650 + }, + { + "epoch": 0.2525445831286941, + "grad_norm": 0.3092157244682312, + "learning_rate": 5.5603821523029084e-05, + "loss": 1.9191, + "step": 10660 + }, + { + "epoch": 0.2530865242942063, + "grad_norm": 0.6483246088027954, + "learning_rate": 5.551233771971611e-05, + "loss": 1.9202, + "step": 10670 + }, + { + "epoch": 0.2536284654597185, + "grad_norm": 0.34343624114990234, + "learning_rate": 5.54208517985954e-05, + "loss": 1.9297, + "step": 10680 + }, + { + "epoch": 0.25417040662523077, + "grad_norm": 0.29993903636932373, + "learning_rate": 5.532936413783469e-05, + "loss": 1.9228, + "step": 10690 + }, + { + "epoch": 0.254712347790743, + "grad_norm": 0.4231652319431305, + "learning_rate": 5.5237875115608905e-05, + "loss": 1.94, + "step": 10700 + }, + { + "epoch": 0.2547665419072942, + "eval_loss": 2.6222281455993652, + "eval_runtime": 21.972, + "eval_samples_per_second": 227.563, + "eval_steps_per_second": 1.229, + "step": 10701 + }, + { + "epoch": 0.2552542889562552, + "grad_norm": 0.31495311856269836, + "learning_rate": 5.51463851100986e-05, + "loss": 1.9252, + "step": 10710 + }, + { + "epoch": 0.2557962301217674, + "grad_norm": 0.5191118717193604, + "learning_rate": 5.5054894499488353e-05, + "loss": 1.9118, + "step": 10720 + }, + { + "epoch": 0.25633817128727965, + "grad_norm": 0.6403262615203857, + "learning_rate": 5.496340366196527e-05, + "loss": 1.9308, + "step": 10730 + }, + { + "epoch": 0.25688011245279185, + "grad_norm": 0.29355281591415405, + "learning_rate": 5.4871912975717444e-05, + "loss": 1.925, + "step": 10740 + }, + { + "epoch": 0.25742205361830406, + "grad_norm": 0.4204306900501251, + "learning_rate": 5.478042281893225e-05, + "loss": 1.92, + "step": 10750 + }, + { + "epoch": 0.25796399478381626, + "grad_norm": 0.5830504894256592, + "learning_rate": 5.468893356979498e-05, + "loss": 1.9133, + "step": 10760 + }, + { + "epoch": 0.2585059359493285, + "grad_norm": 0.2985590994358063, + "learning_rate": 5.459744560648707e-05, + "loss": 1.9221, + "step": 10770 + }, + { + "epoch": 0.25904787711484073, + "grad_norm": 0.29512789845466614, + "learning_rate": 5.4505959307184675e-05, + "loss": 1.9203, + "step": 10780 + }, + { + "epoch": 0.2594814300472505, + "eval_loss": 2.625079870223999, + "eval_runtime": 21.9676, + "eval_samples_per_second": 227.608, + "eval_steps_per_second": 1.229, + "step": 10788 + }, + { + "epoch": 0.25958981828035294, + "grad_norm": 0.3882138133049011, + "learning_rate": 5.441447505005714e-05, + "loss": 1.9178, + "step": 10790 + }, + { + "epoch": 0.26013175944586514, + "grad_norm": 0.5985729098320007, + "learning_rate": 5.432299321326526e-05, + "loss": 1.9214, + "step": 10800 + }, + { + "epoch": 0.2606737006113774, + "grad_norm": 0.5359235405921936, + "learning_rate": 5.423151417495991e-05, + "loss": 1.9182, + "step": 10810 + }, + { + "epoch": 0.2612156417768896, + "grad_norm": 0.2866165041923523, + "learning_rate": 5.4140038313280364e-05, + "loss": 1.9205, + "step": 10820 + }, + { + "epoch": 0.2617575829424018, + "grad_norm": 0.7025465965270996, + "learning_rate": 5.404856600635273e-05, + "loss": 1.9221, + "step": 10830 + }, + { + "epoch": 0.262299524107914, + "grad_norm": 0.48262158036231995, + "learning_rate": 5.39570976322885e-05, + "loss": 1.9164, + "step": 10840 + }, + { + "epoch": 0.2628414652734263, + "grad_norm": 0.331333190202713, + "learning_rate": 5.386563356918286e-05, + "loss": 1.9203, + "step": 10850 + }, + { + "epoch": 0.2633834064389385, + "grad_norm": 0.559256374835968, + "learning_rate": 5.3774174195113145e-05, + "loss": 1.919, + "step": 10860 + }, + { + "epoch": 0.2639253476044507, + "grad_norm": 0.4558927118778229, + "learning_rate": 5.368271988813741e-05, + "loss": 1.9186, + "step": 10870 + }, + { + "epoch": 0.2641963181872068, + "eval_loss": 2.6204073429107666, + "eval_runtime": 21.9646, + "eval_samples_per_second": 227.639, + "eval_steps_per_second": 1.229, + "step": 10875 + }, + { + "epoch": 0.2644672887699629, + "grad_norm": 0.32578545808792114, + "learning_rate": 5.3591271026292645e-05, + "loss": 1.9178, + "step": 10880 + }, + { + "epoch": 0.26500922993547515, + "grad_norm": 0.5844838619232178, + "learning_rate": 5.349982798759341e-05, + "loss": 1.9306, + "step": 10890 + }, + { + "epoch": 0.26555117110098736, + "grad_norm": 0.37786585092544556, + "learning_rate": 5.340839115003019e-05, + "loss": 1.9236, + "step": 10900 + }, + { + "epoch": 0.26609311226649957, + "grad_norm": 0.33021968603134155, + "learning_rate": 5.331696089156776e-05, + "loss": 1.9097, + "step": 10910 + }, + { + "epoch": 0.26663505343201177, + "grad_norm": 0.3873339891433716, + "learning_rate": 5.322553759014383e-05, + "loss": 1.9097, + "step": 10920 + }, + { + "epoch": 0.26717699459752403, + "grad_norm": 0.32818225026130676, + "learning_rate": 5.313412162366723e-05, + "loss": 1.9136, + "step": 10930 + }, + { + "epoch": 0.26771893576303624, + "grad_norm": 0.6563804149627686, + "learning_rate": 5.304271337001652e-05, + "loss": 1.9173, + "step": 10940 + }, + { + "epoch": 0.26826087692854844, + "grad_norm": 0.3008669912815094, + "learning_rate": 5.295131320703841e-05, + "loss": 1.9104, + "step": 10950 + }, + { + "epoch": 0.26880281809406065, + "grad_norm": 0.3501867353916168, + "learning_rate": 5.2859921512546104e-05, + "loss": 1.9232, + "step": 10960 + }, + { + "epoch": 0.2689112063271631, + "eval_loss": 2.611741542816162, + "eval_runtime": 21.9709, + "eval_samples_per_second": 227.574, + "eval_steps_per_second": 1.229, + "step": 10962 + }, + { + "epoch": 0.2693447592595729, + "grad_norm": 0.2955268621444702, + "learning_rate": 5.276853866431787e-05, + "loss": 1.906, + "step": 10970 + }, + { + "epoch": 0.2698867004250851, + "grad_norm": 0.44501546025276184, + "learning_rate": 5.267716504009533e-05, + "loss": 1.9085, + "step": 10980 + }, + { + "epoch": 0.2704286415905973, + "grad_norm": 0.3195658326148987, + "learning_rate": 5.258580101758203e-05, + "loss": 1.9186, + "step": 10990 + }, + { + "epoch": 0.2709705827561095, + "grad_norm": 0.5696297883987427, + "learning_rate": 5.2494446974441837e-05, + "loss": 1.9061, + "step": 11000 + }, + { + "epoch": 0.2715125239216218, + "grad_norm": 0.29219719767570496, + "learning_rate": 5.2403103288297314e-05, + "loss": 1.9114, + "step": 11010 + }, + { + "epoch": 0.272054465087134, + "grad_norm": 0.3206324279308319, + "learning_rate": 5.231177033672824e-05, + "loss": 1.9041, + "step": 11020 + }, + { + "epoch": 0.2725964062526462, + "grad_norm": 0.33854442834854126, + "learning_rate": 5.222044849727005e-05, + "loss": 1.9057, + "step": 11030 + }, + { + "epoch": 0.2731383474181584, + "grad_norm": 0.2936360538005829, + "learning_rate": 5.212913814741219e-05, + "loss": 1.918, + "step": 11040 + }, + { + "epoch": 0.27362609446711944, + "eval_loss": 2.61767840385437, + "eval_runtime": 21.6394, + "eval_samples_per_second": 231.06, + "eval_steps_per_second": 1.248, + "step": 11049 + }, + { + "epoch": 0.27368028858367066, + "grad_norm": 0.34562787413597107, + "learning_rate": 5.203783966459665e-05, + "loss": 1.9068, + "step": 11050 + }, + { + "epoch": 0.27422222974918287, + "grad_norm": 0.5706035494804382, + "learning_rate": 5.1946553426216394e-05, + "loss": 1.9123, + "step": 11060 + }, + { + "epoch": 0.27476417091469507, + "grad_norm": 0.5709580183029175, + "learning_rate": 5.1855279809613675e-05, + "loss": 1.9214, + "step": 11070 + }, + { + "epoch": 0.2753061120802073, + "grad_norm": 0.3960080146789551, + "learning_rate": 5.1764019192078686e-05, + "loss": 1.913, + "step": 11080 + }, + { + "epoch": 0.27584805324571954, + "grad_norm": 0.4222663342952728, + "learning_rate": 5.16727719508478e-05, + "loss": 1.9182, + "step": 11090 + }, + { + "epoch": 0.27638999441123174, + "grad_norm": 0.6161875128746033, + "learning_rate": 5.158153846310214e-05, + "loss": 1.9089, + "step": 11100 + }, + { + "epoch": 0.27693193557674395, + "grad_norm": 0.5677464008331299, + "learning_rate": 5.149031910596599e-05, + "loss": 1.8998, + "step": 11110 + }, + { + "epoch": 0.27747387674225615, + "grad_norm": 0.3052445352077484, + "learning_rate": 5.139911425650518e-05, + "loss": 1.9103, + "step": 11120 + }, + { + "epoch": 0.2780158179077684, + "grad_norm": 0.2828851044178009, + "learning_rate": 5.1307924291725583e-05, + "loss": 1.907, + "step": 11130 + }, + { + "epoch": 0.27834098260707574, + "eval_loss": 2.613119125366211, + "eval_runtime": 21.9757, + "eval_samples_per_second": 227.524, + "eval_steps_per_second": 1.229, + "step": 11136 + }, + { + "epoch": 0.2785577590732806, + "grad_norm": 0.33760419487953186, + "learning_rate": 5.121674958857159e-05, + "loss": 1.9194, + "step": 11140 + }, + { + "epoch": 0.2790997002387928, + "grad_norm": 0.3404898941516876, + "learning_rate": 5.112559052392444e-05, + "loss": 1.9043, + "step": 11150 + }, + { + "epoch": 0.27964164140430503, + "grad_norm": 0.3572676181793213, + "learning_rate": 5.103444747460079e-05, + "loss": 1.9091, + "step": 11160 + }, + { + "epoch": 0.2801835825698173, + "grad_norm": 0.3610747456550598, + "learning_rate": 5.0943320817351034e-05, + "loss": 1.9049, + "step": 11170 + }, + { + "epoch": 0.2807255237353295, + "grad_norm": 0.46232640743255615, + "learning_rate": 5.085221092885785e-05, + "loss": 1.9233, + "step": 11180 + }, + { + "epoch": 0.2812674649008417, + "grad_norm": 0.27908554673194885, + "learning_rate": 5.076111818573459e-05, + "loss": 1.9064, + "step": 11190 + }, + { + "epoch": 0.2818094060663539, + "grad_norm": 0.6692981719970703, + "learning_rate": 5.0670042964523745e-05, + "loss": 1.9067, + "step": 11200 + }, + { + "epoch": 0.28235134723186617, + "grad_norm": 0.3264489769935608, + "learning_rate": 5.057898564169534e-05, + "loss": 1.8999, + "step": 11210 + }, + { + "epoch": 0.2828932883973784, + "grad_norm": 0.5410310626029968, + "learning_rate": 5.048794659364546e-05, + "loss": 1.9098, + "step": 11220 + }, + { + "epoch": 0.28305587074703203, + "eval_loss": 2.6135261058807373, + "eval_runtime": 21.9689, + "eval_samples_per_second": 227.594, + "eval_steps_per_second": 1.229, + "step": 11223 + }, + { + "epoch": 0.2834352295628906, + "grad_norm": 0.6051361560821533, + "learning_rate": 5.0396926196694626e-05, + "loss": 1.9124, + "step": 11230 + }, + { + "epoch": 0.2839771707284028, + "grad_norm": 0.5583413243293762, + "learning_rate": 5.030592482708626e-05, + "loss": 1.9025, + "step": 11240 + }, + { + "epoch": 0.28451911189391504, + "grad_norm": 0.6230108737945557, + "learning_rate": 5.021494286098514e-05, + "loss": 1.9113, + "step": 11250 + }, + { + "epoch": 0.28506105305942725, + "grad_norm": 0.538650631904602, + "learning_rate": 5.0123980674475824e-05, + "loss": 1.9034, + "step": 11260 + }, + { + "epoch": 0.28560299422493945, + "grad_norm": 0.6079558730125427, + "learning_rate": 5.003303864356115e-05, + "loss": 1.9154, + "step": 11270 + }, + { + "epoch": 0.28614493539045166, + "grad_norm": 0.5191447138786316, + "learning_rate": 4.994211714416058e-05, + "loss": 1.8997, + "step": 11280 + }, + { + "epoch": 0.28668687655596387, + "grad_norm": 0.2797117233276367, + "learning_rate": 4.9851216552108745e-05, + "loss": 1.8978, + "step": 11290 + }, + { + "epoch": 0.2872288177214761, + "grad_norm": 0.29481279850006104, + "learning_rate": 4.976033724315385e-05, + "loss": 1.9152, + "step": 11300 + }, + { + "epoch": 0.28777075888698833, + "grad_norm": 0.390680193901062, + "learning_rate": 4.966947959295612e-05, + "loss": 1.9112, + "step": 11310 + }, + { + "epoch": 0.28777075888698833, + "eval_loss": 2.617300033569336, + "eval_runtime": 21.9665, + "eval_samples_per_second": 227.62, + "eval_steps_per_second": 1.229, + "step": 11310 + }, + { + "epoch": 0.28831270005250054, + "grad_norm": 0.6555575132369995, + "learning_rate": 4.957864397708625e-05, + "loss": 1.8965, + "step": 11320 + }, + { + "epoch": 0.28885464121801274, + "grad_norm": 0.3799804449081421, + "learning_rate": 4.948783077102385e-05, + "loss": 1.9039, + "step": 11330 + }, + { + "epoch": 0.289396582383525, + "grad_norm": 0.2934130132198334, + "learning_rate": 4.9397040350155876e-05, + "loss": 1.9078, + "step": 11340 + }, + { + "epoch": 0.2899385235490372, + "grad_norm": 0.2916678190231323, + "learning_rate": 4.930627308977517e-05, + "loss": 1.892, + "step": 11350 + }, + { + "epoch": 0.2904804647145494, + "grad_norm": 0.30668988823890686, + "learning_rate": 4.921552936507876e-05, + "loss": 1.9098, + "step": 11360 + }, + { + "epoch": 0.2910224058800616, + "grad_norm": 0.4994325637817383, + "learning_rate": 4.912480955116642e-05, + "loss": 1.9045, + "step": 11370 + }, + { + "epoch": 0.2915643470455739, + "grad_norm": 1.1822820901870728, + "learning_rate": 4.9034114023039076e-05, + "loss": 1.8957, + "step": 11380 + }, + { + "epoch": 0.2921062882110861, + "grad_norm": 0.3069334626197815, + "learning_rate": 4.894344315559729e-05, + "loss": 1.9096, + "step": 11390 + }, + { + "epoch": 0.29248564702694463, + "eval_loss": 2.6101443767547607, + "eval_runtime": 21.9727, + "eval_samples_per_second": 227.556, + "eval_steps_per_second": 1.229, + "step": 11397 + }, + { + "epoch": 0.2926482293765983, + "grad_norm": 0.3290071487426758, + "learning_rate": 4.885279732363967e-05, + "loss": 1.9007, + "step": 11400 + }, + { + "epoch": 0.2931901705421105, + "grad_norm": 0.27592140436172485, + "learning_rate": 4.876217690186131e-05, + "loss": 1.8886, + "step": 11410 + }, + { + "epoch": 0.29373211170762276, + "grad_norm": 0.5586705207824707, + "learning_rate": 4.867158226485231e-05, + "loss": 1.8874, + "step": 11420 + }, + { + "epoch": 0.29427405287313496, + "grad_norm": 0.5295722484588623, + "learning_rate": 4.858101378709616e-05, + "loss": 1.8926, + "step": 11430 + }, + { + "epoch": 0.29481599403864717, + "grad_norm": 0.3275451064109802, + "learning_rate": 4.8490471842968267e-05, + "loss": 1.908, + "step": 11440 + }, + { + "epoch": 0.29535793520415937, + "grad_norm": 0.2957296371459961, + "learning_rate": 4.8399956806734234e-05, + "loss": 1.899, + "step": 11450 + }, + { + "epoch": 0.29589987636967163, + "grad_norm": 0.30543988943099976, + "learning_rate": 4.830946905254861e-05, + "loss": 1.9042, + "step": 11460 + }, + { + "epoch": 0.29644181753518384, + "grad_norm": 0.5068356394767761, + "learning_rate": 4.821900895445302e-05, + "loss": 1.8957, + "step": 11470 + }, + { + "epoch": 0.29698375870069604, + "grad_norm": 0.3116234838962555, + "learning_rate": 4.812857688637486e-05, + "loss": 1.9, + "step": 11480 + }, + { + "epoch": 0.2972005351669009, + "eval_loss": 2.6111230850219727, + "eval_runtime": 21.9729, + "eval_samples_per_second": 227.553, + "eval_steps_per_second": 1.229, + "step": 11484 + }, + { + "epoch": 0.29752569986620825, + "grad_norm": 0.31971922516822815, + "learning_rate": 4.8038173222125645e-05, + "loss": 1.9074, + "step": 11490 + }, + { + "epoch": 0.2980676410317205, + "grad_norm": 0.3503149747848511, + "learning_rate": 4.7947798335399416e-05, + "loss": 1.899, + "step": 11500 + }, + { + "epoch": 0.2986095821972327, + "grad_norm": 0.30250006914138794, + "learning_rate": 4.7857452599771354e-05, + "loss": 1.9063, + "step": 11510 + }, + { + "epoch": 0.2991515233627449, + "grad_norm": 0.7795620560646057, + "learning_rate": 4.77671363886961e-05, + "loss": 1.9005, + "step": 11520 + }, + { + "epoch": 0.2996934645282571, + "grad_norm": 0.5038977265357971, + "learning_rate": 4.7676850075506185e-05, + "loss": 1.8991, + "step": 11530 + }, + { + "epoch": 0.3002354056937694, + "grad_norm": 0.4416648745536804, + "learning_rate": 4.758659403341069e-05, + "loss": 1.893, + "step": 11540 + }, + { + "epoch": 0.3007773468592816, + "grad_norm": 0.3159092366695404, + "learning_rate": 4.749636863549346e-05, + "loss": 1.9063, + "step": 11550 + }, + { + "epoch": 0.3013192880247938, + "grad_norm": 0.33458906412124634, + "learning_rate": 4.740617425471168e-05, + "loss": 1.9001, + "step": 11560 + }, + { + "epoch": 0.301861229190306, + "grad_norm": 0.5140174627304077, + "learning_rate": 4.731601126389438e-05, + "loss": 1.899, + "step": 11570 + }, + { + "epoch": 0.3019154233068572, + "eval_loss": 2.6166553497314453, + "eval_runtime": 21.9725, + "eval_samples_per_second": 227.558, + "eval_steps_per_second": 1.229, + "step": 11571 + }, + { + "epoch": 0.30240317035581826, + "grad_norm": 0.570547878742218, + "learning_rate": 4.722588003574077e-05, + "loss": 1.895, + "step": 11580 + }, + { + "epoch": 0.30294511152133047, + "grad_norm": 0.44531741738319397, + "learning_rate": 4.7135780942818817e-05, + "loss": 1.9075, + "step": 11590 + }, + { + "epoch": 0.3034870526868427, + "grad_norm": 0.47997337579727173, + "learning_rate": 4.704571435756363e-05, + "loss": 1.8995, + "step": 11600 + }, + { + "epoch": 0.3040289938523549, + "grad_norm": 0.47242090106010437, + "learning_rate": 4.6955680652275916e-05, + "loss": 1.8903, + "step": 11610 + }, + { + "epoch": 0.30457093501786714, + "grad_norm": 0.28598716855049133, + "learning_rate": 4.6865680199120545e-05, + "loss": 1.8922, + "step": 11620 + }, + { + "epoch": 0.30511287618337934, + "grad_norm": 0.44576406478881836, + "learning_rate": 4.677571337012484e-05, + "loss": 1.8952, + "step": 11630 + }, + { + "epoch": 0.30565481734889155, + "grad_norm": 0.3293391764163971, + "learning_rate": 4.668578053717721e-05, + "loss": 1.8935, + "step": 11640 + }, + { + "epoch": 0.30619675851440376, + "grad_norm": 0.31350746750831604, + "learning_rate": 4.65958820720255e-05, + "loss": 1.8906, + "step": 11650 + }, + { + "epoch": 0.3066303114468136, + "eval_loss": 2.620225191116333, + "eval_runtime": 21.9691, + "eval_samples_per_second": 227.592, + "eval_steps_per_second": 1.229, + "step": 11658 + }, + { + "epoch": 0.306738699679916, + "grad_norm": 0.3374342918395996, + "learning_rate": 4.650601834627549e-05, + "loss": 1.8892, + "step": 11660 + }, + { + "epoch": 0.3072806408454282, + "grad_norm": 0.365682452917099, + "learning_rate": 4.641618973138942e-05, + "loss": 1.892, + "step": 11670 + }, + { + "epoch": 0.3078225820109404, + "grad_norm": 0.3332476019859314, + "learning_rate": 4.6326396598684296e-05, + "loss": 1.8927, + "step": 11680 + }, + { + "epoch": 0.30836452317645263, + "grad_norm": 1.3207186460494995, + "learning_rate": 4.6236639319330524e-05, + "loss": 1.8873, + "step": 11690 + }, + { + "epoch": 0.3089064643419649, + "grad_norm": 0.3541896641254425, + "learning_rate": 4.614691826435028e-05, + "loss": 1.8908, + "step": 11700 + }, + { + "epoch": 0.3094484055074771, + "grad_norm": 0.486902117729187, + "learning_rate": 4.605723380461603e-05, + "loss": 1.8982, + "step": 11710 + }, + { + "epoch": 0.3099903466729893, + "grad_norm": 0.33585986495018005, + "learning_rate": 4.596758631084892e-05, + "loss": 1.901, + "step": 11720 + }, + { + "epoch": 0.3105322878385015, + "grad_norm": 0.37316057085990906, + "learning_rate": 4.587797615361735e-05, + "loss": 1.8892, + "step": 11730 + }, + { + "epoch": 0.31107422900401377, + "grad_norm": 0.4041992723941803, + "learning_rate": 4.578840370333534e-05, + "loss": 1.8928, + "step": 11740 + }, + { + "epoch": 0.31134519958676987, + "eval_loss": 2.6146914958953857, + "eval_runtime": 21.9662, + "eval_samples_per_second": 227.622, + "eval_steps_per_second": 1.229, + "step": 11745 + }, + { + "epoch": 0.311616170169526, + "grad_norm": 0.2976398169994354, + "learning_rate": 4.569886933026107e-05, + "loss": 1.8848, + "step": 11750 + }, + { + "epoch": 0.3121581113350382, + "grad_norm": 0.3247699737548828, + "learning_rate": 4.5609373404495316e-05, + "loss": 1.8998, + "step": 11760 + }, + { + "epoch": 0.3127000525005504, + "grad_norm": 0.29927968978881836, + "learning_rate": 4.55199162959799e-05, + "loss": 1.8883, + "step": 11770 + }, + { + "epoch": 0.31324199366606265, + "grad_norm": 0.301215797662735, + "learning_rate": 4.543049837449626e-05, + "loss": 1.8916, + "step": 11780 + }, + { + "epoch": 0.31378393483157485, + "grad_norm": 0.3488280773162842, + "learning_rate": 4.534112000966377e-05, + "loss": 1.9006, + "step": 11790 + }, + { + "epoch": 0.31432587599708706, + "grad_norm": 0.36820992827415466, + "learning_rate": 4.5251781570938324e-05, + "loss": 1.8888, + "step": 11800 + }, + { + "epoch": 0.31486781716259926, + "grad_norm": 0.33846035599708557, + "learning_rate": 4.51624834276108e-05, + "loss": 1.8845, + "step": 11810 + }, + { + "epoch": 0.3154097583281115, + "grad_norm": 0.3961305320262909, + "learning_rate": 4.5073225948805476e-05, + "loss": 1.8928, + "step": 11820 + }, + { + "epoch": 0.3159516994936237, + "grad_norm": 0.29425087571144104, + "learning_rate": 4.498400950347855e-05, + "loss": 1.8951, + "step": 11830 + }, + { + "epoch": 0.31606008772672617, + "eval_loss": 2.6105449199676514, + "eval_runtime": 22.294, + "eval_samples_per_second": 224.276, + "eval_steps_per_second": 1.211, + "step": 11832 + }, + { + "epoch": 0.31649364065913593, + "grad_norm": 0.32833293080329895, + "learning_rate": 4.4894834460416626e-05, + "loss": 1.8865, + "step": 11840 + }, + { + "epoch": 0.31703558182464814, + "grad_norm": 0.274554580450058, + "learning_rate": 4.480570118823511e-05, + "loss": 1.8974, + "step": 11850 + }, + { + "epoch": 0.3175775229901604, + "grad_norm": 0.3351888656616211, + "learning_rate": 4.471661005537682e-05, + "loss": 1.8834, + "step": 11860 + }, + { + "epoch": 0.3181194641556726, + "grad_norm": 0.6972358226776123, + "learning_rate": 4.462756143011031e-05, + "loss": 1.8822, + "step": 11870 + }, + { + "epoch": 0.3186614053211848, + "grad_norm": 0.36904025077819824, + "learning_rate": 4.453855568052847e-05, + "loss": 1.8972, + "step": 11880 + }, + { + "epoch": 0.319203346486697, + "grad_norm": 0.27459755539894104, + "learning_rate": 4.444959317454696e-05, + "loss": 1.8854, + "step": 11890 + }, + { + "epoch": 0.3197452876522093, + "grad_norm": 0.3487244248390198, + "learning_rate": 4.436067427990266e-05, + "loss": 1.8857, + "step": 11900 + }, + { + "epoch": 0.3202872288177215, + "grad_norm": 0.3203682005405426, + "learning_rate": 4.42717993641522e-05, + "loss": 1.8869, + "step": 11910 + }, + { + "epoch": 0.32077497586668247, + "eval_loss": 2.613398313522339, + "eval_runtime": 21.9703, + "eval_samples_per_second": 227.58, + "eval_steps_per_second": 1.229, + "step": 11919 + }, + { + "epoch": 0.3208291699832337, + "grad_norm": 0.30035504698753357, + "learning_rate": 4.418296879467041e-05, + "loss": 1.8942, + "step": 11920 + }, + { + "epoch": 0.3213711111487459, + "grad_norm": 0.38128897547721863, + "learning_rate": 4.409418293864881e-05, + "loss": 1.8833, + "step": 11930 + }, + { + "epoch": 0.32191305231425815, + "grad_norm": 0.31847649812698364, + "learning_rate": 4.400544216309409e-05, + "loss": 1.8946, + "step": 11940 + }, + { + "epoch": 0.32245499347977036, + "grad_norm": 0.6311419606208801, + "learning_rate": 4.3916746834826604e-05, + "loss": 1.887, + "step": 11950 + }, + { + "epoch": 0.32299693464528256, + "grad_norm": 0.5365369319915771, + "learning_rate": 4.3828097320478825e-05, + "loss": 1.881, + "step": 11960 + }, + { + "epoch": 0.32353887581079477, + "grad_norm": 0.3272137939929962, + "learning_rate": 4.3739493986493864e-05, + "loss": 1.8887, + "step": 11970 + }, + { + "epoch": 0.32408081697630703, + "grad_norm": 0.3870546519756317, + "learning_rate": 4.3650937199123934e-05, + "loss": 1.8752, + "step": 11980 + }, + { + "epoch": 0.32462275814181923, + "grad_norm": 0.47655603289604187, + "learning_rate": 4.356242732442887e-05, + "loss": 1.8959, + "step": 11990 + }, + { + "epoch": 0.32516469930733144, + "grad_norm": 0.5217764377593994, + "learning_rate": 4.3473964728274516e-05, + "loss": 1.8831, + "step": 12000 + }, + { + "epoch": 0.32548986400663876, + "eval_loss": 2.6064906120300293, + "eval_runtime": 21.97, + "eval_samples_per_second": 227.583, + "eval_steps_per_second": 1.229, + "step": 12006 + }, + { + "epoch": 0.32570664047284364, + "grad_norm": 0.4419178068637848, + "learning_rate": 4.338554977633138e-05, + "loss": 1.8843, + "step": 12010 + }, + { + "epoch": 0.3262485816383559, + "grad_norm": 0.38523489236831665, + "learning_rate": 4.3297182834072944e-05, + "loss": 1.8829, + "step": 12020 + }, + { + "epoch": 0.3267905228038681, + "grad_norm": 0.537399411201477, + "learning_rate": 4.3208864266774294e-05, + "loss": 1.8814, + "step": 12030 + }, + { + "epoch": 0.3273324639693803, + "grad_norm": 0.3097943365573883, + "learning_rate": 4.312059443951051e-05, + "loss": 1.8777, + "step": 12040 + }, + { + "epoch": 0.3278744051348925, + "grad_norm": 0.3149316608905792, + "learning_rate": 4.303237371715524e-05, + "loss": 1.8874, + "step": 12050 + }, + { + "epoch": 0.3284163463004048, + "grad_norm": 0.35441088676452637, + "learning_rate": 4.2944202464379125e-05, + "loss": 1.8776, + "step": 12060 + }, + { + "epoch": 0.328958287465917, + "grad_norm": 0.34289848804473877, + "learning_rate": 4.2856081045648285e-05, + "loss": 1.8946, + "step": 12070 + }, + { + "epoch": 0.3295002286314292, + "grad_norm": 0.3778747320175171, + "learning_rate": 4.276800982522293e-05, + "loss": 1.8779, + "step": 12080 + }, + { + "epoch": 0.3300421697969414, + "grad_norm": 0.4332473874092102, + "learning_rate": 4.26799891671557e-05, + "loss": 1.8827, + "step": 12090 + }, + { + "epoch": 0.33020475214659506, + "eval_loss": 2.6110384464263916, + "eval_runtime": 21.9727, + "eval_samples_per_second": 227.555, + "eval_steps_per_second": 1.229, + "step": 12093 + }, + { + "epoch": 0.33058411096245366, + "grad_norm": 0.334953248500824, + "learning_rate": 4.2592019435290266e-05, + "loss": 1.8724, + "step": 12100 + }, + { + "epoch": 0.33112605212796586, + "grad_norm": 0.36350664496421814, + "learning_rate": 4.2504100993259774e-05, + "loss": 1.8744, + "step": 12110 + }, + { + "epoch": 0.33166799329347807, + "grad_norm": 0.44730937480926514, + "learning_rate": 4.241623420448533e-05, + "loss": 1.8722, + "step": 12120 + }, + { + "epoch": 0.3322099344589903, + "grad_norm": 0.6003024578094482, + "learning_rate": 4.2328419432174605e-05, + "loss": 1.8838, + "step": 12130 + }, + { + "epoch": 0.33275187562450254, + "grad_norm": 0.27411770820617676, + "learning_rate": 4.224065703932016e-05, + "loss": 1.8716, + "step": 12140 + }, + { + "epoch": 0.33329381679001474, + "grad_norm": 0.3935556411743164, + "learning_rate": 4.215294738869808e-05, + "loss": 1.8874, + "step": 12150 + }, + { + "epoch": 0.33383575795552695, + "grad_norm": 0.32209470868110657, + "learning_rate": 4.206529084286649e-05, + "loss": 1.877, + "step": 12160 + }, + { + "epoch": 0.33437769912103915, + "grad_norm": 0.5299956202507019, + "learning_rate": 4.197768776416387e-05, + "loss": 1.8806, + "step": 12170 + }, + { + "epoch": 0.3349196402865514, + "grad_norm": 0.5041821002960205, + "learning_rate": 4.1890138514707835e-05, + "loss": 1.8764, + "step": 12180 + }, + { + "epoch": 0.3349196402865514, + "eval_loss": 2.6070642471313477, + "eval_runtime": 21.9587, + "eval_samples_per_second": 227.7, + "eval_steps_per_second": 1.23, + "step": 12180 + }, + { + "epoch": 0.3354615814520636, + "grad_norm": 0.4416036307811737, + "learning_rate": 4.180264345639339e-05, + "loss": 1.8713, + "step": 12190 + }, + { + "epoch": 0.3360035226175758, + "grad_norm": 0.2827119827270508, + "learning_rate": 4.171520295089153e-05, + "loss": 1.8777, + "step": 12200 + }, + { + "epoch": 0.33654546378308803, + "grad_norm": 0.374304860830307, + "learning_rate": 4.1627817359647846e-05, + "loss": 1.8803, + "step": 12210 + }, + { + "epoch": 0.3370874049486003, + "grad_norm": 0.27908772230148315, + "learning_rate": 4.1540487043880824e-05, + "loss": 1.8862, + "step": 12220 + }, + { + "epoch": 0.3376293461141125, + "grad_norm": 0.41203275322914124, + "learning_rate": 4.145321236458053e-05, + "loss": 1.8731, + "step": 12230 + }, + { + "epoch": 0.3381712872796247, + "grad_norm": 0.332907497882843, + "learning_rate": 4.136599368250704e-05, + "loss": 1.8844, + "step": 12240 + }, + { + "epoch": 0.3387132284451369, + "grad_norm": 0.302048921585083, + "learning_rate": 4.1278831358188915e-05, + "loss": 1.8878, + "step": 12250 + }, + { + "epoch": 0.33925516961064917, + "grad_norm": 0.40147966146469116, + "learning_rate": 4.119172575192185e-05, + "loss": 1.876, + "step": 12260 + }, + { + "epoch": 0.3396345284265077, + "eval_loss": 2.6118130683898926, + "eval_runtime": 21.9708, + "eval_samples_per_second": 227.575, + "eval_steps_per_second": 1.229, + "step": 12267 + }, + { + "epoch": 0.33979711077616137, + "grad_norm": 0.2979309856891632, + "learning_rate": 4.110467722376697e-05, + "loss": 1.8913, + "step": 12270 + }, + { + "epoch": 0.3403390519416736, + "grad_norm": 0.3392654061317444, + "learning_rate": 4.1017686133549524e-05, + "loss": 1.8759, + "step": 12280 + }, + { + "epoch": 0.3408809931071858, + "grad_norm": 0.44583526253700256, + "learning_rate": 4.093075284085738e-05, + "loss": 1.8815, + "step": 12290 + }, + { + "epoch": 0.34142293427269804, + "grad_norm": 0.3814590573310852, + "learning_rate": 4.084387770503939e-05, + "loss": 1.8834, + "step": 12300 + }, + { + "epoch": 0.34196487543821025, + "grad_norm": 0.2939944565296173, + "learning_rate": 4.0757061085204084e-05, + "loss": 1.8648, + "step": 12310 + }, + { + "epoch": 0.34250681660372245, + "grad_norm": 0.41222232580184937, + "learning_rate": 4.0670303340218085e-05, + "loss": 1.8683, + "step": 12320 + }, + { + "epoch": 0.34304875776923466, + "grad_norm": 0.307704359292984, + "learning_rate": 4.058360482870464e-05, + "loss": 1.8837, + "step": 12330 + }, + { + "epoch": 0.3435906989347469, + "grad_norm": 0.41536006331443787, + "learning_rate": 4.049696590904218e-05, + "loss": 1.8883, + "step": 12340 + }, + { + "epoch": 0.3441326401002591, + "grad_norm": 0.29493269324302673, + "learning_rate": 4.0410386939362774e-05, + "loss": 1.8749, + "step": 12350 + }, + { + "epoch": 0.344349416566464, + "eval_loss": 2.604987144470215, + "eval_runtime": 21.9698, + "eval_samples_per_second": 227.585, + "eval_steps_per_second": 1.229, + "step": 12354 + }, + { + "epoch": 0.34467458126577133, + "grad_norm": 0.3915794789791107, + "learning_rate": 4.032386827755069e-05, + "loss": 1.8712, + "step": 12360 + }, + { + "epoch": 0.34521652243128353, + "grad_norm": 0.3644906282424927, + "learning_rate": 4.0237410281240915e-05, + "loss": 1.8795, + "step": 12370 + }, + { + "epoch": 0.3457584635967958, + "grad_norm": 0.29800209403038025, + "learning_rate": 4.015101330781764e-05, + "loss": 1.8704, + "step": 12380 + }, + { + "epoch": 0.346300404762308, + "grad_norm": 0.3081452250480652, + "learning_rate": 4.0064677714412856e-05, + "loss": 1.8776, + "step": 12390 + }, + { + "epoch": 0.3468423459278202, + "grad_norm": 0.3264562487602234, + "learning_rate": 3.997840385790481e-05, + "loss": 1.8819, + "step": 12400 + }, + { + "epoch": 0.3473842870933324, + "grad_norm": 0.30599161982536316, + "learning_rate": 3.989219209491652e-05, + "loss": 1.8903, + "step": 12410 + }, + { + "epoch": 0.34792622825884467, + "grad_norm": 0.31660717725753784, + "learning_rate": 3.98060427818144e-05, + "loss": 1.8748, + "step": 12420 + }, + { + "epoch": 0.3484681694243569, + "grad_norm": 0.41350027918815613, + "learning_rate": 3.971995627470668e-05, + "loss": 1.8775, + "step": 12430 + }, + { + "epoch": 0.3490101105898691, + "grad_norm": 0.43306195735931396, + "learning_rate": 3.963393292944195e-05, + "loss": 1.8683, + "step": 12440 + }, + { + "epoch": 0.3490643047064203, + "eval_loss": 2.6103334426879883, + "eval_runtime": 21.967, + "eval_samples_per_second": 227.614, + "eval_steps_per_second": 1.229, + "step": 12441 + }, + { + "epoch": 0.3495520517553813, + "grad_norm": 0.3944796323776245, + "learning_rate": 3.954797310160777e-05, + "loss": 1.8695, + "step": 12450 + }, + { + "epoch": 0.35009399292089355, + "grad_norm": 0.3327912986278534, + "learning_rate": 3.946207714652911e-05, + "loss": 1.8703, + "step": 12460 + }, + { + "epoch": 0.35063593408640575, + "grad_norm": 0.38429713249206543, + "learning_rate": 3.937624541926689e-05, + "loss": 1.8715, + "step": 12470 + }, + { + "epoch": 0.35117787525191796, + "grad_norm": 0.6065689325332642, + "learning_rate": 3.9290478274616605e-05, + "loss": 1.8717, + "step": 12480 + }, + { + "epoch": 0.35171981641743016, + "grad_norm": 0.31308409571647644, + "learning_rate": 3.920477606710673e-05, + "loss": 1.8792, + "step": 12490 + }, + { + "epoch": 0.3522617575829424, + "grad_norm": 0.5742292404174805, + "learning_rate": 3.911913915099734e-05, + "loss": 1.8788, + "step": 12500 + }, + { + "epoch": 0.35280369874845463, + "grad_norm": 0.4109736979007721, + "learning_rate": 3.903356788027863e-05, + "loss": 1.8666, + "step": 12510 + }, + { + "epoch": 0.35334563991396684, + "grad_norm": 0.6786653399467468, + "learning_rate": 3.894806260866941e-05, + "loss": 1.8743, + "step": 12520 + }, + { + "epoch": 0.3537791928463766, + "eval_loss": 2.6084859371185303, + "eval_runtime": 21.9673, + "eval_samples_per_second": 227.611, + "eval_steps_per_second": 1.229, + "step": 12528 + }, + { + "epoch": 0.35388758107947904, + "grad_norm": 0.2924114465713501, + "learning_rate": 3.886262368961571e-05, + "loss": 1.8715, + "step": 12530 + }, + { + "epoch": 0.3544295222449913, + "grad_norm": 0.36380696296691895, + "learning_rate": 3.877725147628925e-05, + "loss": 1.8659, + "step": 12540 + }, + { + "epoch": 0.3549714634105035, + "grad_norm": 0.3204795718193054, + "learning_rate": 3.869194632158603e-05, + "loss": 1.8751, + "step": 12550 + }, + { + "epoch": 0.3555134045760157, + "grad_norm": 0.5478801131248474, + "learning_rate": 3.8606708578124875e-05, + "loss": 1.8603, + "step": 12560 + }, + { + "epoch": 0.3560553457415279, + "grad_norm": 0.5209506154060364, + "learning_rate": 3.852153859824593e-05, + "loss": 1.868, + "step": 12570 + }, + { + "epoch": 0.3565972869070402, + "grad_norm": 0.43026360869407654, + "learning_rate": 3.8436436734009243e-05, + "loss": 1.8737, + "step": 12580 + }, + { + "epoch": 0.3571392280725524, + "grad_norm": 0.3033788800239563, + "learning_rate": 3.83514033371933e-05, + "loss": 1.874, + "step": 12590 + }, + { + "epoch": 0.3576811692380646, + "grad_norm": 0.460555762052536, + "learning_rate": 3.8266438759293555e-05, + "loss": 1.8764, + "step": 12600 + }, + { + "epoch": 0.3582231104035768, + "grad_norm": 0.335553377866745, + "learning_rate": 3.818154335152101e-05, + "loss": 1.8718, + "step": 12610 + }, + { + "epoch": 0.3584940809863329, + "eval_loss": 2.607743740081787, + "eval_runtime": 21.9725, + "eval_samples_per_second": 227.557, + "eval_steps_per_second": 1.229, + "step": 12615 + }, + { + "epoch": 0.35876505156908906, + "grad_norm": 0.2863267958164215, + "learning_rate": 3.8096717464800735e-05, + "loss": 1.8626, + "step": 12620 + }, + { + "epoch": 0.35930699273460126, + "grad_norm": 0.386030375957489, + "learning_rate": 3.8011961449770403e-05, + "loss": 1.8737, + "step": 12630 + }, + { + "epoch": 0.35984893390011347, + "grad_norm": 0.5605409145355225, + "learning_rate": 3.7927275656778936e-05, + "loss": 1.8686, + "step": 12640 + }, + { + "epoch": 0.36039087506562567, + "grad_norm": 0.32051485776901245, + "learning_rate": 3.7842660435884916e-05, + "loss": 1.8766, + "step": 12650 + }, + { + "epoch": 0.36093281623113793, + "grad_norm": 0.31499314308166504, + "learning_rate": 3.775811613685518e-05, + "loss": 1.867, + "step": 12660 + }, + { + "epoch": 0.36147475739665014, + "grad_norm": 0.280678927898407, + "learning_rate": 3.767364310916353e-05, + "loss": 1.8651, + "step": 12670 + }, + { + "epoch": 0.36201669856216234, + "grad_norm": 0.26214084029197693, + "learning_rate": 3.7589241701989005e-05, + "loss": 1.8702, + "step": 12680 + }, + { + "epoch": 0.36255863972767455, + "grad_norm": 0.31816792488098145, + "learning_rate": 3.750491226421473e-05, + "loss": 1.8672, + "step": 12690 + }, + { + "epoch": 0.3631005808931868, + "grad_norm": 0.3030293881893158, + "learning_rate": 3.7420655144426256e-05, + "loss": 1.8554, + "step": 12700 + }, + { + "epoch": 0.36320896912628925, + "eval_loss": 2.6091229915618896, + "eval_runtime": 21.9746, + "eval_samples_per_second": 227.536, + "eval_steps_per_second": 1.229, + "step": 12702 + }, + { + "epoch": 0.363642522058699, + "grad_norm": 0.4357650578022003, + "learning_rate": 3.733647069091016e-05, + "loss": 1.8616, + "step": 12710 + }, + { + "epoch": 0.3641844632242112, + "grad_norm": 0.3486103415489197, + "learning_rate": 3.725235925165278e-05, + "loss": 1.8744, + "step": 12720 + }, + { + "epoch": 0.3647264043897234, + "grad_norm": 0.35597971081733704, + "learning_rate": 3.716832117433853e-05, + "loss": 1.8693, + "step": 12730 + }, + { + "epoch": 0.3652683455552357, + "grad_norm": 0.6880526542663574, + "learning_rate": 3.7084356806348566e-05, + "loss": 1.8561, + "step": 12740 + }, + { + "epoch": 0.3658102867207479, + "grad_norm": 0.36243849992752075, + "learning_rate": 3.7000466494759445e-05, + "loss": 1.869, + "step": 12750 + }, + { + "epoch": 0.3663522278862601, + "grad_norm": 0.35954001545906067, + "learning_rate": 3.691665058634153e-05, + "loss": 1.8669, + "step": 12760 + }, + { + "epoch": 0.3668941690517723, + "grad_norm": 0.2994548976421356, + "learning_rate": 3.683290942755767e-05, + "loss": 1.8587, + "step": 12770 + }, + { + "epoch": 0.36743611021728456, + "grad_norm": 0.5059475302696228, + "learning_rate": 3.674924336456173e-05, + "loss": 1.8584, + "step": 12780 + }, + { + "epoch": 0.36792385726624555, + "eval_loss": 2.6061477661132812, + "eval_runtime": 21.9728, + "eval_samples_per_second": 227.554, + "eval_steps_per_second": 1.229, + "step": 12789 + }, + { + "epoch": 0.36797805138279677, + "grad_norm": 0.4944458305835724, + "learning_rate": 3.6665652743197075e-05, + "loss": 1.8711, + "step": 12790 + }, + { + "epoch": 0.36851999254830897, + "grad_norm": 0.3134331703186035, + "learning_rate": 3.658213790899537e-05, + "loss": 1.8633, + "step": 12800 + }, + { + "epoch": 0.3690619337138212, + "grad_norm": 0.35285812616348267, + "learning_rate": 3.649869920717487e-05, + "loss": 1.8843, + "step": 12810 + }, + { + "epoch": 0.36960387487933344, + "grad_norm": 0.3203577399253845, + "learning_rate": 3.641533698263917e-05, + "loss": 1.859, + "step": 12820 + }, + { + "epoch": 0.37014581604484564, + "grad_norm": 0.35432031750679016, + "learning_rate": 3.6332051579975815e-05, + "loss": 1.8704, + "step": 12830 + }, + { + "epoch": 0.37068775721035785, + "grad_norm": 0.3153700828552246, + "learning_rate": 3.624884334345465e-05, + "loss": 1.8622, + "step": 12840 + }, + { + "epoch": 0.37122969837587005, + "grad_norm": 0.7621405124664307, + "learning_rate": 3.616571261702669e-05, + "loss": 1.8665, + "step": 12850 + }, + { + "epoch": 0.3717716395413823, + "grad_norm": 0.5706172585487366, + "learning_rate": 3.6082659744322464e-05, + "loss": 1.8578, + "step": 12860 + }, + { + "epoch": 0.3723135807068945, + "grad_norm": 0.6530439853668213, + "learning_rate": 3.59996850686507e-05, + "loss": 1.8579, + "step": 12870 + }, + { + "epoch": 0.37263874540620184, + "eval_loss": 2.6071505546569824, + "eval_runtime": 21.9724, + "eval_samples_per_second": 227.558, + "eval_steps_per_second": 1.229, + "step": 12876 + }, + { + "epoch": 0.3728555218724067, + "grad_norm": 0.49953022599220276, + "learning_rate": 3.591678893299693e-05, + "loss": 1.8671, + "step": 12880 + }, + { + "epoch": 0.37339746303791893, + "grad_norm": 0.37402573227882385, + "learning_rate": 3.583397168002196e-05, + "loss": 1.8639, + "step": 12890 + }, + { + "epoch": 0.3739394042034312, + "grad_norm": 0.281125545501709, + "learning_rate": 3.575123365206057e-05, + "loss": 1.8736, + "step": 12900 + }, + { + "epoch": 0.3744813453689434, + "grad_norm": 0.29488998651504517, + "learning_rate": 3.566857519112008e-05, + "loss": 1.8704, + "step": 12910 + }, + { + "epoch": 0.3750232865344556, + "grad_norm": 0.29372894763946533, + "learning_rate": 3.558599663887886e-05, + "loss": 1.868, + "step": 12920 + }, + { + "epoch": 0.3755652276999678, + "grad_norm": 0.27853238582611084, + "learning_rate": 3.550349833668499e-05, + "loss": 1.8644, + "step": 12930 + }, + { + "epoch": 0.37610716886548007, + "grad_norm": 0.2777658998966217, + "learning_rate": 3.542108062555483e-05, + "loss": 1.8621, + "step": 12940 + }, + { + "epoch": 0.3766491100309923, + "grad_norm": 0.3306451737880707, + "learning_rate": 3.5338743846171574e-05, + "loss": 1.8612, + "step": 12950 + }, + { + "epoch": 0.3771910511965045, + "grad_norm": 0.4671363830566406, + "learning_rate": 3.525648833888393e-05, + "loss": 1.857, + "step": 12960 + }, + { + "epoch": 0.37735363354615814, + "eval_loss": 2.6051692962646484, + "eval_runtime": 21.9698, + "eval_samples_per_second": 227.585, + "eval_steps_per_second": 1.229, + "step": 12963 + }, + { + "epoch": 0.3777329923620167, + "grad_norm": 0.3648098111152649, + "learning_rate": 3.5174314443704634e-05, + "loss": 1.8643, + "step": 12970 + }, + { + "epoch": 0.37827493352752894, + "grad_norm": 0.4720825254917145, + "learning_rate": 3.5092222500309066e-05, + "loss": 1.8606, + "step": 12980 + }, + { + "epoch": 0.37881687469304115, + "grad_norm": 0.3330930173397064, + "learning_rate": 3.501021284803384e-05, + "loss": 1.8627, + "step": 12990 + }, + { + "epoch": 0.37935881585855336, + "grad_norm": 0.45893749594688416, + "learning_rate": 3.492828582587541e-05, + "loss": 1.8577, + "step": 13000 + }, + { + "epoch": 0.37990075702406556, + "grad_norm": 0.2954985201358795, + "learning_rate": 3.4846441772488706e-05, + "loss": 1.8512, + "step": 13010 + }, + { + "epoch": 0.3804426981895778, + "grad_norm": 0.2998946011066437, + "learning_rate": 3.476468102618564e-05, + "loss": 1.8632, + "step": 13020 + }, + { + "epoch": 0.38098463935509, + "grad_norm": 0.3708723187446594, + "learning_rate": 3.4683003924933823e-05, + "loss": 1.8694, + "step": 13030 + }, + { + "epoch": 0.38152658052060223, + "grad_norm": 0.48674148321151733, + "learning_rate": 3.4601410806355055e-05, + "loss": 1.864, + "step": 13040 + }, + { + "epoch": 0.38206852168611444, + "grad_norm": 0.35994434356689453, + "learning_rate": 3.4519902007724026e-05, + "loss": 1.8574, + "step": 13050 + }, + { + "epoch": 0.38206852168611444, + "eval_loss": 2.6076242923736572, + "eval_runtime": 21.9664, + "eval_samples_per_second": 227.621, + "eval_steps_per_second": 1.229, + "step": 13050 + }, + { + "epoch": 0.38261046285162664, + "grad_norm": 0.30608904361724854, + "learning_rate": 3.443847786596682e-05, + "loss": 1.8665, + "step": 13060 + }, + { + "epoch": 0.3831524040171389, + "grad_norm": 0.2905879616737366, + "learning_rate": 3.435713871765969e-05, + "loss": 1.8563, + "step": 13070 + }, + { + "epoch": 0.3836943451826511, + "grad_norm": 0.34675753116607666, + "learning_rate": 3.427588489902748e-05, + "loss": 1.8582, + "step": 13080 + }, + { + "epoch": 0.3842362863481633, + "grad_norm": 0.2855149209499359, + "learning_rate": 3.419471674594226e-05, + "loss": 1.8635, + "step": 13090 + }, + { + "epoch": 0.3847782275136755, + "grad_norm": 0.3102046847343445, + "learning_rate": 3.4113634593922126e-05, + "loss": 1.8736, + "step": 13100 + }, + { + "epoch": 0.3853201686791878, + "grad_norm": 0.34414398670196533, + "learning_rate": 3.4032638778129576e-05, + "loss": 1.8504, + "step": 13110 + }, + { + "epoch": 0.3858621098447, + "grad_norm": 0.34666258096694946, + "learning_rate": 3.395172963337029e-05, + "loss": 1.8708, + "step": 13120 + }, + { + "epoch": 0.3864040510102122, + "grad_norm": 0.2988355755805969, + "learning_rate": 3.387090749409167e-05, + "loss": 1.8635, + "step": 13130 + }, + { + "epoch": 0.38678340982607073, + "eval_loss": 2.610823631286621, + "eval_runtime": 21.9709, + "eval_samples_per_second": 227.574, + "eval_steps_per_second": 1.229, + "step": 13137 + }, + { + "epoch": 0.3869459921757244, + "grad_norm": 0.2701932489871979, + "learning_rate": 3.3790172694381385e-05, + "loss": 1.8619, + "step": 13140 + }, + { + "epoch": 0.38748793334123666, + "grad_norm": 0.31789588928222656, + "learning_rate": 3.370952556796621e-05, + "loss": 1.8527, + "step": 13150 + }, + { + "epoch": 0.38802987450674886, + "grad_norm": 0.3052840232849121, + "learning_rate": 3.362896644821042e-05, + "loss": 1.8559, + "step": 13160 + }, + { + "epoch": 0.38857181567226107, + "grad_norm": 0.2687411606311798, + "learning_rate": 3.3548495668114536e-05, + "loss": 1.8587, + "step": 13170 + }, + { + "epoch": 0.3891137568377733, + "grad_norm": 0.5188722014427185, + "learning_rate": 3.346811356031394e-05, + "loss": 1.8567, + "step": 13180 + }, + { + "epoch": 0.38965569800328553, + "grad_norm": 0.33732667565345764, + "learning_rate": 3.3387820457077403e-05, + "loss": 1.8539, + "step": 13190 + }, + { + "epoch": 0.39019763916879774, + "grad_norm": 0.31062641739845276, + "learning_rate": 3.3307616690305875e-05, + "loss": 1.8569, + "step": 13200 + }, + { + "epoch": 0.39073958033430994, + "grad_norm": 0.30725353956222534, + "learning_rate": 3.322750259153096e-05, + "loss": 1.8489, + "step": 13210 + }, + { + "epoch": 0.39128152149982215, + "grad_norm": 0.3953590989112854, + "learning_rate": 3.314747849191362e-05, + "loss": 1.8627, + "step": 13220 + }, + { + "epoch": 0.3914982979660271, + "eval_loss": 2.6103906631469727, + "eval_runtime": 21.9715, + "eval_samples_per_second": 227.567, + "eval_steps_per_second": 1.229, + "step": 13224 + }, + { + "epoch": 0.3918234626653344, + "grad_norm": 0.284915953874588, + "learning_rate": 3.306754472224285e-05, + "loss": 1.8606, + "step": 13230 + }, + { + "epoch": 0.3923654038308466, + "grad_norm": 0.5332658886909485, + "learning_rate": 3.2987701612934174e-05, + "loss": 1.8526, + "step": 13240 + }, + { + "epoch": 0.3929073449963588, + "grad_norm": 0.330483078956604, + "learning_rate": 3.290794949402837e-05, + "loss": 1.8603, + "step": 13250 + }, + { + "epoch": 0.393449286161871, + "grad_norm": 0.4462834298610687, + "learning_rate": 3.282828869519019e-05, + "loss": 1.8646, + "step": 13260 + }, + { + "epoch": 0.3939912273273833, + "grad_norm": 0.3231438100337982, + "learning_rate": 3.2748719545706776e-05, + "loss": 1.8528, + "step": 13270 + }, + { + "epoch": 0.3945331684928955, + "grad_norm": 0.2627573311328888, + "learning_rate": 3.266924237448655e-05, + "loss": 1.857, + "step": 13280 + }, + { + "epoch": 0.3950751096584077, + "grad_norm": 0.2972419261932373, + "learning_rate": 3.2589857510057634e-05, + "loss": 1.8477, + "step": 13290 + }, + { + "epoch": 0.3956170508239199, + "grad_norm": 0.3084324598312378, + "learning_rate": 3.251056528056658e-05, + "loss": 1.8504, + "step": 13300 + }, + { + "epoch": 0.39615899198943216, + "grad_norm": 0.3726852536201477, + "learning_rate": 3.2431366013777156e-05, + "loss": 1.8487, + "step": 13310 + }, + { + "epoch": 0.3962131861059834, + "eval_loss": 2.607210397720337, + "eval_runtime": 21.9736, + "eval_samples_per_second": 227.546, + "eval_steps_per_second": 1.229, + "step": 13311 + }, + { + "epoch": 0.39670093315494437, + "grad_norm": 0.3420845568180084, + "learning_rate": 3.235226003706872e-05, + "loss": 1.8553, + "step": 13320 + }, + { + "epoch": 0.3972428743204566, + "grad_norm": 0.3318046033382416, + "learning_rate": 3.227324767743507e-05, + "loss": 1.8592, + "step": 13330 + }, + { + "epoch": 0.3977848154859688, + "grad_norm": 0.2706853449344635, + "learning_rate": 3.2194329261482985e-05, + "loss": 1.8574, + "step": 13340 + }, + { + "epoch": 0.39832675665148104, + "grad_norm": 0.27932029962539673, + "learning_rate": 3.211550511543095e-05, + "loss": 1.8565, + "step": 13350 + }, + { + "epoch": 0.39886869781699325, + "grad_norm": 0.29047316312789917, + "learning_rate": 3.203677556510779e-05, + "loss": 1.859, + "step": 13360 + }, + { + "epoch": 0.39941063898250545, + "grad_norm": 0.3240620791912079, + "learning_rate": 3.195814093595127e-05, + "loss": 1.8506, + "step": 13370 + }, + { + "epoch": 0.39995258014801766, + "grad_norm": 0.2864803969860077, + "learning_rate": 3.18796015530068e-05, + "loss": 1.8585, + "step": 13380 + }, + { + "epoch": 0.4004945213135299, + "grad_norm": 0.28760528564453125, + "learning_rate": 3.180115774092609e-05, + "loss": 1.857, + "step": 13390 + }, + { + "epoch": 0.4009280742459397, + "eval_loss": 2.60837984085083, + "eval_runtime": 21.9751, + "eval_samples_per_second": 227.53, + "eval_steps_per_second": 1.229, + "step": 13398 + }, + { + "epoch": 0.4010364624790421, + "grad_norm": 0.47491455078125, + "learning_rate": 3.172280982396577e-05, + "loss": 1.8524, + "step": 13400 + }, + { + "epoch": 0.4015784036445543, + "grad_norm": 0.7039004564285278, + "learning_rate": 3.164455812598609e-05, + "loss": 1.8448, + "step": 13410 + }, + { + "epoch": 0.40212034481006653, + "grad_norm": 0.5090378522872925, + "learning_rate": 3.15664029704496e-05, + "loss": 1.8595, + "step": 13420 + }, + { + "epoch": 0.4026622859755788, + "grad_norm": 0.3467673361301422, + "learning_rate": 3.148834468041973e-05, + "loss": 1.8602, + "step": 13430 + }, + { + "epoch": 0.403204227141091, + "grad_norm": 0.3230932056903839, + "learning_rate": 3.141038357855953e-05, + "loss": 1.8451, + "step": 13440 + }, + { + "epoch": 0.4037461683066032, + "grad_norm": 0.3669589161872864, + "learning_rate": 3.133251998713032e-05, + "loss": 1.8655, + "step": 13450 + }, + { + "epoch": 0.4042881094721154, + "grad_norm": 0.29886960983276367, + "learning_rate": 3.1254754227990294e-05, + "loss": 1.8554, + "step": 13460 + }, + { + "epoch": 0.40483005063762767, + "grad_norm": 0.3277972638607025, + "learning_rate": 3.1177086622593345e-05, + "loss": 1.8566, + "step": 13470 + }, + { + "epoch": 0.4053719918031399, + "grad_norm": 0.2972913980484009, + "learning_rate": 3.109951749198755e-05, + "loss": 1.8554, + "step": 13480 + }, + { + "epoch": 0.405642962385896, + "eval_loss": 2.604696273803711, + "eval_runtime": 22.1189, + "eval_samples_per_second": 226.051, + "eval_steps_per_second": 1.221, + "step": 13485 + }, + { + "epoch": 0.4059139329686521, + "grad_norm": 0.41642656922340393, + "learning_rate": 3.102204715681397e-05, + "loss": 1.8526, + "step": 13490 + }, + { + "epoch": 0.4064558741341643, + "grad_norm": 0.3331157863140106, + "learning_rate": 3.0944675937305254e-05, + "loss": 1.8621, + "step": 13500 + }, + { + "epoch": 0.40699781529967655, + "grad_norm": 0.2806148827075958, + "learning_rate": 3.086740415328436e-05, + "loss": 1.8597, + "step": 13510 + }, + { + "epoch": 0.40753975646518875, + "grad_norm": 0.328106552362442, + "learning_rate": 3.0790232124163256e-05, + "loss": 1.8543, + "step": 13520 + }, + { + "epoch": 0.40808169763070096, + "grad_norm": 0.27871695160865784, + "learning_rate": 3.0713160168941494e-05, + "loss": 1.8546, + "step": 13530 + }, + { + "epoch": 0.40862363879621316, + "grad_norm": 0.3508320748806, + "learning_rate": 3.0636188606205e-05, + "loss": 1.8454, + "step": 13540 + }, + { + "epoch": 0.4091655799617254, + "grad_norm": 0.29558315873146057, + "learning_rate": 3.0559317754124706e-05, + "loss": 1.8595, + "step": 13550 + }, + { + "epoch": 0.40970752112723763, + "grad_norm": 0.31710633635520935, + "learning_rate": 3.048254793045524e-05, + "loss": 1.8416, + "step": 13560 + }, + { + "epoch": 0.41024946229274983, + "grad_norm": 0.31882941722869873, + "learning_rate": 3.040587945253362e-05, + "loss": 1.8512, + "step": 13570 + }, + { + "epoch": 0.4103578505258523, + "eval_loss": 2.610496759414673, + "eval_runtime": 21.9645, + "eval_samples_per_second": 227.64, + "eval_steps_per_second": 1.229, + "step": 13572 + }, + { + "epoch": 0.41079140345826204, + "grad_norm": 0.28335651755332947, + "learning_rate": 3.032931263727796e-05, + "loss": 1.852, + "step": 13580 + }, + { + "epoch": 0.4113333446237743, + "grad_norm": 0.40349459648132324, + "learning_rate": 3.0252847801186135e-05, + "loss": 1.8441, + "step": 13590 + }, + { + "epoch": 0.4118752857892865, + "grad_norm": 0.2962387204170227, + "learning_rate": 3.0176485260334398e-05, + "loss": 1.8472, + "step": 13600 + }, + { + "epoch": 0.4124172269547987, + "grad_norm": 0.27455824613571167, + "learning_rate": 3.0100225330376282e-05, + "loss": 1.848, + "step": 13610 + }, + { + "epoch": 0.4129591681203109, + "grad_norm": 0.524045467376709, + "learning_rate": 3.0024068326541056e-05, + "loss": 1.8442, + "step": 13620 + }, + { + "epoch": 0.4135011092858232, + "grad_norm": 0.28335145115852356, + "learning_rate": 2.994801456363263e-05, + "loss": 1.8395, + "step": 13630 + }, + { + "epoch": 0.4140430504513354, + "grad_norm": 0.2653671205043793, + "learning_rate": 2.987206435602809e-05, + "loss": 1.8421, + "step": 13640 + }, + { + "epoch": 0.4145849916168476, + "grad_norm": 0.37382206320762634, + "learning_rate": 2.979621801767643e-05, + "loss": 1.847, + "step": 13650 + }, + { + "epoch": 0.41507273866580857, + "eval_loss": 2.604285717010498, + "eval_runtime": 21.965, + "eval_samples_per_second": 227.635, + "eval_steps_per_second": 1.229, + "step": 13659 + }, + { + "epoch": 0.4151269327823598, + "grad_norm": 0.2956138849258423, + "learning_rate": 2.972047586209739e-05, + "loss": 1.8429, + "step": 13660 + }, + { + "epoch": 0.41566887394787205, + "grad_norm": 0.3020017147064209, + "learning_rate": 2.9644838202379988e-05, + "loss": 1.8395, + "step": 13670 + }, + { + "epoch": 0.41621081511338426, + "grad_norm": 0.36557555198669434, + "learning_rate": 2.956930535118129e-05, + "loss": 1.8314, + "step": 13680 + }, + { + "epoch": 0.41675275627889646, + "grad_norm": 0.2863655686378479, + "learning_rate": 2.9493877620725208e-05, + "loss": 1.8437, + "step": 13690 + }, + { + "epoch": 0.41729469744440867, + "grad_norm": 0.30269670486450195, + "learning_rate": 2.9418555322800983e-05, + "loss": 1.8538, + "step": 13700 + }, + { + "epoch": 0.41783663860992093, + "grad_norm": 0.3189536929130554, + "learning_rate": 2.9343338768762175e-05, + "loss": 1.832, + "step": 13710 + }, + { + "epoch": 0.41837857977543313, + "grad_norm": 0.3305031657218933, + "learning_rate": 2.9268228269525178e-05, + "loss": 1.8469, + "step": 13720 + }, + { + "epoch": 0.41892052094094534, + "grad_norm": 0.30250081419944763, + "learning_rate": 2.9193224135567965e-05, + "loss": 1.8563, + "step": 13730 + }, + { + "epoch": 0.41946246210645755, + "grad_norm": 0.2988894581794739, + "learning_rate": 2.9118326676928938e-05, + "loss": 1.8563, + "step": 13740 + }, + { + "epoch": 0.4197876268057649, + "eval_loss": 2.606349229812622, + "eval_runtime": 21.9714, + "eval_samples_per_second": 227.568, + "eval_steps_per_second": 1.229, + "step": 13746 + }, + { + "epoch": 0.4200044032719698, + "grad_norm": 0.32497337460517883, + "learning_rate": 2.904353620320542e-05, + "loss": 1.8507, + "step": 13750 + }, + { + "epoch": 0.420546344437482, + "grad_norm": 0.4472458064556122, + "learning_rate": 2.8968853023552555e-05, + "loss": 1.8343, + "step": 13760 + }, + { + "epoch": 0.4210882856029942, + "grad_norm": 0.4618549942970276, + "learning_rate": 2.8894277446682028e-05, + "loss": 1.8497, + "step": 13770 + }, + { + "epoch": 0.4216302267685064, + "grad_norm": 0.3211345970630646, + "learning_rate": 2.8819809780860625e-05, + "loss": 1.8459, + "step": 13780 + }, + { + "epoch": 0.4221721679340187, + "grad_norm": 0.2894679307937622, + "learning_rate": 2.87454503339092e-05, + "loss": 1.8519, + "step": 13790 + }, + { + "epoch": 0.4227141090995309, + "grad_norm": 0.2763760983943939, + "learning_rate": 2.867119941320114e-05, + "loss": 1.8481, + "step": 13800 + }, + { + "epoch": 0.4232560502650431, + "grad_norm": 0.318680077791214, + "learning_rate": 2.859705732566129e-05, + "loss": 1.8437, + "step": 13810 + }, + { + "epoch": 0.4237979914305553, + "grad_norm": 0.39189639687538147, + "learning_rate": 2.852302437776465e-05, + "loss": 1.8552, + "step": 13820 + }, + { + "epoch": 0.42433993259606756, + "grad_norm": 0.45934727787971497, + "learning_rate": 2.844910087553503e-05, + "loss": 1.8426, + "step": 13830 + }, + { + "epoch": 0.4245025149457212, + "eval_loss": 2.608975648880005, + "eval_runtime": 21.9733, + "eval_samples_per_second": 227.548, + "eval_steps_per_second": 1.229, + "step": 13833 + }, + { + "epoch": 0.42488187376157976, + "grad_norm": 0.2927630543708801, + "learning_rate": 2.8375287124543835e-05, + "loss": 1.855, + "step": 13840 + }, + { + "epoch": 0.42542381492709197, + "grad_norm": 0.27992621064186096, + "learning_rate": 2.830158342990884e-05, + "loss": 1.845, + "step": 13850 + }, + { + "epoch": 0.4259657560926042, + "grad_norm": 0.33510464429855347, + "learning_rate": 2.8227990096292827e-05, + "loss": 1.8371, + "step": 13860 + }, + { + "epoch": 0.42650769725811644, + "grad_norm": 0.4821888208389282, + "learning_rate": 2.8154507427902467e-05, + "loss": 1.8491, + "step": 13870 + }, + { + "epoch": 0.42704963842362864, + "grad_norm": 0.29919731616973877, + "learning_rate": 2.808113572848692e-05, + "loss": 1.8429, + "step": 13880 + }, + { + "epoch": 0.42759157958914085, + "grad_norm": 0.31553396582603455, + "learning_rate": 2.8007875301336662e-05, + "loss": 1.8436, + "step": 13890 + }, + { + "epoch": 0.42813352075465305, + "grad_norm": 0.30432793498039246, + "learning_rate": 2.7934726449282213e-05, + "loss": 1.8395, + "step": 13900 + }, + { + "epoch": 0.4286754619201653, + "grad_norm": 0.3555530905723572, + "learning_rate": 2.7861689474692898e-05, + "loss": 1.842, + "step": 13910 + }, + { + "epoch": 0.4292174030856775, + "grad_norm": 0.32901936769485474, + "learning_rate": 2.7788764679475538e-05, + "loss": 1.8336, + "step": 13920 + }, + { + "epoch": 0.4292174030856775, + "eval_loss": 2.611562728881836, + "eval_runtime": 21.9692, + "eval_samples_per_second": 227.591, + "eval_steps_per_second": 1.229, + "step": 13920 + }, + { + "epoch": 0.4297593442511897, + "grad_norm": 0.5692172646522522, + "learning_rate": 2.7715952365073324e-05, + "loss": 1.8485, + "step": 13930 + }, + { + "epoch": 0.43030128541670193, + "grad_norm": 0.28073015809059143, + "learning_rate": 2.7643252832464423e-05, + "loss": 1.8312, + "step": 13940 + }, + { + "epoch": 0.4308432265822142, + "grad_norm": 0.4343844950199127, + "learning_rate": 2.7570666382160843e-05, + "loss": 1.8512, + "step": 13950 + }, + { + "epoch": 0.4313851677477264, + "grad_norm": 0.26866263151168823, + "learning_rate": 2.7498193314207137e-05, + "loss": 1.8367, + "step": 13960 + }, + { + "epoch": 0.4319271089132386, + "grad_norm": 0.5220692753791809, + "learning_rate": 2.742583392817918e-05, + "loss": 1.8317, + "step": 13970 + }, + { + "epoch": 0.4324690500787508, + "grad_norm": 0.3568936288356781, + "learning_rate": 2.7353588523182943e-05, + "loss": 1.8356, + "step": 13980 + }, + { + "epoch": 0.43301099124426307, + "grad_norm": 0.28348490595817566, + "learning_rate": 2.7281457397853237e-05, + "loss": 1.8416, + "step": 13990 + }, + { + "epoch": 0.43355293240977527, + "grad_norm": 0.32432541251182556, + "learning_rate": 2.720944085035248e-05, + "loss": 1.8346, + "step": 14000 + }, + { + "epoch": 0.4339322912256338, + "eval_loss": 2.6060945987701416, + "eval_runtime": 21.9781, + "eval_samples_per_second": 227.499, + "eval_steps_per_second": 1.228, + "step": 14007 + }, + { + "epoch": 0.4340948735752875, + "grad_norm": 0.30824974179267883, + "learning_rate": 2.7137539178369464e-05, + "loss": 1.8423, + "step": 14010 + }, + { + "epoch": 0.4346368147407997, + "grad_norm": 0.387539803981781, + "learning_rate": 2.7065752679118128e-05, + "loss": 1.8532, + "step": 14020 + }, + { + "epoch": 0.43517875590631194, + "grad_norm": 0.3984427750110626, + "learning_rate": 2.6994081649336366e-05, + "loss": 1.8505, + "step": 14030 + }, + { + "epoch": 0.43572069707182415, + "grad_norm": 0.39461550116539, + "learning_rate": 2.6922526385284737e-05, + "loss": 1.8425, + "step": 14040 + }, + { + "epoch": 0.43626263823733635, + "grad_norm": 0.26579055190086365, + "learning_rate": 2.685108718274525e-05, + "loss": 1.8379, + "step": 14050 + }, + { + "epoch": 0.43680457940284856, + "grad_norm": 0.3582479655742645, + "learning_rate": 2.6779764337020195e-05, + "loss": 1.8338, + "step": 14060 + }, + { + "epoch": 0.4373465205683608, + "grad_norm": 0.4719529151916504, + "learning_rate": 2.6708558142930862e-05, + "loss": 1.8382, + "step": 14070 + }, + { + "epoch": 0.437888461733873, + "grad_norm": 0.3371265232563019, + "learning_rate": 2.6637468894816366e-05, + "loss": 1.844, + "step": 14080 + }, + { + "epoch": 0.43843040289938523, + "grad_norm": 0.27364712953567505, + "learning_rate": 2.656649688653242e-05, + "loss": 1.8391, + "step": 14090 + }, + { + "epoch": 0.4386471793655901, + "eval_loss": 2.606126308441162, + "eval_runtime": 21.9732, + "eval_samples_per_second": 227.549, + "eval_steps_per_second": 1.229, + "step": 14094 + }, + { + "epoch": 0.43897234406489744, + "grad_norm": 0.3573056757450104, + "learning_rate": 2.6495642411450082e-05, + "loss": 1.8407, + "step": 14100 + }, + { + "epoch": 0.4395142852304097, + "grad_norm": 0.26993247866630554, + "learning_rate": 2.64249057624546e-05, + "loss": 1.8321, + "step": 14110 + }, + { + "epoch": 0.4400562263959219, + "grad_norm": 0.48967689275741577, + "learning_rate": 2.6354287231944154e-05, + "loss": 1.8435, + "step": 14120 + }, + { + "epoch": 0.4405981675614341, + "grad_norm": 0.3284403383731842, + "learning_rate": 2.6283787111828666e-05, + "loss": 1.8477, + "step": 14130 + }, + { + "epoch": 0.4411401087269463, + "grad_norm": 0.33071354031562805, + "learning_rate": 2.6213405693528638e-05, + "loss": 1.8433, + "step": 14140 + }, + { + "epoch": 0.4416820498924586, + "grad_norm": 0.2832528352737427, + "learning_rate": 2.6143143267973846e-05, + "loss": 1.846, + "step": 14150 + }, + { + "epoch": 0.4422239910579708, + "grad_norm": 0.3607860207557678, + "learning_rate": 2.6073000125602236e-05, + "loss": 1.8488, + "step": 14160 + }, + { + "epoch": 0.442765932223483, + "grad_norm": 0.26745522022247314, + "learning_rate": 2.600297655635866e-05, + "loss": 1.8377, + "step": 14170 + }, + { + "epoch": 0.4433078733889952, + "grad_norm": 0.28432849049568176, + "learning_rate": 2.5933072849693706e-05, + "loss": 1.8432, + "step": 14180 + }, + { + "epoch": 0.4433620675055464, + "eval_loss": 2.6039960384368896, + "eval_runtime": 21.9648, + "eval_samples_per_second": 227.637, + "eval_steps_per_second": 1.229, + "step": 14181 + }, + { + "epoch": 0.44384981455450745, + "grad_norm": 0.274472177028656, + "learning_rate": 2.5863289294562497e-05, + "loss": 1.8296, + "step": 14190 + }, + { + "epoch": 0.44439175572001965, + "grad_norm": 0.30612727999687195, + "learning_rate": 2.5793626179423514e-05, + "loss": 1.8461, + "step": 14200 + }, + { + "epoch": 0.44493369688553186, + "grad_norm": 0.2670454978942871, + "learning_rate": 2.5724083792237363e-05, + "loss": 1.8464, + "step": 14210 + }, + { + "epoch": 0.44547563805104406, + "grad_norm": 0.3071325719356537, + "learning_rate": 2.5654662420465613e-05, + "loss": 1.8413, + "step": 14220 + }, + { + "epoch": 0.4460175792165563, + "grad_norm": 0.35536590218544006, + "learning_rate": 2.5585362351069586e-05, + "loss": 1.8479, + "step": 14230 + }, + { + "epoch": 0.44655952038206853, + "grad_norm": 0.3007301390171051, + "learning_rate": 2.5516183870509212e-05, + "loss": 1.8459, + "step": 14240 + }, + { + "epoch": 0.44710146154758074, + "grad_norm": 0.3410910665988922, + "learning_rate": 2.544712726474182e-05, + "loss": 1.8482, + "step": 14250 + }, + { + "epoch": 0.44764340271309294, + "grad_norm": 0.3722754418849945, + "learning_rate": 2.5378192819220954e-05, + "loss": 1.8384, + "step": 14260 + }, + { + "epoch": 0.44807695564550276, + "eval_loss": 2.605301856994629, + "eval_runtime": 22.0831, + "eval_samples_per_second": 226.417, + "eval_steps_per_second": 1.223, + "step": 14268 + }, + { + "epoch": 0.4481853438786052, + "grad_norm": 0.27585405111312866, + "learning_rate": 2.5309380818895133e-05, + "loss": 1.8416, + "step": 14270 + }, + { + "epoch": 0.4487272850441174, + "grad_norm": 0.2805611193180084, + "learning_rate": 2.524069154820684e-05, + "loss": 1.8404, + "step": 14280 + }, + { + "epoch": 0.4492692262096296, + "grad_norm": 0.4039418697357178, + "learning_rate": 2.5172125291091147e-05, + "loss": 1.8429, + "step": 14290 + }, + { + "epoch": 0.4498111673751418, + "grad_norm": 0.27978357672691345, + "learning_rate": 2.510368233097472e-05, + "loss": 1.8414, + "step": 14300 + }, + { + "epoch": 0.4503531085406541, + "grad_norm": 0.2985621690750122, + "learning_rate": 2.5035362950774504e-05, + "loss": 1.8439, + "step": 14310 + }, + { + "epoch": 0.4508950497061663, + "grad_norm": 0.28868043422698975, + "learning_rate": 2.496716743289659e-05, + "loss": 1.8448, + "step": 14320 + }, + { + "epoch": 0.4514369908716785, + "grad_norm": 0.3035377860069275, + "learning_rate": 2.4899096059235144e-05, + "loss": 1.8442, + "step": 14330 + }, + { + "epoch": 0.4519789320371907, + "grad_norm": 0.29208043217658997, + "learning_rate": 2.4831149111171117e-05, + "loss": 1.8354, + "step": 14340 + }, + { + "epoch": 0.45252087320270296, + "grad_norm": 0.36014124751091003, + "learning_rate": 2.476332686957113e-05, + "loss": 1.8325, + "step": 14350 + }, + { + "epoch": 0.45279184378545906, + "eval_loss": 2.5989575386047363, + "eval_runtime": 21.9695, + "eval_samples_per_second": 227.588, + "eval_steps_per_second": 1.229, + "step": 14355 + }, + { + "epoch": 0.45306281436821516, + "grad_norm": 0.4234754741191864, + "learning_rate": 2.4695629614786373e-05, + "loss": 1.8427, + "step": 14360 + }, + { + "epoch": 0.45360475553372737, + "grad_norm": 0.2819061577320099, + "learning_rate": 2.462805762665128e-05, + "loss": 1.8368, + "step": 14370 + }, + { + "epoch": 0.45414669669923957, + "grad_norm": 0.4023802876472473, + "learning_rate": 2.4560611184482604e-05, + "loss": 1.8363, + "step": 14380 + }, + { + "epoch": 0.45468863786475183, + "grad_norm": 0.4065036475658417, + "learning_rate": 2.4493290567078052e-05, + "loss": 1.8344, + "step": 14390 + }, + { + "epoch": 0.45523057903026404, + "grad_norm": 0.3405419886112213, + "learning_rate": 2.442609605271524e-05, + "loss": 1.8358, + "step": 14400 + }, + { + "epoch": 0.45577252019577624, + "grad_norm": 0.3849346339702606, + "learning_rate": 2.4359027919150578e-05, + "loss": 1.8321, + "step": 14410 + }, + { + "epoch": 0.45631446136128845, + "grad_norm": 0.27683576941490173, + "learning_rate": 2.4292086443617964e-05, + "loss": 1.8423, + "step": 14420 + }, + { + "epoch": 0.4568564025268007, + "grad_norm": 0.30656400322914124, + "learning_rate": 2.4225271902827808e-05, + "loss": 1.8341, + "step": 14430 + }, + { + "epoch": 0.4573983436923129, + "grad_norm": 0.46740400791168213, + "learning_rate": 2.4158584572965827e-05, + "loss": 1.8442, + "step": 14440 + }, + { + "epoch": 0.45750673192541536, + "eval_loss": 2.6014182567596436, + "eval_runtime": 21.98, + "eval_samples_per_second": 227.479, + "eval_steps_per_second": 1.228, + "step": 14442 + }, + { + "epoch": 0.4579402848578251, + "grad_norm": 0.4761998951435089, + "learning_rate": 2.4092024729691855e-05, + "loss": 1.8453, + "step": 14450 + }, + { + "epoch": 0.4584822260233373, + "grad_norm": 0.3128609359264374, + "learning_rate": 2.4025592648138807e-05, + "loss": 1.832, + "step": 14460 + }, + { + "epoch": 0.4590241671888496, + "grad_norm": 0.31973832845687866, + "learning_rate": 2.3959288602911398e-05, + "loss": 1.8325, + "step": 14470 + }, + { + "epoch": 0.4595661083543618, + "grad_norm": 0.2859000265598297, + "learning_rate": 2.3893112868085134e-05, + "loss": 1.8248, + "step": 14480 + }, + { + "epoch": 0.460108049519874, + "grad_norm": 0.2959967255592346, + "learning_rate": 2.382706571720516e-05, + "loss": 1.8338, + "step": 14490 + }, + { + "epoch": 0.4606499906853862, + "grad_norm": 0.26421812176704407, + "learning_rate": 2.376114742328507e-05, + "loss": 1.8357, + "step": 14500 + }, + { + "epoch": 0.46119193185089846, + "grad_norm": 0.29020607471466064, + "learning_rate": 2.3695358258805813e-05, + "loss": 1.8357, + "step": 14510 + }, + { + "epoch": 0.46173387301641067, + "grad_norm": 0.5411327481269836, + "learning_rate": 2.3629698495714577e-05, + "loss": 1.8333, + "step": 14520 + }, + { + "epoch": 0.46222162006537165, + "eval_loss": 2.5997419357299805, + "eval_runtime": 21.9696, + "eval_samples_per_second": 227.587, + "eval_steps_per_second": 1.229, + "step": 14529 + }, + { + "epoch": 0.4622758141819229, + "grad_norm": 0.4464700222015381, + "learning_rate": 2.356416840542364e-05, + "loss": 1.8359, + "step": 14530 + }, + { + "epoch": 0.4628177553474351, + "grad_norm": 0.37165167927742004, + "learning_rate": 2.3498768258809296e-05, + "loss": 1.8324, + "step": 14540 + }, + { + "epoch": 0.46335969651294734, + "grad_norm": 0.3330148458480835, + "learning_rate": 2.343349832621067e-05, + "loss": 1.8297, + "step": 14550 + }, + { + "epoch": 0.46390163767845954, + "grad_norm": 0.5441980361938477, + "learning_rate": 2.3368358877428643e-05, + "loss": 1.8247, + "step": 14560 + }, + { + "epoch": 0.46444357884397175, + "grad_norm": 0.2869478464126587, + "learning_rate": 2.3303350181724716e-05, + "loss": 1.8367, + "step": 14570 + }, + { + "epoch": 0.46498552000948395, + "grad_norm": 0.30734243988990784, + "learning_rate": 2.3238472507819923e-05, + "loss": 1.8368, + "step": 14580 + }, + { + "epoch": 0.4655274611749962, + "grad_norm": 0.27206000685691833, + "learning_rate": 2.3173726123893675e-05, + "loss": 1.8442, + "step": 14590 + }, + { + "epoch": 0.4660694023405084, + "grad_norm": 0.34638717770576477, + "learning_rate": 2.3109111297582744e-05, + "loss": 1.8407, + "step": 14600 + }, + { + "epoch": 0.4666113435060206, + "grad_norm": 0.29559314250946045, + "learning_rate": 2.3044628295980027e-05, + "loss": 1.8357, + "step": 14610 + }, + { + "epoch": 0.46693650820532795, + "eval_loss": 2.6009130477905273, + "eval_runtime": 21.9702, + "eval_samples_per_second": 227.581, + "eval_steps_per_second": 1.229, + "step": 14616 + }, + { + "epoch": 0.46715328467153283, + "grad_norm": 0.31058135628700256, + "learning_rate": 2.2980277385633533e-05, + "loss": 1.8294, + "step": 14620 + }, + { + "epoch": 0.4676952258370451, + "grad_norm": 0.27876853942871094, + "learning_rate": 2.291605883254525e-05, + "loss": 1.8333, + "step": 14630 + }, + { + "epoch": 0.4682371670025573, + "grad_norm": 0.3003843128681183, + "learning_rate": 2.2851972902170053e-05, + "loss": 1.8333, + "step": 14640 + }, + { + "epoch": 0.4687791081680695, + "grad_norm": 0.4129401743412018, + "learning_rate": 2.2788019859414646e-05, + "loss": 1.8185, + "step": 14650 + }, + { + "epoch": 0.4693210493335817, + "grad_norm": 0.2741876542568207, + "learning_rate": 2.2724199968636357e-05, + "loss": 1.8215, + "step": 14660 + }, + { + "epoch": 0.46986299049909397, + "grad_norm": 0.34926167130470276, + "learning_rate": 2.266051349364216e-05, + "loss": 1.8269, + "step": 14670 + }, + { + "epoch": 0.4704049316646062, + "grad_norm": 0.3961677849292755, + "learning_rate": 2.2596960697687518e-05, + "loss": 1.8275, + "step": 14680 + }, + { + "epoch": 0.4709468728301184, + "grad_norm": 0.3206305503845215, + "learning_rate": 2.2533541843475344e-05, + "loss": 1.836, + "step": 14690 + }, + { + "epoch": 0.4714888139956306, + "grad_norm": 0.349286824464798, + "learning_rate": 2.247025719315484e-05, + "loss": 1.8297, + "step": 14700 + }, + { + "epoch": 0.47165139634528425, + "eval_loss": 2.6037356853485107, + "eval_runtime": 22.0801, + "eval_samples_per_second": 226.448, + "eval_steps_per_second": 1.223, + "step": 14703 + }, + { + "epoch": 0.47203075516114285, + "grad_norm": 0.3154524862766266, + "learning_rate": 2.240710700832052e-05, + "loss": 1.8323, + "step": 14710 + }, + { + "epoch": 0.47257269632665505, + "grad_norm": 0.30774760246276855, + "learning_rate": 2.2344091550011033e-05, + "loss": 1.8456, + "step": 14720 + }, + { + "epoch": 0.47311463749216726, + "grad_norm": 0.3301653265953064, + "learning_rate": 2.228121107870812e-05, + "loss": 1.8384, + "step": 14730 + }, + { + "epoch": 0.47365657865767946, + "grad_norm": 0.30092665553092957, + "learning_rate": 2.2218465854335535e-05, + "loss": 1.8373, + "step": 14740 + }, + { + "epoch": 0.4741985198231917, + "grad_norm": 0.28236475586891174, + "learning_rate": 2.215585613625798e-05, + "loss": 1.8184, + "step": 14750 + }, + { + "epoch": 0.4747404609887039, + "grad_norm": 0.28573015332221985, + "learning_rate": 2.209338218328006e-05, + "loss": 1.841, + "step": 14760 + }, + { + "epoch": 0.47528240215421613, + "grad_norm": 0.3064768314361572, + "learning_rate": 2.2031044253645117e-05, + "loss": 1.8275, + "step": 14770 + }, + { + "epoch": 0.47582434331972834, + "grad_norm": 0.2896689772605896, + "learning_rate": 2.1968842605034262e-05, + "loss": 1.8326, + "step": 14780 + }, + { + "epoch": 0.4763662844852406, + "grad_norm": 0.28397008776664734, + "learning_rate": 2.190677749456526e-05, + "loss": 1.8242, + "step": 14790 + }, + { + "epoch": 0.4763662844852406, + "eval_loss": 2.599595785140991, + "eval_runtime": 22.3123, + "eval_samples_per_second": 224.092, + "eval_steps_per_second": 1.21, + "step": 14790 + }, + { + "epoch": 0.4769082256507528, + "grad_norm": 0.7356740236282349, + "learning_rate": 2.1844849178791486e-05, + "loss": 1.8301, + "step": 14800 + }, + { + "epoch": 0.477450166816265, + "grad_norm": 0.40485429763793945, + "learning_rate": 2.1783057913700865e-05, + "loss": 1.825, + "step": 14810 + }, + { + "epoch": 0.4779921079817772, + "grad_norm": 0.2894582748413086, + "learning_rate": 2.17214039547148e-05, + "loss": 1.8295, + "step": 14820 + }, + { + "epoch": 0.4785340491472894, + "grad_norm": 0.28514474630355835, + "learning_rate": 2.1659887556687102e-05, + "loss": 1.8327, + "step": 14830 + }, + { + "epoch": 0.4790759903128017, + "grad_norm": 0.3327902555465698, + "learning_rate": 2.1598508973903004e-05, + "loss": 1.8393, + "step": 14840 + }, + { + "epoch": 0.4796179314783139, + "grad_norm": 0.28141072392463684, + "learning_rate": 2.1537268460078018e-05, + "loss": 1.8279, + "step": 14850 + }, + { + "epoch": 0.4801598726438261, + "grad_norm": 0.5223854184150696, + "learning_rate": 2.147616626835694e-05, + "loss": 1.8246, + "step": 14860 + }, + { + "epoch": 0.4807018138093383, + "grad_norm": 0.28995490074157715, + "learning_rate": 2.141520265131284e-05, + "loss": 1.8196, + "step": 14870 + }, + { + "epoch": 0.4810811726251969, + "eval_loss": 2.6000826358795166, + "eval_runtime": 21.9715, + "eval_samples_per_second": 227.568, + "eval_steps_per_second": 1.229, + "step": 14877 + }, + { + "epoch": 0.48124375497485056, + "grad_norm": 0.3166589140892029, + "learning_rate": 2.1354377860945925e-05, + "loss": 1.829, + "step": 14880 + }, + { + "epoch": 0.48178569614036276, + "grad_norm": 0.304440975189209, + "learning_rate": 2.1293692148682553e-05, + "loss": 1.8353, + "step": 14890 + }, + { + "epoch": 0.48232763730587497, + "grad_norm": 0.3413919508457184, + "learning_rate": 2.1233145765374202e-05, + "loss": 1.8222, + "step": 14900 + }, + { + "epoch": 0.4828695784713872, + "grad_norm": 0.31630587577819824, + "learning_rate": 2.1172738961296396e-05, + "loss": 1.8281, + "step": 14910 + }, + { + "epoch": 0.48341151963689943, + "grad_norm": 0.3274076581001282, + "learning_rate": 2.1112471986147723e-05, + "loss": 1.8278, + "step": 14920 + }, + { + "epoch": 0.48395346080241164, + "grad_norm": 0.3373514711856842, + "learning_rate": 2.1052345089048765e-05, + "loss": 1.818, + "step": 14930 + }, + { + "epoch": 0.48449540196792384, + "grad_norm": 0.43474531173706055, + "learning_rate": 2.0992358518541025e-05, + "loss": 1.8377, + "step": 14940 + }, + { + "epoch": 0.48503734313343605, + "grad_norm": 0.28481611609458923, + "learning_rate": 2.093251252258602e-05, + "loss": 1.8339, + "step": 14950 + }, + { + "epoch": 0.4855792842989483, + "grad_norm": 0.38067948818206787, + "learning_rate": 2.0872807348564134e-05, + "loss": 1.8251, + "step": 14960 + }, + { + "epoch": 0.4857960607651532, + "eval_loss": 2.606252431869507, + "eval_runtime": 21.966, + "eval_samples_per_second": 227.624, + "eval_steps_per_second": 1.229, + "step": 14964 + }, + { + "epoch": 0.4861212254644605, + "grad_norm": 0.30156803131103516, + "learning_rate": 2.0813243243273694e-05, + "loss": 1.832, + "step": 14970 + }, + { + "epoch": 0.4866631666299727, + "grad_norm": 0.2849683463573456, + "learning_rate": 2.075382045292987e-05, + "loss": 1.8327, + "step": 14980 + }, + { + "epoch": 0.4872051077954849, + "grad_norm": 0.2862844169139862, + "learning_rate": 2.0694539223163674e-05, + "loss": 1.8226, + "step": 14990 + }, + { + "epoch": 0.4877470489609972, + "grad_norm": 0.3448987603187561, + "learning_rate": 2.0635399799021005e-05, + "loss": 1.827, + "step": 15000 + }, + { + "epoch": 0.000541941165512219, + "grad_norm": 0.3310023248195648, + "learning_rate": 2.0576402424961567e-05, + "loss": 1.821, + "step": 15010 + }, + { + "epoch": 0.001083882331024438, + "grad_norm": 0.2753497064113617, + "learning_rate": 2.0517547344857874e-05, + "loss": 1.8207, + "step": 15020 + }, + { + "epoch": 0.0016258234965366573, + "grad_norm": 0.2612669765949249, + "learning_rate": 2.04588348019943e-05, + "loss": 1.8235, + "step": 15030 + }, + { + "epoch": 0.002167764662048876, + "grad_norm": 0.2987896502017975, + "learning_rate": 2.0400265039065938e-05, + "loss": 1.8197, + "step": 15040 + }, + { + "epoch": 0.0027097058275610954, + "grad_norm": 0.27388012409210205, + "learning_rate": 2.0341838298177776e-05, + "loss": 1.8277, + "step": 15050 + }, + { + "epoch": 0.002763899944112317, + "eval_loss": 2.6046245098114014, + "eval_runtime": 23.5773, + "eval_samples_per_second": 212.068, + "eval_steps_per_second": 1.145, + "step": 15051 + }, + { + "epoch": 0.0032516469930733145, + "grad_norm": 0.3995117247104645, + "learning_rate": 2.0283554820843547e-05, + "loss": 1.8286, + "step": 15060 + }, + { + "epoch": 0.0037935881585855337, + "grad_norm": 0.2957726716995239, + "learning_rate": 2.022541484798479e-05, + "loss": 1.8287, + "step": 15070 + }, + { + "epoch": 0.004335529324097752, + "grad_norm": 0.2851552963256836, + "learning_rate": 2.0167418619929908e-05, + "loss": 1.8285, + "step": 15080 + }, + { + "epoch": 0.0048774704896099716, + "grad_norm": 0.3845987021923065, + "learning_rate": 2.010956637641303e-05, + "loss": 1.8201, + "step": 15090 + }, + { + "epoch": 0.005419411655122191, + "grad_norm": 0.2890310287475586, + "learning_rate": 2.0051858356573155e-05, + "loss": 1.8243, + "step": 15100 + }, + { + "epoch": 0.00596135282063441, + "grad_norm": 0.27605000138282776, + "learning_rate": 1.9994294798953134e-05, + "loss": 1.8272, + "step": 15110 + }, + { + "epoch": 0.006503293986146629, + "grad_norm": 0.39410722255706787, + "learning_rate": 1.9936875941498646e-05, + "loss": 1.8233, + "step": 15120 + }, + { + "epoch": 0.007045235151658848, + "grad_norm": 0.37865981459617615, + "learning_rate": 1.9879602021557226e-05, + "loss": 1.8267, + "step": 15130 + }, + { + "epoch": 0.007478788084068624, + "eval_loss": 2.604631185531616, + "eval_runtime": 23.8764, + "eval_samples_per_second": 209.412, + "eval_steps_per_second": 1.131, + "step": 15138 + }, + { + "epoch": 0.007587176317171067, + "grad_norm": 0.28183528780937195, + "learning_rate": 1.98224732758773e-05, + "loss": 1.8224, + "step": 15140 + }, + { + "epoch": 0.008129117482683286, + "grad_norm": 0.2929738759994507, + "learning_rate": 1.97654899406072e-05, + "loss": 1.8245, + "step": 15150 + }, + { + "epoch": 0.008671058648195505, + "grad_norm": 0.28582409024238586, + "learning_rate": 1.9708652251294206e-05, + "loss": 1.8136, + "step": 15160 + }, + { + "epoch": 0.009212999813707724, + "grad_norm": 0.2901137173175812, + "learning_rate": 1.9651960442883528e-05, + "loss": 1.8254, + "step": 15170 + }, + { + "epoch": 0.009754940979219943, + "grad_norm": 0.26637405157089233, + "learning_rate": 1.9595414749717363e-05, + "loss": 1.8244, + "step": 15180 + }, + { + "epoch": 0.010296882144732162, + "grad_norm": 0.2925315201282501, + "learning_rate": 1.9539015405533935e-05, + "loss": 1.8162, + "step": 15190 + }, + { + "epoch": 0.010838823310244381, + "grad_norm": 0.3221004009246826, + "learning_rate": 1.9482762643466504e-05, + "loss": 1.8219, + "step": 15200 + }, + { + "epoch": 0.0113807644757566, + "grad_norm": 0.3065180480480194, + "learning_rate": 1.9426656696042424e-05, + "loss": 1.8196, + "step": 15210 + }, + { + "epoch": 0.01192270564126882, + "grad_norm": 0.3092059791088104, + "learning_rate": 1.9370697795182187e-05, + "loss": 1.8217, + "step": 15220 + }, + { + "epoch": 0.01219367622402493, + "eval_loss": 2.6005666255950928, + "eval_runtime": 22.3369, + "eval_samples_per_second": 223.845, + "eval_steps_per_second": 1.209, + "step": 15225 + }, + { + "epoch": 0.012464646806781039, + "grad_norm": 0.3684033155441284, + "learning_rate": 1.9314886172198426e-05, + "loss": 1.8316, + "step": 15230 + }, + { + "epoch": 0.013006587972293258, + "grad_norm": 0.3613276779651642, + "learning_rate": 1.9259222057794996e-05, + "loss": 1.8332, + "step": 15240 + }, + { + "epoch": 0.013548529137805477, + "grad_norm": 0.26722803711891174, + "learning_rate": 1.9203705682066007e-05, + "loss": 1.8249, + "step": 15250 + }, + { + "epoch": 0.014090470303317696, + "grad_norm": 0.30705708265304565, + "learning_rate": 1.9148337274494862e-05, + "loss": 1.8228, + "step": 15260 + }, + { + "epoch": 0.014632411468829916, + "grad_norm": 0.32335802912712097, + "learning_rate": 1.9093117063953354e-05, + "loss": 1.8176, + "step": 15270 + }, + { + "epoch": 0.015174352634342135, + "grad_norm": 0.33310064673423767, + "learning_rate": 1.9038045278700653e-05, + "loss": 1.824, + "step": 15280 + }, + { + "epoch": 0.015716293799854352, + "grad_norm": 0.27640849351882935, + "learning_rate": 1.8983122146382406e-05, + "loss": 1.8284, + "step": 15290 + }, + { + "epoch": 0.01625823496536657, + "grad_norm": 0.27379941940307617, + "learning_rate": 1.892834789402979e-05, + "loss": 1.8175, + "step": 15300 + }, + { + "epoch": 0.01680017613087879, + "grad_norm": 0.32370367646217346, + "learning_rate": 1.8873722748058552e-05, + "loss": 1.8272, + "step": 15310 + }, + { + "epoch": 0.016908564363981235, + "eval_loss": 2.6025352478027344, + "eval_runtime": 25.0337, + "eval_samples_per_second": 199.731, + "eval_steps_per_second": 1.079, + "step": 15312 + }, + { + "epoch": 0.01734211729639101, + "grad_norm": 0.3098716139793396, + "learning_rate": 1.8819246934268125e-05, + "loss": 1.8247, + "step": 15320 + }, + { + "epoch": 0.01788405846190323, + "grad_norm": 0.2977130115032196, + "learning_rate": 1.8764920677840632e-05, + "loss": 1.8229, + "step": 15330 + }, + { + "epoch": 0.018425999627415448, + "grad_norm": 0.28470513224601746, + "learning_rate": 1.871074420333999e-05, + "loss": 1.817, + "step": 15340 + }, + { + "epoch": 0.018967940792927667, + "grad_norm": 0.2921582758426666, + "learning_rate": 1.8656717734710975e-05, + "loss": 1.8392, + "step": 15350 + }, + { + "epoch": 0.019509881958439886, + "grad_norm": 0.30813974142074585, + "learning_rate": 1.8602841495278294e-05, + "loss": 1.8216, + "step": 15360 + }, + { + "epoch": 0.020051823123952105, + "grad_norm": 0.27432745695114136, + "learning_rate": 1.8549115707745656e-05, + "loss": 1.8257, + "step": 15370 + }, + { + "epoch": 0.020593764289464325, + "grad_norm": 0.2718896269798279, + "learning_rate": 1.84955405941949e-05, + "loss": 1.8297, + "step": 15380 + }, + { + "epoch": 0.021135705454976544, + "grad_norm": 0.2647911310195923, + "learning_rate": 1.8442116376084985e-05, + "loss": 1.8202, + "step": 15390 + }, + { + "epoch": 0.021623452503937542, + "eval_loss": 2.602659225463867, + "eval_runtime": 26.4511, + "eval_samples_per_second": 189.028, + "eval_steps_per_second": 1.021, + "step": 15399 + }, + { + "epoch": 0.021677646620488763, + "grad_norm": 0.3035239279270172, + "learning_rate": 1.8388843274251156e-05, + "loss": 1.825, + "step": 15400 + }, + { + "epoch": 0.022219587786000982, + "grad_norm": 0.31842127442359924, + "learning_rate": 1.8335721508903987e-05, + "loss": 1.8171, + "step": 15410 + }, + { + "epoch": 0.0227615289515132, + "grad_norm": 0.3104911744594574, + "learning_rate": 1.8282751299628486e-05, + "loss": 1.8218, + "step": 15420 + }, + { + "epoch": 0.02330347011702542, + "grad_norm": 0.3664672076702118, + "learning_rate": 1.822993286538321e-05, + "loss": 1.8127, + "step": 15430 + }, + { + "epoch": 0.02384541128253764, + "grad_norm": 0.2692658007144928, + "learning_rate": 1.8177266424499313e-05, + "loss": 1.8264, + "step": 15440 + }, + { + "epoch": 0.02438735244804986, + "grad_norm": 0.3049471378326416, + "learning_rate": 1.812475219467966e-05, + "loss": 1.8147, + "step": 15450 + }, + { + "epoch": 0.024929293613562078, + "grad_norm": 0.32675981521606445, + "learning_rate": 1.8072390392997954e-05, + "loss": 1.8286, + "step": 15460 + }, + { + "epoch": 0.025471234779074297, + "grad_norm": 0.2658865451812744, + "learning_rate": 1.8020181235897797e-05, + "loss": 1.8234, + "step": 15470 + }, + { + "epoch": 0.026013175944586516, + "grad_norm": 0.28163376450538635, + "learning_rate": 1.796812493919185e-05, + "loss": 1.8217, + "step": 15480 + }, + { + "epoch": 0.026338340643893846, + "eval_loss": 2.600201368331909, + "eval_runtime": 22.1652, + "eval_samples_per_second": 225.579, + "eval_steps_per_second": 1.218, + "step": 15486 + }, + { + "epoch": 0.026555117110098735, + "grad_norm": 0.32032302021980286, + "learning_rate": 1.791622171806088e-05, + "loss": 1.8177, + "step": 15490 + }, + { + "epoch": 0.027097058275610954, + "grad_norm": 0.28977230191230774, + "learning_rate": 1.78644717870529e-05, + "loss": 1.8137, + "step": 15500 + }, + { + "epoch": 0.027638999441123174, + "grad_norm": 0.332872599363327, + "learning_rate": 1.781287536008229e-05, + "loss": 1.8214, + "step": 15510 + }, + { + "epoch": 0.028180940606635393, + "grad_norm": 0.27174341678619385, + "learning_rate": 1.77614326504289e-05, + "loss": 1.8277, + "step": 15520 + }, + { + "epoch": 0.028722881772147612, + "grad_norm": 0.27996736764907837, + "learning_rate": 1.771014387073715e-05, + "loss": 1.8163, + "step": 15530 + }, + { + "epoch": 0.02926482293765983, + "grad_norm": 0.26476001739501953, + "learning_rate": 1.765900923301523e-05, + "loss": 1.8268, + "step": 15540 + }, + { + "epoch": 0.02980676410317205, + "grad_norm": 0.3071003556251526, + "learning_rate": 1.760802894863412e-05, + "loss": 1.8231, + "step": 15550 + }, + { + "epoch": 0.03034870526868427, + "grad_norm": 0.31128978729248047, + "learning_rate": 1.7557203228326737e-05, + "loss": 1.8213, + "step": 15560 + }, + { + "epoch": 0.03089064643419649, + "grad_norm": 0.27923986315727234, + "learning_rate": 1.7506532282187166e-05, + "loss": 1.8192, + "step": 15570 + }, + { + "epoch": 0.031053228783850154, + "eval_loss": 2.6021640300750732, + "eval_runtime": 22.1843, + "eval_samples_per_second": 225.384, + "eval_steps_per_second": 1.217, + "step": 15573 + }, + { + "epoch": 0.031432587599708704, + "grad_norm": 0.324139267206192, + "learning_rate": 1.7456016319669642e-05, + "loss": 1.827, + "step": 15580 + }, + { + "epoch": 0.03197452876522092, + "grad_norm": 0.27704742550849915, + "learning_rate": 1.7405655549587816e-05, + "loss": 1.8165, + "step": 15590 + }, + { + "epoch": 0.03251646993073314, + "grad_norm": 0.3016372323036194, + "learning_rate": 1.7355450180113808e-05, + "loss": 1.8272, + "step": 15600 + }, + { + "epoch": 0.03305841109624536, + "grad_norm": 0.2791915833950043, + "learning_rate": 1.730540041877733e-05, + "loss": 1.8166, + "step": 15610 + }, + { + "epoch": 0.03360035226175758, + "grad_norm": 0.40646281838417053, + "learning_rate": 1.7255506472464936e-05, + "loss": 1.828, + "step": 15620 + }, + { + "epoch": 0.0341422934272698, + "grad_norm": 0.32504037022590637, + "learning_rate": 1.7205768547419077e-05, + "loss": 1.8265, + "step": 15630 + }, + { + "epoch": 0.03468423459278202, + "grad_norm": 0.2760820984840393, + "learning_rate": 1.7156186849237244e-05, + "loss": 1.8341, + "step": 15640 + }, + { + "epoch": 0.03522617575829424, + "grad_norm": 0.2752606272697449, + "learning_rate": 1.7106761582871205e-05, + "loss": 1.8145, + "step": 15650 + }, + { + "epoch": 0.03576811692380646, + "grad_norm": 0.3181304633617401, + "learning_rate": 1.7057492952626025e-05, + "loss": 1.8322, + "step": 15660 + }, + { + "epoch": 0.03576811692380646, + "eval_loss": 2.6028239727020264, + "eval_runtime": 21.9861, + "eval_samples_per_second": 227.417, + "eval_steps_per_second": 1.228, + "step": 15660 + }, + { + "epoch": 0.03631005808931868, + "grad_norm": 0.2869409918785095, + "learning_rate": 1.7008381162159358e-05, + "loss": 1.8151, + "step": 15670 + }, + { + "epoch": 0.036851999254830896, + "grad_norm": 0.30195239186286926, + "learning_rate": 1.6959426414480516e-05, + "loss": 1.8268, + "step": 15680 + }, + { + "epoch": 0.037393940420343115, + "grad_norm": 0.2631888687610626, + "learning_rate": 1.6910628911949644e-05, + "loss": 1.8166, + "step": 15690 + }, + { + "epoch": 0.037935881585855334, + "grad_norm": 0.29582157731056213, + "learning_rate": 1.6861988856276946e-05, + "loss": 1.8089, + "step": 15700 + }, + { + "epoch": 0.03847782275136755, + "grad_norm": 0.27693384885787964, + "learning_rate": 1.6813506448521727e-05, + "loss": 1.812, + "step": 15710 + }, + { + "epoch": 0.03901976391687977, + "grad_norm": 0.282247930765152, + "learning_rate": 1.6765181889091675e-05, + "loss": 1.819, + "step": 15720 + }, + { + "epoch": 0.03956170508239199, + "grad_norm": 0.3580980896949768, + "learning_rate": 1.671701537774202e-05, + "loss": 1.8174, + "step": 15730 + }, + { + "epoch": 0.04010364624790421, + "grad_norm": 0.2820577621459961, + "learning_rate": 1.666900711357463e-05, + "loss": 1.8327, + "step": 15740 + }, + { + "epoch": 0.04048300506376277, + "eval_loss": 2.6001157760620117, + "eval_runtime": 25.7544, + "eval_samples_per_second": 194.141, + "eval_steps_per_second": 1.048, + "step": 15747 + }, + { + "epoch": 0.04064558741341643, + "grad_norm": 0.3680673837661743, + "learning_rate": 1.6621157295037298e-05, + "loss": 1.8087, + "step": 15750 + }, + { + "epoch": 0.04118752857892865, + "grad_norm": 0.3296014964580536, + "learning_rate": 1.65734661199228e-05, + "loss": 1.8183, + "step": 15760 + }, + { + "epoch": 0.04172946974444087, + "grad_norm": 0.29352930188179016, + "learning_rate": 1.652593378536816e-05, + "loss": 1.8075, + "step": 15770 + }, + { + "epoch": 0.04227141090995309, + "grad_norm": 0.36422020196914673, + "learning_rate": 1.6478560487853866e-05, + "loss": 1.8151, + "step": 15780 + }, + { + "epoch": 0.04281335207546531, + "grad_norm": 0.30587121844291687, + "learning_rate": 1.6431346423202945e-05, + "loss": 1.8174, + "step": 15790 + }, + { + "epoch": 0.043355293240977526, + "grad_norm": 0.29669710993766785, + "learning_rate": 1.6384291786580247e-05, + "loss": 1.8186, + "step": 15800 + }, + { + "epoch": 0.043897234406489745, + "grad_norm": 0.3331429362297058, + "learning_rate": 1.633739677249159e-05, + "loss": 1.8162, + "step": 15810 + }, + { + "epoch": 0.044439175572001964, + "grad_norm": 0.3026489019393921, + "learning_rate": 1.6290661574782995e-05, + "loss": 1.8103, + "step": 15820 + }, + { + "epoch": 0.04498111673751418, + "grad_norm": 0.29594072699546814, + "learning_rate": 1.624408638663985e-05, + "loss": 1.8055, + "step": 15830 + }, + { + "epoch": 0.04519789320371907, + "eval_loss": 2.599421977996826, + "eval_runtime": 23.9096, + "eval_samples_per_second": 209.121, + "eval_steps_per_second": 1.129, + "step": 15834 + }, + { + "epoch": 0.0455230579030264, + "grad_norm": 0.286182165145874, + "learning_rate": 1.619767140058614e-05, + "loss": 1.8213, + "step": 15840 + }, + { + "epoch": 0.04606499906853862, + "grad_norm": 0.3165307343006134, + "learning_rate": 1.6151416808483603e-05, + "loss": 1.816, + "step": 15850 + }, + { + "epoch": 0.04660694023405084, + "grad_norm": 0.3465806245803833, + "learning_rate": 1.6105322801531005e-05, + "loss": 1.8199, + "step": 15860 + }, + { + "epoch": 0.04714888139956306, + "grad_norm": 0.31671446561813354, + "learning_rate": 1.605938957026329e-05, + "loss": 1.8164, + "step": 15870 + }, + { + "epoch": 0.04769082256507528, + "grad_norm": 0.2874152362346649, + "learning_rate": 1.6013617304550827e-05, + "loss": 1.8194, + "step": 15880 + }, + { + "epoch": 0.0482327637305875, + "grad_norm": 0.28032568097114563, + "learning_rate": 1.5968006193598626e-05, + "loss": 1.8211, + "step": 15890 + }, + { + "epoch": 0.04877470489609972, + "grad_norm": 0.3768242597579956, + "learning_rate": 1.5922556425945532e-05, + "loss": 1.8175, + "step": 15900 + }, + { + "epoch": 0.049316646061611936, + "grad_norm": 0.32226991653442383, + "learning_rate": 1.5877268189463456e-05, + "loss": 1.8246, + "step": 15910 + }, + { + "epoch": 0.049858587227124156, + "grad_norm": 0.29384303092956543, + "learning_rate": 1.583214167135661e-05, + "loss": 1.8125, + "step": 15920 + }, + { + "epoch": 0.049912781343675376, + "eval_loss": 2.5980074405670166, + "eval_runtime": 24.6083, + "eval_samples_per_second": 203.184, + "eval_steps_per_second": 1.097, + "step": 15921 + }, + { + "epoch": 0.050400528392636375, + "grad_norm": 0.30090948939323425, + "learning_rate": 1.57871770581607e-05, + "loss": 1.8066, + "step": 15930 + }, + { + "epoch": 0.050942469558148594, + "grad_norm": 0.28371649980545044, + "learning_rate": 1.5742374535742233e-05, + "loss": 1.8208, + "step": 15940 + }, + { + "epoch": 0.05148441072366081, + "grad_norm": 0.2772349417209625, + "learning_rate": 1.569773428929765e-05, + "loss": 1.8237, + "step": 15950 + }, + { + "epoch": 0.05202635188917303, + "grad_norm": 0.28294837474823, + "learning_rate": 1.5653256503352603e-05, + "loss": 1.8141, + "step": 15960 + }, + { + "epoch": 0.05256829305468525, + "grad_norm": 0.264106810092926, + "learning_rate": 1.5608941361761224e-05, + "loss": 1.8101, + "step": 15970 + }, + { + "epoch": 0.05311023422019747, + "grad_norm": 0.2991582155227661, + "learning_rate": 1.5564789047705296e-05, + "loss": 1.8162, + "step": 15980 + }, + { + "epoch": 0.05365217538570969, + "grad_norm": 0.2891395688056946, + "learning_rate": 1.5520799743693585e-05, + "loss": 1.8405, + "step": 15990 + }, + { + "epoch": 0.05419411655122191, + "grad_norm": 0.29393982887268066, + "learning_rate": 1.5476973631561003e-05, + "loss": 1.8179, + "step": 16000 + }, + { + "epoch": 0.05462766948363168, + "eval_loss": 2.5983400344848633, + "eval_runtime": 24.2547, + "eval_samples_per_second": 206.145, + "eval_steps_per_second": 1.113, + "step": 16008 + }, + { + "epoch": 0.05473605771673413, + "grad_norm": 0.2763077914714813, + "learning_rate": 1.5433310892467897e-05, + "loss": 1.8226, + "step": 16010 + }, + { + "epoch": 0.05527799888224635, + "grad_norm": 0.27592843770980835, + "learning_rate": 1.5389811706899284e-05, + "loss": 1.8173, + "step": 16020 + }, + { + "epoch": 0.055819940047758566, + "grad_norm": 0.31375107169151306, + "learning_rate": 1.5346476254664132e-05, + "loss": 1.8118, + "step": 16030 + }, + { + "epoch": 0.056361881213270786, + "grad_norm": 0.3110925853252411, + "learning_rate": 1.5303304714894568e-05, + "loss": 1.8171, + "step": 16040 + }, + { + "epoch": 0.056903822378783005, + "grad_norm": 0.2970430254936218, + "learning_rate": 1.526029726604521e-05, + "loss": 1.8095, + "step": 16050 + }, + { + "epoch": 0.057445763544295224, + "grad_norm": 0.27963101863861084, + "learning_rate": 1.5217454085892358e-05, + "loss": 1.8202, + "step": 16060 + }, + { + "epoch": 0.05798770470980744, + "grad_norm": 0.31473496556282043, + "learning_rate": 1.5174775351533289e-05, + "loss": 1.831, + "step": 16070 + }, + { + "epoch": 0.05852964587531966, + "grad_norm": 0.2634471654891968, + "learning_rate": 1.5132261239385533e-05, + "loss": 1.8102, + "step": 16080 + }, + { + "epoch": 0.05907158704083188, + "grad_norm": 0.36205658316612244, + "learning_rate": 1.5089911925186124e-05, + "loss": 1.814, + "step": 16090 + }, + { + "epoch": 0.05934255762358799, + "eval_loss": 2.5985541343688965, + "eval_runtime": 28.1025, + "eval_samples_per_second": 177.92, + "eval_steps_per_second": 0.961, + "step": 16095 + }, + { + "epoch": 0.0596135282063441, + "grad_norm": 0.5245100855827332, + "learning_rate": 1.5047727583990898e-05, + "loss": 1.8204, + "step": 16100 + }, + { + "epoch": 0.06015546937185632, + "grad_norm": 0.316148966550827, + "learning_rate": 1.5005708390173756e-05, + "loss": 1.8179, + "step": 16110 + }, + { + "epoch": 0.06069741053736854, + "grad_norm": 0.29313069581985474, + "learning_rate": 1.4963854517425934e-05, + "loss": 1.8179, + "step": 16120 + }, + { + "epoch": 0.06123935170288076, + "grad_norm": 0.26670292019844055, + "learning_rate": 1.4922166138755289e-05, + "loss": 1.8124, + "step": 16130 + }, + { + "epoch": 0.06178129286839298, + "grad_norm": 0.3401847183704376, + "learning_rate": 1.4880643426485602e-05, + "loss": 1.8163, + "step": 16140 + }, + { + "epoch": 0.062323234033905196, + "grad_norm": 0.31569597125053406, + "learning_rate": 1.4839286552255843e-05, + "loss": 1.8118, + "step": 16150 + }, + { + "epoch": 0.06286517519941741, + "grad_norm": 0.2905227839946747, + "learning_rate": 1.4798095687019482e-05, + "loss": 1.8281, + "step": 16160 + }, + { + "epoch": 0.06340711636492963, + "grad_norm": 0.27679443359375, + "learning_rate": 1.4757071001043765e-05, + "loss": 1.8123, + "step": 16170 + }, + { + "epoch": 0.06394905753044185, + "grad_norm": 0.2783887982368469, + "learning_rate": 1.4716212663909016e-05, + "loss": 1.806, + "step": 16180 + }, + { + "epoch": 0.0640574457635443, + "eval_loss": 2.598423957824707, + "eval_runtime": 22.218, + "eval_samples_per_second": 225.043, + "eval_steps_per_second": 1.215, + "step": 16182 + }, + { + "epoch": 0.06449099869595407, + "grad_norm": 0.2765551507472992, + "learning_rate": 1.4675520844507932e-05, + "loss": 1.8033, + "step": 16190 + }, + { + "epoch": 0.06503293986146629, + "grad_norm": 0.2806571424007416, + "learning_rate": 1.4634995711044892e-05, + "loss": 1.8129, + "step": 16200 + }, + { + "epoch": 0.0655748810269785, + "grad_norm": 0.3304462134838104, + "learning_rate": 1.4594637431035272e-05, + "loss": 1.8113, + "step": 16210 + }, + { + "epoch": 0.06611682219249072, + "grad_norm": 0.29869329929351807, + "learning_rate": 1.4554446171304736e-05, + "loss": 1.8068, + "step": 16220 + }, + { + "epoch": 0.06665876335800294, + "grad_norm": 0.2936442196369171, + "learning_rate": 1.451442209798852e-05, + "loss": 1.8089, + "step": 16230 + }, + { + "epoch": 0.06720070452351516, + "grad_norm": 0.2957615256309509, + "learning_rate": 1.4474565376530819e-05, + "loss": 1.7961, + "step": 16240 + }, + { + "epoch": 0.06774264568902738, + "grad_norm": 0.2832295298576355, + "learning_rate": 1.4434876171684023e-05, + "loss": 1.814, + "step": 16250 + }, + { + "epoch": 0.0682845868545396, + "grad_norm": 0.2817908525466919, + "learning_rate": 1.4395354647508113e-05, + "loss": 1.8179, + "step": 16260 + }, + { + "epoch": 0.0687723339035006, + "eval_loss": 2.5987160205841064, + "eval_runtime": 31.5857, + "eval_samples_per_second": 158.299, + "eval_steps_per_second": 0.855, + "step": 16269 + }, + { + "epoch": 0.06882652802005182, + "grad_norm": 0.27101555466651917, + "learning_rate": 1.4356000967369915e-05, + "loss": 1.8071, + "step": 16270 + }, + { + "epoch": 0.06936846918556404, + "grad_norm": 0.29149556159973145, + "learning_rate": 1.4316815293942438e-05, + "loss": 1.8095, + "step": 16280 + }, + { + "epoch": 0.06991041035107626, + "grad_norm": 0.3395109176635742, + "learning_rate": 1.4277797789204242e-05, + "loss": 1.8077, + "step": 16290 + }, + { + "epoch": 0.07045235151658848, + "grad_norm": 0.2934078276157379, + "learning_rate": 1.4238948614438738e-05, + "loss": 1.8234, + "step": 16300 + }, + { + "epoch": 0.0709942926821007, + "grad_norm": 0.40953195095062256, + "learning_rate": 1.4200267930233512e-05, + "loss": 1.8193, + "step": 16310 + }, + { + "epoch": 0.07153623384761292, + "grad_norm": 0.30236881971359253, + "learning_rate": 1.4161755896479711e-05, + "loss": 1.804, + "step": 16320 + }, + { + "epoch": 0.07207817501312513, + "grad_norm": 0.3711455762386322, + "learning_rate": 1.4123412672371278e-05, + "loss": 1.801, + "step": 16330 + }, + { + "epoch": 0.07262011617863735, + "grad_norm": 0.33262383937835693, + "learning_rate": 1.4085238416404434e-05, + "loss": 1.8189, + "step": 16340 + }, + { + "epoch": 0.07316205734414957, + "grad_norm": 0.2707761228084564, + "learning_rate": 1.4047233286376901e-05, + "loss": 1.8234, + "step": 16350 + }, + { + "epoch": 0.07348722204345691, + "eval_loss": 2.5961382389068604, + "eval_runtime": 23.0319, + "eval_samples_per_second": 217.09, + "eval_steps_per_second": 1.172, + "step": 16356 + }, + { + "epoch": 0.07370399850966179, + "grad_norm": 0.277217835187912, + "learning_rate": 1.4009397439387317e-05, + "loss": 1.8101, + "step": 16360 + }, + { + "epoch": 0.07424593967517401, + "grad_norm": 0.324625700712204, + "learning_rate": 1.3971731031834595e-05, + "loss": 1.8177, + "step": 16370 + }, + { + "epoch": 0.07478788084068623, + "grad_norm": 0.27004820108413696, + "learning_rate": 1.3934234219417198e-05, + "loss": 1.8239, + "step": 16380 + }, + { + "epoch": 0.07532982200619845, + "grad_norm": 0.31389129161834717, + "learning_rate": 1.3896907157132564e-05, + "loss": 1.8044, + "step": 16390 + }, + { + "epoch": 0.07587176317171067, + "grad_norm": 0.334043025970459, + "learning_rate": 1.3859749999276485e-05, + "loss": 1.8114, + "step": 16400 + }, + { + "epoch": 0.07641370433722289, + "grad_norm": 0.28280240297317505, + "learning_rate": 1.382276289944239e-05, + "loss": 1.81, + "step": 16410 + }, + { + "epoch": 0.0769556455027351, + "grad_norm": 0.2850959002971649, + "learning_rate": 1.3785946010520792e-05, + "loss": 1.8064, + "step": 16420 + }, + { + "epoch": 0.07749758666824733, + "grad_norm": 0.33790093660354614, + "learning_rate": 1.3749299484698571e-05, + "loss": 1.8038, + "step": 16430 + }, + { + "epoch": 0.07803952783375954, + "grad_norm": 0.3782959282398224, + "learning_rate": 1.3712823473458422e-05, + "loss": 1.814, + "step": 16440 + }, + { + "epoch": 0.0782021101834132, + "eval_loss": 2.5989608764648438, + "eval_runtime": 23.6056, + "eval_samples_per_second": 211.814, + "eval_steps_per_second": 1.144, + "step": 16443 + }, + { + "epoch": 0.07858146899927176, + "grad_norm": 0.34481021761894226, + "learning_rate": 1.36765181275782e-05, + "loss": 1.8129, + "step": 16450 + }, + { + "epoch": 0.07912341016478398, + "grad_norm": 0.2746882438659668, + "learning_rate": 1.3640383597130288e-05, + "loss": 1.8063, + "step": 16460 + }, + { + "epoch": 0.0796653513302962, + "grad_norm": 0.38598158955574036, + "learning_rate": 1.3604420031480968e-05, + "loss": 1.8072, + "step": 16470 + }, + { + "epoch": 0.08020729249580842, + "grad_norm": 0.267660915851593, + "learning_rate": 1.356862757928985e-05, + "loss": 1.7979, + "step": 16480 + }, + { + "epoch": 0.08074923366132064, + "grad_norm": 0.33238664269447327, + "learning_rate": 1.3533006388509187e-05, + "loss": 1.8071, + "step": 16490 + }, + { + "epoch": 0.08129117482683286, + "grad_norm": 0.28862419724464417, + "learning_rate": 1.3497556606383354e-05, + "loss": 1.8117, + "step": 16500 + }, + { + "epoch": 0.08183311599234508, + "grad_norm": 0.36335626244544983, + "learning_rate": 1.3462278379448148e-05, + "loss": 1.8215, + "step": 16510 + }, + { + "epoch": 0.0823750571578573, + "grad_norm": 0.31064873933792114, + "learning_rate": 1.3427171853530232e-05, + "loss": 1.803, + "step": 16520 + }, + { + "epoch": 0.08291699832336952, + "grad_norm": 0.3698948919773102, + "learning_rate": 1.3392237173746527e-05, + "loss": 1.8072, + "step": 16530 + }, + { + "epoch": 0.08291699832336952, + "eval_loss": 2.5981791019439697, + "eval_runtime": 21.9635, + "eval_samples_per_second": 227.65, + "eval_steps_per_second": 1.229, + "step": 16530 + }, + { + "epoch": 0.08345893948888174, + "grad_norm": 0.272014319896698, + "learning_rate": 1.3357474484503613e-05, + "loss": 1.8132, + "step": 16540 + }, + { + "epoch": 0.08400088065439396, + "grad_norm": 0.2674700617790222, + "learning_rate": 1.3322883929497105e-05, + "loss": 1.8112, + "step": 16550 + }, + { + "epoch": 0.08454282181990617, + "grad_norm": 0.3143172264099121, + "learning_rate": 1.3288465651711113e-05, + "loss": 1.8035, + "step": 16560 + }, + { + "epoch": 0.0850847629854184, + "grad_norm": 0.2932548224925995, + "learning_rate": 1.3254219793417596e-05, + "loss": 1.8099, + "step": 16570 + }, + { + "epoch": 0.08562670415093061, + "grad_norm": 0.28410300612449646, + "learning_rate": 1.3220146496175808e-05, + "loss": 1.8086, + "step": 16580 + }, + { + "epoch": 0.08616864531644283, + "grad_norm": 0.2889862358570099, + "learning_rate": 1.3186245900831692e-05, + "loss": 1.8116, + "step": 16590 + }, + { + "epoch": 0.08671058648195505, + "grad_norm": 0.26894381642341614, + "learning_rate": 1.3152518147517309e-05, + "loss": 1.8031, + "step": 16600 + }, + { + "epoch": 0.08725252764746727, + "grad_norm": 0.265250027179718, + "learning_rate": 1.3118963375650279e-05, + "loss": 1.8048, + "step": 16610 + }, + { + "epoch": 0.08763188646332583, + "eval_loss": 2.597559690475464, + "eval_runtime": 27.4632, + "eval_samples_per_second": 182.062, + "eval_steps_per_second": 0.983, + "step": 16617 + }, + { + "epoch": 0.08779446881297949, + "grad_norm": 0.27796122431755066, + "learning_rate": 1.3085581723933146e-05, + "loss": 1.8049, + "step": 16620 + }, + { + "epoch": 0.08833640997849171, + "grad_norm": 0.3169678747653961, + "learning_rate": 1.3052373330352873e-05, + "loss": 1.8094, + "step": 16630 + }, + { + "epoch": 0.08887835114400393, + "grad_norm": 0.3725588321685791, + "learning_rate": 1.3019338332180223e-05, + "loss": 1.8171, + "step": 16640 + }, + { + "epoch": 0.08942029230951615, + "grad_norm": 0.3112504780292511, + "learning_rate": 1.2986476865969215e-05, + "loss": 1.8162, + "step": 16650 + }, + { + "epoch": 0.08996223347502837, + "grad_norm": 0.33968761563301086, + "learning_rate": 1.2953789067556545e-05, + "loss": 1.8147, + "step": 16660 + }, + { + "epoch": 0.09050417464054059, + "grad_norm": 0.3981771469116211, + "learning_rate": 1.2921275072061061e-05, + "loss": 1.8085, + "step": 16670 + }, + { + "epoch": 0.0910461158060528, + "grad_norm": 0.35043928027153015, + "learning_rate": 1.2888935013883141e-05, + "loss": 1.8226, + "step": 16680 + }, + { + "epoch": 0.09158805697156502, + "grad_norm": 0.26825150847435, + "learning_rate": 1.2856769026704188e-05, + "loss": 1.8066, + "step": 16690 + }, + { + "epoch": 0.09212999813707724, + "grad_norm": 0.27858951687812805, + "learning_rate": 1.2824777243486063e-05, + "loss": 1.8188, + "step": 16700 + }, + { + "epoch": 0.09234677460328213, + "eval_loss": 2.594632863998413, + "eval_runtime": 22.2955, + "eval_samples_per_second": 224.26, + "eval_steps_per_second": 1.211, + "step": 16704 + }, + { + "epoch": 0.09267193930258946, + "grad_norm": 0.27273324131965637, + "learning_rate": 1.2792959796470527e-05, + "loss": 1.804, + "step": 16710 + }, + { + "epoch": 0.09321388046810168, + "grad_norm": 0.3499986529350281, + "learning_rate": 1.2761316817178725e-05, + "loss": 1.8051, + "step": 16720 + }, + { + "epoch": 0.0937558216336139, + "grad_norm": 0.26582643389701843, + "learning_rate": 1.2729848436410596e-05, + "loss": 1.8208, + "step": 16730 + }, + { + "epoch": 0.09429776279912612, + "grad_norm": 0.3342505693435669, + "learning_rate": 1.2698554784244355e-05, + "loss": 1.8129, + "step": 16740 + }, + { + "epoch": 0.09483970396463834, + "grad_norm": 0.28335967659950256, + "learning_rate": 1.2667435990035972e-05, + "loss": 1.8078, + "step": 16750 + }, + { + "epoch": 0.09538164513015056, + "grad_norm": 0.2727366089820862, + "learning_rate": 1.2636492182418594e-05, + "loss": 1.8062, + "step": 16760 + }, + { + "epoch": 0.09592358629566278, + "grad_norm": 0.31147339940071106, + "learning_rate": 1.2605723489302087e-05, + "loss": 1.8107, + "step": 16770 + }, + { + "epoch": 0.096465527461175, + "grad_norm": 0.2873106002807617, + "learning_rate": 1.2575130037872418e-05, + "loss": 1.8043, + "step": 16780 + }, + { + "epoch": 0.09700746862668722, + "grad_norm": 0.32366591691970825, + "learning_rate": 1.2544711954591192e-05, + "loss": 1.8257, + "step": 16790 + }, + { + "epoch": 0.09706166274323844, + "eval_loss": 2.5952253341674805, + "eval_runtime": 27.6353, + "eval_samples_per_second": 180.928, + "eval_steps_per_second": 0.977, + "step": 16791 + }, + { + "epoch": 0.09754940979219943, + "grad_norm": 0.2702518105506897, + "learning_rate": 1.2514469365195094e-05, + "loss": 1.8183, + "step": 16800 + }, + { + "epoch": 0.09809135095771165, + "grad_norm": 0.28163909912109375, + "learning_rate": 1.2484402394695408e-05, + "loss": 1.8045, + "step": 16810 + }, + { + "epoch": 0.09863329212322387, + "grad_norm": 0.2960811257362366, + "learning_rate": 1.2454511167377447e-05, + "loss": 1.8087, + "step": 16820 + }, + { + "epoch": 0.09917523328873609, + "grad_norm": 0.28118273615837097, + "learning_rate": 1.2424795806800102e-05, + "loss": 1.812, + "step": 16830 + }, + { + "epoch": 0.09971717445424831, + "grad_norm": 0.30942973494529724, + "learning_rate": 1.2395256435795278e-05, + "loss": 1.8082, + "step": 16840 + }, + { + "epoch": 0.10025911561976053, + "grad_norm": 0.33988916873931885, + "learning_rate": 1.2365893176467404e-05, + "loss": 1.812, + "step": 16850 + }, + { + "epoch": 0.10080105678527275, + "grad_norm": 0.3102359473705292, + "learning_rate": 1.2336706150192946e-05, + "loss": 1.8144, + "step": 16860 + }, + { + "epoch": 0.10134299795078497, + "grad_norm": 0.3185904920101166, + "learning_rate": 1.2307695477619872e-05, + "loss": 1.805, + "step": 16870 + }, + { + "epoch": 0.10177655088319475, + "eval_loss": 2.5944910049438477, + "eval_runtime": 22.1385, + "eval_samples_per_second": 225.85, + "eval_steps_per_second": 1.22, + "step": 16878 + }, + { + "epoch": 0.10188493911629719, + "grad_norm": 0.28458863496780396, + "learning_rate": 1.2278861278667187e-05, + "loss": 1.795, + "step": 16880 + }, + { + "epoch": 0.10242688028180941, + "grad_norm": 0.31006863713264465, + "learning_rate": 1.2250203672524424e-05, + "loss": 1.7943, + "step": 16890 + }, + { + "epoch": 0.10296882144732163, + "grad_norm": 0.28029975295066833, + "learning_rate": 1.2221722777651119e-05, + "loss": 1.8163, + "step": 16900 + }, + { + "epoch": 0.10351076261283385, + "grad_norm": 0.27603140473365784, + "learning_rate": 1.219341871177639e-05, + "loss": 1.814, + "step": 16910 + }, + { + "epoch": 0.10405270377834606, + "grad_norm": 0.2757927179336548, + "learning_rate": 1.2165291591898383e-05, + "loss": 1.8138, + "step": 16920 + }, + { + "epoch": 0.10459464494385828, + "grad_norm": 0.2880472242832184, + "learning_rate": 1.2137341534283842e-05, + "loss": 1.8064, + "step": 16930 + }, + { + "epoch": 0.1051365861093705, + "grad_norm": 0.2712019979953766, + "learning_rate": 1.2109568654467594e-05, + "loss": 1.8005, + "step": 16940 + }, + { + "epoch": 0.10567852727488272, + "grad_norm": 0.2719828188419342, + "learning_rate": 1.2081973067252051e-05, + "loss": 1.8146, + "step": 16950 + }, + { + "epoch": 0.10622046844039494, + "grad_norm": 0.3635210394859314, + "learning_rate": 1.205455488670681e-05, + "loss": 1.8189, + "step": 16960 + }, + { + "epoch": 0.10649143902315104, + "eval_loss": 2.596562385559082, + "eval_runtime": 28.8968, + "eval_samples_per_second": 173.029, + "eval_steps_per_second": 0.934, + "step": 16965 + }, + { + "epoch": 0.10676240960590716, + "grad_norm": 0.2864342927932739, + "learning_rate": 1.2027314226168121e-05, + "loss": 1.8097, + "step": 16970 + }, + { + "epoch": 0.10730435077141938, + "grad_norm": 0.3084823489189148, + "learning_rate": 1.2000251198238424e-05, + "loss": 1.8093, + "step": 16980 + }, + { + "epoch": 0.1078462919369316, + "grad_norm": 0.3098483383655548, + "learning_rate": 1.1973365914785909e-05, + "loss": 1.8137, + "step": 16990 + }, + { + "epoch": 0.10838823310244382, + "grad_norm": 0.30631518363952637, + "learning_rate": 1.1946658486944022e-05, + "loss": 1.8042, + "step": 17000 + }, + { + "epoch": 0.10893017426795604, + "grad_norm": 0.33063915371894836, + "learning_rate": 1.1920129025111034e-05, + "loss": 1.8086, + "step": 17010 + }, + { + "epoch": 0.10947211543346826, + "grad_norm": 0.2645542025566101, + "learning_rate": 1.1893777638949576e-05, + "loss": 1.8219, + "step": 17020 + }, + { + "epoch": 0.11001405659898048, + "grad_norm": 0.27368003129959106, + "learning_rate": 1.1867604437386164e-05, + "loss": 1.8101, + "step": 17030 + }, + { + "epoch": 0.1105559977644927, + "grad_norm": 0.2701696455478668, + "learning_rate": 1.1841609528610796e-05, + "loss": 1.8174, + "step": 17040 + }, + { + "epoch": 0.11109793893000491, + "grad_norm": 0.28612881898880005, + "learning_rate": 1.1815793020076448e-05, + "loss": 1.8077, + "step": 17050 + }, + { + "epoch": 0.11120632716310735, + "eval_loss": 2.595874071121216, + "eval_runtime": 29.3622, + "eval_samples_per_second": 170.287, + "eval_steps_per_second": 0.92, + "step": 17052 + }, + { + "epoch": 0.11163988009551713, + "grad_norm": 0.33459609746932983, + "learning_rate": 1.1790155018498666e-05, + "loss": 1.7929, + "step": 17060 + }, + { + "epoch": 0.11218182126102935, + "grad_norm": 0.27914193272590637, + "learning_rate": 1.176469562985513e-05, + "loss": 1.8074, + "step": 17070 + }, + { + "epoch": 0.11272376242654157, + "grad_norm": 0.26985886693000793, + "learning_rate": 1.1739414959385191e-05, + "loss": 1.8024, + "step": 17080 + }, + { + "epoch": 0.11326570359205379, + "grad_norm": 0.4287962019443512, + "learning_rate": 1.1714313111589445e-05, + "loss": 1.7996, + "step": 17090 + }, + { + "epoch": 0.11380764475756601, + "grad_norm": 0.29455164074897766, + "learning_rate": 1.1689390190229312e-05, + "loss": 1.8117, + "step": 17100 + }, + { + "epoch": 0.11434958592307823, + "grad_norm": 0.26276010274887085, + "learning_rate": 1.1664646298326582e-05, + "loss": 1.8055, + "step": 17110 + }, + { + "epoch": 0.11489152708859045, + "grad_norm": 0.2993430495262146, + "learning_rate": 1.1640081538163036e-05, + "loss": 1.8146, + "step": 17120 + }, + { + "epoch": 0.11543346825410267, + "grad_norm": 0.3164224922657013, + "learning_rate": 1.1615696011279965e-05, + "loss": 1.8064, + "step": 17130 + }, + { + "epoch": 0.11592121530306367, + "eval_loss": 2.596324920654297, + "eval_runtime": 25.6516, + "eval_samples_per_second": 194.919, + "eval_steps_per_second": 1.053, + "step": 17139 + }, + { + "epoch": 0.11597540941961489, + "grad_norm": 0.3307249844074249, + "learning_rate": 1.1591489818477795e-05, + "loss": 1.8039, + "step": 17140 + }, + { + "epoch": 0.1165173505851271, + "grad_norm": 0.29287073016166687, + "learning_rate": 1.1567463059815642e-05, + "loss": 1.8227, + "step": 17150 + }, + { + "epoch": 0.11705929175063932, + "grad_norm": 0.282429039478302, + "learning_rate": 1.1543615834610914e-05, + "loss": 1.8041, + "step": 17160 + }, + { + "epoch": 0.11760123291615154, + "grad_norm": 0.2842390835285187, + "learning_rate": 1.1519948241438899e-05, + "loss": 1.8088, + "step": 17170 + }, + { + "epoch": 0.11814317408166376, + "grad_norm": 0.2744342088699341, + "learning_rate": 1.149646037813236e-05, + "loss": 1.8148, + "step": 17180 + }, + { + "epoch": 0.11868511524717598, + "grad_norm": 0.28901031613349915, + "learning_rate": 1.1473152341781114e-05, + "loss": 1.8048, + "step": 17190 + }, + { + "epoch": 0.1192270564126882, + "grad_norm": 0.2894619107246399, + "learning_rate": 1.1450024228731648e-05, + "loss": 1.8114, + "step": 17200 + }, + { + "epoch": 0.11976899757820042, + "grad_norm": 0.2956114709377289, + "learning_rate": 1.1427076134586722e-05, + "loss": 1.8023, + "step": 17210 + }, + { + "epoch": 0.12031093874371264, + "grad_norm": 0.28053778409957886, + "learning_rate": 1.1404308154204944e-05, + "loss": 1.8051, + "step": 17220 + }, + { + "epoch": 0.12063610344301996, + "eval_loss": 2.5927584171295166, + "eval_runtime": 23.5866, + "eval_samples_per_second": 211.984, + "eval_steps_per_second": 1.145, + "step": 17226 + }, + { + "epoch": 0.12085287990922486, + "grad_norm": 0.3586195707321167, + "learning_rate": 1.1381720381700439e-05, + "loss": 1.8011, + "step": 17230 + }, + { + "epoch": 0.12139482107473708, + "grad_norm": 0.2799775004386902, + "learning_rate": 1.1359312910442378e-05, + "loss": 1.8082, + "step": 17240 + }, + { + "epoch": 0.1219367622402493, + "grad_norm": 0.2748625874519348, + "learning_rate": 1.1337085833054672e-05, + "loss": 1.7877, + "step": 17250 + }, + { + "epoch": 0.12247870340576152, + "grad_norm": 0.27648496627807617, + "learning_rate": 1.131503924141553e-05, + "loss": 1.7975, + "step": 17260 + }, + { + "epoch": 0.12302064457127374, + "grad_norm": 0.27565595507621765, + "learning_rate": 1.1293173226657108e-05, + "loss": 1.8062, + "step": 17270 + }, + { + "epoch": 0.12356258573678595, + "grad_norm": 0.2683922052383423, + "learning_rate": 1.1271487879165137e-05, + "loss": 1.804, + "step": 17280 + }, + { + "epoch": 0.12410452690229817, + "grad_norm": 0.27080050110816956, + "learning_rate": 1.1249983288578525e-05, + "loss": 1.8092, + "step": 17290 + }, + { + "epoch": 0.12464646806781039, + "grad_norm": 0.3873322308063507, + "learning_rate": 1.1228659543789005e-05, + "loss": 1.8062, + "step": 17300 + }, + { + "epoch": 0.1251884092333226, + "grad_norm": 0.3005862534046173, + "learning_rate": 1.1207516732940766e-05, + "loss": 1.8053, + "step": 17310 + }, + { + "epoch": 0.12535099158297627, + "eval_loss": 2.597273826599121, + "eval_runtime": 26.7305, + "eval_samples_per_second": 187.052, + "eval_steps_per_second": 1.01, + "step": 17313 + }, + { + "epoch": 0.12573035039883482, + "grad_norm": 0.27348408102989197, + "learning_rate": 1.118655494343008e-05, + "loss": 1.8239, + "step": 17320 + }, + { + "epoch": 0.12627229156434705, + "grad_norm": 0.3401775360107422, + "learning_rate": 1.1165774261904948e-05, + "loss": 1.7989, + "step": 17330 + }, + { + "epoch": 0.12681423272985926, + "grad_norm": 0.2691876292228699, + "learning_rate": 1.114517477426475e-05, + "loss": 1.8121, + "step": 17340 + }, + { + "epoch": 0.1273561738953715, + "grad_norm": 0.2959340214729309, + "learning_rate": 1.1124756565659861e-05, + "loss": 1.7961, + "step": 17350 + }, + { + "epoch": 0.1278981150608837, + "grad_norm": 0.28132373094558716, + "learning_rate": 1.1104519720491333e-05, + "loss": 1.8025, + "step": 17360 + }, + { + "epoch": 0.12844005622639593, + "grad_norm": 0.322965145111084, + "learning_rate": 1.108446432241052e-05, + "loss": 1.811, + "step": 17370 + }, + { + "epoch": 0.12898199739190813, + "grad_norm": 0.28159090876579285, + "learning_rate": 1.106459045431875e-05, + "loss": 1.806, + "step": 17380 + }, + { + "epoch": 0.12952393855742036, + "grad_norm": 0.25992992520332336, + "learning_rate": 1.1044898198366984e-05, + "loss": 1.8067, + "step": 17390 + }, + { + "epoch": 0.13006587972293257, + "grad_norm": 0.3366539180278778, + "learning_rate": 1.1025387635955451e-05, + "loss": 1.8015, + "step": 17400 + }, + { + "epoch": 0.13006587972293257, + "eval_loss": 2.593247890472412, + "eval_runtime": 21.9708, + "eval_samples_per_second": 227.575, + "eval_steps_per_second": 1.229, + "step": 17400 + }, + { + "epoch": 0.1306078208884448, + "grad_norm": 0.2966313660144806, + "learning_rate": 1.1006058847733338e-05, + "loss": 1.8048, + "step": 17410 + }, + { + "epoch": 0.131149762053957, + "grad_norm": 0.27395039796829224, + "learning_rate": 1.098691191359845e-05, + "loss": 1.8117, + "step": 17420 + }, + { + "epoch": 0.13169170321946924, + "grad_norm": 0.331516295671463, + "learning_rate": 1.096794691269686e-05, + "loss": 1.805, + "step": 17430 + }, + { + "epoch": 0.13223364438498145, + "grad_norm": 0.30107611417770386, + "learning_rate": 1.0949163923422624e-05, + "loss": 1.8032, + "step": 17440 + }, + { + "epoch": 0.13277558555049368, + "grad_norm": 0.2857365310192108, + "learning_rate": 1.0930563023417417e-05, + "loss": 1.8035, + "step": 17450 + }, + { + "epoch": 0.13331752671600589, + "grad_norm": 0.2834301292896271, + "learning_rate": 1.0912144289570228e-05, + "loss": 1.7969, + "step": 17460 + }, + { + "epoch": 0.13385946788151812, + "grad_norm": 0.275591641664505, + "learning_rate": 1.089390779801704e-05, + "loss": 1.7935, + "step": 17470 + }, + { + "epoch": 0.13440140904703032, + "grad_norm": 0.27701419591903687, + "learning_rate": 1.0875853624140517e-05, + "loss": 1.8203, + "step": 17480 + }, + { + "epoch": 0.1347807678628889, + "eval_loss": 2.59383487701416, + "eval_runtime": 26.2522, + "eval_samples_per_second": 190.46, + "eval_steps_per_second": 1.028, + "step": 17487 + }, + { + "epoch": 0.13494335021254256, + "grad_norm": 0.3042493164539337, + "learning_rate": 1.0857981842569686e-05, + "loss": 1.7945, + "step": 17490 + }, + { + "epoch": 0.13548529137805476, + "grad_norm": 0.3667808473110199, + "learning_rate": 1.0840292527179657e-05, + "loss": 1.8075, + "step": 17500 + }, + { + "epoch": 0.136027232543567, + "grad_norm": 0.28393447399139404, + "learning_rate": 1.0822785751091268e-05, + "loss": 1.7968, + "step": 17510 + }, + { + "epoch": 0.1365691737090792, + "grad_norm": 0.2786491811275482, + "learning_rate": 1.0805461586670825e-05, + "loss": 1.8161, + "step": 17520 + }, + { + "epoch": 0.13711111487459143, + "grad_norm": 0.27336838841438293, + "learning_rate": 1.0788320105529776e-05, + "loss": 1.7943, + "step": 17530 + }, + { + "epoch": 0.13765305604010364, + "grad_norm": 0.2629176676273346, + "learning_rate": 1.0771361378524431e-05, + "loss": 1.8007, + "step": 17540 + }, + { + "epoch": 0.13819499720561587, + "grad_norm": 0.26536351442337036, + "learning_rate": 1.0754585475755682e-05, + "loss": 1.7939, + "step": 17550 + }, + { + "epoch": 0.13873693837112808, + "grad_norm": 0.30199551582336426, + "learning_rate": 1.0737992466568667e-05, + "loss": 1.7971, + "step": 17560 + }, + { + "epoch": 0.1392788795366403, + "grad_norm": 0.2917165160179138, + "learning_rate": 1.072158241955252e-05, + "loss": 1.8053, + "step": 17570 + }, + { + "epoch": 0.1394956560028452, + "eval_loss": 2.5910274982452393, + "eval_runtime": 27.7651, + "eval_samples_per_second": 180.082, + "eval_steps_per_second": 0.972, + "step": 17574 + }, + { + "epoch": 0.13982082070215252, + "grad_norm": 0.3207840025424957, + "learning_rate": 1.0705355402540095e-05, + "loss": 1.8075, + "step": 17580 + }, + { + "epoch": 0.14036276186766475, + "grad_norm": 0.2685806155204773, + "learning_rate": 1.0689311482607655e-05, + "loss": 1.8048, + "step": 17590 + }, + { + "epoch": 0.14090470303317695, + "grad_norm": 0.2922206521034241, + "learning_rate": 1.0673450726074611e-05, + "loss": 1.7965, + "step": 17600 + }, + { + "epoch": 0.1414466441986892, + "grad_norm": 0.3959106504917145, + "learning_rate": 1.0657773198503267e-05, + "loss": 1.8087, + "step": 17610 + }, + { + "epoch": 0.1419885853642014, + "grad_norm": 0.33581146597862244, + "learning_rate": 1.0642278964698486e-05, + "loss": 1.8066, + "step": 17620 + }, + { + "epoch": 0.14253052652971362, + "grad_norm": 0.2724241614341736, + "learning_rate": 1.0626968088707508e-05, + "loss": 1.8005, + "step": 17630 + }, + { + "epoch": 0.14307246769522583, + "grad_norm": 0.2701176404953003, + "learning_rate": 1.0611840633819626e-05, + "loss": 1.8039, + "step": 17640 + }, + { + "epoch": 0.14361440886073806, + "grad_norm": 0.2956571877002716, + "learning_rate": 1.0596896662565925e-05, + "loss": 1.7962, + "step": 17650 + }, + { + "epoch": 0.14415635002625027, + "grad_norm": 0.29787176847457886, + "learning_rate": 1.0582136236719073e-05, + "loss": 1.8094, + "step": 17660 + }, + { + "epoch": 0.1442105441428015, + "eval_loss": 2.5948381423950195, + "eval_runtime": 22.6082, + "eval_samples_per_second": 221.158, + "eval_steps_per_second": 1.194, + "step": 17661 + }, + { + "epoch": 0.1446982911917625, + "grad_norm": 0.2762181758880615, + "learning_rate": 1.0567559417292992e-05, + "loss": 1.8069, + "step": 17670 + }, + { + "epoch": 0.1452402323572747, + "grad_norm": 0.2777063548564911, + "learning_rate": 1.0553166264542675e-05, + "loss": 1.8085, + "step": 17680 + }, + { + "epoch": 0.14578217352278694, + "grad_norm": 0.2686801552772522, + "learning_rate": 1.05389568379639e-05, + "loss": 1.7995, + "step": 17690 + }, + { + "epoch": 0.14632411468829915, + "grad_norm": 0.3426153361797333, + "learning_rate": 1.0524931196292987e-05, + "loss": 1.8079, + "step": 17700 + }, + { + "epoch": 0.14686605585381138, + "grad_norm": 0.30656588077545166, + "learning_rate": 1.0511089397506558e-05, + "loss": 1.8001, + "step": 17710 + }, + { + "epoch": 0.14740799701932358, + "grad_norm": 0.26832064986228943, + "learning_rate": 1.0497431498821308e-05, + "loss": 1.8022, + "step": 17720 + }, + { + "epoch": 0.14794993818483582, + "grad_norm": 0.31563130021095276, + "learning_rate": 1.0483957556693748e-05, + "loss": 1.7997, + "step": 17730 + }, + { + "epoch": 0.14849187935034802, + "grad_norm": 0.2889915406703949, + "learning_rate": 1.0470667626820001e-05, + "loss": 1.7999, + "step": 17740 + }, + { + "epoch": 0.1489254322827578, + "eval_loss": 2.596463918685913, + "eval_runtime": 22.7533, + "eval_samples_per_second": 219.749, + "eval_steps_per_second": 1.187, + "step": 17748 + }, + { + "epoch": 0.14903382051586025, + "grad_norm": 0.2632191777229309, + "learning_rate": 1.0457561764135531e-05, + "loss": 1.7958, + "step": 17750 + }, + { + "epoch": 0.14957576168137246, + "grad_norm": 0.3954158425331116, + "learning_rate": 1.0444640022814952e-05, + "loss": 1.8029, + "step": 17760 + }, + { + "epoch": 0.1501177028468847, + "grad_norm": 0.32601505517959595, + "learning_rate": 1.0431902456271798e-05, + "loss": 1.8048, + "step": 17770 + }, + { + "epoch": 0.1506596440123969, + "grad_norm": 0.3632460832595825, + "learning_rate": 1.0419349117158271e-05, + "loss": 1.7984, + "step": 17780 + }, + { + "epoch": 0.15120158517790913, + "grad_norm": 0.2885991036891937, + "learning_rate": 1.040698005736507e-05, + "loss": 1.7987, + "step": 17790 + }, + { + "epoch": 0.15174352634342134, + "grad_norm": 0.3226932883262634, + "learning_rate": 1.0394795328021151e-05, + "loss": 1.8114, + "step": 17800 + }, + { + "epoch": 0.15228546750893357, + "grad_norm": 0.2789970934391022, + "learning_rate": 1.038279497949351e-05, + "loss": 1.7995, + "step": 17810 + }, + { + "epoch": 0.15282740867444577, + "grad_norm": 0.2944728136062622, + "learning_rate": 1.0370979061386985e-05, + "loss": 1.8049, + "step": 17820 + }, + { + "epoch": 0.153369349839958, + "grad_norm": 0.28693804144859314, + "learning_rate": 1.0359347622544055e-05, + "loss": 1.79, + "step": 17830 + }, + { + "epoch": 0.1536403204227141, + "eval_loss": 2.597463369369507, + "eval_runtime": 23.0476, + "eval_samples_per_second": 216.942, + "eval_steps_per_second": 1.171, + "step": 17835 + }, + { + "epoch": 0.1539112910054702, + "grad_norm": 0.2763684391975403, + "learning_rate": 1.0347900711044624e-05, + "loss": 1.81, + "step": 17840 + }, + { + "epoch": 0.15445323217098245, + "grad_norm": 0.2787637412548065, + "learning_rate": 1.0336638374205845e-05, + "loss": 1.8125, + "step": 17850 + }, + { + "epoch": 0.15499517333649465, + "grad_norm": 0.2788551449775696, + "learning_rate": 1.0325560658581893e-05, + "loss": 1.816, + "step": 17860 + }, + { + "epoch": 0.15553711450200688, + "grad_norm": 0.2845081686973572, + "learning_rate": 1.0314667609963802e-05, + "loss": 1.7967, + "step": 17870 + }, + { + "epoch": 0.1560790556675191, + "grad_norm": 0.37821102142333984, + "learning_rate": 1.0303959273379258e-05, + "loss": 1.7942, + "step": 17880 + }, + { + "epoch": 0.15662099683303132, + "grad_norm": 0.2836483418941498, + "learning_rate": 1.0293435693092411e-05, + "loss": 1.814, + "step": 17890 + }, + { + "epoch": 0.15716293799854353, + "grad_norm": 0.2846856415271759, + "learning_rate": 1.0283096912603723e-05, + "loss": 1.8124, + "step": 17900 + }, + { + "epoch": 0.15770487916405576, + "grad_norm": 0.2603934705257416, + "learning_rate": 1.027294297464974e-05, + "loss": 1.8014, + "step": 17910 + }, + { + "epoch": 0.15824682032956797, + "grad_norm": 0.34290361404418945, + "learning_rate": 1.0262973921202948e-05, + "loss": 1.7978, + "step": 17920 + }, + { + "epoch": 0.1583552085626704, + "eval_loss": 2.5988216400146484, + "eval_runtime": 25.8064, + "eval_samples_per_second": 193.75, + "eval_steps_per_second": 1.046, + "step": 17922 + }, + { + "epoch": 0.1587887614950802, + "grad_norm": 0.28911086916923523, + "learning_rate": 1.0253189793471598e-05, + "loss": 1.8057, + "step": 17930 + }, + { + "epoch": 0.1593307026605924, + "grad_norm": 0.297972708940506, + "learning_rate": 1.0243590631899517e-05, + "loss": 1.8037, + "step": 17940 + }, + { + "epoch": 0.15987264382610464, + "grad_norm": 0.27581244707107544, + "learning_rate": 1.023417647616596e-05, + "loss": 1.8009, + "step": 17950 + }, + { + "epoch": 0.16041458499161684, + "grad_norm": 0.2763090133666992, + "learning_rate": 1.0224947365185444e-05, + "loss": 1.8058, + "step": 17960 + }, + { + "epoch": 0.16095652615712908, + "grad_norm": 0.285412073135376, + "learning_rate": 1.0215903337107571e-05, + "loss": 1.8179, + "step": 17970 + }, + { + "epoch": 0.16149846732264128, + "grad_norm": 0.31604015827178955, + "learning_rate": 1.0207044429316883e-05, + "loss": 1.8074, + "step": 17980 + }, + { + "epoch": 0.16204040848815351, + "grad_norm": 0.2986457645893097, + "learning_rate": 1.0198370678432713e-05, + "loss": 1.7948, + "step": 17990 + }, + { + "epoch": 0.16258234965366572, + "grad_norm": 0.28448864817619324, + "learning_rate": 1.0189882120309019e-05, + "loss": 1.794, + "step": 18000 + }, + { + "epoch": 0.16307009670262673, + "eval_loss": 2.5937376022338867, + "eval_runtime": 24.5121, + "eval_samples_per_second": 203.981, + "eval_steps_per_second": 1.101, + "step": 18009 + }, + { + "epoch": 0.16312429081917795, + "grad_norm": 0.3744131922721863, + "learning_rate": 1.0181578790034248e-05, + "loss": 1.7963, + "step": 18010 + }, + { + "epoch": 0.16366623198469016, + "grad_norm": 0.35869526863098145, + "learning_rate": 1.0173460721931186e-05, + "loss": 1.7989, + "step": 18020 + }, + { + "epoch": 0.1642081731502024, + "grad_norm": 0.35815170407295227, + "learning_rate": 1.0165527949556814e-05, + "loss": 1.8044, + "step": 18030 + }, + { + "epoch": 0.1647501143157146, + "grad_norm": 0.3258884847164154, + "learning_rate": 1.015778050570217e-05, + "loss": 1.7979, + "step": 18040 + }, + { + "epoch": 0.16529205548122683, + "grad_norm": 0.2814440429210663, + "learning_rate": 1.0150218422392213e-05, + "loss": 1.7945, + "step": 18050 + }, + { + "epoch": 0.16583399664673903, + "grad_norm": 0.2600628733634949, + "learning_rate": 1.0142841730885705e-05, + "loss": 1.8, + "step": 18060 + }, + { + "epoch": 0.16637593781225127, + "grad_norm": 0.2741418182849884, + "learning_rate": 1.0135650461675054e-05, + "loss": 1.7987, + "step": 18070 + }, + { + "epoch": 0.16691787897776347, + "grad_norm": 0.286240816116333, + "learning_rate": 1.0128644644486213e-05, + "loss": 1.8017, + "step": 18080 + }, + { + "epoch": 0.1674598201432757, + "grad_norm": 0.2839854061603546, + "learning_rate": 1.0121824308278545e-05, + "loss": 1.793, + "step": 18090 + }, + { + "epoch": 0.16778498484258303, + "eval_loss": 2.5955307483673096, + "eval_runtime": 23.2564, + "eval_samples_per_second": 214.994, + "eval_steps_per_second": 1.161, + "step": 18096 + }, + { + "epoch": 0.1680017613087879, + "grad_norm": 0.2708841860294342, + "learning_rate": 1.0115189481244706e-05, + "loss": 1.7934, + "step": 18100 + }, + { + "epoch": 0.16854370247430014, + "grad_norm": 0.32778841257095337, + "learning_rate": 1.0108740190810519e-05, + "loss": 1.8111, + "step": 18110 + }, + { + "epoch": 0.16908564363981235, + "grad_norm": 0.28490176796913147, + "learning_rate": 1.0102476463634891e-05, + "loss": 1.8004, + "step": 18120 + }, + { + "epoch": 0.16962758480532458, + "grad_norm": 0.28157317638397217, + "learning_rate": 1.0096398325609658e-05, + "loss": 1.8033, + "step": 18130 + }, + { + "epoch": 0.1701695259708368, + "grad_norm": 0.29462599754333496, + "learning_rate": 1.0090505801859521e-05, + "loss": 1.8047, + "step": 18140 + }, + { + "epoch": 0.17071146713634902, + "grad_norm": 0.3192684054374695, + "learning_rate": 1.0084798916741909e-05, + "loss": 1.8049, + "step": 18150 + }, + { + "epoch": 0.17125340830186123, + "grad_norm": 0.2903258204460144, + "learning_rate": 1.0079277693846895e-05, + "loss": 1.7911, + "step": 18160 + }, + { + "epoch": 0.17179534946737346, + "grad_norm": 0.26754647493362427, + "learning_rate": 1.0073942155997108e-05, + "loss": 1.7938, + "step": 18170 + }, + { + "epoch": 0.17233729063288566, + "grad_norm": 0.26055288314819336, + "learning_rate": 1.0068792325247608e-05, + "loss": 1.8061, + "step": 18180 + }, + { + "epoch": 0.17249987298253933, + "eval_loss": 2.5973074436187744, + "eval_runtime": 25.1042, + "eval_samples_per_second": 199.169, + "eval_steps_per_second": 1.076, + "step": 18183 + }, + { + "epoch": 0.1728792317983979, + "grad_norm": 0.3081805109977722, + "learning_rate": 1.0063828222885822e-05, + "loss": 1.81, + "step": 18190 + }, + { + "epoch": 0.1734211729639101, + "grad_norm": 0.30028587579727173, + "learning_rate": 1.0059049869431447e-05, + "loss": 1.7991, + "step": 18200 + }, + { + "epoch": 0.17396311412942234, + "grad_norm": 0.29933491349220276, + "learning_rate": 1.0054457284636363e-05, + "loss": 1.7965, + "step": 18210 + }, + { + "epoch": 0.17450505529493454, + "grad_norm": 0.2657584547996521, + "learning_rate": 1.0050050487484561e-05, + "loss": 1.7991, + "step": 18220 + }, + { + "epoch": 0.17504699646044677, + "grad_norm": 0.28480470180511475, + "learning_rate": 1.004582949619204e-05, + "loss": 1.7998, + "step": 18230 + }, + { + "epoch": 0.17558893762595898, + "grad_norm": 0.3122563660144806, + "learning_rate": 1.004179432820677e-05, + "loss": 1.8084, + "step": 18240 + }, + { + "epoch": 0.1761308787914712, + "grad_norm": 0.2768636643886566, + "learning_rate": 1.0037945000208584e-05, + "loss": 1.8065, + "step": 18250 + }, + { + "epoch": 0.17667281995698342, + "grad_norm": 0.2690199017524719, + "learning_rate": 1.0034281528109125e-05, + "loss": 1.8072, + "step": 18260 + }, + { + "epoch": 0.17721476112249565, + "grad_norm": 0.27862584590911865, + "learning_rate": 1.0030803927051793e-05, + "loss": 1.7843, + "step": 18270 + }, + { + "epoch": 0.17721476112249565, + "eval_loss": 2.597975730895996, + "eval_runtime": 21.9736, + "eval_samples_per_second": 227.545, + "eval_steps_per_second": 1.229, + "step": 18270 + }, + { + "epoch": 0.17775670228800786, + "grad_norm": 0.3340103328227997, + "learning_rate": 1.0027512211411644e-05, + "loss": 1.7988, + "step": 18280 + }, + { + "epoch": 0.1782986434535201, + "grad_norm": 0.33370035886764526, + "learning_rate": 1.0024406394795373e-05, + "loss": 1.8053, + "step": 18290 + }, + { + "epoch": 0.1788405846190323, + "grad_norm": 0.2686094343662262, + "learning_rate": 1.0021486490041246e-05, + "loss": 1.7932, + "step": 18300 + }, + { + "epoch": 0.17938252578454453, + "grad_norm": 0.27461278438568115, + "learning_rate": 1.0018752509219017e-05, + "loss": 1.8024, + "step": 18310 + }, + { + "epoch": 0.17992446695005673, + "grad_norm": 0.26994603872299194, + "learning_rate": 1.0016204463629918e-05, + "loss": 1.794, + "step": 18320 + }, + { + "epoch": 0.18046640811556897, + "grad_norm": 0.28709709644317627, + "learning_rate": 1.0013842363806594e-05, + "loss": 1.7986, + "step": 18330 + }, + { + "epoch": 0.18100834928108117, + "grad_norm": 0.3366316556930542, + "learning_rate": 1.0011666219513065e-05, + "loss": 1.794, + "step": 18340 + }, + { + "epoch": 0.1815502904465934, + "grad_norm": 0.3002452552318573, + "learning_rate": 1.0009676039744664e-05, + "loss": 1.8091, + "step": 18350 + }, + { + "epoch": 0.18192964926245195, + "eval_loss": 2.598811149597168, + "eval_runtime": 26.2348, + "eval_samples_per_second": 190.586, + "eval_steps_per_second": 1.029, + "step": 18357 + }, + { + "epoch": 0.1820922316121056, + "grad_norm": 0.276909738779068, + "learning_rate": 1.0007871832728043e-05, + "loss": 1.7919, + "step": 18360 + }, + { + "epoch": 0.18263417277761784, + "grad_norm": 0.2699563503265381, + "learning_rate": 1.0006253605921098e-05, + "loss": 1.7975, + "step": 18370 + }, + { + "epoch": 0.18317611394313005, + "grad_norm": 0.30202600359916687, + "learning_rate": 1.0004821366012962e-05, + "loss": 1.7966, + "step": 18380 + }, + { + "epoch": 0.18371805510864228, + "grad_norm": 0.2897915244102478, + "learning_rate": 1.0003575118923961e-05, + "loss": 1.8013, + "step": 18390 + }, + { + "epoch": 0.18425999627415449, + "grad_norm": 0.2939542233943939, + "learning_rate": 1.0002514869805599e-05, + "loss": 1.8084, + "step": 18400 + }, + { + "epoch": 0.18480193743966672, + "grad_norm": 0.2901172637939453, + "learning_rate": 1.0001640623040548e-05, + "loss": 1.8072, + "step": 18410 + }, + { + "epoch": 0.18534387860517892, + "grad_norm": 0.4139680564403534, + "learning_rate": 1.0000952382242605e-05, + "loss": 1.798, + "step": 18420 + }, + { + "epoch": 0.18588581977069116, + "grad_norm": 0.2750343978404999, + "learning_rate": 1.000045015025669e-05, + "loss": 1.7949, + "step": 18430 + }, + { + "epoch": 0.18642776093620336, + "grad_norm": 0.28022781014442444, + "learning_rate": 1.0000133929158849e-05, + "loss": 1.8095, + "step": 18440 + }, + { + "epoch": 0.18664453740240824, + "eval_loss": 2.5955660343170166, + "eval_runtime": 23.3631, + "eval_samples_per_second": 214.012, + "eval_steps_per_second": 1.156, + "step": 18444 + }, + { + "epoch": 0.1869697021017156, + "grad_norm": 0.26084405183792114, + "learning_rate": 1.000000372025621e-05, + "loss": 1.815, + "step": 18450 + }, + { + "epoch": 1.0004335529324098, + "grad_norm": 7.159980297088623, + "learning_rate": 9.350101516641391e-05, + "loss": 1.9598, + "step": 18460 + }, + { + "epoch": 1.000975494097922, + "grad_norm": 6.029938697814941, + "learning_rate": 9.349281376788221e-05, + "loss": 2.0867, + "step": 18470 + }, + { + "epoch": 1.0015174352634342, + "grad_norm": 4.406125068664551, + "learning_rate": 9.348460760102786e-05, + "loss": 2.0069, + "step": 18480 + }, + { + "epoch": 1.0020593764289465, + "grad_norm": 4.879397392272949, + "learning_rate": 9.347639666686739e-05, + "loss": 2.0173, + "step": 18490 + }, + { + "epoch": 1.0026013175944586, + "grad_norm": 3.420335292816162, + "learning_rate": 9.346818096641794e-05, + "loss": 1.9867, + "step": 18500 + }, + { + "epoch": 1.003143258759971, + "grad_norm": 1.2602205276489258, + "learning_rate": 9.345996050069725e-05, + "loss": 1.9279, + "step": 18510 + }, + { + "epoch": 1.003685199925483, + "grad_norm": 3.904205560684204, + "learning_rate": 9.345173527072363e-05, + "loss": 1.9165, + "step": 18520 + }, + { + "epoch": 1.0042271410909953, + "grad_norm": 1.9543260335922241, + "learning_rate": 9.344350527751599e-05, + "loss": 1.9421, + "step": 18530 + }, + { + "epoch": 1.0042813352075466, + "eval_loss": 2.611534595489502, + "eval_runtime": 59.2714, + "eval_samples_per_second": 84.358, + "eval_steps_per_second": 0.456, + "step": 18531 + }, + { + "epoch": 1.0047690822565076, + "grad_norm": 1.9394625425338745, + "learning_rate": 9.343527052209384e-05, + "loss": 1.8943, + "step": 18540 + }, + { + "epoch": 1.0053110234220197, + "grad_norm": 1.3751671314239502, + "learning_rate": 9.342703100547722e-05, + "loss": 1.8667, + "step": 18550 + }, + { + "epoch": 1.005852964587532, + "grad_norm": 1.4693511724472046, + "learning_rate": 9.341878672868683e-05, + "loss": 1.8609, + "step": 18560 + }, + { + "epoch": 1.0063949057530441, + "grad_norm": 1.3482840061187744, + "learning_rate": 9.341053769274396e-05, + "loss": 1.8552, + "step": 18570 + }, + { + "epoch": 1.0069368469185564, + "grad_norm": 1.061438798904419, + "learning_rate": 9.340228389867041e-05, + "loss": 1.8599, + "step": 18580 + }, + { + "epoch": 1.0074787880840685, + "grad_norm": 0.778508722782135, + "learning_rate": 9.339402534748868e-05, + "loss": 1.8528, + "step": 18590 + }, + { + "epoch": 1.0080207292495809, + "grad_norm": 0.43868595361709595, + "learning_rate": 9.338576204022178e-05, + "loss": 1.8457, + "step": 18600 + }, + { + "epoch": 1.008562670415093, + "grad_norm": 0.531323254108429, + "learning_rate": 9.337749397789332e-05, + "loss": 1.8489, + "step": 18610 + }, + { + "epoch": 1.008996223347503, + "eval_loss": 2.614398717880249, + "eval_runtime": 21.9914, + "eval_samples_per_second": 227.361, + "eval_steps_per_second": 1.228, + "step": 18618 + }, + { + "epoch": 1.0091046115806053, + "grad_norm": 0.6396978497505188, + "learning_rate": 9.336922116152755e-05, + "loss": 1.844, + "step": 18620 + }, + { + "epoch": 1.0096465527461176, + "grad_norm": 0.44661280512809753, + "learning_rate": 9.336094359214923e-05, + "loss": 1.8426, + "step": 18630 + }, + { + "epoch": 1.0101884939116297, + "grad_norm": 0.55703204870224, + "learning_rate": 9.335266127078376e-05, + "loss": 1.8513, + "step": 18640 + }, + { + "epoch": 1.010730435077142, + "grad_norm": 0.8314570784568787, + "learning_rate": 9.334437419845713e-05, + "loss": 1.8503, + "step": 18650 + }, + { + "epoch": 1.011272376242654, + "grad_norm": 0.5193946957588196, + "learning_rate": 9.333608237619589e-05, + "loss": 1.8619, + "step": 18660 + }, + { + "epoch": 1.0118143174081664, + "grad_norm": 0.3355676829814911, + "learning_rate": 9.33277858050272e-05, + "loss": 1.8414, + "step": 18670 + }, + { + "epoch": 1.0123562585736785, + "grad_norm": 0.4459117650985718, + "learning_rate": 9.331948448597882e-05, + "loss": 1.8392, + "step": 18680 + }, + { + "epoch": 1.0128981997391908, + "grad_norm": 0.7514176368713379, + "learning_rate": 9.331117842007906e-05, + "loss": 1.842, + "step": 18690 + }, + { + "epoch": 1.0134401409047031, + "grad_norm": 0.4279676377773285, + "learning_rate": 9.330286760835686e-05, + "loss": 1.8378, + "step": 18700 + }, + { + "epoch": 1.013711111487459, + "eval_loss": 2.6156516075134277, + "eval_runtime": 21.9911, + "eval_samples_per_second": 227.364, + "eval_steps_per_second": 1.228, + "step": 18705 + }, + { + "epoch": 1.0139820820702152, + "grad_norm": 0.43919897079467773, + "learning_rate": 9.32945520518417e-05, + "loss": 1.8378, + "step": 18710 + }, + { + "epoch": 1.0145240232357275, + "grad_norm": 0.4081036150455475, + "learning_rate": 9.32862317515637e-05, + "loss": 1.8381, + "step": 18720 + }, + { + "epoch": 1.0150659644012396, + "grad_norm": 1.0370370149612427, + "learning_rate": 9.327790670855352e-05, + "loss": 1.8482, + "step": 18730 + }, + { + "epoch": 1.015607905566752, + "grad_norm": 0.717692494392395, + "learning_rate": 9.326957692384247e-05, + "loss": 1.8335, + "step": 18740 + }, + { + "epoch": 1.016149846732264, + "grad_norm": 0.5903939604759216, + "learning_rate": 9.326124239846237e-05, + "loss": 1.8393, + "step": 18750 + }, + { + "epoch": 1.0166917878977764, + "grad_norm": 0.5651223063468933, + "learning_rate": 9.325290313344565e-05, + "loss": 1.8397, + "step": 18760 + }, + { + "epoch": 1.0172337290632885, + "grad_norm": 0.7575017809867859, + "learning_rate": 9.32445591298254e-05, + "loss": 1.8482, + "step": 18770 + }, + { + "epoch": 1.0177756702288008, + "grad_norm": 1.311630129814148, + "learning_rate": 9.32362103886352e-05, + "loss": 1.844, + "step": 18780 + }, + { + "epoch": 1.018317611394313, + "grad_norm": 1.2291172742843628, + "learning_rate": 9.322785691090925e-05, + "loss": 1.8475, + "step": 18790 + }, + { + "epoch": 1.0184259996274154, + "eval_loss": 2.618657350540161, + "eval_runtime": 21.9861, + "eval_samples_per_second": 227.417, + "eval_steps_per_second": 1.228, + "step": 18792 + }, + { + "epoch": 1.0188595525598252, + "grad_norm": 0.6886726021766663, + "learning_rate": 9.321949869768236e-05, + "loss": 1.8358, + "step": 18800 + }, + { + "epoch": 1.0194014937253375, + "grad_norm": 0.8392327427864075, + "learning_rate": 9.32111357499899e-05, + "loss": 1.8462, + "step": 18810 + }, + { + "epoch": 1.0199434348908496, + "grad_norm": 0.35495704412460327, + "learning_rate": 9.320276806886784e-05, + "loss": 1.834, + "step": 18820 + }, + { + "epoch": 1.020485376056362, + "grad_norm": 0.45716392993927, + "learning_rate": 9.319439565535274e-05, + "loss": 1.8272, + "step": 18830 + }, + { + "epoch": 1.021027317221874, + "grad_norm": 0.31467896699905396, + "learning_rate": 9.318601851048172e-05, + "loss": 1.8398, + "step": 18840 + }, + { + "epoch": 1.0215692583873863, + "grad_norm": 0.7633432149887085, + "learning_rate": 9.31776366352925e-05, + "loss": 1.839, + "step": 18850 + }, + { + "epoch": 1.0221111995528986, + "grad_norm": 0.40369927883148193, + "learning_rate": 9.316925003082344e-05, + "loss": 1.8429, + "step": 18860 + }, + { + "epoch": 1.0226531407184107, + "grad_norm": 0.7707934975624084, + "learning_rate": 9.316085869811338e-05, + "loss": 1.8314, + "step": 18870 + }, + { + "epoch": 1.0231408877673718, + "eval_loss": 2.6104660034179688, + "eval_runtime": 21.9934, + "eval_samples_per_second": 227.341, + "eval_steps_per_second": 1.228, + "step": 18879 + }, + { + "epoch": 1.023195081883923, + "grad_norm": 0.791067361831665, + "learning_rate": 9.315246263820181e-05, + "loss": 1.8436, + "step": 18880 + }, + { + "epoch": 1.0237370230494351, + "grad_norm": 0.3280937671661377, + "learning_rate": 9.314406185212883e-05, + "loss": 1.8302, + "step": 18890 + }, + { + "epoch": 1.0242789642149475, + "grad_norm": 0.4388424754142761, + "learning_rate": 9.313565634093507e-05, + "loss": 1.8393, + "step": 18900 + }, + { + "epoch": 1.0248209053804596, + "grad_norm": 0.4343729019165039, + "learning_rate": 9.312724610566177e-05, + "loss": 1.8377, + "step": 18910 + }, + { + "epoch": 1.0253628465459719, + "grad_norm": 0.7564290761947632, + "learning_rate": 9.311883114735077e-05, + "loss": 1.8383, + "step": 18920 + }, + { + "epoch": 1.025904787711484, + "grad_norm": 0.3451434075832367, + "learning_rate": 9.311041146704443e-05, + "loss": 1.8312, + "step": 18930 + }, + { + "epoch": 1.0264467288769963, + "grad_norm": 0.44154074788093567, + "learning_rate": 9.31019870657858e-05, + "loss": 1.832, + "step": 18940 + }, + { + "epoch": 1.0269886700425086, + "grad_norm": 0.5464621782302856, + "learning_rate": 9.309355794461843e-05, + "loss": 1.832, + "step": 18950 + }, + { + "epoch": 1.0275306112080207, + "grad_norm": 0.737061619758606, + "learning_rate": 9.308512410458648e-05, + "loss": 1.8379, + "step": 18960 + }, + { + "epoch": 1.0278557759073281, + "eval_loss": 2.598432779312134, + "eval_runtime": 21.9905, + "eval_samples_per_second": 227.371, + "eval_steps_per_second": 1.228, + "step": 18966 + }, + { + "epoch": 1.028072552373533, + "grad_norm": 0.5460038185119629, + "learning_rate": 9.307668554673471e-05, + "loss": 1.8363, + "step": 18970 + }, + { + "epoch": 1.028614493539045, + "grad_norm": 0.365409791469574, + "learning_rate": 9.306824227210845e-05, + "loss": 1.8447, + "step": 18980 + }, + { + "epoch": 1.0291564347045574, + "grad_norm": 0.37528592348098755, + "learning_rate": 9.305979428175361e-05, + "loss": 1.8298, + "step": 18990 + }, + { + "epoch": 1.0296983758700695, + "grad_norm": 0.37354302406311035, + "learning_rate": 9.30513415767167e-05, + "loss": 1.8439, + "step": 19000 + }, + { + "epoch": 1.0302403170355818, + "grad_norm": 0.4735754430294037, + "learning_rate": 9.304288415804479e-05, + "loss": 1.8267, + "step": 19010 + }, + { + "epoch": 1.0307822582010941, + "grad_norm": 0.30185240507125854, + "learning_rate": 9.303442202678555e-05, + "loss": 1.8362, + "step": 19020 + }, + { + "epoch": 1.0313241993666062, + "grad_norm": 0.3827216625213623, + "learning_rate": 9.302595518398723e-05, + "loss": 1.8311, + "step": 19030 + }, + { + "epoch": 1.0318661405321186, + "grad_norm": 0.3216041624546051, + "learning_rate": 9.301748363069872e-05, + "loss": 1.8422, + "step": 19040 + }, + { + "epoch": 1.0324080816976307, + "grad_norm": 0.34851735830307007, + "learning_rate": 9.300900736796934e-05, + "loss": 1.8369, + "step": 19050 + }, + { + "epoch": 1.0325706640472845, + "eval_loss": 2.6064109802246094, + "eval_runtime": 21.9894, + "eval_samples_per_second": 227.383, + "eval_steps_per_second": 1.228, + "step": 19053 + }, + { + "epoch": 1.032950022863143, + "grad_norm": 0.6215195059776306, + "learning_rate": 9.300052639684918e-05, + "loss": 1.834, + "step": 19060 + }, + { + "epoch": 1.033491964028655, + "grad_norm": 0.6960977911949158, + "learning_rate": 9.299204071838878e-05, + "loss": 1.8224, + "step": 19070 + }, + { + "epoch": 1.0340339051941674, + "grad_norm": 0.680069088935852, + "learning_rate": 9.298355033363931e-05, + "loss": 1.8277, + "step": 19080 + }, + { + "epoch": 1.0345758463596795, + "grad_norm": 0.8072498440742493, + "learning_rate": 9.297505524365254e-05, + "loss": 1.8475, + "step": 19090 + }, + { + "epoch": 1.0351177875251918, + "grad_norm": 0.547581672668457, + "learning_rate": 9.296655544948081e-05, + "loss": 1.8269, + "step": 19100 + }, + { + "epoch": 1.035659728690704, + "grad_norm": 0.33486223220825195, + "learning_rate": 9.295805095217701e-05, + "loss": 1.8352, + "step": 19110 + }, + { + "epoch": 1.0362016698562162, + "grad_norm": 0.5130099654197693, + "learning_rate": 9.294954175279466e-05, + "loss": 1.8332, + "step": 19120 + }, + { + "epoch": 1.0367436110217285, + "grad_norm": 0.5673821568489075, + "learning_rate": 9.294102785238785e-05, + "loss": 1.8172, + "step": 19130 + }, + { + "epoch": 1.0372855521872406, + "grad_norm": 0.5935150980949402, + "learning_rate": 9.293250925201122e-05, + "loss": 1.83, + "step": 19140 + }, + { + "epoch": 1.0372855521872406, + "eval_loss": 2.619783878326416, + "eval_runtime": 21.976, + "eval_samples_per_second": 227.521, + "eval_steps_per_second": 1.229, + "step": 19140 + }, + { + "epoch": 1.037827493352753, + "grad_norm": 0.8336159586906433, + "learning_rate": 9.292398595272003e-05, + "loss": 1.8289, + "step": 19150 + }, + { + "epoch": 1.038369434518265, + "grad_norm": 0.7801002264022827, + "learning_rate": 9.291545795557011e-05, + "loss": 1.8325, + "step": 19160 + }, + { + "epoch": 1.0389113756837773, + "grad_norm": 0.3641831576824188, + "learning_rate": 9.290692526161787e-05, + "loss": 1.8289, + "step": 19170 + }, + { + "epoch": 1.0394533168492897, + "grad_norm": 0.34061098098754883, + "learning_rate": 9.289838787192032e-05, + "loss": 1.8234, + "step": 19180 + }, + { + "epoch": 1.0399952580148017, + "grad_norm": 0.31556975841522217, + "learning_rate": 9.288984578753502e-05, + "loss": 1.8252, + "step": 19190 + }, + { + "epoch": 1.040537199180314, + "grad_norm": 0.3090665638446808, + "learning_rate": 9.28812990095201e-05, + "loss": 1.8194, + "step": 19200 + }, + { + "epoch": 1.0410791403458262, + "grad_norm": 0.29080694913864136, + "learning_rate": 9.287274753893435e-05, + "loss": 1.8342, + "step": 19210 + }, + { + "epoch": 1.0416210815113385, + "grad_norm": 0.5005635619163513, + "learning_rate": 9.286419137683704e-05, + "loss": 1.8401, + "step": 19220 + }, + { + "epoch": 1.042000440327197, + "eval_loss": 2.6065304279327393, + "eval_runtime": 21.9863, + "eval_samples_per_second": 227.414, + "eval_steps_per_second": 1.228, + "step": 19227 + }, + { + "epoch": 1.0421630226768506, + "grad_norm": 0.6064873933792114, + "learning_rate": 9.28556305242881e-05, + "loss": 1.822, + "step": 19230 + }, + { + "epoch": 1.0427049638423629, + "grad_norm": 0.30239003896713257, + "learning_rate": 9.284706498234798e-05, + "loss": 1.8317, + "step": 19240 + }, + { + "epoch": 1.043246905007875, + "grad_norm": 0.45904672145843506, + "learning_rate": 9.28384947520778e-05, + "loss": 1.8207, + "step": 19250 + }, + { + "epoch": 1.0437888461733873, + "grad_norm": 0.4452885389328003, + "learning_rate": 9.282991983453917e-05, + "loss": 1.832, + "step": 19260 + }, + { + "epoch": 1.0443307873388996, + "grad_norm": 0.5682956576347351, + "learning_rate": 9.28213402307943e-05, + "loss": 1.8331, + "step": 19270 + }, + { + "epoch": 1.0448727285044117, + "grad_norm": 0.886755108833313, + "learning_rate": 9.281275594190599e-05, + "loss": 1.8377, + "step": 19280 + }, + { + "epoch": 1.045414669669924, + "grad_norm": 0.9180141687393188, + "learning_rate": 9.280416696893766e-05, + "loss": 1.8277, + "step": 19290 + }, + { + "epoch": 1.0459566108354361, + "grad_norm": 0.3015201687812805, + "learning_rate": 9.279557331295325e-05, + "loss": 1.8243, + "step": 19300 + }, + { + "epoch": 1.0464985520009484, + "grad_norm": 0.36977553367614746, + "learning_rate": 9.278697497501731e-05, + "loss": 1.8161, + "step": 19310 + }, + { + "epoch": 1.0467153284671533, + "eval_loss": 2.6029727458953857, + "eval_runtime": 21.9909, + "eval_samples_per_second": 227.367, + "eval_steps_per_second": 1.228, + "step": 19314 + }, + { + "epoch": 1.0470404931664605, + "grad_norm": 0.7295284867286682, + "learning_rate": 9.277837195619496e-05, + "loss": 1.8169, + "step": 19320 + }, + { + "epoch": 1.0475824343319728, + "grad_norm": 0.3548785448074341, + "learning_rate": 9.276976425755192e-05, + "loss": 1.8258, + "step": 19330 + }, + { + "epoch": 1.0481243754974852, + "grad_norm": 0.581323504447937, + "learning_rate": 9.276115188015445e-05, + "loss": 1.8386, + "step": 19340 + }, + { + "epoch": 1.0486663166629973, + "grad_norm": 0.33570852875709534, + "learning_rate": 9.275253482506945e-05, + "loss": 1.8224, + "step": 19350 + }, + { + "epoch": 1.0492082578285096, + "grad_norm": 0.3555627167224884, + "learning_rate": 9.274391309336432e-05, + "loss": 1.8235, + "step": 19360 + }, + { + "epoch": 1.0497501989940217, + "grad_norm": 0.31758037209510803, + "learning_rate": 9.27352866861071e-05, + "loss": 1.8283, + "step": 19370 + }, + { + "epoch": 1.050292140159534, + "grad_norm": 0.3359634280204773, + "learning_rate": 9.272665560436642e-05, + "loss": 1.8207, + "step": 19380 + }, + { + "epoch": 1.050834081325046, + "grad_norm": 0.2850840389728546, + "learning_rate": 9.271801984921142e-05, + "loss": 1.8291, + "step": 19390 + }, + { + "epoch": 1.0513760224905584, + "grad_norm": 0.29905441403388977, + "learning_rate": 9.270937942171189e-05, + "loss": 1.8188, + "step": 19400 + }, + { + "epoch": 1.0514302166071097, + "eval_loss": 2.616773843765259, + "eval_runtime": 22.0168, + "eval_samples_per_second": 227.1, + "eval_steps_per_second": 1.226, + "step": 19401 + }, + { + "epoch": 1.0519179636560705, + "grad_norm": 0.3919777274131775, + "learning_rate": 9.270073432293814e-05, + "loss": 1.8129, + "step": 19410 + }, + { + "epoch": 1.0524599048215828, + "grad_norm": 0.37216314673423767, + "learning_rate": 9.26920845539611e-05, + "loss": 1.8217, + "step": 19420 + }, + { + "epoch": 1.0530018459870951, + "grad_norm": 0.3143322765827179, + "learning_rate": 9.268343011585227e-05, + "loss": 1.8239, + "step": 19430 + }, + { + "epoch": 1.0535437871526072, + "grad_norm": 0.6800436973571777, + "learning_rate": 9.267477100968372e-05, + "loss": 1.8202, + "step": 19440 + }, + { + "epoch": 1.0540857283181195, + "grad_norm": 0.35689041018486023, + "learning_rate": 9.266610723652813e-05, + "loss": 1.82, + "step": 19450 + }, + { + "epoch": 1.0546276694836316, + "grad_norm": 0.29921287298202515, + "learning_rate": 9.265743879745867e-05, + "loss": 1.8124, + "step": 19460 + }, + { + "epoch": 1.055169610649144, + "grad_norm": 0.47158485651016235, + "learning_rate": 9.264876569354921e-05, + "loss": 1.8158, + "step": 19470 + }, + { + "epoch": 1.055711551814656, + "grad_norm": 0.32789716124534607, + "learning_rate": 9.26400879258741e-05, + "loss": 1.8313, + "step": 19480 + }, + { + "epoch": 1.0561451047470658, + "eval_loss": 2.605008840560913, + "eval_runtime": 21.9889, + "eval_samples_per_second": 227.388, + "eval_steps_per_second": 1.228, + "step": 19488 + }, + { + "epoch": 1.0562534929801684, + "grad_norm": 0.3259328007698059, + "learning_rate": 9.263140549550832e-05, + "loss": 1.8178, + "step": 19490 + }, + { + "epoch": 1.0567954341456804, + "grad_norm": 0.27712419629096985, + "learning_rate": 9.26227184035274e-05, + "loss": 1.8237, + "step": 19500 + }, + { + "epoch": 1.0573373753111928, + "grad_norm": 0.7220262289047241, + "learning_rate": 9.261402665100746e-05, + "loss": 1.8208, + "step": 19510 + }, + { + "epoch": 1.057879316476705, + "grad_norm": 0.4974667429924011, + "learning_rate": 9.260533023902522e-05, + "loss": 1.8214, + "step": 19520 + }, + { + "epoch": 1.0584212576422172, + "grad_norm": 0.4250205457210541, + "learning_rate": 9.259662916865792e-05, + "loss": 1.8159, + "step": 19530 + }, + { + "epoch": 1.0589631988077295, + "grad_norm": 0.4620699882507324, + "learning_rate": 9.258792344098344e-05, + "loss": 1.806, + "step": 19540 + }, + { + "epoch": 1.0595051399732416, + "grad_norm": 0.36918604373931885, + "learning_rate": 9.257921305708018e-05, + "loss": 1.8078, + "step": 19550 + }, + { + "epoch": 1.060047081138754, + "grad_norm": 0.3109738528728485, + "learning_rate": 9.257049801802716e-05, + "loss": 1.8172, + "step": 19560 + }, + { + "epoch": 1.060589022304266, + "grad_norm": 0.5203297138214111, + "learning_rate": 9.256177832490398e-05, + "loss": 1.816, + "step": 19570 + }, + { + "epoch": 1.0608599928870222, + "eval_loss": 2.603193759918213, + "eval_runtime": 21.9951, + "eval_samples_per_second": 227.323, + "eval_steps_per_second": 1.228, + "step": 19575 + }, + { + "epoch": 1.0611309634697783, + "grad_norm": 0.29931849241256714, + "learning_rate": 9.255305397879076e-05, + "loss": 1.8155, + "step": 19580 + }, + { + "epoch": 1.0616729046352906, + "grad_norm": 0.3699570298194885, + "learning_rate": 9.254432498076826e-05, + "loss": 1.8154, + "step": 19590 + }, + { + "epoch": 1.0622148458008027, + "grad_norm": 0.2735179364681244, + "learning_rate": 9.253559133191779e-05, + "loss": 1.8209, + "step": 19600 + }, + { + "epoch": 1.062756786966315, + "grad_norm": 0.31726008653640747, + "learning_rate": 9.252685303332123e-05, + "loss": 1.8278, + "step": 19610 + }, + { + "epoch": 1.0632987281318271, + "grad_norm": 0.6221383213996887, + "learning_rate": 9.251811008606102e-05, + "loss": 1.825, + "step": 19620 + }, + { + "epoch": 1.0638406692973394, + "grad_norm": 0.8196062445640564, + "learning_rate": 9.250936249122023e-05, + "loss": 1.8196, + "step": 19630 + }, + { + "epoch": 1.0643826104628515, + "grad_norm": 0.2812905013561249, + "learning_rate": 9.250061024988246e-05, + "loss": 1.8123, + "step": 19640 + }, + { + "epoch": 1.0649245516283639, + "grad_norm": 0.4467778205871582, + "learning_rate": 9.249185336313191e-05, + "loss": 1.8098, + "step": 19650 + }, + { + "epoch": 1.0654664927938762, + "grad_norm": 0.36650586128234863, + "learning_rate": 9.248309183205334e-05, + "loss": 1.8245, + "step": 19660 + }, + { + "epoch": 1.0655748810269785, + "eval_loss": 2.5961737632751465, + "eval_runtime": 21.9861, + "eval_samples_per_second": 227.416, + "eval_steps_per_second": 1.228, + "step": 19662 + }, + { + "epoch": 1.0660084339593883, + "grad_norm": 0.351523220539093, + "learning_rate": 9.247432565773209e-05, + "loss": 1.8098, + "step": 19670 + }, + { + "epoch": 1.0665503751249006, + "grad_norm": 0.4025458097457886, + "learning_rate": 9.246555484125407e-05, + "loss": 1.8208, + "step": 19680 + }, + { + "epoch": 1.0670923162904127, + "grad_norm": 0.5277726650238037, + "learning_rate": 9.245677938370578e-05, + "loss": 1.8235, + "step": 19690 + }, + { + "epoch": 1.067634257455925, + "grad_norm": 0.3863029181957245, + "learning_rate": 9.244799928617427e-05, + "loss": 1.8074, + "step": 19700 + }, + { + "epoch": 1.068176198621437, + "grad_norm": 0.4278218746185303, + "learning_rate": 9.24392145497472e-05, + "loss": 1.8179, + "step": 19710 + }, + { + "epoch": 1.0687181397869494, + "grad_norm": 0.30992093682289124, + "learning_rate": 9.243042517551277e-05, + "loss": 1.8083, + "step": 19720 + }, + { + "epoch": 1.0692600809524615, + "grad_norm": 0.92547607421875, + "learning_rate": 9.242163116455979e-05, + "loss": 1.8194, + "step": 19730 + }, + { + "epoch": 1.0698020221179738, + "grad_norm": 0.297558069229126, + "learning_rate": 9.241283251797759e-05, + "loss": 1.8245, + "step": 19740 + }, + { + "epoch": 1.0702897691669349, + "eval_loss": 2.59582781791687, + "eval_runtime": 21.9957, + "eval_samples_per_second": 227.317, + "eval_steps_per_second": 1.228, + "step": 19749 + }, + { + "epoch": 1.0703439632834861, + "grad_norm": 0.34201860427856445, + "learning_rate": 9.240402923685613e-05, + "loss": 1.8202, + "step": 19750 + }, + { + "epoch": 1.0708859044489982, + "grad_norm": 0.5197858810424805, + "learning_rate": 9.239522132228594e-05, + "loss": 1.8104, + "step": 19760 + }, + { + "epoch": 1.0714278456145105, + "grad_norm": 0.5215007662773132, + "learning_rate": 9.238640877535809e-05, + "loss": 1.8128, + "step": 19770 + }, + { + "epoch": 1.0719697867800226, + "grad_norm": 0.3624344766139984, + "learning_rate": 9.237759159716421e-05, + "loss": 1.8203, + "step": 19780 + }, + { + "epoch": 1.072511727945535, + "grad_norm": 0.3879830241203308, + "learning_rate": 9.236876978879657e-05, + "loss": 1.8119, + "step": 19790 + }, + { + "epoch": 1.073053669111047, + "grad_norm": 0.3035085201263428, + "learning_rate": 9.235994335134798e-05, + "loss": 1.8118, + "step": 19800 + }, + { + "epoch": 1.0735956102765594, + "grad_norm": 1.0123307704925537, + "learning_rate": 9.23511122859118e-05, + "loss": 1.8119, + "step": 19810 + }, + { + "epoch": 1.0741375514420715, + "grad_norm": 0.46915149688720703, + "learning_rate": 9.234227659358197e-05, + "loss": 1.8163, + "step": 19820 + }, + { + "epoch": 1.0746794926075838, + "grad_norm": 0.48051413893699646, + "learning_rate": 9.233343627545307e-05, + "loss": 1.8178, + "step": 19830 + }, + { + "epoch": 1.0750046573068912, + "eval_loss": 2.6102840900421143, + "eval_runtime": 21.9902, + "eval_samples_per_second": 227.374, + "eval_steps_per_second": 1.228, + "step": 19836 + }, + { + "epoch": 1.075221433773096, + "grad_norm": 0.7908685207366943, + "learning_rate": 9.232459133262016e-05, + "loss": 1.8174, + "step": 19840 + }, + { + "epoch": 1.0757633749386082, + "grad_norm": 0.4490140974521637, + "learning_rate": 9.231574176617893e-05, + "loss": 1.821, + "step": 19850 + }, + { + "epoch": 1.0763053161041205, + "grad_norm": 0.6310442686080933, + "learning_rate": 9.230688757722562e-05, + "loss": 1.8002, + "step": 19860 + }, + { + "epoch": 1.0768472572696326, + "grad_norm": 0.47709372639656067, + "learning_rate": 9.229802876685702e-05, + "loss": 1.817, + "step": 19870 + }, + { + "epoch": 1.077389198435145, + "grad_norm": 0.3235110640525818, + "learning_rate": 9.228916533617057e-05, + "loss": 1.8022, + "step": 19880 + }, + { + "epoch": 1.077931139600657, + "grad_norm": 0.2803106904029846, + "learning_rate": 9.228029728626421e-05, + "loss": 1.8075, + "step": 19890 + }, + { + "epoch": 1.0784730807661693, + "grad_norm": 0.290698766708374, + "learning_rate": 9.227142461823648e-05, + "loss": 1.8064, + "step": 19900 + }, + { + "epoch": 1.0790150219316816, + "grad_norm": 0.7563320398330688, + "learning_rate": 9.226254733318648e-05, + "loss": 1.8079, + "step": 19910 + }, + { + "epoch": 1.0795569630971937, + "grad_norm": 0.4314441382884979, + "learning_rate": 9.22536654322139e-05, + "loss": 1.7955, + "step": 19920 + }, + { + "epoch": 1.0797195454468473, + "eval_loss": 2.596864938735962, + "eval_runtime": 21.9923, + "eval_samples_per_second": 227.352, + "eval_steps_per_second": 1.228, + "step": 19923 + }, + { + "epoch": 1.080098904262706, + "grad_norm": 0.30512261390686035, + "learning_rate": 9.224477891641897e-05, + "loss": 1.8072, + "step": 19930 + }, + { + "epoch": 1.0806408454282181, + "grad_norm": 0.5094988346099854, + "learning_rate": 9.223588778690255e-05, + "loss": 1.8176, + "step": 19940 + }, + { + "epoch": 1.0811827865937305, + "grad_norm": 0.47923043370246887, + "learning_rate": 9.222699204476599e-05, + "loss": 1.8203, + "step": 19950 + }, + { + "epoch": 1.0817247277592426, + "grad_norm": 0.3491983711719513, + "learning_rate": 9.221809169111129e-05, + "loss": 1.8097, + "step": 19960 + }, + { + "epoch": 1.0822666689247549, + "grad_norm": 0.5780982971191406, + "learning_rate": 9.220918672704099e-05, + "loss": 1.8065, + "step": 19970 + }, + { + "epoch": 1.0828086100902672, + "grad_norm": 0.37517765164375305, + "learning_rate": 9.220027715365817e-05, + "loss": 1.8102, + "step": 19980 + }, + { + "epoch": 1.0833505512557793, + "grad_norm": 0.582490086555481, + "learning_rate": 9.219136297206652e-05, + "loss": 1.8215, + "step": 19990 + }, + { + "epoch": 1.0838924924212916, + "grad_norm": 0.5589390397071838, + "learning_rate": 9.218244418337028e-05, + "loss": 1.8206, + "step": 20000 + }, + { + "epoch": 1.0844344335868037, + "grad_norm": 0.5232866406440735, + "learning_rate": 9.217352078867431e-05, + "loss": 1.8069, + "step": 20010 + }, + { + "epoch": 1.0844344335868037, + "eval_loss": 2.6093404293060303, + "eval_runtime": 21.9876, + "eval_samples_per_second": 227.401, + "eval_steps_per_second": 1.228, + "step": 20010 + }, + { + "epoch": 1.084976374752316, + "grad_norm": 0.3265039026737213, + "learning_rate": 9.216459278908398e-05, + "loss": 1.7983, + "step": 20020 + }, + { + "epoch": 1.085518315917828, + "grad_norm": 0.3922523558139801, + "learning_rate": 9.215566018570523e-05, + "loss": 1.7976, + "step": 20030 + }, + { + "epoch": 1.0860602570833404, + "grad_norm": 0.35444027185440063, + "learning_rate": 9.214672297964461e-05, + "loss": 1.7977, + "step": 20040 + }, + { + "epoch": 1.0866021982488525, + "grad_norm": 0.5408755540847778, + "learning_rate": 9.213778117200926e-05, + "loss": 1.8022, + "step": 20050 + }, + { + "epoch": 1.0871441394143648, + "grad_norm": 0.5063872337341309, + "learning_rate": 9.212883476390677e-05, + "loss": 1.8088, + "step": 20060 + }, + { + "epoch": 1.0876860805798771, + "grad_norm": 0.39630988240242004, + "learning_rate": 9.211988375644543e-05, + "loss": 1.7955, + "step": 20070 + }, + { + "epoch": 1.0882280217453892, + "grad_norm": 0.3347756564617157, + "learning_rate": 9.211092815073407e-05, + "loss": 1.7925, + "step": 20080 + }, + { + "epoch": 1.0887699629109016, + "grad_norm": 0.3801024854183197, + "learning_rate": 9.210196794788203e-05, + "loss": 1.8096, + "step": 20090 + }, + { + "epoch": 1.08914932172676, + "eval_loss": 2.6098368167877197, + "eval_runtime": 21.9921, + "eval_samples_per_second": 227.354, + "eval_steps_per_second": 1.228, + "step": 20097 + }, + { + "epoch": 1.0893119040764137, + "grad_norm": 0.4123273491859436, + "learning_rate": 9.20930031489993e-05, + "loss": 1.7988, + "step": 20100 + }, + { + "epoch": 1.089853845241926, + "grad_norm": 0.6030639410018921, + "learning_rate": 9.208403375519637e-05, + "loss": 1.7927, + "step": 20110 + }, + { + "epoch": 1.090395786407438, + "grad_norm": 0.3275871276855469, + "learning_rate": 9.207505976758434e-05, + "loss": 1.7937, + "step": 20120 + }, + { + "epoch": 1.0909377275729504, + "grad_norm": 0.4877399206161499, + "learning_rate": 9.206608118727488e-05, + "loss": 1.811, + "step": 20130 + }, + { + "epoch": 1.0914796687384625, + "grad_norm": 0.2985506057739258, + "learning_rate": 9.20570980153802e-05, + "loss": 1.812, + "step": 20140 + }, + { + "epoch": 1.0920216099039748, + "grad_norm": 0.29198411107063293, + "learning_rate": 9.204811025301311e-05, + "loss": 1.803, + "step": 20150 + }, + { + "epoch": 1.092563551069487, + "grad_norm": 0.3116692006587982, + "learning_rate": 9.203911790128696e-05, + "loss": 1.8113, + "step": 20160 + }, + { + "epoch": 1.0931054922349992, + "grad_norm": 0.3624420464038849, + "learning_rate": 9.20301209613157e-05, + "loss": 1.794, + "step": 20170 + }, + { + "epoch": 1.0936474334005115, + "grad_norm": 0.5469610095024109, + "learning_rate": 9.202111943421381e-05, + "loss": 1.8038, + "step": 20180 + }, + { + "epoch": 1.0938642098667164, + "eval_loss": 2.607426404953003, + "eval_runtime": 21.9928, + "eval_samples_per_second": 227.347, + "eval_steps_per_second": 1.228, + "step": 20184 + }, + { + "epoch": 1.0941893745660236, + "grad_norm": 0.5207812786102295, + "learning_rate": 9.201211332109639e-05, + "loss": 1.7874, + "step": 20190 + }, + { + "epoch": 1.094731315731536, + "grad_norm": 0.3748219311237335, + "learning_rate": 9.200310262307905e-05, + "loss": 1.7988, + "step": 20200 + }, + { + "epoch": 1.095273256897048, + "grad_norm": 0.34070441126823425, + "learning_rate": 9.199408734127801e-05, + "loss": 1.8057, + "step": 20210 + }, + { + "epoch": 1.0958151980625603, + "grad_norm": 0.32609522342681885, + "learning_rate": 9.198506747681005e-05, + "loss": 1.8109, + "step": 20220 + }, + { + "epoch": 1.0963571392280727, + "grad_norm": 0.4092683792114258, + "learning_rate": 9.19760430307925e-05, + "loss": 1.7954, + "step": 20230 + }, + { + "epoch": 1.0968990803935847, + "grad_norm": 0.5064677000045776, + "learning_rate": 9.196701400434327e-05, + "loss": 1.8074, + "step": 20240 + }, + { + "epoch": 1.097441021559097, + "grad_norm": 0.3224587142467499, + "learning_rate": 9.195798039858084e-05, + "loss": 1.7973, + "step": 20250 + }, + { + "epoch": 1.0979829627246092, + "grad_norm": 0.5034688711166382, + "learning_rate": 9.194894221462427e-05, + "loss": 1.7994, + "step": 20260 + }, + { + "epoch": 1.0985249038901215, + "grad_norm": 0.3772341310977936, + "learning_rate": 9.193989945359314e-05, + "loss": 1.7817, + "step": 20270 + }, + { + "epoch": 1.0985790980066727, + "eval_loss": 2.6105637550354004, + "eval_runtime": 21.9937, + "eval_samples_per_second": 227.338, + "eval_steps_per_second": 1.228, + "step": 20271 + }, + { + "epoch": 1.0990668450556336, + "grad_norm": 0.8809638619422913, + "learning_rate": 9.193085211660764e-05, + "loss": 1.8056, + "step": 20280 + }, + { + "epoch": 1.0996087862211459, + "grad_norm": 0.6711058020591736, + "learning_rate": 9.192180020478852e-05, + "loss": 1.7951, + "step": 20290 + }, + { + "epoch": 1.1001507273866582, + "grad_norm": 0.8903740048408508, + "learning_rate": 9.19127437192571e-05, + "loss": 1.8076, + "step": 20300 + }, + { + "epoch": 1.1006926685521703, + "grad_norm": 0.8000921010971069, + "learning_rate": 9.190368266113524e-05, + "loss": 1.811, + "step": 20310 + }, + { + "epoch": 1.1012346097176826, + "grad_norm": 0.4838610589504242, + "learning_rate": 9.189461703154538e-05, + "loss": 1.809, + "step": 20320 + }, + { + "epoch": 1.1017765508831947, + "grad_norm": 0.5514103174209595, + "learning_rate": 9.188554683161056e-05, + "loss": 1.7987, + "step": 20330 + }, + { + "epoch": 1.102318492048707, + "grad_norm": 0.2846459448337555, + "learning_rate": 9.187647206245434e-05, + "loss": 1.8004, + "step": 20340 + }, + { + "epoch": 1.1028604332142191, + "grad_norm": 0.45695552229881287, + "learning_rate": 9.186739272520085e-05, + "loss": 1.7999, + "step": 20350 + }, + { + "epoch": 1.1032939861466289, + "eval_loss": 2.5855958461761475, + "eval_runtime": 21.9945, + "eval_samples_per_second": 227.33, + "eval_steps_per_second": 1.228, + "step": 20358 + }, + { + "epoch": 1.1034023743797314, + "grad_norm": 0.3189384937286377, + "learning_rate": 9.185830882097482e-05, + "loss": 1.8023, + "step": 20360 + }, + { + "epoch": 1.1039443155452435, + "grad_norm": 0.5680913925170898, + "learning_rate": 9.184922035090151e-05, + "loss": 1.8008, + "step": 20370 + }, + { + "epoch": 1.1044862567107558, + "grad_norm": 0.31937944889068604, + "learning_rate": 9.184012731610676e-05, + "loss": 1.7983, + "step": 20380 + }, + { + "epoch": 1.1050281978762682, + "grad_norm": 0.9807746410369873, + "learning_rate": 9.1831029717717e-05, + "loss": 1.7981, + "step": 20390 + }, + { + "epoch": 1.1055701390417803, + "grad_norm": 1.0870219469070435, + "learning_rate": 9.182192755685917e-05, + "loss": 1.7879, + "step": 20400 + }, + { + "epoch": 1.1061120802072926, + "grad_norm": 0.45273005962371826, + "learning_rate": 9.181282083466082e-05, + "loss": 1.7992, + "step": 20410 + }, + { + "epoch": 1.1066540213728047, + "grad_norm": 0.7691323757171631, + "learning_rate": 9.180370955225006e-05, + "loss": 1.7918, + "step": 20420 + }, + { + "epoch": 1.107195962538317, + "grad_norm": 0.5909658670425415, + "learning_rate": 9.179459371075554e-05, + "loss": 1.802, + "step": 20430 + }, + { + "epoch": 1.107737903703829, + "grad_norm": 0.3207308053970337, + "learning_rate": 9.178547331130649e-05, + "loss": 1.7934, + "step": 20440 + }, + { + "epoch": 1.1080088742865852, + "eval_loss": 2.5942413806915283, + "eval_runtime": 21.9938, + "eval_samples_per_second": 227.337, + "eval_steps_per_second": 1.228, + "step": 20445 + }, + { + "epoch": 1.1082798448693414, + "grad_norm": 0.3061155378818512, + "learning_rate": 9.177634835503272e-05, + "loss": 1.7894, + "step": 20450 + }, + { + "epoch": 1.1088217860348535, + "grad_norm": 0.35638898611068726, + "learning_rate": 9.176721884306459e-05, + "loss": 1.7893, + "step": 20460 + }, + { + "epoch": 1.1093637272003658, + "grad_norm": 0.351608008146286, + "learning_rate": 9.1758084776533e-05, + "loss": 1.7951, + "step": 20470 + }, + { + "epoch": 1.1099056683658781, + "grad_norm": 0.3291257321834564, + "learning_rate": 9.174894615656948e-05, + "loss": 1.7911, + "step": 20480 + }, + { + "epoch": 1.1104476095313902, + "grad_norm": 0.2714458703994751, + "learning_rate": 9.173980298430604e-05, + "loss": 1.795, + "step": 20490 + }, + { + "epoch": 1.1109895506969025, + "grad_norm": 0.3953942358493805, + "learning_rate": 9.173065526087531e-05, + "loss": 1.7912, + "step": 20500 + }, + { + "epoch": 1.1115314918624146, + "grad_norm": 0.3956896662712097, + "learning_rate": 9.17215029874105e-05, + "loss": 1.7911, + "step": 20510 + }, + { + "epoch": 1.112073433027927, + "grad_norm": 0.3793238401412964, + "learning_rate": 9.17123461650453e-05, + "loss": 1.7972, + "step": 20520 + }, + { + "epoch": 1.112615374193439, + "grad_norm": 0.4785431921482086, + "learning_rate": 9.170318479491406e-05, + "loss": 1.7961, + "step": 20530 + }, + { + "epoch": 1.1127237624265416, + "eval_loss": 2.584057331085205, + "eval_runtime": 21.9901, + "eval_samples_per_second": 227.375, + "eval_steps_per_second": 1.228, + "step": 20532 + }, + { + "epoch": 1.1131573153589513, + "grad_norm": 0.4006010890007019, + "learning_rate": 9.169401887815164e-05, + "loss": 1.7861, + "step": 20540 + }, + { + "epoch": 1.1136992565244637, + "grad_norm": 0.2974483072757721, + "learning_rate": 9.168484841589346e-05, + "loss": 1.7885, + "step": 20550 + }, + { + "epoch": 1.1142411976899758, + "grad_norm": 0.7173396348953247, + "learning_rate": 9.167567340927552e-05, + "loss": 1.8022, + "step": 20560 + }, + { + "epoch": 1.114783138855488, + "grad_norm": 0.46824324131011963, + "learning_rate": 9.166649385943441e-05, + "loss": 1.7895, + "step": 20570 + }, + { + "epoch": 1.1153250800210002, + "grad_norm": 0.6654873490333557, + "learning_rate": 9.165730976750722e-05, + "loss": 1.7898, + "step": 20580 + }, + { + "epoch": 1.1158670211865125, + "grad_norm": 0.27924737334251404, + "learning_rate": 9.164812113463165e-05, + "loss": 1.783, + "step": 20590 + }, + { + "epoch": 1.1164089623520246, + "grad_norm": 0.5301341414451599, + "learning_rate": 9.163892796194594e-05, + "loss": 1.7854, + "step": 20600 + }, + { + "epoch": 1.116950903517537, + "grad_norm": 0.357898086309433, + "learning_rate": 9.162973025058891e-05, + "loss": 1.7834, + "step": 20610 + }, + { + "epoch": 1.117438650566498, + "eval_loss": 2.5919032096862793, + "eval_runtime": 21.993, + "eval_samples_per_second": 227.345, + "eval_steps_per_second": 1.228, + "step": 20619 + }, + { + "epoch": 1.1174928446830492, + "grad_norm": 0.3866696059703827, + "learning_rate": 9.162052800169992e-05, + "loss": 1.7898, + "step": 20620 + }, + { + "epoch": 1.1180347858485613, + "grad_norm": 0.30722272396087646, + "learning_rate": 9.161132121641892e-05, + "loss": 1.7837, + "step": 20630 + }, + { + "epoch": 1.1185767270140736, + "grad_norm": 0.36760374903678894, + "learning_rate": 9.160210989588639e-05, + "loss": 1.7846, + "step": 20640 + }, + { + "epoch": 1.1191186681795857, + "grad_norm": 0.48524531722068787, + "learning_rate": 9.15928940412434e-05, + "loss": 1.7822, + "step": 20650 + }, + { + "epoch": 1.119660609345098, + "grad_norm": 0.41821709275245667, + "learning_rate": 9.158367365363157e-05, + "loss": 1.7902, + "step": 20660 + }, + { + "epoch": 1.1202025505106101, + "grad_norm": 0.5472697615623474, + "learning_rate": 9.157444873419307e-05, + "loss": 1.7975, + "step": 20670 + }, + { + "epoch": 1.1207444916761224, + "grad_norm": 0.3065122663974762, + "learning_rate": 9.156521928407066e-05, + "loss": 1.7851, + "step": 20680 + }, + { + "epoch": 1.1212864328416345, + "grad_norm": 0.5732535719871521, + "learning_rate": 9.155598530440763e-05, + "loss": 1.7849, + "step": 20690 + }, + { + "epoch": 1.1218283740071469, + "grad_norm": 0.3151445686817169, + "learning_rate": 9.154674679634786e-05, + "loss": 1.7931, + "step": 20700 + }, + { + "epoch": 1.122153538706454, + "eval_loss": 2.5944461822509766, + "eval_runtime": 21.9928, + "eval_samples_per_second": 227.348, + "eval_steps_per_second": 1.228, + "step": 20706 + }, + { + "epoch": 1.1223703151726592, + "grad_norm": 0.3001176714897156, + "learning_rate": 9.153750376103575e-05, + "loss": 1.7924, + "step": 20710 + }, + { + "epoch": 1.1229122563381713, + "grad_norm": 0.2985573709011078, + "learning_rate": 9.152825619961633e-05, + "loss": 1.7838, + "step": 20720 + }, + { + "epoch": 1.1234541975036836, + "grad_norm": 0.3202672600746155, + "learning_rate": 9.151900411323509e-05, + "loss": 1.7899, + "step": 20730 + }, + { + "epoch": 1.1239961386691957, + "grad_norm": 0.3624779284000397, + "learning_rate": 9.150974750303818e-05, + "loss": 1.779, + "step": 20740 + }, + { + "epoch": 1.124538079834708, + "grad_norm": 0.47234293818473816, + "learning_rate": 9.150048637017226e-05, + "loss": 1.7977, + "step": 20750 + }, + { + "epoch": 1.12508002100022, + "grad_norm": 0.4014720618724823, + "learning_rate": 9.149122071578457e-05, + "loss": 1.7887, + "step": 20760 + }, + { + "epoch": 1.1256219621657324, + "grad_norm": 0.8474875688552856, + "learning_rate": 9.148195054102289e-05, + "loss": 1.7889, + "step": 20770 + }, + { + "epoch": 1.1261639033312445, + "grad_norm": 0.3223975598812103, + "learning_rate": 9.147267584703554e-05, + "loss": 1.7931, + "step": 20780 + }, + { + "epoch": 1.1267058444967568, + "grad_norm": 0.5686293840408325, + "learning_rate": 9.146339663497148e-05, + "loss": 1.7829, + "step": 20790 + }, + { + "epoch": 1.1268684268464104, + "eval_loss": 2.596271276473999, + "eval_runtime": 21.9897, + "eval_samples_per_second": 227.38, + "eval_steps_per_second": 1.228, + "step": 20793 + }, + { + "epoch": 1.1272477856622691, + "grad_norm": 0.4797728359699249, + "learning_rate": 9.145411290598013e-05, + "loss": 1.7919, + "step": 20800 + }, + { + "epoch": 1.1277897268277812, + "grad_norm": 0.33035770058631897, + "learning_rate": 9.144482466121157e-05, + "loss": 1.7981, + "step": 20810 + }, + { + "epoch": 1.1283316679932935, + "grad_norm": 0.6441662311553955, + "learning_rate": 9.143553190181633e-05, + "loss": 1.7875, + "step": 20820 + }, + { + "epoch": 1.1288736091588056, + "grad_norm": 0.5361613035202026, + "learning_rate": 9.14262346289456e-05, + "loss": 1.7806, + "step": 20830 + }, + { + "epoch": 1.129415550324318, + "grad_norm": 0.32189682126045227, + "learning_rate": 9.141693284375106e-05, + "loss": 1.7967, + "step": 20840 + }, + { + "epoch": 1.12995749148983, + "grad_norm": 0.28092676401138306, + "learning_rate": 9.140762654738499e-05, + "loss": 1.7902, + "step": 20850 + }, + { + "epoch": 1.1304994326553424, + "grad_norm": 0.29651352763175964, + "learning_rate": 9.139831574100022e-05, + "loss": 1.7905, + "step": 20860 + }, + { + "epoch": 1.1310413738208545, + "grad_norm": 0.3914618492126465, + "learning_rate": 9.138900042575012e-05, + "loss": 1.7863, + "step": 20870 + }, + { + "epoch": 1.1315833149863668, + "grad_norm": 0.33878007531166077, + "learning_rate": 9.137968060278863e-05, + "loss": 1.7872, + "step": 20880 + }, + { + "epoch": 1.1315833149863668, + "eval_loss": 2.5968291759490967, + "eval_runtime": 21.9881, + "eval_samples_per_second": 227.396, + "eval_steps_per_second": 1.228, + "step": 20880 + }, + { + "epoch": 1.132125256151879, + "grad_norm": 0.584201455116272, + "learning_rate": 9.137035627327026e-05, + "loss": 1.7768, + "step": 20890 + }, + { + "epoch": 1.1326671973173912, + "grad_norm": 0.3373468816280365, + "learning_rate": 9.136102743835004e-05, + "loss": 1.7857, + "step": 20900 + }, + { + "epoch": 1.1332091384829035, + "grad_norm": 0.2604641020298004, + "learning_rate": 9.13516940991836e-05, + "loss": 1.7847, + "step": 20910 + }, + { + "epoch": 1.1337510796484156, + "grad_norm": 0.28360849618911743, + "learning_rate": 9.134235625692714e-05, + "loss": 1.7903, + "step": 20920 + }, + { + "epoch": 1.134293020813928, + "grad_norm": 0.4277685284614563, + "learning_rate": 9.133301391273736e-05, + "loss": 1.7813, + "step": 20930 + }, + { + "epoch": 1.1348349619794402, + "grad_norm": 0.6790282726287842, + "learning_rate": 9.132366706777155e-05, + "loss": 1.792, + "step": 20940 + }, + { + "epoch": 1.1353769031449523, + "grad_norm": 0.4232507050037384, + "learning_rate": 9.13143157231876e-05, + "loss": 1.7832, + "step": 20950 + }, + { + "epoch": 1.1359188443104646, + "grad_norm": 0.5046867728233337, + "learning_rate": 9.130495988014384e-05, + "loss": 1.7758, + "step": 20960 + }, + { + "epoch": 1.1362982031263231, + "eval_loss": 2.590829372406006, + "eval_runtime": 21.9886, + "eval_samples_per_second": 227.391, + "eval_steps_per_second": 1.228, + "step": 20967 + }, + { + "epoch": 1.1364607854759767, + "grad_norm": 0.3382951021194458, + "learning_rate": 9.129559953979928e-05, + "loss": 1.7858, + "step": 20970 + }, + { + "epoch": 1.137002726641489, + "grad_norm": 0.33747395873069763, + "learning_rate": 9.128623470331343e-05, + "loss": 1.7828, + "step": 20980 + }, + { + "epoch": 1.1375446678070011, + "grad_norm": 0.2978461682796478, + "learning_rate": 9.127686537184636e-05, + "loss": 1.7839, + "step": 20990 + }, + { + "epoch": 1.1380866089725135, + "grad_norm": 0.5556627511978149, + "learning_rate": 9.126749154655872e-05, + "loss": 1.7826, + "step": 21000 + }, + { + "epoch": 1.1386285501380256, + "grad_norm": 0.3020962178707123, + "learning_rate": 9.125811322861168e-05, + "loss": 1.7779, + "step": 21010 + }, + { + "epoch": 1.1391704913035379, + "grad_norm": 0.5155685544013977, + "learning_rate": 9.1248730419167e-05, + "loss": 1.7912, + "step": 21020 + }, + { + "epoch": 1.1397124324690502, + "grad_norm": 1.0742664337158203, + "learning_rate": 9.123934311938696e-05, + "loss": 1.7943, + "step": 21030 + }, + { + "epoch": 1.1402543736345623, + "grad_norm": 0.6324855089187622, + "learning_rate": 9.122995133043442e-05, + "loss": 1.7778, + "step": 21040 + }, + { + "epoch": 1.1407963148000746, + "grad_norm": 0.33419859409332275, + "learning_rate": 9.122055505347283e-05, + "loss": 1.7839, + "step": 21050 + }, + { + "epoch": 1.1410130912662795, + "eval_loss": 2.5992467403411865, + "eval_runtime": 21.9929, + "eval_samples_per_second": 227.346, + "eval_steps_per_second": 1.228, + "step": 21054 + }, + { + "epoch": 1.1413382559655867, + "grad_norm": 0.30402234196662903, + "learning_rate": 9.12111542896661e-05, + "loss": 1.7818, + "step": 21060 + }, + { + "epoch": 1.141880197131099, + "grad_norm": 0.28443703055381775, + "learning_rate": 9.120174904017882e-05, + "loss": 1.7698, + "step": 21070 + }, + { + "epoch": 1.142422138296611, + "grad_norm": 0.29675841331481934, + "learning_rate": 9.119233930617603e-05, + "loss": 1.7764, + "step": 21080 + }, + { + "epoch": 1.1429640794621234, + "grad_norm": 0.6041000485420227, + "learning_rate": 9.118292508882338e-05, + "loss": 1.7697, + "step": 21090 + }, + { + "epoch": 1.1435060206276355, + "grad_norm": 0.3296681046485901, + "learning_rate": 9.117350638928706e-05, + "loss": 1.7884, + "step": 21100 + }, + { + "epoch": 1.1440479617931478, + "grad_norm": 0.5075874924659729, + "learning_rate": 9.11640832087338e-05, + "loss": 1.7874, + "step": 21110 + }, + { + "epoch": 1.1445899029586601, + "grad_norm": 0.5272268056869507, + "learning_rate": 9.115465554833095e-05, + "loss": 1.7755, + "step": 21120 + }, + { + "epoch": 1.1451318441241722, + "grad_norm": 0.6501821875572205, + "learning_rate": 9.114522340924631e-05, + "loss": 1.7839, + "step": 21130 + }, + { + "epoch": 1.1456737852896846, + "grad_norm": 0.2923777103424072, + "learning_rate": 9.113578679264835e-05, + "loss": 1.7769, + "step": 21140 + }, + { + "epoch": 1.1457279794062356, + "eval_loss": 2.5842106342315674, + "eval_runtime": 22.0002, + "eval_samples_per_second": 227.27, + "eval_steps_per_second": 1.227, + "step": 21141 + }, + { + "epoch": 1.1462157264551966, + "grad_norm": 0.3021109402179718, + "learning_rate": 9.1126345699706e-05, + "loss": 1.7834, + "step": 21150 + }, + { + "epoch": 1.146757667620709, + "grad_norm": 0.30015039443969727, + "learning_rate": 9.111690013158877e-05, + "loss": 1.7723, + "step": 21160 + }, + { + "epoch": 1.147299608786221, + "grad_norm": 0.7381042838096619, + "learning_rate": 9.110745008946678e-05, + "loss": 1.7824, + "step": 21170 + }, + { + "epoch": 1.1478415499517334, + "grad_norm": 0.39448803663253784, + "learning_rate": 9.109799557451062e-05, + "loss": 1.7873, + "step": 21180 + }, + { + "epoch": 1.1483834911172455, + "grad_norm": 0.2745918035507202, + "learning_rate": 9.108853658789149e-05, + "loss": 1.7832, + "step": 21190 + }, + { + "epoch": 1.1489254322827578, + "grad_norm": 0.3459675908088684, + "learning_rate": 9.107907313078115e-05, + "loss": 1.7695, + "step": 21200 + }, + { + "epoch": 1.14946737344827, + "grad_norm": 0.30421921610832214, + "learning_rate": 9.106960520435183e-05, + "loss": 1.7735, + "step": 21210 + }, + { + "epoch": 1.1500093146137822, + "grad_norm": 0.2987232804298401, + "learning_rate": 9.106013280977645e-05, + "loss": 1.7777, + "step": 21220 + }, + { + "epoch": 1.150442867546192, + "eval_loss": 2.5747172832489014, + "eval_runtime": 21.9936, + "eval_samples_per_second": 227.339, + "eval_steps_per_second": 1.228, + "step": 21228 + }, + { + "epoch": 1.1505512557792945, + "grad_norm": 0.29645800590515137, + "learning_rate": 9.105065594822838e-05, + "loss": 1.7794, + "step": 21230 + }, + { + "epoch": 1.1510931969448066, + "grad_norm": 0.3672481179237366, + "learning_rate": 9.104117462088154e-05, + "loss": 1.7751, + "step": 21240 + }, + { + "epoch": 1.151635138110319, + "grad_norm": 0.27135440707206726, + "learning_rate": 9.103168882891047e-05, + "loss": 1.7841, + "step": 21250 + }, + { + "epoch": 1.1521770792758312, + "grad_norm": 0.5033676624298096, + "learning_rate": 9.102219857349024e-05, + "loss": 1.776, + "step": 21260 + }, + { + "epoch": 1.1527190204413433, + "grad_norm": 0.4527705907821655, + "learning_rate": 9.101270385579643e-05, + "loss": 1.776, + "step": 21270 + }, + { + "epoch": 1.1532609616068557, + "grad_norm": 0.4339083433151245, + "learning_rate": 9.100320467700521e-05, + "loss": 1.7749, + "step": 21280 + }, + { + "epoch": 1.1538029027723677, + "grad_norm": 0.28354117274284363, + "learning_rate": 9.099370103829332e-05, + "loss": 1.7802, + "step": 21290 + }, + { + "epoch": 1.15434484393788, + "grad_norm": 0.2814236283302307, + "learning_rate": 9.098419294083801e-05, + "loss": 1.7651, + "step": 21300 + }, + { + "epoch": 1.1548867851033922, + "grad_norm": 0.32581958174705505, + "learning_rate": 9.09746803858171e-05, + "loss": 1.7867, + "step": 21310 + }, + { + "epoch": 1.1551577556861483, + "eval_loss": 2.5878617763519287, + "eval_runtime": 21.9939, + "eval_samples_per_second": 227.336, + "eval_steps_per_second": 1.228, + "step": 21315 + }, + { + "epoch": 1.1554287262689045, + "grad_norm": 0.742839515209198, + "learning_rate": 9.096516337440898e-05, + "loss": 1.7793, + "step": 21320 + }, + { + "epoch": 1.1559706674344166, + "grad_norm": 0.27201151847839355, + "learning_rate": 9.095564190779257e-05, + "loss": 1.7819, + "step": 21330 + }, + { + "epoch": 1.1565126085999289, + "grad_norm": 0.2983904778957367, + "learning_rate": 9.094611598714733e-05, + "loss": 1.7799, + "step": 21340 + }, + { + "epoch": 1.1570545497654412, + "grad_norm": 0.2776441276073456, + "learning_rate": 9.09365856136533e-05, + "loss": 1.7716, + "step": 21350 + }, + { + "epoch": 1.1575964909309533, + "grad_norm": 0.27620989084243774, + "learning_rate": 9.092705078849108e-05, + "loss": 1.7662, + "step": 21360 + }, + { + "epoch": 1.1581384320964656, + "grad_norm": 0.4133933484554291, + "learning_rate": 9.091751151284178e-05, + "loss": 1.7788, + "step": 21370 + }, + { + "epoch": 1.1586803732619777, + "grad_norm": 0.587293267250061, + "learning_rate": 9.090796778788709e-05, + "loss": 1.7711, + "step": 21380 + }, + { + "epoch": 1.15922231442749, + "grad_norm": 0.3694206178188324, + "learning_rate": 9.089841961480927e-05, + "loss": 1.7688, + "step": 21390 + }, + { + "epoch": 1.1597642555930021, + "grad_norm": 0.5002974271774292, + "learning_rate": 9.088886699479105e-05, + "loss": 1.7743, + "step": 21400 + }, + { + "epoch": 1.1598726438261047, + "eval_loss": 2.5802183151245117, + "eval_runtime": 21.9908, + "eval_samples_per_second": 227.368, + "eval_steps_per_second": 1.228, + "step": 21402 + }, + { + "epoch": 1.1603061967585144, + "grad_norm": 0.4875272810459137, + "learning_rate": 9.087930992901581e-05, + "loss": 1.7803, + "step": 21410 + }, + { + "epoch": 1.1608481379240265, + "grad_norm": 0.33327093720436096, + "learning_rate": 9.086974841866743e-05, + "loss": 1.7743, + "step": 21420 + }, + { + "epoch": 1.1613900790895388, + "grad_norm": 0.6208361983299255, + "learning_rate": 9.086018246493037e-05, + "loss": 1.7694, + "step": 21430 + }, + { + "epoch": 1.1619320202550512, + "grad_norm": 0.4120174050331116, + "learning_rate": 9.085061206898957e-05, + "loss": 1.7802, + "step": 21440 + }, + { + "epoch": 1.1624739614205633, + "grad_norm": 0.3111993074417114, + "learning_rate": 9.08410372320306e-05, + "loss": 1.7863, + "step": 21450 + }, + { + "epoch": 1.1630159025860756, + "grad_norm": 0.2776833772659302, + "learning_rate": 9.083145795523955e-05, + "loss": 1.7794, + "step": 21460 + }, + { + "epoch": 1.1635578437515877, + "grad_norm": 0.7861410975456238, + "learning_rate": 9.082187423980304e-05, + "loss": 1.7772, + "step": 21470 + }, + { + "epoch": 1.1640997849171, + "grad_norm": 0.3405260741710663, + "learning_rate": 9.081228608690828e-05, + "loss": 1.7832, + "step": 21480 + }, + { + "epoch": 1.164587531966061, + "eval_loss": 2.5907578468322754, + "eval_runtime": 21.9898, + "eval_samples_per_second": 227.378, + "eval_steps_per_second": 1.228, + "step": 21489 + }, + { + "epoch": 1.164641726082612, + "grad_norm": 0.28428077697753906, + "learning_rate": 9.080269349774301e-05, + "loss": 1.7735, + "step": 21490 + }, + { + "epoch": 1.1651836672481244, + "grad_norm": 0.28834155201911926, + "learning_rate": 9.079309647349549e-05, + "loss": 1.7815, + "step": 21500 + }, + { + "epoch": 1.1657256084136365, + "grad_norm": 0.35341453552246094, + "learning_rate": 9.078349501535461e-05, + "loss": 1.7785, + "step": 21510 + }, + { + "epoch": 1.1662675495791488, + "grad_norm": 0.5031281113624573, + "learning_rate": 9.077388912450969e-05, + "loss": 1.7797, + "step": 21520 + }, + { + "epoch": 1.1668094907446611, + "grad_norm": 0.439062237739563, + "learning_rate": 9.076427880215072e-05, + "loss": 1.7657, + "step": 21530 + }, + { + "epoch": 1.1673514319101732, + "grad_norm": 0.356479287147522, + "learning_rate": 9.075466404946814e-05, + "loss": 1.7763, + "step": 21540 + }, + { + "epoch": 1.1678933730756855, + "grad_norm": 0.3379020094871521, + "learning_rate": 9.074504486765304e-05, + "loss": 1.7652, + "step": 21550 + }, + { + "epoch": 1.1684353142411976, + "grad_norm": 0.3034111559391022, + "learning_rate": 9.073542125789695e-05, + "loss": 1.7658, + "step": 21560 + }, + { + "epoch": 1.16897725540671, + "grad_norm": 0.4461290240287781, + "learning_rate": 9.072579322139202e-05, + "loss": 1.7678, + "step": 21570 + }, + { + "epoch": 1.1693024201060171, + "eval_loss": 2.5929572582244873, + "eval_runtime": 21.9943, + "eval_samples_per_second": 227.332, + "eval_steps_per_second": 1.228, + "step": 21576 + }, + { + "epoch": 1.1695191965722223, + "grad_norm": 0.25855252146720886, + "learning_rate": 9.071616075933095e-05, + "loss": 1.7704, + "step": 21580 + }, + { + "epoch": 1.1700611377377343, + "grad_norm": 0.27765700221061707, + "learning_rate": 9.070652387290695e-05, + "loss": 1.7575, + "step": 21590 + }, + { + "epoch": 1.1706030789032467, + "grad_norm": 0.26034823060035706, + "learning_rate": 9.069688256331377e-05, + "loss": 1.7625, + "step": 21600 + }, + { + "epoch": 1.1711450200687588, + "grad_norm": 1.0148969888687134, + "learning_rate": 9.068723683174578e-05, + "loss": 1.7771, + "step": 21610 + }, + { + "epoch": 1.171686961234271, + "grad_norm": 0.3433249592781067, + "learning_rate": 9.067758667939782e-05, + "loss": 1.769, + "step": 21620 + }, + { + "epoch": 1.1722289023997832, + "grad_norm": 0.3450257480144501, + "learning_rate": 9.066793210746533e-05, + "loss": 1.7761, + "step": 21630 + }, + { + "epoch": 1.1727708435652955, + "grad_norm": 0.29265645146369934, + "learning_rate": 9.065827311714426e-05, + "loss": 1.7633, + "step": 21640 + }, + { + "epoch": 1.1733127847308076, + "grad_norm": 0.29442882537841797, + "learning_rate": 9.064860970963112e-05, + "loss": 1.7706, + "step": 21650 + }, + { + "epoch": 1.17385472589632, + "grad_norm": 0.4089371860027313, + "learning_rate": 9.063894188612298e-05, + "loss": 1.7673, + "step": 21660 + }, + { + "epoch": 1.1740173082459735, + "eval_loss": 2.585028648376465, + "eval_runtime": 21.9891, + "eval_samples_per_second": 227.386, + "eval_steps_per_second": 1.228, + "step": 21663 + }, + { + "epoch": 1.1743966670618322, + "grad_norm": 0.5681344866752625, + "learning_rate": 9.062926964781746e-05, + "loss": 1.7797, + "step": 21670 + }, + { + "epoch": 1.1749386082273443, + "grad_norm": 0.5952972769737244, + "learning_rate": 9.061959299591269e-05, + "loss": 1.7781, + "step": 21680 + }, + { + "epoch": 1.1754805493928566, + "grad_norm": 0.30679190158843994, + "learning_rate": 9.060991193160739e-05, + "loss": 1.7685, + "step": 21690 + }, + { + "epoch": 1.1760224905583687, + "grad_norm": 0.37840238213539124, + "learning_rate": 9.06002264561008e-05, + "loss": 1.7613, + "step": 21700 + }, + { + "epoch": 1.176564431723881, + "grad_norm": 0.6132546663284302, + "learning_rate": 9.059053657059272e-05, + "loss": 1.7842, + "step": 21710 + }, + { + "epoch": 1.1771063728893931, + "grad_norm": 0.37106719613075256, + "learning_rate": 9.058084227628351e-05, + "loss": 1.763, + "step": 21720 + }, + { + "epoch": 1.1776483140549054, + "grad_norm": 0.2650188207626343, + "learning_rate": 9.057114357437401e-05, + "loss": 1.7582, + "step": 21730 + }, + { + "epoch": 1.1781902552204175, + "grad_norm": 0.4426226019859314, + "learning_rate": 9.056144046606568e-05, + "loss": 1.7725, + "step": 21740 + }, + { + "epoch": 1.1787321963859299, + "grad_norm": 0.39338555932044983, + "learning_rate": 9.05517329525605e-05, + "loss": 1.7712, + "step": 21750 + }, + { + "epoch": 1.1787321963859299, + "eval_loss": 2.59128999710083, + "eval_runtime": 21.9834, + "eval_samples_per_second": 227.444, + "eval_steps_per_second": 1.228, + "step": 21750 + }, + { + "epoch": 1.1792741375514422, + "grad_norm": 0.3727186918258667, + "learning_rate": 9.054202103506098e-05, + "loss": 1.7642, + "step": 21760 + }, + { + "epoch": 1.1798160787169543, + "grad_norm": 0.4082069993019104, + "learning_rate": 9.053230471477023e-05, + "loss": 1.7736, + "step": 21770 + }, + { + "epoch": 1.1803580198824666, + "grad_norm": 0.29831764101982117, + "learning_rate": 9.052258399289182e-05, + "loss": 1.7592, + "step": 21780 + }, + { + "epoch": 1.1808999610479787, + "grad_norm": 0.4205126166343689, + "learning_rate": 9.051285887062993e-05, + "loss": 1.7677, + "step": 21790 + }, + { + "epoch": 1.181441902213491, + "grad_norm": 0.38999685645103455, + "learning_rate": 9.050312934918926e-05, + "loss": 1.7681, + "step": 21800 + }, + { + "epoch": 1.181983843379003, + "grad_norm": 0.34131819009780884, + "learning_rate": 9.049339542977507e-05, + "loss": 1.7568, + "step": 21810 + }, + { + "epoch": 1.1825257845445154, + "grad_norm": 0.5073003768920898, + "learning_rate": 9.048365711359317e-05, + "loss": 1.7684, + "step": 21820 + }, + { + "epoch": 1.1830677257100275, + "grad_norm": 0.47667044401168823, + "learning_rate": 9.047391440184985e-05, + "loss": 1.7634, + "step": 21830 + }, + { + "epoch": 1.1834470845258862, + "eval_loss": 2.5854649543762207, + "eval_runtime": 21.9926, + "eval_samples_per_second": 227.349, + "eval_steps_per_second": 1.228, + "step": 21837 + }, + { + "epoch": 1.1836096668755398, + "grad_norm": 0.8095766305923462, + "learning_rate": 9.046416729575205e-05, + "loss": 1.7581, + "step": 21840 + }, + { + "epoch": 1.1841516080410521, + "grad_norm": 0.3507765531539917, + "learning_rate": 9.045441579650717e-05, + "loss": 1.7646, + "step": 21850 + }, + { + "epoch": 1.1846935492065642, + "grad_norm": 0.29202473163604736, + "learning_rate": 9.044465990532318e-05, + "loss": 1.7758, + "step": 21860 + }, + { + "epoch": 1.1852354903720765, + "grad_norm": 0.265829861164093, + "learning_rate": 9.043489962340861e-05, + "loss": 1.7639, + "step": 21870 + }, + { + "epoch": 1.1857774315375886, + "grad_norm": 0.2809160649776459, + "learning_rate": 9.042513495197252e-05, + "loss": 1.7516, + "step": 21880 + }, + { + "epoch": 1.186319372703101, + "grad_norm": 0.3107336163520813, + "learning_rate": 9.04153658922245e-05, + "loss": 1.7647, + "step": 21890 + }, + { + "epoch": 1.186861313868613, + "grad_norm": 0.5341174006462097, + "learning_rate": 9.040559244537473e-05, + "loss": 1.7625, + "step": 21900 + }, + { + "epoch": 1.1874032550341254, + "grad_norm": 0.25601211190223694, + "learning_rate": 9.039581461263388e-05, + "loss": 1.7634, + "step": 21910 + }, + { + "epoch": 1.1879451961996375, + "grad_norm": 0.6476098299026489, + "learning_rate": 9.038603239521318e-05, + "loss": 1.7637, + "step": 21920 + }, + { + "epoch": 1.1881619726658426, + "eval_loss": 2.5872724056243896, + "eval_runtime": 21.9906, + "eval_samples_per_second": 227.37, + "eval_steps_per_second": 1.228, + "step": 21924 + }, + { + "epoch": 1.1884871373651498, + "grad_norm": 0.30727824568748474, + "learning_rate": 9.037624579432442e-05, + "loss": 1.7608, + "step": 21930 + }, + { + "epoch": 1.189029078530662, + "grad_norm": 0.256548672914505, + "learning_rate": 9.036645481117992e-05, + "loss": 1.7675, + "step": 21940 + }, + { + "epoch": 1.1895710196961742, + "grad_norm": 0.3580315113067627, + "learning_rate": 9.035665944699254e-05, + "loss": 1.7716, + "step": 21950 + }, + { + "epoch": 1.1901129608616865, + "grad_norm": 0.2569289207458496, + "learning_rate": 9.034685970297571e-05, + "loss": 1.7627, + "step": 21960 + }, + { + "epoch": 1.1906549020271986, + "grad_norm": 0.3389548361301422, + "learning_rate": 9.033705558034335e-05, + "loss": 1.7617, + "step": 21970 + }, + { + "epoch": 1.191196843192711, + "grad_norm": 0.35166749358177185, + "learning_rate": 9.032724708030995e-05, + "loss": 1.7634, + "step": 21980 + }, + { + "epoch": 1.1917387843582232, + "grad_norm": 0.47258156538009644, + "learning_rate": 9.031743420409058e-05, + "loss": 1.7606, + "step": 21990 + }, + { + "epoch": 1.1922807255237353, + "grad_norm": 0.3378153443336487, + "learning_rate": 9.030761695290077e-05, + "loss": 1.7602, + "step": 22000 + }, + { + "epoch": 1.1928226666892476, + "grad_norm": 0.5784263610839844, + "learning_rate": 9.029779532795668e-05, + "loss": 1.7585, + "step": 22010 + }, + { + "epoch": 1.1928768608057987, + "eval_loss": 2.6090340614318848, + "eval_runtime": 21.9879, + "eval_samples_per_second": 227.398, + "eval_steps_per_second": 1.228, + "step": 22011 + }, + { + "epoch": 1.1933646078547597, + "grad_norm": 0.5027700066566467, + "learning_rate": 9.028796933047495e-05, + "loss": 1.7604, + "step": 22020 + }, + { + "epoch": 1.193906549020272, + "grad_norm": 0.3202950060367584, + "learning_rate": 9.027813896167278e-05, + "loss": 1.7642, + "step": 22030 + }, + { + "epoch": 1.1944484901857841, + "grad_norm": 0.3492630422115326, + "learning_rate": 9.026830422276792e-05, + "loss": 1.7612, + "step": 22040 + }, + { + "epoch": 1.1949904313512965, + "grad_norm": 0.6746916770935059, + "learning_rate": 9.025846511497864e-05, + "loss": 1.7562, + "step": 22050 + }, + { + "epoch": 1.1955323725168086, + "grad_norm": 0.42550233006477356, + "learning_rate": 9.02486216395238e-05, + "loss": 1.7618, + "step": 22060 + }, + { + "epoch": 1.1960743136823209, + "grad_norm": 0.4293383061885834, + "learning_rate": 9.023877379762274e-05, + "loss": 1.763, + "step": 22070 + }, + { + "epoch": 1.1966162548478332, + "grad_norm": 0.3266971707344055, + "learning_rate": 9.022892159049537e-05, + "loss": 1.7669, + "step": 22080 + }, + { + "epoch": 1.1971581960133453, + "grad_norm": 0.99581378698349, + "learning_rate": 9.021906501936213e-05, + "loss": 1.7648, + "step": 22090 + }, + { + "epoch": 1.197591748945755, + "eval_loss": 2.58486270904541, + "eval_runtime": 21.9939, + "eval_samples_per_second": 227.336, + "eval_steps_per_second": 1.228, + "step": 22098 + }, + { + "epoch": 1.1977001371788576, + "grad_norm": 0.5003217458724976, + "learning_rate": 9.020920408544404e-05, + "loss": 1.7659, + "step": 22100 + }, + { + "epoch": 1.1982420783443697, + "grad_norm": 0.5149915218353271, + "learning_rate": 9.019933878996259e-05, + "loss": 1.7496, + "step": 22110 + }, + { + "epoch": 1.198784019509882, + "grad_norm": 0.3475677967071533, + "learning_rate": 9.018946913413989e-05, + "loss": 1.7642, + "step": 22120 + }, + { + "epoch": 1.199325960675394, + "grad_norm": 0.3139243423938751, + "learning_rate": 9.017959511919853e-05, + "loss": 1.7556, + "step": 22130 + }, + { + "epoch": 1.1998679018409064, + "grad_norm": 0.32470259070396423, + "learning_rate": 9.016971674636165e-05, + "loss": 1.7616, + "step": 22140 + }, + { + "epoch": 1.2004098430064185, + "grad_norm": 0.6316733360290527, + "learning_rate": 9.015983401685296e-05, + "loss": 1.7633, + "step": 22150 + }, + { + "epoch": 1.2009517841719308, + "grad_norm": 0.6149379014968872, + "learning_rate": 9.014994693189667e-05, + "loss": 1.7599, + "step": 22160 + }, + { + "epoch": 1.2014937253374431, + "grad_norm": 0.4740849435329437, + "learning_rate": 9.014005549271757e-05, + "loss": 1.7587, + "step": 22170 + }, + { + "epoch": 1.2020356665029552, + "grad_norm": 0.4013853371143341, + "learning_rate": 9.013015970054096e-05, + "loss": 1.7523, + "step": 22180 + }, + { + "epoch": 1.2023066370857114, + "eval_loss": 2.5968024730682373, + "eval_runtime": 21.9857, + "eval_samples_per_second": 227.42, + "eval_steps_per_second": 1.228, + "step": 22185 + }, + { + "epoch": 1.2025776076684676, + "grad_norm": 0.5497053861618042, + "learning_rate": 9.012025955659269e-05, + "loss": 1.7546, + "step": 22190 + }, + { + "epoch": 1.2031195488339796, + "grad_norm": 0.3100571632385254, + "learning_rate": 9.011035506209912e-05, + "loss": 1.7637, + "step": 22200 + }, + { + "epoch": 1.203661489999492, + "grad_norm": 0.298776239156723, + "learning_rate": 9.010044621828722e-05, + "loss": 1.7663, + "step": 22210 + }, + { + "epoch": 1.204203431165004, + "grad_norm": 0.36760690808296204, + "learning_rate": 9.009053302638444e-05, + "loss": 1.7575, + "step": 22220 + }, + { + "epoch": 1.2047453723305164, + "grad_norm": 0.2956767678260803, + "learning_rate": 9.008061548761876e-05, + "loss": 1.7624, + "step": 22230 + }, + { + "epoch": 1.2052873134960285, + "grad_norm": 0.5647885203361511, + "learning_rate": 9.007069360321873e-05, + "loss": 1.7409, + "step": 22240 + }, + { + "epoch": 1.2058292546615408, + "grad_norm": 0.26180022954940796, + "learning_rate": 9.006076737441347e-05, + "loss": 1.7539, + "step": 22250 + }, + { + "epoch": 1.206371195827053, + "grad_norm": 0.29279986023902893, + "learning_rate": 9.005083680243254e-05, + "loss": 1.7551, + "step": 22260 + }, + { + "epoch": 1.2069131369925652, + "grad_norm": 0.3438403904438019, + "learning_rate": 9.004090188850612e-05, + "loss": 1.7585, + "step": 22270 + }, + { + "epoch": 1.2070215252256677, + "eval_loss": 2.595259666442871, + "eval_runtime": 21.992, + "eval_samples_per_second": 227.356, + "eval_steps_per_second": 1.228, + "step": 22272 + }, + { + "epoch": 1.2074550781580775, + "grad_norm": 0.46787571907043457, + "learning_rate": 9.003096263386492e-05, + "loss": 1.7648, + "step": 22280 + }, + { + "epoch": 1.2079970193235896, + "grad_norm": 0.5319798588752747, + "learning_rate": 9.002101903974016e-05, + "loss": 1.7574, + "step": 22290 + }, + { + "epoch": 1.208538960489102, + "grad_norm": 0.45188024640083313, + "learning_rate": 9.00110711073636e-05, + "loss": 1.7565, + "step": 22300 + }, + { + "epoch": 1.2090809016546142, + "grad_norm": 0.4394822120666504, + "learning_rate": 9.000111883796756e-05, + "loss": 1.7594, + "step": 22310 + }, + { + "epoch": 1.2096228428201263, + "grad_norm": 0.290712833404541, + "learning_rate": 8.999116223278486e-05, + "loss": 1.7592, + "step": 22320 + }, + { + "epoch": 1.2101647839856386, + "grad_norm": 0.6723989248275757, + "learning_rate": 8.998120129304892e-05, + "loss": 1.752, + "step": 22330 + }, + { + "epoch": 1.2107067251511507, + "grad_norm": 0.3058725595474243, + "learning_rate": 8.997123601999364e-05, + "loss": 1.7487, + "step": 22340 + }, + { + "epoch": 1.211248666316663, + "grad_norm": 0.25992143154144287, + "learning_rate": 8.996126641485345e-05, + "loss": 1.7464, + "step": 22350 + }, + { + "epoch": 1.211736413365624, + "eval_loss": 2.6008663177490234, + "eval_runtime": 21.9912, + "eval_samples_per_second": 227.363, + "eval_steps_per_second": 1.228, + "step": 22359 + }, + { + "epoch": 1.2117906074821752, + "grad_norm": 0.30580416321754456, + "learning_rate": 8.995129247886339e-05, + "loss": 1.7577, + "step": 22360 + }, + { + "epoch": 1.2123325486476875, + "grad_norm": 0.2877110540866852, + "learning_rate": 8.994131421325893e-05, + "loss": 1.7556, + "step": 22370 + }, + { + "epoch": 1.2128744898131996, + "grad_norm": 0.390491247177124, + "learning_rate": 8.993133161927618e-05, + "loss": 1.7553, + "step": 22380 + }, + { + "epoch": 1.2134164309787119, + "grad_norm": 0.3429940342903137, + "learning_rate": 8.992134469815173e-05, + "loss": 1.7507, + "step": 22390 + }, + { + "epoch": 1.2139583721442242, + "grad_norm": 0.29374656081199646, + "learning_rate": 8.99113534511227e-05, + "loss": 1.7548, + "step": 22400 + }, + { + "epoch": 1.2145003133097363, + "grad_norm": 0.37793880701065063, + "learning_rate": 8.99013578794268e-05, + "loss": 1.7504, + "step": 22410 + }, + { + "epoch": 1.2150422544752486, + "grad_norm": 0.312223345041275, + "learning_rate": 8.989135798430218e-05, + "loss": 1.7373, + "step": 22420 + }, + { + "epoch": 1.2155841956407607, + "grad_norm": 0.355259507894516, + "learning_rate": 8.988135376698764e-05, + "loss": 1.7562, + "step": 22430 + }, + { + "epoch": 1.216126136806273, + "grad_norm": 0.5126851797103882, + "learning_rate": 8.987134522872242e-05, + "loss": 1.7551, + "step": 22440 + }, + { + "epoch": 1.2164513015055802, + "eval_loss": 2.5885868072509766, + "eval_runtime": 21.9949, + "eval_samples_per_second": 227.325, + "eval_steps_per_second": 1.228, + "step": 22446 + }, + { + "epoch": 1.2166680779717851, + "grad_norm": 0.3533921241760254, + "learning_rate": 8.986133237074636e-05, + "loss": 1.7599, + "step": 22450 + }, + { + "epoch": 1.2172100191372974, + "grad_norm": 0.3277350962162018, + "learning_rate": 8.98513151942998e-05, + "loss": 1.7425, + "step": 22460 + }, + { + "epoch": 1.2177519603028095, + "grad_norm": 0.45636269450187683, + "learning_rate": 8.984129370062362e-05, + "loss": 1.7538, + "step": 22470 + }, + { + "epoch": 1.2182939014683218, + "grad_norm": 0.7295756340026855, + "learning_rate": 8.983126789095925e-05, + "loss": 1.7601, + "step": 22480 + }, + { + "epoch": 1.2188358426338342, + "grad_norm": 0.5607370734214783, + "learning_rate": 8.982123776654862e-05, + "loss": 1.7585, + "step": 22490 + }, + { + "epoch": 1.2193777837993462, + "grad_norm": 0.29513418674468994, + "learning_rate": 8.981120332863423e-05, + "loss": 1.7626, + "step": 22500 + }, + { + "epoch": 1.2199197249648586, + "grad_norm": 0.2580159306526184, + "learning_rate": 8.980116457845911e-05, + "loss": 1.756, + "step": 22510 + }, + { + "epoch": 1.2204616661303707, + "grad_norm": 0.31364554166793823, + "learning_rate": 8.979112151726684e-05, + "loss": 1.7503, + "step": 22520 + }, + { + "epoch": 1.221003607295883, + "grad_norm": 0.8860167860984802, + "learning_rate": 8.978107414630146e-05, + "loss": 1.7357, + "step": 22530 + }, + { + "epoch": 1.2211661896455366, + "eval_loss": 2.586221933364868, + "eval_runtime": 21.9909, + "eval_samples_per_second": 227.367, + "eval_steps_per_second": 1.228, + "step": 22533 + }, + { + "epoch": 1.221545548461395, + "grad_norm": 0.550439178943634, + "learning_rate": 8.977102246680762e-05, + "loss": 1.7482, + "step": 22540 + }, + { + "epoch": 1.2220874896269074, + "grad_norm": 0.8875564932823181, + "learning_rate": 8.976096648003048e-05, + "loss": 1.7466, + "step": 22550 + }, + { + "epoch": 1.2226294307924195, + "grad_norm": 0.5859338045120239, + "learning_rate": 8.975090618721573e-05, + "loss": 1.7587, + "step": 22560 + }, + { + "epoch": 1.2231713719579318, + "grad_norm": 0.27543067932128906, + "learning_rate": 8.97408415896096e-05, + "loss": 1.7534, + "step": 22570 + }, + { + "epoch": 1.2237133131234441, + "grad_norm": 0.3466237485408783, + "learning_rate": 8.973077268845884e-05, + "loss": 1.7601, + "step": 22580 + }, + { + "epoch": 1.2242552542889562, + "grad_norm": 0.2768082916736603, + "learning_rate": 8.972069948501074e-05, + "loss": 1.7504, + "step": 22590 + }, + { + "epoch": 1.2247971954544685, + "grad_norm": 0.45373255014419556, + "learning_rate": 8.971062198051315e-05, + "loss": 1.7479, + "step": 22600 + }, + { + "epoch": 1.2253391366199806, + "grad_norm": 0.6204401850700378, + "learning_rate": 8.970054017621437e-05, + "loss": 1.7478, + "step": 22610 + }, + { + "epoch": 1.225881077785493, + "grad_norm": 0.3263530135154724, + "learning_rate": 8.969045407336336e-05, + "loss": 1.7486, + "step": 22620 + }, + { + "epoch": 1.225881077785493, + "eval_loss": 2.593569040298462, + "eval_runtime": 21.9692, + "eval_samples_per_second": 227.591, + "eval_steps_per_second": 1.229, + "step": 22620 + }, + { + "epoch": 1.2264230189510053, + "grad_norm": 0.48239004611968994, + "learning_rate": 8.968036367320952e-05, + "loss": 1.7489, + "step": 22630 + }, + { + "epoch": 1.2269649601165173, + "grad_norm": 0.3203703761100769, + "learning_rate": 8.967026897700277e-05, + "loss": 1.7437, + "step": 22640 + }, + { + "epoch": 1.2275069012820297, + "grad_norm": 0.6340882182121277, + "learning_rate": 8.966016998599362e-05, + "loss": 1.7371, + "step": 22650 + }, + { + "epoch": 1.2280488424475418, + "grad_norm": 0.30104759335517883, + "learning_rate": 8.96500667014331e-05, + "loss": 1.7637, + "step": 22660 + }, + { + "epoch": 1.228590783613054, + "grad_norm": 0.2778589129447937, + "learning_rate": 8.963995912457275e-05, + "loss": 1.7441, + "step": 22670 + }, + { + "epoch": 1.2291327247785662, + "grad_norm": 0.3374054431915283, + "learning_rate": 8.962984725666465e-05, + "loss": 1.7528, + "step": 22680 + }, + { + "epoch": 1.2296746659440785, + "grad_norm": 0.25601983070373535, + "learning_rate": 8.961973109896144e-05, + "loss": 1.7444, + "step": 22690 + }, + { + "epoch": 1.2302166071095906, + "grad_norm": 0.2800886929035187, + "learning_rate": 8.960961065271622e-05, + "loss": 1.7444, + "step": 22700 + }, + { + "epoch": 1.2305959659254493, + "eval_loss": 2.58610463142395, + "eval_runtime": 21.9907, + "eval_samples_per_second": 227.368, + "eval_steps_per_second": 1.228, + "step": 22707 + }, + { + "epoch": 1.230758548275103, + "grad_norm": 0.4286305904388428, + "learning_rate": 8.95994859191827e-05, + "loss": 1.7398, + "step": 22710 + }, + { + "epoch": 1.2313004894406152, + "grad_norm": 0.5830126404762268, + "learning_rate": 8.95893568996151e-05, + "loss": 1.7458, + "step": 22720 + }, + { + "epoch": 1.2318424306061273, + "grad_norm": 0.30274108052253723, + "learning_rate": 8.957922359526812e-05, + "loss": 1.7443, + "step": 22730 + }, + { + "epoch": 1.2323843717716396, + "grad_norm": 0.5309381484985352, + "learning_rate": 8.956908600739707e-05, + "loss": 1.7477, + "step": 22740 + }, + { + "epoch": 1.2329263129371517, + "grad_norm": 0.2998747229576111, + "learning_rate": 8.95589441372577e-05, + "loss": 1.7574, + "step": 22750 + }, + { + "epoch": 1.233468254102664, + "grad_norm": 0.3685966432094574, + "learning_rate": 8.954879798610637e-05, + "loss": 1.74, + "step": 22760 + }, + { + "epoch": 1.2340101952681761, + "grad_norm": 0.6951805949211121, + "learning_rate": 8.953864755519995e-05, + "loss": 1.7444, + "step": 22770 + }, + { + "epoch": 1.2345521364336884, + "grad_norm": 0.3382275700569153, + "learning_rate": 8.952849284579585e-05, + "loss": 1.7352, + "step": 22780 + }, + { + "epoch": 1.2350940775992005, + "grad_norm": 0.26488932967185974, + "learning_rate": 8.951833385915193e-05, + "loss": 1.7461, + "step": 22790 + }, + { + "epoch": 1.2353108540654056, + "eval_loss": 2.575146436691284, + "eval_runtime": 21.9928, + "eval_samples_per_second": 227.347, + "eval_steps_per_second": 1.228, + "step": 22794 + }, + { + "epoch": 1.2356360187647129, + "grad_norm": 0.46747535467147827, + "learning_rate": 8.950817059652669e-05, + "loss": 1.7437, + "step": 22800 + }, + { + "epoch": 1.2361779599302252, + "grad_norm": 0.29368457198143005, + "learning_rate": 8.949800305917909e-05, + "loss": 1.7458, + "step": 22810 + }, + { + "epoch": 1.2367199010957373, + "grad_norm": 0.3844955265522003, + "learning_rate": 8.948783124836866e-05, + "loss": 1.7403, + "step": 22820 + }, + { + "epoch": 1.2372618422612496, + "grad_norm": 0.6568012237548828, + "learning_rate": 8.94776551653554e-05, + "loss": 1.7358, + "step": 22830 + }, + { + "epoch": 1.2378037834267617, + "grad_norm": 0.34444165229797363, + "learning_rate": 8.946747481139992e-05, + "loss": 1.7468, + "step": 22840 + }, + { + "epoch": 1.238345724592274, + "grad_norm": 0.44042837619781494, + "learning_rate": 8.945729018776331e-05, + "loss": 1.7455, + "step": 22850 + }, + { + "epoch": 1.238887665757786, + "grad_norm": 0.32380804419517517, + "learning_rate": 8.944710129570719e-05, + "loss": 1.7432, + "step": 22860 + }, + { + "epoch": 1.2394296069232984, + "grad_norm": 0.608935534954071, + "learning_rate": 8.943690813649369e-05, + "loss": 1.7578, + "step": 22870 + }, + { + "epoch": 1.2399715480888105, + "grad_norm": 0.5230217576026917, + "learning_rate": 8.942671071138554e-05, + "loss": 1.7458, + "step": 22880 + }, + { + "epoch": 1.2400257422053618, + "eval_loss": 2.581331253051758, + "eval_runtime": 21.9957, + "eval_samples_per_second": 227.317, + "eval_steps_per_second": 1.228, + "step": 22881 + }, + { + "epoch": 1.2405134892543228, + "grad_norm": 0.5061838030815125, + "learning_rate": 8.941650902164595e-05, + "loss": 1.7455, + "step": 22890 + }, + { + "epoch": 1.2410554304198351, + "grad_norm": 0.405821293592453, + "learning_rate": 8.940630306853861e-05, + "loss": 1.7405, + "step": 22900 + }, + { + "epoch": 1.2415973715853472, + "grad_norm": 0.45560845732688904, + "learning_rate": 8.939609285332785e-05, + "loss": 1.7459, + "step": 22910 + }, + { + "epoch": 1.2421393127508595, + "grad_norm": 0.5665388703346252, + "learning_rate": 8.938587837727842e-05, + "loss": 1.7552, + "step": 22920 + }, + { + "epoch": 1.2426812539163716, + "grad_norm": 0.37847816944122314, + "learning_rate": 8.937565964165569e-05, + "loss": 1.7422, + "step": 22930 + }, + { + "epoch": 1.243223195081884, + "grad_norm": 0.4069490432739258, + "learning_rate": 8.936543664772546e-05, + "loss": 1.7435, + "step": 22940 + }, + { + "epoch": 1.2437651362473963, + "grad_norm": 0.3240896761417389, + "learning_rate": 8.935520939675414e-05, + "loss": 1.7512, + "step": 22950 + }, + { + "epoch": 1.2443070774129084, + "grad_norm": 0.4123472273349762, + "learning_rate": 8.934497789000865e-05, + "loss": 1.7425, + "step": 22960 + }, + { + "epoch": 1.2447406303453181, + "eval_loss": 2.5666730403900146, + "eval_runtime": 21.9906, + "eval_samples_per_second": 227.37, + "eval_steps_per_second": 1.228, + "step": 22968 + }, + { + "epoch": 1.2448490185784207, + "grad_norm": 0.2759721577167511, + "learning_rate": 8.933474212875642e-05, + "loss": 1.736, + "step": 22970 + }, + { + "epoch": 1.2453909597439328, + "grad_norm": 0.438054621219635, + "learning_rate": 8.932450211426537e-05, + "loss": 1.7408, + "step": 22980 + }, + { + "epoch": 1.245932900909445, + "grad_norm": 0.3276619613170624, + "learning_rate": 8.931425784780405e-05, + "loss": 1.748, + "step": 22990 + }, + { + "epoch": 1.2464748420749572, + "grad_norm": 0.27382031083106995, + "learning_rate": 8.930400933064144e-05, + "loss": 1.7394, + "step": 23000 + }, + { + "epoch": 1.2470167832404695, + "grad_norm": 0.24581308662891388, + "learning_rate": 8.929375656404707e-05, + "loss": 1.7361, + "step": 23010 + }, + { + "epoch": 1.2475587244059816, + "grad_norm": 0.2953229546546936, + "learning_rate": 8.928349954929103e-05, + "loss": 1.7587, + "step": 23020 + }, + { + "epoch": 1.248100665571494, + "grad_norm": 0.48332518339157104, + "learning_rate": 8.927323828764393e-05, + "loss": 1.7334, + "step": 23030 + }, + { + "epoch": 1.2486426067370062, + "grad_norm": 0.8373629450798035, + "learning_rate": 8.926297278037685e-05, + "loss": 1.7477, + "step": 23040 + }, + { + "epoch": 1.2491845479025183, + "grad_norm": 0.2879384458065033, + "learning_rate": 8.925270302876146e-05, + "loss": 1.7373, + "step": 23050 + }, + { + "epoch": 1.2494555184852745, + "eval_loss": 2.5645227432250977, + "eval_runtime": 21.9888, + "eval_samples_per_second": 227.388, + "eval_steps_per_second": 1.228, + "step": 23055 + }, + { + "epoch": 1.2497264890680306, + "grad_norm": 0.2931196689605713, + "learning_rate": 8.924242903406993e-05, + "loss": 1.747, + "step": 23060 + }, + { + "epoch": 1.2502684302335427, + "grad_norm": 0.5951820015907288, + "learning_rate": 8.923215079757496e-05, + "loss": 1.7395, + "step": 23070 + }, + { + "epoch": 1.250810371399055, + "grad_norm": 0.30423420667648315, + "learning_rate": 8.922186832054977e-05, + "loss": 1.7422, + "step": 23080 + }, + { + "epoch": 1.2513523125645671, + "grad_norm": 0.3447738289833069, + "learning_rate": 8.92115816042681e-05, + "loss": 1.7358, + "step": 23090 + }, + { + "epoch": 1.2518942537300795, + "grad_norm": 0.3407246470451355, + "learning_rate": 8.920129065000424e-05, + "loss": 1.7486, + "step": 23100 + }, + { + "epoch": 1.2524361948955915, + "grad_norm": 0.33367615938186646, + "learning_rate": 8.919099545903299e-05, + "loss": 1.7355, + "step": 23110 + }, + { + "epoch": 1.2529781360611039, + "grad_norm": 0.32768481969833374, + "learning_rate": 8.918069603262965e-05, + "loss": 1.7406, + "step": 23120 + }, + { + "epoch": 1.2535200772266162, + "grad_norm": 0.31800198554992676, + "learning_rate": 8.91703923720701e-05, + "loss": 1.7452, + "step": 23130 + }, + { + "epoch": 1.2540620183921283, + "grad_norm": 0.5581735372543335, + "learning_rate": 8.916008447863068e-05, + "loss": 1.743, + "step": 23140 + }, + { + "epoch": 1.2541704066252308, + "eval_loss": 2.5848233699798584, + "eval_runtime": 21.9922, + "eval_samples_per_second": 227.353, + "eval_steps_per_second": 1.228, + "step": 23142 + }, + { + "epoch": 1.2546039595576406, + "grad_norm": 0.5674453377723694, + "learning_rate": 8.914977235358831e-05, + "loss": 1.7393, + "step": 23150 + }, + { + "epoch": 1.2551459007231527, + "grad_norm": 0.3421599864959717, + "learning_rate": 8.913945599822043e-05, + "loss": 1.7402, + "step": 23160 + }, + { + "epoch": 1.255687841888665, + "grad_norm": 0.31315505504608154, + "learning_rate": 8.912913541380492e-05, + "loss": 1.7363, + "step": 23170 + }, + { + "epoch": 1.2562297830541773, + "grad_norm": 0.30993425846099854, + "learning_rate": 8.91188106016203e-05, + "loss": 1.7405, + "step": 23180 + }, + { + "epoch": 1.2567717242196894, + "grad_norm": 0.2763616144657135, + "learning_rate": 8.910848156294555e-05, + "loss": 1.7464, + "step": 23190 + }, + { + "epoch": 1.2573136653852015, + "grad_norm": 0.3239862322807312, + "learning_rate": 8.90981482990602e-05, + "loss": 1.7422, + "step": 23200 + }, + { + "epoch": 1.2578556065507138, + "grad_norm": 0.3442568778991699, + "learning_rate": 8.908781081124427e-05, + "loss": 1.7433, + "step": 23210 + }, + { + "epoch": 1.2583975477162261, + "grad_norm": 0.2833685278892517, + "learning_rate": 8.907746910077834e-05, + "loss": 1.7337, + "step": 23220 + }, + { + "epoch": 1.2588852947651872, + "eval_loss": 2.577096939086914, + "eval_runtime": 21.9824, + "eval_samples_per_second": 227.454, + "eval_steps_per_second": 1.228, + "step": 23229 + }, + { + "epoch": 1.2589394888817382, + "grad_norm": 0.2630921006202698, + "learning_rate": 8.906712316894346e-05, + "loss": 1.7534, + "step": 23230 + }, + { + "epoch": 1.2594814300472505, + "grad_norm": 0.5170078277587891, + "learning_rate": 8.90567730170213e-05, + "loss": 1.7483, + "step": 23240 + }, + { + "epoch": 1.2600233712127626, + "grad_norm": 0.3840513527393341, + "learning_rate": 8.904641864629394e-05, + "loss": 1.7339, + "step": 23250 + }, + { + "epoch": 1.260565312378275, + "grad_norm": 0.29101109504699707, + "learning_rate": 8.903606005804406e-05, + "loss": 1.7207, + "step": 23260 + }, + { + "epoch": 1.2611072535437873, + "grad_norm": 0.34224191308021545, + "learning_rate": 8.902569725355482e-05, + "loss": 1.7333, + "step": 23270 + }, + { + "epoch": 1.2616491947092994, + "grad_norm": 0.8239234089851379, + "learning_rate": 8.901533023410994e-05, + "loss": 1.7417, + "step": 23280 + }, + { + "epoch": 1.2621911358748115, + "grad_norm": 0.7893701791763306, + "learning_rate": 8.900495900099362e-05, + "loss": 1.7398, + "step": 23290 + }, + { + "epoch": 1.2627330770403238, + "grad_norm": 0.31907394528388977, + "learning_rate": 8.899458355549061e-05, + "loss": 1.739, + "step": 23300 + }, + { + "epoch": 1.263275018205836, + "grad_norm": 0.26405349373817444, + "learning_rate": 8.898420389888619e-05, + "loss": 1.747, + "step": 23310 + }, + { + "epoch": 1.2636001829051433, + "eval_loss": 2.575762987136841, + "eval_runtime": 21.9898, + "eval_samples_per_second": 227.378, + "eval_steps_per_second": 1.228, + "step": 23316 + }, + { + "epoch": 1.2638169593713482, + "grad_norm": 0.2704724669456482, + "learning_rate": 8.897382003246614e-05, + "loss": 1.7478, + "step": 23320 + }, + { + "epoch": 1.2643589005368605, + "grad_norm": 0.28200745582580566, + "learning_rate": 8.896343195751677e-05, + "loss": 1.7414, + "step": 23330 + }, + { + "epoch": 1.2649008417023726, + "grad_norm": 0.367416113615036, + "learning_rate": 8.895303967532489e-05, + "loss": 1.7448, + "step": 23340 + }, + { + "epoch": 1.265442782867885, + "grad_norm": 0.41314128041267395, + "learning_rate": 8.894264318717786e-05, + "loss": 1.7301, + "step": 23350 + }, + { + "epoch": 1.2659847240333972, + "grad_norm": 0.346835196018219, + "learning_rate": 8.893224249436357e-05, + "loss": 1.7309, + "step": 23360 + }, + { + "epoch": 1.2665266651989093, + "grad_norm": 0.37419700622558594, + "learning_rate": 8.892183759817039e-05, + "loss": 1.7363, + "step": 23370 + }, + { + "epoch": 1.2670686063644216, + "grad_norm": 0.5215455293655396, + "learning_rate": 8.891142849988725e-05, + "loss": 1.7392, + "step": 23380 + }, + { + "epoch": 1.2676105475299337, + "grad_norm": 0.32529744505882263, + "learning_rate": 8.890101520080357e-05, + "loss": 1.7349, + "step": 23390 + }, + { + "epoch": 1.268152488695446, + "grad_norm": 0.5487125515937805, + "learning_rate": 8.889059770220931e-05, + "loss": 1.734, + "step": 23400 + }, + { + "epoch": 1.2683150710450997, + "eval_loss": 2.5602195262908936, + "eval_runtime": 21.9906, + "eval_samples_per_second": 227.37, + "eval_steps_per_second": 1.228, + "step": 23403 + }, + { + "epoch": 1.2686944298609582, + "grad_norm": 0.465263694524765, + "learning_rate": 8.888017600539493e-05, + "loss": 1.7317, + "step": 23410 + }, + { + "epoch": 1.2692363710264705, + "grad_norm": 0.3183075487613678, + "learning_rate": 8.886975011165146e-05, + "loss": 1.7385, + "step": 23420 + }, + { + "epoch": 1.2697783121919826, + "grad_norm": 0.5280638933181763, + "learning_rate": 8.885932002227039e-05, + "loss": 1.732, + "step": 23430 + }, + { + "epoch": 1.2703202533574949, + "grad_norm": 0.3372764587402344, + "learning_rate": 8.884888573854375e-05, + "loss": 1.7314, + "step": 23440 + }, + { + "epoch": 1.2708621945230072, + "grad_norm": 0.374603807926178, + "learning_rate": 8.883844726176412e-05, + "loss": 1.731, + "step": 23450 + }, + { + "epoch": 1.2714041356885193, + "grad_norm": 0.3078954219818115, + "learning_rate": 8.882800459322453e-05, + "loss": 1.7377, + "step": 23460 + }, + { + "epoch": 1.2719460768540316, + "grad_norm": 0.2972983419895172, + "learning_rate": 8.881755773421863e-05, + "loss": 1.734, + "step": 23470 + }, + { + "epoch": 1.2724880180195437, + "grad_norm": 0.30043935775756836, + "learning_rate": 8.880710668604047e-05, + "loss": 1.7393, + "step": 23480 + }, + { + "epoch": 1.273029959185056, + "grad_norm": 0.4024638533592224, + "learning_rate": 8.879665144998473e-05, + "loss": 1.738, + "step": 23490 + }, + { + "epoch": 1.273029959185056, + "eval_loss": 2.574789047241211, + "eval_runtime": 21.9656, + "eval_samples_per_second": 227.628, + "eval_steps_per_second": 1.229, + "step": 23490 + }, + { + "epoch": 1.273571900350568, + "grad_norm": 0.33888739347457886, + "learning_rate": 8.878619202734653e-05, + "loss": 1.7342, + "step": 23500 + }, + { + "epoch": 1.2741138415160804, + "grad_norm": 0.40043386816978455, + "learning_rate": 8.877572841942153e-05, + "loss": 1.7383, + "step": 23510 + }, + { + "epoch": 1.2746557826815925, + "grad_norm": 0.46869125962257385, + "learning_rate": 8.876526062750597e-05, + "loss": 1.7412, + "step": 23520 + }, + { + "epoch": 1.2751977238471048, + "grad_norm": 0.41995322704315186, + "learning_rate": 8.875478865289649e-05, + "loss": 1.7332, + "step": 23530 + }, + { + "epoch": 1.2757396650126172, + "grad_norm": 0.28642240166664124, + "learning_rate": 8.874431249689033e-05, + "loss": 1.7313, + "step": 23540 + }, + { + "epoch": 1.2762816061781292, + "grad_norm": 0.5792128443717957, + "learning_rate": 8.873383216078527e-05, + "loss": 1.7324, + "step": 23550 + }, + { + "epoch": 1.2768235473436416, + "grad_norm": 0.6733444929122925, + "learning_rate": 8.872334764587952e-05, + "loss": 1.7357, + "step": 23560 + }, + { + "epoch": 1.2773654885091537, + "grad_norm": 0.27996379137039185, + "learning_rate": 8.87128589534719e-05, + "loss": 1.723, + "step": 23570 + }, + { + "epoch": 1.2777448473250121, + "eval_loss": 2.5722544193267822, + "eval_runtime": 21.9913, + "eval_samples_per_second": 227.362, + "eval_steps_per_second": 1.228, + "step": 23577 + }, + { + "epoch": 1.277907429674666, + "grad_norm": 0.29412564635276794, + "learning_rate": 8.870236608486165e-05, + "loss": 1.7301, + "step": 23580 + }, + { + "epoch": 1.2784493708401783, + "grad_norm": 0.2967008352279663, + "learning_rate": 8.869186904134862e-05, + "loss": 1.7411, + "step": 23590 + }, + { + "epoch": 1.2789913120056904, + "grad_norm": 1.056755542755127, + "learning_rate": 8.868136782423314e-05, + "loss": 1.735, + "step": 23600 + }, + { + "epoch": 1.2795332531712025, + "grad_norm": 0.6758991479873657, + "learning_rate": 8.867086243481603e-05, + "loss": 1.7431, + "step": 23610 + }, + { + "epoch": 1.2800751943367148, + "grad_norm": 0.33271247148513794, + "learning_rate": 8.866035287439868e-05, + "loss": 1.7338, + "step": 23620 + }, + { + "epoch": 1.280617135502227, + "grad_norm": 0.40084099769592285, + "learning_rate": 8.864983914428293e-05, + "loss": 1.7385, + "step": 23630 + }, + { + "epoch": 1.2811590766677392, + "grad_norm": 0.3270736038684845, + "learning_rate": 8.863932124577123e-05, + "loss": 1.716, + "step": 23640 + }, + { + "epoch": 1.2817010178332515, + "grad_norm": 0.3113623559474945, + "learning_rate": 8.862879918016643e-05, + "loss": 1.7371, + "step": 23650 + }, + { + "epoch": 1.2822429589987636, + "grad_norm": 0.6056118011474609, + "learning_rate": 8.861827294877201e-05, + "loss": 1.7375, + "step": 23660 + }, + { + "epoch": 1.2824597354649687, + "eval_loss": 2.576855182647705, + "eval_runtime": 21.9914, + "eval_samples_per_second": 227.361, + "eval_steps_per_second": 1.228, + "step": 23664 + }, + { + "epoch": 1.282784900164276, + "grad_norm": 0.3462965488433838, + "learning_rate": 8.86077425528919e-05, + "loss": 1.7308, + "step": 23670 + }, + { + "epoch": 1.2833268413297882, + "grad_norm": 0.2960757315158844, + "learning_rate": 8.859720799383054e-05, + "loss": 1.7263, + "step": 23680 + }, + { + "epoch": 1.2838687824953003, + "grad_norm": 0.29280513525009155, + "learning_rate": 8.858666927289292e-05, + "loss": 1.7369, + "step": 23690 + }, + { + "epoch": 1.2844107236608124, + "grad_norm": 0.602401614189148, + "learning_rate": 8.857612639138451e-05, + "loss": 1.7398, + "step": 23700 + }, + { + "epoch": 1.2849526648263248, + "grad_norm": 0.4261922240257263, + "learning_rate": 8.856557935061137e-05, + "loss": 1.7272, + "step": 23710 + }, + { + "epoch": 1.285494605991837, + "grad_norm": 0.32072654366493225, + "learning_rate": 8.855502815187996e-05, + "loss": 1.7529, + "step": 23720 + }, + { + "epoch": 1.2860365471573492, + "grad_norm": 0.45794838666915894, + "learning_rate": 8.854447279649737e-05, + "loss": 1.7268, + "step": 23730 + }, + { + "epoch": 1.2865784883228615, + "grad_norm": 0.8380472660064697, + "learning_rate": 8.85339132857711e-05, + "loss": 1.732, + "step": 23740 + }, + { + "epoch": 1.2871204294883736, + "grad_norm": 0.25734448432922363, + "learning_rate": 8.852334962100926e-05, + "loss": 1.7341, + "step": 23750 + }, + { + "epoch": 1.2871746236049249, + "eval_loss": 2.560034990310669, + "eval_runtime": 21.9846, + "eval_samples_per_second": 227.432, + "eval_steps_per_second": 1.228, + "step": 23751 + }, + { + "epoch": 1.287662370653886, + "grad_norm": 0.32123035192489624, + "learning_rate": 8.851278180352041e-05, + "loss": 1.7288, + "step": 23760 + }, + { + "epoch": 1.2882043118193982, + "grad_norm": 0.41775065660476685, + "learning_rate": 8.850220983461365e-05, + "loss": 1.7305, + "step": 23770 + }, + { + "epoch": 1.2887462529849103, + "grad_norm": 0.3736642599105835, + "learning_rate": 8.849163371559858e-05, + "loss": 1.7276, + "step": 23780 + }, + { + "epoch": 1.2892881941504226, + "grad_norm": 0.3616468906402588, + "learning_rate": 8.848105344778532e-05, + "loss": 1.731, + "step": 23790 + }, + { + "epoch": 1.2898301353159347, + "grad_norm": 0.39535510540008545, + "learning_rate": 8.847046903248453e-05, + "loss": 1.7199, + "step": 23800 + }, + { + "epoch": 1.290372076481447, + "grad_norm": 0.27410778403282166, + "learning_rate": 8.845988047100736e-05, + "loss": 1.729, + "step": 23810 + }, + { + "epoch": 1.2909140176469591, + "grad_norm": 0.2933545708656311, + "learning_rate": 8.844928776466547e-05, + "loss": 1.7338, + "step": 23820 + }, + { + "epoch": 1.2914559588124714, + "grad_norm": 0.44660070538520813, + "learning_rate": 8.843869091477102e-05, + "loss": 1.7316, + "step": 23830 + }, + { + "epoch": 1.2918895117448812, + "eval_loss": 2.552804946899414, + "eval_runtime": 21.9898, + "eval_samples_per_second": 227.378, + "eval_steps_per_second": 1.228, + "step": 23838 + }, + { + "epoch": 1.2919978999779835, + "grad_norm": 0.3844490945339203, + "learning_rate": 8.842808992263672e-05, + "loss": 1.7361, + "step": 23840 + }, + { + "epoch": 1.2925398411434958, + "grad_norm": 0.5550277829170227, + "learning_rate": 8.841748478957577e-05, + "loss": 1.7385, + "step": 23850 + }, + { + "epoch": 1.2930817823090082, + "grad_norm": 0.32363656163215637, + "learning_rate": 8.840687551690189e-05, + "loss": 1.7243, + "step": 23860 + }, + { + "epoch": 1.2936237234745203, + "grad_norm": 0.4406464993953705, + "learning_rate": 8.839626210592931e-05, + "loss": 1.7437, + "step": 23870 + }, + { + "epoch": 1.2941656646400326, + "grad_norm": 0.3115149140357971, + "learning_rate": 8.838564455797275e-05, + "loss": 1.7447, + "step": 23880 + }, + { + "epoch": 1.2947076058055447, + "grad_norm": 0.27961403131484985, + "learning_rate": 8.837502287434752e-05, + "loss": 1.7287, + "step": 23890 + }, + { + "epoch": 1.295249546971057, + "grad_norm": 0.49664953351020813, + "learning_rate": 8.836439705636935e-05, + "loss": 1.7304, + "step": 23900 + }, + { + "epoch": 1.2957914881365693, + "grad_norm": 0.6754089593887329, + "learning_rate": 8.835376710535451e-05, + "loss": 1.7334, + "step": 23910 + }, + { + "epoch": 1.2963334293020814, + "grad_norm": 0.2822556793689728, + "learning_rate": 8.834313302261982e-05, + "loss": 1.7226, + "step": 23920 + }, + { + "epoch": 1.2966043998848376, + "eval_loss": 2.5520572662353516, + "eval_runtime": 21.9902, + "eval_samples_per_second": 227.373, + "eval_steps_per_second": 1.228, + "step": 23925 + }, + { + "epoch": 1.2968753704675935, + "grad_norm": 0.41642260551452637, + "learning_rate": 8.833249480948257e-05, + "loss": 1.7223, + "step": 23930 + }, + { + "epoch": 1.2974173116331058, + "grad_norm": 0.6077197790145874, + "learning_rate": 8.832185246726057e-05, + "loss": 1.7211, + "step": 23940 + }, + { + "epoch": 1.2979592527986181, + "grad_norm": 0.7251920700073242, + "learning_rate": 8.831120599727217e-05, + "loss": 1.7233, + "step": 23950 + }, + { + "epoch": 1.2985011939641302, + "grad_norm": 0.7429440021514893, + "learning_rate": 8.83005554008362e-05, + "loss": 1.727, + "step": 23960 + }, + { + "epoch": 1.2990431351296425, + "grad_norm": 0.33626994490623474, + "learning_rate": 8.828990067927199e-05, + "loss": 1.7237, + "step": 23970 + }, + { + "epoch": 1.2995850762951546, + "grad_norm": 0.4045674502849579, + "learning_rate": 8.827924183389941e-05, + "loss": 1.7197, + "step": 23980 + }, + { + "epoch": 1.300127017460667, + "grad_norm": 0.43031617999076843, + "learning_rate": 8.826857886603885e-05, + "loss": 1.7181, + "step": 23990 + }, + { + "epoch": 1.3006689586261793, + "grad_norm": 0.4407658874988556, + "learning_rate": 8.825791177701116e-05, + "loss": 1.7297, + "step": 24000 + }, + { + "epoch": 1.3012108997916914, + "grad_norm": 0.4655308425426483, + "learning_rate": 8.824724056813775e-05, + "loss": 1.7271, + "step": 24010 + }, + { + "epoch": 1.3013192880247937, + "eval_loss": 2.550110340118408, + "eval_runtime": 21.9873, + "eval_samples_per_second": 227.404, + "eval_steps_per_second": 1.228, + "step": 24012 + }, + { + "epoch": 1.3017528409572034, + "grad_norm": 0.47410663962364197, + "learning_rate": 8.823656524074054e-05, + "loss": 1.7271, + "step": 24020 + }, + { + "epoch": 1.3022947821227158, + "grad_norm": 0.397776335477829, + "learning_rate": 8.822588579614192e-05, + "loss": 1.7268, + "step": 24030 + }, + { + "epoch": 1.302836723288228, + "grad_norm": 0.4908923804759979, + "learning_rate": 8.821520223566483e-05, + "loss": 1.7312, + "step": 24040 + }, + { + "epoch": 1.3033786644537402, + "grad_norm": 0.475315660238266, + "learning_rate": 8.820451456063268e-05, + "loss": 1.7358, + "step": 24050 + }, + { + "epoch": 1.3039206056192525, + "grad_norm": 0.3519163131713867, + "learning_rate": 8.819382277236943e-05, + "loss": 1.7313, + "step": 24060 + }, + { + "epoch": 1.3044625467847646, + "grad_norm": 0.6225571632385254, + "learning_rate": 8.818312687219953e-05, + "loss": 1.7269, + "step": 24070 + }, + { + "epoch": 1.305004487950277, + "grad_norm": 0.3213563859462738, + "learning_rate": 8.817242686144793e-05, + "loss": 1.7273, + "step": 24080 + }, + { + "epoch": 1.3055464291157892, + "grad_norm": 0.3261583149433136, + "learning_rate": 8.816172274144013e-05, + "loss": 1.7208, + "step": 24090 + }, + { + "epoch": 1.30603417616475, + "eval_loss": 2.560994863510132, + "eval_runtime": 21.9922, + "eval_samples_per_second": 227.353, + "eval_steps_per_second": 1.228, + "step": 24099 + }, + { + "epoch": 1.3060883702813013, + "grad_norm": 0.35049372911453247, + "learning_rate": 8.815101451350207e-05, + "loss": 1.7158, + "step": 24100 + }, + { + "epoch": 1.3066303114468136, + "grad_norm": 0.253828227519989, + "learning_rate": 8.814030217896026e-05, + "loss": 1.7282, + "step": 24110 + }, + { + "epoch": 1.3071722526123257, + "grad_norm": 0.32618460059165955, + "learning_rate": 8.81295857391417e-05, + "loss": 1.7182, + "step": 24120 + }, + { + "epoch": 1.307714193777838, + "grad_norm": 0.2585597634315491, + "learning_rate": 8.811886519537391e-05, + "loss": 1.7322, + "step": 24130 + }, + { + "epoch": 1.3082561349433501, + "grad_norm": 0.29455363750457764, + "learning_rate": 8.810814054898488e-05, + "loss": 1.71, + "step": 24140 + }, + { + "epoch": 1.3087980761088625, + "grad_norm": 0.32605740427970886, + "learning_rate": 8.809741180130313e-05, + "loss": 1.7158, + "step": 24150 + }, + { + "epoch": 1.3093400172743745, + "grad_norm": 0.422584593296051, + "learning_rate": 8.808667895365771e-05, + "loss": 1.717, + "step": 24160 + }, + { + "epoch": 1.3098819584398869, + "grad_norm": 0.3378809988498688, + "learning_rate": 8.807594200737815e-05, + "loss": 1.7302, + "step": 24170 + }, + { + "epoch": 1.3104238996053992, + "grad_norm": 0.4901425242424011, + "learning_rate": 8.806520096379448e-05, + "loss": 1.7179, + "step": 24180 + }, + { + "epoch": 1.3107490643047064, + "eval_loss": 2.562967300415039, + "eval_runtime": 21.9941, + "eval_samples_per_second": 227.334, + "eval_steps_per_second": 1.228, + "step": 24186 + }, + { + "epoch": 1.3109658407709113, + "grad_norm": 0.39655086398124695, + "learning_rate": 8.805445582423728e-05, + "loss": 1.7215, + "step": 24190 + }, + { + "epoch": 1.3115077819364236, + "grad_norm": 0.3672724962234497, + "learning_rate": 8.804370659003762e-05, + "loss": 1.7214, + "step": 24200 + }, + { + "epoch": 1.3120497231019357, + "grad_norm": 0.4074071943759918, + "learning_rate": 8.803295326252701e-05, + "loss": 1.7172, + "step": 24210 + }, + { + "epoch": 1.312591664267448, + "grad_norm": 0.4320417046546936, + "learning_rate": 8.802219584303758e-05, + "loss": 1.7183, + "step": 24220 + }, + { + "epoch": 1.3131336054329603, + "grad_norm": 0.5554030537605286, + "learning_rate": 8.80114343329019e-05, + "loss": 1.7136, + "step": 24230 + }, + { + "epoch": 1.3136755465984724, + "grad_norm": 0.3394809663295746, + "learning_rate": 8.800066873345306e-05, + "loss": 1.7206, + "step": 24240 + }, + { + "epoch": 1.3142174877639845, + "grad_norm": 0.2953517436981201, + "learning_rate": 8.798989904602465e-05, + "loss": 1.7286, + "step": 24250 + }, + { + "epoch": 1.3147594289294968, + "grad_norm": 0.3113909959793091, + "learning_rate": 8.797912527195078e-05, + "loss": 1.7155, + "step": 24260 + }, + { + "epoch": 1.3153013700950091, + "grad_norm": 0.26612791419029236, + "learning_rate": 8.796834741256605e-05, + "loss": 1.7277, + "step": 24270 + }, + { + "epoch": 1.3154639524446627, + "eval_loss": 2.5562336444854736, + "eval_runtime": 21.9909, + "eval_samples_per_second": 227.367, + "eval_steps_per_second": 1.228, + "step": 24273 + }, + { + "epoch": 1.3158433112605212, + "grad_norm": 0.3873152434825897, + "learning_rate": 8.795756546920556e-05, + "loss": 1.7147, + "step": 24280 + }, + { + "epoch": 1.3163852524260335, + "grad_norm": 0.30190661549568176, + "learning_rate": 8.794677944320497e-05, + "loss": 1.7135, + "step": 24290 + }, + { + "epoch": 1.3169271935915456, + "grad_norm": 0.6682907342910767, + "learning_rate": 8.793598933590036e-05, + "loss": 1.729, + "step": 24300 + }, + { + "epoch": 1.317469134757058, + "grad_norm": 0.280906617641449, + "learning_rate": 8.79251951486284e-05, + "loss": 1.7061, + "step": 24310 + }, + { + "epoch": 1.3180110759225703, + "grad_norm": 0.5581380724906921, + "learning_rate": 8.79143968827262e-05, + "loss": 1.7343, + "step": 24320 + }, + { + "epoch": 1.3185530170880824, + "grad_norm": 0.4470410645008087, + "learning_rate": 8.790359453953145e-05, + "loss": 1.7278, + "step": 24330 + }, + { + "epoch": 1.3190949582535945, + "grad_norm": 0.4987182021141052, + "learning_rate": 8.789278812038222e-05, + "loss": 1.7266, + "step": 24340 + }, + { + "epoch": 1.3196368994191068, + "grad_norm": 1.2388598918914795, + "learning_rate": 8.788197762661723e-05, + "loss": 1.7193, + "step": 24350 + }, + { + "epoch": 1.320178840584619, + "grad_norm": 0.8786284327507019, + "learning_rate": 8.78711630595756e-05, + "loss": 1.7217, + "step": 24360 + }, + { + "epoch": 1.320178840584619, + "eval_loss": 2.568453073501587, + "eval_runtime": 21.9891, + "eval_samples_per_second": 227.386, + "eval_steps_per_second": 1.228, + "step": 24360 + }, + { + "epoch": 1.3207207817501312, + "grad_norm": 0.41298526525497437, + "learning_rate": 8.7860344420597e-05, + "loss": 1.7191, + "step": 24370 + }, + { + "epoch": 1.3212627229156435, + "grad_norm": 0.27922841906547546, + "learning_rate": 8.78495217110216e-05, + "loss": 1.7214, + "step": 24380 + }, + { + "epoch": 1.3218046640811556, + "grad_norm": 0.3357425034046173, + "learning_rate": 8.783869493219008e-05, + "loss": 1.7165, + "step": 24390 + }, + { + "epoch": 1.322346605246668, + "grad_norm": 0.6415514945983887, + "learning_rate": 8.782786408544358e-05, + "loss": 1.7259, + "step": 24400 + }, + { + "epoch": 1.3228885464121802, + "grad_norm": 0.5723456144332886, + "learning_rate": 8.78170291721238e-05, + "loss": 1.7221, + "step": 24410 + }, + { + "epoch": 1.3234304875776923, + "grad_norm": 0.27751269936561584, + "learning_rate": 8.780619019357295e-05, + "loss": 1.7241, + "step": 24420 + }, + { + "epoch": 1.3239724287432046, + "grad_norm": 0.5434432029724121, + "learning_rate": 8.779534715113368e-05, + "loss": 1.7284, + "step": 24430 + }, + { + "epoch": 1.3245143699087167, + "grad_norm": 0.3848007917404175, + "learning_rate": 8.778450004614918e-05, + "loss": 1.7164, + "step": 24440 + }, + { + "epoch": 1.3248937287245752, + "eval_loss": 2.563822031021118, + "eval_runtime": 21.9907, + "eval_samples_per_second": 227.369, + "eval_steps_per_second": 1.228, + "step": 24447 + }, + { + "epoch": 1.325056311074229, + "grad_norm": 0.3684322237968445, + "learning_rate": 8.777364887996315e-05, + "loss": 1.7197, + "step": 24450 + }, + { + "epoch": 1.3255982522397411, + "grad_norm": 0.3419678807258606, + "learning_rate": 8.77627936539198e-05, + "loss": 1.7221, + "step": 24460 + }, + { + "epoch": 1.3261401934052535, + "grad_norm": 0.2709847688674927, + "learning_rate": 8.77519343693638e-05, + "loss": 1.7159, + "step": 24470 + }, + { + "epoch": 1.3266821345707656, + "grad_norm": 0.29516398906707764, + "learning_rate": 8.774107102764038e-05, + "loss": 1.72, + "step": 24480 + }, + { + "epoch": 1.3272240757362779, + "grad_norm": 0.29573437571525574, + "learning_rate": 8.773020363009521e-05, + "loss": 1.7187, + "step": 24490 + }, + { + "epoch": 1.3277660169017902, + "grad_norm": 0.44336891174316406, + "learning_rate": 8.771933217807453e-05, + "loss": 1.7106, + "step": 24500 + }, + { + "epoch": 1.3283079580673023, + "grad_norm": 0.3083667755126953, + "learning_rate": 8.770845667292503e-05, + "loss": 1.7229, + "step": 24510 + }, + { + "epoch": 1.3288498992328146, + "grad_norm": 0.4367120862007141, + "learning_rate": 8.769757711599391e-05, + "loss": 1.7231, + "step": 24520 + }, + { + "epoch": 1.3293918403983267, + "grad_norm": 0.40371039509773254, + "learning_rate": 8.768669350862892e-05, + "loss": 1.7188, + "step": 24530 + }, + { + "epoch": 1.3296086168645316, + "eval_loss": 2.572169542312622, + "eval_runtime": 21.9901, + "eval_samples_per_second": 227.375, + "eval_steps_per_second": 1.228, + "step": 24534 + }, + { + "epoch": 1.329933781563839, + "grad_norm": 0.7102428674697876, + "learning_rate": 8.767580585217823e-05, + "loss": 1.7157, + "step": 24540 + }, + { + "epoch": 1.3304757227293513, + "grad_norm": 0.35714074969291687, + "learning_rate": 8.766491414799057e-05, + "loss": 1.7066, + "step": 24550 + }, + { + "epoch": 1.3310176638948634, + "grad_norm": 0.29054415225982666, + "learning_rate": 8.765401839741517e-05, + "loss": 1.7183, + "step": 24560 + }, + { + "epoch": 1.3315596050603755, + "grad_norm": 0.26257362961769104, + "learning_rate": 8.764311860180175e-05, + "loss": 1.7163, + "step": 24570 + }, + { + "epoch": 1.3321015462258878, + "grad_norm": 0.3010044991970062, + "learning_rate": 8.763221476250051e-05, + "loss": 1.7215, + "step": 24580 + }, + { + "epoch": 1.3326434873914001, + "grad_norm": 0.33944013714790344, + "learning_rate": 8.762130688086219e-05, + "loss": 1.72, + "step": 24590 + }, + { + "epoch": 1.3331854285569122, + "grad_norm": 0.5436154007911682, + "learning_rate": 8.761039495823799e-05, + "loss": 1.7121, + "step": 24600 + }, + { + "epoch": 1.3337273697224246, + "grad_norm": 0.4238379895687103, + "learning_rate": 8.759947899597964e-05, + "loss": 1.7195, + "step": 24610 + }, + { + "epoch": 1.3342693108879367, + "grad_norm": 0.2585737705230713, + "learning_rate": 8.758855899543939e-05, + "loss": 1.7015, + "step": 24620 + }, + { + "epoch": 1.334323505004488, + "eval_loss": 2.5573627948760986, + "eval_runtime": 21.9891, + "eval_samples_per_second": 227.386, + "eval_steps_per_second": 1.228, + "step": 24621 + }, + { + "epoch": 1.334811252053449, + "grad_norm": 0.35536953806877136, + "learning_rate": 8.75776349579699e-05, + "loss": 1.7164, + "step": 24630 + }, + { + "epoch": 1.3353531932189613, + "grad_norm": 0.4927321672439575, + "learning_rate": 8.756670688492445e-05, + "loss": 1.7181, + "step": 24640 + }, + { + "epoch": 1.3358951343844734, + "grad_norm": 0.3061029613018036, + "learning_rate": 8.755577477765674e-05, + "loss": 1.7136, + "step": 24650 + }, + { + "epoch": 1.3364370755499855, + "grad_norm": 0.27562111616134644, + "learning_rate": 8.7544838637521e-05, + "loss": 1.7222, + "step": 24660 + }, + { + "epoch": 1.3369790167154978, + "grad_norm": 0.46605604887008667, + "learning_rate": 8.753389846587194e-05, + "loss": 1.7085, + "step": 24670 + }, + { + "epoch": 1.33752095788101, + "grad_norm": 0.47294819355010986, + "learning_rate": 8.752295426406479e-05, + "loss": 1.711, + "step": 24680 + }, + { + "epoch": 1.3380628990465222, + "grad_norm": 0.42514827847480774, + "learning_rate": 8.751200603345524e-05, + "loss": 1.7081, + "step": 24690 + }, + { + "epoch": 1.3386048402120345, + "grad_norm": 0.46494096517562866, + "learning_rate": 8.750105377539957e-05, + "loss": 1.6989, + "step": 24700 + }, + { + "epoch": 1.3390383931444443, + "eval_loss": 2.5702195167541504, + "eval_runtime": 21.9879, + "eval_samples_per_second": 227.398, + "eval_steps_per_second": 1.228, + "step": 24708 + }, + { + "epoch": 1.3391467813775466, + "grad_norm": 0.36518365144729614, + "learning_rate": 8.749009749125445e-05, + "loss": 1.7115, + "step": 24710 + }, + { + "epoch": 1.339688722543059, + "grad_norm": 0.32633692026138306, + "learning_rate": 8.747913718237712e-05, + "loss": 1.7166, + "step": 24720 + }, + { + "epoch": 1.3402306637085712, + "grad_norm": 0.370088130235672, + "learning_rate": 8.746817285012527e-05, + "loss": 1.714, + "step": 24730 + }, + { + "epoch": 1.3407726048740833, + "grad_norm": 0.25269201397895813, + "learning_rate": 8.745720449585714e-05, + "loss": 1.7212, + "step": 24740 + }, + { + "epoch": 1.3413145460395957, + "grad_norm": 0.3450757563114166, + "learning_rate": 8.744623212093142e-05, + "loss": 1.7241, + "step": 24750 + }, + { + "epoch": 1.3418564872051078, + "grad_norm": 0.26472437381744385, + "learning_rate": 8.743525572670734e-05, + "loss": 1.7087, + "step": 24760 + }, + { + "epoch": 1.34239842837062, + "grad_norm": 0.5389896035194397, + "learning_rate": 8.74242753145446e-05, + "loss": 1.7157, + "step": 24770 + }, + { + "epoch": 1.3429403695361322, + "grad_norm": 0.6463091969490051, + "learning_rate": 8.74132908858034e-05, + "loss": 1.7056, + "step": 24780 + }, + { + "epoch": 1.3434823107016445, + "grad_norm": 0.40934550762176514, + "learning_rate": 8.740230244184448e-05, + "loss": 1.7137, + "step": 24790 + }, + { + "epoch": 1.3437532812844006, + "eval_loss": 2.5633254051208496, + "eval_runtime": 21.9886, + "eval_samples_per_second": 227.391, + "eval_steps_per_second": 1.228, + "step": 24795 + }, + { + "epoch": 1.3440242518671566, + "grad_norm": 0.4404658377170563, + "learning_rate": 8.739130998402898e-05, + "loss": 1.7126, + "step": 24800 + }, + { + "epoch": 1.3445661930326689, + "grad_norm": 0.37851405143737793, + "learning_rate": 8.738031351371863e-05, + "loss": 1.7127, + "step": 24810 + }, + { + "epoch": 1.3451081341981812, + "grad_norm": 0.30251404643058777, + "learning_rate": 8.736931303227563e-05, + "loss": 1.7293, + "step": 24820 + }, + { + "epoch": 1.3456500753636933, + "grad_norm": 0.5295489430427551, + "learning_rate": 8.735830854106267e-05, + "loss": 1.7092, + "step": 24830 + }, + { + "epoch": 1.3461920165292056, + "grad_norm": 0.54742431640625, + "learning_rate": 8.734730004144292e-05, + "loss": 1.7201, + "step": 24840 + }, + { + "epoch": 1.3467339576947177, + "grad_norm": 0.6740279793739319, + "learning_rate": 8.733628753478009e-05, + "loss": 1.7187, + "step": 24850 + }, + { + "epoch": 1.34727589886023, + "grad_norm": 0.4922647476196289, + "learning_rate": 8.732527102243835e-05, + "loss": 1.7185, + "step": 24860 + }, + { + "epoch": 1.3478178400257423, + "grad_norm": 0.34301790595054626, + "learning_rate": 8.731425050578238e-05, + "loss": 1.709, + "step": 24870 + }, + { + "epoch": 1.3483597811912544, + "grad_norm": 0.6136478185653687, + "learning_rate": 8.730322598617734e-05, + "loss": 1.7129, + "step": 24880 + }, + { + "epoch": 1.3484681694243568, + "eval_loss": 2.5651490688323975, + "eval_runtime": 21.9924, + "eval_samples_per_second": 227.351, + "eval_steps_per_second": 1.228, + "step": 24882 + }, + { + "epoch": 1.3489017223567665, + "grad_norm": 0.44811177253723145, + "learning_rate": 8.729219746498892e-05, + "loss": 1.7175, + "step": 24890 + }, + { + "epoch": 1.3494436635222788, + "grad_norm": 0.35493898391723633, + "learning_rate": 8.728116494358325e-05, + "loss": 1.7221, + "step": 24900 + }, + { + "epoch": 1.3499856046877912, + "grad_norm": 0.3304137885570526, + "learning_rate": 8.727012842332706e-05, + "loss": 1.7252, + "step": 24910 + }, + { + "epoch": 1.3505275458533033, + "grad_norm": 0.2520520091056824, + "learning_rate": 8.725908790558746e-05, + "loss": 1.7111, + "step": 24920 + }, + { + "epoch": 1.3510694870188156, + "grad_norm": 0.48782554268836975, + "learning_rate": 8.724804339173208e-05, + "loss": 1.7112, + "step": 24930 + }, + { + "epoch": 1.3516114281843277, + "grad_norm": 0.3623749017715454, + "learning_rate": 8.72369948831291e-05, + "loss": 1.7057, + "step": 24940 + }, + { + "epoch": 1.35215336934984, + "grad_norm": 0.31124410033226013, + "learning_rate": 8.722594238114716e-05, + "loss": 1.702, + "step": 24950 + }, + { + "epoch": 1.3526953105153523, + "grad_norm": 0.3612881600856781, + "learning_rate": 8.721488588715539e-05, + "loss": 1.7136, + "step": 24960 + }, + { + "epoch": 1.3531830575643131, + "eval_loss": 2.5710861682891846, + "eval_runtime": 21.9923, + "eval_samples_per_second": 227.352, + "eval_steps_per_second": 1.228, + "step": 24969 + }, + { + "epoch": 1.3532372516808644, + "grad_norm": 0.3691853880882263, + "learning_rate": 8.720382540252341e-05, + "loss": 1.7128, + "step": 24970 + }, + { + "epoch": 1.3537791928463765, + "grad_norm": 0.33288514614105225, + "learning_rate": 8.719276092862137e-05, + "loss": 1.6998, + "step": 24980 + }, + { + "epoch": 1.3543211340118888, + "grad_norm": 0.28774330019950867, + "learning_rate": 8.718169246681986e-05, + "loss": 1.7154, + "step": 24990 + }, + { + "epoch": 1.3548630751774011, + "grad_norm": 0.44046953320503235, + "learning_rate": 8.717062001849001e-05, + "loss": 1.6982, + "step": 25000 + }, + { + "epoch": 1.3554050163429132, + "grad_norm": 0.40893176198005676, + "learning_rate": 8.715954358500342e-05, + "loss": 1.7179, + "step": 25010 + }, + { + "epoch": 1.3559469575084255, + "grad_norm": 0.46221449971199036, + "learning_rate": 8.714846316773218e-05, + "loss": 1.7108, + "step": 25020 + }, + { + "epoch": 1.3564888986739376, + "grad_norm": 0.4240012466907501, + "learning_rate": 8.713737876804891e-05, + "loss": 1.7114, + "step": 25030 + }, + { + "epoch": 1.35703083983945, + "grad_norm": 0.39423850178718567, + "learning_rate": 8.71262903873267e-05, + "loss": 1.7035, + "step": 25040 + }, + { + "epoch": 1.3575727810049623, + "grad_norm": 0.26944607496261597, + "learning_rate": 8.711519802693911e-05, + "loss": 1.7109, + "step": 25050 + }, + { + "epoch": 1.3578979457042695, + "eval_loss": 2.564357280731201, + "eval_runtime": 21.9966, + "eval_samples_per_second": 227.308, + "eval_steps_per_second": 1.227, + "step": 25056 + }, + { + "epoch": 1.3581147221704744, + "grad_norm": 0.353392630815506, + "learning_rate": 8.710410168826021e-05, + "loss": 1.7087, + "step": 25060 + }, + { + "epoch": 1.3586566633359867, + "grad_norm": 0.41630539298057556, + "learning_rate": 8.709300137266456e-05, + "loss": 1.7018, + "step": 25070 + }, + { + "epoch": 1.3591986045014988, + "grad_norm": 0.3117140233516693, + "learning_rate": 8.708189708152727e-05, + "loss": 1.7139, + "step": 25080 + }, + { + "epoch": 1.359740545667011, + "grad_norm": 0.522264301776886, + "learning_rate": 8.707078881622381e-05, + "loss": 1.7087, + "step": 25090 + }, + { + "epoch": 1.3602824868325232, + "grad_norm": 0.2718927562236786, + "learning_rate": 8.70596765781303e-05, + "loss": 1.702, + "step": 25100 + }, + { + "epoch": 1.3608244279980355, + "grad_norm": 0.5048019289970398, + "learning_rate": 8.704856036862325e-05, + "loss": 1.7174, + "step": 25110 + }, + { + "epoch": 1.3613663691635476, + "grad_norm": 0.29533931612968445, + "learning_rate": 8.703744018907966e-05, + "loss": 1.7028, + "step": 25120 + }, + { + "epoch": 1.36190831032906, + "grad_norm": 0.6198928356170654, + "learning_rate": 8.702631604087709e-05, + "loss": 1.7008, + "step": 25130 + }, + { + "epoch": 1.3624502514945722, + "grad_norm": 0.3209061026573181, + "learning_rate": 8.701518792539351e-05, + "loss": 1.7072, + "step": 25140 + }, + { + "epoch": 1.3626128338442258, + "eval_loss": 2.5585379600524902, + "eval_runtime": 21.9846, + "eval_samples_per_second": 227.432, + "eval_steps_per_second": 1.228, + "step": 25143 + }, + { + "epoch": 1.3629921926600843, + "grad_norm": 0.2755369246006012, + "learning_rate": 8.700405584400747e-05, + "loss": 1.7093, + "step": 25150 + }, + { + "epoch": 1.3635341338255966, + "grad_norm": 0.24208010733127594, + "learning_rate": 8.699291979809794e-05, + "loss": 1.7136, + "step": 25160 + }, + { + "epoch": 1.3640760749911087, + "grad_norm": 0.44155067205429077, + "learning_rate": 8.69817797890444e-05, + "loss": 1.6991, + "step": 25170 + }, + { + "epoch": 1.364618016156621, + "grad_norm": 0.24571721255779266, + "learning_rate": 8.697063581822682e-05, + "loss": 1.6956, + "step": 25180 + }, + { + "epoch": 1.3651599573221331, + "grad_norm": 0.2612999975681305, + "learning_rate": 8.695948788702568e-05, + "loss": 1.7069, + "step": 25190 + }, + { + "epoch": 1.3657018984876454, + "grad_norm": 0.47256600856781006, + "learning_rate": 8.694833599682194e-05, + "loss": 1.6981, + "step": 25200 + }, + { + "epoch": 1.3662438396531575, + "grad_norm": 0.3326663374900818, + "learning_rate": 8.693718014899705e-05, + "loss": 1.7145, + "step": 25210 + }, + { + "epoch": 1.3667857808186699, + "grad_norm": 0.6608891487121582, + "learning_rate": 8.692602034493292e-05, + "loss": 1.6981, + "step": 25220 + }, + { + "epoch": 1.3673277219841822, + "grad_norm": 0.3640691041946411, + "learning_rate": 8.691485658601203e-05, + "loss": 1.7105, + "step": 25230 + }, + { + "epoch": 1.3673277219841822, + "eval_loss": 2.5621073246002197, + "eval_runtime": 21.9283, + "eval_samples_per_second": 228.016, + "eval_steps_per_second": 1.231, + "step": 25230 + }, + { + "epoch": 1.3678696631496943, + "grad_norm": 0.26905468106269836, + "learning_rate": 8.690368887361724e-05, + "loss": 1.7063, + "step": 25240 + }, + { + "epoch": 1.3684116043152066, + "grad_norm": 0.33112001419067383, + "learning_rate": 8.689251720913199e-05, + "loss": 1.7043, + "step": 25250 + }, + { + "epoch": 1.3689535454807187, + "grad_norm": 0.6896092891693115, + "learning_rate": 8.688134159394017e-05, + "loss": 1.7025, + "step": 25260 + }, + { + "epoch": 1.369495486646231, + "grad_norm": 0.26553961634635925, + "learning_rate": 8.687016202942617e-05, + "loss": 1.7046, + "step": 25270 + }, + { + "epoch": 1.3700374278117433, + "grad_norm": 0.6590327024459839, + "learning_rate": 8.685897851697486e-05, + "loss": 1.6894, + "step": 25280 + }, + { + "epoch": 1.3705793689772554, + "grad_norm": 0.264779657125473, + "learning_rate": 8.684779105797164e-05, + "loss": 1.7135, + "step": 25290 + }, + { + "epoch": 1.3711213101427675, + "grad_norm": 0.29008379578590393, + "learning_rate": 8.68365996538023e-05, + "loss": 1.7019, + "step": 25300 + }, + { + "epoch": 1.3716632513082798, + "grad_norm": 0.28447601199150085, + "learning_rate": 8.682540430585322e-05, + "loss": 1.7155, + "step": 25310 + }, + { + "epoch": 1.3720426101241383, + "eval_loss": 2.554466962814331, + "eval_runtime": 21.9935, + "eval_samples_per_second": 227.34, + "eval_steps_per_second": 1.228, + "step": 25317 + }, + { + "epoch": 1.3722051924737921, + "grad_norm": 0.4446844160556793, + "learning_rate": 8.681420501551123e-05, + "loss": 1.6982, + "step": 25320 + }, + { + "epoch": 1.3727471336393042, + "grad_norm": 0.3506196439266205, + "learning_rate": 8.680300178416364e-05, + "loss": 1.6931, + "step": 25330 + }, + { + "epoch": 1.3732890748048165, + "grad_norm": 0.28431805968284607, + "learning_rate": 8.679179461319828e-05, + "loss": 1.7094, + "step": 25340 + }, + { + "epoch": 1.3738310159703286, + "grad_norm": 0.3544039726257324, + "learning_rate": 8.678058350400342e-05, + "loss": 1.6993, + "step": 25350 + }, + { + "epoch": 1.374372957135841, + "grad_norm": 0.4839727580547333, + "learning_rate": 8.676936845796785e-05, + "loss": 1.712, + "step": 25360 + }, + { + "epoch": 1.3749148983013533, + "grad_norm": 0.34992125630378723, + "learning_rate": 8.675814947648085e-05, + "loss": 1.7031, + "step": 25370 + }, + { + "epoch": 1.3754568394668654, + "grad_norm": 0.37806737422943115, + "learning_rate": 8.674692656093216e-05, + "loss": 1.7053, + "step": 25380 + }, + { + "epoch": 1.3759987806323775, + "grad_norm": 0.24221035838127136, + "learning_rate": 8.673569971271206e-05, + "loss": 1.702, + "step": 25390 + }, + { + "epoch": 1.3765407217978898, + "grad_norm": 0.5913215279579163, + "learning_rate": 8.672446893321125e-05, + "loss": 1.6931, + "step": 25400 + }, + { + "epoch": 1.3767574982640947, + "eval_loss": 2.552272081375122, + "eval_runtime": 21.9916, + "eval_samples_per_second": 227.36, + "eval_steps_per_second": 1.228, + "step": 25404 + }, + { + "epoch": 1.377082662963402, + "grad_norm": 0.29546642303466797, + "learning_rate": 8.671323422382097e-05, + "loss": 1.7001, + "step": 25410 + }, + { + "epoch": 1.3776246041289142, + "grad_norm": 0.48717227578163147, + "learning_rate": 8.67019955859329e-05, + "loss": 1.7133, + "step": 25420 + }, + { + "epoch": 1.3781665452944265, + "grad_norm": 0.3169041872024536, + "learning_rate": 8.669075302093928e-05, + "loss": 1.6972, + "step": 25430 + }, + { + "epoch": 1.3787084864599386, + "grad_norm": 0.36612847447395325, + "learning_rate": 8.667950653023274e-05, + "loss": 1.6897, + "step": 25440 + }, + { + "epoch": 1.379250427625451, + "grad_norm": 0.2583743929862976, + "learning_rate": 8.666825611520648e-05, + "loss": 1.7026, + "step": 25450 + }, + { + "epoch": 1.3797923687909632, + "grad_norm": 0.34147560596466064, + "learning_rate": 8.665700177725415e-05, + "loss": 1.6969, + "step": 25460 + }, + { + "epoch": 1.3803343099564753, + "grad_norm": 0.43001192808151245, + "learning_rate": 8.664574351776988e-05, + "loss": 1.708, + "step": 25470 + }, + { + "epoch": 1.3808762511219876, + "grad_norm": 0.41845470666885376, + "learning_rate": 8.663448133814826e-05, + "loss": 1.7032, + "step": 25480 + }, + { + "epoch": 1.3814181922874997, + "grad_norm": 0.3702855706214905, + "learning_rate": 8.662321523978448e-05, + "loss": 1.6965, + "step": 25490 + }, + { + "epoch": 1.381472386404051, + "eval_loss": 2.5554749965667725, + "eval_runtime": 21.99, + "eval_samples_per_second": 227.376, + "eval_steps_per_second": 1.228, + "step": 25491 + }, + { + "epoch": 1.381960133453012, + "grad_norm": 0.4040187895298004, + "learning_rate": 8.661194522407408e-05, + "loss": 1.7025, + "step": 25500 + }, + { + "epoch": 1.3825020746185241, + "grad_norm": 0.5149408578872681, + "learning_rate": 8.660067129241315e-05, + "loss": 1.698, + "step": 25510 + }, + { + "epoch": 1.3830440157840365, + "grad_norm": 0.328133761882782, + "learning_rate": 8.658939344619824e-05, + "loss": 1.7068, + "step": 25520 + }, + { + "epoch": 1.3835859569495486, + "grad_norm": 0.34496381878852844, + "learning_rate": 8.657811168682644e-05, + "loss": 1.7131, + "step": 25530 + }, + { + "epoch": 1.3841278981150609, + "grad_norm": 0.29763221740722656, + "learning_rate": 8.656682601569524e-05, + "loss": 1.6992, + "step": 25540 + }, + { + "epoch": 1.3846698392805732, + "grad_norm": 0.3687681257724762, + "learning_rate": 8.655553643420268e-05, + "loss": 1.7021, + "step": 25550 + }, + { + "epoch": 1.3852117804460853, + "grad_norm": 0.36310508847236633, + "learning_rate": 8.654424294374729e-05, + "loss": 1.6958, + "step": 25560 + }, + { + "epoch": 1.3857537216115976, + "grad_norm": 0.7248652577400208, + "learning_rate": 8.653294554572802e-05, + "loss": 1.7009, + "step": 25570 + }, + { + "epoch": 1.3861872745440074, + "eval_loss": 2.5556063652038574, + "eval_runtime": 21.9953, + "eval_samples_per_second": 227.321, + "eval_steps_per_second": 1.228, + "step": 25578 + }, + { + "epoch": 1.3862956627771097, + "grad_norm": 0.2832872271537781, + "learning_rate": 8.652164424154438e-05, + "loss": 1.6906, + "step": 25580 + }, + { + "epoch": 1.386837603942622, + "grad_norm": 0.6250040531158447, + "learning_rate": 8.651033903259629e-05, + "loss": 1.6885, + "step": 25590 + }, + { + "epoch": 1.3873795451081343, + "grad_norm": 0.36408352851867676, + "learning_rate": 8.649902992028419e-05, + "loss": 1.697, + "step": 25600 + }, + { + "epoch": 1.3879214862736464, + "grad_norm": 0.344611257314682, + "learning_rate": 8.648771690600905e-05, + "loss": 1.7002, + "step": 25610 + }, + { + "epoch": 1.3884634274391585, + "grad_norm": 0.5186106562614441, + "learning_rate": 8.647639999117221e-05, + "loss": 1.6938, + "step": 25620 + }, + { + "epoch": 1.3890053686046708, + "grad_norm": 0.3214333951473236, + "learning_rate": 8.646507917717562e-05, + "loss": 1.6959, + "step": 25630 + }, + { + "epoch": 1.3895473097701831, + "grad_norm": 0.31978529691696167, + "learning_rate": 8.645375446542162e-05, + "loss": 1.6994, + "step": 25640 + }, + { + "epoch": 1.3900892509356952, + "grad_norm": 0.3708828389644623, + "learning_rate": 8.644242585731309e-05, + "loss": 1.7054, + "step": 25650 + }, + { + "epoch": 1.3906311921012076, + "grad_norm": 0.2623041272163391, + "learning_rate": 8.643109335425335e-05, + "loss": 1.6969, + "step": 25660 + }, + { + "epoch": 1.3909021626839637, + "eval_loss": 2.5598154067993164, + "eval_runtime": 21.9908, + "eval_samples_per_second": 227.368, + "eval_steps_per_second": 1.228, + "step": 25665 + }, + { + "epoch": 1.3911731332667197, + "grad_norm": 0.23545025289058685, + "learning_rate": 8.641975695764621e-05, + "loss": 1.6934, + "step": 25670 + }, + { + "epoch": 1.391715074432232, + "grad_norm": 0.31226447224617004, + "learning_rate": 8.640841666889599e-05, + "loss": 1.705, + "step": 25680 + }, + { + "epoch": 1.3922570155977443, + "grad_norm": 0.25354868173599243, + "learning_rate": 8.639707248940748e-05, + "loss": 1.6927, + "step": 25690 + }, + { + "epoch": 1.3927989567632564, + "grad_norm": 0.2758951783180237, + "learning_rate": 8.638572442058596e-05, + "loss": 1.6954, + "step": 25700 + }, + { + "epoch": 1.3933408979287685, + "grad_norm": 0.2809247672557831, + "learning_rate": 8.637437246383714e-05, + "loss": 1.691, + "step": 25710 + }, + { + "epoch": 1.3938828390942808, + "grad_norm": 0.27451246976852417, + "learning_rate": 8.636301662056727e-05, + "loss": 1.6959, + "step": 25720 + }, + { + "epoch": 1.394424780259793, + "grad_norm": 0.5022933483123779, + "learning_rate": 8.635165689218309e-05, + "loss": 1.6908, + "step": 25730 + }, + { + "epoch": 1.3949667214253052, + "grad_norm": 0.23696884512901306, + "learning_rate": 8.634029328009174e-05, + "loss": 1.6908, + "step": 25740 + }, + { + "epoch": 1.3955086625908175, + "grad_norm": 0.2913939356803894, + "learning_rate": 8.632892578570094e-05, + "loss": 1.6963, + "step": 25750 + }, + { + "epoch": 1.3956170508239198, + "eval_loss": 2.5667457580566406, + "eval_runtime": 21.9931, + "eval_samples_per_second": 227.344, + "eval_steps_per_second": 1.228, + "step": 25752 + }, + { + "epoch": 1.3960506037563296, + "grad_norm": 0.28175458312034607, + "learning_rate": 8.631755441041884e-05, + "loss": 1.7013, + "step": 25760 + }, + { + "epoch": 1.396592544921842, + "grad_norm": 0.4736407399177551, + "learning_rate": 8.630617915565405e-05, + "loss": 1.7113, + "step": 25770 + }, + { + "epoch": 1.3971344860873542, + "grad_norm": 0.33866068720817566, + "learning_rate": 8.629480002281572e-05, + "loss": 1.6952, + "step": 25780 + }, + { + "epoch": 1.3976764272528663, + "grad_norm": 0.2977088689804077, + "learning_rate": 8.628341701331341e-05, + "loss": 1.7036, + "step": 25790 + }, + { + "epoch": 1.3982183684183787, + "grad_norm": 0.26200008392333984, + "learning_rate": 8.627203012855723e-05, + "loss": 1.708, + "step": 25800 + }, + { + "epoch": 1.3987603095838907, + "grad_norm": 0.6198749542236328, + "learning_rate": 8.626063936995774e-05, + "loss": 1.6924, + "step": 25810 + }, + { + "epoch": 1.399302250749403, + "grad_norm": 0.6624993681907654, + "learning_rate": 8.624924473892595e-05, + "loss": 1.6978, + "step": 25820 + }, + { + "epoch": 1.3998441919149152, + "grad_norm": 0.32168132066726685, + "learning_rate": 8.62378462368734e-05, + "loss": 1.7015, + "step": 25830 + }, + { + "epoch": 1.4003319389638762, + "eval_loss": 2.560290575027466, + "eval_runtime": 21.9932, + "eval_samples_per_second": 227.343, + "eval_steps_per_second": 1.228, + "step": 25839 + }, + { + "epoch": 1.4003861330804275, + "grad_norm": 0.39742761850357056, + "learning_rate": 8.622644386521207e-05, + "loss": 1.6899, + "step": 25840 + }, + { + "epoch": 1.4009280742459396, + "grad_norm": 0.4070427417755127, + "learning_rate": 8.621503762535445e-05, + "loss": 1.6905, + "step": 25850 + }, + { + "epoch": 1.4014700154114519, + "grad_norm": 0.33487892150878906, + "learning_rate": 8.620362751871349e-05, + "loss": 1.6978, + "step": 25860 + }, + { + "epoch": 1.4020119565769642, + "grad_norm": 0.2614098787307739, + "learning_rate": 8.619221354670264e-05, + "loss": 1.682, + "step": 25870 + }, + { + "epoch": 1.4025538977424763, + "grad_norm": 0.3438650369644165, + "learning_rate": 8.618079571073578e-05, + "loss": 1.6924, + "step": 25880 + }, + { + "epoch": 1.4030958389079886, + "grad_norm": 0.3044370710849762, + "learning_rate": 8.616937401222734e-05, + "loss": 1.6948, + "step": 25890 + }, + { + "epoch": 1.4036377800735007, + "grad_norm": 0.3689815402030945, + "learning_rate": 8.615794845259215e-05, + "loss": 1.7012, + "step": 25900 + }, + { + "epoch": 1.404179721239013, + "grad_norm": 0.2958444356918335, + "learning_rate": 8.61465190332456e-05, + "loss": 1.6935, + "step": 25910 + }, + { + "epoch": 1.4047216624045253, + "grad_norm": 0.4378800392150879, + "learning_rate": 8.613508575560348e-05, + "loss": 1.6843, + "step": 25920 + }, + { + "epoch": 1.4050468271038326, + "eval_loss": 2.5524935722351074, + "eval_runtime": 21.9905, + "eval_samples_per_second": 227.371, + "eval_steps_per_second": 1.228, + "step": 25926 + }, + { + "epoch": 1.4052636035700374, + "grad_norm": 0.2674407362937927, + "learning_rate": 8.612364862108211e-05, + "loss": 1.6896, + "step": 25930 + }, + { + "epoch": 1.4058055447355495, + "grad_norm": 0.37812310457229614, + "learning_rate": 8.611220763109829e-05, + "loss": 1.6967, + "step": 25940 + }, + { + "epoch": 1.4063474859010618, + "grad_norm": 0.25375431776046753, + "learning_rate": 8.610076278706926e-05, + "loss": 1.6858, + "step": 25950 + }, + { + "epoch": 1.4068894270665742, + "grad_norm": 0.30164968967437744, + "learning_rate": 8.608931409041273e-05, + "loss": 1.695, + "step": 25960 + }, + { + "epoch": 1.4074313682320863, + "grad_norm": 0.3200394809246063, + "learning_rate": 8.607786154254699e-05, + "loss": 1.6881, + "step": 25970 + }, + { + "epoch": 1.4079733093975986, + "grad_norm": 0.3104444742202759, + "learning_rate": 8.606640514489067e-05, + "loss": 1.6972, + "step": 25980 + }, + { + "epoch": 1.4085152505631107, + "grad_norm": 0.3415814936161041, + "learning_rate": 8.605494489886297e-05, + "loss": 1.6989, + "step": 25990 + }, + { + "epoch": 1.409057191728623, + "grad_norm": 0.3201243281364441, + "learning_rate": 8.604348080588351e-05, + "loss": 1.6955, + "step": 26000 + }, + { + "epoch": 1.4095991328941353, + "grad_norm": 0.2451564520597458, + "learning_rate": 8.603201286737243e-05, + "loss": 1.6988, + "step": 26010 + }, + { + "epoch": 1.409761715243789, + "eval_loss": 2.557467222213745, + "eval_runtime": 21.9947, + "eval_samples_per_second": 227.328, + "eval_steps_per_second": 1.228, + "step": 26013 + }, + { + "epoch": 1.4101410740596474, + "grad_norm": 0.31296998262405396, + "learning_rate": 8.602054108475032e-05, + "loss": 1.6994, + "step": 26020 + }, + { + "epoch": 1.4106830152251595, + "grad_norm": 0.6278992891311646, + "learning_rate": 8.600906545943827e-05, + "loss": 1.7028, + "step": 26030 + }, + { + "epoch": 1.4112249563906718, + "grad_norm": 0.3286595642566681, + "learning_rate": 8.59975859928578e-05, + "loss": 1.7018, + "step": 26040 + }, + { + "epoch": 1.4117668975561841, + "grad_norm": 0.26818498969078064, + "learning_rate": 8.598610268643097e-05, + "loss": 1.686, + "step": 26050 + }, + { + "epoch": 1.4123088387216962, + "grad_norm": 0.3351867198944092, + "learning_rate": 8.597461554158025e-05, + "loss": 1.6841, + "step": 26060 + }, + { + "epoch": 1.4128507798872085, + "grad_norm": 0.43873295187950134, + "learning_rate": 8.596312455972866e-05, + "loss": 1.7027, + "step": 26070 + }, + { + "epoch": 1.4133927210527206, + "grad_norm": 0.3023853600025177, + "learning_rate": 8.595162974229963e-05, + "loss": 1.6949, + "step": 26080 + }, + { + "epoch": 1.413934662218233, + "grad_norm": 0.30039265751838684, + "learning_rate": 8.594013109071705e-05, + "loss": 1.6887, + "step": 26090 + }, + { + "epoch": 1.4144766033837453, + "grad_norm": 0.32320213317871094, + "learning_rate": 8.59286286064054e-05, + "loss": 1.6913, + "step": 26100 + }, + { + "epoch": 1.4144766033837453, + "eval_loss": 2.557053327560425, + "eval_runtime": 21.9348, + "eval_samples_per_second": 227.948, + "eval_steps_per_second": 1.231, + "step": 26100 + }, + { + "epoch": 1.4150185445492574, + "grad_norm": 0.2611085772514343, + "learning_rate": 8.591712229078949e-05, + "loss": 1.6902, + "step": 26110 + }, + { + "epoch": 1.4155604857147697, + "grad_norm": 0.6301568150520325, + "learning_rate": 8.59056121452947e-05, + "loss": 1.6871, + "step": 26120 + }, + { + "epoch": 1.4161024268802818, + "grad_norm": 0.29995056986808777, + "learning_rate": 8.589409817134685e-05, + "loss": 1.6999, + "step": 26130 + }, + { + "epoch": 1.416644368045794, + "grad_norm": 0.30796995759010315, + "learning_rate": 8.588258037037227e-05, + "loss": 1.6911, + "step": 26140 + }, + { + "epoch": 1.4171863092113062, + "grad_norm": 0.28592872619628906, + "learning_rate": 8.58710587437977e-05, + "loss": 1.6882, + "step": 26150 + }, + { + "epoch": 1.4177282503768185, + "grad_norm": 0.264588862657547, + "learning_rate": 8.58595332930504e-05, + "loss": 1.6967, + "step": 26160 + }, + { + "epoch": 1.4182701915423306, + "grad_norm": 0.3666386306285858, + "learning_rate": 8.58480040195581e-05, + "loss": 1.682, + "step": 26170 + }, + { + "epoch": 1.418812132707843, + "grad_norm": 0.38330572843551636, + "learning_rate": 8.5836470924749e-05, + "loss": 1.6829, + "step": 26180 + }, + { + "epoch": 1.4191914915237014, + "eval_loss": 2.5586886405944824, + "eval_runtime": 21.9928, + "eval_samples_per_second": 227.348, + "eval_steps_per_second": 1.228, + "step": 26187 + }, + { + "epoch": 1.4193540738733552, + "grad_norm": 0.32240378856658936, + "learning_rate": 8.582493401005175e-05, + "loss": 1.6968, + "step": 26190 + }, + { + "epoch": 1.4198960150388673, + "grad_norm": 0.3624284267425537, + "learning_rate": 8.581339327689549e-05, + "loss": 1.7, + "step": 26200 + }, + { + "epoch": 1.4204379562043796, + "grad_norm": 0.3740536868572235, + "learning_rate": 8.580184872670987e-05, + "loss": 1.6942, + "step": 26210 + }, + { + "epoch": 1.4209798973698917, + "grad_norm": 0.5000831484794617, + "learning_rate": 8.579030036092495e-05, + "loss": 1.6869, + "step": 26220 + }, + { + "epoch": 1.421521838535404, + "grad_norm": 0.4658829867839813, + "learning_rate": 8.57787481809713e-05, + "loss": 1.6873, + "step": 26230 + }, + { + "epoch": 1.4220637797009164, + "grad_norm": 0.3881395161151886, + "learning_rate": 8.576719218827996e-05, + "loss": 1.6912, + "step": 26240 + }, + { + "epoch": 1.4226057208664284, + "grad_norm": 0.6568459868431091, + "learning_rate": 8.575563238428241e-05, + "loss": 1.6982, + "step": 26250 + }, + { + "epoch": 1.4231476620319405, + "grad_norm": 0.2987072765827179, + "learning_rate": 8.574406877041067e-05, + "loss": 1.6951, + "step": 26260 + }, + { + "epoch": 1.4236896031974529, + "grad_norm": 0.3211478590965271, + "learning_rate": 8.573250134809715e-05, + "loss": 1.7003, + "step": 26270 + }, + { + "epoch": 1.4239063796636577, + "eval_loss": 2.5507450103759766, + "eval_runtime": 21.9925, + "eval_samples_per_second": 227.35, + "eval_steps_per_second": 1.228, + "step": 26274 + }, + { + "epoch": 1.4242315443629652, + "grad_norm": 0.327347993850708, + "learning_rate": 8.57209301187748e-05, + "loss": 1.6815, + "step": 26280 + }, + { + "epoch": 1.4247734855284773, + "grad_norm": 0.3275546133518219, + "learning_rate": 8.5709355083877e-05, + "loss": 1.6945, + "step": 26290 + }, + { + "epoch": 1.4253154266939896, + "grad_norm": 0.36542415618896484, + "learning_rate": 8.569777624483763e-05, + "loss": 1.695, + "step": 26300 + }, + { + "epoch": 1.4258573678595017, + "grad_norm": 0.52955561876297, + "learning_rate": 8.568619360309102e-05, + "loss": 1.6802, + "step": 26310 + }, + { + "epoch": 1.426399309025014, + "grad_norm": 0.9618374109268188, + "learning_rate": 8.567460716007195e-05, + "loss": 1.6978, + "step": 26320 + }, + { + "epoch": 1.4269412501905263, + "grad_norm": 0.7096937298774719, + "learning_rate": 8.566301691721574e-05, + "loss": 1.6956, + "step": 26330 + }, + { + "epoch": 1.4274831913560384, + "grad_norm": 0.2932894825935364, + "learning_rate": 8.56514228759581e-05, + "loss": 1.6919, + "step": 26340 + }, + { + "epoch": 1.4280251325215505, + "grad_norm": 0.38948243856430054, + "learning_rate": 8.563982503773527e-05, + "loss": 1.6941, + "step": 26350 + }, + { + "epoch": 1.4285670736870628, + "grad_norm": 0.2520905137062073, + "learning_rate": 8.562822340398395e-05, + "loss": 1.6801, + "step": 26360 + }, + { + "epoch": 1.428621267803614, + "eval_loss": 2.5581541061401367, + "eval_runtime": 21.991, + "eval_samples_per_second": 227.366, + "eval_steps_per_second": 1.228, + "step": 26361 + }, + { + "epoch": 1.4291090148525751, + "grad_norm": 0.26686540246009827, + "learning_rate": 8.56166179761413e-05, + "loss": 1.6883, + "step": 26370 + }, + { + "epoch": 1.4296509560180872, + "grad_norm": 0.45527172088623047, + "learning_rate": 8.560500875564491e-05, + "loss": 1.6885, + "step": 26380 + }, + { + "epoch": 1.4301928971835995, + "grad_norm": 0.33151310682296753, + "learning_rate": 8.559339574393294e-05, + "loss": 1.6901, + "step": 26390 + }, + { + "epoch": 1.4307348383491116, + "grad_norm": 0.4582711458206177, + "learning_rate": 8.558177894244392e-05, + "loss": 1.6876, + "step": 26400 + }, + { + "epoch": 1.431276779514624, + "grad_norm": 0.23842747509479523, + "learning_rate": 8.557015835261688e-05, + "loss": 1.7054, + "step": 26410 + }, + { + "epoch": 1.4318187206801363, + "grad_norm": 0.4057098627090454, + "learning_rate": 8.555853397589136e-05, + "loss": 1.7013, + "step": 26420 + }, + { + "epoch": 1.4323606618456484, + "grad_norm": 0.4992932081222534, + "learning_rate": 8.554690581370732e-05, + "loss": 1.6931, + "step": 26430 + }, + { + "epoch": 1.4329026030111607, + "grad_norm": 0.4780980944633484, + "learning_rate": 8.553527386750521e-05, + "loss": 1.6896, + "step": 26440 + }, + { + "epoch": 1.4333361559435704, + "eval_loss": 2.556183099746704, + "eval_runtime": 21.9936, + "eval_samples_per_second": 227.339, + "eval_steps_per_second": 1.228, + "step": 26448 + }, + { + "epoch": 1.4334445441766728, + "grad_norm": 0.5446142554283142, + "learning_rate": 8.552363813872594e-05, + "loss": 1.6788, + "step": 26450 + }, + { + "epoch": 1.433986485342185, + "grad_norm": 0.48127421736717224, + "learning_rate": 8.551199862881091e-05, + "loss": 1.6734, + "step": 26460 + }, + { + "epoch": 1.4345284265076972, + "grad_norm": 0.30122366547584534, + "learning_rate": 8.550035533920194e-05, + "loss": 1.6897, + "step": 26470 + }, + { + "epoch": 1.4350703676732095, + "grad_norm": 0.36052316427230835, + "learning_rate": 8.548870827134139e-05, + "loss": 1.6886, + "step": 26480 + }, + { + "epoch": 1.4356123088387216, + "grad_norm": 0.47297677397727966, + "learning_rate": 8.547705742667203e-05, + "loss": 1.6943, + "step": 26490 + }, + { + "epoch": 1.436154250004234, + "grad_norm": 0.26182791590690613, + "learning_rate": 8.54654028066371e-05, + "loss": 1.6938, + "step": 26500 + }, + { + "epoch": 1.4366961911697462, + "grad_norm": 0.3834492564201355, + "learning_rate": 8.545374441268033e-05, + "loss": 1.6822, + "step": 26510 + }, + { + "epoch": 1.4372381323352583, + "grad_norm": 0.29216572642326355, + "learning_rate": 8.544208224624592e-05, + "loss": 1.6838, + "step": 26520 + }, + { + "epoch": 1.4377800735007706, + "grad_norm": 0.2463953047990799, + "learning_rate": 8.543041630877856e-05, + "loss": 1.6817, + "step": 26530 + }, + { + "epoch": 1.4380510440835268, + "eval_loss": 2.5639803409576416, + "eval_runtime": 21.9922, + "eval_samples_per_second": 227.353, + "eval_steps_per_second": 1.228, + "step": 26535 + }, + { + "epoch": 1.4383220146662827, + "grad_norm": 0.6637621521949768, + "learning_rate": 8.54187466017233e-05, + "loss": 1.6862, + "step": 26540 + }, + { + "epoch": 1.438863955831795, + "grad_norm": 0.33043143153190613, + "learning_rate": 8.540707312652581e-05, + "loss": 1.704, + "step": 26550 + }, + { + "epoch": 1.4394058969973074, + "grad_norm": 0.36580994725227356, + "learning_rate": 8.53953958846321e-05, + "loss": 1.6747, + "step": 26560 + }, + { + "epoch": 1.4399478381628195, + "grad_norm": 0.5655596256256104, + "learning_rate": 8.538371487748871e-05, + "loss": 1.688, + "step": 26570 + }, + { + "epoch": 1.4404897793283316, + "grad_norm": 0.41566041111946106, + "learning_rate": 8.537203010654266e-05, + "loss": 1.6908, + "step": 26580 + }, + { + "epoch": 1.4410317204938439, + "grad_norm": 0.3370110094547272, + "learning_rate": 8.536034157324135e-05, + "loss": 1.6906, + "step": 26590 + }, + { + "epoch": 1.4415736616593562, + "grad_norm": 0.4993593692779541, + "learning_rate": 8.534864927903274e-05, + "loss": 1.6829, + "step": 26600 + }, + { + "epoch": 1.4421156028248683, + "grad_norm": 0.28498584032058716, + "learning_rate": 8.533695322536523e-05, + "loss": 1.6933, + "step": 26610 + }, + { + "epoch": 1.4426575439903806, + "grad_norm": 0.7398360371589661, + "learning_rate": 8.532525341368767e-05, + "loss": 1.686, + "step": 26620 + }, + { + "epoch": 1.442765932223483, + "eval_loss": 2.5482335090637207, + "eval_runtime": 21.988, + "eval_samples_per_second": 227.397, + "eval_steps_per_second": 1.228, + "step": 26622 + }, + { + "epoch": 1.4431994851558927, + "grad_norm": 0.40166711807250977, + "learning_rate": 8.531354984544936e-05, + "loss": 1.6862, + "step": 26630 + }, + { + "epoch": 1.443741426321405, + "grad_norm": 0.43468061089515686, + "learning_rate": 8.530184252210012e-05, + "loss": 1.6952, + "step": 26640 + }, + { + "epoch": 1.4442833674869173, + "grad_norm": 0.39255911111831665, + "learning_rate": 8.529013144509017e-05, + "loss": 1.6928, + "step": 26650 + }, + { + "epoch": 1.4448253086524294, + "grad_norm": 0.3616562783718109, + "learning_rate": 8.527841661587024e-05, + "loss": 1.6716, + "step": 26660 + }, + { + "epoch": 1.4453672498179415, + "grad_norm": 0.5667160153388977, + "learning_rate": 8.526669803589153e-05, + "loss": 1.6842, + "step": 26670 + }, + { + "epoch": 1.4459091909834538, + "grad_norm": 0.6671094298362732, + "learning_rate": 8.525497570660567e-05, + "loss": 1.6756, + "step": 26680 + }, + { + "epoch": 1.4464511321489661, + "grad_norm": 0.4949670732021332, + "learning_rate": 8.524324962946477e-05, + "loss": 1.6857, + "step": 26690 + }, + { + "epoch": 1.4469930733144782, + "grad_norm": 0.2640099823474884, + "learning_rate": 8.52315198059214e-05, + "loss": 1.6939, + "step": 26700 + }, + { + "epoch": 1.4474808203634393, + "eval_loss": 2.541543960571289, + "eval_runtime": 21.9948, + "eval_samples_per_second": 227.326, + "eval_steps_per_second": 1.228, + "step": 26709 + }, + { + "epoch": 1.4475350144799906, + "grad_norm": 0.24292974174022675, + "learning_rate": 8.521978623742861e-05, + "loss": 1.6733, + "step": 26710 + }, + { + "epoch": 1.4480769556455027, + "grad_norm": 0.40531477332115173, + "learning_rate": 8.520804892543993e-05, + "loss": 1.6769, + "step": 26720 + }, + { + "epoch": 1.448618896811015, + "grad_norm": 0.37671342492103577, + "learning_rate": 8.519630787140926e-05, + "loss": 1.6894, + "step": 26730 + }, + { + "epoch": 1.4491608379765273, + "grad_norm": 0.3520098328590393, + "learning_rate": 8.51845630767911e-05, + "loss": 1.6852, + "step": 26740 + }, + { + "epoch": 1.4497027791420394, + "grad_norm": 0.3429430425167084, + "learning_rate": 8.517281454304031e-05, + "loss": 1.6855, + "step": 26750 + }, + { + "epoch": 1.4502447203075517, + "grad_norm": 0.22939632833003998, + "learning_rate": 8.516106227161223e-05, + "loss": 1.6898, + "step": 26760 + }, + { + "epoch": 1.4507866614730638, + "grad_norm": 0.2420610934495926, + "learning_rate": 8.514930626396273e-05, + "loss": 1.6847, + "step": 26770 + }, + { + "epoch": 1.451328602638576, + "grad_norm": 0.3113659620285034, + "learning_rate": 8.513754652154805e-05, + "loss": 1.6867, + "step": 26780 + }, + { + "epoch": 1.4518705438040882, + "grad_norm": 0.34163153171539307, + "learning_rate": 8.512578304582495e-05, + "loss": 1.695, + "step": 26790 + }, + { + "epoch": 1.4521957085033956, + "eval_loss": 2.5400216579437256, + "eval_runtime": 21.9954, + "eval_samples_per_second": 227.32, + "eval_steps_per_second": 1.228, + "step": 26796 + }, + { + "epoch": 1.4524124849696005, + "grad_norm": 0.26067569851875305, + "learning_rate": 8.511401583825066e-05, + "loss": 1.6843, + "step": 26800 + }, + { + "epoch": 1.4529544261351126, + "grad_norm": 0.34139981865882874, + "learning_rate": 8.51022449002828e-05, + "loss": 1.68, + "step": 26810 + }, + { + "epoch": 1.453496367300625, + "grad_norm": 0.40918228030204773, + "learning_rate": 8.509047023337954e-05, + "loss": 1.6758, + "step": 26820 + }, + { + "epoch": 1.4540383084661372, + "grad_norm": 0.33097583055496216, + "learning_rate": 8.507869183899947e-05, + "loss": 1.6781, + "step": 26830 + }, + { + "epoch": 1.4545802496316493, + "grad_norm": 0.5380516648292542, + "learning_rate": 8.506690971860167e-05, + "loss": 1.6753, + "step": 26840 + }, + { + "epoch": 1.4551221907971617, + "grad_norm": 0.3190267086029053, + "learning_rate": 8.50551238736456e-05, + "loss": 1.6841, + "step": 26850 + }, + { + "epoch": 1.4556641319626737, + "grad_norm": 0.40389493107795715, + "learning_rate": 8.504333430559127e-05, + "loss": 1.687, + "step": 26860 + }, + { + "epoch": 1.456206073128186, + "grad_norm": 0.3580215573310852, + "learning_rate": 8.503154101589915e-05, + "loss": 1.6818, + "step": 26870 + }, + { + "epoch": 1.4567480142936984, + "grad_norm": 0.2765572667121887, + "learning_rate": 8.501974400603009e-05, + "loss": 1.6859, + "step": 26880 + }, + { + "epoch": 1.456910596643352, + "eval_loss": 2.5574100017547607, + "eval_runtime": 21.9907, + "eval_samples_per_second": 227.368, + "eval_steps_per_second": 1.228, + "step": 26883 + }, + { + "epoch": 1.4572899554592105, + "grad_norm": 0.2352159172296524, + "learning_rate": 8.500794327744547e-05, + "loss": 1.679, + "step": 26890 + }, + { + "epoch": 1.4578318966247226, + "grad_norm": 0.241908460855484, + "learning_rate": 8.499613883160712e-05, + "loss": 1.6749, + "step": 26900 + }, + { + "epoch": 1.4583738377902349, + "grad_norm": 0.46047091484069824, + "learning_rate": 8.498433066997733e-05, + "loss": 1.6867, + "step": 26910 + }, + { + "epoch": 1.4589157789557472, + "grad_norm": 0.4493439793586731, + "learning_rate": 8.497251879401884e-05, + "loss": 1.6741, + "step": 26920 + }, + { + "epoch": 1.4594577201212593, + "grad_norm": 0.26538506150245667, + "learning_rate": 8.496070320519484e-05, + "loss": 1.6759, + "step": 26930 + }, + { + "epoch": 1.4599996612867716, + "grad_norm": 0.3610468804836273, + "learning_rate": 8.494888390496901e-05, + "loss": 1.6859, + "step": 26940 + }, + { + "epoch": 1.4605416024522837, + "grad_norm": 0.3829333782196045, + "learning_rate": 8.493706089480546e-05, + "loss": 1.6795, + "step": 26950 + }, + { + "epoch": 1.461083543617796, + "grad_norm": 0.26342180371284485, + "learning_rate": 8.492523417616878e-05, + "loss": 1.6823, + "step": 26960 + }, + { + "epoch": 1.4616254847833083, + "grad_norm": 0.3344675302505493, + "learning_rate": 8.491340375052403e-05, + "loss": 1.6862, + "step": 26970 + }, + { + "epoch": 1.4616254847833083, + "eval_loss": 2.554945230484009, + "eval_runtime": 21.9254, + "eval_samples_per_second": 228.046, + "eval_steps_per_second": 1.231, + "step": 26970 + }, + { + "epoch": 1.4621674259488204, + "grad_norm": 0.28914472460746765, + "learning_rate": 8.490156961933666e-05, + "loss": 1.6751, + "step": 26980 + }, + { + "epoch": 1.4627093671143325, + "grad_norm": 0.46620985865592957, + "learning_rate": 8.488973178407269e-05, + "loss": 1.689, + "step": 26990 + }, + { + "epoch": 1.4632513082798448, + "grad_norm": 0.2889445722103119, + "learning_rate": 8.487789024619852e-05, + "loss": 1.6843, + "step": 27000 + }, + { + "epoch": 1.0005419411655123, + "grad_norm": 0.23382456600666046, + "learning_rate": 8.486604500718102e-05, + "loss": 1.6714, + "step": 27010 + }, + { + "epoch": 1.0010838823310244, + "grad_norm": 0.554624617099762, + "learning_rate": 8.485419606848751e-05, + "loss": 1.675, + "step": 27020 + }, + { + "epoch": 1.0016258234965367, + "grad_norm": 0.48615115880966187, + "learning_rate": 8.484234343158583e-05, + "loss": 1.6822, + "step": 27030 + }, + { + "epoch": 1.0021677646620488, + "grad_norm": 0.348832368850708, + "learning_rate": 8.48304870979442e-05, + "loss": 1.6722, + "step": 27040 + }, + { + "epoch": 1.0027097058275611, + "grad_norm": 0.5944726467132568, + "learning_rate": 8.481862706903132e-05, + "loss": 1.6743, + "step": 27050 + }, + { + "epoch": 1.0030890646434196, + "eval_loss": 2.5551207065582275, + "eval_runtime": 22.1704, + "eval_samples_per_second": 225.526, + "eval_steps_per_second": 1.218, + "step": 27057 + }, + { + "epoch": 1.0032516469930732, + "grad_norm": 0.34727635979652405, + "learning_rate": 8.48067633463164e-05, + "loss": 1.6697, + "step": 27060 + }, + { + "epoch": 1.0037935881585855, + "grad_norm": 0.37410902976989746, + "learning_rate": 8.479489593126904e-05, + "loss": 1.666, + "step": 27070 + }, + { + "epoch": 1.0043355293240976, + "grad_norm": 0.3332069516181946, + "learning_rate": 8.478302482535933e-05, + "loss": 1.6891, + "step": 27080 + }, + { + "epoch": 1.00487747048961, + "grad_norm": 0.2845593988895416, + "learning_rate": 8.477115003005781e-05, + "loss": 1.6745, + "step": 27090 + }, + { + "epoch": 1.0054194116551223, + "grad_norm": 0.367365300655365, + "learning_rate": 8.475927154683547e-05, + "loss": 1.6853, + "step": 27100 + }, + { + "epoch": 1.0059613528206344, + "grad_norm": 0.2617562711238861, + "learning_rate": 8.474738937716378e-05, + "loss": 1.6763, + "step": 27110 + }, + { + "epoch": 1.0065032939861467, + "grad_norm": 0.4524533748626709, + "learning_rate": 8.473550352251465e-05, + "loss": 1.6789, + "step": 27120 + }, + { + "epoch": 1.0070452351516588, + "grad_norm": 0.39219167828559875, + "learning_rate": 8.472361398436043e-05, + "loss": 1.6915, + "step": 27130 + }, + { + "epoch": 1.007587176317171, + "grad_norm": 0.47083818912506104, + "learning_rate": 8.471172076417397e-05, + "loss": 1.6807, + "step": 27140 + }, + { + "epoch": 1.007803952783376, + "eval_loss": 2.5521044731140137, + "eval_runtime": 21.9818, + "eval_samples_per_second": 227.461, + "eval_steps_per_second": 1.228, + "step": 27144 + }, + { + "epoch": 1.0081291174826832, + "grad_norm": 0.33303186297416687, + "learning_rate": 8.469982386342851e-05, + "loss": 1.6765, + "step": 27150 + }, + { + "epoch": 1.0086710586481955, + "grad_norm": 0.28193533420562744, + "learning_rate": 8.468792328359783e-05, + "loss": 1.674, + "step": 27160 + }, + { + "epoch": 1.0092129998137078, + "grad_norm": 0.35703226923942566, + "learning_rate": 8.467601902615613e-05, + "loss": 1.6827, + "step": 27170 + }, + { + "epoch": 1.00975494097922, + "grad_norm": 0.36111465096473694, + "learning_rate": 8.4664111092578e-05, + "loss": 1.6728, + "step": 27180 + }, + { + "epoch": 1.0102968821447322, + "grad_norm": 0.28821465373039246, + "learning_rate": 8.465219948433858e-05, + "loss": 1.6828, + "step": 27190 + }, + { + "epoch": 1.0108388233102443, + "grad_norm": 0.3027511537075043, + "learning_rate": 8.464028420291344e-05, + "loss": 1.6751, + "step": 27200 + }, + { + "epoch": 1.0113807644757566, + "grad_norm": 0.28520819544792175, + "learning_rate": 8.462836524977856e-05, + "loss": 1.6664, + "step": 27210 + }, + { + "epoch": 1.0119227056412687, + "grad_norm": 0.4790101647377014, + "learning_rate": 8.461644262641044e-05, + "loss": 1.6729, + "step": 27220 + }, + { + "epoch": 1.012464646806781, + "grad_norm": 0.43564823269844055, + "learning_rate": 8.460451633428599e-05, + "loss": 1.6749, + "step": 27230 + }, + { + "epoch": 1.0125188409233323, + "eval_loss": 2.557983636856079, + "eval_runtime": 21.9815, + "eval_samples_per_second": 227.464, + "eval_steps_per_second": 1.228, + "step": 27231 + }, + { + "epoch": 1.0130065879722931, + "grad_norm": 0.24843677878379822, + "learning_rate": 8.459258637488259e-05, + "loss": 1.6682, + "step": 27240 + }, + { + "epoch": 1.0135485291378055, + "grad_norm": 0.2693169414997101, + "learning_rate": 8.458065274967806e-05, + "loss": 1.6843, + "step": 27250 + }, + { + "epoch": 1.0140904703033178, + "grad_norm": 0.2424820065498352, + "learning_rate": 8.45687154601507e-05, + "loss": 1.6775, + "step": 27260 + }, + { + "epoch": 1.0146324114688299, + "grad_norm": 0.37894406914711, + "learning_rate": 8.455677450777922e-05, + "loss": 1.6716, + "step": 27270 + }, + { + "epoch": 1.0151743526343422, + "grad_norm": 0.771432638168335, + "learning_rate": 8.454482989404286e-05, + "loss": 1.6872, + "step": 27280 + }, + { + "epoch": 1.0157162937998543, + "grad_norm": 0.28843340277671814, + "learning_rate": 8.453288162042124e-05, + "loss": 1.681, + "step": 27290 + }, + { + "epoch": 1.0162582349653666, + "grad_norm": 0.31491175293922424, + "learning_rate": 8.452092968839446e-05, + "loss": 1.6717, + "step": 27300 + }, + { + "epoch": 1.0168001761308787, + "grad_norm": 0.38847339153289795, + "learning_rate": 8.450897409944309e-05, + "loss": 1.6786, + "step": 27310 + }, + { + "epoch": 1.0172337290632885, + "eval_loss": 2.552642345428467, + "eval_runtime": 21.9929, + "eval_samples_per_second": 227.346, + "eval_steps_per_second": 1.228, + "step": 27318 + }, + { + "epoch": 1.017342117296391, + "grad_norm": 0.6366086602210999, + "learning_rate": 8.44970148550481e-05, + "loss": 1.6791, + "step": 27320 + }, + { + "epoch": 1.0178840584619033, + "grad_norm": 0.5009103417396545, + "learning_rate": 8.448505195669099e-05, + "loss": 1.6762, + "step": 27330 + }, + { + "epoch": 1.0184259996274154, + "grad_norm": 0.31960320472717285, + "learning_rate": 8.447308540585364e-05, + "loss": 1.676, + "step": 27340 + }, + { + "epoch": 1.0189679407929277, + "grad_norm": 0.5080009698867798, + "learning_rate": 8.446111520401845e-05, + "loss": 1.6768, + "step": 27350 + }, + { + "epoch": 1.0195098819584398, + "grad_norm": 0.44143038988113403, + "learning_rate": 8.44491413526682e-05, + "loss": 1.6816, + "step": 27360 + }, + { + "epoch": 1.0200518231239522, + "grad_norm": 0.3140455186367035, + "learning_rate": 8.443716385328618e-05, + "loss": 1.6696, + "step": 27370 + }, + { + "epoch": 1.0205937642894642, + "grad_norm": 0.5069706439971924, + "learning_rate": 8.442518270735611e-05, + "loss": 1.6752, + "step": 27380 + }, + { + "epoch": 1.0211357054549766, + "grad_norm": 1.0147614479064941, + "learning_rate": 8.441319791636215e-05, + "loss": 1.6684, + "step": 27390 + }, + { + "epoch": 1.0216776466204887, + "grad_norm": 0.2649192214012146, + "learning_rate": 8.440120948178895e-05, + "loss": 1.6823, + "step": 27400 + }, + { + "epoch": 1.0219486172032448, + "eval_loss": 2.5535383224487305, + "eval_runtime": 21.989, + "eval_samples_per_second": 227.386, + "eval_steps_per_second": 1.228, + "step": 27405 + }, + { + "epoch": 1.022219587786001, + "grad_norm": 0.2985130250453949, + "learning_rate": 8.438921740512153e-05, + "loss": 1.6679, + "step": 27410 + }, + { + "epoch": 1.0227615289515133, + "grad_norm": 0.3355660140514374, + "learning_rate": 8.437722168784548e-05, + "loss": 1.6648, + "step": 27420 + }, + { + "epoch": 1.0233034701170254, + "grad_norm": 0.30278173089027405, + "learning_rate": 8.436522233144675e-05, + "loss": 1.6676, + "step": 27430 + }, + { + "epoch": 1.0238454112825377, + "grad_norm": 0.24776387214660645, + "learning_rate": 8.435321933741177e-05, + "loss": 1.6718, + "step": 27440 + }, + { + "epoch": 1.0243873524480498, + "grad_norm": 0.25117209553718567, + "learning_rate": 8.434121270722741e-05, + "loss": 1.6791, + "step": 27450 + }, + { + "epoch": 1.024929293613562, + "grad_norm": 0.23679675161838531, + "learning_rate": 8.4329202442381e-05, + "loss": 1.6793, + "step": 27460 + }, + { + "epoch": 1.0254712347790742, + "grad_norm": 0.4516010582447052, + "learning_rate": 8.431718854436034e-05, + "loss": 1.6673, + "step": 27470 + }, + { + "epoch": 1.0260131759445865, + "grad_norm": 0.2465139925479889, + "learning_rate": 8.430517101465364e-05, + "loss": 1.6743, + "step": 27480 + }, + { + "epoch": 1.0265551171100988, + "grad_norm": 0.3209734857082367, + "learning_rate": 8.429314985474958e-05, + "loss": 1.67, + "step": 27490 + }, + { + "epoch": 1.0266635053432012, + "eval_loss": 2.5473275184631348, + "eval_runtime": 21.9881, + "eval_samples_per_second": 227.396, + "eval_steps_per_second": 1.228, + "step": 27492 + }, + { + "epoch": 1.027097058275611, + "grad_norm": 0.2319590449333191, + "learning_rate": 8.42811250661373e-05, + "loss": 1.6803, + "step": 27500 + }, + { + "epoch": 1.0276389994411232, + "grad_norm": 0.3213670253753662, + "learning_rate": 8.426909665030637e-05, + "loss": 1.6656, + "step": 27510 + }, + { + "epoch": 1.0281809406066353, + "grad_norm": 0.29917410016059875, + "learning_rate": 8.425706460874682e-05, + "loss": 1.6738, + "step": 27520 + }, + { + "epoch": 1.0287228817721477, + "grad_norm": 0.24403968453407288, + "learning_rate": 8.424502894294913e-05, + "loss": 1.6695, + "step": 27530 + }, + { + "epoch": 1.0292648229376598, + "grad_norm": 0.3891294002532959, + "learning_rate": 8.423298965440423e-05, + "loss": 1.6619, + "step": 27540 + }, + { + "epoch": 1.029806764103172, + "grad_norm": 0.4318540692329407, + "learning_rate": 8.422094674460348e-05, + "loss": 1.6751, + "step": 27550 + }, + { + "epoch": 1.0303487052686842, + "grad_norm": 0.3217497766017914, + "learning_rate": 8.420890021503872e-05, + "loss": 1.6667, + "step": 27560 + }, + { + "epoch": 1.0308906464341965, + "grad_norm": 0.31174400448799133, + "learning_rate": 8.419685006720221e-05, + "loss": 1.6779, + "step": 27570 + }, + { + "epoch": 1.0313783934831575, + "eval_loss": 2.553981304168701, + "eval_runtime": 22.0351, + "eval_samples_per_second": 226.91, + "eval_steps_per_second": 1.225, + "step": 27579 + }, + { + "epoch": 1.0314325875997088, + "grad_norm": 0.25709420442581177, + "learning_rate": 8.418479630258666e-05, + "loss": 1.681, + "step": 27580 + }, + { + "epoch": 1.0319745287652209, + "grad_norm": 0.2779182195663452, + "learning_rate": 8.417273892268527e-05, + "loss": 1.6807, + "step": 27590 + }, + { + "epoch": 1.0325164699307332, + "grad_norm": 0.41596314311027527, + "learning_rate": 8.416067792899162e-05, + "loss": 1.6761, + "step": 27600 + }, + { + "epoch": 1.0330584110962453, + "grad_norm": 0.2902078628540039, + "learning_rate": 8.41486133229998e-05, + "loss": 1.6773, + "step": 27610 + }, + { + "epoch": 1.0336003522617576, + "grad_norm": 0.2825920879840851, + "learning_rate": 8.413654510620432e-05, + "loss": 1.6655, + "step": 27620 + }, + { + "epoch": 1.0341422934272697, + "grad_norm": 0.3873823285102844, + "learning_rate": 8.412447328010012e-05, + "loss": 1.6638, + "step": 27630 + }, + { + "epoch": 1.034684234592782, + "grad_norm": 1.103782057762146, + "learning_rate": 8.411239784618263e-05, + "loss": 1.6825, + "step": 27640 + }, + { + "epoch": 1.0352261757582943, + "grad_norm": 0.632357120513916, + "learning_rate": 8.410031880594768e-05, + "loss": 1.6624, + "step": 27650 + }, + { + "epoch": 1.0357681169238064, + "grad_norm": 0.3010210394859314, + "learning_rate": 8.408823616089157e-05, + "loss": 1.6669, + "step": 27660 + }, + { + "epoch": 1.0360932816231139, + "eval_loss": 2.547511100769043, + "eval_runtime": 21.993, + "eval_samples_per_second": 227.345, + "eval_steps_per_second": 1.228, + "step": 27666 + }, + { + "epoch": 1.0363100580893188, + "grad_norm": 0.5328647494316101, + "learning_rate": 8.407614991251106e-05, + "loss": 1.6691, + "step": 27670 + }, + { + "epoch": 1.0368519992548308, + "grad_norm": 0.328873872756958, + "learning_rate": 8.406406006230333e-05, + "loss": 1.6752, + "step": 27680 + }, + { + "epoch": 1.0373939404203432, + "grad_norm": 0.5136923789978027, + "learning_rate": 8.405196661176604e-05, + "loss": 1.6724, + "step": 27690 + }, + { + "epoch": 1.0379358815858553, + "grad_norm": 0.2645358443260193, + "learning_rate": 8.403986956239724e-05, + "loss": 1.67, + "step": 27700 + }, + { + "epoch": 1.0384778227513676, + "grad_norm": 0.31197217106819153, + "learning_rate": 8.402776891569547e-05, + "loss": 1.6693, + "step": 27710 + }, + { + "epoch": 1.0390197639168797, + "grad_norm": 0.3356460928916931, + "learning_rate": 8.401566467315973e-05, + "loss": 1.668, + "step": 27720 + }, + { + "epoch": 1.039561705082392, + "grad_norm": 0.3273829221725464, + "learning_rate": 8.400355683628941e-05, + "loss": 1.6722, + "step": 27730 + }, + { + "epoch": 1.0401036462479043, + "grad_norm": 0.26531314849853516, + "learning_rate": 8.399144540658441e-05, + "loss": 1.6708, + "step": 27740 + }, + { + "epoch": 1.0406455874134164, + "grad_norm": 0.2471926361322403, + "learning_rate": 8.397933038554502e-05, + "loss": 1.6663, + "step": 27750 + }, + { + "epoch": 1.04080816976307, + "eval_loss": 2.5492753982543945, + "eval_runtime": 21.9881, + "eval_samples_per_second": 227.396, + "eval_steps_per_second": 1.228, + "step": 27753 + }, + { + "epoch": 1.0411875285789287, + "grad_norm": 0.35514238476753235, + "learning_rate": 8.396721177467197e-05, + "loss": 1.6734, + "step": 27760 + }, + { + "epoch": 1.0417294697444408, + "grad_norm": 0.33591434359550476, + "learning_rate": 8.39550895754665e-05, + "loss": 1.6738, + "step": 27770 + }, + { + "epoch": 1.0422714109099531, + "grad_norm": 0.2625053822994232, + "learning_rate": 8.394296378943027e-05, + "loss": 1.6687, + "step": 27780 + }, + { + "epoch": 1.0428133520754652, + "grad_norm": 0.2579818665981293, + "learning_rate": 8.39308344180653e-05, + "loss": 1.6667, + "step": 27790 + }, + { + "epoch": 1.0433552932409775, + "grad_norm": 0.25735151767730713, + "learning_rate": 8.39187014628742e-05, + "loss": 1.6705, + "step": 27800 + }, + { + "epoch": 1.0438972344064898, + "grad_norm": 0.259867399930954, + "learning_rate": 8.390656492535992e-05, + "loss": 1.6795, + "step": 27810 + }, + { + "epoch": 1.044439175572002, + "grad_norm": 0.26979291439056396, + "learning_rate": 8.389442480702585e-05, + "loss": 1.6748, + "step": 27820 + }, + { + "epoch": 1.0449811167375143, + "grad_norm": 0.3980228006839752, + "learning_rate": 8.38822811093759e-05, + "loss": 1.6651, + "step": 27830 + }, + { + "epoch": 1.0455230579030264, + "grad_norm": 0.39179670810699463, + "learning_rate": 8.387013383391435e-05, + "loss": 1.6701, + "step": 27840 + }, + { + "epoch": 1.0455230579030264, + "eval_loss": 2.563586950302124, + "eval_runtime": 21.9807, + "eval_samples_per_second": 227.472, + "eval_steps_per_second": 1.228, + "step": 27840 + }, + { + "epoch": 1.0460649990685387, + "grad_norm": 0.5658113956451416, + "learning_rate": 8.385798298214596e-05, + "loss": 1.672, + "step": 27850 + }, + { + "epoch": 1.0466069402340508, + "grad_norm": 0.5975947380065918, + "learning_rate": 8.384582855557594e-05, + "loss": 1.6522, + "step": 27860 + }, + { + "epoch": 1.047148881399563, + "grad_norm": 0.28900066018104553, + "learning_rate": 8.383367055570991e-05, + "loss": 1.6596, + "step": 27870 + }, + { + "epoch": 1.0476908225650752, + "grad_norm": 0.38573452830314636, + "learning_rate": 8.382150898405396e-05, + "loss": 1.6625, + "step": 27880 + }, + { + "epoch": 1.0482327637305875, + "grad_norm": 0.2802741229534149, + "learning_rate": 8.380934384211458e-05, + "loss": 1.6651, + "step": 27890 + }, + { + "epoch": 1.0487747048960998, + "grad_norm": 0.31623005867004395, + "learning_rate": 8.37971751313988e-05, + "loss": 1.6644, + "step": 27900 + }, + { + "epoch": 1.049316646061612, + "grad_norm": 0.2634221911430359, + "learning_rate": 8.378500285341397e-05, + "loss": 1.6651, + "step": 27910 + }, + { + "epoch": 1.0498585872271242, + "grad_norm": 0.2507796287536621, + "learning_rate": 8.377282700966795e-05, + "loss": 1.6699, + "step": 27920 + }, + { + "epoch": 1.0502379460429827, + "eval_loss": 2.5567116737365723, + "eval_runtime": 21.9902, + "eval_samples_per_second": 227.374, + "eval_steps_per_second": 1.228, + "step": 27927 + }, + { + "epoch": 1.0504005283926363, + "grad_norm": 0.38170045614242554, + "learning_rate": 8.376064760166907e-05, + "loss": 1.674, + "step": 27930 + }, + { + "epoch": 1.0509424695581486, + "grad_norm": 0.31428414583206177, + "learning_rate": 8.3748464630926e-05, + "loss": 1.6715, + "step": 27940 + }, + { + "epoch": 1.0514844107236607, + "grad_norm": 0.30629944801330566, + "learning_rate": 8.373627809894796e-05, + "loss": 1.6685, + "step": 27950 + }, + { + "epoch": 1.052026351889173, + "grad_norm": 0.4108351171016693, + "learning_rate": 8.372408800724454e-05, + "loss": 1.6599, + "step": 27960 + }, + { + "epoch": 1.0525682930546854, + "grad_norm": 0.4083997309207916, + "learning_rate": 8.371189435732581e-05, + "loss": 1.6629, + "step": 27970 + }, + { + "epoch": 1.0531102342201974, + "grad_norm": 0.5288533568382263, + "learning_rate": 8.369969715070226e-05, + "loss": 1.6598, + "step": 27980 + }, + { + "epoch": 1.0536521753857098, + "grad_norm": 0.35696470737457275, + "learning_rate": 8.368749638888484e-05, + "loss": 1.6646, + "step": 27990 + }, + { + "epoch": 1.0541941165512219, + "grad_norm": 0.3221840262413025, + "learning_rate": 8.36752920733849e-05, + "loss": 1.6577, + "step": 28000 + }, + { + "epoch": 1.0547360577167342, + "grad_norm": 0.35849282145500183, + "learning_rate": 8.366308420571429e-05, + "loss": 1.6604, + "step": 28010 + }, + { + "epoch": 1.054952834182939, + "eval_loss": 2.547173500061035, + "eval_runtime": 21.9831, + "eval_samples_per_second": 227.447, + "eval_steps_per_second": 1.228, + "step": 28014 + }, + { + "epoch": 1.0552779988822463, + "grad_norm": 0.2584594488143921, + "learning_rate": 8.365087278738524e-05, + "loss": 1.6827, + "step": 28020 + }, + { + "epoch": 1.0558199400477586, + "grad_norm": 0.2509235441684723, + "learning_rate": 8.363865781991046e-05, + "loss": 1.6627, + "step": 28030 + }, + { + "epoch": 1.0563618812132707, + "grad_norm": 0.36739498376846313, + "learning_rate": 8.36264393048031e-05, + "loss": 1.666, + "step": 28040 + }, + { + "epoch": 1.056903822378783, + "grad_norm": 0.41700538992881775, + "learning_rate": 8.36142172435767e-05, + "loss": 1.6799, + "step": 28050 + }, + { + "epoch": 1.0574457635442953, + "grad_norm": 0.31023573875427246, + "learning_rate": 8.360199163774531e-05, + "loss": 1.6701, + "step": 28060 + }, + { + "epoch": 1.0579877047098074, + "grad_norm": 0.23943783342838287, + "learning_rate": 8.358976248882337e-05, + "loss": 1.656, + "step": 28070 + }, + { + "epoch": 1.0585296458753197, + "grad_norm": 0.8542354702949524, + "learning_rate": 8.357752979832578e-05, + "loss": 1.6668, + "step": 28080 + }, + { + "epoch": 1.0590715870408318, + "grad_norm": 0.4590936601161957, + "learning_rate": 8.356529356776786e-05, + "loss": 1.6735, + "step": 28090 + }, + { + "epoch": 1.0596135282063441, + "grad_norm": 0.61350017786026, + "learning_rate": 8.355305379866541e-05, + "loss": 1.667, + "step": 28100 + }, + { + "epoch": 1.0596677223228954, + "eval_loss": 2.5593972206115723, + "eval_runtime": 21.9806, + "eval_samples_per_second": 227.474, + "eval_steps_per_second": 1.228, + "step": 28101 + }, + { + "epoch": 1.0601554693718562, + "grad_norm": 0.7067118883132935, + "learning_rate": 8.354081049253462e-05, + "loss": 1.6765, + "step": 28110 + }, + { + "epoch": 1.0606974105373685, + "grad_norm": 0.27008959650993347, + "learning_rate": 8.35285636508921e-05, + "loss": 1.6628, + "step": 28120 + }, + { + "epoch": 1.0612393517028809, + "grad_norm": 0.2994672358036041, + "learning_rate": 8.351631327525501e-05, + "loss": 1.6551, + "step": 28130 + }, + { + "epoch": 1.061781292868393, + "grad_norm": 0.3203209340572357, + "learning_rate": 8.350405936714083e-05, + "loss": 1.6561, + "step": 28140 + }, + { + "epoch": 1.0623232340339053, + "grad_norm": 0.3424260914325714, + "learning_rate": 8.349180192806753e-05, + "loss": 1.67, + "step": 28150 + }, + { + "epoch": 1.0628651751994174, + "grad_norm": 0.2762835621833801, + "learning_rate": 8.34795409595535e-05, + "loss": 1.6691, + "step": 28160 + }, + { + "epoch": 1.0634071163649297, + "grad_norm": 0.3313772976398468, + "learning_rate": 8.346727646311758e-05, + "loss": 1.6552, + "step": 28170 + }, + { + "epoch": 1.0639490575304418, + "grad_norm": 0.2399122714996338, + "learning_rate": 8.345500844027905e-05, + "loss": 1.6666, + "step": 28180 + }, + { + "epoch": 1.0643826104628515, + "eval_loss": 2.549626350402832, + "eval_runtime": 21.9895, + "eval_samples_per_second": 227.381, + "eval_steps_per_second": 1.228, + "step": 28188 + }, + { + "epoch": 1.064490998695954, + "grad_norm": 0.4699346721172333, + "learning_rate": 8.344273689255761e-05, + "loss": 1.668, + "step": 28190 + }, + { + "epoch": 1.0650329398614662, + "grad_norm": 0.25492924451828003, + "learning_rate": 8.343046182147342e-05, + "loss": 1.6632, + "step": 28200 + }, + { + "epoch": 1.0655748810269785, + "grad_norm": 0.4417177438735962, + "learning_rate": 8.341818322854705e-05, + "loss": 1.6678, + "step": 28210 + }, + { + "epoch": 1.0661168221924908, + "grad_norm": 0.28891265392303467, + "learning_rate": 8.340590111529952e-05, + "loss": 1.6606, + "step": 28220 + }, + { + "epoch": 1.066658763358003, + "grad_norm": 0.29153016209602356, + "learning_rate": 8.339361548325226e-05, + "loss": 1.6658, + "step": 28230 + }, + { + "epoch": 1.0672007045235152, + "grad_norm": 0.2998339533805847, + "learning_rate": 8.338132633392723e-05, + "loss": 1.6612, + "step": 28240 + }, + { + "epoch": 1.0677426456890273, + "grad_norm": 0.7401821613311768, + "learning_rate": 8.336903366884668e-05, + "loss": 1.6711, + "step": 28250 + }, + { + "epoch": 1.0682845868545396, + "grad_norm": 0.3623445928096771, + "learning_rate": 8.335673748953342e-05, + "loss": 1.6542, + "step": 28260 + }, + { + "epoch": 1.0688265280200517, + "grad_norm": 0.48223739862442017, + "learning_rate": 8.334443779751063e-05, + "loss": 1.6577, + "step": 28270 + }, + { + "epoch": 1.069097498602808, + "eval_loss": 2.5425007343292236, + "eval_runtime": 21.9868, + "eval_samples_per_second": 227.409, + "eval_steps_per_second": 1.228, + "step": 28275 + }, + { + "epoch": 1.069368469185564, + "grad_norm": 0.4789433479309082, + "learning_rate": 8.333213459430195e-05, + "loss": 1.6562, + "step": 28280 + }, + { + "epoch": 1.0699104103510764, + "grad_norm": 0.5614416003227234, + "learning_rate": 8.331982788143142e-05, + "loss": 1.6689, + "step": 28290 + }, + { + "epoch": 1.0704523515165885, + "grad_norm": 0.4019344449043274, + "learning_rate": 8.330751766042358e-05, + "loss": 1.6564, + "step": 28300 + }, + { + "epoch": 1.0709942926821008, + "grad_norm": 0.3532206118106842, + "learning_rate": 8.329520393280335e-05, + "loss": 1.6645, + "step": 28310 + }, + { + "epoch": 1.0715362338476129, + "grad_norm": 0.3050895631313324, + "learning_rate": 8.32828867000961e-05, + "loss": 1.6581, + "step": 28320 + }, + { + "epoch": 1.0720781750131252, + "grad_norm": 0.4680069386959076, + "learning_rate": 8.327056596382766e-05, + "loss": 1.6648, + "step": 28330 + }, + { + "epoch": 1.0726201161786373, + "grad_norm": 0.5191404819488525, + "learning_rate": 8.325824172552422e-05, + "loss": 1.6579, + "step": 28340 + }, + { + "epoch": 1.0731620573441496, + "grad_norm": 0.30772343277931213, + "learning_rate": 8.324591398671249e-05, + "loss": 1.6623, + "step": 28350 + }, + { + "epoch": 1.0737039985096617, + "grad_norm": 0.351131796836853, + "learning_rate": 8.323358274891956e-05, + "loss": 1.6632, + "step": 28360 + }, + { + "epoch": 1.0738123867427642, + "eval_loss": 2.5413732528686523, + "eval_runtime": 21.9839, + "eval_samples_per_second": 227.439, + "eval_steps_per_second": 1.228, + "step": 28362 + }, + { + "epoch": 1.074245939675174, + "grad_norm": 0.42983412742614746, + "learning_rate": 8.322124801367298e-05, + "loss": 1.652, + "step": 28370 + }, + { + "epoch": 1.0747878808406863, + "grad_norm": 0.2206849902868271, + "learning_rate": 8.320890978250073e-05, + "loss": 1.6608, + "step": 28380 + }, + { + "epoch": 1.0753298220061984, + "grad_norm": 0.2790736258029938, + "learning_rate": 8.31965680569312e-05, + "loss": 1.6622, + "step": 28390 + }, + { + "epoch": 1.0758717631717107, + "grad_norm": 0.44509637355804443, + "learning_rate": 8.318422283849323e-05, + "loss": 1.6611, + "step": 28400 + }, + { + "epoch": 1.0764137043372228, + "grad_norm": 0.24993574619293213, + "learning_rate": 8.317187412871611e-05, + "loss": 1.6573, + "step": 28410 + }, + { + "epoch": 1.0769556455027351, + "grad_norm": 0.325591504573822, + "learning_rate": 8.315952192912952e-05, + "loss": 1.6501, + "step": 28420 + }, + { + "epoch": 1.0774975866682472, + "grad_norm": 0.2710714638233185, + "learning_rate": 8.314716624126362e-05, + "loss": 1.6609, + "step": 28430 + }, + { + "epoch": 1.0780395278337596, + "grad_norm": 0.5176997780799866, + "learning_rate": 8.313480706664898e-05, + "loss": 1.6707, + "step": 28440 + }, + { + "epoch": 1.0785272748827206, + "eval_loss": 2.5304479598999023, + "eval_runtime": 21.9868, + "eval_samples_per_second": 227.409, + "eval_steps_per_second": 1.228, + "step": 28449 + }, + { + "epoch": 1.0785814689992717, + "grad_norm": 0.5030739307403564, + "learning_rate": 8.312244440681657e-05, + "loss": 1.6555, + "step": 28450 + }, + { + "epoch": 1.079123410164784, + "grad_norm": 0.44651323556900024, + "learning_rate": 8.311007826329786e-05, + "loss": 1.6696, + "step": 28460 + }, + { + "epoch": 1.0796653513302963, + "grad_norm": 0.6648367047309875, + "learning_rate": 8.309770863762468e-05, + "loss": 1.6589, + "step": 28470 + }, + { + "epoch": 1.0802072924958084, + "grad_norm": 0.24871893227100372, + "learning_rate": 8.308533553132935e-05, + "loss": 1.6527, + "step": 28480 + }, + { + "epoch": 1.0807492336613207, + "grad_norm": 0.4465760588645935, + "learning_rate": 8.307295894594456e-05, + "loss": 1.6585, + "step": 28490 + }, + { + "epoch": 1.0812911748268328, + "grad_norm": 0.313731849193573, + "learning_rate": 8.306057888300352e-05, + "loss": 1.6579, + "step": 28500 + }, + { + "epoch": 1.081833115992345, + "grad_norm": 0.2809997797012329, + "learning_rate": 8.304819534403981e-05, + "loss": 1.6533, + "step": 28510 + }, + { + "epoch": 1.0823750571578572, + "grad_norm": 0.24525237083435059, + "learning_rate": 8.303580833058742e-05, + "loss": 1.6607, + "step": 28520 + }, + { + "epoch": 1.0829169983233695, + "grad_norm": 0.2652153968811035, + "learning_rate": 8.302341784418081e-05, + "loss": 1.6687, + "step": 28530 + }, + { + "epoch": 1.083242163022677, + "eval_loss": 2.5320417881011963, + "eval_runtime": 21.9915, + "eval_samples_per_second": 227.361, + "eval_steps_per_second": 1.228, + "step": 28536 + }, + { + "epoch": 1.0834589394888818, + "grad_norm": 0.24303193390369415, + "learning_rate": 8.301102388635487e-05, + "loss": 1.6608, + "step": 28540 + }, + { + "epoch": 1.084000880654394, + "grad_norm": 0.26834824681282043, + "learning_rate": 8.299862645864491e-05, + "loss": 1.6616, + "step": 28550 + }, + { + "epoch": 1.0845428218199062, + "grad_norm": 0.2935827374458313, + "learning_rate": 8.298622556258664e-05, + "loss": 1.6588, + "step": 28560 + }, + { + "epoch": 1.0850847629854183, + "grad_norm": 0.27826014161109924, + "learning_rate": 8.297382119971627e-05, + "loss": 1.6731, + "step": 28570 + }, + { + "epoch": 1.0856267041509307, + "grad_norm": 0.35834673047065735, + "learning_rate": 8.296141337157038e-05, + "loss": 1.657, + "step": 28580 + }, + { + "epoch": 1.0861686453164427, + "grad_norm": 0.2979893386363983, + "learning_rate": 8.294900207968597e-05, + "loss": 1.6546, + "step": 28590 + }, + { + "epoch": 1.086710586481955, + "grad_norm": 0.2854683995246887, + "learning_rate": 8.293658732560056e-05, + "loss": 1.6656, + "step": 28600 + }, + { + "epoch": 1.0872525276474674, + "grad_norm": 0.37426939606666565, + "learning_rate": 8.292416911085198e-05, + "loss": 1.6513, + "step": 28610 + }, + { + "epoch": 1.0877944688129795, + "grad_norm": 0.34777402877807617, + "learning_rate": 8.291174743697857e-05, + "loss": 1.6653, + "step": 28620 + }, + { + "epoch": 1.087957051162633, + "eval_loss": 2.53008770942688, + "eval_runtime": 21.9855, + "eval_samples_per_second": 227.422, + "eval_steps_per_second": 1.228, + "step": 28623 + }, + { + "epoch": 1.0883364099784918, + "grad_norm": 0.28725841641426086, + "learning_rate": 8.289932230551907e-05, + "loss": 1.651, + "step": 28630 + }, + { + "epoch": 1.0888783511440039, + "grad_norm": 0.28935521841049194, + "learning_rate": 8.288689371801265e-05, + "loss": 1.6616, + "step": 28640 + }, + { + "epoch": 1.0894202923095162, + "grad_norm": 0.30465996265411377, + "learning_rate": 8.287446167599891e-05, + "loss": 1.6554, + "step": 28650 + }, + { + "epoch": 1.0899622334750283, + "grad_norm": 0.5029714703559875, + "learning_rate": 8.286202618101788e-05, + "loss": 1.668, + "step": 28660 + }, + { + "epoch": 1.0905041746405406, + "grad_norm": 0.2679192125797272, + "learning_rate": 8.284958723461001e-05, + "loss": 1.661, + "step": 28670 + }, + { + "epoch": 1.0910461158060527, + "grad_norm": 0.3121162950992584, + "learning_rate": 8.283714483831617e-05, + "loss": 1.6601, + "step": 28680 + }, + { + "epoch": 1.091588056971565, + "grad_norm": 0.2876189947128296, + "learning_rate": 8.28246989936777e-05, + "loss": 1.65, + "step": 28690 + }, + { + "epoch": 1.0921299981370773, + "grad_norm": 0.30062171816825867, + "learning_rate": 8.281224970223632e-05, + "loss": 1.6531, + "step": 28700 + }, + { + "epoch": 1.0926719393025894, + "grad_norm": 0.44612184166908264, + "learning_rate": 8.27997969655342e-05, + "loss": 1.6509, + "step": 28710 + }, + { + "epoch": 1.0926719393025894, + "eval_loss": 2.5466320514678955, + "eval_runtime": 21.9837, + "eval_samples_per_second": 227.441, + "eval_steps_per_second": 1.228, + "step": 28710 + }, + { + "epoch": 1.0932138804681018, + "grad_norm": 0.2449900060892105, + "learning_rate": 8.27873407851139e-05, + "loss": 1.6529, + "step": 28720 + }, + { + "epoch": 1.0937558216336138, + "grad_norm": 0.4357602894306183, + "learning_rate": 8.27748811625185e-05, + "loss": 1.6595, + "step": 28730 + }, + { + "epoch": 1.0942977627991262, + "grad_norm": 0.34103602170944214, + "learning_rate": 8.276241809929137e-05, + "loss": 1.6567, + "step": 28740 + }, + { + "epoch": 1.0948397039646383, + "grad_norm": 0.3430241346359253, + "learning_rate": 8.274995159697646e-05, + "loss": 1.6582, + "step": 28750 + }, + { + "epoch": 1.0953816451301506, + "grad_norm": 0.40874654054641724, + "learning_rate": 8.273748165711799e-05, + "loss": 1.6566, + "step": 28760 + }, + { + "epoch": 1.0959235862956627, + "grad_norm": 0.26068803668022156, + "learning_rate": 8.272500828126075e-05, + "loss": 1.6484, + "step": 28770 + }, + { + "epoch": 1.096465527461175, + "grad_norm": 0.3570903241634369, + "learning_rate": 8.271253147094981e-05, + "loss": 1.6562, + "step": 28780 + }, + { + "epoch": 1.0970074686266873, + "grad_norm": 0.2552138864994049, + "learning_rate": 8.270005122773083e-05, + "loss": 1.657, + "step": 28790 + }, + { + "epoch": 1.0973868274425458, + "eval_loss": 2.531879186630249, + "eval_runtime": 21.9807, + "eval_samples_per_second": 227.473, + "eval_steps_per_second": 1.228, + "step": 28797 + }, + { + "epoch": 1.0975494097921994, + "grad_norm": 0.374850869178772, + "learning_rate": 8.268756755314973e-05, + "loss": 1.6501, + "step": 28800 + }, + { + "epoch": 1.0980913509577117, + "grad_norm": 0.2875063419342041, + "learning_rate": 8.2675080448753e-05, + "loss": 1.6515, + "step": 28810 + }, + { + "epoch": 1.0986332921232238, + "grad_norm": 0.2932276725769043, + "learning_rate": 8.266258991608743e-05, + "loss": 1.6496, + "step": 28820 + }, + { + "epoch": 1.0991752332887361, + "grad_norm": 0.28365346789360046, + "learning_rate": 8.265009595670034e-05, + "loss": 1.6466, + "step": 28830 + }, + { + "epoch": 1.0997171744542482, + "grad_norm": 0.32967546582221985, + "learning_rate": 8.263759857213939e-05, + "loss": 1.6494, + "step": 28840 + }, + { + "epoch": 1.1002591156197605, + "grad_norm": 0.5435857772827148, + "learning_rate": 8.262509776395274e-05, + "loss": 1.661, + "step": 28850 + }, + { + "epoch": 1.1008010567852728, + "grad_norm": 0.26168474555015564, + "learning_rate": 8.26125935336889e-05, + "loss": 1.6467, + "step": 28860 + }, + { + "epoch": 1.101342997950785, + "grad_norm": 0.5927068591117859, + "learning_rate": 8.260008588289687e-05, + "loss": 1.6555, + "step": 28870 + }, + { + "epoch": 1.1018849391162973, + "grad_norm": 0.28580188751220703, + "learning_rate": 8.258757481312601e-05, + "loss": 1.6536, + "step": 28880 + }, + { + "epoch": 1.1021017155825021, + "eval_loss": 2.5326905250549316, + "eval_runtime": 21.9872, + "eval_samples_per_second": 227.405, + "eval_steps_per_second": 1.228, + "step": 28884 + }, + { + "epoch": 1.1024268802818094, + "grad_norm": 0.23324058949947357, + "learning_rate": 8.257506032592617e-05, + "loss": 1.6618, + "step": 28890 + }, + { + "epoch": 1.1029688214473217, + "grad_norm": 0.37625429034233093, + "learning_rate": 8.256254242284755e-05, + "loss": 1.6533, + "step": 28900 + }, + { + "epoch": 1.1035107626128338, + "grad_norm": 0.2690766453742981, + "learning_rate": 8.255002110544087e-05, + "loss": 1.6568, + "step": 28910 + }, + { + "epoch": 1.104052703778346, + "grad_norm": 0.3211461007595062, + "learning_rate": 8.253749637525717e-05, + "loss": 1.6532, + "step": 28920 + }, + { + "epoch": 1.1045946449438584, + "grad_norm": 0.30158957839012146, + "learning_rate": 8.252496823384798e-05, + "loss": 1.6473, + "step": 28930 + }, + { + "epoch": 1.1051365861093705, + "grad_norm": 0.26189902424812317, + "learning_rate": 8.251243668276524e-05, + "loss": 1.6586, + "step": 28940 + }, + { + "epoch": 1.1056785272748828, + "grad_norm": 0.3414219915866852, + "learning_rate": 8.249990172356128e-05, + "loss": 1.6553, + "step": 28950 + }, + { + "epoch": 1.106220468440395, + "grad_norm": 0.2272077202796936, + "learning_rate": 8.248736335778888e-05, + "loss": 1.662, + "step": 28960 + }, + { + "epoch": 1.1067624096059072, + "grad_norm": 0.3499011695384979, + "learning_rate": 8.247482158700126e-05, + "loss": 1.6621, + "step": 28970 + }, + { + "epoch": 1.1068166037224585, + "eval_loss": 2.527860403060913, + "eval_runtime": 21.9887, + "eval_samples_per_second": 227.389, + "eval_steps_per_second": 1.228, + "step": 28971 + }, + { + "epoch": 1.1073043507714193, + "grad_norm": 0.44124558568000793, + "learning_rate": 8.246227641275199e-05, + "loss": 1.6541, + "step": 28980 + }, + { + "epoch": 1.1078462919369316, + "grad_norm": 0.807167649269104, + "learning_rate": 8.244972783659518e-05, + "loss": 1.6499, + "step": 28990 + }, + { + "epoch": 1.1083882331024437, + "grad_norm": 0.5441895723342896, + "learning_rate": 8.243717586008525e-05, + "loss": 1.6535, + "step": 29000 + }, + { + "epoch": 1.108930174267956, + "grad_norm": 0.4006600081920624, + "learning_rate": 8.24246204847771e-05, + "loss": 1.6554, + "step": 29010 + }, + { + "epoch": 1.1094721154334684, + "grad_norm": 0.26644378900527954, + "learning_rate": 8.241206171222602e-05, + "loss": 1.6445, + "step": 29020 + }, + { + "epoch": 1.1100140565989804, + "grad_norm": 0.3243505358695984, + "learning_rate": 8.239949954398777e-05, + "loss": 1.6578, + "step": 29030 + }, + { + "epoch": 1.1105559977644928, + "grad_norm": 0.39492732286453247, + "learning_rate": 8.238693398161844e-05, + "loss": 1.6574, + "step": 29040 + }, + { + "epoch": 1.1110979389300049, + "grad_norm": 0.36204707622528076, + "learning_rate": 8.237436502667467e-05, + "loss": 1.647, + "step": 29050 + }, + { + "epoch": 1.1115314918624146, + "eval_loss": 2.537062644958496, + "eval_runtime": 21.9899, + "eval_samples_per_second": 227.377, + "eval_steps_per_second": 1.228, + "step": 29058 + }, + { + "epoch": 1.1116398800955172, + "grad_norm": 0.4409829080104828, + "learning_rate": 8.236179268071337e-05, + "loss": 1.6599, + "step": 29060 + }, + { + "epoch": 1.1121818212610293, + "grad_norm": 0.4123667776584625, + "learning_rate": 8.2349216945292e-05, + "loss": 1.6573, + "step": 29070 + }, + { + "epoch": 1.1127237624265416, + "grad_norm": 0.29760655760765076, + "learning_rate": 8.233663782196837e-05, + "loss": 1.6542, + "step": 29080 + }, + { + "epoch": 1.1132657035920537, + "grad_norm": 0.34215226769447327, + "learning_rate": 8.232405531230074e-05, + "loss": 1.6521, + "step": 29090 + }, + { + "epoch": 1.113807644757566, + "grad_norm": 0.6061283349990845, + "learning_rate": 8.231146941784776e-05, + "loss": 1.6578, + "step": 29100 + }, + { + "epoch": 1.1143495859230783, + "grad_norm": 0.5324410200119019, + "learning_rate": 8.229888014016853e-05, + "loss": 1.6522, + "step": 29110 + }, + { + "epoch": 1.1148915270885904, + "grad_norm": 0.3367980420589447, + "learning_rate": 8.228628748082256e-05, + "loss": 1.6521, + "step": 29120 + }, + { + "epoch": 1.1154334682541027, + "grad_norm": 0.34855327010154724, + "learning_rate": 8.227369144136974e-05, + "loss": 1.6478, + "step": 29130 + }, + { + "epoch": 1.1159754094196148, + "grad_norm": 0.5364739298820496, + "learning_rate": 8.226109202337043e-05, + "loss": 1.6525, + "step": 29140 + }, + { + "epoch": 1.116246380002371, + "eval_loss": 2.545180320739746, + "eval_runtime": 21.9866, + "eval_samples_per_second": 227.411, + "eval_steps_per_second": 1.228, + "step": 29145 + }, + { + "epoch": 1.1165173505851271, + "grad_norm": 0.36383602023124695, + "learning_rate": 8.224848922838541e-05, + "loss": 1.66, + "step": 29150 + }, + { + "epoch": 1.1170592917506392, + "grad_norm": 0.5313844680786133, + "learning_rate": 8.223588305797585e-05, + "loss": 1.6447, + "step": 29160 + }, + { + "epoch": 1.1176012329161515, + "grad_norm": 0.2722112536430359, + "learning_rate": 8.222327351370332e-05, + "loss": 1.6562, + "step": 29170 + }, + { + "epoch": 1.1181431740816639, + "grad_norm": 0.39539259672164917, + "learning_rate": 8.221066059712988e-05, + "loss": 1.6494, + "step": 29180 + }, + { + "epoch": 1.118685115247176, + "grad_norm": 0.4637277126312256, + "learning_rate": 8.219804430981794e-05, + "loss": 1.6445, + "step": 29190 + }, + { + "epoch": 1.1192270564126883, + "grad_norm": 0.3050798177719116, + "learning_rate": 8.218542465333035e-05, + "loss": 1.6477, + "step": 29200 + }, + { + "epoch": 1.1197689975782004, + "grad_norm": 0.3634278476238251, + "learning_rate": 8.217280162923036e-05, + "loss": 1.662, + "step": 29210 + }, + { + "epoch": 1.1203109387437127, + "grad_norm": 0.2769427001476288, + "learning_rate": 8.216017523908172e-05, + "loss": 1.655, + "step": 29220 + }, + { + "epoch": 1.1208528799092248, + "grad_norm": 0.2443336397409439, + "learning_rate": 8.214754548444846e-05, + "loss": 1.6374, + "step": 29230 + }, + { + "epoch": 1.1209612681423273, + "eval_loss": 2.541795015335083, + "eval_runtime": 21.9864, + "eval_samples_per_second": 227.413, + "eval_steps_per_second": 1.228, + "step": 29232 + }, + { + "epoch": 1.121394821074737, + "grad_norm": 0.2958986163139343, + "learning_rate": 8.213491236689514e-05, + "loss": 1.6575, + "step": 29240 + }, + { + "epoch": 1.1219367622402494, + "grad_norm": 0.2536433935165405, + "learning_rate": 8.212227588798667e-05, + "loss": 1.6449, + "step": 29250 + }, + { + "epoch": 1.1224787034057615, + "grad_norm": 0.2808546721935272, + "learning_rate": 8.210963604928842e-05, + "loss": 1.664, + "step": 29260 + }, + { + "epoch": 1.1230206445712738, + "grad_norm": 0.24417255818843842, + "learning_rate": 8.209699285236618e-05, + "loss": 1.6619, + "step": 29270 + }, + { + "epoch": 1.123562585736786, + "grad_norm": 0.22852517664432526, + "learning_rate": 8.208434629878607e-05, + "loss": 1.6587, + "step": 29280 + }, + { + "epoch": 1.1241045269022982, + "grad_norm": 0.3301936388015747, + "learning_rate": 8.207169639011474e-05, + "loss": 1.6574, + "step": 29290 + }, + { + "epoch": 1.1246464680678103, + "grad_norm": 0.3570113480091095, + "learning_rate": 8.205904312791921e-05, + "loss": 1.648, + "step": 29300 + }, + { + "epoch": 1.1251884092333226, + "grad_norm": 0.4051576256752014, + "learning_rate": 8.20463865137669e-05, + "loss": 1.6463, + "step": 29310 + }, + { + "epoch": 1.1256761562822837, + "eval_loss": 2.5262796878814697, + "eval_runtime": 21.9856, + "eval_samples_per_second": 227.421, + "eval_steps_per_second": 1.228, + "step": 29319 + }, + { + "epoch": 1.1257303503988347, + "grad_norm": 0.3884764611721039, + "learning_rate": 8.203372654922563e-05, + "loss": 1.6512, + "step": 29320 + }, + { + "epoch": 1.126272291564347, + "grad_norm": 0.7175701260566711, + "learning_rate": 8.20210632358637e-05, + "loss": 1.6392, + "step": 29330 + }, + { + "epoch": 1.1268142327298594, + "grad_norm": 0.44228026270866394, + "learning_rate": 8.200839657524976e-05, + "loss": 1.6573, + "step": 29340 + }, + { + "epoch": 1.1273561738953715, + "grad_norm": 0.25877845287323, + "learning_rate": 8.199572656895291e-05, + "loss": 1.6494, + "step": 29350 + }, + { + "epoch": 1.1278981150608838, + "grad_norm": 0.42708685994148254, + "learning_rate": 8.198305321854267e-05, + "loss": 1.6489, + "step": 29360 + }, + { + "epoch": 1.1284400562263959, + "grad_norm": 0.6025623083114624, + "learning_rate": 8.197037652558895e-05, + "loss": 1.6507, + "step": 29370 + }, + { + "epoch": 1.1289819973919082, + "grad_norm": 0.4976705312728882, + "learning_rate": 8.195769649166205e-05, + "loss": 1.6528, + "step": 29380 + }, + { + "epoch": 1.1295239385574203, + "grad_norm": 0.27502158284187317, + "learning_rate": 8.194501311833277e-05, + "loss": 1.659, + "step": 29390 + }, + { + "epoch": 1.1300658797229326, + "grad_norm": 0.26740071177482605, + "learning_rate": 8.193232640717223e-05, + "loss": 1.6435, + "step": 29400 + }, + { + "epoch": 1.13039104442224, + "eval_loss": 2.532335042953491, + "eval_runtime": 21.9808, + "eval_samples_per_second": 227.471, + "eval_steps_per_second": 1.228, + "step": 29406 + }, + { + "epoch": 1.1306078208884447, + "grad_norm": 0.21525554358959198, + "learning_rate": 8.191963635975204e-05, + "loss": 1.651, + "step": 29410 + }, + { + "epoch": 1.131149762053957, + "grad_norm": 0.3878322243690491, + "learning_rate": 8.190694297764417e-05, + "loss": 1.6369, + "step": 29420 + }, + { + "epoch": 1.1316917032194693, + "grad_norm": 0.2476329803466797, + "learning_rate": 8.189424626242102e-05, + "loss": 1.6483, + "step": 29430 + }, + { + "epoch": 1.1322336443849814, + "grad_norm": 0.40698474645614624, + "learning_rate": 8.18815462156554e-05, + "loss": 1.6495, + "step": 29440 + }, + { + "epoch": 1.1327755855504937, + "grad_norm": 0.2684831917285919, + "learning_rate": 8.186884283892056e-05, + "loss": 1.6424, + "step": 29450 + }, + { + "epoch": 1.1333175267160058, + "grad_norm": 0.46887609362602234, + "learning_rate": 8.185613613379011e-05, + "loss": 1.6439, + "step": 29460 + }, + { + "epoch": 1.1338594678815181, + "grad_norm": 0.34955888986587524, + "learning_rate": 8.184342610183812e-05, + "loss": 1.6492, + "step": 29470 + }, + { + "epoch": 1.1344014090470302, + "grad_norm": 0.3082946240901947, + "learning_rate": 8.183071274463903e-05, + "loss": 1.6533, + "step": 29480 + }, + { + "epoch": 1.1349433502125426, + "grad_norm": 0.35093745589256287, + "learning_rate": 8.181799606376773e-05, + "loss": 1.6564, + "step": 29490 + }, + { + "epoch": 1.1351059325621962, + "eval_loss": 2.538316249847412, + "eval_runtime": 21.9822, + "eval_samples_per_second": 227.456, + "eval_steps_per_second": 1.228, + "step": 29493 + }, + { + "epoch": 1.1354852913780547, + "grad_norm": 0.26975366473197937, + "learning_rate": 8.180527606079953e-05, + "loss": 1.6604, + "step": 29500 + }, + { + "epoch": 1.136027232543567, + "grad_norm": 0.2648986577987671, + "learning_rate": 8.17925527373101e-05, + "loss": 1.6453, + "step": 29510 + }, + { + "epoch": 1.1365691737090793, + "grad_norm": 0.2807252109050751, + "learning_rate": 8.177982609487556e-05, + "loss": 1.634, + "step": 29520 + }, + { + "epoch": 1.1371111148745914, + "grad_norm": 0.36864936351776123, + "learning_rate": 8.176709613507243e-05, + "loss": 1.6542, + "step": 29530 + }, + { + "epoch": 1.1376530560401037, + "grad_norm": 0.3327690362930298, + "learning_rate": 8.175436285947764e-05, + "loss": 1.6508, + "step": 29540 + }, + { + "epoch": 1.1381949972056158, + "grad_norm": 0.44101572036743164, + "learning_rate": 8.174162626966853e-05, + "loss": 1.6418, + "step": 29550 + }, + { + "epoch": 1.138736938371128, + "grad_norm": 0.24692606925964355, + "learning_rate": 8.172888636722288e-05, + "loss": 1.6477, + "step": 29560 + }, + { + "epoch": 1.1392788795366404, + "grad_norm": 0.41825541853904724, + "learning_rate": 8.171614315371881e-05, + "loss": 1.6572, + "step": 29570 + }, + { + "epoch": 1.1398208207021525, + "grad_norm": 0.3095550835132599, + "learning_rate": 8.170339663073492e-05, + "loss": 1.6457, + "step": 29580 + }, + { + "epoch": 1.1398208207021525, + "eval_loss": 2.5319855213165283, + "eval_runtime": 21.9838, + "eval_samples_per_second": 227.44, + "eval_steps_per_second": 1.228, + "step": 29580 + }, + { + "epoch": 1.1403627618676648, + "grad_norm": 0.5016186833381653, + "learning_rate": 8.16906467998502e-05, + "loss": 1.6549, + "step": 29590 + }, + { + "epoch": 1.140904703033177, + "grad_norm": 0.4376733899116516, + "learning_rate": 8.1677893662644e-05, + "loss": 1.6502, + "step": 29600 + }, + { + "epoch": 1.1414466441986892, + "grad_norm": 0.285548597574234, + "learning_rate": 8.16651372206962e-05, + "loss": 1.6387, + "step": 29610 + }, + { + "epoch": 1.1419885853642013, + "grad_norm": 0.22487492859363556, + "learning_rate": 8.165237747558694e-05, + "loss": 1.6547, + "step": 29620 + }, + { + "epoch": 1.1425305265297137, + "grad_norm": 0.37438520789146423, + "learning_rate": 8.16396144288969e-05, + "loss": 1.6615, + "step": 29630 + }, + { + "epoch": 1.1430724676952257, + "grad_norm": 0.2509768009185791, + "learning_rate": 8.162684808220708e-05, + "loss": 1.6561, + "step": 29640 + }, + { + "epoch": 1.143614408860738, + "grad_norm": 0.32760903239250183, + "learning_rate": 8.161407843709889e-05, + "loss": 1.6438, + "step": 29650 + }, + { + "epoch": 1.1441563500262504, + "grad_norm": 0.29007697105407715, + "learning_rate": 8.160130549515423e-05, + "loss": 1.6537, + "step": 29660 + }, + { + "epoch": 1.1445357088421089, + "eval_loss": 2.5295541286468506, + "eval_runtime": 21.9895, + "eval_samples_per_second": 227.382, + "eval_steps_per_second": 1.228, + "step": 29667 + }, + { + "epoch": 1.1446982911917625, + "grad_norm": 0.3338105380535126, + "learning_rate": 8.158852925795534e-05, + "loss": 1.6412, + "step": 29670 + }, + { + "epoch": 1.1452402323572748, + "grad_norm": 0.2998930811882019, + "learning_rate": 8.157574972708488e-05, + "loss": 1.6408, + "step": 29680 + }, + { + "epoch": 1.1457821735227869, + "grad_norm": 0.2588035464286804, + "learning_rate": 8.156296690412593e-05, + "loss": 1.6518, + "step": 29690 + }, + { + "epoch": 1.1463241146882992, + "grad_norm": 0.24682050943374634, + "learning_rate": 8.155018079066193e-05, + "loss": 1.6513, + "step": 29700 + }, + { + "epoch": 1.1468660558538113, + "grad_norm": 0.34920454025268555, + "learning_rate": 8.153739138827684e-05, + "loss": 1.6486, + "step": 29710 + }, + { + "epoch": 1.1474079970193236, + "grad_norm": 0.24906253814697266, + "learning_rate": 8.15245986985549e-05, + "loss": 1.6509, + "step": 29720 + }, + { + "epoch": 1.1479499381848357, + "grad_norm": 0.3515869677066803, + "learning_rate": 8.151180272308085e-05, + "loss": 1.6521, + "step": 29730 + }, + { + "epoch": 1.148491879350348, + "grad_norm": 0.2527877688407898, + "learning_rate": 8.149900346343975e-05, + "loss": 1.6504, + "step": 29740 + }, + { + "epoch": 1.1490338205158603, + "grad_norm": 0.29912281036376953, + "learning_rate": 8.148620092121718e-05, + "loss": 1.6499, + "step": 29750 + }, + { + "epoch": 1.1492505969820652, + "eval_loss": 2.5310330390930176, + "eval_runtime": 21.9865, + "eval_samples_per_second": 227.412, + "eval_steps_per_second": 1.228, + "step": 29754 + }, + { + "epoch": 1.1495757616813724, + "grad_norm": 0.34561511874198914, + "learning_rate": 8.147339509799902e-05, + "loss": 1.6445, + "step": 29760 + }, + { + "epoch": 1.1501177028468847, + "grad_norm": 0.2445221096277237, + "learning_rate": 8.146058599537162e-05, + "loss": 1.6496, + "step": 29770 + }, + { + "epoch": 1.1506596440123968, + "grad_norm": 0.2948252856731415, + "learning_rate": 8.144777361492168e-05, + "loss": 1.6419, + "step": 29780 + }, + { + "epoch": 1.1512015851779092, + "grad_norm": 0.297454833984375, + "learning_rate": 8.14349579582364e-05, + "loss": 1.6407, + "step": 29790 + }, + { + "epoch": 1.1517435263434213, + "grad_norm": 0.2834935188293457, + "learning_rate": 8.142213902690329e-05, + "loss": 1.6598, + "step": 29800 + }, + { + "epoch": 1.1522854675089336, + "grad_norm": 0.4210936427116394, + "learning_rate": 8.140931682251029e-05, + "loss": 1.6401, + "step": 29810 + }, + { + "epoch": 1.1528274086744457, + "grad_norm": 0.33858510851860046, + "learning_rate": 8.13964913466458e-05, + "loss": 1.646, + "step": 29820 + }, + { + "epoch": 1.153369349839958, + "grad_norm": 0.3140000104904175, + "learning_rate": 8.138366260089855e-05, + "loss": 1.6428, + "step": 29830 + }, + { + "epoch": 1.1539112910054703, + "grad_norm": 0.24602891504764557, + "learning_rate": 8.137083058685774e-05, + "loss": 1.6408, + "step": 29840 + }, + { + "epoch": 1.1539654851220214, + "eval_loss": 2.540917158126831, + "eval_runtime": 21.9857, + "eval_samples_per_second": 227.421, + "eval_steps_per_second": 1.228, + "step": 29841 + }, + { + "epoch": 1.1544532321709824, + "grad_norm": 0.23379047214984894, + "learning_rate": 8.135799530611292e-05, + "loss": 1.6488, + "step": 29850 + }, + { + "epoch": 1.1549951733364947, + "grad_norm": 0.3018862307071686, + "learning_rate": 8.134515676025407e-05, + "loss": 1.6542, + "step": 29860 + }, + { + "epoch": 1.1555371145020068, + "grad_norm": 0.24227862060070038, + "learning_rate": 8.133231495087159e-05, + "loss": 1.6456, + "step": 29870 + }, + { + "epoch": 1.1560790556675191, + "grad_norm": 0.39517369866371155, + "learning_rate": 8.131946987955627e-05, + "loss": 1.6424, + "step": 29880 + }, + { + "epoch": 1.1566209968330314, + "grad_norm": 0.3524185121059418, + "learning_rate": 8.130662154789926e-05, + "loss": 1.6485, + "step": 29890 + }, + { + "epoch": 1.1571629379985435, + "grad_norm": 0.43569809198379517, + "learning_rate": 8.12937699574922e-05, + "loss": 1.6421, + "step": 29900 + }, + { + "epoch": 1.1577048791640558, + "grad_norm": 0.3304058909416199, + "learning_rate": 8.128091510992705e-05, + "loss": 1.652, + "step": 29910 + }, + { + "epoch": 1.158246820329568, + "grad_norm": 0.4421805143356323, + "learning_rate": 8.126805700679628e-05, + "loss": 1.6417, + "step": 29920 + }, + { + "epoch": 1.1586803732619777, + "eval_loss": 2.530564785003662, + "eval_runtime": 21.9874, + "eval_samples_per_second": 227.403, + "eval_steps_per_second": 1.228, + "step": 29928 + }, + { + "epoch": 1.1587887614950803, + "grad_norm": 0.3330616056919098, + "learning_rate": 8.125519564969263e-05, + "loss": 1.6476, + "step": 29930 + }, + { + "epoch": 1.1593307026605923, + "grad_norm": 0.36129215359687805, + "learning_rate": 8.124233104020932e-05, + "loss": 1.6414, + "step": 29940 + }, + { + "epoch": 1.1598726438261047, + "grad_norm": 0.24318821728229523, + "learning_rate": 8.122946317993999e-05, + "loss": 1.6465, + "step": 29950 + }, + { + "epoch": 1.1604145849916168, + "grad_norm": 0.2985212206840515, + "learning_rate": 8.121659207047864e-05, + "loss": 1.6386, + "step": 29960 + }, + { + "epoch": 1.160956526157129, + "grad_norm": 0.33473873138427734, + "learning_rate": 8.120371771341968e-05, + "loss": 1.6402, + "step": 29970 + }, + { + "epoch": 1.1614984673226414, + "grad_norm": 0.2525531053543091, + "learning_rate": 8.119084011035794e-05, + "loss": 1.6486, + "step": 29980 + }, + { + "epoch": 1.1620404084881535, + "grad_norm": 0.36281898617744446, + "learning_rate": 8.117795926288862e-05, + "loss": 1.6468, + "step": 29990 + }, + { + "epoch": 1.1625823496536658, + "grad_norm": 0.355827659368515, + "learning_rate": 8.116507517260737e-05, + "loss": 1.6445, + "step": 30000 + }, + { + "epoch": 1.163124290819178, + "grad_norm": 0.38084909319877625, + "learning_rate": 8.11521878411102e-05, + "loss": 1.6413, + "step": 30010 + }, + { + "epoch": 1.163395261401934, + "eval_loss": 2.527758836746216, + "eval_runtime": 21.9849, + "eval_samples_per_second": 227.428, + "eval_steps_per_second": 1.228, + "step": 30015 + }, + { + "epoch": 1.1636662319846902, + "grad_norm": 0.2793338894844055, + "learning_rate": 8.113929726999354e-05, + "loss": 1.6487, + "step": 30020 + }, + { + "epoch": 1.1642081731502023, + "grad_norm": 0.31127527356147766, + "learning_rate": 8.112640346085424e-05, + "loss": 1.6481, + "step": 30030 + }, + { + "epoch": 1.1647501143157146, + "grad_norm": 0.26151424646377563, + "learning_rate": 8.11135064152895e-05, + "loss": 1.6467, + "step": 30040 + }, + { + "epoch": 1.1652920554812267, + "grad_norm": 0.3043818771839142, + "learning_rate": 8.110060613489693e-05, + "loss": 1.6401, + "step": 30050 + }, + { + "epoch": 1.165833996646739, + "grad_norm": 0.26495102047920227, + "learning_rate": 8.108770262127463e-05, + "loss": 1.6412, + "step": 30060 + }, + { + "epoch": 1.1663759378122514, + "grad_norm": 0.31619176268577576, + "learning_rate": 8.107479587602097e-05, + "loss": 1.6383, + "step": 30070 + }, + { + "epoch": 1.1669178789777634, + "grad_norm": 0.22748631238937378, + "learning_rate": 8.106188590073481e-05, + "loss": 1.637, + "step": 30080 + }, + { + "epoch": 1.1674598201432758, + "grad_norm": 0.4346810579299927, + "learning_rate": 8.104897269701538e-05, + "loss": 1.647, + "step": 30090 + }, + { + "epoch": 1.1680017613087879, + "grad_norm": 0.2576606571674347, + "learning_rate": 8.103605626646229e-05, + "loss": 1.6389, + "step": 30100 + }, + { + "epoch": 1.1681101495418904, + "eval_loss": 2.51259708404541, + "eval_runtime": 21.9879, + "eval_samples_per_second": 227.398, + "eval_steps_per_second": 1.228, + "step": 30102 + }, + { + "epoch": 1.1685437024743002, + "grad_norm": 0.31433582305908203, + "learning_rate": 8.10231366106756e-05, + "loss": 1.6312, + "step": 30110 + }, + { + "epoch": 1.1690856436398123, + "grad_norm": 0.5255588889122009, + "learning_rate": 8.101021373125573e-05, + "loss": 1.634, + "step": 30120 + }, + { + "epoch": 1.1696275848053246, + "grad_norm": 0.23931747674942017, + "learning_rate": 8.099728762980349e-05, + "loss": 1.6314, + "step": 30130 + }, + { + "epoch": 1.1701695259708367, + "grad_norm": 0.2556639015674591, + "learning_rate": 8.098435830792013e-05, + "loss": 1.6428, + "step": 30140 + }, + { + "epoch": 1.170711467136349, + "grad_norm": 0.4357036352157593, + "learning_rate": 8.097142576720728e-05, + "loss": 1.6342, + "step": 30150 + }, + { + "epoch": 1.1712534083018613, + "grad_norm": 0.37833094596862793, + "learning_rate": 8.095849000926696e-05, + "loss": 1.6384, + "step": 30160 + }, + { + "epoch": 1.1717953494673734, + "grad_norm": 0.2795352041721344, + "learning_rate": 8.094555103570161e-05, + "loss": 1.6411, + "step": 30170 + }, + { + "epoch": 1.1723372906328857, + "grad_norm": 0.3068492114543915, + "learning_rate": 8.093260884811403e-05, + "loss": 1.6403, + "step": 30180 + }, + { + "epoch": 1.1728250376818468, + "eval_loss": 2.527621269226074, + "eval_runtime": 21.9849, + "eval_samples_per_second": 227.429, + "eval_steps_per_second": 1.228, + "step": 30189 + }, + { + "epoch": 1.1728792317983978, + "grad_norm": 0.6460037231445312, + "learning_rate": 8.091966344810746e-05, + "loss": 1.6299, + "step": 30190 + }, + { + "epoch": 1.1734211729639101, + "grad_norm": 0.5416850447654724, + "learning_rate": 8.090671483728553e-05, + "loss": 1.6407, + "step": 30200 + }, + { + "epoch": 1.1739631141294224, + "grad_norm": 0.23775923252105713, + "learning_rate": 8.08937630172522e-05, + "loss": 1.6432, + "step": 30210 + }, + { + "epoch": 1.1745050552949345, + "grad_norm": 0.26255470514297485, + "learning_rate": 8.088080798961196e-05, + "loss": 1.6431, + "step": 30220 + }, + { + "epoch": 1.1750469964604469, + "grad_norm": 0.2868313491344452, + "learning_rate": 8.086784975596959e-05, + "loss": 1.638, + "step": 30230 + }, + { + "epoch": 1.175588937625959, + "grad_norm": 0.3102096617221832, + "learning_rate": 8.085488831793029e-05, + "loss": 1.6405, + "step": 30240 + }, + { + "epoch": 1.1761308787914713, + "grad_norm": 0.3971962630748749, + "learning_rate": 8.084192367709967e-05, + "loss": 1.6356, + "step": 30250 + }, + { + "epoch": 1.1766728199569834, + "grad_norm": 0.39245739579200745, + "learning_rate": 8.082895583508374e-05, + "loss": 1.6448, + "step": 30260 + }, + { + "epoch": 1.1772147611224957, + "grad_norm": 0.31180277466773987, + "learning_rate": 8.081598479348892e-05, + "loss": 1.6339, + "step": 30270 + }, + { + "epoch": 1.177539925821803, + "eval_loss": 2.5259082317352295, + "eval_runtime": 21.9866, + "eval_samples_per_second": 227.411, + "eval_steps_per_second": 1.228, + "step": 30276 + }, + { + "epoch": 1.1777567022880078, + "grad_norm": 0.3723970353603363, + "learning_rate": 8.080301055392196e-05, + "loss": 1.6444, + "step": 30280 + }, + { + "epoch": 1.17829864345352, + "grad_norm": 0.24248439073562622, + "learning_rate": 8.079003311799008e-05, + "loss": 1.6399, + "step": 30290 + }, + { + "epoch": 1.1788405846190324, + "grad_norm": 0.545952320098877, + "learning_rate": 8.077705248730089e-05, + "loss": 1.6446, + "step": 30300 + }, + { + "epoch": 1.1793825257845445, + "grad_norm": 0.27861401438713074, + "learning_rate": 8.076406866346233e-05, + "loss": 1.6314, + "step": 30310 + }, + { + "epoch": 1.1799244669500568, + "grad_norm": 0.3368987441062927, + "learning_rate": 8.075108164808281e-05, + "loss": 1.6445, + "step": 30320 + }, + { + "epoch": 1.180466408115569, + "grad_norm": 0.5238324999809265, + "learning_rate": 8.073809144277109e-05, + "loss": 1.6504, + "step": 30330 + }, + { + "epoch": 1.1810083492810812, + "grad_norm": 0.4157116413116455, + "learning_rate": 8.072509804913634e-05, + "loss": 1.6359, + "step": 30340 + }, + { + "epoch": 1.1815502904465933, + "grad_norm": 0.4345547556877136, + "learning_rate": 8.071210146878813e-05, + "loss": 1.6455, + "step": 30350 + }, + { + "epoch": 1.1820922316121056, + "grad_norm": 0.3230676054954529, + "learning_rate": 8.069910170333643e-05, + "loss": 1.6376, + "step": 30360 + }, + { + "epoch": 1.1822548139617592, + "eval_loss": 2.5333669185638428, + "eval_runtime": 21.9869, + "eval_samples_per_second": 227.408, + "eval_steps_per_second": 1.228, + "step": 30363 + }, + { + "epoch": 1.1826341727776177, + "grad_norm": 0.3938463032245636, + "learning_rate": 8.068609875439159e-05, + "loss": 1.6364, + "step": 30370 + }, + { + "epoch": 1.18317611394313, + "grad_norm": 0.29343992471694946, + "learning_rate": 8.067309262356435e-05, + "loss": 1.6359, + "step": 30380 + }, + { + "epoch": 1.1837180551086424, + "grad_norm": 0.25108593702316284, + "learning_rate": 8.066008331246586e-05, + "loss": 1.6415, + "step": 30390 + }, + { + "epoch": 1.1842599962741545, + "grad_norm": 0.6940186619758606, + "learning_rate": 8.064707082270765e-05, + "loss": 1.6353, + "step": 30400 + }, + { + "epoch": 1.1848019374396668, + "grad_norm": 0.33529943227767944, + "learning_rate": 8.063405515590166e-05, + "loss": 1.6275, + "step": 30410 + }, + { + "epoch": 1.1853438786051789, + "grad_norm": 0.3404565453529358, + "learning_rate": 8.06210363136602e-05, + "loss": 1.638, + "step": 30420 + }, + { + "epoch": 1.1858858197706912, + "grad_norm": 0.4092163145542145, + "learning_rate": 8.0608014297596e-05, + "loss": 1.6319, + "step": 30430 + }, + { + "epoch": 1.1864277609362033, + "grad_norm": 0.5055906772613525, + "learning_rate": 8.059498910932216e-05, + "loss": 1.6342, + "step": 30440 + }, + { + "epoch": 1.1869697021017156, + "grad_norm": 0.38912099599838257, + "learning_rate": 8.05819607504522e-05, + "loss": 1.6364, + "step": 30450 + }, + { + "epoch": 1.1869697021017156, + "eval_loss": 2.5219056606292725, + "eval_runtime": 21.9805, + "eval_samples_per_second": 227.474, + "eval_steps_per_second": 1.228, + "step": 30450 + }, + { + "epoch": 1.1875116432672277, + "grad_norm": 0.47896572947502136, + "learning_rate": 8.056892922260001e-05, + "loss": 1.6426, + "step": 30460 + }, + { + "epoch": 1.18805358443274, + "grad_norm": 0.2856700122356415, + "learning_rate": 8.055589452737988e-05, + "loss": 1.6501, + "step": 30470 + }, + { + "epoch": 1.1885955255982523, + "grad_norm": 0.2293747514486313, + "learning_rate": 8.054285666640649e-05, + "loss": 1.6401, + "step": 30480 + }, + { + "epoch": 1.1891374667637644, + "grad_norm": 0.46321722865104675, + "learning_rate": 8.052981564129489e-05, + "loss": 1.6478, + "step": 30490 + }, + { + "epoch": 1.1896794079292767, + "grad_norm": 0.39135417342185974, + "learning_rate": 8.051677145366058e-05, + "loss": 1.6481, + "step": 30500 + }, + { + "epoch": 1.1902213490947888, + "grad_norm": 0.4103468954563141, + "learning_rate": 8.050372410511941e-05, + "loss": 1.6359, + "step": 30510 + }, + { + "epoch": 1.1907632902603011, + "grad_norm": 0.2855057418346405, + "learning_rate": 8.049067359728763e-05, + "loss": 1.6489, + "step": 30520 + }, + { + "epoch": 1.1913052314258132, + "grad_norm": 0.3194611966609955, + "learning_rate": 8.047761993178186e-05, + "loss": 1.6374, + "step": 30530 + }, + { + "epoch": 1.191684590241672, + "eval_loss": 2.5290892124176025, + "eval_runtime": 21.9827, + "eval_samples_per_second": 227.452, + "eval_steps_per_second": 1.228, + "step": 30537 + }, + { + "epoch": 1.1918471725913256, + "grad_norm": 0.47105491161346436, + "learning_rate": 8.046456311021916e-05, + "loss": 1.6399, + "step": 30540 + }, + { + "epoch": 1.1923891137568376, + "grad_norm": 0.3898428976535797, + "learning_rate": 8.045150313421693e-05, + "loss": 1.6325, + "step": 30550 + }, + { + "epoch": 1.19293105492235, + "grad_norm": 0.23930542171001434, + "learning_rate": 8.0438440005393e-05, + "loss": 1.6297, + "step": 30560 + }, + { + "epoch": 1.1934729960878623, + "grad_norm": 0.31366342306137085, + "learning_rate": 8.042537372536556e-05, + "loss": 1.634, + "step": 30570 + }, + { + "epoch": 1.1940149372533744, + "grad_norm": 0.22141672670841217, + "learning_rate": 8.041230429575319e-05, + "loss": 1.6477, + "step": 30580 + }, + { + "epoch": 1.1945568784188867, + "grad_norm": 0.5529452562332153, + "learning_rate": 8.039923171817492e-05, + "loss": 1.6397, + "step": 30590 + }, + { + "epoch": 1.1950988195843988, + "grad_norm": 0.4657527208328247, + "learning_rate": 8.038615599425008e-05, + "loss": 1.6408, + "step": 30600 + }, + { + "epoch": 1.195640760749911, + "grad_norm": 0.4929318130016327, + "learning_rate": 8.037307712559847e-05, + "loss": 1.6377, + "step": 30610 + }, + { + "epoch": 1.1961827019154234, + "grad_norm": 0.38976770639419556, + "learning_rate": 8.03599951138402e-05, + "loss": 1.6393, + "step": 30620 + }, + { + "epoch": 1.196399478381628, + "eval_loss": 2.534869432449341, + "eval_runtime": 21.9803, + "eval_samples_per_second": 227.476, + "eval_steps_per_second": 1.228, + "step": 30624 + }, + { + "epoch": 1.1967246430809355, + "grad_norm": 0.2525932788848877, + "learning_rate": 8.034690996059584e-05, + "loss": 1.6411, + "step": 30630 + }, + { + "epoch": 1.1972665842464478, + "grad_norm": 0.2530060112476349, + "learning_rate": 8.033382166748633e-05, + "loss": 1.6505, + "step": 30640 + }, + { + "epoch": 1.19780852541196, + "grad_norm": 0.46789705753326416, + "learning_rate": 8.032073023613299e-05, + "loss": 1.6235, + "step": 30650 + }, + { + "epoch": 1.1983504665774722, + "grad_norm": 0.547118604183197, + "learning_rate": 8.03076356681575e-05, + "loss": 1.6362, + "step": 30660 + }, + { + "epoch": 1.1988924077429843, + "grad_norm": 0.33731144666671753, + "learning_rate": 8.029453796518198e-05, + "loss": 1.6387, + "step": 30670 + }, + { + "epoch": 1.1994343489084967, + "grad_norm": 0.22524616122245789, + "learning_rate": 8.028143712882893e-05, + "loss": 1.6338, + "step": 30680 + }, + { + "epoch": 1.1999762900740087, + "grad_norm": 0.46151962876319885, + "learning_rate": 8.02683331607212e-05, + "loss": 1.6475, + "step": 30690 + }, + { + "epoch": 1.200518231239521, + "grad_norm": 0.313071072101593, + "learning_rate": 8.025522606248206e-05, + "loss": 1.6416, + "step": 30700 + }, + { + "epoch": 1.2010601724050334, + "grad_norm": 0.480162113904953, + "learning_rate": 8.024211583573516e-05, + "loss": 1.6333, + "step": 30710 + }, + { + "epoch": 1.2011143665215844, + "eval_loss": 2.5296213626861572, + "eval_runtime": 21.9849, + "eval_samples_per_second": 227.429, + "eval_steps_per_second": 1.228, + "step": 30711 + }, + { + "epoch": 1.2016021135705455, + "grad_norm": 0.3885667622089386, + "learning_rate": 8.022900248210455e-05, + "loss": 1.6343, + "step": 30720 + }, + { + "epoch": 1.2021440547360578, + "grad_norm": 0.34139159321784973, + "learning_rate": 8.021588600321465e-05, + "loss": 1.6332, + "step": 30730 + }, + { + "epoch": 1.2026859959015699, + "grad_norm": 0.2892656922340393, + "learning_rate": 8.020276640069025e-05, + "loss": 1.6362, + "step": 30740 + }, + { + "epoch": 1.2032279370670822, + "grad_norm": 0.4062507748603821, + "learning_rate": 8.018964367615659e-05, + "loss": 1.6388, + "step": 30750 + }, + { + "epoch": 1.2037698782325943, + "grad_norm": 0.3714730739593506, + "learning_rate": 8.017651783123922e-05, + "loss": 1.63, + "step": 30760 + }, + { + "epoch": 1.2043118193981066, + "grad_norm": 0.5153228044509888, + "learning_rate": 8.016338886756412e-05, + "loss": 1.6387, + "step": 30770 + }, + { + "epoch": 1.2048537605636187, + "grad_norm": 0.27551186084747314, + "learning_rate": 8.015025678675767e-05, + "loss": 1.634, + "step": 30780 + }, + { + "epoch": 1.205395701729131, + "grad_norm": 0.8653702139854431, + "learning_rate": 8.013712159044658e-05, + "loss": 1.6362, + "step": 30790 + }, + { + "epoch": 1.2058292546615408, + "eval_loss": 2.5252997875213623, + "eval_runtime": 21.986, + "eval_samples_per_second": 227.418, + "eval_steps_per_second": 1.228, + "step": 30798 + }, + { + "epoch": 1.2059376428946433, + "grad_norm": 0.8155677914619446, + "learning_rate": 8.012398328025804e-05, + "loss": 1.6397, + "step": 30800 + }, + { + "epoch": 1.2064795840601554, + "grad_norm": 0.49000170826911926, + "learning_rate": 8.01108418578195e-05, + "loss": 1.6407, + "step": 30810 + }, + { + "epoch": 1.2070215252256677, + "grad_norm": 0.37893688678741455, + "learning_rate": 8.009769732475889e-05, + "loss": 1.6368, + "step": 30820 + }, + { + "epoch": 1.2075634663911798, + "grad_norm": 0.3986130356788635, + "learning_rate": 8.008454968270452e-05, + "loss": 1.634, + "step": 30830 + }, + { + "epoch": 1.2081054075566922, + "grad_norm": 0.33178648352622986, + "learning_rate": 8.0071398933285e-05, + "loss": 1.6315, + "step": 30840 + }, + { + "epoch": 1.2086473487222043, + "grad_norm": 0.3417208790779114, + "learning_rate": 8.005824507812947e-05, + "loss": 1.6246, + "step": 30850 + }, + { + "epoch": 1.2091892898877166, + "grad_norm": 0.4329122006893158, + "learning_rate": 8.004508811886732e-05, + "loss": 1.6451, + "step": 30860 + }, + { + "epoch": 1.2097312310532287, + "grad_norm": 0.4540995657444, + "learning_rate": 8.003192805712839e-05, + "loss": 1.6188, + "step": 30870 + }, + { + "epoch": 1.210273172218741, + "grad_norm": 0.41036659479141235, + "learning_rate": 8.001876489454289e-05, + "loss": 1.6392, + "step": 30880 + }, + { + "epoch": 1.2105441428014971, + "eval_loss": 2.5213656425476074, + "eval_runtime": 21.9838, + "eval_samples_per_second": 227.441, + "eval_steps_per_second": 1.228, + "step": 30885 + }, + { + "epoch": 1.2108151133842533, + "grad_norm": 0.2853230834007263, + "learning_rate": 8.00055986327414e-05, + "loss": 1.6335, + "step": 30890 + }, + { + "epoch": 1.2113570545497654, + "grad_norm": 0.23493582010269165, + "learning_rate": 7.999242927335493e-05, + "loss": 1.6369, + "step": 30900 + }, + { + "epoch": 1.2118989957152777, + "grad_norm": 0.4317961037158966, + "learning_rate": 7.997925681801484e-05, + "loss": 1.6346, + "step": 30910 + }, + { + "epoch": 1.2124409368807898, + "grad_norm": 0.29279986023902893, + "learning_rate": 7.996608126835285e-05, + "loss": 1.6326, + "step": 30920 + }, + { + "epoch": 1.2129828780463021, + "grad_norm": 0.3215675950050354, + "learning_rate": 7.995290262600109e-05, + "loss": 1.6445, + "step": 30930 + }, + { + "epoch": 1.2135248192118144, + "grad_norm": 0.6567270755767822, + "learning_rate": 7.99397208925921e-05, + "loss": 1.6359, + "step": 30940 + }, + { + "epoch": 1.2140667603773265, + "grad_norm": 0.5213804841041565, + "learning_rate": 7.992653606975877e-05, + "loss": 1.6262, + "step": 30950 + }, + { + "epoch": 1.2146087015428388, + "grad_norm": 0.3358005881309509, + "learning_rate": 7.991334815913437e-05, + "loss": 1.6371, + "step": 30960 + }, + { + "epoch": 1.215150642708351, + "grad_norm": 0.26169344782829285, + "learning_rate": 7.990015716235255e-05, + "loss": 1.6329, + "step": 30970 + }, + { + "epoch": 1.2152590309414535, + "eval_loss": 2.521756172180176, + "eval_runtime": 21.9857, + "eval_samples_per_second": 227.421, + "eval_steps_per_second": 1.228, + "step": 30972 + }, + { + "epoch": 1.2156925838738633, + "grad_norm": 0.3503156304359436, + "learning_rate": 7.988696308104738e-05, + "loss": 1.6396, + "step": 30980 + }, + { + "epoch": 1.2162345250393753, + "grad_norm": 0.4474554657936096, + "learning_rate": 7.987376591685325e-05, + "loss": 1.6303, + "step": 30990 + }, + { + "epoch": 1.2167764662048877, + "grad_norm": 0.25015437602996826, + "learning_rate": 7.986056567140502e-05, + "loss": 1.6229, + "step": 31000 + }, + { + "epoch": 1.2173184073703998, + "grad_norm": 0.40038561820983887, + "learning_rate": 7.984736234633784e-05, + "loss": 1.6299, + "step": 31010 + }, + { + "epoch": 1.217860348535912, + "grad_norm": 0.3585070073604584, + "learning_rate": 7.983415594328729e-05, + "loss": 1.644, + "step": 31020 + }, + { + "epoch": 1.2184022897014244, + "grad_norm": 0.2341560274362564, + "learning_rate": 7.98209464638893e-05, + "loss": 1.6389, + "step": 31030 + }, + { + "epoch": 1.2189442308669365, + "grad_norm": 0.32264870405197144, + "learning_rate": 7.980773390978024e-05, + "loss": 1.635, + "step": 31040 + }, + { + "epoch": 1.2194861720324488, + "grad_norm": 0.46334415674209595, + "learning_rate": 7.979451828259681e-05, + "loss": 1.6315, + "step": 31050 + }, + { + "epoch": 1.2199739190814096, + "eval_loss": 2.524176597595215, + "eval_runtime": 21.9898, + "eval_samples_per_second": 227.379, + "eval_steps_per_second": 1.228, + "step": 31059 + }, + { + "epoch": 1.220028113197961, + "grad_norm": 0.2632630467414856, + "learning_rate": 7.978129958397612e-05, + "loss": 1.6317, + "step": 31060 + }, + { + "epoch": 1.2205700543634732, + "grad_norm": 0.5674657225608826, + "learning_rate": 7.97680778155556e-05, + "loss": 1.6294, + "step": 31070 + }, + { + "epoch": 1.2211119955289853, + "grad_norm": 0.4556896686553955, + "learning_rate": 7.975485297897312e-05, + "loss": 1.633, + "step": 31080 + }, + { + "epoch": 1.2216539366944976, + "grad_norm": 0.362857848405838, + "learning_rate": 7.974162507586696e-05, + "loss": 1.632, + "step": 31090 + }, + { + "epoch": 1.2221958778600097, + "grad_norm": 0.3741185963153839, + "learning_rate": 7.972839410787568e-05, + "loss": 1.6396, + "step": 31100 + }, + { + "epoch": 1.222737819025522, + "grad_norm": 0.5633114576339722, + "learning_rate": 7.971516007663831e-05, + "loss": 1.6354, + "step": 31110 + }, + { + "epoch": 1.2232797601910343, + "grad_norm": 0.3228178322315216, + "learning_rate": 7.970192298379421e-05, + "loss": 1.6383, + "step": 31120 + }, + { + "epoch": 1.2238217013565464, + "grad_norm": 0.39119747281074524, + "learning_rate": 7.968868283098314e-05, + "loss": 1.6271, + "step": 31130 + }, + { + "epoch": 1.2243636425220588, + "grad_norm": 0.28056901693344116, + "learning_rate": 7.967543961984522e-05, + "loss": 1.6356, + "step": 31140 + }, + { + "epoch": 1.224688807221366, + "eval_loss": 2.5236127376556396, + "eval_runtime": 21.9859, + "eval_samples_per_second": 227.418, + "eval_steps_per_second": 1.228, + "step": 31146 + }, + { + "epoch": 1.2249055836875709, + "grad_norm": 0.4328171908855438, + "learning_rate": 7.966219335202097e-05, + "loss": 1.6377, + "step": 31150 + }, + { + "epoch": 1.2254475248530832, + "grad_norm": 0.3190264105796814, + "learning_rate": 7.964894402915127e-05, + "loss": 1.6327, + "step": 31160 + }, + { + "epoch": 1.2259894660185953, + "grad_norm": 0.38925376534461975, + "learning_rate": 7.963569165287743e-05, + "loss": 1.6092, + "step": 31170 + }, + { + "epoch": 1.2265314071841076, + "grad_norm": 0.2989320755004883, + "learning_rate": 7.962243622484104e-05, + "loss": 1.6305, + "step": 31180 + }, + { + "epoch": 1.2270733483496197, + "grad_norm": 0.21814024448394775, + "learning_rate": 7.960917774668415e-05, + "loss": 1.6351, + "step": 31190 + }, + { + "epoch": 1.227615289515132, + "grad_norm": 0.23537899553775787, + "learning_rate": 7.95959162200492e-05, + "loss": 1.6261, + "step": 31200 + }, + { + "epoch": 1.2281572306806443, + "grad_norm": 0.3734976649284363, + "learning_rate": 7.958265164657889e-05, + "loss": 1.6297, + "step": 31210 + }, + { + "epoch": 1.2286991718461564, + "grad_norm": 0.3379276990890503, + "learning_rate": 7.956938402791644e-05, + "loss": 1.622, + "step": 31220 + }, + { + "epoch": 1.2292411130116687, + "grad_norm": 0.3800952434539795, + "learning_rate": 7.955611336570537e-05, + "loss": 1.6307, + "step": 31230 + }, + { + "epoch": 1.2294036953613223, + "eval_loss": 2.5284194946289062, + "eval_runtime": 21.9883, + "eval_samples_per_second": 227.393, + "eval_steps_per_second": 1.228, + "step": 31233 + }, + { + "epoch": 1.2297830541771808, + "grad_norm": 0.36053764820098877, + "learning_rate": 7.954283966158957e-05, + "loss": 1.6258, + "step": 31240 + }, + { + "epoch": 1.2303249953426931, + "grad_norm": 0.24756401777267456, + "learning_rate": 7.952956291721335e-05, + "loss": 1.6383, + "step": 31250 + }, + { + "epoch": 1.2308669365082054, + "grad_norm": 0.24404959380626678, + "learning_rate": 7.951628313422139e-05, + "loss": 1.6229, + "step": 31260 + }, + { + "epoch": 1.2314088776737175, + "grad_norm": 0.2366773635149002, + "learning_rate": 7.95030003142587e-05, + "loss": 1.6288, + "step": 31270 + }, + { + "epoch": 1.2319508188392299, + "grad_norm": 0.6960218548774719, + "learning_rate": 7.948971445897072e-05, + "loss": 1.6308, + "step": 31280 + }, + { + "epoch": 1.232492760004742, + "grad_norm": 0.24987642467021942, + "learning_rate": 7.947642557000324e-05, + "loss": 1.6346, + "step": 31290 + }, + { + "epoch": 1.2330347011702543, + "grad_norm": 0.3360191583633423, + "learning_rate": 7.946313364900242e-05, + "loss": 1.6323, + "step": 31300 + }, + { + "epoch": 1.2335766423357664, + "grad_norm": 0.3646528124809265, + "learning_rate": 7.944983869761481e-05, + "loss": 1.6235, + "step": 31310 + }, + { + "epoch": 1.2341185835012787, + "grad_norm": 0.29639899730682373, + "learning_rate": 7.943654071748734e-05, + "loss": 1.6291, + "step": 31320 + }, + { + "epoch": 1.2341185835012787, + "eval_loss": 2.529906749725342, + "eval_runtime": 21.9492, + "eval_samples_per_second": 227.799, + "eval_steps_per_second": 1.23, + "step": 31320 + }, + { + "epoch": 1.2346605246667908, + "grad_norm": 0.2218153476715088, + "learning_rate": 7.942323971026729e-05, + "loss": 1.6226, + "step": 31330 + }, + { + "epoch": 1.235202465832303, + "grad_norm": 0.25773563981056213, + "learning_rate": 7.940993567760235e-05, + "loss": 1.6332, + "step": 31340 + }, + { + "epoch": 1.2357444069978154, + "grad_norm": 0.26830193400382996, + "learning_rate": 7.939662862114053e-05, + "loss": 1.6398, + "step": 31350 + }, + { + "epoch": 1.2362863481633275, + "grad_norm": 0.3047340214252472, + "learning_rate": 7.938331854253031e-05, + "loss": 1.6377, + "step": 31360 + }, + { + "epoch": 1.2368282893288398, + "grad_norm": 0.39592602849006653, + "learning_rate": 7.937000544342042e-05, + "loss": 1.6235, + "step": 31370 + }, + { + "epoch": 1.237370230494352, + "grad_norm": 0.39517736434936523, + "learning_rate": 7.935668932546009e-05, + "loss": 1.639, + "step": 31380 + }, + { + "epoch": 1.2379121716598642, + "grad_norm": 0.3887443542480469, + "learning_rate": 7.934337019029881e-05, + "loss": 1.6261, + "step": 31390 + }, + { + "epoch": 1.2384541128253763, + "grad_norm": 0.5671885013580322, + "learning_rate": 7.933004803958654e-05, + "loss": 1.6313, + "step": 31400 + }, + { + "epoch": 1.238833471641235, + "eval_loss": 2.5245509147644043, + "eval_runtime": 21.9815, + "eval_samples_per_second": 227.464, + "eval_steps_per_second": 1.228, + "step": 31407 + }, + { + "epoch": 1.2389960539908886, + "grad_norm": 0.3325177729129791, + "learning_rate": 7.931672287497353e-05, + "loss": 1.6346, + "step": 31410 + }, + { + "epoch": 1.2395379951564007, + "grad_norm": 0.36562344431877136, + "learning_rate": 7.930339469811045e-05, + "loss": 1.6241, + "step": 31420 + }, + { + "epoch": 1.240079936321913, + "grad_norm": 0.2752797603607178, + "learning_rate": 7.929006351064838e-05, + "loss": 1.6228, + "step": 31430 + }, + { + "epoch": 1.2406218774874254, + "grad_norm": 0.27838945388793945, + "learning_rate": 7.927672931423869e-05, + "loss": 1.6249, + "step": 31440 + }, + { + "epoch": 1.2411638186529375, + "grad_norm": 0.40533244609832764, + "learning_rate": 7.926339211053316e-05, + "loss": 1.6243, + "step": 31450 + }, + { + "epoch": 1.2417057598184498, + "grad_norm": 0.47712355852127075, + "learning_rate": 7.925005190118397e-05, + "loss": 1.6253, + "step": 31460 + }, + { + "epoch": 1.2422477009839619, + "grad_norm": 0.3724726140499115, + "learning_rate": 7.923670868784364e-05, + "loss": 1.6356, + "step": 31470 + }, + { + "epoch": 1.2427896421494742, + "grad_norm": 0.26425930857658386, + "learning_rate": 7.922336247216505e-05, + "loss": 1.6291, + "step": 31480 + }, + { + "epoch": 1.2433315833149863, + "grad_norm": 0.2903880476951599, + "learning_rate": 7.92100132558015e-05, + "loss": 1.6299, + "step": 31490 + }, + { + "epoch": 1.2435483597811912, + "eval_loss": 2.520765781402588, + "eval_runtime": 21.9829, + "eval_samples_per_second": 227.449, + "eval_steps_per_second": 1.228, + "step": 31494 + }, + { + "epoch": 1.2438735244804986, + "grad_norm": 0.3055003881454468, + "learning_rate": 7.91966610404066e-05, + "loss": 1.6376, + "step": 31500 + }, + { + "epoch": 1.2444154656460107, + "grad_norm": 0.37320762872695923, + "learning_rate": 7.918330582763438e-05, + "loss": 1.6298, + "step": 31510 + }, + { + "epoch": 1.244957406811523, + "grad_norm": 0.2776867747306824, + "learning_rate": 7.916994761913923e-05, + "loss": 1.6266, + "step": 31520 + }, + { + "epoch": 1.2454993479770353, + "grad_norm": 0.2319355010986328, + "learning_rate": 7.91565864165759e-05, + "loss": 1.6252, + "step": 31530 + }, + { + "epoch": 1.2460412891425474, + "grad_norm": 0.5147307515144348, + "learning_rate": 7.914322222159953e-05, + "loss": 1.6261, + "step": 31540 + }, + { + "epoch": 1.2465832303080597, + "grad_norm": 0.31287848949432373, + "learning_rate": 7.912985503586562e-05, + "loss": 1.6304, + "step": 31550 + }, + { + "epoch": 1.2471251714735718, + "grad_norm": 0.2941262423992157, + "learning_rate": 7.911648486103002e-05, + "loss": 1.6325, + "step": 31560 + }, + { + "epoch": 1.2476671126390841, + "grad_norm": 0.2733207941055298, + "learning_rate": 7.910311169874898e-05, + "loss": 1.6273, + "step": 31570 + }, + { + "epoch": 1.2482090538045965, + "grad_norm": 0.40679219365119934, + "learning_rate": 7.908973555067911e-05, + "loss": 1.6264, + "step": 31580 + }, + { + "epoch": 1.2482632479211475, + "eval_loss": 2.5255589485168457, + "eval_runtime": 21.9824, + "eval_samples_per_second": 227.454, + "eval_steps_per_second": 1.228, + "step": 31581 + }, + { + "epoch": 1.2487509949701086, + "grad_norm": 0.27531698346138, + "learning_rate": 7.907635641847739e-05, + "loss": 1.6209, + "step": 31590 + }, + { + "epoch": 1.2492929361356209, + "grad_norm": 0.26964595913887024, + "learning_rate": 7.906297430380114e-05, + "loss": 1.6253, + "step": 31600 + }, + { + "epoch": 1.249834877301133, + "grad_norm": 0.25738173723220825, + "learning_rate": 7.904958920830813e-05, + "loss": 1.6205, + "step": 31610 + }, + { + "epoch": 1.2503768184666453, + "grad_norm": 0.2674032151699066, + "learning_rate": 7.903620113365644e-05, + "loss": 1.6191, + "step": 31620 + }, + { + "epoch": 1.2509187596321574, + "grad_norm": 0.42425525188446045, + "learning_rate": 7.902281008150447e-05, + "loss": 1.6345, + "step": 31630 + }, + { + "epoch": 1.2514607007976697, + "grad_norm": 0.4274544417858124, + "learning_rate": 7.90094160535111e-05, + "loss": 1.6372, + "step": 31640 + }, + { + "epoch": 1.2520026419631818, + "grad_norm": 0.36170271039009094, + "learning_rate": 7.89960190513355e-05, + "loss": 1.6288, + "step": 31650 + }, + { + "epoch": 1.252544583128694, + "grad_norm": 0.35101965069770813, + "learning_rate": 7.898261907663725e-05, + "loss": 1.6218, + "step": 31660 + }, + { + "epoch": 1.2529781360611039, + "eval_loss": 2.5185720920562744, + "eval_runtime": 21.9807, + "eval_samples_per_second": 227.472, + "eval_steps_per_second": 1.228, + "step": 31668 + }, + { + "epoch": 1.2530865242942064, + "grad_norm": 0.2534119188785553, + "learning_rate": 7.896921613107626e-05, + "loss": 1.6279, + "step": 31670 + }, + { + "epoch": 1.2536284654597185, + "grad_norm": 0.4205189049243927, + "learning_rate": 7.895581021631286e-05, + "loss": 1.623, + "step": 31680 + }, + { + "epoch": 1.2541704066252308, + "grad_norm": 0.29453709721565247, + "learning_rate": 7.894240133400767e-05, + "loss": 1.6259, + "step": 31690 + }, + { + "epoch": 1.254712347790743, + "grad_norm": 0.3749118149280548, + "learning_rate": 7.892898948582177e-05, + "loss": 1.6186, + "step": 31700 + }, + { + "epoch": 1.2552542889562552, + "grad_norm": 0.3719863295555115, + "learning_rate": 7.891557467341653e-05, + "loss": 1.6233, + "step": 31710 + }, + { + "epoch": 1.2557962301217673, + "grad_norm": 0.2719629406929016, + "learning_rate": 7.890215689845374e-05, + "loss": 1.642, + "step": 31720 + }, + { + "epoch": 1.2563381712872796, + "grad_norm": 0.29956310987472534, + "learning_rate": 7.888873616259552e-05, + "loss": 1.6394, + "step": 31730 + }, + { + "epoch": 1.2568801124527917, + "grad_norm": 0.6171798706054688, + "learning_rate": 7.887531246750438e-05, + "loss": 1.6224, + "step": 31740 + }, + { + "epoch": 1.257422053618304, + "grad_norm": 0.42021244764328003, + "learning_rate": 7.886188581484318e-05, + "loss": 1.6289, + "step": 31750 + }, + { + "epoch": 1.2576930242010602, + "eval_loss": 2.518956422805786, + "eval_runtime": 21.9853, + "eval_samples_per_second": 227.425, + "eval_steps_per_second": 1.228, + "step": 31755 + }, + { + "epoch": 1.2579639947838164, + "grad_norm": 0.3666936159133911, + "learning_rate": 7.884845620627518e-05, + "loss": 1.6277, + "step": 31760 + }, + { + "epoch": 1.2585059359493285, + "grad_norm": 0.3201642334461212, + "learning_rate": 7.883502364346396e-05, + "loss": 1.6311, + "step": 31770 + }, + { + "epoch": 1.2590478771148408, + "grad_norm": 0.2944391369819641, + "learning_rate": 7.882158812807349e-05, + "loss": 1.6259, + "step": 31780 + }, + { + "epoch": 1.2595898182803529, + "grad_norm": 0.2937580943107605, + "learning_rate": 7.88081496617681e-05, + "loss": 1.626, + "step": 31790 + }, + { + "epoch": 1.2601317594458652, + "grad_norm": 0.3028720021247864, + "learning_rate": 7.87947082462125e-05, + "loss": 1.6325, + "step": 31800 + }, + { + "epoch": 1.2606737006113775, + "grad_norm": 0.2833790183067322, + "learning_rate": 7.878126388307173e-05, + "loss": 1.6247, + "step": 31810 + }, + { + "epoch": 1.2612156417768896, + "grad_norm": 0.264164000749588, + "learning_rate": 7.876781657401125e-05, + "loss": 1.627, + "step": 31820 + }, + { + "epoch": 1.2617575829424017, + "grad_norm": 0.3081631064414978, + "learning_rate": 7.875436632069687e-05, + "loss": 1.6225, + "step": 31830 + }, + { + "epoch": 1.262299524107914, + "grad_norm": 0.4150921404361725, + "learning_rate": 7.874091312479468e-05, + "loss": 1.6336, + "step": 31840 + }, + { + "epoch": 1.2624079123410166, + "eval_loss": 2.5215582847595215, + "eval_runtime": 21.9862, + "eval_samples_per_second": 227.416, + "eval_steps_per_second": 1.228, + "step": 31842 + }, + { + "epoch": 1.2628414652734263, + "grad_norm": 0.689495861530304, + "learning_rate": 7.872745698797128e-05, + "loss": 1.6334, + "step": 31850 + }, + { + "epoch": 1.2633834064389384, + "grad_norm": 0.501124918460846, + "learning_rate": 7.87139979118935e-05, + "loss": 1.6275, + "step": 31860 + }, + { + "epoch": 1.2639253476044507, + "grad_norm": 0.2936415672302246, + "learning_rate": 7.870053589822863e-05, + "loss": 1.6368, + "step": 31870 + }, + { + "epoch": 1.2644672887699628, + "grad_norm": 0.3334285616874695, + "learning_rate": 7.868707094864427e-05, + "loss": 1.6255, + "step": 31880 + }, + { + "epoch": 1.2650092299354752, + "grad_norm": 0.2995965778827667, + "learning_rate": 7.867360306480839e-05, + "loss": 1.6214, + "step": 31890 + }, + { + "epoch": 1.2655511711009875, + "grad_norm": 0.3712153732776642, + "learning_rate": 7.866013224838933e-05, + "loss": 1.634, + "step": 31900 + }, + { + "epoch": 1.2660931122664996, + "grad_norm": 0.30742502212524414, + "learning_rate": 7.864665850105583e-05, + "loss": 1.6193, + "step": 31910 + }, + { + "epoch": 1.2666350534320117, + "grad_norm": 0.27019500732421875, + "learning_rate": 7.863318182447693e-05, + "loss": 1.627, + "step": 31920 + }, + { + "epoch": 1.2671228004809727, + "eval_loss": 2.5232045650482178, + "eval_runtime": 21.9899, + "eval_samples_per_second": 227.377, + "eval_steps_per_second": 1.228, + "step": 31929 + }, + { + "epoch": 1.267176994597524, + "grad_norm": 0.3056955337524414, + "learning_rate": 7.861970222032207e-05, + "loss": 1.6083, + "step": 31930 + }, + { + "epoch": 1.2677189357630363, + "grad_norm": 0.28661999106407166, + "learning_rate": 7.860621969026106e-05, + "loss": 1.6225, + "step": 31940 + }, + { + "epoch": 1.2682608769285484, + "grad_norm": 0.26536825299263, + "learning_rate": 7.859273423596403e-05, + "loss": 1.6166, + "step": 31950 + }, + { + "epoch": 1.2688028180940607, + "grad_norm": 0.5294485092163086, + "learning_rate": 7.85792458591015e-05, + "loss": 1.6225, + "step": 31960 + }, + { + "epoch": 1.2693447592595728, + "grad_norm": 0.3292939364910126, + "learning_rate": 7.85657545613444e-05, + "loss": 1.6334, + "step": 31970 + }, + { + "epoch": 1.2698867004250851, + "grad_norm": 0.23607225716114044, + "learning_rate": 7.85522603443639e-05, + "loss": 1.6311, + "step": 31980 + }, + { + "epoch": 1.2704286415905974, + "grad_norm": 0.28158169984817505, + "learning_rate": 7.853876320983165e-05, + "loss": 1.622, + "step": 31990 + }, + { + "epoch": 1.2709705827561095, + "grad_norm": 0.2377581000328064, + "learning_rate": 7.852526315941961e-05, + "loss": 1.623, + "step": 32000 + }, + { + "epoch": 1.2715125239216218, + "grad_norm": 0.514994740486145, + "learning_rate": 7.851176019480012e-05, + "loss": 1.6112, + "step": 32010 + }, + { + "epoch": 1.271837688620929, + "eval_loss": 2.515641689300537, + "eval_runtime": 21.988, + "eval_samples_per_second": 227.397, + "eval_steps_per_second": 1.228, + "step": 32016 + }, + { + "epoch": 1.272054465087134, + "grad_norm": 0.3797670900821686, + "learning_rate": 7.849825431764585e-05, + "loss": 1.6232, + "step": 32020 + }, + { + "epoch": 1.2725964062526463, + "grad_norm": 0.44352027773857117, + "learning_rate": 7.848474552962984e-05, + "loss": 1.6295, + "step": 32030 + }, + { + "epoch": 1.2731383474181583, + "grad_norm": 0.2417132705450058, + "learning_rate": 7.847123383242552e-05, + "loss": 1.6249, + "step": 32040 + }, + { + "epoch": 1.2736802885836707, + "grad_norm": 0.4180681109428406, + "learning_rate": 7.845771922770667e-05, + "loss": 1.6065, + "step": 32050 + }, + { + "epoch": 1.2742222297491828, + "grad_norm": 0.29426735639572144, + "learning_rate": 7.84442017171474e-05, + "loss": 1.6243, + "step": 32060 + }, + { + "epoch": 1.274764170914695, + "grad_norm": 0.37015318870544434, + "learning_rate": 7.84306813024222e-05, + "loss": 1.6223, + "step": 32070 + }, + { + "epoch": 1.2753061120802074, + "grad_norm": 0.2594332993030548, + "learning_rate": 7.841715798520592e-05, + "loss": 1.6248, + "step": 32080 + }, + { + "epoch": 1.2758480532457195, + "grad_norm": 0.23385967314243317, + "learning_rate": 7.840363176717377e-05, + "loss": 1.6098, + "step": 32090 + }, + { + "epoch": 1.2763899944112318, + "grad_norm": 0.38536781072616577, + "learning_rate": 7.839010265000136e-05, + "loss": 1.6224, + "step": 32100 + }, + { + "epoch": 1.2765525767608854, + "eval_loss": 2.518597364425659, + "eval_runtime": 21.9837, + "eval_samples_per_second": 227.441, + "eval_steps_per_second": 1.228, + "step": 32103 + }, + { + "epoch": 1.276931935576744, + "grad_norm": 0.2687205374240875, + "learning_rate": 7.837657063536456e-05, + "loss": 1.6277, + "step": 32110 + }, + { + "epoch": 1.2774738767422562, + "grad_norm": 0.29153600335121155, + "learning_rate": 7.83630357249397e-05, + "loss": 1.6221, + "step": 32120 + }, + { + "epoch": 1.2780158179077685, + "grad_norm": 0.42591148614883423, + "learning_rate": 7.834949792040337e-05, + "loss": 1.6185, + "step": 32130 + }, + { + "epoch": 1.2785577590732806, + "grad_norm": 0.5398927927017212, + "learning_rate": 7.833595722343263e-05, + "loss": 1.6177, + "step": 32140 + }, + { + "epoch": 1.2790997002387927, + "grad_norm": 0.3706096410751343, + "learning_rate": 7.832241363570482e-05, + "loss": 1.6266, + "step": 32150 + }, + { + "epoch": 1.279641641404305, + "grad_norm": 0.34234559535980225, + "learning_rate": 7.830886715889766e-05, + "loss": 1.6242, + "step": 32160 + }, + { + "epoch": 1.2801835825698173, + "grad_norm": 0.288562148809433, + "learning_rate": 7.829531779468925e-05, + "loss": 1.6247, + "step": 32170 + }, + { + "epoch": 1.2807255237353294, + "grad_norm": 0.5007647275924683, + "learning_rate": 7.8281765544758e-05, + "loss": 1.6259, + "step": 32180 + }, + { + "epoch": 1.2812674649008418, + "grad_norm": 0.2426513284444809, + "learning_rate": 7.826821041078271e-05, + "loss": 1.6284, + "step": 32190 + }, + { + "epoch": 1.2812674649008418, + "eval_loss": 2.516087293624878, + "eval_runtime": 21.9703, + "eval_samples_per_second": 227.58, + "eval_steps_per_second": 1.229, + "step": 32190 + }, + { + "epoch": 1.2818094060663539, + "grad_norm": 0.2416008710861206, + "learning_rate": 7.825465239444255e-05, + "loss": 1.6283, + "step": 32200 + }, + { + "epoch": 1.2823513472318662, + "grad_norm": 0.3502400517463684, + "learning_rate": 7.824109149741701e-05, + "loss": 1.6253, + "step": 32210 + }, + { + "epoch": 1.2828932883973785, + "grad_norm": 0.2979862689971924, + "learning_rate": 7.822752772138594e-05, + "loss": 1.6324, + "step": 32220 + }, + { + "epoch": 1.2834352295628906, + "grad_norm": 0.355049192905426, + "learning_rate": 7.821396106802958e-05, + "loss": 1.6229, + "step": 32230 + }, + { + "epoch": 1.2839771707284027, + "grad_norm": 0.29975321888923645, + "learning_rate": 7.820039153902852e-05, + "loss": 1.6195, + "step": 32240 + }, + { + "epoch": 1.284519111893915, + "grad_norm": 0.3482322692871094, + "learning_rate": 7.81868191360637e-05, + "loss": 1.6228, + "step": 32250 + }, + { + "epoch": 1.2850610530594273, + "grad_norm": 0.20890609920024872, + "learning_rate": 7.817324386081637e-05, + "loss": 1.6212, + "step": 32260 + }, + { + "epoch": 1.2856029942249394, + "grad_norm": 0.35245707631111145, + "learning_rate": 7.81596657149682e-05, + "loss": 1.613, + "step": 32270 + }, + { + "epoch": 1.285982353040798, + "eval_loss": 2.5217950344085693, + "eval_runtime": 21.9904, + "eval_samples_per_second": 227.372, + "eval_steps_per_second": 1.228, + "step": 32277 + }, + { + "epoch": 1.2861449353904517, + "grad_norm": 0.3934645354747772, + "learning_rate": 7.814608470020118e-05, + "loss": 1.6187, + "step": 32280 + }, + { + "epoch": 1.2866868765559638, + "grad_norm": 0.2991849482059479, + "learning_rate": 7.81325008181977e-05, + "loss": 1.6208, + "step": 32290 + }, + { + "epoch": 1.2872288177214761, + "grad_norm": 0.31971707940101624, + "learning_rate": 7.811891407064044e-05, + "loss": 1.6314, + "step": 32300 + }, + { + "epoch": 1.2877707588869884, + "grad_norm": 0.32109859585762024, + "learning_rate": 7.810532445921248e-05, + "loss": 1.6099, + "step": 32310 + }, + { + "epoch": 1.2883127000525005, + "grad_norm": 0.4225994348526001, + "learning_rate": 7.809173198559724e-05, + "loss": 1.6177, + "step": 32320 + }, + { + "epoch": 1.2888546412180126, + "grad_norm": 0.2910853326320648, + "learning_rate": 7.807813665147847e-05, + "loss": 1.6291, + "step": 32330 + }, + { + "epoch": 1.289396582383525, + "grad_norm": 0.3097500205039978, + "learning_rate": 7.806453845854036e-05, + "loss": 1.6157, + "step": 32340 + }, + { + "epoch": 1.2899385235490373, + "grad_norm": 0.2302774041891098, + "learning_rate": 7.805093740846736e-05, + "loss": 1.635, + "step": 32350 + }, + { + "epoch": 1.2904804647145494, + "grad_norm": 0.49598991870880127, + "learning_rate": 7.80373335029443e-05, + "loss": 1.6259, + "step": 32360 + }, + { + "epoch": 1.2906972411807542, + "eval_loss": 2.5177066326141357, + "eval_runtime": 21.9888, + "eval_samples_per_second": 227.389, + "eval_steps_per_second": 1.228, + "step": 32364 + }, + { + "epoch": 1.2910224058800617, + "grad_norm": 0.32611334323883057, + "learning_rate": 7.802372674365639e-05, + "loss": 1.6188, + "step": 32370 + }, + { + "epoch": 1.2915643470455738, + "grad_norm": 0.31275832653045654, + "learning_rate": 7.801011713228915e-05, + "loss": 1.6243, + "step": 32380 + }, + { + "epoch": 1.292106288211086, + "grad_norm": 0.31753793358802795, + "learning_rate": 7.799650467052853e-05, + "loss": 1.6231, + "step": 32390 + }, + { + "epoch": 1.2926482293765984, + "grad_norm": 0.25154951214790344, + "learning_rate": 7.798288936006073e-05, + "loss": 1.6303, + "step": 32400 + }, + { + "epoch": 1.2931901705421105, + "grad_norm": 0.264728844165802, + "learning_rate": 7.796927120257237e-05, + "loss": 1.6155, + "step": 32410 + }, + { + "epoch": 1.2937321117076228, + "grad_norm": 0.27522847056388855, + "learning_rate": 7.795565019975045e-05, + "loss": 1.6155, + "step": 32420 + }, + { + "epoch": 1.294274052873135, + "grad_norm": 0.3371419906616211, + "learning_rate": 7.794202635328222e-05, + "loss": 1.614, + "step": 32430 + }, + { + "epoch": 1.2948159940386472, + "grad_norm": 0.25964683294296265, + "learning_rate": 7.792839966485537e-05, + "loss": 1.6086, + "step": 32440 + }, + { + "epoch": 1.2953579352041593, + "grad_norm": 0.22096186876296997, + "learning_rate": 7.791477013615794e-05, + "loss": 1.608, + "step": 32450 + }, + { + "epoch": 1.2954121293207106, + "eval_loss": 2.525634288787842, + "eval_runtime": 21.9845, + "eval_samples_per_second": 227.433, + "eval_steps_per_second": 1.228, + "step": 32451 + }, + { + "epoch": 1.2958998763696716, + "grad_norm": 0.4005647301673889, + "learning_rate": 7.790113776887825e-05, + "loss": 1.6187, + "step": 32460 + }, + { + "epoch": 1.2964418175351837, + "grad_norm": 0.24895216524600983, + "learning_rate": 7.788750256470506e-05, + "loss": 1.6125, + "step": 32470 + }, + { + "epoch": 1.296983758700696, + "grad_norm": 0.2178104668855667, + "learning_rate": 7.787386452532739e-05, + "loss": 1.6205, + "step": 32480 + }, + { + "epoch": 1.2975256998662084, + "grad_norm": 0.30151644349098206, + "learning_rate": 7.78602236524347e-05, + "loss": 1.6093, + "step": 32490 + }, + { + "epoch": 1.2980676410317205, + "grad_norm": 0.3658519983291626, + "learning_rate": 7.784657994771676e-05, + "loss": 1.6158, + "step": 32500 + }, + { + "epoch": 1.2986095821972328, + "grad_norm": 0.40358707308769226, + "learning_rate": 7.783293341286368e-05, + "loss": 1.6271, + "step": 32510 + }, + { + "epoch": 1.2991515233627449, + "grad_norm": 0.4737723469734192, + "learning_rate": 7.781928404956594e-05, + "loss": 1.6102, + "step": 32520 + }, + { + "epoch": 1.2996934645282572, + "grad_norm": 0.3582918345928192, + "learning_rate": 7.780563185951437e-05, + "loss": 1.6199, + "step": 32530 + }, + { + "epoch": 1.300127017460667, + "eval_loss": 2.526118040084839, + "eval_runtime": 21.9907, + "eval_samples_per_second": 227.369, + "eval_steps_per_second": 1.228, + "step": 32538 + }, + { + "epoch": 1.3002354056937695, + "grad_norm": 0.282720148563385, + "learning_rate": 7.779197684440014e-05, + "loss": 1.6162, + "step": 32540 + }, + { + "epoch": 1.3007773468592816, + "grad_norm": 0.3246510624885559, + "learning_rate": 7.777831900591477e-05, + "loss": 1.6151, + "step": 32550 + }, + { + "epoch": 1.3013192880247937, + "grad_norm": 0.311619371175766, + "learning_rate": 7.776465834575013e-05, + "loss": 1.6199, + "step": 32560 + }, + { + "epoch": 1.301861229190306, + "grad_norm": 0.4281388223171234, + "learning_rate": 7.775099486559845e-05, + "loss": 1.6132, + "step": 32570 + }, + { + "epoch": 1.3024031703558183, + "grad_norm": 0.39166897535324097, + "learning_rate": 7.773732856715229e-05, + "loss": 1.6158, + "step": 32580 + }, + { + "epoch": 1.3029451115213304, + "grad_norm": 0.26505813002586365, + "learning_rate": 7.772365945210459e-05, + "loss": 1.6237, + "step": 32590 + }, + { + "epoch": 1.3034870526868427, + "grad_norm": 0.4248243570327759, + "learning_rate": 7.770998752214863e-05, + "loss": 1.6218, + "step": 32600 + }, + { + "epoch": 1.3040289938523548, + "grad_norm": 0.21477878093719482, + "learning_rate": 7.769631277897801e-05, + "loss": 1.6209, + "step": 32610 + }, + { + "epoch": 1.3045709350178671, + "grad_norm": 0.29866376519203186, + "learning_rate": 7.768263522428667e-05, + "loss": 1.6122, + "step": 32620 + }, + { + "epoch": 1.3048419056006233, + "eval_loss": 2.5290870666503906, + "eval_runtime": 21.9868, + "eval_samples_per_second": 227.409, + "eval_steps_per_second": 1.228, + "step": 32625 + }, + { + "epoch": 1.3051128761833795, + "grad_norm": 0.2718803882598877, + "learning_rate": 7.766895485976899e-05, + "loss": 1.6174, + "step": 32630 + }, + { + "epoch": 1.3056548173488915, + "grad_norm": 0.22970491647720337, + "learning_rate": 7.765527168711958e-05, + "loss": 1.6211, + "step": 32640 + }, + { + "epoch": 1.3061967585144036, + "grad_norm": 0.36656203866004944, + "learning_rate": 7.764158570803348e-05, + "loss": 1.6204, + "step": 32650 + }, + { + "epoch": 1.306738699679916, + "grad_norm": 0.31001171469688416, + "learning_rate": 7.762789692420604e-05, + "loss": 1.6245, + "step": 32660 + }, + { + "epoch": 1.3072806408454283, + "grad_norm": 0.3693181872367859, + "learning_rate": 7.761420533733297e-05, + "loss": 1.616, + "step": 32670 + }, + { + "epoch": 1.3078225820109404, + "grad_norm": 0.3442032039165497, + "learning_rate": 7.760051094911032e-05, + "loss": 1.6168, + "step": 32680 + }, + { + "epoch": 1.3083645231764527, + "grad_norm": 0.26000893115997314, + "learning_rate": 7.75868137612345e-05, + "loss": 1.6162, + "step": 32690 + }, + { + "epoch": 1.3089064643419648, + "grad_norm": 0.27503514289855957, + "learning_rate": 7.757311377540226e-05, + "loss": 1.6104, + "step": 32700 + }, + { + "epoch": 1.309448405507477, + "grad_norm": 0.395526647567749, + "learning_rate": 7.75594109933107e-05, + "loss": 1.6291, + "step": 32710 + }, + { + "epoch": 1.3095567937405796, + "eval_loss": 2.5185840129852295, + "eval_runtime": 21.986, + "eval_samples_per_second": 227.418, + "eval_steps_per_second": 1.228, + "step": 32712 + }, + { + "epoch": 1.3099903466729894, + "grad_norm": 0.31528306007385254, + "learning_rate": 7.75457054166572e-05, + "loss": 1.6118, + "step": 32720 + }, + { + "epoch": 1.3105322878385015, + "grad_norm": 0.3324233889579773, + "learning_rate": 7.753199704713963e-05, + "loss": 1.6176, + "step": 32730 + }, + { + "epoch": 1.3110742290040138, + "grad_norm": 0.2883225679397583, + "learning_rate": 7.75182858864561e-05, + "loss": 1.6269, + "step": 32740 + }, + { + "epoch": 1.311616170169526, + "grad_norm": 0.4456532597541809, + "learning_rate": 7.750457193630507e-05, + "loss": 1.6139, + "step": 32750 + }, + { + "epoch": 1.3121581113350382, + "grad_norm": 0.26752784848213196, + "learning_rate": 7.749085519838537e-05, + "loss": 1.6183, + "step": 32760 + }, + { + "epoch": 1.3127000525005503, + "grad_norm": 0.46080008149147034, + "learning_rate": 7.747713567439617e-05, + "loss": 1.6097, + "step": 32770 + }, + { + "epoch": 1.3132419936660626, + "grad_norm": 0.2891246974468231, + "learning_rate": 7.746341336603698e-05, + "loss": 1.6118, + "step": 32780 + }, + { + "epoch": 1.3137839348315747, + "grad_norm": 0.4009568989276886, + "learning_rate": 7.744968827500769e-05, + "loss": 1.63, + "step": 32790 + }, + { + "epoch": 1.3142716818805358, + "eval_loss": 2.504746913909912, + "eval_runtime": 21.9893, + "eval_samples_per_second": 227.384, + "eval_steps_per_second": 1.228, + "step": 32799 + }, + { + "epoch": 1.314325875997087, + "grad_norm": 0.31230422854423523, + "learning_rate": 7.743596040300848e-05, + "loss": 1.6139, + "step": 32800 + }, + { + "epoch": 1.3148678171625994, + "grad_norm": 0.4773191809654236, + "learning_rate": 7.742222975173991e-05, + "loss": 1.619, + "step": 32810 + }, + { + "epoch": 1.3154097583281115, + "grad_norm": 0.36002716422080994, + "learning_rate": 7.740849632290284e-05, + "loss": 1.6178, + "step": 32820 + }, + { + "epoch": 1.3159516994936238, + "grad_norm": 0.22423742711544037, + "learning_rate": 7.739476011819854e-05, + "loss": 1.6269, + "step": 32830 + }, + { + "epoch": 1.3164936406591359, + "grad_norm": 0.43875738978385925, + "learning_rate": 7.73810211393286e-05, + "loss": 1.6173, + "step": 32840 + }, + { + "epoch": 1.3170355818246482, + "grad_norm": 0.24483536183834076, + "learning_rate": 7.736727938799492e-05, + "loss": 1.6307, + "step": 32850 + }, + { + "epoch": 1.3175775229901605, + "grad_norm": 0.24819006025791168, + "learning_rate": 7.73535348658998e-05, + "loss": 1.61, + "step": 32860 + }, + { + "epoch": 1.3181194641556726, + "grad_norm": 0.2164914309978485, + "learning_rate": 7.733978757474579e-05, + "loss": 1.6216, + "step": 32870 + }, + { + "epoch": 1.3186614053211847, + "grad_norm": 0.2493513524532318, + "learning_rate": 7.732603751623591e-05, + "loss": 1.614, + "step": 32880 + }, + { + "epoch": 1.3189865700204921, + "eval_loss": 2.5233335494995117, + "eval_runtime": 21.9884, + "eval_samples_per_second": 227.392, + "eval_steps_per_second": 1.228, + "step": 32886 + }, + { + "epoch": 1.319203346486697, + "grad_norm": 0.2988925278186798, + "learning_rate": 7.731228469207342e-05, + "loss": 1.6109, + "step": 32890 + }, + { + "epoch": 1.3197452876522093, + "grad_norm": 0.3509455621242523, + "learning_rate": 7.7298529103962e-05, + "loss": 1.6183, + "step": 32900 + }, + { + "epoch": 1.3202872288177214, + "grad_norm": 0.33325713872909546, + "learning_rate": 7.728477075360558e-05, + "loss": 1.6207, + "step": 32910 + }, + { + "epoch": 1.3208291699832337, + "grad_norm": 0.22428090870380402, + "learning_rate": 7.72710096427085e-05, + "loss": 1.6048, + "step": 32920 + }, + { + "epoch": 1.3213711111487458, + "grad_norm": 0.4815099835395813, + "learning_rate": 7.725724577297547e-05, + "loss": 1.6148, + "step": 32930 + }, + { + "epoch": 1.3219130523142582, + "grad_norm": 0.2657027542591095, + "learning_rate": 7.724347914611142e-05, + "loss": 1.6154, + "step": 32940 + }, + { + "epoch": 1.3224549934797705, + "grad_norm": 0.4155304729938507, + "learning_rate": 7.722970976382179e-05, + "loss": 1.622, + "step": 32950 + }, + { + "epoch": 1.3229969346452826, + "grad_norm": 0.49975696206092834, + "learning_rate": 7.721593762781221e-05, + "loss": 1.6162, + "step": 32960 + }, + { + "epoch": 1.3235388758107947, + "grad_norm": 0.376020222902298, + "learning_rate": 7.720216273978872e-05, + "loss": 1.6212, + "step": 32970 + }, + { + "epoch": 1.3237014581604485, + "eval_loss": 2.526895523071289, + "eval_runtime": 21.9876, + "eval_samples_per_second": 227.401, + "eval_steps_per_second": 1.228, + "step": 32973 + }, + { + "epoch": 1.324080816976307, + "grad_norm": 0.22721335291862488, + "learning_rate": 7.71883851014577e-05, + "loss": 1.6171, + "step": 32980 + }, + { + "epoch": 1.3246227581418193, + "grad_norm": 0.2434106022119522, + "learning_rate": 7.717460471452588e-05, + "loss": 1.6035, + "step": 32990 + }, + { + "epoch": 1.3251646993073314, + "grad_norm": 0.4650270938873291, + "learning_rate": 7.716082158070031e-05, + "loss": 1.6103, + "step": 33000 + }, + { + "epoch": 1.3257066404728437, + "grad_norm": 0.2861434817314148, + "learning_rate": 7.714703570168835e-05, + "loss": 1.6158, + "step": 33010 + }, + { + "epoch": 1.3262485816383558, + "grad_norm": 0.7163834571838379, + "learning_rate": 7.713324707919777e-05, + "loss": 1.6211, + "step": 33020 + }, + { + "epoch": 1.326790522803868, + "grad_norm": 0.685391902923584, + "learning_rate": 7.711945571493663e-05, + "loss": 1.624, + "step": 33030 + }, + { + "epoch": 1.3273324639693804, + "grad_norm": 0.33094045519828796, + "learning_rate": 7.710566161061337e-05, + "loss": 1.6251, + "step": 33040 + }, + { + "epoch": 1.3278744051348925, + "grad_norm": 0.2844617962837219, + "learning_rate": 7.70918647679367e-05, + "loss": 1.6184, + "step": 33050 + }, + { + "epoch": 1.3284163463004048, + "grad_norm": 0.3345138430595398, + "learning_rate": 7.707806518861575e-05, + "loss": 1.6111, + "step": 33060 + }, + { + "epoch": 1.3284163463004048, + "eval_loss": 2.527109384536743, + "eval_runtime": 21.9601, + "eval_samples_per_second": 227.686, + "eval_steps_per_second": 1.23, + "step": 33060 + }, + { + "epoch": 1.328958287465917, + "grad_norm": 0.48074135184288025, + "learning_rate": 7.706426287435991e-05, + "loss": 1.611, + "step": 33070 + }, + { + "epoch": 1.3295002286314292, + "grad_norm": 0.31359565258026123, + "learning_rate": 7.7050457826879e-05, + "loss": 1.6273, + "step": 33080 + }, + { + "epoch": 1.3300421697969413, + "grad_norm": 0.23901283740997314, + "learning_rate": 7.703665004788312e-05, + "loss": 1.6191, + "step": 33090 + }, + { + "epoch": 1.3305841109624537, + "grad_norm": 0.5294537544250488, + "learning_rate": 7.70228395390827e-05, + "loss": 1.6146, + "step": 33100 + }, + { + "epoch": 1.3311260521279658, + "grad_norm": 0.2701181173324585, + "learning_rate": 7.700902630218852e-05, + "loss": 1.6156, + "step": 33110 + }, + { + "epoch": 1.331667993293478, + "grad_norm": 0.25721871852874756, + "learning_rate": 7.699521033891171e-05, + "loss": 1.6191, + "step": 33120 + }, + { + "epoch": 1.3322099344589904, + "grad_norm": 0.35411280393600464, + "learning_rate": 7.698139165096375e-05, + "loss": 1.6178, + "step": 33130 + }, + { + "epoch": 1.3327518756245025, + "grad_norm": 0.4553796052932739, + "learning_rate": 7.696757024005642e-05, + "loss": 1.6116, + "step": 33140 + }, + { + "epoch": 1.3331312344403612, + "eval_loss": 2.523940086364746, + "eval_runtime": 21.9819, + "eval_samples_per_second": 227.46, + "eval_steps_per_second": 1.228, + "step": 33147 + }, + { + "epoch": 1.3332938167900148, + "grad_norm": 0.22905197739601135, + "learning_rate": 7.695374610790187e-05, + "loss": 1.6081, + "step": 33150 + }, + { + "epoch": 1.333835757955527, + "grad_norm": 0.3004699945449829, + "learning_rate": 7.693991925621256e-05, + "loss": 1.6163, + "step": 33160 + }, + { + "epoch": 1.3343776991210392, + "grad_norm": 0.3088211119174957, + "learning_rate": 7.69260896867013e-05, + "loss": 1.6104, + "step": 33170 + }, + { + "epoch": 1.3349196402865515, + "grad_norm": 0.2766474187374115, + "learning_rate": 7.691225740108126e-05, + "loss": 1.6134, + "step": 33180 + }, + { + "epoch": 1.3354615814520636, + "grad_norm": 0.4962151050567627, + "learning_rate": 7.68984224010659e-05, + "loss": 1.6066, + "step": 33190 + }, + { + "epoch": 1.3360035226175757, + "grad_norm": 0.3378833830356598, + "learning_rate": 7.688458468836903e-05, + "loss": 1.6132, + "step": 33200 + }, + { + "epoch": 1.336545463783088, + "grad_norm": 0.34599658846855164, + "learning_rate": 7.687074426470484e-05, + "loss": 1.6161, + "step": 33210 + }, + { + "epoch": 1.3370874049486003, + "grad_norm": 0.3025360703468323, + "learning_rate": 7.68569011317878e-05, + "loss": 1.6168, + "step": 33220 + }, + { + "epoch": 1.3376293461141124, + "grad_norm": 0.32311034202575684, + "learning_rate": 7.684305529133273e-05, + "loss": 1.6177, + "step": 33230 + }, + { + "epoch": 1.3378461225803173, + "eval_loss": 2.5135555267333984, + "eval_runtime": 21.987, + "eval_samples_per_second": 227.407, + "eval_steps_per_second": 1.228, + "step": 33234 + }, + { + "epoch": 1.3381712872796248, + "grad_norm": 0.25993430614471436, + "learning_rate": 7.682920674505481e-05, + "loss": 1.612, + "step": 33240 + }, + { + "epoch": 1.3387132284451368, + "grad_norm": 0.35539132356643677, + "learning_rate": 7.681535549466954e-05, + "loss": 1.6175, + "step": 33250 + }, + { + "epoch": 1.3392551696106492, + "grad_norm": 0.24199128150939941, + "learning_rate": 7.680150154189275e-05, + "loss": 1.6121, + "step": 33260 + }, + { + "epoch": 1.3397971107761615, + "grad_norm": 0.4272856116294861, + "learning_rate": 7.678764488844059e-05, + "loss": 1.6128, + "step": 33270 + }, + { + "epoch": 1.3403390519416736, + "grad_norm": 0.23739628493785858, + "learning_rate": 7.677378553602958e-05, + "loss": 1.6096, + "step": 33280 + }, + { + "epoch": 1.3408809931071857, + "grad_norm": 0.2996535897254944, + "learning_rate": 7.675992348637654e-05, + "loss": 1.6071, + "step": 33290 + }, + { + "epoch": 1.341422934272698, + "grad_norm": 0.38013362884521484, + "learning_rate": 7.674605874119865e-05, + "loss": 1.6145, + "step": 33300 + }, + { + "epoch": 1.3419648754382103, + "grad_norm": 0.5391820073127747, + "learning_rate": 7.673219130221342e-05, + "loss": 1.6162, + "step": 33310 + }, + { + "epoch": 1.3425068166037224, + "grad_norm": 0.5446195006370544, + "learning_rate": 7.671832117113868e-05, + "loss": 1.6039, + "step": 33320 + }, + { + "epoch": 1.3425610107202737, + "eval_loss": 2.505586624145508, + "eval_runtime": 21.9853, + "eval_samples_per_second": 227.425, + "eval_steps_per_second": 1.228, + "step": 33321 + }, + { + "epoch": 1.3430487577692347, + "grad_norm": 0.3738638460636139, + "learning_rate": 7.670444834969262e-05, + "loss": 1.6136, + "step": 33330 + }, + { + "epoch": 1.3435906989347468, + "grad_norm": 0.2111603319644928, + "learning_rate": 7.669057283959371e-05, + "loss": 1.6126, + "step": 33340 + }, + { + "epoch": 1.3441326401002591, + "grad_norm": 0.27275893092155457, + "learning_rate": 7.667669464256081e-05, + "loss": 1.6204, + "step": 33350 + }, + { + "epoch": 1.3446745812657714, + "grad_norm": 0.3274075984954834, + "learning_rate": 7.66628137603131e-05, + "loss": 1.6052, + "step": 33360 + }, + { + "epoch": 1.3452165224312835, + "grad_norm": 0.22435316443443298, + "learning_rate": 7.664893019457007e-05, + "loss": 1.6097, + "step": 33370 + }, + { + "epoch": 1.3457584635967959, + "grad_norm": 0.41898712515830994, + "learning_rate": 7.663504394705155e-05, + "loss": 1.6118, + "step": 33380 + }, + { + "epoch": 1.346300404762308, + "grad_norm": 0.2666551172733307, + "learning_rate": 7.662115501947772e-05, + "loss": 1.5982, + "step": 33390 + }, + { + "epoch": 1.3468423459278203, + "grad_norm": 0.4487745463848114, + "learning_rate": 7.660726341356908e-05, + "loss": 1.6143, + "step": 33400 + }, + { + "epoch": 1.34727589886023, + "eval_loss": 2.506096363067627, + "eval_runtime": 21.9887, + "eval_samples_per_second": 227.39, + "eval_steps_per_second": 1.228, + "step": 33408 + }, + { + "epoch": 1.3473842870933324, + "grad_norm": 0.21905028820037842, + "learning_rate": 7.659336913104645e-05, + "loss": 1.6089, + "step": 33410 + }, + { + "epoch": 1.3479262282588447, + "grad_norm": 0.21832507848739624, + "learning_rate": 7.657947217363099e-05, + "loss": 1.6071, + "step": 33420 + }, + { + "epoch": 1.3484681694243568, + "grad_norm": 0.26740700006484985, + "learning_rate": 7.656557254304423e-05, + "loss": 1.6158, + "step": 33430 + }, + { + "epoch": 1.349010110589869, + "grad_norm": 0.5225000977516174, + "learning_rate": 7.655167024100798e-05, + "loss": 1.6123, + "step": 33440 + }, + { + "epoch": 1.3495520517553814, + "grad_norm": 0.3178088068962097, + "learning_rate": 7.653776526924435e-05, + "loss": 1.6182, + "step": 33450 + }, + { + "epoch": 1.3500939929208935, + "grad_norm": 0.21684478223323822, + "learning_rate": 7.65238576294759e-05, + "loss": 1.6098, + "step": 33460 + }, + { + "epoch": 1.3506359340864058, + "grad_norm": 0.5009759664535522, + "learning_rate": 7.650994732342539e-05, + "loss": 1.6073, + "step": 33470 + }, + { + "epoch": 1.351177875251918, + "grad_norm": 0.5376147031784058, + "learning_rate": 7.649603435281601e-05, + "loss": 1.609, + "step": 33480 + }, + { + "epoch": 1.3517198164174302, + "grad_norm": 0.49498310685157776, + "learning_rate": 7.648211871937121e-05, + "loss": 1.6146, + "step": 33490 + }, + { + "epoch": 1.3519907870001862, + "eval_loss": 2.51250958442688, + "eval_runtime": 21.983, + "eval_samples_per_second": 227.448, + "eval_steps_per_second": 1.228, + "step": 33495 + }, + { + "epoch": 1.3522617575829425, + "grad_norm": 0.4050725996494293, + "learning_rate": 7.64682004248148e-05, + "loss": 1.5954, + "step": 33500 + }, + { + "epoch": 1.3528036987484546, + "grad_norm": 0.2994046211242676, + "learning_rate": 7.645427947087096e-05, + "loss": 1.6119, + "step": 33510 + }, + { + "epoch": 1.3533456399139667, + "grad_norm": 0.2522560656070709, + "learning_rate": 7.64403558592641e-05, + "loss": 1.613, + "step": 33520 + }, + { + "epoch": 1.353887581079479, + "grad_norm": 0.20905892550945282, + "learning_rate": 7.642642959171905e-05, + "loss": 1.5925, + "step": 33530 + }, + { + "epoch": 1.3544295222449914, + "grad_norm": 0.259552925825119, + "learning_rate": 7.641250066996092e-05, + "loss": 1.6032, + "step": 33540 + }, + { + "epoch": 1.3549714634105035, + "grad_norm": 0.2907361686229706, + "learning_rate": 7.639856909571517e-05, + "loss": 1.6111, + "step": 33550 + }, + { + "epoch": 1.3555134045760158, + "grad_norm": 0.2905162274837494, + "learning_rate": 7.638463487070759e-05, + "loss": 1.6076, + "step": 33560 + }, + { + "epoch": 1.3560553457415279, + "grad_norm": 0.3089943826198578, + "learning_rate": 7.637069799666428e-05, + "loss": 1.6011, + "step": 33570 + }, + { + "epoch": 1.3565972869070402, + "grad_norm": 0.34541210532188416, + "learning_rate": 7.635675847531169e-05, + "loss": 1.6062, + "step": 33580 + }, + { + "epoch": 1.3567056751401427, + "eval_loss": 2.5141220092773438, + "eval_runtime": 21.9917, + "eval_samples_per_second": 227.359, + "eval_steps_per_second": 1.228, + "step": 33582 + }, + { + "epoch": 1.3571392280725525, + "grad_norm": 0.3026711642742157, + "learning_rate": 7.634281630837656e-05, + "loss": 1.6096, + "step": 33590 + }, + { + "epoch": 1.3576811692380646, + "grad_norm": 0.37327638268470764, + "learning_rate": 7.632887149758604e-05, + "loss": 1.6083, + "step": 33600 + }, + { + "epoch": 1.3582231104035767, + "grad_norm": 0.24328859150409698, + "learning_rate": 7.63149240446675e-05, + "loss": 1.6124, + "step": 33610 + }, + { + "epoch": 1.358765051569089, + "grad_norm": 0.26186954975128174, + "learning_rate": 7.630097395134873e-05, + "loss": 1.5939, + "step": 33620 + }, + { + "epoch": 1.3593069927346013, + "grad_norm": 0.31656497716903687, + "learning_rate": 7.628702121935776e-05, + "loss": 1.6096, + "step": 33630 + }, + { + "epoch": 1.3598489339001134, + "grad_norm": 0.3277546763420105, + "learning_rate": 7.627306585042302e-05, + "loss": 1.6049, + "step": 33640 + }, + { + "epoch": 1.3603908750656257, + "grad_norm": 0.25422918796539307, + "learning_rate": 7.625910784627326e-05, + "loss": 1.6149, + "step": 33650 + }, + { + "epoch": 1.3609328162311378, + "grad_norm": 0.22749896347522736, + "learning_rate": 7.62451472086375e-05, + "loss": 1.6047, + "step": 33660 + }, + { + "epoch": 1.3614205632800989, + "eval_loss": 2.5274205207824707, + "eval_runtime": 21.9881, + "eval_samples_per_second": 227.396, + "eval_steps_per_second": 1.228, + "step": 33669 + }, + { + "epoch": 1.3614747573966501, + "grad_norm": 0.3570140302181244, + "learning_rate": 7.623118393924515e-05, + "loss": 1.6078, + "step": 33670 + }, + { + "epoch": 1.3620166985621625, + "grad_norm": 0.583695113658905, + "learning_rate": 7.62172180398259e-05, + "loss": 1.6051, + "step": 33680 + }, + { + "epoch": 1.3625586397276745, + "grad_norm": 0.2449539452791214, + "learning_rate": 7.620324951210981e-05, + "loss": 1.6079, + "step": 33690 + }, + { + "epoch": 1.3631005808931869, + "grad_norm": 0.33772215247154236, + "learning_rate": 7.618927835782724e-05, + "loss": 1.6193, + "step": 33700 + }, + { + "epoch": 1.363642522058699, + "grad_norm": 0.28392404317855835, + "learning_rate": 7.617530457870883e-05, + "loss": 1.6043, + "step": 33710 + }, + { + "epoch": 1.3641844632242113, + "grad_norm": 0.34906795620918274, + "learning_rate": 7.616132817648565e-05, + "loss": 1.6015, + "step": 33720 + }, + { + "epoch": 1.3647264043897234, + "grad_norm": 0.23826880753040314, + "learning_rate": 7.614734915288899e-05, + "loss": 1.6132, + "step": 33730 + }, + { + "epoch": 1.3652683455552357, + "grad_norm": 0.4068436026573181, + "learning_rate": 7.613336750965053e-05, + "loss": 1.6078, + "step": 33740 + }, + { + "epoch": 1.3658102867207478, + "grad_norm": 0.45196202397346497, + "learning_rate": 7.611938324850226e-05, + "loss": 1.5997, + "step": 33750 + }, + { + "epoch": 1.3661354514200552, + "eval_loss": 2.5162079334259033, + "eval_runtime": 21.9869, + "eval_samples_per_second": 227.408, + "eval_steps_per_second": 1.228, + "step": 33756 + }, + { + "epoch": 1.36635222788626, + "grad_norm": 0.2863083779811859, + "learning_rate": 7.610539637117647e-05, + "loss": 1.6013, + "step": 33760 + }, + { + "epoch": 1.3668941690517724, + "grad_norm": 0.3467727303504944, + "learning_rate": 7.609140687940585e-05, + "loss": 1.6113, + "step": 33770 + }, + { + "epoch": 1.3674361102172845, + "grad_norm": 0.8201264142990112, + "learning_rate": 7.607741477492327e-05, + "loss": 1.6092, + "step": 33780 + }, + { + "epoch": 1.3679780513827968, + "grad_norm": 0.3005310893058777, + "learning_rate": 7.606342005946207e-05, + "loss": 1.6119, + "step": 33790 + }, + { + "epoch": 1.368519992548309, + "grad_norm": 0.31185653805732727, + "learning_rate": 7.604942273475585e-05, + "loss": 1.6117, + "step": 33800 + }, + { + "epoch": 1.3690619337138212, + "grad_norm": 0.4523196816444397, + "learning_rate": 7.603542280253853e-05, + "loss": 1.6103, + "step": 33810 + }, + { + "epoch": 1.3696038748793335, + "grad_norm": 0.2638733386993408, + "learning_rate": 7.602142026454435e-05, + "loss": 1.6264, + "step": 33820 + }, + { + "epoch": 1.3701458160448456, + "grad_norm": 0.25433018803596497, + "learning_rate": 7.60074151225079e-05, + "loss": 1.6129, + "step": 33830 + }, + { + "epoch": 1.3706877572103577, + "grad_norm": 0.22881822288036346, + "learning_rate": 7.599340737816406e-05, + "loss": 1.5969, + "step": 33840 + }, + { + "epoch": 1.3708503395600116, + "eval_loss": 2.5215718746185303, + "eval_runtime": 21.9866, + "eval_samples_per_second": 227.411, + "eval_steps_per_second": 1.228, + "step": 33843 + }, + { + "epoch": 1.37122969837587, + "grad_norm": 0.47086840867996216, + "learning_rate": 7.597939703324807e-05, + "loss": 1.6004, + "step": 33850 + }, + { + "epoch": 1.3717716395413824, + "grad_norm": 0.25351575016975403, + "learning_rate": 7.596538408949546e-05, + "loss": 1.6025, + "step": 33860 + }, + { + "epoch": 1.3723135807068945, + "grad_norm": 0.47730979323387146, + "learning_rate": 7.595136854864208e-05, + "loss": 1.6075, + "step": 33870 + }, + { + "epoch": 1.3728555218724068, + "grad_norm": 0.22560712695121765, + "learning_rate": 7.593735041242414e-05, + "loss": 1.6045, + "step": 33880 + }, + { + "epoch": 1.3733974630379189, + "grad_norm": 0.3164345920085907, + "learning_rate": 7.592332968257812e-05, + "loss": 1.612, + "step": 33890 + }, + { + "epoch": 1.3739394042034312, + "grad_norm": 0.5792235136032104, + "learning_rate": 7.590930636084087e-05, + "loss": 1.6049, + "step": 33900 + }, + { + "epoch": 1.3744813453689435, + "grad_norm": 0.2980400025844574, + "learning_rate": 7.589528044894951e-05, + "loss": 1.6049, + "step": 33910 + }, + { + "epoch": 1.3750232865344556, + "grad_norm": 0.2572561800479889, + "learning_rate": 7.588125194864154e-05, + "loss": 1.5972, + "step": 33920 + }, + { + "epoch": 1.3755652276999677, + "grad_norm": 0.3515624701976776, + "learning_rate": 7.586722086165471e-05, + "loss": 1.6086, + "step": 33930 + }, + { + "epoch": 1.3755652276999677, + "eval_loss": 2.5238847732543945, + "eval_runtime": 21.9722, + "eval_samples_per_second": 227.56, + "eval_steps_per_second": 1.229, + "step": 33930 + }, + { + "epoch": 1.37610716886548, + "grad_norm": 0.22956761717796326, + "learning_rate": 7.585318718972719e-05, + "loss": 1.5948, + "step": 33940 + }, + { + "epoch": 1.3766491100309923, + "grad_norm": 0.2527923583984375, + "learning_rate": 7.583915093459736e-05, + "loss": 1.5908, + "step": 33950 + }, + { + "epoch": 1.3771910511965044, + "grad_norm": 0.3912854790687561, + "learning_rate": 7.5825112098004e-05, + "loss": 1.5966, + "step": 33960 + }, + { + "epoch": 1.3777329923620167, + "grad_norm": 0.2859535813331604, + "learning_rate": 7.581107068168615e-05, + "loss": 1.6188, + "step": 33970 + }, + { + "epoch": 1.3782749335275288, + "grad_norm": 0.29767870903015137, + "learning_rate": 7.579702668738323e-05, + "loss": 1.604, + "step": 33980 + }, + { + "epoch": 1.3788168746930412, + "grad_norm": 0.3692917227745056, + "learning_rate": 7.578298011683493e-05, + "loss": 1.6166, + "step": 33990 + }, + { + "epoch": 1.3793588158585535, + "grad_norm": 0.40221714973449707, + "learning_rate": 7.576893097178128e-05, + "loss": 1.6078, + "step": 34000 + }, + { + "epoch": 1.3799007570240656, + "grad_norm": 0.4148884415626526, + "learning_rate": 7.575487925396264e-05, + "loss": 1.6028, + "step": 34010 + }, + { + "epoch": 1.380280115839924, + "eval_loss": 2.53521466255188, + "eval_runtime": 21.9888, + "eval_samples_per_second": 227.389, + "eval_steps_per_second": 1.228, + "step": 34017 + }, + { + "epoch": 1.3804426981895779, + "grad_norm": 0.5252642035484314, + "learning_rate": 7.574082496511966e-05, + "loss": 1.6146, + "step": 34020 + }, + { + "epoch": 1.38098463935509, + "grad_norm": 0.4250527322292328, + "learning_rate": 7.572676810699333e-05, + "loss": 1.6074, + "step": 34030 + }, + { + "epoch": 1.3815265805206023, + "grad_norm": 0.31851446628570557, + "learning_rate": 7.571270868132496e-05, + "loss": 1.5978, + "step": 34040 + }, + { + "epoch": 1.3820685216861144, + "grad_norm": 0.22338920831680298, + "learning_rate": 7.569864668985617e-05, + "loss": 1.5961, + "step": 34050 + }, + { + "epoch": 1.3826104628516267, + "grad_norm": 0.2865997552871704, + "learning_rate": 7.568458213432888e-05, + "loss": 1.6073, + "step": 34060 + }, + { + "epoch": 1.3831524040171388, + "grad_norm": 0.3791174590587616, + "learning_rate": 7.567051501648536e-05, + "loss": 1.6018, + "step": 34070 + }, + { + "epoch": 1.383694345182651, + "grad_norm": 0.3320346474647522, + "learning_rate": 7.565644533806818e-05, + "loss": 1.5995, + "step": 34080 + }, + { + "epoch": 1.3842362863481634, + "grad_norm": 0.2221497744321823, + "learning_rate": 7.564237310082024e-05, + "loss": 1.5972, + "step": 34090 + }, + { + "epoch": 1.3847782275136755, + "grad_norm": 0.27021268010139465, + "learning_rate": 7.562829830648474e-05, + "loss": 1.6121, + "step": 34100 + }, + { + "epoch": 1.3849950039798804, + "eval_loss": 2.520078420639038, + "eval_runtime": 21.9871, + "eval_samples_per_second": 227.407, + "eval_steps_per_second": 1.228, + "step": 34104 + }, + { + "epoch": 1.3853201686791878, + "grad_norm": 0.35636231303215027, + "learning_rate": 7.56142209568052e-05, + "loss": 1.6015, + "step": 34110 + }, + { + "epoch": 1.3858621098447, + "grad_norm": 0.45814478397369385, + "learning_rate": 7.560014105352546e-05, + "loss": 1.6131, + "step": 34120 + }, + { + "epoch": 1.3864040510102122, + "grad_norm": 0.6515801548957825, + "learning_rate": 7.55860585983897e-05, + "loss": 1.6189, + "step": 34130 + }, + { + "epoch": 1.3869459921757243, + "grad_norm": 0.32424890995025635, + "learning_rate": 7.557197359314237e-05, + "loss": 1.6162, + "step": 34140 + }, + { + "epoch": 1.3874879333412367, + "grad_norm": 0.24668356776237488, + "learning_rate": 7.555788603952825e-05, + "loss": 1.607, + "step": 34150 + }, + { + "epoch": 1.3880298745067488, + "grad_norm": 0.2475864738225937, + "learning_rate": 7.554379593929248e-05, + "loss": 1.6014, + "step": 34160 + }, + { + "epoch": 1.388571815672261, + "grad_norm": 0.2331823706626892, + "learning_rate": 7.552970329418045e-05, + "loss": 1.6068, + "step": 34170 + }, + { + "epoch": 1.3891137568377734, + "grad_norm": 0.2390824258327484, + "learning_rate": 7.551560810593792e-05, + "loss": 1.5906, + "step": 34180 + }, + { + "epoch": 1.3896556980032855, + "grad_norm": 0.26939457654953003, + "learning_rate": 7.550151037631092e-05, + "loss": 1.6275, + "step": 34190 + }, + { + "epoch": 1.3897098921198368, + "eval_loss": 2.518341064453125, + "eval_runtime": 21.9824, + "eval_samples_per_second": 227.455, + "eval_steps_per_second": 1.228, + "step": 34191 + }, + { + "epoch": 1.3901976391687978, + "grad_norm": 0.262103796005249, + "learning_rate": 7.548741010704583e-05, + "loss": 1.5979, + "step": 34200 + }, + { + "epoch": 1.3907395803343099, + "grad_norm": 0.34173309803009033, + "learning_rate": 7.547330729988931e-05, + "loss": 1.5959, + "step": 34210 + }, + { + "epoch": 1.3912815214998222, + "grad_norm": 0.22995412349700928, + "learning_rate": 7.545920195658837e-05, + "loss": 1.605, + "step": 34220 + }, + { + "epoch": 1.3918234626653345, + "grad_norm": 0.2825547158718109, + "learning_rate": 7.544509407889033e-05, + "loss": 1.5997, + "step": 34230 + }, + { + "epoch": 1.3923654038308466, + "grad_norm": 0.3113856613636017, + "learning_rate": 7.54309836685428e-05, + "loss": 1.6061, + "step": 34240 + }, + { + "epoch": 1.3929073449963587, + "grad_norm": 0.32774263620376587, + "learning_rate": 7.54168707272937e-05, + "loss": 1.5979, + "step": 34250 + }, + { + "epoch": 1.393449286161871, + "grad_norm": 0.3357396721839905, + "learning_rate": 7.540275525689131e-05, + "loss": 1.6033, + "step": 34260 + }, + { + "epoch": 1.3939912273273833, + "grad_norm": 0.35256245732307434, + "learning_rate": 7.538863725908416e-05, + "loss": 1.6, + "step": 34270 + }, + { + "epoch": 1.394424780259793, + "eval_loss": 2.519773244857788, + "eval_runtime": 21.9811, + "eval_samples_per_second": 227.468, + "eval_steps_per_second": 1.228, + "step": 34278 + }, + { + "epoch": 1.3945331684928954, + "grad_norm": 0.31579533219337463, + "learning_rate": 7.537451673562116e-05, + "loss": 1.5972, + "step": 34280 + }, + { + "epoch": 1.3950751096584078, + "grad_norm": 0.3092842996120453, + "learning_rate": 7.536039368825147e-05, + "loss": 1.5975, + "step": 34290 + }, + { + "epoch": 1.3956170508239198, + "grad_norm": 0.4527590572834015, + "learning_rate": 7.534626811872463e-05, + "loss": 1.6089, + "step": 34300 + }, + { + "epoch": 1.3961589919894322, + "grad_norm": 0.30094006657600403, + "learning_rate": 7.53321400287904e-05, + "loss": 1.6083, + "step": 34310 + }, + { + "epoch": 1.3967009331549445, + "grad_norm": 0.3692034184932709, + "learning_rate": 7.531800942019895e-05, + "loss": 1.6021, + "step": 34320 + }, + { + "epoch": 1.3972428743204566, + "grad_norm": 0.37879860401153564, + "learning_rate": 7.530387629470072e-05, + "loss": 1.6015, + "step": 34330 + }, + { + "epoch": 1.3977848154859687, + "grad_norm": 0.5758041739463806, + "learning_rate": 7.528974065404644e-05, + "loss": 1.598, + "step": 34340 + }, + { + "epoch": 1.398326756651481, + "grad_norm": 0.29847633838653564, + "learning_rate": 7.527560249998716e-05, + "loss": 1.6055, + "step": 34350 + }, + { + "epoch": 1.3988686978169933, + "grad_norm": 0.22990530729293823, + "learning_rate": 7.526146183427428e-05, + "loss": 1.5928, + "step": 34360 + }, + { + "epoch": 1.3991396683997492, + "eval_loss": 2.5169806480407715, + "eval_runtime": 21.9875, + "eval_samples_per_second": 227.402, + "eval_steps_per_second": 1.228, + "step": 34365 + }, + { + "epoch": 1.3994106389825054, + "grad_norm": 0.20360521972179413, + "learning_rate": 7.524731865865947e-05, + "loss": 1.5949, + "step": 34370 + }, + { + "epoch": 1.3999525801480177, + "grad_norm": 0.24859194457530975, + "learning_rate": 7.523317297489473e-05, + "loss": 1.6024, + "step": 34380 + }, + { + "epoch": 1.4004945213135298, + "grad_norm": 0.33308878540992737, + "learning_rate": 7.521902478473238e-05, + "loss": 1.598, + "step": 34390 + }, + { + "epoch": 1.4010364624790421, + "grad_norm": 0.27835598587989807, + "learning_rate": 7.520487408992501e-05, + "loss": 1.5965, + "step": 34400 + }, + { + "epoch": 1.4015784036445544, + "grad_norm": 0.36223334074020386, + "learning_rate": 7.519072089222557e-05, + "loss": 1.6055, + "step": 34410 + }, + { + "epoch": 1.4021203448100665, + "grad_norm": 0.3996673822402954, + "learning_rate": 7.517656519338728e-05, + "loss": 1.6096, + "step": 34420 + }, + { + "epoch": 1.4026622859755788, + "grad_norm": 0.25303345918655396, + "learning_rate": 7.516240699516367e-05, + "loss": 1.614, + "step": 34430 + }, + { + "epoch": 1.403204227141091, + "grad_norm": 0.3061133325099945, + "learning_rate": 7.514824629930865e-05, + "loss": 1.6032, + "step": 34440 + }, + { + "epoch": 1.4037461683066033, + "grad_norm": 0.3153916895389557, + "learning_rate": 7.513408310757632e-05, + "loss": 1.5952, + "step": 34450 + }, + { + "epoch": 1.4038545565397056, + "eval_loss": 2.5186572074890137, + "eval_runtime": 21.9793, + "eval_samples_per_second": 227.487, + "eval_steps_per_second": 1.228, + "step": 34452 + }, + { + "epoch": 1.4042881094721154, + "grad_norm": 0.24946312606334686, + "learning_rate": 7.51199174217212e-05, + "loss": 1.6027, + "step": 34460 + }, + { + "epoch": 1.4048300506376277, + "grad_norm": 0.23080390691757202, + "learning_rate": 7.510574924349807e-05, + "loss": 1.604, + "step": 34470 + }, + { + "epoch": 1.4053719918031398, + "grad_norm": 0.36294880509376526, + "learning_rate": 7.509157857466202e-05, + "loss": 1.608, + "step": 34480 + }, + { + "epoch": 1.405913932968652, + "grad_norm": 0.3913600742816925, + "learning_rate": 7.507740541696844e-05, + "loss": 1.6031, + "step": 34490 + }, + { + "epoch": 1.4064558741341644, + "grad_norm": 0.36812445521354675, + "learning_rate": 7.506322977217305e-05, + "loss": 1.596, + "step": 34500 + }, + { + "epoch": 1.4069978152996765, + "grad_norm": 0.5203189253807068, + "learning_rate": 7.504905164203184e-05, + "loss": 1.6107, + "step": 34510 + }, + { + "epoch": 1.4075397564651888, + "grad_norm": 0.2861349880695343, + "learning_rate": 7.503487102830116e-05, + "loss": 1.6116, + "step": 34520 + }, + { + "epoch": 1.408081697630701, + "grad_norm": 0.3074442446231842, + "learning_rate": 7.502068793273765e-05, + "loss": 1.602, + "step": 34530 + }, + { + "epoch": 1.408569444679662, + "eval_loss": 2.5188817977905273, + "eval_runtime": 21.988, + "eval_samples_per_second": 227.396, + "eval_steps_per_second": 1.228, + "step": 34539 + }, + { + "epoch": 1.4086236387962132, + "grad_norm": 0.3228742778301239, + "learning_rate": 7.500650235709826e-05, + "loss": 1.5973, + "step": 34540 + }, + { + "epoch": 1.4091655799617255, + "grad_norm": 0.3475671708583832, + "learning_rate": 7.49923143031402e-05, + "loss": 1.5987, + "step": 34550 + }, + { + "epoch": 1.4097075211272376, + "grad_norm": 0.3168829083442688, + "learning_rate": 7.497812377262102e-05, + "loss": 1.6001, + "step": 34560 + }, + { + "epoch": 1.4102494622927497, + "grad_norm": 0.2580108046531677, + "learning_rate": 7.496393076729862e-05, + "loss": 1.6122, + "step": 34570 + }, + { + "epoch": 1.410791403458262, + "grad_norm": 0.28347161412239075, + "learning_rate": 7.494973528893117e-05, + "loss": 1.609, + "step": 34580 + }, + { + "epoch": 1.4113333446237744, + "grad_norm": 0.25245901942253113, + "learning_rate": 7.49355373392771e-05, + "loss": 1.6036, + "step": 34590 + }, + { + "epoch": 1.4118752857892864, + "grad_norm": 0.29171431064605713, + "learning_rate": 7.492133692009524e-05, + "loss": 1.5972, + "step": 34600 + }, + { + "epoch": 1.4124172269547988, + "grad_norm": 0.3764426112174988, + "learning_rate": 7.490713403314462e-05, + "loss": 1.6124, + "step": 34610 + }, + { + "epoch": 1.4129591681203109, + "grad_norm": 0.2615674138069153, + "learning_rate": 7.489292868018469e-05, + "loss": 1.6089, + "step": 34620 + }, + { + "epoch": 1.4132843328196183, + "eval_loss": 2.516463279724121, + "eval_runtime": 21.9878, + "eval_samples_per_second": 227.399, + "eval_steps_per_second": 1.228, + "step": 34626 + }, + { + "epoch": 1.4135011092858232, + "grad_norm": 0.2315002828836441, + "learning_rate": 7.487872086297513e-05, + "loss": 1.6043, + "step": 34630 + }, + { + "epoch": 1.4140430504513355, + "grad_norm": 0.25207579135894775, + "learning_rate": 7.48645105832759e-05, + "loss": 1.5948, + "step": 34640 + }, + { + "epoch": 1.4145849916168476, + "grad_norm": 0.29729408025741577, + "learning_rate": 7.485029784284737e-05, + "loss": 1.6036, + "step": 34650 + }, + { + "epoch": 1.4151269327823597, + "grad_norm": 0.2224244922399521, + "learning_rate": 7.483608264345011e-05, + "loss": 1.592, + "step": 34660 + }, + { + "epoch": 1.415668873947872, + "grad_norm": 0.3714025616645813, + "learning_rate": 7.482186498684504e-05, + "loss": 1.5983, + "step": 34670 + }, + { + "epoch": 1.4162108151133843, + "grad_norm": 0.29220449924468994, + "learning_rate": 7.480764487479342e-05, + "loss": 1.5968, + "step": 34680 + }, + { + "epoch": 1.4167527562788964, + "grad_norm": 0.2586628198623657, + "learning_rate": 7.479342230905674e-05, + "loss": 1.5986, + "step": 34690 + }, + { + "epoch": 1.4172946974444087, + "grad_norm": 0.2448311746120453, + "learning_rate": 7.477919729139684e-05, + "loss": 1.5924, + "step": 34700 + }, + { + "epoch": 1.4178366386099208, + "grad_norm": 0.43788665533065796, + "learning_rate": 7.476496982357585e-05, + "loss": 1.6003, + "step": 34710 + }, + { + "epoch": 1.4179992209595746, + "eval_loss": 2.5183908939361572, + "eval_runtime": 21.9844, + "eval_samples_per_second": 227.434, + "eval_steps_per_second": 1.228, + "step": 34713 + }, + { + "epoch": 1.4183785797754331, + "grad_norm": 0.3235478103160858, + "learning_rate": 7.47507399073562e-05, + "loss": 1.6102, + "step": 34720 + }, + { + "epoch": 1.4189205209409455, + "grad_norm": 0.2464754730463028, + "learning_rate": 7.473650754450066e-05, + "loss": 1.5979, + "step": 34730 + }, + { + "epoch": 1.4194624621064575, + "grad_norm": 0.4333379566669464, + "learning_rate": 7.472227273677225e-05, + "loss": 1.6088, + "step": 34740 + }, + { + "epoch": 1.4200044032719699, + "grad_norm": 0.2818497121334076, + "learning_rate": 7.47080354859343e-05, + "loss": 1.6013, + "step": 34750 + }, + { + "epoch": 1.420546344437482, + "grad_norm": 0.2427944540977478, + "learning_rate": 7.469379579375049e-05, + "loss": 1.5914, + "step": 34760 + }, + { + "epoch": 1.4210882856029943, + "grad_norm": 0.29250580072402954, + "learning_rate": 7.467955366198474e-05, + "loss": 1.6074, + "step": 34770 + }, + { + "epoch": 1.4216302267685064, + "grad_norm": 0.26771166920661926, + "learning_rate": 7.466530909240135e-05, + "loss": 1.6108, + "step": 34780 + }, + { + "epoch": 1.4221721679340187, + "grad_norm": 0.29733744263648987, + "learning_rate": 7.465106208676485e-05, + "loss": 1.5965, + "step": 34790 + }, + { + "epoch": 1.4227141090995308, + "grad_norm": 0.2804502546787262, + "learning_rate": 7.463681264684007e-05, + "loss": 1.5984, + "step": 34800 + }, + { + "epoch": 1.4227141090995308, + "eval_loss": 2.5124473571777344, + "eval_runtime": 21.9639, + "eval_samples_per_second": 227.646, + "eval_steps_per_second": 1.229, + "step": 34800 + }, + { + "epoch": 1.423256050265043, + "grad_norm": 0.46871018409729004, + "learning_rate": 7.46225607743922e-05, + "loss": 1.5917, + "step": 34810 + }, + { + "epoch": 1.4237979914305554, + "grad_norm": 0.26123303174972534, + "learning_rate": 7.46083064711867e-05, + "loss": 1.5963, + "step": 34820 + }, + { + "epoch": 1.4243399325960675, + "grad_norm": 0.22404921054840088, + "learning_rate": 7.459404973898932e-05, + "loss": 1.6005, + "step": 34830 + }, + { + "epoch": 1.4248818737615798, + "grad_norm": 0.26554566621780396, + "learning_rate": 7.457979057956615e-05, + "loss": 1.6021, + "step": 34840 + }, + { + "epoch": 1.425423814927092, + "grad_norm": 0.2421078085899353, + "learning_rate": 7.45655289946835e-05, + "loss": 1.6013, + "step": 34850 + }, + { + "epoch": 1.4259657560926042, + "grad_norm": 0.22835715115070343, + "learning_rate": 7.455126498610807e-05, + "loss": 1.5989, + "step": 34860 + }, + { + "epoch": 1.4265076972581165, + "grad_norm": 0.37547022104263306, + "learning_rate": 7.453699855560683e-05, + "loss": 1.6129, + "step": 34870 + }, + { + "epoch": 1.4270496384236286, + "grad_norm": 0.4508250057697296, + "learning_rate": 7.452272970494702e-05, + "loss": 1.5987, + "step": 34880 + }, + { + "epoch": 1.4274289972394871, + "eval_loss": 2.5165185928344727, + "eval_runtime": 22.3018, + "eval_samples_per_second": 224.198, + "eval_steps_per_second": 1.211, + "step": 34887 + }, + { + "epoch": 1.4275915795891407, + "grad_norm": 0.28928378224372864, + "learning_rate": 7.450845843589622e-05, + "loss": 1.5965, + "step": 34890 + }, + { + "epoch": 1.428133520754653, + "grad_norm": 0.24763907492160797, + "learning_rate": 7.449418475022228e-05, + "loss": 1.5987, + "step": 34900 + }, + { + "epoch": 1.4286754619201654, + "grad_norm": 0.28781482577323914, + "learning_rate": 7.447990864969336e-05, + "loss": 1.59, + "step": 34910 + }, + { + "epoch": 1.4292174030856775, + "grad_norm": 0.4051133990287781, + "learning_rate": 7.446563013607795e-05, + "loss": 1.5962, + "step": 34920 + }, + { + "epoch": 1.4297593442511898, + "grad_norm": 0.472084641456604, + "learning_rate": 7.445134921114477e-05, + "loss": 1.5906, + "step": 34930 + }, + { + "epoch": 1.4303012854167019, + "grad_norm": 0.495980441570282, + "learning_rate": 7.443706587666292e-05, + "loss": 1.6111, + "step": 34940 + }, + { + "epoch": 1.4308432265822142, + "grad_norm": 0.41576093435287476, + "learning_rate": 7.442278013440173e-05, + "loss": 1.5951, + "step": 34950 + }, + { + "epoch": 1.4313851677477265, + "grad_norm": 0.48290589451789856, + "learning_rate": 7.440849198613084e-05, + "loss": 1.5979, + "step": 34960 + }, + { + "epoch": 1.4319271089132386, + "grad_norm": 0.3737516701221466, + "learning_rate": 7.439420143362025e-05, + "loss": 1.6024, + "step": 34970 + }, + { + "epoch": 1.4321438853794435, + "eval_loss": 2.513838768005371, + "eval_runtime": 21.9922, + "eval_samples_per_second": 227.353, + "eval_steps_per_second": 1.228, + "step": 34974 + }, + { + "epoch": 1.4324690500787507, + "grad_norm": 0.22067669034004211, + "learning_rate": 7.437990847864018e-05, + "loss": 1.6073, + "step": 34980 + }, + { + "epoch": 1.433010991244263, + "grad_norm": 0.35069945454597473, + "learning_rate": 7.436561312296118e-05, + "loss": 1.5947, + "step": 34990 + }, + { + "epoch": 1.4335529324097753, + "grad_norm": 0.3185942471027374, + "learning_rate": 7.435131536835412e-05, + "loss": 1.5965, + "step": 35000 + }, + { + "epoch": 1.4340948735752874, + "grad_norm": 0.3867363929748535, + "learning_rate": 7.433701521659012e-05, + "loss": 1.6054, + "step": 35010 + }, + { + "epoch": 1.4346368147407997, + "grad_norm": 0.2255895733833313, + "learning_rate": 7.432271266944063e-05, + "loss": 1.5981, + "step": 35020 + }, + { + "epoch": 1.4351787559063118, + "grad_norm": 0.5799124836921692, + "learning_rate": 7.43084077286774e-05, + "loss": 1.5975, + "step": 35030 + }, + { + "epoch": 1.4357206970718241, + "grad_norm": 0.32704293727874756, + "learning_rate": 7.429410039607241e-05, + "loss": 1.598, + "step": 35040 + }, + { + "epoch": 1.4362626382373365, + "grad_norm": 0.2843274772167206, + "learning_rate": 7.427979067339805e-05, + "loss": 1.6012, + "step": 35050 + }, + { + "epoch": 1.4368045794028486, + "grad_norm": 0.21767373383045197, + "learning_rate": 7.426547856242692e-05, + "loss": 1.5984, + "step": 35060 + }, + { + "epoch": 1.4368587735193998, + "eval_loss": 2.5202910900115967, + "eval_runtime": 21.9854, + "eval_samples_per_second": 227.423, + "eval_steps_per_second": 1.228, + "step": 35061 + }, + { + "epoch": 1.4373465205683609, + "grad_norm": 0.3761364221572876, + "learning_rate": 7.425116406493196e-05, + "loss": 1.6041, + "step": 35070 + }, + { + "epoch": 1.437888461733873, + "grad_norm": 0.3286113142967224, + "learning_rate": 7.423684718268637e-05, + "loss": 1.5982, + "step": 35080 + }, + { + "epoch": 1.4384304028993853, + "grad_norm": 0.3064782917499542, + "learning_rate": 7.422252791746369e-05, + "loss": 1.5936, + "step": 35090 + }, + { + "epoch": 1.4389723440648974, + "grad_norm": 0.2235429883003235, + "learning_rate": 7.420820627103768e-05, + "loss": 1.5973, + "step": 35100 + }, + { + "epoch": 1.4395142852304097, + "grad_norm": 0.33427155017852783, + "learning_rate": 7.419388224518247e-05, + "loss": 1.5904, + "step": 35110 + }, + { + "epoch": 1.4400562263959218, + "grad_norm": 0.26824381947517395, + "learning_rate": 7.417955584167246e-05, + "loss": 1.6038, + "step": 35120 + }, + { + "epoch": 1.440598167561434, + "grad_norm": 0.3559771776199341, + "learning_rate": 7.416522706228235e-05, + "loss": 1.5985, + "step": 35130 + }, + { + "epoch": 1.4411401087269464, + "grad_norm": 0.43836888670921326, + "learning_rate": 7.415089590878713e-05, + "loss": 1.6008, + "step": 35140 + }, + { + "epoch": 1.4415736616593562, + "eval_loss": 2.522282838821411, + "eval_runtime": 21.9813, + "eval_samples_per_second": 227.466, + "eval_steps_per_second": 1.228, + "step": 35148 + }, + { + "epoch": 1.4416820498924585, + "grad_norm": 0.2667721211910248, + "learning_rate": 7.413656238296205e-05, + "loss": 1.5959, + "step": 35150 + }, + { + "epoch": 1.4422239910579708, + "grad_norm": 0.34478846192359924, + "learning_rate": 7.412222648658271e-05, + "loss": 1.5947, + "step": 35160 + }, + { + "epoch": 1.442765932223483, + "grad_norm": 0.2894307076931, + "learning_rate": 7.410788822142497e-05, + "loss": 1.5881, + "step": 35170 + }, + { + "epoch": 1.4433078733889952, + "grad_norm": 0.44409897923469543, + "learning_rate": 7.409354758926501e-05, + "loss": 1.5902, + "step": 35180 + }, + { + "epoch": 1.4438498145545076, + "grad_norm": 0.3519600033760071, + "learning_rate": 7.407920459187925e-05, + "loss": 1.6024, + "step": 35190 + }, + { + "epoch": 1.4443917557200197, + "grad_norm": 0.2908567786216736, + "learning_rate": 7.406485923104447e-05, + "loss": 1.5973, + "step": 35200 + }, + { + "epoch": 1.4449336968855317, + "grad_norm": 0.33565568923950195, + "learning_rate": 7.405051150853771e-05, + "loss": 1.5992, + "step": 35210 + }, + { + "epoch": 1.445475638051044, + "grad_norm": 0.3194481134414673, + "learning_rate": 7.403616142613627e-05, + "loss": 1.5931, + "step": 35220 + }, + { + "epoch": 1.4460175792165564, + "grad_norm": 0.25139880180358887, + "learning_rate": 7.402180898561781e-05, + "loss": 1.599, + "step": 35230 + }, + { + "epoch": 1.4462885497993123, + "eval_loss": 2.5243048667907715, + "eval_runtime": 21.9879, + "eval_samples_per_second": 227.398, + "eval_steps_per_second": 1.228, + "step": 35235 + }, + { + "epoch": 1.4465595203820685, + "grad_norm": 0.4910086989402771, + "learning_rate": 7.400745418876024e-05, + "loss": 1.5972, + "step": 35240 + }, + { + "epoch": 1.4471014615475808, + "grad_norm": 0.2239486426115036, + "learning_rate": 7.399309703734178e-05, + "loss": 1.5989, + "step": 35250 + }, + { + "epoch": 1.4476434027130929, + "grad_norm": 0.3110259175300598, + "learning_rate": 7.39787375331409e-05, + "loss": 1.6041, + "step": 35260 + }, + { + "epoch": 1.4481853438786052, + "grad_norm": 0.394132137298584, + "learning_rate": 7.396437567793642e-05, + "loss": 1.5946, + "step": 35270 + }, + { + "epoch": 1.4487272850441175, + "grad_norm": 0.3766900599002838, + "learning_rate": 7.39500114735074e-05, + "loss": 1.5982, + "step": 35280 + }, + { + "epoch": 1.4492692262096296, + "grad_norm": 0.37917500734329224, + "learning_rate": 7.393564492163326e-05, + "loss": 1.599, + "step": 35290 + }, + { + "epoch": 1.4498111673751417, + "grad_norm": 0.30892854928970337, + "learning_rate": 7.392127602409361e-05, + "loss": 1.5915, + "step": 35300 + }, + { + "epoch": 1.450353108540654, + "grad_norm": 0.23008306324481964, + "learning_rate": 7.390690478266844e-05, + "loss": 1.6001, + "step": 35310 + }, + { + "epoch": 1.4508950497061663, + "grad_norm": 0.23139609396457672, + "learning_rate": 7.389253119913801e-05, + "loss": 1.5941, + "step": 35320 + }, + { + "epoch": 1.4510034379392687, + "eval_loss": 2.5347366333007812, + "eval_runtime": 22.3055, + "eval_samples_per_second": 224.16, + "eval_steps_per_second": 1.21, + "step": 35322 + }, + { + "epoch": 1.4514369908716784, + "grad_norm": 0.3306918740272522, + "learning_rate": 7.387815527528283e-05, + "loss": 1.6064, + "step": 35330 + }, + { + "epoch": 1.4519789320371908, + "grad_norm": 0.3398854732513428, + "learning_rate": 7.386377701288372e-05, + "loss": 1.5957, + "step": 35340 + }, + { + "epoch": 1.4525208732027028, + "grad_norm": 0.4003507196903229, + "learning_rate": 7.384939641372184e-05, + "loss": 1.5979, + "step": 35350 + }, + { + "epoch": 1.4530628143682152, + "grad_norm": 0.23540987074375153, + "learning_rate": 7.383501347957854e-05, + "loss": 1.5936, + "step": 35360 + }, + { + "epoch": 1.4536047555337275, + "grad_norm": 0.23208670318126678, + "learning_rate": 7.382062821223555e-05, + "loss": 1.5963, + "step": 35370 + }, + { + "epoch": 1.4541466966992396, + "grad_norm": 0.2580191195011139, + "learning_rate": 7.380624061347485e-05, + "loss": 1.5969, + "step": 35380 + }, + { + "epoch": 1.4546886378647519, + "grad_norm": 0.4060412645339966, + "learning_rate": 7.37918506850787e-05, + "loss": 1.599, + "step": 35390 + }, + { + "epoch": 1.455230579030264, + "grad_norm": 0.2602030634880066, + "learning_rate": 7.377745842882971e-05, + "loss": 1.5982, + "step": 35400 + }, + { + "epoch": 1.455718326079225, + "eval_loss": 2.522488832473755, + "eval_runtime": 21.9903, + "eval_samples_per_second": 227.373, + "eval_steps_per_second": 1.228, + "step": 35409 + }, + { + "epoch": 1.4557725201957763, + "grad_norm": 0.31275516748428345, + "learning_rate": 7.376306384651066e-05, + "loss": 1.5937, + "step": 35410 + }, + { + "epoch": 1.4563144613612884, + "grad_norm": 0.34258827567100525, + "learning_rate": 7.374866693990474e-05, + "loss": 1.5911, + "step": 35420 + }, + { + "epoch": 1.4568564025268007, + "grad_norm": 0.2919900417327881, + "learning_rate": 7.373426771079534e-05, + "loss": 1.5827, + "step": 35430 + }, + { + "epoch": 1.4573983436923128, + "grad_norm": 0.4928779900074005, + "learning_rate": 7.37198661609662e-05, + "loss": 1.5888, + "step": 35440 + }, + { + "epoch": 1.4579402848578251, + "grad_norm": 0.25346246361732483, + "learning_rate": 7.370546229220133e-05, + "loss": 1.5951, + "step": 35450 + }, + { + "epoch": 1.4584822260233374, + "grad_norm": 0.21547643840312958, + "learning_rate": 7.369105610628498e-05, + "loss": 1.5987, + "step": 35460 + }, + { + "epoch": 1.4590241671888495, + "grad_norm": 0.26982036232948303, + "learning_rate": 7.367664760500177e-05, + "loss": 1.5837, + "step": 35470 + }, + { + "epoch": 1.4595661083543618, + "grad_norm": 0.34236806631088257, + "learning_rate": 7.366223679013652e-05, + "loss": 1.595, + "step": 35480 + }, + { + "epoch": 1.460108049519874, + "grad_norm": 0.41274112462997437, + "learning_rate": 7.364782366347439e-05, + "loss": 1.5924, + "step": 35490 + }, + { + "epoch": 1.4604332142191814, + "eval_loss": 2.516339063644409, + "eval_runtime": 21.9874, + "eval_samples_per_second": 227.403, + "eval_steps_per_second": 1.228, + "step": 35496 + }, + { + "epoch": 1.4606499906853863, + "grad_norm": 0.6666343212127686, + "learning_rate": 7.363340822680085e-05, + "loss": 1.6145, + "step": 35500 + }, + { + "epoch": 1.4611919318508986, + "grad_norm": 0.57722008228302, + "learning_rate": 7.36189904819016e-05, + "loss": 1.5893, + "step": 35510 + }, + { + "epoch": 1.4617338730164107, + "grad_norm": 0.3001037836074829, + "learning_rate": 7.360457043056263e-05, + "loss": 1.5977, + "step": 35520 + }, + { + "epoch": 1.4622758141819228, + "grad_norm": 0.3289230763912201, + "learning_rate": 7.359014807457026e-05, + "loss": 1.5955, + "step": 35530 + }, + { + "epoch": 1.462817755347435, + "grad_norm": 0.26572784781455994, + "learning_rate": 7.357572341571103e-05, + "loss": 1.5957, + "step": 35540 + }, + { + "epoch": 1.4633596965129474, + "grad_norm": 0.3806922435760498, + "learning_rate": 7.356129645577186e-05, + "loss": 1.5925, + "step": 35550 + }, + { + "epoch": 1.4639016376784595, + "grad_norm": 0.31142738461494446, + "learning_rate": 7.354686719653986e-05, + "loss": 1.5968, + "step": 35560 + }, + { + "epoch": 1.4644435788439718, + "grad_norm": 0.26502105593681335, + "learning_rate": 7.353243563980247e-05, + "loss": 1.6025, + "step": 35570 + }, + { + "epoch": 1.464985520009484, + "grad_norm": 0.3371651768684387, + "learning_rate": 7.351800178734741e-05, + "loss": 1.5903, + "step": 35580 + }, + { + "epoch": 1.4651481023591377, + "eval_loss": 2.5116758346557617, + "eval_runtime": 21.9879, + "eval_samples_per_second": 227.398, + "eval_steps_per_second": 1.228, + "step": 35583 + }, + { + "epoch": 1.4655274611749962, + "grad_norm": 0.3568730056285858, + "learning_rate": 7.350356564096269e-05, + "loss": 1.5913, + "step": 35590 + }, + { + "epoch": 1.4660694023405085, + "grad_norm": 0.29186302423477173, + "learning_rate": 7.348912720243661e-05, + "loss": 1.5942, + "step": 35600 + }, + { + "epoch": 1.4666113435060206, + "grad_norm": 0.2645188868045807, + "learning_rate": 7.347468647355771e-05, + "loss": 1.5962, + "step": 35610 + }, + { + "epoch": 1.4671532846715327, + "grad_norm": 0.24721835553646088, + "learning_rate": 7.346024345611485e-05, + "loss": 1.5911, + "step": 35620 + }, + { + "epoch": 1.467695225837045, + "grad_norm": 0.3433522582054138, + "learning_rate": 7.344579815189718e-05, + "loss": 1.5934, + "step": 35630 + }, + { + "epoch": 1.4682371670025574, + "grad_norm": 0.28682151436805725, + "learning_rate": 7.343135056269412e-05, + "loss": 1.5839, + "step": 35640 + }, + { + "epoch": 1.4687791081680694, + "grad_norm": 0.28299185633659363, + "learning_rate": 7.341690069029538e-05, + "loss": 1.5926, + "step": 35650 + }, + { + "epoch": 1.4693210493335818, + "grad_norm": 0.3892780840396881, + "learning_rate": 7.340244853649095e-05, + "loss": 1.589, + "step": 35660 + }, + { + "epoch": 1.4698629904990939, + "grad_norm": 0.296934574842453, + "learning_rate": 7.338799410307107e-05, + "loss": 1.6075, + "step": 35670 + }, + { + "epoch": 1.4698629904990939, + "eval_loss": 2.520735740661621, + "eval_runtime": 21.9779, + "eval_samples_per_second": 227.501, + "eval_steps_per_second": 1.229, + "step": 35670 + }, + { + "epoch": 1.4704049316646062, + "grad_norm": 0.2969922423362732, + "learning_rate": 7.337353739182631e-05, + "loss": 1.5992, + "step": 35680 + }, + { + "epoch": 1.4709468728301185, + "grad_norm": 0.24185840785503387, + "learning_rate": 7.335907840454753e-05, + "loss": 1.5935, + "step": 35690 + }, + { + "epoch": 1.4714888139956306, + "grad_norm": 0.272232323884964, + "learning_rate": 7.334461714302582e-05, + "loss": 1.6053, + "step": 35700 + }, + { + "epoch": 1.472030755161143, + "grad_norm": 0.2702069878578186, + "learning_rate": 7.333015360905257e-05, + "loss": 1.5961, + "step": 35710 + }, + { + "epoch": 1.472572696326655, + "grad_norm": 0.28312546014785767, + "learning_rate": 7.331568780441948e-05, + "loss": 1.5832, + "step": 35720 + }, + { + "epoch": 1.4731146374921673, + "grad_norm": 0.2772080898284912, + "learning_rate": 7.330121973091849e-05, + "loss": 1.6027, + "step": 35730 + }, + { + "epoch": 1.4736565786576794, + "grad_norm": 0.2347124218940735, + "learning_rate": 7.328674939034188e-05, + "loss": 1.5872, + "step": 35740 + }, + { + "epoch": 1.4741985198231917, + "grad_norm": 0.30288901925086975, + "learning_rate": 7.327227678448215e-05, + "loss": 1.5851, + "step": 35750 + }, + { + "epoch": 1.4745778786390502, + "eval_loss": 2.5225229263305664, + "eval_runtime": 21.6393, + "eval_samples_per_second": 231.061, + "eval_steps_per_second": 1.248, + "step": 35757 + }, + { + "epoch": 1.4747404609887038, + "grad_norm": 0.3605928122997284, + "learning_rate": 7.325780191513208e-05, + "loss": 1.5919, + "step": 35760 + }, + { + "epoch": 1.4752824021542161, + "grad_norm": 0.30377334356307983, + "learning_rate": 7.324332478408478e-05, + "loss": 1.5886, + "step": 35770 + }, + { + "epoch": 1.4758243433197284, + "grad_norm": 0.32762688398361206, + "learning_rate": 7.322884539313361e-05, + "loss": 1.5903, + "step": 35780 + }, + { + "epoch": 1.4763662844852405, + "grad_norm": 0.29475733637809753, + "learning_rate": 7.321436374407222e-05, + "loss": 1.59, + "step": 35790 + }, + { + "epoch": 1.4769082256507529, + "grad_norm": 0.2471809685230255, + "learning_rate": 7.319987983869454e-05, + "loss": 1.5876, + "step": 35800 + }, + { + "epoch": 1.477450166816265, + "grad_norm": 0.36135897040367126, + "learning_rate": 7.318539367879475e-05, + "loss": 1.5946, + "step": 35810 + }, + { + "epoch": 1.4779921079817773, + "grad_norm": 0.42719608545303345, + "learning_rate": 7.317090526616733e-05, + "loss": 1.593, + "step": 35820 + }, + { + "epoch": 1.4785340491472894, + "grad_norm": 0.22060419619083405, + "learning_rate": 7.315641460260708e-05, + "loss": 1.5905, + "step": 35830 + }, + { + "epoch": 1.4790759903128017, + "grad_norm": 0.27766963839530945, + "learning_rate": 7.3141921689909e-05, + "loss": 1.5923, + "step": 35840 + }, + { + "epoch": 1.4792927667790066, + "eval_loss": 2.5205538272857666, + "eval_runtime": 21.9888, + "eval_samples_per_second": 227.388, + "eval_steps_per_second": 1.228, + "step": 35844 + }, + { + "epoch": 1.4796179314783138, + "grad_norm": 0.21527595818042755, + "learning_rate": 7.312742652986844e-05, + "loss": 1.5926, + "step": 35850 + }, + { + "epoch": 1.480159872643826, + "grad_norm": 0.2089533805847168, + "learning_rate": 7.311292912428097e-05, + "loss": 1.582, + "step": 35860 + }, + { + "epoch": 1.4807018138093384, + "grad_norm": 0.2418367564678192, + "learning_rate": 7.309842947494248e-05, + "loss": 1.5862, + "step": 35870 + }, + { + "epoch": 1.4812437549748505, + "grad_norm": 0.2674960494041443, + "learning_rate": 7.308392758364912e-05, + "loss": 1.5865, + "step": 35880 + }, + { + "epoch": 1.4817856961403628, + "grad_norm": 0.24217753112316132, + "learning_rate": 7.306942345219733e-05, + "loss": 1.5894, + "step": 35890 + }, + { + "epoch": 1.482327637305875, + "grad_norm": 0.2955038249492645, + "learning_rate": 7.305491708238381e-05, + "loss": 1.5957, + "step": 35900 + }, + { + "epoch": 1.4828695784713872, + "grad_norm": 0.4298487603664398, + "learning_rate": 7.304040847600555e-05, + "loss": 1.5872, + "step": 35910 + }, + { + "epoch": 1.4834115196368995, + "grad_norm": 0.3208651542663574, + "learning_rate": 7.30258976348598e-05, + "loss": 1.5896, + "step": 35920 + }, + { + "epoch": 1.4839534608024116, + "grad_norm": 0.3353649079799652, + "learning_rate": 7.301138456074414e-05, + "loss": 1.5863, + "step": 35930 + }, + { + "epoch": 1.484007654918963, + "eval_loss": 2.5121777057647705, + "eval_runtime": 21.989, + "eval_samples_per_second": 227.386, + "eval_steps_per_second": 1.228, + "step": 35931 + }, + { + "epoch": 1.4844954019679237, + "grad_norm": 0.2195397913455963, + "learning_rate": 7.299686925545633e-05, + "loss": 1.5874, + "step": 35940 + }, + { + "epoch": 1.485037343133436, + "grad_norm": 0.27811160683631897, + "learning_rate": 7.298235172079451e-05, + "loss": 1.5921, + "step": 35950 + }, + { + "epoch": 1.4855792842989484, + "grad_norm": 0.3784521818161011, + "learning_rate": 7.296783195855703e-05, + "loss": 1.5812, + "step": 35960 + }, + { + "epoch": 1.4861212254644605, + "grad_norm": 0.20319710671901703, + "learning_rate": 7.295330997054251e-05, + "loss": 1.5943, + "step": 35970 + }, + { + "epoch": 1.4866631666299728, + "grad_norm": 0.45995742082595825, + "learning_rate": 7.293878575854993e-05, + "loss": 1.5899, + "step": 35980 + }, + { + "epoch": 1.4872051077954849, + "grad_norm": 0.4735308289527893, + "learning_rate": 7.292425932437843e-05, + "loss": 1.5944, + "step": 35990 + }, + { + "epoch": 1.4877470489609972, + "grad_norm": 0.2938894033432007, + "learning_rate": 7.290973066982752e-05, + "loss": 1.5885, + "step": 36000 + }, + { + "epoch": 1.4882889901265095, + "grad_norm": 0.4378818869590759, + "learning_rate": 7.289519979669693e-05, + "loss": 1.5856, + "step": 36010 + }, + { + "epoch": 1.4887225430589193, + "eval_loss": 2.5050690174102783, + "eval_runtime": 21.9834, + "eval_samples_per_second": 227.444, + "eval_steps_per_second": 1.228, + "step": 36018 + }, + { + "epoch": 1.4888309312920216, + "grad_norm": 0.5193374752998352, + "learning_rate": 7.288066670678665e-05, + "loss": 1.6, + "step": 36020 + }, + { + "epoch": 1.4893728724575337, + "grad_norm": 0.3998728394508362, + "learning_rate": 7.286613140189704e-05, + "loss": 1.5935, + "step": 36030 + }, + { + "epoch": 1.489914813623046, + "grad_norm": 0.2564713954925537, + "learning_rate": 7.285159388382864e-05, + "loss": 1.5882, + "step": 36040 + }, + { + "epoch": 1.4904567547885583, + "grad_norm": 0.2355422079563141, + "learning_rate": 7.283705415438228e-05, + "loss": 1.5895, + "step": 36050 + }, + { + "epoch": 1.4909986959540704, + "grad_norm": 0.226087749004364, + "learning_rate": 7.282251221535908e-05, + "loss": 1.5908, + "step": 36060 + }, + { + "epoch": 1.4915406371195827, + "grad_norm": 0.3110443353652954, + "learning_rate": 7.280796806856048e-05, + "loss": 1.5861, + "step": 36070 + }, + { + "epoch": 1.4920825782850948, + "grad_norm": 0.2450808882713318, + "learning_rate": 7.27934217157881e-05, + "loss": 1.5803, + "step": 36080 + }, + { + "epoch": 1.4926245194506071, + "grad_norm": 0.27178075909614563, + "learning_rate": 7.277887315884388e-05, + "loss": 1.5887, + "step": 36090 + }, + { + "epoch": 1.4931664606161195, + "grad_norm": 0.35005462169647217, + "learning_rate": 7.276432239953004e-05, + "loss": 1.5939, + "step": 36100 + }, + { + "epoch": 1.4934374311988754, + "eval_loss": 2.5150177478790283, + "eval_runtime": 21.9887, + "eval_samples_per_second": 227.39, + "eval_steps_per_second": 1.228, + "step": 36105 + }, + { + "epoch": 1.4937084017816316, + "grad_norm": 0.27534955739974976, + "learning_rate": 7.274976943964906e-05, + "loss": 1.5847, + "step": 36110 + }, + { + "epoch": 1.4942503429471439, + "grad_norm": 0.29644301533699036, + "learning_rate": 7.273521428100372e-05, + "loss": 1.5859, + "step": 36120 + }, + { + "epoch": 1.494792284112656, + "grad_norm": 0.37286531925201416, + "learning_rate": 7.272065692539701e-05, + "loss": 1.6031, + "step": 36130 + }, + { + "epoch": 1.4953342252781683, + "grad_norm": 0.3895072340965271, + "learning_rate": 7.270609737463229e-05, + "loss": 1.5959, + "step": 36140 + }, + { + "epoch": 1.4958761664436804, + "grad_norm": 0.3287084996700287, + "learning_rate": 7.269153563051306e-05, + "loss": 1.5838, + "step": 36150 + }, + { + "epoch": 1.4964181076091927, + "grad_norm": 0.4030684530735016, + "learning_rate": 7.267697169484323e-05, + "loss": 1.58, + "step": 36160 + }, + { + "epoch": 1.4969600487747048, + "grad_norm": 0.32397374510765076, + "learning_rate": 7.26624055694269e-05, + "loss": 1.5884, + "step": 36170 + }, + { + "epoch": 1.497501989940217, + "grad_norm": 0.268887996673584, + "learning_rate": 7.264783725606843e-05, + "loss": 1.5928, + "step": 36180 + }, + { + "epoch": 1.4980439311057294, + "grad_norm": 0.32079851627349854, + "learning_rate": 7.263326675657251e-05, + "loss": 1.5925, + "step": 36190 + }, + { + "epoch": 1.4981523193388318, + "eval_loss": 2.512747049331665, + "eval_runtime": 21.9853, + "eval_samples_per_second": 227.425, + "eval_steps_per_second": 1.228, + "step": 36192 + }, + { + "epoch": 1.4985858722712415, + "grad_norm": 0.27530887722969055, + "learning_rate": 7.261869407274406e-05, + "loss": 1.5871, + "step": 36200 + }, + { + "epoch": 1.4991278134367538, + "grad_norm": 0.39330941438674927, + "learning_rate": 7.260411920638828e-05, + "loss": 1.5856, + "step": 36210 + }, + { + "epoch": 1.499669754602266, + "grad_norm": 0.23756785690784454, + "learning_rate": 7.258954215931064e-05, + "loss": 1.5878, + "step": 36220 + }, + { + "epoch": 1.5002116957677782, + "grad_norm": 0.3410539925098419, + "learning_rate": 7.257496293331688e-05, + "loss": 1.6006, + "step": 36230 + }, + { + "epoch": 1.5007536369332906, + "grad_norm": 0.3402417302131653, + "learning_rate": 7.256038153021303e-05, + "loss": 1.5809, + "step": 36240 + }, + { + "epoch": 1.5012955780988027, + "grad_norm": 0.25963348150253296, + "learning_rate": 7.254579795180534e-05, + "loss": 1.58, + "step": 36250 + }, + { + "epoch": 1.5018375192643147, + "grad_norm": 0.37318065762519836, + "learning_rate": 7.253121219990038e-05, + "loss": 1.5912, + "step": 36260 + }, + { + "epoch": 1.502379460429827, + "grad_norm": 0.45932379364967346, + "learning_rate": 7.251662427630496e-05, + "loss": 1.5933, + "step": 36270 + }, + { + "epoch": 1.502867207478788, + "eval_loss": 2.5038368701934814, + "eval_runtime": 21.9923, + "eval_samples_per_second": 227.352, + "eval_steps_per_second": 1.228, + "step": 36279 + }, + { + "epoch": 1.5029214015953394, + "grad_norm": 0.3073781430721283, + "learning_rate": 7.25020341828262e-05, + "loss": 1.5869, + "step": 36280 + }, + { + "epoch": 1.5034633427608515, + "grad_norm": 0.29523077607154846, + "learning_rate": 7.24874419212714e-05, + "loss": 1.5864, + "step": 36290 + }, + { + "epoch": 1.5040052839263638, + "grad_norm": 0.3561224341392517, + "learning_rate": 7.247284749344824e-05, + "loss": 1.5904, + "step": 36300 + }, + { + "epoch": 1.5045472250918759, + "grad_norm": 0.26353198289871216, + "learning_rate": 7.245825090116457e-05, + "loss": 1.5867, + "step": 36310 + }, + { + "epoch": 1.5050891662573882, + "grad_norm": 0.2485145479440689, + "learning_rate": 7.244365214622859e-05, + "loss": 1.5951, + "step": 36320 + }, + { + "epoch": 1.5056311074229005, + "grad_norm": 0.21996235847473145, + "learning_rate": 7.242905123044872e-05, + "loss": 1.5847, + "step": 36330 + }, + { + "epoch": 1.5061730485884126, + "grad_norm": 0.32352975010871887, + "learning_rate": 7.241444815563364e-05, + "loss": 1.5844, + "step": 36340 + }, + { + "epoch": 1.5067149897539247, + "grad_norm": 0.3241994380950928, + "learning_rate": 7.239984292359232e-05, + "loss": 1.5895, + "step": 36350 + }, + { + "epoch": 1.507256930919437, + "grad_norm": 0.7218349575996399, + "learning_rate": 7.238523553613402e-05, + "loss": 1.5877, + "step": 36360 + }, + { + "epoch": 1.5075820956187442, + "eval_loss": 2.5044937133789062, + "eval_runtime": 21.9867, + "eval_samples_per_second": 227.411, + "eval_steps_per_second": 1.228, + "step": 36366 + }, + { + "epoch": 1.5077988720849493, + "grad_norm": 0.5906317830085754, + "learning_rate": 7.237062599506821e-05, + "loss": 1.589, + "step": 36370 + }, + { + "epoch": 1.5083408132504617, + "grad_norm": 0.43497881293296814, + "learning_rate": 7.23560143022047e-05, + "loss": 1.5833, + "step": 36380 + }, + { + "epoch": 1.5088827544159737, + "grad_norm": 0.3339081406593323, + "learning_rate": 7.234140045935345e-05, + "loss": 1.6007, + "step": 36390 + }, + { + "epoch": 1.5094246955814858, + "grad_norm": 0.2763802111148834, + "learning_rate": 7.23267844683248e-05, + "loss": 1.5971, + "step": 36400 + }, + { + "epoch": 1.5099666367469982, + "grad_norm": 0.3865683376789093, + "learning_rate": 7.231216633092934e-05, + "loss": 1.5844, + "step": 36410 + }, + { + "epoch": 1.5105085779125105, + "grad_norm": 0.32255837321281433, + "learning_rate": 7.229754604897786e-05, + "loss": 1.5878, + "step": 36420 + }, + { + "epoch": 1.5110505190780226, + "grad_norm": 0.29356804490089417, + "learning_rate": 7.228292362428148e-05, + "loss": 1.5826, + "step": 36430 + }, + { + "epoch": 1.5115924602435347, + "grad_norm": 0.2429652065038681, + "learning_rate": 7.226829905865156e-05, + "loss": 1.5873, + "step": 36440 + }, + { + "epoch": 1.512134401409047, + "grad_norm": 0.39031389355659485, + "learning_rate": 7.225367235389972e-05, + "loss": 1.5894, + "step": 36450 + }, + { + "epoch": 1.5122969837587008, + "eval_loss": 2.497527837753296, + "eval_runtime": 21.9849, + "eval_samples_per_second": 227.429, + "eval_steps_per_second": 1.228, + "step": 36453 + }, + { + "epoch": 1.5126763425745593, + "grad_norm": 0.27026987075805664, + "learning_rate": 7.223904351183786e-05, + "loss": 1.5877, + "step": 36460 + }, + { + "epoch": 1.5132182837400716, + "grad_norm": 0.26341304183006287, + "learning_rate": 7.222441253427813e-05, + "loss": 1.5977, + "step": 36470 + }, + { + "epoch": 1.5137602249055837, + "grad_norm": 0.2732945382595062, + "learning_rate": 7.220977942303295e-05, + "loss": 1.5945, + "step": 36480 + }, + { + "epoch": 1.5143021660710958, + "grad_norm": 0.29458683729171753, + "learning_rate": 7.219514417991505e-05, + "loss": 1.5861, + "step": 36490 + }, + { + "epoch": 1.5148441072366081, + "grad_norm": 0.5213008522987366, + "learning_rate": 7.218050680673729e-05, + "loss": 1.5707, + "step": 36500 + }, + { + "epoch": 1.5153860484021204, + "grad_norm": 0.4199194014072418, + "learning_rate": 7.216586730531296e-05, + "loss": 1.5796, + "step": 36510 + }, + { + "epoch": 1.5159279895676325, + "grad_norm": 0.23940829932689667, + "learning_rate": 7.215122567745552e-05, + "loss": 1.5849, + "step": 36520 + }, + { + "epoch": 1.5164699307331446, + "grad_norm": 0.2722623944282532, + "learning_rate": 7.213658192497869e-05, + "loss": 1.5892, + "step": 36530 + }, + { + "epoch": 1.517011871898657, + "grad_norm": 0.24977056682109833, + "learning_rate": 7.212193604969652e-05, + "loss": 1.5806, + "step": 36540 + }, + { + "epoch": 1.517011871898657, + "eval_loss": 2.511324644088745, + "eval_runtime": 21.9284, + "eval_samples_per_second": 228.015, + "eval_steps_per_second": 1.231, + "step": 36540 + }, + { + "epoch": 1.5175538130641693, + "grad_norm": 0.4285825788974762, + "learning_rate": 7.21072880534232e-05, + "loss": 1.5742, + "step": 36550 + }, + { + "epoch": 1.5180957542296816, + "grad_norm": 0.72999107837677, + "learning_rate": 7.209263793797335e-05, + "loss": 1.5943, + "step": 36560 + }, + { + "epoch": 1.5186376953951937, + "grad_norm": 0.4707547426223755, + "learning_rate": 7.207798570516172e-05, + "loss": 1.5901, + "step": 36570 + }, + { + "epoch": 1.5191796365607058, + "grad_norm": 0.25756052136421204, + "learning_rate": 7.206333135680336e-05, + "loss": 1.5844, + "step": 36580 + }, + { + "epoch": 1.519721577726218, + "grad_norm": 0.21552801132202148, + "learning_rate": 7.204867489471359e-05, + "loss": 1.5989, + "step": 36590 + }, + { + "epoch": 1.5202635188917304, + "grad_norm": 0.3025147616863251, + "learning_rate": 7.203401632070798e-05, + "loss": 1.5821, + "step": 36600 + }, + { + "epoch": 1.5208054600572425, + "grad_norm": 0.3975871801376343, + "learning_rate": 7.201935563660239e-05, + "loss": 1.5842, + "step": 36610 + }, + { + "epoch": 1.5213474012227548, + "grad_norm": 0.2734328806400299, + "learning_rate": 7.200469284421292e-05, + "loss": 1.5842, + "step": 36620 + }, + { + "epoch": 1.5217267600386133, + "eval_loss": 2.4983890056610107, + "eval_runtime": 21.9821, + "eval_samples_per_second": 227.458, + "eval_steps_per_second": 1.228, + "step": 36627 + }, + { + "epoch": 1.521889342388267, + "grad_norm": 0.23720048367977142, + "learning_rate": 7.199002794535593e-05, + "loss": 1.5873, + "step": 36630 + }, + { + "epoch": 1.5224312835537792, + "grad_norm": 0.26733702421188354, + "learning_rate": 7.197536094184803e-05, + "loss": 1.5868, + "step": 36640 + }, + { + "epoch": 1.5229732247192915, + "grad_norm": 0.3890104293823242, + "learning_rate": 7.196069183550612e-05, + "loss": 1.5894, + "step": 36650 + }, + { + "epoch": 1.5235151658848036, + "grad_norm": 0.23871298134326935, + "learning_rate": 7.194602062814733e-05, + "loss": 1.5899, + "step": 36660 + }, + { + "epoch": 1.5240571070503157, + "grad_norm": 0.20540666580200195, + "learning_rate": 7.19313473215891e-05, + "loss": 1.585, + "step": 36670 + }, + { + "epoch": 1.524599048215828, + "grad_norm": 0.2062886506319046, + "learning_rate": 7.191667191764906e-05, + "loss": 1.5882, + "step": 36680 + }, + { + "epoch": 1.5251409893813404, + "grad_norm": 0.2321225106716156, + "learning_rate": 7.190199441814516e-05, + "loss": 1.5901, + "step": 36690 + }, + { + "epoch": 1.5256829305468527, + "grad_norm": 0.4270740747451782, + "learning_rate": 7.188731482489556e-05, + "loss": 1.5668, + "step": 36700 + }, + { + "epoch": 1.5262248717123648, + "grad_norm": 0.34541916847229004, + "learning_rate": 7.187263313971872e-05, + "loss": 1.5925, + "step": 36710 + }, + { + "epoch": 1.5264416481785696, + "eval_loss": 2.493448495864868, + "eval_runtime": 21.9846, + "eval_samples_per_second": 227.432, + "eval_steps_per_second": 1.228, + "step": 36714 + }, + { + "epoch": 1.5267668128778769, + "grad_norm": 0.44076773524284363, + "learning_rate": 7.185794936443334e-05, + "loss": 1.5788, + "step": 36720 + }, + { + "epoch": 1.5273087540433892, + "grad_norm": 0.2881830334663391, + "learning_rate": 7.184326350085839e-05, + "loss": 1.5871, + "step": 36730 + }, + { + "epoch": 1.5278506952089015, + "grad_norm": 0.4518699645996094, + "learning_rate": 7.182857555081307e-05, + "loss": 1.5941, + "step": 36740 + }, + { + "epoch": 1.5283926363744136, + "grad_norm": 0.48421424627304077, + "learning_rate": 7.181388551611688e-05, + "loss": 1.5865, + "step": 36750 + }, + { + "epoch": 1.5289345775399257, + "grad_norm": 0.2462109625339508, + "learning_rate": 7.179919339858955e-05, + "loss": 1.5774, + "step": 36760 + }, + { + "epoch": 1.529476518705438, + "grad_norm": 0.25756916403770447, + "learning_rate": 7.178449920005108e-05, + "loss": 1.5809, + "step": 36770 + }, + { + "epoch": 1.5300184598709503, + "grad_norm": 0.28613096475601196, + "learning_rate": 7.176980292232173e-05, + "loss": 1.5815, + "step": 36780 + }, + { + "epoch": 1.5305604010364626, + "grad_norm": 0.3218177258968353, + "learning_rate": 7.175510456722198e-05, + "loss": 1.5914, + "step": 36790 + }, + { + "epoch": 1.5311023422019747, + "grad_norm": 0.42091110348701477, + "learning_rate": 7.174040413657262e-05, + "loss": 1.5778, + "step": 36800 + }, + { + "epoch": 1.5311565363185258, + "eval_loss": 2.505159854888916, + "eval_runtime": 21.9856, + "eval_samples_per_second": 227.422, + "eval_steps_per_second": 1.228, + "step": 36801 + }, + { + "epoch": 1.5316442833674868, + "grad_norm": 0.23221740126609802, + "learning_rate": 7.17257016321947e-05, + "loss": 1.5965, + "step": 36810 + }, + { + "epoch": 1.5321862245329991, + "grad_norm": 0.28910213708877563, + "learning_rate": 7.171099705590946e-05, + "loss": 1.5775, + "step": 36820 + }, + { + "epoch": 1.5327281656985114, + "grad_norm": 0.25105226039886475, + "learning_rate": 7.169629040953846e-05, + "loss": 1.5871, + "step": 36830 + }, + { + "epoch": 1.5332701068640235, + "grad_norm": 0.2661377489566803, + "learning_rate": 7.168158169490347e-05, + "loss": 1.5805, + "step": 36840 + }, + { + "epoch": 1.5338120480295356, + "grad_norm": 0.28218016028404236, + "learning_rate": 7.166687091382659e-05, + "loss": 1.5916, + "step": 36850 + }, + { + "epoch": 1.534353989195048, + "grad_norm": 0.3052498400211334, + "learning_rate": 7.165215806813009e-05, + "loss": 1.5885, + "step": 36860 + }, + { + "epoch": 1.5348959303605603, + "grad_norm": 0.2858087718486786, + "learning_rate": 7.163744315963656e-05, + "loss": 1.5752, + "step": 36870 + }, + { + "epoch": 1.5354378715260726, + "grad_norm": 0.2672872245311737, + "learning_rate": 7.162272619016879e-05, + "loss": 1.5828, + "step": 36880 + }, + { + "epoch": 1.5358714244584823, + "eval_loss": 2.495072364807129, + "eval_runtime": 21.9812, + "eval_samples_per_second": 227.467, + "eval_steps_per_second": 1.228, + "step": 36888 + }, + { + "epoch": 1.5359798126915847, + "grad_norm": 0.3383979797363281, + "learning_rate": 7.160800716154987e-05, + "loss": 1.5818, + "step": 36890 + }, + { + "epoch": 1.5365217538570968, + "grad_norm": 0.3658377528190613, + "learning_rate": 7.159328607560312e-05, + "loss": 1.5777, + "step": 36900 + }, + { + "epoch": 1.537063695022609, + "grad_norm": 0.2234833836555481, + "learning_rate": 7.157856293415216e-05, + "loss": 1.5839, + "step": 36910 + }, + { + "epoch": 1.5376056361881214, + "grad_norm": 0.2896660566329956, + "learning_rate": 7.156383773902076e-05, + "loss": 1.5878, + "step": 36920 + }, + { + "epoch": 1.5381475773536335, + "grad_norm": 0.37888798117637634, + "learning_rate": 7.15491104920331e-05, + "loss": 1.5934, + "step": 36930 + }, + { + "epoch": 1.5386895185191458, + "grad_norm": 0.49281513690948486, + "learning_rate": 7.153438119501346e-05, + "loss": 1.5918, + "step": 36940 + }, + { + "epoch": 1.539231459684658, + "grad_norm": 0.5974851846694946, + "learning_rate": 7.151964984978645e-05, + "loss": 1.59, + "step": 36950 + }, + { + "epoch": 1.5397734008501702, + "grad_norm": 0.28145140409469604, + "learning_rate": 7.150491645817695e-05, + "loss": 1.5886, + "step": 36960 + }, + { + "epoch": 1.5403153420156825, + "grad_norm": 0.2434787005186081, + "learning_rate": 7.149018102201007e-05, + "loss": 1.5792, + "step": 36970 + }, + { + "epoch": 1.5405863125984385, + "eval_loss": 2.5023746490478516, + "eval_runtime": 21.9846, + "eval_samples_per_second": 227.432, + "eval_steps_per_second": 1.228, + "step": 36975 + }, + { + "epoch": 1.5408572831811946, + "grad_norm": 0.2465183436870575, + "learning_rate": 7.147544354311114e-05, + "loss": 1.5838, + "step": 36980 + }, + { + "epoch": 1.5413992243467067, + "grad_norm": 0.23497800529003143, + "learning_rate": 7.146070402330578e-05, + "loss": 1.5924, + "step": 36990 + }, + { + "epoch": 1.541941165512219, + "grad_norm": 0.3130892217159271, + "learning_rate": 7.14459624644199e-05, + "loss": 1.5842, + "step": 37000 + }, + { + "epoch": 1.5424831066777314, + "grad_norm": 0.26590847969055176, + "learning_rate": 7.143121886827955e-05, + "loss": 1.5869, + "step": 37010 + }, + { + "epoch": 1.5430250478432437, + "grad_norm": 0.22026236355304718, + "learning_rate": 7.141647323671119e-05, + "loss": 1.5772, + "step": 37020 + }, + { + "epoch": 1.5435669890087558, + "grad_norm": 0.3443725109100342, + "learning_rate": 7.140172557154138e-05, + "loss": 1.5823, + "step": 37030 + }, + { + "epoch": 1.5441089301742679, + "grad_norm": 0.3271442949771881, + "learning_rate": 7.1386975874597e-05, + "loss": 1.5902, + "step": 37040 + }, + { + "epoch": 1.5446508713397802, + "grad_norm": 0.3126424252986908, + "learning_rate": 7.137222414770522e-05, + "loss": 1.5913, + "step": 37050 + }, + { + "epoch": 1.5451928125052925, + "grad_norm": 0.3820621073246002, + "learning_rate": 7.135747039269339e-05, + "loss": 1.5824, + "step": 37060 + }, + { + "epoch": 1.5453012007383948, + "eval_loss": 2.5038020610809326, + "eval_runtime": 21.984, + "eval_samples_per_second": 227.438, + "eval_steps_per_second": 1.228, + "step": 37062 + }, + { + "epoch": 1.5457347536708046, + "grad_norm": 0.29371535778045654, + "learning_rate": 7.134271461138913e-05, + "loss": 1.5818, + "step": 37070 + }, + { + "epoch": 1.5462766948363167, + "grad_norm": 0.3977007567882538, + "learning_rate": 7.132795680562035e-05, + "loss": 1.5777, + "step": 37080 + }, + { + "epoch": 1.546818636001829, + "grad_norm": 0.21876034140586853, + "learning_rate": 7.131319697721515e-05, + "loss": 1.5834, + "step": 37090 + }, + { + "epoch": 1.5473605771673413, + "grad_norm": 0.28337687253952026, + "learning_rate": 7.129843512800196e-05, + "loss": 1.5842, + "step": 37100 + }, + { + "epoch": 1.5479025183328536, + "grad_norm": 0.2559991180896759, + "learning_rate": 7.128367125980938e-05, + "loss": 1.5773, + "step": 37110 + }, + { + "epoch": 1.5484444594983657, + "grad_norm": 0.442965567111969, + "learning_rate": 7.126890537446628e-05, + "loss": 1.5765, + "step": 37120 + }, + { + "epoch": 1.5489864006638778, + "grad_norm": 0.4674750566482544, + "learning_rate": 7.125413747380183e-05, + "loss": 1.5938, + "step": 37130 + }, + { + "epoch": 1.5495283418293901, + "grad_norm": 0.30193501710891724, + "learning_rate": 7.12393675596454e-05, + "loss": 1.585, + "step": 37140 + }, + { + "epoch": 1.5500160888783512, + "eval_loss": 2.501018524169922, + "eval_runtime": 21.9863, + "eval_samples_per_second": 227.415, + "eval_steps_per_second": 1.228, + "step": 37149 + }, + { + "epoch": 1.5500702829949025, + "grad_norm": 0.4235888719558716, + "learning_rate": 7.122459563382663e-05, + "loss": 1.5755, + "step": 37150 + }, + { + "epoch": 1.5506122241604146, + "grad_norm": 0.2661120593547821, + "learning_rate": 7.120982169817538e-05, + "loss": 1.5908, + "step": 37160 + }, + { + "epoch": 1.5511541653259266, + "grad_norm": 0.2776598334312439, + "learning_rate": 7.119504575452179e-05, + "loss": 1.5824, + "step": 37170 + }, + { + "epoch": 1.551696106491439, + "grad_norm": 0.44518810510635376, + "learning_rate": 7.118026780469626e-05, + "loss": 1.5958, + "step": 37180 + }, + { + "epoch": 1.5522380476569513, + "grad_norm": 0.19919413328170776, + "learning_rate": 7.116548785052938e-05, + "loss": 1.5844, + "step": 37190 + }, + { + "epoch": 1.5527799888224636, + "grad_norm": 0.27062350511550903, + "learning_rate": 7.115070589385206e-05, + "loss": 1.5944, + "step": 37200 + }, + { + "epoch": 1.5533219299879757, + "grad_norm": 0.25841081142425537, + "learning_rate": 7.113592193649542e-05, + "loss": 1.5645, + "step": 37210 + }, + { + "epoch": 1.5538638711534878, + "grad_norm": 0.22730769217014313, + "learning_rate": 7.112113598029083e-05, + "loss": 1.5833, + "step": 37220 + }, + { + "epoch": 1.554405812319, + "grad_norm": 0.22557532787322998, + "learning_rate": 7.110634802706988e-05, + "loss": 1.5742, + "step": 37230 + }, + { + "epoch": 1.5547309770183073, + "eval_loss": 2.501530885696411, + "eval_runtime": 21.9894, + "eval_samples_per_second": 227.383, + "eval_steps_per_second": 1.228, + "step": 37236 + }, + { + "epoch": 1.5549477534845124, + "grad_norm": 0.24140417575836182, + "learning_rate": 7.109155807866449e-05, + "loss": 1.5807, + "step": 37240 + }, + { + "epoch": 1.5554896946500245, + "grad_norm": 0.33359408378601074, + "learning_rate": 7.107676613690673e-05, + "loss": 1.5806, + "step": 37250 + }, + { + "epoch": 1.5560316358155368, + "grad_norm": 0.3180777132511139, + "learning_rate": 7.1061972203629e-05, + "loss": 1.577, + "step": 37260 + }, + { + "epoch": 1.556573576981049, + "grad_norm": 0.29410409927368164, + "learning_rate": 7.104717628066387e-05, + "loss": 1.5777, + "step": 37270 + }, + { + "epoch": 1.5571155181465612, + "grad_norm": 0.4459421932697296, + "learning_rate": 7.103237836984421e-05, + "loss": 1.5918, + "step": 37280 + }, + { + "epoch": 1.5576574593120736, + "grad_norm": 0.24809594452381134, + "learning_rate": 7.101757847300312e-05, + "loss": 1.583, + "step": 37290 + }, + { + "epoch": 1.5581994004775856, + "grad_norm": 0.2839214503765106, + "learning_rate": 7.100277659197396e-05, + "loss": 1.5854, + "step": 37300 + }, + { + "epoch": 1.5587413416430977, + "grad_norm": 0.32646796107292175, + "learning_rate": 7.098797272859032e-05, + "loss": 1.5824, + "step": 37310 + }, + { + "epoch": 1.55928328280861, + "grad_norm": 0.3208850920200348, + "learning_rate": 7.097316688468602e-05, + "loss": 1.5793, + "step": 37320 + }, + { + "epoch": 1.5594458651582639, + "eval_loss": 2.494997024536133, + "eval_runtime": 21.9879, + "eval_samples_per_second": 227.397, + "eval_steps_per_second": 1.228, + "step": 37323 + }, + { + "epoch": 1.5598252239741224, + "grad_norm": 0.2501738965511322, + "learning_rate": 7.095835906209515e-05, + "loss": 1.59, + "step": 37330 + }, + { + "epoch": 1.5603671651396347, + "grad_norm": 0.3885467052459717, + "learning_rate": 7.094354926265206e-05, + "loss": 1.5841, + "step": 37340 + }, + { + "epoch": 1.5609091063051468, + "grad_norm": 0.48911768198013306, + "learning_rate": 7.09287374881913e-05, + "loss": 1.5772, + "step": 37350 + }, + { + "epoch": 1.5614510474706589, + "grad_norm": 0.3899608552455902, + "learning_rate": 7.09139237405477e-05, + "loss": 1.5811, + "step": 37360 + }, + { + "epoch": 1.5619929886361712, + "grad_norm": 0.44720959663391113, + "learning_rate": 7.089910802155631e-05, + "loss": 1.5769, + "step": 37370 + }, + { + "epoch": 1.5625349298016835, + "grad_norm": 0.3570062816143036, + "learning_rate": 7.088429033305245e-05, + "loss": 1.5878, + "step": 37380 + }, + { + "epoch": 1.5630768709671956, + "grad_norm": 0.3376300036907196, + "learning_rate": 7.086947067687167e-05, + "loss": 1.5816, + "step": 37390 + }, + { + "epoch": 1.5636188121327077, + "grad_norm": 0.24573005735874176, + "learning_rate": 7.085464905484974e-05, + "loss": 1.58, + "step": 37400 + }, + { + "epoch": 1.56416075329822, + "grad_norm": 0.3362915813922882, + "learning_rate": 7.083982546882275e-05, + "loss": 1.5777, + "step": 37410 + }, + { + "epoch": 1.56416075329822, + "eval_loss": 2.491037368774414, + "eval_runtime": 21.982, + "eval_samples_per_second": 227.459, + "eval_steps_per_second": 1.228, + "step": 37410 + }, + { + "epoch": 1.5647026944637323, + "grad_norm": 0.37921565771102905, + "learning_rate": 7.082499992062693e-05, + "loss": 1.5786, + "step": 37420 + }, + { + "epoch": 1.5652446356292447, + "grad_norm": 0.3702291250228882, + "learning_rate": 7.081017241209883e-05, + "loss": 1.5851, + "step": 37430 + }, + { + "epoch": 1.5657865767947567, + "grad_norm": 0.23378178477287292, + "learning_rate": 7.07953429450752e-05, + "loss": 1.5818, + "step": 37440 + }, + { + "epoch": 1.5663285179602688, + "grad_norm": 0.3024660646915436, + "learning_rate": 7.078051152139308e-05, + "loss": 1.5752, + "step": 37450 + }, + { + "epoch": 1.5668704591257812, + "grad_norm": 0.25815534591674805, + "learning_rate": 7.076567814288971e-05, + "loss": 1.5895, + "step": 37460 + }, + { + "epoch": 1.5674124002912935, + "grad_norm": 0.35706350207328796, + "learning_rate": 7.075084281140256e-05, + "loss": 1.567, + "step": 37470 + }, + { + "epoch": 1.5679543414568056, + "grad_norm": 0.34599047899246216, + "learning_rate": 7.07360055287694e-05, + "loss": 1.5831, + "step": 37480 + }, + { + "epoch": 1.5684962826223177, + "grad_norm": 0.22840653359889984, + "learning_rate": 7.072116629682819e-05, + "loss": 1.5795, + "step": 37490 + }, + { + "epoch": 1.5688756414381764, + "eval_loss": 2.498384714126587, + "eval_runtime": 21.9865, + "eval_samples_per_second": 227.412, + "eval_steps_per_second": 1.228, + "step": 37497 + }, + { + "epoch": 1.56903822378783, + "grad_norm": 0.24497516453266144, + "learning_rate": 7.070632511741717e-05, + "loss": 1.5712, + "step": 37500 + }, + { + "epoch": 1.5695801649533423, + "grad_norm": 0.3624199628829956, + "learning_rate": 7.069148199237476e-05, + "loss": 1.5803, + "step": 37510 + }, + { + "epoch": 2.000325164699307, + "grad_norm": 0.2853184640407562, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.5294, + "step": 37520 + }, + { + "epoch": 2.0008671058648195, + "grad_norm": 0.2135554403066635, + "learning_rate": 5.333333333333333e-07, + "loss": 1.5365, + "step": 37530 + }, + { + "epoch": 2.001409047030332, + "grad_norm": 0.20927861332893372, + "learning_rate": 8.666666666666667e-07, + "loss": 1.5365, + "step": 37540 + }, + { + "epoch": 2.001950988195844, + "grad_norm": 0.17029166221618652, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.5374, + "step": 37550 + }, + { + "epoch": 2.002492929361356, + "grad_norm": 0.16770167648792267, + "learning_rate": 1.5333333333333334e-06, + "loss": 1.5412, + "step": 37560 + }, + { + "epoch": 2.0030348705268683, + "grad_norm": 0.17810305953025818, + "learning_rate": 1.8666666666666669e-06, + "loss": 1.5236, + "step": 37570 + }, + { + "epoch": 2.0035768116923807, + "grad_norm": 0.18917031586170197, + "learning_rate": 2.2e-06, + "loss": 1.5386, + "step": 37580 + }, + { + "epoch": 2.0037935881585853, + "eval_loss": 2.4802141189575195, + "eval_runtime": 22.3304, + "eval_samples_per_second": 223.91, + "eval_steps_per_second": 1.209, + "step": 37584 + }, + { + "epoch": 2.004118752857893, + "grad_norm": 0.1666443794965744, + "learning_rate": 2.5333333333333334e-06, + "loss": 1.5323, + "step": 37590 + }, + { + "epoch": 2.0046606940234053, + "grad_norm": 0.16958613693714142, + "learning_rate": 2.8666666666666666e-06, + "loss": 1.5331, + "step": 37600 + }, + { + "epoch": 2.005202635188917, + "grad_norm": 0.16641965508460999, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.5381, + "step": 37610 + }, + { + "epoch": 2.0057445763544295, + "grad_norm": 0.18020287156105042, + "learning_rate": 3.5333333333333335e-06, + "loss": 1.5293, + "step": 37620 + }, + { + "epoch": 2.006286517519942, + "grad_norm": 0.16953766345977783, + "learning_rate": 3.866666666666667e-06, + "loss": 1.5372, + "step": 37630 + }, + { + "epoch": 2.006828458685454, + "grad_norm": 0.16563622653484344, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.53, + "step": 37640 + }, + { + "epoch": 2.007370399850966, + "grad_norm": 0.1674739271402359, + "learning_rate": 4.533333333333334e-06, + "loss": 1.5224, + "step": 37650 + }, + { + "epoch": 2.0079123410164783, + "grad_norm": 0.17399495840072632, + "learning_rate": 4.866666666666667e-06, + "loss": 1.5198, + "step": 37660 + }, + { + "epoch": 2.0084542821819906, + "grad_norm": 0.166429802775383, + "learning_rate": 5.2e-06, + "loss": 1.5246, + "step": 37670 + }, + { + "epoch": 2.008508476298542, + "eval_loss": 2.476987600326538, + "eval_runtime": 27.0724, + "eval_samples_per_second": 184.69, + "eval_steps_per_second": 0.997, + "step": 37671 + }, + { + "epoch": 2.008996223347503, + "grad_norm": 0.17933695018291473, + "learning_rate": 5.5333333333333334e-06, + "loss": 1.5297, + "step": 37680 + }, + { + "epoch": 2.0095381645130153, + "grad_norm": 0.16821593046188354, + "learning_rate": 5.866666666666667e-06, + "loss": 1.5318, + "step": 37690 + }, + { + "epoch": 2.010080105678527, + "grad_norm": 0.17084164917469025, + "learning_rate": 6.2e-06, + "loss": 1.5276, + "step": 37700 + }, + { + "epoch": 2.0106220468440394, + "grad_norm": 0.17240796983242035, + "learning_rate": 6.533333333333333e-06, + "loss": 1.5238, + "step": 37710 + }, + { + "epoch": 2.0111639880095518, + "grad_norm": 0.1693965643644333, + "learning_rate": 6.866666666666667e-06, + "loss": 1.5267, + "step": 37720 + }, + { + "epoch": 2.011705929175064, + "grad_norm": 0.17528323829174042, + "learning_rate": 7.2e-06, + "loss": 1.5202, + "step": 37730 + }, + { + "epoch": 2.012247870340576, + "grad_norm": 0.16867630183696747, + "learning_rate": 7.533333333333334e-06, + "loss": 1.5173, + "step": 37740 + }, + { + "epoch": 2.0127898115060883, + "grad_norm": 0.17133750021457672, + "learning_rate": 7.866666666666667e-06, + "loss": 1.5128, + "step": 37750 + }, + { + "epoch": 2.013223364438498, + "eval_loss": 2.475126028060913, + "eval_runtime": 21.9812, + "eval_samples_per_second": 227.467, + "eval_steps_per_second": 1.228, + "step": 37758 + }, + { + "epoch": 2.0133317526716006, + "grad_norm": 0.17712301015853882, + "learning_rate": 8.200000000000001e-06, + "loss": 1.5215, + "step": 37760 + }, + { + "epoch": 2.013873693837113, + "grad_norm": 0.19257335364818573, + "learning_rate": 8.533333333333334e-06, + "loss": 1.5109, + "step": 37770 + }, + { + "epoch": 2.014415635002625, + "grad_norm": 0.2007877379655838, + "learning_rate": 8.866666666666668e-06, + "loss": 1.5275, + "step": 37780 + }, + { + "epoch": 2.014957576168137, + "grad_norm": 0.16571199893951416, + "learning_rate": 9.2e-06, + "loss": 1.5206, + "step": 37790 + }, + { + "epoch": 2.0154995173336494, + "grad_norm": 0.17200104892253876, + "learning_rate": 9.533333333333334e-06, + "loss": 1.5205, + "step": 37800 + }, + { + "epoch": 2.0160414584991617, + "grad_norm": 0.1716819703578949, + "learning_rate": 9.866666666666667e-06, + "loss": 1.5087, + "step": 37810 + }, + { + "epoch": 2.016583399664674, + "grad_norm": 0.21321678161621094, + "learning_rate": 1.02e-05, + "loss": 1.5171, + "step": 37820 + }, + { + "epoch": 2.017125340830186, + "grad_norm": 0.16358011960983276, + "learning_rate": 1.0533333333333335e-05, + "loss": 1.5169, + "step": 37830 + }, + { + "epoch": 2.0176672819956982, + "grad_norm": 0.16521596908569336, + "learning_rate": 1.0866666666666667e-05, + "loss": 1.5277, + "step": 37840 + }, + { + "epoch": 2.0179382525784546, + "eval_loss": 2.476113796234131, + "eval_runtime": 21.9762, + "eval_samples_per_second": 227.519, + "eval_steps_per_second": 1.229, + "step": 37845 + }, + { + "epoch": 2.0182092231612105, + "grad_norm": 0.17164713144302368, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.5241, + "step": 37850 + }, + { + "epoch": 2.018751164326723, + "grad_norm": 0.21223784983158112, + "learning_rate": 1.1533333333333334e-05, + "loss": 1.5176, + "step": 37860 + }, + { + "epoch": 2.019293105492235, + "grad_norm": 0.17650015652179718, + "learning_rate": 1.1866666666666668e-05, + "loss": 1.5128, + "step": 37870 + }, + { + "epoch": 2.019835046657747, + "grad_norm": 0.23301972448825836, + "learning_rate": 1.22e-05, + "loss": 1.5302, + "step": 37880 + }, + { + "epoch": 2.0203769878232594, + "grad_norm": 0.16798456013202667, + "learning_rate": 1.2533333333333332e-05, + "loss": 1.5146, + "step": 37890 + }, + { + "epoch": 2.0209189289887717, + "grad_norm": 0.1657019704580307, + "learning_rate": 1.2866666666666668e-05, + "loss": 1.523, + "step": 37900 + }, + { + "epoch": 2.021460870154284, + "grad_norm": 0.1911483108997345, + "learning_rate": 1.32e-05, + "loss": 1.5141, + "step": 37910 + }, + { + "epoch": 2.0220028113197963, + "grad_norm": 0.22926867008209229, + "learning_rate": 1.3533333333333335e-05, + "loss": 1.5156, + "step": 37920 + }, + { + "epoch": 2.022544752485308, + "grad_norm": 0.25321751832962036, + "learning_rate": 1.3866666666666667e-05, + "loss": 1.5193, + "step": 37930 + }, + { + "epoch": 2.0226531407184107, + "eval_loss": 2.470311403274536, + "eval_runtime": 21.9756, + "eval_samples_per_second": 227.525, + "eval_steps_per_second": 1.229, + "step": 37932 + }, + { + "epoch": 2.0230866936508205, + "grad_norm": 0.20465686917304993, + "learning_rate": 1.42e-05, + "loss": 1.514, + "step": 37940 + }, + { + "epoch": 2.023628634816333, + "grad_norm": 0.20215226709842682, + "learning_rate": 1.4533333333333335e-05, + "loss": 1.5079, + "step": 37950 + }, + { + "epoch": 2.024170575981845, + "grad_norm": 0.18972061574459076, + "learning_rate": 1.4866666666666668e-05, + "loss": 1.5137, + "step": 37960 + }, + { + "epoch": 2.024712517147357, + "grad_norm": 0.18428048491477966, + "learning_rate": 1.52e-05, + "loss": 1.5206, + "step": 37970 + }, + { + "epoch": 2.0252544583128693, + "grad_norm": 0.21038727462291718, + "learning_rate": 1.5533333333333333e-05, + "loss": 1.515, + "step": 37980 + }, + { + "epoch": 2.0257963994783816, + "grad_norm": 0.20297273993492126, + "learning_rate": 1.586666666666667e-05, + "loss": 1.5159, + "step": 37990 + }, + { + "epoch": 2.026338340643894, + "grad_norm": 0.16452449560165405, + "learning_rate": 1.62e-05, + "loss": 1.5092, + "step": 38000 + }, + { + "epoch": 2.0268802818094063, + "grad_norm": 0.25832870602607727, + "learning_rate": 1.6533333333333333e-05, + "loss": 1.5097, + "step": 38010 + }, + { + "epoch": 2.027368028858367, + "eval_loss": 2.469764232635498, + "eval_runtime": 21.9777, + "eval_samples_per_second": 227.503, + "eval_steps_per_second": 1.229, + "step": 38019 + }, + { + "epoch": 2.027422222974918, + "grad_norm": 0.17639535665512085, + "learning_rate": 1.6866666666666666e-05, + "loss": 1.52, + "step": 38020 + }, + { + "epoch": 2.0279641641404305, + "grad_norm": 0.2649475336074829, + "learning_rate": 1.7199999999999998e-05, + "loss": 1.5037, + "step": 38030 + }, + { + "epoch": 2.0285061053059428, + "grad_norm": 0.19192355871200562, + "learning_rate": 1.7533333333333334e-05, + "loss": 1.501, + "step": 38040 + }, + { + "epoch": 2.029048046471455, + "grad_norm": 0.19619537889957428, + "learning_rate": 1.7866666666666666e-05, + "loss": 1.5126, + "step": 38050 + }, + { + "epoch": 2.029589987636967, + "grad_norm": 0.1901368945837021, + "learning_rate": 1.8200000000000002e-05, + "loss": 1.525, + "step": 38060 + }, + { + "epoch": 2.0301319288024793, + "grad_norm": 0.1913931667804718, + "learning_rate": 1.8533333333333334e-05, + "loss": 1.5136, + "step": 38070 + }, + { + "epoch": 2.0306738699679916, + "grad_norm": 0.1890285611152649, + "learning_rate": 1.886666666666667e-05, + "loss": 1.5198, + "step": 38080 + }, + { + "epoch": 2.031215811133504, + "grad_norm": 0.19151568412780762, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.5177, + "step": 38090 + }, + { + "epoch": 2.0317577522990162, + "grad_norm": 0.16688157618045807, + "learning_rate": 1.9533333333333335e-05, + "loss": 1.5084, + "step": 38100 + }, + { + "epoch": 2.0320829169983234, + "eval_loss": 2.4747180938720703, + "eval_runtime": 21.9798, + "eval_samples_per_second": 227.481, + "eval_steps_per_second": 1.228, + "step": 38106 + }, + { + "epoch": 2.032299693464528, + "grad_norm": 0.21504206955432892, + "learning_rate": 1.9866666666666667e-05, + "loss": 1.5145, + "step": 38110 + }, + { + "epoch": 2.0328416346300404, + "grad_norm": 0.19307386875152588, + "learning_rate": 2.0200000000000003e-05, + "loss": 1.514, + "step": 38120 + }, + { + "epoch": 2.0333835757955527, + "grad_norm": 0.17073208093643188, + "learning_rate": 2.0533333333333336e-05, + "loss": 1.5158, + "step": 38130 + }, + { + "epoch": 2.033925516961065, + "grad_norm": 0.22306814789772034, + "learning_rate": 2.0866666666666668e-05, + "loss": 1.5055, + "step": 38140 + }, + { + "epoch": 2.034467458126577, + "grad_norm": 0.170424222946167, + "learning_rate": 2.12e-05, + "loss": 1.5103, + "step": 38150 + }, + { + "epoch": 2.0350093992920892, + "grad_norm": 0.18648818135261536, + "learning_rate": 2.1533333333333333e-05, + "loss": 1.5109, + "step": 38160 + }, + { + "epoch": 2.0355513404576016, + "grad_norm": 0.18736661970615387, + "learning_rate": 2.186666666666667e-05, + "loss": 1.507, + "step": 38170 + }, + { + "epoch": 2.036093281623114, + "grad_norm": 0.20201878249645233, + "learning_rate": 2.22e-05, + "loss": 1.5157, + "step": 38180 + }, + { + "epoch": 2.036635222788626, + "grad_norm": 0.19277946650981903, + "learning_rate": 2.2533333333333333e-05, + "loss": 1.5152, + "step": 38190 + }, + { + "epoch": 2.0367978051382796, + "eval_loss": 2.474329710006714, + "eval_runtime": 21.9764, + "eval_samples_per_second": 227.517, + "eval_steps_per_second": 1.229, + "step": 38193 + }, + { + "epoch": 2.037177163954138, + "grad_norm": 0.1907971054315567, + "learning_rate": 2.2866666666666666e-05, + "loss": 1.5019, + "step": 38200 + }, + { + "epoch": 2.0377191051196504, + "grad_norm": 0.2295379936695099, + "learning_rate": 2.32e-05, + "loss": 1.5046, + "step": 38210 + }, + { + "epoch": 2.0382610462851627, + "grad_norm": 0.18477369844913483, + "learning_rate": 2.3533333333333334e-05, + "loss": 1.5057, + "step": 38220 + }, + { + "epoch": 2.038802987450675, + "grad_norm": 0.19044053554534912, + "learning_rate": 2.3866666666666666e-05, + "loss": 1.5217, + "step": 38230 + }, + { + "epoch": 2.0393449286161873, + "grad_norm": 0.18540959060192108, + "learning_rate": 2.4200000000000002e-05, + "loss": 1.52, + "step": 38240 + }, + { + "epoch": 2.039886869781699, + "grad_norm": 0.23876672983169556, + "learning_rate": 2.4533333333333334e-05, + "loss": 1.5129, + "step": 38250 + }, + { + "epoch": 2.0404288109472115, + "grad_norm": 0.19681920111179352, + "learning_rate": 2.486666666666667e-05, + "loss": 1.502, + "step": 38260 + }, + { + "epoch": 2.040970752112724, + "grad_norm": 0.17679736018180847, + "learning_rate": 2.5200000000000003e-05, + "loss": 1.5122, + "step": 38270 + }, + { + "epoch": 2.041512693278236, + "grad_norm": 0.18017658591270447, + "learning_rate": 2.553333333333334e-05, + "loss": 1.5145, + "step": 38280 + }, + { + "epoch": 2.041512693278236, + "eval_loss": 2.4778501987457275, + "eval_runtime": 22.0279, + "eval_samples_per_second": 226.985, + "eval_steps_per_second": 1.226, + "step": 38280 + }, + { + "epoch": 2.042054634443748, + "grad_norm": 0.17411205172538757, + "learning_rate": 2.5866666666666667e-05, + "loss": 1.5064, + "step": 38290 + }, + { + "epoch": 2.0425965756092603, + "grad_norm": 0.18675434589385986, + "learning_rate": 2.6200000000000003e-05, + "loss": 1.5071, + "step": 38300 + }, + { + "epoch": 2.0431385167747727, + "grad_norm": 0.17742466926574707, + "learning_rate": 2.6533333333333332e-05, + "loss": 1.5147, + "step": 38310 + }, + { + "epoch": 2.043680457940285, + "grad_norm": 0.2024219036102295, + "learning_rate": 2.6866666666666668e-05, + "loss": 1.5201, + "step": 38320 + }, + { + "epoch": 2.0442223991057973, + "grad_norm": 0.19301582872867584, + "learning_rate": 2.7200000000000004e-05, + "loss": 1.5131, + "step": 38330 + }, + { + "epoch": 2.044764340271309, + "grad_norm": 0.186081200838089, + "learning_rate": 2.7533333333333333e-05, + "loss": 1.5113, + "step": 38340 + }, + { + "epoch": 2.0453062814368215, + "grad_norm": 0.2549179792404175, + "learning_rate": 2.786666666666667e-05, + "loss": 1.5133, + "step": 38350 + }, + { + "epoch": 2.045848222602334, + "grad_norm": 0.23150603473186493, + "learning_rate": 2.8199999999999998e-05, + "loss": 1.5087, + "step": 38360 + }, + { + "epoch": 2.0462275814181923, + "eval_loss": 2.479970932006836, + "eval_runtime": 21.9756, + "eval_samples_per_second": 227.525, + "eval_steps_per_second": 1.229, + "step": 38367 + }, + { + "epoch": 2.046390163767846, + "grad_norm": 0.199720561504364, + "learning_rate": 2.8533333333333333e-05, + "loss": 1.5038, + "step": 38370 + }, + { + "epoch": 2.046932104933358, + "grad_norm": 0.16712939739227295, + "learning_rate": 2.886666666666667e-05, + "loss": 1.5047, + "step": 38380 + }, + { + "epoch": 2.0474740460988703, + "grad_norm": 0.17325983941555023, + "learning_rate": 2.9199999999999998e-05, + "loss": 1.517, + "step": 38390 + }, + { + "epoch": 2.0480159872643826, + "grad_norm": 0.19841617345809937, + "learning_rate": 2.9533333333333334e-05, + "loss": 1.5113, + "step": 38400 + }, + { + "epoch": 2.048557928429895, + "grad_norm": 0.21291446685791016, + "learning_rate": 2.986666666666667e-05, + "loss": 1.5056, + "step": 38410 + }, + { + "epoch": 2.0490998695954072, + "grad_norm": 0.33324041962623596, + "learning_rate": 3.02e-05, + "loss": 1.4991, + "step": 38420 + }, + { + "epoch": 2.049641810760919, + "grad_norm": 0.19952350854873657, + "learning_rate": 3.0533333333333335e-05, + "loss": 1.5244, + "step": 38430 + }, + { + "epoch": 2.0501837519264314, + "grad_norm": 0.19988487660884857, + "learning_rate": 3.086666666666667e-05, + "loss": 1.509, + "step": 38440 + }, + { + "epoch": 2.0507256930919437, + "grad_norm": 0.3386477530002594, + "learning_rate": 3.12e-05, + "loss": 1.5031, + "step": 38450 + }, + { + "epoch": 2.0509424695581484, + "eval_loss": 2.480910301208496, + "eval_runtime": 21.9771, + "eval_samples_per_second": 227.51, + "eval_steps_per_second": 1.229, + "step": 38454 + }, + { + "epoch": 2.051267634257456, + "grad_norm": 0.23942768573760986, + "learning_rate": 3.153333333333334e-05, + "loss": 1.5143, + "step": 38460 + }, + { + "epoch": 2.051809575422968, + "grad_norm": 0.2345375418663025, + "learning_rate": 3.1866666666666664e-05, + "loss": 1.5095, + "step": 38470 + }, + { + "epoch": 2.0523515165884803, + "grad_norm": 0.23772574961185455, + "learning_rate": 3.2200000000000003e-05, + "loss": 1.5036, + "step": 38480 + }, + { + "epoch": 2.0528934577539926, + "grad_norm": 0.21411636471748352, + "learning_rate": 3.253333333333333e-05, + "loss": 1.5111, + "step": 38490 + }, + { + "epoch": 2.053435398919505, + "grad_norm": 0.17845569550991058, + "learning_rate": 3.286666666666667e-05, + "loss": 1.5151, + "step": 38500 + }, + { + "epoch": 2.053977340085017, + "grad_norm": 0.23491686582565308, + "learning_rate": 3.32e-05, + "loss": 1.5025, + "step": 38510 + }, + { + "epoch": 2.054519281250529, + "grad_norm": 0.2281605452299118, + "learning_rate": 3.353333333333333e-05, + "loss": 1.5121, + "step": 38520 + }, + { + "epoch": 2.0550612224160414, + "grad_norm": 0.19572101533412933, + "learning_rate": 3.3866666666666665e-05, + "loss": 1.5169, + "step": 38530 + }, + { + "epoch": 2.0556031635815537, + "grad_norm": 0.2043548971414566, + "learning_rate": 3.4200000000000005e-05, + "loss": 1.507, + "step": 38540 + }, + { + "epoch": 2.055657357698105, + "eval_loss": 2.4816620349884033, + "eval_runtime": 22.0156, + "eval_samples_per_second": 227.111, + "eval_steps_per_second": 1.226, + "step": 38541 + }, + { + "epoch": 2.056145104747066, + "grad_norm": 0.3473966717720032, + "learning_rate": 3.453333333333334e-05, + "loss": 1.503, + "step": 38550 + }, + { + "epoch": 2.056687045912578, + "grad_norm": 0.17651808261871338, + "learning_rate": 3.486666666666667e-05, + "loss": 1.5191, + "step": 38560 + }, + { + "epoch": 2.05722898707809, + "grad_norm": 0.31023022532463074, + "learning_rate": 3.52e-05, + "loss": 1.5095, + "step": 38570 + }, + { + "epoch": 2.0577709282436025, + "grad_norm": 0.22835835814476013, + "learning_rate": 3.5533333333333334e-05, + "loss": 1.496, + "step": 38580 + }, + { + "epoch": 2.058312869409115, + "grad_norm": 0.28630900382995605, + "learning_rate": 3.586666666666667e-05, + "loss": 1.5129, + "step": 38590 + }, + { + "epoch": 2.058854810574627, + "grad_norm": 0.2917551100254059, + "learning_rate": 3.62e-05, + "loss": 1.5131, + "step": 38600 + }, + { + "epoch": 2.059396751740139, + "grad_norm": 0.23212890326976776, + "learning_rate": 3.653333333333334e-05, + "loss": 1.5075, + "step": 38610 + }, + { + "epoch": 2.0599386929056513, + "grad_norm": 0.3044704496860504, + "learning_rate": 3.6866666666666664e-05, + "loss": 1.5153, + "step": 38620 + }, + { + "epoch": 2.060372245838061, + "eval_loss": 2.466387987136841, + "eval_runtime": 21.9823, + "eval_samples_per_second": 227.456, + "eval_steps_per_second": 1.228, + "step": 38628 + }, + { + "epoch": 2.0604806340711637, + "grad_norm": 0.1840297132730484, + "learning_rate": 3.72e-05, + "loss": 1.5098, + "step": 38630 + }, + { + "epoch": 2.061022575236676, + "grad_norm": 0.20662543177604675, + "learning_rate": 3.7533333333333335e-05, + "loss": 1.5006, + "step": 38640 + }, + { + "epoch": 2.0615645164021883, + "grad_norm": 0.227204829454422, + "learning_rate": 3.786666666666667e-05, + "loss": 1.5052, + "step": 38650 + }, + { + "epoch": 2.0621064575677, + "grad_norm": 0.377437561750412, + "learning_rate": 3.82e-05, + "loss": 1.5002, + "step": 38660 + }, + { + "epoch": 2.0626483987332125, + "grad_norm": 0.3552536964416504, + "learning_rate": 3.853333333333334e-05, + "loss": 1.5104, + "step": 38670 + }, + { + "epoch": 2.063190339898725, + "grad_norm": 0.31461700797080994, + "learning_rate": 3.8866666666666665e-05, + "loss": 1.5048, + "step": 38680 + }, + { + "epoch": 2.063732281064237, + "grad_norm": 0.20412826538085938, + "learning_rate": 3.9200000000000004e-05, + "loss": 1.5058, + "step": 38690 + }, + { + "epoch": 2.064274222229749, + "grad_norm": 0.22955581545829773, + "learning_rate": 3.9533333333333337e-05, + "loss": 1.5037, + "step": 38700 + }, + { + "epoch": 2.0648161633952613, + "grad_norm": 0.2234533727169037, + "learning_rate": 3.986666666666667e-05, + "loss": 1.5108, + "step": 38710 + }, + { + "epoch": 2.0650871339780177, + "eval_loss": 2.4609971046447754, + "eval_runtime": 21.9738, + "eval_samples_per_second": 227.543, + "eval_steps_per_second": 1.229, + "step": 38715 + }, + { + "epoch": 2.0653581045607736, + "grad_norm": 0.18891902267932892, + "learning_rate": 4.02e-05, + "loss": 1.504, + "step": 38720 + }, + { + "epoch": 2.065900045726286, + "grad_norm": 0.21943628787994385, + "learning_rate": 4.0533333333333334e-05, + "loss": 1.5037, + "step": 38730 + }, + { + "epoch": 2.0664419868917983, + "grad_norm": 0.24250586330890656, + "learning_rate": 4.086666666666667e-05, + "loss": 1.5022, + "step": 38740 + }, + { + "epoch": 2.06698392805731, + "grad_norm": 0.2112322300672531, + "learning_rate": 4.12e-05, + "loss": 1.5052, + "step": 38750 + }, + { + "epoch": 2.0675258692228224, + "grad_norm": 0.24614650011062622, + "learning_rate": 4.153333333333334e-05, + "loss": 1.5012, + "step": 38760 + }, + { + "epoch": 2.0680678103883348, + "grad_norm": 0.25870656967163086, + "learning_rate": 4.186666666666667e-05, + "loss": 1.5078, + "step": 38770 + }, + { + "epoch": 2.068609751553847, + "grad_norm": 0.18944492936134338, + "learning_rate": 4.22e-05, + "loss": 1.5, + "step": 38780 + }, + { + "epoch": 2.069151692719359, + "grad_norm": 0.2570621073246002, + "learning_rate": 4.2533333333333335e-05, + "loss": 1.5076, + "step": 38790 + }, + { + "epoch": 2.0696936338848713, + "grad_norm": 0.19097241759300232, + "learning_rate": 4.286666666666667e-05, + "loss": 1.5103, + "step": 38800 + }, + { + "epoch": 2.069802022117974, + "eval_loss": 2.472243309020996, + "eval_runtime": 21.9768, + "eval_samples_per_second": 227.513, + "eval_steps_per_second": 1.229, + "step": 38802 + }, + { + "epoch": 2.0702355750503836, + "grad_norm": 0.2385208159685135, + "learning_rate": 4.32e-05, + "loss": 1.5022, + "step": 38810 + }, + { + "epoch": 2.070777516215896, + "grad_norm": 0.3327140212059021, + "learning_rate": 4.353333333333334e-05, + "loss": 1.5071, + "step": 38820 + }, + { + "epoch": 2.071319457381408, + "grad_norm": 0.36782822012901306, + "learning_rate": 4.3866666666666665e-05, + "loss": 1.5097, + "step": 38830 + }, + { + "epoch": 2.07186139854692, + "grad_norm": 0.35685569047927856, + "learning_rate": 4.4200000000000004e-05, + "loss": 1.5072, + "step": 38840 + }, + { + "epoch": 2.0724033397124324, + "grad_norm": 0.20131441950798035, + "learning_rate": 4.4533333333333336e-05, + "loss": 1.4922, + "step": 38850 + }, + { + "epoch": 2.0729452808779447, + "grad_norm": 0.31305959820747375, + "learning_rate": 4.486666666666667e-05, + "loss": 1.509, + "step": 38860 + }, + { + "epoch": 2.073487222043457, + "grad_norm": 0.2946673333644867, + "learning_rate": 4.52e-05, + "loss": 1.5089, + "step": 38870 + }, + { + "epoch": 2.0740291632089694, + "grad_norm": 0.29778286814689636, + "learning_rate": 4.553333333333333e-05, + "loss": 1.4924, + "step": 38880 + }, + { + "epoch": 2.07451691025793, + "eval_loss": 2.4727067947387695, + "eval_runtime": 21.9813, + "eval_samples_per_second": 227.466, + "eval_steps_per_second": 1.228, + "step": 38889 + }, + { + "epoch": 2.0745711043744812, + "grad_norm": 0.2734017074108124, + "learning_rate": 4.5866666666666666e-05, + "loss": 1.5117, + "step": 38890 + }, + { + "epoch": 2.0751130455399935, + "grad_norm": 0.1901269257068634, + "learning_rate": 4.6200000000000005e-05, + "loss": 1.503, + "step": 38900 + }, + { + "epoch": 2.075654986705506, + "grad_norm": 0.2383534014225006, + "learning_rate": 4.653333333333334e-05, + "loss": 1.4965, + "step": 38910 + }, + { + "epoch": 2.076196927871018, + "grad_norm": 0.27332353591918945, + "learning_rate": 4.686666666666667e-05, + "loss": 1.5121, + "step": 38920 + }, + { + "epoch": 2.07673886903653, + "grad_norm": 0.25885239243507385, + "learning_rate": 4.72e-05, + "loss": 1.5025, + "step": 38930 + }, + { + "epoch": 2.0772808102020424, + "grad_norm": 0.19907046854496002, + "learning_rate": 4.7533333333333334e-05, + "loss": 1.5126, + "step": 38940 + }, + { + "epoch": 2.0778227513675547, + "grad_norm": 0.19673128426074982, + "learning_rate": 4.7866666666666674e-05, + "loss": 1.5149, + "step": 38950 + }, + { + "epoch": 2.078364692533067, + "grad_norm": 0.3096723258495331, + "learning_rate": 4.82e-05, + "loss": 1.509, + "step": 38960 + }, + { + "epoch": 2.0789066336985793, + "grad_norm": 0.22866037487983704, + "learning_rate": 4.853333333333334e-05, + "loss": 1.5085, + "step": 38970 + }, + { + "epoch": 2.0792317983978865, + "eval_loss": 2.4707653522491455, + "eval_runtime": 21.9775, + "eval_samples_per_second": 227.506, + "eval_steps_per_second": 1.229, + "step": 38976 + }, + { + "epoch": 2.079448574864091, + "grad_norm": 0.19826719164848328, + "learning_rate": 4.886666666666667e-05, + "loss": 1.5061, + "step": 38980 + }, + { + "epoch": 2.0799905160296035, + "grad_norm": 0.3995167016983032, + "learning_rate": 4.92e-05, + "loss": 1.4995, + "step": 38990 + }, + { + "epoch": 2.080532457195116, + "grad_norm": 0.3492770791053772, + "learning_rate": 4.9533333333333336e-05, + "loss": 1.5092, + "step": 39000 + }, + { + "epoch": 2.081074398360628, + "grad_norm": 0.248097226023674, + "learning_rate": 4.986666666666667e-05, + "loss": 1.5146, + "step": 39010 + }, + { + "epoch": 2.08161633952614, + "grad_norm": 0.3634330630302429, + "learning_rate": 5.02e-05, + "loss": 1.5122, + "step": 39020 + }, + { + "epoch": 2.0821582806916523, + "grad_norm": 0.30074241757392883, + "learning_rate": 5.053333333333333e-05, + "loss": 1.5094, + "step": 39030 + }, + { + "epoch": 2.0827002218571646, + "grad_norm": 0.19916889071464539, + "learning_rate": 5.086666666666667e-05, + "loss": 1.5129, + "step": 39040 + }, + { + "epoch": 2.083242163022677, + "grad_norm": 0.19030417501926422, + "learning_rate": 5.1200000000000004e-05, + "loss": 1.5048, + "step": 39050 + }, + { + "epoch": 2.0837841041881893, + "grad_norm": 0.2428630143404007, + "learning_rate": 5.153333333333333e-05, + "loss": 1.5089, + "step": 39060 + }, + { + "epoch": 2.0839466865378427, + "eval_loss": 2.4720537662506104, + "eval_runtime": 21.9794, + "eval_samples_per_second": 227.486, + "eval_steps_per_second": 1.228, + "step": 39063 + }, + { + "epoch": 2.084326045353701, + "grad_norm": 0.2563911974430084, + "learning_rate": 5.1866666666666676e-05, + "loss": 1.5078, + "step": 39070 + }, + { + "epoch": 2.0848679865192135, + "grad_norm": 0.20653724670410156, + "learning_rate": 5.22e-05, + "loss": 1.4982, + "step": 39080 + }, + { + "epoch": 2.0854099276847258, + "grad_norm": 0.29985037446022034, + "learning_rate": 5.2533333333333334e-05, + "loss": 1.5188, + "step": 39090 + }, + { + "epoch": 2.085951868850238, + "grad_norm": 0.2325274795293808, + "learning_rate": 5.2866666666666666e-05, + "loss": 1.4888, + "step": 39100 + }, + { + "epoch": 2.08649381001575, + "grad_norm": 0.35616472363471985, + "learning_rate": 5.3200000000000006e-05, + "loss": 1.503, + "step": 39110 + }, + { + "epoch": 2.0870357511812623, + "grad_norm": 0.23784513771533966, + "learning_rate": 5.353333333333334e-05, + "loss": 1.4995, + "step": 39120 + }, + { + "epoch": 2.0875776923467746, + "grad_norm": 0.2512984275817871, + "learning_rate": 5.3866666666666664e-05, + "loss": 1.4988, + "step": 39130 + }, + { + "epoch": 2.088119633512287, + "grad_norm": 0.31452298164367676, + "learning_rate": 5.420000000000001e-05, + "loss": 1.5064, + "step": 39140 + }, + { + "epoch": 2.0886615746777992, + "grad_norm": 0.2067795991897583, + "learning_rate": 5.4533333333333335e-05, + "loss": 1.503, + "step": 39150 + }, + { + "epoch": 2.0886615746777992, + "eval_loss": 2.478317975997925, + "eval_runtime": 21.954, + "eval_samples_per_second": 227.749, + "eval_steps_per_second": 1.23, + "step": 39150 + }, + { + "epoch": 2.089203515843311, + "grad_norm": 0.2181154042482376, + "learning_rate": 5.486666666666667e-05, + "loss": 1.5082, + "step": 39160 + }, + { + "epoch": 2.0897454570088234, + "grad_norm": 0.25218093395233154, + "learning_rate": 5.520000000000001e-05, + "loss": 1.4949, + "step": 39170 + }, + { + "epoch": 2.0902873981743357, + "grad_norm": 0.26879268884658813, + "learning_rate": 5.553333333333334e-05, + "loss": 1.5169, + "step": 39180 + }, + { + "epoch": 2.090829339339848, + "grad_norm": 0.42360979318618774, + "learning_rate": 5.5866666666666665e-05, + "loss": 1.5001, + "step": 39190 + }, + { + "epoch": 2.09137128050536, + "grad_norm": 0.3156111538410187, + "learning_rate": 5.620000000000001e-05, + "loss": 1.4959, + "step": 39200 + }, + { + "epoch": 2.0919132216708722, + "grad_norm": 0.3810129761695862, + "learning_rate": 5.6533333333333336e-05, + "loss": 1.5044, + "step": 39210 + }, + { + "epoch": 2.0924551628363846, + "grad_norm": 0.2980895936489105, + "learning_rate": 5.686666666666667e-05, + "loss": 1.509, + "step": 39220 + }, + { + "epoch": 2.092997104001897, + "grad_norm": 0.2537708878517151, + "learning_rate": 5.72e-05, + "loss": 1.502, + "step": 39230 + }, + { + "epoch": 2.0933764628177554, + "eval_loss": 2.486907482147217, + "eval_runtime": 21.9802, + "eval_samples_per_second": 227.478, + "eval_steps_per_second": 1.228, + "step": 39237 + }, + { + "epoch": 2.093539045167409, + "grad_norm": 0.2361438274383545, + "learning_rate": 5.753333333333334e-05, + "loss": 1.5046, + "step": 39240 + }, + { + "epoch": 2.094080986332921, + "grad_norm": 0.3751475214958191, + "learning_rate": 5.7866666666666666e-05, + "loss": 1.5045, + "step": 39250 + }, + { + "epoch": 2.0946229274984334, + "grad_norm": 0.28046801686286926, + "learning_rate": 5.82e-05, + "loss": 1.5143, + "step": 39260 + }, + { + "epoch": 2.0951648686639457, + "grad_norm": 0.3462556302547455, + "learning_rate": 5.853333333333334e-05, + "loss": 1.5155, + "step": 39270 + }, + { + "epoch": 2.095706809829458, + "grad_norm": 0.4051414728164673, + "learning_rate": 5.886666666666667e-05, + "loss": 1.5008, + "step": 39280 + }, + { + "epoch": 2.0962487509949703, + "grad_norm": 0.348838746547699, + "learning_rate": 5.92e-05, + "loss": 1.5027, + "step": 39290 + }, + { + "epoch": 2.096790692160482, + "grad_norm": 0.26631638407707214, + "learning_rate": 5.953333333333334e-05, + "loss": 1.5169, + "step": 39300 + }, + { + "epoch": 2.0973326333259945, + "grad_norm": 0.20438840985298157, + "learning_rate": 5.9866666666666674e-05, + "loss": 1.5017, + "step": 39310 + }, + { + "epoch": 2.097874574491507, + "grad_norm": 0.23914705216884613, + "learning_rate": 6.02e-05, + "loss": 1.5111, + "step": 39320 + }, + { + "epoch": 2.0980913509577115, + "eval_loss": 2.485321521759033, + "eval_runtime": 21.983, + "eval_samples_per_second": 227.448, + "eval_steps_per_second": 1.228, + "step": 39324 + }, + { + "epoch": 2.098416515657019, + "grad_norm": 0.4190734028816223, + "learning_rate": 6.053333333333333e-05, + "loss": 1.5052, + "step": 39330 + }, + { + "epoch": 2.098958456822531, + "grad_norm": 0.2981734871864319, + "learning_rate": 6.086666666666667e-05, + "loss": 1.5005, + "step": 39340 + }, + { + "epoch": 2.0995003979880433, + "grad_norm": 0.28054559230804443, + "learning_rate": 6.12e-05, + "loss": 1.4988, + "step": 39350 + }, + { + "epoch": 2.1000423391535556, + "grad_norm": 0.4990901052951813, + "learning_rate": 6.153333333333333e-05, + "loss": 1.4869, + "step": 39360 + }, + { + "epoch": 2.100584280319068, + "grad_norm": 0.30956992506980896, + "learning_rate": 6.186666666666668e-05, + "loss": 1.4983, + "step": 39370 + }, + { + "epoch": 2.1011262214845803, + "grad_norm": 0.25511854887008667, + "learning_rate": 6.220000000000001e-05, + "loss": 1.5014, + "step": 39380 + }, + { + "epoch": 2.101668162650092, + "grad_norm": 0.5350297689437866, + "learning_rate": 6.253333333333333e-05, + "loss": 1.4979, + "step": 39390 + }, + { + "epoch": 2.1022101038156045, + "grad_norm": 0.20907965302467346, + "learning_rate": 6.286666666666667e-05, + "loss": 1.5125, + "step": 39400 + }, + { + "epoch": 2.102752044981117, + "grad_norm": 0.2567403018474579, + "learning_rate": 6.32e-05, + "loss": 1.4964, + "step": 39410 + }, + { + "epoch": 2.102806239097668, + "eval_loss": 2.490506887435913, + "eval_runtime": 21.9797, + "eval_samples_per_second": 227.483, + "eval_steps_per_second": 1.228, + "step": 39411 + }, + { + "epoch": 2.103293986146629, + "grad_norm": 0.25373193621635437, + "learning_rate": 6.353333333333334e-05, + "loss": 1.5058, + "step": 39420 + }, + { + "epoch": 2.103835927312141, + "grad_norm": 0.20478834211826324, + "learning_rate": 6.386666666666667e-05, + "loss": 1.4987, + "step": 39430 + }, + { + "epoch": 2.1043778684776533, + "grad_norm": 0.2046637237071991, + "learning_rate": 6.42e-05, + "loss": 1.51, + "step": 39440 + }, + { + "epoch": 2.1049198096431656, + "grad_norm": 0.2921828329563141, + "learning_rate": 6.453333333333333e-05, + "loss": 1.505, + "step": 39450 + }, + { + "epoch": 2.105461750808678, + "grad_norm": 0.22420616447925568, + "learning_rate": 6.486666666666667e-05, + "loss": 1.5004, + "step": 39460 + }, + { + "epoch": 2.1060036919741902, + "grad_norm": 0.39737066626548767, + "learning_rate": 6.52e-05, + "loss": 1.5068, + "step": 39470 + }, + { + "epoch": 2.106545633139702, + "grad_norm": 0.21135340631008148, + "learning_rate": 6.553333333333333e-05, + "loss": 1.4962, + "step": 39480 + }, + { + "epoch": 2.1070875743052144, + "grad_norm": 0.2920219600200653, + "learning_rate": 6.586666666666666e-05, + "loss": 1.5102, + "step": 39490 + }, + { + "epoch": 2.107521127237624, + "eval_loss": 2.477649450302124, + "eval_runtime": 22.0366, + "eval_samples_per_second": 226.895, + "eval_steps_per_second": 1.225, + "step": 39498 + }, + { + "epoch": 2.1076295154707267, + "grad_norm": 0.34153836965560913, + "learning_rate": 6.620000000000001e-05, + "loss": 1.5078, + "step": 39500 + }, + { + "epoch": 2.108171456636239, + "grad_norm": 0.27279841899871826, + "learning_rate": 6.653333333333334e-05, + "loss": 1.505, + "step": 39510 + }, + { + "epoch": 2.108713397801751, + "grad_norm": 0.2870520055294037, + "learning_rate": 6.686666666666666e-05, + "loss": 1.5043, + "step": 39520 + }, + { + "epoch": 2.1092553389672632, + "grad_norm": 0.34095126390457153, + "learning_rate": 6.720000000000001e-05, + "loss": 1.5083, + "step": 39530 + }, + { + "epoch": 2.1097972801327756, + "grad_norm": 0.2083679437637329, + "learning_rate": 6.753333333333334e-05, + "loss": 1.4983, + "step": 39540 + }, + { + "epoch": 2.110339221298288, + "grad_norm": 0.281099796295166, + "learning_rate": 6.786666666666667e-05, + "loss": 1.5089, + "step": 39550 + }, + { + "epoch": 2.1108811624638, + "grad_norm": 0.29480496048927307, + "learning_rate": 6.82e-05, + "loss": 1.5133, + "step": 39560 + }, + { + "epoch": 2.111423103629312, + "grad_norm": 0.38620585203170776, + "learning_rate": 6.853333333333334e-05, + "loss": 1.5034, + "step": 39570 + }, + { + "epoch": 2.1119650447948244, + "grad_norm": 0.41479256749153137, + "learning_rate": 6.886666666666667e-05, + "loss": 1.5107, + "step": 39580 + }, + { + "epoch": 2.1122360153775808, + "eval_loss": 2.4839091300964355, + "eval_runtime": 22.0638, + "eval_samples_per_second": 226.615, + "eval_steps_per_second": 1.224, + "step": 39585 + }, + { + "epoch": 2.1125069859603367, + "grad_norm": 0.3035687804222107, + "learning_rate": 6.92e-05, + "loss": 1.4977, + "step": 39590 + }, + { + "epoch": 2.113048927125849, + "grad_norm": 0.34999310970306396, + "learning_rate": 6.953333333333333e-05, + "loss": 1.5053, + "step": 39600 + }, + { + "epoch": 2.113590868291361, + "grad_norm": 0.27466028928756714, + "learning_rate": 6.986666666666667e-05, + "loss": 1.5103, + "step": 39610 + }, + { + "epoch": 2.114132809456873, + "grad_norm": 0.4131866693496704, + "learning_rate": 7.02e-05, + "loss": 1.4978, + "step": 39620 + }, + { + "epoch": 2.1146747506223855, + "grad_norm": 0.3529013991355896, + "learning_rate": 7.053333333333334e-05, + "loss": 1.5053, + "step": 39630 + }, + { + "epoch": 2.115216691787898, + "grad_norm": 0.30995792150497437, + "learning_rate": 7.086666666666666e-05, + "loss": 1.513, + "step": 39640 + }, + { + "epoch": 2.11575863295341, + "grad_norm": 0.3839752972126007, + "learning_rate": 7.12e-05, + "loss": 1.505, + "step": 39650 + }, + { + "epoch": 2.116300574118922, + "grad_norm": 0.2367662787437439, + "learning_rate": 7.153333333333334e-05, + "loss": 1.5036, + "step": 39660 + }, + { + "epoch": 2.1168425152844343, + "grad_norm": 0.22004197537899017, + "learning_rate": 7.186666666666667e-05, + "loss": 1.5015, + "step": 39670 + }, + { + "epoch": 2.116950903517537, + "eval_loss": 2.4732017517089844, + "eval_runtime": 21.9785, + "eval_samples_per_second": 227.495, + "eval_steps_per_second": 1.228, + "step": 39672 + }, + { + "epoch": 2.1173844564499467, + "grad_norm": 0.2898196280002594, + "learning_rate": 7.22e-05, + "loss": 1.5044, + "step": 39680 + }, + { + "epoch": 2.117926397615459, + "grad_norm": 0.30908989906311035, + "learning_rate": 7.253333333333334e-05, + "loss": 1.5071, + "step": 39690 + }, + { + "epoch": 2.1184683387809713, + "grad_norm": 0.6186519265174866, + "learning_rate": 7.286666666666667e-05, + "loss": 1.522, + "step": 39700 + }, + { + "epoch": 2.119010279946483, + "grad_norm": 0.47445744276046753, + "learning_rate": 7.32e-05, + "loss": 1.4992, + "step": 39710 + }, + { + "epoch": 2.1195522211119955, + "grad_norm": 0.3463587760925293, + "learning_rate": 7.353333333333334e-05, + "loss": 1.512, + "step": 39720 + }, + { + "epoch": 2.120094162277508, + "grad_norm": 0.3156580626964569, + "learning_rate": 7.386666666666667e-05, + "loss": 1.5026, + "step": 39730 + }, + { + "epoch": 2.12063610344302, + "grad_norm": 0.3200650215148926, + "learning_rate": 7.42e-05, + "loss": 1.504, + "step": 39740 + }, + { + "epoch": 2.121178044608532, + "grad_norm": 0.3450500965118408, + "learning_rate": 7.453333333333333e-05, + "loss": 1.4982, + "step": 39750 + }, + { + "epoch": 2.121665791657493, + "eval_loss": 2.47827410697937, + "eval_runtime": 22.0331, + "eval_samples_per_second": 226.931, + "eval_steps_per_second": 1.225, + "step": 39759 + }, + { + "epoch": 2.1217199857740443, + "grad_norm": 0.221024289727211, + "learning_rate": 7.486666666666668e-05, + "loss": 1.5028, + "step": 39760 + }, + { + "epoch": 2.1222619269395566, + "grad_norm": 0.2555961012840271, + "learning_rate": 7.52e-05, + "loss": 1.5123, + "step": 39770 + }, + { + "epoch": 2.122803868105069, + "grad_norm": 0.34122392535209656, + "learning_rate": 7.553333333333333e-05, + "loss": 1.5043, + "step": 39780 + }, + { + "epoch": 2.1233458092705813, + "grad_norm": 0.29069313406944275, + "learning_rate": 7.586666666666668e-05, + "loss": 1.4978, + "step": 39790 + }, + { + "epoch": 2.123887750436093, + "grad_norm": 0.3182983696460724, + "learning_rate": 7.620000000000001e-05, + "loss": 1.5068, + "step": 39800 + }, + { + "epoch": 2.1244296916016054, + "grad_norm": 0.5020242929458618, + "learning_rate": 7.653333333333333e-05, + "loss": 1.497, + "step": 39810 + }, + { + "epoch": 2.1249716327671178, + "grad_norm": 0.27356261014938354, + "learning_rate": 7.686666666666667e-05, + "loss": 1.5029, + "step": 39820 + }, + { + "epoch": 2.12551357393263, + "grad_norm": 0.315500408411026, + "learning_rate": 7.72e-05, + "loss": 1.5193, + "step": 39830 + }, + { + "epoch": 2.126055515098142, + "grad_norm": 0.36083605885505676, + "learning_rate": 7.753333333333334e-05, + "loss": 1.5013, + "step": 39840 + }, + { + "epoch": 2.1263806797974496, + "eval_loss": 2.4653656482696533, + "eval_runtime": 21.9768, + "eval_samples_per_second": 227.513, + "eval_steps_per_second": 1.229, + "step": 39846 + }, + { + "epoch": 2.1265974562636543, + "grad_norm": 0.3310469090938568, + "learning_rate": 7.786666666666667e-05, + "loss": 1.5136, + "step": 39850 + }, + { + "epoch": 2.1271393974291666, + "grad_norm": 0.3014518618583679, + "learning_rate": 7.82e-05, + "loss": 1.5079, + "step": 39860 + }, + { + "epoch": 2.127681338594679, + "grad_norm": 0.434340238571167, + "learning_rate": 7.853333333333334e-05, + "loss": 1.5123, + "step": 39870 + }, + { + "epoch": 2.128223279760191, + "grad_norm": 0.34903663396835327, + "learning_rate": 7.886666666666667e-05, + "loss": 1.5069, + "step": 39880 + }, + { + "epoch": 2.128765220925703, + "grad_norm": 0.2542589604854584, + "learning_rate": 7.920000000000001e-05, + "loss": 1.5123, + "step": 39890 + }, + { + "epoch": 2.1293071620912154, + "grad_norm": 0.26018011569976807, + "learning_rate": 7.953333333333333e-05, + "loss": 1.5098, + "step": 39900 + }, + { + "epoch": 2.1298491032567277, + "grad_norm": 0.19746781885623932, + "learning_rate": 7.986666666666667e-05, + "loss": 1.5074, + "step": 39910 + }, + { + "epoch": 2.13039104442224, + "grad_norm": 0.2585922181606293, + "learning_rate": 8.020000000000001e-05, + "loss": 1.5005, + "step": 39920 + }, + { + "epoch": 2.1309329855877523, + "grad_norm": 0.3514852523803711, + "learning_rate": 8.053333333333334e-05, + "loss": 1.5007, + "step": 39930 + }, + { + "epoch": 2.1310955679374057, + "eval_loss": 2.4711849689483643, + "eval_runtime": 21.9689, + "eval_samples_per_second": 227.594, + "eval_steps_per_second": 1.229, + "step": 39933 + }, + { + "epoch": 2.131474926753264, + "grad_norm": 0.4120597839355469, + "learning_rate": 8.086666666666666e-05, + "loss": 1.5033, + "step": 39940 + }, + { + "epoch": 2.1320168679187765, + "grad_norm": 0.27820542454719543, + "learning_rate": 8.120000000000001e-05, + "loss": 1.5124, + "step": 39950 + }, + { + "epoch": 2.132558809084289, + "grad_norm": 0.33248940110206604, + "learning_rate": 8.153333333333334e-05, + "loss": 1.5004, + "step": 39960 + }, + { + "epoch": 2.133100750249801, + "grad_norm": 0.4663317799568176, + "learning_rate": 8.186666666666667e-05, + "loss": 1.5161, + "step": 39970 + }, + { + "epoch": 2.133642691415313, + "grad_norm": 0.463692843914032, + "learning_rate": 8.22e-05, + "loss": 1.5041, + "step": 39980 + }, + { + "epoch": 2.1341846325808254, + "grad_norm": 0.3125864267349243, + "learning_rate": 8.253333333333334e-05, + "loss": 1.496, + "step": 39990 + }, + { + "epoch": 2.1347265737463377, + "grad_norm": 0.4677470922470093, + "learning_rate": 8.286666666666667e-05, + "loss": 1.5, + "step": 40000 + }, + { + "epoch": 2.13526851491185, + "grad_norm": 0.24888941645622253, + "learning_rate": 8.32e-05, + "loss": 1.5033, + "step": 40010 + }, + { + "epoch": 2.1358104560773623, + "grad_norm": 0.26266810297966003, + "learning_rate": 8.353333333333334e-05, + "loss": 1.5042, + "step": 40020 + }, + { + "epoch": 2.1358104560773623, + "eval_loss": 2.4674057960510254, + "eval_runtime": 21.5786, + "eval_samples_per_second": 231.711, + "eval_steps_per_second": 1.251, + "step": 40020 + }, + { + "epoch": 2.136352397242874, + "grad_norm": 0.36174437403678894, + "learning_rate": 8.386666666666667e-05, + "loss": 1.5109, + "step": 40030 + }, + { + "epoch": 2.1368943384083865, + "grad_norm": 0.2121952325105667, + "learning_rate": 8.42e-05, + "loss": 1.4932, + "step": 40040 + }, + { + "epoch": 2.137436279573899, + "grad_norm": 0.3189486861228943, + "learning_rate": 8.453333333333335e-05, + "loss": 1.51, + "step": 40050 + }, + { + "epoch": 2.137978220739411, + "grad_norm": 0.36640292406082153, + "learning_rate": 8.486666666666668e-05, + "loss": 1.5087, + "step": 40060 + }, + { + "epoch": 2.138520161904923, + "grad_norm": 0.22334055602550507, + "learning_rate": 8.52e-05, + "loss": 1.5019, + "step": 40070 + }, + { + "epoch": 2.1390621030704353, + "grad_norm": 0.4532851576805115, + "learning_rate": 8.553333333333333e-05, + "loss": 1.5119, + "step": 40080 + }, + { + "epoch": 2.1396040442359476, + "grad_norm": 0.30973002314567566, + "learning_rate": 8.586666666666668e-05, + "loss": 1.5028, + "step": 40090 + }, + { + "epoch": 2.14014598540146, + "grad_norm": 0.27160486578941345, + "learning_rate": 8.620000000000001e-05, + "loss": 1.5112, + "step": 40100 + }, + { + "epoch": 2.1405253442173184, + "eval_loss": 2.4754085540771484, + "eval_runtime": 21.9771, + "eval_samples_per_second": 227.51, + "eval_steps_per_second": 1.229, + "step": 40107 + }, + { + "epoch": 2.1406879265669723, + "grad_norm": 0.3385959267616272, + "learning_rate": 8.653333333333333e-05, + "loss": 1.5036, + "step": 40110 + }, + { + "epoch": 2.141229867732484, + "grad_norm": 0.5644890069961548, + "learning_rate": 8.686666666666667e-05, + "loss": 1.5046, + "step": 40120 + }, + { + "epoch": 2.1417718088979965, + "grad_norm": 0.4575164020061493, + "learning_rate": 8.72e-05, + "loss": 1.5059, + "step": 40130 + }, + { + "epoch": 2.1423137500635088, + "grad_norm": 0.656599223613739, + "learning_rate": 8.753333333333334e-05, + "loss": 1.5027, + "step": 40140 + }, + { + "epoch": 2.142855691229021, + "grad_norm": 0.4436124563217163, + "learning_rate": 8.786666666666667e-05, + "loss": 1.5026, + "step": 40150 + }, + { + "epoch": 2.1433976323945334, + "grad_norm": 0.35058170557022095, + "learning_rate": 8.82e-05, + "loss": 1.5074, + "step": 40160 + }, + { + "epoch": 2.1439395735600453, + "grad_norm": 0.2605305016040802, + "learning_rate": 8.853333333333333e-05, + "loss": 1.5077, + "step": 40170 + }, + { + "epoch": 2.1444815147255576, + "grad_norm": 0.3072412312030792, + "learning_rate": 8.886666666666668e-05, + "loss": 1.5117, + "step": 40180 + }, + { + "epoch": 2.14502345589107, + "grad_norm": 0.20943394303321838, + "learning_rate": 8.92e-05, + "loss": 1.5003, + "step": 40190 + }, + { + "epoch": 2.1452402323572746, + "eval_loss": 2.478358507156372, + "eval_runtime": 24.6282, + "eval_samples_per_second": 203.019, + "eval_steps_per_second": 1.096, + "step": 40194 + }, + { + "epoch": 2.1455653970565822, + "grad_norm": 0.4068165719509125, + "learning_rate": 8.953333333333333e-05, + "loss": 1.5108, + "step": 40200 + }, + { + "epoch": 2.146107338222094, + "grad_norm": 0.35715365409851074, + "learning_rate": 8.986666666666666e-05, + "loss": 1.5062, + "step": 40210 + }, + { + "epoch": 2.1466492793876064, + "grad_norm": 0.4496524930000305, + "learning_rate": 9.020000000000001e-05, + "loss": 1.5063, + "step": 40220 + }, + { + "epoch": 2.1471912205531187, + "grad_norm": 0.606382429599762, + "learning_rate": 9.053333333333334e-05, + "loss": 1.5041, + "step": 40230 + }, + { + "epoch": 2.147733161718631, + "grad_norm": 0.3925984799861908, + "learning_rate": 9.086666666666666e-05, + "loss": 1.5084, + "step": 40240 + }, + { + "epoch": 2.148275102884143, + "grad_norm": 0.29136770963668823, + "learning_rate": 9.120000000000001e-05, + "loss": 1.5013, + "step": 40250 + }, + { + "epoch": 2.1488170440496552, + "grad_norm": 0.35544130206108093, + "learning_rate": 9.153333333333334e-05, + "loss": 1.5072, + "step": 40260 + }, + { + "epoch": 2.1493589852151676, + "grad_norm": 0.3096676766872406, + "learning_rate": 9.186666666666667e-05, + "loss": 1.5037, + "step": 40270 + }, + { + "epoch": 2.14990092638068, + "grad_norm": 0.2778821587562561, + "learning_rate": 9.22e-05, + "loss": 1.5112, + "step": 40280 + }, + { + "epoch": 2.149955120497231, + "eval_loss": 2.4636154174804688, + "eval_runtime": 24.7898, + "eval_samples_per_second": 201.695, + "eval_steps_per_second": 1.089, + "step": 40281 + }, + { + "epoch": 2.150442867546192, + "grad_norm": 0.7011622786521912, + "learning_rate": 9.253333333333334e-05, + "loss": 1.5166, + "step": 40290 + }, + { + "epoch": 2.150984808711704, + "grad_norm": 0.5509504079818726, + "learning_rate": 9.286666666666667e-05, + "loss": 1.5078, + "step": 40300 + }, + { + "epoch": 2.1515267498772164, + "grad_norm": 0.28685396909713745, + "learning_rate": 9.320000000000002e-05, + "loss": 1.5159, + "step": 40310 + }, + { + "epoch": 2.1520686910427287, + "grad_norm": 0.3149552345275879, + "learning_rate": 9.353333333333333e-05, + "loss": 1.5044, + "step": 40320 + }, + { + "epoch": 2.152610632208241, + "grad_norm": 0.458866685628891, + "learning_rate": 9.386666666666667e-05, + "loss": 1.4973, + "step": 40330 + }, + { + "epoch": 2.1531525733737533, + "grad_norm": 0.21605221927165985, + "learning_rate": 9.42e-05, + "loss": 1.5009, + "step": 40340 + }, + { + "epoch": 2.153694514539265, + "grad_norm": 0.6214112043380737, + "learning_rate": 9.453333333333335e-05, + "loss": 1.5062, + "step": 40350 + }, + { + "epoch": 2.1542364557047775, + "grad_norm": 0.5758342146873474, + "learning_rate": 9.486666666666666e-05, + "loss": 1.5175, + "step": 40360 + }, + { + "epoch": 2.1546700086371873, + "eval_loss": 2.45977520942688, + "eval_runtime": 21.979, + "eval_samples_per_second": 227.489, + "eval_steps_per_second": 1.228, + "step": 40368 + }, + { + "epoch": 2.15477839687029, + "grad_norm": 0.3867745101451874, + "learning_rate": 9.52e-05, + "loss": 1.513, + "step": 40370 + }, + { + "epoch": 2.155320338035802, + "grad_norm": 0.25635677576065063, + "learning_rate": 9.553333333333334e-05, + "loss": 1.5077, + "step": 40380 + }, + { + "epoch": 2.155862279201314, + "grad_norm": 0.5329334735870361, + "learning_rate": 9.586666666666667e-05, + "loss": 1.5127, + "step": 40390 + }, + { + "epoch": 2.1564042203668263, + "grad_norm": 0.4407355785369873, + "learning_rate": 9.620000000000001e-05, + "loss": 1.5063, + "step": 40400 + }, + { + "epoch": 2.1569461615323386, + "grad_norm": 0.3090643584728241, + "learning_rate": 9.653333333333334e-05, + "loss": 1.5083, + "step": 40410 + }, + { + "epoch": 2.157488102697851, + "grad_norm": 0.487301766872406, + "learning_rate": 9.686666666666667e-05, + "loss": 1.5051, + "step": 40420 + }, + { + "epoch": 2.1580300438633633, + "grad_norm": 0.5774275064468384, + "learning_rate": 9.72e-05, + "loss": 1.5058, + "step": 40430 + }, + { + "epoch": 2.158571985028875, + "grad_norm": 0.30334457755088806, + "learning_rate": 9.753333333333334e-05, + "loss": 1.514, + "step": 40440 + }, + { + "epoch": 2.1591139261943875, + "grad_norm": 0.5308240056037903, + "learning_rate": 9.786666666666667e-05, + "loss": 1.5075, + "step": 40450 + }, + { + "epoch": 2.1593848967771434, + "eval_loss": 2.4678995609283447, + "eval_runtime": 22.396, + "eval_samples_per_second": 223.254, + "eval_steps_per_second": 1.206, + "step": 40455 + }, + { + "epoch": 2.1596558673599, + "grad_norm": 0.5267515182495117, + "learning_rate": 9.82e-05, + "loss": 1.5033, + "step": 40460 + }, + { + "epoch": 2.160197808525412, + "grad_norm": 0.4302433133125305, + "learning_rate": 9.853333333333333e-05, + "loss": 1.5039, + "step": 40470 + }, + { + "epoch": 2.160739749690924, + "grad_norm": 0.2622862160205841, + "learning_rate": 9.886666666666668e-05, + "loss": 1.5074, + "step": 40480 + }, + { + "epoch": 2.1612816908564363, + "grad_norm": 0.34495052695274353, + "learning_rate": 9.92e-05, + "loss": 1.5148, + "step": 40490 + }, + { + "epoch": 2.1618236320219486, + "grad_norm": 0.28749775886535645, + "learning_rate": 9.953333333333333e-05, + "loss": 1.5168, + "step": 40500 + }, + { + "epoch": 2.162365573187461, + "grad_norm": 0.34906288981437683, + "learning_rate": 9.986666666666668e-05, + "loss": 1.5094, + "step": 40510 + }, + { + "epoch": 2.1629075143529732, + "grad_norm": 0.6776122450828552, + "learning_rate": 9.999999899660717e-05, + "loss": 1.5113, + "step": 40520 + }, + { + "epoch": 2.163449455518485, + "grad_norm": 0.3558565080165863, + "learning_rate": 9.99999928647622e-05, + "loss": 1.5072, + "step": 40530 + }, + { + "epoch": 2.1639913966839974, + "grad_norm": 0.2457047551870346, + "learning_rate": 9.999998115851347e-05, + "loss": 1.5147, + "step": 40540 + }, + { + "epoch": 2.1640997849171, + "eval_loss": 2.471486806869507, + "eval_runtime": 21.974, + "eval_samples_per_second": 227.542, + "eval_steps_per_second": 1.229, + "step": 40542 + }, + { + "epoch": 2.1645333378495097, + "grad_norm": 0.520976722240448, + "learning_rate": 9.999996387786247e-05, + "loss": 1.5066, + "step": 40550 + }, + { + "epoch": 2.165075279015022, + "grad_norm": 0.3190830945968628, + "learning_rate": 9.99999410228113e-05, + "loss": 1.5005, + "step": 40560 + }, + { + "epoch": 2.1656172201805344, + "grad_norm": 0.31958481669425964, + "learning_rate": 9.99999125933628e-05, + "loss": 1.5157, + "step": 40570 + }, + { + "epoch": 2.1661591613460462, + "grad_norm": 0.5813402533531189, + "learning_rate": 9.999987858952052e-05, + "loss": 1.5099, + "step": 40580 + }, + { + "epoch": 2.1667011025115586, + "grad_norm": 0.34184807538986206, + "learning_rate": 9.999983901128863e-05, + "loss": 1.5005, + "step": 40590 + }, + { + "epoch": 2.167243043677071, + "grad_norm": 0.3223097622394562, + "learning_rate": 9.999979385867205e-05, + "loss": 1.5134, + "step": 40600 + }, + { + "epoch": 2.167784984842583, + "grad_norm": 0.20804668962955475, + "learning_rate": 9.99997431316764e-05, + "loss": 1.4992, + "step": 40610 + }, + { + "epoch": 2.168326926008095, + "grad_norm": 0.25401726365089417, + "learning_rate": 9.999968683030792e-05, + "loss": 1.5153, + "step": 40620 + }, + { + "epoch": 2.168814673057056, + "eval_loss": 2.468203067779541, + "eval_runtime": 21.9753, + "eval_samples_per_second": 227.529, + "eval_steps_per_second": 1.229, + "step": 40629 + }, + { + "epoch": 2.1688688671736074, + "grad_norm": 0.2348717451095581, + "learning_rate": 9.999962495457362e-05, + "loss": 1.5125, + "step": 40630 + }, + { + "epoch": 2.1694108083391197, + "grad_norm": 0.43220508098602295, + "learning_rate": 9.999955750448114e-05, + "loss": 1.5078, + "step": 40640 + }, + { + "epoch": 2.169952749504632, + "grad_norm": 0.28150442242622375, + "learning_rate": 9.999948448003884e-05, + "loss": 1.5077, + "step": 40650 + }, + { + "epoch": 2.170494690670144, + "grad_norm": 0.24436496198177338, + "learning_rate": 9.999940588125579e-05, + "loss": 1.5158, + "step": 40660 + }, + { + "epoch": 2.171036631835656, + "grad_norm": 0.3555525243282318, + "learning_rate": 9.999932170814168e-05, + "loss": 1.5009, + "step": 40670 + }, + { + "epoch": 2.1715785730011685, + "grad_norm": 0.6536611318588257, + "learning_rate": 9.999923196070698e-05, + "loss": 1.5179, + "step": 40680 + }, + { + "epoch": 2.172120514166681, + "grad_norm": 0.555479884147644, + "learning_rate": 9.99991366389628e-05, + "loss": 1.5034, + "step": 40690 + }, + { + "epoch": 2.172662455332193, + "grad_norm": 0.27514633536338806, + "learning_rate": 9.999903574292093e-05, + "loss": 1.4983, + "step": 40700 + }, + { + "epoch": 2.173204396497705, + "grad_norm": 0.2602129876613617, + "learning_rate": 9.999892927259388e-05, + "loss": 1.5167, + "step": 40710 + }, + { + "epoch": 2.1735295611970127, + "eval_loss": 2.479363441467285, + "eval_runtime": 21.9755, + "eval_samples_per_second": 227.526, + "eval_steps_per_second": 1.229, + "step": 40716 + }, + { + "epoch": 2.1737463376632173, + "grad_norm": 0.3722988963127136, + "learning_rate": 9.999881722799482e-05, + "loss": 1.5071, + "step": 40720 + }, + { + "epoch": 2.1742882788287297, + "grad_norm": 0.22509793937206268, + "learning_rate": 9.999869960913767e-05, + "loss": 1.5103, + "step": 40730 + }, + { + "epoch": 2.174830219994242, + "grad_norm": 0.5048813819885254, + "learning_rate": 9.999857641603697e-05, + "loss": 1.5164, + "step": 40740 + }, + { + "epoch": 2.1753721611597543, + "grad_norm": 0.3289371430873871, + "learning_rate": 9.999844764870799e-05, + "loss": 1.5028, + "step": 40750 + }, + { + "epoch": 2.175914102325266, + "grad_norm": 0.5190215706825256, + "learning_rate": 9.999831330716668e-05, + "loss": 1.5106, + "step": 40760 + }, + { + "epoch": 2.1764560434907785, + "grad_norm": 0.2816135883331299, + "learning_rate": 9.999817339142969e-05, + "loss": 1.5065, + "step": 40770 + }, + { + "epoch": 2.176997984656291, + "grad_norm": 0.7596124410629272, + "learning_rate": 9.999802790151434e-05, + "loss": 1.5103, + "step": 40780 + }, + { + "epoch": 2.177539925821803, + "grad_norm": 0.34131622314453125, + "learning_rate": 9.999787683743863e-05, + "loss": 1.5063, + "step": 40790 + }, + { + "epoch": 2.178081866987315, + "grad_norm": 0.3968164026737213, + "learning_rate": 9.999772019922132e-05, + "loss": 1.5084, + "step": 40800 + }, + { + "epoch": 2.178244449336969, + "eval_loss": 2.4752957820892334, + "eval_runtime": 21.9803, + "eval_samples_per_second": 227.477, + "eval_steps_per_second": 1.228, + "step": 40803 + }, + { + "epoch": 2.1786238081528273, + "grad_norm": 0.28055036067962646, + "learning_rate": 9.999755798688178e-05, + "loss": 1.5242, + "step": 40810 + }, + { + "epoch": 2.1791657493183396, + "grad_norm": 0.35142403841018677, + "learning_rate": 9.999739020044013e-05, + "loss": 1.5095, + "step": 40820 + }, + { + "epoch": 2.179707690483852, + "grad_norm": 0.2656286656856537, + "learning_rate": 9.999721683991714e-05, + "loss": 1.5034, + "step": 40830 + }, + { + "epoch": 2.1802496316493643, + "grad_norm": 0.2500777840614319, + "learning_rate": 9.999703790533428e-05, + "loss": 1.5072, + "step": 40840 + }, + { + "epoch": 2.180791572814876, + "grad_norm": 0.23317931592464447, + "learning_rate": 9.999685339671372e-05, + "loss": 1.5045, + "step": 40850 + }, + { + "epoch": 2.1813335139803884, + "grad_norm": 0.6272380352020264, + "learning_rate": 9.999666331407832e-05, + "loss": 1.506, + "step": 40860 + }, + { + "epoch": 2.1818754551459008, + "grad_norm": 0.28550583124160767, + "learning_rate": 9.999646765745162e-05, + "loss": 1.5038, + "step": 40870 + }, + { + "epoch": 2.182417396311413, + "grad_norm": 0.3291584253311157, + "learning_rate": 9.999626642685788e-05, + "loss": 1.5097, + "step": 40880 + }, + { + "epoch": 2.182959337476925, + "grad_norm": 0.6882104873657227, + "learning_rate": 9.999605962232201e-05, + "loss": 1.4979, + "step": 40890 + }, + { + "epoch": 2.182959337476925, + "eval_loss": 2.482081174850464, + "eval_runtime": 21.9465, + "eval_samples_per_second": 227.827, + "eval_steps_per_second": 1.23, + "step": 40890 + }, + { + "epoch": 2.1835012786424373, + "grad_norm": 0.4152418375015259, + "learning_rate": 9.999584724386959e-05, + "loss": 1.4999, + "step": 40900 + }, + { + "epoch": 2.1840432198079496, + "grad_norm": 0.40763190388679504, + "learning_rate": 9.9995629291527e-05, + "loss": 1.5092, + "step": 40910 + }, + { + "epoch": 2.184585160973462, + "grad_norm": 0.29160091280937195, + "learning_rate": 9.99954057653212e-05, + "loss": 1.5012, + "step": 40920 + }, + { + "epoch": 2.185127102138974, + "grad_norm": 0.40323200821876526, + "learning_rate": 9.999517666527988e-05, + "loss": 1.5059, + "step": 40930 + }, + { + "epoch": 2.185669043304486, + "grad_norm": 0.30431511998176575, + "learning_rate": 9.999494199143142e-05, + "loss": 1.5109, + "step": 40940 + }, + { + "epoch": 2.1862109844699984, + "grad_norm": 0.2627016007900238, + "learning_rate": 9.999470174380489e-05, + "loss": 1.5064, + "step": 40950 + }, + { + "epoch": 2.1867529256355107, + "grad_norm": 0.47459253668785095, + "learning_rate": 9.999445592243008e-05, + "loss": 1.51, + "step": 40960 + }, + { + "epoch": 2.187294866801023, + "grad_norm": 0.352264404296875, + "learning_rate": 9.999420452733739e-05, + "loss": 1.5102, + "step": 40970 + }, + { + "epoch": 2.1876742256168815, + "eval_loss": 2.473921298980713, + "eval_runtime": 21.9773, + "eval_samples_per_second": 227.507, + "eval_steps_per_second": 1.229, + "step": 40977 + }, + { + "epoch": 2.1878368079665353, + "grad_norm": 0.35294270515441895, + "learning_rate": 9.9993947558558e-05, + "loss": 1.5141, + "step": 40980 + }, + { + "epoch": 2.188378749132047, + "grad_norm": 0.4508706033229828, + "learning_rate": 9.999368501612373e-05, + "loss": 1.503, + "step": 40990 + }, + { + "epoch": 2.1889206902975595, + "grad_norm": 0.5512154698371887, + "learning_rate": 9.999341690006711e-05, + "loss": 1.5284, + "step": 41000 + }, + { + "epoch": 2.189462631463072, + "grad_norm": 0.4073977768421173, + "learning_rate": 9.999314321042134e-05, + "loss": 1.4996, + "step": 41010 + }, + { + "epoch": 2.190004572628584, + "grad_norm": 0.458518385887146, + "learning_rate": 9.999286394722031e-05, + "loss": 1.5099, + "step": 41020 + }, + { + "epoch": 2.190546513794096, + "grad_norm": 0.25982314348220825, + "learning_rate": 9.999257911049866e-05, + "loss": 1.5042, + "step": 41030 + }, + { + "epoch": 2.1910884549596084, + "grad_norm": 0.2903514802455902, + "learning_rate": 9.999228870029165e-05, + "loss": 1.5033, + "step": 41040 + }, + { + "epoch": 2.1916303961251207, + "grad_norm": 0.21735897660255432, + "learning_rate": 9.999199271663524e-05, + "loss": 1.5171, + "step": 41050 + }, + { + "epoch": 2.192172337290633, + "grad_norm": 0.39294907450675964, + "learning_rate": 9.999169115956612e-05, + "loss": 1.5007, + "step": 41060 + }, + { + "epoch": 2.1923891137568376, + "eval_loss": 2.489570140838623, + "eval_runtime": 21.9756, + "eval_samples_per_second": 227.525, + "eval_steps_per_second": 1.229, + "step": 41064 + }, + { + "epoch": 2.1927142784561453, + "grad_norm": 0.5847845673561096, + "learning_rate": 9.999138402912161e-05, + "loss": 1.5158, + "step": 41070 + }, + { + "epoch": 2.193256219621657, + "grad_norm": 0.33957940340042114, + "learning_rate": 9.99910713253398e-05, + "loss": 1.5152, + "step": 41080 + }, + { + "epoch": 2.1937981607871695, + "grad_norm": 0.3208034038543701, + "learning_rate": 9.999075304825941e-05, + "loss": 1.509, + "step": 41090 + }, + { + "epoch": 2.194340101952682, + "grad_norm": 0.3303355872631073, + "learning_rate": 9.999042919791985e-05, + "loss": 1.5075, + "step": 41100 + }, + { + "epoch": 2.194882043118194, + "grad_norm": 0.278929203748703, + "learning_rate": 9.999009977436125e-05, + "loss": 1.5121, + "step": 41110 + }, + { + "epoch": 2.195423984283706, + "grad_norm": 0.21701566874980927, + "learning_rate": 9.998976477762442e-05, + "loss": 1.5121, + "step": 41120 + }, + { + "epoch": 2.1959659254492183, + "grad_norm": 0.35072654485702515, + "learning_rate": 9.998942420775086e-05, + "loss": 1.5021, + "step": 41130 + }, + { + "epoch": 2.1965078666147306, + "grad_norm": 0.36139407753944397, + "learning_rate": 9.998907806478275e-05, + "loss": 1.5029, + "step": 41140 + }, + { + "epoch": 2.197049807780243, + "grad_norm": 0.31903785467147827, + "learning_rate": 9.998872634876297e-05, + "loss": 1.506, + "step": 41150 + }, + { + "epoch": 2.1971040018967942, + "eval_loss": 2.4797050952911377, + "eval_runtime": 21.9824, + "eval_samples_per_second": 227.454, + "eval_steps_per_second": 1.228, + "step": 41151 + }, + { + "epoch": 2.1975917489457553, + "grad_norm": 0.6769224405288696, + "learning_rate": 9.99883690597351e-05, + "loss": 1.5126, + "step": 41160 + }, + { + "epoch": 2.198133690111267, + "grad_norm": 0.44375690817832947, + "learning_rate": 9.998800619774339e-05, + "loss": 1.5127, + "step": 41170 + }, + { + "epoch": 2.1986756312767795, + "grad_norm": 0.22975300252437592, + "learning_rate": 9.998763776283277e-05, + "loss": 1.5153, + "step": 41180 + }, + { + "epoch": 2.1992175724422918, + "grad_norm": 0.250010222196579, + "learning_rate": 9.998726375504892e-05, + "loss": 1.5109, + "step": 41190 + }, + { + "epoch": 2.199759513607804, + "grad_norm": 0.2538135349750519, + "learning_rate": 9.998688417443815e-05, + "loss": 1.5127, + "step": 41200 + }, + { + "epoch": 2.2003014547733164, + "grad_norm": 0.25529077649116516, + "learning_rate": 9.998649902104748e-05, + "loss": 1.5056, + "step": 41210 + }, + { + "epoch": 2.2008433959388283, + "grad_norm": 0.3090484142303467, + "learning_rate": 9.998610829492462e-05, + "loss": 1.5212, + "step": 41220 + }, + { + "epoch": 2.2013853371043406, + "grad_norm": 0.2540747821331024, + "learning_rate": 9.998571199611799e-05, + "loss": 1.5252, + "step": 41230 + }, + { + "epoch": 2.2018188900367504, + "eval_loss": 2.49056077003479, + "eval_runtime": 21.9807, + "eval_samples_per_second": 227.473, + "eval_steps_per_second": 1.228, + "step": 41238 + }, + { + "epoch": 2.201927278269853, + "grad_norm": 0.35519057512283325, + "learning_rate": 9.998531012467664e-05, + "loss": 1.5082, + "step": 41240 + }, + { + "epoch": 2.202469219435365, + "grad_norm": 0.24406082928180695, + "learning_rate": 9.998490268065038e-05, + "loss": 1.5048, + "step": 41250 + }, + { + "epoch": 2.203011160600877, + "grad_norm": 0.3896248936653137, + "learning_rate": 9.998448966408971e-05, + "loss": 1.4991, + "step": 41260 + }, + { + "epoch": 2.2035531017663894, + "grad_norm": 0.278812974691391, + "learning_rate": 9.998407107504575e-05, + "loss": 1.5014, + "step": 41270 + }, + { + "epoch": 2.2040950429319017, + "grad_norm": 0.5823280811309814, + "learning_rate": 9.998364691357036e-05, + "loss": 1.5131, + "step": 41280 + }, + { + "epoch": 2.204636984097414, + "grad_norm": 0.3981192111968994, + "learning_rate": 9.998321717971609e-05, + "loss": 1.5001, + "step": 41290 + }, + { + "epoch": 2.205178925262926, + "grad_norm": 0.6286061406135559, + "learning_rate": 9.998278187353616e-05, + "loss": 1.5109, + "step": 41300 + }, + { + "epoch": 2.2057208664284382, + "grad_norm": 0.2553674280643463, + "learning_rate": 9.998234099508454e-05, + "loss": 1.5113, + "step": 41310 + }, + { + "epoch": 2.2062628075939505, + "grad_norm": 0.48403024673461914, + "learning_rate": 9.998189454441579e-05, + "loss": 1.5138, + "step": 41320 + }, + { + "epoch": 2.2065337781767065, + "eval_loss": 2.478606700897217, + "eval_runtime": 21.9762, + "eval_samples_per_second": 227.519, + "eval_steps_per_second": 1.229, + "step": 41325 + }, + { + "epoch": 2.206804748759463, + "grad_norm": 0.3329886794090271, + "learning_rate": 9.998144252158523e-05, + "loss": 1.5122, + "step": 41330 + }, + { + "epoch": 2.207346689924975, + "grad_norm": 0.2454531043767929, + "learning_rate": 9.998098492664888e-05, + "loss": 1.5094, + "step": 41340 + }, + { + "epoch": 2.207888631090487, + "grad_norm": 0.3834342360496521, + "learning_rate": 9.998052175966339e-05, + "loss": 1.5042, + "step": 41350 + }, + { + "epoch": 2.2084305722559994, + "grad_norm": 0.27599623799324036, + "learning_rate": 9.998005302068616e-05, + "loss": 1.5139, + "step": 41360 + }, + { + "epoch": 2.2089725134215117, + "grad_norm": 0.2955540716648102, + "learning_rate": 9.997957870977525e-05, + "loss": 1.5212, + "step": 41370 + }, + { + "epoch": 2.209514454587024, + "grad_norm": 0.25166115164756775, + "learning_rate": 9.99790988269894e-05, + "loss": 1.5021, + "step": 41380 + }, + { + "epoch": 2.2100563957525363, + "grad_norm": 0.3993496298789978, + "learning_rate": 9.997861337238807e-05, + "loss": 1.4986, + "step": 41390 + }, + { + "epoch": 2.210598336918048, + "grad_norm": 0.3200553059577942, + "learning_rate": 9.99781223460314e-05, + "loss": 1.5055, + "step": 41400 + }, + { + "epoch": 2.2111402780835605, + "grad_norm": 0.3932972550392151, + "learning_rate": 9.99776257479802e-05, + "loss": 1.5082, + "step": 41410 + }, + { + "epoch": 2.211248666316663, + "eval_loss": 2.4762344360351562, + "eval_runtime": 21.9757, + "eval_samples_per_second": 227.524, + "eval_steps_per_second": 1.229, + "step": 41412 + }, + { + "epoch": 2.211682219249073, + "grad_norm": 0.22166556119918823, + "learning_rate": 9.997712357829599e-05, + "loss": 1.5199, + "step": 41420 + }, + { + "epoch": 2.212224160414585, + "grad_norm": 0.4338929057121277, + "learning_rate": 9.997661583704098e-05, + "loss": 1.51, + "step": 41430 + }, + { + "epoch": 2.212766101580097, + "grad_norm": 0.2761380672454834, + "learning_rate": 9.99761025242781e-05, + "loss": 1.524, + "step": 41440 + }, + { + "epoch": 2.2133080427456093, + "grad_norm": 0.2140953242778778, + "learning_rate": 9.997558364007087e-05, + "loss": 1.5159, + "step": 41450 + }, + { + "epoch": 2.2138499839111216, + "grad_norm": 0.2510915696620941, + "learning_rate": 9.997505918448364e-05, + "loss": 1.5055, + "step": 41460 + }, + { + "epoch": 2.214391925076634, + "grad_norm": 0.38897550106048584, + "learning_rate": 9.997452915758131e-05, + "loss": 1.4971, + "step": 41470 + }, + { + "epoch": 2.2149338662421463, + "grad_norm": 0.29425960779190063, + "learning_rate": 9.997399355942958e-05, + "loss": 1.5033, + "step": 41480 + }, + { + "epoch": 2.215475807407658, + "grad_norm": 0.43580734729766846, + "learning_rate": 9.997345239009477e-05, + "loss": 1.5049, + "step": 41490 + }, + { + "epoch": 2.215963554456619, + "eval_loss": 2.4717535972595215, + "eval_runtime": 22.0352, + "eval_samples_per_second": 226.909, + "eval_steps_per_second": 1.225, + "step": 41499 + }, + { + "epoch": 2.2160177485731705, + "grad_norm": 0.25963518023490906, + "learning_rate": 9.997290564964395e-05, + "loss": 1.4924, + "step": 41500 + }, + { + "epoch": 2.216559689738683, + "grad_norm": 0.23817171156406403, + "learning_rate": 9.997235333814482e-05, + "loss": 1.497, + "step": 41510 + }, + { + "epoch": 2.217101630904195, + "grad_norm": 0.19800572097301483, + "learning_rate": 9.997179545566582e-05, + "loss": 1.5142, + "step": 41520 + }, + { + "epoch": 2.217643572069707, + "grad_norm": 0.21633438766002655, + "learning_rate": 9.997123200227606e-05, + "loss": 1.5033, + "step": 41530 + }, + { + "epoch": 2.2181855132352193, + "grad_norm": 0.24905604124069214, + "learning_rate": 9.99706629780453e-05, + "loss": 1.4983, + "step": 41540 + }, + { + "epoch": 2.2187274544007316, + "grad_norm": 0.47959622740745544, + "learning_rate": 9.997008838304404e-05, + "loss": 1.5122, + "step": 41550 + }, + { + "epoch": 2.219269395566244, + "grad_norm": 0.5474976301193237, + "learning_rate": 9.996950821734351e-05, + "loss": 1.5153, + "step": 41560 + }, + { + "epoch": 2.2198113367317562, + "grad_norm": 0.38861730694770813, + "learning_rate": 9.996892248101553e-05, + "loss": 1.5109, + "step": 41570 + }, + { + "epoch": 2.220353277897268, + "grad_norm": 0.48361384868621826, + "learning_rate": 9.996833117413266e-05, + "loss": 1.503, + "step": 41580 + }, + { + "epoch": 2.2206784425965758, + "eval_loss": 2.471097946166992, + "eval_runtime": 21.9727, + "eval_samples_per_second": 227.555, + "eval_steps_per_second": 1.229, + "step": 41586 + }, + { + "epoch": 2.2208952190627804, + "grad_norm": 0.24642570316791534, + "learning_rate": 9.996773429676815e-05, + "loss": 1.5177, + "step": 41590 + }, + { + "epoch": 2.2214371602282927, + "grad_norm": 0.2803477942943573, + "learning_rate": 9.996713184899595e-05, + "loss": 1.514, + "step": 41600 + }, + { + "epoch": 2.221979101393805, + "grad_norm": 0.555143415927887, + "learning_rate": 9.996652383089069e-05, + "loss": 1.5011, + "step": 41610 + }, + { + "epoch": 2.2225210425593174, + "grad_norm": 0.2646414041519165, + "learning_rate": 9.996591024252768e-05, + "loss": 1.5026, + "step": 41620 + }, + { + "epoch": 2.2230629837248292, + "grad_norm": 0.5564790964126587, + "learning_rate": 9.996529108398294e-05, + "loss": 1.5104, + "step": 41630 + }, + { + "epoch": 2.2236049248903416, + "grad_norm": 0.5015528798103333, + "learning_rate": 9.996466635533316e-05, + "loss": 1.5151, + "step": 41640 + }, + { + "epoch": 2.224146866055854, + "grad_norm": 0.3904198408126831, + "learning_rate": 9.996403605665572e-05, + "loss": 1.5045, + "step": 41650 + }, + { + "epoch": 2.224688807221366, + "grad_norm": 0.27617716789245605, + "learning_rate": 9.996340018802872e-05, + "loss": 1.5048, + "step": 41660 + }, + { + "epoch": 2.225230748386878, + "grad_norm": 0.2563170790672302, + "learning_rate": 9.99627587495309e-05, + "loss": 1.507, + "step": 41670 + }, + { + "epoch": 2.225393330736532, + "eval_loss": 2.475794553756714, + "eval_runtime": 21.9846, + "eval_samples_per_second": 227.432, + "eval_steps_per_second": 1.228, + "step": 41673 + }, + { + "epoch": 2.2257726895523904, + "grad_norm": 0.20860429108142853, + "learning_rate": 9.996211174124174e-05, + "loss": 1.5048, + "step": 41680 + }, + { + "epoch": 2.2263146307179027, + "grad_norm": 0.2962222397327423, + "learning_rate": 9.996145916324138e-05, + "loss": 1.4984, + "step": 41690 + }, + { + "epoch": 2.226856571883415, + "grad_norm": 0.285791277885437, + "learning_rate": 9.996080101561066e-05, + "loss": 1.5111, + "step": 41700 + }, + { + "epoch": 2.2273985130489273, + "grad_norm": 0.2907065451145172, + "learning_rate": 9.996013729843113e-05, + "loss": 1.5012, + "step": 41710 + }, + { + "epoch": 2.227940454214439, + "grad_norm": 0.3030034303665161, + "learning_rate": 9.995946801178498e-05, + "loss": 1.5042, + "step": 41720 + }, + { + "epoch": 2.2284823953799515, + "grad_norm": 0.3327134847640991, + "learning_rate": 9.995879315575511e-05, + "loss": 1.5058, + "step": 41730 + }, + { + "epoch": 2.229024336545464, + "grad_norm": 0.2516813576221466, + "learning_rate": 9.995811273042515e-05, + "loss": 1.5054, + "step": 41740 + }, + { + "epoch": 2.229566277710976, + "grad_norm": 0.43475955724716187, + "learning_rate": 9.995742673587937e-05, + "loss": 1.5044, + "step": 41750 + }, + { + "epoch": 2.230108218876488, + "grad_norm": 0.4820566773414612, + "learning_rate": 9.995673517220277e-05, + "loss": 1.4987, + "step": 41760 + }, + { + "epoch": 2.230108218876488, + "eval_loss": 2.476296901702881, + "eval_runtime": 21.9717, + "eval_samples_per_second": 227.565, + "eval_steps_per_second": 1.229, + "step": 41760 + }, + { + "epoch": 2.2306501600420003, + "grad_norm": 0.503626823425293, + "learning_rate": 9.995603803948098e-05, + "loss": 1.5076, + "step": 41770 + }, + { + "epoch": 2.2311921012075127, + "grad_norm": 0.3283330500125885, + "learning_rate": 9.995533533780038e-05, + "loss": 1.5055, + "step": 41780 + }, + { + "epoch": 2.231734042373025, + "grad_norm": 0.5820431113243103, + "learning_rate": 9.9954627067248e-05, + "loss": 1.5118, + "step": 41790 + }, + { + "epoch": 2.2322759835385373, + "grad_norm": 0.29778963327407837, + "learning_rate": 9.995391322791162e-05, + "loss": 1.5087, + "step": 41800 + }, + { + "epoch": 2.232817924704049, + "grad_norm": 0.4030333459377289, + "learning_rate": 9.995319381987962e-05, + "loss": 1.4998, + "step": 41810 + }, + { + "epoch": 2.2333598658695615, + "grad_norm": 0.22505435347557068, + "learning_rate": 9.995246884324115e-05, + "loss": 1.5041, + "step": 41820 + }, + { + "epoch": 2.233901807035074, + "grad_norm": 0.4648926258087158, + "learning_rate": 9.995173829808599e-05, + "loss": 1.4911, + "step": 41830 + }, + { + "epoch": 2.234443748200586, + "grad_norm": 0.2858235239982605, + "learning_rate": 9.995100218450467e-05, + "loss": 1.5019, + "step": 41840 + }, + { + "epoch": 2.2348231070164446, + "eval_loss": 2.4754035472869873, + "eval_runtime": 21.9771, + "eval_samples_per_second": 227.51, + "eval_steps_per_second": 1.229, + "step": 41847 + }, + { + "epoch": 2.2349856893660984, + "grad_norm": 0.48889511823654175, + "learning_rate": 9.995026050258835e-05, + "loss": 1.5035, + "step": 41850 + }, + { + "epoch": 2.2355276305316103, + "grad_norm": 0.32360124588012695, + "learning_rate": 9.994951325242891e-05, + "loss": 1.4995, + "step": 41860 + }, + { + "epoch": 2.2360695716971226, + "grad_norm": 0.3390207588672638, + "learning_rate": 9.994876043411891e-05, + "loss": 1.5077, + "step": 41870 + }, + { + "epoch": 2.236611512862635, + "grad_norm": 0.3765084743499756, + "learning_rate": 9.994800204775163e-05, + "loss": 1.5077, + "step": 41880 + }, + { + "epoch": 2.2371534540281472, + "grad_norm": 0.26570597290992737, + "learning_rate": 9.994723809342099e-05, + "loss": 1.5082, + "step": 41890 + }, + { + "epoch": 2.237695395193659, + "grad_norm": 0.2624066472053528, + "learning_rate": 9.994646857122165e-05, + "loss": 1.5153, + "step": 41900 + }, + { + "epoch": 2.2382373363591714, + "grad_norm": 0.5075863003730774, + "learning_rate": 9.994569348124891e-05, + "loss": 1.5177, + "step": 41910 + }, + { + "epoch": 2.2387792775246838, + "grad_norm": 0.2840348184108734, + "learning_rate": 9.99449128235988e-05, + "loss": 1.507, + "step": 41920 + }, + { + "epoch": 2.239321218690196, + "grad_norm": 0.23727837204933167, + "learning_rate": 9.994412659836802e-05, + "loss": 1.5062, + "step": 41930 + }, + { + "epoch": 2.2395379951564007, + "eval_loss": 2.478437662124634, + "eval_runtime": 21.9787, + "eval_samples_per_second": 227.493, + "eval_steps_per_second": 1.228, + "step": 41934 + }, + { + "epoch": 2.239863159855708, + "grad_norm": 0.5313767790794373, + "learning_rate": 9.994333480565397e-05, + "loss": 1.5117, + "step": 41940 + }, + { + "epoch": 2.2404051010212203, + "grad_norm": 0.28299862146377563, + "learning_rate": 9.994253744555473e-05, + "loss": 1.5064, + "step": 41950 + }, + { + "epoch": 2.2409470421867326, + "grad_norm": 0.2567594349384308, + "learning_rate": 9.994173451816906e-05, + "loss": 1.5007, + "step": 41960 + }, + { + "epoch": 2.241488983352245, + "grad_norm": 0.24751020967960358, + "learning_rate": 9.994092602359645e-05, + "loss": 1.5152, + "step": 41970 + }, + { + "epoch": 2.242030924517757, + "grad_norm": 0.5088745951652527, + "learning_rate": 9.994011196193702e-05, + "loss": 1.5042, + "step": 41980 + }, + { + "epoch": 2.242572865683269, + "grad_norm": 0.2354600876569748, + "learning_rate": 9.993929233329164e-05, + "loss": 1.5111, + "step": 41990 + }, + { + "epoch": 2.2431148068487814, + "grad_norm": 0.22222284972667694, + "learning_rate": 9.993846713776183e-05, + "loss": 1.5087, + "step": 42000 + }, + { + "epoch": 2.2436567480142937, + "grad_norm": 0.20365022122859955, + "learning_rate": 9.993763637544983e-05, + "loss": 1.5099, + "step": 42010 + }, + { + "epoch": 2.244198689179806, + "grad_norm": 0.2721654176712036, + "learning_rate": 9.993680004645851e-05, + "loss": 1.5031, + "step": 42020 + }, + { + "epoch": 2.2442528832963573, + "eval_loss": 2.4772350788116455, + "eval_runtime": 21.9795, + "eval_samples_per_second": 227.485, + "eval_steps_per_second": 1.228, + "step": 42021 + }, + { + "epoch": 2.2447406303453183, + "grad_norm": 0.3124418556690216, + "learning_rate": 9.993595815089151e-05, + "loss": 1.503, + "step": 42030 + }, + { + "epoch": 2.24528257151083, + "grad_norm": 0.304231196641922, + "learning_rate": 9.993511068885311e-05, + "loss": 1.4958, + "step": 42040 + }, + { + "epoch": 2.2458245126763425, + "grad_norm": 0.23490315675735474, + "learning_rate": 9.99342576604483e-05, + "loss": 1.5052, + "step": 42050 + }, + { + "epoch": 2.246366453841855, + "grad_norm": 0.5602021217346191, + "learning_rate": 9.99333990657827e-05, + "loss": 1.5007, + "step": 42060 + }, + { + "epoch": 2.246908395007367, + "grad_norm": 0.25727346539497375, + "learning_rate": 9.993253490496272e-05, + "loss": 1.5107, + "step": 42070 + }, + { + "epoch": 2.247450336172879, + "grad_norm": 0.2954048216342926, + "learning_rate": 9.99316651780954e-05, + "loss": 1.5127, + "step": 42080 + }, + { + "epoch": 2.2479922773383914, + "grad_norm": 0.23384593427181244, + "learning_rate": 9.993078988528848e-05, + "loss": 1.5024, + "step": 42090 + }, + { + "epoch": 2.2485342185039037, + "grad_norm": 0.2479962408542633, + "learning_rate": 9.992990902665037e-05, + "loss": 1.5007, + "step": 42100 + }, + { + "epoch": 2.2489677714363134, + "eval_loss": 2.471235990524292, + "eval_runtime": 21.9804, + "eval_samples_per_second": 227.476, + "eval_steps_per_second": 1.228, + "step": 42108 + }, + { + "epoch": 2.249076159669416, + "grad_norm": 0.26295939087867737, + "learning_rate": 9.99290226022902e-05, + "loss": 1.5132, + "step": 42110 + }, + { + "epoch": 2.2496181008349283, + "grad_norm": 0.6523755192756653, + "learning_rate": 9.992813061231775e-05, + "loss": 1.5043, + "step": 42120 + }, + { + "epoch": 2.25016004200044, + "grad_norm": 0.3719354569911957, + "learning_rate": 9.992723305684357e-05, + "loss": 1.5014, + "step": 42130 + }, + { + "epoch": 2.2507019831659525, + "grad_norm": 0.2825568616390228, + "learning_rate": 9.99263299359788e-05, + "loss": 1.5012, + "step": 42140 + }, + { + "epoch": 2.251243924331465, + "grad_norm": 0.4288165867328644, + "learning_rate": 9.992542124983531e-05, + "loss": 1.5053, + "step": 42150 + }, + { + "epoch": 2.251785865496977, + "grad_norm": 0.22682829201221466, + "learning_rate": 9.992450699852571e-05, + "loss": 1.501, + "step": 42160 + }, + { + "epoch": 2.252327806662489, + "grad_norm": 0.409411758184433, + "learning_rate": 9.992358718216321e-05, + "loss": 1.5048, + "step": 42170 + }, + { + "epoch": 2.2528697478280013, + "grad_norm": 0.6447529792785645, + "learning_rate": 9.992266180086178e-05, + "loss": 1.5141, + "step": 42180 + }, + { + "epoch": 2.2534116889935136, + "grad_norm": 0.35520994663238525, + "learning_rate": 9.992173085473603e-05, + "loss": 1.5043, + "step": 42190 + }, + { + "epoch": 2.2536826595762696, + "eval_loss": 2.463776111602783, + "eval_runtime": 21.9731, + "eval_samples_per_second": 227.551, + "eval_steps_per_second": 1.229, + "step": 42195 + }, + { + "epoch": 2.253953630159026, + "grad_norm": 0.24560990929603577, + "learning_rate": 9.992079434390129e-05, + "loss": 1.5081, + "step": 42200 + }, + { + "epoch": 2.2544955713245383, + "grad_norm": 0.41237837076187134, + "learning_rate": 9.991985226847358e-05, + "loss": 1.5058, + "step": 42210 + }, + { + "epoch": 2.25503751249005, + "grad_norm": 0.2179434448480606, + "learning_rate": 9.99189046285696e-05, + "loss": 1.5061, + "step": 42220 + }, + { + "epoch": 2.2555794536555624, + "grad_norm": 0.2224990427494049, + "learning_rate": 9.991795142430672e-05, + "loss": 1.4924, + "step": 42230 + }, + { + "epoch": 2.2561213948210748, + "grad_norm": 0.3237124979496002, + "learning_rate": 9.991699265580304e-05, + "loss": 1.5058, + "step": 42240 + }, + { + "epoch": 2.256663335986587, + "grad_norm": 0.2218666821718216, + "learning_rate": 9.991602832317731e-05, + "loss": 1.4995, + "step": 42250 + }, + { + "epoch": 2.2572052771520994, + "grad_norm": 0.38904523849487305, + "learning_rate": 9.991505842654901e-05, + "loss": 1.503, + "step": 42260 + }, + { + "epoch": 2.2577472183176113, + "grad_norm": 0.341505765914917, + "learning_rate": 9.991408296603827e-05, + "loss": 1.5107, + "step": 42270 + }, + { + "epoch": 2.2582891594831236, + "grad_norm": 0.242193341255188, + "learning_rate": 9.991310194176593e-05, + "loss": 1.495, + "step": 42280 + }, + { + "epoch": 2.258397547716226, + "eval_loss": 2.4719505310058594, + "eval_runtime": 21.9979, + "eval_samples_per_second": 227.294, + "eval_steps_per_second": 1.227, + "step": 42282 + }, + { + "epoch": 2.258831100648636, + "grad_norm": 0.26852932572364807, + "learning_rate": 9.991211535385352e-05, + "loss": 1.504, + "step": 42290 + }, + { + "epoch": 2.259373041814148, + "grad_norm": 0.32731330394744873, + "learning_rate": 9.991112320242322e-05, + "loss": 1.4927, + "step": 42300 + }, + { + "epoch": 2.25991498297966, + "grad_norm": 0.30170223116874695, + "learning_rate": 9.991012548759801e-05, + "loss": 1.5, + "step": 42310 + }, + { + "epoch": 2.2604569241451724, + "grad_norm": 0.2903915345668793, + "learning_rate": 9.990912220950141e-05, + "loss": 1.4963, + "step": 42320 + }, + { + "epoch": 2.2609988653106847, + "grad_norm": 0.2501979470252991, + "learning_rate": 9.990811336825771e-05, + "loss": 1.5035, + "step": 42330 + }, + { + "epoch": 2.261540806476197, + "grad_norm": 0.3588942289352417, + "learning_rate": 9.990709896399191e-05, + "loss": 1.506, + "step": 42340 + }, + { + "epoch": 2.262082747641709, + "grad_norm": 0.2870718240737915, + "learning_rate": 9.990607899682967e-05, + "loss": 1.4929, + "step": 42350 + }, + { + "epoch": 2.2626246888072212, + "grad_norm": 0.2925523817539215, + "learning_rate": 9.990505346689732e-05, + "loss": 1.5084, + "step": 42360 + }, + { + "epoch": 2.2631124358561823, + "eval_loss": 2.4687459468841553, + "eval_runtime": 22.2759, + "eval_samples_per_second": 224.458, + "eval_steps_per_second": 1.212, + "step": 42369 + }, + { + "epoch": 2.2631666299727335, + "grad_norm": 0.28121358156204224, + "learning_rate": 9.99040223743219e-05, + "loss": 1.5021, + "step": 42370 + }, + { + "epoch": 2.263708571138246, + "grad_norm": 0.40164533257484436, + "learning_rate": 9.990298571923114e-05, + "loss": 1.4933, + "step": 42380 + }, + { + "epoch": 2.264250512303758, + "grad_norm": 0.1876879632472992, + "learning_rate": 9.990194350175346e-05, + "loss": 1.5118, + "step": 42390 + }, + { + "epoch": 2.26479245346927, + "grad_norm": 0.3837462067604065, + "learning_rate": 9.990089572201796e-05, + "loss": 1.4987, + "step": 42400 + }, + { + "epoch": 2.2653343946347824, + "grad_norm": 0.3707353472709656, + "learning_rate": 9.989984238015445e-05, + "loss": 1.5162, + "step": 42410 + }, + { + "epoch": 2.2658763358002947, + "grad_norm": 0.5873448252677917, + "learning_rate": 9.98987834762934e-05, + "loss": 1.4921, + "step": 42420 + }, + { + "epoch": 2.266418276965807, + "grad_norm": 0.43937060236930847, + "learning_rate": 9.989771901056598e-05, + "loss": 1.5083, + "step": 42430 + }, + { + "epoch": 2.2669602181313193, + "grad_norm": 0.3484452962875366, + "learning_rate": 9.989664898310405e-05, + "loss": 1.5054, + "step": 42440 + }, + { + "epoch": 2.267502159296831, + "grad_norm": 0.3318488597869873, + "learning_rate": 9.98955733940402e-05, + "loss": 1.4976, + "step": 42450 + }, + { + "epoch": 2.267827323996139, + "eval_loss": 2.4685611724853516, + "eval_runtime": 22.0052, + "eval_samples_per_second": 227.219, + "eval_steps_per_second": 1.227, + "step": 42456 + }, + { + "epoch": 2.2680441004623435, + "grad_norm": 0.3194591999053955, + "learning_rate": 9.989449224350758e-05, + "loss": 1.504, + "step": 42460 + }, + { + "epoch": 2.268586041627856, + "grad_norm": 0.2176416665315628, + "learning_rate": 9.989340553164021e-05, + "loss": 1.5006, + "step": 42470 + }, + { + "epoch": 2.269127982793368, + "grad_norm": 0.2666696608066559, + "learning_rate": 9.989231325857266e-05, + "loss": 1.5008, + "step": 42480 + }, + { + "epoch": 2.2696699239588805, + "grad_norm": 0.5212827324867249, + "learning_rate": 9.989121542444023e-05, + "loss": 1.5013, + "step": 42490 + }, + { + "epoch": 2.2702118651243923, + "grad_norm": 0.33473333716392517, + "learning_rate": 9.989011202937896e-05, + "loss": 1.5156, + "step": 42500 + }, + { + "epoch": 2.2707538062899046, + "grad_norm": 0.46315068006515503, + "learning_rate": 9.988900307352549e-05, + "loss": 1.4865, + "step": 42510 + }, + { + "epoch": 2.271295747455417, + "grad_norm": 0.3576599657535553, + "learning_rate": 9.98878885570172e-05, + "loss": 1.5077, + "step": 42520 + }, + { + "epoch": 2.2718376886209293, + "grad_norm": 0.22602027654647827, + "learning_rate": 9.988676847999218e-05, + "loss": 1.5185, + "step": 42530 + }, + { + "epoch": 2.272379629786441, + "grad_norm": 0.2150840014219284, + "learning_rate": 9.988564284258916e-05, + "loss": 1.5062, + "step": 42540 + }, + { + "epoch": 2.272542212136095, + "eval_loss": 2.4602785110473633, + "eval_runtime": 21.997, + "eval_samples_per_second": 227.304, + "eval_steps_per_second": 1.227, + "step": 42543 + }, + { + "epoch": 2.2729215709519535, + "grad_norm": 0.3397773206233978, + "learning_rate": 9.988451164494757e-05, + "loss": 1.5026, + "step": 42550 + }, + { + "epoch": 2.273463512117466, + "grad_norm": 0.213679701089859, + "learning_rate": 9.988337488720753e-05, + "loss": 1.5014, + "step": 42560 + }, + { + "epoch": 2.274005453282978, + "grad_norm": 0.21069695055484772, + "learning_rate": 9.98822325695099e-05, + "loss": 1.5098, + "step": 42570 + }, + { + "epoch": 2.27454739444849, + "grad_norm": 0.38944995403289795, + "learning_rate": 9.988108469199613e-05, + "loss": 1.496, + "step": 42580 + }, + { + "epoch": 2.2750893356140023, + "grad_norm": 0.36275213956832886, + "learning_rate": 9.987993125480848e-05, + "loss": 1.5009, + "step": 42590 + }, + { + "epoch": 2.2756312767795146, + "grad_norm": 0.27559274435043335, + "learning_rate": 9.987877225808976e-05, + "loss": 1.4994, + "step": 42600 + }, + { + "epoch": 2.276173217945027, + "grad_norm": 0.365401029586792, + "learning_rate": 9.987760770198359e-05, + "loss": 1.5061, + "step": 42610 + }, + { + "epoch": 2.2767151591105392, + "grad_norm": 0.29308363795280457, + "learning_rate": 9.987643758663422e-05, + "loss": 1.4963, + "step": 42620 + }, + { + "epoch": 2.277257100276051, + "grad_norm": 0.43470004200935364, + "learning_rate": 9.98752619121866e-05, + "loss": 1.496, + "step": 42630 + }, + { + "epoch": 2.277257100276051, + "eval_loss": 2.4641952514648438, + "eval_runtime": 22.0028, + "eval_samples_per_second": 227.243, + "eval_steps_per_second": 1.227, + "step": 42630 + }, + { + "epoch": 2.2777990414415634, + "grad_norm": 0.21174076199531555, + "learning_rate": 9.987408067878636e-05, + "loss": 1.5095, + "step": 42640 + }, + { + "epoch": 2.2783409826070757, + "grad_norm": 0.4398151934146881, + "learning_rate": 9.987289388657982e-05, + "loss": 1.4995, + "step": 42650 + }, + { + "epoch": 2.278882923772588, + "grad_norm": 0.41599544882774353, + "learning_rate": 9.987170153571402e-05, + "loss": 1.4949, + "step": 42660 + }, + { + "epoch": 2.2794248649381004, + "grad_norm": 0.34385424852371216, + "learning_rate": 9.987050362633663e-05, + "loss": 1.5031, + "step": 42670 + }, + { + "epoch": 2.2799668061036122, + "grad_norm": 0.242776557803154, + "learning_rate": 9.986930015859607e-05, + "loss": 1.4902, + "step": 42680 + }, + { + "epoch": 2.2805087472691246, + "grad_norm": 0.4597446322441101, + "learning_rate": 9.986809113264142e-05, + "loss": 1.5039, + "step": 42690 + }, + { + "epoch": 2.281050688434637, + "grad_norm": 0.25635120272636414, + "learning_rate": 9.986687654862242e-05, + "loss": 1.5071, + "step": 42700 + }, + { + "epoch": 2.281592629600149, + "grad_norm": 0.23346717655658722, + "learning_rate": 9.986565640668955e-05, + "loss": 1.5025, + "step": 42710 + }, + { + "epoch": 2.2819719884160077, + "eval_loss": 2.4699313640594482, + "eval_runtime": 22.0353, + "eval_samples_per_second": 226.908, + "eval_steps_per_second": 1.225, + "step": 42717 + }, + { + "epoch": 2.282134570765661, + "grad_norm": 0.4288314878940582, + "learning_rate": 9.986443070699393e-05, + "loss": 1.5069, + "step": 42720 + }, + { + "epoch": 2.2826765119311734, + "grad_norm": 0.3048160672187805, + "learning_rate": 9.986319944968745e-05, + "loss": 1.5084, + "step": 42730 + }, + { + "epoch": 2.2832184530966857, + "grad_norm": 0.2685145139694214, + "learning_rate": 9.986196263492259e-05, + "loss": 1.5078, + "step": 42740 + }, + { + "epoch": 2.283760394262198, + "grad_norm": 0.36465343832969666, + "learning_rate": 9.986072026285257e-05, + "loss": 1.4972, + "step": 42750 + }, + { + "epoch": 2.28430233542771, + "grad_norm": 0.27710703015327454, + "learning_rate": 9.985947233363128e-05, + "loss": 1.4929, + "step": 42760 + }, + { + "epoch": 2.284844276593222, + "grad_norm": 0.5926934480667114, + "learning_rate": 9.985821884741333e-05, + "loss": 1.5015, + "step": 42770 + }, + { + "epoch": 2.2853862177587345, + "grad_norm": 0.34573689103126526, + "learning_rate": 9.985695980435398e-05, + "loss": 1.5047, + "step": 42780 + }, + { + "epoch": 2.285928158924247, + "grad_norm": 0.33093443512916565, + "learning_rate": 9.985569520460918e-05, + "loss": 1.4997, + "step": 42790 + }, + { + "epoch": 2.286470100089759, + "grad_norm": 0.5260155200958252, + "learning_rate": 9.985442504833562e-05, + "loss": 1.5034, + "step": 42800 + }, + { + "epoch": 2.286686876555964, + "eval_loss": 2.4743220806121826, + "eval_runtime": 22.0148, + "eval_samples_per_second": 227.12, + "eval_steps_per_second": 1.226, + "step": 42804 + }, + { + "epoch": 2.287012041255271, + "grad_norm": 0.22900910675525665, + "learning_rate": 9.985314933569063e-05, + "loss": 1.5057, + "step": 42810 + }, + { + "epoch": 2.2875539824207833, + "grad_norm": 0.47526514530181885, + "learning_rate": 9.985186806683222e-05, + "loss": 1.4987, + "step": 42820 + }, + { + "epoch": 2.2880959235862957, + "grad_norm": 0.2705422043800354, + "learning_rate": 9.985058124191914e-05, + "loss": 1.5031, + "step": 42830 + }, + { + "epoch": 2.288637864751808, + "grad_norm": 0.21489684283733368, + "learning_rate": 9.984928886111076e-05, + "loss": 1.5001, + "step": 42840 + }, + { + "epoch": 2.2891798059173203, + "grad_norm": 0.44066211581230164, + "learning_rate": 9.98479909245672e-05, + "loss": 1.5006, + "step": 42850 + }, + { + "epoch": 2.289721747082832, + "grad_norm": 0.4437292516231537, + "learning_rate": 9.984668743244921e-05, + "loss": 1.5121, + "step": 42860 + }, + { + "epoch": 2.2902636882483445, + "grad_norm": 0.41992589831352234, + "learning_rate": 9.984537838491833e-05, + "loss": 1.4998, + "step": 42870 + }, + { + "epoch": 2.290805629413857, + "grad_norm": 0.26407718658447266, + "learning_rate": 9.984406378213664e-05, + "loss": 1.5009, + "step": 42880 + }, + { + "epoch": 2.291347570579369, + "grad_norm": 0.35871899127960205, + "learning_rate": 9.984274362426703e-05, + "loss": 1.5003, + "step": 42890 + }, + { + "epoch": 2.2914017646959204, + "eval_loss": 2.4577038288116455, + "eval_runtime": 22.0057, + "eval_samples_per_second": 227.214, + "eval_steps_per_second": 1.227, + "step": 42891 + }, + { + "epoch": 2.2918895117448814, + "grad_norm": 0.38924890756607056, + "learning_rate": 9.984141791147303e-05, + "loss": 1.5098, + "step": 42900 + }, + { + "epoch": 2.2924314529103933, + "grad_norm": 0.46913039684295654, + "learning_rate": 9.984008664391888e-05, + "loss": 1.5006, + "step": 42910 + }, + { + "epoch": 2.2929733940759056, + "grad_norm": 0.6839142441749573, + "learning_rate": 9.983874982176944e-05, + "loss": 1.5011, + "step": 42920 + }, + { + "epoch": 2.293515335241418, + "grad_norm": 0.37098947167396545, + "learning_rate": 9.983740744519037e-05, + "loss": 1.5078, + "step": 42930 + }, + { + "epoch": 2.2940572764069302, + "grad_norm": 0.4213247001171112, + "learning_rate": 9.983605951434792e-05, + "loss": 1.4979, + "step": 42940 + }, + { + "epoch": 2.294599217572442, + "grad_norm": 0.37451714277267456, + "learning_rate": 9.983470602940907e-05, + "loss": 1.5055, + "step": 42950 + }, + { + "epoch": 2.2951411587379544, + "grad_norm": 0.3335314989089966, + "learning_rate": 9.983334699054149e-05, + "loss": 1.4985, + "step": 42960 + }, + { + "epoch": 2.2956830999034668, + "grad_norm": 0.5755605697631836, + "learning_rate": 9.983198239791353e-05, + "loss": 1.5072, + "step": 42970 + }, + { + "epoch": 2.2961166528358765, + "eval_loss": 2.46994686126709, + "eval_runtime": 22.0001, + "eval_samples_per_second": 227.272, + "eval_steps_per_second": 1.227, + "step": 42978 + }, + { + "epoch": 2.296225041068979, + "grad_norm": 0.4055348038673401, + "learning_rate": 9.983061225169423e-05, + "loss": 1.514, + "step": 42980 + }, + { + "epoch": 2.296766982234491, + "grad_norm": 0.41009873151779175, + "learning_rate": 9.982923655205335e-05, + "loss": 1.4959, + "step": 42990 + }, + { + "epoch": 2.2973089234000033, + "grad_norm": 0.30407077074050903, + "learning_rate": 9.982785529916124e-05, + "loss": 1.4958, + "step": 43000 + }, + { + "epoch": 2.2978508645655156, + "grad_norm": 0.3343801498413086, + "learning_rate": 9.982646849318906e-05, + "loss": 1.5003, + "step": 43010 + }, + { + "epoch": 2.298392805731028, + "grad_norm": 0.4572197496891022, + "learning_rate": 9.982507613430856e-05, + "loss": 1.4998, + "step": 43020 + }, + { + "epoch": 2.29893474689654, + "grad_norm": 0.3537229895591736, + "learning_rate": 9.982367822269225e-05, + "loss": 1.5059, + "step": 43030 + }, + { + "epoch": 2.299476688062052, + "grad_norm": 0.4736888110637665, + "learning_rate": 9.982227475851328e-05, + "loss": 1.4988, + "step": 43040 + }, + { + "epoch": 2.3000186292275644, + "grad_norm": 0.32492560148239136, + "learning_rate": 9.982086574194552e-05, + "loss": 1.5023, + "step": 43050 + }, + { + "epoch": 2.3005605703930767, + "grad_norm": 0.19111861288547516, + "learning_rate": 9.98194511731635e-05, + "loss": 1.5075, + "step": 43060 + }, + { + "epoch": 2.3008315409758326, + "eval_loss": 2.4626097679138184, + "eval_runtime": 22.0143, + "eval_samples_per_second": 227.125, + "eval_steps_per_second": 1.226, + "step": 43065 + }, + { + "epoch": 2.301102511558589, + "grad_norm": 0.416725754737854, + "learning_rate": 9.981803105234246e-05, + "loss": 1.5009, + "step": 43070 + }, + { + "epoch": 2.3016444527241013, + "grad_norm": 0.5482866168022156, + "learning_rate": 9.981660537965833e-05, + "loss": 1.5044, + "step": 43080 + }, + { + "epoch": 2.302186393889613, + "grad_norm": 0.22884666919708252, + "learning_rate": 9.981517415528766e-05, + "loss": 1.4965, + "step": 43090 + }, + { + "epoch": 2.3027283350551255, + "grad_norm": 0.3131319284439087, + "learning_rate": 9.98137373794078e-05, + "loss": 1.5089, + "step": 43100 + }, + { + "epoch": 2.303270276220638, + "grad_norm": 0.22370652854442596, + "learning_rate": 9.981229505219673e-05, + "loss": 1.5051, + "step": 43110 + }, + { + "epoch": 2.30381221738615, + "grad_norm": 0.2758335471153259, + "learning_rate": 9.98108471738331e-05, + "loss": 1.5001, + "step": 43120 + }, + { + "epoch": 2.3043541585516625, + "grad_norm": 0.309836208820343, + "learning_rate": 9.980939374449627e-05, + "loss": 1.5022, + "step": 43130 + }, + { + "epoch": 2.3048960997171744, + "grad_norm": 0.2783421576023102, + "learning_rate": 9.980793476436628e-05, + "loss": 1.4996, + "step": 43140 + }, + { + "epoch": 2.3054380408826867, + "grad_norm": 0.26890337467193604, + "learning_rate": 9.980647023362388e-05, + "loss": 1.5025, + "step": 43150 + }, + { + "epoch": 2.305546429115789, + "eval_loss": 2.4663755893707275, + "eval_runtime": 21.9786, + "eval_samples_per_second": 227.494, + "eval_steps_per_second": 1.228, + "step": 43152 + }, + { + "epoch": 2.305979982048199, + "grad_norm": 0.26960644125938416, + "learning_rate": 9.980500015245045e-05, + "loss": 1.5067, + "step": 43160 + }, + { + "epoch": 2.3065219232137113, + "grad_norm": 0.32048168778419495, + "learning_rate": 9.980352452102815e-05, + "loss": 1.4944, + "step": 43170 + }, + { + "epoch": 2.307063864379223, + "grad_norm": 0.36742621660232544, + "learning_rate": 9.980204333953974e-05, + "loss": 1.4997, + "step": 43180 + }, + { + "epoch": 2.3076058055447355, + "grad_norm": 0.38924074172973633, + "learning_rate": 9.980055660816872e-05, + "loss": 1.5018, + "step": 43190 + }, + { + "epoch": 2.308147746710248, + "grad_norm": 0.7526839971542358, + "learning_rate": 9.979906432709925e-05, + "loss": 1.5036, + "step": 43200 + }, + { + "epoch": 2.30868968787576, + "grad_norm": 0.49918413162231445, + "learning_rate": 9.97975664965162e-05, + "loss": 1.5053, + "step": 43210 + }, + { + "epoch": 2.309231629041272, + "grad_norm": 0.21481403708457947, + "learning_rate": 9.979606311660506e-05, + "loss": 1.5091, + "step": 43220 + }, + { + "epoch": 2.3097735702067843, + "grad_norm": 0.26321297883987427, + "learning_rate": 9.979455418755215e-05, + "loss": 1.4983, + "step": 43230 + }, + { + "epoch": 2.3102613172557454, + "eval_loss": 2.472028970718384, + "eval_runtime": 21.9804, + "eval_samples_per_second": 227.476, + "eval_steps_per_second": 1.228, + "step": 43239 + }, + { + "epoch": 2.3103155113722966, + "grad_norm": 0.2163982391357422, + "learning_rate": 9.979303970954434e-05, + "loss": 1.5075, + "step": 43240 + }, + { + "epoch": 2.310857452537809, + "grad_norm": 0.2770186960697174, + "learning_rate": 9.979151968276922e-05, + "loss": 1.5033, + "step": 43250 + }, + { + "epoch": 2.3113993937033213, + "grad_norm": 0.3074730932712555, + "learning_rate": 9.978999410741514e-05, + "loss": 1.4884, + "step": 43260 + }, + { + "epoch": 2.311941334868833, + "grad_norm": 0.30169418454170227, + "learning_rate": 9.978846298367103e-05, + "loss": 1.4956, + "step": 43270 + }, + { + "epoch": 2.3124832760343454, + "grad_norm": 0.23095916211605072, + "learning_rate": 9.978692631172657e-05, + "loss": 1.4884, + "step": 43280 + }, + { + "epoch": 2.3130252171998578, + "grad_norm": 0.28143152594566345, + "learning_rate": 9.978538409177212e-05, + "loss": 1.494, + "step": 43290 + }, + { + "epoch": 2.31356715836537, + "grad_norm": 0.3327721953392029, + "learning_rate": 9.978383632399876e-05, + "loss": 1.5017, + "step": 43300 + }, + { + "epoch": 2.3141090995308824, + "grad_norm": 0.2959439754486084, + "learning_rate": 9.978228300859817e-05, + "loss": 1.4953, + "step": 43310 + }, + { + "epoch": 2.3146510406963943, + "grad_norm": 0.3569033741950989, + "learning_rate": 9.978072414576277e-05, + "loss": 1.509, + "step": 43320 + }, + { + "epoch": 2.314976205395702, + "eval_loss": 2.471214771270752, + "eval_runtime": 21.9774, + "eval_samples_per_second": 227.506, + "eval_steps_per_second": 1.229, + "step": 43326 + }, + { + "epoch": 2.3151929818619066, + "grad_norm": 0.37210530042648315, + "learning_rate": 9.97791597356857e-05, + "loss": 1.4907, + "step": 43330 + }, + { + "epoch": 2.315734923027419, + "grad_norm": 0.22837452590465546, + "learning_rate": 9.977758977856074e-05, + "loss": 1.494, + "step": 43340 + }, + { + "epoch": 2.316276864192931, + "grad_norm": 0.351477712392807, + "learning_rate": 9.977601427458235e-05, + "loss": 1.4953, + "step": 43350 + }, + { + "epoch": 2.316818805358443, + "grad_norm": 0.4676024913787842, + "learning_rate": 9.97744332239457e-05, + "loss": 1.5027, + "step": 43360 + }, + { + "epoch": 2.3173607465239554, + "grad_norm": 0.3546915054321289, + "learning_rate": 9.977284662684668e-05, + "loss": 1.509, + "step": 43370 + }, + { + "epoch": 2.3179026876894677, + "grad_norm": 0.45756760239601135, + "learning_rate": 9.977125448348178e-05, + "loss": 1.4956, + "step": 43380 + }, + { + "epoch": 2.31844462885498, + "grad_norm": 0.44374728202819824, + "learning_rate": 9.976965679404827e-05, + "loss": 1.4851, + "step": 43390 + }, + { + "epoch": 2.318986570020492, + "grad_norm": 0.2877369225025177, + "learning_rate": 9.976805355874404e-05, + "loss": 1.4979, + "step": 43400 + }, + { + "epoch": 2.3195285111860042, + "grad_norm": 0.4321652352809906, + "learning_rate": 9.976644477776768e-05, + "loss": 1.5, + "step": 43410 + }, + { + "epoch": 2.319691093535658, + "eval_loss": 2.4682838916778564, + "eval_runtime": 21.9705, + "eval_samples_per_second": 227.578, + "eval_steps_per_second": 1.229, + "step": 43413 + }, + { + "epoch": 2.3200704523515165, + "grad_norm": 0.3509621024131775, + "learning_rate": 9.976483045131853e-05, + "loss": 1.4899, + "step": 43420 + }, + { + "epoch": 2.320612393517029, + "grad_norm": 0.2721761167049408, + "learning_rate": 9.976321057959651e-05, + "loss": 1.4967, + "step": 43430 + }, + { + "epoch": 2.321154334682541, + "grad_norm": 0.27356627583503723, + "learning_rate": 9.976158516280231e-05, + "loss": 1.5021, + "step": 43440 + }, + { + "epoch": 2.321696275848053, + "grad_norm": 0.30389806628227234, + "learning_rate": 9.975995420113729e-05, + "loss": 1.4943, + "step": 43450 + }, + { + "epoch": 2.3222382170135654, + "grad_norm": 0.35005050897598267, + "learning_rate": 9.975831769480345e-05, + "loss": 1.5019, + "step": 43460 + }, + { + "epoch": 2.3227801581790777, + "grad_norm": 0.19983713328838348, + "learning_rate": 9.975667564400355e-05, + "loss": 1.5049, + "step": 43470 + }, + { + "epoch": 2.32332209934459, + "grad_norm": 0.27769649028778076, + "learning_rate": 9.975502804894097e-05, + "loss": 1.4946, + "step": 43480 + }, + { + "epoch": 2.3238640405101023, + "grad_norm": 0.20963497459888458, + "learning_rate": 9.975337490981984e-05, + "loss": 1.4937, + "step": 43490 + }, + { + "epoch": 2.324405981675614, + "grad_norm": 0.21975292265415192, + "learning_rate": 9.975171622684492e-05, + "loss": 1.4993, + "step": 43500 + }, + { + "epoch": 2.324405981675614, + "eval_loss": 2.477883815765381, + "eval_runtime": 21.9663, + "eval_samples_per_second": 227.622, + "eval_steps_per_second": 1.229, + "step": 43500 + }, + { + "epoch": 2.3249479228411265, + "grad_norm": 0.3812916874885559, + "learning_rate": 9.97500520002217e-05, + "loss": 1.5059, + "step": 43510 + }, + { + "epoch": 2.325489864006639, + "grad_norm": 0.29676496982574463, + "learning_rate": 9.974838223015631e-05, + "loss": 1.4986, + "step": 43520 + }, + { + "epoch": 2.326031805172151, + "grad_norm": 0.3537111282348633, + "learning_rate": 9.97467069168556e-05, + "loss": 1.4903, + "step": 43530 + }, + { + "epoch": 2.3265737463376635, + "grad_norm": 0.2975006699562073, + "learning_rate": 9.974502606052711e-05, + "loss": 1.4985, + "step": 43540 + }, + { + "epoch": 2.3271156875031753, + "grad_norm": 0.42832067608833313, + "learning_rate": 9.974333966137907e-05, + "loss": 1.4866, + "step": 43550 + }, + { + "epoch": 2.3276576286686876, + "grad_norm": 0.29303500056266785, + "learning_rate": 9.974164771962035e-05, + "loss": 1.496, + "step": 43560 + }, + { + "epoch": 2.3281995698342, + "grad_norm": 0.330950528383255, + "learning_rate": 9.973995023546055e-05, + "loss": 1.4929, + "step": 43570 + }, + { + "epoch": 2.3287415109997123, + "grad_norm": 0.20558315515518188, + "learning_rate": 9.973824720911e-05, + "loss": 1.5115, + "step": 43580 + }, + { + "epoch": 2.3291208698155708, + "eval_loss": 2.4667203426361084, + "eval_runtime": 21.9785, + "eval_samples_per_second": 227.495, + "eval_steps_per_second": 1.228, + "step": 43587 + }, + { + "epoch": 2.329283452165224, + "grad_norm": 0.36295872926712036, + "learning_rate": 9.973653864077958e-05, + "loss": 1.4857, + "step": 43590 + }, + { + "epoch": 2.3298253933307365, + "grad_norm": 0.6293714046478271, + "learning_rate": 9.973482453068099e-05, + "loss": 1.497, + "step": 43600 + }, + { + "epoch": 2.3303673344962488, + "grad_norm": 0.4836002290248871, + "learning_rate": 9.973310487902657e-05, + "loss": 1.5107, + "step": 43610 + }, + { + "epoch": 2.330909275661761, + "grad_norm": 0.3348879814147949, + "learning_rate": 9.973137968602932e-05, + "loss": 1.4975, + "step": 43620 + }, + { + "epoch": 2.331451216827273, + "grad_norm": 0.24856191873550415, + "learning_rate": 9.972964895190295e-05, + "loss": 1.4994, + "step": 43630 + }, + { + "epoch": 2.3319931579927853, + "grad_norm": 0.2245061695575714, + "learning_rate": 9.972791267686188e-05, + "loss": 1.494, + "step": 43640 + }, + { + "epoch": 2.3325350991582976, + "grad_norm": 0.2887219786643982, + "learning_rate": 9.972617086112116e-05, + "loss": 1.5022, + "step": 43650 + }, + { + "epoch": 2.33307704032381, + "grad_norm": 0.5362899303436279, + "learning_rate": 9.97244235048966e-05, + "loss": 1.5091, + "step": 43660 + }, + { + "epoch": 2.3336189814893222, + "grad_norm": 0.4345687925815582, + "learning_rate": 9.972267060840461e-05, + "loss": 1.5024, + "step": 43670 + }, + { + "epoch": 2.333835757955527, + "eval_loss": 2.4626216888427734, + "eval_runtime": 21.9768, + "eval_samples_per_second": 227.513, + "eval_steps_per_second": 1.229, + "step": 43674 + }, + { + "epoch": 2.334160922654834, + "grad_norm": 0.42070385813713074, + "learning_rate": 9.972091217186236e-05, + "loss": 1.5046, + "step": 43680 + }, + { + "epoch": 2.3347028638203464, + "grad_norm": 0.5176911950111389, + "learning_rate": 9.971914819548766e-05, + "loss": 1.5029, + "step": 43690 + }, + { + "epoch": 2.3352448049858587, + "grad_norm": 0.3545333743095398, + "learning_rate": 9.971737867949903e-05, + "loss": 1.4942, + "step": 43700 + }, + { + "epoch": 2.335786746151371, + "grad_norm": 0.2512124180793762, + "learning_rate": 9.971560362411569e-05, + "loss": 1.5021, + "step": 43710 + }, + { + "epoch": 2.3363286873168834, + "grad_norm": 0.32333534955978394, + "learning_rate": 9.971382302955748e-05, + "loss": 1.4935, + "step": 43720 + }, + { + "epoch": 2.3368706284823952, + "grad_norm": 0.431264728307724, + "learning_rate": 9.971203689604504e-05, + "loss": 1.5011, + "step": 43730 + }, + { + "epoch": 2.3374125696479076, + "grad_norm": 0.26169353723526, + "learning_rate": 9.971024522379957e-05, + "loss": 1.5027, + "step": 43740 + }, + { + "epoch": 2.33795451081342, + "grad_norm": 0.44165289402008057, + "learning_rate": 9.970844801304303e-05, + "loss": 1.4827, + "step": 43750 + }, + { + "epoch": 2.338496451978932, + "grad_norm": 0.3183143734931946, + "learning_rate": 9.970664526399806e-05, + "loss": 1.4988, + "step": 43760 + }, + { + "epoch": 2.3385506460954835, + "eval_loss": 2.4591445922851562, + "eval_runtime": 21.9816, + "eval_samples_per_second": 227.463, + "eval_steps_per_second": 1.228, + "step": 43761 + }, + { + "epoch": 2.3390383931444445, + "grad_norm": 0.22885079681873322, + "learning_rate": 9.970483697688798e-05, + "loss": 1.4994, + "step": 43770 + }, + { + "epoch": 2.3395803343099564, + "grad_norm": 0.20680442452430725, + "learning_rate": 9.970302315193677e-05, + "loss": 1.4886, + "step": 43780 + }, + { + "epoch": 2.3401222754754687, + "grad_norm": 0.28351494669914246, + "learning_rate": 9.970120378936914e-05, + "loss": 1.4918, + "step": 43790 + }, + { + "epoch": 2.340664216640981, + "grad_norm": 0.3337201774120331, + "learning_rate": 9.969937888941046e-05, + "loss": 1.4897, + "step": 43800 + }, + { + "epoch": 2.3412061578064933, + "grad_norm": 0.37120136618614197, + "learning_rate": 9.969754845228676e-05, + "loss": 1.4988, + "step": 43810 + }, + { + "epoch": 2.341748098972005, + "grad_norm": 0.2082194685935974, + "learning_rate": 9.969571247822486e-05, + "loss": 1.5011, + "step": 43820 + }, + { + "epoch": 2.3422900401375175, + "grad_norm": 0.3116413652896881, + "learning_rate": 9.969387096745211e-05, + "loss": 1.4943, + "step": 43830 + }, + { + "epoch": 2.34283198130303, + "grad_norm": 0.18814930319786072, + "learning_rate": 9.969202392019668e-05, + "loss": 1.5032, + "step": 43840 + }, + { + "epoch": 2.3432655342354396, + "eval_loss": 2.4612669944763184, + "eval_runtime": 21.9782, + "eval_samples_per_second": 227.498, + "eval_steps_per_second": 1.228, + "step": 43848 + }, + { + "epoch": 2.343373922468542, + "grad_norm": 0.19654281437397003, + "learning_rate": 9.969017133668738e-05, + "loss": 1.4942, + "step": 43850 + }, + { + "epoch": 2.343915863634054, + "grad_norm": 0.49275675415992737, + "learning_rate": 9.968831321715365e-05, + "loss": 1.4915, + "step": 43860 + }, + { + "epoch": 2.3444578047995663, + "grad_norm": 0.26238903403282166, + "learning_rate": 9.968644956182572e-05, + "loss": 1.5072, + "step": 43870 + }, + { + "epoch": 2.3449997459650787, + "grad_norm": 0.43797948956489563, + "learning_rate": 9.968458037093442e-05, + "loss": 1.4991, + "step": 43880 + }, + { + "epoch": 2.345541687130591, + "grad_norm": 0.34569740295410156, + "learning_rate": 9.968270564471131e-05, + "loss": 1.5006, + "step": 43890 + }, + { + "epoch": 2.3460836282961033, + "grad_norm": 0.3247714340686798, + "learning_rate": 9.96808253833886e-05, + "loss": 1.5022, + "step": 43900 + }, + { + "epoch": 2.346625569461615, + "grad_norm": 0.3044470250606537, + "learning_rate": 9.967893958719924e-05, + "loss": 1.4978, + "step": 43910 + }, + { + "epoch": 2.3471675106271275, + "grad_norm": 0.26124030351638794, + "learning_rate": 9.967704825637681e-05, + "loss": 1.5, + "step": 43920 + }, + { + "epoch": 2.34770945179264, + "grad_norm": 0.2891956567764282, + "learning_rate": 9.967515139115562e-05, + "loss": 1.5099, + "step": 43930 + }, + { + "epoch": 2.3479804223753957, + "eval_loss": 2.4739420413970947, + "eval_runtime": 21.9776, + "eval_samples_per_second": 227.505, + "eval_steps_per_second": 1.229, + "step": 43935 + }, + { + "epoch": 2.348251392958152, + "grad_norm": 0.3045171797275543, + "learning_rate": 9.967324899177062e-05, + "loss": 1.4922, + "step": 43940 + }, + { + "epoch": 2.3487933341236644, + "grad_norm": 0.26287245750427246, + "learning_rate": 9.96713410584575e-05, + "loss": 1.5083, + "step": 43950 + }, + { + "epoch": 2.3493352752891763, + "grad_norm": 0.37551262974739075, + "learning_rate": 9.96694275914526e-05, + "loss": 1.4992, + "step": 43960 + }, + { + "epoch": 2.3498772164546886, + "grad_norm": 0.4330720901489258, + "learning_rate": 9.966750859099294e-05, + "loss": 1.4883, + "step": 43970 + }, + { + "epoch": 2.350419157620201, + "grad_norm": 0.21787714958190918, + "learning_rate": 9.966558405731624e-05, + "loss": 1.4937, + "step": 43980 + }, + { + "epoch": 2.3509610987857132, + "grad_norm": 0.28836148977279663, + "learning_rate": 9.96636539906609e-05, + "loss": 1.4988, + "step": 43990 + }, + { + "epoch": 2.351503039951225, + "grad_norm": 0.29318341612815857, + "learning_rate": 9.966171839126601e-05, + "loss": 1.5102, + "step": 44000 + }, + { + "epoch": 2.3520449811167374, + "grad_norm": 0.5823042988777161, + "learning_rate": 9.965977725937138e-05, + "loss": 1.4957, + "step": 44010 + }, + { + "epoch": 2.3525869222822497, + "grad_norm": 0.5775142908096313, + "learning_rate": 9.96578305952174e-05, + "loss": 1.4923, + "step": 44020 + }, + { + "epoch": 2.3526953105153523, + "eval_loss": 2.4744277000427246, + "eval_runtime": 21.978, + "eval_samples_per_second": 227.5, + "eval_steps_per_second": 1.228, + "step": 44022 + }, + { + "epoch": 2.353128863447762, + "grad_norm": 0.6233053803443909, + "learning_rate": 9.965587839904527e-05, + "loss": 1.507, + "step": 44030 + }, + { + "epoch": 2.353670804613274, + "grad_norm": 0.5592018365859985, + "learning_rate": 9.965392067109679e-05, + "loss": 1.4945, + "step": 44040 + }, + { + "epoch": 2.3542127457787863, + "grad_norm": 0.3838133215904236, + "learning_rate": 9.965195741161449e-05, + "loss": 1.4825, + "step": 44050 + }, + { + "epoch": 2.3547546869442986, + "grad_norm": 0.2884215712547302, + "learning_rate": 9.964998862084157e-05, + "loss": 1.4858, + "step": 44060 + }, + { + "epoch": 2.355296628109811, + "grad_norm": 0.22418469190597534, + "learning_rate": 9.96480142990219e-05, + "loss": 1.493, + "step": 44070 + }, + { + "epoch": 2.355838569275323, + "grad_norm": 0.34609487652778625, + "learning_rate": 9.964603444640007e-05, + "loss": 1.5015, + "step": 44080 + }, + { + "epoch": 2.356380510440835, + "grad_norm": 0.2725040912628174, + "learning_rate": 9.964404906322132e-05, + "loss": 1.4937, + "step": 44090 + }, + { + "epoch": 2.3569224516063474, + "grad_norm": 0.6061669588088989, + "learning_rate": 9.96420581497316e-05, + "loss": 1.5041, + "step": 44100 + }, + { + "epoch": 2.3574101986553084, + "eval_loss": 2.4810047149658203, + "eval_runtime": 21.9811, + "eval_samples_per_second": 227.468, + "eval_steps_per_second": 1.228, + "step": 44109 + }, + { + "epoch": 2.3574643927718597, + "grad_norm": 0.509492814540863, + "learning_rate": 9.964006170617754e-05, + "loss": 1.5008, + "step": 44110 + }, + { + "epoch": 2.358006333937372, + "grad_norm": 0.4258333146572113, + "learning_rate": 9.963805973280643e-05, + "loss": 1.4977, + "step": 44120 + }, + { + "epoch": 2.3585482751028843, + "grad_norm": 0.27747806906700134, + "learning_rate": 9.96360522298663e-05, + "loss": 1.4924, + "step": 44130 + }, + { + "epoch": 2.359090216268396, + "grad_norm": 0.339024156332016, + "learning_rate": 9.963403919760579e-05, + "loss": 1.494, + "step": 44140 + }, + { + "epoch": 2.3596321574339085, + "grad_norm": 0.3902308940887451, + "learning_rate": 9.963202063627429e-05, + "loss": 1.5009, + "step": 44150 + }, + { + "epoch": 2.360174098599421, + "grad_norm": 0.20376642048358917, + "learning_rate": 9.962999654612185e-05, + "loss": 1.5112, + "step": 44160 + }, + { + "epoch": 2.360716039764933, + "grad_norm": 0.28291308879852295, + "learning_rate": 9.96279669273992e-05, + "loss": 1.5011, + "step": 44170 + }, + { + "epoch": 2.3612579809304455, + "grad_norm": 0.3442187011241913, + "learning_rate": 9.962593178035776e-05, + "loss": 1.5062, + "step": 44180 + }, + { + "epoch": 2.3617999220959573, + "grad_norm": 0.23570683598518372, + "learning_rate": 9.962389110524964e-05, + "loss": 1.4994, + "step": 44190 + }, + { + "epoch": 2.362125086795265, + "eval_loss": 2.45691180229187, + "eval_runtime": 21.9778, + "eval_samples_per_second": 227.503, + "eval_steps_per_second": 1.229, + "step": 44196 + }, + { + "epoch": 2.3623418632614697, + "grad_norm": 0.38584163784980774, + "learning_rate": 9.962184490232763e-05, + "loss": 1.5001, + "step": 44200 + }, + { + "epoch": 2.362883804426982, + "grad_norm": 0.2459559589624405, + "learning_rate": 9.96197931718452e-05, + "loss": 1.4962, + "step": 44210 + }, + { + "epoch": 2.3634257455924943, + "grad_norm": 0.217377707362175, + "learning_rate": 9.96177359140565e-05, + "loss": 1.4944, + "step": 44220 + }, + { + "epoch": 2.363967686758006, + "grad_norm": 0.5001499652862549, + "learning_rate": 9.96156731292164e-05, + "loss": 1.4937, + "step": 44230 + }, + { + "epoch": 2.3645096279235185, + "grad_norm": 0.33417242765426636, + "learning_rate": 9.961360481758043e-05, + "loss": 1.4935, + "step": 44240 + }, + { + "epoch": 2.365051569089031, + "grad_norm": 0.4141455590724945, + "learning_rate": 9.961153097940475e-05, + "loss": 1.5026, + "step": 44250 + }, + { + "epoch": 2.365593510254543, + "grad_norm": 0.6103190779685974, + "learning_rate": 9.960945161494633e-05, + "loss": 1.4783, + "step": 44260 + }, + { + "epoch": 2.366135451420055, + "grad_norm": 0.4958433508872986, + "learning_rate": 9.96073667244627e-05, + "loss": 1.4915, + "step": 44270 + }, + { + "epoch": 2.3666773925855673, + "grad_norm": 0.30558812618255615, + "learning_rate": 9.960527630821217e-05, + "loss": 1.4999, + "step": 44280 + }, + { + "epoch": 2.366839974935221, + "eval_loss": 2.4537668228149414, + "eval_runtime": 21.9779, + "eval_samples_per_second": 227.501, + "eval_steps_per_second": 1.229, + "step": 44283 + }, + { + "epoch": 2.3672193337510796, + "grad_norm": 0.36779430508613586, + "learning_rate": 9.960318036645363e-05, + "loss": 1.4988, + "step": 44290 + }, + { + "epoch": 2.367761274916592, + "grad_norm": 0.2908129096031189, + "learning_rate": 9.960107889944679e-05, + "loss": 1.5003, + "step": 44300 + }, + { + "epoch": 2.3683032160821043, + "grad_norm": 0.5681326389312744, + "learning_rate": 9.959897190745192e-05, + "loss": 1.4934, + "step": 44310 + }, + { + "epoch": 2.368845157247616, + "grad_norm": 0.40029388666152954, + "learning_rate": 9.959685939073002e-05, + "loss": 1.5063, + "step": 44320 + }, + { + "epoch": 2.3693870984131284, + "grad_norm": 0.42044511437416077, + "learning_rate": 9.959474134954283e-05, + "loss": 1.4933, + "step": 44330 + }, + { + "epoch": 2.3699290395786408, + "grad_norm": 0.590247392654419, + "learning_rate": 9.959261778415267e-05, + "loss": 1.4867, + "step": 44340 + }, + { + "epoch": 2.370470980744153, + "grad_norm": 0.2779798209667206, + "learning_rate": 9.959048869482262e-05, + "loss": 1.5081, + "step": 44350 + }, + { + "epoch": 2.3710129219096654, + "grad_norm": 0.48375606536865234, + "learning_rate": 9.958835408181643e-05, + "loss": 1.4951, + "step": 44360 + }, + { + "epoch": 2.3715548630751773, + "grad_norm": 0.251369446516037, + "learning_rate": 9.958621394539854e-05, + "loss": 1.4872, + "step": 44370 + }, + { + "epoch": 2.3715548630751773, + "eval_loss": 2.4517953395843506, + "eval_runtime": 21.9682, + "eval_samples_per_second": 227.602, + "eval_steps_per_second": 1.229, + "step": 44370 + }, + { + "epoch": 2.3720968042406896, + "grad_norm": 0.24449922144412994, + "learning_rate": 9.9584068285834e-05, + "loss": 1.4872, + "step": 44380 + }, + { + "epoch": 2.372638745406202, + "grad_norm": 0.33364132046699524, + "learning_rate": 9.95819171033887e-05, + "loss": 1.4842, + "step": 44390 + }, + { + "epoch": 2.373180686571714, + "grad_norm": 0.30811458826065063, + "learning_rate": 9.957976039832901e-05, + "loss": 1.4935, + "step": 44400 + }, + { + "epoch": 2.373722627737226, + "grad_norm": 0.2445499747991562, + "learning_rate": 9.957759817092218e-05, + "loss": 1.5017, + "step": 44410 + }, + { + "epoch": 2.3742645689027384, + "grad_norm": 0.2053236961364746, + "learning_rate": 9.957543042143601e-05, + "loss": 1.5071, + "step": 44420 + }, + { + "epoch": 2.3748065100682507, + "grad_norm": 0.4515707492828369, + "learning_rate": 9.957325715013905e-05, + "loss": 1.4962, + "step": 44430 + }, + { + "epoch": 2.375348451233763, + "grad_norm": 0.26038962602615356, + "learning_rate": 9.957107835730052e-05, + "loss": 1.4925, + "step": 44440 + }, + { + "epoch": 2.375890392399275, + "grad_norm": 0.3523120880126953, + "learning_rate": 9.95688940431903e-05, + "loss": 1.4928, + "step": 44450 + }, + { + "epoch": 2.376269751215134, + "eval_loss": 2.4564239978790283, + "eval_runtime": 21.9781, + "eval_samples_per_second": 227.499, + "eval_steps_per_second": 1.228, + "step": 44457 + }, + { + "epoch": 2.3764323335647872, + "grad_norm": 0.24596655368804932, + "learning_rate": 9.956670420807899e-05, + "loss": 1.5008, + "step": 44460 + }, + { + "epoch": 2.3769742747302995, + "grad_norm": 0.47213152050971985, + "learning_rate": 9.956450885223785e-05, + "loss": 1.5027, + "step": 44470 + }, + { + "epoch": 2.377516215895812, + "grad_norm": 0.3502696752548218, + "learning_rate": 9.956230797593884e-05, + "loss": 1.4894, + "step": 44480 + }, + { + "epoch": 2.378058157061324, + "grad_norm": 0.31429505348205566, + "learning_rate": 9.95601015794546e-05, + "loss": 1.4878, + "step": 44490 + }, + { + "epoch": 2.378600098226836, + "grad_norm": 0.22514714300632477, + "learning_rate": 9.95578896630584e-05, + "loss": 1.4924, + "step": 44500 + }, + { + "epoch": 2.3791420393923484, + "grad_norm": 0.27507612109184265, + "learning_rate": 9.95556722270243e-05, + "loss": 1.5104, + "step": 44510 + }, + { + "epoch": 2.3796839805578607, + "grad_norm": 0.3192688226699829, + "learning_rate": 9.955344927162698e-05, + "loss": 1.4992, + "step": 44520 + }, + { + "epoch": 2.380225921723373, + "grad_norm": 0.20219798386096954, + "learning_rate": 9.955122079714177e-05, + "loss": 1.51, + "step": 44530 + }, + { + "epoch": 2.3807678628888853, + "grad_norm": 0.21116623282432556, + "learning_rate": 9.954898680384476e-05, + "loss": 1.4943, + "step": 44540 + }, + { + "epoch": 2.38098463935509, + "eval_loss": 2.4590439796447754, + "eval_runtime": 22.3001, + "eval_samples_per_second": 224.215, + "eval_steps_per_second": 1.211, + "step": 44544 + }, + { + "epoch": 2.381309804054397, + "grad_norm": 0.3956089913845062, + "learning_rate": 9.954674729201268e-05, + "loss": 1.4822, + "step": 44550 + }, + { + "epoch": 2.3818517452199095, + "grad_norm": 0.5214465856552124, + "learning_rate": 9.954450226192295e-05, + "loss": 1.4843, + "step": 44560 + }, + { + "epoch": 2.382393686385422, + "grad_norm": 0.4375009536743164, + "learning_rate": 9.954225171385366e-05, + "loss": 1.509, + "step": 44570 + }, + { + "epoch": 2.382935627550934, + "grad_norm": 0.22339291870594025, + "learning_rate": 9.953999564808362e-05, + "loss": 1.4914, + "step": 44580 + }, + { + "epoch": 2.3834775687164464, + "grad_norm": 0.2312263697385788, + "learning_rate": 9.953773406489229e-05, + "loss": 1.5064, + "step": 44590 + }, + { + "epoch": 2.3840195098819583, + "grad_norm": 0.21803376078605652, + "learning_rate": 9.953546696455984e-05, + "loss": 1.5093, + "step": 44600 + }, + { + "epoch": 2.3845614510474706, + "grad_norm": 0.33443108201026917, + "learning_rate": 9.953319434736708e-05, + "loss": 1.4916, + "step": 44610 + }, + { + "epoch": 2.385103392212983, + "grad_norm": 0.30027446150779724, + "learning_rate": 9.953091621359556e-05, + "loss": 1.4997, + "step": 44620 + }, + { + "epoch": 2.3856453333784953, + "grad_norm": 0.32896170020103455, + "learning_rate": 9.952863256352746e-05, + "loss": 1.4882, + "step": 44630 + }, + { + "epoch": 2.3856995274950465, + "eval_loss": 2.4726407527923584, + "eval_runtime": 21.9784, + "eval_samples_per_second": 227.496, + "eval_steps_per_second": 1.228, + "step": 44631 + }, + { + "epoch": 2.386187274544007, + "grad_norm": 0.41596585512161255, + "learning_rate": 9.95263433974457e-05, + "loss": 1.4935, + "step": 44640 + }, + { + "epoch": 2.3867292157095195, + "grad_norm": 0.2066296935081482, + "learning_rate": 9.952404871563383e-05, + "loss": 1.4963, + "step": 44650 + }, + { + "epoch": 2.3872711568750318, + "grad_norm": 0.5098561644554138, + "learning_rate": 9.952174851837609e-05, + "loss": 1.4899, + "step": 44660 + }, + { + "epoch": 2.387813098040544, + "grad_norm": 0.3089795410633087, + "learning_rate": 9.951944280595745e-05, + "loss": 1.4808, + "step": 44670 + }, + { + "epoch": 2.388355039206056, + "grad_norm": 0.3919314742088318, + "learning_rate": 9.951713157866352e-05, + "loss": 1.4988, + "step": 44680 + }, + { + "epoch": 2.3888969803715683, + "grad_norm": 0.6062659025192261, + "learning_rate": 9.95148148367806e-05, + "loss": 1.494, + "step": 44690 + }, + { + "epoch": 2.3894389215370806, + "grad_norm": 0.40539994835853577, + "learning_rate": 9.951249258059569e-05, + "loss": 1.4863, + "step": 44700 + }, + { + "epoch": 2.389980862702593, + "grad_norm": 0.3886335492134094, + "learning_rate": 9.951016481039646e-05, + "loss": 1.4878, + "step": 44710 + }, + { + "epoch": 2.3904144156350027, + "eval_loss": 2.4730005264282227, + "eval_runtime": 21.9758, + "eval_samples_per_second": 227.523, + "eval_steps_per_second": 1.229, + "step": 44718 + }, + { + "epoch": 2.3905228038681052, + "grad_norm": 0.39864251017570496, + "learning_rate": 9.950783152647124e-05, + "loss": 1.4932, + "step": 44720 + }, + { + "epoch": 2.391064745033617, + "grad_norm": 0.21343019604682922, + "learning_rate": 9.950549272910909e-05, + "loss": 1.4889, + "step": 44730 + }, + { + "epoch": 2.3916066861991294, + "grad_norm": 0.33363330364227295, + "learning_rate": 9.950314841859973e-05, + "loss": 1.484, + "step": 44740 + }, + { + "epoch": 2.3921486273646417, + "grad_norm": 0.41146817803382874, + "learning_rate": 9.950079859523354e-05, + "loss": 1.497, + "step": 44750 + }, + { + "epoch": 2.392690568530154, + "grad_norm": 0.27292779088020325, + "learning_rate": 9.949844325930165e-05, + "loss": 1.4942, + "step": 44760 + }, + { + "epoch": 2.3932325096956664, + "grad_norm": 0.2181469202041626, + "learning_rate": 9.94960824110958e-05, + "loss": 1.4911, + "step": 44770 + }, + { + "epoch": 2.3937744508611782, + "grad_norm": 0.21608904004096985, + "learning_rate": 9.94937160509084e-05, + "loss": 1.4981, + "step": 44780 + }, + { + "epoch": 2.3943163920266906, + "grad_norm": 0.2778734266757965, + "learning_rate": 9.949134417903267e-05, + "loss": 1.4898, + "step": 44790 + }, + { + "epoch": 2.394858333192203, + "grad_norm": 0.3089962303638458, + "learning_rate": 9.948896679576238e-05, + "loss": 1.4905, + "step": 44800 + }, + { + "epoch": 2.395129303774959, + "eval_loss": 2.4715840816497803, + "eval_runtime": 21.9797, + "eval_samples_per_second": 227.482, + "eval_steps_per_second": 1.228, + "step": 44805 + }, + { + "epoch": 2.395400274357715, + "grad_norm": 0.3231702148914337, + "learning_rate": 9.948658390139203e-05, + "loss": 1.5066, + "step": 44810 + }, + { + "epoch": 2.3959422155232275, + "grad_norm": 0.40103909373283386, + "learning_rate": 9.94841954962168e-05, + "loss": 1.4987, + "step": 44820 + }, + { + "epoch": 2.3964841566887394, + "grad_norm": 0.40768495202064514, + "learning_rate": 9.948180158053257e-05, + "loss": 1.4982, + "step": 44830 + }, + { + "epoch": 2.3970260978542517, + "grad_norm": 0.23787713050842285, + "learning_rate": 9.947940215463589e-05, + "loss": 1.4838, + "step": 44840 + }, + { + "epoch": 2.397568039019764, + "grad_norm": 0.3806591331958771, + "learning_rate": 9.947699721882396e-05, + "loss": 1.4936, + "step": 44850 + }, + { + "epoch": 2.3981099801852763, + "grad_norm": 0.26466140151023865, + "learning_rate": 9.947458677339473e-05, + "loss": 1.4965, + "step": 44860 + }, + { + "epoch": 2.398651921350788, + "grad_norm": 0.2688174247741699, + "learning_rate": 9.947217081864678e-05, + "loss": 1.494, + "step": 44870 + }, + { + "epoch": 2.3991938625163005, + "grad_norm": 0.5081690549850464, + "learning_rate": 9.94697493548794e-05, + "loss": 1.4951, + "step": 44880 + }, + { + "epoch": 2.399735803681813, + "grad_norm": 0.23584547638893127, + "learning_rate": 9.946732238239251e-05, + "loss": 1.4915, + "step": 44890 + }, + { + "epoch": 2.3998441919149154, + "eval_loss": 2.4753541946411133, + "eval_runtime": 21.9826, + "eval_samples_per_second": 227.453, + "eval_steps_per_second": 1.228, + "step": 44892 + }, + { + "epoch": 2.400277744847325, + "grad_norm": 0.41271650791168213, + "learning_rate": 9.946488990148679e-05, + "loss": 1.4985, + "step": 44900 + }, + { + "epoch": 2.400819686012837, + "grad_norm": 0.24468545615673065, + "learning_rate": 9.946245191246358e-05, + "loss": 1.4974, + "step": 44910 + }, + { + "epoch": 2.4013616271783493, + "grad_norm": 0.3811487853527069, + "learning_rate": 9.946000841562482e-05, + "loss": 1.4896, + "step": 44920 + }, + { + "epoch": 2.4019035683438617, + "grad_norm": 0.4173762798309326, + "learning_rate": 9.945755941127327e-05, + "loss": 1.4897, + "step": 44930 + }, + { + "epoch": 2.402445509509374, + "grad_norm": 0.5052602291107178, + "learning_rate": 9.945510489971228e-05, + "loss": 1.4964, + "step": 44940 + }, + { + "epoch": 2.4029874506748863, + "grad_norm": 0.3579169809818268, + "learning_rate": 9.945264488124589e-05, + "loss": 1.4941, + "step": 44950 + }, + { + "epoch": 2.403529391840398, + "grad_norm": 0.2974871098995209, + "learning_rate": 9.945017935617885e-05, + "loss": 1.5033, + "step": 44960 + }, + { + "epoch": 2.4040713330059105, + "grad_norm": 0.34116071462631226, + "learning_rate": 9.944770832481656e-05, + "loss": 1.4966, + "step": 44970 + }, + { + "epoch": 2.4045590800548715, + "eval_loss": 2.4695467948913574, + "eval_runtime": 21.9753, + "eval_samples_per_second": 227.528, + "eval_steps_per_second": 1.229, + "step": 44979 + }, + { + "epoch": 2.404613274171423, + "grad_norm": 0.2582961916923523, + "learning_rate": 9.944523178746516e-05, + "loss": 1.499, + "step": 44980 + }, + { + "epoch": 2.405155215336935, + "grad_norm": 0.32894718647003174, + "learning_rate": 9.94427497444314e-05, + "loss": 1.4877, + "step": 44990 + }, + { + "epoch": 2.4056971565024474, + "grad_norm": 0.38005682826042175, + "learning_rate": 9.944026219602274e-05, + "loss": 1.4999, + "step": 45000 + }, + { + "epoch": 2.0005419411655123, + "grad_norm": 0.2491222321987152, + "learning_rate": 9.943776914254736e-05, + "loss": 1.5076, + "step": 45010 + }, + { + "epoch": 2.0010838823310246, + "grad_norm": 0.3733494281768799, + "learning_rate": 9.943527058431406e-05, + "loss": 1.517, + "step": 45020 + }, + { + "epoch": 2.0016258234965365, + "grad_norm": 0.33631569147109985, + "learning_rate": 9.943276652163235e-05, + "loss": 1.5062, + "step": 45030 + }, + { + "epoch": 2.002167764662049, + "grad_norm": 0.21265359222888947, + "learning_rate": 9.943025695481244e-05, + "loss": 1.5177, + "step": 45040 + }, + { + "epoch": 2.002709705827561, + "grad_norm": 0.18703655898571014, + "learning_rate": 9.94277418841652e-05, + "loss": 1.5098, + "step": 45050 + }, + { + "epoch": 2.0032516469930735, + "grad_norm": 0.44634345173835754, + "learning_rate": 9.942522131000216e-05, + "loss": 1.5068, + "step": 45060 + }, + { + "epoch": 2.0035768116923807, + "eval_loss": 2.45697283744812, + "eval_runtime": 28.4848, + "eval_samples_per_second": 175.532, + "eval_steps_per_second": 0.948, + "step": 45066 + }, + { + "epoch": 2.0037935881585853, + "grad_norm": 0.22693060338497162, + "learning_rate": 9.942269523263559e-05, + "loss": 1.5085, + "step": 45070 + }, + { + "epoch": 2.0043355293240976, + "grad_norm": 0.22670039534568787, + "learning_rate": 9.942016365237841e-05, + "loss": 1.5155, + "step": 45080 + }, + { + "epoch": 2.00487747048961, + "grad_norm": 0.2845227122306824, + "learning_rate": 9.94176265695442e-05, + "loss": 1.5119, + "step": 45090 + }, + { + "epoch": 2.0054194116551223, + "grad_norm": 0.2120792120695114, + "learning_rate": 9.941508398444725e-05, + "loss": 1.5108, + "step": 45100 + }, + { + "epoch": 2.0059613528206346, + "grad_norm": 0.2501295804977417, + "learning_rate": 9.941253589740255e-05, + "loss": 1.5151, + "step": 45110 + }, + { + "epoch": 2.0065032939861465, + "grad_norm": 0.21968966722488403, + "learning_rate": 9.940998230872569e-05, + "loss": 1.5051, + "step": 45120 + }, + { + "epoch": 2.007045235151659, + "grad_norm": 0.2440696805715561, + "learning_rate": 9.940742321873304e-05, + "loss": 1.5188, + "step": 45130 + }, + { + "epoch": 2.007587176317171, + "grad_norm": 0.26428186893463135, + "learning_rate": 9.940485862774162e-05, + "loss": 1.5097, + "step": 45140 + }, + { + "epoch": 2.0081291174826834, + "grad_norm": 0.2735063135623932, + "learning_rate": 9.940228853606908e-05, + "loss": 1.5153, + "step": 45150 + }, + { + "epoch": 2.008291699832337, + "eval_loss": 2.474893093109131, + "eval_runtime": 22.0407, + "eval_samples_per_second": 226.853, + "eval_steps_per_second": 1.225, + "step": 45153 + }, + { + "epoch": 2.0086710586481953, + "grad_norm": 0.21670082211494446, + "learning_rate": 9.939971294403382e-05, + "loss": 1.5133, + "step": 45160 + }, + { + "epoch": 2.0092129998137076, + "grad_norm": 0.219325989484787, + "learning_rate": 9.939713185195486e-05, + "loss": 1.5115, + "step": 45170 + }, + { + "epoch": 2.00975494097922, + "grad_norm": 0.21777480840682983, + "learning_rate": 9.939454526015199e-05, + "loss": 1.5066, + "step": 45180 + }, + { + "epoch": 2.0102968821447322, + "grad_norm": 0.24728938937187195, + "learning_rate": 9.939195316894558e-05, + "loss": 1.5173, + "step": 45190 + }, + { + "epoch": 2.0108388233102445, + "grad_norm": 0.3391672670841217, + "learning_rate": 9.938935557865676e-05, + "loss": 1.4985, + "step": 45200 + }, + { + "epoch": 2.0113807644757564, + "grad_norm": 0.22963690757751465, + "learning_rate": 9.938675248960726e-05, + "loss": 1.5031, + "step": 45210 + }, + { + "epoch": 2.0119227056412687, + "grad_norm": 0.2681106626987457, + "learning_rate": 9.93841439021196e-05, + "loss": 1.5047, + "step": 45220 + }, + { + "epoch": 2.012464646806781, + "grad_norm": 0.37070515751838684, + "learning_rate": 9.938152981651687e-05, + "loss": 1.5039, + "step": 45230 + }, + { + "epoch": 2.0130065879722934, + "grad_norm": 0.342690110206604, + "learning_rate": 9.937891023312292e-05, + "loss": 1.5114, + "step": 45240 + }, + { + "epoch": 2.0130065879722934, + "eval_loss": 2.4622411727905273, + "eval_runtime": 21.7527, + "eval_samples_per_second": 229.857, + "eval_steps_per_second": 1.241, + "step": 45240 + }, + { + "epoch": 2.0135485291378057, + "grad_norm": 0.4903299808502197, + "learning_rate": 9.937628515226225e-05, + "loss": 1.5174, + "step": 45250 + }, + { + "epoch": 2.0140904703033176, + "grad_norm": 0.2701393961906433, + "learning_rate": 9.937365457426003e-05, + "loss": 1.5235, + "step": 45260 + }, + { + "epoch": 2.01463241146883, + "grad_norm": 0.21151518821716309, + "learning_rate": 9.937101849944213e-05, + "loss": 1.4979, + "step": 45270 + }, + { + "epoch": 2.015174352634342, + "grad_norm": 0.1973683089017868, + "learning_rate": 9.936837692813511e-05, + "loss": 1.5211, + "step": 45280 + }, + { + "epoch": 2.0157162937998545, + "grad_norm": 0.2013273984193802, + "learning_rate": 9.936572986066616e-05, + "loss": 1.5038, + "step": 45290 + }, + { + "epoch": 2.0162582349653664, + "grad_norm": 0.28360608220100403, + "learning_rate": 9.936307729736324e-05, + "loss": 1.5059, + "step": 45300 + }, + { + "epoch": 2.0168001761308787, + "grad_norm": 0.2819204032421112, + "learning_rate": 9.93604192385549e-05, + "loss": 1.5104, + "step": 45310 + }, + { + "epoch": 2.017342117296391, + "grad_norm": 0.20827263593673706, + "learning_rate": 9.935775568457041e-05, + "loss": 1.4944, + "step": 45320 + }, + { + "epoch": 2.0177214761122495, + "eval_loss": 2.4640448093414307, + "eval_runtime": 22.083, + "eval_samples_per_second": 226.419, + "eval_steps_per_second": 1.223, + "step": 45327 + }, + { + "epoch": 2.0178840584619033, + "grad_norm": 0.21158994734287262, + "learning_rate": 9.935508663573973e-05, + "loss": 1.5125, + "step": 45330 + }, + { + "epoch": 2.0184259996274156, + "grad_norm": 0.3482116162776947, + "learning_rate": 9.935241209239349e-05, + "loss": 1.5073, + "step": 45340 + }, + { + "epoch": 2.0189679407929275, + "grad_norm": 0.2513381540775299, + "learning_rate": 9.9349732054863e-05, + "loss": 1.4914, + "step": 45350 + }, + { + "epoch": 2.01950988195844, + "grad_norm": 0.21970421075820923, + "learning_rate": 9.934704652348024e-05, + "loss": 1.511, + "step": 45360 + }, + { + "epoch": 2.020051823123952, + "grad_norm": 0.20519542694091797, + "learning_rate": 9.934435549857793e-05, + "loss": 1.4999, + "step": 45370 + }, + { + "epoch": 2.0205937642894645, + "grad_norm": 0.2705243229866028, + "learning_rate": 9.934165898048934e-05, + "loss": 1.5099, + "step": 45380 + }, + { + "epoch": 2.0211357054549763, + "grad_norm": 0.3955739140510559, + "learning_rate": 9.933895696954857e-05, + "loss": 1.5111, + "step": 45390 + }, + { + "epoch": 2.0216776466204887, + "grad_norm": 0.39523282647132874, + "learning_rate": 9.933624946609031e-05, + "loss": 1.5154, + "step": 45400 + }, + { + "epoch": 2.022219587786001, + "grad_norm": 0.22369325160980225, + "learning_rate": 9.933353647044995e-05, + "loss": 1.5127, + "step": 45410 + }, + { + "epoch": 2.022436364252206, + "eval_loss": 2.4598445892333984, + "eval_runtime": 22.0511, + "eval_samples_per_second": 226.746, + "eval_steps_per_second": 1.224, + "step": 45414 + }, + { + "epoch": 2.0227615289515133, + "grad_norm": 0.29554933309555054, + "learning_rate": 9.933081798296358e-05, + "loss": 1.507, + "step": 45420 + }, + { + "epoch": 2.0233034701170256, + "grad_norm": 0.2695947289466858, + "learning_rate": 9.932809400396793e-05, + "loss": 1.505, + "step": 45430 + }, + { + "epoch": 2.0238454112825375, + "grad_norm": 0.25676560401916504, + "learning_rate": 9.932536453380045e-05, + "loss": 1.5104, + "step": 45440 + }, + { + "epoch": 2.02438735244805, + "grad_norm": 0.21625731885433197, + "learning_rate": 9.932262957279926e-05, + "loss": 1.5112, + "step": 45450 + }, + { + "epoch": 2.024929293613562, + "grad_norm": 0.22126367688179016, + "learning_rate": 9.931988912130313e-05, + "loss": 1.4907, + "step": 45460 + }, + { + "epoch": 2.0254712347790744, + "grad_norm": 0.2600845396518707, + "learning_rate": 9.931714317965158e-05, + "loss": 1.5057, + "step": 45470 + }, + { + "epoch": 2.0260131759445863, + "grad_norm": 0.2164200097322464, + "learning_rate": 9.931439174818472e-05, + "loss": 1.5125, + "step": 45480 + }, + { + "epoch": 2.0265551171100986, + "grad_norm": 0.3730428218841553, + "learning_rate": 9.931163482724341e-05, + "loss": 1.5166, + "step": 45490 + }, + { + "epoch": 2.027097058275611, + "grad_norm": 0.2146274298429489, + "learning_rate": 9.930887241716915e-05, + "loss": 1.5042, + "step": 45500 + }, + { + "epoch": 2.027151252392162, + "eval_loss": 2.4563684463500977, + "eval_runtime": 22.0541, + "eval_samples_per_second": 226.715, + "eval_steps_per_second": 1.224, + "step": 45501 + }, + { + "epoch": 2.0276389994411232, + "grad_norm": 0.2864300310611725, + "learning_rate": 9.930610451830417e-05, + "loss": 1.5001, + "step": 45510 + }, + { + "epoch": 2.0281809406066356, + "grad_norm": 0.37505003809928894, + "learning_rate": 9.93033311309913e-05, + "loss": 1.5074, + "step": 45520 + }, + { + "epoch": 2.0287228817721474, + "grad_norm": 0.3186861276626587, + "learning_rate": 9.93005522555741e-05, + "loss": 1.5096, + "step": 45530 + }, + { + "epoch": 2.0292648229376598, + "grad_norm": 0.31974560022354126, + "learning_rate": 9.929776789239685e-05, + "loss": 1.5028, + "step": 45540 + }, + { + "epoch": 2.029806764103172, + "grad_norm": 0.22991715371608734, + "learning_rate": 9.929497804180442e-05, + "loss": 1.4912, + "step": 45550 + }, + { + "epoch": 2.0303487052686844, + "grad_norm": 0.38136276602745056, + "learning_rate": 9.929218270414243e-05, + "loss": 1.5082, + "step": 45560 + }, + { + "epoch": 2.0308906464341967, + "grad_norm": 0.2910080850124359, + "learning_rate": 9.928938187975714e-05, + "loss": 1.5081, + "step": 45570 + }, + { + "epoch": 2.0314325875997086, + "grad_norm": 0.20356157422065735, + "learning_rate": 9.928657556899551e-05, + "loss": 1.4984, + "step": 45580 + }, + { + "epoch": 2.0318661405321183, + "eval_loss": 2.454059600830078, + "eval_runtime": 22.0277, + "eval_samples_per_second": 226.987, + "eval_steps_per_second": 1.226, + "step": 45588 + }, + { + "epoch": 2.031974528765221, + "grad_norm": 0.32182246446609497, + "learning_rate": 9.928376377220517e-05, + "loss": 1.507, + "step": 45590 + }, + { + "epoch": 2.032516469930733, + "grad_norm": 0.19996988773345947, + "learning_rate": 9.928094648973443e-05, + "loss": 1.5189, + "step": 45600 + }, + { + "epoch": 2.0330584110962455, + "grad_norm": 0.2483881115913391, + "learning_rate": 9.92781237219323e-05, + "loss": 1.5088, + "step": 45610 + }, + { + "epoch": 2.0336003522617574, + "grad_norm": 0.3291274309158325, + "learning_rate": 9.927529546914842e-05, + "loss": 1.5016, + "step": 45620 + }, + { + "epoch": 2.0341422934272697, + "grad_norm": 0.21742436289787292, + "learning_rate": 9.927246173173318e-05, + "loss": 1.507, + "step": 45630 + }, + { + "epoch": 2.034684234592782, + "grad_norm": 0.34620094299316406, + "learning_rate": 9.926962251003758e-05, + "loss": 1.5015, + "step": 45640 + }, + { + "epoch": 2.0352261757582943, + "grad_norm": 0.2289854735136032, + "learning_rate": 9.926677780441335e-05, + "loss": 1.5091, + "step": 45650 + }, + { + "epoch": 2.0357681169238067, + "grad_norm": 0.32256433367729187, + "learning_rate": 9.926392761521286e-05, + "loss": 1.5123, + "step": 45660 + }, + { + "epoch": 2.0363100580893185, + "grad_norm": 0.22222593426704407, + "learning_rate": 9.926107194278921e-05, + "loss": 1.5074, + "step": 45670 + }, + { + "epoch": 2.036581028672075, + "eval_loss": 2.465963125228882, + "eval_runtime": 21.9821, + "eval_samples_per_second": 227.458, + "eval_steps_per_second": 1.228, + "step": 45675 + }, + { + "epoch": 2.036851999254831, + "grad_norm": 0.23388569056987762, + "learning_rate": 9.925821078749612e-05, + "loss": 1.5062, + "step": 45680 + }, + { + "epoch": 2.037393940420343, + "grad_norm": 0.22175799310207367, + "learning_rate": 9.925534414968802e-05, + "loss": 1.4969, + "step": 45690 + }, + { + "epoch": 2.0379358815858555, + "grad_norm": 0.5243973135948181, + "learning_rate": 9.925247202972004e-05, + "loss": 1.5039, + "step": 45700 + }, + { + "epoch": 2.0384778227513674, + "grad_norm": 0.3757253885269165, + "learning_rate": 9.924959442794794e-05, + "loss": 1.5076, + "step": 45710 + }, + { + "epoch": 2.0390197639168797, + "grad_norm": 0.4267645478248596, + "learning_rate": 9.92467113447282e-05, + "loss": 1.5162, + "step": 45720 + }, + { + "epoch": 2.039561705082392, + "grad_norm": 0.19706867635250092, + "learning_rate": 9.924382278041796e-05, + "loss": 1.5014, + "step": 45730 + }, + { + "epoch": 2.0401036462479043, + "grad_norm": 0.3324958086013794, + "learning_rate": 9.924092873537506e-05, + "loss": 1.5019, + "step": 45740 + }, + { + "epoch": 2.0406455874134166, + "grad_norm": 0.2539781630039215, + "learning_rate": 9.923802920995794e-05, + "loss": 1.5161, + "step": 45750 + }, + { + "epoch": 2.0411875285789285, + "grad_norm": 0.4110063314437866, + "learning_rate": 9.923512420452585e-05, + "loss": 1.4998, + "step": 45760 + }, + { + "epoch": 2.041295916812031, + "eval_loss": 2.4641757011413574, + "eval_runtime": 21.9849, + "eval_samples_per_second": 227.429, + "eval_steps_per_second": 1.228, + "step": 45762 + }, + { + "epoch": 2.041729469744441, + "grad_norm": 0.22075022757053375, + "learning_rate": 9.923221371943862e-05, + "loss": 1.4926, + "step": 45770 + }, + { + "epoch": 2.042271410909953, + "grad_norm": 0.3489071726799011, + "learning_rate": 9.92292977550568e-05, + "loss": 1.5091, + "step": 45780 + }, + { + "epoch": 2.0428133520754654, + "grad_norm": 0.364443302154541, + "learning_rate": 9.922637631174158e-05, + "loss": 1.5006, + "step": 45790 + }, + { + "epoch": 2.0433552932409773, + "grad_norm": 0.26170814037323, + "learning_rate": 9.922344938985489e-05, + "loss": 1.5044, + "step": 45800 + }, + { + "epoch": 2.0438972344064896, + "grad_norm": 0.2701292634010315, + "learning_rate": 9.922051698975927e-05, + "loss": 1.5028, + "step": 45810 + }, + { + "epoch": 2.044439175572002, + "grad_norm": 0.5133768320083618, + "learning_rate": 9.921757911181801e-05, + "loss": 1.514, + "step": 45820 + }, + { + "epoch": 2.0449811167375143, + "grad_norm": 0.20944929122924805, + "learning_rate": 9.921463575639503e-05, + "loss": 1.5047, + "step": 45830 + }, + { + "epoch": 2.0455230579030266, + "grad_norm": 0.2594199776649475, + "learning_rate": 9.921168692385491e-05, + "loss": 1.5131, + "step": 45840 + }, + { + "epoch": 2.0460108049519876, + "eval_loss": 2.4567441940307617, + "eval_runtime": 21.9911, + "eval_samples_per_second": 227.365, + "eval_steps_per_second": 1.228, + "step": 45849 + }, + { + "epoch": 2.0460649990685384, + "grad_norm": 0.4213975667953491, + "learning_rate": 9.920873261456297e-05, + "loss": 1.4995, + "step": 45850 + }, + { + "epoch": 2.0466069402340508, + "grad_norm": 0.33704066276550293, + "learning_rate": 9.920577282888515e-05, + "loss": 1.5053, + "step": 45860 + }, + { + "epoch": 2.047148881399563, + "grad_norm": 0.3914563059806824, + "learning_rate": 9.920280756718815e-05, + "loss": 1.508, + "step": 45870 + }, + { + "epoch": 2.0476908225650754, + "grad_norm": 0.22689972817897797, + "learning_rate": 9.919983682983924e-05, + "loss": 1.5135, + "step": 45880 + }, + { + "epoch": 2.0482327637305877, + "grad_norm": 0.24155005812644958, + "learning_rate": 9.919686061720645e-05, + "loss": 1.5056, + "step": 45890 + }, + { + "epoch": 2.0487747048960996, + "grad_norm": 0.22122272849082947, + "learning_rate": 9.919387892965845e-05, + "loss": 1.5075, + "step": 45900 + }, + { + "epoch": 2.049316646061612, + "grad_norm": 0.18150360882282257, + "learning_rate": 9.919089176756458e-05, + "loss": 1.5084, + "step": 45910 + }, + { + "epoch": 2.049858587227124, + "grad_norm": 0.31947770714759827, + "learning_rate": 9.918789913129491e-05, + "loss": 1.5126, + "step": 45920 + }, + { + "epoch": 2.0504005283926365, + "grad_norm": 0.21400408446788788, + "learning_rate": 9.918490102122014e-05, + "loss": 1.5074, + "step": 45930 + }, + { + "epoch": 2.0507256930919437, + "eval_loss": 2.472001552581787, + "eval_runtime": 21.9893, + "eval_samples_per_second": 227.383, + "eval_steps_per_second": 1.228, + "step": 45936 + }, + { + "epoch": 2.0509424695581484, + "grad_norm": 0.4493444263935089, + "learning_rate": 9.918189743771165e-05, + "loss": 1.5229, + "step": 45940 + }, + { + "epoch": 2.0514844107236607, + "grad_norm": 0.28981345891952515, + "learning_rate": 9.917888838114155e-05, + "loss": 1.5128, + "step": 45950 + }, + { + "epoch": 2.052026351889173, + "grad_norm": 0.3488079011440277, + "learning_rate": 9.917587385188255e-05, + "loss": 1.4949, + "step": 45960 + }, + { + "epoch": 2.0525682930546854, + "grad_norm": 0.25213801860809326, + "learning_rate": 9.917285385030808e-05, + "loss": 1.5046, + "step": 45970 + }, + { + "epoch": 2.0531102342201977, + "grad_norm": 0.1905914545059204, + "learning_rate": 9.916982837679226e-05, + "loss": 1.5046, + "step": 45980 + }, + { + "epoch": 2.0536521753857095, + "grad_norm": 0.2938050329685211, + "learning_rate": 9.916679743170986e-05, + "loss": 1.5087, + "step": 45990 + }, + { + "epoch": 2.054194116551222, + "grad_norm": 0.3296414613723755, + "learning_rate": 9.916376101543636e-05, + "loss": 1.5114, + "step": 46000 + }, + { + "epoch": 2.054736057716734, + "grad_norm": 0.32699063420295715, + "learning_rate": 9.916071912834789e-05, + "loss": 1.5049, + "step": 46010 + }, + { + "epoch": 2.0552779988822465, + "grad_norm": 0.28838619589805603, + "learning_rate": 9.915767177082125e-05, + "loss": 1.5045, + "step": 46020 + }, + { + "epoch": 2.0554405812319, + "eval_loss": 2.4637975692749023, + "eval_runtime": 21.9861, + "eval_samples_per_second": 227.417, + "eval_steps_per_second": 1.228, + "step": 46023 + }, + { + "epoch": 2.0558199400477584, + "grad_norm": 0.23613585531711578, + "learning_rate": 9.915461894323395e-05, + "loss": 1.5072, + "step": 46030 + }, + { + "epoch": 2.0563618812132707, + "grad_norm": 0.33887359499931335, + "learning_rate": 9.915156064596414e-05, + "loss": 1.497, + "step": 46040 + }, + { + "epoch": 2.056903822378783, + "grad_norm": 0.24096645414829254, + "learning_rate": 9.914849687939071e-05, + "loss": 1.4923, + "step": 46050 + }, + { + "epoch": 2.0574457635442953, + "grad_norm": 0.29144373536109924, + "learning_rate": 9.914542764389314e-05, + "loss": 1.4964, + "step": 46060 + }, + { + "epoch": 2.0579877047098076, + "grad_norm": 0.34292078018188477, + "learning_rate": 9.914235293985167e-05, + "loss": 1.514, + "step": 46070 + }, + { + "epoch": 2.0585296458753195, + "grad_norm": 0.286112904548645, + "learning_rate": 9.913927276764715e-05, + "loss": 1.5068, + "step": 46080 + }, + { + "epoch": 2.059071587040832, + "grad_norm": 0.2864058017730713, + "learning_rate": 9.913618712766117e-05, + "loss": 1.5067, + "step": 46090 + }, + { + "epoch": 2.059613528206344, + "grad_norm": 0.2555348575115204, + "learning_rate": 9.913309602027593e-05, + "loss": 1.5068, + "step": 46100 + }, + { + "epoch": 2.0601554693718565, + "grad_norm": 0.21725402772426605, + "learning_rate": 9.912999944587437e-05, + "loss": 1.5055, + "step": 46110 + }, + { + "epoch": 2.0601554693718565, + "eval_loss": 2.46505069732666, + "eval_runtime": 22.3497, + "eval_samples_per_second": 223.717, + "eval_steps_per_second": 1.208, + "step": 46110 + }, + { + "epoch": 2.0606974105373683, + "grad_norm": 0.3962418735027313, + "learning_rate": 9.912689740484007e-05, + "loss": 1.5052, + "step": 46120 + }, + { + "epoch": 2.0612393517028806, + "grad_norm": 0.22441263496875763, + "learning_rate": 9.91237898975573e-05, + "loss": 1.5036, + "step": 46130 + }, + { + "epoch": 2.061781292868393, + "grad_norm": 0.3478442430496216, + "learning_rate": 9.9120676924411e-05, + "loss": 1.5032, + "step": 46140 + }, + { + "epoch": 2.0623232340339053, + "grad_norm": 0.41713225841522217, + "learning_rate": 9.911755848578681e-05, + "loss": 1.5111, + "step": 46150 + }, + { + "epoch": 2.0628651751994176, + "grad_norm": 0.20105630159378052, + "learning_rate": 9.9114434582071e-05, + "loss": 1.5153, + "step": 46160 + }, + { + "epoch": 2.0634071163649295, + "grad_norm": 0.48680949211120605, + "learning_rate": 9.911130521365057e-05, + "loss": 1.5127, + "step": 46170 + }, + { + "epoch": 2.0639490575304418, + "grad_norm": 0.2089950144290924, + "learning_rate": 9.910817038091315e-05, + "loss": 1.4976, + "step": 46180 + }, + { + "epoch": 2.064490998695954, + "grad_norm": 0.35045289993286133, + "learning_rate": 9.910503008424709e-05, + "loss": 1.4987, + "step": 46190 + }, + { + "epoch": 2.0648703575118126, + "eval_loss": 2.4648759365081787, + "eval_runtime": 22.0369, + "eval_samples_per_second": 226.892, + "eval_steps_per_second": 1.225, + "step": 46197 + }, + { + "epoch": 2.0650329398614664, + "grad_norm": 0.30531641840934753, + "learning_rate": 9.910188432404139e-05, + "loss": 1.5034, + "step": 46200 + }, + { + "epoch": 2.0655748810269783, + "grad_norm": 0.2192566692829132, + "learning_rate": 9.909873310068571e-05, + "loss": 1.5177, + "step": 46210 + }, + { + "epoch": 2.0661168221924906, + "grad_norm": 0.39566564559936523, + "learning_rate": 9.909557641457047e-05, + "loss": 1.5048, + "step": 46220 + }, + { + "epoch": 2.066658763358003, + "grad_norm": 0.4346673786640167, + "learning_rate": 9.909241426608664e-05, + "loss": 1.4999, + "step": 46230 + }, + { + "epoch": 2.0672007045235152, + "grad_norm": 0.2114131599664688, + "learning_rate": 9.908924665562595e-05, + "loss": 1.5104, + "step": 46240 + }, + { + "epoch": 2.0677426456890275, + "grad_norm": 0.21276456117630005, + "learning_rate": 9.908607358358082e-05, + "loss": 1.5089, + "step": 46250 + }, + { + "epoch": 2.0682845868545394, + "grad_norm": 0.20837821066379547, + "learning_rate": 9.90828950503443e-05, + "loss": 1.508, + "step": 46260 + }, + { + "epoch": 2.0688265280200517, + "grad_norm": 0.22510722279548645, + "learning_rate": 9.907971105631014e-05, + "loss": 1.4973, + "step": 46270 + }, + { + "epoch": 2.069368469185564, + "grad_norm": 0.2420571893453598, + "learning_rate": 9.907652160187272e-05, + "loss": 1.5006, + "step": 46280 + }, + { + "epoch": 2.069585245651769, + "eval_loss": 2.455404043197632, + "eval_runtime": 22.0244, + "eval_samples_per_second": 227.021, + "eval_steps_per_second": 1.226, + "step": 46284 + }, + { + "epoch": 2.0699104103510764, + "grad_norm": 0.21104340255260468, + "learning_rate": 9.907332668742718e-05, + "loss": 1.5047, + "step": 46290 + }, + { + "epoch": 2.0704523515165887, + "grad_norm": 0.3561536967754364, + "learning_rate": 9.907012631336927e-05, + "loss": 1.504, + "step": 46300 + }, + { + "epoch": 2.0709942926821006, + "grad_norm": 0.19159241020679474, + "learning_rate": 9.906692048009546e-05, + "loss": 1.5021, + "step": 46310 + }, + { + "epoch": 2.071536233847613, + "grad_norm": 0.4784151315689087, + "learning_rate": 9.906370918800283e-05, + "loss": 1.4997, + "step": 46320 + }, + { + "epoch": 2.072078175013125, + "grad_norm": 0.2709428369998932, + "learning_rate": 9.906049243748925e-05, + "loss": 1.5038, + "step": 46330 + }, + { + "epoch": 2.0726201161786375, + "grad_norm": 0.3371290862560272, + "learning_rate": 9.905727022895313e-05, + "loss": 1.5185, + "step": 46340 + }, + { + "epoch": 2.0731620573441494, + "grad_norm": 0.3282695710659027, + "learning_rate": 9.905404256279367e-05, + "loss": 1.5043, + "step": 46350 + }, + { + "epoch": 2.0737039985096617, + "grad_norm": 0.22218751907348633, + "learning_rate": 9.905080943941068e-05, + "loss": 1.5047, + "step": 46360 + }, + { + "epoch": 2.074245939675174, + "grad_norm": 0.4120001494884491, + "learning_rate": 9.904757085920466e-05, + "loss": 1.4932, + "step": 46370 + }, + { + "epoch": 2.0743001337917253, + "eval_loss": 2.4604697227478027, + "eval_runtime": 22.0263, + "eval_samples_per_second": 227.001, + "eval_steps_per_second": 1.226, + "step": 46371 + }, + { + "epoch": 2.0747878808406863, + "grad_norm": 0.2726476788520813, + "learning_rate": 9.90443268225768e-05, + "loss": 1.4971, + "step": 46380 + }, + { + "epoch": 2.0753298220061986, + "grad_norm": 0.25668570399284363, + "learning_rate": 9.904107732992896e-05, + "loss": 1.5062, + "step": 46390 + }, + { + "epoch": 2.0758717631717105, + "grad_norm": 0.3078915476799011, + "learning_rate": 9.903782238166364e-05, + "loss": 1.504, + "step": 46400 + }, + { + "epoch": 2.076413704337223, + "grad_norm": 0.23746353387832642, + "learning_rate": 9.903456197818411e-05, + "loss": 1.5084, + "step": 46410 + }, + { + "epoch": 2.076955645502735, + "grad_norm": 0.1999620497226715, + "learning_rate": 9.90312961198942e-05, + "loss": 1.4992, + "step": 46420 + }, + { + "epoch": 2.0774975866682475, + "grad_norm": 0.3216739594936371, + "learning_rate": 9.90280248071985e-05, + "loss": 1.5011, + "step": 46430 + }, + { + "epoch": 2.0780395278337593, + "grad_norm": 0.21200136840343475, + "learning_rate": 9.902474804050224e-05, + "loss": 1.4959, + "step": 46440 + }, + { + "epoch": 2.0785814689992717, + "grad_norm": 0.21161618828773499, + "learning_rate": 9.902146582021133e-05, + "loss": 1.5019, + "step": 46450 + }, + { + "epoch": 2.0790150219316814, + "eval_loss": 2.4619641304016113, + "eval_runtime": 21.9868, + "eval_samples_per_second": 227.409, + "eval_steps_per_second": 1.228, + "step": 46458 + }, + { + "epoch": 2.079123410164784, + "grad_norm": 0.28815415501594543, + "learning_rate": 9.901817814673236e-05, + "loss": 1.4998, + "step": 46460 + }, + { + "epoch": 2.0796653513302963, + "grad_norm": 0.21010233461856842, + "learning_rate": 9.901488502047257e-05, + "loss": 1.5081, + "step": 46470 + }, + { + "epoch": 2.0802072924958086, + "grad_norm": 0.21125809848308563, + "learning_rate": 9.901158644183993e-05, + "loss": 1.5111, + "step": 46480 + }, + { + "epoch": 2.0807492336613205, + "grad_norm": 0.2528863251209259, + "learning_rate": 9.900828241124303e-05, + "loss": 1.4949, + "step": 46490 + }, + { + "epoch": 2.081291174826833, + "grad_norm": 0.5834783911705017, + "learning_rate": 9.900497292909119e-05, + "loss": 1.5182, + "step": 46500 + }, + { + "epoch": 2.081833115992345, + "grad_norm": 0.47388049960136414, + "learning_rate": 9.900165799579434e-05, + "loss": 1.4988, + "step": 46510 + }, + { + "epoch": 2.0823750571578574, + "grad_norm": 0.2034447342157364, + "learning_rate": 9.899833761176312e-05, + "loss": 1.4929, + "step": 46520 + }, + { + "epoch": 2.0829169983233697, + "grad_norm": 0.2005794793367386, + "learning_rate": 9.899501177740889e-05, + "loss": 1.4957, + "step": 46530 + }, + { + "epoch": 2.0834589394888816, + "grad_norm": 0.3143729269504547, + "learning_rate": 9.899168049314358e-05, + "loss": 1.4937, + "step": 46540 + }, + { + "epoch": 2.083729910071638, + "eval_loss": 2.4592928886413574, + "eval_runtime": 21.99, + "eval_samples_per_second": 227.376, + "eval_steps_per_second": 1.228, + "step": 46545 + }, + { + "epoch": 2.084000880654394, + "grad_norm": 0.3189544379711151, + "learning_rate": 9.898834375937991e-05, + "loss": 1.5108, + "step": 46550 + }, + { + "epoch": 2.0845428218199062, + "grad_norm": 0.4922240674495697, + "learning_rate": 9.898500157653118e-05, + "loss": 1.51, + "step": 46560 + }, + { + "epoch": 2.0850847629854186, + "grad_norm": 0.18532593548297882, + "learning_rate": 9.898165394501142e-05, + "loss": 1.5036, + "step": 46570 + }, + { + "epoch": 2.0856267041509304, + "grad_norm": 0.26253679394721985, + "learning_rate": 9.897830086523531e-05, + "loss": 1.5052, + "step": 46580 + }, + { + "epoch": 2.0861686453164427, + "grad_norm": 0.36160239577293396, + "learning_rate": 9.897494233761823e-05, + "loss": 1.5125, + "step": 46590 + }, + { + "epoch": 2.086710586481955, + "grad_norm": 0.25832414627075195, + "learning_rate": 9.897157836257621e-05, + "loss": 1.5106, + "step": 46600 + }, + { + "epoch": 2.0872525276474674, + "grad_norm": 0.32197335362434387, + "learning_rate": 9.896820894052598e-05, + "loss": 1.5029, + "step": 46610 + }, + { + "epoch": 2.0877944688129797, + "grad_norm": 0.18866315484046936, + "learning_rate": 9.896483407188492e-05, + "loss": 1.502, + "step": 46620 + }, + { + "epoch": 2.0883364099784916, + "grad_norm": 0.25076818466186523, + "learning_rate": 9.896145375707106e-05, + "loss": 1.5093, + "step": 46630 + }, + { + "epoch": 2.088444798211594, + "eval_loss": 2.461345911026001, + "eval_runtime": 26.7525, + "eval_samples_per_second": 186.898, + "eval_steps_per_second": 1.009, + "step": 46632 + }, + { + "epoch": 2.088878351144004, + "grad_norm": 0.20240730047225952, + "learning_rate": 9.89580679965032e-05, + "loss": 1.508, + "step": 46640 + }, + { + "epoch": 2.089420292309516, + "grad_norm": 0.2117396742105484, + "learning_rate": 9.895467679060071e-05, + "loss": 1.4989, + "step": 46650 + }, + { + "epoch": 2.0899622334750285, + "grad_norm": 0.3446267545223236, + "learning_rate": 9.89512801397837e-05, + "loss": 1.5041, + "step": 46660 + }, + { + "epoch": 2.0905041746405404, + "grad_norm": 0.22244545817375183, + "learning_rate": 9.89478780444729e-05, + "loss": 1.5003, + "step": 46670 + }, + { + "epoch": 2.0910461158060527, + "grad_norm": 0.18350328505039215, + "learning_rate": 9.894447050508981e-05, + "loss": 1.5068, + "step": 46680 + }, + { + "epoch": 2.091588056971565, + "grad_norm": 0.2232082635164261, + "learning_rate": 9.894105752205648e-05, + "loss": 1.5096, + "step": 46690 + }, + { + "epoch": 2.0921299981370773, + "grad_norm": 0.2229575365781784, + "learning_rate": 9.893763909579571e-05, + "loss": 1.4963, + "step": 46700 + }, + { + "epoch": 2.0926719393025897, + "grad_norm": 0.22202491760253906, + "learning_rate": 9.893421522673098e-05, + "loss": 1.4979, + "step": 46710 + }, + { + "epoch": 2.0931596863515503, + "eval_loss": 2.4592254161834717, + "eval_runtime": 22.1699, + "eval_samples_per_second": 225.531, + "eval_steps_per_second": 1.218, + "step": 46719 + }, + { + "epoch": 2.0932138804681015, + "grad_norm": 0.28266942501068115, + "learning_rate": 9.89307859152864e-05, + "loss": 1.5001, + "step": 46720 + }, + { + "epoch": 2.093755821633614, + "grad_norm": 0.24115784466266632, + "learning_rate": 9.89273511618868e-05, + "loss": 1.501, + "step": 46730 + }, + { + "epoch": 2.094297762799126, + "grad_norm": 0.3385097086429596, + "learning_rate": 9.892391096695766e-05, + "loss": 1.5128, + "step": 46740 + }, + { + "epoch": 2.0948397039646385, + "grad_norm": 0.2886834144592285, + "learning_rate": 9.89204653309251e-05, + "loss": 1.4963, + "step": 46750 + }, + { + "epoch": 2.0953816451301503, + "grad_norm": 0.40250498056411743, + "learning_rate": 9.891701425421599e-05, + "loss": 1.5119, + "step": 46760 + }, + { + "epoch": 2.0959235862956627, + "grad_norm": 0.20079436898231506, + "learning_rate": 9.891355773725783e-05, + "loss": 1.5029, + "step": 46770 + }, + { + "epoch": 2.096465527461175, + "grad_norm": 0.26996469497680664, + "learning_rate": 9.891009578047879e-05, + "loss": 1.4942, + "step": 46780 + }, + { + "epoch": 2.0970074686266873, + "grad_norm": 0.31331729888916016, + "learning_rate": 9.890662838430771e-05, + "loss": 1.5042, + "step": 46790 + }, + { + "epoch": 2.0975494097921996, + "grad_norm": 0.27219855785369873, + "learning_rate": 9.890315554917415e-05, + "loss": 1.5004, + "step": 46800 + }, + { + "epoch": 2.097874574491507, + "eval_loss": 2.4539170265197754, + "eval_runtime": 22.0178, + "eval_samples_per_second": 227.089, + "eval_steps_per_second": 1.226, + "step": 46806 + }, + { + "epoch": 2.0980913509577115, + "grad_norm": 0.2593035399913788, + "learning_rate": 9.889967727550829e-05, + "loss": 1.495, + "step": 46810 + }, + { + "epoch": 2.098633292123224, + "grad_norm": 0.219623863697052, + "learning_rate": 9.889619356374097e-05, + "loss": 1.5117, + "step": 46820 + }, + { + "epoch": 2.099175233288736, + "grad_norm": 0.3427010774612427, + "learning_rate": 9.88927044143038e-05, + "loss": 1.5115, + "step": 46830 + }, + { + "epoch": 2.0997171744542484, + "grad_norm": 0.2844060957431793, + "learning_rate": 9.888920982762895e-05, + "loss": 1.5038, + "step": 46840 + }, + { + "epoch": 2.1002591156197603, + "grad_norm": 0.6262246966362, + "learning_rate": 9.888570980414935e-05, + "loss": 1.4964, + "step": 46850 + }, + { + "epoch": 2.1008010567852726, + "grad_norm": 0.3818177878856659, + "learning_rate": 9.888220434429856e-05, + "loss": 1.499, + "step": 46860 + }, + { + "epoch": 2.101342997950785, + "grad_norm": 0.19796334207057953, + "learning_rate": 9.887869344851081e-05, + "loss": 1.5019, + "step": 46870 + }, + { + "epoch": 2.1018849391162973, + "grad_norm": 0.40566131472587585, + "learning_rate": 9.8875177117221e-05, + "loss": 1.4951, + "step": 46880 + }, + { + "epoch": 2.1024268802818096, + "grad_norm": 0.2360570877790451, + "learning_rate": 9.887165535086473e-05, + "loss": 1.501, + "step": 46890 + }, + { + "epoch": 2.102589462631463, + "eval_loss": 2.4633328914642334, + "eval_runtime": 22.3971, + "eval_samples_per_second": 223.243, + "eval_steps_per_second": 1.206, + "step": 46893 + }, + { + "epoch": 2.1029688214473214, + "grad_norm": 0.41195404529571533, + "learning_rate": 9.88681281498783e-05, + "loss": 1.5036, + "step": 46900 + }, + { + "epoch": 2.1035107626128338, + "grad_norm": 0.30758100748062134, + "learning_rate": 9.886459551469858e-05, + "loss": 1.5054, + "step": 46910 + }, + { + "epoch": 2.104052703778346, + "grad_norm": 0.40162351727485657, + "learning_rate": 9.886105744576322e-05, + "loss": 1.4949, + "step": 46920 + }, + { + "epoch": 2.1045946449438584, + "grad_norm": 0.32191726565361023, + "learning_rate": 9.88575139435105e-05, + "loss": 1.5065, + "step": 46930 + }, + { + "epoch": 2.1051365861093707, + "grad_norm": 0.5494133830070496, + "learning_rate": 9.885396500837934e-05, + "loss": 1.5051, + "step": 46940 + }, + { + "epoch": 2.1056785272748826, + "grad_norm": 0.23451875150203705, + "learning_rate": 9.88504106408094e-05, + "loss": 1.4936, + "step": 46950 + }, + { + "epoch": 2.106220468440395, + "grad_norm": 0.2888700067996979, + "learning_rate": 9.884685084124098e-05, + "loss": 1.4968, + "step": 46960 + }, + { + "epoch": 2.106762409605907, + "grad_norm": 0.24672222137451172, + "learning_rate": 9.884328561011503e-05, + "loss": 1.4998, + "step": 46970 + }, + { + "epoch": 2.1073043507714195, + "grad_norm": 0.2256728857755661, + "learning_rate": 9.883971494787322e-05, + "loss": 1.5046, + "step": 46980 + }, + { + "epoch": 2.1073043507714195, + "eval_loss": 2.471791982650757, + "eval_runtime": 21.9693, + "eval_samples_per_second": 227.591, + "eval_steps_per_second": 1.229, + "step": 46980 + }, + { + "epoch": 2.1078462919369314, + "grad_norm": 0.21322891116142273, + "learning_rate": 9.883613885495785e-05, + "loss": 1.5042, + "step": 46990 + }, + { + "epoch": 2.1083882331024437, + "grad_norm": 0.35615795850753784, + "learning_rate": 9.88325573318119e-05, + "loss": 1.4933, + "step": 47000 + }, + { + "epoch": 2.108930174267956, + "grad_norm": 0.3552355170249939, + "learning_rate": 9.882897037887907e-05, + "loss": 1.5039, + "step": 47010 + }, + { + "epoch": 2.1094721154334684, + "grad_norm": 0.6598255038261414, + "learning_rate": 9.882537799660368e-05, + "loss": 1.4987, + "step": 47020 + }, + { + "epoch": 2.1100140565989807, + "grad_norm": 0.22862312197685242, + "learning_rate": 9.882178018543071e-05, + "loss": 1.5196, + "step": 47030 + }, + { + "epoch": 2.1105559977644925, + "grad_norm": 0.3337612450122833, + "learning_rate": 9.881817694580588e-05, + "loss": 1.5018, + "step": 47040 + }, + { + "epoch": 2.111097938930005, + "grad_norm": 0.24085493385791779, + "learning_rate": 9.881456827817553e-05, + "loss": 1.5019, + "step": 47050 + }, + { + "epoch": 2.111639880095517, + "grad_norm": 0.20780259370803833, + "learning_rate": 9.881095418298668e-05, + "loss": 1.5074, + "step": 47060 + }, + { + "epoch": 2.1120192389113757, + "eval_loss": 2.4535341262817383, + "eval_runtime": 21.9889, + "eval_samples_per_second": 227.388, + "eval_steps_per_second": 1.228, + "step": 47067 + }, + { + "epoch": 2.1121818212610295, + "grad_norm": 0.41703495383262634, + "learning_rate": 9.880733466068704e-05, + "loss": 1.5105, + "step": 47070 + }, + { + "epoch": 2.1127237624265414, + "grad_norm": 0.3521629571914673, + "learning_rate": 9.880370971172497e-05, + "loss": 1.5062, + "step": 47080 + }, + { + "epoch": 2.1132657035920537, + "grad_norm": 0.3287181854248047, + "learning_rate": 9.880007933654953e-05, + "loss": 1.4956, + "step": 47090 + }, + { + "epoch": 2.113807644757566, + "grad_norm": 0.3003343939781189, + "learning_rate": 9.87964435356104e-05, + "loss": 1.5089, + "step": 47100 + }, + { + "epoch": 2.1143495859230783, + "grad_norm": 0.2599339187145233, + "learning_rate": 9.879280230935801e-05, + "loss": 1.4982, + "step": 47110 + }, + { + "epoch": 2.1148915270885906, + "grad_norm": 0.22060526907444, + "learning_rate": 9.878915565824341e-05, + "loss": 1.4989, + "step": 47120 + }, + { + "epoch": 2.1154334682541025, + "grad_norm": 0.28959226608276367, + "learning_rate": 9.878550358271829e-05, + "loss": 1.5042, + "step": 47130 + }, + { + "epoch": 2.115975409419615, + "grad_norm": 0.31336092948913574, + "learning_rate": 9.878184608323509e-05, + "loss": 1.4997, + "step": 47140 + }, + { + "epoch": 2.116517350585127, + "grad_norm": 0.23946624994277954, + "learning_rate": 9.877818316024689e-05, + "loss": 1.5056, + "step": 47150 + }, + { + "epoch": 2.116734127051332, + "eval_loss": 2.464033365249634, + "eval_runtime": 21.9877, + "eval_samples_per_second": 227.4, + "eval_steps_per_second": 1.228, + "step": 47154 + }, + { + "epoch": 2.1170592917506394, + "grad_norm": 0.23651756346225739, + "learning_rate": 9.877451481420742e-05, + "loss": 1.502, + "step": 47160 + }, + { + "epoch": 2.1176012329161518, + "grad_norm": 0.2244928926229477, + "learning_rate": 9.877084104557111e-05, + "loss": 1.5014, + "step": 47170 + }, + { + "epoch": 2.1181431740816636, + "grad_norm": 0.24885980784893036, + "learning_rate": 9.876716185479303e-05, + "loss": 1.5075, + "step": 47180 + }, + { + "epoch": 2.118685115247176, + "grad_norm": 0.22047527134418488, + "learning_rate": 9.876347724232898e-05, + "loss": 1.5042, + "step": 47190 + }, + { + "epoch": 2.1192270564126883, + "grad_norm": 0.2784343659877777, + "learning_rate": 9.875978720863536e-05, + "loss": 1.5095, + "step": 47200 + }, + { + "epoch": 2.1197689975782006, + "grad_norm": 0.41502806544303894, + "learning_rate": 9.875609175416929e-05, + "loss": 1.498, + "step": 47210 + }, + { + "epoch": 2.1203109387437125, + "grad_norm": 0.23642586171627045, + "learning_rate": 9.875239087938853e-05, + "loss": 1.4923, + "step": 47220 + }, + { + "epoch": 2.1208528799092248, + "grad_norm": 0.3005571663379669, + "learning_rate": 9.874868458475155e-05, + "loss": 1.5045, + "step": 47230 + }, + { + "epoch": 2.121394821074737, + "grad_norm": 0.2985764443874359, + "learning_rate": 9.874497287071747e-05, + "loss": 1.5013, + "step": 47240 + }, + { + "epoch": 2.1214490151912884, + "eval_loss": 2.4620718955993652, + "eval_runtime": 22.3871, + "eval_samples_per_second": 223.343, + "eval_steps_per_second": 1.206, + "step": 47241 + }, + { + "epoch": 2.1219367622402494, + "grad_norm": 0.3452117443084717, + "learning_rate": 9.874125573774608e-05, + "loss": 1.5131, + "step": 47250 + }, + { + "epoch": 2.1224787034057617, + "grad_norm": 0.33107510209083557, + "learning_rate": 9.873753318629781e-05, + "loss": 1.4932, + "step": 47260 + }, + { + "epoch": 2.1230206445712736, + "grad_norm": 0.20669598877429962, + "learning_rate": 9.873380521683383e-05, + "loss": 1.5066, + "step": 47270 + }, + { + "epoch": 2.123562585736786, + "grad_norm": 0.2573641538619995, + "learning_rate": 9.873007182981592e-05, + "loss": 1.5003, + "step": 47280 + }, + { + "epoch": 2.1241045269022982, + "grad_norm": 0.2514735460281372, + "learning_rate": 9.87263330257066e-05, + "loss": 1.4996, + "step": 47290 + }, + { + "epoch": 2.1246464680678105, + "grad_norm": 0.7629513740539551, + "learning_rate": 9.872258880496896e-05, + "loss": 1.5085, + "step": 47300 + }, + { + "epoch": 2.1251884092333224, + "grad_norm": 0.2581771910190582, + "learning_rate": 9.871883916806686e-05, + "loss": 1.5017, + "step": 47310 + }, + { + "epoch": 2.1257303503988347, + "grad_norm": 0.22503337264060974, + "learning_rate": 9.871508411546475e-05, + "loss": 1.4997, + "step": 47320 + }, + { + "epoch": 2.1261639033312445, + "eval_loss": 2.4661858081817627, + "eval_runtime": 25.2283, + "eval_samples_per_second": 198.19, + "eval_steps_per_second": 1.07, + "step": 47328 + }, + { + "epoch": 2.126272291564347, + "grad_norm": 0.3019942045211792, + "learning_rate": 9.871132364762782e-05, + "loss": 1.5063, + "step": 47330 + }, + { + "epoch": 2.1268142327298594, + "grad_norm": 0.3621060848236084, + "learning_rate": 9.87075577650219e-05, + "loss": 1.4999, + "step": 47340 + }, + { + "epoch": 2.1273561738953717, + "grad_norm": 0.18868303298950195, + "learning_rate": 9.870378646811347e-05, + "loss": 1.5056, + "step": 47350 + }, + { + "epoch": 2.1278981150608836, + "grad_norm": 0.20302945375442505, + "learning_rate": 9.870000975736974e-05, + "loss": 1.5063, + "step": 47360 + }, + { + "epoch": 2.128440056226396, + "grad_norm": 0.41298550367355347, + "learning_rate": 9.86962276332585e-05, + "loss": 1.5067, + "step": 47370 + }, + { + "epoch": 2.128981997391908, + "grad_norm": 0.25545287132263184, + "learning_rate": 9.869244009624831e-05, + "loss": 1.4994, + "step": 47380 + }, + { + "epoch": 2.1295239385574205, + "grad_norm": 0.33961889147758484, + "learning_rate": 9.868864714680832e-05, + "loss": 1.5066, + "step": 47390 + }, + { + "epoch": 2.1300658797229324, + "grad_norm": 0.29086834192276, + "learning_rate": 9.868484878540842e-05, + "loss": 1.505, + "step": 47400 + }, + { + "epoch": 2.1306078208884447, + "grad_norm": 0.2222570925951004, + "learning_rate": 9.868104501251909e-05, + "loss": 1.4965, + "step": 47410 + }, + { + "epoch": 2.130878791471201, + "eval_loss": 2.467705249786377, + "eval_runtime": 22.0169, + "eval_samples_per_second": 227.099, + "eval_steps_per_second": 1.226, + "step": 47415 + }, + { + "epoch": 2.131149762053957, + "grad_norm": 0.25193503499031067, + "learning_rate": 9.867723582861155e-05, + "loss": 1.5039, + "step": 47420 + }, + { + "epoch": 2.1316917032194693, + "grad_norm": 0.2993335723876953, + "learning_rate": 9.867342123415768e-05, + "loss": 1.5052, + "step": 47430 + }, + { + "epoch": 2.1322336443849816, + "grad_norm": 0.2402217835187912, + "learning_rate": 9.866960122962998e-05, + "loss": 1.4947, + "step": 47440 + }, + { + "epoch": 2.1327755855504935, + "grad_norm": 0.47997650504112244, + "learning_rate": 9.866577581550169e-05, + "loss": 1.5065, + "step": 47450 + }, + { + "epoch": 2.133317526716006, + "grad_norm": 0.3690037429332733, + "learning_rate": 9.866194499224665e-05, + "loss": 1.5024, + "step": 47460 + }, + { + "epoch": 2.133859467881518, + "grad_norm": 0.2865743935108185, + "learning_rate": 9.865810876033946e-05, + "loss": 1.4975, + "step": 47470 + }, + { + "epoch": 2.1344014090470305, + "grad_norm": 0.2986805737018585, + "learning_rate": 9.865426712025527e-05, + "loss": 1.4964, + "step": 47480 + }, + { + "epoch": 2.1349433502125423, + "grad_norm": 0.44685614109039307, + "learning_rate": 9.865042007247001e-05, + "loss": 1.4987, + "step": 47490 + }, + { + "epoch": 2.1354852913780547, + "grad_norm": 0.26483583450317383, + "learning_rate": 9.864656761746021e-05, + "loss": 1.4933, + "step": 47500 + }, + { + "epoch": 2.135593679611157, + "eval_loss": 2.4674875736236572, + "eval_runtime": 22.0484, + "eval_samples_per_second": 226.774, + "eval_steps_per_second": 1.225, + "step": 47502 + }, + { + "epoch": 2.136027232543567, + "grad_norm": 0.2297901213169098, + "learning_rate": 9.864270975570313e-05, + "loss": 1.5029, + "step": 47510 + }, + { + "epoch": 2.1365691737090793, + "grad_norm": 0.251094251871109, + "learning_rate": 9.863884648767662e-05, + "loss": 1.5057, + "step": 47520 + }, + { + "epoch": 2.1371111148745916, + "grad_norm": 0.36560431122779846, + "learning_rate": 9.863497781385928e-05, + "loss": 1.4962, + "step": 47530 + }, + { + "epoch": 2.1376530560401035, + "grad_norm": 0.3538174033164978, + "learning_rate": 9.863110373473033e-05, + "loss": 1.4831, + "step": 47540 + }, + { + "epoch": 2.138194997205616, + "grad_norm": 0.20020203292369843, + "learning_rate": 9.862722425076968e-05, + "loss": 1.5092, + "step": 47550 + }, + { + "epoch": 2.138736938371128, + "grad_norm": 0.3080503046512604, + "learning_rate": 9.862333936245789e-05, + "loss": 1.5111, + "step": 47560 + }, + { + "epoch": 2.1392788795366404, + "grad_norm": 0.3943610191345215, + "learning_rate": 9.861944907027624e-05, + "loss": 1.5024, + "step": 47570 + }, + { + "epoch": 2.1398208207021527, + "grad_norm": 0.25787243247032166, + "learning_rate": 9.861555337470658e-05, + "loss": 1.4955, + "step": 47580 + }, + { + "epoch": 2.1403085677511133, + "eval_loss": 2.4601619243621826, + "eval_runtime": 22.0199, + "eval_samples_per_second": 227.067, + "eval_steps_per_second": 1.226, + "step": 47589 + }, + { + "epoch": 2.1403627618676646, + "grad_norm": 0.2594373822212219, + "learning_rate": 9.861165227623154e-05, + "loss": 1.5006, + "step": 47590 + }, + { + "epoch": 2.140904703033177, + "grad_norm": 0.1998678594827652, + "learning_rate": 9.860774577533438e-05, + "loss": 1.493, + "step": 47600 + }, + { + "epoch": 2.1414466441986892, + "grad_norm": 0.30177435278892517, + "learning_rate": 9.860383387249897e-05, + "loss": 1.5081, + "step": 47610 + }, + { + "epoch": 2.1419885853642016, + "grad_norm": 0.3916855752468109, + "learning_rate": 9.859991656820994e-05, + "loss": 1.4935, + "step": 47620 + }, + { + "epoch": 2.1425305265297134, + "grad_norm": 0.3234284520149231, + "learning_rate": 9.859599386295255e-05, + "loss": 1.5059, + "step": 47630 + }, + { + "epoch": 2.1430724676952257, + "grad_norm": 0.4442438781261444, + "learning_rate": 9.85920657572127e-05, + "loss": 1.5093, + "step": 47640 + }, + { + "epoch": 2.143614408860738, + "grad_norm": 0.194505512714386, + "learning_rate": 9.858813225147702e-05, + "loss": 1.4962, + "step": 47650 + }, + { + "epoch": 2.1441563500262504, + "grad_norm": 0.31255653500556946, + "learning_rate": 9.858419334623273e-05, + "loss": 1.5066, + "step": 47660 + }, + { + "epoch": 2.1446982911917627, + "grad_norm": 0.2588372230529785, + "learning_rate": 9.858024904196782e-05, + "loss": 1.502, + "step": 47670 + }, + { + "epoch": 2.14502345589107, + "eval_loss": 2.4643874168395996, + "eval_runtime": 21.985, + "eval_samples_per_second": 227.428, + "eval_steps_per_second": 1.228, + "step": 47676 + }, + { + "epoch": 2.1452402323572746, + "grad_norm": 0.27704402804374695, + "learning_rate": 9.857629933917084e-05, + "loss": 1.5025, + "step": 47680 + }, + { + "epoch": 2.145782173522787, + "grad_norm": 0.34092339873313904, + "learning_rate": 9.857234423833111e-05, + "loss": 1.4959, + "step": 47690 + }, + { + "epoch": 2.146324114688299, + "grad_norm": 0.37414854764938354, + "learning_rate": 9.856838373993852e-05, + "loss": 1.494, + "step": 47700 + }, + { + "epoch": 2.1468660558538115, + "grad_norm": 0.19338767230510712, + "learning_rate": 9.856441784448373e-05, + "loss": 1.5082, + "step": 47710 + }, + { + "epoch": 2.1474079970193234, + "grad_norm": 0.23796340823173523, + "learning_rate": 9.8560446552458e-05, + "loss": 1.4824, + "step": 47720 + }, + { + "epoch": 2.1479499381848357, + "grad_norm": 0.3311995565891266, + "learning_rate": 9.855646986435325e-05, + "loss": 1.5069, + "step": 47730 + }, + { + "epoch": 2.148491879350348, + "grad_norm": 0.2240724116563797, + "learning_rate": 9.855248778066212e-05, + "loss": 1.4987, + "step": 47740 + }, + { + "epoch": 2.1490338205158603, + "grad_norm": 0.22334522008895874, + "learning_rate": 9.854850030187791e-05, + "loss": 1.4981, + "step": 47750 + }, + { + "epoch": 2.1495757616813727, + "grad_norm": 0.21262875199317932, + "learning_rate": 9.854450742849452e-05, + "loss": 1.5029, + "step": 47760 + }, + { + "epoch": 2.149738344031026, + "eval_loss": 2.4613311290740967, + "eval_runtime": 21.9832, + "eval_samples_per_second": 227.447, + "eval_steps_per_second": 1.228, + "step": 47763 + }, + { + "epoch": 2.1501177028468845, + "grad_norm": 0.2315114289522171, + "learning_rate": 9.854050916100664e-05, + "loss": 1.5089, + "step": 47770 + }, + { + "epoch": 2.150659644012397, + "grad_norm": 0.34692510962486267, + "learning_rate": 9.853650549990949e-05, + "loss": 1.488, + "step": 47780 + }, + { + "epoch": 2.151201585177909, + "grad_norm": 0.2978134751319885, + "learning_rate": 9.853249644569907e-05, + "loss": 1.5069, + "step": 47790 + }, + { + "epoch": 2.1517435263434215, + "grad_norm": 0.2450045347213745, + "learning_rate": 9.852848199887198e-05, + "loss": 1.4962, + "step": 47800 + }, + { + "epoch": 2.152285467508934, + "grad_norm": 0.27408814430236816, + "learning_rate": 9.852446215992552e-05, + "loss": 1.5035, + "step": 47810 + }, + { + "epoch": 2.1528274086744457, + "grad_norm": 0.288369745016098, + "learning_rate": 9.852043692935766e-05, + "loss": 1.5054, + "step": 47820 + }, + { + "epoch": 2.153369349839958, + "grad_norm": 0.33964666724205017, + "learning_rate": 9.851640630766703e-05, + "loss": 1.5055, + "step": 47830 + }, + { + "epoch": 2.1539112910054703, + "grad_norm": 0.3584079444408417, + "learning_rate": 9.851237029535289e-05, + "loss": 1.4909, + "step": 47840 + }, + { + "epoch": 2.1544532321709826, + "grad_norm": 0.3552767336368561, + "learning_rate": 9.850832889291525e-05, + "loss": 1.4932, + "step": 47850 + }, + { + "epoch": 2.1544532321709826, + "eval_loss": 2.4597816467285156, + "eval_runtime": 21.9841, + "eval_samples_per_second": 227.437, + "eval_steps_per_second": 1.228, + "step": 47850 + }, + { + "epoch": 2.1549951733364945, + "grad_norm": 0.24502573907375336, + "learning_rate": 9.850428210085471e-05, + "loss": 1.4988, + "step": 47860 + }, + { + "epoch": 2.155537114502007, + "grad_norm": 0.26566943526268005, + "learning_rate": 9.850022991967258e-05, + "loss": 1.5044, + "step": 47870 + }, + { + "epoch": 2.156079055667519, + "grad_norm": 0.22309494018554688, + "learning_rate": 9.849617234987083e-05, + "loss": 1.5052, + "step": 47880 + }, + { + "epoch": 2.1566209968330314, + "grad_norm": 0.3310026526451111, + "learning_rate": 9.849210939195209e-05, + "loss": 1.5026, + "step": 47890 + }, + { + "epoch": 2.1571629379985433, + "grad_norm": 0.24902848899364471, + "learning_rate": 9.848804104641966e-05, + "loss": 1.5058, + "step": 47900 + }, + { + "epoch": 2.1577048791640556, + "grad_norm": 0.21098916232585907, + "learning_rate": 9.848396731377751e-05, + "loss": 1.5041, + "step": 47910 + }, + { + "epoch": 2.158246820329568, + "grad_norm": 0.23908697068691254, + "learning_rate": 9.84798881945303e-05, + "loss": 1.5018, + "step": 47920 + }, + { + "epoch": 2.1587887614950803, + "grad_norm": 0.41047078371047974, + "learning_rate": 9.847580368918329e-05, + "loss": 1.5021, + "step": 47930 + }, + { + "epoch": 2.1591681203109387, + "eval_loss": 2.4657371044158936, + "eval_runtime": 21.9884, + "eval_samples_per_second": 227.392, + "eval_steps_per_second": 1.228, + "step": 47937 + }, + { + "epoch": 2.1593307026605926, + "grad_norm": 0.4476509988307953, + "learning_rate": 9.847171379824248e-05, + "loss": 1.5102, + "step": 47940 + }, + { + "epoch": 2.1598726438261044, + "grad_norm": 0.43281930685043335, + "learning_rate": 9.846761852221449e-05, + "loss": 1.4999, + "step": 47950 + }, + { + "epoch": 2.1604145849916168, + "grad_norm": 0.20760992169380188, + "learning_rate": 9.846351786160665e-05, + "loss": 1.5029, + "step": 47960 + }, + { + "epoch": 2.160956526157129, + "grad_norm": 0.24444232881069183, + "learning_rate": 9.84594118169269e-05, + "loss": 1.5091, + "step": 47970 + }, + { + "epoch": 2.1614984673226414, + "grad_norm": 0.25241169333457947, + "learning_rate": 9.84553003886839e-05, + "loss": 1.5019, + "step": 47980 + }, + { + "epoch": 2.1620404084881537, + "grad_norm": 0.26885291934013367, + "learning_rate": 9.845118357738696e-05, + "loss": 1.5068, + "step": 47990 + }, + { + "epoch": 2.1625823496536656, + "grad_norm": 0.26971253752708435, + "learning_rate": 9.844706138354603e-05, + "loss": 1.5013, + "step": 48000 + }, + { + "epoch": 2.163124290819178, + "grad_norm": 0.20452235639095306, + "learning_rate": 9.844293380767178e-05, + "loss": 1.4947, + "step": 48010 + }, + { + "epoch": 2.16366623198469, + "grad_norm": 0.19715271890163422, + "learning_rate": 9.843880085027551e-05, + "loss": 1.496, + "step": 48020 + }, + { + "epoch": 2.163883008450895, + "eval_loss": 2.4602131843566895, + "eval_runtime": 21.9834, + "eval_samples_per_second": 227.444, + "eval_steps_per_second": 1.228, + "step": 48024 + }, + { + "epoch": 2.1642081731502025, + "grad_norm": 0.30456554889678955, + "learning_rate": 9.843466251186916e-05, + "loss": 1.5103, + "step": 48030 + }, + { + "epoch": 2.1647501143157144, + "grad_norm": 0.24360065162181854, + "learning_rate": 9.843051879296539e-05, + "loss": 1.5072, + "step": 48040 + }, + { + "epoch": 2.1652920554812267, + "grad_norm": 0.2575078308582306, + "learning_rate": 9.842636969407753e-05, + "loss": 1.5027, + "step": 48050 + }, + { + "epoch": 2.165833996646739, + "grad_norm": 0.1880762130022049, + "learning_rate": 9.842221521571951e-05, + "loss": 1.4936, + "step": 48060 + }, + { + "epoch": 2.1663759378122514, + "grad_norm": 0.24595960974693298, + "learning_rate": 9.841805535840602e-05, + "loss": 1.501, + "step": 48070 + }, + { + "epoch": 2.1669178789777637, + "grad_norm": 0.21159407496452332, + "learning_rate": 9.841389012265231e-05, + "loss": 1.4968, + "step": 48080 + }, + { + "epoch": 2.1674598201432755, + "grad_norm": 0.4702402353286743, + "learning_rate": 9.840971950897437e-05, + "loss": 1.4929, + "step": 48090 + }, + { + "epoch": 2.168001761308788, + "grad_norm": 0.23953235149383545, + "learning_rate": 9.840554351788885e-05, + "loss": 1.4948, + "step": 48100 + }, + { + "epoch": 2.1685437024743, + "grad_norm": 0.21100491285324097, + "learning_rate": 9.840136214991305e-05, + "loss": 1.4927, + "step": 48110 + }, + { + "epoch": 2.1685978965908514, + "eval_loss": 2.4579579830169678, + "eval_runtime": 21.9856, + "eval_samples_per_second": 227.422, + "eval_steps_per_second": 1.228, + "step": 48111 + }, + { + "epoch": 2.1690856436398125, + "grad_norm": 0.2241096794605255, + "learning_rate": 9.839717540556495e-05, + "loss": 1.4958, + "step": 48120 + }, + { + "epoch": 2.1696275848053244, + "grad_norm": 0.19508574903011322, + "learning_rate": 9.839298328536316e-05, + "loss": 1.4886, + "step": 48130 + }, + { + "epoch": 2.1701695259708367, + "grad_norm": 0.3244903087615967, + "learning_rate": 9.838878578982699e-05, + "loss": 1.4895, + "step": 48140 + }, + { + "epoch": 2.170711467136349, + "grad_norm": 0.3189353346824646, + "learning_rate": 9.838458291947641e-05, + "loss": 1.4833, + "step": 48150 + }, + { + "epoch": 2.1712534083018613, + "grad_norm": 0.20821264386177063, + "learning_rate": 9.838037467483207e-05, + "loss": 1.498, + "step": 48160 + }, + { + "epoch": 2.1717953494673736, + "grad_norm": 0.21814101934432983, + "learning_rate": 9.837616105641523e-05, + "loss": 1.4939, + "step": 48170 + }, + { + "epoch": 2.1723372906328855, + "grad_norm": 0.2125815451145172, + "learning_rate": 9.837194206474789e-05, + "loss": 1.4896, + "step": 48180 + }, + { + "epoch": 2.172879231798398, + "grad_norm": 0.20504556596279144, + "learning_rate": 9.83677177003527e-05, + "loss": 1.498, + "step": 48190 + }, + { + "epoch": 2.1733127847308076, + "eval_loss": 2.4657247066497803, + "eval_runtime": 22.1285, + "eval_samples_per_second": 225.953, + "eval_steps_per_second": 1.22, + "step": 48198 + }, + { + "epoch": 2.17342117296391, + "grad_norm": 0.21012401580810547, + "learning_rate": 9.836348796375288e-05, + "loss": 1.4979, + "step": 48200 + }, + { + "epoch": 2.1739631141294224, + "grad_norm": 0.42915037274360657, + "learning_rate": 9.835925285547245e-05, + "loss": 1.4883, + "step": 48210 + }, + { + "epoch": 2.1745050552949348, + "grad_norm": 0.350690633058548, + "learning_rate": 9.835501237603603e-05, + "loss": 1.5056, + "step": 48220 + }, + { + "epoch": 2.1750469964604466, + "grad_norm": 0.23836711049079895, + "learning_rate": 9.835076652596891e-05, + "loss": 1.5043, + "step": 48230 + }, + { + "epoch": 2.175588937625959, + "grad_norm": 0.4392627477645874, + "learning_rate": 9.834651530579703e-05, + "loss": 1.489, + "step": 48240 + }, + { + "epoch": 2.1761308787914713, + "grad_norm": 0.1950349509716034, + "learning_rate": 9.834225871604701e-05, + "loss": 1.4914, + "step": 48250 + }, + { + "epoch": 2.1766728199569836, + "grad_norm": 0.36042729020118713, + "learning_rate": 9.833799675724619e-05, + "loss": 1.5052, + "step": 48260 + }, + { + "epoch": 2.1772147611224955, + "grad_norm": 0.3218774199485779, + "learning_rate": 9.833372942992248e-05, + "loss": 1.4963, + "step": 48270 + }, + { + "epoch": 2.1777567022880078, + "grad_norm": 0.17845819890499115, + "learning_rate": 9.832945673460448e-05, + "loss": 1.494, + "step": 48280 + }, + { + "epoch": 2.178027672870764, + "eval_loss": 2.4591383934020996, + "eval_runtime": 22.2137, + "eval_samples_per_second": 225.086, + "eval_steps_per_second": 1.215, + "step": 48285 + }, + { + "epoch": 2.17829864345352, + "grad_norm": 0.36038216948509216, + "learning_rate": 9.832517867182151e-05, + "loss": 1.4997, + "step": 48290 + }, + { + "epoch": 2.1788405846190324, + "grad_norm": 0.2766604721546173, + "learning_rate": 9.832089524210352e-05, + "loss": 1.4955, + "step": 48300 + }, + { + "epoch": 2.1793825257845447, + "grad_norm": 0.2935325801372528, + "learning_rate": 9.831660644598109e-05, + "loss": 1.5001, + "step": 48310 + }, + { + "epoch": 2.1799244669500566, + "grad_norm": 0.3200201094150543, + "learning_rate": 9.831231228398553e-05, + "loss": 1.4945, + "step": 48320 + }, + { + "epoch": 2.180466408115569, + "grad_norm": 0.27418839931488037, + "learning_rate": 9.830801275664876e-05, + "loss": 1.5014, + "step": 48330 + }, + { + "epoch": 2.1810083492810812, + "grad_norm": 0.26256710290908813, + "learning_rate": 9.830370786450339e-05, + "loss": 1.5014, + "step": 48340 + }, + { + "epoch": 2.1815502904465935, + "grad_norm": 0.4114410877227783, + "learning_rate": 9.82993976080827e-05, + "loss": 1.51, + "step": 48350 + }, + { + "epoch": 2.1820922316121054, + "grad_norm": 0.3471300005912781, + "learning_rate": 9.829508198792062e-05, + "loss": 1.5092, + "step": 48360 + }, + { + "epoch": 2.1826341727776177, + "grad_norm": 0.356444776058197, + "learning_rate": 9.829076100455176e-05, + "loss": 1.4959, + "step": 48370 + }, + { + "epoch": 2.1827425610107203, + "eval_loss": 2.4686012268066406, + "eval_runtime": 22.0175, + "eval_samples_per_second": 227.092, + "eval_steps_per_second": 1.226, + "step": 48372 + }, + { + "epoch": 2.18317611394313, + "grad_norm": 0.2611772418022156, + "learning_rate": 9.828643465851137e-05, + "loss": 1.4966, + "step": 48380 + }, + { + "epoch": 2.1837180551086424, + "grad_norm": 0.43987488746643066, + "learning_rate": 9.828210295033537e-05, + "loss": 1.4965, + "step": 48390 + }, + { + "epoch": 2.1842599962741547, + "grad_norm": 0.24238485097885132, + "learning_rate": 9.82777658805604e-05, + "loss": 1.4908, + "step": 48400 + }, + { + "epoch": 2.1848019374396666, + "grad_norm": 0.2008477747440338, + "learning_rate": 9.827342344972366e-05, + "loss": 1.4946, + "step": 48410 + }, + { + "epoch": 2.185343878605179, + "grad_norm": 0.21971365809440613, + "learning_rate": 9.826907565836311e-05, + "loss": 1.4978, + "step": 48420 + }, + { + "epoch": 2.185885819770691, + "grad_norm": 0.35851284861564636, + "learning_rate": 9.826472250701732e-05, + "loss": 1.4988, + "step": 48430 + }, + { + "epoch": 2.1864277609362035, + "grad_norm": 0.24326647818088531, + "learning_rate": 9.826036399622553e-05, + "loss": 1.4949, + "step": 48440 + }, + { + "epoch": 2.186969702101716, + "grad_norm": 0.23622386157512665, + "learning_rate": 9.82560001265277e-05, + "loss": 1.4959, + "step": 48450 + }, + { + "epoch": 2.1874574491506764, + "eval_loss": 2.4709348678588867, + "eval_runtime": 22.0771, + "eval_samples_per_second": 226.479, + "eval_steps_per_second": 1.223, + "step": 48459 + }, + { + "epoch": 2.1875116432672277, + "grad_norm": 0.22727850079536438, + "learning_rate": 9.825163089846434e-05, + "loss": 1.4841, + "step": 48460 + }, + { + "epoch": 2.18805358443274, + "grad_norm": 0.291652649641037, + "learning_rate": 9.824725631257674e-05, + "loss": 1.4959, + "step": 48470 + }, + { + "epoch": 2.1885955255982523, + "grad_norm": 0.27845892310142517, + "learning_rate": 9.824287636940678e-05, + "loss": 1.4919, + "step": 48480 + }, + { + "epoch": 2.1891374667637646, + "grad_norm": 0.3905391991138458, + "learning_rate": 9.823849106949704e-05, + "loss": 1.4948, + "step": 48490 + }, + { + "epoch": 2.1896794079292765, + "grad_norm": 0.2136894315481186, + "learning_rate": 9.823410041339075e-05, + "loss": 1.4967, + "step": 48500 + }, + { + "epoch": 2.190221349094789, + "grad_norm": 0.3162279725074768, + "learning_rate": 9.82297044016318e-05, + "loss": 1.4927, + "step": 48510 + }, + { + "epoch": 2.190763290260301, + "grad_norm": 0.24722278118133545, + "learning_rate": 9.822530303476476e-05, + "loss": 1.4965, + "step": 48520 + }, + { + "epoch": 2.1913052314258135, + "grad_norm": 0.2130030393600464, + "learning_rate": 9.822089631333484e-05, + "loss": 1.5057, + "step": 48530 + }, + { + "epoch": 2.1918471725913253, + "grad_norm": 0.2408854067325592, + "learning_rate": 9.821648423788794e-05, + "loss": 1.5172, + "step": 48540 + }, + { + "epoch": 2.192172337290633, + "eval_loss": 2.4656124114990234, + "eval_runtime": 22.0104, + "eval_samples_per_second": 227.165, + "eval_steps_per_second": 1.227, + "step": 48546 + }, + { + "epoch": 2.1923891137568376, + "grad_norm": 0.21235163509845734, + "learning_rate": 9.821206680897059e-05, + "loss": 1.4993, + "step": 48550 + }, + { + "epoch": 2.19293105492235, + "grad_norm": 0.365730345249176, + "learning_rate": 9.820764402713002e-05, + "loss": 1.4979, + "step": 48560 + }, + { + "epoch": 2.1934729960878623, + "grad_norm": 0.26309072971343994, + "learning_rate": 9.82032158929141e-05, + "loss": 1.4962, + "step": 48570 + }, + { + "epoch": 2.1940149372533746, + "grad_norm": 0.3084958493709564, + "learning_rate": 9.819878240687136e-05, + "loss": 1.4912, + "step": 48580 + }, + { + "epoch": 2.1945568784188865, + "grad_norm": 0.2697133719921112, + "learning_rate": 9.8194343569551e-05, + "loss": 1.4936, + "step": 48590 + }, + { + "epoch": 2.195098819584399, + "grad_norm": 0.31936559081077576, + "learning_rate": 9.81898993815029e-05, + "loss": 1.4938, + "step": 48600 + }, + { + "epoch": 2.195640760749911, + "grad_norm": 0.2950705289840698, + "learning_rate": 9.818544984327756e-05, + "loss": 1.501, + "step": 48610 + }, + { + "epoch": 2.1961827019154234, + "grad_norm": 0.49501827359199524, + "learning_rate": 9.818099495542619e-05, + "loss": 1.5048, + "step": 48620 + }, + { + "epoch": 2.1967246430809357, + "grad_norm": 0.22042915225028992, + "learning_rate": 9.817653471850066e-05, + "loss": 1.4881, + "step": 48630 + }, + { + "epoch": 2.196887225430589, + "eval_loss": 2.4624252319335938, + "eval_runtime": 22.0712, + "eval_samples_per_second": 226.54, + "eval_steps_per_second": 1.223, + "step": 48633 + }, + { + "epoch": 2.1972665842464476, + "grad_norm": 0.26132088899612427, + "learning_rate": 9.817206913305344e-05, + "loss": 1.497, + "step": 48640 + }, + { + "epoch": 2.19780852541196, + "grad_norm": 0.19975581765174866, + "learning_rate": 9.816759819963773e-05, + "loss": 1.483, + "step": 48650 + }, + { + "epoch": 2.1983504665774722, + "grad_norm": 0.2904106080532074, + "learning_rate": 9.816312191880738e-05, + "loss": 1.4892, + "step": 48660 + }, + { + "epoch": 2.1988924077429846, + "grad_norm": 0.2334350049495697, + "learning_rate": 9.815864029111688e-05, + "loss": 1.4904, + "step": 48670 + }, + { + "epoch": 2.1994343489084964, + "grad_norm": 0.24541063606739044, + "learning_rate": 9.81541533171214e-05, + "loss": 1.4901, + "step": 48680 + }, + { + "epoch": 2.1999762900740087, + "grad_norm": 0.1976650059223175, + "learning_rate": 9.814966099737676e-05, + "loss": 1.479, + "step": 48690 + }, + { + "epoch": 2.200518231239521, + "grad_norm": 0.22315511107444763, + "learning_rate": 9.814516333243945e-05, + "loss": 1.5053, + "step": 48700 + }, + { + "epoch": 2.2010601724050334, + "grad_norm": 0.3041779100894928, + "learning_rate": 9.814066032286664e-05, + "loss": 1.4908, + "step": 48710 + }, + { + "epoch": 2.2016021135705457, + "grad_norm": 0.3959655463695526, + "learning_rate": 9.813615196921611e-05, + "loss": 1.497, + "step": 48720 + }, + { + "epoch": 2.2016021135705457, + "eval_loss": 2.467682361602783, + "eval_runtime": 21.8632, + "eval_samples_per_second": 228.695, + "eval_steps_per_second": 1.235, + "step": 48720 + }, + { + "epoch": 2.2021440547360576, + "grad_norm": 0.2074437290430069, + "learning_rate": 9.813163827204634e-05, + "loss": 1.4933, + "step": 48730 + }, + { + "epoch": 2.20268599590157, + "grad_norm": 0.24793848395347595, + "learning_rate": 9.812711923191651e-05, + "loss": 1.5001, + "step": 48740 + }, + { + "epoch": 2.203227937067082, + "grad_norm": 0.2648943364620209, + "learning_rate": 9.812259484938638e-05, + "loss": 1.4939, + "step": 48750 + }, + { + "epoch": 2.2037698782325945, + "grad_norm": 0.4681974947452545, + "learning_rate": 9.811806512501641e-05, + "loss": 1.4988, + "step": 48760 + }, + { + "epoch": 2.2043118193981064, + "grad_norm": 0.30370041728019714, + "learning_rate": 9.811353005936774e-05, + "loss": 1.4922, + "step": 48770 + }, + { + "epoch": 2.2048537605636187, + "grad_norm": 0.26227137446403503, + "learning_rate": 9.810898965300213e-05, + "loss": 1.4986, + "step": 48780 + }, + { + "epoch": 2.205395701729131, + "grad_norm": 0.3216424286365509, + "learning_rate": 9.810444390648207e-05, + "loss": 1.493, + "step": 48790 + }, + { + "epoch": 2.2059376428946433, + "grad_norm": 0.19285954535007477, + "learning_rate": 9.809989282037062e-05, + "loss": 1.4856, + "step": 48800 + }, + { + "epoch": 2.206317001710502, + "eval_loss": 2.4649438858032227, + "eval_runtime": 22.0562, + "eval_samples_per_second": 226.694, + "eval_steps_per_second": 1.224, + "step": 48807 + }, + { + "epoch": 2.2064795840601557, + "grad_norm": 0.5123102068901062, + "learning_rate": 9.809533639523156e-05, + "loss": 1.494, + "step": 48810 + }, + { + "epoch": 2.2070215252256675, + "grad_norm": 0.23556865751743317, + "learning_rate": 9.809077463162935e-05, + "loss": 1.4987, + "step": 48820 + }, + { + "epoch": 2.20756346639118, + "grad_norm": 0.25429365038871765, + "learning_rate": 9.808620753012906e-05, + "loss": 1.492, + "step": 48830 + }, + { + "epoch": 2.208105407556692, + "grad_norm": 0.2279432713985443, + "learning_rate": 9.808163509129643e-05, + "loss": 1.4876, + "step": 48840 + }, + { + "epoch": 2.2086473487222045, + "grad_norm": 0.23476704955101013, + "learning_rate": 9.807705731569789e-05, + "loss": 1.4923, + "step": 48850 + }, + { + "epoch": 2.209189289887717, + "grad_norm": 0.19318480789661407, + "learning_rate": 9.80724742039005e-05, + "loss": 1.4884, + "step": 48860 + }, + { + "epoch": 2.2097312310532287, + "grad_norm": 0.2585408091545105, + "learning_rate": 9.806788575647204e-05, + "loss": 1.5005, + "step": 48870 + }, + { + "epoch": 2.210273172218741, + "grad_norm": 0.23720701038837433, + "learning_rate": 9.806329197398085e-05, + "loss": 1.4892, + "step": 48880 + }, + { + "epoch": 2.2108151133842533, + "grad_norm": 0.3471413552761078, + "learning_rate": 9.805869285699602e-05, + "loss": 1.4984, + "step": 48890 + }, + { + "epoch": 2.211031889850458, + "eval_loss": 2.4734652042388916, + "eval_runtime": 22.0944, + "eval_samples_per_second": 226.302, + "eval_steps_per_second": 1.222, + "step": 48894 + }, + { + "epoch": 2.2113570545497656, + "grad_norm": 0.4286750555038452, + "learning_rate": 9.805408840608725e-05, + "loss": 1.4975, + "step": 48900 + }, + { + "epoch": 2.2118989957152775, + "grad_norm": 0.4188242256641388, + "learning_rate": 9.804947862182494e-05, + "loss": 1.4967, + "step": 48910 + }, + { + "epoch": 2.21244093688079, + "grad_norm": 0.3151668310165405, + "learning_rate": 9.804486350478013e-05, + "loss": 1.4959, + "step": 48920 + }, + { + "epoch": 2.212982878046302, + "grad_norm": 0.24722276628017426, + "learning_rate": 9.804024305552451e-05, + "loss": 1.486, + "step": 48930 + }, + { + "epoch": 2.2135248192118144, + "grad_norm": 0.33431771397590637, + "learning_rate": 9.803561727463042e-05, + "loss": 1.4907, + "step": 48940 + }, + { + "epoch": 2.2140667603773263, + "grad_norm": 0.20495231449604034, + "learning_rate": 9.803098616267093e-05, + "loss": 1.5058, + "step": 48950 + }, + { + "epoch": 2.2146087015428386, + "grad_norm": 0.2133825570344925, + "learning_rate": 9.802634972021967e-05, + "loss": 1.4913, + "step": 48960 + }, + { + "epoch": 2.215150642708351, + "grad_norm": 0.32575109601020813, + "learning_rate": 9.802170794785103e-05, + "loss": 1.4912, + "step": 48970 + }, + { + "epoch": 2.2156925838738633, + "grad_norm": 0.24184833467006683, + "learning_rate": 9.801706084613997e-05, + "loss": 1.4915, + "step": 48980 + }, + { + "epoch": 2.2157467779904145, + "eval_loss": 2.473459243774414, + "eval_runtime": 22.0863, + "eval_samples_per_second": 226.385, + "eval_steps_per_second": 1.222, + "step": 48981 + }, + { + "epoch": 2.2162345250393756, + "grad_norm": 0.281916081905365, + "learning_rate": 9.801240841566221e-05, + "loss": 1.4939, + "step": 48990 + }, + { + "epoch": 2.2167764662048874, + "grad_norm": 0.24919818341732025, + "learning_rate": 9.8007750656994e-05, + "loss": 1.5073, + "step": 49000 + }, + { + "epoch": 2.2173184073703998, + "grad_norm": 0.3815664052963257, + "learning_rate": 9.800308757071238e-05, + "loss": 1.4974, + "step": 49010 + }, + { + "epoch": 2.217860348535912, + "grad_norm": 0.21346397697925568, + "learning_rate": 9.799841915739496e-05, + "loss": 1.49, + "step": 49020 + }, + { + "epoch": 2.2184022897014244, + "grad_norm": 0.2835428714752197, + "learning_rate": 9.799374541762005e-05, + "loss": 1.4981, + "step": 49030 + }, + { + "epoch": 2.2189442308669367, + "grad_norm": 0.20683638751506805, + "learning_rate": 9.798906635196665e-05, + "loss": 1.5014, + "step": 49040 + }, + { + "epoch": 2.2194861720324486, + "grad_norm": 0.28130945563316345, + "learning_rate": 9.798438196101432e-05, + "loss": 1.4794, + "step": 49050 + }, + { + "epoch": 2.220028113197961, + "grad_norm": 0.6789763569831848, + "learning_rate": 9.797969224534338e-05, + "loss": 1.504, + "step": 49060 + }, + { + "epoch": 2.2204616661303707, + "eval_loss": 2.4563019275665283, + "eval_runtime": 22.0507, + "eval_samples_per_second": 226.75, + "eval_steps_per_second": 1.224, + "step": 49068 + }, + { + "epoch": 2.220570054363473, + "grad_norm": 0.5758925676345825, + "learning_rate": 9.797499720553476e-05, + "loss": 1.4929, + "step": 49070 + }, + { + "epoch": 2.2211119955289855, + "grad_norm": 0.26593759655952454, + "learning_rate": 9.797029684217008e-05, + "loss": 1.4907, + "step": 49080 + }, + { + "epoch": 2.2216539366944974, + "grad_norm": 0.2138567566871643, + "learning_rate": 9.796559115583158e-05, + "loss": 1.4922, + "step": 49090 + }, + { + "epoch": 2.2221958778600097, + "grad_norm": 0.31835222244262695, + "learning_rate": 9.796088014710218e-05, + "loss": 1.4927, + "step": 49100 + }, + { + "epoch": 2.222737819025522, + "grad_norm": 0.19409584999084473, + "learning_rate": 9.795616381656546e-05, + "loss": 1.4954, + "step": 49110 + }, + { + "epoch": 2.2232797601910343, + "grad_norm": 0.273907870054245, + "learning_rate": 9.795144216480566e-05, + "loss": 1.4999, + "step": 49120 + }, + { + "epoch": 2.2238217013565467, + "grad_norm": 0.19406692683696747, + "learning_rate": 9.794671519240768e-05, + "loss": 1.4948, + "step": 49130 + }, + { + "epoch": 2.2243636425220585, + "grad_norm": 0.262638121843338, + "learning_rate": 9.79419828999571e-05, + "loss": 1.4956, + "step": 49140 + }, + { + "epoch": 2.224905583687571, + "grad_norm": 0.26331815123558044, + "learning_rate": 9.79372452880401e-05, + "loss": 1.4954, + "step": 49150 + }, + { + "epoch": 2.2251765542703272, + "eval_loss": 2.458817958831787, + "eval_runtime": 22.1352, + "eval_samples_per_second": 225.885, + "eval_steps_per_second": 1.22, + "step": 49155 + }, + { + "epoch": 2.225447524853083, + "grad_norm": 0.35624048113822937, + "learning_rate": 9.793250235724358e-05, + "loss": 1.4944, + "step": 49160 + }, + { + "epoch": 2.2259894660185955, + "grad_norm": 0.24932223558425903, + "learning_rate": 9.792775410815506e-05, + "loss": 1.4867, + "step": 49170 + }, + { + "epoch": 2.2265314071841074, + "grad_norm": 0.2496650665998459, + "learning_rate": 9.792300054136272e-05, + "loss": 1.4798, + "step": 49180 + }, + { + "epoch": 2.2270733483496197, + "grad_norm": 0.29679879546165466, + "learning_rate": 9.791824165745543e-05, + "loss": 1.493, + "step": 49190 + }, + { + "epoch": 2.227615289515132, + "grad_norm": 0.3611978590488434, + "learning_rate": 9.79134774570227e-05, + "loss": 1.4927, + "step": 49200 + }, + { + "epoch": 2.2281572306806443, + "grad_norm": 0.289468377828598, + "learning_rate": 9.790870794065469e-05, + "loss": 1.5052, + "step": 49210 + }, + { + "epoch": 2.2286991718461566, + "grad_norm": 0.31869956851005554, + "learning_rate": 9.790393310894225e-05, + "loss": 1.5031, + "step": 49220 + }, + { + "epoch": 2.2292411130116685, + "grad_norm": 0.1822601705789566, + "learning_rate": 9.789915296247682e-05, + "loss": 1.4961, + "step": 49230 + }, + { + "epoch": 2.229783054177181, + "grad_norm": 0.19582431018352509, + "learning_rate": 9.789436750185059e-05, + "loss": 1.4834, + "step": 49240 + }, + { + "epoch": 2.2298914424102834, + "eval_loss": 2.4611740112304688, + "eval_runtime": 23.4689, + "eval_samples_per_second": 213.048, + "eval_steps_per_second": 1.15, + "step": 49242 + }, + { + "epoch": 2.230324995342693, + "grad_norm": 0.3001343607902527, + "learning_rate": 9.788957672765634e-05, + "loss": 1.4975, + "step": 49250 + }, + { + "epoch": 2.2308669365082054, + "grad_norm": 0.32307741045951843, + "learning_rate": 9.788478064048753e-05, + "loss": 1.4875, + "step": 49260 + }, + { + "epoch": 2.2314088776737178, + "grad_norm": 0.21440020203590393, + "learning_rate": 9.787997924093829e-05, + "loss": 1.5025, + "step": 49270 + }, + { + "epoch": 2.2319508188392296, + "grad_norm": 0.21042481064796448, + "learning_rate": 9.787517252960335e-05, + "loss": 1.5006, + "step": 49280 + }, + { + "epoch": 2.232492760004742, + "grad_norm": 0.22425493597984314, + "learning_rate": 9.787036050707822e-05, + "loss": 1.4948, + "step": 49290 + }, + { + "epoch": 2.2330347011702543, + "grad_norm": 0.24447576701641083, + "learning_rate": 9.786554317395894e-05, + "loss": 1.4939, + "step": 49300 + }, + { + "epoch": 2.2335766423357666, + "grad_norm": 0.3083951771259308, + "learning_rate": 9.786072053084228e-05, + "loss": 1.4949, + "step": 49310 + }, + { + "epoch": 2.2341185835012785, + "grad_norm": 0.24689245223999023, + "learning_rate": 9.785589257832565e-05, + "loss": 1.489, + "step": 49320 + }, + { + "epoch": 2.2346063305502395, + "eval_loss": 2.45898175239563, + "eval_runtime": 22.1974, + "eval_samples_per_second": 225.252, + "eval_steps_per_second": 1.216, + "step": 49329 + }, + { + "epoch": 2.2346605246667908, + "grad_norm": 0.3437364995479584, + "learning_rate": 9.78510593170071e-05, + "loss": 1.4934, + "step": 49330 + }, + { + "epoch": 2.235202465832303, + "grad_norm": 0.34142521023750305, + "learning_rate": 9.784622074748537e-05, + "loss": 1.4972, + "step": 49340 + }, + { + "epoch": 2.2357444069978154, + "grad_norm": 0.3373830020427704, + "learning_rate": 9.784137687035983e-05, + "loss": 1.4975, + "step": 49350 + }, + { + "epoch": 2.2362863481633277, + "grad_norm": 0.2561146318912506, + "learning_rate": 9.78365276862305e-05, + "loss": 1.4934, + "step": 49360 + }, + { + "epoch": 2.2368282893288396, + "grad_norm": 0.19827355444431305, + "learning_rate": 9.783167319569813e-05, + "loss": 1.4969, + "step": 49370 + }, + { + "epoch": 2.237370230494352, + "grad_norm": 0.2824994921684265, + "learning_rate": 9.782681339936404e-05, + "loss": 1.4843, + "step": 49380 + }, + { + "epoch": 2.2379121716598642, + "grad_norm": 0.21020928025245667, + "learning_rate": 9.782194829783022e-05, + "loss": 1.4916, + "step": 49390 + }, + { + "epoch": 2.2384541128253765, + "grad_norm": 0.30868810415267944, + "learning_rate": 9.781707789169937e-05, + "loss": 1.4969, + "step": 49400 + }, + { + "epoch": 2.2389960539908884, + "grad_norm": 0.41937023401260376, + "learning_rate": 9.781220218157479e-05, + "loss": 1.4893, + "step": 49410 + }, + { + "epoch": 2.239321218690196, + "eval_loss": 2.4543631076812744, + "eval_runtime": 22.0555, + "eval_samples_per_second": 226.7, + "eval_steps_per_second": 1.224, + "step": 49416 + }, + { + "epoch": 2.2395379951564007, + "grad_norm": 0.6382551789283752, + "learning_rate": 9.780732116806052e-05, + "loss": 1.5002, + "step": 49420 + }, + { + "epoch": 2.240079936321913, + "grad_norm": 0.2832179069519043, + "learning_rate": 9.780243485176111e-05, + "loss": 1.4967, + "step": 49430 + }, + { + "epoch": 2.2406218774874254, + "grad_norm": 0.19235193729400635, + "learning_rate": 9.779754323328192e-05, + "loss": 1.4979, + "step": 49440 + }, + { + "epoch": 2.2411638186529377, + "grad_norm": 0.26842355728149414, + "learning_rate": 9.779264631322888e-05, + "loss": 1.4917, + "step": 49450 + }, + { + "epoch": 2.2417057598184496, + "grad_norm": 0.28281205892562866, + "learning_rate": 9.778774409220859e-05, + "loss": 1.4907, + "step": 49460 + }, + { + "epoch": 2.242247700983962, + "grad_norm": 0.3516809344291687, + "learning_rate": 9.778283657082832e-05, + "loss": 1.4887, + "step": 49470 + }, + { + "epoch": 2.242789642149474, + "grad_norm": 0.23774586617946625, + "learning_rate": 9.777792374969603e-05, + "loss": 1.4881, + "step": 49480 + }, + { + "epoch": 2.2433315833149865, + "grad_norm": 0.32771846652030945, + "learning_rate": 9.777300562942027e-05, + "loss": 1.4883, + "step": 49490 + }, + { + "epoch": 2.243873524480499, + "grad_norm": 0.25199854373931885, + "learning_rate": 9.776808221061024e-05, + "loss": 1.4944, + "step": 49500 + }, + { + "epoch": 2.244036106830152, + "eval_loss": 2.4585046768188477, + "eval_runtime": 22.068, + "eval_samples_per_second": 226.572, + "eval_steps_per_second": 1.223, + "step": 49503 + }, + { + "epoch": 2.2444154656460107, + "grad_norm": 0.26867803931236267, + "learning_rate": 9.776315349387589e-05, + "loss": 1.4877, + "step": 49510 + }, + { + "epoch": 2.244957406811523, + "grad_norm": 0.2837635576725006, + "learning_rate": 9.775821947982775e-05, + "loss": 1.4914, + "step": 49520 + }, + { + "epoch": 2.2454993479770353, + "grad_norm": 0.24937689304351807, + "learning_rate": 9.775328016907701e-05, + "loss": 1.4903, + "step": 49530 + }, + { + "epoch": 2.2460412891425476, + "grad_norm": 0.20561334490776062, + "learning_rate": 9.774833556223554e-05, + "loss": 1.4971, + "step": 49540 + }, + { + "epoch": 2.2465832303080595, + "grad_norm": 0.2335055023431778, + "learning_rate": 9.774338565991586e-05, + "loss": 1.4979, + "step": 49550 + }, + { + "epoch": 2.247125171473572, + "grad_norm": 0.3294164836406708, + "learning_rate": 9.773843046273111e-05, + "loss": 1.4996, + "step": 49560 + }, + { + "epoch": 2.247667112639084, + "grad_norm": 0.3552034795284271, + "learning_rate": 9.773346997129518e-05, + "loss": 1.5011, + "step": 49570 + }, + { + "epoch": 2.2482090538045965, + "grad_norm": 0.36698946356773376, + "learning_rate": 9.77285041862225e-05, + "loss": 1.5012, + "step": 49580 + }, + { + "epoch": 2.2487509949701083, + "grad_norm": 0.2022208720445633, + "learning_rate": 9.772353310812824e-05, + "loss": 1.5007, + "step": 49590 + }, + { + "epoch": 2.2487509949701083, + "eval_loss": 2.4587907791137695, + "eval_runtime": 22.0232, + "eval_samples_per_second": 227.033, + "eval_steps_per_second": 1.226, + "step": 49590 + }, + { + "epoch": 2.2492929361356206, + "grad_norm": 0.3783475160598755, + "learning_rate": 9.771855673762818e-05, + "loss": 1.4963, + "step": 49600 + }, + { + "epoch": 2.249834877301133, + "grad_norm": 0.4129343628883362, + "learning_rate": 9.771357507533878e-05, + "loss": 1.4945, + "step": 49610 + }, + { + "epoch": 2.2503768184666453, + "grad_norm": 0.2977985739707947, + "learning_rate": 9.770858812187715e-05, + "loss": 1.4925, + "step": 49620 + }, + { + "epoch": 2.2509187596321576, + "grad_norm": 0.22732873260974884, + "learning_rate": 9.770359587786105e-05, + "loss": 1.4973, + "step": 49630 + }, + { + "epoch": 2.2514607007976695, + "grad_norm": 0.27571314573287964, + "learning_rate": 9.769859834390887e-05, + "loss": 1.4836, + "step": 49640 + }, + { + "epoch": 2.252002641963182, + "grad_norm": 0.19114813208580017, + "learning_rate": 9.769359552063972e-05, + "loss": 1.4889, + "step": 49650 + }, + { + "epoch": 2.252544583128694, + "grad_norm": 0.193264439702034, + "learning_rate": 9.76885874086733e-05, + "loss": 1.5037, + "step": 49660 + }, + { + "epoch": 2.2530865242942064, + "grad_norm": 0.2822969853878021, + "learning_rate": 9.768357400863004e-05, + "loss": 1.4948, + "step": 49670 + }, + { + "epoch": 2.253465883110065, + "eval_loss": 2.4479169845581055, + "eval_runtime": 21.9892, + "eval_samples_per_second": 227.384, + "eval_steps_per_second": 1.228, + "step": 49677 + }, + { + "epoch": 2.2536284654597187, + "grad_norm": 0.2823607325553894, + "learning_rate": 9.767855532113091e-05, + "loss": 1.4953, + "step": 49680 + }, + { + "epoch": 2.2541704066252306, + "grad_norm": 0.29018524289131165, + "learning_rate": 9.767353134679765e-05, + "loss": 1.4909, + "step": 49690 + }, + { + "epoch": 2.254712347790743, + "grad_norm": 0.2224309891462326, + "learning_rate": 9.76685020862526e-05, + "loss": 1.4952, + "step": 49700 + }, + { + "epoch": 2.2552542889562552, + "grad_norm": 0.25766855478286743, + "learning_rate": 9.766346754011875e-05, + "loss": 1.4953, + "step": 49710 + }, + { + "epoch": 2.2557962301217676, + "grad_norm": 0.2575187385082245, + "learning_rate": 9.765842770901979e-05, + "loss": 1.4876, + "step": 49720 + }, + { + "epoch": 2.25633817128728, + "grad_norm": 0.3350069522857666, + "learning_rate": 9.765338259358e-05, + "loss": 1.4925, + "step": 49730 + }, + { + "epoch": 2.2568801124527917, + "grad_norm": 0.21751070022583008, + "learning_rate": 9.764833219442435e-05, + "loss": 1.4812, + "step": 49740 + }, + { + "epoch": 2.257422053618304, + "grad_norm": 0.19443929195404053, + "learning_rate": 9.764327651217847e-05, + "loss": 1.4924, + "step": 49750 + }, + { + "epoch": 2.2579639947838164, + "grad_norm": 0.21549266576766968, + "learning_rate": 9.763821554746864e-05, + "loss": 1.484, + "step": 49760 + }, + { + "epoch": 2.258180771250021, + "eval_loss": 2.451836585998535, + "eval_runtime": 21.9854, + "eval_samples_per_second": 227.423, + "eval_steps_per_second": 1.228, + "step": 49764 + }, + { + "epoch": 2.2585059359493287, + "grad_norm": 0.30445384979248047, + "learning_rate": 9.763314930092178e-05, + "loss": 1.4924, + "step": 49770 + }, + { + "epoch": 2.2590478771148406, + "grad_norm": 0.3584153950214386, + "learning_rate": 9.762807777316548e-05, + "loss": 1.4855, + "step": 49780 + }, + { + "epoch": 2.259589818280353, + "grad_norm": 0.2762848138809204, + "learning_rate": 9.762300096482799e-05, + "loss": 1.5014, + "step": 49790 + }, + { + "epoch": 2.260131759445865, + "grad_norm": 0.22628115117549896, + "learning_rate": 9.761791887653817e-05, + "loss": 1.4932, + "step": 49800 + }, + { + "epoch": 2.2606737006113775, + "grad_norm": 0.3497247099876404, + "learning_rate": 9.761283150892561e-05, + "loss": 1.5017, + "step": 49810 + }, + { + "epoch": 2.2612156417768894, + "grad_norm": 0.40231335163116455, + "learning_rate": 9.760773886262046e-05, + "loss": 1.4924, + "step": 49820 + }, + { + "epoch": 2.2617575829424017, + "grad_norm": 0.2883796989917755, + "learning_rate": 9.760264093825361e-05, + "loss": 1.4829, + "step": 49830 + }, + { + "epoch": 2.262299524107914, + "grad_norm": 0.4345490038394928, + "learning_rate": 9.759753773645658e-05, + "loss": 1.4926, + "step": 49840 + }, + { + "epoch": 2.2628414652734263, + "grad_norm": 0.2081182599067688, + "learning_rate": 9.759242925786151e-05, + "loss": 1.5033, + "step": 49850 + }, + { + "epoch": 2.2628956593899776, + "eval_loss": 2.4485983848571777, + "eval_runtime": 22.0488, + "eval_samples_per_second": 226.769, + "eval_steps_per_second": 1.225, + "step": 49851 + }, + { + "epoch": 2.2633834064389386, + "grad_norm": 0.29944297671318054, + "learning_rate": 9.758731550310122e-05, + "loss": 1.4909, + "step": 49860 + }, + { + "epoch": 2.2639253476044505, + "grad_norm": 0.37277737259864807, + "learning_rate": 9.758219647280919e-05, + "loss": 1.4931, + "step": 49870 + }, + { + "epoch": 2.264467288769963, + "grad_norm": 0.2762742340564728, + "learning_rate": 9.757707216761953e-05, + "loss": 1.4982, + "step": 49880 + }, + { + "epoch": 2.265009229935475, + "grad_norm": 0.22631752490997314, + "learning_rate": 9.7571942588167e-05, + "loss": 1.4871, + "step": 49890 + }, + { + "epoch": 2.2655511711009875, + "grad_norm": 0.22785073518753052, + "learning_rate": 9.756680773508708e-05, + "loss": 1.4866, + "step": 49900 + }, + { + "epoch": 2.2660931122665, + "grad_norm": 0.3055456876754761, + "learning_rate": 9.756166760901581e-05, + "loss": 1.4829, + "step": 49910 + }, + { + "epoch": 2.2666350534320117, + "grad_norm": 0.25249767303466797, + "learning_rate": 9.755652221058993e-05, + "loss": 1.501, + "step": 49920 + }, + { + "epoch": 2.267176994597524, + "grad_norm": 0.25835105776786804, + "learning_rate": 9.755137154044686e-05, + "loss": 1.4893, + "step": 49930 + }, + { + "epoch": 2.2676105475299337, + "eval_loss": 2.449587345123291, + "eval_runtime": 22.013, + "eval_samples_per_second": 227.138, + "eval_steps_per_second": 1.227, + "step": 49938 + }, + { + "epoch": 2.2677189357630363, + "grad_norm": 0.3590300679206848, + "learning_rate": 9.754621559922461e-05, + "loss": 1.4947, + "step": 49940 + }, + { + "epoch": 2.2682608769285486, + "grad_norm": 0.24853801727294922, + "learning_rate": 9.75410543875619e-05, + "loss": 1.4931, + "step": 49950 + }, + { + "epoch": 2.2688028180940605, + "grad_norm": 0.19558899104595184, + "learning_rate": 9.753588790609807e-05, + "loss": 1.4906, + "step": 49960 + }, + { + "epoch": 2.269344759259573, + "grad_norm": 0.2478579878807068, + "learning_rate": 9.75307161554731e-05, + "loss": 1.4903, + "step": 49970 + }, + { + "epoch": 2.269886700425085, + "grad_norm": 0.32022833824157715, + "learning_rate": 9.752553913632768e-05, + "loss": 1.4904, + "step": 49980 + }, + { + "epoch": 2.2704286415905974, + "grad_norm": 0.22632542252540588, + "learning_rate": 9.752035684930309e-05, + "loss": 1.4888, + "step": 49990 + }, + { + "epoch": 2.2709705827561093, + "grad_norm": 0.29843834042549133, + "learning_rate": 9.75151692950413e-05, + "loss": 1.4982, + "step": 50000 + }, + { + "epoch": 2.2715125239216216, + "grad_norm": 0.24948932230472565, + "learning_rate": 9.750997647418492e-05, + "loss": 1.4907, + "step": 50010 + }, + { + "epoch": 2.272054465087134, + "grad_norm": 0.26221540570259094, + "learning_rate": 9.750477838737721e-05, + "loss": 1.4931, + "step": 50020 + }, + { + "epoch": 2.27232543566989, + "eval_loss": 2.4510858058929443, + "eval_runtime": 22.1311, + "eval_samples_per_second": 225.927, + "eval_steps_per_second": 1.22, + "step": 50025 + }, + { + "epoch": 2.2725964062526463, + "grad_norm": 0.4738941192626953, + "learning_rate": 9.74995750352621e-05, + "loss": 1.5008, + "step": 50030 + }, + { + "epoch": 2.2731383474181586, + "grad_norm": 0.29958027601242065, + "learning_rate": 9.749436641848415e-05, + "loss": 1.4866, + "step": 50040 + }, + { + "epoch": 2.2736802885836704, + "grad_norm": 0.21620671451091766, + "learning_rate": 9.748915253768856e-05, + "loss": 1.4908, + "step": 50050 + }, + { + "epoch": 2.2742222297491828, + "grad_norm": 0.21653667092323303, + "learning_rate": 9.748393339352125e-05, + "loss": 1.4871, + "step": 50060 + }, + { + "epoch": 2.274764170914695, + "grad_norm": 0.2310364544391632, + "learning_rate": 9.747870898662871e-05, + "loss": 1.4929, + "step": 50070 + }, + { + "epoch": 2.2753061120802074, + "grad_norm": 0.23432140052318573, + "learning_rate": 9.747347931765812e-05, + "loss": 1.4955, + "step": 50080 + }, + { + "epoch": 2.2758480532457197, + "grad_norm": 0.22089755535125732, + "learning_rate": 9.746824438725731e-05, + "loss": 1.489, + "step": 50090 + }, + { + "epoch": 2.2763899944112316, + "grad_norm": 0.3632870018482208, + "learning_rate": 9.746300419607479e-05, + "loss": 1.4839, + "step": 50100 + }, + { + "epoch": 2.276931935576744, + "grad_norm": 0.18700088560581207, + "learning_rate": 9.745775874475963e-05, + "loss": 1.4952, + "step": 50110 + }, + { + "epoch": 2.2770403238098464, + "eval_loss": 2.4573004245758057, + "eval_runtime": 22.0595, + "eval_samples_per_second": 226.66, + "eval_steps_per_second": 1.224, + "step": 50112 + }, + { + "epoch": 2.277473876742256, + "grad_norm": 0.23492306470870972, + "learning_rate": 9.745250803396166e-05, + "loss": 1.4832, + "step": 50120 + }, + { + "epoch": 2.2780158179077685, + "grad_norm": 0.35017403960227966, + "learning_rate": 9.744725206433131e-05, + "loss": 1.4903, + "step": 50130 + }, + { + "epoch": 2.278557759073281, + "grad_norm": 0.2380804866552353, + "learning_rate": 9.744199083651968e-05, + "loss": 1.492, + "step": 50140 + }, + { + "epoch": 2.2790997002387927, + "grad_norm": 0.2782959043979645, + "learning_rate": 9.743672435117846e-05, + "loss": 1.4848, + "step": 50150 + }, + { + "epoch": 2.279641641404305, + "grad_norm": 0.19856207072734833, + "learning_rate": 9.743145260896009e-05, + "loss": 1.4947, + "step": 50160 + }, + { + "epoch": 2.2801835825698173, + "grad_norm": 0.3704603612422943, + "learning_rate": 9.742617561051758e-05, + "loss": 1.4797, + "step": 50170 + }, + { + "epoch": 2.2807255237353297, + "grad_norm": 0.38659003376960754, + "learning_rate": 9.742089335650462e-05, + "loss": 1.4944, + "step": 50180 + }, + { + "epoch": 2.2812674649008415, + "grad_norm": 0.2731989324092865, + "learning_rate": 9.741560584757559e-05, + "loss": 1.4955, + "step": 50190 + }, + { + "epoch": 2.2817552119498026, + "eval_loss": 2.4447662830352783, + "eval_runtime": 22.0833, + "eval_samples_per_second": 226.415, + "eval_steps_per_second": 1.223, + "step": 50199 + }, + { + "epoch": 2.281809406066354, + "grad_norm": 0.312407523393631, + "learning_rate": 9.741031308438543e-05, + "loss": 1.4969, + "step": 50200 + }, + { + "epoch": 2.282351347231866, + "grad_norm": 0.2855091691017151, + "learning_rate": 9.740501506758983e-05, + "loss": 1.4872, + "step": 50210 + }, + { + "epoch": 2.2828932883973785, + "grad_norm": 0.33906736969947815, + "learning_rate": 9.739971179784508e-05, + "loss": 1.4888, + "step": 50220 + }, + { + "epoch": 2.2834352295628904, + "grad_norm": 0.21002908051013947, + "learning_rate": 9.739440327580809e-05, + "loss": 1.49, + "step": 50230 + }, + { + "epoch": 2.2839771707284027, + "grad_norm": 0.24895620346069336, + "learning_rate": 9.738908950213648e-05, + "loss": 1.4906, + "step": 50240 + }, + { + "epoch": 2.284519111893915, + "grad_norm": 0.2253609001636505, + "learning_rate": 9.738377047748852e-05, + "loss": 1.5001, + "step": 50250 + }, + { + "epoch": 2.2850610530594273, + "grad_norm": 0.30024558305740356, + "learning_rate": 9.737844620252307e-05, + "loss": 1.4887, + "step": 50260 + }, + { + "epoch": 2.2856029942249396, + "grad_norm": 0.3111526370048523, + "learning_rate": 9.737311667789967e-05, + "loss": 1.4861, + "step": 50270 + }, + { + "epoch": 2.2861449353904515, + "grad_norm": 0.22137415409088135, + "learning_rate": 9.736778190427859e-05, + "loss": 1.4972, + "step": 50280 + }, + { + "epoch": 2.286470100089759, + "eval_loss": 2.4479713439941406, + "eval_runtime": 21.9852, + "eval_samples_per_second": 227.425, + "eval_steps_per_second": 1.228, + "step": 50286 + }, + { + "epoch": 2.286686876555964, + "grad_norm": 0.18520957231521606, + "learning_rate": 9.73624418823206e-05, + "loss": 1.4849, + "step": 50290 + }, + { + "epoch": 2.287228817721476, + "grad_norm": 0.30916085839271545, + "learning_rate": 9.735709661268723e-05, + "loss": 1.4852, + "step": 50300 + }, + { + "epoch": 2.2877707588869884, + "grad_norm": 0.5676615238189697, + "learning_rate": 9.735174609604063e-05, + "loss": 1.4893, + "step": 50310 + }, + { + "epoch": 2.2883127000525008, + "grad_norm": 0.40237462520599365, + "learning_rate": 9.734639033304361e-05, + "loss": 1.4901, + "step": 50320 + }, + { + "epoch": 2.2888546412180126, + "grad_norm": 0.24099163711071014, + "learning_rate": 9.734102932435959e-05, + "loss": 1.4842, + "step": 50330 + }, + { + "epoch": 2.289396582383525, + "grad_norm": 0.2044031023979187, + "learning_rate": 9.73356630706527e-05, + "loss": 1.4834, + "step": 50340 + }, + { + "epoch": 2.2899385235490373, + "grad_norm": 0.22731153666973114, + "learning_rate": 9.733029157258765e-05, + "loss": 1.471, + "step": 50350 + }, + { + "epoch": 2.2904804647145496, + "grad_norm": 0.19987347722053528, + "learning_rate": 9.732491483082987e-05, + "loss": 1.4786, + "step": 50360 + }, + { + "epoch": 2.2910224058800615, + "grad_norm": 0.23623400926589966, + "learning_rate": 9.73195328460454e-05, + "loss": 1.4864, + "step": 50370 + }, + { + "epoch": 2.2911849882297153, + "eval_loss": 2.4462499618530273, + "eval_runtime": 22.0809, + "eval_samples_per_second": 226.44, + "eval_steps_per_second": 1.223, + "step": 50373 + }, + { + "epoch": 2.2915643470455738, + "grad_norm": 0.2596568167209625, + "learning_rate": 9.731414561890093e-05, + "loss": 1.4828, + "step": 50380 + }, + { + "epoch": 2.292106288211086, + "grad_norm": 0.22793002426624298, + "learning_rate": 9.730875315006381e-05, + "loss": 1.4874, + "step": 50390 + }, + { + "epoch": 2.2926482293765984, + "grad_norm": 0.34478330612182617, + "learning_rate": 9.730335544020204e-05, + "loss": 1.494, + "step": 50400 + }, + { + "epoch": 2.2931901705421103, + "grad_norm": 0.3564873933792114, + "learning_rate": 9.729795248998425e-05, + "loss": 1.4937, + "step": 50410 + }, + { + "epoch": 2.2937321117076226, + "grad_norm": 0.21651984751224518, + "learning_rate": 9.729254430007976e-05, + "loss": 1.4791, + "step": 50420 + }, + { + "epoch": 2.294274052873135, + "grad_norm": 0.25641965866088867, + "learning_rate": 9.728713087115848e-05, + "loss": 1.4991, + "step": 50430 + }, + { + "epoch": 2.2948159940386472, + "grad_norm": 0.3314838707447052, + "learning_rate": 9.728171220389104e-05, + "loss": 1.5014, + "step": 50440 + }, + { + "epoch": 2.2953579352041595, + "grad_norm": 0.18577730655670166, + "learning_rate": 9.727628829894866e-05, + "loss": 1.4913, + "step": 50450 + }, + { + "epoch": 2.2958998763696714, + "grad_norm": 0.3247391879558563, + "learning_rate": 9.727085915700321e-05, + "loss": 1.4837, + "step": 50460 + }, + { + "epoch": 2.2958998763696714, + "eval_loss": 2.445127010345459, + "eval_runtime": 22.0909, + "eval_samples_per_second": 226.337, + "eval_steps_per_second": 1.222, + "step": 50460 + }, + { + "epoch": 2.2964418175351837, + "grad_norm": 0.2847256660461426, + "learning_rate": 9.726542477872726e-05, + "loss": 1.4776, + "step": 50470 + }, + { + "epoch": 2.296983758700696, + "grad_norm": 0.23234505951404572, + "learning_rate": 9.725998516479399e-05, + "loss": 1.4889, + "step": 50480 + }, + { + "epoch": 2.2975256998662084, + "grad_norm": 0.20872457325458527, + "learning_rate": 9.725454031587725e-05, + "loss": 1.4975, + "step": 50490 + }, + { + "epoch": 2.2980676410317207, + "grad_norm": 0.24152375757694244, + "learning_rate": 9.724909023265147e-05, + "loss": 1.4884, + "step": 50500 + }, + { + "epoch": 2.2986095821972325, + "grad_norm": 0.45855534076690674, + "learning_rate": 9.724363491579185e-05, + "loss": 1.4975, + "step": 50510 + }, + { + "epoch": 2.299151523362745, + "grad_norm": 0.37530308961868286, + "learning_rate": 9.723817436597413e-05, + "loss": 1.4934, + "step": 50520 + }, + { + "epoch": 2.299693464528257, + "grad_norm": 0.3856462240219116, + "learning_rate": 9.723270858387474e-05, + "loss": 1.4977, + "step": 50530 + }, + { + "epoch": 2.3002354056937695, + "grad_norm": 0.21550019085407257, + "learning_rate": 9.722723757017078e-05, + "loss": 1.5, + "step": 50540 + }, + { + "epoch": 2.300614764509628, + "eval_loss": 2.45145320892334, + "eval_runtime": 22.1166, + "eval_samples_per_second": 226.074, + "eval_steps_per_second": 1.221, + "step": 50547 + }, + { + "epoch": 2.300777346859282, + "grad_norm": 0.2878018617630005, + "learning_rate": 9.722176132553995e-05, + "loss": 1.498, + "step": 50550 + }, + { + "epoch": 2.3013192880247937, + "grad_norm": 0.3644629418849945, + "learning_rate": 9.721627985066064e-05, + "loss": 1.4972, + "step": 50560 + }, + { + "epoch": 2.301861229190306, + "grad_norm": 0.22482427954673767, + "learning_rate": 9.721079314621186e-05, + "loss": 1.4867, + "step": 50570 + }, + { + "epoch": 2.3024031703558183, + "grad_norm": 0.3207145631313324, + "learning_rate": 9.72053012128733e-05, + "loss": 1.481, + "step": 50580 + }, + { + "epoch": 2.3029451115213306, + "grad_norm": 0.26092955470085144, + "learning_rate": 9.719980405132527e-05, + "loss": 1.4841, + "step": 50590 + }, + { + "epoch": 2.3034870526868425, + "grad_norm": 0.30404138565063477, + "learning_rate": 9.71943016622487e-05, + "loss": 1.4924, + "step": 50600 + }, + { + "epoch": 2.304028993852355, + "grad_norm": 0.23371928930282593, + "learning_rate": 9.718879404632525e-05, + "loss": 1.4786, + "step": 50610 + }, + { + "epoch": 2.304570935017867, + "grad_norm": 0.2730085551738739, + "learning_rate": 9.718328120423715e-05, + "loss": 1.4862, + "step": 50620 + }, + { + "epoch": 2.3051128761833795, + "grad_norm": 0.19008232653141022, + "learning_rate": 9.717776313666731e-05, + "loss": 1.4935, + "step": 50630 + }, + { + "epoch": 2.305329652649584, + "eval_loss": 2.4463045597076416, + "eval_runtime": 22.0762, + "eval_samples_per_second": 226.488, + "eval_steps_per_second": 1.223, + "step": 50634 + }, + { + "epoch": 2.3056548173488913, + "grad_norm": 0.30709657073020935, + "learning_rate": 9.717223984429931e-05, + "loss": 1.4934, + "step": 50640 + }, + { + "epoch": 2.3061967585144036, + "grad_norm": 0.3597027361392975, + "learning_rate": 9.716671132781731e-05, + "loss": 1.488, + "step": 50650 + }, + { + "epoch": 2.306738699679916, + "grad_norm": 0.2394603192806244, + "learning_rate": 9.71611775879062e-05, + "loss": 1.4865, + "step": 50660 + }, + { + "epoch": 2.3072806408454283, + "grad_norm": 0.2578514814376831, + "learning_rate": 9.715563862525145e-05, + "loss": 1.4828, + "step": 50670 + }, + { + "epoch": 2.3078225820109406, + "grad_norm": 0.2434462159872055, + "learning_rate": 9.715009444053921e-05, + "loss": 1.4906, + "step": 50680 + }, + { + "epoch": 2.3083645231764525, + "grad_norm": 0.3057672381401062, + "learning_rate": 9.714454503445626e-05, + "loss": 1.4889, + "step": 50690 + }, + { + "epoch": 2.308906464341965, + "grad_norm": 0.2835559546947479, + "learning_rate": 9.713899040769004e-05, + "loss": 1.4879, + "step": 50700 + }, + { + "epoch": 2.309448405507477, + "grad_norm": 0.23302173614501953, + "learning_rate": 9.713343056092866e-05, + "loss": 1.4891, + "step": 50710 + }, + { + "epoch": 2.3099903466729894, + "grad_norm": 0.2765030562877655, + "learning_rate": 9.71278654948608e-05, + "loss": 1.4879, + "step": 50720 + }, + { + "epoch": 2.3100445407895407, + "eval_loss": 2.4514873027801514, + "eval_runtime": 22.1411, + "eval_samples_per_second": 225.825, + "eval_steps_per_second": 1.219, + "step": 50721 + }, + { + "epoch": 2.3105322878385017, + "grad_norm": 0.3144146502017975, + "learning_rate": 9.712229521017588e-05, + "loss": 1.4854, + "step": 50730 + }, + { + "epoch": 2.3110742290040136, + "grad_norm": 0.2929839491844177, + "learning_rate": 9.71167197075639e-05, + "loss": 1.4764, + "step": 50740 + }, + { + "epoch": 2.311616170169526, + "grad_norm": 0.38944029808044434, + "learning_rate": 9.711113898771554e-05, + "loss": 1.4945, + "step": 50750 + }, + { + "epoch": 2.3121581113350382, + "grad_norm": 0.28745847940444946, + "learning_rate": 9.71055530513221e-05, + "loss": 1.4949, + "step": 50760 + }, + { + "epoch": 2.3127000525005506, + "grad_norm": 0.26115939021110535, + "learning_rate": 9.709996189907557e-05, + "loss": 1.4907, + "step": 50770 + }, + { + "epoch": 2.313241993666063, + "grad_norm": 0.24977637827396393, + "learning_rate": 9.709436553166853e-05, + "loss": 1.4829, + "step": 50780 + }, + { + "epoch": 2.3137839348315747, + "grad_norm": 0.35427191853523254, + "learning_rate": 9.708876394979424e-05, + "loss": 1.4819, + "step": 50790 + }, + { + "epoch": 2.314325875997087, + "grad_norm": 0.24472922086715698, + "learning_rate": 9.708315715414661e-05, + "loss": 1.4968, + "step": 50800 + }, + { + "epoch": 2.314759428929497, + "eval_loss": 2.440610647201538, + "eval_runtime": 21.9959, + "eval_samples_per_second": 227.315, + "eval_steps_per_second": 1.227, + "step": 50808 + }, + { + "epoch": 2.3148678171625994, + "grad_norm": 0.338789165019989, + "learning_rate": 9.707754514542017e-05, + "loss": 1.4834, + "step": 50810 + }, + { + "epoch": 2.3154097583281117, + "grad_norm": 0.30206480622291565, + "learning_rate": 9.707192792431014e-05, + "loss": 1.4907, + "step": 50820 + }, + { + "epoch": 2.3159516994936236, + "grad_norm": 0.23182201385498047, + "learning_rate": 9.70663054915123e-05, + "loss": 1.4917, + "step": 50830 + }, + { + "epoch": 2.316493640659136, + "grad_norm": 0.3595997989177704, + "learning_rate": 9.70606778477232e-05, + "loss": 1.4962, + "step": 50840 + }, + { + "epoch": 2.317035581824648, + "grad_norm": 0.21878936886787415, + "learning_rate": 9.705504499363993e-05, + "loss": 1.4847, + "step": 50850 + }, + { + "epoch": 2.3175775229901605, + "grad_norm": 0.19247514009475708, + "learning_rate": 9.704940692996027e-05, + "loss": 1.4883, + "step": 50860 + }, + { + "epoch": 2.3181194641556724, + "grad_norm": 0.20777463912963867, + "learning_rate": 9.704376365738262e-05, + "loss": 1.5016, + "step": 50870 + }, + { + "epoch": 2.3186614053211847, + "grad_norm": 0.2756713628768921, + "learning_rate": 9.703811517660609e-05, + "loss": 1.494, + "step": 50880 + }, + { + "epoch": 2.319203346486697, + "grad_norm": 0.3496546745300293, + "learning_rate": 9.703246148833037e-05, + "loss": 1.495, + "step": 50890 + }, + { + "epoch": 2.319474317069453, + "eval_loss": 2.451153039932251, + "eval_runtime": 22.0437, + "eval_samples_per_second": 226.823, + "eval_steps_per_second": 1.225, + "step": 50895 + }, + { + "epoch": 2.3197452876522093, + "grad_norm": 0.28930479288101196, + "learning_rate": 9.702680259325579e-05, + "loss": 1.4995, + "step": 50900 + }, + { + "epoch": 2.3202872288177216, + "grad_norm": 0.1946384757757187, + "learning_rate": 9.702113849208337e-05, + "loss": 1.4856, + "step": 50910 + }, + { + "epoch": 2.3208291699832335, + "grad_norm": 0.23787322640419006, + "learning_rate": 9.701546918551475e-05, + "loss": 1.4841, + "step": 50920 + }, + { + "epoch": 2.321371111148746, + "grad_norm": 0.2119310200214386, + "learning_rate": 9.700979467425222e-05, + "loss": 1.4882, + "step": 50930 + }, + { + "epoch": 2.321913052314258, + "grad_norm": 0.23019666969776154, + "learning_rate": 9.700411495899872e-05, + "loss": 1.4834, + "step": 50940 + }, + { + "epoch": 2.3224549934797705, + "grad_norm": 0.22270162403583527, + "learning_rate": 9.699843004045782e-05, + "loss": 1.4819, + "step": 50950 + }, + { + "epoch": 2.322996934645283, + "grad_norm": 0.3237341642379761, + "learning_rate": 9.699273991933373e-05, + "loss": 1.4903, + "step": 50960 + }, + { + "epoch": 2.3235388758107947, + "grad_norm": 0.26717808842658997, + "learning_rate": 9.698704459633136e-05, + "loss": 1.4825, + "step": 50970 + }, + { + "epoch": 2.324080816976307, + "grad_norm": 0.27529168128967285, + "learning_rate": 9.698134407215618e-05, + "loss": 1.4881, + "step": 50980 + }, + { + "epoch": 2.3241892052094095, + "eval_loss": 2.4506008625030518, + "eval_runtime": 22.0206, + "eval_samples_per_second": 227.06, + "eval_steps_per_second": 1.226, + "step": 50982 + }, + { + "epoch": 2.3246227581418193, + "grad_norm": 0.17906807363033295, + "learning_rate": 9.697563834751436e-05, + "loss": 1.4853, + "step": 50990 + }, + { + "epoch": 2.3251646993073316, + "grad_norm": 0.2530919313430786, + "learning_rate": 9.69699274231127e-05, + "loss": 1.4873, + "step": 51000 + }, + { + "epoch": 2.3257066404728435, + "grad_norm": 0.20605182647705078, + "learning_rate": 9.696421129965865e-05, + "loss": 1.4754, + "step": 51010 + }, + { + "epoch": 2.326248581638356, + "grad_norm": 0.3275115489959717, + "learning_rate": 9.69584899778603e-05, + "loss": 1.4704, + "step": 51020 + }, + { + "epoch": 2.326790522803868, + "grad_norm": 0.4294751286506653, + "learning_rate": 9.695276345842638e-05, + "loss": 1.4987, + "step": 51030 + }, + { + "epoch": 2.3273324639693804, + "grad_norm": 0.22030548751354218, + "learning_rate": 9.694703174206624e-05, + "loss": 1.4899, + "step": 51040 + }, + { + "epoch": 2.3278744051348923, + "grad_norm": 0.263075053691864, + "learning_rate": 9.694129482948994e-05, + "loss": 1.4819, + "step": 51050 + }, + { + "epoch": 2.3284163463004046, + "grad_norm": 0.26459792256355286, + "learning_rate": 9.693555272140813e-05, + "loss": 1.499, + "step": 51060 + }, + { + "epoch": 2.3289040933493657, + "eval_loss": 2.4480111598968506, + "eval_runtime": 22.0391, + "eval_samples_per_second": 226.869, + "eval_steps_per_second": 1.225, + "step": 51069 + }, + { + "epoch": 2.328958287465917, + "grad_norm": 0.19399993121623993, + "learning_rate": 9.692980541853211e-05, + "loss": 1.4926, + "step": 51070 + }, + { + "epoch": 2.3295002286314292, + "grad_norm": 0.2647286057472229, + "learning_rate": 9.692405292157384e-05, + "loss": 1.4846, + "step": 51080 + }, + { + "epoch": 2.3300421697969416, + "grad_norm": 0.2626708149909973, + "learning_rate": 9.69182952312459e-05, + "loss": 1.4875, + "step": 51090 + }, + { + "epoch": 2.3305841109624534, + "grad_norm": 0.3158663213253021, + "learning_rate": 9.691253234826155e-05, + "loss": 1.4902, + "step": 51100 + }, + { + "epoch": 2.3311260521279658, + "grad_norm": 0.18909795582294464, + "learning_rate": 9.690676427333467e-05, + "loss": 1.4898, + "step": 51110 + }, + { + "epoch": 2.331667993293478, + "grad_norm": 0.3357420265674591, + "learning_rate": 9.690099100717974e-05, + "loss": 1.4999, + "step": 51120 + }, + { + "epoch": 2.3322099344589904, + "grad_norm": 0.36199477314949036, + "learning_rate": 9.689521255051198e-05, + "loss": 1.4957, + "step": 51130 + }, + { + "epoch": 2.3327518756245027, + "grad_norm": 0.36792802810668945, + "learning_rate": 9.688942890404718e-05, + "loss": 1.4897, + "step": 51140 + }, + { + "epoch": 2.3332938167900146, + "grad_norm": 0.22224745154380798, + "learning_rate": 9.68836400685018e-05, + "loss": 1.4866, + "step": 51150 + }, + { + "epoch": 2.3336189814893222, + "eval_loss": 2.449270248413086, + "eval_runtime": 22.033, + "eval_samples_per_second": 226.933, + "eval_steps_per_second": 1.225, + "step": 51156 + }, + { + "epoch": 2.333835757955527, + "grad_norm": 0.20259080827236176, + "learning_rate": 9.687784604459292e-05, + "loss": 1.4879, + "step": 51160 + }, + { + "epoch": 2.334377699121039, + "grad_norm": 0.41556867957115173, + "learning_rate": 9.687204683303829e-05, + "loss": 1.4895, + "step": 51170 + }, + { + "epoch": 2.3349196402865515, + "grad_norm": 0.24334754049777985, + "learning_rate": 9.686624243455627e-05, + "loss": 1.4751, + "step": 51180 + }, + { + "epoch": 2.335461581452064, + "grad_norm": 0.20898622274398804, + "learning_rate": 9.686043284986593e-05, + "loss": 1.4863, + "step": 51190 + }, + { + "epoch": 2.3360035226175757, + "grad_norm": 0.22658079862594604, + "learning_rate": 9.685461807968688e-05, + "loss": 1.4837, + "step": 51200 + }, + { + "epoch": 2.336545463783088, + "grad_norm": 0.29090017080307007, + "learning_rate": 9.684879812473948e-05, + "loss": 1.4827, + "step": 51210 + }, + { + "epoch": 2.3370874049486003, + "grad_norm": 0.27824729681015015, + "learning_rate": 9.684297298574465e-05, + "loss": 1.4882, + "step": 51220 + }, + { + "epoch": 2.3376293461141127, + "grad_norm": 0.2605385482311249, + "learning_rate": 9.683714266342398e-05, + "loss": 1.4894, + "step": 51230 + }, + { + "epoch": 2.3381712872796245, + "grad_norm": 0.34890076518058777, + "learning_rate": 9.683130715849973e-05, + "loss": 1.4949, + "step": 51240 + }, + { + "epoch": 2.3383338696292784, + "eval_loss": 2.450643539428711, + "eval_runtime": 22.0782, + "eval_samples_per_second": 226.468, + "eval_steps_per_second": 1.223, + "step": 51243 + }, + { + "epoch": 2.338713228445137, + "grad_norm": 0.2740577459335327, + "learning_rate": 9.682546647169478e-05, + "loss": 1.4953, + "step": 51250 + }, + { + "epoch": 2.339255169610649, + "grad_norm": 0.29385995864868164, + "learning_rate": 9.68196206037326e-05, + "loss": 1.4833, + "step": 51260 + }, + { + "epoch": 2.3397971107761615, + "grad_norm": 0.2904885709285736, + "learning_rate": 9.681376955533739e-05, + "loss": 1.4861, + "step": 51270 + }, + { + "epoch": 2.3403390519416734, + "grad_norm": 0.19600605964660645, + "learning_rate": 9.680791332723396e-05, + "loss": 1.4891, + "step": 51280 + }, + { + "epoch": 2.3408809931071857, + "grad_norm": 0.35659798979759216, + "learning_rate": 9.680205192014772e-05, + "loss": 1.481, + "step": 51290 + }, + { + "epoch": 2.341422934272698, + "grad_norm": 0.20791636407375336, + "learning_rate": 9.679618533480478e-05, + "loss": 1.4846, + "step": 51300 + }, + { + "epoch": 2.3419648754382103, + "grad_norm": 0.36341404914855957, + "learning_rate": 9.679031357193186e-05, + "loss": 1.487, + "step": 51310 + }, + { + "epoch": 2.3425068166037226, + "grad_norm": 0.2511788308620453, + "learning_rate": 9.678443663225633e-05, + "loss": 1.4911, + "step": 51320 + }, + { + "epoch": 2.3430487577692345, + "grad_norm": 0.22106388211250305, + "learning_rate": 9.67785545165062e-05, + "loss": 1.4855, + "step": 51330 + }, + { + "epoch": 2.3430487577692345, + "eval_loss": 2.449833631515503, + "eval_runtime": 22.0541, + "eval_samples_per_second": 226.716, + "eval_steps_per_second": 1.224, + "step": 51330 + }, + { + "epoch": 2.343590698934747, + "grad_norm": 0.2117205709218979, + "learning_rate": 9.677266722541012e-05, + "loss": 1.4836, + "step": 51340 + }, + { + "epoch": 2.344132640100259, + "grad_norm": 0.261532187461853, + "learning_rate": 9.67667747596974e-05, + "loss": 1.4843, + "step": 51350 + }, + { + "epoch": 2.3446745812657714, + "grad_norm": 0.25580722093582153, + "learning_rate": 9.676087712009794e-05, + "loss": 1.4842, + "step": 51360 + }, + { + "epoch": 2.3452165224312838, + "grad_norm": 0.21774953603744507, + "learning_rate": 9.675497430734233e-05, + "loss": 1.4816, + "step": 51370 + }, + { + "epoch": 2.3457584635967956, + "grad_norm": 0.4116745591163635, + "learning_rate": 9.67490663221618e-05, + "loss": 1.4927, + "step": 51380 + }, + { + "epoch": 2.346300404762308, + "grad_norm": 0.35664743185043335, + "learning_rate": 9.674315316528816e-05, + "loss": 1.4871, + "step": 51390 + }, + { + "epoch": 2.3468423459278203, + "grad_norm": 0.25654247403144836, + "learning_rate": 9.673723483745395e-05, + "loss": 1.4728, + "step": 51400 + }, + { + "epoch": 2.3473842870933326, + "grad_norm": 0.32242023944854736, + "learning_rate": 9.673131133939229e-05, + "loss": 1.4841, + "step": 51410 + }, + { + "epoch": 2.347763645909191, + "eval_loss": 2.4569506645202637, + "eval_runtime": 21.6699, + "eval_samples_per_second": 230.735, + "eval_steps_per_second": 1.246, + "step": 51417 + }, + { + "epoch": 2.347926228258845, + "grad_norm": 0.19315354526042938, + "learning_rate": 9.672538267183699e-05, + "loss": 1.4803, + "step": 51420 + }, + { + "epoch": 2.3484681694243568, + "grad_norm": 0.2572130262851715, + "learning_rate": 9.67194488355224e-05, + "loss": 1.4969, + "step": 51430 + }, + { + "epoch": 2.349010110589869, + "grad_norm": 0.5099160671234131, + "learning_rate": 9.671350983118365e-05, + "loss": 1.4876, + "step": 51440 + }, + { + "epoch": 2.3495520517553814, + "grad_norm": 0.2558972239494324, + "learning_rate": 9.670756565955637e-05, + "loss": 1.4961, + "step": 51450 + }, + { + "epoch": 2.3500939929208937, + "grad_norm": 0.5650994181632996, + "learning_rate": 9.670161632137696e-05, + "loss": 1.4858, + "step": 51460 + }, + { + "epoch": 2.3506359340864056, + "grad_norm": 0.45410850644111633, + "learning_rate": 9.669566181738236e-05, + "loss": 1.4921, + "step": 51470 + }, + { + "epoch": 2.351177875251918, + "grad_norm": 0.37703588604927063, + "learning_rate": 9.668970214831019e-05, + "loss": 1.4899, + "step": 51480 + }, + { + "epoch": 2.35171981641743, + "grad_norm": 0.20002073049545288, + "learning_rate": 9.668373731489872e-05, + "loss": 1.4873, + "step": 51490 + }, + { + "epoch": 2.3522617575829425, + "grad_norm": 0.2333013415336609, + "learning_rate": 9.667776731788685e-05, + "loss": 1.4952, + "step": 51500 + }, + { + "epoch": 2.352478534049147, + "eval_loss": 2.4526426792144775, + "eval_runtime": 22.019, + "eval_samples_per_second": 227.077, + "eval_steps_per_second": 1.226, + "step": 51504 + }, + { + "epoch": 2.3528036987484544, + "grad_norm": 0.28557243943214417, + "learning_rate": 9.667179215801411e-05, + "loss": 1.4851, + "step": 51510 + }, + { + "epoch": 2.3533456399139667, + "grad_norm": 0.2601202130317688, + "learning_rate": 9.666581183602069e-05, + "loss": 1.4857, + "step": 51520 + }, + { + "epoch": 2.353887581079479, + "grad_norm": 0.37464073300361633, + "learning_rate": 9.665982635264736e-05, + "loss": 1.491, + "step": 51530 + }, + { + "epoch": 2.3544295222449914, + "grad_norm": 0.3240549862384796, + "learning_rate": 9.665383570863565e-05, + "loss": 1.478, + "step": 51540 + }, + { + "epoch": 2.3549714634105037, + "grad_norm": 0.18953123688697815, + "learning_rate": 9.66478399047276e-05, + "loss": 1.4791, + "step": 51550 + }, + { + "epoch": 2.3555134045760155, + "grad_norm": 0.24826766550540924, + "learning_rate": 9.664183894166595e-05, + "loss": 1.4875, + "step": 51560 + }, + { + "epoch": 2.356055345741528, + "grad_norm": 0.2706245481967926, + "learning_rate": 9.66358328201941e-05, + "loss": 1.4802, + "step": 51570 + }, + { + "epoch": 2.35659728690704, + "grad_norm": 0.3040546178817749, + "learning_rate": 9.662982154105604e-05, + "loss": 1.4826, + "step": 51580 + }, + { + "epoch": 2.3571392280725525, + "grad_norm": 0.28579655289649963, + "learning_rate": 9.66238051049964e-05, + "loss": 1.4826, + "step": 51590 + }, + { + "epoch": 2.3571934221891038, + "eval_loss": 2.4456629753112793, + "eval_runtime": 22.0034, + "eval_samples_per_second": 227.238, + "eval_steps_per_second": 1.227, + "step": 51591 + }, + { + "epoch": 2.357681169238065, + "grad_norm": 0.2639320194721222, + "learning_rate": 9.661778351276052e-05, + "loss": 1.4805, + "step": 51600 + }, + { + "epoch": 2.3582231104035767, + "grad_norm": 0.3113354742527008, + "learning_rate": 9.66117567650943e-05, + "loss": 1.4745, + "step": 51610 + }, + { + "epoch": 2.358765051569089, + "grad_norm": 0.2081727683544159, + "learning_rate": 9.660572486274432e-05, + "loss": 1.4801, + "step": 51620 + }, + { + "epoch": 2.3593069927346013, + "grad_norm": 0.2256453037261963, + "learning_rate": 9.659968780645777e-05, + "loss": 1.4801, + "step": 51630 + }, + { + "epoch": 2.3598489339001136, + "grad_norm": 0.23762857913970947, + "learning_rate": 9.659364559698249e-05, + "loss": 1.4833, + "step": 51640 + }, + { + "epoch": 2.3603908750656255, + "grad_norm": 0.28543922305107117, + "learning_rate": 9.6587598235067e-05, + "loss": 1.4877, + "step": 51650 + }, + { + "epoch": 2.360932816231138, + "grad_norm": 0.32550597190856934, + "learning_rate": 9.658154572146039e-05, + "loss": 1.4831, + "step": 51660 + }, + { + "epoch": 2.36147475739665, + "grad_norm": 0.3355502188205719, + "learning_rate": 9.65754880569124e-05, + "loss": 1.4936, + "step": 51670 + }, + { + "epoch": 2.36190831032906, + "eval_loss": 2.46115779876709, + "eval_runtime": 22.0455, + "eval_samples_per_second": 226.804, + "eval_steps_per_second": 1.225, + "step": 51678 + }, + { + "epoch": 2.3620166985621625, + "grad_norm": 0.3635583519935608, + "learning_rate": 9.65694252421735e-05, + "loss": 1.4941, + "step": 51680 + }, + { + "epoch": 2.3625586397276743, + "grad_norm": 0.2427382469177246, + "learning_rate": 9.656335727799464e-05, + "loss": 1.4888, + "step": 51690 + }, + { + "epoch": 2.3631005808931866, + "grad_norm": 0.178348109126091, + "learning_rate": 9.655728416512754e-05, + "loss": 1.4835, + "step": 51700 + }, + { + "epoch": 2.363642522058699, + "grad_norm": 0.22014687955379486, + "learning_rate": 9.65512059043245e-05, + "loss": 1.485, + "step": 51710 + }, + { + "epoch": 2.3641844632242113, + "grad_norm": 0.21226602792739868, + "learning_rate": 9.654512249633848e-05, + "loss": 1.4826, + "step": 51720 + }, + { + "epoch": 2.3647264043897236, + "grad_norm": 0.28715527057647705, + "learning_rate": 9.653903394192304e-05, + "loss": 1.4772, + "step": 51730 + }, + { + "epoch": 2.3652683455552355, + "grad_norm": 0.24136418104171753, + "learning_rate": 9.653294024183243e-05, + "loss": 1.4856, + "step": 51740 + }, + { + "epoch": 2.365810286720748, + "grad_norm": 0.20212812721729279, + "learning_rate": 9.65268413968215e-05, + "loss": 1.4721, + "step": 51750 + }, + { + "epoch": 2.36635222788626, + "grad_norm": 0.2756584882736206, + "learning_rate": 9.652073740764576e-05, + "loss": 1.4901, + "step": 51760 + }, + { + "epoch": 2.366623198469016, + "eval_loss": 2.4501538276672363, + "eval_runtime": 22.0902, + "eval_samples_per_second": 226.345, + "eval_steps_per_second": 1.222, + "step": 51765 + }, + { + "epoch": 2.3668941690517724, + "grad_norm": 0.27957209944725037, + "learning_rate": 9.651462827506131e-05, + "loss": 1.484, + "step": 51770 + }, + { + "epoch": 2.3674361102172847, + "grad_norm": 0.3119787871837616, + "learning_rate": 9.650851399982495e-05, + "loss": 1.4874, + "step": 51780 + }, + { + "epoch": 2.3679780513827966, + "grad_norm": 0.37608903646469116, + "learning_rate": 9.65023945826941e-05, + "loss": 1.4897, + "step": 51790 + }, + { + "epoch": 2.368519992548309, + "grad_norm": 0.1750231236219406, + "learning_rate": 9.64962700244268e-05, + "loss": 1.4893, + "step": 51800 + }, + { + "epoch": 2.3690619337138212, + "grad_norm": 0.20804855227470398, + "learning_rate": 9.649014032578174e-05, + "loss": 1.4854, + "step": 51810 + }, + { + "epoch": 2.3696038748793335, + "grad_norm": 0.19923941791057587, + "learning_rate": 9.648400548751819e-05, + "loss": 1.4891, + "step": 51820 + }, + { + "epoch": 2.370145816044846, + "grad_norm": 0.3422909379005432, + "learning_rate": 9.647786551039617e-05, + "loss": 1.4859, + "step": 51830 + }, + { + "epoch": 2.3706877572103577, + "grad_norm": 0.2580278813838959, + "learning_rate": 9.647172039517623e-05, + "loss": 1.4886, + "step": 51840 + }, + { + "epoch": 2.37122969837587, + "grad_norm": 0.21753491461277008, + "learning_rate": 9.646557014261966e-05, + "loss": 1.4819, + "step": 51850 + }, + { + "epoch": 2.3713380866089726, + "eval_loss": 2.452852964401245, + "eval_runtime": 22.005, + "eval_samples_per_second": 227.221, + "eval_steps_per_second": 1.227, + "step": 51852 + }, + { + "epoch": 2.3717716395413824, + "grad_norm": 0.3423318862915039, + "learning_rate": 9.645941475348825e-05, + "loss": 1.4947, + "step": 51860 + }, + { + "epoch": 2.3723135807068947, + "grad_norm": 0.28139030933380127, + "learning_rate": 9.645325422854454e-05, + "loss": 1.4803, + "step": 51870 + }, + { + "epoch": 2.3728555218724066, + "grad_norm": 0.21756911277770996, + "learning_rate": 9.644708856855168e-05, + "loss": 1.4883, + "step": 51880 + }, + { + "epoch": 2.373397463037919, + "grad_norm": 0.22105887532234192, + "learning_rate": 9.644091777427344e-05, + "loss": 1.4905, + "step": 51890 + }, + { + "epoch": 2.373939404203431, + "grad_norm": 0.2563340365886688, + "learning_rate": 9.643474184647422e-05, + "loss": 1.4769, + "step": 51900 + }, + { + "epoch": 2.3744813453689435, + "grad_norm": 0.19484004378318787, + "learning_rate": 9.642856078591906e-05, + "loss": 1.4858, + "step": 51910 + }, + { + "epoch": 2.3750232865344554, + "grad_norm": 0.4082275629043579, + "learning_rate": 9.642237459337366e-05, + "loss": 1.4934, + "step": 51920 + }, + { + "epoch": 2.3755652276999677, + "grad_norm": 0.26656705141067505, + "learning_rate": 9.641618326960435e-05, + "loss": 1.4833, + "step": 51930 + }, + { + "epoch": 2.3760529747489287, + "eval_loss": 2.4559152126312256, + "eval_runtime": 22.0417, + "eval_samples_per_second": 226.843, + "eval_steps_per_second": 1.225, + "step": 51939 + }, + { + "epoch": 2.37610716886548, + "grad_norm": 0.38334181904792786, + "learning_rate": 9.640998681537805e-05, + "loss": 1.4832, + "step": 51940 + }, + { + "epoch": 2.3766491100309923, + "grad_norm": 0.34898388385772705, + "learning_rate": 9.640378523146238e-05, + "loss": 1.489, + "step": 51950 + }, + { + "epoch": 2.3771910511965046, + "grad_norm": 0.23095417022705078, + "learning_rate": 9.639757851862553e-05, + "loss": 1.4816, + "step": 51960 + }, + { + "epoch": 2.3777329923620165, + "grad_norm": 0.19512107968330383, + "learning_rate": 9.63913666776364e-05, + "loss": 1.4864, + "step": 51970 + }, + { + "epoch": 2.378274933527529, + "grad_norm": 0.4121658205986023, + "learning_rate": 9.638514970926447e-05, + "loss": 1.4825, + "step": 51980 + }, + { + "epoch": 2.378816874693041, + "grad_norm": 0.3080250024795532, + "learning_rate": 9.637892761427987e-05, + "loss": 1.4921, + "step": 51990 + }, + { + "epoch": 2.3793588158585535, + "grad_norm": 0.25781115889549255, + "learning_rate": 9.637270039345335e-05, + "loss": 1.4752, + "step": 52000 + }, + { + "epoch": 2.379900757024066, + "grad_norm": 0.2879182994365692, + "learning_rate": 9.636646804755635e-05, + "loss": 1.4941, + "step": 52010 + }, + { + "epoch": 2.3804426981895777, + "grad_norm": 0.2381771355867386, + "learning_rate": 9.636023057736088e-05, + "loss": 1.4927, + "step": 52020 + }, + { + "epoch": 2.3807678628888853, + "eval_loss": 2.451416492462158, + "eval_runtime": 22.0461, + "eval_samples_per_second": 226.797, + "eval_steps_per_second": 1.225, + "step": 52026 + }, + { + "epoch": 2.38098463935509, + "grad_norm": 0.33801183104515076, + "learning_rate": 9.63539879836396e-05, + "loss": 1.4843, + "step": 52030 + }, + { + "epoch": 2.3815265805206023, + "grad_norm": 0.19671766459941864, + "learning_rate": 9.634774026716585e-05, + "loss": 1.4758, + "step": 52040 + }, + { + "epoch": 2.3820685216861146, + "grad_norm": 0.17954659461975098, + "learning_rate": 9.634148742871353e-05, + "loss": 1.4805, + "step": 52050 + }, + { + "epoch": 2.3826104628516265, + "grad_norm": 0.31026336550712585, + "learning_rate": 9.633522946905725e-05, + "loss": 1.4882, + "step": 52060 + }, + { + "epoch": 2.383152404017139, + "grad_norm": 0.19763007760047913, + "learning_rate": 9.632896638897219e-05, + "loss": 1.4804, + "step": 52070 + }, + { + "epoch": 2.383694345182651, + "grad_norm": 0.37613585591316223, + "learning_rate": 9.63226981892342e-05, + "loss": 1.4732, + "step": 52080 + }, + { + "epoch": 2.3842362863481634, + "grad_norm": 0.3250291347503662, + "learning_rate": 9.631642487061978e-05, + "loss": 1.4788, + "step": 52090 + }, + { + "epoch": 2.3847782275136753, + "grad_norm": 0.4253854751586914, + "learning_rate": 9.631014643390602e-05, + "loss": 1.4871, + "step": 52100 + }, + { + "epoch": 2.3853201686791876, + "grad_norm": 0.1921018362045288, + "learning_rate": 9.630386287987067e-05, + "loss": 1.4825, + "step": 52110 + }, + { + "epoch": 2.3854827510288414, + "eval_loss": 2.4489946365356445, + "eval_runtime": 21.9884, + "eval_samples_per_second": 227.393, + "eval_steps_per_second": 1.228, + "step": 52113 + }, + { + "epoch": 2.3858621098447, + "grad_norm": 0.2642991840839386, + "learning_rate": 9.629757420929212e-05, + "loss": 1.4773, + "step": 52120 + }, + { + "epoch": 2.3864040510102122, + "grad_norm": 0.20693929493427277, + "learning_rate": 9.629128042294936e-05, + "loss": 1.4714, + "step": 52130 + }, + { + "epoch": 2.3869459921757246, + "grad_norm": 0.2861025631427765, + "learning_rate": 9.628498152162205e-05, + "loss": 1.4816, + "step": 52140 + }, + { + "epoch": 2.3874879333412364, + "grad_norm": 0.1987767517566681, + "learning_rate": 9.627867750609047e-05, + "loss": 1.4794, + "step": 52150 + }, + { + "epoch": 2.3880298745067488, + "grad_norm": 0.19934241473674774, + "learning_rate": 9.627236837713553e-05, + "loss": 1.4802, + "step": 52160 + }, + { + "epoch": 2.388571815672261, + "grad_norm": 0.2977670133113861, + "learning_rate": 9.626605413553881e-05, + "loss": 1.4917, + "step": 52170 + }, + { + "epoch": 2.3891137568377734, + "grad_norm": 0.22710345685482025, + "learning_rate": 9.625973478208243e-05, + "loss": 1.4835, + "step": 52180 + }, + { + "epoch": 2.3896556980032857, + "grad_norm": 0.2273036539554596, + "learning_rate": 9.625341031754925e-05, + "loss": 1.4893, + "step": 52190 + }, + { + "epoch": 2.3901976391687976, + "grad_norm": 0.2654477655887604, + "learning_rate": 9.624708074272272e-05, + "loss": 1.4888, + "step": 52200 + }, + { + "epoch": 2.3901976391687976, + "eval_loss": 2.4439258575439453, + "eval_runtime": 21.9105, + "eval_samples_per_second": 228.201, + "eval_steps_per_second": 1.232, + "step": 52200 + }, + { + "epoch": 2.39073958033431, + "grad_norm": 0.2779911160469055, + "learning_rate": 9.624074605838688e-05, + "loss": 1.4723, + "step": 52210 + }, + { + "epoch": 2.391281521499822, + "grad_norm": 0.20526061952114105, + "learning_rate": 9.623440626532649e-05, + "loss": 1.4886, + "step": 52220 + }, + { + "epoch": 2.3918234626653345, + "grad_norm": 0.23855385184288025, + "learning_rate": 9.622806136432684e-05, + "loss": 1.4846, + "step": 52230 + }, + { + "epoch": 2.392365403830847, + "grad_norm": 0.1878088265657425, + "learning_rate": 9.622171135617397e-05, + "loss": 1.4781, + "step": 52240 + }, + { + "epoch": 2.3929073449963587, + "grad_norm": 0.28239238262176514, + "learning_rate": 9.621535624165446e-05, + "loss": 1.4894, + "step": 52250 + }, + { + "epoch": 2.393449286161871, + "grad_norm": 0.24423588812351227, + "learning_rate": 9.620899602155557e-05, + "loss": 1.4842, + "step": 52260 + }, + { + "epoch": 2.3939912273273833, + "grad_norm": 0.20243407785892487, + "learning_rate": 9.620263069666514e-05, + "loss": 1.4767, + "step": 52270 + }, + { + "epoch": 2.3945331684928957, + "grad_norm": 0.3433877229690552, + "learning_rate": 9.619626026777172e-05, + "loss": 1.4847, + "step": 52280 + }, + { + "epoch": 2.394912527308754, + "eval_loss": 2.4462528228759766, + "eval_runtime": 22.0531, + "eval_samples_per_second": 226.726, + "eval_steps_per_second": 1.224, + "step": 52287 + }, + { + "epoch": 2.3950751096584075, + "grad_norm": 0.2804941236972809, + "learning_rate": 9.618988473566442e-05, + "loss": 1.4887, + "step": 52290 + }, + { + "epoch": 2.39561705082392, + "grad_norm": 0.21407003700733185, + "learning_rate": 9.618350410113304e-05, + "loss": 1.4823, + "step": 52300 + }, + { + "epoch": 2.396158991989432, + "grad_norm": 0.2938377261161804, + "learning_rate": 9.617711836496797e-05, + "loss": 1.489, + "step": 52310 + }, + { + "epoch": 2.3967009331549445, + "grad_norm": 0.19561982154846191, + "learning_rate": 9.617072752796025e-05, + "loss": 1.4788, + "step": 52320 + }, + { + "epoch": 2.3972428743204564, + "grad_norm": 0.2788771688938141, + "learning_rate": 9.616433159090154e-05, + "loss": 1.4788, + "step": 52330 + }, + { + "epoch": 2.3977848154859687, + "grad_norm": 0.24922244250774384, + "learning_rate": 9.615793055458415e-05, + "loss": 1.4827, + "step": 52340 + }, + { + "epoch": 2.398326756651481, + "grad_norm": 0.18937751650810242, + "learning_rate": 9.615152441980104e-05, + "loss": 1.4771, + "step": 52350 + }, + { + "epoch": 2.3988686978169933, + "grad_norm": 0.19057206809520721, + "learning_rate": 9.614511318734572e-05, + "loss": 1.4782, + "step": 52360 + }, + { + "epoch": 2.3994106389825056, + "grad_norm": 0.40637481212615967, + "learning_rate": 9.613869685801242e-05, + "loss": 1.4894, + "step": 52370 + }, + { + "epoch": 2.3996274154487103, + "eval_loss": 2.4485349655151367, + "eval_runtime": 25.128, + "eval_samples_per_second": 198.981, + "eval_steps_per_second": 1.074, + "step": 52374 + }, + { + "epoch": 2.3999525801480175, + "grad_norm": 0.18058116734027863, + "learning_rate": 9.613227543259595e-05, + "loss": 1.4847, + "step": 52380 + }, + { + "epoch": 2.40049452131353, + "grad_norm": 0.2915930151939392, + "learning_rate": 9.61258489118918e-05, + "loss": 1.4771, + "step": 52390 + }, + { + "epoch": 2.401036462479042, + "grad_norm": 0.20396968722343445, + "learning_rate": 9.611941729669602e-05, + "loss": 1.4799, + "step": 52400 + }, + { + "epoch": 2.4015784036445544, + "grad_norm": 0.18857981264591217, + "learning_rate": 9.611298058780536e-05, + "loss": 1.4857, + "step": 52410 + }, + { + "epoch": 2.4021203448100668, + "grad_norm": 0.3981879651546478, + "learning_rate": 9.610653878601715e-05, + "loss": 1.4748, + "step": 52420 + }, + { + "epoch": 2.4026622859755786, + "grad_norm": 0.17841531336307526, + "learning_rate": 9.610009189212938e-05, + "loss": 1.4865, + "step": 52430 + }, + { + "epoch": 2.403204227141091, + "grad_norm": 0.37024664878845215, + "learning_rate": 9.609363990694067e-05, + "loss": 1.4806, + "step": 52440 + }, + { + "epoch": 2.4037461683066033, + "grad_norm": 0.19459642469882965, + "learning_rate": 9.608718283125026e-05, + "loss": 1.4754, + "step": 52450 + }, + { + "epoch": 2.4042881094721156, + "grad_norm": 0.2543799579143524, + "learning_rate": 9.608072066585803e-05, + "loss": 1.4872, + "step": 52460 + }, + { + "epoch": 2.404342303588667, + "eval_loss": 2.44846773147583, + "eval_runtime": 21.9857, + "eval_samples_per_second": 227.42, + "eval_steps_per_second": 1.228, + "step": 52461 + }, + { + "epoch": 2.404830050637628, + "grad_norm": 0.31468695402145386, + "learning_rate": 9.607425341156447e-05, + "loss": 1.4677, + "step": 52470 + }, + { + "epoch": 2.4053719918031398, + "grad_norm": 0.31996336579322815, + "learning_rate": 9.606778106917071e-05, + "loss": 1.4961, + "step": 52480 + }, + { + "epoch": 2.405913932968652, + "grad_norm": 0.22491812705993652, + "learning_rate": 9.606130363947856e-05, + "loss": 1.4923, + "step": 52490 + }, + { + "epoch": 2.4064558741341644, + "grad_norm": 0.18299773335456848, + "learning_rate": 9.605482112329037e-05, + "loss": 1.4828, + "step": 52500 + }, + { + "epoch": 2.4069978152996767, + "grad_norm": 0.17865203320980072, + "learning_rate": 9.604833352140918e-05, + "loss": 1.4884, + "step": 52510 + }, + { + "epoch": 2.4075397564651886, + "grad_norm": 0.31600865721702576, + "learning_rate": 9.604184083463863e-05, + "loss": 1.4814, + "step": 52520 + }, + { + "epoch": 2.408081697630701, + "grad_norm": 0.22451327741146088, + "learning_rate": 9.603534306378305e-05, + "loss": 1.4824, + "step": 52530 + }, + { + "epoch": 2.408623638796213, + "grad_norm": 0.1985584944486618, + "learning_rate": 9.602884020964734e-05, + "loss": 1.4852, + "step": 52540 + }, + { + "epoch": 2.409057191728623, + "eval_loss": 2.45249605178833, + "eval_runtime": 22.1056, + "eval_samples_per_second": 226.187, + "eval_steps_per_second": 1.221, + "step": 52548 + }, + { + "epoch": 2.4091655799617255, + "grad_norm": 0.2913689911365509, + "learning_rate": 9.6022332273037e-05, + "loss": 1.4858, + "step": 52550 + }, + { + "epoch": 2.4097075211272374, + "grad_norm": 0.28544992208480835, + "learning_rate": 9.601581925475825e-05, + "loss": 1.4845, + "step": 52560 + }, + { + "epoch": 2.4102494622927497, + "grad_norm": 0.34636637568473816, + "learning_rate": 9.600930115561791e-05, + "loss": 1.4848, + "step": 52570 + }, + { + "epoch": 2.410791403458262, + "grad_norm": 0.22824904322624207, + "learning_rate": 9.600277797642335e-05, + "loss": 1.4804, + "step": 52580 + }, + { + "epoch": 2.4113333446237744, + "grad_norm": 0.2189302295446396, + "learning_rate": 9.59962497179827e-05, + "loss": 1.4794, + "step": 52590 + }, + { + "epoch": 2.4118752857892867, + "grad_norm": 0.1900554746389389, + "learning_rate": 9.59897163811046e-05, + "loss": 1.4769, + "step": 52600 + }, + { + "epoch": 2.4124172269547985, + "grad_norm": 0.2690756618976593, + "learning_rate": 9.59831779665984e-05, + "loss": 1.4868, + "step": 52610 + }, + { + "epoch": 2.412959168120311, + "grad_norm": 0.22319385409355164, + "learning_rate": 9.597663447527407e-05, + "loss": 1.4726, + "step": 52620 + }, + { + "epoch": 2.413501109285823, + "grad_norm": 0.22841401398181915, + "learning_rate": 9.597008590794211e-05, + "loss": 1.4806, + "step": 52630 + }, + { + "epoch": 2.413772079868579, + "eval_loss": 2.4524683952331543, + "eval_runtime": 22.0707, + "eval_samples_per_second": 226.544, + "eval_steps_per_second": 1.223, + "step": 52635 + }, + { + "epoch": 2.4140430504513355, + "grad_norm": 0.20807218551635742, + "learning_rate": 9.596353226541382e-05, + "loss": 1.4779, + "step": 52640 + }, + { + "epoch": 2.414584991616848, + "grad_norm": 0.4142747223377228, + "learning_rate": 9.595697354850101e-05, + "loss": 1.4986, + "step": 52650 + }, + { + "epoch": 2.4151269327823597, + "grad_norm": 0.21264301240444183, + "learning_rate": 9.59504097580161e-05, + "loss": 1.4877, + "step": 52660 + }, + { + "epoch": 2.415668873947872, + "grad_norm": 0.5480301976203918, + "learning_rate": 9.594384089477224e-05, + "loss": 1.4897, + "step": 52670 + }, + { + "epoch": 2.4162108151133843, + "grad_norm": 0.42489802837371826, + "learning_rate": 9.593726695958313e-05, + "loss": 1.4774, + "step": 52680 + }, + { + "epoch": 2.4167527562788966, + "grad_norm": 0.17096111178398132, + "learning_rate": 9.593068795326312e-05, + "loss": 1.4863, + "step": 52690 + }, + { + "epoch": 2.4172946974444085, + "grad_norm": 0.19619131088256836, + "learning_rate": 9.592410387662717e-05, + "loss": 1.4682, + "step": 52700 + }, + { + "epoch": 2.417836638609921, + "grad_norm": 0.18955209851264954, + "learning_rate": 9.591751473049095e-05, + "loss": 1.4765, + "step": 52710 + }, + { + "epoch": 2.418378579775433, + "grad_norm": 0.1831616461277008, + "learning_rate": 9.591092051567063e-05, + "loss": 1.4783, + "step": 52720 + }, + { + "epoch": 2.4184869680085357, + "eval_loss": 2.446958303451538, + "eval_runtime": 22.1086, + "eval_samples_per_second": 226.156, + "eval_steps_per_second": 1.221, + "step": 52722 + }, + { + "epoch": 2.4189205209409455, + "grad_norm": 0.2192278653383255, + "learning_rate": 9.590432123298307e-05, + "loss": 1.4874, + "step": 52730 + }, + { + "epoch": 2.4194624621064573, + "grad_norm": 0.19387726485729218, + "learning_rate": 9.589771688324582e-05, + "loss": 1.4803, + "step": 52740 + }, + { + "epoch": 2.4200044032719696, + "grad_norm": 0.280454158782959, + "learning_rate": 9.589110746727692e-05, + "loss": 1.4762, + "step": 52750 + }, + { + "epoch": 2.420546344437482, + "grad_norm": 0.18469981849193573, + "learning_rate": 9.588449298589518e-05, + "loss": 1.478, + "step": 52760 + }, + { + "epoch": 2.4210882856029943, + "grad_norm": 0.21699224412441254, + "learning_rate": 9.587787343991996e-05, + "loss": 1.473, + "step": 52770 + }, + { + "epoch": 2.4216302267685066, + "grad_norm": 0.373770147562027, + "learning_rate": 9.587124883017126e-05, + "loss": 1.4781, + "step": 52780 + }, + { + "epoch": 2.4221721679340185, + "grad_norm": 0.305627703666687, + "learning_rate": 9.586461915746968e-05, + "loss": 1.4829, + "step": 52790 + }, + { + "epoch": 2.4227141090995308, + "grad_norm": 0.23528113961219788, + "learning_rate": 9.585798442263651e-05, + "loss": 1.4835, + "step": 52800 + }, + { + "epoch": 2.423201856148492, + "eval_loss": 2.444547653198242, + "eval_runtime": 22.0829, + "eval_samples_per_second": 226.42, + "eval_steps_per_second": 1.223, + "step": 52809 + }, + { + "epoch": 2.423256050265043, + "grad_norm": 0.33448222279548645, + "learning_rate": 9.58513446264936e-05, + "loss": 1.4654, + "step": 52810 + }, + { + "epoch": 2.4237979914305554, + "grad_norm": 0.2976095974445343, + "learning_rate": 9.584469976986349e-05, + "loss": 1.4784, + "step": 52820 + }, + { + "epoch": 2.4243399325960677, + "grad_norm": 0.19375227391719818, + "learning_rate": 9.58380498535693e-05, + "loss": 1.4728, + "step": 52830 + }, + { + "epoch": 2.4248818737615796, + "grad_norm": 0.30406102538108826, + "learning_rate": 9.583139487843479e-05, + "loss": 1.4812, + "step": 52840 + }, + { + "epoch": 2.425423814927092, + "grad_norm": 0.3576555550098419, + "learning_rate": 9.582473484528436e-05, + "loss": 1.4949, + "step": 52850 + }, + { + "epoch": 2.4259657560926042, + "grad_norm": 0.21541939675807953, + "learning_rate": 9.581806975494303e-05, + "loss": 1.4831, + "step": 52860 + }, + { + "epoch": 2.4265076972581165, + "grad_norm": 0.20319156348705292, + "learning_rate": 9.581139960823642e-05, + "loss": 1.471, + "step": 52870 + }, + { + "epoch": 2.427049638423629, + "grad_norm": 0.3167076110839844, + "learning_rate": 9.58047244059908e-05, + "loss": 1.4771, + "step": 52880 + }, + { + "epoch": 2.4275915795891407, + "grad_norm": 0.41747766733169556, + "learning_rate": 9.579804414903311e-05, + "loss": 1.4705, + "step": 52890 + }, + { + "epoch": 2.4279167442884484, + "eval_loss": 2.454636573791504, + "eval_runtime": 21.9836, + "eval_samples_per_second": 227.442, + "eval_steps_per_second": 1.228, + "step": 52896 + }, + { + "epoch": 2.428133520754653, + "grad_norm": 0.37934955954551697, + "learning_rate": 9.579135883819082e-05, + "loss": 1.4835, + "step": 52900 + }, + { + "epoch": 2.4286754619201654, + "grad_norm": 0.1990528702735901, + "learning_rate": 9.578466847429208e-05, + "loss": 1.4811, + "step": 52910 + }, + { + "epoch": 2.4292174030856777, + "grad_norm": 0.1967354118824005, + "learning_rate": 9.57779730581657e-05, + "loss": 1.4862, + "step": 52920 + }, + { + "epoch": 2.4297593442511896, + "grad_norm": 0.22842636704444885, + "learning_rate": 9.577127259064106e-05, + "loss": 1.4771, + "step": 52930 + }, + { + "epoch": 2.430301285416702, + "grad_norm": 0.34596872329711914, + "learning_rate": 9.576456707254817e-05, + "loss": 1.4861, + "step": 52940 + }, + { + "epoch": 2.430843226582214, + "grad_norm": 0.2179035097360611, + "learning_rate": 9.57578565047177e-05, + "loss": 1.4817, + "step": 52950 + }, + { + "epoch": 2.4313851677477265, + "grad_norm": 0.3425596356391907, + "learning_rate": 9.57511408879809e-05, + "loss": 1.4933, + "step": 52960 + }, + { + "epoch": 2.4319271089132384, + "grad_norm": 0.33133429288864136, + "learning_rate": 9.574442022316972e-05, + "loss": 1.4704, + "step": 52970 + }, + { + "epoch": 2.4324690500787507, + "grad_norm": 0.2703513205051422, + "learning_rate": 9.573769451111665e-05, + "loss": 1.4756, + "step": 52980 + }, + { + "epoch": 2.4326316324284045, + "eval_loss": 2.4538090229034424, + "eval_runtime": 22.1163, + "eval_samples_per_second": 226.078, + "eval_steps_per_second": 1.221, + "step": 52983 + }, + { + "epoch": 2.433010991244263, + "grad_norm": 0.2822078764438629, + "learning_rate": 9.573096375265484e-05, + "loss": 1.4883, + "step": 52990 + }, + { + "epoch": 2.4335529324097753, + "grad_norm": 0.2804018259048462, + "learning_rate": 9.572422794861808e-05, + "loss": 1.4807, + "step": 53000 + }, + { + "epoch": 2.4340948735752876, + "grad_norm": 0.41560617089271545, + "learning_rate": 9.571748709984076e-05, + "loss": 1.4775, + "step": 53010 + }, + { + "epoch": 2.4346368147407995, + "grad_norm": 0.43294498324394226, + "learning_rate": 9.571074120715794e-05, + "loss": 1.4794, + "step": 53020 + }, + { + "epoch": 2.435178755906312, + "grad_norm": 0.22881726920604706, + "learning_rate": 9.570399027140523e-05, + "loss": 1.4722, + "step": 53030 + }, + { + "epoch": 2.435720697071824, + "grad_norm": 0.25796639919281006, + "learning_rate": 9.569723429341892e-05, + "loss": 1.4782, + "step": 53040 + }, + { + "epoch": 2.4362626382373365, + "grad_norm": 0.2922258973121643, + "learning_rate": 9.569047327403593e-05, + "loss": 1.4862, + "step": 53050 + }, + { + "epoch": 2.436804579402849, + "grad_norm": 0.2601679265499115, + "learning_rate": 9.568370721409376e-05, + "loss": 1.4967, + "step": 53060 + }, + { + "epoch": 2.4373465205683607, + "grad_norm": 0.2096012830734253, + "learning_rate": 9.567693611443057e-05, + "loss": 1.4771, + "step": 53070 + }, + { + "epoch": 2.4373465205683607, + "eval_loss": 2.448545217514038, + "eval_runtime": 22.001, + "eval_samples_per_second": 227.262, + "eval_steps_per_second": 1.227, + "step": 53070 + }, + { + "epoch": 2.437888461733873, + "grad_norm": 0.18229645490646362, + "learning_rate": 9.567015997588516e-05, + "loss": 1.4764, + "step": 53080 + }, + { + "epoch": 2.4384304028993853, + "grad_norm": 0.20466655492782593, + "learning_rate": 9.566337879929687e-05, + "loss": 1.478, + "step": 53090 + }, + { + "epoch": 2.4389723440648976, + "grad_norm": 0.27803587913513184, + "learning_rate": 9.565659258550576e-05, + "loss": 1.4613, + "step": 53100 + }, + { + "epoch": 2.43951428523041, + "grad_norm": 0.2388664335012436, + "learning_rate": 9.564980133535249e-05, + "loss": 1.4798, + "step": 53110 + }, + { + "epoch": 2.440056226395922, + "grad_norm": 0.20684485137462616, + "learning_rate": 9.564300504967832e-05, + "loss": 1.4779, + "step": 53120 + }, + { + "epoch": 2.440598167561434, + "grad_norm": 0.22506524622440338, + "learning_rate": 9.563620372932513e-05, + "loss": 1.4851, + "step": 53130 + }, + { + "epoch": 2.4411401087269464, + "grad_norm": 0.2067488580942154, + "learning_rate": 9.562939737513544e-05, + "loss": 1.476, + "step": 53140 + }, + { + "epoch": 2.4416820498924587, + "grad_norm": 0.27859237790107727, + "learning_rate": 9.56225859879524e-05, + "loss": 1.4776, + "step": 53150 + }, + { + "epoch": 2.4420614087083172, + "eval_loss": 2.4477126598358154, + "eval_runtime": 22.0385, + "eval_samples_per_second": 226.876, + "eval_steps_per_second": 1.225, + "step": 53157 + }, + { + "epoch": 2.4422239910579706, + "grad_norm": 0.31960025429725647, + "learning_rate": 9.561576956861978e-05, + "loss": 1.4773, + "step": 53160 + }, + { + "epoch": 2.442765932223483, + "grad_norm": 0.23332948982715607, + "learning_rate": 9.560894811798198e-05, + "loss": 1.4791, + "step": 53170 + }, + { + "epoch": 2.4433078733889952, + "grad_norm": 0.20001184940338135, + "learning_rate": 9.560212163688395e-05, + "loss": 1.4825, + "step": 53180 + }, + { + "epoch": 2.4438498145545076, + "grad_norm": 0.2940216064453125, + "learning_rate": 9.559529012617141e-05, + "loss": 1.4872, + "step": 53190 + }, + { + "epoch": 2.4443917557200194, + "grad_norm": 0.25465646386146545, + "learning_rate": 9.558845358669055e-05, + "loss": 1.4896, + "step": 53200 + }, + { + "epoch": 2.4449336968855317, + "grad_norm": 0.2544267177581787, + "learning_rate": 9.55816120192883e-05, + "loss": 1.4844, + "step": 53210 + }, + { + "epoch": 2.445475638051044, + "grad_norm": 0.30805036425590515, + "learning_rate": 9.557476542481212e-05, + "loss": 1.4777, + "step": 53220 + }, + { + "epoch": 2.4460175792165564, + "grad_norm": 0.23450466990470886, + "learning_rate": 9.556791380411017e-05, + "loss": 1.486, + "step": 53230 + }, + { + "epoch": 2.4465595203820687, + "grad_norm": 0.2862785756587982, + "learning_rate": 9.556105715803116e-05, + "loss": 1.4807, + "step": 53240 + }, + { + "epoch": 2.4467762968482734, + "eval_loss": 2.448235273361206, + "eval_runtime": 22.0612, + "eval_samples_per_second": 226.642, + "eval_steps_per_second": 1.224, + "step": 53244 + }, + { + "epoch": 2.4471014615475806, + "grad_norm": 0.23192840814590454, + "learning_rate": 9.555419548742452e-05, + "loss": 1.4753, + "step": 53250 + }, + { + "epoch": 2.447643402713093, + "grad_norm": 0.3470098078250885, + "learning_rate": 9.554732879314019e-05, + "loss": 1.4847, + "step": 53260 + }, + { + "epoch": 2.448185343878605, + "grad_norm": 0.3917534351348877, + "learning_rate": 9.554045707602882e-05, + "loss": 1.481, + "step": 53270 + }, + { + "epoch": 2.4487272850441175, + "grad_norm": 0.36082231998443604, + "learning_rate": 9.553358033694164e-05, + "loss": 1.4753, + "step": 53280 + }, + { + "epoch": 2.44926922620963, + "grad_norm": 0.24314887821674347, + "learning_rate": 9.552669857673049e-05, + "loss": 1.488, + "step": 53290 + }, + { + "epoch": 2.4498111673751417, + "grad_norm": 0.27214857935905457, + "learning_rate": 9.551981179624789e-05, + "loss": 1.4761, + "step": 53300 + }, + { + "epoch": 2.450353108540654, + "grad_norm": 0.22485631704330444, + "learning_rate": 9.55129199963469e-05, + "loss": 1.4849, + "step": 53310 + }, + { + "epoch": 2.4508950497061663, + "grad_norm": 0.2175171673297882, + "learning_rate": 9.55060231778813e-05, + "loss": 1.4772, + "step": 53320 + }, + { + "epoch": 2.4514369908716787, + "grad_norm": 0.5146223306655884, + "learning_rate": 9.549912134170539e-05, + "loss": 1.4912, + "step": 53330 + }, + { + "epoch": 2.45149118498823, + "eval_loss": 2.4793365001678467, + "eval_runtime": 22.2104, + "eval_samples_per_second": 225.12, + "eval_steps_per_second": 1.216, + "step": 53331 + }, + { + "epoch": 2.4519789320371905, + "grad_norm": 0.2823374271392822, + "learning_rate": 9.549221448867415e-05, + "loss": 1.4967, + "step": 53340 + }, + { + "epoch": 2.452520873202703, + "grad_norm": 0.26054081320762634, + "learning_rate": 9.54853026196432e-05, + "loss": 1.4803, + "step": 53350 + }, + { + "epoch": 2.453062814368215, + "grad_norm": 0.27415192127227783, + "learning_rate": 9.547838573546872e-05, + "loss": 1.487, + "step": 53360 + }, + { + "epoch": 2.4536047555337275, + "grad_norm": 0.19018392264842987, + "learning_rate": 9.547146383700756e-05, + "loss": 1.4945, + "step": 53370 + }, + { + "epoch": 2.4541466966992393, + "grad_norm": 0.23459668457508087, + "learning_rate": 9.546453692511715e-05, + "loss": 1.4807, + "step": 53380 + }, + { + "epoch": 2.4546886378647517, + "grad_norm": 0.30195194482803345, + "learning_rate": 9.545760500065562e-05, + "loss": 1.4858, + "step": 53390 + }, + { + "epoch": 2.455230579030264, + "grad_norm": 0.23426523804664612, + "learning_rate": 9.54506680644816e-05, + "loss": 1.4805, + "step": 53400 + }, + { + "epoch": 2.4557725201957763, + "grad_norm": 0.23431915044784546, + "learning_rate": 9.544372611745444e-05, + "loss": 1.4824, + "step": 53410 + }, + { + "epoch": 2.456206073128186, + "eval_loss": 2.459233283996582, + "eval_runtime": 22.0413, + "eval_samples_per_second": 226.847, + "eval_steps_per_second": 1.225, + "step": 53418 + }, + { + "epoch": 2.4563144613612886, + "grad_norm": 0.19480261206626892, + "learning_rate": 9.54367791604341e-05, + "loss": 1.4798, + "step": 53420 + }, + { + "epoch": 2.4568564025268005, + "grad_norm": 0.23288105428218842, + "learning_rate": 9.54298271942811e-05, + "loss": 1.4837, + "step": 53430 + }, + { + "epoch": 2.457398343692313, + "grad_norm": 0.21174949407577515, + "learning_rate": 9.542287021985665e-05, + "loss": 1.4851, + "step": 53440 + }, + { + "epoch": 2.457940284857825, + "grad_norm": 0.4950149953365326, + "learning_rate": 9.541590823802252e-05, + "loss": 1.4755, + "step": 53450 + }, + { + "epoch": 2.4584822260233374, + "grad_norm": 0.2723096013069153, + "learning_rate": 9.540894124964115e-05, + "loss": 1.4802, + "step": 53460 + }, + { + "epoch": 2.4590241671888498, + "grad_norm": 0.17027664184570312, + "learning_rate": 9.54019692555756e-05, + "loss": 1.4746, + "step": 53470 + }, + { + "epoch": 2.4595661083543616, + "grad_norm": 0.30771180987358093, + "learning_rate": 9.539499225668948e-05, + "loss": 1.4835, + "step": 53480 + }, + { + "epoch": 2.460108049519874, + "grad_norm": 0.21357007324695587, + "learning_rate": 9.538801025384709e-05, + "loss": 1.4775, + "step": 53490 + }, + { + "epoch": 2.4606499906853863, + "grad_norm": 0.28452086448669434, + "learning_rate": 9.538102324791336e-05, + "loss": 1.4832, + "step": 53500 + }, + { + "epoch": 2.460920961268142, + "eval_loss": 2.4530394077301025, + "eval_runtime": 22.1982, + "eval_samples_per_second": 225.244, + "eval_steps_per_second": 1.216, + "step": 53505 + }, + { + "epoch": 2.4611919318508986, + "grad_norm": 0.20882080495357513, + "learning_rate": 9.537403123975378e-05, + "loss": 1.4729, + "step": 53510 + }, + { + "epoch": 2.461733873016411, + "grad_norm": 0.2761918306350708, + "learning_rate": 9.536703423023449e-05, + "loss": 1.4739, + "step": 53520 + }, + { + "epoch": 2.4622758141819228, + "grad_norm": 0.1824912279844284, + "learning_rate": 9.536003222022225e-05, + "loss": 1.48, + "step": 53530 + }, + { + "epoch": 2.462817755347435, + "grad_norm": 0.22570142149925232, + "learning_rate": 9.535302521058445e-05, + "loss": 1.4832, + "step": 53540 + }, + { + "epoch": 2.4633596965129474, + "grad_norm": 0.3082813620567322, + "learning_rate": 9.534601320218909e-05, + "loss": 1.4899, + "step": 53550 + }, + { + "epoch": 2.4639016376784597, + "grad_norm": 0.17578811943531036, + "learning_rate": 9.533899619590477e-05, + "loss": 1.4744, + "step": 53560 + }, + { + "epoch": 2.4644435788439716, + "grad_norm": 0.20538325607776642, + "learning_rate": 9.533197419260073e-05, + "loss": 1.4834, + "step": 53570 + }, + { + "epoch": 2.464985520009484, + "grad_norm": 0.2888244688510895, + "learning_rate": 9.532494719314686e-05, + "loss": 1.4833, + "step": 53580 + }, + { + "epoch": 2.465527461174996, + "grad_norm": 0.26800277829170227, + "learning_rate": 9.531791519841359e-05, + "loss": 1.4856, + "step": 53590 + }, + { + "epoch": 2.4656358494080988, + "eval_loss": 2.4533004760742188, + "eval_runtime": 22.8599, + "eval_samples_per_second": 218.724, + "eval_steps_per_second": 1.181, + "step": 53592 + }, + { + "epoch": 2.4660694023405085, + "grad_norm": 0.24594108760356903, + "learning_rate": 9.531087820927201e-05, + "loss": 1.4797, + "step": 53600 + }, + { + "epoch": 2.4666113435060204, + "grad_norm": 0.18168674409389496, + "learning_rate": 9.530383622659386e-05, + "loss": 1.4837, + "step": 53610 + }, + { + "epoch": 2.4671532846715327, + "grad_norm": 0.20581193268299103, + "learning_rate": 9.529678925125148e-05, + "loss": 1.4761, + "step": 53620 + }, + { + "epoch": 2.467695225837045, + "grad_norm": 0.33261099457740784, + "learning_rate": 9.528973728411778e-05, + "loss": 1.4789, + "step": 53630 + }, + { + "epoch": 2.4682371670025574, + "grad_norm": 0.3050486147403717, + "learning_rate": 9.528268032606636e-05, + "loss": 1.4759, + "step": 53640 + }, + { + "epoch": 2.4687791081680697, + "grad_norm": 0.24290646612644196, + "learning_rate": 9.527561837797136e-05, + "loss": 1.4944, + "step": 53650 + }, + { + "epoch": 2.4693210493335815, + "grad_norm": 0.26528191566467285, + "learning_rate": 9.526855144070763e-05, + "loss": 1.4721, + "step": 53660 + }, + { + "epoch": 2.469862990499094, + "grad_norm": 0.1806386113166809, + "learning_rate": 9.52614795151506e-05, + "loss": 1.4691, + "step": 53670 + }, + { + "epoch": 2.470350737548055, + "eval_loss": 2.45682430267334, + "eval_runtime": 21.9989, + "eval_samples_per_second": 227.284, + "eval_steps_per_second": 1.227, + "step": 53679 + }, + { + "epoch": 2.470404931664606, + "grad_norm": 0.27701669931411743, + "learning_rate": 9.525440260217627e-05, + "loss": 1.4868, + "step": 53680 + }, + { + "epoch": 2.4709468728301185, + "grad_norm": 0.23009248077869415, + "learning_rate": 9.52473207026613e-05, + "loss": 1.4797, + "step": 53690 + }, + { + "epoch": 2.471488813995631, + "grad_norm": 0.19565734267234802, + "learning_rate": 9.524023381748298e-05, + "loss": 1.4742, + "step": 53700 + }, + { + "epoch": 2.4720307551611427, + "grad_norm": 0.3481823801994324, + "learning_rate": 9.523314194751921e-05, + "loss": 1.4823, + "step": 53710 + }, + { + "epoch": 2.472572696326655, + "grad_norm": 0.1881093680858612, + "learning_rate": 9.522604509364849e-05, + "loss": 1.4707, + "step": 53720 + }, + { + "epoch": 2.4731146374921673, + "grad_norm": 0.4059028625488281, + "learning_rate": 9.521894325674994e-05, + "loss": 1.4749, + "step": 53730 + }, + { + "epoch": 2.4736565786576796, + "grad_norm": 0.3592798709869385, + "learning_rate": 9.521183643770333e-05, + "loss": 1.477, + "step": 53740 + }, + { + "epoch": 2.474198519823192, + "grad_norm": 0.2864842712879181, + "learning_rate": 9.520472463738899e-05, + "loss": 1.4784, + "step": 53750 + }, + { + "epoch": 2.474740460988704, + "grad_norm": 0.20083869993686676, + "learning_rate": 9.519760785668791e-05, + "loss": 1.4775, + "step": 53760 + }, + { + "epoch": 2.4750656256880115, + "eval_loss": 2.4518940448760986, + "eval_runtime": 33.4746, + "eval_samples_per_second": 149.367, + "eval_steps_per_second": 0.807, + "step": 53766 + }, + { + "epoch": 2.475282402154216, + "grad_norm": 0.25543951988220215, + "learning_rate": 9.519048609648169e-05, + "loss": 1.4624, + "step": 53770 + }, + { + "epoch": 2.4758243433197284, + "grad_norm": 0.2484443336725235, + "learning_rate": 9.518335935765256e-05, + "loss": 1.483, + "step": 53780 + }, + { + "epoch": 2.4763662844852408, + "grad_norm": 0.3281131088733673, + "learning_rate": 9.517622764108331e-05, + "loss": 1.4762, + "step": 53790 + }, + { + "epoch": 2.4769082256507526, + "grad_norm": 0.23684971034526825, + "learning_rate": 9.516909094765741e-05, + "loss": 1.4772, + "step": 53800 + }, + { + "epoch": 2.477450166816265, + "grad_norm": 0.2768543064594269, + "learning_rate": 9.516194927825894e-05, + "loss": 1.4877, + "step": 53810 + }, + { + "epoch": 2.4779921079817773, + "grad_norm": 0.20820960402488708, + "learning_rate": 9.515480263377253e-05, + "loss": 1.4711, + "step": 53820 + }, + { + "epoch": 2.4785340491472896, + "grad_norm": 0.18678176403045654, + "learning_rate": 9.514765101508353e-05, + "loss": 1.4923, + "step": 53830 + }, + { + "epoch": 2.4790759903128015, + "grad_norm": 0.22122722864151, + "learning_rate": 9.514049442307782e-05, + "loss": 1.4837, + "step": 53840 + }, + { + "epoch": 2.4796179314783138, + "grad_norm": 0.3165682256221771, + "learning_rate": 9.513333285864192e-05, + "loss": 1.4856, + "step": 53850 + }, + { + "epoch": 2.4797805138279676, + "eval_loss": 2.4556586742401123, + "eval_runtime": 33.9488, + "eval_samples_per_second": 147.281, + "eval_steps_per_second": 0.795, + "step": 53853 + }, + { + "epoch": 2.480159872643826, + "grad_norm": 0.2486167550086975, + "learning_rate": 9.5126166322663e-05, + "loss": 1.4829, + "step": 53860 + }, + { + "epoch": 2.4807018138093384, + "grad_norm": 0.21120484173297882, + "learning_rate": 9.51189948160288e-05, + "loss": 1.479, + "step": 53870 + }, + { + "epoch": 2.4812437549748507, + "grad_norm": 0.18009105324745178, + "learning_rate": 9.511181833962772e-05, + "loss": 1.4698, + "step": 53880 + }, + { + "epoch": 2.4817856961403626, + "grad_norm": 0.19406986236572266, + "learning_rate": 9.51046368943487e-05, + "loss": 1.484, + "step": 53890 + }, + { + "epoch": 2.482327637305875, + "grad_norm": 0.22672545909881592, + "learning_rate": 9.50974504810814e-05, + "loss": 1.4773, + "step": 53900 + }, + { + "epoch": 2.4828695784713872, + "grad_norm": 0.21448560059070587, + "learning_rate": 9.509025910071602e-05, + "loss": 1.4781, + "step": 53910 + }, + { + "epoch": 2.4834115196368995, + "grad_norm": 0.3304313123226166, + "learning_rate": 9.508306275414339e-05, + "loss": 1.472, + "step": 53920 + }, + { + "epoch": 2.483953460802412, + "grad_norm": 0.29814818501472473, + "learning_rate": 9.507586144225497e-05, + "loss": 1.4737, + "step": 53930 + }, + { + "epoch": 2.4844954019679237, + "grad_norm": 0.20384559035301208, + "learning_rate": 9.506865516594282e-05, + "loss": 1.4748, + "step": 53940 + }, + { + "epoch": 2.4844954019679237, + "eval_loss": 2.4536936283111572, + "eval_runtime": 21.8133, + "eval_samples_per_second": 229.218, + "eval_steps_per_second": 1.238, + "step": 53940 + }, + { + "epoch": 2.485037343133436, + "grad_norm": 0.1892993301153183, + "learning_rate": 9.506144392609965e-05, + "loss": 1.4846, + "step": 53950 + }, + { + "epoch": 2.4855792842989484, + "grad_norm": 0.37276574969291687, + "learning_rate": 9.505422772361872e-05, + "loss": 1.4809, + "step": 53960 + }, + { + "epoch": 2.4861212254644607, + "grad_norm": 0.5390621423721313, + "learning_rate": 9.504700655939396e-05, + "loss": 1.4752, + "step": 53970 + }, + { + "epoch": 2.4866631666299726, + "grad_norm": 0.26775336265563965, + "learning_rate": 9.50397804343199e-05, + "loss": 1.4858, + "step": 53980 + }, + { + "epoch": 2.487205107795485, + "grad_norm": 0.18470676243305206, + "learning_rate": 9.503254934929165e-05, + "loss": 1.4741, + "step": 53990 + }, + { + "epoch": 2.487747048960997, + "grad_norm": 0.24282990396022797, + "learning_rate": 9.502531330520501e-05, + "loss": 1.4781, + "step": 54000 + }, + { + "epoch": 2.4882889901265095, + "grad_norm": 0.19152526557445526, + "learning_rate": 9.501807230295634e-05, + "loss": 1.4714, + "step": 54010 + }, + { + "epoch": 2.4888309312920214, + "grad_norm": 0.24509835243225098, + "learning_rate": 9.50108263434426e-05, + "loss": 1.4673, + "step": 54020 + }, + { + "epoch": 2.4892102901078803, + "eval_loss": 2.4450418949127197, + "eval_runtime": 21.9853, + "eval_samples_per_second": 227.425, + "eval_steps_per_second": 1.228, + "step": 54027 + }, + { + "epoch": 2.4893728724575337, + "grad_norm": 0.20996253192424774, + "learning_rate": 9.500357542756139e-05, + "loss": 1.4696, + "step": 54030 + }, + { + "epoch": 2.489914813623046, + "grad_norm": 0.3065512776374817, + "learning_rate": 9.499631955621097e-05, + "loss": 1.4837, + "step": 54040 + }, + { + "epoch": 2.4904567547885583, + "grad_norm": 0.2927444577217102, + "learning_rate": 9.498905873029012e-05, + "loss": 1.4833, + "step": 54050 + }, + { + "epoch": 2.4909986959540706, + "grad_norm": 0.3556462228298187, + "learning_rate": 9.498179295069827e-05, + "loss": 1.4744, + "step": 54060 + }, + { + "epoch": 2.4915406371195825, + "grad_norm": 0.3211055099964142, + "learning_rate": 9.49745222183355e-05, + "loss": 1.4723, + "step": 54070 + }, + { + "epoch": 2.492082578285095, + "grad_norm": 0.510176956653595, + "learning_rate": 9.496724653410249e-05, + "loss": 1.4706, + "step": 54080 + }, + { + "epoch": 2.492624519450607, + "grad_norm": 0.20977604389190674, + "learning_rate": 9.495996589890048e-05, + "loss": 1.4692, + "step": 54090 + }, + { + "epoch": 2.4931664606161195, + "grad_norm": 0.19488461315631866, + "learning_rate": 9.495268031363138e-05, + "loss": 1.4821, + "step": 54100 + }, + { + "epoch": 2.493708401781632, + "grad_norm": 0.21856850385665894, + "learning_rate": 9.494538977919771e-05, + "loss": 1.4704, + "step": 54110 + }, + { + "epoch": 2.4939251782478364, + "eval_loss": 2.4471306800842285, + "eval_runtime": 21.9864, + "eval_samples_per_second": 227.414, + "eval_steps_per_second": 1.228, + "step": 54114 + }, + { + "epoch": 2.4942503429471437, + "grad_norm": 0.21702884137630463, + "learning_rate": 9.493809429650256e-05, + "loss": 1.4802, + "step": 54120 + }, + { + "epoch": 2.494792284112656, + "grad_norm": 0.17151543498039246, + "learning_rate": 9.49307938664497e-05, + "loss": 1.4798, + "step": 54130 + }, + { + "epoch": 2.4953342252781683, + "grad_norm": 0.20608776807785034, + "learning_rate": 9.492348848994345e-05, + "loss": 1.4736, + "step": 54140 + }, + { + "epoch": 2.4958761664436806, + "grad_norm": 0.29367008805274963, + "learning_rate": 9.491617816788879e-05, + "loss": 1.4799, + "step": 54150 + }, + { + "epoch": 2.496418107609193, + "grad_norm": 0.38167864084243774, + "learning_rate": 9.490886290119128e-05, + "loss": 1.4636, + "step": 54160 + }, + { + "epoch": 2.496960048774705, + "grad_norm": 0.18490886688232422, + "learning_rate": 9.490154269075708e-05, + "loss": 1.4694, + "step": 54170 + }, + { + "epoch": 2.497501989940217, + "grad_norm": 0.44868624210357666, + "learning_rate": 9.489421753749303e-05, + "loss": 1.4783, + "step": 54180 + }, + { + "epoch": 2.4980439311057294, + "grad_norm": 0.34574294090270996, + "learning_rate": 9.488688744230649e-05, + "loss": 1.4775, + "step": 54190 + }, + { + "epoch": 2.4985858722712417, + "grad_norm": 0.34734290838241577, + "learning_rate": 9.487955240610553e-05, + "loss": 1.4725, + "step": 54200 + }, + { + "epoch": 2.498640066387793, + "eval_loss": 2.442676305770874, + "eval_runtime": 21.9936, + "eval_samples_per_second": 227.339, + "eval_steps_per_second": 1.228, + "step": 54201 + }, + { + "epoch": 2.4991278134367536, + "grad_norm": 0.18172703683376312, + "learning_rate": 9.487221242979874e-05, + "loss": 1.4862, + "step": 54210 + }, + { + "epoch": 2.499669754602266, + "grad_norm": 0.4136309027671814, + "learning_rate": 9.486486751429539e-05, + "loss": 1.4723, + "step": 54220 + }, + { + "epoch": 2.5002116957677782, + "grad_norm": 0.2756052315235138, + "learning_rate": 9.485751766050533e-05, + "loss": 1.4846, + "step": 54230 + }, + { + "epoch": 2.5007536369332906, + "grad_norm": 0.34378984570503235, + "learning_rate": 9.485016286933902e-05, + "loss": 1.4631, + "step": 54240 + }, + { + "epoch": 2.5012955780988024, + "grad_norm": 0.2502152919769287, + "learning_rate": 9.484280314170754e-05, + "loss": 1.4684, + "step": 54250 + }, + { + "epoch": 2.5018375192643147, + "grad_norm": 0.20602858066558838, + "learning_rate": 9.483543847852258e-05, + "loss": 1.461, + "step": 54260 + }, + { + "epoch": 2.502379460429827, + "grad_norm": 0.19559136033058167, + "learning_rate": 9.482806888069648e-05, + "loss": 1.4821, + "step": 54270 + }, + { + "epoch": 2.5029214015953394, + "grad_norm": 0.19327175617218018, + "learning_rate": 9.48206943491421e-05, + "loss": 1.4777, + "step": 54280 + }, + { + "epoch": 2.503354954527749, + "eval_loss": 2.4406850337982178, + "eval_runtime": 21.9849, + "eval_samples_per_second": 227.429, + "eval_steps_per_second": 1.228, + "step": 54288 + }, + { + "epoch": 2.5034633427608517, + "grad_norm": 0.19267085194587708, + "learning_rate": 9.4813314884773e-05, + "loss": 1.48, + "step": 54290 + }, + { + "epoch": 2.5040052839263636, + "grad_norm": 0.21109183132648468, + "learning_rate": 9.48059304885033e-05, + "loss": 1.4782, + "step": 54300 + }, + { + "epoch": 2.504547225091876, + "grad_norm": 0.2512279152870178, + "learning_rate": 9.479854116124775e-05, + "loss": 1.4898, + "step": 54310 + }, + { + "epoch": 2.505089166257388, + "grad_norm": 0.24899804592132568, + "learning_rate": 9.479114690392172e-05, + "loss": 1.4642, + "step": 54320 + }, + { + "epoch": 2.5056311074229005, + "grad_norm": 0.20213483273983002, + "learning_rate": 9.478374771744115e-05, + "loss": 1.4686, + "step": 54330 + }, + { + "epoch": 2.506173048588413, + "grad_norm": 0.1799444556236267, + "learning_rate": 9.477634360272265e-05, + "loss": 1.4745, + "step": 54340 + }, + { + "epoch": 2.5067149897539247, + "grad_norm": 0.347083181142807, + "learning_rate": 9.476893456068338e-05, + "loss": 1.48, + "step": 54350 + }, + { + "epoch": 2.507256930919437, + "grad_norm": 0.2862820625305176, + "learning_rate": 9.476152059224117e-05, + "loss": 1.4804, + "step": 54360 + }, + { + "epoch": 2.5077988720849493, + "grad_norm": 0.285040020942688, + "learning_rate": 9.475410169831442e-05, + "loss": 1.4798, + "step": 54370 + }, + { + "epoch": 2.5080698426677053, + "eval_loss": 2.4463353157043457, + "eval_runtime": 22.1709, + "eval_samples_per_second": 225.521, + "eval_steps_per_second": 1.218, + "step": 54375 + }, + { + "epoch": 2.5083408132504617, + "grad_norm": 0.2216855138540268, + "learning_rate": 9.474667787982213e-05, + "loss": 1.4854, + "step": 54380 + }, + { + "epoch": 2.508882754415974, + "grad_norm": 0.307620108127594, + "learning_rate": 9.473924913768396e-05, + "loss": 1.4787, + "step": 54390 + }, + { + "epoch": 2.509424695581486, + "grad_norm": 0.30470430850982666, + "learning_rate": 9.473181547282013e-05, + "loss": 1.4721, + "step": 54400 + }, + { + "epoch": 2.509966636746998, + "grad_norm": 0.35153964161872864, + "learning_rate": 9.47243768861515e-05, + "loss": 1.4701, + "step": 54410 + }, + { + "epoch": 2.5105085779125105, + "grad_norm": 0.24849040806293488, + "learning_rate": 9.471693337859953e-05, + "loss": 1.4609, + "step": 54420 + }, + { + "epoch": 2.5110505190780223, + "grad_norm": 0.3956458270549774, + "learning_rate": 9.470948495108628e-05, + "loss": 1.4819, + "step": 54430 + }, + { + "epoch": 2.5115924602435347, + "grad_norm": 0.21388791501522064, + "learning_rate": 9.470203160453445e-05, + "loss": 1.4825, + "step": 54440 + }, + { + "epoch": 2.512134401409047, + "grad_norm": 0.35142725706100464, + "learning_rate": 9.46945733398673e-05, + "loss": 1.4787, + "step": 54450 + }, + { + "epoch": 2.5126763425745593, + "grad_norm": 0.22184672951698303, + "learning_rate": 9.468711015800874e-05, + "loss": 1.4662, + "step": 54460 + }, + { + "epoch": 2.512784730807662, + "eval_loss": 2.446195363998413, + "eval_runtime": 22.0811, + "eval_samples_per_second": 226.438, + "eval_steps_per_second": 1.223, + "step": 54462 + }, + { + "epoch": 2.5132182837400716, + "grad_norm": 0.27088162302970886, + "learning_rate": 9.46796420598833e-05, + "loss": 1.473, + "step": 54470 + }, + { + "epoch": 2.5137602249055835, + "grad_norm": 0.2419961541891098, + "learning_rate": 9.467216904641606e-05, + "loss": 1.4824, + "step": 54480 + }, + { + "epoch": 2.514302166071096, + "grad_norm": 0.2746829688549042, + "learning_rate": 9.466469111853275e-05, + "loss": 1.4729, + "step": 54490 + }, + { + "epoch": 2.514844107236608, + "grad_norm": 0.2866189479827881, + "learning_rate": 9.465720827715972e-05, + "loss": 1.476, + "step": 54500 + }, + { + "epoch": 2.5153860484021204, + "grad_norm": 0.19014035165309906, + "learning_rate": 9.464972052322388e-05, + "loss": 1.4803, + "step": 54510 + }, + { + "epoch": 2.5159279895676327, + "grad_norm": 0.2546852231025696, + "learning_rate": 9.464222785765284e-05, + "loss": 1.4749, + "step": 54520 + }, + { + "epoch": 2.5164699307331446, + "grad_norm": 0.3189975917339325, + "learning_rate": 9.46347302813747e-05, + "loss": 1.4838, + "step": 54530 + }, + { + "epoch": 2.517011871898657, + "grad_norm": 0.25641825795173645, + "learning_rate": 9.462722779531825e-05, + "loss": 1.4775, + "step": 54540 + }, + { + "epoch": 2.517499618947618, + "eval_loss": 2.4492218494415283, + "eval_runtime": 22.0696, + "eval_samples_per_second": 226.556, + "eval_steps_per_second": 1.223, + "step": 54549 + }, + { + "epoch": 2.5175538130641693, + "grad_norm": 0.36368492245674133, + "learning_rate": 9.461972040041286e-05, + "loss": 1.4834, + "step": 54550 + }, + { + "epoch": 2.5180957542296816, + "grad_norm": 0.41073617339134216, + "learning_rate": 9.461220809758854e-05, + "loss": 1.475, + "step": 54560 + }, + { + "epoch": 2.518637695395194, + "grad_norm": 0.42227840423583984, + "learning_rate": 9.460469088777585e-05, + "loss": 1.4782, + "step": 54570 + }, + { + "epoch": 2.5191796365607058, + "grad_norm": 0.24830390512943268, + "learning_rate": 9.459716877190599e-05, + "loss": 1.4789, + "step": 54580 + }, + { + "epoch": 2.519721577726218, + "grad_norm": 0.2122548371553421, + "learning_rate": 9.458964175091078e-05, + "loss": 1.4714, + "step": 54590 + }, + { + "epoch": 2.5202635188917304, + "grad_norm": 0.1787809282541275, + "learning_rate": 9.458210982572264e-05, + "loss": 1.4732, + "step": 54600 + }, + { + "epoch": 2.5208054600572423, + "grad_norm": 0.19361905753612518, + "learning_rate": 9.457457299727458e-05, + "loss": 1.4837, + "step": 54610 + }, + { + "epoch": 2.521347401222755, + "grad_norm": 0.459349662065506, + "learning_rate": 9.456703126650023e-05, + "loss": 1.4909, + "step": 54620 + }, + { + "epoch": 2.521889342388267, + "grad_norm": 0.31457701325416565, + "learning_rate": 9.455948463433384e-05, + "loss": 1.4623, + "step": 54630 + }, + { + "epoch": 2.5222145070875746, + "eval_loss": 2.4459049701690674, + "eval_runtime": 22.0427, + "eval_samples_per_second": 226.832, + "eval_steps_per_second": 1.225, + "step": 54636 + }, + { + "epoch": 2.522431283553779, + "grad_norm": 0.2660439610481262, + "learning_rate": 9.455193310171022e-05, + "loss": 1.4708, + "step": 54640 + }, + { + "epoch": 2.5229732247192915, + "grad_norm": 0.22614122927188873, + "learning_rate": 9.454437666956486e-05, + "loss": 1.4763, + "step": 54650 + }, + { + "epoch": 2.5235151658848034, + "grad_norm": 0.23727241158485413, + "learning_rate": 9.45368153388338e-05, + "loss": 1.4875, + "step": 54660 + }, + { + "epoch": 2.5240571070503157, + "grad_norm": 0.20123521983623505, + "learning_rate": 9.452924911045372e-05, + "loss": 1.4907, + "step": 54670 + }, + { + "epoch": 2.524599048215828, + "grad_norm": 0.28257882595062256, + "learning_rate": 9.452167798536186e-05, + "loss": 1.4782, + "step": 54680 + }, + { + "epoch": 2.5251409893813404, + "grad_norm": 0.16655373573303223, + "learning_rate": 9.451410196449613e-05, + "loss": 1.4794, + "step": 54690 + }, + { + "epoch": 2.5256829305468527, + "grad_norm": 0.28501424193382263, + "learning_rate": 9.450652104879499e-05, + "loss": 1.4713, + "step": 54700 + }, + { + "epoch": 2.5262248717123645, + "grad_norm": 0.24294120073318481, + "learning_rate": 9.449893523919754e-05, + "loss": 1.4853, + "step": 54710 + }, + { + "epoch": 2.526766812877877, + "grad_norm": 0.5156522393226624, + "learning_rate": 9.44913445366435e-05, + "loss": 1.4686, + "step": 54720 + }, + { + "epoch": 2.5269293952275307, + "eval_loss": 2.447787046432495, + "eval_runtime": 22.1759, + "eval_samples_per_second": 225.47, + "eval_steps_per_second": 1.218, + "step": 54723 + }, + { + "epoch": 2.527308754043389, + "grad_norm": 0.21539968252182007, + "learning_rate": 9.448374894207314e-05, + "loss": 1.4801, + "step": 54730 + }, + { + "epoch": 2.5278506952089015, + "grad_norm": 0.268772155046463, + "learning_rate": 9.447614845642738e-05, + "loss": 1.4644, + "step": 54740 + }, + { + "epoch": 2.528392636374414, + "grad_norm": 0.1783050000667572, + "learning_rate": 9.446854308064774e-05, + "loss": 1.4827, + "step": 54750 + }, + { + "epoch": 2.5289345775399257, + "grad_norm": 0.17679934203624725, + "learning_rate": 9.446093281567635e-05, + "loss": 1.4651, + "step": 54760 + }, + { + "epoch": 2.529476518705438, + "grad_norm": 0.2454293668270111, + "learning_rate": 9.445331766245592e-05, + "loss": 1.4663, + "step": 54770 + }, + { + "epoch": 2.5300184598709503, + "grad_norm": 0.1850767731666565, + "learning_rate": 9.444569762192977e-05, + "loss": 1.4661, + "step": 54780 + }, + { + "epoch": 2.5305604010364626, + "grad_norm": 0.2817839980125427, + "learning_rate": 9.443807269504187e-05, + "loss": 1.4777, + "step": 54790 + }, + { + "epoch": 2.531102342201975, + "grad_norm": 0.2719174921512604, + "learning_rate": 9.443044288273675e-05, + "loss": 1.4745, + "step": 54800 + }, + { + "epoch": 2.531644283367487, + "grad_norm": 0.7066235542297363, + "learning_rate": 9.442280818595955e-05, + "loss": 1.466, + "step": 54810 + }, + { + "epoch": 2.531644283367487, + "eval_loss": 2.447768211364746, + "eval_runtime": 21.9147, + "eval_samples_per_second": 228.157, + "eval_steps_per_second": 1.232, + "step": 54810 + }, + { + "epoch": 2.532186224532999, + "grad_norm": 0.3116154968738556, + "learning_rate": 9.441516860565602e-05, + "loss": 1.4752, + "step": 54820 + }, + { + "epoch": 2.5327281656985114, + "grad_norm": 0.29827791452407837, + "learning_rate": 9.440752414277254e-05, + "loss": 1.4729, + "step": 54830 + }, + { + "epoch": 2.5332701068640233, + "grad_norm": 0.2395966500043869, + "learning_rate": 9.439987479825607e-05, + "loss": 1.4742, + "step": 54840 + }, + { + "epoch": 2.5338120480295356, + "grad_norm": 0.21512676775455475, + "learning_rate": 9.439222057305414e-05, + "loss": 1.4761, + "step": 54850 + }, + { + "epoch": 2.534353989195048, + "grad_norm": 0.19757066667079926, + "learning_rate": 9.438456146811496e-05, + "loss": 1.4736, + "step": 54860 + }, + { + "epoch": 2.5348959303605603, + "grad_norm": 0.2103499174118042, + "learning_rate": 9.43768974843873e-05, + "loss": 1.4746, + "step": 54870 + }, + { + "epoch": 2.5354378715260726, + "grad_norm": 0.19325312972068787, + "learning_rate": 9.436922862282052e-05, + "loss": 1.4561, + "step": 54880 + }, + { + "epoch": 2.5359798126915845, + "grad_norm": 0.2294219732284546, + "learning_rate": 9.436155488436464e-05, + "loss": 1.4783, + "step": 54890 + }, + { + "epoch": 2.5363591715074434, + "eval_loss": 2.4521801471710205, + "eval_runtime": 22.0914, + "eval_samples_per_second": 226.332, + "eval_steps_per_second": 1.222, + "step": 54897 + }, + { + "epoch": 2.5365217538570968, + "grad_norm": 0.40045541524887085, + "learning_rate": 9.435387626997024e-05, + "loss": 1.4763, + "step": 54900 + }, + { + "epoch": 2.537063695022609, + "grad_norm": 0.2468022257089615, + "learning_rate": 9.434619278058848e-05, + "loss": 1.4752, + "step": 54910 + }, + { + "epoch": 2.5376056361881214, + "grad_norm": 0.2977105975151062, + "learning_rate": 9.433850441717122e-05, + "loss": 1.4786, + "step": 54920 + }, + { + "epoch": 2.5381475773536337, + "grad_norm": 0.20533820986747742, + "learning_rate": 9.433081118067078e-05, + "loss": 1.479, + "step": 54930 + }, + { + "epoch": 2.5386895185191456, + "grad_norm": 0.2988840937614441, + "learning_rate": 9.432311307204024e-05, + "loss": 1.4668, + "step": 54940 + }, + { + "epoch": 2.539231459684658, + "grad_norm": 0.24603962898254395, + "learning_rate": 9.431541009223316e-05, + "loss": 1.4821, + "step": 54950 + }, + { + "epoch": 2.5397734008501702, + "grad_norm": 0.1960807591676712, + "learning_rate": 9.430770224220376e-05, + "loss": 1.4664, + "step": 54960 + }, + { + "epoch": 2.5403153420156825, + "grad_norm": 0.20046474039554596, + "learning_rate": 9.429998952290688e-05, + "loss": 1.4732, + "step": 54970 + }, + { + "epoch": 2.540857283181195, + "grad_norm": 0.1977715790271759, + "learning_rate": 9.429227193529791e-05, + "loss": 1.4807, + "step": 54980 + }, + { + "epoch": 2.5410740596473995, + "eval_loss": 2.445849657058716, + "eval_runtime": 21.9883, + "eval_samples_per_second": 227.394, + "eval_steps_per_second": 1.228, + "step": 54984 + }, + { + "epoch": 2.5413992243467067, + "grad_norm": 0.20431387424468994, + "learning_rate": 9.42845494803329e-05, + "loss": 1.4758, + "step": 54990 + }, + { + "epoch": 2.541941165512219, + "grad_norm": 0.3124183416366577, + "learning_rate": 9.427682215896846e-05, + "loss": 1.4776, + "step": 55000 + }, + { + "epoch": 2.5424831066777314, + "grad_norm": 0.17832711338996887, + "learning_rate": 9.426908997216179e-05, + "loss": 1.4732, + "step": 55010 + }, + { + "epoch": 2.5430250478432437, + "grad_norm": 0.2197619527578354, + "learning_rate": 9.426135292087076e-05, + "loss": 1.4652, + "step": 55020 + }, + { + "epoch": 2.543566989008756, + "grad_norm": 0.3124410808086395, + "learning_rate": 9.425361100605378e-05, + "loss": 1.4776, + "step": 55030 + }, + { + "epoch": 2.544108930174268, + "grad_norm": 0.28143149614334106, + "learning_rate": 9.424586422866989e-05, + "loss": 1.4755, + "step": 55040 + }, + { + "epoch": 2.54465087133978, + "grad_norm": 0.3505375385284424, + "learning_rate": 9.423811258967872e-05, + "loss": 1.4826, + "step": 55050 + }, + { + "epoch": 2.5451928125052925, + "grad_norm": 0.30467939376831055, + "learning_rate": 9.423035609004054e-05, + "loss": 1.4683, + "step": 55060 + }, + { + "epoch": 2.5457347536708044, + "grad_norm": 0.2334168255329132, + "learning_rate": 9.422259473071615e-05, + "loss": 1.4676, + "step": 55070 + }, + { + "epoch": 2.545788947787356, + "eval_loss": 2.4590961933135986, + "eval_runtime": 22.0238, + "eval_samples_per_second": 227.027, + "eval_steps_per_second": 1.226, + "step": 55071 + }, + { + "epoch": 2.5462766948363167, + "grad_norm": 0.24799096584320068, + "learning_rate": 9.421482851266702e-05, + "loss": 1.4726, + "step": 55080 + }, + { + "epoch": 2.546818636001829, + "grad_norm": 0.17815372347831726, + "learning_rate": 9.420705743685519e-05, + "loss": 1.4807, + "step": 55090 + }, + { + "epoch": 2.5473605771673413, + "grad_norm": 0.1867280900478363, + "learning_rate": 9.419928150424328e-05, + "loss": 1.479, + "step": 55100 + }, + { + "epoch": 2.5479025183328536, + "grad_norm": 0.18417419493198395, + "learning_rate": 9.419150071579459e-05, + "loss": 1.4774, + "step": 55110 + }, + { + "epoch": 2.5484444594983655, + "grad_norm": 0.24251243472099304, + "learning_rate": 9.418371507247294e-05, + "loss": 1.4771, + "step": 55120 + }, + { + "epoch": 2.548986400663878, + "grad_norm": 0.22498652338981628, + "learning_rate": 9.417592457524278e-05, + "loss": 1.4618, + "step": 55130 + }, + { + "epoch": 2.54952834182939, + "grad_norm": 0.3402760624885559, + "learning_rate": 9.416812922506917e-05, + "loss": 1.4772, + "step": 55140 + }, + { + "epoch": 2.5500702829949025, + "grad_norm": 0.22304527461528778, + "learning_rate": 9.416032902291778e-05, + "loss": 1.4924, + "step": 55150 + }, + { + "epoch": 2.5505038359273122, + "eval_loss": 2.4431533813476562, + "eval_runtime": 22.327, + "eval_samples_per_second": 223.944, + "eval_steps_per_second": 1.209, + "step": 55158 + }, + { + "epoch": 2.5506122241604148, + "grad_norm": 0.17721644043922424, + "learning_rate": 9.415252396975482e-05, + "loss": 1.4695, + "step": 55160 + }, + { + "epoch": 2.5511541653259266, + "grad_norm": 0.2341170608997345, + "learning_rate": 9.414471406654718e-05, + "loss": 1.4676, + "step": 55170 + }, + { + "epoch": 2.551696106491439, + "grad_norm": 0.17687572538852692, + "learning_rate": 9.413689931426232e-05, + "loss": 1.4775, + "step": 55180 + }, + { + "epoch": 2.5522380476569513, + "grad_norm": 0.3460327982902527, + "learning_rate": 9.412907971386828e-05, + "loss": 1.4701, + "step": 55190 + }, + { + "epoch": 2.5527799888224636, + "grad_norm": 0.3097844123840332, + "learning_rate": 9.412125526633373e-05, + "loss": 1.4664, + "step": 55200 + }, + { + "epoch": 2.553321929987976, + "grad_norm": 0.2532496452331543, + "learning_rate": 9.411342597262794e-05, + "loss": 1.4723, + "step": 55210 + }, + { + "epoch": 2.553863871153488, + "grad_norm": 0.18894581496715546, + "learning_rate": 9.410559183372072e-05, + "loss": 1.4768, + "step": 55220 + }, + { + "epoch": 2.554405812319, + "grad_norm": 0.3138901889324188, + "learning_rate": 9.409775285058259e-05, + "loss": 1.4703, + "step": 55230 + }, + { + "epoch": 2.5549477534845124, + "grad_norm": 0.5384250283241272, + "learning_rate": 9.408990902418458e-05, + "loss": 1.4734, + "step": 55240 + }, + { + "epoch": 2.5552187240672684, + "eval_loss": 2.4425833225250244, + "eval_runtime": 22.3536, + "eval_samples_per_second": 223.678, + "eval_steps_per_second": 1.208, + "step": 55245 + }, + { + "epoch": 2.0001083882331026, + "grad_norm": 0.246513232588768, + "learning_rate": 9.408206035549835e-05, + "loss": 1.4546, + "step": 55250 + }, + { + "epoch": 2.000650329398615, + "grad_norm": 0.18351303040981293, + "learning_rate": 9.407420684549616e-05, + "loss": 1.4714, + "step": 55260 + }, + { + "epoch": 2.0011922705641267, + "grad_norm": 0.38743826746940613, + "learning_rate": 9.406634849515087e-05, + "loss": 1.4758, + "step": 55270 + }, + { + "epoch": 2.001734211729639, + "grad_norm": 0.2079382985830307, + "learning_rate": 9.405848530543593e-05, + "loss": 1.4662, + "step": 55280 + }, + { + "epoch": 2.0022761528951514, + "grad_norm": 0.19639058411121368, + "learning_rate": 9.40506172773254e-05, + "loss": 1.4805, + "step": 55290 + }, + { + "epoch": 2.0028180940606637, + "grad_norm": 0.19277872145175934, + "learning_rate": 9.404274441179397e-05, + "loss": 1.4692, + "step": 55300 + }, + { + "epoch": 2.0033600352261756, + "grad_norm": 0.3094417452812195, + "learning_rate": 9.403486670981685e-05, + "loss": 1.4718, + "step": 55310 + }, + { + "epoch": 2.003901976391688, + "grad_norm": 0.2434214949607849, + "learning_rate": 9.402698417236991e-05, + "loss": 1.4711, + "step": 55320 + }, + { + "epoch": 2.0044439175572, + "grad_norm": 0.37863555550575256, + "learning_rate": 9.401909680042962e-05, + "loss": 1.4704, + "step": 55330 + }, + { + "epoch": 2.0045523057903027, + "eval_loss": 2.450153112411499, + "eval_runtime": 62.0654, + "eval_samples_per_second": 80.56, + "eval_steps_per_second": 0.435, + "step": 55332 + }, + { + "epoch": 2.0049858587227125, + "grad_norm": 0.7027290463447571, + "learning_rate": 9.401120459497302e-05, + "loss": 1.4786, + "step": 55340 + }, + { + "epoch": 2.005527799888225, + "grad_norm": 0.42648592591285706, + "learning_rate": 9.400330755697774e-05, + "loss": 1.4755, + "step": 55350 + }, + { + "epoch": 2.0060697410537367, + "grad_norm": 0.22446554899215698, + "learning_rate": 9.399540568742209e-05, + "loss": 1.4791, + "step": 55360 + }, + { + "epoch": 2.006611682219249, + "grad_norm": 0.2071634829044342, + "learning_rate": 9.398749898728487e-05, + "loss": 1.4712, + "step": 55370 + }, + { + "epoch": 2.0071536233847613, + "grad_norm": 0.22032909095287323, + "learning_rate": 9.397958745754554e-05, + "loss": 1.4703, + "step": 55380 + }, + { + "epoch": 2.0076955645502736, + "grad_norm": 0.24659113585948944, + "learning_rate": 9.397167109918416e-05, + "loss": 1.4654, + "step": 55390 + }, + { + "epoch": 2.0082375057157855, + "grad_norm": 0.23570233583450317, + "learning_rate": 9.396374991318135e-05, + "loss": 1.4684, + "step": 55400 + }, + { + "epoch": 2.008779446881298, + "grad_norm": 0.3380047380924225, + "learning_rate": 9.395582390051838e-05, + "loss": 1.4691, + "step": 55410 + }, + { + "epoch": 2.009267193930259, + "eval_loss": 2.4444944858551025, + "eval_runtime": 24.1261, + "eval_samples_per_second": 207.245, + "eval_steps_per_second": 1.119, + "step": 55419 + }, + { + "epoch": 2.00932138804681, + "grad_norm": 0.2984738051891327, + "learning_rate": 9.394789306217707e-05, + "loss": 1.4741, + "step": 55420 + }, + { + "epoch": 2.0098633292123225, + "grad_norm": 0.2363492250442505, + "learning_rate": 9.393995739913985e-05, + "loss": 1.474, + "step": 55430 + }, + { + "epoch": 2.010405270377835, + "grad_norm": 0.18736572563648224, + "learning_rate": 9.393201691238976e-05, + "loss": 1.4761, + "step": 55440 + }, + { + "epoch": 2.0109472115433467, + "grad_norm": 0.27000486850738525, + "learning_rate": 9.392407160291045e-05, + "loss": 1.4624, + "step": 55450 + }, + { + "epoch": 2.011489152708859, + "grad_norm": 0.26010996103286743, + "learning_rate": 9.391612147168617e-05, + "loss": 1.4704, + "step": 55460 + }, + { + "epoch": 2.0120310938743713, + "grad_norm": 0.2281525433063507, + "learning_rate": 9.390816651970168e-05, + "loss": 1.465, + "step": 55470 + }, + { + "epoch": 2.0125730350398836, + "grad_norm": 0.20982374250888824, + "learning_rate": 9.390020674794245e-05, + "loss": 1.4664, + "step": 55480 + }, + { + "epoch": 2.013114976205396, + "grad_norm": 0.20972926914691925, + "learning_rate": 9.389224215739452e-05, + "loss": 1.4597, + "step": 55490 + }, + { + "epoch": 2.013656917370908, + "grad_norm": 0.21081332862377167, + "learning_rate": 9.388427274904447e-05, + "loss": 1.4585, + "step": 55500 + }, + { + "epoch": 2.0139820820702155, + "eval_loss": 2.451312780380249, + "eval_runtime": 21.9802, + "eval_samples_per_second": 227.477, + "eval_steps_per_second": 1.228, + "step": 55506 + }, + { + "epoch": 2.01419885853642, + "grad_norm": 0.22025637328624725, + "learning_rate": 9.387629852387952e-05, + "loss": 1.4756, + "step": 55510 + }, + { + "epoch": 2.0147407997019324, + "grad_norm": 0.18817798793315887, + "learning_rate": 9.38683194828875e-05, + "loss": 1.4721, + "step": 55520 + }, + { + "epoch": 2.0152827408674447, + "grad_norm": 0.30482780933380127, + "learning_rate": 9.386033562705681e-05, + "loss": 1.4679, + "step": 55530 + }, + { + "epoch": 2.0158246820329566, + "grad_norm": 0.20792457461357117, + "learning_rate": 9.385234695737647e-05, + "loss": 1.4554, + "step": 55540 + }, + { + "epoch": 2.016366623198469, + "grad_norm": 0.39416491985321045, + "learning_rate": 9.384435347483606e-05, + "loss": 1.4665, + "step": 55550 + }, + { + "epoch": 2.0169085643639812, + "grad_norm": 0.29534631967544556, + "learning_rate": 9.383635518042579e-05, + "loss": 1.4604, + "step": 55560 + }, + { + "epoch": 2.0174505055294936, + "grad_norm": 0.20709377527236938, + "learning_rate": 9.382835207513644e-05, + "loss": 1.4783, + "step": 55570 + }, + { + "epoch": 2.017992446695006, + "grad_norm": 0.1838233321905136, + "learning_rate": 9.382034415995942e-05, + "loss": 1.4729, + "step": 55580 + }, + { + "epoch": 2.0185343878605178, + "grad_norm": 0.20121820271015167, + "learning_rate": 9.381233143588671e-05, + "loss": 1.469, + "step": 55590 + }, + { + "epoch": 2.0186969702101716, + "eval_loss": 2.4466960430145264, + "eval_runtime": 22.0147, + "eval_samples_per_second": 227.121, + "eval_steps_per_second": 1.226, + "step": 55593 + }, + { + "epoch": 2.01907632902603, + "grad_norm": 0.24472562968730927, + "learning_rate": 9.380431390391089e-05, + "loss": 1.4665, + "step": 55600 + }, + { + "epoch": 2.0196182701915424, + "grad_norm": 0.2527831196784973, + "learning_rate": 9.379629156502513e-05, + "loss": 1.4773, + "step": 55610 + }, + { + "epoch": 2.0201602113570547, + "grad_norm": 0.23660998046398163, + "learning_rate": 9.37882644202232e-05, + "loss": 1.4686, + "step": 55620 + }, + { + "epoch": 2.0207021525225666, + "grad_norm": 0.2622048258781433, + "learning_rate": 9.378023247049949e-05, + "loss": 1.4686, + "step": 55630 + }, + { + "epoch": 2.021244093688079, + "grad_norm": 0.1906578242778778, + "learning_rate": 9.377219571684895e-05, + "loss": 1.4706, + "step": 55640 + }, + { + "epoch": 2.021786034853591, + "grad_norm": 0.2571401000022888, + "learning_rate": 9.376415416026712e-05, + "loss": 1.4626, + "step": 55650 + }, + { + "epoch": 2.0223279760191035, + "grad_norm": 0.37901771068573, + "learning_rate": 9.375610780175017e-05, + "loss": 1.4766, + "step": 55660 + }, + { + "epoch": 2.022869917184616, + "grad_norm": 0.23881062865257263, + "learning_rate": 9.374805664229484e-05, + "loss": 1.4653, + "step": 55670 + }, + { + "epoch": 2.0234118583501277, + "grad_norm": 0.5407462120056152, + "learning_rate": 9.37400006828985e-05, + "loss": 1.4641, + "step": 55680 + }, + { + "epoch": 2.0234118583501277, + "eval_loss": 2.4462928771972656, + "eval_runtime": 21.8259, + "eval_samples_per_second": 229.086, + "eval_steps_per_second": 1.237, + "step": 55680 + }, + { + "epoch": 2.02395379951564, + "grad_norm": 0.24240484833717346, + "learning_rate": 9.373193992455907e-05, + "loss": 1.4575, + "step": 55690 + }, + { + "epoch": 2.0244957406811523, + "grad_norm": 0.3244767487049103, + "learning_rate": 9.372387436827507e-05, + "loss": 1.4706, + "step": 55700 + }, + { + "epoch": 2.0250376818466647, + "grad_norm": 0.22140353918075562, + "learning_rate": 9.371580401504564e-05, + "loss": 1.4656, + "step": 55710 + }, + { + "epoch": 2.0255796230121765, + "grad_norm": 0.21475926041603088, + "learning_rate": 9.370772886587049e-05, + "loss": 1.4706, + "step": 55720 + }, + { + "epoch": 2.026121564177689, + "grad_norm": 0.2393941879272461, + "learning_rate": 9.369964892174995e-05, + "loss": 1.4622, + "step": 55730 + }, + { + "epoch": 2.026663505343201, + "grad_norm": 0.23679640889167786, + "learning_rate": 9.369156418368491e-05, + "loss": 1.4604, + "step": 55740 + }, + { + "epoch": 2.0272054465087135, + "grad_norm": 0.2113851010799408, + "learning_rate": 9.368347465267688e-05, + "loss": 1.4713, + "step": 55750 + }, + { + "epoch": 2.027747387674226, + "grad_norm": 0.25252246856689453, + "learning_rate": 9.367538032972797e-05, + "loss": 1.4595, + "step": 55760 + }, + { + "epoch": 2.0281267464900843, + "eval_loss": 2.453479051589966, + "eval_runtime": 21.9786, + "eval_samples_per_second": 227.494, + "eval_steps_per_second": 1.228, + "step": 55767 + }, + { + "epoch": 2.0282893288397377, + "grad_norm": 0.23864135146141052, + "learning_rate": 9.366728121584084e-05, + "loss": 1.4588, + "step": 55770 + }, + { + "epoch": 2.02883127000525, + "grad_norm": 0.24793609976768494, + "learning_rate": 9.365917731201879e-05, + "loss": 1.4593, + "step": 55780 + }, + { + "epoch": 2.0293732111707623, + "grad_norm": 0.21740341186523438, + "learning_rate": 9.365106861926571e-05, + "loss": 1.4745, + "step": 55790 + }, + { + "epoch": 2.0299151523362746, + "grad_norm": 0.19946753978729248, + "learning_rate": 9.364295513858604e-05, + "loss": 1.4723, + "step": 55800 + }, + { + "epoch": 2.030457093501787, + "grad_norm": 0.2608671486377716, + "learning_rate": 9.363483687098487e-05, + "loss": 1.4651, + "step": 55810 + }, + { + "epoch": 2.030999034667299, + "grad_norm": 0.2773636281490326, + "learning_rate": 9.362671381746784e-05, + "loss": 1.4797, + "step": 55820 + }, + { + "epoch": 2.031540975832811, + "grad_norm": 0.3309532105922699, + "learning_rate": 9.361858597904119e-05, + "loss": 1.4591, + "step": 55830 + }, + { + "epoch": 2.0320829169983234, + "grad_norm": 0.343264102935791, + "learning_rate": 9.36104533567118e-05, + "loss": 1.4657, + "step": 55840 + }, + { + "epoch": 2.0326248581638358, + "grad_norm": 0.21975140273571014, + "learning_rate": 9.360231595148708e-05, + "loss": 1.4684, + "step": 55850 + }, + { + "epoch": 2.0328416346300404, + "eval_loss": 2.453883409500122, + "eval_runtime": 21.9811, + "eval_samples_per_second": 227.468, + "eval_steps_per_second": 1.228, + "step": 55854 + }, + { + "epoch": 2.0331667993293476, + "grad_norm": 0.2824975848197937, + "learning_rate": 9.359417376437503e-05, + "loss": 1.4678, + "step": 55860 + }, + { + "epoch": 2.03370874049486, + "grad_norm": 0.41226598620414734, + "learning_rate": 9.35860267963843e-05, + "loss": 1.4693, + "step": 55870 + }, + { + "epoch": 2.0342506816603723, + "grad_norm": 0.37096548080444336, + "learning_rate": 9.35778750485241e-05, + "loss": 1.461, + "step": 55880 + }, + { + "epoch": 2.0347926228258846, + "grad_norm": 0.3635852038860321, + "learning_rate": 9.356971852180421e-05, + "loss": 1.4598, + "step": 55890 + }, + { + "epoch": 2.035334563991397, + "grad_norm": 0.32016879320144653, + "learning_rate": 9.356155721723506e-05, + "loss": 1.4612, + "step": 55900 + }, + { + "epoch": 2.0358765051569088, + "grad_norm": 0.23647364974021912, + "learning_rate": 9.355339113582761e-05, + "loss": 1.4744, + "step": 55910 + }, + { + "epoch": 2.036418446322421, + "grad_norm": 0.40886321663856506, + "learning_rate": 9.354522027859347e-05, + "loss": 1.4683, + "step": 55920 + }, + { + "epoch": 2.0369603874879334, + "grad_norm": 0.3399202525615692, + "learning_rate": 9.353704464654477e-05, + "loss": 1.4559, + "step": 55930 + }, + { + "epoch": 2.0375023286534457, + "grad_norm": 0.2759416401386261, + "learning_rate": 9.35288642406943e-05, + "loss": 1.4592, + "step": 55940 + }, + { + "epoch": 2.037556522769997, + "eval_loss": 2.454275608062744, + "eval_runtime": 21.9844, + "eval_samples_per_second": 227.434, + "eval_steps_per_second": 1.228, + "step": 55941 + }, + { + "epoch": 2.0380442698189576, + "grad_norm": 0.193961039185524, + "learning_rate": 9.352067906205538e-05, + "loss": 1.4633, + "step": 55950 + }, + { + "epoch": 2.03858621098447, + "grad_norm": 0.23220661282539368, + "learning_rate": 9.3512489111642e-05, + "loss": 1.4639, + "step": 55960 + }, + { + "epoch": 2.039128152149982, + "grad_norm": 0.24919536709785461, + "learning_rate": 9.350429439046867e-05, + "loss": 1.4727, + "step": 55970 + }, + { + "epoch": 2.0396700933154945, + "grad_norm": 0.3676759898662567, + "learning_rate": 9.349609489955053e-05, + "loss": 1.477, + "step": 55980 + }, + { + "epoch": 2.040212034481007, + "grad_norm": 0.5084336996078491, + "learning_rate": 9.348789063990328e-05, + "loss": 1.4609, + "step": 55990 + }, + { + "epoch": 2.0407539756465187, + "grad_norm": 0.33119305968284607, + "learning_rate": 9.347968161254321e-05, + "loss": 1.4632, + "step": 56000 + }, + { + "epoch": 2.041295916812031, + "grad_norm": 0.23832644522190094, + "learning_rate": 9.347146781848726e-05, + "loss": 1.4682, + "step": 56010 + }, + { + "epoch": 2.0418378579775434, + "grad_norm": 0.2528223395347595, + "learning_rate": 9.346324925875293e-05, + "loss": 1.4668, + "step": 56020 + }, + { + "epoch": 2.042271410909953, + "eval_loss": 2.461331844329834, + "eval_runtime": 21.9887, + "eval_samples_per_second": 227.39, + "eval_steps_per_second": 1.228, + "step": 56028 + }, + { + "epoch": 2.0423797991430557, + "grad_norm": 0.30627909302711487, + "learning_rate": 9.345502593435824e-05, + "loss": 1.4561, + "step": 56030 + }, + { + "epoch": 2.0429217403085675, + "grad_norm": 0.275127112865448, + "learning_rate": 9.34467978463219e-05, + "loss": 1.4664, + "step": 56040 + }, + { + "epoch": 2.04346368147408, + "grad_norm": 0.25963136553764343, + "learning_rate": 9.343856499566315e-05, + "loss": 1.471, + "step": 56050 + }, + { + "epoch": 2.044005622639592, + "grad_norm": 0.18673470616340637, + "learning_rate": 9.343032738340187e-05, + "loss": 1.4709, + "step": 56060 + }, + { + "epoch": 2.0445475638051045, + "grad_norm": 0.23496182262897491, + "learning_rate": 9.342208501055849e-05, + "loss": 1.4652, + "step": 56070 + }, + { + "epoch": 2.045089504970617, + "grad_norm": 0.23610053956508636, + "learning_rate": 9.341383787815402e-05, + "loss": 1.4712, + "step": 56080 + }, + { + "epoch": 2.0456314461361287, + "grad_norm": 0.2762158215045929, + "learning_rate": 9.340558598721008e-05, + "loss": 1.4671, + "step": 56090 + }, + { + "epoch": 2.046173387301641, + "grad_norm": 0.23105528950691223, + "learning_rate": 9.339732933874891e-05, + "loss": 1.4625, + "step": 56100 + }, + { + "epoch": 2.0467153284671533, + "grad_norm": 0.4698254466056824, + "learning_rate": 9.338906793379327e-05, + "loss": 1.4586, + "step": 56110 + }, + { + "epoch": 2.0469862990499093, + "eval_loss": 2.449103832244873, + "eval_runtime": 22.0267, + "eval_samples_per_second": 226.997, + "eval_steps_per_second": 1.226, + "step": 56115 + }, + { + "epoch": 2.0472572696326656, + "grad_norm": 0.311443954706192, + "learning_rate": 9.338080177336656e-05, + "loss": 1.4692, + "step": 56120 + }, + { + "epoch": 2.0477992107981775, + "grad_norm": 0.3137112557888031, + "learning_rate": 9.337253085849276e-05, + "loss": 1.4666, + "step": 56130 + }, + { + "epoch": 2.04834115196369, + "grad_norm": 0.27474987506866455, + "learning_rate": 9.336425519019644e-05, + "loss": 1.4654, + "step": 56140 + }, + { + "epoch": 2.048883093129202, + "grad_norm": 0.21653681993484497, + "learning_rate": 9.335597476950275e-05, + "loss": 1.4556, + "step": 56150 + }, + { + "epoch": 2.0494250342947145, + "grad_norm": 0.21602825820446014, + "learning_rate": 9.334768959743742e-05, + "loss": 1.4701, + "step": 56160 + }, + { + "epoch": 2.0499669754602268, + "grad_norm": 0.2874417006969452, + "learning_rate": 9.333939967502681e-05, + "loss": 1.4702, + "step": 56170 + }, + { + "epoch": 2.0505089166257386, + "grad_norm": 0.3574830889701843, + "learning_rate": 9.333110500329781e-05, + "loss": 1.4587, + "step": 56180 + }, + { + "epoch": 2.051050857791251, + "grad_norm": 0.33918556571006775, + "learning_rate": 9.332280558327795e-05, + "loss": 1.4667, + "step": 56190 + }, + { + "epoch": 2.0515927989567633, + "grad_norm": 0.25858092308044434, + "learning_rate": 9.33145014159953e-05, + "loss": 1.4654, + "step": 56200 + }, + { + "epoch": 2.051701187189866, + "eval_loss": 2.4660325050354004, + "eval_runtime": 21.9813, + "eval_samples_per_second": 227.466, + "eval_steps_per_second": 1.228, + "step": 56202 + }, + { + "epoch": 2.0521347401222756, + "grad_norm": 0.24299031496047974, + "learning_rate": 9.330619250247855e-05, + "loss": 1.4677, + "step": 56210 + }, + { + "epoch": 2.052676681287788, + "grad_norm": 0.2615341246128082, + "learning_rate": 9.3297878843757e-05, + "loss": 1.4633, + "step": 56220 + }, + { + "epoch": 2.0532186224533, + "grad_norm": 0.34123480319976807, + "learning_rate": 9.328956044086049e-05, + "loss": 1.4689, + "step": 56230 + }, + { + "epoch": 2.053760563618812, + "grad_norm": 0.3530578315258026, + "learning_rate": 9.328123729481947e-05, + "loss": 1.4615, + "step": 56240 + }, + { + "epoch": 2.0543025047843244, + "grad_norm": 0.18291862308979034, + "learning_rate": 9.327290940666497e-05, + "loss": 1.4621, + "step": 56250 + }, + { + "epoch": 2.0548444459498367, + "grad_norm": 0.19218549132347107, + "learning_rate": 9.326457677742861e-05, + "loss": 1.4736, + "step": 56260 + }, + { + "epoch": 2.0553863871153486, + "grad_norm": 0.26161691546440125, + "learning_rate": 9.325623940814263e-05, + "loss": 1.4674, + "step": 56270 + }, + { + "epoch": 2.055928328280861, + "grad_norm": 0.2795359194278717, + "learning_rate": 9.324789729983979e-05, + "loss": 1.4581, + "step": 56280 + }, + { + "epoch": 2.056416075329822, + "eval_loss": 2.465681314468384, + "eval_runtime": 21.9832, + "eval_samples_per_second": 227.447, + "eval_steps_per_second": 1.228, + "step": 56289 + }, + { + "epoch": 2.0564702694463732, + "grad_norm": 0.19691872596740723, + "learning_rate": 9.323955045355349e-05, + "loss": 1.4692, + "step": 56290 + }, + { + "epoch": 2.0570122106118855, + "grad_norm": 0.33033034205436707, + "learning_rate": 9.323119887031769e-05, + "loss": 1.4676, + "step": 56300 + }, + { + "epoch": 2.057554151777398, + "grad_norm": 0.2493034452199936, + "learning_rate": 9.322284255116696e-05, + "loss": 1.4552, + "step": 56310 + }, + { + "epoch": 2.0580960929429097, + "grad_norm": 0.3580038249492645, + "learning_rate": 9.321448149713645e-05, + "loss": 1.4593, + "step": 56320 + }, + { + "epoch": 2.058638034108422, + "grad_norm": 0.2858411967754364, + "learning_rate": 9.320611570926189e-05, + "loss": 1.4693, + "step": 56330 + }, + { + "epoch": 2.0591799752739344, + "grad_norm": 0.35059741139411926, + "learning_rate": 9.319774518857958e-05, + "loss": 1.468, + "step": 56340 + }, + { + "epoch": 2.0597219164394467, + "grad_norm": 0.30402854084968567, + "learning_rate": 9.318936993612643e-05, + "loss": 1.4626, + "step": 56350 + }, + { + "epoch": 2.0602638576049586, + "grad_norm": 0.33084359765052795, + "learning_rate": 9.318098995293993e-05, + "loss": 1.4748, + "step": 56360 + }, + { + "epoch": 2.060805798770471, + "grad_norm": 0.21346881985664368, + "learning_rate": 9.317260524005817e-05, + "loss": 1.4647, + "step": 56370 + }, + { + "epoch": 2.061130963469778, + "eval_loss": 2.4557716846466064, + "eval_runtime": 21.9864, + "eval_samples_per_second": 227.413, + "eval_steps_per_second": 1.228, + "step": 56376 + }, + { + "epoch": 2.061347739935983, + "grad_norm": 0.32445028424263, + "learning_rate": 9.31642157985198e-05, + "loss": 1.4523, + "step": 56380 + }, + { + "epoch": 2.0618896811014955, + "grad_norm": 0.2711530029773712, + "learning_rate": 9.315582162936407e-05, + "loss": 1.4613, + "step": 56390 + }, + { + "epoch": 2.062431622267008, + "grad_norm": 0.23968590795993805, + "learning_rate": 9.314742273363082e-05, + "loss": 1.4578, + "step": 56400 + }, + { + "epoch": 2.0629735634325197, + "grad_norm": 0.29744964838027954, + "learning_rate": 9.313901911236046e-05, + "loss": 1.4645, + "step": 56410 + }, + { + "epoch": 2.063515504598032, + "grad_norm": 0.23724764585494995, + "learning_rate": 9.313061076659398e-05, + "loss": 1.4636, + "step": 56420 + }, + { + "epoch": 2.0640574457635443, + "grad_norm": 0.18018679320812225, + "learning_rate": 9.312219769737299e-05, + "loss": 1.4586, + "step": 56430 + }, + { + "epoch": 2.0645993869290566, + "grad_norm": 0.25777846574783325, + "learning_rate": 9.311377990573967e-05, + "loss": 1.464, + "step": 56440 + }, + { + "epoch": 2.065141328094569, + "grad_norm": 0.2517983317375183, + "learning_rate": 9.310535739273675e-05, + "loss": 1.4611, + "step": 56450 + }, + { + "epoch": 2.065683269260081, + "grad_norm": 0.32330676913261414, + "learning_rate": 9.30969301594076e-05, + "loss": 1.4483, + "step": 56460 + }, + { + "epoch": 2.0658458516097347, + "eval_loss": 2.4468648433685303, + "eval_runtime": 23.3942, + "eval_samples_per_second": 213.728, + "eval_steps_per_second": 1.154, + "step": 56463 + }, + { + "epoch": 2.066225210425593, + "grad_norm": 0.3357566297054291, + "learning_rate": 9.308849820679614e-05, + "loss": 1.4712, + "step": 56470 + }, + { + "epoch": 2.0667671515911055, + "grad_norm": 0.18558350205421448, + "learning_rate": 9.308006153594692e-05, + "loss": 1.4578, + "step": 56480 + }, + { + "epoch": 2.067309092756618, + "grad_norm": 0.324517160654068, + "learning_rate": 9.307162014790496e-05, + "loss": 1.457, + "step": 56490 + }, + { + "epoch": 2.0678510339221297, + "grad_norm": 0.2188093215227127, + "learning_rate": 9.3063174043716e-05, + "loss": 1.4654, + "step": 56500 + }, + { + "epoch": 2.068392975087642, + "grad_norm": 0.19635216891765594, + "learning_rate": 9.30547232244263e-05, + "loss": 1.4495, + "step": 56510 + }, + { + "epoch": 2.0689349162531543, + "grad_norm": 0.18253515660762787, + "learning_rate": 9.304626769108271e-05, + "loss": 1.4619, + "step": 56520 + }, + { + "epoch": 2.0694768574186666, + "grad_norm": 0.19728650152683258, + "learning_rate": 9.303780744473265e-05, + "loss": 1.4629, + "step": 56530 + }, + { + "epoch": 2.0700187985841785, + "grad_norm": 0.27253544330596924, + "learning_rate": 9.302934248642414e-05, + "loss": 1.4604, + "step": 56540 + }, + { + "epoch": 2.070560739749691, + "grad_norm": 0.22469353675842285, + "learning_rate": 9.30208728172058e-05, + "loss": 1.4624, + "step": 56550 + }, + { + "epoch": 2.070560739749691, + "eval_loss": 2.449016809463501, + "eval_runtime": 21.9605, + "eval_samples_per_second": 227.682, + "eval_steps_per_second": 1.229, + "step": 56550 + }, + { + "epoch": 2.071102680915203, + "grad_norm": 0.4223212003707886, + "learning_rate": 9.301239843812681e-05, + "loss": 1.4569, + "step": 56560 + }, + { + "epoch": 2.0716446220807154, + "grad_norm": 0.1832091063261032, + "learning_rate": 9.30039193502369e-05, + "loss": 1.4669, + "step": 56570 + }, + { + "epoch": 2.0721865632462277, + "grad_norm": 0.19555889070034027, + "learning_rate": 9.29954355545865e-05, + "loss": 1.4459, + "step": 56580 + }, + { + "epoch": 2.0727285044117396, + "grad_norm": 0.25337639451026917, + "learning_rate": 9.29869470522265e-05, + "loss": 1.4591, + "step": 56590 + }, + { + "epoch": 2.073270445577252, + "grad_norm": 0.2375129759311676, + "learning_rate": 9.29784538442084e-05, + "loss": 1.4576, + "step": 56600 + }, + { + "epoch": 2.0738123867427642, + "grad_norm": 0.3068578243255615, + "learning_rate": 9.296995593158433e-05, + "loss": 1.4535, + "step": 56610 + }, + { + "epoch": 2.0743543279082766, + "grad_norm": 0.37019553780555725, + "learning_rate": 9.296145331540696e-05, + "loss": 1.4583, + "step": 56620 + }, + { + "epoch": 2.074896269073789, + "grad_norm": 0.26135745644569397, + "learning_rate": 9.295294599672959e-05, + "loss": 1.463, + "step": 56630 + }, + { + "epoch": 2.0752756278896474, + "eval_loss": 2.45114803314209, + "eval_runtime": 21.9951, + "eval_samples_per_second": 227.323, + "eval_steps_per_second": 1.228, + "step": 56637 + }, + { + "epoch": 2.0754382102393008, + "grad_norm": 0.23497585952281952, + "learning_rate": 9.294443397660603e-05, + "loss": 1.4568, + "step": 56640 + }, + { + "epoch": 2.075980151404813, + "grad_norm": 0.20370684564113617, + "learning_rate": 9.293591725609072e-05, + "loss": 1.4567, + "step": 56650 + }, + { + "epoch": 2.0765220925703254, + "grad_norm": 0.2905084490776062, + "learning_rate": 9.292739583623869e-05, + "loss": 1.4572, + "step": 56660 + }, + { + "epoch": 2.0770640337358377, + "grad_norm": 0.17985448241233826, + "learning_rate": 9.291886971810554e-05, + "loss": 1.465, + "step": 56670 + }, + { + "epoch": 2.0776059749013496, + "grad_norm": 0.2356506884098053, + "learning_rate": 9.291033890274743e-05, + "loss": 1.4714, + "step": 56680 + }, + { + "epoch": 2.078147916066862, + "grad_norm": 0.22777162492275238, + "learning_rate": 9.290180339122111e-05, + "loss": 1.458, + "step": 56690 + }, + { + "epoch": 2.078689857232374, + "grad_norm": 0.17766469717025757, + "learning_rate": 9.289326318458396e-05, + "loss": 1.466, + "step": 56700 + }, + { + "epoch": 2.0792317983978865, + "grad_norm": 0.18408410251140594, + "learning_rate": 9.288471828389387e-05, + "loss": 1.4562, + "step": 56710 + }, + { + "epoch": 2.079773739563399, + "grad_norm": 0.2577512264251709, + "learning_rate": 9.287616869020935e-05, + "loss": 1.4582, + "step": 56720 + }, + { + "epoch": 2.0799905160296035, + "eval_loss": 2.459651231765747, + "eval_runtime": 21.9706, + "eval_samples_per_second": 227.577, + "eval_steps_per_second": 1.229, + "step": 56724 + }, + { + "epoch": 2.0803156807289107, + "grad_norm": 0.19766086339950562, + "learning_rate": 9.286761440458952e-05, + "loss": 1.4558, + "step": 56730 + }, + { + "epoch": 2.080857621894423, + "grad_norm": 0.33140110969543457, + "learning_rate": 9.285905542809401e-05, + "loss": 1.4656, + "step": 56740 + }, + { + "epoch": 2.0813995630599353, + "grad_norm": 0.26236221194267273, + "learning_rate": 9.285049176178308e-05, + "loss": 1.4667, + "step": 56750 + }, + { + "epoch": 2.0819415042254477, + "grad_norm": 0.39471328258514404, + "learning_rate": 9.284192340671756e-05, + "loss": 1.4631, + "step": 56760 + }, + { + "epoch": 2.0824834453909595, + "grad_norm": 0.19031473994255066, + "learning_rate": 9.283335036395886e-05, + "loss": 1.4657, + "step": 56770 + }, + { + "epoch": 2.083025386556472, + "grad_norm": 0.27123594284057617, + "learning_rate": 9.282477263456899e-05, + "loss": 1.4586, + "step": 56780 + }, + { + "epoch": 2.083567327721984, + "grad_norm": 0.4629122018814087, + "learning_rate": 9.281619021961049e-05, + "loss": 1.4637, + "step": 56790 + }, + { + "epoch": 2.0841092688874965, + "grad_norm": 0.2233474850654602, + "learning_rate": 9.280760312014651e-05, + "loss": 1.4612, + "step": 56800 + }, + { + "epoch": 2.084651210053009, + "grad_norm": 0.25030142068862915, + "learning_rate": 9.279901133724082e-05, + "loss": 1.446, + "step": 56810 + }, + { + "epoch": 2.0847054041695596, + "eval_loss": 2.456845998764038, + "eval_runtime": 21.9864, + "eval_samples_per_second": 227.413, + "eval_steps_per_second": 1.228, + "step": 56811 + }, + { + "epoch": 2.0851931512185207, + "grad_norm": 0.2584102153778076, + "learning_rate": 9.279041487195772e-05, + "loss": 1.468, + "step": 56820 + }, + { + "epoch": 2.085735092384033, + "grad_norm": 0.2037636637687683, + "learning_rate": 9.27818137253621e-05, + "loss": 1.457, + "step": 56830 + }, + { + "epoch": 2.0862770335495453, + "grad_norm": 0.2464050054550171, + "learning_rate": 9.27732078985194e-05, + "loss": 1.4455, + "step": 56840 + }, + { + "epoch": 2.0868189747150576, + "grad_norm": 0.24278301000595093, + "learning_rate": 9.27645973924957e-05, + "loss": 1.4556, + "step": 56850 + }, + { + "epoch": 2.08736091588057, + "grad_norm": 0.2788692116737366, + "learning_rate": 9.275598220835765e-05, + "loss": 1.4473, + "step": 56860 + }, + { + "epoch": 2.087902857046082, + "grad_norm": 0.21799428761005402, + "learning_rate": 9.274736234717243e-05, + "loss": 1.4548, + "step": 56870 + }, + { + "epoch": 2.088444798211594, + "grad_norm": 0.2989203929901123, + "learning_rate": 9.273873781000786e-05, + "loss": 1.4623, + "step": 56880 + }, + { + "epoch": 2.0889867393771064, + "grad_norm": 0.21010935306549072, + "learning_rate": 9.273010859793231e-05, + "loss": 1.456, + "step": 56890 + }, + { + "epoch": 2.089420292309516, + "eval_loss": 2.455754041671753, + "eval_runtime": 21.9865, + "eval_samples_per_second": 227.412, + "eval_steps_per_second": 1.228, + "step": 56898 + }, + { + "epoch": 2.0895286805426188, + "grad_norm": 0.18887412548065186, + "learning_rate": 9.272147471201467e-05, + "loss": 1.456, + "step": 56900 + }, + { + "epoch": 2.0900706217081306, + "grad_norm": 0.26745525002479553, + "learning_rate": 9.271283615332455e-05, + "loss": 1.4589, + "step": 56910 + }, + { + "epoch": 2.090612562873643, + "grad_norm": 0.18780305981636047, + "learning_rate": 9.270419292293201e-05, + "loss": 1.4585, + "step": 56920 + }, + { + "epoch": 2.0911545040391553, + "grad_norm": 0.21684403717517853, + "learning_rate": 9.269554502190775e-05, + "loss": 1.4464, + "step": 56930 + }, + { + "epoch": 2.0916964452046676, + "grad_norm": 0.4031422436237335, + "learning_rate": 9.268689245132302e-05, + "loss": 1.4465, + "step": 56940 + }, + { + "epoch": 2.09223838637018, + "grad_norm": 0.31991732120513916, + "learning_rate": 9.267823521224967e-05, + "loss": 1.4656, + "step": 56950 + }, + { + "epoch": 2.0927803275356918, + "grad_norm": 0.29410219192504883, + "learning_rate": 9.266957330576015e-05, + "loss": 1.452, + "step": 56960 + }, + { + "epoch": 2.093322268701204, + "grad_norm": 0.3743274211883545, + "learning_rate": 9.26609067329274e-05, + "loss": 1.4552, + "step": 56970 + }, + { + "epoch": 2.0938642098667164, + "grad_norm": 0.5883728265762329, + "learning_rate": 9.265223549482505e-05, + "loss": 1.4578, + "step": 56980 + }, + { + "epoch": 2.0941351804494723, + "eval_loss": 2.460325002670288, + "eval_runtime": 22.2713, + "eval_samples_per_second": 224.505, + "eval_steps_per_second": 1.212, + "step": 56985 + }, + { + "epoch": 2.0944061510322287, + "grad_norm": 0.3245681822299957, + "learning_rate": 9.264355959252722e-05, + "loss": 1.4606, + "step": 56990 + }, + { + "epoch": 2.0949480921977406, + "grad_norm": 0.255845844745636, + "learning_rate": 9.263487902710869e-05, + "loss": 1.4688, + "step": 57000 + }, + { + "epoch": 2.095490033363253, + "grad_norm": 0.17143049836158752, + "learning_rate": 9.262619379964474e-05, + "loss": 1.4586, + "step": 57010 + }, + { + "epoch": 2.096031974528765, + "grad_norm": 0.2582264244556427, + "learning_rate": 9.261750391121122e-05, + "loss": 1.4485, + "step": 57020 + }, + { + "epoch": 2.0965739156942775, + "grad_norm": 0.3058115541934967, + "learning_rate": 9.260880936288466e-05, + "loss": 1.4608, + "step": 57030 + }, + { + "epoch": 2.09711585685979, + "grad_norm": 0.3971554934978485, + "learning_rate": 9.260011015574207e-05, + "loss": 1.4587, + "step": 57040 + }, + { + "epoch": 2.0976577980253017, + "grad_norm": 0.21815316379070282, + "learning_rate": 9.25914062908611e-05, + "loss": 1.4634, + "step": 57050 + }, + { + "epoch": 2.098199739190814, + "grad_norm": 0.18554462492465973, + "learning_rate": 9.258269776931989e-05, + "loss": 1.4547, + "step": 57060 + }, + { + "epoch": 2.0987416803563264, + "grad_norm": 0.2115841954946518, + "learning_rate": 9.257398459219727e-05, + "loss": 1.4534, + "step": 57070 + }, + { + "epoch": 2.098850068589429, + "eval_loss": 2.4590704441070557, + "eval_runtime": 21.9818, + "eval_samples_per_second": 227.461, + "eval_steps_per_second": 1.228, + "step": 57072 + }, + { + "epoch": 2.0992836215218387, + "grad_norm": 0.3519175350666046, + "learning_rate": 9.256526676057257e-05, + "loss": 1.4481, + "step": 57080 + }, + { + "epoch": 2.0998255626873505, + "grad_norm": 0.3369831144809723, + "learning_rate": 9.25565442755257e-05, + "loss": 1.4422, + "step": 57090 + }, + { + "epoch": 2.100367503852863, + "grad_norm": 0.3334982395172119, + "learning_rate": 9.254781713813719e-05, + "loss": 1.444, + "step": 57100 + }, + { + "epoch": 2.100909445018375, + "grad_norm": 0.3039768636226654, + "learning_rate": 9.253908534948811e-05, + "loss": 1.4585, + "step": 57110 + }, + { + "epoch": 2.1014513861838875, + "grad_norm": 0.3292928636074066, + "learning_rate": 9.253034891066011e-05, + "loss": 1.4428, + "step": 57120 + }, + { + "epoch": 2.1019933273494, + "grad_norm": 0.2547774910926819, + "learning_rate": 9.252160782273544e-05, + "loss": 1.4511, + "step": 57130 + }, + { + "epoch": 2.1025352685149117, + "grad_norm": 0.19933480024337769, + "learning_rate": 9.251286208679688e-05, + "loss": 1.456, + "step": 57140 + }, + { + "epoch": 2.103077209680424, + "grad_norm": 0.20023803412914276, + "learning_rate": 9.250411170392783e-05, + "loss": 1.4615, + "step": 57150 + }, + { + "epoch": 2.103564956729385, + "eval_loss": 2.44368839263916, + "eval_runtime": 21.9837, + "eval_samples_per_second": 227.441, + "eval_steps_per_second": 1.228, + "step": 57159 + }, + { + "epoch": 2.1036191508459363, + "grad_norm": 0.3000018894672394, + "learning_rate": 9.249535667521225e-05, + "loss": 1.4423, + "step": 57160 + }, + { + "epoch": 2.1041610920114486, + "grad_norm": 0.18277384340763092, + "learning_rate": 9.248659700173469e-05, + "loss": 1.4596, + "step": 57170 + }, + { + "epoch": 2.1047030331769605, + "grad_norm": 0.33772820234298706, + "learning_rate": 9.247783268458023e-05, + "loss": 1.4496, + "step": 57180 + }, + { + "epoch": 2.105244974342473, + "grad_norm": 0.2701638340950012, + "learning_rate": 9.246906372483456e-05, + "loss": 1.4604, + "step": 57190 + }, + { + "epoch": 2.105786915507985, + "grad_norm": 0.33614516258239746, + "learning_rate": 9.246029012358398e-05, + "loss": 1.4474, + "step": 57200 + }, + { + "epoch": 2.1063288566734975, + "grad_norm": 0.18986083567142487, + "learning_rate": 9.245151188191526e-05, + "loss": 1.448, + "step": 57210 + }, + { + "epoch": 2.1068707978390098, + "grad_norm": 0.19249024987220764, + "learning_rate": 9.244272900091586e-05, + "loss": 1.4549, + "step": 57220 + }, + { + "epoch": 2.1074127390045216, + "grad_norm": 0.21236182749271393, + "learning_rate": 9.243394148167376e-05, + "loss": 1.4536, + "step": 57230 + }, + { + "epoch": 2.107954680170034, + "grad_norm": 0.37698227167129517, + "learning_rate": 9.242514932527751e-05, + "loss": 1.4607, + "step": 57240 + }, + { + "epoch": 2.108279844869341, + "eval_loss": 2.428705930709839, + "eval_runtime": 21.9743, + "eval_samples_per_second": 227.539, + "eval_steps_per_second": 1.229, + "step": 57246 + }, + { + "epoch": 2.1084966213355463, + "grad_norm": 0.26461127400398254, + "learning_rate": 9.241635253281624e-05, + "loss": 1.4558, + "step": 57250 + }, + { + "epoch": 2.1090385625010586, + "grad_norm": 0.20196294784545898, + "learning_rate": 9.240755110537966e-05, + "loss": 1.4561, + "step": 57260 + }, + { + "epoch": 2.109580503666571, + "grad_norm": 0.40879595279693604, + "learning_rate": 9.239874504405806e-05, + "loss": 1.4488, + "step": 57270 + }, + { + "epoch": 2.1101224448320828, + "grad_norm": 0.2242775410413742, + "learning_rate": 9.238993434994229e-05, + "loss": 1.4513, + "step": 57280 + }, + { + "epoch": 2.110664385997595, + "grad_norm": 0.25892990827560425, + "learning_rate": 9.238111902412379e-05, + "loss": 1.4625, + "step": 57290 + }, + { + "epoch": 2.1112063271631074, + "grad_norm": 0.1938840001821518, + "learning_rate": 9.237229906769456e-05, + "loss": 1.4614, + "step": 57300 + }, + { + "epoch": 2.1117482683286197, + "grad_norm": 0.2417358160018921, + "learning_rate": 9.236347448174717e-05, + "loss": 1.4616, + "step": 57310 + }, + { + "epoch": 2.1122902094941316, + "grad_norm": 0.26493963599205017, + "learning_rate": 9.235464526737477e-05, + "loss": 1.4459, + "step": 57320 + }, + { + "epoch": 2.112832150659644, + "grad_norm": 0.19729004800319672, + "learning_rate": 9.234581142567112e-05, + "loss": 1.4565, + "step": 57330 + }, + { + "epoch": 2.1129947330092977, + "eval_loss": 2.458986520767212, + "eval_runtime": 21.9806, + "eval_samples_per_second": 227.473, + "eval_steps_per_second": 1.228, + "step": 57333 + }, + { + "epoch": 2.1133740918251562, + "grad_norm": 0.2965938448905945, + "learning_rate": 9.233697295773046e-05, + "loss": 1.4564, + "step": 57340 + }, + { + "epoch": 2.1139160329906685, + "grad_norm": 0.281646728515625, + "learning_rate": 9.23281298646477e-05, + "loss": 1.4464, + "step": 57350 + }, + { + "epoch": 2.114457974156181, + "grad_norm": 0.2671820819377899, + "learning_rate": 9.231928214751828e-05, + "loss": 1.4495, + "step": 57360 + }, + { + "epoch": 2.1149999153216927, + "grad_norm": 0.20213744044303894, + "learning_rate": 9.23104298074382e-05, + "loss": 1.4672, + "step": 57370 + }, + { + "epoch": 2.115541856487205, + "grad_norm": 0.3106236755847931, + "learning_rate": 9.230157284550407e-05, + "loss": 1.4501, + "step": 57380 + }, + { + "epoch": 2.1160837976527174, + "grad_norm": 0.26210352778434753, + "learning_rate": 9.229271126281304e-05, + "loss": 1.454, + "step": 57390 + }, + { + "epoch": 2.1166257388182297, + "grad_norm": 0.2264351099729538, + "learning_rate": 9.228384506046286e-05, + "loss": 1.4441, + "step": 57400 + }, + { + "epoch": 2.1171676799837416, + "grad_norm": 0.19489522278308868, + "learning_rate": 9.22749742395518e-05, + "loss": 1.4577, + "step": 57410 + }, + { + "epoch": 2.117709621149254, + "grad_norm": 0.20375080406665802, + "learning_rate": 9.226609880117877e-05, + "loss": 1.4462, + "step": 57420 + }, + { + "epoch": 2.117709621149254, + "eval_loss": 2.4545302391052246, + "eval_runtime": 21.9693, + "eval_samples_per_second": 227.591, + "eval_steps_per_second": 1.229, + "step": 57420 + }, + { + "epoch": 2.118251562314766, + "grad_norm": 0.29413971304893494, + "learning_rate": 9.225721874644324e-05, + "loss": 1.465, + "step": 57430 + }, + { + "epoch": 2.1187935034802785, + "grad_norm": 0.2669055461883545, + "learning_rate": 9.224833407644519e-05, + "loss": 1.454, + "step": 57440 + }, + { + "epoch": 2.119335444645791, + "grad_norm": 0.2955906391143799, + "learning_rate": 9.223944479228522e-05, + "loss": 1.4574, + "step": 57450 + }, + { + "epoch": 2.1198773858113027, + "grad_norm": 0.3815632462501526, + "learning_rate": 9.223055089506451e-05, + "loss": 1.4574, + "step": 57460 + }, + { + "epoch": 2.120419326976815, + "grad_norm": 0.3034198582172394, + "learning_rate": 9.22216523858848e-05, + "loss": 1.4515, + "step": 57470 + }, + { + "epoch": 2.1209612681423273, + "grad_norm": 0.22253744304180145, + "learning_rate": 9.22127492658484e-05, + "loss": 1.4485, + "step": 57480 + }, + { + "epoch": 2.1215032093078396, + "grad_norm": 0.39392632246017456, + "learning_rate": 9.220384153605816e-05, + "loss": 1.4488, + "step": 57490 + }, + { + "epoch": 2.122045150473352, + "grad_norm": 0.19380737841129303, + "learning_rate": 9.219492919761758e-05, + "loss": 1.448, + "step": 57500 + }, + { + "epoch": 2.1224245092892104, + "eval_loss": 2.4559643268585205, + "eval_runtime": 21.9784, + "eval_samples_per_second": 227.496, + "eval_steps_per_second": 1.228, + "step": 57507 + }, + { + "epoch": 2.122587091638864, + "grad_norm": 0.1623711884021759, + "learning_rate": 9.218601225163063e-05, + "loss": 1.4629, + "step": 57510 + }, + { + "epoch": 2.123129032804376, + "grad_norm": 0.27015969157218933, + "learning_rate": 9.217709069920196e-05, + "loss": 1.4466, + "step": 57520 + }, + { + "epoch": 2.1236709739698885, + "grad_norm": 0.20501737296581268, + "learning_rate": 9.216816454143667e-05, + "loss": 1.4536, + "step": 57530 + }, + { + "epoch": 2.124212915135401, + "grad_norm": 0.22715558111667633, + "learning_rate": 9.215923377944055e-05, + "loss": 1.4488, + "step": 57540 + }, + { + "epoch": 2.1247548563009127, + "grad_norm": 0.27195459604263306, + "learning_rate": 9.215029841431984e-05, + "loss": 1.4442, + "step": 57550 + }, + { + "epoch": 2.125296797466425, + "grad_norm": 0.2515755295753479, + "learning_rate": 9.21413584471815e-05, + "loss": 1.4584, + "step": 57560 + }, + { + "epoch": 2.1258387386319373, + "grad_norm": 0.2585810720920563, + "learning_rate": 9.213241387913289e-05, + "loss": 1.4515, + "step": 57570 + }, + { + "epoch": 2.1263806797974496, + "grad_norm": 0.3669833540916443, + "learning_rate": 9.212346471128207e-05, + "loss": 1.4564, + "step": 57580 + }, + { + "epoch": 2.1269226209629615, + "grad_norm": 0.2513576149940491, + "learning_rate": 9.211451094473764e-05, + "loss": 1.4557, + "step": 57590 + }, + { + "epoch": 2.1271393974291666, + "eval_loss": 2.456778049468994, + "eval_runtime": 22.6763, + "eval_samples_per_second": 220.495, + "eval_steps_per_second": 1.191, + "step": 57594 + }, + { + "epoch": 2.127464562128474, + "grad_norm": 0.2454679310321808, + "learning_rate": 9.210555258060871e-05, + "loss": 1.4549, + "step": 57600 + }, + { + "epoch": 2.128006503293986, + "grad_norm": 0.3271390199661255, + "learning_rate": 9.209658962000502e-05, + "loss": 1.4548, + "step": 57610 + }, + { + "epoch": 2.1285484444594984, + "grad_norm": 0.2133026272058487, + "learning_rate": 9.208762206403687e-05, + "loss": 1.4508, + "step": 57620 + }, + { + "epoch": 2.1290903856250107, + "grad_norm": 0.20085935294628143, + "learning_rate": 9.20786499138151e-05, + "loss": 1.4557, + "step": 57630 + }, + { + "epoch": 2.1296323267905226, + "grad_norm": 0.3363330662250519, + "learning_rate": 9.206967317045119e-05, + "loss": 1.4512, + "step": 57640 + }, + { + "epoch": 2.130174267956035, + "grad_norm": 0.2005142867565155, + "learning_rate": 9.20606918350571e-05, + "loss": 1.4578, + "step": 57650 + }, + { + "epoch": 2.1307162091215472, + "grad_norm": 0.32124650478363037, + "learning_rate": 9.205170590874539e-05, + "loss": 1.4407, + "step": 57660 + }, + { + "epoch": 2.1312581502870596, + "grad_norm": 0.406781405210495, + "learning_rate": 9.204271539262922e-05, + "loss": 1.4442, + "step": 57670 + }, + { + "epoch": 2.131800091452572, + "grad_norm": 0.19096383452415466, + "learning_rate": 9.20337202878223e-05, + "loss": 1.4632, + "step": 57680 + }, + { + "epoch": 2.1318542855691227, + "eval_loss": 2.4498584270477295, + "eval_runtime": 21.9755, + "eval_samples_per_second": 227.526, + "eval_steps_per_second": 1.229, + "step": 57681 + }, + { + "epoch": 2.1323420326180837, + "grad_norm": 0.21396368741989136, + "learning_rate": 9.202472059543888e-05, + "loss": 1.4473, + "step": 57690 + }, + { + "epoch": 2.132883973783596, + "grad_norm": 0.33212918043136597, + "learning_rate": 9.201571631659384e-05, + "loss": 1.4547, + "step": 57700 + }, + { + "epoch": 2.1334259149491084, + "grad_norm": 0.3928041458129883, + "learning_rate": 9.200670745240255e-05, + "loss": 1.4552, + "step": 57710 + }, + { + "epoch": 2.1339678561146207, + "grad_norm": 0.2717365622520447, + "learning_rate": 9.199769400398101e-05, + "loss": 1.4431, + "step": 57720 + }, + { + "epoch": 2.1345097972801326, + "grad_norm": 0.19520139694213867, + "learning_rate": 9.198867597244577e-05, + "loss": 1.4426, + "step": 57730 + }, + { + "epoch": 2.135051738445645, + "grad_norm": 0.4766680896282196, + "learning_rate": 9.197965335891394e-05, + "loss": 1.4433, + "step": 57740 + }, + { + "epoch": 2.135593679611157, + "grad_norm": 0.5358844995498657, + "learning_rate": 9.19706261645032e-05, + "loss": 1.4486, + "step": 57750 + }, + { + "epoch": 2.1361356207766695, + "grad_norm": 0.40660426020622253, + "learning_rate": 9.196159439033179e-05, + "loss": 1.4481, + "step": 57760 + }, + { + "epoch": 2.1365691737090793, + "eval_loss": 2.4472365379333496, + "eval_runtime": 21.9775, + "eval_samples_per_second": 227.505, + "eval_steps_per_second": 1.229, + "step": 57768 + }, + { + "epoch": 2.136677561942182, + "grad_norm": 0.21572063863277435, + "learning_rate": 9.195255803751855e-05, + "loss": 1.4457, + "step": 57770 + }, + { + "epoch": 2.1372195031076937, + "grad_norm": 0.29680702090263367, + "learning_rate": 9.194351710718285e-05, + "loss": 1.4545, + "step": 57780 + }, + { + "epoch": 2.137761444273206, + "grad_norm": 0.27061325311660767, + "learning_rate": 9.193447160044465e-05, + "loss": 1.4451, + "step": 57790 + }, + { + "epoch": 2.1383033854387183, + "grad_norm": 0.2027037888765335, + "learning_rate": 9.192542151842447e-05, + "loss": 1.4497, + "step": 57800 + }, + { + "epoch": 2.1388453266042307, + "grad_norm": 0.20895957946777344, + "learning_rate": 9.191636686224336e-05, + "loss": 1.4519, + "step": 57810 + }, + { + "epoch": 2.1393872677697425, + "grad_norm": 0.18710966408252716, + "learning_rate": 9.1907307633023e-05, + "loss": 1.4492, + "step": 57820 + }, + { + "epoch": 2.139929208935255, + "grad_norm": 0.270098477602005, + "learning_rate": 9.189824383188562e-05, + "loss": 1.4581, + "step": 57830 + }, + { + "epoch": 2.140471150100767, + "grad_norm": 0.21558840572834015, + "learning_rate": 9.1889175459954e-05, + "loss": 1.4398, + "step": 57840 + }, + { + "epoch": 2.1410130912662795, + "grad_norm": 0.2560511827468872, + "learning_rate": 9.188010251835147e-05, + "loss": 1.4501, + "step": 57850 + }, + { + "epoch": 2.1412840618490354, + "eval_loss": 2.448354482650757, + "eval_runtime": 21.9774, + "eval_samples_per_second": 227.506, + "eval_steps_per_second": 1.229, + "step": 57855 + }, + { + "epoch": 2.141555032431792, + "grad_norm": 0.3320198357105255, + "learning_rate": 9.187102500820195e-05, + "loss": 1.4466, + "step": 57860 + }, + { + "epoch": 2.1420969735973037, + "grad_norm": 0.37881776690483093, + "learning_rate": 9.186194293062993e-05, + "loss": 1.446, + "step": 57870 + }, + { + "epoch": 2.142638914762816, + "grad_norm": 0.38807186484336853, + "learning_rate": 9.185285628676045e-05, + "loss": 1.4483, + "step": 57880 + }, + { + "epoch": 2.1431808559283283, + "grad_norm": 0.27356836199760437, + "learning_rate": 9.184376507771916e-05, + "loss": 1.4483, + "step": 57890 + }, + { + "epoch": 2.1437227970938406, + "grad_norm": 0.31562650203704834, + "learning_rate": 9.18346693046322e-05, + "loss": 1.4557, + "step": 57900 + }, + { + "epoch": 2.144264738259353, + "grad_norm": 0.3229880630970001, + "learning_rate": 9.182556896862632e-05, + "loss": 1.4511, + "step": 57910 + }, + { + "epoch": 2.144806679424865, + "grad_norm": 0.36681440472602844, + "learning_rate": 9.181646407082885e-05, + "loss": 1.4448, + "step": 57920 + }, + { + "epoch": 2.145348620590377, + "grad_norm": 0.4348771274089813, + "learning_rate": 9.180735461236764e-05, + "loss": 1.4456, + "step": 57930 + }, + { + "epoch": 2.1458905617558894, + "grad_norm": 0.35928380489349365, + "learning_rate": 9.179824059437113e-05, + "loss": 1.448, + "step": 57940 + }, + { + "epoch": 2.145998949988992, + "eval_loss": 2.451070547103882, + "eval_runtime": 21.9847, + "eval_samples_per_second": 227.431, + "eval_steps_per_second": 1.228, + "step": 57942 + }, + { + "epoch": 2.1464325029214018, + "grad_norm": 0.3611794710159302, + "learning_rate": 9.178912201796836e-05, + "loss": 1.4492, + "step": 57950 + }, + { + "epoch": 2.1469744440869136, + "grad_norm": 0.21141478419303894, + "learning_rate": 9.177999888428886e-05, + "loss": 1.4474, + "step": 57960 + }, + { + "epoch": 2.147516385252426, + "grad_norm": 0.2665724456310272, + "learning_rate": 9.17708711944628e-05, + "loss": 1.4562, + "step": 57970 + }, + { + "epoch": 2.1480583264179383, + "grad_norm": 0.3022826015949249, + "learning_rate": 9.176173894962086e-05, + "loss": 1.4337, + "step": 57980 + }, + { + "epoch": 2.1486002675834506, + "grad_norm": 0.27852869033813477, + "learning_rate": 9.175260215089429e-05, + "loss": 1.4481, + "step": 57990 + }, + { + "epoch": 2.149142208748963, + "grad_norm": 0.22829607129096985, + "learning_rate": 9.174346079941492e-05, + "loss": 1.4468, + "step": 58000 + }, + { + "epoch": 2.1496841499144748, + "grad_norm": 0.24403288960456848, + "learning_rate": 9.173431489631517e-05, + "loss": 1.4571, + "step": 58010 + }, + { + "epoch": 2.150226091079987, + "grad_norm": 0.23664097487926483, + "learning_rate": 9.172516444272796e-05, + "loss": 1.4461, + "step": 58020 + }, + { + "epoch": 2.150713838128948, + "eval_loss": 2.4530248641967773, + "eval_runtime": 21.9794, + "eval_samples_per_second": 227.486, + "eval_steps_per_second": 1.228, + "step": 58029 + }, + { + "epoch": 2.1507680322454994, + "grad_norm": 0.19086076319217682, + "learning_rate": 9.171600943978683e-05, + "loss": 1.4514, + "step": 58030 + }, + { + "epoch": 2.1513099734110117, + "grad_norm": 0.22935578227043152, + "learning_rate": 9.170684988862586e-05, + "loss": 1.456, + "step": 58040 + }, + { + "epoch": 2.1518519145765236, + "grad_norm": 0.21930955350399017, + "learning_rate": 9.16976857903797e-05, + "loss": 1.4513, + "step": 58050 + }, + { + "epoch": 2.152393855742036, + "grad_norm": 0.2644178867340088, + "learning_rate": 9.168851714618352e-05, + "loss": 1.4377, + "step": 58060 + }, + { + "epoch": 2.152935796907548, + "grad_norm": 0.3892003297805786, + "learning_rate": 9.167934395717314e-05, + "loss": 1.4409, + "step": 58070 + }, + { + "epoch": 2.1534777380730605, + "grad_norm": 0.40704429149627686, + "learning_rate": 9.167016622448488e-05, + "loss": 1.4479, + "step": 58080 + }, + { + "epoch": 2.154019679238573, + "grad_norm": 0.33170846104621887, + "learning_rate": 9.166098394925562e-05, + "loss": 1.4459, + "step": 58090 + }, + { + "epoch": 2.1545616204040847, + "grad_norm": 0.18887291848659515, + "learning_rate": 9.165179713262286e-05, + "loss": 1.4583, + "step": 58100 + }, + { + "epoch": 2.155103561569597, + "grad_norm": 0.2687866687774658, + "learning_rate": 9.164260577572456e-05, + "loss": 1.4504, + "step": 58110 + }, + { + "epoch": 2.1554287262689042, + "eval_loss": 2.450887680053711, + "eval_runtime": 21.9808, + "eval_samples_per_second": 227.471, + "eval_steps_per_second": 1.228, + "step": 58116 + }, + { + "epoch": 2.1556455027351094, + "grad_norm": 0.21473592519760132, + "learning_rate": 9.163340987969938e-05, + "loss": 1.4491, + "step": 58120 + }, + { + "epoch": 2.1561874439006217, + "grad_norm": 0.1952633261680603, + "learning_rate": 9.162420944568641e-05, + "loss": 1.4505, + "step": 58130 + }, + { + "epoch": 2.156729385066134, + "grad_norm": 0.21670879423618317, + "learning_rate": 9.161500447482539e-05, + "loss": 1.4447, + "step": 58140 + }, + { + "epoch": 2.157271326231646, + "grad_norm": 0.27638787031173706, + "learning_rate": 9.160579496825656e-05, + "loss": 1.4444, + "step": 58150 + }, + { + "epoch": 2.157813267397158, + "grad_norm": 0.2775803804397583, + "learning_rate": 9.15965809271208e-05, + "loss": 1.4442, + "step": 58160 + }, + { + "epoch": 2.1583552085626705, + "grad_norm": 0.2165958732366562, + "learning_rate": 9.158736235255949e-05, + "loss": 1.4533, + "step": 58170 + }, + { + "epoch": 2.158897149728183, + "grad_norm": 0.3519152104854584, + "learning_rate": 9.157813924571455e-05, + "loss": 1.4455, + "step": 58180 + }, + { + "epoch": 2.1594390908936947, + "grad_norm": 0.23166511952877045, + "learning_rate": 9.156891160772854e-05, + "loss": 1.4437, + "step": 58190 + }, + { + "epoch": 2.159981032059207, + "grad_norm": 0.32799333333969116, + "learning_rate": 9.155967943974453e-05, + "loss": 1.4477, + "step": 58200 + }, + { + "epoch": 2.160143614408861, + "eval_loss": 2.4706532955169678, + "eval_runtime": 21.9762, + "eval_samples_per_second": 227.518, + "eval_steps_per_second": 1.229, + "step": 58203 + }, + { + "epoch": 2.1605229732247193, + "grad_norm": 0.26824527978897095, + "learning_rate": 9.155044274290614e-05, + "loss": 1.4423, + "step": 58210 + }, + { + "epoch": 2.1610649143902316, + "grad_norm": 0.3500238358974457, + "learning_rate": 9.15412015183576e-05, + "loss": 1.4509, + "step": 58220 + }, + { + "epoch": 2.1616068555557435, + "grad_norm": 0.38133561611175537, + "learning_rate": 9.153195576724367e-05, + "loss": 1.4519, + "step": 58230 + }, + { + "epoch": 2.162148796721256, + "grad_norm": 0.27178552746772766, + "learning_rate": 9.152270549070964e-05, + "loss": 1.4541, + "step": 58240 + }, + { + "epoch": 2.162690737886768, + "grad_norm": 0.24917040765285492, + "learning_rate": 9.151345068990146e-05, + "loss": 1.449, + "step": 58250 + }, + { + "epoch": 2.1632326790522804, + "grad_norm": 0.39437514543533325, + "learning_rate": 9.150419136596551e-05, + "loss": 1.4438, + "step": 58260 + }, + { + "epoch": 2.1637746202177928, + "grad_norm": 0.346034437417984, + "learning_rate": 9.149492752004882e-05, + "loss": 1.4544, + "step": 58270 + }, + { + "epoch": 2.1643165613833046, + "grad_norm": 0.3629694879055023, + "learning_rate": 9.148565915329896e-05, + "loss": 1.4454, + "step": 58280 + }, + { + "epoch": 2.164858502548817, + "grad_norm": 0.20559094846248627, + "learning_rate": 9.147638626686404e-05, + "loss": 1.4417, + "step": 58290 + }, + { + "epoch": 2.164858502548817, + "eval_loss": 2.461625099182129, + "eval_runtime": 21.9403, + "eval_samples_per_second": 227.891, + "eval_steps_per_second": 1.231, + "step": 58290 + }, + { + "epoch": 2.1654004437143293, + "grad_norm": 0.1901310533285141, + "learning_rate": 9.146710886189276e-05, + "loss": 1.4454, + "step": 58300 + }, + { + "epoch": 2.1659423848798416, + "grad_norm": 0.2069934457540512, + "learning_rate": 9.145782693953435e-05, + "loss": 1.4521, + "step": 58310 + }, + { + "epoch": 2.166484326045354, + "grad_norm": 0.34269529581069946, + "learning_rate": 9.144854050093863e-05, + "loss": 1.4463, + "step": 58320 + }, + { + "epoch": 2.1670262672108658, + "grad_norm": 0.2547290325164795, + "learning_rate": 9.143924954725595e-05, + "loss": 1.4447, + "step": 58330 + }, + { + "epoch": 2.167568208376378, + "grad_norm": 0.2823292016983032, + "learning_rate": 9.142995407963724e-05, + "loss": 1.4402, + "step": 58340 + }, + { + "epoch": 2.1681101495418904, + "grad_norm": 0.3173072934150696, + "learning_rate": 9.1420654099234e-05, + "loss": 1.4404, + "step": 58350 + }, + { + "epoch": 2.1686520907074027, + "grad_norm": 0.23170779645442963, + "learning_rate": 9.141134960719824e-05, + "loss": 1.4542, + "step": 58360 + }, + { + "epoch": 2.1691940318729146, + "grad_norm": 0.33306044340133667, + "learning_rate": 9.140204060468257e-05, + "loss": 1.4494, + "step": 58370 + }, + { + "epoch": 2.1695733906887735, + "eval_loss": 2.4640088081359863, + "eval_runtime": 21.9887, + "eval_samples_per_second": 227.39, + "eval_steps_per_second": 1.228, + "step": 58377 + }, + { + "epoch": 2.169735973038427, + "grad_norm": 0.23135629296302795, + "learning_rate": 9.139272709284015e-05, + "loss": 1.4456, + "step": 58380 + }, + { + "epoch": 2.1702779142039392, + "grad_norm": 0.22279199957847595, + "learning_rate": 9.138340907282472e-05, + "loss": 1.4514, + "step": 58390 + }, + { + "epoch": 2.1708198553694515, + "grad_norm": 0.27086111903190613, + "learning_rate": 9.137408654579051e-05, + "loss": 1.4481, + "step": 58400 + }, + { + "epoch": 2.171361796534964, + "grad_norm": 0.21697980165481567, + "learning_rate": 9.136475951289239e-05, + "loss": 1.4424, + "step": 58410 + }, + { + "epoch": 2.1719037377004757, + "grad_norm": 0.32999521493911743, + "learning_rate": 9.135542797528574e-05, + "loss": 1.4552, + "step": 58420 + }, + { + "epoch": 2.172445678865988, + "grad_norm": 0.3084947466850281, + "learning_rate": 9.134609193412652e-05, + "loss": 1.433, + "step": 58430 + }, + { + "epoch": 2.1729876200315004, + "grad_norm": 0.21755863726139069, + "learning_rate": 9.13367513905712e-05, + "loss": 1.4449, + "step": 58440 + }, + { + "epoch": 2.1735295611970127, + "grad_norm": 0.1901983916759491, + "learning_rate": 9.13274063457769e-05, + "loss": 1.4482, + "step": 58450 + }, + { + "epoch": 2.1740715023625246, + "grad_norm": 0.18798936903476715, + "learning_rate": 9.131805680090122e-05, + "loss": 1.4472, + "step": 58460 + }, + { + "epoch": 2.1742882788287297, + "eval_loss": 2.459465503692627, + "eval_runtime": 21.9819, + "eval_samples_per_second": 227.459, + "eval_steps_per_second": 1.228, + "step": 58464 + }, + { + "epoch": 2.174613443528037, + "grad_norm": 0.2203335165977478, + "learning_rate": 9.130870275710235e-05, + "loss": 1.4538, + "step": 58470 + }, + { + "epoch": 2.175155384693549, + "grad_norm": 0.22482015192508698, + "learning_rate": 9.1299344215539e-05, + "loss": 1.4545, + "step": 58480 + }, + { + "epoch": 2.1756973258590615, + "grad_norm": 0.3685661852359772, + "learning_rate": 9.12899811773705e-05, + "loss": 1.441, + "step": 58490 + }, + { + "epoch": 2.176239267024574, + "grad_norm": 0.27606692910194397, + "learning_rate": 9.128061364375668e-05, + "loss": 1.4412, + "step": 58500 + }, + { + "epoch": 2.1767812081900857, + "grad_norm": 0.2645034193992615, + "learning_rate": 9.127124161585795e-05, + "loss": 1.4463, + "step": 58510 + }, + { + "epoch": 2.177323149355598, + "grad_norm": 0.4667447805404663, + "learning_rate": 9.126186509483529e-05, + "loss": 1.4512, + "step": 58520 + }, + { + "epoch": 2.1778650905211103, + "grad_norm": 0.5272287726402283, + "learning_rate": 9.125248408185021e-05, + "loss": 1.4451, + "step": 58530 + }, + { + "epoch": 2.1784070316866226, + "grad_norm": 0.2895019054412842, + "learning_rate": 9.12430985780648e-05, + "loss": 1.4515, + "step": 58540 + }, + { + "epoch": 2.178948972852135, + "grad_norm": 0.25505638122558594, + "learning_rate": 9.123370858464169e-05, + "loss": 1.4515, + "step": 58550 + }, + { + "epoch": 2.179003166968686, + "eval_loss": 2.462846279144287, + "eval_runtime": 21.9798, + "eval_samples_per_second": 227.481, + "eval_steps_per_second": 1.228, + "step": 58551 + }, + { + "epoch": 2.179490914017647, + "grad_norm": 0.24811875820159912, + "learning_rate": 9.122431410274406e-05, + "loss": 1.4464, + "step": 58560 + }, + { + "epoch": 2.180032855183159, + "grad_norm": 0.22253675758838654, + "learning_rate": 9.12149151335357e-05, + "loss": 1.4418, + "step": 58570 + }, + { + "epoch": 2.1805747963486715, + "grad_norm": 0.3881581127643585, + "learning_rate": 9.120551167818084e-05, + "loss": 1.4429, + "step": 58580 + }, + { + "epoch": 2.181116737514184, + "grad_norm": 0.19040873646736145, + "learning_rate": 9.119610373784442e-05, + "loss": 1.4412, + "step": 58590 + }, + { + "epoch": 2.1816586786796957, + "grad_norm": 0.18105448782444, + "learning_rate": 9.118669131369179e-05, + "loss": 1.4504, + "step": 58600 + }, + { + "epoch": 2.182200619845208, + "grad_norm": 0.23432651162147522, + "learning_rate": 9.117727440688896e-05, + "loss": 1.4373, + "step": 58610 + }, + { + "epoch": 2.1827425610107203, + "grad_norm": 0.22334584593772888, + "learning_rate": 9.116785301860244e-05, + "loss": 1.4414, + "step": 58620 + }, + { + "epoch": 2.1832845021762326, + "grad_norm": 0.3361223638057709, + "learning_rate": 9.115842714999931e-05, + "loss": 1.4394, + "step": 58630 + }, + { + "epoch": 2.1837180551086424, + "eval_loss": 2.464482545852661, + "eval_runtime": 21.9821, + "eval_samples_per_second": 227.458, + "eval_steps_per_second": 1.228, + "step": 58638 + }, + { + "epoch": 2.183826443341745, + "grad_norm": 0.5349717140197754, + "learning_rate": 9.11489968022472e-05, + "loss": 1.4439, + "step": 58640 + }, + { + "epoch": 2.184368384507257, + "grad_norm": 0.5961711406707764, + "learning_rate": 9.113956197651434e-05, + "loss": 1.4424, + "step": 58650 + }, + { + "epoch": 2.184910325672769, + "grad_norm": 0.3221561312675476, + "learning_rate": 9.113012267396943e-05, + "loss": 1.4393, + "step": 58660 + }, + { + "epoch": 2.1854522668382814, + "grad_norm": 0.36589309573173523, + "learning_rate": 9.112067889578178e-05, + "loss": 1.4436, + "step": 58670 + }, + { + "epoch": 2.1859942080037937, + "grad_norm": 0.21223586797714233, + "learning_rate": 9.111123064312125e-05, + "loss": 1.4466, + "step": 58680 + }, + { + "epoch": 2.1865361491693056, + "grad_norm": 0.21100690960884094, + "learning_rate": 9.110177791715824e-05, + "loss": 1.4415, + "step": 58690 + }, + { + "epoch": 2.187078090334818, + "grad_norm": 0.3193996548652649, + "learning_rate": 9.109232071906373e-05, + "loss": 1.4573, + "step": 58700 + }, + { + "epoch": 2.1876200315003302, + "grad_norm": 0.18448546528816223, + "learning_rate": 9.108285905000922e-05, + "loss": 1.4535, + "step": 58710 + }, + { + "epoch": 2.1881619726658426, + "grad_norm": 0.3342556953430176, + "learning_rate": 9.107339291116679e-05, + "loss": 1.4356, + "step": 58720 + }, + { + "epoch": 2.1884329432485985, + "eval_loss": 2.4611220359802246, + "eval_runtime": 21.982, + "eval_samples_per_second": 227.459, + "eval_steps_per_second": 1.228, + "step": 58725 + }, + { + "epoch": 2.188703913831355, + "grad_norm": 0.43229833245277405, + "learning_rate": 9.106392230370906e-05, + "loss": 1.4644, + "step": 58730 + }, + { + "epoch": 2.1892458549968667, + "grad_norm": 0.36450695991516113, + "learning_rate": 9.10544472288092e-05, + "loss": 1.4381, + "step": 58740 + }, + { + "epoch": 2.189787796162379, + "grad_norm": 0.18473106622695923, + "learning_rate": 9.104496768764096e-05, + "loss": 1.4404, + "step": 58750 + }, + { + "epoch": 2.1903297373278914, + "grad_norm": 0.3914441168308258, + "learning_rate": 9.103548368137863e-05, + "loss": 1.4448, + "step": 58760 + }, + { + "epoch": 2.1908716784934037, + "grad_norm": 0.38350018858909607, + "learning_rate": 9.102599521119701e-05, + "loss": 1.4437, + "step": 58770 + }, + { + "epoch": 2.191413619658916, + "grad_norm": 0.1921490877866745, + "learning_rate": 9.101650227827152e-05, + "loss": 1.4433, + "step": 58780 + }, + { + "epoch": 2.191955560824428, + "grad_norm": 0.25018230080604553, + "learning_rate": 9.100700488377809e-05, + "loss": 1.4451, + "step": 58790 + }, + { + "epoch": 2.19249750198994, + "grad_norm": 0.5067526698112488, + "learning_rate": 9.099750302889323e-05, + "loss": 1.4481, + "step": 58800 + }, + { + "epoch": 2.1930394431554525, + "grad_norm": 0.27210572361946106, + "learning_rate": 9.098799671479397e-05, + "loss": 1.4553, + "step": 58810 + }, + { + "epoch": 2.193147831388555, + "eval_loss": 2.46246600151062, + "eval_runtime": 21.9768, + "eval_samples_per_second": 227.512, + "eval_steps_per_second": 1.229, + "step": 58812 + }, + { + "epoch": 2.193581384320965, + "grad_norm": 0.1989605873823166, + "learning_rate": 9.097848594265793e-05, + "loss": 1.4463, + "step": 58820 + }, + { + "epoch": 2.1941233254864767, + "grad_norm": 0.18576236069202423, + "learning_rate": 9.096897071366326e-05, + "loss": 1.4411, + "step": 58830 + }, + { + "epoch": 2.194665266651989, + "grad_norm": 0.2913901209831238, + "learning_rate": 9.095945102898865e-05, + "loss": 1.4448, + "step": 58840 + }, + { + "epoch": 2.1952072078175013, + "grad_norm": 0.5270187854766846, + "learning_rate": 9.094992688981337e-05, + "loss": 1.454, + "step": 58850 + }, + { + "epoch": 2.1957491489830137, + "grad_norm": 0.2984652519226074, + "learning_rate": 9.094039829731721e-05, + "loss": 1.4452, + "step": 58860 + }, + { + "epoch": 2.1962910901485255, + "grad_norm": 0.3765738308429718, + "learning_rate": 9.093086525268059e-05, + "loss": 1.4389, + "step": 58870 + }, + { + "epoch": 2.196833031314038, + "grad_norm": 0.31733348965644836, + "learning_rate": 9.092132775708432e-05, + "loss": 1.4405, + "step": 58880 + }, + { + "epoch": 2.19737497247955, + "grad_norm": 0.19566883146762848, + "learning_rate": 9.091178581170996e-05, + "loss": 1.451, + "step": 58890 + }, + { + "epoch": 2.197862719528511, + "eval_loss": 2.460601568222046, + "eval_runtime": 21.9826, + "eval_samples_per_second": 227.453, + "eval_steps_per_second": 1.228, + "step": 58899 + }, + { + "epoch": 2.1979169136450625, + "grad_norm": 0.2175428867340088, + "learning_rate": 9.090223941773949e-05, + "loss": 1.4446, + "step": 58900 + }, + { + "epoch": 2.198458854810575, + "grad_norm": 0.17789192497730255, + "learning_rate": 9.089268857635546e-05, + "loss": 1.4443, + "step": 58910 + }, + { + "epoch": 2.1990007959760867, + "grad_norm": 0.17181305587291718, + "learning_rate": 9.0883133288741e-05, + "loss": 1.4535, + "step": 58920 + }, + { + "epoch": 2.199542737141599, + "grad_norm": 0.3057771325111389, + "learning_rate": 9.087357355607977e-05, + "loss": 1.4402, + "step": 58930 + }, + { + "epoch": 2.2000846783071113, + "grad_norm": 0.3295128643512726, + "learning_rate": 9.086400937955601e-05, + "loss": 1.4578, + "step": 58940 + }, + { + "epoch": 2.2006266194726236, + "grad_norm": 0.3450293242931366, + "learning_rate": 9.085444076035448e-05, + "loss": 1.451, + "step": 58950 + }, + { + "epoch": 2.201168560638136, + "grad_norm": 0.31713640689849854, + "learning_rate": 9.084486769966047e-05, + "loss": 1.4599, + "step": 58960 + }, + { + "epoch": 2.201710501803648, + "grad_norm": 0.23452381789684296, + "learning_rate": 9.083529019865988e-05, + "loss": 1.4549, + "step": 58970 + }, + { + "epoch": 2.20225244296916, + "grad_norm": 0.19139593839645386, + "learning_rate": 9.082570825853912e-05, + "loss": 1.4405, + "step": 58980 + }, + { + "epoch": 2.2025776076684673, + "eval_loss": 2.4606852531433105, + "eval_runtime": 21.9818, + "eval_samples_per_second": 227.461, + "eval_steps_per_second": 1.228, + "step": 58986 + }, + { + "epoch": 2.2027943841346724, + "grad_norm": 0.20362070202827454, + "learning_rate": 9.081612188048518e-05, + "loss": 1.4418, + "step": 58990 + }, + { + "epoch": 2.2033363253001848, + "grad_norm": 0.24732381105422974, + "learning_rate": 9.080653106568555e-05, + "loss": 1.4289, + "step": 59000 + }, + { + "epoch": 2.2038782664656966, + "grad_norm": 0.2842128872871399, + "learning_rate": 9.07969358153283e-05, + "loss": 1.4486, + "step": 59010 + }, + { + "epoch": 2.204420207631209, + "grad_norm": 0.4157765805721283, + "learning_rate": 9.078733613060205e-05, + "loss": 1.4461, + "step": 59020 + }, + { + "epoch": 2.2049621487967213, + "grad_norm": 0.5429375171661377, + "learning_rate": 9.0777732012696e-05, + "loss": 1.4433, + "step": 59030 + }, + { + "epoch": 2.2055040899622336, + "grad_norm": 0.29770663380622864, + "learning_rate": 9.076812346279982e-05, + "loss": 1.4464, + "step": 59040 + }, + { + "epoch": 2.206046031127746, + "grad_norm": 0.22405195236206055, + "learning_rate": 9.075851048210379e-05, + "loss": 1.4495, + "step": 59050 + }, + { + "epoch": 2.2065879722932578, + "grad_norm": 0.43941786885261536, + "learning_rate": 9.074889307179873e-05, + "loss": 1.4481, + "step": 59060 + }, + { + "epoch": 2.20712991345877, + "grad_norm": 0.20217150449752808, + "learning_rate": 9.073927123307602e-05, + "loss": 1.4482, + "step": 59070 + }, + { + "epoch": 2.207292495808424, + "eval_loss": 2.457340717315674, + "eval_runtime": 21.9825, + "eval_samples_per_second": 227.453, + "eval_steps_per_second": 1.228, + "step": 59073 + }, + { + "epoch": 2.2076718546242824, + "grad_norm": 0.34927546977996826, + "learning_rate": 9.072964496712755e-05, + "loss": 1.4482, + "step": 59080 + }, + { + "epoch": 2.2082137957897947, + "grad_norm": 0.2522357702255249, + "learning_rate": 9.072001427514578e-05, + "loss": 1.4505, + "step": 59090 + }, + { + "epoch": 2.2087557369553066, + "grad_norm": 0.2815626263618469, + "learning_rate": 9.07103791583237e-05, + "loss": 1.4479, + "step": 59100 + }, + { + "epoch": 2.209297678120819, + "grad_norm": 0.22079609334468842, + "learning_rate": 9.070073961785491e-05, + "loss": 1.4494, + "step": 59110 + }, + { + "epoch": 2.209839619286331, + "grad_norm": 0.2886086702346802, + "learning_rate": 9.06910956549335e-05, + "loss": 1.4356, + "step": 59120 + }, + { + "epoch": 2.2103815604518435, + "grad_norm": 0.39057257771492004, + "learning_rate": 9.068144727075409e-05, + "loss": 1.4445, + "step": 59130 + }, + { + "epoch": 2.210923501617356, + "grad_norm": 0.41756001114845276, + "learning_rate": 9.067179446651195e-05, + "loss": 1.4419, + "step": 59140 + }, + { + "epoch": 2.2114654427828677, + "grad_norm": 0.30022770166397095, + "learning_rate": 9.066213724340274e-05, + "loss": 1.4551, + "step": 59150 + }, + { + "epoch": 2.21200738394838, + "grad_norm": 0.20884348452091217, + "learning_rate": 9.065247560262282e-05, + "loss": 1.456, + "step": 59160 + }, + { + "epoch": 2.21200738394838, + "eval_loss": 2.453835964202881, + "eval_runtime": 21.9755, + "eval_samples_per_second": 227.526, + "eval_steps_per_second": 1.229, + "step": 59160 + }, + { + "epoch": 2.2125493251138924, + "grad_norm": 0.3450928330421448, + "learning_rate": 9.0642809545369e-05, + "loss": 1.4548, + "step": 59170 + }, + { + "epoch": 2.2130912662794047, + "grad_norm": 0.3694857358932495, + "learning_rate": 9.063313907283868e-05, + "loss": 1.4562, + "step": 59180 + }, + { + "epoch": 2.213633207444917, + "grad_norm": 0.28221815824508667, + "learning_rate": 9.06234641862298e-05, + "loss": 1.4502, + "step": 59190 + }, + { + "epoch": 2.214175148610429, + "grad_norm": 0.207607701420784, + "learning_rate": 9.061378488674084e-05, + "loss": 1.4369, + "step": 59200 + }, + { + "epoch": 2.214717089775941, + "grad_norm": 0.3098548948764801, + "learning_rate": 9.060410117557083e-05, + "loss": 1.4371, + "step": 59210 + }, + { + "epoch": 2.2152590309414535, + "grad_norm": 0.3138673007488251, + "learning_rate": 9.059441305391932e-05, + "loss": 1.443, + "step": 59220 + }, + { + "epoch": 2.215800972106966, + "grad_norm": 0.3069899380207062, + "learning_rate": 9.058472052298649e-05, + "loss": 1.4424, + "step": 59230 + }, + { + "epoch": 2.2163429132724777, + "grad_norm": 0.4762488305568695, + "learning_rate": 9.057502358397296e-05, + "loss": 1.4312, + "step": 59240 + }, + { + "epoch": 2.2167222720883366, + "eval_loss": 2.4593636989593506, + "eval_runtime": 21.9809, + "eval_samples_per_second": 227.471, + "eval_steps_per_second": 1.228, + "step": 59247 + }, + { + "epoch": 2.21688485443799, + "grad_norm": 0.2531556487083435, + "learning_rate": 9.056532223807995e-05, + "loss": 1.4506, + "step": 59250 + }, + { + "epoch": 2.2174267956035023, + "grad_norm": 0.42446276545524597, + "learning_rate": 9.055561648650924e-05, + "loss": 1.4452, + "step": 59260 + }, + { + "epoch": 2.2179687367690146, + "grad_norm": 0.2571756839752197, + "learning_rate": 9.054590633046313e-05, + "loss": 1.4384, + "step": 59270 + }, + { + "epoch": 2.2185106779345265, + "grad_norm": 0.29887983202934265, + "learning_rate": 9.053619177114446e-05, + "loss": 1.4408, + "step": 59280 + }, + { + "epoch": 2.219052619100039, + "grad_norm": 0.22026404738426208, + "learning_rate": 9.052647280975664e-05, + "loss": 1.4522, + "step": 59290 + }, + { + "epoch": 2.219594560265551, + "grad_norm": 0.20031945407390594, + "learning_rate": 9.051674944750362e-05, + "loss": 1.4536, + "step": 59300 + }, + { + "epoch": 2.2201365014310634, + "grad_norm": 0.2620982527732849, + "learning_rate": 9.050702168558987e-05, + "loss": 1.4432, + "step": 59310 + }, + { + "epoch": 2.2206784425965758, + "grad_norm": 0.35358208417892456, + "learning_rate": 9.049728952522042e-05, + "loss": 1.444, + "step": 59320 + }, + { + "epoch": 2.2212203837620876, + "grad_norm": 0.36926135420799255, + "learning_rate": 9.048755296760087e-05, + "loss": 1.4593, + "step": 59330 + }, + { + "epoch": 2.2214371602282927, + "eval_loss": 2.448256015777588, + "eval_runtime": 21.981, + "eval_samples_per_second": 227.47, + "eval_steps_per_second": 1.228, + "step": 59334 + }, + { + "epoch": 2.2217623249276, + "grad_norm": 0.17719149589538574, + "learning_rate": 9.047781201393731e-05, + "loss": 1.4395, + "step": 59340 + }, + { + "epoch": 2.2223042660931123, + "grad_norm": 0.23782901465892792, + "learning_rate": 9.046806666543645e-05, + "loss": 1.4385, + "step": 59350 + }, + { + "epoch": 2.2228462072586246, + "grad_norm": 0.36395078897476196, + "learning_rate": 9.045831692330546e-05, + "loss": 1.4533, + "step": 59360 + }, + { + "epoch": 2.223388148424137, + "grad_norm": 0.4966432452201843, + "learning_rate": 9.044856278875212e-05, + "loss": 1.4494, + "step": 59370 + }, + { + "epoch": 2.2239300895896488, + "grad_norm": 0.4597684442996979, + "learning_rate": 9.043880426298475e-05, + "loss": 1.4468, + "step": 59380 + }, + { + "epoch": 2.224472030755161, + "grad_norm": 0.2446877360343933, + "learning_rate": 9.042904134721214e-05, + "loss": 1.4415, + "step": 59390 + }, + { + "epoch": 2.2250139719206734, + "grad_norm": 0.2874411940574646, + "learning_rate": 9.041927404264371e-05, + "loss": 1.4472, + "step": 59400 + }, + { + "epoch": 2.2255559130861857, + "grad_norm": 0.5566506385803223, + "learning_rate": 9.040950235048938e-05, + "loss": 1.446, + "step": 59410 + }, + { + "epoch": 2.2260978542516976, + "grad_norm": 0.19569170475006104, + "learning_rate": 9.039972627195965e-05, + "loss": 1.4365, + "step": 59420 + }, + { + "epoch": 2.226152048368249, + "eval_loss": 2.458207607269287, + "eval_runtime": 21.9788, + "eval_samples_per_second": 227.492, + "eval_steps_per_second": 1.228, + "step": 59421 + }, + { + "epoch": 2.22663979541721, + "grad_norm": 0.41606298089027405, + "learning_rate": 9.038994580826549e-05, + "loss": 1.4496, + "step": 59430 + }, + { + "epoch": 2.2271817365827222, + "grad_norm": 0.21181747317314148, + "learning_rate": 9.03801609606185e-05, + "loss": 1.4425, + "step": 59440 + }, + { + "epoch": 2.2277236777482345, + "grad_norm": 0.3672482371330261, + "learning_rate": 9.03703717302308e-05, + "loss": 1.4444, + "step": 59450 + }, + { + "epoch": 2.228265618913747, + "grad_norm": 0.2655334174633026, + "learning_rate": 9.0360578118315e-05, + "loss": 1.4421, + "step": 59460 + }, + { + "epoch": 2.2288075600792587, + "grad_norm": 0.20638707280158997, + "learning_rate": 9.035078012608431e-05, + "loss": 1.4422, + "step": 59470 + }, + { + "epoch": 2.229349501244771, + "grad_norm": 0.5288010239601135, + "learning_rate": 9.034097775475244e-05, + "loss": 1.4441, + "step": 59480 + }, + { + "epoch": 2.2298914424102834, + "grad_norm": 0.37558361887931824, + "learning_rate": 9.03311710055337e-05, + "loss": 1.4386, + "step": 59490 + }, + { + "epoch": 2.2304333835757957, + "grad_norm": 0.30273330211639404, + "learning_rate": 9.032135987964287e-05, + "loss": 1.4426, + "step": 59500 + }, + { + "epoch": 2.2308669365082054, + "eval_loss": 2.4551827907562256, + "eval_runtime": 21.9794, + "eval_samples_per_second": 227.485, + "eval_steps_per_second": 1.228, + "step": 59508 + }, + { + "epoch": 2.2309753247413076, + "grad_norm": 0.34168776869773865, + "learning_rate": 9.031154437829533e-05, + "loss": 1.4485, + "step": 59510 + }, + { + "epoch": 2.23151726590682, + "grad_norm": 0.2830340266227722, + "learning_rate": 9.030172450270699e-05, + "loss": 1.446, + "step": 59520 + }, + { + "epoch": 2.232059207072332, + "grad_norm": 0.18490228056907654, + "learning_rate": 9.029190025409426e-05, + "loss": 1.4459, + "step": 59530 + }, + { + "epoch": 2.2326011482378445, + "grad_norm": 0.32973939180374146, + "learning_rate": 9.028207163367417e-05, + "loss": 1.4491, + "step": 59540 + }, + { + "epoch": 2.233143089403357, + "grad_norm": 0.2619604766368866, + "learning_rate": 9.027223864266423e-05, + "loss": 1.4446, + "step": 59550 + }, + { + "epoch": 2.2336850305688687, + "grad_norm": 0.3706190884113312, + "learning_rate": 9.02624012822825e-05, + "loss": 1.4369, + "step": 59560 + }, + { + "epoch": 2.234226971734381, + "grad_norm": 0.1908445507287979, + "learning_rate": 9.025255955374758e-05, + "loss": 1.4321, + "step": 59570 + }, + { + "epoch": 2.2347689128998933, + "grad_norm": 0.25038012862205505, + "learning_rate": 9.024271345827864e-05, + "loss": 1.4451, + "step": 59580 + }, + { + "epoch": 2.2353108540654056, + "grad_norm": 0.22101671993732452, + "learning_rate": 9.023286299709538e-05, + "loss": 1.4376, + "step": 59590 + }, + { + "epoch": 2.2355818246481616, + "eval_loss": 2.449619770050049, + "eval_runtime": 21.9821, + "eval_samples_per_second": 227.458, + "eval_steps_per_second": 1.228, + "step": 59595 + }, + { + "epoch": 2.235852795230918, + "grad_norm": 0.2644299268722534, + "learning_rate": 9.022300817141799e-05, + "loss": 1.4417, + "step": 59600 + }, + { + "epoch": 2.23639473639643, + "grad_norm": 0.18024158477783203, + "learning_rate": 9.02131489824673e-05, + "loss": 1.4442, + "step": 59610 + }, + { + "epoch": 2.236936677561942, + "grad_norm": 0.2031889110803604, + "learning_rate": 9.020328543146457e-05, + "loss": 1.4512, + "step": 59620 + }, + { + "epoch": 2.2374786187274545, + "grad_norm": 0.2275354415178299, + "learning_rate": 9.01934175196317e-05, + "loss": 1.4478, + "step": 59630 + }, + { + "epoch": 2.2380205598929668, + "grad_norm": 0.1913514882326126, + "learning_rate": 9.018354524819104e-05, + "loss": 1.4532, + "step": 59640 + }, + { + "epoch": 2.2385625010584786, + "grad_norm": 0.17917923629283905, + "learning_rate": 9.017366861836555e-05, + "loss": 1.4534, + "step": 59650 + }, + { + "epoch": 2.239104442223991, + "grad_norm": 0.1910235583782196, + "learning_rate": 9.016378763137872e-05, + "loss": 1.4452, + "step": 59660 + }, + { + "epoch": 2.2396463833895033, + "grad_norm": 0.2699836194515228, + "learning_rate": 9.015390228845454e-05, + "loss": 1.4452, + "step": 59670 + }, + { + "epoch": 2.2401883245550156, + "grad_norm": 0.5121615529060364, + "learning_rate": 9.014401259081754e-05, + "loss": 1.4472, + "step": 59680 + }, + { + "epoch": 2.240296712788118, + "eval_loss": 2.445277452468872, + "eval_runtime": 21.9791, + "eval_samples_per_second": 227.489, + "eval_steps_per_second": 1.228, + "step": 59682 + }, + { + "epoch": 2.240730265720528, + "grad_norm": 0.3202587962150574, + "learning_rate": 9.013411853969286e-05, + "loss": 1.447, + "step": 59690 + }, + { + "epoch": 2.24127220688604, + "grad_norm": 0.29666054248809814, + "learning_rate": 9.012422013630611e-05, + "loss": 1.4478, + "step": 59700 + }, + { + "epoch": 2.241814148051552, + "grad_norm": 0.2042292356491089, + "learning_rate": 9.011431738188348e-05, + "loss": 1.4473, + "step": 59710 + }, + { + "epoch": 2.2423560892170644, + "grad_norm": 0.1927926242351532, + "learning_rate": 9.010441027765165e-05, + "loss": 1.4467, + "step": 59720 + }, + { + "epoch": 2.2428980303825767, + "grad_norm": 0.3930176794528961, + "learning_rate": 9.009449882483788e-05, + "loss": 1.4493, + "step": 59730 + }, + { + "epoch": 2.2434399715480886, + "grad_norm": 0.31526270508766174, + "learning_rate": 9.008458302466995e-05, + "loss": 1.4487, + "step": 59740 + }, + { + "epoch": 2.243981912713601, + "grad_norm": 0.2965898811817169, + "learning_rate": 9.007466287837622e-05, + "loss": 1.4528, + "step": 59750 + }, + { + "epoch": 2.2445238538791132, + "grad_norm": 0.3894706964492798, + "learning_rate": 9.006473838718551e-05, + "loss": 1.443, + "step": 59760 + }, + { + "epoch": 2.2450116009280743, + "eval_loss": 2.4487032890319824, + "eval_runtime": 21.9873, + "eval_samples_per_second": 227.404, + "eval_steps_per_second": 1.228, + "step": 59769 + }, + { + "epoch": 2.2450657950446256, + "grad_norm": 0.2878785729408264, + "learning_rate": 9.005480955232726e-05, + "loss": 1.4379, + "step": 59770 + }, + { + "epoch": 2.245607736210138, + "grad_norm": 0.2061561793088913, + "learning_rate": 9.004487637503139e-05, + "loss": 1.4331, + "step": 59780 + }, + { + "epoch": 2.2461496773756497, + "grad_norm": 0.24609437584877014, + "learning_rate": 9.00349388565284e-05, + "loss": 1.4486, + "step": 59790 + }, + { + "epoch": 2.246691618541162, + "grad_norm": 0.24656355381011963, + "learning_rate": 9.002499699804927e-05, + "loss": 1.442, + "step": 59800 + }, + { + "epoch": 2.2472335597066744, + "grad_norm": 0.23344773054122925, + "learning_rate": 9.001505080082558e-05, + "loss": 1.452, + "step": 59810 + }, + { + "epoch": 2.2477755008721867, + "grad_norm": 0.2287297546863556, + "learning_rate": 9.000510026608942e-05, + "loss": 1.4439, + "step": 59820 + }, + { + "epoch": 2.248317442037699, + "grad_norm": 0.2048652619123459, + "learning_rate": 8.999514539507342e-05, + "loss": 1.4392, + "step": 59830 + }, + { + "epoch": 2.248859383203211, + "grad_norm": 0.2534552812576294, + "learning_rate": 8.998518618901073e-05, + "loss": 1.4449, + "step": 59840 + }, + { + "epoch": 2.249401324368723, + "grad_norm": 0.22183406352996826, + "learning_rate": 8.997522264913508e-05, + "loss": 1.4556, + "step": 59850 + }, + { + "epoch": 2.2497264890680304, + "eval_loss": 2.4434633255004883, + "eval_runtime": 21.9826, + "eval_samples_per_second": 227.452, + "eval_steps_per_second": 1.228, + "step": 59856 + }, + { + "epoch": 2.2499432655342355, + "grad_norm": 0.26273661851882935, + "learning_rate": 8.996525477668068e-05, + "loss": 1.4403, + "step": 59860 + }, + { + "epoch": 2.250485206699748, + "grad_norm": 0.2545994222164154, + "learning_rate": 8.995528257288233e-05, + "loss": 1.4377, + "step": 59870 + }, + { + "epoch": 2.2510271478652597, + "grad_norm": 0.2006346583366394, + "learning_rate": 8.994530603897534e-05, + "loss": 1.4457, + "step": 59880 + }, + { + "epoch": 2.251569089030772, + "grad_norm": 0.17895658314228058, + "learning_rate": 8.993532517619554e-05, + "loss": 1.4424, + "step": 59890 + }, + { + "epoch": 2.2521110301962843, + "grad_norm": 0.2239982634782791, + "learning_rate": 8.992533998577936e-05, + "loss": 1.4329, + "step": 59900 + }, + { + "epoch": 2.2526529713617967, + "grad_norm": 0.2703670263290405, + "learning_rate": 8.991535046896367e-05, + "loss": 1.456, + "step": 59910 + }, + { + "epoch": 2.2531949125273085, + "grad_norm": 0.23329605162143707, + "learning_rate": 8.990535662698596e-05, + "loss": 1.4411, + "step": 59920 + }, + { + "epoch": 2.253736853692821, + "grad_norm": 0.25912684202194214, + "learning_rate": 8.989535846108421e-05, + "loss": 1.446, + "step": 59930 + }, + { + "epoch": 2.254278794858333, + "grad_norm": 0.22521668672561646, + "learning_rate": 8.988535597249696e-05, + "loss": 1.4516, + "step": 59940 + }, + { + "epoch": 2.254441377207987, + "eval_loss": 2.432748556137085, + "eval_runtime": 21.9825, + "eval_samples_per_second": 227.454, + "eval_steps_per_second": 1.228, + "step": 59943 + }, + { + "epoch": 2.2548207360238455, + "grad_norm": 0.20753563940525055, + "learning_rate": 8.987534916246327e-05, + "loss": 1.4364, + "step": 59950 + }, + { + "epoch": 2.255362677189358, + "grad_norm": 0.18667367100715637, + "learning_rate": 8.986533803222272e-05, + "loss": 1.4418, + "step": 59960 + }, + { + "epoch": 2.2559046183548697, + "grad_norm": 0.2076788991689682, + "learning_rate": 8.985532258301549e-05, + "loss": 1.4435, + "step": 59970 + }, + { + "epoch": 2.256446559520382, + "grad_norm": 0.2706129550933838, + "learning_rate": 8.98453028160822e-05, + "loss": 1.4415, + "step": 59980 + }, + { + "epoch": 2.2569885006858943, + "grad_norm": 0.21341146528720856, + "learning_rate": 8.98352787326641e-05, + "loss": 1.4393, + "step": 59990 + }, + { + "epoch": 2.2575304418514066, + "grad_norm": 0.21545018255710602, + "learning_rate": 8.982525033400289e-05, + "loss": 1.4484, + "step": 60000 + }, + { + "epoch": 2.258072383016919, + "grad_norm": 0.23385457694530487, + "learning_rate": 8.98152176213409e-05, + "loss": 1.4451, + "step": 60010 + }, + { + "epoch": 2.258614324182431, + "grad_norm": 0.20415639877319336, + "learning_rate": 8.980518059592088e-05, + "loss": 1.4394, + "step": 60020 + }, + { + "epoch": 2.259156265347943, + "grad_norm": 0.25711020827293396, + "learning_rate": 8.97951392589862e-05, + "loss": 1.4322, + "step": 60030 + }, + { + "epoch": 2.259156265347943, + "eval_loss": 2.4509191513061523, + "eval_runtime": 21.9779, + "eval_samples_per_second": 227.502, + "eval_steps_per_second": 1.229, + "step": 60030 + }, + { + "epoch": 2.2596982065134554, + "grad_norm": 0.3049508333206177, + "learning_rate": 8.978509361178073e-05, + "loss": 1.4373, + "step": 60040 + }, + { + "epoch": 2.2602401476789677, + "grad_norm": 0.2312183976173401, + "learning_rate": 8.97750436555489e-05, + "loss": 1.4361, + "step": 60050 + }, + { + "epoch": 2.26078208884448, + "grad_norm": 0.2977278232574463, + "learning_rate": 8.976498939153562e-05, + "loss": 1.4418, + "step": 60060 + }, + { + "epoch": 2.261324030009992, + "grad_norm": 0.18335352838039398, + "learning_rate": 8.975493082098639e-05, + "loss": 1.4427, + "step": 60070 + }, + { + "epoch": 2.2618659711755043, + "grad_norm": 0.22939863801002502, + "learning_rate": 8.974486794514723e-05, + "loss": 1.4411, + "step": 60080 + }, + { + "epoch": 2.2624079123410166, + "grad_norm": 0.17828980088233948, + "learning_rate": 8.973480076526469e-05, + "loss": 1.4399, + "step": 60090 + }, + { + "epoch": 2.262949853506529, + "grad_norm": 0.2951532304286957, + "learning_rate": 8.97247292825858e-05, + "loss": 1.4469, + "step": 60100 + }, + { + "epoch": 2.2634917946720408, + "grad_norm": 0.2627731263637543, + "learning_rate": 8.971465349835824e-05, + "loss": 1.4371, + "step": 60110 + }, + { + "epoch": 2.2638711534878997, + "eval_loss": 2.4536399841308594, + "eval_runtime": 21.9812, + "eval_samples_per_second": 227.467, + "eval_steps_per_second": 1.228, + "step": 60117 + }, + { + "epoch": 2.264033735837553, + "grad_norm": 0.1896529346704483, + "learning_rate": 8.970457341383011e-05, + "loss": 1.4413, + "step": 60120 + }, + { + "epoch": 2.2645756770030654, + "grad_norm": 0.1765110045671463, + "learning_rate": 8.969448903025008e-05, + "loss": 1.445, + "step": 60130 + }, + { + "epoch": 2.2651176181685777, + "grad_norm": 0.19842223823070526, + "learning_rate": 8.96844003488674e-05, + "loss": 1.4459, + "step": 60140 + }, + { + "epoch": 2.2656595593340896, + "grad_norm": 0.23995903134346008, + "learning_rate": 8.967430737093179e-05, + "loss": 1.4476, + "step": 60150 + }, + { + "epoch": 2.266201500499602, + "grad_norm": 0.3694940209388733, + "learning_rate": 8.966421009769352e-05, + "loss": 1.4348, + "step": 60160 + }, + { + "epoch": 2.266743441665114, + "grad_norm": 0.27862706780433655, + "learning_rate": 8.965410853040338e-05, + "loss": 1.4425, + "step": 60170 + }, + { + "epoch": 2.2672853828306265, + "grad_norm": 0.2391398549079895, + "learning_rate": 8.964400267031274e-05, + "loss": 1.4437, + "step": 60180 + }, + { + "epoch": 2.267827323996139, + "grad_norm": 0.17560423910617828, + "learning_rate": 8.963389251867346e-05, + "loss": 1.4396, + "step": 60190 + }, + { + "epoch": 2.2683692651616507, + "grad_norm": 0.2710578143596649, + "learning_rate": 8.962377807673795e-05, + "loss": 1.4407, + "step": 60200 + }, + { + "epoch": 2.268586041627856, + "eval_loss": 2.433633327484131, + "eval_runtime": 21.9783, + "eval_samples_per_second": 227.497, + "eval_steps_per_second": 1.228, + "step": 60204 + }, + { + "epoch": 2.268911206327163, + "grad_norm": 0.2406301647424698, + "learning_rate": 8.961365934575913e-05, + "loss": 1.4453, + "step": 60210 + }, + { + "epoch": 2.2694531474926753, + "grad_norm": 0.48485299944877625, + "learning_rate": 8.960353632699046e-05, + "loss": 1.4345, + "step": 60220 + }, + { + "epoch": 2.2699950886581877, + "grad_norm": 0.270643025636673, + "learning_rate": 8.959340902168594e-05, + "loss": 1.4527, + "step": 60230 + }, + { + "epoch": 2.2705370298237, + "grad_norm": 0.22174952924251556, + "learning_rate": 8.95832774311001e-05, + "loss": 1.4415, + "step": 60240 + }, + { + "epoch": 2.271078970989212, + "grad_norm": 0.4398956298828125, + "learning_rate": 8.957314155648801e-05, + "loss": 1.4397, + "step": 60250 + }, + { + "epoch": 2.271620912154724, + "grad_norm": 0.2305718958377838, + "learning_rate": 8.956300139910525e-05, + "loss": 1.4484, + "step": 60260 + }, + { + "epoch": 2.2721628533202365, + "grad_norm": 0.2533299922943115, + "learning_rate": 8.955285696020793e-05, + "loss": 1.4524, + "step": 60270 + }, + { + "epoch": 2.272704794485749, + "grad_norm": 0.20475125312805176, + "learning_rate": 8.954270824105268e-05, + "loss": 1.4427, + "step": 60280 + }, + { + "epoch": 2.2732467356512607, + "grad_norm": 0.20677226781845093, + "learning_rate": 8.953255524289671e-05, + "loss": 1.4397, + "step": 60290 + }, + { + "epoch": 2.273300929767812, + "eval_loss": 2.436509132385254, + "eval_runtime": 22.2954, + "eval_samples_per_second": 224.261, + "eval_steps_per_second": 1.211, + "step": 60291 + }, + { + "epoch": 2.273788676816773, + "grad_norm": 0.21641433238983154, + "learning_rate": 8.952239796699774e-05, + "loss": 1.4478, + "step": 60300 + }, + { + "epoch": 2.2743306179822853, + "grad_norm": 0.18868723511695862, + "learning_rate": 8.951223641461398e-05, + "loss": 1.4441, + "step": 60310 + }, + { + "epoch": 2.2748725591477976, + "grad_norm": 0.22199149429798126, + "learning_rate": 8.950207058700422e-05, + "loss": 1.4376, + "step": 60320 + }, + { + "epoch": 2.2754145003133095, + "grad_norm": 0.21569089591503143, + "learning_rate": 8.949190048542773e-05, + "loss": 1.4365, + "step": 60330 + }, + { + "epoch": 2.275956441478822, + "grad_norm": 0.2327711433172226, + "learning_rate": 8.948172611114438e-05, + "loss": 1.4375, + "step": 60340 + }, + { + "epoch": 2.276498382644334, + "grad_norm": 0.25674811005592346, + "learning_rate": 8.947154746541448e-05, + "loss": 1.442, + "step": 60350 + }, + { + "epoch": 2.2770403238098464, + "grad_norm": 0.26655909419059753, + "learning_rate": 8.946136454949895e-05, + "loss": 1.4425, + "step": 60360 + }, + { + "epoch": 2.2775822649753588, + "grad_norm": 0.2611353099346161, + "learning_rate": 8.94511773646592e-05, + "loss": 1.4407, + "step": 60370 + }, + { + "epoch": 2.2780158179077685, + "eval_loss": 2.4322497844696045, + "eval_runtime": 21.9808, + "eval_samples_per_second": 227.471, + "eval_steps_per_second": 1.228, + "step": 60378 + }, + { + "epoch": 2.2781242061408706, + "grad_norm": 0.22814051806926727, + "learning_rate": 8.944098591215717e-05, + "loss": 1.4372, + "step": 60380 + }, + { + "epoch": 2.278666147306383, + "grad_norm": 0.3651120066642761, + "learning_rate": 8.943079019325533e-05, + "loss": 1.4415, + "step": 60390 + }, + { + "epoch": 2.2792080884718953, + "grad_norm": 0.3389890193939209, + "learning_rate": 8.94205902092167e-05, + "loss": 1.4422, + "step": 60400 + }, + { + "epoch": 2.2797500296374076, + "grad_norm": 0.32138633728027344, + "learning_rate": 8.941038596130477e-05, + "loss": 1.428, + "step": 60410 + }, + { + "epoch": 2.28029197080292, + "grad_norm": 0.2286197692155838, + "learning_rate": 8.940017745078363e-05, + "loss": 1.4454, + "step": 60420 + }, + { + "epoch": 2.2808339119684318, + "grad_norm": 0.26349860429763794, + "learning_rate": 8.938996467891786e-05, + "loss": 1.4401, + "step": 60430 + }, + { + "epoch": 2.281375853133944, + "grad_norm": 0.1832720935344696, + "learning_rate": 8.937974764697258e-05, + "loss": 1.451, + "step": 60440 + }, + { + "epoch": 2.2819177942994564, + "grad_norm": 0.18660111725330353, + "learning_rate": 8.936952635621341e-05, + "loss": 1.4453, + "step": 60450 + }, + { + "epoch": 2.2824597354649687, + "grad_norm": 0.22778889536857605, + "learning_rate": 8.935930080790655e-05, + "loss": 1.4451, + "step": 60460 + }, + { + "epoch": 2.2827307060477247, + "eval_loss": 2.4412660598754883, + "eval_runtime": 21.9845, + "eval_samples_per_second": 227.433, + "eval_steps_per_second": 1.228, + "step": 60465 + }, + { + "epoch": 2.283001676630481, + "grad_norm": 0.4049639105796814, + "learning_rate": 8.934907100331865e-05, + "loss": 1.4458, + "step": 60470 + }, + { + "epoch": 2.283543617795993, + "grad_norm": 0.20592905580997467, + "learning_rate": 8.933883694371699e-05, + "loss": 1.441, + "step": 60480 + }, + { + "epoch": 2.2840855589615052, + "grad_norm": 0.5759056806564331, + "learning_rate": 8.932859863036927e-05, + "loss": 1.4342, + "step": 60490 + }, + { + "epoch": 2.2846275001270175, + "grad_norm": 0.3856758177280426, + "learning_rate": 8.931835606454382e-05, + "loss": 1.4307, + "step": 60500 + }, + { + "epoch": 2.28516944129253, + "grad_norm": 0.24121706187725067, + "learning_rate": 8.930810924750939e-05, + "loss": 1.4466, + "step": 60510 + }, + { + "epoch": 2.2857113824580417, + "grad_norm": 0.3475324511528015, + "learning_rate": 8.929785818053534e-05, + "loss": 1.4371, + "step": 60520 + }, + { + "epoch": 2.286253323623554, + "grad_norm": 0.2951948642730713, + "learning_rate": 8.928760286489155e-05, + "loss": 1.4513, + "step": 60530 + }, + { + "epoch": 2.2867952647890664, + "grad_norm": 0.27565380930900574, + "learning_rate": 8.927734330184835e-05, + "loss": 1.4397, + "step": 60540 + }, + { + "epoch": 2.2873372059545787, + "grad_norm": 0.22539940476417542, + "learning_rate": 8.92670794926767e-05, + "loss": 1.4414, + "step": 60550 + }, + { + "epoch": 2.2874455941876812, + "eval_loss": 2.4488258361816406, + "eval_runtime": 21.9792, + "eval_samples_per_second": 227.487, + "eval_steps_per_second": 1.228, + "step": 60552 + }, + { + "epoch": 2.2878791471200906, + "grad_norm": 0.2256643921136856, + "learning_rate": 8.9256811438648e-05, + "loss": 1.4383, + "step": 60560 + }, + { + "epoch": 2.288421088285603, + "grad_norm": 0.23651210963726044, + "learning_rate": 8.924653914103423e-05, + "loss": 1.44, + "step": 60570 + }, + { + "epoch": 2.288963029451115, + "grad_norm": 0.2763255834579468, + "learning_rate": 8.923626260110789e-05, + "loss": 1.4377, + "step": 60580 + }, + { + "epoch": 2.2895049706166275, + "grad_norm": 0.34174492955207825, + "learning_rate": 8.922598182014197e-05, + "loss": 1.4544, + "step": 60590 + }, + { + "epoch": 2.29004691178214, + "grad_norm": 0.26863643527030945, + "learning_rate": 8.921569679941e-05, + "loss": 1.4435, + "step": 60600 + }, + { + "epoch": 2.2905888529476517, + "grad_norm": 0.24815820157527924, + "learning_rate": 8.920540754018608e-05, + "loss": 1.4385, + "step": 60610 + }, + { + "epoch": 2.291130794113164, + "grad_norm": 0.3145918548107147, + "learning_rate": 8.919511404374476e-05, + "loss": 1.446, + "step": 60620 + }, + { + "epoch": 2.2916727352786763, + "grad_norm": 0.2556162178516388, + "learning_rate": 8.918481631136119e-05, + "loss": 1.4419, + "step": 60630 + }, + { + "epoch": 2.2921604823276374, + "eval_loss": 2.4316673278808594, + "eval_runtime": 21.9778, + "eval_samples_per_second": 227.503, + "eval_steps_per_second": 1.229, + "step": 60639 + }, + { + "epoch": 2.2922146764441886, + "grad_norm": 0.32888561487197876, + "learning_rate": 8.9174514344311e-05, + "loss": 1.4446, + "step": 60640 + }, + { + "epoch": 2.292756617609701, + "grad_norm": 0.289276123046875, + "learning_rate": 8.916420814387033e-05, + "loss": 1.4418, + "step": 60650 + }, + { + "epoch": 2.293298558775213, + "grad_norm": 0.4594111740589142, + "learning_rate": 8.915389771131587e-05, + "loss": 1.4414, + "step": 60660 + }, + { + "epoch": 2.293840499940725, + "grad_norm": 0.2204999476671219, + "learning_rate": 8.914358304792486e-05, + "loss": 1.4378, + "step": 60670 + }, + { + "epoch": 2.2943824411062375, + "grad_norm": 0.22972345352172852, + "learning_rate": 8.913326415497501e-05, + "loss": 1.4386, + "step": 60680 + }, + { + "epoch": 2.2949243822717498, + "grad_norm": 0.2072676122188568, + "learning_rate": 8.912294103374459e-05, + "loss": 1.4439, + "step": 60690 + }, + { + "epoch": 2.295466323437262, + "grad_norm": 0.18969523906707764, + "learning_rate": 8.911261368551238e-05, + "loss": 1.4416, + "step": 60700 + }, + { + "epoch": 2.296008264602774, + "grad_norm": 0.24862347543239594, + "learning_rate": 8.910228211155771e-05, + "loss": 1.4567, + "step": 60710 + }, + { + "epoch": 2.2965502057682863, + "grad_norm": 0.3798186779022217, + "learning_rate": 8.909194631316035e-05, + "loss": 1.4435, + "step": 60720 + }, + { + "epoch": 2.2968753704675935, + "eval_loss": 2.450904369354248, + "eval_runtime": 21.9806, + "eval_samples_per_second": 227.474, + "eval_steps_per_second": 1.228, + "step": 60726 + }, + { + "epoch": 2.2970921469337986, + "grad_norm": 0.32553133368492126, + "learning_rate": 8.908160629160073e-05, + "loss": 1.4286, + "step": 60730 + }, + { + "epoch": 2.297634088099311, + "grad_norm": 0.3055340647697449, + "learning_rate": 8.907126204815966e-05, + "loss": 1.4405, + "step": 60740 + }, + { + "epoch": 2.298176029264823, + "grad_norm": 0.28653863072395325, + "learning_rate": 8.906091358411858e-05, + "loss": 1.4405, + "step": 60750 + }, + { + "epoch": 2.298717970430335, + "grad_norm": 0.2887609004974365, + "learning_rate": 8.905056090075942e-05, + "loss": 1.4377, + "step": 60760 + }, + { + "epoch": 2.2992599115958474, + "grad_norm": 0.23055769503116608, + "learning_rate": 8.90402039993646e-05, + "loss": 1.4483, + "step": 60770 + }, + { + "epoch": 2.2998018527613597, + "grad_norm": 0.20625241100788116, + "learning_rate": 8.90298428812171e-05, + "loss": 1.4344, + "step": 60780 + }, + { + "epoch": 2.3003437939268716, + "grad_norm": 0.1865055412054062, + "learning_rate": 8.901947754760039e-05, + "loss": 1.4477, + "step": 60790 + }, + { + "epoch": 2.300885735092384, + "grad_norm": 0.36700624227523804, + "learning_rate": 8.90091079997985e-05, + "loss": 1.4487, + "step": 60800 + }, + { + "epoch": 2.3014276762578962, + "grad_norm": 0.2842310667037964, + "learning_rate": 8.899873423909599e-05, + "loss": 1.4478, + "step": 60810 + }, + { + "epoch": 2.30159025860755, + "eval_loss": 2.4509172439575195, + "eval_runtime": 35.9865, + "eval_samples_per_second": 138.941, + "eval_steps_per_second": 0.75, + "step": 60813 + }, + { + "epoch": 2.3019696174234086, + "grad_norm": 0.24928165972232819, + "learning_rate": 8.898835626677786e-05, + "loss": 1.4322, + "step": 60820 + }, + { + "epoch": 2.302511558588921, + "grad_norm": 0.22808949649333954, + "learning_rate": 8.897797408412973e-05, + "loss": 1.4377, + "step": 60830 + }, + { + "epoch": 2.3030534997544327, + "grad_norm": 0.22761550545692444, + "learning_rate": 8.896758769243769e-05, + "loss": 1.4548, + "step": 60840 + }, + { + "epoch": 2.303595440919945, + "grad_norm": 0.24886593222618103, + "learning_rate": 8.895719709298837e-05, + "loss": 1.4398, + "step": 60850 + }, + { + "epoch": 2.3041373820854574, + "grad_norm": 0.1771778017282486, + "learning_rate": 8.894680228706888e-05, + "loss": 1.4439, + "step": 60860 + }, + { + "epoch": 2.3046793232509697, + "grad_norm": 0.28259819746017456, + "learning_rate": 8.893640327596692e-05, + "loss": 1.4408, + "step": 60870 + }, + { + "epoch": 2.305221264416482, + "grad_norm": 0.23001182079315186, + "learning_rate": 8.892600006097065e-05, + "loss": 1.4418, + "step": 60880 + }, + { + "epoch": 2.305763205581994, + "grad_norm": 0.21615125238895416, + "learning_rate": 8.891559264336879e-05, + "loss": 1.4419, + "step": 60890 + }, + { + "epoch": 2.306305146747506, + "grad_norm": 0.2845569849014282, + "learning_rate": 8.890518102445055e-05, + "loss": 1.4417, + "step": 60900 + }, + { + "epoch": 2.306305146747506, + "eval_loss": 2.452852487564087, + "eval_runtime": 21.8629, + "eval_samples_per_second": 228.698, + "eval_steps_per_second": 1.235, + "step": 60900 + }, + { + "epoch": 2.3068470879130185, + "grad_norm": 0.21112145483493805, + "learning_rate": 8.88947652055057e-05, + "loss": 1.4355, + "step": 60910 + }, + { + "epoch": 2.307389029078531, + "grad_norm": 0.2809900641441345, + "learning_rate": 8.888434518782449e-05, + "loss": 1.441, + "step": 60920 + }, + { + "epoch": 2.3079309702440427, + "grad_norm": 0.19691738486289978, + "learning_rate": 8.88739209726977e-05, + "loss": 1.4439, + "step": 60930 + }, + { + "epoch": 2.308472911409555, + "grad_norm": 0.5515458583831787, + "learning_rate": 8.886349256141665e-05, + "loss": 1.4473, + "step": 60940 + }, + { + "epoch": 2.3090148525750673, + "grad_norm": 0.34075281023979187, + "learning_rate": 8.885305995527317e-05, + "loss": 1.4532, + "step": 60950 + }, + { + "epoch": 2.3095567937405796, + "grad_norm": 0.2703841030597687, + "learning_rate": 8.884262315555958e-05, + "loss": 1.4428, + "step": 60960 + }, + { + "epoch": 2.3100987349060915, + "grad_norm": 0.24917836487293243, + "learning_rate": 8.88321821635688e-05, + "loss": 1.4448, + "step": 60970 + }, + { + "epoch": 2.310640676071604, + "grad_norm": 0.22919951379299164, + "learning_rate": 8.882173698059414e-05, + "loss": 1.4484, + "step": 60980 + }, + { + "epoch": 2.3110200348874628, + "eval_loss": 2.4533286094665527, + "eval_runtime": 21.9805, + "eval_samples_per_second": 227.474, + "eval_steps_per_second": 1.228, + "step": 60987 + }, + { + "epoch": 2.311182617237116, + "grad_norm": 0.2503993511199951, + "learning_rate": 8.881128760792957e-05, + "loss": 1.4377, + "step": 60990 + }, + { + "epoch": 2.3117245584026285, + "grad_norm": 0.2338937520980835, + "learning_rate": 8.880083404686947e-05, + "loss": 1.4339, + "step": 61000 + }, + { + "epoch": 2.312266499568141, + "grad_norm": 0.2059098482131958, + "learning_rate": 8.879037629870878e-05, + "loss": 1.4293, + "step": 61010 + }, + { + "epoch": 2.3128084407336527, + "grad_norm": 0.20788943767547607, + "learning_rate": 8.8779914364743e-05, + "loss": 1.429, + "step": 61020 + }, + { + "epoch": 2.313350381899165, + "grad_norm": 0.2890428900718689, + "learning_rate": 8.87694482462681e-05, + "loss": 1.4375, + "step": 61030 + }, + { + "epoch": 2.3138923230646773, + "grad_norm": 0.235799640417099, + "learning_rate": 8.875897794458053e-05, + "loss": 1.4417, + "step": 61040 + }, + { + "epoch": 2.3144342642301896, + "grad_norm": 0.39794206619262695, + "learning_rate": 8.874850346097736e-05, + "loss": 1.4408, + "step": 61050 + }, + { + "epoch": 2.314976205395702, + "grad_norm": 0.46668577194213867, + "learning_rate": 8.87380247967561e-05, + "loss": 1.4364, + "step": 61060 + }, + { + "epoch": 2.315518146561214, + "grad_norm": 0.21078045666217804, + "learning_rate": 8.872754195321479e-05, + "loss": 1.4361, + "step": 61070 + }, + { + "epoch": 2.315734923027419, + "eval_loss": 2.4492809772491455, + "eval_runtime": 21.9892, + "eval_samples_per_second": 227.384, + "eval_steps_per_second": 1.228, + "step": 61074 + }, + { + "epoch": 2.316060087726726, + "grad_norm": 0.27754735946655273, + "learning_rate": 8.871705493165203e-05, + "loss": 1.4314, + "step": 61080 + }, + { + "epoch": 2.3166020288922384, + "grad_norm": 0.22259338200092316, + "learning_rate": 8.870656373336689e-05, + "loss": 1.4417, + "step": 61090 + }, + { + "epoch": 2.3171439700577507, + "grad_norm": 0.21421173214912415, + "learning_rate": 8.869606835965896e-05, + "loss": 1.4432, + "step": 61100 + }, + { + "epoch": 2.317685911223263, + "grad_norm": 0.2673332691192627, + "learning_rate": 8.868556881182838e-05, + "loss": 1.4422, + "step": 61110 + }, + { + "epoch": 2.318227852388775, + "grad_norm": 0.18308067321777344, + "learning_rate": 8.867506509117578e-05, + "loss": 1.426, + "step": 61120 + }, + { + "epoch": 2.3187697935542873, + "grad_norm": 0.3170804977416992, + "learning_rate": 8.866455719900234e-05, + "loss": 1.4377, + "step": 61130 + }, + { + "epoch": 2.3193117347197996, + "grad_norm": 0.28330197930336, + "learning_rate": 8.865404513660968e-05, + "loss": 1.437, + "step": 61140 + }, + { + "epoch": 2.319853675885312, + "grad_norm": 0.17759016156196594, + "learning_rate": 8.864352890530005e-05, + "loss": 1.439, + "step": 61150 + }, + { + "epoch": 2.3203956170508238, + "grad_norm": 0.33059704303741455, + "learning_rate": 8.863300850637611e-05, + "loss": 1.4346, + "step": 61160 + }, + { + "epoch": 2.320449811167375, + "eval_loss": 2.450498580932617, + "eval_runtime": 21.984, + "eval_samples_per_second": 227.438, + "eval_steps_per_second": 1.228, + "step": 61161 + }, + { + "epoch": 2.320937558216336, + "grad_norm": 0.26688456535339355, + "learning_rate": 8.862248394114111e-05, + "loss": 1.4378, + "step": 61170 + }, + { + "epoch": 2.3214794993818484, + "grad_norm": 0.25356730818748474, + "learning_rate": 8.861195521089875e-05, + "loss": 1.439, + "step": 61180 + }, + { + "epoch": 2.3220214405473607, + "grad_norm": 0.33288466930389404, + "learning_rate": 8.860142231695331e-05, + "loss": 1.4367, + "step": 61190 + }, + { + "epoch": 2.3225633817128726, + "grad_norm": 0.22826845943927765, + "learning_rate": 8.859088526060957e-05, + "loss": 1.4443, + "step": 61200 + }, + { + "epoch": 2.323105322878385, + "grad_norm": 0.214629128575325, + "learning_rate": 8.85803440431728e-05, + "loss": 1.4391, + "step": 61210 + }, + { + "epoch": 2.323647264043897, + "grad_norm": 0.21806223690509796, + "learning_rate": 8.856979866594879e-05, + "loss": 1.4335, + "step": 61220 + }, + { + "epoch": 2.3241892052094095, + "grad_norm": 0.20262044668197632, + "learning_rate": 8.855924913024388e-05, + "loss": 1.4399, + "step": 61230 + }, + { + "epoch": 2.324731146374922, + "grad_norm": 0.267966628074646, + "learning_rate": 8.854869543736487e-05, + "loss": 1.4412, + "step": 61240 + }, + { + "epoch": 2.3251646993073316, + "eval_loss": 2.4549691677093506, + "eval_runtime": 21.9814, + "eval_samples_per_second": 227.465, + "eval_steps_per_second": 1.228, + "step": 61248 + }, + { + "epoch": 2.3252730875404337, + "grad_norm": 0.23619084060192108, + "learning_rate": 8.853813758861915e-05, + "loss": 1.442, + "step": 61250 + }, + { + "epoch": 2.325815028705946, + "grad_norm": 0.4099680185317993, + "learning_rate": 8.852757558531453e-05, + "loss": 1.4374, + "step": 61260 + }, + { + "epoch": 2.3263569698714583, + "grad_norm": 0.20198297500610352, + "learning_rate": 8.851700942875943e-05, + "loss": 1.4314, + "step": 61270 + }, + { + "epoch": 2.3268989110369707, + "grad_norm": 0.22473087906837463, + "learning_rate": 8.850643912026269e-05, + "loss": 1.4323, + "step": 61280 + }, + { + "epoch": 2.327440852202483, + "grad_norm": 0.19374053180217743, + "learning_rate": 8.849586466113376e-05, + "loss": 1.4348, + "step": 61290 + }, + { + "epoch": 2.327982793367995, + "grad_norm": 0.20663167536258698, + "learning_rate": 8.848528605268255e-05, + "loss": 1.4335, + "step": 61300 + }, + { + "epoch": 2.328524734533507, + "grad_norm": 0.2488778531551361, + "learning_rate": 8.847470329621945e-05, + "loss": 1.4479, + "step": 61310 + }, + { + "epoch": 2.3290666756990195, + "grad_norm": 0.181378573179245, + "learning_rate": 8.846411639305546e-05, + "loss": 1.4368, + "step": 61320 + }, + { + "epoch": 2.329608616864532, + "grad_norm": 0.2096365988254547, + "learning_rate": 8.845352534450202e-05, + "loss": 1.4265, + "step": 61330 + }, + { + "epoch": 2.3298795874472877, + "eval_loss": 2.452427625656128, + "eval_runtime": 21.9811, + "eval_samples_per_second": 227.468, + "eval_steps_per_second": 1.228, + "step": 61335 + }, + { + "epoch": 2.3301505580300437, + "grad_norm": 0.3072015345096588, + "learning_rate": 8.84429301518711e-05, + "loss": 1.4511, + "step": 61340 + }, + { + "epoch": 2.330692499195556, + "grad_norm": 0.21584992110729218, + "learning_rate": 8.843233081647519e-05, + "loss": 1.4421, + "step": 61350 + }, + { + "epoch": 2.3312344403610683, + "grad_norm": 0.5617396831512451, + "learning_rate": 8.842172733962727e-05, + "loss": 1.4423, + "step": 61360 + }, + { + "epoch": 2.3317763815265806, + "grad_norm": 0.194200336933136, + "learning_rate": 8.841111972264088e-05, + "loss": 1.4333, + "step": 61370 + }, + { + "epoch": 2.3323183226920925, + "grad_norm": 0.457123339176178, + "learning_rate": 8.840050796683005e-05, + "loss": 1.441, + "step": 61380 + }, + { + "epoch": 2.332860263857605, + "grad_norm": 0.24595646560192108, + "learning_rate": 8.838989207350928e-05, + "loss": 1.4481, + "step": 61390 + }, + { + "epoch": 2.333402205023117, + "grad_norm": 0.22267189621925354, + "learning_rate": 8.837927204399367e-05, + "loss": 1.4456, + "step": 61400 + }, + { + "epoch": 2.3339441461886294, + "grad_norm": 0.18372011184692383, + "learning_rate": 8.836864787959873e-05, + "loss": 1.4442, + "step": 61410 + }, + { + "epoch": 2.3344860873541418, + "grad_norm": 0.18260882794857025, + "learning_rate": 8.835801958164059e-05, + "loss": 1.4382, + "step": 61420 + }, + { + "epoch": 2.334594475587244, + "eval_loss": 2.4512012004852295, + "eval_runtime": 21.9808, + "eval_samples_per_second": 227.471, + "eval_steps_per_second": 1.228, + "step": 61422 + }, + { + "epoch": 2.3350280285196536, + "grad_norm": 0.238543301820755, + "learning_rate": 8.834738715143577e-05, + "loss": 1.4325, + "step": 61430 + }, + { + "epoch": 2.335569969685166, + "grad_norm": 0.47301405668258667, + "learning_rate": 8.833675059030144e-05, + "loss": 1.4422, + "step": 61440 + }, + { + "epoch": 2.3361119108506783, + "grad_norm": 0.23695412278175354, + "learning_rate": 8.832610989955518e-05, + "loss": 1.4361, + "step": 61450 + }, + { + "epoch": 2.3366538520161906, + "grad_norm": 0.23388248682022095, + "learning_rate": 8.831546508051511e-05, + "loss": 1.441, + "step": 61460 + }, + { + "epoch": 2.337195793181703, + "grad_norm": 0.2939755320549011, + "learning_rate": 8.830481613449985e-05, + "loss": 1.4408, + "step": 61470 + }, + { + "epoch": 2.3377377343472148, + "grad_norm": 0.3122168779373169, + "learning_rate": 8.82941630628286e-05, + "loss": 1.4366, + "step": 61480 + }, + { + "epoch": 2.338279675512727, + "grad_norm": 0.4362255036830902, + "learning_rate": 8.828350586682096e-05, + "loss": 1.4309, + "step": 61490 + }, + { + "epoch": 2.3388216166782394, + "grad_norm": 0.3685493469238281, + "learning_rate": 8.82728445477971e-05, + "loss": 1.4369, + "step": 61500 + }, + { + "epoch": 2.3393093637272004, + "eval_loss": 2.454096555709839, + "eval_runtime": 21.9765, + "eval_samples_per_second": 227.516, + "eval_steps_per_second": 1.229, + "step": 61509 + }, + { + "epoch": 2.3393635578437517, + "grad_norm": 0.3012941777706146, + "learning_rate": 8.826217910707774e-05, + "loss": 1.4327, + "step": 61510 + }, + { + "epoch": 2.339905499009264, + "grad_norm": 0.4149247407913208, + "learning_rate": 8.825150954598403e-05, + "loss": 1.4361, + "step": 61520 + }, + { + "epoch": 2.340447440174776, + "grad_norm": 0.17556257545948029, + "learning_rate": 8.824083586583767e-05, + "loss": 1.4273, + "step": 61530 + }, + { + "epoch": 2.3409893813402882, + "grad_norm": 0.42168980836868286, + "learning_rate": 8.823015806796089e-05, + "loss": 1.4306, + "step": 61540 + }, + { + "epoch": 2.3415313225058005, + "grad_norm": 0.29091402888298035, + "learning_rate": 8.82194761536764e-05, + "loss": 1.4425, + "step": 61550 + }, + { + "epoch": 2.342073263671313, + "grad_norm": 0.33587974309921265, + "learning_rate": 8.820879012430742e-05, + "loss": 1.4383, + "step": 61560 + }, + { + "epoch": 2.3426152048368247, + "grad_norm": 0.2704252600669861, + "learning_rate": 8.81980999811777e-05, + "loss": 1.4397, + "step": 61570 + }, + { + "epoch": 2.343157146002337, + "grad_norm": 0.21759971976280212, + "learning_rate": 8.81874057256115e-05, + "loss": 1.4436, + "step": 61580 + }, + { + "epoch": 2.3436990871678494, + "grad_norm": 0.28754401206970215, + "learning_rate": 8.817670735893356e-05, + "loss": 1.4314, + "step": 61590 + }, + { + "epoch": 2.3440242518671566, + "eval_loss": 2.4510679244995117, + "eval_runtime": 21.9791, + "eval_samples_per_second": 227.489, + "eval_steps_per_second": 1.228, + "step": 61596 + }, + { + "epoch": 2.3442410283333617, + "grad_norm": 0.39755532145500183, + "learning_rate": 8.816600488246914e-05, + "loss": 1.441, + "step": 61600 + }, + { + "epoch": 2.3447829694988735, + "grad_norm": 0.27198687195777893, + "learning_rate": 8.815529829754403e-05, + "loss": 1.4492, + "step": 61610 + }, + { + "epoch": 2.345324910664386, + "grad_norm": 0.2784078121185303, + "learning_rate": 8.814458760548452e-05, + "loss": 1.4396, + "step": 61620 + }, + { + "epoch": 2.345866851829898, + "grad_norm": 0.205166295170784, + "learning_rate": 8.81338728076174e-05, + "loss": 1.4402, + "step": 61630 + }, + { + "epoch": 2.3464087929954105, + "grad_norm": 0.23504230380058289, + "learning_rate": 8.812315390526997e-05, + "loss": 1.4417, + "step": 61640 + }, + { + "epoch": 2.346950734160923, + "grad_norm": 0.21687854826450348, + "learning_rate": 8.811243089977002e-05, + "loss": 1.4379, + "step": 61650 + }, + { + "epoch": 2.3474926753264347, + "grad_norm": 0.27661964297294617, + "learning_rate": 8.810170379244591e-05, + "loss": 1.446, + "step": 61660 + }, + { + "epoch": 2.348034616491947, + "grad_norm": 0.23024803400039673, + "learning_rate": 8.809097258462645e-05, + "loss": 1.4403, + "step": 61670 + }, + { + "epoch": 2.3485765576574593, + "grad_norm": 0.4340742230415344, + "learning_rate": 8.808023727764095e-05, + "loss": 1.439, + "step": 61680 + }, + { + "epoch": 2.348739140007113, + "eval_loss": 2.4533681869506836, + "eval_runtime": 21.9799, + "eval_samples_per_second": 227.481, + "eval_steps_per_second": 1.228, + "step": 61683 + }, + { + "epoch": 2.3491184988229716, + "grad_norm": 0.2405279576778412, + "learning_rate": 8.806949787281929e-05, + "loss": 1.444, + "step": 61690 + }, + { + "epoch": 2.349660439988484, + "grad_norm": 0.3794168531894684, + "learning_rate": 8.805875437149182e-05, + "loss": 1.4397, + "step": 61700 + }, + { + "epoch": 2.350202381153996, + "grad_norm": 0.272239625453949, + "learning_rate": 8.804800677498935e-05, + "loss": 1.4343, + "step": 61710 + }, + { + "epoch": 2.350744322319508, + "grad_norm": 0.20025727152824402, + "learning_rate": 8.803725508464332e-05, + "loss": 1.4387, + "step": 61720 + }, + { + "epoch": 2.3512862634850205, + "grad_norm": 0.24284347891807556, + "learning_rate": 8.802649930178553e-05, + "loss": 1.4398, + "step": 61730 + }, + { + "epoch": 2.3518282046505328, + "grad_norm": 0.20169271528720856, + "learning_rate": 8.801573942774842e-05, + "loss": 1.4471, + "step": 61740 + }, + { + "epoch": 2.352370145816045, + "grad_norm": 0.2248934954404831, + "learning_rate": 8.800497546386484e-05, + "loss": 1.4281, + "step": 61750 + }, + { + "epoch": 2.352912086981557, + "grad_norm": 0.2882291376590729, + "learning_rate": 8.79942074114682e-05, + "loss": 1.4445, + "step": 61760 + }, + { + "epoch": 2.3534540281470693, + "grad_norm": 0.544728696346283, + "learning_rate": 8.798343527189238e-05, + "loss": 1.4378, + "step": 61770 + }, + { + "epoch": 2.3534540281470693, + "eval_loss": 2.448328971862793, + "eval_runtime": 21.7136, + "eval_samples_per_second": 230.271, + "eval_steps_per_second": 1.243, + "step": 61770 + }, + { + "epoch": 2.3539959693125816, + "grad_norm": 0.3125400245189667, + "learning_rate": 8.797265904647179e-05, + "loss": 1.4284, + "step": 61780 + }, + { + "epoch": 2.354537910478094, + "grad_norm": 0.3059818744659424, + "learning_rate": 8.796187873654138e-05, + "loss": 1.4296, + "step": 61790 + }, + { + "epoch": 2.355079851643606, + "grad_norm": 0.2530340254306793, + "learning_rate": 8.795109434343652e-05, + "loss": 1.4302, + "step": 61800 + }, + { + "epoch": 2.355621792809118, + "grad_norm": 0.2673911452293396, + "learning_rate": 8.794030586849315e-05, + "loss": 1.4459, + "step": 61810 + }, + { + "epoch": 2.3561637339746304, + "grad_norm": 0.3139052391052246, + "learning_rate": 8.792951331304771e-05, + "loss": 1.4379, + "step": 61820 + }, + { + "epoch": 2.3567056751401427, + "grad_norm": 0.19884108006954193, + "learning_rate": 8.791871667843713e-05, + "loss": 1.4348, + "step": 61830 + }, + { + "epoch": 2.3572476163056546, + "grad_norm": 0.2218913733959198, + "learning_rate": 8.790791596599882e-05, + "loss": 1.4465, + "step": 61840 + }, + { + "epoch": 2.357789557471167, + "grad_norm": 0.18263356387615204, + "learning_rate": 8.789711117707078e-05, + "loss": 1.4378, + "step": 61850 + }, + { + "epoch": 2.3581689162870254, + "eval_loss": 2.4493706226348877, + "eval_runtime": 21.983, + "eval_samples_per_second": 227.448, + "eval_steps_per_second": 1.228, + "step": 61857 + }, + { + "epoch": 2.3583314986366792, + "grad_norm": 0.2500559091567993, + "learning_rate": 8.788630231299142e-05, + "loss": 1.4311, + "step": 61860 + }, + { + "epoch": 2.3588734398021916, + "grad_norm": 0.27540549635887146, + "learning_rate": 8.787548937509971e-05, + "loss": 1.431, + "step": 61870 + }, + { + "epoch": 2.359415380967704, + "grad_norm": 0.23163899779319763, + "learning_rate": 8.786467236473511e-05, + "loss": 1.4425, + "step": 61880 + }, + { + "epoch": 2.3599573221332157, + "grad_norm": 0.19727367162704468, + "learning_rate": 8.785385128323759e-05, + "loss": 1.45, + "step": 61890 + }, + { + "epoch": 2.360499263298728, + "grad_norm": 0.2537407875061035, + "learning_rate": 8.784302613194758e-05, + "loss": 1.4446, + "step": 61900 + }, + { + "epoch": 2.3610412044642404, + "grad_norm": 0.208670973777771, + "learning_rate": 8.783219691220611e-05, + "loss": 1.4393, + "step": 61910 + }, + { + "epoch": 2.3615831456297527, + "grad_norm": 0.18386749923229218, + "learning_rate": 8.782136362535462e-05, + "loss": 1.4468, + "step": 61920 + }, + { + "epoch": 2.362125086795265, + "grad_norm": 0.19878651201725006, + "learning_rate": 8.781052627273512e-05, + "loss": 1.444, + "step": 61930 + }, + { + "epoch": 2.362667027960777, + "grad_norm": 0.27927711606025696, + "learning_rate": 8.779968485569004e-05, + "loss": 1.4331, + "step": 61940 + }, + { + "epoch": 2.362883804426982, + "eval_loss": 2.4520583152770996, + "eval_runtime": 21.9818, + "eval_samples_per_second": 227.461, + "eval_steps_per_second": 1.228, + "step": 61944 + }, + { + "epoch": 2.363208969126289, + "grad_norm": 0.5157132744789124, + "learning_rate": 8.778883937556243e-05, + "loss": 1.4434, + "step": 61950 + }, + { + "epoch": 2.3637509102918015, + "grad_norm": 0.6511501669883728, + "learning_rate": 8.777798983369573e-05, + "loss": 1.4315, + "step": 61960 + }, + { + "epoch": 2.364292851457314, + "grad_norm": 0.21835680305957794, + "learning_rate": 8.776713623143397e-05, + "loss": 1.4358, + "step": 61970 + }, + { + "epoch": 2.3648347926228257, + "grad_norm": 0.1703735738992691, + "learning_rate": 8.775627857012165e-05, + "loss": 1.4404, + "step": 61980 + }, + { + "epoch": 2.365376733788338, + "grad_norm": 0.17746131122112274, + "learning_rate": 8.774541685110373e-05, + "loss": 1.4322, + "step": 61990 + }, + { + "epoch": 2.3659186749538503, + "grad_norm": 0.34184667468070984, + "learning_rate": 8.773455107572574e-05, + "loss": 1.4246, + "step": 62000 + }, + { + "epoch": 2.3664606161193626, + "grad_norm": 0.2798058092594147, + "learning_rate": 8.772368124533369e-05, + "loss": 1.4387, + "step": 62010 + }, + { + "epoch": 2.3670025572848745, + "grad_norm": 0.40121662616729736, + "learning_rate": 8.771280736127407e-05, + "loss": 1.4394, + "step": 62020 + }, + { + "epoch": 2.367544498450387, + "grad_norm": 0.26767289638519287, + "learning_rate": 8.770192942489392e-05, + "loss": 1.4371, + "step": 62030 + }, + { + "epoch": 2.367598692566938, + "eval_loss": 2.441128969192505, + "eval_runtime": 21.982, + "eval_samples_per_second": 227.459, + "eval_steps_per_second": 1.228, + "step": 62031 + }, + { + "epoch": 2.368086439615899, + "grad_norm": 0.32896319031715393, + "learning_rate": 8.76910474375407e-05, + "loss": 1.4328, + "step": 62040 + }, + { + "epoch": 2.3686283807814115, + "grad_norm": 0.4783940017223358, + "learning_rate": 8.768016140056247e-05, + "loss": 1.4453, + "step": 62050 + }, + { + "epoch": 2.369170321946924, + "grad_norm": 0.29626142978668213, + "learning_rate": 8.766927131530774e-05, + "loss": 1.4437, + "step": 62060 + }, + { + "epoch": 2.3697122631124357, + "grad_norm": 0.31796202063560486, + "learning_rate": 8.765837718312549e-05, + "loss": 1.4328, + "step": 62070 + }, + { + "epoch": 2.370254204277948, + "grad_norm": 0.6041905283927917, + "learning_rate": 8.764747900536529e-05, + "loss": 1.4374, + "step": 62080 + }, + { + "epoch": 2.3707961454434603, + "grad_norm": 0.2997055649757385, + "learning_rate": 8.76365767833771e-05, + "loss": 1.4417, + "step": 62090 + }, + { + "epoch": 2.3713380866089726, + "grad_norm": 0.23351405560970306, + "learning_rate": 8.76256705185115e-05, + "loss": 1.4327, + "step": 62100 + }, + { + "epoch": 2.371880027774485, + "grad_norm": 0.18303707242012024, + "learning_rate": 8.761476021211947e-05, + "loss": 1.4324, + "step": 62110 + }, + { + "epoch": 2.3723135807068947, + "eval_loss": 2.431164503097534, + "eval_runtime": 21.98, + "eval_samples_per_second": 227.48, + "eval_steps_per_second": 1.228, + "step": 62118 + }, + { + "epoch": 2.372421968939997, + "grad_norm": 0.17883995175361633, + "learning_rate": 8.760384586555255e-05, + "loss": 1.4199, + "step": 62120 + }, + { + "epoch": 2.372963910105509, + "grad_norm": 0.1785135418176651, + "learning_rate": 8.759292748016275e-05, + "loss": 1.4307, + "step": 62130 + }, + { + "epoch": 2.3735058512710214, + "grad_norm": 0.3247034549713135, + "learning_rate": 8.75820050573026e-05, + "loss": 1.4367, + "step": 62140 + }, + { + "epoch": 2.3740477924365337, + "grad_norm": 0.2257276326417923, + "learning_rate": 8.757107859832512e-05, + "loss": 1.4524, + "step": 62150 + }, + { + "epoch": 2.374589733602046, + "grad_norm": 0.34088021516799927, + "learning_rate": 8.756014810458384e-05, + "loss": 1.4359, + "step": 62160 + }, + { + "epoch": 2.375131674767558, + "grad_norm": 0.2236095666885376, + "learning_rate": 8.754921357743277e-05, + "loss": 1.439, + "step": 62170 + }, + { + "epoch": 2.3756736159330702, + "grad_norm": 0.2399052530527115, + "learning_rate": 8.753827501822643e-05, + "loss": 1.4326, + "step": 62180 + }, + { + "epoch": 2.3762155570985826, + "grad_norm": 0.18329595029354095, + "learning_rate": 8.752733242831985e-05, + "loss": 1.4373, + "step": 62190 + }, + { + "epoch": 2.376757498264095, + "grad_norm": 0.2353314459323883, + "learning_rate": 8.751638580906856e-05, + "loss": 1.441, + "step": 62200 + }, + { + "epoch": 2.377028468846851, + "eval_loss": 2.4490318298339844, + "eval_runtime": 21.9823, + "eval_samples_per_second": 227.456, + "eval_steps_per_second": 1.228, + "step": 62205 + }, + { + "epoch": 2.3772994394296068, + "grad_norm": 0.30364421010017395, + "learning_rate": 8.750543516182855e-05, + "loss": 1.4362, + "step": 62210 + }, + { + "epoch": 2.377841380595119, + "grad_norm": 0.1798836588859558, + "learning_rate": 8.749448048795637e-05, + "loss": 1.4259, + "step": 62220 + }, + { + "epoch": 2.3783833217606314, + "grad_norm": 0.24086004495620728, + "learning_rate": 8.748352178880902e-05, + "loss": 1.4362, + "step": 62230 + }, + { + "epoch": 2.3789252629261437, + "grad_norm": 0.23006141185760498, + "learning_rate": 8.747255906574402e-05, + "loss": 1.4397, + "step": 62240 + }, + { + "epoch": 2.3794672040916556, + "grad_norm": 0.23331311345100403, + "learning_rate": 8.74615923201194e-05, + "loss": 1.4479, + "step": 62250 + }, + { + "epoch": 2.380009145257168, + "grad_norm": 0.2259208709001541, + "learning_rate": 8.745062155329363e-05, + "loss": 1.4515, + "step": 62260 + }, + { + "epoch": 2.38055108642268, + "grad_norm": 0.3277144730091095, + "learning_rate": 8.743964676662576e-05, + "loss": 1.4405, + "step": 62270 + }, + { + "epoch": 2.3810930275881925, + "grad_norm": 0.25958386063575745, + "learning_rate": 8.742866796147528e-05, + "loss": 1.4254, + "step": 62280 + }, + { + "epoch": 2.381634968753705, + "grad_norm": 0.2229194939136505, + "learning_rate": 8.741768513920222e-05, + "loss": 1.426, + "step": 62290 + }, + { + "epoch": 2.381743356986807, + "eval_loss": 2.435068368911743, + "eval_runtime": 21.9789, + "eval_samples_per_second": 227.491, + "eval_steps_per_second": 1.228, + "step": 62292 + }, + { + "epoch": 2.3821769099192167, + "grad_norm": 0.4171644449234009, + "learning_rate": 8.740669830116706e-05, + "loss": 1.4468, + "step": 62300 + }, + { + "epoch": 2.382718851084729, + "grad_norm": 0.17529958486557007, + "learning_rate": 8.739570744873081e-05, + "loss": 1.4307, + "step": 62310 + }, + { + "epoch": 2.3832607922502413, + "grad_norm": 0.25872156023979187, + "learning_rate": 8.738471258325499e-05, + "loss": 1.4456, + "step": 62320 + }, + { + "epoch": 2.3838027334157537, + "grad_norm": 0.18120713531970978, + "learning_rate": 8.737371370610156e-05, + "loss": 1.4495, + "step": 62330 + }, + { + "epoch": 2.384344674581266, + "grad_norm": 0.3255040645599365, + "learning_rate": 8.736271081863302e-05, + "loss": 1.4378, + "step": 62340 + }, + { + "epoch": 2.384886615746778, + "grad_norm": 0.22683602571487427, + "learning_rate": 8.735170392221237e-05, + "loss": 1.4417, + "step": 62350 + }, + { + "epoch": 2.38542855691229, + "grad_norm": 0.5052164196968079, + "learning_rate": 8.73406930182031e-05, + "loss": 1.4369, + "step": 62360 + }, + { + "epoch": 2.3859704980778025, + "grad_norm": 0.27900445461273193, + "learning_rate": 8.732967810796918e-05, + "loss": 1.4283, + "step": 62370 + }, + { + "epoch": 2.3864582451267635, + "eval_loss": 2.445622682571411, + "eval_runtime": 21.9893, + "eval_samples_per_second": 227.383, + "eval_steps_per_second": 1.228, + "step": 62379 + }, + { + "epoch": 2.386512439243315, + "grad_norm": 0.30575114488601685, + "learning_rate": 8.73186591928751e-05, + "loss": 1.438, + "step": 62380 + }, + { + "epoch": 2.387054380408827, + "grad_norm": 0.2561473846435547, + "learning_rate": 8.730763627428585e-05, + "loss": 1.4324, + "step": 62390 + }, + { + "epoch": 2.387596321574339, + "grad_norm": 0.2688450515270233, + "learning_rate": 8.729660935356686e-05, + "loss": 1.4184, + "step": 62400 + }, + { + "epoch": 2.3881382627398513, + "grad_norm": 0.3544863164424896, + "learning_rate": 8.728557843208413e-05, + "loss": 1.4436, + "step": 62410 + }, + { + "epoch": 2.3886802039053636, + "grad_norm": 0.30758488178253174, + "learning_rate": 8.727454351120413e-05, + "loss": 1.427, + "step": 62420 + }, + { + "epoch": 2.389222145070876, + "grad_norm": 0.41708388924598694, + "learning_rate": 8.726350459229379e-05, + "loss": 1.4349, + "step": 62430 + }, + { + "epoch": 2.389764086236388, + "grad_norm": 0.5974102020263672, + "learning_rate": 8.725246167672057e-05, + "loss": 1.4272, + "step": 62440 + }, + { + "epoch": 2.3903060274019, + "grad_norm": 0.19719624519348145, + "learning_rate": 8.724141476585244e-05, + "loss": 1.4359, + "step": 62450 + }, + { + "epoch": 2.3908479685674124, + "grad_norm": 0.3106327950954437, + "learning_rate": 8.723036386105781e-05, + "loss": 1.4333, + "step": 62460 + }, + { + "epoch": 2.3911731332667197, + "eval_loss": 2.4473304748535156, + "eval_runtime": 21.9738, + "eval_samples_per_second": 227.544, + "eval_steps_per_second": 1.229, + "step": 62466 + }, + { + "epoch": 2.3913899097329248, + "grad_norm": 0.2692268192768097, + "learning_rate": 8.721930896370563e-05, + "loss": 1.421, + "step": 62470 + }, + { + "epoch": 2.3919318508984366, + "grad_norm": 0.2162933647632599, + "learning_rate": 8.720825007516535e-05, + "loss": 1.4401, + "step": 62480 + }, + { + "epoch": 2.392473792063949, + "grad_norm": 0.3203500509262085, + "learning_rate": 8.719718719680687e-05, + "loss": 1.4334, + "step": 62490 + }, + { + "epoch": 2.3930157332294613, + "grad_norm": 0.3285951018333435, + "learning_rate": 8.718612033000066e-05, + "loss": 1.4321, + "step": 62500 + }, + { + "epoch": 2.3935576743949736, + "grad_norm": 0.4296458065509796, + "learning_rate": 8.717504947611757e-05, + "loss": 1.4345, + "step": 62510 + }, + { + "epoch": 2.394099615560486, + "grad_norm": 0.1817467212677002, + "learning_rate": 8.716397463652907e-05, + "loss": 1.4386, + "step": 62520 + }, + { + "epoch": 2.3946415567259978, + "grad_norm": 0.3981388807296753, + "learning_rate": 8.7152895812607e-05, + "loss": 1.4317, + "step": 62530 + }, + { + "epoch": 2.39518349789151, + "grad_norm": 0.2751355767250061, + "learning_rate": 8.71418130057238e-05, + "loss": 1.4464, + "step": 62540 + }, + { + "epoch": 2.3957254390570224, + "grad_norm": 0.2315666377544403, + "learning_rate": 8.713072621725235e-05, + "loss": 1.4424, + "step": 62550 + }, + { + "epoch": 2.3958880214066762, + "eval_loss": 2.4334030151367188, + "eval_runtime": 22.0386, + "eval_samples_per_second": 226.874, + "eval_steps_per_second": 1.225, + "step": 62553 + }, + { + "epoch": 2.3962673802225347, + "grad_norm": 0.2237350344657898, + "learning_rate": 8.711963544856606e-05, + "loss": 1.4372, + "step": 62560 + }, + { + "epoch": 2.396809321388047, + "grad_norm": 0.17813153564929962, + "learning_rate": 8.710854070103876e-05, + "loss": 1.4269, + "step": 62570 + }, + { + "epoch": 2.397351262553559, + "grad_norm": 0.2021496742963791, + "learning_rate": 8.709744197604483e-05, + "loss": 1.4307, + "step": 62580 + }, + { + "epoch": 2.397893203719071, + "grad_norm": 0.1962280124425888, + "learning_rate": 8.708633927495916e-05, + "loss": 1.4401, + "step": 62590 + }, + { + "epoch": 2.3984351448845835, + "grad_norm": 0.29893505573272705, + "learning_rate": 8.707523259915707e-05, + "loss": 1.4374, + "step": 62600 + }, + { + "epoch": 2.398977086050096, + "grad_norm": 0.24217230081558228, + "learning_rate": 8.706412195001444e-05, + "loss": 1.4346, + "step": 62610 + }, + { + "epoch": 2.3995190272156077, + "grad_norm": 0.24626600742340088, + "learning_rate": 8.705300732890756e-05, + "loss": 1.4344, + "step": 62620 + }, + { + "epoch": 2.40006096838112, + "grad_norm": 0.17191511392593384, + "learning_rate": 8.704188873721332e-05, + "loss": 1.443, + "step": 62630 + }, + { + "epoch": 2.4006029095466324, + "grad_norm": 0.20966501533985138, + "learning_rate": 8.703076617630901e-05, + "loss": 1.4345, + "step": 62640 + }, + { + "epoch": 2.4006029095466324, + "eval_loss": 2.4349613189697266, + "eval_runtime": 21.971, + "eval_samples_per_second": 227.572, + "eval_steps_per_second": 1.229, + "step": 62640 + }, + { + "epoch": 2.4011448507121447, + "grad_norm": 0.190885990858078, + "learning_rate": 8.701963964757245e-05, + "loss": 1.4334, + "step": 62650 + }, + { + "epoch": 2.4016867918776565, + "grad_norm": 0.26069727540016174, + "learning_rate": 8.700850915238195e-05, + "loss": 1.4388, + "step": 62660 + }, + { + "epoch": 2.402228733043169, + "grad_norm": 0.33938780426979065, + "learning_rate": 8.699737469211629e-05, + "loss": 1.4331, + "step": 62670 + }, + { + "epoch": 2.402770674208681, + "grad_norm": 0.23580268025398254, + "learning_rate": 8.698623626815478e-05, + "loss": 1.4318, + "step": 62680 + }, + { + "epoch": 2.4033126153741935, + "grad_norm": 0.23255738615989685, + "learning_rate": 8.697509388187721e-05, + "loss": 1.4455, + "step": 62690 + }, + { + "epoch": 2.403854556539706, + "grad_norm": 0.3351677656173706, + "learning_rate": 8.696394753466381e-05, + "loss": 1.4373, + "step": 62700 + }, + { + "epoch": 2.4043964977052177, + "grad_norm": 0.18452972173690796, + "learning_rate": 8.695279722789536e-05, + "loss": 1.4417, + "step": 62710 + }, + { + "epoch": 2.40493843887073, + "grad_norm": 0.21630007028579712, + "learning_rate": 8.694164296295311e-05, + "loss": 1.434, + "step": 62720 + }, + { + "epoch": 2.4053177976865885, + "eval_loss": 2.435746908187866, + "eval_runtime": 22.0344, + "eval_samples_per_second": 226.918, + "eval_steps_per_second": 1.225, + "step": 62727 + }, + { + "epoch": 2.4054803800362423, + "grad_norm": 0.3593086004257202, + "learning_rate": 8.693048474121883e-05, + "loss": 1.4383, + "step": 62730 + }, + { + "epoch": 2.4060223212017546, + "grad_norm": 0.4216225743293762, + "learning_rate": 8.69193225640747e-05, + "loss": 1.4458, + "step": 62740 + }, + { + "epoch": 2.406564262367267, + "grad_norm": 0.2987191379070282, + "learning_rate": 8.690815643290348e-05, + "loss": 1.4488, + "step": 62750 + }, + { + "epoch": 2.407106203532779, + "grad_norm": 0.2709275186061859, + "learning_rate": 8.68969863490884e-05, + "loss": 1.4483, + "step": 62760 + }, + { + "epoch": 2.407648144698291, + "grad_norm": 0.17814864218235016, + "learning_rate": 8.68858123140131e-05, + "loss": 1.4451, + "step": 62770 + }, + { + "epoch": 2.4081900858638035, + "grad_norm": 0.24348965287208557, + "learning_rate": 8.687463432906182e-05, + "loss": 1.4463, + "step": 62780 + }, + { + "epoch": 2.4087320270293158, + "grad_norm": 0.21512000262737274, + "learning_rate": 8.686345239561921e-05, + "loss": 1.4508, + "step": 62790 + }, + { + "epoch": 2.409273968194828, + "grad_norm": 0.3042374551296234, + "learning_rate": 8.685226651507047e-05, + "loss": 1.4606, + "step": 62800 + }, + { + "epoch": 2.40981590936034, + "grad_norm": 0.22630225121974945, + "learning_rate": 8.684107668880124e-05, + "loss": 1.4499, + "step": 62810 + }, + { + "epoch": 2.410032685826545, + "eval_loss": 2.45141339302063, + "eval_runtime": 21.9807, + "eval_samples_per_second": 227.472, + "eval_steps_per_second": 1.228, + "step": 62814 + }, + { + "epoch": 2.4103578505258523, + "grad_norm": 0.22491927444934845, + "learning_rate": 8.682988291819766e-05, + "loss": 1.4527, + "step": 62820 + }, + { + "epoch": 2.4108997916913646, + "grad_norm": 0.18473847210407257, + "learning_rate": 8.68186852046464e-05, + "loss": 1.4372, + "step": 62830 + }, + { + "epoch": 2.411441732856877, + "grad_norm": 0.18476863205432892, + "learning_rate": 8.680748354953454e-05, + "loss": 1.4481, + "step": 62840 + }, + { + "epoch": 2.411983674022389, + "grad_norm": 0.20057182013988495, + "learning_rate": 8.679627795424973e-05, + "loss": 1.442, + "step": 62850 + }, + { + "epoch": 2.412525615187901, + "grad_norm": 0.2545701563358307, + "learning_rate": 8.678506842018002e-05, + "loss": 1.4425, + "step": 62860 + }, + { + "epoch": 2.4130675563534134, + "grad_norm": 0.18435277044773102, + "learning_rate": 8.677385494871406e-05, + "loss": 1.4428, + "step": 62870 + }, + { + "epoch": 2.4136094975189257, + "grad_norm": 0.3235138952732086, + "learning_rate": 8.676263754124089e-05, + "loss": 1.4548, + "step": 62880 + }, + { + "epoch": 2.4141514386844376, + "grad_norm": 0.36199185252189636, + "learning_rate": 8.675141619915008e-05, + "loss": 1.4421, + "step": 62890 + }, + { + "epoch": 2.41469337984995, + "grad_norm": 0.19079086184501648, + "learning_rate": 8.674019092383168e-05, + "loss": 1.4568, + "step": 62900 + }, + { + "epoch": 2.414747573966501, + "eval_loss": 2.44244122505188, + "eval_runtime": 21.9822, + "eval_samples_per_second": 227.456, + "eval_steps_per_second": 1.228, + "step": 62901 + }, + { + "epoch": 2.4152353210154622, + "grad_norm": 0.20537631213665009, + "learning_rate": 8.672896171667623e-05, + "loss": 1.454, + "step": 62910 + }, + { + "epoch": 2.4157772621809745, + "grad_norm": 0.1923794001340866, + "learning_rate": 8.671772857907476e-05, + "loss": 1.444, + "step": 62920 + }, + { + "epoch": 2.416319203346487, + "grad_norm": 0.3296695649623871, + "learning_rate": 8.670649151241876e-05, + "loss": 1.4404, + "step": 62930 + }, + { + "epoch": 2.4168611445119987, + "grad_norm": 0.4826582670211792, + "learning_rate": 8.669525051810028e-05, + "loss": 1.4398, + "step": 62940 + }, + { + "epoch": 2.417403085677511, + "grad_norm": 0.23408730328083038, + "learning_rate": 8.668400559751175e-05, + "loss": 1.4405, + "step": 62950 + }, + { + "epoch": 2.4179450268430234, + "grad_norm": 0.461852103471756, + "learning_rate": 8.667275675204617e-05, + "loss": 1.4522, + "step": 62960 + }, + { + "epoch": 2.4184869680085357, + "grad_norm": 0.22658464312553406, + "learning_rate": 8.6661503983097e-05, + "loss": 1.4552, + "step": 62970 + }, + { + "epoch": 2.419028909174048, + "grad_norm": 0.1908838152885437, + "learning_rate": 8.665024729205816e-05, + "loss": 1.4352, + "step": 62980 + }, + { + "epoch": 2.4194624621064573, + "eval_loss": 2.4345483779907227, + "eval_runtime": 21.9764, + "eval_samples_per_second": 227.517, + "eval_steps_per_second": 1.229, + "step": 62988 + }, + { + "epoch": 2.41957085033956, + "grad_norm": 0.24472688138484955, + "learning_rate": 8.663898668032412e-05, + "loss": 1.4473, + "step": 62990 + }, + { + "epoch": 2.420112791505072, + "grad_norm": 0.2447074055671692, + "learning_rate": 8.662772214928976e-05, + "loss": 1.4536, + "step": 63000 + }, + { + "epoch": 3.0005419411655123, + "grad_norm": 0.16394081711769104, + "learning_rate": 8.661645370035048e-05, + "loss": 1.4525, + "step": 63010 + }, + { + "epoch": 3.0010838823310246, + "grad_norm": 0.2631014585494995, + "learning_rate": 8.660518133490221e-05, + "loss": 1.4492, + "step": 63020 + }, + { + "epoch": 3.0016258234965365, + "grad_norm": 0.1863064467906952, + "learning_rate": 8.659390505434127e-05, + "loss": 1.4496, + "step": 63030 + }, + { + "epoch": 3.002167764662049, + "grad_norm": 0.1799326092004776, + "learning_rate": 8.658262486006455e-05, + "loss": 1.4565, + "step": 63040 + }, + { + "epoch": 3.002709705827561, + "grad_norm": 0.18670876324176788, + "learning_rate": 8.657134075346938e-05, + "loss": 1.4478, + "step": 63050 + }, + { + "epoch": 3.0032516469930735, + "grad_norm": 0.23742614686489105, + "learning_rate": 8.65600527359536e-05, + "loss": 1.4542, + "step": 63060 + }, + { + "epoch": 3.0037935881585853, + "grad_norm": 0.21548855304718018, + "learning_rate": 8.654876080891547e-05, + "loss": 1.4522, + "step": 63070 + }, + { + "epoch": 3.0040645587413417, + "eval_loss": 2.447054624557495, + "eval_runtime": 22.3909, + "eval_samples_per_second": 223.305, + "eval_steps_per_second": 1.206, + "step": 63075 + }, + { + "epoch": 3.0043355293240976, + "grad_norm": 0.20155511796474457, + "learning_rate": 8.653746497375385e-05, + "loss": 1.4611, + "step": 63080 + }, + { + "epoch": 3.00487747048961, + "grad_norm": 0.21091540157794952, + "learning_rate": 8.652616523186797e-05, + "loss": 1.444, + "step": 63090 + }, + { + "epoch": 3.0054194116551223, + "grad_norm": 0.20185105502605438, + "learning_rate": 8.651486158465764e-05, + "loss": 1.453, + "step": 63100 + }, + { + "epoch": 3.0059613528206346, + "grad_norm": 0.21705321967601776, + "learning_rate": 8.650355403352307e-05, + "loss": 1.4463, + "step": 63110 + }, + { + "epoch": 3.0065032939861465, + "grad_norm": 0.22515122592449188, + "learning_rate": 8.649224257986499e-05, + "loss": 1.4393, + "step": 63120 + }, + { + "epoch": 3.007045235151659, + "grad_norm": 0.2835371494293213, + "learning_rate": 8.648092722508463e-05, + "loss": 1.449, + "step": 63130 + }, + { + "epoch": 3.007587176317171, + "grad_norm": 0.36949828267097473, + "learning_rate": 8.64696079705837e-05, + "loss": 1.4525, + "step": 63140 + }, + { + "epoch": 3.0081291174826834, + "grad_norm": 0.23466630280017853, + "learning_rate": 8.645828481776434e-05, + "loss": 1.4465, + "step": 63150 + }, + { + "epoch": 3.0086710586481953, + "grad_norm": 0.2540377676486969, + "learning_rate": 8.644695776802925e-05, + "loss": 1.454, + "step": 63160 + }, + { + "epoch": 3.008779446881298, + "eval_loss": 2.4413795471191406, + "eval_runtime": 21.9823, + "eval_samples_per_second": 227.456, + "eval_steps_per_second": 1.228, + "step": 63162 + }, + { + "epoch": 3.0092129998137076, + "grad_norm": 0.3684237599372864, + "learning_rate": 8.643562682278154e-05, + "loss": 1.4465, + "step": 63170 + }, + { + "epoch": 3.00975494097922, + "grad_norm": 0.1901058703660965, + "learning_rate": 8.642429198342488e-05, + "loss": 1.4547, + "step": 63180 + }, + { + "epoch": 3.0102968821447322, + "grad_norm": 0.18713931739330292, + "learning_rate": 8.641295325136336e-05, + "loss": 1.441, + "step": 63190 + }, + { + "epoch": 3.0108388233102445, + "grad_norm": 0.18807746469974518, + "learning_rate": 8.640161062800155e-05, + "loss": 1.4493, + "step": 63200 + }, + { + "epoch": 3.0113807644757564, + "grad_norm": 0.19221310317516327, + "learning_rate": 8.639026411474457e-05, + "loss": 1.44, + "step": 63210 + }, + { + "epoch": 3.0119227056412687, + "grad_norm": 0.2589902877807617, + "learning_rate": 8.637891371299796e-05, + "loss": 1.4451, + "step": 63220 + }, + { + "epoch": 3.012464646806781, + "grad_norm": 0.22735221683979034, + "learning_rate": 8.636755942416774e-05, + "loss": 1.4497, + "step": 63230 + }, + { + "epoch": 3.0130065879722934, + "grad_norm": 0.17304742336273193, + "learning_rate": 8.635620124966043e-05, + "loss": 1.4586, + "step": 63240 + }, + { + "epoch": 3.0134943350212544, + "eval_loss": 2.4555513858795166, + "eval_runtime": 21.9864, + "eval_samples_per_second": 227.414, + "eval_steps_per_second": 1.228, + "step": 63249 + }, + { + "epoch": 3.0135485291378057, + "grad_norm": 0.19638432562351227, + "learning_rate": 8.634483919088306e-05, + "loss": 1.4583, + "step": 63250 + }, + { + "epoch": 3.0140904703033176, + "grad_norm": 0.3353283703327179, + "learning_rate": 8.633347324924309e-05, + "loss": 1.4459, + "step": 63260 + }, + { + "epoch": 3.01463241146883, + "grad_norm": 0.23562084138393402, + "learning_rate": 8.63221034261485e-05, + "loss": 1.449, + "step": 63270 + }, + { + "epoch": 3.015174352634342, + "grad_norm": 0.23831287026405334, + "learning_rate": 8.63107297230077e-05, + "loss": 1.4461, + "step": 63280 + }, + { + "epoch": 3.0157162937998545, + "grad_norm": 0.2882630228996277, + "learning_rate": 8.629935214122968e-05, + "loss": 1.453, + "step": 63290 + }, + { + "epoch": 3.0162582349653664, + "grad_norm": 0.26988014578819275, + "learning_rate": 8.628797068222378e-05, + "loss": 1.451, + "step": 63300 + }, + { + "epoch": 3.0168001761308787, + "grad_norm": 0.2622480094432831, + "learning_rate": 8.627658534739992e-05, + "loss": 1.4587, + "step": 63310 + }, + { + "epoch": 3.017342117296391, + "grad_norm": 0.2087298184633255, + "learning_rate": 8.626519613816844e-05, + "loss": 1.4502, + "step": 63320 + }, + { + "epoch": 3.0178840584619033, + "grad_norm": 0.20758210122585297, + "learning_rate": 8.62538030559402e-05, + "loss": 1.4498, + "step": 63330 + }, + { + "epoch": 3.0182092231612105, + "eval_loss": 2.4477882385253906, + "eval_runtime": 21.9833, + "eval_samples_per_second": 227.445, + "eval_steps_per_second": 1.228, + "step": 63336 + }, + { + "epoch": 3.0184259996274156, + "grad_norm": 0.1754673719406128, + "learning_rate": 8.624240610212656e-05, + "loss": 1.4539, + "step": 63340 + }, + { + "epoch": 3.0189679407929275, + "grad_norm": 0.15522533655166626, + "learning_rate": 8.623100527813928e-05, + "loss": 1.4482, + "step": 63350 + }, + { + "epoch": 3.01950988195844, + "grad_norm": 0.22570842504501343, + "learning_rate": 8.621960058539062e-05, + "loss": 1.4564, + "step": 63360 + }, + { + "epoch": 3.020051823123952, + "grad_norm": 0.20296502113342285, + "learning_rate": 8.620819202529342e-05, + "loss": 1.4586, + "step": 63370 + }, + { + "epoch": 3.0205937642894645, + "grad_norm": 0.2882785201072693, + "learning_rate": 8.619677959926089e-05, + "loss": 1.4478, + "step": 63380 + }, + { + "epoch": 3.0211357054549763, + "grad_norm": 0.3313170075416565, + "learning_rate": 8.618536330870673e-05, + "loss": 1.4577, + "step": 63390 + }, + { + "epoch": 3.0216776466204887, + "grad_norm": 0.19241797924041748, + "learning_rate": 8.617394315504516e-05, + "loss": 1.4574, + "step": 63400 + }, + { + "epoch": 3.022219587786001, + "grad_norm": 0.32396382093429565, + "learning_rate": 8.616251913969085e-05, + "loss": 1.4519, + "step": 63410 + }, + { + "epoch": 3.0227615289515133, + "grad_norm": 0.1714586317539215, + "learning_rate": 8.615109126405897e-05, + "loss": 1.4567, + "step": 63420 + }, + { + "epoch": 3.0229241113011667, + "eval_loss": 2.4477341175079346, + "eval_runtime": 21.9816, + "eval_samples_per_second": 227.463, + "eval_steps_per_second": 1.228, + "step": 63423 + }, + { + "epoch": 3.0233034701170256, + "grad_norm": 0.1809176504611969, + "learning_rate": 8.613965952956515e-05, + "loss": 1.446, + "step": 63430 + }, + { + "epoch": 3.0238454112825375, + "grad_norm": 0.20341655611991882, + "learning_rate": 8.61282239376255e-05, + "loss": 1.4551, + "step": 63440 + }, + { + "epoch": 3.02438735244805, + "grad_norm": 0.1895596832036972, + "learning_rate": 8.611678448965661e-05, + "loss": 1.4487, + "step": 63450 + }, + { + "epoch": 3.024929293613562, + "grad_norm": 0.2253957986831665, + "learning_rate": 8.610534118707556e-05, + "loss": 1.4559, + "step": 63460 + }, + { + "epoch": 3.0254712347790744, + "grad_norm": 0.2183631956577301, + "learning_rate": 8.609389403129988e-05, + "loss": 1.4415, + "step": 63470 + }, + { + "epoch": 3.0260131759445863, + "grad_norm": 0.3052237629890442, + "learning_rate": 8.608244302374762e-05, + "loss": 1.4455, + "step": 63480 + }, + { + "epoch": 3.0265551171100986, + "grad_norm": 0.28673261404037476, + "learning_rate": 8.607098816583725e-05, + "loss": 1.4617, + "step": 63490 + }, + { + "epoch": 3.027097058275611, + "grad_norm": 0.20791488885879517, + "learning_rate": 8.605952945898777e-05, + "loss": 1.4532, + "step": 63500 + }, + { + "epoch": 3.0276389994411232, + "grad_norm": 0.34028980135917664, + "learning_rate": 8.604806690461863e-05, + "loss": 1.4535, + "step": 63510 + }, + { + "epoch": 3.0276389994411232, + "eval_loss": 2.4497005939483643, + "eval_runtime": 21.9833, + "eval_samples_per_second": 227.446, + "eval_steps_per_second": 1.228, + "step": 63510 + }, + { + "epoch": 3.0281809406066356, + "grad_norm": 0.22710201144218445, + "learning_rate": 8.603660050414974e-05, + "loss": 1.4425, + "step": 63520 + }, + { + "epoch": 3.0287228817721474, + "grad_norm": 0.17469532787799835, + "learning_rate": 8.602513025900155e-05, + "loss": 1.4445, + "step": 63530 + }, + { + "epoch": 3.0292648229376598, + "grad_norm": 0.21239697933197021, + "learning_rate": 8.601365617059491e-05, + "loss": 1.4371, + "step": 63540 + }, + { + "epoch": 3.029806764103172, + "grad_norm": 0.1866355538368225, + "learning_rate": 8.60021782403512e-05, + "loss": 1.4426, + "step": 63550 + }, + { + "epoch": 3.0303487052686844, + "grad_norm": 0.175773486495018, + "learning_rate": 8.599069646969223e-05, + "loss": 1.4477, + "step": 63560 + }, + { + "epoch": 3.0308906464341967, + "grad_norm": 0.2518947124481201, + "learning_rate": 8.597921086004035e-05, + "loss": 1.4465, + "step": 63570 + }, + { + "epoch": 3.0314325875997086, + "grad_norm": 0.3670835494995117, + "learning_rate": 8.596772141281833e-05, + "loss": 1.4487, + "step": 63580 + }, + { + "epoch": 3.031974528765221, + "grad_norm": 0.3510816991329193, + "learning_rate": 8.59562281294494e-05, + "loss": 1.4518, + "step": 63590 + }, + { + "epoch": 3.0323538875810794, + "eval_loss": 2.4546306133270264, + "eval_runtime": 22.1411, + "eval_samples_per_second": 225.824, + "eval_steps_per_second": 1.219, + "step": 63597 + }, + { + "epoch": 3.032516469930733, + "grad_norm": 0.3301527500152588, + "learning_rate": 8.594473101135734e-05, + "loss": 1.4655, + "step": 63600 + }, + { + "epoch": 3.0330584110962455, + "grad_norm": 0.2271365225315094, + "learning_rate": 8.593323005996638e-05, + "loss": 1.4479, + "step": 63610 + }, + { + "epoch": 3.0336003522617574, + "grad_norm": 0.3536682724952698, + "learning_rate": 8.592172527670114e-05, + "loss": 1.4523, + "step": 63620 + }, + { + "epoch": 3.0341422934272697, + "grad_norm": 0.4474346935749054, + "learning_rate": 8.591021666298684e-05, + "loss": 1.4426, + "step": 63630 + }, + { + "epoch": 3.034684234592782, + "grad_norm": 0.41748547554016113, + "learning_rate": 8.589870422024909e-05, + "loss": 1.4496, + "step": 63640 + }, + { + "epoch": 3.0352261757582943, + "grad_norm": 0.39396533370018005, + "learning_rate": 8.5887187949914e-05, + "loss": 1.4458, + "step": 63650 + }, + { + "epoch": 3.0357681169238067, + "grad_norm": 0.30107173323631287, + "learning_rate": 8.58756678534082e-05, + "loss": 1.4475, + "step": 63660 + }, + { + "epoch": 3.0363100580893185, + "grad_norm": 0.21877449750900269, + "learning_rate": 8.586414393215869e-05, + "loss": 1.4491, + "step": 63670 + }, + { + "epoch": 3.036851999254831, + "grad_norm": 0.3609618842601776, + "learning_rate": 8.585261618759303e-05, + "loss": 1.4532, + "step": 63680 + }, + { + "epoch": 3.037068775721036, + "eval_loss": 2.4435439109802246, + "eval_runtime": 21.984, + "eval_samples_per_second": 227.438, + "eval_steps_per_second": 1.228, + "step": 63684 + }, + { + "epoch": 3.037393940420343, + "grad_norm": 0.21839331090450287, + "learning_rate": 8.584108462113922e-05, + "loss": 1.4429, + "step": 63690 + }, + { + "epoch": 3.0379358815858555, + "grad_norm": 0.24262754619121552, + "learning_rate": 8.582954923422578e-05, + "loss": 1.4498, + "step": 63700 + }, + { + "epoch": 3.0384778227513674, + "grad_norm": 0.18923676013946533, + "learning_rate": 8.581801002828159e-05, + "loss": 1.4449, + "step": 63710 + }, + { + "epoch": 3.0390197639168797, + "grad_norm": 0.2811407446861267, + "learning_rate": 8.580646700473614e-05, + "loss": 1.4465, + "step": 63720 + }, + { + "epoch": 3.039561705082392, + "grad_norm": 0.2581160366535187, + "learning_rate": 8.579492016501929e-05, + "loss": 1.4538, + "step": 63730 + }, + { + "epoch": 3.0401036462479043, + "grad_norm": 0.4067544639110565, + "learning_rate": 8.578336951056145e-05, + "loss": 1.4401, + "step": 63740 + }, + { + "epoch": 3.0406455874134166, + "grad_norm": 0.2783118784427643, + "learning_rate": 8.577181504279342e-05, + "loss": 1.4468, + "step": 63750 + }, + { + "epoch": 3.0411875285789285, + "grad_norm": 0.2964227795600891, + "learning_rate": 8.576025676314654e-05, + "loss": 1.4491, + "step": 63760 + }, + { + "epoch": 3.041729469744441, + "grad_norm": 0.2709335684776306, + "learning_rate": 8.57486946730526e-05, + "loss": 1.4501, + "step": 63770 + }, + { + "epoch": 3.041783663860992, + "eval_loss": 2.4320621490478516, + "eval_runtime": 21.9837, + "eval_samples_per_second": 227.441, + "eval_steps_per_second": 1.228, + "step": 63771 + }, + { + "epoch": 3.042271410909953, + "grad_norm": 0.31969088315963745, + "learning_rate": 8.573712877394387e-05, + "loss": 1.4542, + "step": 63780 + }, + { + "epoch": 3.0428133520754654, + "grad_norm": 0.43009239435195923, + "learning_rate": 8.572555906725309e-05, + "loss": 1.4478, + "step": 63790 + }, + { + "epoch": 3.0433552932409773, + "grad_norm": 0.24113118648529053, + "learning_rate": 8.571398555441344e-05, + "loss": 1.4368, + "step": 63800 + }, + { + "epoch": 3.0438972344064896, + "grad_norm": 0.30968958139419556, + "learning_rate": 8.570240823685858e-05, + "loss": 1.451, + "step": 63810 + }, + { + "epoch": 3.044439175572002, + "grad_norm": 0.2378184050321579, + "learning_rate": 8.569082711602271e-05, + "loss": 1.443, + "step": 63820 + }, + { + "epoch": 3.0449811167375143, + "grad_norm": 0.19775545597076416, + "learning_rate": 8.567924219334042e-05, + "loss": 1.452, + "step": 63830 + }, + { + "epoch": 3.0455230579030266, + "grad_norm": 0.17699836194515228, + "learning_rate": 8.566765347024679e-05, + "loss": 1.452, + "step": 63840 + }, + { + "epoch": 3.0460649990685384, + "grad_norm": 0.1834602653980255, + "learning_rate": 8.565606094817741e-05, + "loss": 1.4525, + "step": 63850 + }, + { + "epoch": 3.046498552000948, + "eval_loss": 2.435060977935791, + "eval_runtime": 21.9853, + "eval_samples_per_second": 227.424, + "eval_steps_per_second": 1.228, + "step": 63858 + }, + { + "epoch": 3.0466069402340508, + "grad_norm": 0.2576271891593933, + "learning_rate": 8.56444646285683e-05, + "loss": 1.4452, + "step": 63860 + }, + { + "epoch": 3.047148881399563, + "grad_norm": 0.20896773040294647, + "learning_rate": 8.563286451285595e-05, + "loss": 1.4548, + "step": 63870 + }, + { + "epoch": 3.0476908225650754, + "grad_norm": 0.2274290919303894, + "learning_rate": 8.562126060247733e-05, + "loss": 1.4541, + "step": 63880 + }, + { + "epoch": 3.0482327637305877, + "grad_norm": 0.2640198767185211, + "learning_rate": 8.560965289886987e-05, + "loss": 1.4511, + "step": 63890 + }, + { + "epoch": 3.0487747048960996, + "grad_norm": 0.20334435999393463, + "learning_rate": 8.559804140347156e-05, + "loss": 1.4497, + "step": 63900 + }, + { + "epoch": 3.049316646061612, + "grad_norm": 0.28822678327560425, + "learning_rate": 8.558642611772069e-05, + "loss": 1.4519, + "step": 63910 + }, + { + "epoch": 3.049858587227124, + "grad_norm": 0.30483877658843994, + "learning_rate": 8.557480704305614e-05, + "loss": 1.4519, + "step": 63920 + }, + { + "epoch": 3.0504005283926365, + "grad_norm": 0.29864802956581116, + "learning_rate": 8.556318418091724e-05, + "loss": 1.449, + "step": 63930 + }, + { + "epoch": 3.0509424695581484, + "grad_norm": 0.43397045135498047, + "learning_rate": 8.555155753274379e-05, + "loss": 1.4485, + "step": 63940 + }, + { + "epoch": 3.051213440140905, + "eval_loss": 2.4513673782348633, + "eval_runtime": 21.9827, + "eval_samples_per_second": 227.451, + "eval_steps_per_second": 1.228, + "step": 63945 + }, + { + "epoch": 3.0514844107236607, + "grad_norm": 0.3487168550491333, + "learning_rate": 8.553992709997602e-05, + "loss": 1.4545, + "step": 63950 + }, + { + "epoch": 3.052026351889173, + "grad_norm": 0.18993450701236725, + "learning_rate": 8.552829288405467e-05, + "loss": 1.4491, + "step": 63960 + }, + { + "epoch": 3.0525682930546854, + "grad_norm": 0.22048941254615784, + "learning_rate": 8.551665488642096e-05, + "loss": 1.442, + "step": 63970 + }, + { + "epoch": 3.0531102342201977, + "grad_norm": 0.2803003191947937, + "learning_rate": 8.55050131085165e-05, + "loss": 1.4496, + "step": 63980 + }, + { + "epoch": 3.0536521753857095, + "grad_norm": 0.43568509817123413, + "learning_rate": 8.549336755178347e-05, + "loss": 1.4516, + "step": 63990 + }, + { + "epoch": 3.054194116551222, + "grad_norm": 0.24292457103729248, + "learning_rate": 8.548171821766448e-05, + "loss": 1.4408, + "step": 64000 + }, + { + "epoch": 3.054736057716734, + "grad_norm": 0.1853036731481552, + "learning_rate": 8.547006510760254e-05, + "loss": 1.4507, + "step": 64010 + }, + { + "epoch": 3.0552779988822465, + "grad_norm": 0.2446528673171997, + "learning_rate": 8.545840822304125e-05, + "loss": 1.4424, + "step": 64020 + }, + { + "epoch": 3.0558199400477584, + "grad_norm": 0.17944113910198212, + "learning_rate": 8.544674756542457e-05, + "loss": 1.455, + "step": 64030 + }, + { + "epoch": 3.055928328280861, + "eval_loss": 2.433335304260254, + "eval_runtime": 21.9868, + "eval_samples_per_second": 227.409, + "eval_steps_per_second": 1.228, + "step": 64032 + }, + { + "epoch": 3.0563618812132707, + "grad_norm": 0.3536889851093292, + "learning_rate": 8.5435083136197e-05, + "loss": 1.4485, + "step": 64040 + }, + { + "epoch": 3.056903822378783, + "grad_norm": 0.1827295571565628, + "learning_rate": 8.542341493680345e-05, + "loss": 1.4464, + "step": 64050 + }, + { + "epoch": 3.0574457635442953, + "grad_norm": 0.1838574856519699, + "learning_rate": 8.541174296868935e-05, + "loss": 1.4464, + "step": 64060 + }, + { + "epoch": 3.0579877047098076, + "grad_norm": 0.2586890757083893, + "learning_rate": 8.540006723330057e-05, + "loss": 1.4497, + "step": 64070 + }, + { + "epoch": 3.0585296458753195, + "grad_norm": 0.3454444408416748, + "learning_rate": 8.538838773208344e-05, + "loss": 1.4462, + "step": 64080 + }, + { + "epoch": 3.059071587040832, + "grad_norm": 0.27531322836875916, + "learning_rate": 8.537670446648477e-05, + "loss": 1.4482, + "step": 64090 + }, + { + "epoch": 3.059613528206344, + "grad_norm": 0.2372499257326126, + "learning_rate": 8.536501743795183e-05, + "loss": 1.4447, + "step": 64100 + }, + { + "epoch": 3.0601554693718565, + "grad_norm": 0.20991750061511993, + "learning_rate": 8.535332664793237e-05, + "loss": 1.4462, + "step": 64110 + }, + { + "epoch": 3.0606432164208175, + "eval_loss": 2.4356119632720947, + "eval_runtime": 21.9868, + "eval_samples_per_second": 227.409, + "eval_steps_per_second": 1.228, + "step": 64119 + }, + { + "epoch": 3.0606974105373683, + "grad_norm": 0.23890291154384613, + "learning_rate": 8.534163209787459e-05, + "loss": 1.4582, + "step": 64120 + }, + { + "epoch": 3.0612393517028806, + "grad_norm": 0.25897133350372314, + "learning_rate": 8.532993378922716e-05, + "loss": 1.4452, + "step": 64130 + }, + { + "epoch": 3.061781292868393, + "grad_norm": 0.2815474569797516, + "learning_rate": 8.53182317234392e-05, + "loss": 1.4509, + "step": 64140 + }, + { + "epoch": 3.0623232340339053, + "grad_norm": 0.22710613906383514, + "learning_rate": 8.530652590196033e-05, + "loss": 1.4503, + "step": 64150 + }, + { + "epoch": 3.0628651751994176, + "grad_norm": 0.26313063502311707, + "learning_rate": 8.529481632624059e-05, + "loss": 1.4433, + "step": 64160 + }, + { + "epoch": 3.0634071163649295, + "grad_norm": 0.17818008363246918, + "learning_rate": 8.528310299773055e-05, + "loss": 1.4435, + "step": 64170 + }, + { + "epoch": 3.0639490575304418, + "grad_norm": 0.1890927255153656, + "learning_rate": 8.527138591788118e-05, + "loss": 1.4366, + "step": 64180 + }, + { + "epoch": 3.064490998695954, + "grad_norm": 0.27579453587532043, + "learning_rate": 8.525966508814396e-05, + "loss": 1.4499, + "step": 64190 + }, + { + "epoch": 3.0650329398614664, + "grad_norm": 0.17526105046272278, + "learning_rate": 8.524794050997079e-05, + "loss": 1.4415, + "step": 64200 + }, + { + "epoch": 3.0653581045607736, + "eval_loss": 2.440138101577759, + "eval_runtime": 21.9841, + "eval_samples_per_second": 227.437, + "eval_steps_per_second": 1.228, + "step": 64206 + }, + { + "epoch": 3.0655748810269783, + "grad_norm": 0.23076623678207397, + "learning_rate": 8.523621218481407e-05, + "loss": 1.4393, + "step": 64210 + }, + { + "epoch": 3.0661168221924906, + "grad_norm": 0.29393690824508667, + "learning_rate": 8.522448011412665e-05, + "loss": 1.4491, + "step": 64220 + }, + { + "epoch": 3.066658763358003, + "grad_norm": 0.16649694740772247, + "learning_rate": 8.521274429936187e-05, + "loss": 1.4364, + "step": 64230 + }, + { + "epoch": 3.0672007045235152, + "grad_norm": 0.23092569410800934, + "learning_rate": 8.520100474197348e-05, + "loss": 1.444, + "step": 64240 + }, + { + "epoch": 3.0677426456890275, + "grad_norm": 0.24326132237911224, + "learning_rate": 8.518926144341577e-05, + "loss": 1.4569, + "step": 64250 + }, + { + "epoch": 3.0682845868545394, + "grad_norm": 0.27092161774635315, + "learning_rate": 8.51775144051434e-05, + "loss": 1.443, + "step": 64260 + }, + { + "epoch": 3.0688265280200517, + "grad_norm": 0.3469485640525818, + "learning_rate": 8.516576362861159e-05, + "loss": 1.4355, + "step": 64270 + }, + { + "epoch": 3.069368469185564, + "grad_norm": 0.22915403544902802, + "learning_rate": 8.515400911527592e-05, + "loss": 1.4623, + "step": 64280 + }, + { + "epoch": 3.0699104103510764, + "grad_norm": 0.2217090129852295, + "learning_rate": 8.514225086659253e-05, + "loss": 1.4463, + "step": 64290 + }, + { + "epoch": 3.0700729927007298, + "eval_loss": 2.4342753887176514, + "eval_runtime": 21.983, + "eval_samples_per_second": 227.448, + "eval_steps_per_second": 1.228, + "step": 64293 + }, + { + "epoch": 3.0704523515165887, + "grad_norm": 0.3960467278957367, + "learning_rate": 8.513048888401795e-05, + "loss": 1.4428, + "step": 64300 + }, + { + "epoch": 3.0709942926821006, + "grad_norm": 0.20913895964622498, + "learning_rate": 8.511872316900925e-05, + "loss": 1.4455, + "step": 64310 + }, + { + "epoch": 3.071536233847613, + "grad_norm": 0.3125520944595337, + "learning_rate": 8.510695372302385e-05, + "loss": 1.4572, + "step": 64320 + }, + { + "epoch": 3.072078175013125, + "grad_norm": 0.3635540306568146, + "learning_rate": 8.509518054751976e-05, + "loss": 1.452, + "step": 64330 + }, + { + "epoch": 3.0726201161786375, + "grad_norm": 0.2593584358692169, + "learning_rate": 8.508340364395536e-05, + "loss": 1.4375, + "step": 64340 + }, + { + "epoch": 3.0731620573441494, + "grad_norm": 0.22683855891227722, + "learning_rate": 8.507162301378952e-05, + "loss": 1.4461, + "step": 64350 + }, + { + "epoch": 3.0737039985096617, + "grad_norm": 0.18193486332893372, + "learning_rate": 8.505983865848158e-05, + "loss": 1.4501, + "step": 64360 + }, + { + "epoch": 3.074245939675174, + "grad_norm": 0.35355544090270996, + "learning_rate": 8.504805057949132e-05, + "loss": 1.444, + "step": 64370 + }, + { + "epoch": 3.0747878808406863, + "grad_norm": 0.16228243708610535, + "learning_rate": 8.503625877827904e-05, + "loss": 1.4489, + "step": 64380 + }, + { + "epoch": 3.0747878808406863, + "eval_loss": 2.4377474784851074, + "eval_runtime": 21.9803, + "eval_samples_per_second": 227.477, + "eval_steps_per_second": 1.228, + "step": 64380 + }, + { + "epoch": 3.0753298220061986, + "grad_norm": 0.26819851994514465, + "learning_rate": 8.50244632563054e-05, + "loss": 1.4487, + "step": 64390 + }, + { + "epoch": 3.0758717631717105, + "grad_norm": 0.21382486820220947, + "learning_rate": 8.501266401503164e-05, + "loss": 1.4461, + "step": 64400 + }, + { + "epoch": 3.076413704337223, + "grad_norm": 0.20011401176452637, + "learning_rate": 8.500086105591935e-05, + "loss": 1.4511, + "step": 64410 + }, + { + "epoch": 3.076955645502735, + "grad_norm": 0.2521159052848816, + "learning_rate": 8.498905438043061e-05, + "loss": 1.4421, + "step": 64420 + }, + { + "epoch": 3.0774975866682475, + "grad_norm": 0.27458107471466064, + "learning_rate": 8.497724399002805e-05, + "loss": 1.4556, + "step": 64430 + }, + { + "epoch": 3.0780395278337593, + "grad_norm": 0.17625468969345093, + "learning_rate": 8.496542988617463e-05, + "loss": 1.4562, + "step": 64440 + }, + { + "epoch": 3.0785814689992717, + "grad_norm": 0.23053233325481415, + "learning_rate": 8.495361207033387e-05, + "loss": 1.4438, + "step": 64450 + }, + { + "epoch": 3.079123410164784, + "grad_norm": 0.20668716728687286, + "learning_rate": 8.494179054396968e-05, + "loss": 1.4421, + "step": 64460 + }, + { + "epoch": 3.0795027689806425, + "eval_loss": 2.4462552070617676, + "eval_runtime": 208.061, + "eval_samples_per_second": 24.031, + "eval_steps_per_second": 0.13, + "step": 64467 + }, + { + "epoch": 3.0796653513302963, + "grad_norm": 0.2157292515039444, + "learning_rate": 8.492996530854646e-05, + "loss": 1.441, + "step": 64470 + }, + { + "epoch": 3.0802072924958086, + "grad_norm": 0.1837170124053955, + "learning_rate": 8.491813636552911e-05, + "loss": 1.4421, + "step": 64480 + }, + { + "epoch": 3.0807492336613205, + "grad_norm": 0.2104165405035019, + "learning_rate": 8.490630371638291e-05, + "loss": 1.4516, + "step": 64490 + }, + { + "epoch": 3.081291174826833, + "grad_norm": 0.24644435942173004, + "learning_rate": 8.489446736257365e-05, + "loss": 1.4451, + "step": 64500 + }, + { + "epoch": 3.081833115992345, + "grad_norm": 0.18532828986644745, + "learning_rate": 8.488262730556754e-05, + "loss": 1.4551, + "step": 64510 + }, + { + "epoch": 3.0823750571578574, + "grad_norm": 0.20310962200164795, + "learning_rate": 8.487078354683132e-05, + "loss": 1.4506, + "step": 64520 + }, + { + "epoch": 3.0829169983233697, + "grad_norm": 0.22750219702720642, + "learning_rate": 8.48589360878321e-05, + "loss": 1.4364, + "step": 64530 + }, + { + "epoch": 3.0834589394888816, + "grad_norm": 0.23471009731292725, + "learning_rate": 8.484708493003753e-05, + "loss": 1.455, + "step": 64540 + }, + { + "epoch": 3.084000880654394, + "grad_norm": 0.17168253660202026, + "learning_rate": 8.483523007491565e-05, + "loss": 1.4408, + "step": 64550 + }, + { + "epoch": 3.084217657120599, + "eval_loss": 2.4336390495300293, + "eval_runtime": 21.9861, + "eval_samples_per_second": 227.416, + "eval_steps_per_second": 1.228, + "step": 64554 + }, + { + "epoch": 3.0845428218199062, + "grad_norm": 0.21957682073116302, + "learning_rate": 8.4823371523935e-05, + "loss": 1.4431, + "step": 64560 + }, + { + "epoch": 3.0850847629854186, + "grad_norm": 0.25939157605171204, + "learning_rate": 8.481150927856458e-05, + "loss": 1.4492, + "step": 64570 + }, + { + "epoch": 3.0856267041509304, + "grad_norm": 0.22554215788841248, + "learning_rate": 8.479964334027381e-05, + "loss": 1.4551, + "step": 64580 + }, + { + "epoch": 3.0861686453164427, + "grad_norm": 0.20122438669204712, + "learning_rate": 8.478777371053259e-05, + "loss": 1.4445, + "step": 64590 + }, + { + "epoch": 3.086710586481955, + "grad_norm": 0.18250145018100739, + "learning_rate": 8.477590039081131e-05, + "loss": 1.4475, + "step": 64600 + }, + { + "epoch": 3.0872525276474674, + "grad_norm": 0.18288695812225342, + "learning_rate": 8.476402338258073e-05, + "loss": 1.4484, + "step": 64610 + }, + { + "epoch": 3.0877944688129797, + "grad_norm": 0.19832782447338104, + "learning_rate": 8.475214268731219e-05, + "loss": 1.4513, + "step": 64620 + }, + { + "epoch": 3.0883364099784916, + "grad_norm": 0.38973701000213623, + "learning_rate": 8.474025830647737e-05, + "loss": 1.442, + "step": 64630 + }, + { + "epoch": 3.088878351144004, + "grad_norm": 0.3162194490432739, + "learning_rate": 8.472837024154847e-05, + "loss": 1.4464, + "step": 64640 + }, + { + "epoch": 3.088932545260555, + "eval_loss": 2.423926830291748, + "eval_runtime": 21.9851, + "eval_samples_per_second": 227.426, + "eval_steps_per_second": 1.228, + "step": 64641 + }, + { + "epoch": 3.089420292309516, + "grad_norm": 0.1702577918767929, + "learning_rate": 8.471647849399815e-05, + "loss": 1.4542, + "step": 64650 + }, + { + "epoch": 3.0899622334750285, + "grad_norm": 0.315075546503067, + "learning_rate": 8.470458306529946e-05, + "loss": 1.4416, + "step": 64660 + }, + { + "epoch": 3.0905041746405404, + "grad_norm": 0.25835493206977844, + "learning_rate": 8.4692683956926e-05, + "loss": 1.436, + "step": 64670 + }, + { + "epoch": 3.0910461158060527, + "grad_norm": 0.6356673240661621, + "learning_rate": 8.468078117035176e-05, + "loss": 1.4472, + "step": 64680 + }, + { + "epoch": 3.091588056971565, + "grad_norm": 0.3583969175815582, + "learning_rate": 8.466887470705121e-05, + "loss": 1.4487, + "step": 64690 + }, + { + "epoch": 3.0921299981370773, + "grad_norm": 0.21938195824623108, + "learning_rate": 8.465696456849928e-05, + "loss": 1.4513, + "step": 64700 + }, + { + "epoch": 3.0926719393025897, + "grad_norm": 0.26139944791793823, + "learning_rate": 8.464505075617133e-05, + "loss": 1.4505, + "step": 64710 + }, + { + "epoch": 3.0932138804681015, + "grad_norm": 0.2597460150718689, + "learning_rate": 8.46331332715432e-05, + "loss": 1.4483, + "step": 64720 + }, + { + "epoch": 3.0936474334005113, + "eval_loss": 2.4286162853240967, + "eval_runtime": 21.6785, + "eval_samples_per_second": 230.643, + "eval_steps_per_second": 1.245, + "step": 64728 + }, + { + "epoch": 3.093755821633614, + "grad_norm": 0.21105916798114777, + "learning_rate": 8.462121211609117e-05, + "loss": 1.4437, + "step": 64730 + }, + { + "epoch": 3.094297762799126, + "grad_norm": 0.1735154688358307, + "learning_rate": 8.4609287291292e-05, + "loss": 1.4413, + "step": 64740 + }, + { + "epoch": 3.0948397039646385, + "grad_norm": 0.1659323126077652, + "learning_rate": 8.459735879862286e-05, + "loss": 1.4488, + "step": 64750 + }, + { + "epoch": 3.0953816451301503, + "grad_norm": 0.19731490314006805, + "learning_rate": 8.45854266395614e-05, + "loss": 1.4478, + "step": 64760 + }, + { + "epoch": 3.0959235862956627, + "grad_norm": 0.19939690828323364, + "learning_rate": 8.457349081558576e-05, + "loss": 1.4378, + "step": 64770 + }, + { + "epoch": 3.096465527461175, + "grad_norm": 0.24715344607830048, + "learning_rate": 8.456155132817443e-05, + "loss": 1.4477, + "step": 64780 + }, + { + "epoch": 3.0970074686266873, + "grad_norm": 0.20696981251239777, + "learning_rate": 8.45496081788065e-05, + "loss": 1.4452, + "step": 64790 + }, + { + "epoch": 3.0975494097921996, + "grad_norm": 0.19315387308597565, + "learning_rate": 8.45376613689614e-05, + "loss": 1.4469, + "step": 64800 + }, + { + "epoch": 3.0980913509577115, + "grad_norm": 0.23861078917980194, + "learning_rate": 8.452571090011905e-05, + "loss": 1.4624, + "step": 64810 + }, + { + "epoch": 3.098362321540468, + "eval_loss": 2.4268784523010254, + "eval_runtime": 22.037, + "eval_samples_per_second": 226.891, + "eval_steps_per_second": 1.225, + "step": 64815 + }, + { + "epoch": 3.098633292123224, + "grad_norm": 0.1983337700366974, + "learning_rate": 8.45137567737598e-05, + "loss": 1.4448, + "step": 64820 + }, + { + "epoch": 3.099175233288736, + "grad_norm": 0.2253299057483673, + "learning_rate": 8.450179899136451e-05, + "loss": 1.4471, + "step": 64830 + }, + { + "epoch": 3.0997171744542484, + "grad_norm": 0.2381051778793335, + "learning_rate": 8.448983755441447e-05, + "loss": 1.4461, + "step": 64840 + }, + { + "epoch": 3.1002591156197603, + "grad_norm": 0.23918847739696503, + "learning_rate": 8.447787246439135e-05, + "loss": 1.4571, + "step": 64850 + }, + { + "epoch": 3.1008010567852726, + "grad_norm": 0.17684470117092133, + "learning_rate": 8.446590372277738e-05, + "loss": 1.4455, + "step": 64860 + }, + { + "epoch": 3.101342997950785, + "grad_norm": 0.1668468713760376, + "learning_rate": 8.445393133105519e-05, + "loss": 1.4437, + "step": 64870 + }, + { + "epoch": 3.1018849391162973, + "grad_norm": 0.18969334661960602, + "learning_rate": 8.444195529070785e-05, + "loss": 1.4375, + "step": 64880 + }, + { + "epoch": 3.1024268802818096, + "grad_norm": 0.1837807595729828, + "learning_rate": 8.442997560321894e-05, + "loss": 1.4407, + "step": 64890 + }, + { + "epoch": 3.1029688214473214, + "grad_norm": 0.22503748536109924, + "learning_rate": 8.44179922700724e-05, + "loss": 1.4465, + "step": 64900 + }, + { + "epoch": 3.103077209680424, + "eval_loss": 2.4285776615142822, + "eval_runtime": 21.9808, + "eval_samples_per_second": 227.472, + "eval_steps_per_second": 1.228, + "step": 64902 + }, + { + "epoch": 3.1035107626128338, + "grad_norm": 0.20925331115722656, + "learning_rate": 8.44060052927527e-05, + "loss": 1.4417, + "step": 64910 + }, + { + "epoch": 3.104052703778346, + "grad_norm": 0.36361682415008545, + "learning_rate": 8.439401467274474e-05, + "loss": 1.4505, + "step": 64920 + }, + { + "epoch": 3.1045946449438584, + "grad_norm": 0.2141452133655548, + "learning_rate": 8.438202041153385e-05, + "loss": 1.4502, + "step": 64930 + }, + { + "epoch": 3.1051365861093707, + "grad_norm": 0.22404509782791138, + "learning_rate": 8.437002251060585e-05, + "loss": 1.4467, + "step": 64940 + }, + { + "epoch": 3.1056785272748826, + "grad_norm": 0.26203832030296326, + "learning_rate": 8.435802097144696e-05, + "loss": 1.4514, + "step": 64950 + }, + { + "epoch": 3.106220468440395, + "grad_norm": 0.265337198972702, + "learning_rate": 8.434601579554389e-05, + "loss": 1.4438, + "step": 64960 + }, + { + "epoch": 3.106762409605907, + "grad_norm": 0.1931227445602417, + "learning_rate": 8.433400698438381e-05, + "loss": 1.4518, + "step": 64970 + }, + { + "epoch": 3.1073043507714195, + "grad_norm": 0.2169187366962433, + "learning_rate": 8.432199453945427e-05, + "loss": 1.4367, + "step": 64980 + }, + { + "epoch": 3.1077920978203806, + "eval_loss": 2.420952796936035, + "eval_runtime": 21.9789, + "eval_samples_per_second": 227.491, + "eval_steps_per_second": 1.228, + "step": 64989 + }, + { + "epoch": 3.1078462919369314, + "grad_norm": 0.2237025499343872, + "learning_rate": 8.430997846224338e-05, + "loss": 1.4437, + "step": 64990 + }, + { + "epoch": 3.1083882331024437, + "grad_norm": 0.18218779563903809, + "learning_rate": 8.42979587542396e-05, + "loss": 1.442, + "step": 65000 + }, + { + "epoch": 3.108930174267956, + "grad_norm": 0.2928388714790344, + "learning_rate": 8.428593541693188e-05, + "loss": 1.4455, + "step": 65010 + }, + { + "epoch": 3.1094721154334684, + "grad_norm": 0.22536158561706543, + "learning_rate": 8.427390845180963e-05, + "loss": 1.4388, + "step": 65020 + }, + { + "epoch": 3.1100140565989807, + "grad_norm": 0.3034621477127075, + "learning_rate": 8.426187786036269e-05, + "loss": 1.4458, + "step": 65030 + }, + { + "epoch": 3.1105559977644925, + "grad_norm": 0.21954983472824097, + "learning_rate": 8.424984364408138e-05, + "loss": 1.4349, + "step": 65040 + }, + { + "epoch": 3.111097938930005, + "grad_norm": 0.18350431323051453, + "learning_rate": 8.423780580445642e-05, + "loss": 1.4489, + "step": 65050 + }, + { + "epoch": 3.111639880095517, + "grad_norm": 0.19277605414390564, + "learning_rate": 8.4225764342979e-05, + "loss": 1.4507, + "step": 65060 + }, + { + "epoch": 3.1121818212610295, + "grad_norm": 0.25510692596435547, + "learning_rate": 8.42137192611408e-05, + "loss": 1.4467, + "step": 65070 + }, + { + "epoch": 3.1125069859603367, + "eval_loss": 2.427020788192749, + "eval_runtime": 21.9917, + "eval_samples_per_second": 227.358, + "eval_steps_per_second": 1.228, + "step": 65076 + }, + { + "epoch": 3.1127237624265414, + "grad_norm": 0.2290118932723999, + "learning_rate": 8.42016705604339e-05, + "loss": 1.4456, + "step": 65080 + }, + { + "epoch": 3.1132657035920537, + "grad_norm": 0.19887521862983704, + "learning_rate": 8.41896182423508e-05, + "loss": 1.4479, + "step": 65090 + }, + { + "epoch": 3.113807644757566, + "grad_norm": 0.17075598239898682, + "learning_rate": 8.417756230838455e-05, + "loss": 1.443, + "step": 65100 + }, + { + "epoch": 3.1143495859230783, + "grad_norm": 0.2500585913658142, + "learning_rate": 8.416550276002853e-05, + "loss": 1.44, + "step": 65110 + }, + { + "epoch": 3.1148915270885906, + "grad_norm": 0.2130533754825592, + "learning_rate": 8.415343959877668e-05, + "loss": 1.4478, + "step": 65120 + }, + { + "epoch": 3.1154334682541025, + "grad_norm": 0.3005780279636383, + "learning_rate": 8.41413728261233e-05, + "loss": 1.4438, + "step": 65130 + }, + { + "epoch": 3.115975409419615, + "grad_norm": 0.23416976630687714, + "learning_rate": 8.412930244356316e-05, + "loss": 1.4435, + "step": 65140 + }, + { + "epoch": 3.116517350585127, + "grad_norm": 0.26363202929496765, + "learning_rate": 8.41172284525915e-05, + "loss": 1.4469, + "step": 65150 + }, + { + "epoch": 3.1170592917506394, + "grad_norm": 0.20009048283100128, + "learning_rate": 8.4105150854704e-05, + "loss": 1.4412, + "step": 65160 + }, + { + "epoch": 3.117221874100293, + "eval_loss": 2.424807071685791, + "eval_runtime": 21.9875, + "eval_samples_per_second": 227.402, + "eval_steps_per_second": 1.228, + "step": 65163 + }, + { + "epoch": 3.1176012329161518, + "grad_norm": 0.2043033242225647, + "learning_rate": 8.409306965139677e-05, + "loss": 1.4525, + "step": 65170 + }, + { + "epoch": 3.1181431740816636, + "grad_norm": 0.27456438541412354, + "learning_rate": 8.408098484416639e-05, + "loss": 1.4432, + "step": 65180 + }, + { + "epoch": 3.118685115247176, + "grad_norm": 0.4368920624256134, + "learning_rate": 8.406889643450984e-05, + "loss": 1.4449, + "step": 65190 + }, + { + "epoch": 3.1192270564126883, + "grad_norm": 0.3747996389865875, + "learning_rate": 8.405680442392464e-05, + "loss": 1.4441, + "step": 65200 + }, + { + "epoch": 3.1197689975782006, + "grad_norm": 0.3094594478607178, + "learning_rate": 8.404470881390863e-05, + "loss": 1.4518, + "step": 65210 + }, + { + "epoch": 3.1203109387437125, + "grad_norm": 0.30118393898010254, + "learning_rate": 8.40326096059602e-05, + "loss": 1.4454, + "step": 65220 + }, + { + "epoch": 3.1208528799092248, + "grad_norm": 0.20605798065662384, + "learning_rate": 8.402050680157816e-05, + "loss": 1.4495, + "step": 65230 + }, + { + "epoch": 3.121394821074737, + "grad_norm": 0.21210525929927826, + "learning_rate": 8.400840040226172e-05, + "loss": 1.4417, + "step": 65240 + }, + { + "epoch": 3.1219367622402494, + "grad_norm": 0.17496547102928162, + "learning_rate": 8.399629040951057e-05, + "loss": 1.4494, + "step": 65250 + }, + { + "epoch": 3.1219367622402494, + "eval_loss": 2.42055082321167, + "eval_runtime": 21.5444, + "eval_samples_per_second": 232.079, + "eval_steps_per_second": 1.253, + "step": 65250 + }, + { + "epoch": 3.1224787034057617, + "grad_norm": 0.26712679862976074, + "learning_rate": 8.398417682482486e-05, + "loss": 1.4512, + "step": 65260 + }, + { + "epoch": 3.1230206445712736, + "grad_norm": 0.2958221733570099, + "learning_rate": 8.397205964970515e-05, + "loss": 1.4454, + "step": 65270 + }, + { + "epoch": 3.123562585736786, + "grad_norm": 0.2679630219936371, + "learning_rate": 8.39599388856525e-05, + "loss": 1.4397, + "step": 65280 + }, + { + "epoch": 3.1241045269022982, + "grad_norm": 0.16526180505752563, + "learning_rate": 8.394781453416832e-05, + "loss": 1.4498, + "step": 65290 + }, + { + "epoch": 3.1246464680678105, + "grad_norm": 0.20681805908679962, + "learning_rate": 8.393568659675458e-05, + "loss": 1.4472, + "step": 65300 + }, + { + "epoch": 3.1251884092333224, + "grad_norm": 0.19305351376533508, + "learning_rate": 8.392355507491361e-05, + "loss": 1.4432, + "step": 65310 + }, + { + "epoch": 3.1257303503988347, + "grad_norm": 0.21879535913467407, + "learning_rate": 8.391141997014819e-05, + "loss": 1.4487, + "step": 65320 + }, + { + "epoch": 3.126272291564347, + "grad_norm": 0.22270238399505615, + "learning_rate": 8.389928128396161e-05, + "loss": 1.4372, + "step": 65330 + }, + { + "epoch": 3.1266516503802055, + "eval_loss": 2.4269917011260986, + "eval_runtime": 21.979, + "eval_samples_per_second": 227.49, + "eval_steps_per_second": 1.228, + "step": 65337 + }, + { + "epoch": 3.1268142327298594, + "grad_norm": 0.2375665158033371, + "learning_rate": 8.388713901785753e-05, + "loss": 1.4447, + "step": 65340 + }, + { + "epoch": 3.1273561738953717, + "grad_norm": 0.30421313643455505, + "learning_rate": 8.387499317334007e-05, + "loss": 1.4416, + "step": 65350 + }, + { + "epoch": 3.1278981150608836, + "grad_norm": 0.3451899290084839, + "learning_rate": 8.386284375191381e-05, + "loss": 1.4467, + "step": 65360 + }, + { + "epoch": 3.128440056226396, + "grad_norm": 0.26824066042900085, + "learning_rate": 8.385069075508379e-05, + "loss": 1.4496, + "step": 65370 + }, + { + "epoch": 3.128981997391908, + "grad_norm": 0.17959991097450256, + "learning_rate": 8.383853418435546e-05, + "loss": 1.4531, + "step": 65380 + }, + { + "epoch": 3.1295239385574205, + "grad_norm": 0.18542522192001343, + "learning_rate": 8.38263740412347e-05, + "loss": 1.4485, + "step": 65390 + }, + { + "epoch": 3.1300658797229324, + "grad_norm": 0.1759880632162094, + "learning_rate": 8.38142103272279e-05, + "loss": 1.4377, + "step": 65400 + }, + { + "epoch": 3.1306078208884447, + "grad_norm": 0.24404051899909973, + "learning_rate": 8.380204304384181e-05, + "loss": 1.4517, + "step": 65410 + }, + { + "epoch": 3.131149762053957, + "grad_norm": 0.18275314569473267, + "learning_rate": 8.37898721925837e-05, + "loss": 1.4604, + "step": 65420 + }, + { + "epoch": 3.131366538520162, + "eval_loss": 2.424156665802002, + "eval_runtime": 21.9886, + "eval_samples_per_second": 227.39, + "eval_steps_per_second": 1.228, + "step": 65424 + }, + { + "epoch": 3.1316917032194693, + "grad_norm": 0.22716642916202545, + "learning_rate": 8.377769777496118e-05, + "loss": 1.4573, + "step": 65430 + }, + { + "epoch": 3.1322336443849816, + "grad_norm": 0.2442425787448883, + "learning_rate": 8.376551979248242e-05, + "loss": 1.4396, + "step": 65440 + }, + { + "epoch": 3.1327755855504935, + "grad_norm": 0.25765150785446167, + "learning_rate": 8.375333824665594e-05, + "loss": 1.4532, + "step": 65450 + }, + { + "epoch": 3.133317526716006, + "grad_norm": 0.22376912832260132, + "learning_rate": 8.374115313899077e-05, + "loss": 1.4431, + "step": 65460 + }, + { + "epoch": 3.133859467881518, + "grad_norm": 0.36618417501449585, + "learning_rate": 8.372896447099634e-05, + "loss": 1.4448, + "step": 65470 + }, + { + "epoch": 3.1344014090470305, + "grad_norm": 0.35767969489097595, + "learning_rate": 8.371677224418248e-05, + "loss": 1.4337, + "step": 65480 + }, + { + "epoch": 3.1349433502125423, + "grad_norm": 0.48897460103034973, + "learning_rate": 8.370457646005957e-05, + "loss": 1.4542, + "step": 65490 + }, + { + "epoch": 3.1354852913780547, + "grad_norm": 0.39159682393074036, + "learning_rate": 8.369237712013835e-05, + "loss": 1.4522, + "step": 65500 + }, + { + "epoch": 3.136027232543567, + "grad_norm": 0.36611253023147583, + "learning_rate": 8.368017422593003e-05, + "loss": 1.441, + "step": 65510 + }, + { + "epoch": 3.1360814266601182, + "eval_loss": 2.427338123321533, + "eval_runtime": 22.8212, + "eval_samples_per_second": 219.095, + "eval_steps_per_second": 1.183, + "step": 65511 + }, + { + "epoch": 3.1365691737090793, + "grad_norm": 0.26878681778907776, + "learning_rate": 8.366796777894624e-05, + "loss": 1.4446, + "step": 65520 + }, + { + "epoch": 3.1371111148745916, + "grad_norm": 0.2367294430732727, + "learning_rate": 8.365575778069907e-05, + "loss": 1.4538, + "step": 65530 + }, + { + "epoch": 3.1376530560401035, + "grad_norm": 0.29069700837135315, + "learning_rate": 8.364354423270102e-05, + "loss": 1.4405, + "step": 65540 + }, + { + "epoch": 3.138194997205616, + "grad_norm": 0.22299885749816895, + "learning_rate": 8.363132713646509e-05, + "loss": 1.4406, + "step": 65550 + }, + { + "epoch": 3.138736938371128, + "grad_norm": 0.2363450676202774, + "learning_rate": 8.361910649350465e-05, + "loss": 1.4323, + "step": 65560 + }, + { + "epoch": 3.1392788795366404, + "grad_norm": 0.2508092224597931, + "learning_rate": 8.360688230533356e-05, + "loss": 1.4348, + "step": 65570 + }, + { + "epoch": 3.1398208207021527, + "grad_norm": 0.20643532276153564, + "learning_rate": 8.359465457346607e-05, + "loss": 1.4358, + "step": 65580 + }, + { + "epoch": 3.1403627618676646, + "grad_norm": 0.22416460514068604, + "learning_rate": 8.358242329941692e-05, + "loss": 1.4501, + "step": 65590 + }, + { + "epoch": 3.1407963148000744, + "eval_loss": 2.4230735301971436, + "eval_runtime": 23.1943, + "eval_samples_per_second": 215.571, + "eval_steps_per_second": 1.164, + "step": 65598 + }, + { + "epoch": 3.140904703033177, + "grad_norm": 0.3181062936782837, + "learning_rate": 8.357018848470128e-05, + "loss": 1.4362, + "step": 65600 + }, + { + "epoch": 3.1414466441986892, + "grad_norm": 0.23214562237262726, + "learning_rate": 8.35579501308347e-05, + "loss": 1.4405, + "step": 65610 + }, + { + "epoch": 3.1419885853642016, + "grad_norm": 0.35660114884376526, + "learning_rate": 8.354570823933327e-05, + "loss": 1.4464, + "step": 65620 + }, + { + "epoch": 3.1425305265297134, + "grad_norm": 0.38225075602531433, + "learning_rate": 8.353346281171343e-05, + "loss": 1.4411, + "step": 65630 + }, + { + "epoch": 3.1430724676952257, + "grad_norm": 0.2181611806154251, + "learning_rate": 8.35212138494921e-05, + "loss": 1.442, + "step": 65640 + }, + { + "epoch": 3.143614408860738, + "grad_norm": 0.23916418850421906, + "learning_rate": 8.35089613541866e-05, + "loss": 1.4485, + "step": 65650 + }, + { + "epoch": 3.1441563500262504, + "grad_norm": 0.19111785292625427, + "learning_rate": 8.349670532731478e-05, + "loss": 1.4476, + "step": 65660 + }, + { + "epoch": 3.1446982911917627, + "grad_norm": 0.31324857473373413, + "learning_rate": 8.34844457703948e-05, + "loss": 1.4374, + "step": 65670 + }, + { + "epoch": 3.1452402323572746, + "grad_norm": 0.3937076926231384, + "learning_rate": 8.347218268494535e-05, + "loss": 1.4406, + "step": 65680 + }, + { + "epoch": 3.145511202940031, + "eval_loss": 2.4279704093933105, + "eval_runtime": 22.8461, + "eval_samples_per_second": 218.856, + "eval_steps_per_second": 1.182, + "step": 65685 + }, + { + "epoch": 3.145782173522787, + "grad_norm": 0.27373912930488586, + "learning_rate": 8.345991607248553e-05, + "loss": 1.4465, + "step": 65690 + }, + { + "epoch": 3.146324114688299, + "grad_norm": 0.16764716804027557, + "learning_rate": 8.344764593453485e-05, + "loss": 1.4447, + "step": 65700 + }, + { + "epoch": 3.1468660558538115, + "grad_norm": 0.21356230974197388, + "learning_rate": 8.343537227261332e-05, + "loss": 1.4484, + "step": 65710 + }, + { + "epoch": 3.1474079970193234, + "grad_norm": 0.17575284838676453, + "learning_rate": 8.342309508824132e-05, + "loss": 1.4387, + "step": 65720 + }, + { + "epoch": 3.1479499381848357, + "grad_norm": 0.20323576033115387, + "learning_rate": 8.34108143829397e-05, + "loss": 1.4325, + "step": 65730 + }, + { + "epoch": 3.148491879350348, + "grad_norm": 0.30600157380104065, + "learning_rate": 8.339853015822974e-05, + "loss": 1.4434, + "step": 65740 + }, + { + "epoch": 3.1490338205158603, + "grad_norm": 0.22637711465358734, + "learning_rate": 8.338624241563316e-05, + "loss": 1.4399, + "step": 65750 + }, + { + "epoch": 3.1495757616813727, + "grad_norm": 0.17684820294380188, + "learning_rate": 8.33739511566721e-05, + "loss": 1.4412, + "step": 65760 + }, + { + "epoch": 3.1501177028468845, + "grad_norm": 0.20868733525276184, + "learning_rate": 8.336165638286916e-05, + "loss": 1.4387, + "step": 65770 + }, + { + "epoch": 3.150226091079987, + "eval_loss": 2.425462245941162, + "eval_runtime": 22.0377, + "eval_samples_per_second": 226.884, + "eval_steps_per_second": 1.225, + "step": 65772 + }, + { + "epoch": 3.150659644012397, + "grad_norm": 0.23098771274089813, + "learning_rate": 8.334935809574738e-05, + "loss": 1.4372, + "step": 65780 + }, + { + "epoch": 3.151201585177909, + "grad_norm": 0.17905355989933014, + "learning_rate": 8.33370562968302e-05, + "loss": 1.4408, + "step": 65790 + }, + { + "epoch": 3.1517435263434215, + "grad_norm": 0.21767891943454742, + "learning_rate": 8.332475098764149e-05, + "loss": 1.4326, + "step": 65800 + }, + { + "epoch": 3.152285467508934, + "grad_norm": 0.2932087779045105, + "learning_rate": 8.331244216970561e-05, + "loss": 1.4417, + "step": 65810 + }, + { + "epoch": 3.1528274086744457, + "grad_norm": 0.2508069574832916, + "learning_rate": 8.330012984454732e-05, + "loss": 1.4398, + "step": 65820 + }, + { + "epoch": 3.153369349839958, + "grad_norm": 0.20580008625984192, + "learning_rate": 8.32878140136918e-05, + "loss": 1.4417, + "step": 65830 + }, + { + "epoch": 3.1539112910054703, + "grad_norm": 0.222931370139122, + "learning_rate": 8.327549467866472e-05, + "loss": 1.4447, + "step": 65840 + }, + { + "epoch": 3.1544532321709826, + "grad_norm": 0.17937925457954407, + "learning_rate": 8.32631718409921e-05, + "loss": 1.4547, + "step": 65850 + }, + { + "epoch": 3.154940979219943, + "eval_loss": 2.4275054931640625, + "eval_runtime": 21.995, + "eval_samples_per_second": 227.324, + "eval_steps_per_second": 1.228, + "step": 65859 + }, + { + "epoch": 3.1549951733364945, + "grad_norm": 0.27297332882881165, + "learning_rate": 8.325084550220046e-05, + "loss": 1.4369, + "step": 65860 + }, + { + "epoch": 3.155537114502007, + "grad_norm": 0.2378816157579422, + "learning_rate": 8.323851566381672e-05, + "loss": 1.4453, + "step": 65870 + }, + { + "epoch": 3.156079055667519, + "grad_norm": 0.30926477909088135, + "learning_rate": 8.322618232736827e-05, + "loss": 1.4455, + "step": 65880 + }, + { + "epoch": 3.1566209968330314, + "grad_norm": 0.19149400293827057, + "learning_rate": 8.32138454943829e-05, + "loss": 1.4509, + "step": 65890 + }, + { + "epoch": 3.1571629379985433, + "grad_norm": 0.18199436366558075, + "learning_rate": 8.320150516638884e-05, + "loss": 1.4574, + "step": 65900 + }, + { + "epoch": 3.1577048791640556, + "grad_norm": 0.3855782747268677, + "learning_rate": 8.318916134491477e-05, + "loss": 1.4557, + "step": 65910 + }, + { + "epoch": 3.158246820329568, + "grad_norm": 0.29194483160972595, + "learning_rate": 8.317681403148978e-05, + "loss": 1.4476, + "step": 65920 + }, + { + "epoch": 3.1587887614950803, + "grad_norm": 0.3535638451576233, + "learning_rate": 8.316446322764338e-05, + "loss": 1.4398, + "step": 65930 + }, + { + "epoch": 3.1593307026605926, + "grad_norm": 0.36056360602378845, + "learning_rate": 8.315210893490556e-05, + "loss": 1.44, + "step": 65940 + }, + { + "epoch": 3.1596558673599, + "eval_loss": 2.4279699325561523, + "eval_runtime": 21.9818, + "eval_samples_per_second": 227.461, + "eval_steps_per_second": 1.228, + "step": 65946 + }, + { + "epoch": 3.1598726438261044, + "grad_norm": 0.17839354276657104, + "learning_rate": 8.313975115480671e-05, + "loss": 1.4455, + "step": 65950 + }, + { + "epoch": 3.1604145849916168, + "grad_norm": 0.21037691831588745, + "learning_rate": 8.312738988887766e-05, + "loss": 1.4449, + "step": 65960 + }, + { + "epoch": 3.160956526157129, + "grad_norm": 0.25134557485580444, + "learning_rate": 8.311502513864966e-05, + "loss": 1.4403, + "step": 65970 + }, + { + "epoch": 3.1614984673226414, + "grad_norm": 0.17751184105873108, + "learning_rate": 8.310265690565443e-05, + "loss": 1.4473, + "step": 65980 + }, + { + "epoch": 3.1620404084881537, + "grad_norm": 0.1860496550798416, + "learning_rate": 8.309028519142406e-05, + "loss": 1.4408, + "step": 65990 + }, + { + "epoch": 3.1625823496536656, + "grad_norm": 0.22845673561096191, + "learning_rate": 8.30779099974911e-05, + "loss": 1.4459, + "step": 66000 + }, + { + "epoch": 3.163124290819178, + "grad_norm": 0.4244441092014313, + "learning_rate": 8.306553132538856e-05, + "loss": 1.4389, + "step": 66010 + }, + { + "epoch": 3.16366623198469, + "grad_norm": 0.3140546679496765, + "learning_rate": 8.305314917664985e-05, + "loss": 1.4415, + "step": 66020 + }, + { + "epoch": 3.1642081731502025, + "grad_norm": 0.34219682216644287, + "learning_rate": 8.304076355280883e-05, + "loss": 1.4586, + "step": 66030 + }, + { + "epoch": 3.164370755499856, + "eval_loss": 2.432257890701294, + "eval_runtime": 21.9773, + "eval_samples_per_second": 227.508, + "eval_steps_per_second": 1.229, + "step": 66033 + }, + { + "epoch": 3.1647501143157144, + "grad_norm": 0.17820283770561218, + "learning_rate": 8.302837445539974e-05, + "loss": 1.4485, + "step": 66040 + }, + { + "epoch": 3.1652920554812267, + "grad_norm": 0.22140522301197052, + "learning_rate": 8.301598188595732e-05, + "loss": 1.4522, + "step": 66050 + }, + { + "epoch": 3.165833996646739, + "grad_norm": 0.24713575839996338, + "learning_rate": 8.300358584601671e-05, + "loss": 1.4491, + "step": 66060 + }, + { + "epoch": 3.1663759378122514, + "grad_norm": 0.32958516478538513, + "learning_rate": 8.299118633711344e-05, + "loss": 1.44, + "step": 66070 + }, + { + "epoch": 3.1669178789777637, + "grad_norm": 0.24236957728862762, + "learning_rate": 8.297878336078354e-05, + "loss": 1.4442, + "step": 66080 + }, + { + "epoch": 3.1674598201432755, + "grad_norm": 0.2900751531124115, + "learning_rate": 8.296637691856342e-05, + "loss": 1.4535, + "step": 66090 + }, + { + "epoch": 3.168001761308788, + "grad_norm": 0.2060800939798355, + "learning_rate": 8.295396701198996e-05, + "loss": 1.4472, + "step": 66100 + }, + { + "epoch": 3.1685437024743, + "grad_norm": 0.45042285323143005, + "learning_rate": 8.294155364260045e-05, + "loss": 1.4494, + "step": 66110 + }, + { + "epoch": 3.1690856436398125, + "grad_norm": 0.21350212395191193, + "learning_rate": 8.292913681193254e-05, + "loss": 1.447, + "step": 66120 + }, + { + "epoch": 3.1690856436398125, + "eval_loss": 2.430042266845703, + "eval_runtime": 21.5428, + "eval_samples_per_second": 232.096, + "eval_steps_per_second": 1.253, + "step": 66120 + }, + { + "epoch": 3.1696275848053244, + "grad_norm": 0.19495175778865814, + "learning_rate": 8.291671652152445e-05, + "loss": 1.448, + "step": 66130 + }, + { + "epoch": 3.1701695259708367, + "grad_norm": 0.22730892896652222, + "learning_rate": 8.290429277291471e-05, + "loss": 1.4446, + "step": 66140 + }, + { + "epoch": 3.170711467136349, + "grad_norm": 0.23822692036628723, + "learning_rate": 8.289186556764233e-05, + "loss": 1.4448, + "step": 66150 + }, + { + "epoch": 3.1712534083018613, + "grad_norm": 0.21188586950302124, + "learning_rate": 8.287943490724673e-05, + "loss": 1.441, + "step": 66160 + }, + { + "epoch": 3.1717953494673736, + "grad_norm": 0.279965877532959, + "learning_rate": 8.286700079326777e-05, + "loss": 1.4367, + "step": 66170 + }, + { + "epoch": 3.1723372906328855, + "grad_norm": 0.16774581372737885, + "learning_rate": 8.285456322724577e-05, + "loss": 1.4426, + "step": 66180 + }, + { + "epoch": 3.172879231798398, + "grad_norm": 0.2504456341266632, + "learning_rate": 8.284212221072139e-05, + "loss": 1.4505, + "step": 66190 + }, + { + "epoch": 3.17342117296391, + "grad_norm": 0.17107921838760376, + "learning_rate": 8.282967774523579e-05, + "loss": 1.4483, + "step": 66200 + }, + { + "epoch": 3.1738005317797686, + "eval_loss": 2.4283711910247803, + "eval_runtime": 21.9868, + "eval_samples_per_second": 227.409, + "eval_steps_per_second": 1.228, + "step": 66207 + }, + { + "epoch": 3.1739631141294224, + "grad_norm": 0.2413926124572754, + "learning_rate": 8.281722983233054e-05, + "loss": 1.4405, + "step": 66210 + }, + { + "epoch": 3.1745050552949348, + "grad_norm": 0.22221823036670685, + "learning_rate": 8.280477847354763e-05, + "loss": 1.4362, + "step": 66220 + }, + { + "epoch": 3.1750469964604466, + "grad_norm": 0.2293311357498169, + "learning_rate": 8.279232367042946e-05, + "loss": 1.4406, + "step": 66230 + }, + { + "epoch": 3.175588937625959, + "grad_norm": 0.22770948708057404, + "learning_rate": 8.27798654245189e-05, + "loss": 1.4537, + "step": 66240 + }, + { + "epoch": 3.1761308787914713, + "grad_norm": 0.2080654501914978, + "learning_rate": 8.276740373735922e-05, + "loss": 1.4435, + "step": 66250 + }, + { + "epoch": 3.1766728199569836, + "grad_norm": 0.23802123963832855, + "learning_rate": 8.275493861049414e-05, + "loss": 1.4486, + "step": 66260 + }, + { + "epoch": 3.1772147611224955, + "grad_norm": 0.18610985577106476, + "learning_rate": 8.274247004546775e-05, + "loss": 1.4385, + "step": 66270 + }, + { + "epoch": 3.1777567022880078, + "grad_norm": 0.2491571456193924, + "learning_rate": 8.272999804382461e-05, + "loss": 1.4416, + "step": 66280 + }, + { + "epoch": 3.17829864345352, + "grad_norm": 0.41166606545448303, + "learning_rate": 8.271752260710972e-05, + "loss": 1.4539, + "step": 66290 + }, + { + "epoch": 3.1785154199197247, + "eval_loss": 2.4275481700897217, + "eval_runtime": 21.9832, + "eval_samples_per_second": 227.446, + "eval_steps_per_second": 1.228, + "step": 66294 + }, + { + "epoch": 3.1788405846190324, + "grad_norm": 0.2705402672290802, + "learning_rate": 8.270504373686846e-05, + "loss": 1.4452, + "step": 66300 + }, + { + "epoch": 3.1793825257845447, + "grad_norm": 0.16571412980556488, + "learning_rate": 8.269256143464666e-05, + "loss": 1.4402, + "step": 66310 + }, + { + "epoch": 3.1799244669500566, + "grad_norm": 0.17406509816646576, + "learning_rate": 8.268007570199058e-05, + "loss": 1.4416, + "step": 66320 + }, + { + "epoch": 3.180466408115569, + "grad_norm": 0.26004037261009216, + "learning_rate": 8.26675865404469e-05, + "loss": 1.4407, + "step": 66330 + }, + { + "epoch": 3.1810083492810812, + "grad_norm": 0.18530094623565674, + "learning_rate": 8.265509395156272e-05, + "loss": 1.4352, + "step": 66340 + }, + { + "epoch": 3.1815502904465935, + "grad_norm": 0.22706040740013123, + "learning_rate": 8.264259793688555e-05, + "loss": 1.4528, + "step": 66350 + }, + { + "epoch": 3.1820922316121054, + "grad_norm": 0.3016396462917328, + "learning_rate": 8.263009849796338e-05, + "loss": 1.4534, + "step": 66360 + }, + { + "epoch": 3.1826341727776177, + "grad_norm": 0.2243020385503769, + "learning_rate": 8.261759563634458e-05, + "loss": 1.4415, + "step": 66370 + }, + { + "epoch": 3.18317611394313, + "grad_norm": 0.18285973370075226, + "learning_rate": 8.260508935357791e-05, + "loss": 1.4299, + "step": 66380 + }, + { + "epoch": 3.1832303080596813, + "eval_loss": 2.425734043121338, + "eval_runtime": 21.9789, + "eval_samples_per_second": 227.491, + "eval_steps_per_second": 1.228, + "step": 66381 + }, + { + "epoch": 3.1837180551086424, + "grad_norm": 0.2271660417318344, + "learning_rate": 8.259257965121263e-05, + "loss": 1.4353, + "step": 66390 + }, + { + "epoch": 3.1842599962741547, + "grad_norm": 0.1780574768781662, + "learning_rate": 8.258006653079838e-05, + "loss": 1.4335, + "step": 66400 + }, + { + "epoch": 3.1848019374396666, + "grad_norm": 0.2370023876428604, + "learning_rate": 8.256754999388522e-05, + "loss": 1.4438, + "step": 66410 + }, + { + "epoch": 3.185343878605179, + "grad_norm": 0.34876930713653564, + "learning_rate": 8.255503004202365e-05, + "loss": 1.4481, + "step": 66420 + }, + { + "epoch": 3.185885819770691, + "grad_norm": 0.19301725924015045, + "learning_rate": 8.254250667676461e-05, + "loss": 1.4412, + "step": 66430 + }, + { + "epoch": 3.1864277609362035, + "grad_norm": 0.22976385056972504, + "learning_rate": 8.252997989965942e-05, + "loss": 1.4483, + "step": 66440 + }, + { + "epoch": 3.186969702101716, + "grad_norm": 0.17878347635269165, + "learning_rate": 8.251744971225982e-05, + "loss": 1.4393, + "step": 66450 + }, + { + "epoch": 3.1875116432672277, + "grad_norm": 0.2477036416530609, + "learning_rate": 8.250491611611803e-05, + "loss": 1.4437, + "step": 66460 + }, + { + "epoch": 3.1879451961996375, + "eval_loss": 2.425063371658325, + "eval_runtime": 21.9818, + "eval_samples_per_second": 227.46, + "eval_steps_per_second": 1.228, + "step": 66468 + }, + { + "epoch": 3.18805358443274, + "grad_norm": 0.26957419514656067, + "learning_rate": 8.249237911278665e-05, + "loss": 1.4446, + "step": 66470 + }, + { + "epoch": 3.1885955255982523, + "grad_norm": 0.19030284881591797, + "learning_rate": 8.24798387038187e-05, + "loss": 1.4414, + "step": 66480 + }, + { + "epoch": 3.1891374667637646, + "grad_norm": 0.18507073819637299, + "learning_rate": 8.246729489076763e-05, + "loss": 1.4501, + "step": 66490 + }, + { + "epoch": 3.1896794079292765, + "grad_norm": 0.17628097534179688, + "learning_rate": 8.245474767518734e-05, + "loss": 1.444, + "step": 66500 + }, + { + "epoch": 3.190221349094789, + "grad_norm": 0.259546160697937, + "learning_rate": 8.24421970586321e-05, + "loss": 1.44, + "step": 66510 + }, + { + "epoch": 3.190763290260301, + "grad_norm": 0.22335298359394073, + "learning_rate": 8.242964304265662e-05, + "loss": 1.4356, + "step": 66520 + }, + { + "epoch": 3.1913052314258135, + "grad_norm": 0.22630515694618225, + "learning_rate": 8.241708562881604e-05, + "loss": 1.4469, + "step": 66530 + }, + { + "epoch": 3.1918471725913253, + "grad_norm": 0.2385028600692749, + "learning_rate": 8.240452481866595e-05, + "loss": 1.4392, + "step": 66540 + }, + { + "epoch": 3.1923891137568376, + "grad_norm": 0.22308200597763062, + "learning_rate": 8.239196061376229e-05, + "loss": 1.4402, + "step": 66550 + }, + { + "epoch": 3.192660084339594, + "eval_loss": 2.420628309249878, + "eval_runtime": 21.9793, + "eval_samples_per_second": 227.487, + "eval_steps_per_second": 1.228, + "step": 66555 + }, + { + "epoch": 3.19293105492235, + "grad_norm": 0.388051837682724, + "learning_rate": 8.237939301566148e-05, + "loss": 1.4462, + "step": 66560 + }, + { + "epoch": 3.1934729960878623, + "grad_norm": 0.18918316066265106, + "learning_rate": 8.236682202592032e-05, + "loss": 1.4424, + "step": 66570 + }, + { + "epoch": 3.1940149372533746, + "grad_norm": 0.392870157957077, + "learning_rate": 8.235424764609607e-05, + "loss": 1.447, + "step": 66580 + }, + { + "epoch": 3.1945568784188865, + "grad_norm": 0.46847355365753174, + "learning_rate": 8.234166987774639e-05, + "loss": 1.4523, + "step": 66590 + }, + { + "epoch": 3.195098819584399, + "grad_norm": 0.23610202968120575, + "learning_rate": 8.232908872242932e-05, + "loss": 1.4373, + "step": 66600 + }, + { + "epoch": 3.195640760749911, + "grad_norm": 0.39547818899154663, + "learning_rate": 8.231650418170343e-05, + "loss": 1.4522, + "step": 66610 + }, + { + "epoch": 3.1961827019154234, + "grad_norm": 0.3930874168872833, + "learning_rate": 8.230391625712759e-05, + "loss": 1.4393, + "step": 66620 + }, + { + "epoch": 3.1967246430809357, + "grad_norm": 0.2097681313753128, + "learning_rate": 8.229132495026113e-05, + "loss": 1.4432, + "step": 66630 + }, + { + "epoch": 3.1972665842464476, + "grad_norm": 0.19665232300758362, + "learning_rate": 8.227873026266383e-05, + "loss": 1.436, + "step": 66640 + }, + { + "epoch": 3.19737497247955, + "eval_loss": 2.4246163368225098, + "eval_runtime": 21.9864, + "eval_samples_per_second": 227.414, + "eval_steps_per_second": 1.228, + "step": 66642 + }, + { + "epoch": 3.19780852541196, + "grad_norm": 0.2388431578874588, + "learning_rate": 8.226613219589583e-05, + "loss": 1.4458, + "step": 66650 + }, + { + "epoch": 3.1983504665774722, + "grad_norm": 0.22420534491539001, + "learning_rate": 8.225353075151781e-05, + "loss": 1.4418, + "step": 66660 + }, + { + "epoch": 3.1988924077429846, + "grad_norm": 0.22993390262126923, + "learning_rate": 8.224092593109068e-05, + "loss": 1.4308, + "step": 66670 + }, + { + "epoch": 3.1994343489084964, + "grad_norm": 0.21597042679786682, + "learning_rate": 8.222831773617592e-05, + "loss": 1.4508, + "step": 66680 + }, + { + "epoch": 3.1999762900740087, + "grad_norm": 0.21598871052265167, + "learning_rate": 8.221570616833538e-05, + "loss": 1.4428, + "step": 66690 + }, + { + "epoch": 3.200518231239521, + "grad_norm": 0.20474159717559814, + "learning_rate": 8.220309122913132e-05, + "loss": 1.4461, + "step": 66700 + }, + { + "epoch": 3.2010601724050334, + "grad_norm": 0.2574845850467682, + "learning_rate": 8.219047292012642e-05, + "loss": 1.4409, + "step": 66710 + }, + { + "epoch": 3.2016021135705457, + "grad_norm": 0.17870694398880005, + "learning_rate": 8.217785124288378e-05, + "loss": 1.4424, + "step": 66720 + }, + { + "epoch": 3.2020898606195063, + "eval_loss": 2.422410726547241, + "eval_runtime": 21.9819, + "eval_samples_per_second": 227.46, + "eval_steps_per_second": 1.228, + "step": 66729 + }, + { + "epoch": 3.2021440547360576, + "grad_norm": 0.2235659956932068, + "learning_rate": 8.216522619896693e-05, + "loss": 1.4361, + "step": 66730 + }, + { + "epoch": 3.20268599590157, + "grad_norm": 0.16267363727092743, + "learning_rate": 8.215259778993979e-05, + "loss": 1.4447, + "step": 66740 + }, + { + "epoch": 3.203227937067082, + "grad_norm": 0.26697710156440735, + "learning_rate": 8.213996601736673e-05, + "loss": 1.4388, + "step": 66750 + }, + { + "epoch": 3.2037698782325945, + "grad_norm": 0.3052631616592407, + "learning_rate": 8.21273308828125e-05, + "loss": 1.4425, + "step": 66760 + }, + { + "epoch": 3.2043118193981064, + "grad_norm": 0.22814327478408813, + "learning_rate": 8.21146923878423e-05, + "loss": 1.4351, + "step": 66770 + }, + { + "epoch": 3.2048537605636187, + "grad_norm": 0.19500422477722168, + "learning_rate": 8.210205053402172e-05, + "loss": 1.4411, + "step": 66780 + }, + { + "epoch": 3.205395701729131, + "grad_norm": 0.1982443779706955, + "learning_rate": 8.20894053229168e-05, + "loss": 1.4501, + "step": 66790 + }, + { + "epoch": 3.2059376428946433, + "grad_norm": 0.2346677929162979, + "learning_rate": 8.207675675609395e-05, + "loss": 1.443, + "step": 66800 + }, + { + "epoch": 3.2064795840601557, + "grad_norm": 0.18981340527534485, + "learning_rate": 8.206410483512004e-05, + "loss": 1.4277, + "step": 66810 + }, + { + "epoch": 3.206804748759463, + "eval_loss": 2.4243276119232178, + "eval_runtime": 21.9797, + "eval_samples_per_second": 227.482, + "eval_steps_per_second": 1.228, + "step": 66816 + }, + { + "epoch": 3.2070215252256675, + "grad_norm": 0.20465369522571564, + "learning_rate": 8.20514495615623e-05, + "loss": 1.4445, + "step": 66820 + }, + { + "epoch": 3.20756346639118, + "grad_norm": 0.18857480585575104, + "learning_rate": 8.203879093698845e-05, + "loss": 1.439, + "step": 66830 + }, + { + "epoch": 3.208105407556692, + "grad_norm": 0.19271798431873322, + "learning_rate": 8.202612896296657e-05, + "loss": 1.4458, + "step": 66840 + }, + { + "epoch": 3.2086473487222045, + "grad_norm": 0.42280739545822144, + "learning_rate": 8.201346364106516e-05, + "loss": 1.4374, + "step": 66850 + }, + { + "epoch": 3.209189289887717, + "grad_norm": 0.21561777591705322, + "learning_rate": 8.200079497285316e-05, + "loss": 1.4565, + "step": 66860 + }, + { + "epoch": 3.2097312310532287, + "grad_norm": 0.20918765664100647, + "learning_rate": 8.198812295989991e-05, + "loss": 1.4396, + "step": 66870 + }, + { + "epoch": 3.210273172218741, + "grad_norm": 0.33416321873664856, + "learning_rate": 8.197544760377514e-05, + "loss": 1.4401, + "step": 66880 + }, + { + "epoch": 3.2108151133842533, + "grad_norm": 0.2590186297893524, + "learning_rate": 8.196276890604906e-05, + "loss": 1.4388, + "step": 66890 + }, + { + "epoch": 3.2113570545497656, + "grad_norm": 0.20215663313865662, + "learning_rate": 8.195008686829222e-05, + "loss": 1.4443, + "step": 66900 + }, + { + "epoch": 3.211519636899419, + "eval_loss": 2.431623935699463, + "eval_runtime": 21.9765, + "eval_samples_per_second": 227.515, + "eval_steps_per_second": 1.229, + "step": 66903 + }, + { + "epoch": 3.2118989957152775, + "grad_norm": 0.3255484700202942, + "learning_rate": 8.193740149207561e-05, + "loss": 1.44, + "step": 66910 + }, + { + "epoch": 3.21244093688079, + "grad_norm": 0.33131662011146545, + "learning_rate": 8.192471277897068e-05, + "loss": 1.4452, + "step": 66920 + }, + { + "epoch": 3.212982878046302, + "grad_norm": 0.2151406705379486, + "learning_rate": 8.191202073054922e-05, + "loss": 1.4488, + "step": 66930 + }, + { + "epoch": 3.2135248192118144, + "grad_norm": 0.2985169291496277, + "learning_rate": 8.189932534838346e-05, + "loss": 1.442, + "step": 66940 + }, + { + "epoch": 3.2140667603773263, + "grad_norm": 0.21713115274906158, + "learning_rate": 8.188662663404607e-05, + "loss": 1.44, + "step": 66950 + }, + { + "epoch": 3.2146087015428386, + "grad_norm": 0.25025808811187744, + "learning_rate": 8.18739245891101e-05, + "loss": 1.4394, + "step": 66960 + }, + { + "epoch": 3.215150642708351, + "grad_norm": 0.4430973529815674, + "learning_rate": 8.186121921514903e-05, + "loss": 1.4407, + "step": 66970 + }, + { + "epoch": 3.2156925838738633, + "grad_norm": 0.25356000661849976, + "learning_rate": 8.184851051373673e-05, + "loss": 1.4364, + "step": 66980 + }, + { + "epoch": 3.2162345250393756, + "grad_norm": 0.3219984471797943, + "learning_rate": 8.183579848644753e-05, + "loss": 1.4377, + "step": 66990 + }, + { + "epoch": 3.2162345250393756, + "eval_loss": 2.4254963397979736, + "eval_runtime": 21.8545, + "eval_samples_per_second": 228.785, + "eval_steps_per_second": 1.235, + "step": 66990 + }, + { + "epoch": 3.2167764662048874, + "grad_norm": 0.17054729163646698, + "learning_rate": 8.18230831348561e-05, + "loss": 1.4393, + "step": 67000 + }, + { + "epoch": 3.2173184073703998, + "grad_norm": 0.26265397667884827, + "learning_rate": 8.181036446053761e-05, + "loss": 1.4521, + "step": 67010 + }, + { + "epoch": 3.217860348535912, + "grad_norm": 0.3703189492225647, + "learning_rate": 8.179764246506755e-05, + "loss": 1.4443, + "step": 67020 + }, + { + "epoch": 3.2184022897014244, + "grad_norm": 0.21587614715099335, + "learning_rate": 8.17849171500219e-05, + "loss": 1.4459, + "step": 67030 + }, + { + "epoch": 3.2189442308669367, + "grad_norm": 0.21136698126792908, + "learning_rate": 8.1772188516977e-05, + "loss": 1.4417, + "step": 67040 + }, + { + "epoch": 3.2194861720324486, + "grad_norm": 0.38758090138435364, + "learning_rate": 8.175945656750962e-05, + "loss": 1.4469, + "step": 67050 + }, + { + "epoch": 3.220028113197961, + "grad_norm": 0.3118115961551666, + "learning_rate": 8.174672130319694e-05, + "loss": 1.4373, + "step": 67060 + }, + { + "epoch": 3.220570054363473, + "grad_norm": 0.19323886930942535, + "learning_rate": 8.173398272561654e-05, + "loss": 1.4394, + "step": 67070 + }, + { + "epoch": 3.2209494131793317, + "eval_loss": 2.42934513092041, + "eval_runtime": 24.88, + "eval_samples_per_second": 200.964, + "eval_steps_per_second": 1.085, + "step": 67077 + }, + { + "epoch": 3.2211119955289855, + "grad_norm": 0.22981129586696625, + "learning_rate": 8.172124083634643e-05, + "loss": 1.4441, + "step": 67080 + }, + { + "epoch": 3.2216539366944974, + "grad_norm": 0.18165723979473114, + "learning_rate": 8.170849563696501e-05, + "loss": 1.4517, + "step": 67090 + }, + { + "epoch": 3.2221958778600097, + "grad_norm": 0.19143837690353394, + "learning_rate": 8.169574712905111e-05, + "loss": 1.4327, + "step": 67100 + }, + { + "epoch": 3.222737819025522, + "grad_norm": 0.3314666152000427, + "learning_rate": 8.168299531418396e-05, + "loss": 1.4345, + "step": 67110 + }, + { + "epoch": 3.2232797601910343, + "grad_norm": 0.2969511151313782, + "learning_rate": 8.167024019394321e-05, + "loss": 1.4327, + "step": 67120 + }, + { + "epoch": 3.2238217013565467, + "grad_norm": 0.2874842882156372, + "learning_rate": 8.165748176990887e-05, + "loss": 1.4512, + "step": 67130 + }, + { + "epoch": 3.2243636425220585, + "grad_norm": 0.2401866316795349, + "learning_rate": 8.164472004366145e-05, + "loss": 1.4391, + "step": 67140 + }, + { + "epoch": 3.224905583687571, + "grad_norm": 0.26989611983299255, + "learning_rate": 8.163195501678177e-05, + "loss": 1.4425, + "step": 67150 + }, + { + "epoch": 3.225447524853083, + "grad_norm": 0.21331489086151123, + "learning_rate": 8.161918669085113e-05, + "loss": 1.4442, + "step": 67160 + }, + { + "epoch": 3.225664301319288, + "eval_loss": 2.4356348514556885, + "eval_runtime": 26.8873, + "eval_samples_per_second": 185.961, + "eval_steps_per_second": 1.004, + "step": 67164 + }, + { + "epoch": 3.2259894660185955, + "grad_norm": 0.28904426097869873, + "learning_rate": 8.160641506745123e-05, + "loss": 1.4346, + "step": 67170 + }, + { + "epoch": 3.2265314071841074, + "grad_norm": 0.3067649304866791, + "learning_rate": 8.159364014816412e-05, + "loss": 1.449, + "step": 67180 + }, + { + "epoch": 3.2270733483496197, + "grad_norm": 0.23630201816558838, + "learning_rate": 8.158086193457234e-05, + "loss": 1.4384, + "step": 67190 + }, + { + "epoch": 3.227615289515132, + "grad_norm": 0.21013253927230835, + "learning_rate": 8.156808042825878e-05, + "loss": 1.4325, + "step": 67200 + }, + { + "epoch": 3.2281572306806443, + "grad_norm": 0.18870244920253754, + "learning_rate": 8.155529563080676e-05, + "loss": 1.4469, + "step": 67210 + }, + { + "epoch": 3.2286991718461566, + "grad_norm": 0.266428142786026, + "learning_rate": 8.154250754380002e-05, + "loss": 1.442, + "step": 67220 + }, + { + "epoch": 3.2292411130116685, + "grad_norm": 0.22033743560314178, + "learning_rate": 8.152971616882269e-05, + "loss": 1.4389, + "step": 67230 + }, + { + "epoch": 3.229783054177181, + "grad_norm": 0.29417768120765686, + "learning_rate": 8.151692150745928e-05, + "loss": 1.4398, + "step": 67240 + }, + { + "epoch": 3.230324995342693, + "grad_norm": 0.2595650255680084, + "learning_rate": 8.150412356129478e-05, + "loss": 1.4334, + "step": 67250 + }, + { + "epoch": 3.2303791894592444, + "eval_loss": 2.4399569034576416, + "eval_runtime": 22.4553, + "eval_samples_per_second": 222.665, + "eval_steps_per_second": 1.202, + "step": 67251 + }, + { + "epoch": 3.2308669365082054, + "grad_norm": 0.16750743985176086, + "learning_rate": 8.14913223319145e-05, + "loss": 1.4424, + "step": 67260 + }, + { + "epoch": 3.2314088776737178, + "grad_norm": 0.17600034177303314, + "learning_rate": 8.147851782090425e-05, + "loss": 1.4473, + "step": 67270 + }, + { + "epoch": 3.2319508188392296, + "grad_norm": 0.18144869804382324, + "learning_rate": 8.146571002985013e-05, + "loss": 1.4402, + "step": 67280 + }, + { + "epoch": 3.232492760004742, + "grad_norm": 0.19220831990242004, + "learning_rate": 8.145289896033879e-05, + "loss": 1.4406, + "step": 67290 + }, + { + "epoch": 3.2330347011702543, + "grad_norm": 0.22534775733947754, + "learning_rate": 8.144008461395716e-05, + "loss": 1.4429, + "step": 67300 + }, + { + "epoch": 3.2335766423357666, + "grad_norm": 0.18956543505191803, + "learning_rate": 8.142726699229265e-05, + "loss": 1.4538, + "step": 67310 + }, + { + "epoch": 3.2341185835012785, + "grad_norm": 0.2575107514858246, + "learning_rate": 8.141444609693302e-05, + "loss": 1.4445, + "step": 67320 + }, + { + "epoch": 3.2346605246667908, + "grad_norm": 0.37571388483047485, + "learning_rate": 8.14016219294665e-05, + "loss": 1.4412, + "step": 67330 + }, + { + "epoch": 3.2350940775992005, + "eval_loss": 2.430121660232544, + "eval_runtime": 24.4358, + "eval_samples_per_second": 204.618, + "eval_steps_per_second": 1.105, + "step": 67338 + }, + { + "epoch": 3.235202465832303, + "grad_norm": 0.2669738531112671, + "learning_rate": 8.138879449148168e-05, + "loss": 1.4511, + "step": 67340 + }, + { + "epoch": 3.2357444069978154, + "grad_norm": 0.16547146439552307, + "learning_rate": 8.137596378456757e-05, + "loss": 1.4397, + "step": 67350 + }, + { + "epoch": 3.2362863481633277, + "grad_norm": 0.168304443359375, + "learning_rate": 8.136312981031358e-05, + "loss": 1.4388, + "step": 67360 + }, + { + "epoch": 3.2368282893288396, + "grad_norm": 0.43757399916648865, + "learning_rate": 8.135029257030953e-05, + "loss": 1.4345, + "step": 67370 + }, + { + "epoch": 3.237370230494352, + "grad_norm": 0.26308125257492065, + "learning_rate": 8.133745206614561e-05, + "loss": 1.4538, + "step": 67380 + }, + { + "epoch": 3.2379121716598642, + "grad_norm": 0.18845511972904205, + "learning_rate": 8.132460829941252e-05, + "loss": 1.4393, + "step": 67390 + }, + { + "epoch": 3.2384541128253765, + "grad_norm": 0.17935842275619507, + "learning_rate": 8.13117612717012e-05, + "loss": 1.437, + "step": 67400 + }, + { + "epoch": 3.2389960539908884, + "grad_norm": 0.18170326948165894, + "learning_rate": 8.129891098460316e-05, + "loss": 1.4439, + "step": 67410 + }, + { + "epoch": 3.2395379951564007, + "grad_norm": 0.2227085679769516, + "learning_rate": 8.128605743971018e-05, + "loss": 1.4394, + "step": 67420 + }, + { + "epoch": 3.239808965739157, + "eval_loss": 2.421604633331299, + "eval_runtime": 22.8082, + "eval_samples_per_second": 219.219, + "eval_steps_per_second": 1.184, + "step": 67425 + }, + { + "epoch": 3.240079936321913, + "grad_norm": 0.26854372024536133, + "learning_rate": 8.127320063861455e-05, + "loss": 1.4398, + "step": 67430 + }, + { + "epoch": 3.2406218774874254, + "grad_norm": 0.2939850389957428, + "learning_rate": 8.126034058290887e-05, + "loss": 1.4404, + "step": 67440 + }, + { + "epoch": 3.2411638186529377, + "grad_norm": 0.20707334578037262, + "learning_rate": 8.124747727418623e-05, + "loss": 1.4451, + "step": 67450 + }, + { + "epoch": 3.2417057598184496, + "grad_norm": 0.17989417910575867, + "learning_rate": 8.123461071404005e-05, + "loss": 1.4282, + "step": 67460 + }, + { + "epoch": 3.242247700983962, + "grad_norm": 0.2010103315114975, + "learning_rate": 8.122174090406418e-05, + "loss": 1.4304, + "step": 67470 + }, + { + "epoch": 3.242789642149474, + "grad_norm": 0.24504514038562775, + "learning_rate": 8.120886784585292e-05, + "loss": 1.4444, + "step": 67480 + }, + { + "epoch": 3.2433315833149865, + "grad_norm": 0.16585427522659302, + "learning_rate": 8.119599154100087e-05, + "loss": 1.4337, + "step": 67490 + }, + { + "epoch": 3.243873524480499, + "grad_norm": 0.2048460990190506, + "learning_rate": 8.118311199110314e-05, + "loss": 1.4403, + "step": 67500 + }, + { + "epoch": 3.2444154656460107, + "grad_norm": 0.18525651097297668, + "learning_rate": 8.117022919775516e-05, + "loss": 1.4412, + "step": 67510 + }, + { + "epoch": 3.2445238538791132, + "eval_loss": 2.4250948429107666, + "eval_runtime": 24.9863, + "eval_samples_per_second": 200.11, + "eval_steps_per_second": 1.081, + "step": 67512 + }, + { + "epoch": 3.244957406811523, + "grad_norm": 0.1921282261610031, + "learning_rate": 8.115734316255281e-05, + "loss": 1.4357, + "step": 67520 + }, + { + "epoch": 3.2454993479770353, + "grad_norm": 0.3551950454711914, + "learning_rate": 8.114445388709236e-05, + "loss": 1.449, + "step": 67530 + }, + { + "epoch": 3.2460412891425476, + "grad_norm": 0.26464539766311646, + "learning_rate": 8.113156137297048e-05, + "loss": 1.4367, + "step": 67540 + }, + { + "epoch": 3.2465832303080595, + "grad_norm": 0.24829427897930145, + "learning_rate": 8.111866562178419e-05, + "loss": 1.4472, + "step": 67550 + }, + { + "epoch": 3.247125171473572, + "grad_norm": 0.2007465362548828, + "learning_rate": 8.110576663513105e-05, + "loss": 1.4395, + "step": 67560 + }, + { + "epoch": 3.247667112639084, + "grad_norm": 0.17801623046398163, + "learning_rate": 8.109286441460885e-05, + "loss": 1.4425, + "step": 67570 + }, + { + "epoch": 3.2482090538045965, + "grad_norm": 0.2939807176589966, + "learning_rate": 8.107995896181588e-05, + "loss": 1.4434, + "step": 67580 + }, + { + "epoch": 3.2487509949701083, + "grad_norm": 0.18530774116516113, + "learning_rate": 8.106705027835083e-05, + "loss": 1.4438, + "step": 67590 + }, + { + "epoch": 3.2492387420190694, + "eval_loss": 2.4217963218688965, + "eval_runtime": 23.4048, + "eval_samples_per_second": 213.631, + "eval_steps_per_second": 1.154, + "step": 67599 + }, + { + "epoch": 3.2492929361356206, + "grad_norm": 0.2213825136423111, + "learning_rate": 8.105413836581277e-05, + "loss": 1.4492, + "step": 67600 + }, + { + "epoch": 3.249834877301133, + "grad_norm": 0.2034333199262619, + "learning_rate": 8.104122322580116e-05, + "loss": 1.4339, + "step": 67610 + }, + { + "epoch": 3.2503768184666453, + "grad_norm": 0.16729718446731567, + "learning_rate": 8.102830485991589e-05, + "loss": 1.4311, + "step": 67620 + }, + { + "epoch": 3.2509187596321576, + "grad_norm": 0.18632495403289795, + "learning_rate": 8.101538326975721e-05, + "loss": 1.4385, + "step": 67630 + }, + { + "epoch": 3.2514607007976695, + "grad_norm": 0.3231146037578583, + "learning_rate": 8.100245845692579e-05, + "loss": 1.4494, + "step": 67640 + }, + { + "epoch": 3.252002641963182, + "grad_norm": 0.37084832787513733, + "learning_rate": 8.09895304230227e-05, + "loss": 1.4487, + "step": 67650 + }, + { + "epoch": 3.252544583128694, + "grad_norm": 0.37098175287246704, + "learning_rate": 8.097659916964943e-05, + "loss": 1.4386, + "step": 67660 + }, + { + "epoch": 3.2530865242942064, + "grad_norm": 0.2461938112974167, + "learning_rate": 8.096366469840785e-05, + "loss": 1.4404, + "step": 67670 + }, + { + "epoch": 3.2536284654597187, + "grad_norm": 0.21850334107875824, + "learning_rate": 8.095072701090019e-05, + "loss": 1.4517, + "step": 67680 + }, + { + "epoch": 3.253953630159026, + "eval_loss": 2.423842191696167, + "eval_runtime": 21.9821, + "eval_samples_per_second": 227.458, + "eval_steps_per_second": 1.228, + "step": 67686 + }, + { + "epoch": 3.2541704066252306, + "grad_norm": 0.18838092684745789, + "learning_rate": 8.093778610872912e-05, + "loss": 1.4467, + "step": 67690 + }, + { + "epoch": 3.254712347790743, + "grad_norm": 0.21146613359451294, + "learning_rate": 8.092484199349775e-05, + "loss": 1.4391, + "step": 67700 + }, + { + "epoch": 3.2552542889562552, + "grad_norm": 0.21148332953453064, + "learning_rate": 8.091189466680948e-05, + "loss": 1.4415, + "step": 67710 + }, + { + "epoch": 3.2557962301217676, + "grad_norm": 0.18964509665966034, + "learning_rate": 8.089894413026823e-05, + "loss": 1.4444, + "step": 67720 + }, + { + "epoch": 3.25633817128728, + "grad_norm": 0.21399128437042236, + "learning_rate": 8.08859903854782e-05, + "loss": 1.4514, + "step": 67730 + }, + { + "epoch": 3.2568801124527917, + "grad_norm": 0.20721645653247833, + "learning_rate": 8.087303343404406e-05, + "loss": 1.4518, + "step": 67740 + }, + { + "epoch": 3.257422053618304, + "grad_norm": 0.2715248763561249, + "learning_rate": 8.086007327757088e-05, + "loss": 1.4342, + "step": 67750 + }, + { + "epoch": 3.2579639947838164, + "grad_norm": 0.19381947815418243, + "learning_rate": 8.08471099176641e-05, + "loss": 1.4455, + "step": 67760 + }, + { + "epoch": 3.2585059359493287, + "grad_norm": 0.169798344373703, + "learning_rate": 8.083414335592955e-05, + "loss": 1.4354, + "step": 67770 + }, + { + "epoch": 3.258668518298982, + "eval_loss": 2.425126075744629, + "eval_runtime": 23.083, + "eval_samples_per_second": 216.61, + "eval_steps_per_second": 1.17, + "step": 67773 + }, + { + "epoch": 3.2590478771148406, + "grad_norm": 0.17293022572994232, + "learning_rate": 8.08211735939735e-05, + "loss": 1.4359, + "step": 67780 + }, + { + "epoch": 3.259589818280353, + "grad_norm": 0.28945857286453247, + "learning_rate": 8.080820063340254e-05, + "loss": 1.4333, + "step": 67790 + }, + { + "epoch": 3.260131759445865, + "grad_norm": 0.18180948495864868, + "learning_rate": 8.079522447582375e-05, + "loss": 1.4428, + "step": 67800 + }, + { + "epoch": 3.2606737006113775, + "grad_norm": 0.2261655181646347, + "learning_rate": 8.078224512284455e-05, + "loss": 1.4295, + "step": 67810 + }, + { + "epoch": 3.2612156417768894, + "grad_norm": 0.5620535612106323, + "learning_rate": 8.076926257607274e-05, + "loss": 1.4416, + "step": 67820 + }, + { + "epoch": 3.2617575829424017, + "grad_norm": 0.24003808200359344, + "learning_rate": 8.075627683711658e-05, + "loss": 1.4422, + "step": 67830 + }, + { + "epoch": 3.262299524107914, + "grad_norm": 0.2081241011619568, + "learning_rate": 8.074328790758466e-05, + "loss": 1.4396, + "step": 67840 + }, + { + "epoch": 3.2628414652734263, + "grad_norm": 0.22388330101966858, + "learning_rate": 8.073029578908601e-05, + "loss": 1.4407, + "step": 67850 + }, + { + "epoch": 3.2633834064389386, + "grad_norm": 0.27499768137931824, + "learning_rate": 8.071730048323002e-05, + "loss": 1.4343, + "step": 67860 + }, + { + "epoch": 3.2633834064389386, + "eval_loss": 2.4300835132598877, + "eval_runtime": 22.2873, + "eval_samples_per_second": 224.343, + "eval_steps_per_second": 1.211, + "step": 67860 + }, + { + "epoch": 3.2639253476044505, + "grad_norm": 0.17892539501190186, + "learning_rate": 8.070430199162648e-05, + "loss": 1.4281, + "step": 67870 + }, + { + "epoch": 3.264467288769963, + "grad_norm": 0.24364718794822693, + "learning_rate": 8.069130031588562e-05, + "loss": 1.4381, + "step": 67880 + }, + { + "epoch": 3.265009229935475, + "grad_norm": 0.22364751994609833, + "learning_rate": 8.067829545761804e-05, + "loss": 1.4383, + "step": 67890 + }, + { + "epoch": 3.2655511711009875, + "grad_norm": 0.3235653340816498, + "learning_rate": 8.066528741843468e-05, + "loss": 1.4419, + "step": 67900 + }, + { + "epoch": 3.2660931122665, + "grad_norm": 0.2588692605495453, + "learning_rate": 8.065227619994695e-05, + "loss": 1.4372, + "step": 67910 + }, + { + "epoch": 3.2666350534320117, + "grad_norm": 0.22628279030323029, + "learning_rate": 8.063926180376661e-05, + "loss": 1.4345, + "step": 67920 + }, + { + "epoch": 3.267176994597524, + "grad_norm": 0.29548385739326477, + "learning_rate": 8.062624423150584e-05, + "loss": 1.4383, + "step": 67930 + }, + { + "epoch": 3.2677189357630363, + "grad_norm": 0.21970342099666595, + "learning_rate": 8.061322348477717e-05, + "loss": 1.4512, + "step": 67940 + }, + { + "epoch": 3.268098294578895, + "eval_loss": 2.420971393585205, + "eval_runtime": 28.5235, + "eval_samples_per_second": 175.294, + "eval_steps_per_second": 0.947, + "step": 67947 + }, + { + "epoch": 3.2682608769285486, + "grad_norm": 0.19080619513988495, + "learning_rate": 8.06001995651936e-05, + "loss": 1.4366, + "step": 67950 + }, + { + "epoch": 3.2688028180940605, + "grad_norm": 0.21438750624656677, + "learning_rate": 8.058717247436845e-05, + "loss": 1.4318, + "step": 67960 + }, + { + "epoch": 3.269344759259573, + "grad_norm": 0.24113547801971436, + "learning_rate": 8.057414221391545e-05, + "loss": 1.443, + "step": 67970 + }, + { + "epoch": 3.269886700425085, + "grad_norm": 0.22619077563285828, + "learning_rate": 8.056110878544875e-05, + "loss": 1.4333, + "step": 67980 + }, + { + "epoch": 3.2704286415905974, + "grad_norm": 0.192766472697258, + "learning_rate": 8.054807219058287e-05, + "loss": 1.4479, + "step": 67990 + }, + { + "epoch": 3.2709705827561093, + "grad_norm": 0.22505764663219452, + "learning_rate": 8.053503243093275e-05, + "loss": 1.4371, + "step": 68000 + }, + { + "epoch": 3.2715125239216216, + "grad_norm": 0.18467269837856293, + "learning_rate": 8.052198950811364e-05, + "loss": 1.4294, + "step": 68010 + }, + { + "epoch": 3.272054465087134, + "grad_norm": 0.21085874736309052, + "learning_rate": 8.050894342374128e-05, + "loss": 1.4519, + "step": 68020 + }, + { + "epoch": 3.2725964062526463, + "grad_norm": 0.3685001730918884, + "learning_rate": 8.049589417943176e-05, + "loss": 1.4364, + "step": 68030 + }, + { + "epoch": 3.272813182718851, + "eval_loss": 2.4268741607666016, + "eval_runtime": 23.9572, + "eval_samples_per_second": 208.705, + "eval_steps_per_second": 1.127, + "step": 68034 + }, + { + "epoch": 3.2731383474181586, + "grad_norm": 0.21414147317409515, + "learning_rate": 8.048284177680158e-05, + "loss": 1.4309, + "step": 68040 + }, + { + "epoch": 3.2736802885836704, + "grad_norm": 0.23878353834152222, + "learning_rate": 8.046978621746759e-05, + "loss": 1.4443, + "step": 68050 + }, + { + "epoch": 3.2742222297491828, + "grad_norm": 0.18432900309562683, + "learning_rate": 8.045672750304703e-05, + "loss": 1.429, + "step": 68060 + }, + { + "epoch": 3.274764170914695, + "grad_norm": 0.18248611688613892, + "learning_rate": 8.044366563515762e-05, + "loss": 1.4367, + "step": 68070 + }, + { + "epoch": 3.2753061120802074, + "grad_norm": 0.22367575764656067, + "learning_rate": 8.043060061541737e-05, + "loss": 1.4467, + "step": 68080 + }, + { + "epoch": 3.2758480532457197, + "grad_norm": 0.17164652049541473, + "learning_rate": 8.041753244544472e-05, + "loss": 1.4351, + "step": 68090 + }, + { + "epoch": 3.2763899944112316, + "grad_norm": 0.2587142288684845, + "learning_rate": 8.04044611268585e-05, + "loss": 1.4501, + "step": 68100 + }, + { + "epoch": 3.276931935576744, + "grad_norm": 0.21978062391281128, + "learning_rate": 8.039138666127793e-05, + "loss": 1.4343, + "step": 68110 + }, + { + "epoch": 3.277473876742256, + "grad_norm": 0.1821766346693039, + "learning_rate": 8.037830905032264e-05, + "loss": 1.4463, + "step": 68120 + }, + { + "epoch": 3.2775280708588075, + "eval_loss": 2.42914080619812, + "eval_runtime": 25.1974, + "eval_samples_per_second": 198.433, + "eval_steps_per_second": 1.072, + "step": 68121 + }, + { + "epoch": 3.2780158179077685, + "grad_norm": 0.20630845427513123, + "learning_rate": 8.036522829561259e-05, + "loss": 1.4386, + "step": 68130 + }, + { + "epoch": 3.278557759073281, + "grad_norm": 0.18165987730026245, + "learning_rate": 8.035214439876818e-05, + "loss": 1.4445, + "step": 68140 + }, + { + "epoch": 3.2790997002387927, + "grad_norm": 0.2866304814815521, + "learning_rate": 8.03390573614102e-05, + "loss": 1.4429, + "step": 68150 + }, + { + "epoch": 3.279641641404305, + "grad_norm": 0.4309597909450531, + "learning_rate": 8.032596718515982e-05, + "loss": 1.4471, + "step": 68160 + }, + { + "epoch": 3.2801835825698173, + "grad_norm": 0.16174156963825226, + "learning_rate": 8.031287387163854e-05, + "loss": 1.4416, + "step": 68170 + }, + { + "epoch": 3.2807255237353297, + "grad_norm": 0.20739299058914185, + "learning_rate": 8.029977742246837e-05, + "loss": 1.434, + "step": 68180 + }, + { + "epoch": 3.2812674649008415, + "grad_norm": 0.30324587225914, + "learning_rate": 8.02866778392716e-05, + "loss": 1.4353, + "step": 68190 + }, + { + "epoch": 3.281809406066354, + "grad_norm": 0.2629935145378113, + "learning_rate": 8.027357512367097e-05, + "loss": 1.4326, + "step": 68200 + }, + { + "epoch": 3.2822429589987636, + "eval_loss": 2.420156955718994, + "eval_runtime": 25.5157, + "eval_samples_per_second": 195.958, + "eval_steps_per_second": 1.058, + "step": 68208 + }, + { + "epoch": 3.282351347231866, + "grad_norm": 0.23528800904750824, + "learning_rate": 8.026046927728959e-05, + "loss": 1.4362, + "step": 68210 + }, + { + "epoch": 3.2828932883973785, + "grad_norm": 0.18353603780269623, + "learning_rate": 8.024736030175092e-05, + "loss": 1.4437, + "step": 68220 + }, + { + "epoch": 3.2834352295628904, + "grad_norm": 0.3570035994052887, + "learning_rate": 8.02342481986789e-05, + "loss": 1.4461, + "step": 68230 + }, + { + "epoch": 3.2839771707284027, + "grad_norm": 0.43758606910705566, + "learning_rate": 8.022113296969773e-05, + "loss": 1.4389, + "step": 68240 + }, + { + "epoch": 3.284519111893915, + "grad_norm": 0.2364160418510437, + "learning_rate": 8.020801461643214e-05, + "loss": 1.4388, + "step": 68250 + }, + { + "epoch": 3.2850610530594273, + "grad_norm": 0.3303424119949341, + "learning_rate": 8.019489314050715e-05, + "loss": 1.4482, + "step": 68260 + }, + { + "epoch": 3.2856029942249396, + "grad_norm": 0.3738381266593933, + "learning_rate": 8.018176854354815e-05, + "loss": 1.4299, + "step": 68270 + }, + { + "epoch": 3.2861449353904515, + "grad_norm": 0.1708504557609558, + "learning_rate": 8.016864082718102e-05, + "loss": 1.4301, + "step": 68280 + }, + { + "epoch": 3.286686876555964, + "grad_norm": 0.22790126502513885, + "learning_rate": 8.015550999303192e-05, + "loss": 1.4363, + "step": 68290 + }, + { + "epoch": 3.28695784713872, + "eval_loss": 2.4150733947753906, + "eval_runtime": 25.2805, + "eval_samples_per_second": 197.781, + "eval_steps_per_second": 1.068, + "step": 68295 + }, + { + "epoch": 3.287228817721476, + "grad_norm": 0.17331331968307495, + "learning_rate": 8.014237604272744e-05, + "loss": 1.4265, + "step": 68300 + }, + { + "epoch": 3.2877707588869884, + "grad_norm": 0.2437063455581665, + "learning_rate": 8.012923897789461e-05, + "loss": 1.4362, + "step": 68310 + }, + { + "epoch": 3.2883127000525008, + "grad_norm": 0.1904052495956421, + "learning_rate": 8.011609880016074e-05, + "loss": 1.4315, + "step": 68320 + }, + { + "epoch": 3.2888546412180126, + "grad_norm": 0.17929191887378693, + "learning_rate": 8.010295551115358e-05, + "loss": 1.4352, + "step": 68330 + }, + { + "epoch": 3.289396582383525, + "grad_norm": 0.24011602997779846, + "learning_rate": 8.00898091125013e-05, + "loss": 1.4416, + "step": 68340 + }, + { + "epoch": 3.2899385235490373, + "grad_norm": 0.26743006706237793, + "learning_rate": 8.007665960583237e-05, + "loss": 1.4295, + "step": 68350 + }, + { + "epoch": 3.2904804647145496, + "grad_norm": 0.21712139248847961, + "learning_rate": 8.006350699277573e-05, + "loss": 1.4409, + "step": 68360 + }, + { + "epoch": 3.2910224058800615, + "grad_norm": 0.19347068667411804, + "learning_rate": 8.005035127496068e-05, + "loss": 1.4342, + "step": 68370 + }, + { + "epoch": 3.2915643470455738, + "grad_norm": 0.2079339623451233, + "learning_rate": 8.003719245401684e-05, + "loss": 1.4394, + "step": 68380 + }, + { + "epoch": 3.2916727352786763, + "eval_loss": 2.4259541034698486, + "eval_runtime": 29.0409, + "eval_samples_per_second": 172.171, + "eval_steps_per_second": 0.93, + "step": 68382 + }, + { + "epoch": 3.292106288211086, + "grad_norm": 0.17019985616207123, + "learning_rate": 8.002403053157432e-05, + "loss": 1.4358, + "step": 68390 + }, + { + "epoch": 3.2926482293765984, + "grad_norm": 0.18729358911514282, + "learning_rate": 8.001086550926354e-05, + "loss": 1.431, + "step": 68400 + }, + { + "epoch": 3.2931901705421103, + "grad_norm": 0.3095646798610687, + "learning_rate": 7.999769738871533e-05, + "loss": 1.4412, + "step": 68410 + }, + { + "epoch": 3.2937321117076226, + "grad_norm": 0.17975299060344696, + "learning_rate": 7.998452617156088e-05, + "loss": 1.4424, + "step": 68420 + }, + { + "epoch": 3.294274052873135, + "grad_norm": 0.15972350537776947, + "learning_rate": 7.997135185943182e-05, + "loss": 1.4329, + "step": 68430 + }, + { + "epoch": 3.2948159940386472, + "grad_norm": 0.19751180708408356, + "learning_rate": 7.995817445396009e-05, + "loss": 1.437, + "step": 68440 + }, + { + "epoch": 3.2953579352041595, + "grad_norm": 0.17910368740558624, + "learning_rate": 7.994499395677807e-05, + "loss": 1.4297, + "step": 68450 + }, + { + "epoch": 3.2958998763696714, + "grad_norm": 0.24977631866931915, + "learning_rate": 7.99318103695185e-05, + "loss": 1.4352, + "step": 68460 + }, + { + "epoch": 3.2963876234186325, + "eval_loss": 2.4190785884857178, + "eval_runtime": 26.6502, + "eval_samples_per_second": 187.616, + "eval_steps_per_second": 1.013, + "step": 68469 + }, + { + "epoch": 3.2964418175351837, + "grad_norm": 0.2186552733182907, + "learning_rate": 7.99186236938145e-05, + "loss": 1.4364, + "step": 68470 + }, + { + "epoch": 3.296983758700696, + "grad_norm": 0.29373136162757874, + "learning_rate": 7.990543393129959e-05, + "loss": 1.4334, + "step": 68480 + }, + { + "epoch": 3.2975256998662084, + "grad_norm": 0.17339174449443817, + "learning_rate": 7.989224108360765e-05, + "loss": 1.4425, + "step": 68490 + }, + { + "epoch": 3.2980676410317207, + "grad_norm": 0.16692684590816498, + "learning_rate": 7.987904515237297e-05, + "loss": 1.445, + "step": 68500 + }, + { + "epoch": 3.2986095821972325, + "grad_norm": 0.17328135669231415, + "learning_rate": 7.986584613923017e-05, + "loss": 1.4348, + "step": 68510 + }, + { + "epoch": 3.299151523362745, + "grad_norm": 0.2851394712924957, + "learning_rate": 7.985264404581431e-05, + "loss": 1.45, + "step": 68520 + }, + { + "epoch": 3.299693464528257, + "grad_norm": 0.18991175293922424, + "learning_rate": 7.983943887376083e-05, + "loss": 1.4392, + "step": 68530 + }, + { + "epoch": 3.3002354056937695, + "grad_norm": 0.18527284264564514, + "learning_rate": 7.982623062470547e-05, + "loss": 1.4445, + "step": 68540 + }, + { + "epoch": 3.300777346859282, + "grad_norm": 0.1802045851945877, + "learning_rate": 7.981301930028446e-05, + "loss": 1.4487, + "step": 68550 + }, + { + "epoch": 3.301102511558589, + "eval_loss": 2.417846441268921, + "eval_runtime": 26.4831, + "eval_samples_per_second": 188.799, + "eval_steps_per_second": 1.02, + "step": 68556 + }, + { + "epoch": 3.3013192880247937, + "grad_norm": 0.15754951536655426, + "learning_rate": 7.979980490213435e-05, + "loss": 1.4363, + "step": 68560 + }, + { + "epoch": 3.301861229190306, + "grad_norm": 0.31062695384025574, + "learning_rate": 7.978658743189205e-05, + "loss": 1.4359, + "step": 68570 + }, + { + "epoch": 3.3024031703558183, + "grad_norm": 0.2875862717628479, + "learning_rate": 7.977336689119495e-05, + "loss": 1.4459, + "step": 68580 + }, + { + "epoch": 3.3029451115213306, + "grad_norm": 0.18160034716129303, + "learning_rate": 7.97601432816807e-05, + "loss": 1.4378, + "step": 68590 + }, + { + "epoch": 3.3034870526868425, + "grad_norm": 0.24987049400806427, + "learning_rate": 7.97469166049874e-05, + "loss": 1.4372, + "step": 68600 + }, + { + "epoch": 3.304028993852355, + "grad_norm": 0.2157442718744278, + "learning_rate": 7.973368686275353e-05, + "loss": 1.4379, + "step": 68610 + }, + { + "epoch": 3.304570935017867, + "grad_norm": 0.20178145170211792, + "learning_rate": 7.972045405661788e-05, + "loss": 1.4259, + "step": 68620 + }, + { + "epoch": 3.3051128761833795, + "grad_norm": 0.24957998096942902, + "learning_rate": 7.970721818821972e-05, + "loss": 1.4474, + "step": 68630 + }, + { + "epoch": 3.3056548173488913, + "grad_norm": 0.16413570940494537, + "learning_rate": 7.969397925919863e-05, + "loss": 1.434, + "step": 68640 + }, + { + "epoch": 3.305817399698545, + "eval_loss": 2.419210433959961, + "eval_runtime": 24.5355, + "eval_samples_per_second": 203.786, + "eval_steps_per_second": 1.1, + "step": 68643 + }, + { + "epoch": 3.3061967585144036, + "grad_norm": 0.31925585865974426, + "learning_rate": 7.968073727119461e-05, + "loss": 1.4409, + "step": 68650 + }, + { + "epoch": 3.306738699679916, + "grad_norm": 0.193577840924263, + "learning_rate": 7.966749222584802e-05, + "loss": 1.4379, + "step": 68660 + }, + { + "epoch": 3.3072806408454283, + "grad_norm": 0.3227108418941498, + "learning_rate": 7.965424412479958e-05, + "loss": 1.4412, + "step": 68670 + }, + { + "epoch": 3.3078225820109406, + "grad_norm": 0.2504270076751709, + "learning_rate": 7.964099296969042e-05, + "loss": 1.4256, + "step": 68680 + }, + { + "epoch": 3.3083645231764525, + "grad_norm": 0.23025809228420258, + "learning_rate": 7.962773876216202e-05, + "loss": 1.4289, + "step": 68690 + }, + { + "epoch": 3.308906464341965, + "grad_norm": 0.17569345235824585, + "learning_rate": 7.961448150385628e-05, + "loss": 1.4433, + "step": 68700 + }, + { + "epoch": 3.309448405507477, + "grad_norm": 0.24907849729061127, + "learning_rate": 7.960122119641542e-05, + "loss": 1.443, + "step": 68710 + }, + { + "epoch": 3.3099903466729894, + "grad_norm": 0.30965548753738403, + "learning_rate": 7.958795784148208e-05, + "loss": 1.4377, + "step": 68720 + }, + { + "epoch": 3.3105322878385017, + "grad_norm": 0.30555593967437744, + "learning_rate": 7.95746914406993e-05, + "loss": 1.441, + "step": 68730 + }, + { + "epoch": 3.3105322878385017, + "eval_loss": 2.420576572418213, + "eval_runtime": 21.9481, + "eval_samples_per_second": 227.81, + "eval_steps_per_second": 1.23, + "step": 68730 + }, + { + "epoch": 3.3110742290040136, + "grad_norm": 0.2148512899875641, + "learning_rate": 7.956142199571042e-05, + "loss": 1.4404, + "step": 68740 + }, + { + "epoch": 3.311616170169526, + "grad_norm": 0.3420555591583252, + "learning_rate": 7.954814950815922e-05, + "loss": 1.4432, + "step": 68750 + }, + { + "epoch": 3.3121581113350382, + "grad_norm": 0.22187262773513794, + "learning_rate": 7.953487397968984e-05, + "loss": 1.4425, + "step": 68760 + }, + { + "epoch": 3.3127000525005506, + "grad_norm": 0.2587040662765503, + "learning_rate": 7.95215954119468e-05, + "loss": 1.451, + "step": 68770 + }, + { + "epoch": 3.313241993666063, + "grad_norm": 0.20211346447467804, + "learning_rate": 7.950831380657496e-05, + "loss": 1.4466, + "step": 68780 + }, + { + "epoch": 3.3137839348315747, + "grad_norm": 0.2853049337863922, + "learning_rate": 7.949502916521962e-05, + "loss": 1.4423, + "step": 68790 + }, + { + "epoch": 3.314325875997087, + "grad_norm": 0.17778904736042023, + "learning_rate": 7.948174148952642e-05, + "loss": 1.4383, + "step": 68800 + }, + { + "epoch": 3.3148678171625994, + "grad_norm": 0.30810484290122986, + "learning_rate": 7.946845078114137e-05, + "loss": 1.4441, + "step": 68810 + }, + { + "epoch": 3.315247175978458, + "eval_loss": 2.4207398891448975, + "eval_runtime": 32.3649, + "eval_samples_per_second": 154.488, + "eval_steps_per_second": 0.834, + "step": 68817 + }, + { + "epoch": 3.3154097583281117, + "grad_norm": 0.19365157186985016, + "learning_rate": 7.945515704171088e-05, + "loss": 1.4428, + "step": 68820 + }, + { + "epoch": 3.3159516994936236, + "grad_norm": 0.17585448920726776, + "learning_rate": 7.944186027288169e-05, + "loss": 1.4359, + "step": 68830 + }, + { + "epoch": 3.316493640659136, + "grad_norm": 0.1955270618200302, + "learning_rate": 7.942856047630098e-05, + "loss": 1.4375, + "step": 68840 + }, + { + "epoch": 3.317035581824648, + "grad_norm": 0.17522399127483368, + "learning_rate": 7.941525765361624e-05, + "loss": 1.4442, + "step": 68850 + }, + { + "epoch": 3.3175775229901605, + "grad_norm": 0.20333018898963928, + "learning_rate": 7.940195180647539e-05, + "loss": 1.4478, + "step": 68860 + }, + { + "epoch": 3.3181194641556724, + "grad_norm": 0.22244490683078766, + "learning_rate": 7.93886429365267e-05, + "loss": 1.428, + "step": 68870 + }, + { + "epoch": 3.3186614053211847, + "grad_norm": 0.22800850868225098, + "learning_rate": 7.93753310454188e-05, + "loss": 1.4382, + "step": 68880 + }, + { + "epoch": 3.319203346486697, + "grad_norm": 0.3180639147758484, + "learning_rate": 7.93620161348007e-05, + "loss": 1.4318, + "step": 68890 + }, + { + "epoch": 3.3197452876522093, + "grad_norm": 0.4034172594547272, + "learning_rate": 7.934869820632183e-05, + "loss": 1.4457, + "step": 68900 + }, + { + "epoch": 3.319962064118414, + "eval_loss": 2.4232521057128906, + "eval_runtime": 24.2726, + "eval_samples_per_second": 205.994, + "eval_steps_per_second": 1.112, + "step": 68904 + }, + { + "epoch": 3.3202872288177216, + "grad_norm": 0.32735925912857056, + "learning_rate": 7.933537726163195e-05, + "loss": 1.4403, + "step": 68910 + }, + { + "epoch": 3.3208291699832335, + "grad_norm": 0.2666130065917969, + "learning_rate": 7.932205330238118e-05, + "loss": 1.4348, + "step": 68920 + }, + { + "epoch": 3.321371111148746, + "grad_norm": 0.20888814330101013, + "learning_rate": 7.930872633022006e-05, + "loss": 1.4446, + "step": 68930 + }, + { + "epoch": 3.321913052314258, + "grad_norm": 0.26774919033050537, + "learning_rate": 7.929539634679941e-05, + "loss": 1.4374, + "step": 68940 + }, + { + "epoch": 3.3224549934797705, + "grad_norm": 0.18306075036525726, + "learning_rate": 7.928206335377057e-05, + "loss": 1.4408, + "step": 68950 + }, + { + "epoch": 3.322996934645283, + "grad_norm": 0.2182762324810028, + "learning_rate": 7.926872735278514e-05, + "loss": 1.439, + "step": 68960 + }, + { + "epoch": 3.3235388758107947, + "grad_norm": 0.23973847925662994, + "learning_rate": 7.925538834549514e-05, + "loss": 1.4403, + "step": 68970 + }, + { + "epoch": 3.324080816976307, + "grad_norm": 0.2488727569580078, + "learning_rate": 7.924204633355295e-05, + "loss": 1.4386, + "step": 68980 + }, + { + "epoch": 3.3246227581418193, + "grad_norm": 0.2777588963508606, + "learning_rate": 7.922870131861127e-05, + "loss": 1.4477, + "step": 68990 + }, + { + "epoch": 3.3246769522583706, + "eval_loss": 2.42183518409729, + "eval_runtime": 21.8322, + "eval_samples_per_second": 229.02, + "eval_steps_per_second": 1.237, + "step": 68991 + }, + { + "epoch": 3.3251646993073316, + "grad_norm": 0.18954722583293915, + "learning_rate": 7.921535330232328e-05, + "loss": 1.4356, + "step": 69000 + }, + { + "epoch": 3.3257066404728435, + "grad_norm": 0.2268332540988922, + "learning_rate": 7.920200228634245e-05, + "loss": 1.4413, + "step": 69010 + }, + { + "epoch": 3.326248581638356, + "grad_norm": 0.21182286739349365, + "learning_rate": 7.918864827232268e-05, + "loss": 1.4295, + "step": 69020 + }, + { + "epoch": 3.326790522803868, + "grad_norm": 0.16312585771083832, + "learning_rate": 7.917529126191815e-05, + "loss": 1.4398, + "step": 69030 + }, + { + "epoch": 3.3273324639693804, + "grad_norm": 0.2508258819580078, + "learning_rate": 7.916193125678349e-05, + "loss": 1.431, + "step": 69040 + }, + { + "epoch": 3.3278744051348923, + "grad_norm": 0.3282955586910248, + "learning_rate": 7.914856825857371e-05, + "loss": 1.4384, + "step": 69050 + }, + { + "epoch": 3.3284163463004046, + "grad_norm": 0.20805367827415466, + "learning_rate": 7.913520226894413e-05, + "loss": 1.4365, + "step": 69060 + }, + { + "epoch": 3.328958287465917, + "grad_norm": 0.32144641876220703, + "learning_rate": 7.912183328955047e-05, + "loss": 1.439, + "step": 69070 + }, + { + "epoch": 3.3293918403983267, + "eval_loss": 2.420147657394409, + "eval_runtime": 23.884, + "eval_samples_per_second": 209.345, + "eval_steps_per_second": 1.13, + "step": 69078 + }, + { + "epoch": 3.3295002286314292, + "grad_norm": 0.27090176939964294, + "learning_rate": 7.910846132204883e-05, + "loss": 1.4386, + "step": 69080 + }, + { + "epoch": 3.3300421697969416, + "grad_norm": 0.25403621792793274, + "learning_rate": 7.909508636809567e-05, + "loss": 1.4382, + "step": 69090 + }, + { + "epoch": 3.3305841109624534, + "grad_norm": 0.23677849769592285, + "learning_rate": 7.908170842934783e-05, + "loss": 1.4414, + "step": 69100 + }, + { + "epoch": 3.3311260521279658, + "grad_norm": 0.2112101912498474, + "learning_rate": 7.90683275074625e-05, + "loss": 1.4353, + "step": 69110 + }, + { + "epoch": 3.331667993293478, + "grad_norm": 0.18061186373233795, + "learning_rate": 7.905494360409725e-05, + "loss": 1.4271, + "step": 69120 + }, + { + "epoch": 3.3322099344589904, + "grad_norm": 0.34150585532188416, + "learning_rate": 7.904155672091002e-05, + "loss": 1.4275, + "step": 69130 + }, + { + "epoch": 3.3327518756245027, + "grad_norm": 0.20496203005313873, + "learning_rate": 7.902816685955912e-05, + "loss": 1.4328, + "step": 69140 + }, + { + "epoch": 3.3332938167900146, + "grad_norm": 0.43717578053474426, + "learning_rate": 7.901477402170323e-05, + "loss": 1.4455, + "step": 69150 + }, + { + "epoch": 3.333835757955527, + "grad_norm": 0.29195889830589294, + "learning_rate": 7.90013782090014e-05, + "loss": 1.4471, + "step": 69160 + }, + { + "epoch": 3.3341067285382833, + "eval_loss": 2.4220991134643555, + "eval_runtime": 23.5219, + "eval_samples_per_second": 212.568, + "eval_steps_per_second": 1.148, + "step": 69165 + }, + { + "epoch": 3.334377699121039, + "grad_norm": 0.27426832914352417, + "learning_rate": 7.898797942311304e-05, + "loss": 1.4368, + "step": 69170 + }, + { + "epoch": 3.3349196402865515, + "grad_norm": 0.21939682960510254, + "learning_rate": 7.897457766569793e-05, + "loss": 1.4365, + "step": 69180 + }, + { + "epoch": 3.335461581452064, + "grad_norm": 0.20194536447525024, + "learning_rate": 7.896117293841622e-05, + "loss": 1.4327, + "step": 69190 + }, + { + "epoch": 3.3360035226175757, + "grad_norm": 0.22306720912456512, + "learning_rate": 7.894776524292845e-05, + "loss": 1.4425, + "step": 69200 + }, + { + "epoch": 3.336545463783088, + "grad_norm": 0.547161877155304, + "learning_rate": 7.893435458089549e-05, + "loss": 1.4359, + "step": 69210 + }, + { + "epoch": 3.3370874049486003, + "grad_norm": 0.17057354748249054, + "learning_rate": 7.89209409539786e-05, + "loss": 1.4307, + "step": 69220 + }, + { + "epoch": 3.3376293461141127, + "grad_norm": 0.4212014973163605, + "learning_rate": 7.890752436383939e-05, + "loss": 1.4284, + "step": 69230 + }, + { + "epoch": 3.3381712872796245, + "grad_norm": 0.272225558757782, + "learning_rate": 7.889410481213986e-05, + "loss": 1.4391, + "step": 69240 + }, + { + "epoch": 3.338713228445137, + "grad_norm": 0.2661522626876831, + "learning_rate": 7.888068230054236e-05, + "loss": 1.4344, + "step": 69250 + }, + { + "epoch": 3.3388216166782394, + "eval_loss": 2.4209210872650146, + "eval_runtime": 23.9348, + "eval_samples_per_second": 208.901, + "eval_steps_per_second": 1.128, + "step": 69252 + }, + { + "epoch": 3.339255169610649, + "grad_norm": 0.21756187081336975, + "learning_rate": 7.886725683070963e-05, + "loss": 1.4455, + "step": 69260 + }, + { + "epoch": 3.3397971107761615, + "grad_norm": 0.1777390092611313, + "learning_rate": 7.885382840430475e-05, + "loss": 1.4404, + "step": 69270 + }, + { + "epoch": 3.3403390519416734, + "grad_norm": 0.21643613278865814, + "learning_rate": 7.884039702299113e-05, + "loss": 1.4348, + "step": 69280 + }, + { + "epoch": 3.3408809931071857, + "grad_norm": 0.21973682940006256, + "learning_rate": 7.882696268843267e-05, + "loss": 1.4467, + "step": 69290 + }, + { + "epoch": 3.341422934272698, + "grad_norm": 0.2839016020298004, + "learning_rate": 7.881352540229351e-05, + "loss": 1.4296, + "step": 69300 + }, + { + "epoch": 3.3419648754382103, + "grad_norm": 0.17551878094673157, + "learning_rate": 7.880008516623822e-05, + "loss": 1.4323, + "step": 69310 + }, + { + "epoch": 3.3425068166037226, + "grad_norm": 0.19211049377918243, + "learning_rate": 7.878664198193169e-05, + "loss": 1.436, + "step": 69320 + }, + { + "epoch": 3.3430487577692345, + "grad_norm": 0.2430991530418396, + "learning_rate": 7.877319585103922e-05, + "loss": 1.4405, + "step": 69330 + }, + { + "epoch": 3.3435365048181955, + "eval_loss": 2.417872190475464, + "eval_runtime": 28.3658, + "eval_samples_per_second": 176.269, + "eval_steps_per_second": 0.952, + "step": 69339 + }, + { + "epoch": 3.343590698934747, + "grad_norm": 0.2997911274433136, + "learning_rate": 7.875974677522648e-05, + "loss": 1.4357, + "step": 69340 + }, + { + "epoch": 3.344132640100259, + "grad_norm": 0.18667905032634735, + "learning_rate": 7.874629475615946e-05, + "loss": 1.4338, + "step": 69350 + }, + { + "epoch": 3.3446745812657714, + "grad_norm": 0.20530866086483002, + "learning_rate": 7.873283979550452e-05, + "loss": 1.4477, + "step": 69360 + }, + { + "epoch": 3.3452165224312838, + "grad_norm": 0.16726155579090118, + "learning_rate": 7.871938189492844e-05, + "loss": 1.4393, + "step": 69370 + }, + { + "epoch": 3.3457584635967956, + "grad_norm": 0.16196750104427338, + "learning_rate": 7.870592105609832e-05, + "loss": 1.4274, + "step": 69380 + }, + { + "epoch": 3.346300404762308, + "grad_norm": 0.18552835285663605, + "learning_rate": 7.86924572806816e-05, + "loss": 1.4396, + "step": 69390 + }, + { + "epoch": 3.3468423459278203, + "grad_norm": 0.19698499143123627, + "learning_rate": 7.867899057034616e-05, + "loss": 1.4374, + "step": 69400 + }, + { + "epoch": 3.3473842870933326, + "grad_norm": 0.23695054650306702, + "learning_rate": 7.866552092676015e-05, + "loss": 1.4411, + "step": 69410 + }, + { + "epoch": 3.347926228258845, + "grad_norm": 0.2107638120651245, + "learning_rate": 7.865204835159217e-05, + "loss": 1.4317, + "step": 69420 + }, + { + "epoch": 3.348251392958152, + "eval_loss": 2.422180652618408, + "eval_runtime": 29.2437, + "eval_samples_per_second": 170.977, + "eval_steps_per_second": 0.923, + "step": 69426 + }, + { + "epoch": 3.3484681694243568, + "grad_norm": 0.24402372539043427, + "learning_rate": 7.863857284651111e-05, + "loss": 1.4371, + "step": 69430 + }, + { + "epoch": 3.349010110589869, + "grad_norm": 0.2646619379520416, + "learning_rate": 7.862509441318627e-05, + "loss": 1.4186, + "step": 69440 + }, + { + "epoch": 3.3495520517553814, + "grad_norm": 0.3436126708984375, + "learning_rate": 7.861161305328733e-05, + "loss": 1.4437, + "step": 69450 + }, + { + "epoch": 3.3500939929208937, + "grad_norm": 0.1728588491678238, + "learning_rate": 7.859812876848426e-05, + "loss": 1.4393, + "step": 69460 + }, + { + "epoch": 3.3506359340864056, + "grad_norm": 0.2518586814403534, + "learning_rate": 7.858464156044745e-05, + "loss": 1.4365, + "step": 69470 + }, + { + "epoch": 3.351177875251918, + "grad_norm": 0.2023596614599228, + "learning_rate": 7.857115143084763e-05, + "loss": 1.4384, + "step": 69480 + }, + { + "epoch": 3.35171981641743, + "grad_norm": 0.22805948555469513, + "learning_rate": 7.855765838135592e-05, + "loss": 1.4287, + "step": 69490 + }, + { + "epoch": 3.3522617575829425, + "grad_norm": 0.18878723680973053, + "learning_rate": 7.854416241364376e-05, + "loss": 1.4451, + "step": 69500 + }, + { + "epoch": 3.3528036987484544, + "grad_norm": 0.22859229147434235, + "learning_rate": 7.8530663529383e-05, + "loss": 1.4299, + "step": 69510 + }, + { + "epoch": 3.3529662810981082, + "eval_loss": 2.420332908630371, + "eval_runtime": 22.0276, + "eval_samples_per_second": 226.988, + "eval_steps_per_second": 1.226, + "step": 69513 + }, + { + "epoch": 3.3533456399139667, + "grad_norm": 0.23224987089633942, + "learning_rate": 7.851716173024578e-05, + "loss": 1.4408, + "step": 69520 + }, + { + "epoch": 3.353887581079479, + "grad_norm": 0.2047203630208969, + "learning_rate": 7.850365701790466e-05, + "loss": 1.4358, + "step": 69530 + }, + { + "epoch": 3.3544295222449914, + "grad_norm": 0.40388110280036926, + "learning_rate": 7.849014939403256e-05, + "loss": 1.43, + "step": 69540 + }, + { + "epoch": 3.3549714634105037, + "grad_norm": 0.21261848509311676, + "learning_rate": 7.847663886030274e-05, + "loss": 1.4404, + "step": 69550 + }, + { + "epoch": 3.3555134045760155, + "grad_norm": 0.2009463608264923, + "learning_rate": 7.846312541838883e-05, + "loss": 1.4355, + "step": 69560 + }, + { + "epoch": 3.356055345741528, + "grad_norm": 0.22957339882850647, + "learning_rate": 7.844960906996481e-05, + "loss": 1.4421, + "step": 69570 + }, + { + "epoch": 3.35659728690704, + "grad_norm": 0.2667635679244995, + "learning_rate": 7.843608981670501e-05, + "loss": 1.4367, + "step": 69580 + }, + { + "epoch": 3.3571392280725525, + "grad_norm": 0.19165495038032532, + "learning_rate": 7.842256766028416e-05, + "loss": 1.4375, + "step": 69590 + }, + { + "epoch": 3.357681169238065, + "grad_norm": 0.2126622498035431, + "learning_rate": 7.840904260237732e-05, + "loss": 1.4342, + "step": 69600 + }, + { + "epoch": 3.357681169238065, + "eval_loss": 2.420774459838867, + "eval_runtime": 21.9323, + "eval_samples_per_second": 227.974, + "eval_steps_per_second": 1.231, + "step": 69600 + }, + { + "epoch": 3.3582231104035767, + "grad_norm": 0.21864204108715057, + "learning_rate": 7.839551464465992e-05, + "loss": 1.4502, + "step": 69610 + }, + { + "epoch": 3.358765051569089, + "grad_norm": 0.1772039234638214, + "learning_rate": 7.838198378880772e-05, + "loss": 1.4457, + "step": 69620 + }, + { + "epoch": 3.3593069927346013, + "grad_norm": 0.16376861929893494, + "learning_rate": 7.83684500364969e-05, + "loss": 1.4307, + "step": 69630 + }, + { + "epoch": 3.3598489339001136, + "grad_norm": 0.19408194720745087, + "learning_rate": 7.835491338940395e-05, + "loss": 1.4434, + "step": 69640 + }, + { + "epoch": 3.3603908750656255, + "grad_norm": 0.18771915137767792, + "learning_rate": 7.834137384920572e-05, + "loss": 1.4245, + "step": 69650 + }, + { + "epoch": 3.360932816231138, + "grad_norm": 0.1942000687122345, + "learning_rate": 7.832783141757943e-05, + "loss": 1.4336, + "step": 69660 + }, + { + "epoch": 3.36147475739665, + "grad_norm": 0.22460921108722687, + "learning_rate": 7.831428609620267e-05, + "loss": 1.4373, + "step": 69670 + }, + { + "epoch": 3.3620166985621625, + "grad_norm": 0.182665154337883, + "learning_rate": 7.830073788675336e-05, + "loss": 1.439, + "step": 69680 + }, + { + "epoch": 3.362396057378021, + "eval_loss": 2.4225962162017822, + "eval_runtime": 26.9741, + "eval_samples_per_second": 185.363, + "eval_steps_per_second": 1.001, + "step": 69687 + }, + { + "epoch": 3.3625586397276743, + "grad_norm": 0.18876463174819946, + "learning_rate": 7.828718679090981e-05, + "loss": 1.4367, + "step": 69690 + }, + { + "epoch": 3.3631005808931866, + "grad_norm": 0.2040254920721054, + "learning_rate": 7.827363281035067e-05, + "loss": 1.4358, + "step": 69700 + }, + { + "epoch": 3.363642522058699, + "grad_norm": 0.375626802444458, + "learning_rate": 7.826007594675493e-05, + "loss": 1.436, + "step": 69710 + }, + { + "epoch": 3.3641844632242113, + "grad_norm": 0.1845981627702713, + "learning_rate": 7.824651620180196e-05, + "loss": 1.4379, + "step": 69720 + }, + { + "epoch": 3.3647264043897236, + "grad_norm": 0.1836758852005005, + "learning_rate": 7.82329535771715e-05, + "loss": 1.4381, + "step": 69730 + }, + { + "epoch": 3.3652683455552355, + "grad_norm": 0.2422439604997635, + "learning_rate": 7.821938807454361e-05, + "loss": 1.4424, + "step": 69740 + }, + { + "epoch": 3.365810286720748, + "grad_norm": 0.2933938205242157, + "learning_rate": 7.820581969559877e-05, + "loss": 1.4292, + "step": 69750 + }, + { + "epoch": 3.36635222788626, + "grad_norm": 0.4664517343044281, + "learning_rate": 7.819224844201769e-05, + "loss": 1.4266, + "step": 69760 + }, + { + "epoch": 3.3668941690517724, + "grad_norm": 0.211371049284935, + "learning_rate": 7.817867431548158e-05, + "loss": 1.4308, + "step": 69770 + }, + { + "epoch": 3.367110945517977, + "eval_loss": 2.440751791000366, + "eval_runtime": 22.2008, + "eval_samples_per_second": 225.217, + "eval_steps_per_second": 1.216, + "step": 69774 + }, + { + "epoch": 3.3674361102172847, + "grad_norm": 0.39874711632728577, + "learning_rate": 7.816509731767191e-05, + "loss": 1.4285, + "step": 69780 + }, + { + "epoch": 3.3679780513827966, + "grad_norm": 0.23130638897418976, + "learning_rate": 7.815151745027058e-05, + "loss": 1.4322, + "step": 69790 + }, + { + "epoch": 3.368519992548309, + "grad_norm": 0.18918637931346893, + "learning_rate": 7.81379347149598e-05, + "loss": 1.4355, + "step": 69800 + }, + { + "epoch": 3.3690619337138212, + "grad_norm": 0.27294501662254333, + "learning_rate": 7.812434911342209e-05, + "loss": 1.4453, + "step": 69810 + }, + { + "epoch": 3.3696038748793335, + "grad_norm": 0.19342243671417236, + "learning_rate": 7.811076064734043e-05, + "loss": 1.4399, + "step": 69820 + }, + { + "epoch": 3.370145816044846, + "grad_norm": 0.2188844233751297, + "learning_rate": 7.809716931839804e-05, + "loss": 1.4368, + "step": 69830 + }, + { + "epoch": 3.3706877572103577, + "grad_norm": 0.19374194741249084, + "learning_rate": 7.808357512827862e-05, + "loss": 1.4304, + "step": 69840 + }, + { + "epoch": 3.37122969837587, + "grad_norm": 0.2000492513179779, + "learning_rate": 7.806997807866614e-05, + "loss": 1.426, + "step": 69850 + }, + { + "epoch": 3.3717716395413824, + "grad_norm": 0.2283279299736023, + "learning_rate": 7.805637817124493e-05, + "loss": 1.4398, + "step": 69860 + }, + { + "epoch": 3.3718258336579336, + "eval_loss": 2.4315900802612305, + "eval_runtime": 24.6052, + "eval_samples_per_second": 203.209, + "eval_steps_per_second": 1.097, + "step": 69861 + }, + { + "epoch": 3.3723135807068947, + "grad_norm": 0.21723055839538574, + "learning_rate": 7.804277540769967e-05, + "loss": 1.4309, + "step": 69870 + }, + { + "epoch": 3.3728555218724066, + "grad_norm": 0.19172869622707367, + "learning_rate": 7.802916978971546e-05, + "loss": 1.4248, + "step": 69880 + }, + { + "epoch": 3.373397463037919, + "grad_norm": 0.16879266500473022, + "learning_rate": 7.801556131897769e-05, + "loss": 1.4338, + "step": 69890 + }, + { + "epoch": 3.373939404203431, + "grad_norm": 0.3514922857284546, + "learning_rate": 7.800194999717207e-05, + "loss": 1.4281, + "step": 69900 + }, + { + "epoch": 3.3744813453689435, + "grad_norm": 0.28702405095100403, + "learning_rate": 7.798833582598476e-05, + "loss": 1.4389, + "step": 69910 + }, + { + "epoch": 3.3750232865344554, + "grad_norm": 0.2494763731956482, + "learning_rate": 7.797471880710223e-05, + "loss": 1.4392, + "step": 69920 + }, + { + "epoch": 3.3755652276999677, + "grad_norm": 0.2236117422580719, + "learning_rate": 7.796109894221127e-05, + "loss": 1.4325, + "step": 69930 + }, + { + "epoch": 3.37610716886548, + "grad_norm": 0.1991259753704071, + "learning_rate": 7.794747623299906e-05, + "loss": 1.4294, + "step": 69940 + }, + { + "epoch": 3.3765407217978898, + "eval_loss": 2.4242312908172607, + "eval_runtime": 22.1933, + "eval_samples_per_second": 225.294, + "eval_steps_per_second": 1.217, + "step": 69948 + }, + { + "epoch": 3.3766491100309923, + "grad_norm": 0.3077497184276581, + "learning_rate": 7.793385068115312e-05, + "loss": 1.4422, + "step": 69950 + }, + { + "epoch": 3.3771910511965046, + "grad_norm": 0.17531952261924744, + "learning_rate": 7.792022228836133e-05, + "loss": 1.4326, + "step": 69960 + }, + { + "epoch": 3.3777329923620165, + "grad_norm": 0.1661709100008011, + "learning_rate": 7.790659105631192e-05, + "loss": 1.4322, + "step": 69970 + }, + { + "epoch": 3.378274933527529, + "grad_norm": 0.21228908002376556, + "learning_rate": 7.789295698669345e-05, + "loss": 1.4374, + "step": 69980 + }, + { + "epoch": 3.378816874693041, + "grad_norm": 0.22769136726856232, + "learning_rate": 7.787932008119487e-05, + "loss": 1.4357, + "step": 69990 + }, + { + "epoch": 3.3793588158585535, + "grad_norm": 0.16828486323356628, + "learning_rate": 7.786568034150545e-05, + "loss": 1.4477, + "step": 70000 + }, + { + "epoch": 3.379900757024066, + "grad_norm": 0.32244718074798584, + "learning_rate": 7.785203776931482e-05, + "loss": 1.4328, + "step": 70010 + }, + { + "epoch": 3.3804426981895777, + "grad_norm": 0.2961633801460266, + "learning_rate": 7.783839236631294e-05, + "loss": 1.4321, + "step": 70020 + }, + { + "epoch": 3.38098463935509, + "grad_norm": 0.1998753696680069, + "learning_rate": 7.78247441341902e-05, + "loss": 1.4415, + "step": 70030 + }, + { + "epoch": 3.3812556099378464, + "eval_loss": 2.421795606613159, + "eval_runtime": 24.1321, + "eval_samples_per_second": 207.193, + "eval_steps_per_second": 1.119, + "step": 70035 + }, + { + "epoch": 3.3815265805206023, + "grad_norm": 0.16955262422561646, + "learning_rate": 7.781109307463725e-05, + "loss": 1.4366, + "step": 70040 + }, + { + "epoch": 3.3820685216861146, + "grad_norm": 0.18801350891590118, + "learning_rate": 7.77974391893451e-05, + "loss": 1.4322, + "step": 70050 + }, + { + "epoch": 3.3826104628516265, + "grad_norm": 0.1863052397966385, + "learning_rate": 7.778378248000517e-05, + "loss": 1.4334, + "step": 70060 + }, + { + "epoch": 3.383152404017139, + "grad_norm": 0.3966461420059204, + "learning_rate": 7.77701229483092e-05, + "loss": 1.4332, + "step": 70070 + }, + { + "epoch": 3.383694345182651, + "grad_norm": 0.2498437464237213, + "learning_rate": 7.775646059594924e-05, + "loss": 1.4432, + "step": 70080 + }, + { + "epoch": 3.3842362863481634, + "grad_norm": 0.22239406406879425, + "learning_rate": 7.774279542461776e-05, + "loss": 1.4272, + "step": 70090 + }, + { + "epoch": 3.3847782275136753, + "grad_norm": 0.26137417554855347, + "learning_rate": 7.77291274360075e-05, + "loss": 1.4275, + "step": 70100 + }, + { + "epoch": 3.3853201686791876, + "grad_norm": 0.28286683559417725, + "learning_rate": 7.771545663181161e-05, + "loss": 1.4328, + "step": 70110 + }, + { + "epoch": 3.3858621098447, + "grad_norm": 0.22739897668361664, + "learning_rate": 7.770178301372361e-05, + "loss": 1.4253, + "step": 70120 + }, + { + "epoch": 3.3859704980778025, + "eval_loss": 2.4276702404022217, + "eval_runtime": 21.9823, + "eval_samples_per_second": 227.455, + "eval_steps_per_second": 1.228, + "step": 70122 + }, + { + "epoch": 3.3864040510102122, + "grad_norm": 0.20265692472457886, + "learning_rate": 7.768810658343724e-05, + "loss": 1.4306, + "step": 70130 + }, + { + "epoch": 3.3869459921757246, + "grad_norm": 0.2867891490459442, + "learning_rate": 7.767442734264677e-05, + "loss": 1.4319, + "step": 70140 + }, + { + "epoch": 3.3874879333412364, + "grad_norm": 0.1854824721813202, + "learning_rate": 7.766074529304666e-05, + "loss": 1.4365, + "step": 70150 + }, + { + "epoch": 3.3880298745067488, + "grad_norm": 0.3422788679599762, + "learning_rate": 7.764706043633183e-05, + "loss": 1.4379, + "step": 70160 + }, + { + "epoch": 3.388571815672261, + "grad_norm": 0.17778363823890686, + "learning_rate": 7.763337277419745e-05, + "loss": 1.4323, + "step": 70170 + }, + { + "epoch": 3.3891137568377734, + "grad_norm": 0.1999989151954651, + "learning_rate": 7.761968230833913e-05, + "loss": 1.4236, + "step": 70180 + }, + { + "epoch": 3.3896556980032857, + "grad_norm": 0.24541445076465607, + "learning_rate": 7.760598904045277e-05, + "loss": 1.4302, + "step": 70190 + }, + { + "epoch": 3.3901976391687976, + "grad_norm": 0.2255592793226242, + "learning_rate": 7.759229297223463e-05, + "loss": 1.441, + "step": 70200 + }, + { + "epoch": 3.3906853862177586, + "eval_loss": 2.4177823066711426, + "eval_runtime": 23.5061, + "eval_samples_per_second": 212.711, + "eval_steps_per_second": 1.149, + "step": 70209 + }, + { + "epoch": 3.39073958033431, + "grad_norm": 0.19751523435115814, + "learning_rate": 7.757859410538131e-05, + "loss": 1.4337, + "step": 70210 + }, + { + "epoch": 3.391281521499822, + "grad_norm": 0.3200652301311493, + "learning_rate": 7.75648924415898e-05, + "loss": 1.4221, + "step": 70220 + }, + { + "epoch": 3.3918234626653345, + "grad_norm": 0.2096845954656601, + "learning_rate": 7.755118798255738e-05, + "loss": 1.4365, + "step": 70230 + }, + { + "epoch": 3.392365403830847, + "grad_norm": 0.2815345525741577, + "learning_rate": 7.753748072998169e-05, + "loss": 1.4421, + "step": 70240 + }, + { + "epoch": 3.3929073449963587, + "grad_norm": 0.21538645029067993, + "learning_rate": 7.752377068556073e-05, + "loss": 1.4339, + "step": 70250 + }, + { + "epoch": 3.393449286161871, + "grad_norm": 0.1685652881860733, + "learning_rate": 7.751005785099286e-05, + "loss": 1.4168, + "step": 70260 + }, + { + "epoch": 3.3939912273273833, + "grad_norm": 0.16810309886932373, + "learning_rate": 7.749634222797674e-05, + "loss": 1.437, + "step": 70270 + }, + { + "epoch": 3.3945331684928957, + "grad_norm": 0.32125574350357056, + "learning_rate": 7.748262381821143e-05, + "loss": 1.435, + "step": 70280 + }, + { + "epoch": 3.3950751096584075, + "grad_norm": 0.2670728266239166, + "learning_rate": 7.746890262339627e-05, + "loss": 1.4457, + "step": 70290 + }, + { + "epoch": 3.395400274357715, + "eval_loss": 2.4280776977539062, + "eval_runtime": 25.8886, + "eval_samples_per_second": 193.135, + "eval_steps_per_second": 1.043, + "step": 70296 + }, + { + "epoch": 3.39561705082392, + "grad_norm": 0.3197952210903168, + "learning_rate": 7.745517864523102e-05, + "loss": 1.4286, + "step": 70300 + }, + { + "epoch": 3.396158991989432, + "grad_norm": 0.19565744698047638, + "learning_rate": 7.744145188541573e-05, + "loss": 1.4334, + "step": 70310 + }, + { + "epoch": 3.3967009331549445, + "grad_norm": 0.22424980998039246, + "learning_rate": 7.742772234565081e-05, + "loss": 1.432, + "step": 70320 + }, + { + "epoch": 3.3972428743204564, + "grad_norm": 0.27934402227401733, + "learning_rate": 7.741399002763702e-05, + "loss": 1.4402, + "step": 70330 + }, + { + "epoch": 3.3977848154859687, + "grad_norm": 0.16731417179107666, + "learning_rate": 7.740025493307543e-05, + "loss": 1.4352, + "step": 70340 + }, + { + "epoch": 3.398326756651481, + "grad_norm": 0.24355627596378326, + "learning_rate": 7.738651706366754e-05, + "loss": 1.4329, + "step": 70350 + }, + { + "epoch": 3.3988686978169933, + "grad_norm": 0.1672547459602356, + "learning_rate": 7.73727764211151e-05, + "loss": 1.4408, + "step": 70360 + }, + { + "epoch": 3.3994106389825056, + "grad_norm": 0.21636275947093964, + "learning_rate": 7.735903300712025e-05, + "loss": 1.4255, + "step": 70370 + }, + { + "epoch": 3.3999525801480175, + "grad_norm": 0.21118460595607758, + "learning_rate": 7.734528682338546e-05, + "loss": 1.4435, + "step": 70380 + }, + { + "epoch": 3.4001151624976713, + "eval_loss": 2.429720163345337, + "eval_runtime": 24.7724, + "eval_samples_per_second": 201.837, + "eval_steps_per_second": 1.09, + "step": 70383 + }, + { + "epoch": 3.40049452131353, + "grad_norm": 0.30819687247276306, + "learning_rate": 7.733153787161356e-05, + "loss": 1.4255, + "step": 70390 + }, + { + "epoch": 3.401036462479042, + "grad_norm": 0.20422889292240143, + "learning_rate": 7.73177861535077e-05, + "loss": 1.4364, + "step": 70400 + }, + { + "epoch": 3.4015784036445544, + "grad_norm": 0.3542598783969879, + "learning_rate": 7.73040316707714e-05, + "loss": 1.4333, + "step": 70410 + }, + { + "epoch": 3.4021203448100668, + "grad_norm": 0.20476211607456207, + "learning_rate": 7.729027442510847e-05, + "loss": 1.4459, + "step": 70420 + }, + { + "epoch": 3.4026622859755786, + "grad_norm": 0.22746922075748444, + "learning_rate": 7.727651441822312e-05, + "loss": 1.4267, + "step": 70430 + }, + { + "epoch": 3.403204227141091, + "grad_norm": 0.23665057122707367, + "learning_rate": 7.726275165181988e-05, + "loss": 1.4302, + "step": 70440 + }, + { + "epoch": 3.4037461683066033, + "grad_norm": 0.2850460112094879, + "learning_rate": 7.724898612760362e-05, + "loss": 1.4315, + "step": 70450 + }, + { + "epoch": 3.4042881094721156, + "grad_norm": 0.17364229261875153, + "learning_rate": 7.723521784727956e-05, + "loss": 1.4364, + "step": 70460 + }, + { + "epoch": 3.404830050637628, + "grad_norm": 0.21966204047203064, + "learning_rate": 7.722144681255325e-05, + "loss": 1.4284, + "step": 70470 + }, + { + "epoch": 3.404830050637628, + "eval_loss": 2.4222443103790283, + "eval_runtime": 21.8336, + "eval_samples_per_second": 229.005, + "eval_steps_per_second": 1.237, + "step": 70470 + }, + { + "epoch": 3.4053719918031398, + "grad_norm": 0.23222704231739044, + "learning_rate": 7.720767302513059e-05, + "loss": 1.4334, + "step": 70480 + }, + { + "epoch": 3.405913932968652, + "grad_norm": 0.19313640892505646, + "learning_rate": 7.719389648671779e-05, + "loss": 1.4361, + "step": 70490 + }, + { + "epoch": 3.4064558741341644, + "grad_norm": 0.17659373581409454, + "learning_rate": 7.718011719902146e-05, + "loss": 1.4402, + "step": 70500 + }, + { + "epoch": 3.4069978152996767, + "grad_norm": 0.2190655916929245, + "learning_rate": 7.716633516374852e-05, + "loss": 1.4326, + "step": 70510 + }, + { + "epoch": 3.4075397564651886, + "grad_norm": 0.2351011484861374, + "learning_rate": 7.715255038260621e-05, + "loss": 1.4198, + "step": 70520 + }, + { + "epoch": 3.408081697630701, + "grad_norm": 0.2913278341293335, + "learning_rate": 7.713876285730213e-05, + "loss": 1.4363, + "step": 70530 + }, + { + "epoch": 3.408623638796213, + "grad_norm": 0.21653717756271362, + "learning_rate": 7.712497258954422e-05, + "loss": 1.4284, + "step": 70540 + }, + { + "epoch": 3.4091655799617255, + "grad_norm": 0.2118827849626541, + "learning_rate": 7.711117958104077e-05, + "loss": 1.4306, + "step": 70550 + }, + { + "epoch": 3.409544938777584, + "eval_loss": 2.4244723320007324, + "eval_runtime": 23.3419, + "eval_samples_per_second": 214.207, + "eval_steps_per_second": 1.157, + "step": 70557 + }, + { + "epoch": 3.4097075211272374, + "grad_norm": 0.3481065630912781, + "learning_rate": 7.70973838335004e-05, + "loss": 1.4373, + "step": 70560 + }, + { + "epoch": 3.4102494622927497, + "grad_norm": 0.16835106909275055, + "learning_rate": 7.708358534863205e-05, + "loss": 1.433, + "step": 70570 + }, + { + "epoch": 3.410791403458262, + "grad_norm": 0.3342517912387848, + "learning_rate": 7.706978412814501e-05, + "loss": 1.4379, + "step": 70580 + }, + { + "epoch": 3.4113333446237744, + "grad_norm": 0.19866886734962463, + "learning_rate": 7.705598017374894e-05, + "loss": 1.4306, + "step": 70590 + }, + { + "epoch": 3.4118752857892867, + "grad_norm": 0.29506245255470276, + "learning_rate": 7.704217348715381e-05, + "loss": 1.4344, + "step": 70600 + }, + { + "epoch": 3.4124172269547985, + "grad_norm": 0.2535003125667572, + "learning_rate": 7.702836407006993e-05, + "loss": 1.441, + "step": 70610 + }, + { + "epoch": 3.412959168120311, + "grad_norm": 0.23304949700832367, + "learning_rate": 7.701455192420793e-05, + "loss": 1.429, + "step": 70620 + }, + { + "epoch": 3.413501109285823, + "grad_norm": 0.2439218908548355, + "learning_rate": 7.700073705127883e-05, + "loss": 1.4379, + "step": 70630 + }, + { + "epoch": 3.4140430504513355, + "grad_norm": 0.2127150297164917, + "learning_rate": 7.698691945299392e-05, + "loss": 1.4366, + "step": 70640 + }, + { + "epoch": 3.41425982691754, + "eval_loss": 2.4175617694854736, + "eval_runtime": 21.9871, + "eval_samples_per_second": 227.406, + "eval_steps_per_second": 1.228, + "step": 70644 + }, + { + "epoch": 3.414584991616848, + "grad_norm": 0.17583894729614258, + "learning_rate": 7.697309913106491e-05, + "loss": 1.4296, + "step": 70650 + }, + { + "epoch": 3.4151269327823597, + "grad_norm": 0.1831618696451187, + "learning_rate": 7.695927608720376e-05, + "loss": 1.435, + "step": 70660 + }, + { + "epoch": 3.415668873947872, + "grad_norm": 0.22078917920589447, + "learning_rate": 7.694545032312284e-05, + "loss": 1.4256, + "step": 70670 + }, + { + "epoch": 3.4162108151133843, + "grad_norm": 0.19926197826862335, + "learning_rate": 7.69316218405348e-05, + "loss": 1.4341, + "step": 70680 + }, + { + "epoch": 3.4167527562788966, + "grad_norm": 0.20558814704418182, + "learning_rate": 7.691779064115267e-05, + "loss": 1.4258, + "step": 70690 + }, + { + "epoch": 3.4172946974444085, + "grad_norm": 0.1772947460412979, + "learning_rate": 7.690395672668979e-05, + "loss": 1.4311, + "step": 70700 + }, + { + "epoch": 3.417836638609921, + "grad_norm": 0.29191502928733826, + "learning_rate": 7.689012009885986e-05, + "loss": 1.4376, + "step": 70710 + }, + { + "epoch": 3.418378579775433, + "grad_norm": 0.21399341523647308, + "learning_rate": 7.687628075937689e-05, + "loss": 1.4347, + "step": 70720 + }, + { + "epoch": 3.4189205209409455, + "grad_norm": 0.2068105787038803, + "learning_rate": 7.686243870995522e-05, + "loss": 1.4269, + "step": 70730 + }, + { + "epoch": 3.4189747150574967, + "eval_loss": 2.414785861968994, + "eval_runtime": 22.7197, + "eval_samples_per_second": 220.073, + "eval_steps_per_second": 1.188, + "step": 70731 + }, + { + "epoch": 3.4194624621064573, + "grad_norm": 0.22110241651535034, + "learning_rate": 7.684859395230956e-05, + "loss": 1.4331, + "step": 70740 + }, + { + "epoch": 3.4200044032719696, + "grad_norm": 0.1638895571231842, + "learning_rate": 7.683474648815496e-05, + "loss": 1.419, + "step": 70750 + }, + { + "epoch": 3.420546344437482, + "grad_norm": 0.20719309151172638, + "learning_rate": 7.682089631920674e-05, + "loss": 1.4246, + "step": 70760 + }, + { + "epoch": 3.4210882856029943, + "grad_norm": 0.19159828126430511, + "learning_rate": 7.680704344718063e-05, + "loss": 1.4245, + "step": 70770 + }, + { + "epoch": 3.4216302267685066, + "grad_norm": 0.29333052039146423, + "learning_rate": 7.679318787379264e-05, + "loss": 1.4397, + "step": 70780 + }, + { + "epoch": 3.4221721679340185, + "grad_norm": 0.22254757583141327, + "learning_rate": 7.677932960075917e-05, + "loss": 1.4231, + "step": 70790 + }, + { + "epoch": 3.4227141090995308, + "grad_norm": 0.2216396927833557, + "learning_rate": 7.67654686297969e-05, + "loss": 1.433, + "step": 70800 + }, + { + "epoch": 3.423256050265043, + "grad_norm": 0.27683907747268677, + "learning_rate": 7.675160496262288e-05, + "loss": 1.4404, + "step": 70810 + }, + { + "epoch": 3.423689603197453, + "eval_loss": 2.416546583175659, + "eval_runtime": 21.9857, + "eval_samples_per_second": 227.421, + "eval_steps_per_second": 1.228, + "step": 70818 + }, + { + "epoch": 3.4237979914305554, + "grad_norm": 0.3137832581996918, + "learning_rate": 7.673773860095445e-05, + "loss": 1.434, + "step": 70820 + }, + { + "epoch": 3.4243399325960677, + "grad_norm": 0.1663368195295334, + "learning_rate": 7.672386954650936e-05, + "loss": 1.427, + "step": 70830 + }, + { + "epoch": 3.4248818737615796, + "grad_norm": 0.18768776953220367, + "learning_rate": 7.670999780100563e-05, + "loss": 1.4314, + "step": 70840 + }, + { + "epoch": 3.425423814927092, + "grad_norm": 0.19177640974521637, + "learning_rate": 7.669612336616162e-05, + "loss": 1.4275, + "step": 70850 + }, + { + "epoch": 3.4259657560926042, + "grad_norm": 0.2142564207315445, + "learning_rate": 7.668224624369603e-05, + "loss": 1.4467, + "step": 70860 + }, + { + "epoch": 3.4265076972581165, + "grad_norm": 0.24929867684841156, + "learning_rate": 7.666836643532793e-05, + "loss": 1.4374, + "step": 70870 + }, + { + "epoch": 3.427049638423629, + "grad_norm": 0.20959332585334778, + "learning_rate": 7.665448394277664e-05, + "loss": 1.4289, + "step": 70880 + }, + { + "epoch": 3.4275915795891407, + "grad_norm": 0.23564212024211884, + "learning_rate": 7.664059876776195e-05, + "loss": 1.4368, + "step": 70890 + }, + { + "epoch": 3.428133520754653, + "grad_norm": 0.29305994510650635, + "learning_rate": 7.662671091200378e-05, + "loss": 1.4361, + "step": 70900 + }, + { + "epoch": 3.4284044913374094, + "eval_loss": 2.4167122840881348, + "eval_runtime": 22.6755, + "eval_samples_per_second": 220.502, + "eval_steps_per_second": 1.191, + "step": 70905 + }, + { + "epoch": 3.4286754619201654, + "grad_norm": 0.19599109888076782, + "learning_rate": 7.66128203772226e-05, + "loss": 1.4286, + "step": 70910 + }, + { + "epoch": 3.4292174030856777, + "grad_norm": 0.2723342478275299, + "learning_rate": 7.659892716513904e-05, + "loss": 1.4253, + "step": 70920 + }, + { + "epoch": 3.4297593442511896, + "grad_norm": 0.25713953375816345, + "learning_rate": 7.658503127747415e-05, + "loss": 1.4388, + "step": 70930 + }, + { + "epoch": 3.430301285416702, + "grad_norm": 0.24171973764896393, + "learning_rate": 7.657113271594931e-05, + "loss": 1.4361, + "step": 70940 + }, + { + "epoch": 3.430843226582214, + "grad_norm": 0.17661350965499878, + "learning_rate": 7.65572314822862e-05, + "loss": 1.4331, + "step": 70950 + }, + { + "epoch": 3.4313851677477265, + "grad_norm": 0.23326051235198975, + "learning_rate": 7.654332757820684e-05, + "loss": 1.4239, + "step": 70960 + }, + { + "epoch": 3.4319271089132384, + "grad_norm": 0.1615256667137146, + "learning_rate": 7.652942100543361e-05, + "loss": 1.4351, + "step": 70970 + }, + { + "epoch": 3.4324690500787507, + "grad_norm": 0.19447456300258636, + "learning_rate": 7.651551176568916e-05, + "loss": 1.4277, + "step": 70980 + }, + { + "epoch": 3.433010991244263, + "grad_norm": 0.3392447829246521, + "learning_rate": 7.650159986069653e-05, + "loss": 1.4348, + "step": 70990 + }, + { + "epoch": 3.4331193794773656, + "eval_loss": 2.42305850982666, + "eval_runtime": 22.2077, + "eval_samples_per_second": 225.147, + "eval_steps_per_second": 1.216, + "step": 70992 + }, + { + "epoch": 3.4335529324097753, + "grad_norm": 0.2070346623659134, + "learning_rate": 7.648768529217907e-05, + "loss": 1.4279, + "step": 71000 + }, + { + "epoch": 3.4340948735752876, + "grad_norm": 0.2491535097360611, + "learning_rate": 7.647376806186043e-05, + "loss": 1.4386, + "step": 71010 + }, + { + "epoch": 3.4346368147407995, + "grad_norm": 0.3890012502670288, + "learning_rate": 7.645984817146464e-05, + "loss": 1.429, + "step": 71020 + }, + { + "epoch": 3.435178755906312, + "grad_norm": 0.19839666783809662, + "learning_rate": 7.644592562271603e-05, + "loss": 1.4271, + "step": 71030 + }, + { + "epoch": 3.435720697071824, + "grad_norm": 0.24999631941318512, + "learning_rate": 7.643200041733926e-05, + "loss": 1.4357, + "step": 71040 + }, + { + "epoch": 3.4362626382373365, + "grad_norm": 0.18307673931121826, + "learning_rate": 7.641807255705932e-05, + "loss": 1.4381, + "step": 71050 + }, + { + "epoch": 3.436804579402849, + "grad_norm": 0.3571625351905823, + "learning_rate": 7.640414204360154e-05, + "loss": 1.4406, + "step": 71060 + }, + { + "epoch": 3.4373465205683607, + "grad_norm": 0.23355412483215332, + "learning_rate": 7.639020887869157e-05, + "loss": 1.4392, + "step": 71070 + }, + { + "epoch": 3.4378342676173217, + "eval_loss": 2.4152307510375977, + "eval_runtime": 22.2007, + "eval_samples_per_second": 225.218, + "eval_steps_per_second": 1.216, + "step": 71079 + }, + { + "epoch": 3.437888461733873, + "grad_norm": 0.20043282210826874, + "learning_rate": 7.63762730640554e-05, + "loss": 1.4451, + "step": 71080 + }, + { + "epoch": 3.4384304028993853, + "grad_norm": 0.19627268612384796, + "learning_rate": 7.636233460141934e-05, + "loss": 1.4379, + "step": 71090 + }, + { + "epoch": 3.4389723440648976, + "grad_norm": 0.24721965193748474, + "learning_rate": 7.634839349251e-05, + "loss": 1.4345, + "step": 71100 + }, + { + "epoch": 3.43951428523041, + "grad_norm": 0.2560265362262726, + "learning_rate": 7.633444973905435e-05, + "loss": 1.4404, + "step": 71110 + }, + { + "epoch": 3.440056226395922, + "grad_norm": 0.21362638473510742, + "learning_rate": 7.63205033427797e-05, + "loss": 1.4416, + "step": 71120 + }, + { + "epoch": 3.440598167561434, + "grad_norm": 0.24023322761058807, + "learning_rate": 7.630655430541367e-05, + "loss": 1.4308, + "step": 71130 + }, + { + "epoch": 3.4411401087269464, + "grad_norm": 0.20237214863300323, + "learning_rate": 7.629260262868419e-05, + "loss": 1.4432, + "step": 71140 + }, + { + "epoch": 3.4416820498924587, + "grad_norm": 0.18272891640663147, + "learning_rate": 7.627864831431955e-05, + "loss": 1.4389, + "step": 71150 + }, + { + "epoch": 3.4422239910579706, + "grad_norm": 0.2650783061981201, + "learning_rate": 7.626469136404834e-05, + "loss": 1.439, + "step": 71160 + }, + { + "epoch": 3.4425491557572783, + "eval_loss": 2.4182510375976562, + "eval_runtime": 22.3053, + "eval_samples_per_second": 224.162, + "eval_steps_per_second": 1.21, + "step": 71166 + }, + { + "epoch": 3.442765932223483, + "grad_norm": 0.2870977222919464, + "learning_rate": 7.625073177959945e-05, + "loss": 1.4253, + "step": 71170 + }, + { + "epoch": 3.4433078733889952, + "grad_norm": 0.20810657739639282, + "learning_rate": 7.62367695627022e-05, + "loss": 1.4484, + "step": 71180 + }, + { + "epoch": 3.4438498145545076, + "grad_norm": 0.177906796336174, + "learning_rate": 7.622280471508611e-05, + "loss": 1.441, + "step": 71190 + }, + { + "epoch": 3.4443917557200194, + "grad_norm": 0.17265500128269196, + "learning_rate": 7.620883723848114e-05, + "loss": 1.4263, + "step": 71200 + }, + { + "epoch": 3.4449336968855317, + "grad_norm": 0.23366230726242065, + "learning_rate": 7.61948671346175e-05, + "loss": 1.4195, + "step": 71210 + }, + { + "epoch": 3.445475638051044, + "grad_norm": 0.18871016800403595, + "learning_rate": 7.618089440522571e-05, + "loss": 1.4275, + "step": 71220 + }, + { + "epoch": 3.4460175792165564, + "grad_norm": 0.1917314976453781, + "learning_rate": 7.61669190520367e-05, + "loss": 1.4325, + "step": 71230 + }, + { + "epoch": 3.4465595203820687, + "grad_norm": 0.2467162013053894, + "learning_rate": 7.615294107678165e-05, + "loss": 1.4374, + "step": 71240 + }, + { + "epoch": 3.4471014615475806, + "grad_norm": 0.1812368631362915, + "learning_rate": 7.61389604811921e-05, + "loss": 1.4434, + "step": 71250 + }, + { + "epoch": 3.4472640438972344, + "eval_loss": 2.420837163925171, + "eval_runtime": 23.1164, + "eval_samples_per_second": 216.296, + "eval_steps_per_second": 1.168, + "step": 71253 + }, + { + "epoch": 3.447643402713093, + "grad_norm": 0.24200530350208282, + "learning_rate": 7.61249772669999e-05, + "loss": 1.4313, + "step": 71260 + }, + { + "epoch": 3.448185343878605, + "grad_norm": 0.19202779233455658, + "learning_rate": 7.61109914359372e-05, + "loss": 1.4212, + "step": 71270 + }, + { + "epoch": 3.4487272850441175, + "grad_norm": 0.34746161103248596, + "learning_rate": 7.609700298973659e-05, + "loss": 1.4312, + "step": 71280 + }, + { + "epoch": 3.44926922620963, + "grad_norm": 0.19444984197616577, + "learning_rate": 7.608301193013082e-05, + "loss": 1.416, + "step": 71290 + }, + { + "epoch": 3.4498111673751417, + "grad_norm": 0.17890185117721558, + "learning_rate": 7.606901825885305e-05, + "loss": 1.431, + "step": 71300 + }, + { + "epoch": 3.450353108540654, + "grad_norm": 0.1906130313873291, + "learning_rate": 7.605502197763678e-05, + "loss": 1.4435, + "step": 71310 + }, + { + "epoch": 3.4508950497061663, + "grad_norm": 0.2459091991186142, + "learning_rate": 7.60410230882158e-05, + "loss": 1.4252, + "step": 71320 + }, + { + "epoch": 3.4514369908716787, + "grad_norm": 0.3785655200481415, + "learning_rate": 7.602702159232424e-05, + "loss": 1.4277, + "step": 71330 + }, + { + "epoch": 3.4519789320371905, + "grad_norm": 0.2577289044857025, + "learning_rate": 7.601301749169652e-05, + "loss": 1.429, + "step": 71340 + }, + { + "epoch": 3.4519789320371905, + "eval_loss": 2.4179348945617676, + "eval_runtime": 21.9382, + "eval_samples_per_second": 227.913, + "eval_steps_per_second": 1.231, + "step": 71340 + }, + { + "epoch": 3.452520873202703, + "grad_norm": 0.2082802802324295, + "learning_rate": 7.599901078806744e-05, + "loss": 1.4214, + "step": 71350 + }, + { + "epoch": 3.453062814368215, + "grad_norm": 0.251081645488739, + "learning_rate": 7.598500148317206e-05, + "loss": 1.4394, + "step": 71360 + }, + { + "epoch": 3.4536047555337275, + "grad_norm": 0.1895270198583603, + "learning_rate": 7.597098957874582e-05, + "loss": 1.4283, + "step": 71370 + }, + { + "epoch": 3.4541466966992393, + "grad_norm": 0.252121239900589, + "learning_rate": 7.595697507652442e-05, + "loss": 1.4305, + "step": 71380 + }, + { + "epoch": 3.4546886378647517, + "grad_norm": 0.32456740736961365, + "learning_rate": 7.594295797824393e-05, + "loss": 1.4383, + "step": 71390 + }, + { + "epoch": 3.455230579030264, + "grad_norm": 0.22235190868377686, + "learning_rate": 7.592893828564073e-05, + "loss": 1.4284, + "step": 71400 + }, + { + "epoch": 3.4557725201957763, + "grad_norm": 0.19792044162750244, + "learning_rate": 7.591491600045155e-05, + "loss": 1.4249, + "step": 71410 + }, + { + "epoch": 3.4563144613612886, + "grad_norm": 0.26035961508750916, + "learning_rate": 7.590089112441336e-05, + "loss": 1.4245, + "step": 71420 + }, + { + "epoch": 3.456693820177147, + "eval_loss": 2.4162979125976562, + "eval_runtime": 21.987, + "eval_samples_per_second": 227.408, + "eval_steps_per_second": 1.228, + "step": 71427 + }, + { + "epoch": 3.4568564025268005, + "grad_norm": 0.18416984379291534, + "learning_rate": 7.58868636592635e-05, + "loss": 1.4349, + "step": 71430 + }, + { + "epoch": 3.457398343692313, + "grad_norm": 0.21309533715248108, + "learning_rate": 7.587283360673969e-05, + "loss": 1.4194, + "step": 71440 + }, + { + "epoch": 3.457940284857825, + "grad_norm": 0.17019227147102356, + "learning_rate": 7.585880096857985e-05, + "loss": 1.4342, + "step": 71450 + }, + { + "epoch": 3.4584822260233374, + "grad_norm": 0.16507531702518463, + "learning_rate": 7.58447657465223e-05, + "loss": 1.424, + "step": 71460 + }, + { + "epoch": 3.4590241671888498, + "grad_norm": 0.21044619381427765, + "learning_rate": 7.583072794230567e-05, + "loss": 1.428, + "step": 71470 + }, + { + "epoch": 3.4595661083543616, + "grad_norm": 0.2250647246837616, + "learning_rate": 7.581668755766891e-05, + "loss": 1.4316, + "step": 71480 + }, + { + "epoch": 3.460108049519874, + "grad_norm": 0.21987579762935638, + "learning_rate": 7.580264459435129e-05, + "loss": 1.4411, + "step": 71490 + }, + { + "epoch": 3.4606499906853863, + "grad_norm": 0.21779508888721466, + "learning_rate": 7.578859905409234e-05, + "loss": 1.4381, + "step": 71500 + }, + { + "epoch": 3.4611919318508986, + "grad_norm": 0.3549853265285492, + "learning_rate": 7.577455093863202e-05, + "loss": 1.4346, + "step": 71510 + }, + { + "epoch": 3.4614087083171032, + "eval_loss": 2.4182517528533936, + "eval_runtime": 21.9784, + "eval_samples_per_second": 227.496, + "eval_steps_per_second": 1.228, + "step": 71514 + }, + { + "epoch": 3.461733873016411, + "grad_norm": 0.46638888120651245, + "learning_rate": 7.57605002497105e-05, + "loss": 1.4274, + "step": 71520 + }, + { + "epoch": 3.4622758141819228, + "grad_norm": 0.28370195627212524, + "learning_rate": 7.574644698906836e-05, + "loss": 1.4282, + "step": 71530 + }, + { + "epoch": 3.462817755347435, + "grad_norm": 0.2209625244140625, + "learning_rate": 7.573239115844644e-05, + "loss": 1.4407, + "step": 71540 + }, + { + "epoch": 3.4633596965129474, + "grad_norm": 0.2586290240287781, + "learning_rate": 7.571833275958591e-05, + "loss": 1.437, + "step": 71550 + }, + { + "epoch": 3.4639016376784597, + "grad_norm": 0.17665983736515045, + "learning_rate": 7.570427179422827e-05, + "loss": 1.4312, + "step": 71560 + }, + { + "epoch": 3.4644435788439716, + "grad_norm": 0.1650412678718567, + "learning_rate": 7.569020826411532e-05, + "loss": 1.4262, + "step": 71570 + }, + { + "epoch": 3.464985520009484, + "grad_norm": 0.19921010732650757, + "learning_rate": 7.56761421709892e-05, + "loss": 1.4333, + "step": 71580 + }, + { + "epoch": 3.465527461174996, + "grad_norm": 0.2843552827835083, + "learning_rate": 7.566207351659236e-05, + "loss": 1.4295, + "step": 71590 + }, + { + "epoch": 3.4660694023405085, + "grad_norm": 0.3298152983188629, + "learning_rate": 7.564800230266756e-05, + "loss": 1.4246, + "step": 71600 + }, + { + "epoch": 3.46612359645706, + "eval_loss": 2.4160807132720947, + "eval_runtime": 21.9823, + "eval_samples_per_second": 227.456, + "eval_steps_per_second": 1.228, + "step": 71601 + }, + { + "epoch": 3.4666113435060204, + "grad_norm": 0.41355177760124207, + "learning_rate": 7.563392853095786e-05, + "loss": 1.4208, + "step": 71610 + }, + { + "epoch": 3.4671532846715327, + "grad_norm": 0.25345563888549805, + "learning_rate": 7.56198522032067e-05, + "loss": 1.4262, + "step": 71620 + }, + { + "epoch": 3.467695225837045, + "grad_norm": 0.2895798981189728, + "learning_rate": 7.560577332115777e-05, + "loss": 1.4344, + "step": 71630 + }, + { + "epoch": 3.4682371670025574, + "grad_norm": 0.18971815705299377, + "learning_rate": 7.559169188655509e-05, + "loss": 1.4359, + "step": 71640 + }, + { + "epoch": 3.4687791081680697, + "grad_norm": 0.24166151881217957, + "learning_rate": 7.557760790114304e-05, + "loss": 1.4282, + "step": 71650 + }, + { + "epoch": 3.4693210493335815, + "grad_norm": 0.18398047983646393, + "learning_rate": 7.556352136666624e-05, + "loss": 1.4319, + "step": 71660 + }, + { + "epoch": 3.469862990499094, + "grad_norm": 0.1882873922586441, + "learning_rate": 7.554943228486969e-05, + "loss": 1.4305, + "step": 71670 + }, + { + "epoch": 3.470404931664606, + "grad_norm": 0.39805862307548523, + "learning_rate": 7.55353406574987e-05, + "loss": 1.4383, + "step": 71680 + }, + { + "epoch": 3.470838484597016, + "eval_loss": 2.422006607055664, + "eval_runtime": 21.9837, + "eval_samples_per_second": 227.442, + "eval_steps_per_second": 1.228, + "step": 71688 + }, + { + "epoch": 3.4709468728301185, + "grad_norm": 0.4627326726913452, + "learning_rate": 7.552124648629887e-05, + "loss": 1.423, + "step": 71690 + }, + { + "epoch": 3.471488813995631, + "grad_norm": 0.20239268243312836, + "learning_rate": 7.55071497730161e-05, + "loss": 1.4374, + "step": 71700 + }, + { + "epoch": 3.4720307551611427, + "grad_norm": 0.21879947185516357, + "learning_rate": 7.549305051939665e-05, + "loss": 1.4308, + "step": 71710 + }, + { + "epoch": 3.472572696326655, + "grad_norm": 0.2617088854312897, + "learning_rate": 7.547894872718709e-05, + "loss": 1.4352, + "step": 71720 + }, + { + "epoch": 3.4731146374921673, + "grad_norm": 0.24820886552333832, + "learning_rate": 7.546484439813427e-05, + "loss": 1.4237, + "step": 71730 + }, + { + "epoch": 3.4736565786576796, + "grad_norm": 0.22648629546165466, + "learning_rate": 7.545073753398537e-05, + "loss": 1.435, + "step": 71740 + }, + { + "epoch": 3.474198519823192, + "grad_norm": 0.20121285319328308, + "learning_rate": 7.543662813648789e-05, + "loss": 1.4264, + "step": 71750 + }, + { + "epoch": 3.474740460988704, + "grad_norm": 0.32857656478881836, + "learning_rate": 7.542251620738964e-05, + "loss": 1.4422, + "step": 71760 + }, + { + "epoch": 3.475282402154216, + "grad_norm": 0.1786399483680725, + "learning_rate": 7.540840174843876e-05, + "loss": 1.4308, + "step": 71770 + }, + { + "epoch": 3.475553372736972, + "eval_loss": 2.418339490890503, + "eval_runtime": 21.9871, + "eval_samples_per_second": 227.406, + "eval_steps_per_second": 1.228, + "step": 71775 + }, + { + "epoch": 3.4758243433197284, + "grad_norm": 0.202467679977417, + "learning_rate": 7.539428476138367e-05, + "loss": 1.4229, + "step": 71780 + }, + { + "epoch": 3.4763662844852408, + "grad_norm": 0.18343421816825867, + "learning_rate": 7.538016524797313e-05, + "loss": 1.4268, + "step": 71790 + }, + { + "epoch": 3.4769082256507526, + "grad_norm": 0.3386618494987488, + "learning_rate": 7.53660432099562e-05, + "loss": 1.4319, + "step": 71800 + }, + { + "epoch": 3.477450166816265, + "grad_norm": 0.2889906167984009, + "learning_rate": 7.535191864908224e-05, + "loss": 1.4279, + "step": 71810 + }, + { + "epoch": 3.4779921079817773, + "grad_norm": 0.25574788451194763, + "learning_rate": 7.533779156710098e-05, + "loss": 1.4254, + "step": 71820 + }, + { + "epoch": 3.4785340491472896, + "grad_norm": 0.282247930765152, + "learning_rate": 7.532366196576238e-05, + "loss": 1.4315, + "step": 71830 + }, + { + "epoch": 3.4790759903128015, + "grad_norm": 0.22523202002048492, + "learning_rate": 7.530952984681679e-05, + "loss": 1.4407, + "step": 71840 + }, + { + "epoch": 3.4796179314783138, + "grad_norm": 0.2516654431819916, + "learning_rate": 7.529539521201481e-05, + "loss": 1.4203, + "step": 71850 + }, + { + "epoch": 3.480159872643826, + "grad_norm": 0.19450673460960388, + "learning_rate": 7.528125806310737e-05, + "loss": 1.4341, + "step": 71860 + }, + { + "epoch": 3.4802682608769286, + "eval_loss": 2.420518636703491, + "eval_runtime": 21.9826, + "eval_samples_per_second": 227.453, + "eval_steps_per_second": 1.228, + "step": 71862 + }, + { + "epoch": 3.4807018138093384, + "grad_norm": 0.17963695526123047, + "learning_rate": 7.526711840184576e-05, + "loss": 1.4376, + "step": 71870 + }, + { + "epoch": 3.4812437549748507, + "grad_norm": 0.1823495328426361, + "learning_rate": 7.525297622998151e-05, + "loss": 1.4332, + "step": 71880 + }, + { + "epoch": 3.4817856961403626, + "grad_norm": 0.18945930898189545, + "learning_rate": 7.523883154926648e-05, + "loss": 1.4278, + "step": 71890 + }, + { + "epoch": 3.482327637305875, + "grad_norm": 0.20653122663497925, + "learning_rate": 7.522468436145288e-05, + "loss": 1.4371, + "step": 71900 + }, + { + "epoch": 3.4828695784713872, + "grad_norm": 0.17666640877723694, + "learning_rate": 7.521053466829317e-05, + "loss": 1.4255, + "step": 71910 + }, + { + "epoch": 3.4834115196368995, + "grad_norm": 0.21840277314186096, + "learning_rate": 7.51963824715402e-05, + "loss": 1.4243, + "step": 71920 + }, + { + "epoch": 3.483953460802412, + "grad_norm": 0.19102028012275696, + "learning_rate": 7.518222777294703e-05, + "loss": 1.4402, + "step": 71930 + }, + { + "epoch": 3.4844954019679237, + "grad_norm": 0.17519988119602203, + "learning_rate": 7.51680705742671e-05, + "loss": 1.4324, + "step": 71940 + }, + { + "epoch": 3.4849831490168848, + "eval_loss": 2.4200730323791504, + "eval_runtime": 21.6386, + "eval_samples_per_second": 231.068, + "eval_steps_per_second": 1.248, + "step": 71949 + }, + { + "epoch": 3.485037343133436, + "grad_norm": 0.18477900326251984, + "learning_rate": 7.515391087725416e-05, + "loss": 1.4264, + "step": 71950 + }, + { + "epoch": 3.4855792842989484, + "grad_norm": 0.21089692413806915, + "learning_rate": 7.513974868366224e-05, + "loss": 1.4269, + "step": 71960 + }, + { + "epoch": 3.4861212254644607, + "grad_norm": 0.34691062569618225, + "learning_rate": 7.51255839952457e-05, + "loss": 1.4232, + "step": 71970 + }, + { + "epoch": 3.4866631666299726, + "grad_norm": 0.2588226795196533, + "learning_rate": 7.511141681375917e-05, + "loss": 1.4332, + "step": 71980 + }, + { + "epoch": 3.487205107795485, + "grad_norm": 0.16295021772384644, + "learning_rate": 7.509724714095766e-05, + "loss": 1.4296, + "step": 71990 + }, + { + "epoch": 3.487747048960997, + "grad_norm": 0.22175998985767365, + "learning_rate": 7.508307497859641e-05, + "loss": 1.4286, + "step": 72000 + }, + { + "epoch": 3.0005419411655123, + "grad_norm": 0.28885793685913086, + "learning_rate": 7.506890032843104e-05, + "loss": 1.4315, + "step": 72010 + }, + { + "epoch": 3.0010838823310246, + "grad_norm": 0.17862290143966675, + "learning_rate": 7.505472319221742e-05, + "loss": 1.4237, + "step": 72020 + }, + { + "epoch": 3.0016258234965365, + "grad_norm": 0.1830783486366272, + "learning_rate": 7.504054357171176e-05, + "loss": 1.4236, + "step": 72030 + }, + { + "epoch": 3.001950988195844, + "eval_loss": 2.42079496383667, + "eval_runtime": 22.1932, + "eval_samples_per_second": 225.294, + "eval_steps_per_second": 1.217, + "step": 72036 + }, + { + "epoch": 3.002167764662049, + "grad_norm": 0.22852027416229248, + "learning_rate": 7.502636146867058e-05, + "loss": 1.4378, + "step": 72040 + }, + { + "epoch": 3.002709705827561, + "grad_norm": 0.17318575084209442, + "learning_rate": 7.501217688485067e-05, + "loss": 1.4238, + "step": 72050 + }, + { + "epoch": 3.0032516469930735, + "grad_norm": 0.32434844970703125, + "learning_rate": 7.499798982200917e-05, + "loss": 1.4348, + "step": 72060 + }, + { + "epoch": 3.0037935881585853, + "grad_norm": 0.3870048522949219, + "learning_rate": 7.498380028190353e-05, + "loss": 1.4307, + "step": 72070 + }, + { + "epoch": 3.0043355293240976, + "grad_norm": 0.21093975007534027, + "learning_rate": 7.496960826629147e-05, + "loss": 1.4365, + "step": 72080 + }, + { + "epoch": 3.00487747048961, + "grad_norm": 0.1736556887626648, + "learning_rate": 7.495541377693103e-05, + "loss": 1.4264, + "step": 72090 + }, + { + "epoch": 3.0054194116551223, + "grad_norm": 0.23631495237350464, + "learning_rate": 7.494121681558056e-05, + "loss": 1.4222, + "step": 72100 + }, + { + "epoch": 3.0059613528206346, + "grad_norm": 0.1855953484773636, + "learning_rate": 7.492701738399874e-05, + "loss": 1.4191, + "step": 72110 + }, + { + "epoch": 3.0065032939861465, + "grad_norm": 0.16358767449855804, + "learning_rate": 7.491281548394454e-05, + "loss": 1.4286, + "step": 72120 + }, + { + "epoch": 3.0066658763358003, + "eval_loss": 2.4213919639587402, + "eval_runtime": 22.016, + "eval_samples_per_second": 227.108, + "eval_steps_per_second": 1.226, + "step": 72123 + }, + { + "epoch": 3.007045235151659, + "grad_norm": 0.17632584273815155, + "learning_rate": 7.489861111717718e-05, + "loss": 1.4182, + "step": 72130 + }, + { + "epoch": 3.007587176317171, + "grad_norm": 0.16463103890419006, + "learning_rate": 7.488440428545626e-05, + "loss": 1.4179, + "step": 72140 + }, + { + "epoch": 3.0081291174826834, + "grad_norm": 0.3055421710014343, + "learning_rate": 7.487019499054169e-05, + "loss": 1.426, + "step": 72150 + }, + { + "epoch": 3.0086710586481953, + "grad_norm": 0.27146390080451965, + "learning_rate": 7.485598323419362e-05, + "loss": 1.425, + "step": 72160 + }, + { + "epoch": 3.0092129998137076, + "grad_norm": 0.2870032787322998, + "learning_rate": 7.484176901817256e-05, + "loss": 1.4395, + "step": 72170 + }, + { + "epoch": 3.00975494097922, + "grad_norm": 0.18291185796260834, + "learning_rate": 7.482755234423931e-05, + "loss": 1.428, + "step": 72180 + }, + { + "epoch": 3.0102968821447322, + "grad_norm": 0.16777105629444122, + "learning_rate": 7.481333321415493e-05, + "loss": 1.4148, + "step": 72190 + }, + { + "epoch": 3.0108388233102445, + "grad_norm": 0.19106002151966095, + "learning_rate": 7.479911162968087e-05, + "loss": 1.4297, + "step": 72200 + }, + { + "epoch": 3.0113807644757564, + "grad_norm": 0.22009027004241943, + "learning_rate": 7.478488759257882e-05, + "loss": 1.4288, + "step": 72210 + }, + { + "epoch": 3.0113807644757564, + "eval_loss": 2.4199483394622803, + "eval_runtime": 22.9392, + "eval_samples_per_second": 217.968, + "eval_steps_per_second": 1.177, + "step": 72210 + }, + { + "epoch": 3.0119227056412687, + "grad_norm": 0.20799531042575836, + "learning_rate": 7.477066110461078e-05, + "loss": 1.4217, + "step": 72220 + }, + { + "epoch": 3.012464646806781, + "grad_norm": 0.17034943401813507, + "learning_rate": 7.475643216753909e-05, + "loss": 1.4257, + "step": 72230 + }, + { + "epoch": 3.0130065879722934, + "grad_norm": 0.16152597963809967, + "learning_rate": 7.474220078312636e-05, + "loss": 1.4231, + "step": 72240 + }, + { + "epoch": 3.0135485291378057, + "grad_norm": 0.23870690166950226, + "learning_rate": 7.47279669531355e-05, + "loss": 1.4311, + "step": 72250 + }, + { + "epoch": 3.0140904703033176, + "grad_norm": 0.19177548587322235, + "learning_rate": 7.471373067932975e-05, + "loss": 1.4227, + "step": 72260 + }, + { + "epoch": 3.01463241146883, + "grad_norm": 0.23941627144813538, + "learning_rate": 7.469949196347263e-05, + "loss": 1.4267, + "step": 72270 + }, + { + "epoch": 3.015174352634342, + "grad_norm": 0.1809268444776535, + "learning_rate": 7.468525080732798e-05, + "loss": 1.421, + "step": 72280 + }, + { + "epoch": 3.0157162937998545, + "grad_norm": 0.18789643049240112, + "learning_rate": 7.46710072126599e-05, + "loss": 1.4302, + "step": 72290 + }, + { + "epoch": 3.016095652615713, + "eval_loss": 2.415761947631836, + "eval_runtime": 22.0228, + "eval_samples_per_second": 227.038, + "eval_steps_per_second": 1.226, + "step": 72297 + }, + { + "epoch": 3.0162582349653664, + "grad_norm": 0.18077979981899261, + "learning_rate": 7.465676118123287e-05, + "loss": 1.4176, + "step": 72300 + }, + { + "epoch": 3.0168001761308787, + "grad_norm": 0.4619799852371216, + "learning_rate": 7.46425127148116e-05, + "loss": 1.4209, + "step": 72310 + }, + { + "epoch": 3.017342117296391, + "grad_norm": 0.1716815084218979, + "learning_rate": 7.462826181516115e-05, + "loss": 1.4313, + "step": 72320 + }, + { + "epoch": 3.0178840584619033, + "grad_norm": 0.21145406365394592, + "learning_rate": 7.461400848404681e-05, + "loss": 1.4226, + "step": 72330 + }, + { + "epoch": 3.0184259996274156, + "grad_norm": 0.19309388101100922, + "learning_rate": 7.459975272323427e-05, + "loss": 1.4252, + "step": 72340 + }, + { + "epoch": 3.0189679407929275, + "grad_norm": 0.37424296140670776, + "learning_rate": 7.458549453448948e-05, + "loss": 1.4245, + "step": 72350 + }, + { + "epoch": 3.01950988195844, + "grad_norm": 0.36267608404159546, + "learning_rate": 7.457123391957863e-05, + "loss": 1.4271, + "step": 72360 + }, + { + "epoch": 3.020051823123952, + "grad_norm": 0.1854252964258194, + "learning_rate": 7.455697088026831e-05, + "loss": 1.4313, + "step": 72370 + }, + { + "epoch": 3.0205937642894645, + "grad_norm": 0.2632382810115814, + "learning_rate": 7.454270541832532e-05, + "loss": 1.4254, + "step": 72380 + }, + { + "epoch": 3.020810540755669, + "eval_loss": 2.422041416168213, + "eval_runtime": 21.9894, + "eval_samples_per_second": 227.382, + "eval_steps_per_second": 1.228, + "step": 72384 + }, + { + "epoch": 3.0211357054549763, + "grad_norm": 0.16941377520561218, + "learning_rate": 7.452843753551687e-05, + "loss": 1.433, + "step": 72390 + }, + { + "epoch": 3.0216776466204887, + "grad_norm": 0.20150338113307953, + "learning_rate": 7.451416723361033e-05, + "loss": 1.4228, + "step": 72400 + }, + { + "epoch": 3.022219587786001, + "grad_norm": 0.20586276054382324, + "learning_rate": 7.449989451437347e-05, + "loss": 1.4336, + "step": 72410 + }, + { + "epoch": 3.0227615289515133, + "grad_norm": 0.180009663105011, + "learning_rate": 7.448561937957435e-05, + "loss": 1.4399, + "step": 72420 + }, + { + "epoch": 3.0233034701170256, + "grad_norm": 0.17290925979614258, + "learning_rate": 7.44713418309813e-05, + "loss": 1.4349, + "step": 72430 + }, + { + "epoch": 3.0238454112825375, + "grad_norm": 0.19423924386501312, + "learning_rate": 7.445706187036295e-05, + "loss": 1.4175, + "step": 72440 + }, + { + "epoch": 3.02438735244805, + "grad_norm": 0.19001413881778717, + "learning_rate": 7.444277949948826e-05, + "loss": 1.4253, + "step": 72450 + }, + { + "epoch": 3.024929293613562, + "grad_norm": 0.27171728014945984, + "learning_rate": 7.442849472012648e-05, + "loss": 1.4299, + "step": 72460 + }, + { + "epoch": 3.0254712347790744, + "grad_norm": 0.24754732847213745, + "learning_rate": 7.441420753404709e-05, + "loss": 1.4204, + "step": 72470 + }, + { + "epoch": 3.0255254288956257, + "eval_loss": 2.420412063598633, + "eval_runtime": 21.9948, + "eval_samples_per_second": 227.326, + "eval_steps_per_second": 1.228, + "step": 72471 + }, + { + "epoch": 3.0260131759445863, + "grad_norm": 0.18741434812545776, + "learning_rate": 7.439991794301997e-05, + "loss": 1.4214, + "step": 72480 + }, + { + "epoch": 3.0265551171100986, + "grad_norm": 0.2331148236989975, + "learning_rate": 7.438562594881523e-05, + "loss": 1.4267, + "step": 72490 + }, + { + "epoch": 3.027097058275611, + "grad_norm": 0.21943721175193787, + "learning_rate": 7.437133155320333e-05, + "loss": 1.4202, + "step": 72500 + }, + { + "epoch": 3.0276389994411232, + "grad_norm": 0.1892043948173523, + "learning_rate": 7.435703475795498e-05, + "loss": 1.4201, + "step": 72510 + }, + { + "epoch": 3.0281809406066356, + "grad_norm": 0.2600436210632324, + "learning_rate": 7.434273556484119e-05, + "loss": 1.4357, + "step": 72520 + }, + { + "epoch": 3.0287228817721474, + "grad_norm": 0.32084086537361145, + "learning_rate": 7.43284339756333e-05, + "loss": 1.4274, + "step": 72530 + }, + { + "epoch": 3.0292648229376598, + "grad_norm": 0.23586052656173706, + "learning_rate": 7.431412999210292e-05, + "loss": 1.4261, + "step": 72540 + }, + { + "epoch": 3.029806764103172, + "grad_norm": 0.2831364870071411, + "learning_rate": 7.429982361602198e-05, + "loss": 1.4191, + "step": 72550 + }, + { + "epoch": 3.030240317035582, + "eval_loss": 2.41964054107666, + "eval_runtime": 21.992, + "eval_samples_per_second": 227.355, + "eval_steps_per_second": 1.228, + "step": 72558 + }, + { + "epoch": 3.0303487052686844, + "grad_norm": 0.1781311184167862, + "learning_rate": 7.428551484916269e-05, + "loss": 1.4297, + "step": 72560 + }, + { + "epoch": 3.0308906464341967, + "grad_norm": 0.34944701194763184, + "learning_rate": 7.427120369329753e-05, + "loss": 1.4319, + "step": 72570 + }, + { + "epoch": 3.0314325875997086, + "grad_norm": 0.17898167669773102, + "learning_rate": 7.425689015019932e-05, + "loss": 1.4229, + "step": 72580 + }, + { + "epoch": 3.031974528765221, + "grad_norm": 0.19159488379955292, + "learning_rate": 7.42425742216412e-05, + "loss": 1.4248, + "step": 72590 + }, + { + "epoch": 3.032516469930733, + "grad_norm": 0.1828126460313797, + "learning_rate": 7.422825590939648e-05, + "loss": 1.4298, + "step": 72600 + }, + { + "epoch": 3.0330584110962455, + "grad_norm": 0.1625342071056366, + "learning_rate": 7.421393521523893e-05, + "loss": 1.425, + "step": 72610 + }, + { + "epoch": 3.0336003522617574, + "grad_norm": 0.1856265813112259, + "learning_rate": 7.41996121409425e-05, + "loss": 1.4231, + "step": 72620 + }, + { + "epoch": 3.0341422934272697, + "grad_norm": 0.20504432916641235, + "learning_rate": 7.418528668828147e-05, + "loss": 1.4211, + "step": 72630 + }, + { + "epoch": 3.034684234592782, + "grad_norm": 0.1928977072238922, + "learning_rate": 7.417095885903043e-05, + "loss": 1.421, + "step": 72640 + }, + { + "epoch": 3.034955205175538, + "eval_loss": 2.4224603176116943, + "eval_runtime": 21.9942, + "eval_samples_per_second": 227.333, + "eval_steps_per_second": 1.228, + "step": 72645 + }, + { + "epoch": 3.0352261757582943, + "grad_norm": 0.26011061668395996, + "learning_rate": 7.415662865496421e-05, + "loss": 1.4156, + "step": 72650 + }, + { + "epoch": 3.0357681169238067, + "grad_norm": 0.1762949675321579, + "learning_rate": 7.414229607785803e-05, + "loss": 1.4256, + "step": 72660 + }, + { + "epoch": 3.0363100580893185, + "grad_norm": 0.23641474545001984, + "learning_rate": 7.412796112948731e-05, + "loss": 1.4283, + "step": 72670 + }, + { + "epoch": 3.036851999254831, + "grad_norm": 0.2524915635585785, + "learning_rate": 7.41136238116278e-05, + "loss": 1.4353, + "step": 72680 + }, + { + "epoch": 3.037393940420343, + "grad_norm": 0.16015665233135223, + "learning_rate": 7.409928412605557e-05, + "loss": 1.4262, + "step": 72690 + }, + { + "epoch": 3.0379358815858555, + "grad_norm": 0.252564936876297, + "learning_rate": 7.408494207454694e-05, + "loss": 1.4269, + "step": 72700 + }, + { + "epoch": 3.0384778227513674, + "grad_norm": 0.2023906707763672, + "learning_rate": 7.407059765887854e-05, + "loss": 1.4323, + "step": 72710 + }, + { + "epoch": 3.0390197639168797, + "grad_norm": 0.22321388125419617, + "learning_rate": 7.40562508808273e-05, + "loss": 1.4271, + "step": 72720 + }, + { + "epoch": 3.039561705082392, + "grad_norm": 0.26535144448280334, + "learning_rate": 7.404190174217044e-05, + "loss": 1.4305, + "step": 72730 + }, + { + "epoch": 3.0396700933154945, + "eval_loss": 2.43088436126709, + "eval_runtime": 21.9914, + "eval_samples_per_second": 227.362, + "eval_steps_per_second": 1.228, + "step": 72732 + }, + { + "epoch": 3.0401036462479043, + "grad_norm": 0.20327869057655334, + "learning_rate": 7.402755024468547e-05, + "loss": 1.4258, + "step": 72740 + }, + { + "epoch": 3.0406455874134166, + "grad_norm": 0.2531057596206665, + "learning_rate": 7.401319639015018e-05, + "loss": 1.4226, + "step": 72750 + }, + { + "epoch": 3.0411875285789285, + "grad_norm": 0.2527235746383667, + "learning_rate": 7.399884018034265e-05, + "loss": 1.4343, + "step": 72760 + }, + { + "epoch": 3.041729469744441, + "grad_norm": 0.16884733736515045, + "learning_rate": 7.398448161704131e-05, + "loss": 1.4123, + "step": 72770 + }, + { + "epoch": 3.042271410909953, + "grad_norm": 0.2351938784122467, + "learning_rate": 7.397012070202478e-05, + "loss": 1.4316, + "step": 72780 + }, + { + "epoch": 3.0428133520754654, + "grad_norm": 0.17390334606170654, + "learning_rate": 7.395575743707208e-05, + "loss": 1.4276, + "step": 72790 + }, + { + "epoch": 3.0433552932409773, + "grad_norm": 0.23687273263931274, + "learning_rate": 7.394139182396245e-05, + "loss": 1.4308, + "step": 72800 + }, + { + "epoch": 3.0438972344064896, + "grad_norm": 0.22024738788604736, + "learning_rate": 7.39270238644754e-05, + "loss": 1.4276, + "step": 72810 + }, + { + "epoch": 3.0443849814554507, + "eval_loss": 2.4263041019439697, + "eval_runtime": 21.995, + "eval_samples_per_second": 227.324, + "eval_steps_per_second": 1.228, + "step": 72819 + }, + { + "epoch": 3.044439175572002, + "grad_norm": 0.37231162190437317, + "learning_rate": 7.391265356039084e-05, + "loss": 1.4351, + "step": 72820 + }, + { + "epoch": 3.0449811167375143, + "grad_norm": 0.1920226663351059, + "learning_rate": 7.389828091348888e-05, + "loss": 1.4319, + "step": 72830 + }, + { + "epoch": 3.0455230579030266, + "grad_norm": 0.16061918437480927, + "learning_rate": 7.388390592554989e-05, + "loss": 1.4258, + "step": 72840 + }, + { + "epoch": 3.0460649990685384, + "grad_norm": 0.20206622779369354, + "learning_rate": 7.386952859835466e-05, + "loss": 1.4202, + "step": 72850 + }, + { + "epoch": 3.0466069402340508, + "grad_norm": 0.31317874789237976, + "learning_rate": 7.385514893368411e-05, + "loss": 1.4229, + "step": 72860 + }, + { + "epoch": 3.047148881399563, + "grad_norm": 0.17705164849758148, + "learning_rate": 7.38407669333196e-05, + "loss": 1.4275, + "step": 72870 + }, + { + "epoch": 3.0476908225650754, + "grad_norm": 0.2397981733083725, + "learning_rate": 7.382638259904268e-05, + "loss": 1.4225, + "step": 72880 + }, + { + "epoch": 3.0482327637305877, + "grad_norm": 0.19595515727996826, + "learning_rate": 7.38119959326352e-05, + "loss": 1.437, + "step": 72890 + }, + { + "epoch": 3.0487747048960996, + "grad_norm": 0.18026091158390045, + "learning_rate": 7.379760693587935e-05, + "loss": 1.43, + "step": 72900 + }, + { + "epoch": 3.0490998695954072, + "eval_loss": 2.430786371231079, + "eval_runtime": 22.1488, + "eval_samples_per_second": 225.746, + "eval_steps_per_second": 1.219, + "step": 72906 + }, + { + "epoch": 3.049316646061612, + "grad_norm": 0.26389095187187195, + "learning_rate": 7.378321561055756e-05, + "loss": 1.4251, + "step": 72910 + }, + { + "epoch": 3.049858587227124, + "grad_norm": 0.33883413672447205, + "learning_rate": 7.376882195845256e-05, + "loss": 1.4346, + "step": 72920 + }, + { + "epoch": 3.0504005283926365, + "grad_norm": 0.17455531656742096, + "learning_rate": 7.37544259813474e-05, + "loss": 1.4272, + "step": 72930 + }, + { + "epoch": 3.0509424695581484, + "grad_norm": 0.1896173059940338, + "learning_rate": 7.374002768102535e-05, + "loss": 1.4286, + "step": 72940 + }, + { + "epoch": 3.0514844107236607, + "grad_norm": 0.1727907508611679, + "learning_rate": 7.372562705927006e-05, + "loss": 1.4246, + "step": 72950 + }, + { + "epoch": 3.052026351889173, + "grad_norm": 0.17617569863796234, + "learning_rate": 7.371122411786538e-05, + "loss": 1.433, + "step": 72960 + }, + { + "epoch": 3.0525682930546854, + "grad_norm": 0.2908771336078644, + "learning_rate": 7.369681885859548e-05, + "loss": 1.4263, + "step": 72970 + }, + { + "epoch": 3.0531102342201977, + "grad_norm": 0.2100307047367096, + "learning_rate": 7.368241128324485e-05, + "loss": 1.4221, + "step": 72980 + }, + { + "epoch": 3.0536521753857095, + "grad_norm": 0.24599306285381317, + "learning_rate": 7.366800139359822e-05, + "loss": 1.4234, + "step": 72990 + }, + { + "epoch": 3.0538147577353634, + "eval_loss": 2.424680709838867, + "eval_runtime": 22.0534, + "eval_samples_per_second": 226.723, + "eval_steps_per_second": 1.224, + "step": 72993 + }, + { + "epoch": 3.054194116551222, + "grad_norm": 0.18987296521663666, + "learning_rate": 7.365358919144063e-05, + "loss": 1.427, + "step": 73000 + }, + { + "epoch": 3.054736057716734, + "grad_norm": 0.17865373194217682, + "learning_rate": 7.363917467855736e-05, + "loss": 1.426, + "step": 73010 + }, + { + "epoch": 3.0552779988822465, + "grad_norm": 0.17104874551296234, + "learning_rate": 7.362475785673409e-05, + "loss": 1.4333, + "step": 73020 + }, + { + "epoch": 3.0558199400477584, + "grad_norm": 0.17414012551307678, + "learning_rate": 7.361033872775667e-05, + "loss": 1.4307, + "step": 73030 + }, + { + "epoch": 3.0563618812132707, + "grad_norm": 0.2933975160121918, + "learning_rate": 7.359591729341128e-05, + "loss": 1.4239, + "step": 73040 + }, + { + "epoch": 3.056903822378783, + "grad_norm": 0.1879478394985199, + "learning_rate": 7.358149355548439e-05, + "loss": 1.4258, + "step": 73050 + }, + { + "epoch": 3.0574457635442953, + "grad_norm": 0.18025019764900208, + "learning_rate": 7.356706751576274e-05, + "loss": 1.421, + "step": 73060 + }, + { + "epoch": 3.0579877047098076, + "grad_norm": 0.16190853714942932, + "learning_rate": 7.355263917603341e-05, + "loss": 1.4229, + "step": 73070 + }, + { + "epoch": 3.0585296458753195, + "grad_norm": 0.19061139225959778, + "learning_rate": 7.353820853808365e-05, + "loss": 1.4202, + "step": 73080 + }, + { + "epoch": 3.0585296458753195, + "eval_loss": 2.4280989170074463, + "eval_runtime": 21.9869, + "eval_samples_per_second": 227.408, + "eval_steps_per_second": 1.228, + "step": 73080 + }, + { + "epoch": 3.059071587040832, + "grad_norm": 0.24955511093139648, + "learning_rate": 7.352377560370113e-05, + "loss": 1.4388, + "step": 73090 + }, + { + "epoch": 3.059613528206344, + "grad_norm": 0.24170927703380585, + "learning_rate": 7.350934037467371e-05, + "loss": 1.4251, + "step": 73100 + }, + { + "epoch": 3.0601554693718565, + "grad_norm": 0.19894978404045105, + "learning_rate": 7.349490285278953e-05, + "loss": 1.4229, + "step": 73110 + }, + { + "epoch": 3.0606974105373683, + "grad_norm": 0.16050221025943756, + "learning_rate": 7.348046303983713e-05, + "loss": 1.4217, + "step": 73120 + }, + { + "epoch": 3.0612393517028806, + "grad_norm": 0.20622624456882477, + "learning_rate": 7.346602093760516e-05, + "loss": 1.4358, + "step": 73130 + }, + { + "epoch": 3.061781292868393, + "grad_norm": 0.17460785806179047, + "learning_rate": 7.345157654788272e-05, + "loss": 1.4205, + "step": 73140 + }, + { + "epoch": 3.0623232340339053, + "grad_norm": 0.2245752215385437, + "learning_rate": 7.343712987245908e-05, + "loss": 1.4378, + "step": 73150 + }, + { + "epoch": 3.0628651751994176, + "grad_norm": 0.2324395626783371, + "learning_rate": 7.342268091312381e-05, + "loss": 1.4331, + "step": 73160 + }, + { + "epoch": 3.063244534015276, + "eval_loss": 2.4189603328704834, + "eval_runtime": 22.2302, + "eval_samples_per_second": 224.919, + "eval_steps_per_second": 1.215, + "step": 73167 + }, + { + "epoch": 3.0634071163649295, + "grad_norm": 0.16124655306339264, + "learning_rate": 7.340822967166684e-05, + "loss": 1.427, + "step": 73170 + }, + { + "epoch": 3.0639490575304418, + "grad_norm": 0.171858012676239, + "learning_rate": 7.339377614987827e-05, + "loss": 1.4179, + "step": 73180 + }, + { + "epoch": 3.064490998695954, + "grad_norm": 0.28253042697906494, + "learning_rate": 7.33793203495486e-05, + "loss": 1.4261, + "step": 73190 + }, + { + "epoch": 3.0650329398614664, + "grad_norm": 0.1876271665096283, + "learning_rate": 7.336486227246851e-05, + "loss": 1.431, + "step": 73200 + }, + { + "epoch": 3.0655748810269783, + "grad_norm": 0.2576892077922821, + "learning_rate": 7.335040192042901e-05, + "loss": 1.4216, + "step": 73210 + }, + { + "epoch": 3.0661168221924906, + "grad_norm": 0.28171801567077637, + "learning_rate": 7.333593929522138e-05, + "loss": 1.4285, + "step": 73220 + }, + { + "epoch": 3.066658763358003, + "grad_norm": 0.16934189200401306, + "learning_rate": 7.33214743986372e-05, + "loss": 1.4277, + "step": 73230 + }, + { + "epoch": 3.0672007045235152, + "grad_norm": 0.17389722168445587, + "learning_rate": 7.330700723246835e-05, + "loss": 1.4219, + "step": 73240 + }, + { + "epoch": 3.0677426456890275, + "grad_norm": 0.18714025616645813, + "learning_rate": 7.32925377985069e-05, + "loss": 1.4354, + "step": 73250 + }, + { + "epoch": 3.067959422155232, + "eval_loss": 2.4224860668182373, + "eval_runtime": 21.9928, + "eval_samples_per_second": 227.347, + "eval_steps_per_second": 1.228, + "step": 73254 + }, + { + "epoch": 3.0682845868545394, + "grad_norm": 0.1830187737941742, + "learning_rate": 7.327806609854527e-05, + "loss": 1.4227, + "step": 73260 + }, + { + "epoch": 3.0688265280200517, + "grad_norm": 0.1765843778848648, + "learning_rate": 7.326359213437618e-05, + "loss": 1.4215, + "step": 73270 + }, + { + "epoch": 3.069368469185564, + "grad_norm": 0.19436825811862946, + "learning_rate": 7.32491159077926e-05, + "loss": 1.423, + "step": 73280 + }, + { + "epoch": 3.0699104103510764, + "grad_norm": 0.31252750754356384, + "learning_rate": 7.323463742058776e-05, + "loss": 1.4223, + "step": 73290 + }, + { + "epoch": 3.0704523515165887, + "grad_norm": 0.2948218286037445, + "learning_rate": 7.322015667455521e-05, + "loss": 1.4295, + "step": 73300 + }, + { + "epoch": 3.0709942926821006, + "grad_norm": 0.1795574426651001, + "learning_rate": 7.320567367148875e-05, + "loss": 1.4268, + "step": 73310 + }, + { + "epoch": 3.071536233847613, + "grad_norm": 0.2362855225801468, + "learning_rate": 7.319118841318246e-05, + "loss": 1.4224, + "step": 73320 + }, + { + "epoch": 3.072078175013125, + "grad_norm": 0.22740645706653595, + "learning_rate": 7.317670090143076e-05, + "loss": 1.4203, + "step": 73330 + }, + { + "epoch": 3.0726201161786375, + "grad_norm": 0.1676589697599411, + "learning_rate": 7.316221113802825e-05, + "loss": 1.4286, + "step": 73340 + }, + { + "epoch": 3.072674310295189, + "eval_loss": 2.4217214584350586, + "eval_runtime": 21.9907, + "eval_samples_per_second": 227.369, + "eval_steps_per_second": 1.228, + "step": 73341 + }, + { + "epoch": 3.0731620573441494, + "grad_norm": 0.4173310697078705, + "learning_rate": 7.314771912476987e-05, + "loss": 1.4293, + "step": 73350 + }, + { + "epoch": 3.0737039985096617, + "grad_norm": 0.198992520570755, + "learning_rate": 7.313322486345085e-05, + "loss": 1.4195, + "step": 73360 + }, + { + "epoch": 3.074245939675174, + "grad_norm": 0.19896645843982697, + "learning_rate": 7.311872835586665e-05, + "loss": 1.4355, + "step": 73370 + }, + { + "epoch": 3.0747878808406863, + "grad_norm": 0.1793803870677948, + "learning_rate": 7.310422960381305e-05, + "loss": 1.4228, + "step": 73380 + }, + { + "epoch": 3.0753298220061986, + "grad_norm": 0.21066676080226898, + "learning_rate": 7.308972860908609e-05, + "loss": 1.4219, + "step": 73390 + }, + { + "epoch": 3.0758717631717105, + "grad_norm": 0.18298877775669098, + "learning_rate": 7.30752253734821e-05, + "loss": 1.4309, + "step": 73400 + }, + { + "epoch": 3.076413704337223, + "grad_norm": 0.202785462141037, + "learning_rate": 7.306071989879762e-05, + "loss": 1.423, + "step": 73410 + }, + { + "epoch": 3.076955645502735, + "grad_norm": 0.17988531291484833, + "learning_rate": 7.304621218682961e-05, + "loss": 1.4266, + "step": 73420 + }, + { + "epoch": 3.077389198435145, + "eval_loss": 2.4217171669006348, + "eval_runtime": 22.0012, + "eval_samples_per_second": 227.26, + "eval_steps_per_second": 1.227, + "step": 73428 + }, + { + "epoch": 3.0774975866682475, + "grad_norm": 0.29091671109199524, + "learning_rate": 7.303170223937518e-05, + "loss": 1.4327, + "step": 73430 + }, + { + "epoch": 3.0780395278337593, + "grad_norm": 0.1689477562904358, + "learning_rate": 7.301719005823175e-05, + "loss": 1.4226, + "step": 73440 + }, + { + "epoch": 3.0785814689992717, + "grad_norm": 0.21074718236923218, + "learning_rate": 7.300267564519703e-05, + "loss": 1.4233, + "step": 73450 + }, + { + "epoch": 3.079123410164784, + "grad_norm": 0.18468187749385834, + "learning_rate": 7.298815900206904e-05, + "loss": 1.4186, + "step": 73460 + }, + { + "epoch": 3.0796653513302963, + "grad_norm": 0.16234566271305084, + "learning_rate": 7.2973640130646e-05, + "loss": 1.42, + "step": 73470 + }, + { + "epoch": 3.0802072924958086, + "grad_norm": 0.20264677703380585, + "learning_rate": 7.295911903272643e-05, + "loss": 1.4322, + "step": 73480 + }, + { + "epoch": 3.0807492336613205, + "grad_norm": 0.20148052275180817, + "learning_rate": 7.29445957101092e-05, + "loss": 1.42, + "step": 73490 + }, + { + "epoch": 3.081291174826833, + "grad_norm": 0.1632404625415802, + "learning_rate": 7.293007016459333e-05, + "loss": 1.4228, + "step": 73500 + }, + { + "epoch": 3.081833115992345, + "grad_norm": 0.28163808584213257, + "learning_rate": 7.291554239797823e-05, + "loss": 1.4194, + "step": 73510 + }, + { + "epoch": 3.082104086575101, + "eval_loss": 2.420088768005371, + "eval_runtime": 21.9934, + "eval_samples_per_second": 227.341, + "eval_steps_per_second": 1.228, + "step": 73515 + }, + { + "epoch": 3.0823750571578574, + "grad_norm": 0.2194111943244934, + "learning_rate": 7.290101241206354e-05, + "loss": 1.4236, + "step": 73520 + }, + { + "epoch": 3.0829169983233697, + "grad_norm": 0.3260360658168793, + "learning_rate": 7.288648020864911e-05, + "loss": 1.436, + "step": 73530 + }, + { + "epoch": 3.0834589394888816, + "grad_norm": 0.21551568806171417, + "learning_rate": 7.287194578953518e-05, + "loss": 1.423, + "step": 73540 + }, + { + "epoch": 3.084000880654394, + "grad_norm": 0.2473076581954956, + "learning_rate": 7.28574091565222e-05, + "loss": 1.4256, + "step": 73550 + }, + { + "epoch": 3.0845428218199062, + "grad_norm": 0.24069726467132568, + "learning_rate": 7.28428703114109e-05, + "loss": 1.4311, + "step": 73560 + }, + { + "epoch": 3.0850847629854186, + "grad_norm": 0.24383601546287537, + "learning_rate": 7.28283292560023e-05, + "loss": 1.4249, + "step": 73570 + }, + { + "epoch": 3.0856267041509304, + "grad_norm": 0.17306429147720337, + "learning_rate": 7.281378599209768e-05, + "loss": 1.4223, + "step": 73580 + }, + { + "epoch": 3.0861686453164427, + "grad_norm": 0.3249399960041046, + "learning_rate": 7.279924052149858e-05, + "loss": 1.4173, + "step": 73590 + }, + { + "epoch": 3.086710586481955, + "grad_norm": 0.33424919843673706, + "learning_rate": 7.278469284600684e-05, + "loss": 1.4187, + "step": 73600 + }, + { + "epoch": 3.0868189747150576, + "eval_loss": 2.4198853969573975, + "eval_runtime": 21.9949, + "eval_samples_per_second": 227.326, + "eval_steps_per_second": 1.228, + "step": 73602 + }, + { + "epoch": 3.0872525276474674, + "grad_norm": 0.25549885630607605, + "learning_rate": 7.277014296742457e-05, + "loss": 1.4192, + "step": 73610 + }, + { + "epoch": 3.0877944688129797, + "grad_norm": 0.3509141206741333, + "learning_rate": 7.275559088755414e-05, + "loss": 1.4217, + "step": 73620 + }, + { + "epoch": 3.0883364099784916, + "grad_norm": 0.1701222062110901, + "learning_rate": 7.27410366081982e-05, + "loss": 1.4255, + "step": 73630 + }, + { + "epoch": 3.088878351144004, + "grad_norm": 0.4111487865447998, + "learning_rate": 7.272648013115965e-05, + "loss": 1.4136, + "step": 73640 + }, + { + "epoch": 3.089420292309516, + "grad_norm": 0.292900949716568, + "learning_rate": 7.271192145824172e-05, + "loss": 1.4301, + "step": 73650 + }, + { + "epoch": 3.0899622334750285, + "grad_norm": 0.24990233778953552, + "learning_rate": 7.269736059124785e-05, + "loss": 1.4301, + "step": 73660 + }, + { + "epoch": 3.0905041746405404, + "grad_norm": 0.2537482678890228, + "learning_rate": 7.26827975319818e-05, + "loss": 1.4215, + "step": 73670 + }, + { + "epoch": 3.0910461158060527, + "grad_norm": 0.23013080656528473, + "learning_rate": 7.266823228224754e-05, + "loss": 1.4313, + "step": 73680 + }, + { + "epoch": 3.0915338628550137, + "eval_loss": 2.418518304824829, + "eval_runtime": 21.9932, + "eval_samples_per_second": 227.343, + "eval_steps_per_second": 1.228, + "step": 73689 + }, + { + "epoch": 3.091588056971565, + "grad_norm": 0.20117320120334625, + "learning_rate": 7.265366484384938e-05, + "loss": 1.4179, + "step": 73690 + }, + { + "epoch": 3.0921299981370773, + "grad_norm": 0.25527241826057434, + "learning_rate": 7.263909521859186e-05, + "loss": 1.4265, + "step": 73700 + }, + { + "epoch": 3.0926719393025897, + "grad_norm": 0.1841808259487152, + "learning_rate": 7.262452340827981e-05, + "loss": 1.4271, + "step": 73710 + }, + { + "epoch": 3.0932138804681015, + "grad_norm": 0.16479361057281494, + "learning_rate": 7.260994941471832e-05, + "loss": 1.4119, + "step": 73720 + }, + { + "epoch": 3.093755821633614, + "grad_norm": 0.23891477286815643, + "learning_rate": 7.259537323971276e-05, + "loss": 1.424, + "step": 73730 + }, + { + "epoch": 3.094297762799126, + "grad_norm": 0.22275440394878387, + "learning_rate": 7.258079488506874e-05, + "loss": 1.4334, + "step": 73740 + }, + { + "epoch": 3.0948397039646385, + "grad_norm": 0.19591204822063446, + "learning_rate": 7.256621435259218e-05, + "loss": 1.4379, + "step": 73750 + }, + { + "epoch": 3.0953816451301503, + "grad_norm": 0.19313859939575195, + "learning_rate": 7.255163164408926e-05, + "loss": 1.4185, + "step": 73760 + }, + { + "epoch": 3.0959235862956627, + "grad_norm": 0.23939813673496246, + "learning_rate": 7.25370467613664e-05, + "loss": 1.4255, + "step": 73770 + }, + { + "epoch": 3.0962487509949703, + "eval_loss": 2.417570114135742, + "eval_runtime": 21.9951, + "eval_samples_per_second": 227.323, + "eval_steps_per_second": 1.228, + "step": 73776 + }, + { + "epoch": 3.096465527461175, + "grad_norm": 0.23741766810417175, + "learning_rate": 7.252245970623035e-05, + "loss": 1.4347, + "step": 73780 + }, + { + "epoch": 3.0970074686266873, + "grad_norm": 0.17643536627292633, + "learning_rate": 7.250787048048804e-05, + "loss": 1.4208, + "step": 73790 + }, + { + "epoch": 3.0975494097921996, + "grad_norm": 0.1725606471300125, + "learning_rate": 7.249327908594675e-05, + "loss": 1.4172, + "step": 73800 + }, + { + "epoch": 3.0980913509577115, + "grad_norm": 0.17194822430610657, + "learning_rate": 7.247868552441402e-05, + "loss": 1.4279, + "step": 73810 + }, + { + "epoch": 3.098633292123224, + "grad_norm": 0.17184831202030182, + "learning_rate": 7.246408979769759e-05, + "loss": 1.4215, + "step": 73820 + }, + { + "epoch": 3.099175233288736, + "grad_norm": 0.15591196715831757, + "learning_rate": 7.244949190760553e-05, + "loss": 1.413, + "step": 73830 + }, + { + "epoch": 3.0997171744542484, + "grad_norm": 0.2347261756658554, + "learning_rate": 7.243489185594618e-05, + "loss": 1.4282, + "step": 73840 + }, + { + "epoch": 3.1002591156197603, + "grad_norm": 0.17219695448875427, + "learning_rate": 7.242028964452812e-05, + "loss": 1.4218, + "step": 73850 + }, + { + "epoch": 3.1008010567852726, + "grad_norm": 0.19845928251743317, + "learning_rate": 7.240568527516022e-05, + "loss": 1.4226, + "step": 73860 + }, + { + "epoch": 3.1009636391349265, + "eval_loss": 2.4213979244232178, + "eval_runtime": 21.9911, + "eval_samples_per_second": 227.365, + "eval_steps_per_second": 1.228, + "step": 73863 + }, + { + "epoch": 3.101342997950785, + "grad_norm": 0.18098603188991547, + "learning_rate": 7.239107874965158e-05, + "loss": 1.4348, + "step": 73870 + }, + { + "epoch": 3.1018849391162973, + "grad_norm": 0.19403134286403656, + "learning_rate": 7.237647006981162e-05, + "loss": 1.4321, + "step": 73880 + }, + { + "epoch": 3.1024268802818096, + "grad_norm": 0.2447945922613144, + "learning_rate": 7.236185923745e-05, + "loss": 1.4228, + "step": 73890 + }, + { + "epoch": 3.1029688214473214, + "grad_norm": 0.20486268401145935, + "learning_rate": 7.234724625437661e-05, + "loss": 1.4183, + "step": 73900 + }, + { + "epoch": 3.1035107626128338, + "grad_norm": 0.18675167858600616, + "learning_rate": 7.233263112240168e-05, + "loss": 1.4293, + "step": 73910 + }, + { + "epoch": 3.104052703778346, + "grad_norm": 0.3332042396068573, + "learning_rate": 7.231801384333567e-05, + "loss": 1.4271, + "step": 73920 + }, + { + "epoch": 3.1045946449438584, + "grad_norm": 0.20681266486644745, + "learning_rate": 7.230339441898928e-05, + "loss": 1.4215, + "step": 73930 + }, + { + "epoch": 3.1051365861093707, + "grad_norm": 0.2513537108898163, + "learning_rate": 7.228877285117352e-05, + "loss": 1.4296, + "step": 73940 + }, + { + "epoch": 3.1056785272748826, + "grad_norm": 0.17482328414916992, + "learning_rate": 7.227414914169961e-05, + "loss": 1.4263, + "step": 73950 + }, + { + "epoch": 3.1056785272748826, + "eval_loss": 2.4194869995117188, + "eval_runtime": 21.9853, + "eval_samples_per_second": 227.425, + "eval_steps_per_second": 1.228, + "step": 73950 + }, + { + "epoch": 3.106220468440395, + "grad_norm": 0.2170400321483612, + "learning_rate": 7.225952329237913e-05, + "loss": 1.4152, + "step": 73960 + }, + { + "epoch": 3.106762409605907, + "grad_norm": 0.22855044901371002, + "learning_rate": 7.224489530502384e-05, + "loss": 1.4359, + "step": 73970 + }, + { + "epoch": 3.1073043507714195, + "grad_norm": 0.18862365186214447, + "learning_rate": 7.223026518144578e-05, + "loss": 1.4132, + "step": 73980 + }, + { + "epoch": 3.1078462919369314, + "grad_norm": 0.21314261853694916, + "learning_rate": 7.221563292345728e-05, + "loss": 1.4295, + "step": 73990 + }, + { + "epoch": 3.1083882331024437, + "grad_norm": 0.3071286976337433, + "learning_rate": 7.220099853287091e-05, + "loss": 1.4306, + "step": 74000 + }, + { + "epoch": 3.108930174267956, + "grad_norm": 0.20251207053661346, + "learning_rate": 7.218636201149952e-05, + "loss": 1.4239, + "step": 74010 + }, + { + "epoch": 3.1094721154334684, + "grad_norm": 0.18129009008407593, + "learning_rate": 7.217172336115622e-05, + "loss": 1.4233, + "step": 74020 + }, + { + "epoch": 3.1100140565989807, + "grad_norm": 0.21674220263957977, + "learning_rate": 7.21570825836544e-05, + "loss": 1.426, + "step": 74030 + }, + { + "epoch": 3.110393415414839, + "eval_loss": 2.4165384769439697, + "eval_runtime": 21.9918, + "eval_samples_per_second": 227.357, + "eval_steps_per_second": 1.228, + "step": 74037 + }, + { + "epoch": 3.1105559977644925, + "grad_norm": 0.23506887257099152, + "learning_rate": 7.214243968080766e-05, + "loss": 1.4339, + "step": 74040 + }, + { + "epoch": 3.111097938930005, + "grad_norm": 0.19796548783779144, + "learning_rate": 7.212779465442993e-05, + "loss": 1.4343, + "step": 74050 + }, + { + "epoch": 3.111639880095517, + "grad_norm": 0.22716623544692993, + "learning_rate": 7.211314750633534e-05, + "loss": 1.4137, + "step": 74060 + }, + { + "epoch": 3.1121818212610295, + "grad_norm": 0.3029038608074188, + "learning_rate": 7.209849823833835e-05, + "loss": 1.4262, + "step": 74070 + }, + { + "epoch": 3.1127237624265414, + "grad_norm": 0.175888329744339, + "learning_rate": 7.208384685225364e-05, + "loss": 1.4147, + "step": 74080 + }, + { + "epoch": 3.1132657035920537, + "grad_norm": 0.28215131163597107, + "learning_rate": 7.206919334989613e-05, + "loss": 1.4238, + "step": 74090 + }, + { + "epoch": 3.113807644757566, + "grad_norm": 0.26978573203086853, + "learning_rate": 7.205453773308107e-05, + "loss": 1.4235, + "step": 74100 + }, + { + "epoch": 3.1143495859230783, + "grad_norm": 0.16594207286834717, + "learning_rate": 7.20398800036239e-05, + "loss": 1.4189, + "step": 74110 + }, + { + "epoch": 3.1148915270885906, + "grad_norm": 0.22124223411083221, + "learning_rate": 7.20252201633404e-05, + "loss": 1.4344, + "step": 74120 + }, + { + "epoch": 3.1151083035547953, + "eval_loss": 2.4167044162750244, + "eval_runtime": 21.9938, + "eval_samples_per_second": 227.337, + "eval_steps_per_second": 1.228, + "step": 74124 + }, + { + "epoch": 3.1154334682541025, + "grad_norm": 0.16019515693187714, + "learning_rate": 7.201055821404653e-05, + "loss": 1.4233, + "step": 74130 + }, + { + "epoch": 3.115975409419615, + "grad_norm": 0.19272254407405853, + "learning_rate": 7.199589415755856e-05, + "loss": 1.4239, + "step": 74140 + }, + { + "epoch": 3.116517350585127, + "grad_norm": 0.3636123538017273, + "learning_rate": 7.198122799569302e-05, + "loss": 1.4233, + "step": 74150 + }, + { + "epoch": 3.1170592917506394, + "grad_norm": 0.31708648800849915, + "learning_rate": 7.196655973026667e-05, + "loss": 1.4282, + "step": 74160 + }, + { + "epoch": 3.1176012329161518, + "grad_norm": 0.19407716393470764, + "learning_rate": 7.195188936309657e-05, + "loss": 1.4302, + "step": 74170 + }, + { + "epoch": 3.1181431740816636, + "grad_norm": 0.17568087577819824, + "learning_rate": 7.193721689600002e-05, + "loss": 1.4249, + "step": 74180 + }, + { + "epoch": 3.118685115247176, + "grad_norm": 0.36815229058265686, + "learning_rate": 7.192254233079455e-05, + "loss": 1.4195, + "step": 74190 + }, + { + "epoch": 3.1192270564126883, + "grad_norm": 0.21112428605556488, + "learning_rate": 7.190786566929802e-05, + "loss": 1.4215, + "step": 74200 + }, + { + "epoch": 3.1197689975782006, + "grad_norm": 0.19633443653583527, + "learning_rate": 7.189318691332851e-05, + "loss": 1.4301, + "step": 74210 + }, + { + "epoch": 3.119823191694752, + "eval_loss": 2.416978359222412, + "eval_runtime": 21.9898, + "eval_samples_per_second": 227.378, + "eval_steps_per_second": 1.228, + "step": 74211 + }, + { + "epoch": 3.1203109387437125, + "grad_norm": 0.32902562618255615, + "learning_rate": 7.187850606470435e-05, + "loss": 1.4162, + "step": 74220 + }, + { + "epoch": 3.1208528799092248, + "grad_norm": 0.19823166728019714, + "learning_rate": 7.186382312524411e-05, + "loss": 1.4167, + "step": 74230 + }, + { + "epoch": 3.121394821074737, + "grad_norm": 0.1761886328458786, + "learning_rate": 7.18491380967667e-05, + "loss": 1.4312, + "step": 74240 + }, + { + "epoch": 3.1219367622402494, + "grad_norm": 0.3041189908981323, + "learning_rate": 7.18344509810912e-05, + "loss": 1.4233, + "step": 74250 + }, + { + "epoch": 3.1224787034057617, + "grad_norm": 0.39153873920440674, + "learning_rate": 7.181976178003701e-05, + "loss": 1.4268, + "step": 74260 + }, + { + "epoch": 3.1230206445712736, + "grad_norm": 0.3814428448677063, + "learning_rate": 7.180507049542376e-05, + "loss": 1.4169, + "step": 74270 + }, + { + "epoch": 3.123562585736786, + "grad_norm": 0.29164746403694153, + "learning_rate": 7.179037712907131e-05, + "loss": 1.4271, + "step": 74280 + }, + { + "epoch": 3.1241045269022982, + "grad_norm": 0.20582854747772217, + "learning_rate": 7.177568168279987e-05, + "loss": 1.4168, + "step": 74290 + }, + { + "epoch": 3.124538079834708, + "eval_loss": 2.4158596992492676, + "eval_runtime": 22.0314, + "eval_samples_per_second": 226.949, + "eval_steps_per_second": 1.226, + "step": 74298 + }, + { + "epoch": 3.1246464680678105, + "grad_norm": 0.18478041887283325, + "learning_rate": 7.17609841584298e-05, + "loss": 1.4319, + "step": 74300 + }, + { + "epoch": 3.1251884092333224, + "grad_norm": 0.2710546851158142, + "learning_rate": 7.174628455778178e-05, + "loss": 1.4217, + "step": 74310 + }, + { + "epoch": 3.1257303503988347, + "grad_norm": 0.18582426011562347, + "learning_rate": 7.173158288267674e-05, + "loss": 1.427, + "step": 74320 + }, + { + "epoch": 3.126272291564347, + "grad_norm": 0.1683790534734726, + "learning_rate": 7.171687913493586e-05, + "loss": 1.4333, + "step": 74330 + }, + { + "epoch": 3.1268142327298594, + "grad_norm": 0.16057170927524567, + "learning_rate": 7.170217331638056e-05, + "loss": 1.4262, + "step": 74340 + }, + { + "epoch": 3.1273561738953717, + "grad_norm": 0.19682690501213074, + "learning_rate": 7.168746542883254e-05, + "loss": 1.4231, + "step": 74350 + }, + { + "epoch": 3.1278981150608836, + "grad_norm": 0.2511323094367981, + "learning_rate": 7.167275547411376e-05, + "loss": 1.4138, + "step": 74360 + }, + { + "epoch": 3.128440056226396, + "grad_norm": 0.2534506320953369, + "learning_rate": 7.16580434540464e-05, + "loss": 1.4225, + "step": 74370 + }, + { + "epoch": 3.128981997391908, + "grad_norm": 0.26025325059890747, + "learning_rate": 7.164332937045295e-05, + "loss": 1.425, + "step": 74380 + }, + { + "epoch": 3.129252967974664, + "eval_loss": 2.41723370552063, + "eval_runtime": 22.0929, + "eval_samples_per_second": 226.317, + "eval_steps_per_second": 1.222, + "step": 74385 + }, + { + "epoch": 3.1295239385574205, + "grad_norm": 0.3324603736400604, + "learning_rate": 7.162861322515613e-05, + "loss": 1.43, + "step": 74390 + }, + { + "epoch": 3.1300658797229324, + "grad_norm": 0.21369896829128265, + "learning_rate": 7.161389501997889e-05, + "loss": 1.4211, + "step": 74400 + }, + { + "epoch": 3.1306078208884447, + "grad_norm": 0.5364189743995667, + "learning_rate": 7.159917475674447e-05, + "loss": 1.424, + "step": 74410 + }, + { + "epoch": 3.131149762053957, + "grad_norm": 0.29370662569999695, + "learning_rate": 7.158445243727632e-05, + "loss": 1.4254, + "step": 74420 + }, + { + "epoch": 3.1316917032194693, + "grad_norm": 0.4463825821876526, + "learning_rate": 7.156972806339823e-05, + "loss": 1.4264, + "step": 74430 + }, + { + "epoch": 3.1322336443849816, + "grad_norm": 0.181520015001297, + "learning_rate": 7.155500163693417e-05, + "loss": 1.422, + "step": 74440 + }, + { + "epoch": 3.1327755855504935, + "grad_norm": 0.18724457919597626, + "learning_rate": 7.154027315970838e-05, + "loss": 1.4197, + "step": 74450 + }, + { + "epoch": 3.133317526716006, + "grad_norm": 0.19526837766170502, + "learning_rate": 7.152554263354535e-05, + "loss": 1.4262, + "step": 74460 + }, + { + "epoch": 3.133859467881518, + "grad_norm": 0.21833248436450958, + "learning_rate": 7.151081006026984e-05, + "loss": 1.4281, + "step": 74470 + }, + { + "epoch": 3.1339678561146207, + "eval_loss": 2.4146862030029297, + "eval_runtime": 22.0465, + "eval_samples_per_second": 226.794, + "eval_steps_per_second": 1.225, + "step": 74472 + }, + { + "epoch": 3.1344014090470305, + "grad_norm": 0.22797681391239166, + "learning_rate": 7.149607544170687e-05, + "loss": 1.4222, + "step": 74480 + }, + { + "epoch": 3.1349433502125423, + "grad_norm": 0.16637133061885834, + "learning_rate": 7.14813387796817e-05, + "loss": 1.4116, + "step": 74490 + }, + { + "epoch": 3.1354852913780547, + "grad_norm": 0.28121089935302734, + "learning_rate": 7.146660007601984e-05, + "loss": 1.417, + "step": 74500 + }, + { + "epoch": 3.136027232543567, + "grad_norm": 0.19828984141349792, + "learning_rate": 7.145185933254705e-05, + "loss": 1.4322, + "step": 74510 + }, + { + "epoch": 3.1365691737090793, + "grad_norm": 0.2876355051994324, + "learning_rate": 7.143711655108935e-05, + "loss": 1.435, + "step": 74520 + }, + { + "epoch": 3.1371111148745916, + "grad_norm": 0.1621021181344986, + "learning_rate": 7.1422371733473e-05, + "loss": 1.4288, + "step": 74530 + }, + { + "epoch": 3.1376530560401035, + "grad_norm": 0.3681042492389679, + "learning_rate": 7.140762488152457e-05, + "loss": 1.429, + "step": 74540 + }, + { + "epoch": 3.138194997205616, + "grad_norm": 0.22536900639533997, + "learning_rate": 7.139287599707078e-05, + "loss": 1.4203, + "step": 74550 + }, + { + "epoch": 3.138682744254577, + "eval_loss": 2.4159655570983887, + "eval_runtime": 22.1157, + "eval_samples_per_second": 226.084, + "eval_steps_per_second": 1.221, + "step": 74559 + }, + { + "epoch": 3.138736938371128, + "grad_norm": 0.18401791155338287, + "learning_rate": 7.137812508193868e-05, + "loss": 1.4321, + "step": 74560 + }, + { + "epoch": 3.1392788795366404, + "grad_norm": 0.38875266909599304, + "learning_rate": 7.136337213795556e-05, + "loss": 1.4213, + "step": 74570 + }, + { + "epoch": 3.1398208207021527, + "grad_norm": 0.28335729241371155, + "learning_rate": 7.134861716694894e-05, + "loss": 1.4259, + "step": 74580 + }, + { + "epoch": 3.1403627618676646, + "grad_norm": 0.4349845051765442, + "learning_rate": 7.13338601707466e-05, + "loss": 1.4315, + "step": 74590 + }, + { + "epoch": 3.140904703033177, + "grad_norm": 0.2121918946504593, + "learning_rate": 7.131910115117659e-05, + "loss": 1.4343, + "step": 74600 + }, + { + "epoch": 3.1414466441986892, + "grad_norm": 0.281886488199234, + "learning_rate": 7.130434011006716e-05, + "loss": 1.4218, + "step": 74610 + }, + { + "epoch": 3.1419885853642016, + "grad_norm": 0.1884024739265442, + "learning_rate": 7.128957704924689e-05, + "loss": 1.4322, + "step": 74620 + }, + { + "epoch": 3.1425305265297134, + "grad_norm": 0.3853340446949005, + "learning_rate": 7.127481197054452e-05, + "loss": 1.4274, + "step": 74630 + }, + { + "epoch": 3.1430724676952257, + "grad_norm": 0.18657207489013672, + "learning_rate": 7.12600448757891e-05, + "loss": 1.4131, + "step": 74640 + }, + { + "epoch": 3.1433976323945334, + "eval_loss": 2.421818733215332, + "eval_runtime": 22.0955, + "eval_samples_per_second": 226.29, + "eval_steps_per_second": 1.222, + "step": 74646 + }, + { + "epoch": 3.143614408860738, + "grad_norm": 0.17503514885902405, + "learning_rate": 7.124527576680993e-05, + "loss": 1.425, + "step": 74650 + }, + { + "epoch": 3.1441563500262504, + "grad_norm": 0.17085334658622742, + "learning_rate": 7.123050464543652e-05, + "loss": 1.4221, + "step": 74660 + }, + { + "epoch": 3.1446982911917627, + "grad_norm": 0.19825150072574615, + "learning_rate": 7.121573151349863e-05, + "loss": 1.4195, + "step": 74670 + }, + { + "epoch": 3.1452402323572746, + "grad_norm": 0.29708579182624817, + "learning_rate": 7.120095637282636e-05, + "loss": 1.4341, + "step": 74680 + }, + { + "epoch": 3.145782173522787, + "grad_norm": 0.2100587636232376, + "learning_rate": 7.118617922524996e-05, + "loss": 1.4208, + "step": 74690 + }, + { + "epoch": 3.146324114688299, + "grad_norm": 0.2220253199338913, + "learning_rate": 7.117140007259993e-05, + "loss": 1.4305, + "step": 74700 + }, + { + "epoch": 3.1468660558538115, + "grad_norm": 0.17661844193935394, + "learning_rate": 7.115661891670706e-05, + "loss": 1.4279, + "step": 74710 + }, + { + "epoch": 3.1474079970193234, + "grad_norm": 0.19139379262924194, + "learning_rate": 7.11418357594024e-05, + "loss": 1.4318, + "step": 74720 + }, + { + "epoch": 3.1479499381848357, + "grad_norm": 0.17980216443538666, + "learning_rate": 7.11270506025172e-05, + "loss": 1.4247, + "step": 74730 + }, + { + "epoch": 3.1481125205344895, + "eval_loss": 2.416665554046631, + "eval_runtime": 22.0909, + "eval_samples_per_second": 226.337, + "eval_steps_per_second": 1.222, + "step": 74733 + }, + { + "epoch": 3.148491879350348, + "grad_norm": 0.24544402956962585, + "learning_rate": 7.111226344788298e-05, + "loss": 1.4119, + "step": 74740 + }, + { + "epoch": 3.1490338205158603, + "grad_norm": 0.26411178708076477, + "learning_rate": 7.109747429733154e-05, + "loss": 1.4213, + "step": 74750 + }, + { + "epoch": 3.1495757616813727, + "grad_norm": 0.2932426333427429, + "learning_rate": 7.108268315269485e-05, + "loss": 1.4356, + "step": 74760 + }, + { + "epoch": 3.1501177028468845, + "grad_norm": 0.35350051522254944, + "learning_rate": 7.106789001580521e-05, + "loss": 1.4253, + "step": 74770 + }, + { + "epoch": 3.150659644012397, + "grad_norm": 0.1805257648229599, + "learning_rate": 7.10530948884951e-05, + "loss": 1.4194, + "step": 74780 + }, + { + "epoch": 3.151201585177909, + "grad_norm": 0.20616401731967926, + "learning_rate": 7.10382977725973e-05, + "loss": 1.4175, + "step": 74790 + }, + { + "epoch": 3.1517435263434215, + "grad_norm": 0.16186383366584778, + "learning_rate": 7.10234986699448e-05, + "loss": 1.4185, + "step": 74800 + }, + { + "epoch": 3.152285467508934, + "grad_norm": 0.31783926486968994, + "learning_rate": 7.100869758237085e-05, + "loss": 1.4254, + "step": 74810 + }, + { + "epoch": 3.1528274086744457, + "grad_norm": 0.20256149768829346, + "learning_rate": 7.099389451170897e-05, + "loss": 1.4193, + "step": 74820 + }, + { + "epoch": 3.1528274086744457, + "eval_loss": 2.4210216999053955, + "eval_runtime": 22.0823, + "eval_samples_per_second": 226.426, + "eval_steps_per_second": 1.223, + "step": 74820 + }, + { + "epoch": 3.153369349839958, + "grad_norm": 0.169488787651062, + "learning_rate": 7.097908945979284e-05, + "loss": 1.4263, + "step": 74830 + }, + { + "epoch": 3.1539112910054703, + "grad_norm": 0.1810005009174347, + "learning_rate": 7.09642824284565e-05, + "loss": 1.4223, + "step": 74840 + }, + { + "epoch": 3.1544532321709826, + "grad_norm": 0.21549201011657715, + "learning_rate": 7.094947341953417e-05, + "loss": 1.4189, + "step": 74850 + }, + { + "epoch": 3.1549951733364945, + "grad_norm": 0.274050772190094, + "learning_rate": 7.09346624348603e-05, + "loss": 1.4256, + "step": 74860 + }, + { + "epoch": 3.155537114502007, + "grad_norm": 0.21615564823150635, + "learning_rate": 7.091984947626964e-05, + "loss": 1.4145, + "step": 74870 + }, + { + "epoch": 3.156079055667519, + "grad_norm": 0.17899556457996368, + "learning_rate": 7.090503454559715e-05, + "loss": 1.4263, + "step": 74880 + }, + { + "epoch": 3.1566209968330314, + "grad_norm": 0.2249787449836731, + "learning_rate": 7.089021764467803e-05, + "loss": 1.4118, + "step": 74890 + }, + { + "epoch": 3.1571629379985433, + "grad_norm": 0.17908982932567596, + "learning_rate": 7.087539877534772e-05, + "loss": 1.4206, + "step": 74900 + }, + { + "epoch": 3.1575422968144022, + "eval_loss": 2.4155755043029785, + "eval_runtime": 21.9967, + "eval_samples_per_second": 227.307, + "eval_steps_per_second": 1.227, + "step": 74907 + }, + { + "epoch": 3.1577048791640556, + "grad_norm": 0.17990335822105408, + "learning_rate": 7.086057793944195e-05, + "loss": 1.4283, + "step": 74910 + }, + { + "epoch": 3.158246820329568, + "grad_norm": 0.19940659403800964, + "learning_rate": 7.084575513879664e-05, + "loss": 1.4323, + "step": 74920 + }, + { + "epoch": 3.1587887614950803, + "grad_norm": 0.1748953014612198, + "learning_rate": 7.083093037524798e-05, + "loss": 1.4191, + "step": 74930 + }, + { + "epoch": 3.1593307026605926, + "grad_norm": 0.16563956439495087, + "learning_rate": 7.08161036506324e-05, + "loss": 1.4231, + "step": 74940 + }, + { + "epoch": 3.1598726438261044, + "grad_norm": 0.21732689440250397, + "learning_rate": 7.080127496678655e-05, + "loss": 1.4266, + "step": 74950 + }, + { + "epoch": 3.1604145849916168, + "grad_norm": 0.17851050198078156, + "learning_rate": 7.078644432554738e-05, + "loss": 1.4195, + "step": 74960 + }, + { + "epoch": 3.160956526157129, + "grad_norm": 0.2197248935699463, + "learning_rate": 7.077161172875202e-05, + "loss": 1.4252, + "step": 74970 + }, + { + "epoch": 3.1614984673226414, + "grad_norm": 0.2679365575313568, + "learning_rate": 7.075677717823787e-05, + "loss": 1.4283, + "step": 74980 + }, + { + "epoch": 3.1620404084881537, + "grad_norm": 0.18566496670246124, + "learning_rate": 7.074194067584256e-05, + "loss": 1.4232, + "step": 74990 + }, + { + "epoch": 3.1622571849543584, + "eval_loss": 2.4319992065429688, + "eval_runtime": 21.9934, + "eval_samples_per_second": 227.341, + "eval_steps_per_second": 1.228, + "step": 74994 + }, + { + "epoch": 3.1625823496536656, + "grad_norm": 0.28876206278800964, + "learning_rate": 7.072710222340399e-05, + "loss": 1.4202, + "step": 75000 + }, + { + "epoch": 3.163124290819178, + "grad_norm": 0.22490562498569489, + "learning_rate": 7.07122618227603e-05, + "loss": 1.424, + "step": 75010 + }, + { + "epoch": 3.16366623198469, + "grad_norm": 0.17399635910987854, + "learning_rate": 7.069741947574981e-05, + "loss": 1.4218, + "step": 75020 + }, + { + "epoch": 3.1642081731502025, + "grad_norm": 0.2350047528743744, + "learning_rate": 7.068257518421116e-05, + "loss": 1.4214, + "step": 75030 + }, + { + "epoch": 3.1647501143157144, + "grad_norm": 0.5656566619873047, + "learning_rate": 7.066772894998316e-05, + "loss": 1.4225, + "step": 75040 + }, + { + "epoch": 3.1652920554812267, + "grad_norm": 0.22501088678836823, + "learning_rate": 7.065288077490493e-05, + "loss": 1.4295, + "step": 75050 + }, + { + "epoch": 3.165833996646739, + "grad_norm": 0.2622585594654083, + "learning_rate": 7.06380306608158e-05, + "loss": 1.411, + "step": 75060 + }, + { + "epoch": 3.1663759378122514, + "grad_norm": 0.2026064693927765, + "learning_rate": 7.062317860955529e-05, + "loss": 1.4129, + "step": 75070 + }, + { + "epoch": 3.1669178789777637, + "grad_norm": 0.16883380711078644, + "learning_rate": 7.060832462296329e-05, + "loss": 1.4127, + "step": 75080 + }, + { + "epoch": 3.1669720730943145, + "eval_loss": 2.4277634620666504, + "eval_runtime": 22.0408, + "eval_samples_per_second": 226.852, + "eval_steps_per_second": 1.225, + "step": 75081 + }, + { + "epoch": 3.1674598201432755, + "grad_norm": 0.17658884823322296, + "learning_rate": 7.059346870287978e-05, + "loss": 1.4141, + "step": 75090 + }, + { + "epoch": 3.168001761308788, + "grad_norm": 0.16324864327907562, + "learning_rate": 7.057861085114506e-05, + "loss": 1.4321, + "step": 75100 + }, + { + "epoch": 3.1685437024743, + "grad_norm": 0.17185302078723907, + "learning_rate": 7.056375106959967e-05, + "loss": 1.4238, + "step": 75110 + }, + { + "epoch": 3.1690856436398125, + "grad_norm": 0.23366904258728027, + "learning_rate": 7.054888936008437e-05, + "loss": 1.4175, + "step": 75120 + }, + { + "epoch": 3.1696275848053244, + "grad_norm": 0.23290032148361206, + "learning_rate": 7.053402572444017e-05, + "loss": 1.4307, + "step": 75130 + }, + { + "epoch": 3.1701695259708367, + "grad_norm": 0.19138121604919434, + "learning_rate": 7.05191601645083e-05, + "loss": 1.4317, + "step": 75140 + }, + { + "epoch": 3.170711467136349, + "grad_norm": 0.1849122941493988, + "learning_rate": 7.050429268213023e-05, + "loss": 1.431, + "step": 75150 + }, + { + "epoch": 3.1712534083018613, + "grad_norm": 0.2830689549446106, + "learning_rate": 7.04894232791477e-05, + "loss": 1.413, + "step": 75160 + }, + { + "epoch": 3.171686961234271, + "eval_loss": 2.426955223083496, + "eval_runtime": 22.0447, + "eval_samples_per_second": 226.812, + "eval_steps_per_second": 1.225, + "step": 75168 + }, + { + "epoch": 3.1717953494673736, + "grad_norm": 0.21067143976688385, + "learning_rate": 7.047455195740268e-05, + "loss": 1.4298, + "step": 75170 + }, + { + "epoch": 3.1723372906328855, + "grad_norm": 0.2845075726509094, + "learning_rate": 7.045967871873734e-05, + "loss": 1.4164, + "step": 75180 + }, + { + "epoch": 3.172879231798398, + "grad_norm": 0.2222658097743988, + "learning_rate": 7.044480356499412e-05, + "loss": 1.4213, + "step": 75190 + }, + { + "epoch": 3.17342117296391, + "grad_norm": 0.2342739850282669, + "learning_rate": 7.042992649801568e-05, + "loss": 1.425, + "step": 75200 + }, + { + "epoch": 3.1739631141294224, + "grad_norm": 0.16035151481628418, + "learning_rate": 7.041504751964494e-05, + "loss": 1.4337, + "step": 75210 + }, + { + "epoch": 3.1745050552949348, + "grad_norm": 0.1617584526538849, + "learning_rate": 7.040016663172505e-05, + "loss": 1.4214, + "step": 75220 + }, + { + "epoch": 3.1750469964604466, + "grad_norm": 0.18865042924880981, + "learning_rate": 7.038528383609936e-05, + "loss": 1.4262, + "step": 75230 + }, + { + "epoch": 3.175588937625959, + "grad_norm": 0.15919503569602966, + "learning_rate": 7.037039913461152e-05, + "loss": 1.4259, + "step": 75240 + }, + { + "epoch": 3.1761308787914713, + "grad_norm": 0.18104073405265808, + "learning_rate": 7.035551252910535e-05, + "loss": 1.4228, + "step": 75250 + }, + { + "epoch": 3.176401849374227, + "eval_loss": 2.4324724674224854, + "eval_runtime": 22.1726, + "eval_samples_per_second": 225.504, + "eval_steps_per_second": 1.218, + "step": 75255 + }, + { + "epoch": 3.1766728199569836, + "grad_norm": 0.24971145391464233, + "learning_rate": 7.034062402142497e-05, + "loss": 1.4224, + "step": 75260 + }, + { + "epoch": 3.1772147611224955, + "grad_norm": 0.22338011860847473, + "learning_rate": 7.032573361341469e-05, + "loss": 1.4086, + "step": 75270 + }, + { + "epoch": 3.1777567022880078, + "grad_norm": 0.2989422082901001, + "learning_rate": 7.031084130691905e-05, + "loss": 1.4188, + "step": 75280 + }, + { + "epoch": 3.17829864345352, + "grad_norm": 0.38781970739364624, + "learning_rate": 7.029594710378288e-05, + "loss": 1.4263, + "step": 75290 + }, + { + "epoch": 3.1788405846190324, + "grad_norm": 0.27515822649002075, + "learning_rate": 7.028105100585118e-05, + "loss": 1.4258, + "step": 75300 + }, + { + "epoch": 3.1793825257845447, + "grad_norm": 0.24076929688453674, + "learning_rate": 7.026615301496923e-05, + "loss": 1.4264, + "step": 75310 + }, + { + "epoch": 3.1799244669500566, + "grad_norm": 0.1977168768644333, + "learning_rate": 7.025125313298253e-05, + "loss": 1.4267, + "step": 75320 + }, + { + "epoch": 3.180466408115569, + "grad_norm": 0.20575913786888123, + "learning_rate": 7.02363513617368e-05, + "loss": 1.4178, + "step": 75330 + }, + { + "epoch": 3.1810083492810812, + "grad_norm": 0.25656020641326904, + "learning_rate": 7.022144770307801e-05, + "loss": 1.4224, + "step": 75340 + }, + { + "epoch": 3.181116737514184, + "eval_loss": 2.4162216186523438, + "eval_runtime": 21.9936, + "eval_samples_per_second": 227.339, + "eval_steps_per_second": 1.228, + "step": 75342 + }, + { + "epoch": 3.1815502904465935, + "grad_norm": 0.20052428543567657, + "learning_rate": 7.020654215885236e-05, + "loss": 1.4175, + "step": 75350 + }, + { + "epoch": 3.1820922316121054, + "grad_norm": 0.2730656564235687, + "learning_rate": 7.01916347309063e-05, + "loss": 1.427, + "step": 75360 + }, + { + "epoch": 3.1826341727776177, + "grad_norm": 0.19795359671115875, + "learning_rate": 7.017672542108648e-05, + "loss": 1.4233, + "step": 75370 + }, + { + "epoch": 3.18317611394313, + "grad_norm": 0.25802189111709595, + "learning_rate": 7.016181423123981e-05, + "loss": 1.4181, + "step": 75380 + }, + { + "epoch": 3.1837180551086424, + "grad_norm": 0.2566634714603424, + "learning_rate": 7.01469011632134e-05, + "loss": 1.4277, + "step": 75390 + }, + { + "epoch": 3.1842599962741547, + "grad_norm": 0.18765607476234436, + "learning_rate": 7.013198621885465e-05, + "loss": 1.4108, + "step": 75400 + }, + { + "epoch": 3.1848019374396666, + "grad_norm": 0.2734335660934448, + "learning_rate": 7.011706940001113e-05, + "loss": 1.4238, + "step": 75410 + }, + { + "epoch": 3.185343878605179, + "grad_norm": 0.2739078402519226, + "learning_rate": 7.010215070853071e-05, + "loss": 1.4204, + "step": 75420 + }, + { + "epoch": 3.18583162565414, + "eval_loss": 2.4123969078063965, + "eval_runtime": 21.9921, + "eval_samples_per_second": 227.354, + "eval_steps_per_second": 1.228, + "step": 75429 + }, + { + "epoch": 3.185885819770691, + "grad_norm": 0.18177048861980438, + "learning_rate": 7.008723014626142e-05, + "loss": 1.4138, + "step": 75430 + }, + { + "epoch": 3.1864277609362035, + "grad_norm": 0.2982783615589142, + "learning_rate": 7.007230771505155e-05, + "loss": 1.4217, + "step": 75440 + }, + { + "epoch": 3.186969702101716, + "grad_norm": 0.24831917881965637, + "learning_rate": 7.005738341674964e-05, + "loss": 1.4239, + "step": 75450 + }, + { + "epoch": 3.1875116432672277, + "grad_norm": 0.20293238759040833, + "learning_rate": 7.004245725320445e-05, + "loss": 1.4171, + "step": 75460 + }, + { + "epoch": 3.18805358443274, + "grad_norm": 0.27422091364860535, + "learning_rate": 7.002752922626496e-05, + "loss": 1.4212, + "step": 75470 + }, + { + "epoch": 3.1885955255982523, + "grad_norm": 0.19502641260623932, + "learning_rate": 7.001259933778041e-05, + "loss": 1.4212, + "step": 75480 + }, + { + "epoch": 3.1891374667637646, + "grad_norm": 0.1884526014328003, + "learning_rate": 6.999766758960019e-05, + "loss": 1.4176, + "step": 75490 + }, + { + "epoch": 3.1896794079292765, + "grad_norm": 0.1656298190355301, + "learning_rate": 6.998273398357406e-05, + "loss": 1.429, + "step": 75500 + }, + { + "epoch": 3.190221349094789, + "grad_norm": 0.17676104605197906, + "learning_rate": 6.996779852155189e-05, + "loss": 1.423, + "step": 75510 + }, + { + "epoch": 3.190546513794096, + "eval_loss": 2.412719488143921, + "eval_runtime": 21.9959, + "eval_samples_per_second": 227.315, + "eval_steps_per_second": 1.228, + "step": 75516 + }, + { + "epoch": 3.190763290260301, + "grad_norm": 0.3444202244281769, + "learning_rate": 6.99528612053838e-05, + "loss": 1.4231, + "step": 75520 + }, + { + "epoch": 3.1913052314258135, + "grad_norm": 0.20383748412132263, + "learning_rate": 6.99379220369202e-05, + "loss": 1.4305, + "step": 75530 + }, + { + "epoch": 3.1918471725913253, + "grad_norm": 0.2849511206150055, + "learning_rate": 6.992298101801167e-05, + "loss": 1.4264, + "step": 75540 + }, + { + "epoch": 3.1923891137568376, + "grad_norm": 0.20015236735343933, + "learning_rate": 6.990803815050903e-05, + "loss": 1.4288, + "step": 75550 + }, + { + "epoch": 3.19293105492235, + "grad_norm": 0.19224634766578674, + "learning_rate": 6.989309343626337e-05, + "loss": 1.4244, + "step": 75560 + }, + { + "epoch": 3.1934729960878623, + "grad_norm": 0.16140466928482056, + "learning_rate": 6.987814687712592e-05, + "loss": 1.4274, + "step": 75570 + }, + { + "epoch": 3.1940149372533746, + "grad_norm": 0.1749192178249359, + "learning_rate": 6.986319847494826e-05, + "loss": 1.4347, + "step": 75580 + }, + { + "epoch": 3.1945568784188865, + "grad_norm": 0.16757184267044067, + "learning_rate": 6.98482482315821e-05, + "loss": 1.419, + "step": 75590 + }, + { + "epoch": 3.195098819584399, + "grad_norm": 0.15887753665447235, + "learning_rate": 6.98332961488794e-05, + "loss": 1.4307, + "step": 75600 + }, + { + "epoch": 3.1952614019340526, + "eval_loss": 2.424544095993042, + "eval_runtime": 21.9927, + "eval_samples_per_second": 227.348, + "eval_steps_per_second": 1.228, + "step": 75603 + }, + { + "epoch": 3.195640760749911, + "grad_norm": 0.3752150237560272, + "learning_rate": 6.98183422286924e-05, + "loss": 1.421, + "step": 75610 + }, + { + "epoch": 3.1961827019154234, + "grad_norm": 0.18618738651275635, + "learning_rate": 6.980338647287347e-05, + "loss": 1.4293, + "step": 75620 + }, + { + "epoch": 3.1967246430809357, + "grad_norm": 0.19021612405776978, + "learning_rate": 6.97884288832753e-05, + "loss": 1.4218, + "step": 75630 + }, + { + "epoch": 3.1972665842464476, + "grad_norm": 0.1741076409816742, + "learning_rate": 6.977346946175078e-05, + "loss": 1.4211, + "step": 75640 + }, + { + "epoch": 3.19780852541196, + "grad_norm": 0.1816461980342865, + "learning_rate": 6.975850821015298e-05, + "loss": 1.4124, + "step": 75650 + }, + { + "epoch": 3.1983504665774722, + "grad_norm": 0.22760295867919922, + "learning_rate": 6.974354513033528e-05, + "loss": 1.4251, + "step": 75660 + }, + { + "epoch": 3.1988924077429846, + "grad_norm": 0.255193293094635, + "learning_rate": 6.972858022415119e-05, + "loss": 1.4269, + "step": 75670 + }, + { + "epoch": 3.1994343489084964, + "grad_norm": 0.1877882033586502, + "learning_rate": 6.971361349345455e-05, + "loss": 1.4093, + "step": 75680 + }, + { + "epoch": 3.1999762900740087, + "grad_norm": 0.17674243450164795, + "learning_rate": 6.969864494009934e-05, + "loss": 1.4263, + "step": 75690 + }, + { + "epoch": 3.1999762900740087, + "eval_loss": 2.420743942260742, + "eval_runtime": 21.9872, + "eval_samples_per_second": 227.405, + "eval_steps_per_second": 1.228, + "step": 75690 + }, + { + "epoch": 3.200518231239521, + "grad_norm": 0.1850854456424713, + "learning_rate": 6.968367456593983e-05, + "loss": 1.4203, + "step": 75700 + }, + { + "epoch": 3.2010601724050334, + "grad_norm": 0.25063949823379517, + "learning_rate": 6.966870237283045e-05, + "loss": 1.4245, + "step": 75710 + }, + { + "epoch": 3.2016021135705457, + "grad_norm": 0.23362627625465393, + "learning_rate": 6.96537283626259e-05, + "loss": 1.4203, + "step": 75720 + }, + { + "epoch": 3.2021440547360576, + "grad_norm": 0.17846883833408356, + "learning_rate": 6.96387525371811e-05, + "loss": 1.4251, + "step": 75730 + }, + { + "epoch": 3.20268599590157, + "grad_norm": 0.2213391810655594, + "learning_rate": 6.96237748983512e-05, + "loss": 1.4155, + "step": 75740 + }, + { + "epoch": 3.203227937067082, + "grad_norm": 0.1812654733657837, + "learning_rate": 6.960879544799156e-05, + "loss": 1.4178, + "step": 75750 + }, + { + "epoch": 3.2037698782325945, + "grad_norm": 0.18986418843269348, + "learning_rate": 6.959381418795775e-05, + "loss": 1.4217, + "step": 75760 + }, + { + "epoch": 3.2043118193981064, + "grad_norm": 0.15823641419410706, + "learning_rate": 6.957883112010563e-05, + "loss": 1.4217, + "step": 75770 + }, + { + "epoch": 3.2046911782139653, + "eval_loss": 2.4173123836517334, + "eval_runtime": 21.9872, + "eval_samples_per_second": 227.405, + "eval_steps_per_second": 1.228, + "step": 75777 + }, + { + "epoch": 3.2048537605636187, + "grad_norm": 0.25696510076522827, + "learning_rate": 6.956384624629117e-05, + "loss": 1.4149, + "step": 75780 + }, + { + "epoch": 3.205395701729131, + "grad_norm": 0.18557502329349518, + "learning_rate": 6.954885956837067e-05, + "loss": 1.4202, + "step": 75790 + }, + { + "epoch": 3.2059376428946433, + "grad_norm": 0.2925853133201599, + "learning_rate": 6.953387108820064e-05, + "loss": 1.4157, + "step": 75800 + }, + { + "epoch": 3.2064795840601557, + "grad_norm": 0.17210768163204193, + "learning_rate": 6.951888080763772e-05, + "loss": 1.4136, + "step": 75810 + }, + { + "epoch": 3.2070215252256675, + "grad_norm": 0.17783254384994507, + "learning_rate": 6.950388872853891e-05, + "loss": 1.4264, + "step": 75820 + }, + { + "epoch": 3.20756346639118, + "grad_norm": 0.1816641092300415, + "learning_rate": 6.948889485276132e-05, + "loss": 1.4121, + "step": 75830 + }, + { + "epoch": 3.208105407556692, + "grad_norm": 0.25515398383140564, + "learning_rate": 6.947389918216234e-05, + "loss": 1.4282, + "step": 75840 + }, + { + "epoch": 3.2086473487222045, + "grad_norm": 0.16730128228664398, + "learning_rate": 6.94589017185996e-05, + "loss": 1.4233, + "step": 75850 + }, + { + "epoch": 3.209189289887717, + "grad_norm": 0.2866290509700775, + "learning_rate": 6.944390246393085e-05, + "loss": 1.4215, + "step": 75860 + }, + { + "epoch": 3.2094060663539214, + "eval_loss": 2.418275833129883, + "eval_runtime": 21.9933, + "eval_samples_per_second": 227.342, + "eval_steps_per_second": 1.228, + "step": 75864 + }, + { + "epoch": 3.2097312310532287, + "grad_norm": 0.21577335894107819, + "learning_rate": 6.942890142001418e-05, + "loss": 1.4179, + "step": 75870 + }, + { + "epoch": 3.210273172218741, + "grad_norm": 0.20668751001358032, + "learning_rate": 6.941389858870785e-05, + "loss": 1.4216, + "step": 75880 + }, + { + "epoch": 3.2108151133842533, + "grad_norm": 0.23815388977527618, + "learning_rate": 6.939889397187034e-05, + "loss": 1.4147, + "step": 75890 + }, + { + "epoch": 3.2113570545497656, + "grad_norm": 0.18798255920410156, + "learning_rate": 6.938388757136036e-05, + "loss": 1.4145, + "step": 75900 + }, + { + "epoch": 3.2118989957152775, + "grad_norm": 0.24447955191135406, + "learning_rate": 6.936887938903684e-05, + "loss": 1.43, + "step": 75910 + }, + { + "epoch": 3.21244093688079, + "grad_norm": 0.288022518157959, + "learning_rate": 6.935386942675892e-05, + "loss": 1.4171, + "step": 75920 + }, + { + "epoch": 3.212982878046302, + "grad_norm": 0.22881217300891876, + "learning_rate": 6.933885768638598e-05, + "loss": 1.4228, + "step": 75930 + }, + { + "epoch": 3.2135248192118144, + "grad_norm": 0.20432133972644806, + "learning_rate": 6.932384416977759e-05, + "loss": 1.4242, + "step": 75940 + }, + { + "epoch": 3.2140667603773263, + "grad_norm": 0.21099141240119934, + "learning_rate": 6.930882887879359e-05, + "loss": 1.4182, + "step": 75950 + }, + { + "epoch": 3.2141209544938776, + "eval_loss": 2.428762674331665, + "eval_runtime": 21.9941, + "eval_samples_per_second": 227.334, + "eval_steps_per_second": 1.228, + "step": 75951 + }, + { + "epoch": 3.2146087015428386, + "grad_norm": 0.19293814897537231, + "learning_rate": 6.929381181529399e-05, + "loss": 1.4111, + "step": 75960 + }, + { + "epoch": 3.215150642708351, + "grad_norm": 0.209365114569664, + "learning_rate": 6.927879298113901e-05, + "loss": 1.4083, + "step": 75970 + }, + { + "epoch": 3.2156925838738633, + "grad_norm": 0.20382948219776154, + "learning_rate": 6.926377237818917e-05, + "loss": 1.4229, + "step": 75980 + }, + { + "epoch": 3.2162345250393756, + "grad_norm": 0.18336911499500275, + "learning_rate": 6.924875000830513e-05, + "loss": 1.4244, + "step": 75990 + }, + { + "epoch": 3.2167764662048874, + "grad_norm": 0.18866483867168427, + "learning_rate": 6.92337258733478e-05, + "loss": 1.4241, + "step": 76000 + }, + { + "epoch": 3.2173184073703998, + "grad_norm": 0.21788087487220764, + "learning_rate": 6.921869997517834e-05, + "loss": 1.4319, + "step": 76010 + }, + { + "epoch": 3.217860348535912, + "grad_norm": 0.1925356537103653, + "learning_rate": 6.9203672315658e-05, + "loss": 1.4204, + "step": 76020 + }, + { + "epoch": 3.2184022897014244, + "grad_norm": 0.24953441321849823, + "learning_rate": 6.918864289664845e-05, + "loss": 1.4209, + "step": 76030 + }, + { + "epoch": 3.218835842633834, + "eval_loss": 2.4290771484375, + "eval_runtime": 21.9942, + "eval_samples_per_second": 227.333, + "eval_steps_per_second": 1.228, + "step": 76038 + }, + { + "epoch": 3.2189442308669367, + "grad_norm": 0.18474015593528748, + "learning_rate": 6.917361172001139e-05, + "loss": 1.4223, + "step": 76040 + }, + { + "epoch": 3.2194861720324486, + "grad_norm": 0.27110907435417175, + "learning_rate": 6.915857878760885e-05, + "loss": 1.4256, + "step": 76050 + }, + { + "epoch": 3.220028113197961, + "grad_norm": 0.19559454917907715, + "learning_rate": 6.914354410130305e-05, + "loss": 1.4299, + "step": 76060 + }, + { + "epoch": 3.220570054363473, + "grad_norm": 0.16140873730182648, + "learning_rate": 6.912850766295641e-05, + "loss": 1.4068, + "step": 76070 + }, + { + "epoch": 3.2211119955289855, + "grad_norm": 0.17691193521022797, + "learning_rate": 6.911346947443157e-05, + "loss": 1.4216, + "step": 76080 + }, + { + "epoch": 3.2216539366944974, + "grad_norm": 0.173243448138237, + "learning_rate": 6.909842953759142e-05, + "loss": 1.4172, + "step": 76090 + }, + { + "epoch": 3.2221958778600097, + "grad_norm": 0.3001802861690521, + "learning_rate": 6.908338785429901e-05, + "loss": 1.4205, + "step": 76100 + }, + { + "epoch": 3.222737819025522, + "grad_norm": 0.21718403697013855, + "learning_rate": 6.906834442641768e-05, + "loss": 1.4201, + "step": 76110 + }, + { + "epoch": 3.2232797601910343, + "grad_norm": 0.19174130260944366, + "learning_rate": 6.905329925581089e-05, + "loss": 1.4161, + "step": 76120 + }, + { + "epoch": 3.2235507307737903, + "eval_loss": 2.4190409183502197, + "eval_runtime": 21.9914, + "eval_samples_per_second": 227.361, + "eval_steps_per_second": 1.228, + "step": 76125 + }, + { + "epoch": 3.2238217013565467, + "grad_norm": 0.28363704681396484, + "learning_rate": 6.903825234434243e-05, + "loss": 1.4222, + "step": 76130 + }, + { + "epoch": 3.2243636425220585, + "grad_norm": 0.21950308978557587, + "learning_rate": 6.902320369387621e-05, + "loss": 1.4202, + "step": 76140 + }, + { + "epoch": 3.224905583687571, + "grad_norm": 0.22058185935020447, + "learning_rate": 6.90081533062764e-05, + "loss": 1.4147, + "step": 76150 + }, + { + "epoch": 3.225447524853083, + "grad_norm": 0.354383260011673, + "learning_rate": 6.899310118340736e-05, + "loss": 1.4167, + "step": 76160 + }, + { + "epoch": 3.2259894660185955, + "grad_norm": 0.3099483251571655, + "learning_rate": 6.897804732713371e-05, + "loss": 1.4339, + "step": 76170 + }, + { + "epoch": 3.2265314071841074, + "grad_norm": 0.33063191175460815, + "learning_rate": 6.896299173932023e-05, + "loss": 1.4213, + "step": 76180 + }, + { + "epoch": 3.2270733483496197, + "grad_norm": 0.17252486944198608, + "learning_rate": 6.894793442183199e-05, + "loss": 1.4295, + "step": 76190 + }, + { + "epoch": 3.227615289515132, + "grad_norm": 0.20971451699733734, + "learning_rate": 6.893287537653417e-05, + "loss": 1.4133, + "step": 76200 + }, + { + "epoch": 3.2281572306806443, + "grad_norm": 0.16854190826416016, + "learning_rate": 6.891781460529223e-05, + "loss": 1.4208, + "step": 76210 + }, + { + "epoch": 3.228265618913747, + "eval_loss": 2.4187822341918945, + "eval_runtime": 21.9892, + "eval_samples_per_second": 227.385, + "eval_steps_per_second": 1.228, + "step": 76212 + }, + { + "epoch": 3.2286991718461566, + "grad_norm": 0.1864662617444992, + "learning_rate": 6.890275210997185e-05, + "loss": 1.4251, + "step": 76220 + }, + { + "epoch": 3.2292411130116685, + "grad_norm": 0.15556700527668, + "learning_rate": 6.888768789243889e-05, + "loss": 1.4134, + "step": 76230 + }, + { + "epoch": 3.229783054177181, + "grad_norm": 0.17359033226966858, + "learning_rate": 6.887262195455946e-05, + "loss": 1.4146, + "step": 76240 + }, + { + "epoch": 3.230324995342693, + "grad_norm": 0.29231011867523193, + "learning_rate": 6.885755429819985e-05, + "loss": 1.4274, + "step": 76250 + }, + { + "epoch": 3.2308669365082054, + "grad_norm": 0.16642163693904877, + "learning_rate": 6.884248492522656e-05, + "loss": 1.4075, + "step": 76260 + }, + { + "epoch": 3.2314088776737178, + "grad_norm": 0.4484625458717346, + "learning_rate": 6.882741383750635e-05, + "loss": 1.4122, + "step": 76270 + }, + { + "epoch": 3.2319508188392296, + "grad_norm": 0.14720787107944489, + "learning_rate": 6.881234103690616e-05, + "loss": 1.4155, + "step": 76280 + }, + { + "epoch": 3.232492760004742, + "grad_norm": 0.19290123879909515, + "learning_rate": 6.87972665252931e-05, + "loss": 1.4226, + "step": 76290 + }, + { + "epoch": 3.232980507053703, + "eval_loss": 2.4077162742614746, + "eval_runtime": 21.993, + "eval_samples_per_second": 227.345, + "eval_steps_per_second": 1.228, + "step": 76299 + }, + { + "epoch": 3.2330347011702543, + "grad_norm": 0.18437008559703827, + "learning_rate": 6.878219030453459e-05, + "loss": 1.4208, + "step": 76300 + }, + { + "epoch": 3.2335766423357666, + "grad_norm": 0.1884869635105133, + "learning_rate": 6.876711237649816e-05, + "loss": 1.416, + "step": 76310 + }, + { + "epoch": 3.2341185835012785, + "grad_norm": 0.19494092464447021, + "learning_rate": 6.875203274305163e-05, + "loss": 1.4133, + "step": 76320 + }, + { + "epoch": 3.2346605246667908, + "grad_norm": 0.16780586540699005, + "learning_rate": 6.8736951406063e-05, + "loss": 1.4222, + "step": 76330 + }, + { + "epoch": 3.235202465832303, + "grad_norm": 0.24331122636795044, + "learning_rate": 6.872186836740046e-05, + "loss": 1.4253, + "step": 76340 + }, + { + "epoch": 3.2357444069978154, + "grad_norm": 0.2462978959083557, + "learning_rate": 6.870678362893243e-05, + "loss": 1.4215, + "step": 76350 + }, + { + "epoch": 3.2362863481633277, + "grad_norm": 0.3127591907978058, + "learning_rate": 6.869169719252756e-05, + "loss": 1.4199, + "step": 76360 + }, + { + "epoch": 3.2368282893288396, + "grad_norm": 0.19799236953258514, + "learning_rate": 6.867660906005467e-05, + "loss": 1.4101, + "step": 76370 + }, + { + "epoch": 3.237370230494352, + "grad_norm": 0.4551452696323395, + "learning_rate": 6.866151923338286e-05, + "loss": 1.4229, + "step": 76380 + }, + { + "epoch": 3.237695395193659, + "eval_loss": 2.4275617599487305, + "eval_runtime": 21.99, + "eval_samples_per_second": 227.376, + "eval_steps_per_second": 1.228, + "step": 76386 + }, + { + "epoch": 3.2379121716598642, + "grad_norm": 0.19475868344306946, + "learning_rate": 6.864642771438136e-05, + "loss": 1.4142, + "step": 76390 + }, + { + "epoch": 3.2384541128253765, + "grad_norm": 0.5060222148895264, + "learning_rate": 6.863133450491961e-05, + "loss": 1.4153, + "step": 76400 + }, + { + "epoch": 3.2389960539908884, + "grad_norm": 0.2045438289642334, + "learning_rate": 6.861623960686734e-05, + "loss": 1.413, + "step": 76410 + }, + { + "epoch": 3.2395379951564007, + "grad_norm": 0.24009686708450317, + "learning_rate": 6.860114302209443e-05, + "loss": 1.4246, + "step": 76420 + }, + { + "epoch": 3.240079936321913, + "grad_norm": 0.1824415624141693, + "learning_rate": 6.858604475247097e-05, + "loss": 1.4248, + "step": 76430 + }, + { + "epoch": 3.2406218774874254, + "grad_norm": 0.19345873594284058, + "learning_rate": 6.857094479986726e-05, + "loss": 1.4301, + "step": 76440 + }, + { + "epoch": 3.2411638186529377, + "grad_norm": 0.17767898738384247, + "learning_rate": 6.855584316615384e-05, + "loss": 1.4193, + "step": 76450 + }, + { + "epoch": 3.2417057598184496, + "grad_norm": 0.24474495649337769, + "learning_rate": 6.854073985320141e-05, + "loss": 1.4279, + "step": 76460 + }, + { + "epoch": 3.242247700983962, + "grad_norm": 0.2586842179298401, + "learning_rate": 6.852563486288093e-05, + "loss": 1.4149, + "step": 76470 + }, + { + "epoch": 3.2424102833336157, + "eval_loss": 2.4281370639801025, + "eval_runtime": 21.9931, + "eval_samples_per_second": 227.344, + "eval_steps_per_second": 1.228, + "step": 76473 + }, + { + "epoch": 3.242789642149474, + "grad_norm": 0.20308630168437958, + "learning_rate": 6.85105281970635e-05, + "loss": 1.4204, + "step": 76480 + }, + { + "epoch": 3.2433315833149865, + "grad_norm": 0.17210350930690765, + "learning_rate": 6.849541985762053e-05, + "loss": 1.4308, + "step": 76490 + }, + { + "epoch": 3.243873524480499, + "grad_norm": 0.351959228515625, + "learning_rate": 6.848030984642351e-05, + "loss": 1.4203, + "step": 76500 + }, + { + "epoch": 3.2444154656460107, + "grad_norm": 0.22208164632320404, + "learning_rate": 6.846519816534423e-05, + "loss": 1.4174, + "step": 76510 + }, + { + "epoch": 3.244957406811523, + "grad_norm": 0.3489255905151367, + "learning_rate": 6.845008481625468e-05, + "loss": 1.4243, + "step": 76520 + }, + { + "epoch": 3.2454993479770353, + "grad_norm": 0.20681335031986237, + "learning_rate": 6.843496980102702e-05, + "loss": 1.4338, + "step": 76530 + }, + { + "epoch": 3.2460412891425476, + "grad_norm": 0.20921902358531952, + "learning_rate": 6.841985312153362e-05, + "loss": 1.4329, + "step": 76540 + }, + { + "epoch": 3.2465832303080595, + "grad_norm": 0.2758723199367523, + "learning_rate": 6.840473477964707e-05, + "loss": 1.4197, + "step": 76550 + }, + { + "epoch": 3.247125171473572, + "grad_norm": 0.19785663485527039, + "learning_rate": 6.838961477724018e-05, + "loss": 1.4155, + "step": 76560 + }, + { + "epoch": 3.247125171473572, + "eval_loss": 2.4455416202545166, + "eval_runtime": 21.9858, + "eval_samples_per_second": 227.419, + "eval_steps_per_second": 1.228, + "step": 76560 + }, + { + "epoch": 3.247667112639084, + "grad_norm": 0.16675598919391632, + "learning_rate": 6.837449311618595e-05, + "loss": 1.4148, + "step": 76570 + }, + { + "epoch": 3.2482090538045965, + "grad_norm": 0.2686634957790375, + "learning_rate": 6.835936979835757e-05, + "loss": 1.4247, + "step": 76580 + }, + { + "epoch": 3.2487509949701083, + "grad_norm": 0.16704584658145905, + "learning_rate": 6.834424482562846e-05, + "loss": 1.412, + "step": 76590 + }, + { + "epoch": 3.2492929361356206, + "grad_norm": 0.2601518929004669, + "learning_rate": 6.832911819987224e-05, + "loss": 1.4192, + "step": 76600 + }, + { + "epoch": 3.249834877301133, + "grad_norm": 0.21344710886478424, + "learning_rate": 6.831398992296273e-05, + "loss": 1.4149, + "step": 76610 + }, + { + "epoch": 3.2503768184666453, + "grad_norm": 0.267600953578949, + "learning_rate": 6.829885999677395e-05, + "loss": 1.4142, + "step": 76620 + }, + { + "epoch": 3.2509187596321576, + "grad_norm": 0.1773858517408371, + "learning_rate": 6.828372842318013e-05, + "loss": 1.4171, + "step": 76630 + }, + { + "epoch": 3.2514607007976695, + "grad_norm": 0.1974133402109146, + "learning_rate": 6.826859520405572e-05, + "loss": 1.4088, + "step": 76640 + }, + { + "epoch": 3.2518400596135284, + "eval_loss": 2.4362905025482178, + "eval_runtime": 21.9908, + "eval_samples_per_second": 227.368, + "eval_steps_per_second": 1.228, + "step": 76647 + }, + { + "epoch": 3.252002641963182, + "grad_norm": 0.3224498927593231, + "learning_rate": 6.825346034127531e-05, + "loss": 1.417, + "step": 76650 + }, + { + "epoch": 3.252544583128694, + "grad_norm": 0.32483381032943726, + "learning_rate": 6.823832383671379e-05, + "loss": 1.4267, + "step": 76660 + }, + { + "epoch": 3.2530865242942064, + "grad_norm": 0.1879141479730606, + "learning_rate": 6.82231856922462e-05, + "loss": 1.4178, + "step": 76670 + }, + { + "epoch": 3.2536284654597187, + "grad_norm": 0.21414776146411896, + "learning_rate": 6.820804590974776e-05, + "loss": 1.4175, + "step": 76680 + }, + { + "epoch": 3.2541704066252306, + "grad_norm": 0.19383347034454346, + "learning_rate": 6.819290449109395e-05, + "loss": 1.4178, + "step": 76690 + }, + { + "epoch": 3.254712347790743, + "grad_norm": 0.17373976111412048, + "learning_rate": 6.81777614381604e-05, + "loss": 1.4182, + "step": 76700 + }, + { + "epoch": 3.2552542889562552, + "grad_norm": 0.18311308324337006, + "learning_rate": 6.816261675282297e-05, + "loss": 1.4162, + "step": 76710 + }, + { + "epoch": 3.2557962301217676, + "grad_norm": 0.1700367033481598, + "learning_rate": 6.814747043695772e-05, + "loss": 1.4163, + "step": 76720 + }, + { + "epoch": 3.25633817128728, + "grad_norm": 0.4533218443393707, + "learning_rate": 6.813232249244093e-05, + "loss": 1.4266, + "step": 76730 + }, + { + "epoch": 3.2565549477534845, + "eval_loss": 2.439307689666748, + "eval_runtime": 21.9896, + "eval_samples_per_second": 227.38, + "eval_steps_per_second": 1.228, + "step": 76734 + }, + { + "epoch": 3.2568801124527917, + "grad_norm": 0.2232131063938141, + "learning_rate": 6.811717292114904e-05, + "loss": 1.4208, + "step": 76740 + }, + { + "epoch": 3.257422053618304, + "grad_norm": 0.24600377678871155, + "learning_rate": 6.810202172495873e-05, + "loss": 1.4128, + "step": 76750 + }, + { + "epoch": 3.2579639947838164, + "grad_norm": 0.1848914921283722, + "learning_rate": 6.808686890574683e-05, + "loss": 1.4298, + "step": 76760 + }, + { + "epoch": 3.2585059359493287, + "grad_norm": 0.2184385061264038, + "learning_rate": 6.807171446539042e-05, + "loss": 1.407, + "step": 76770 + }, + { + "epoch": 3.2590478771148406, + "grad_norm": 0.18162274360656738, + "learning_rate": 6.80565584057668e-05, + "loss": 1.4158, + "step": 76780 + }, + { + "epoch": 3.259589818280353, + "grad_norm": 0.18968087434768677, + "learning_rate": 6.804140072875338e-05, + "loss": 1.416, + "step": 76790 + }, + { + "epoch": 3.260131759445865, + "grad_norm": 0.19787006080150604, + "learning_rate": 6.802624143622789e-05, + "loss": 1.4232, + "step": 76800 + }, + { + "epoch": 3.2606737006113775, + "grad_norm": 0.19424383342266083, + "learning_rate": 6.801108053006815e-05, + "loss": 1.4144, + "step": 76810 + }, + { + "epoch": 3.2612156417768894, + "grad_norm": 0.23365655541419983, + "learning_rate": 6.799591801215223e-05, + "loss": 1.4212, + "step": 76820 + }, + { + "epoch": 3.2612698358934407, + "eval_loss": 2.4371750354766846, + "eval_runtime": 21.9951, + "eval_samples_per_second": 227.324, + "eval_steps_per_second": 1.228, + "step": 76821 + }, + { + "epoch": 3.2617575829424017, + "grad_norm": 0.20863786339759827, + "learning_rate": 6.798075388435845e-05, + "loss": 1.4189, + "step": 76830 + }, + { + "epoch": 3.262299524107914, + "grad_norm": 0.18672865629196167, + "learning_rate": 6.79655881485652e-05, + "loss": 1.4129, + "step": 76840 + }, + { + "epoch": 3.2628414652734263, + "grad_norm": 0.3191605508327484, + "learning_rate": 6.795042080665117e-05, + "loss": 1.4204, + "step": 76850 + }, + { + "epoch": 3.2633834064389386, + "grad_norm": 0.1636675000190735, + "learning_rate": 6.793525186049527e-05, + "loss": 1.422, + "step": 76860 + }, + { + "epoch": 3.2639253476044505, + "grad_norm": 0.2027965933084488, + "learning_rate": 6.792008131197651e-05, + "loss": 1.4215, + "step": 76870 + }, + { + "epoch": 3.264467288769963, + "grad_norm": 0.1797472983598709, + "learning_rate": 6.790490916297419e-05, + "loss": 1.4197, + "step": 76880 + }, + { + "epoch": 3.265009229935475, + "grad_norm": 0.2633003294467926, + "learning_rate": 6.788973541536772e-05, + "loss": 1.418, + "step": 76890 + }, + { + "epoch": 3.2655511711009875, + "grad_norm": 0.30197060108184814, + "learning_rate": 6.78745600710368e-05, + "loss": 1.4259, + "step": 76900 + }, + { + "epoch": 3.2659847240333972, + "eval_loss": 2.4383397102355957, + "eval_runtime": 21.995, + "eval_samples_per_second": 227.325, + "eval_steps_per_second": 1.228, + "step": 76908 + }, + { + "epoch": 3.2660931122665, + "grad_norm": 0.21342779695987701, + "learning_rate": 6.785938313186128e-05, + "loss": 1.4201, + "step": 76910 + }, + { + "epoch": 3.2666350534320117, + "grad_norm": 0.18133488297462463, + "learning_rate": 6.784420459972122e-05, + "loss": 1.4201, + "step": 76920 + }, + { + "epoch": 3.267176994597524, + "grad_norm": 0.2161795049905777, + "learning_rate": 6.782902447649684e-05, + "loss": 1.4147, + "step": 76930 + }, + { + "epoch": 3.2677189357630363, + "grad_norm": 0.22516366839408875, + "learning_rate": 6.78138427640686e-05, + "loss": 1.4197, + "step": 76940 + }, + { + "epoch": 3.2682608769285486, + "grad_norm": 0.20277120172977448, + "learning_rate": 6.779865946431716e-05, + "loss": 1.4132, + "step": 76950 + }, + { + "epoch": 3.2688028180940605, + "grad_norm": 0.2785748243331909, + "learning_rate": 6.778347457912335e-05, + "loss": 1.4275, + "step": 76960 + }, + { + "epoch": 3.269344759259573, + "grad_norm": 0.2544669210910797, + "learning_rate": 6.776828811036821e-05, + "loss": 1.4225, + "step": 76970 + }, + { + "epoch": 3.269886700425085, + "grad_norm": 0.16335433721542358, + "learning_rate": 6.775310005993297e-05, + "loss": 1.4102, + "step": 76980 + }, + { + "epoch": 3.2704286415905974, + "grad_norm": 0.249395951628685, + "learning_rate": 6.773791042969907e-05, + "loss": 1.4151, + "step": 76990 + }, + { + "epoch": 3.2706996121733534, + "eval_loss": 2.4316024780273438, + "eval_runtime": 21.9962, + "eval_samples_per_second": 227.312, + "eval_steps_per_second": 1.227, + "step": 76995 + }, + { + "epoch": 3.2709705827561093, + "grad_norm": 0.17765724658966064, + "learning_rate": 6.772271922154814e-05, + "loss": 1.4182, + "step": 77000 + }, + { + "epoch": 3.2715125239216216, + "grad_norm": 0.2077697366476059, + "learning_rate": 6.770752643736196e-05, + "loss": 1.4264, + "step": 77010 + }, + { + "epoch": 3.272054465087134, + "grad_norm": 0.18246911466121674, + "learning_rate": 6.769233207902261e-05, + "loss": 1.4242, + "step": 77020 + }, + { + "epoch": 3.2725964062526463, + "grad_norm": 0.3925098478794098, + "learning_rate": 6.767713614841223e-05, + "loss": 1.4246, + "step": 77030 + }, + { + "epoch": 3.2731383474181586, + "grad_norm": 0.21297501027584076, + "learning_rate": 6.766193864741327e-05, + "loss": 1.4159, + "step": 77040 + }, + { + "epoch": 3.2736802885836704, + "grad_norm": 0.19493433833122253, + "learning_rate": 6.764673957790834e-05, + "loss": 1.4209, + "step": 77050 + }, + { + "epoch": 3.2742222297491828, + "grad_norm": 0.26771730184555054, + "learning_rate": 6.763153894178022e-05, + "loss": 1.4178, + "step": 77060 + }, + { + "epoch": 3.274764170914695, + "grad_norm": 0.2222745716571808, + "learning_rate": 6.761633674091187e-05, + "loss": 1.4288, + "step": 77070 + }, + { + "epoch": 3.2753061120802074, + "grad_norm": 0.27417925000190735, + "learning_rate": 6.760113297718653e-05, + "loss": 1.4272, + "step": 77080 + }, + { + "epoch": 3.2754145003133095, + "eval_loss": 2.4333410263061523, + "eval_runtime": 21.988, + "eval_samples_per_second": 227.397, + "eval_steps_per_second": 1.228, + "step": 77082 + }, + { + "epoch": 3.2758480532457197, + "grad_norm": 0.4127950966358185, + "learning_rate": 6.758592765248752e-05, + "loss": 1.4098, + "step": 77090 + }, + { + "epoch": 3.2763899944112316, + "grad_norm": 0.2694561779499054, + "learning_rate": 6.757072076869845e-05, + "loss": 1.4175, + "step": 77100 + }, + { + "epoch": 3.276931935576744, + "grad_norm": 0.4011683166027069, + "learning_rate": 6.755551232770306e-05, + "loss": 1.4237, + "step": 77110 + }, + { + "epoch": 3.277473876742256, + "grad_norm": 0.20086224377155304, + "learning_rate": 6.754030233138533e-05, + "loss": 1.4171, + "step": 77120 + }, + { + "epoch": 3.2780158179077685, + "grad_norm": 0.2316787838935852, + "learning_rate": 6.752509078162938e-05, + "loss": 1.4164, + "step": 77130 + }, + { + "epoch": 3.278557759073281, + "grad_norm": 0.259022980928421, + "learning_rate": 6.750987768031954e-05, + "loss": 1.4197, + "step": 77140 + }, + { + "epoch": 3.2790997002387927, + "grad_norm": 0.42053425312042236, + "learning_rate": 6.749466302934042e-05, + "loss": 1.4367, + "step": 77150 + }, + { + "epoch": 3.279641641404305, + "grad_norm": 0.1725689321756363, + "learning_rate": 6.747944683057666e-05, + "loss": 1.4162, + "step": 77160 + }, + { + "epoch": 3.280129388453266, + "eval_loss": 2.433736801147461, + "eval_runtime": 21.9879, + "eval_samples_per_second": 227.397, + "eval_steps_per_second": 1.228, + "step": 77169 + }, + { + "epoch": 3.2801835825698173, + "grad_norm": 0.183704674243927, + "learning_rate": 6.746422908591318e-05, + "loss": 1.413, + "step": 77170 + }, + { + "epoch": 3.2807255237353297, + "grad_norm": 0.2713957130908966, + "learning_rate": 6.744900979723515e-05, + "loss": 1.4109, + "step": 77180 + }, + { + "epoch": 3.2812674649008415, + "grad_norm": 0.18868058919906616, + "learning_rate": 6.743378896642781e-05, + "loss": 1.4193, + "step": 77190 + }, + { + "epoch": 3.281809406066354, + "grad_norm": 0.22179383039474487, + "learning_rate": 6.741856659537669e-05, + "loss": 1.4177, + "step": 77200 + }, + { + "epoch": 3.282351347231866, + "grad_norm": 0.18624810874462128, + "learning_rate": 6.740334268596746e-05, + "loss": 1.4239, + "step": 77210 + }, + { + "epoch": 3.2828932883973785, + "grad_norm": 0.2454560250043869, + "learning_rate": 6.738811724008598e-05, + "loss": 1.425, + "step": 77220 + }, + { + "epoch": 3.2834352295628904, + "grad_norm": 0.26571282744407654, + "learning_rate": 6.737289025961835e-05, + "loss": 1.4224, + "step": 77230 + }, + { + "epoch": 3.2839771707284027, + "grad_norm": 0.19458989799022675, + "learning_rate": 6.735766174645075e-05, + "loss": 1.4143, + "step": 77240 + }, + { + "epoch": 3.284519111893915, + "grad_norm": 0.2220762073993683, + "learning_rate": 6.734243170246968e-05, + "loss": 1.4226, + "step": 77250 + }, + { + "epoch": 3.284844276593222, + "eval_loss": 2.4338366985321045, + "eval_runtime": 22.2711, + "eval_samples_per_second": 224.507, + "eval_steps_per_second": 1.212, + "step": 77256 + }, + { + "epoch": 3.2850610530594273, + "grad_norm": 0.1980799287557602, + "learning_rate": 6.732720012956175e-05, + "loss": 1.4256, + "step": 77260 + }, + { + "epoch": 3.2856029942249396, + "grad_norm": 0.2740095555782318, + "learning_rate": 6.731196702961381e-05, + "loss": 1.4159, + "step": 77270 + }, + { + "epoch": 3.2861449353904515, + "grad_norm": 0.16144628822803497, + "learning_rate": 6.729673240451283e-05, + "loss": 1.4139, + "step": 77280 + }, + { + "epoch": 3.286686876555964, + "grad_norm": 0.20206600427627563, + "learning_rate": 6.728149625614602e-05, + "loss": 1.4197, + "step": 77290 + }, + { + "epoch": 3.287228817721476, + "grad_norm": 0.2360743135213852, + "learning_rate": 6.726625858640078e-05, + "loss": 1.422, + "step": 77300 + }, + { + "epoch": 3.2877707588869884, + "grad_norm": 0.2006218433380127, + "learning_rate": 6.72510193971647e-05, + "loss": 1.4153, + "step": 77310 + }, + { + "epoch": 3.2883127000525008, + "grad_norm": 0.2214326560497284, + "learning_rate": 6.72357786903255e-05, + "loss": 1.4235, + "step": 77320 + }, + { + "epoch": 3.2888546412180126, + "grad_norm": 0.183137446641922, + "learning_rate": 6.722053646777116e-05, + "loss": 1.419, + "step": 77330 + }, + { + "epoch": 3.289396582383525, + "grad_norm": 0.18729977309703827, + "learning_rate": 6.720529273138983e-05, + "loss": 1.4225, + "step": 77340 + }, + { + "epoch": 3.2895591647331788, + "eval_loss": 2.433459758758545, + "eval_runtime": 21.993, + "eval_samples_per_second": 227.345, + "eval_steps_per_second": 1.228, + "step": 77343 + }, + { + "epoch": 3.2899385235490373, + "grad_norm": 0.19643135368824005, + "learning_rate": 6.719004748306982e-05, + "loss": 1.4253, + "step": 77350 + }, + { + "epoch": 3.2904804647145496, + "grad_norm": 0.21529223024845123, + "learning_rate": 6.717480072469967e-05, + "loss": 1.4184, + "step": 77360 + }, + { + "epoch": 3.2910224058800615, + "grad_norm": 0.1587986797094345, + "learning_rate": 6.715955245816804e-05, + "loss": 1.4235, + "step": 77370 + }, + { + "epoch": 3.2915643470455738, + "grad_norm": 0.23578478395938873, + "learning_rate": 6.714430268536384e-05, + "loss": 1.4172, + "step": 77380 + }, + { + "epoch": 3.292106288211086, + "grad_norm": 0.33754247426986694, + "learning_rate": 6.712905140817616e-05, + "loss": 1.4207, + "step": 77390 + }, + { + "epoch": 3.2926482293765984, + "grad_norm": 0.20548036694526672, + "learning_rate": 6.711379862849426e-05, + "loss": 1.4004, + "step": 77400 + }, + { + "epoch": 3.2931901705421103, + "grad_norm": 0.1700463443994522, + "learning_rate": 6.709854434820757e-05, + "loss": 1.419, + "step": 77410 + }, + { + "epoch": 3.2937321117076226, + "grad_norm": 0.381428062915802, + "learning_rate": 6.708328856920574e-05, + "loss": 1.4164, + "step": 77420 + }, + { + "epoch": 3.294274052873135, + "grad_norm": 0.19036360085010529, + "learning_rate": 6.706803129337856e-05, + "loss": 1.4059, + "step": 77430 + }, + { + "epoch": 3.294274052873135, + "eval_loss": 2.4337708950042725, + "eval_runtime": 22.2936, + "eval_samples_per_second": 224.279, + "eval_steps_per_second": 1.211, + "step": 77430 + }, + { + "epoch": 3.2948159940386472, + "grad_norm": 0.2967754900455475, + "learning_rate": 6.705277252261608e-05, + "loss": 1.4183, + "step": 77440 + }, + { + "epoch": 3.2953579352041595, + "grad_norm": 0.18681344389915466, + "learning_rate": 6.703751225880847e-05, + "loss": 1.4228, + "step": 77450 + }, + { + "epoch": 3.2958998763696714, + "grad_norm": 0.15400539338588715, + "learning_rate": 6.70222505038461e-05, + "loss": 1.4108, + "step": 77460 + }, + { + "epoch": 3.2964418175351837, + "grad_norm": 0.16726654767990112, + "learning_rate": 6.700698725961952e-05, + "loss": 1.4053, + "step": 77470 + }, + { + "epoch": 3.296983758700696, + "grad_norm": 0.2558479309082031, + "learning_rate": 6.699172252801948e-05, + "loss": 1.4146, + "step": 77480 + }, + { + "epoch": 3.2975256998662084, + "grad_norm": 0.16007401049137115, + "learning_rate": 6.697645631093694e-05, + "loss": 1.4116, + "step": 77490 + }, + { + "epoch": 3.2980676410317207, + "grad_norm": 0.23393070697784424, + "learning_rate": 6.696118861026297e-05, + "loss": 1.4125, + "step": 77500 + }, + { + "epoch": 3.2986095821972325, + "grad_norm": 0.18870221078395844, + "learning_rate": 6.69459194278889e-05, + "loss": 1.4135, + "step": 77510 + }, + { + "epoch": 3.298988941013091, + "eval_loss": 2.4385015964508057, + "eval_runtime": 21.9917, + "eval_samples_per_second": 227.358, + "eval_steps_per_second": 1.228, + "step": 77517 + }, + { + "epoch": 3.299151523362745, + "grad_norm": 0.2689937651157379, + "learning_rate": 6.69306487657062e-05, + "loss": 1.4227, + "step": 77520 + }, + { + "epoch": 3.299693464528257, + "grad_norm": 0.28837496042251587, + "learning_rate": 6.691537662560651e-05, + "loss": 1.4243, + "step": 77530 + }, + { + "epoch": 3.3002354056937695, + "grad_norm": 0.18641532957553864, + "learning_rate": 6.69001030094817e-05, + "loss": 1.4236, + "step": 77540 + }, + { + "epoch": 3.300777346859282, + "grad_norm": 0.18917597830295563, + "learning_rate": 6.68848279192238e-05, + "loss": 1.4141, + "step": 77550 + }, + { + "epoch": 3.3013192880247937, + "grad_norm": 0.201051265001297, + "learning_rate": 6.6869551356725e-05, + "loss": 1.422, + "step": 77560 + }, + { + "epoch": 3.301861229190306, + "grad_norm": 0.18273517489433289, + "learning_rate": 6.685427332387774e-05, + "loss": 1.4275, + "step": 77570 + }, + { + "epoch": 3.3024031703558183, + "grad_norm": 0.2597062587738037, + "learning_rate": 6.683899382257454e-05, + "loss": 1.426, + "step": 77580 + }, + { + "epoch": 3.3029451115213306, + "grad_norm": 0.33226585388183594, + "learning_rate": 6.682371285470819e-05, + "loss": 1.4262, + "step": 77590 + }, + { + "epoch": 3.3034870526868425, + "grad_norm": 0.22054164111614227, + "learning_rate": 6.680843042217165e-05, + "loss": 1.4183, + "step": 77600 + }, + { + "epoch": 3.3037038291530476, + "eval_loss": 2.4387526512145996, + "eval_runtime": 21.991, + "eval_samples_per_second": 227.366, + "eval_steps_per_second": 1.228, + "step": 77604 + }, + { + "epoch": 3.304028993852355, + "grad_norm": 0.19982264935970306, + "learning_rate": 6.679314652685798e-05, + "loss": 1.4194, + "step": 77610 + }, + { + "epoch": 3.304570935017867, + "grad_norm": 0.3592228293418884, + "learning_rate": 6.677786117066054e-05, + "loss": 1.4139, + "step": 77620 + }, + { + "epoch": 3.3051128761833795, + "grad_norm": 0.27388885617256165, + "learning_rate": 6.676257435547279e-05, + "loss": 1.4126, + "step": 77630 + }, + { + "epoch": 3.3056548173488913, + "grad_norm": 0.165022611618042, + "learning_rate": 6.674728608318839e-05, + "loss": 1.426, + "step": 77640 + }, + { + "epoch": 3.3061967585144036, + "grad_norm": 0.28118664026260376, + "learning_rate": 6.67319963557012e-05, + "loss": 1.4054, + "step": 77650 + }, + { + "epoch": 3.306738699679916, + "grad_norm": 0.16331221163272858, + "learning_rate": 6.671670517490525e-05, + "loss": 1.419, + "step": 77660 + }, + { + "epoch": 3.3072806408454283, + "grad_norm": 0.16730548441410065, + "learning_rate": 6.67014125426947e-05, + "loss": 1.4121, + "step": 77670 + }, + { + "epoch": 3.3078225820109406, + "grad_norm": 0.22583581507205963, + "learning_rate": 6.668611846096397e-05, + "loss": 1.4194, + "step": 77680 + }, + { + "epoch": 3.3083645231764525, + "grad_norm": 0.195210799574852, + "learning_rate": 6.667082293160766e-05, + "loss": 1.4119, + "step": 77690 + }, + { + "epoch": 3.3084187172930037, + "eval_loss": 2.4286701679229736, + "eval_runtime": 21.9887, + "eval_samples_per_second": 227.39, + "eval_steps_per_second": 1.228, + "step": 77691 + }, + { + "epoch": 3.308906464341965, + "grad_norm": 0.18108178675174713, + "learning_rate": 6.665552595652043e-05, + "loss": 1.4121, + "step": 77700 + }, + { + "epoch": 3.309448405507477, + "grad_norm": 0.2247098833322525, + "learning_rate": 6.664022753759728e-05, + "loss": 1.4101, + "step": 77710 + }, + { + "epoch": 3.3099903466729894, + "grad_norm": 0.3786323666572571, + "learning_rate": 6.662492767673325e-05, + "loss": 1.4119, + "step": 77720 + }, + { + "epoch": 3.3105322878385017, + "grad_norm": 0.19558481872081757, + "learning_rate": 6.660962637582366e-05, + "loss": 1.4074, + "step": 77730 + }, + { + "epoch": 3.3110742290040136, + "grad_norm": 0.2246645838022232, + "learning_rate": 6.659432363676397e-05, + "loss": 1.4203, + "step": 77740 + }, + { + "epoch": 3.311616170169526, + "grad_norm": 0.18004010617733002, + "learning_rate": 6.65790194614498e-05, + "loss": 1.4246, + "step": 77750 + }, + { + "epoch": 3.3121581113350382, + "grad_norm": 0.19459094107151031, + "learning_rate": 6.656371385177697e-05, + "loss": 1.4134, + "step": 77760 + }, + { + "epoch": 3.3127000525005506, + "grad_norm": 0.16068406403064728, + "learning_rate": 6.654840680964148e-05, + "loss": 1.4124, + "step": 77770 + }, + { + "epoch": 3.3131336054329603, + "eval_loss": 2.4155197143554688, + "eval_runtime": 21.9967, + "eval_samples_per_second": 227.307, + "eval_steps_per_second": 1.227, + "step": 77778 + }, + { + "epoch": 3.313241993666063, + "grad_norm": 0.19042399525642395, + "learning_rate": 6.653309833693947e-05, + "loss": 1.4129, + "step": 77780 + }, + { + "epoch": 3.3137839348315747, + "grad_norm": 0.17300648987293243, + "learning_rate": 6.651778843556734e-05, + "loss": 1.4206, + "step": 77790 + }, + { + "epoch": 3.314325875997087, + "grad_norm": 0.23803412914276123, + "learning_rate": 6.650247710742156e-05, + "loss": 1.4187, + "step": 77800 + }, + { + "epoch": 3.3148678171625994, + "grad_norm": 0.26673057675361633, + "learning_rate": 6.648716435439887e-05, + "loss": 1.4223, + "step": 77810 + }, + { + "epoch": 3.3154097583281117, + "grad_norm": 0.25232914090156555, + "learning_rate": 6.647185017839612e-05, + "loss": 1.4211, + "step": 77820 + }, + { + "epoch": 3.3159516994936236, + "grad_norm": 0.22854040563106537, + "learning_rate": 6.645653458131037e-05, + "loss": 1.4249, + "step": 77830 + }, + { + "epoch": 3.316493640659136, + "grad_norm": 0.1900588870048523, + "learning_rate": 6.644121756503888e-05, + "loss": 1.4148, + "step": 77840 + }, + { + "epoch": 3.317035581824648, + "grad_norm": 0.23278219997882843, + "learning_rate": 6.642589913147902e-05, + "loss": 1.4233, + "step": 77850 + }, + { + "epoch": 3.3175775229901605, + "grad_norm": 0.2536775767803192, + "learning_rate": 6.641057928252837e-05, + "loss": 1.412, + "step": 77860 + }, + { + "epoch": 3.3178484935729164, + "eval_loss": 2.4168598651885986, + "eval_runtime": 21.9945, + "eval_samples_per_second": 227.329, + "eval_steps_per_second": 1.228, + "step": 77865 + }, + { + "epoch": 3.3181194641556724, + "grad_norm": 0.16754235327243805, + "learning_rate": 6.63952580200847e-05, + "loss": 1.4147, + "step": 77870 + }, + { + "epoch": 3.3186614053211847, + "grad_norm": 0.1937492936849594, + "learning_rate": 6.637993534604595e-05, + "loss": 1.4192, + "step": 77880 + }, + { + "epoch": 3.319203346486697, + "grad_norm": 0.17745104432106018, + "learning_rate": 6.636461126231022e-05, + "loss": 1.4204, + "step": 77890 + }, + { + "epoch": 3.3197452876522093, + "grad_norm": 0.21982987225055695, + "learning_rate": 6.634928577077577e-05, + "loss": 1.41, + "step": 77900 + }, + { + "epoch": 3.3202872288177216, + "grad_norm": 0.2929491102695465, + "learning_rate": 6.633395887334108e-05, + "loss": 1.4172, + "step": 77910 + }, + { + "epoch": 3.3208291699832335, + "grad_norm": 0.17290639877319336, + "learning_rate": 6.631863057190479e-05, + "loss": 1.4262, + "step": 77920 + }, + { + "epoch": 3.321371111148746, + "grad_norm": 0.18496091663837433, + "learning_rate": 6.630330086836565e-05, + "loss": 1.4092, + "step": 77930 + }, + { + "epoch": 3.321913052314258, + "grad_norm": 0.3094109892845154, + "learning_rate": 6.62879697646227e-05, + "loss": 1.422, + "step": 77940 + }, + { + "epoch": 3.3224549934797705, + "grad_norm": 0.21480301022529602, + "learning_rate": 6.627263726257506e-05, + "loss": 1.4147, + "step": 77950 + }, + { + "epoch": 3.3225633817128726, + "eval_loss": 2.4209840297698975, + "eval_runtime": 21.9901, + "eval_samples_per_second": 227.376, + "eval_steps_per_second": 1.228, + "step": 77952 + }, + { + "epoch": 3.322996934645283, + "grad_norm": 0.19507268071174622, + "learning_rate": 6.625730336412204e-05, + "loss": 1.4109, + "step": 77960 + }, + { + "epoch": 3.3235388758107947, + "grad_norm": 0.21125446259975433, + "learning_rate": 6.624196807116317e-05, + "loss": 1.4112, + "step": 77970 + }, + { + "epoch": 3.324080816976307, + "grad_norm": 0.2264462113380432, + "learning_rate": 6.62266313855981e-05, + "loss": 1.4279, + "step": 77980 + }, + { + "epoch": 3.3246227581418193, + "grad_norm": 0.2783096432685852, + "learning_rate": 6.621129330932668e-05, + "loss": 1.4212, + "step": 77990 + }, + { + "epoch": 3.3251646993073316, + "grad_norm": 0.17773371934890747, + "learning_rate": 6.61959538442489e-05, + "loss": 1.4149, + "step": 78000 + }, + { + "epoch": 3.3257066404728435, + "grad_norm": 0.1886042207479477, + "learning_rate": 6.618061299226497e-05, + "loss": 1.4196, + "step": 78010 + }, + { + "epoch": 3.326248581638356, + "grad_norm": 0.2692718505859375, + "learning_rate": 6.616527075527527e-05, + "loss": 1.4199, + "step": 78020 + }, + { + "epoch": 3.326790522803868, + "grad_norm": 0.1580006331205368, + "learning_rate": 6.61499271351803e-05, + "loss": 1.4211, + "step": 78030 + }, + { + "epoch": 3.327278269852829, + "eval_loss": 2.4329586029052734, + "eval_runtime": 21.9893, + "eval_samples_per_second": 227.384, + "eval_steps_per_second": 1.228, + "step": 78039 + }, + { + "epoch": 3.3273324639693804, + "grad_norm": 0.18620441854000092, + "learning_rate": 6.613458213388073e-05, + "loss": 1.4193, + "step": 78040 + }, + { + "epoch": 3.3278744051348923, + "grad_norm": 0.20754049718379974, + "learning_rate": 6.61192357532775e-05, + "loss": 1.4075, + "step": 78050 + }, + { + "epoch": 3.3284163463004046, + "grad_norm": 0.19851942360401154, + "learning_rate": 6.61038879952716e-05, + "loss": 1.4155, + "step": 78060 + }, + { + "epoch": 3.328958287465917, + "grad_norm": 0.1629553735256195, + "learning_rate": 6.608853886176426e-05, + "loss": 1.4237, + "step": 78070 + }, + { + "epoch": 3.3295002286314292, + "grad_norm": 0.15505240857601166, + "learning_rate": 6.607318835465689e-05, + "loss": 1.4255, + "step": 78080 + }, + { + "epoch": 3.3300421697969416, + "grad_norm": 0.3402341902256012, + "learning_rate": 6.6057836475851e-05, + "loss": 1.4163, + "step": 78090 + }, + { + "epoch": 3.3305841109624534, + "grad_norm": 0.1934230774641037, + "learning_rate": 6.604248322724835e-05, + "loss": 1.4173, + "step": 78100 + }, + { + "epoch": 3.3311260521279658, + "grad_norm": 0.17497047781944275, + "learning_rate": 6.602712861075082e-05, + "loss": 1.4166, + "step": 78110 + }, + { + "epoch": 3.331667993293478, + "grad_norm": 0.19935786724090576, + "learning_rate": 6.601177262826046e-05, + "loss": 1.4221, + "step": 78120 + }, + { + "epoch": 3.3319931579927853, + "eval_loss": 2.4348928928375244, + "eval_runtime": 21.9935, + "eval_samples_per_second": 227.34, + "eval_steps_per_second": 1.228, + "step": 78126 + }, + { + "epoch": 3.3322099344589904, + "grad_norm": 0.17210763692855835, + "learning_rate": 6.599641528167952e-05, + "loss": 1.4111, + "step": 78130 + }, + { + "epoch": 3.3327518756245027, + "grad_norm": 0.175918847322464, + "learning_rate": 6.598105657291041e-05, + "loss": 1.4183, + "step": 78140 + }, + { + "epoch": 3.3332938167900146, + "grad_norm": 0.1733727604150772, + "learning_rate": 6.596569650385567e-05, + "loss": 1.4179, + "step": 78150 + }, + { + "epoch": 3.333835757955527, + "grad_norm": 0.16511158645153046, + "learning_rate": 6.595033507641806e-05, + "loss": 1.415, + "step": 78160 + }, + { + "epoch": 3.334377699121039, + "grad_norm": 0.3177291750907898, + "learning_rate": 6.593497229250048e-05, + "loss": 1.4095, + "step": 78170 + }, + { + "epoch": 3.3349196402865515, + "grad_norm": 0.172699436545372, + "learning_rate": 6.591960815400601e-05, + "loss": 1.4035, + "step": 78180 + }, + { + "epoch": 3.335461581452064, + "grad_norm": 0.28057149052619934, + "learning_rate": 6.590424266283791e-05, + "loss": 1.4162, + "step": 78190 + }, + { + "epoch": 3.3360035226175757, + "grad_norm": 0.23236146569252014, + "learning_rate": 6.588887582089955e-05, + "loss": 1.417, + "step": 78200 + }, + { + "epoch": 3.336545463783088, + "grad_norm": 0.2135874629020691, + "learning_rate": 6.587350763009452e-05, + "loss": 1.4192, + "step": 78210 + }, + { + "epoch": 3.336708046132742, + "eval_loss": 2.4323973655700684, + "eval_runtime": 21.9907, + "eval_samples_per_second": 227.369, + "eval_steps_per_second": 1.228, + "step": 78213 + }, + { + "epoch": 3.3370874049486003, + "grad_norm": 0.4779936671257019, + "learning_rate": 6.585813809232659e-05, + "loss": 1.415, + "step": 78220 + }, + { + "epoch": 3.3376293461141127, + "grad_norm": 0.22560039162635803, + "learning_rate": 6.584276720949964e-05, + "loss": 1.4238, + "step": 78230 + }, + { + "epoch": 3.3381712872796245, + "grad_norm": 0.20752812922000885, + "learning_rate": 6.582739498351778e-05, + "loss": 1.4179, + "step": 78240 + }, + { + "epoch": 3.338713228445137, + "grad_norm": 0.16067805886268616, + "learning_rate": 6.581202141628522e-05, + "loss": 1.4103, + "step": 78250 + }, + { + "epoch": 3.339255169610649, + "grad_norm": 0.18570862710475922, + "learning_rate": 6.579664650970638e-05, + "loss": 1.4131, + "step": 78260 + }, + { + "epoch": 3.3397971107761615, + "grad_norm": 0.2738288938999176, + "learning_rate": 6.578127026568587e-05, + "loss": 1.4215, + "step": 78270 + }, + { + "epoch": 3.3403390519416734, + "grad_norm": 0.26719287037849426, + "learning_rate": 6.57658926861284e-05, + "loss": 1.4188, + "step": 78280 + }, + { + "epoch": 3.3408809931071857, + "grad_norm": 0.23562686145305634, + "learning_rate": 6.575051377293888e-05, + "loss": 1.4081, + "step": 78290 + }, + { + "epoch": 3.341422934272698, + "grad_norm": 0.17355681955814362, + "learning_rate": 6.573513352802239e-05, + "loss": 1.4196, + "step": 78300 + }, + { + "epoch": 3.341422934272698, + "eval_loss": 2.430081605911255, + "eval_runtime": 21.985, + "eval_samples_per_second": 227.428, + "eval_steps_per_second": 1.228, + "step": 78300 + }, + { + "epoch": 3.3419648754382103, + "grad_norm": 0.19452211260795593, + "learning_rate": 6.571975195328416e-05, + "loss": 1.4239, + "step": 78310 + }, + { + "epoch": 3.3425068166037226, + "grad_norm": 0.20565609633922577, + "learning_rate": 6.570436905062961e-05, + "loss": 1.4145, + "step": 78320 + }, + { + "epoch": 3.3430487577692345, + "grad_norm": 0.163151815533638, + "learning_rate": 6.568898482196428e-05, + "loss": 1.4153, + "step": 78330 + }, + { + "epoch": 3.343590698934747, + "grad_norm": 0.17055930197238922, + "learning_rate": 6.567359926919394e-05, + "loss": 1.4203, + "step": 78340 + }, + { + "epoch": 3.344132640100259, + "grad_norm": 0.17260503768920898, + "learning_rate": 6.565821239422444e-05, + "loss": 1.4126, + "step": 78350 + }, + { + "epoch": 3.3446745812657714, + "grad_norm": 0.19056035578250885, + "learning_rate": 6.564282419896187e-05, + "loss": 1.4296, + "step": 78360 + }, + { + "epoch": 3.3452165224312838, + "grad_norm": 0.1595456898212433, + "learning_rate": 6.562743468531244e-05, + "loss": 1.4219, + "step": 78370 + }, + { + "epoch": 3.3457584635967956, + "grad_norm": 0.16452792286872864, + "learning_rate": 6.561204385518257e-05, + "loss": 1.4136, + "step": 78380 + }, + { + "epoch": 3.346137822412654, + "eval_loss": 2.4075536727905273, + "eval_runtime": 21.9892, + "eval_samples_per_second": 227.384, + "eval_steps_per_second": 1.228, + "step": 78387 + }, + { + "epoch": 3.346300404762308, + "grad_norm": 0.3267938196659088, + "learning_rate": 6.559665171047876e-05, + "loss": 1.4088, + "step": 78390 + }, + { + "epoch": 3.3468423459278203, + "grad_norm": 0.24453267455101013, + "learning_rate": 6.558125825310772e-05, + "loss": 1.4215, + "step": 78400 + }, + { + "epoch": 3.3473842870933326, + "grad_norm": 0.20705412328243256, + "learning_rate": 6.556586348497637e-05, + "loss": 1.4251, + "step": 78410 + }, + { + "epoch": 3.347926228258845, + "grad_norm": 0.2045924812555313, + "learning_rate": 6.555046740799173e-05, + "loss": 1.4172, + "step": 78420 + }, + { + "epoch": 3.3484681694243568, + "grad_norm": 0.1746375858783722, + "learning_rate": 6.553507002406099e-05, + "loss": 1.4219, + "step": 78430 + }, + { + "epoch": 3.349010110589869, + "grad_norm": 0.23190824687480927, + "learning_rate": 6.551967133509149e-05, + "loss": 1.4169, + "step": 78440 + }, + { + "epoch": 3.3495520517553814, + "grad_norm": 0.21592742204666138, + "learning_rate": 6.550427134299079e-05, + "loss": 1.415, + "step": 78450 + }, + { + "epoch": 3.3500939929208937, + "grad_norm": 0.18433372676372528, + "learning_rate": 6.548887004966658e-05, + "loss": 1.415, + "step": 78460 + }, + { + "epoch": 3.3506359340864056, + "grad_norm": 0.1842111349105835, + "learning_rate": 6.547346745702666e-05, + "loss": 1.4273, + "step": 78470 + }, + { + "epoch": 3.3508527105526107, + "eval_loss": 2.4247753620147705, + "eval_runtime": 21.9917, + "eval_samples_per_second": 227.358, + "eval_steps_per_second": 1.228, + "step": 78474 + }, + { + "epoch": 3.351177875251918, + "grad_norm": 0.18961256742477417, + "learning_rate": 6.545806356697908e-05, + "loss": 1.4107, + "step": 78480 + }, + { + "epoch": 3.35171981641743, + "grad_norm": 0.15932081639766693, + "learning_rate": 6.544265838143197e-05, + "loss": 1.4176, + "step": 78490 + }, + { + "epoch": 3.3522617575829425, + "grad_norm": 0.1957116425037384, + "learning_rate": 6.54272519022937e-05, + "loss": 1.4084, + "step": 78500 + }, + { + "epoch": 3.3528036987484544, + "grad_norm": 0.2836982011795044, + "learning_rate": 6.541184413147273e-05, + "loss": 1.4135, + "step": 78510 + }, + { + "epoch": 3.3533456399139667, + "grad_norm": 0.19204333424568176, + "learning_rate": 6.53964350708777e-05, + "loss": 1.4181, + "step": 78520 + }, + { + "epoch": 3.353887581079479, + "grad_norm": 0.173109233379364, + "learning_rate": 6.538102472241745e-05, + "loss": 1.4161, + "step": 78530 + }, + { + "epoch": 3.3544295222449914, + "grad_norm": 0.24930110573768616, + "learning_rate": 6.536561308800091e-05, + "loss": 1.4201, + "step": 78540 + }, + { + "epoch": 3.3549714634105037, + "grad_norm": 0.1809731125831604, + "learning_rate": 6.535020016953723e-05, + "loss": 1.4107, + "step": 78550 + }, + { + "epoch": 3.3555134045760155, + "grad_norm": 0.18697808682918549, + "learning_rate": 6.53347859689357e-05, + "loss": 1.415, + "step": 78560 + }, + { + "epoch": 3.355567598692567, + "eval_loss": 2.433493137359619, + "eval_runtime": 21.9938, + "eval_samples_per_second": 227.337, + "eval_steps_per_second": 1.228, + "step": 78561 + }, + { + "epoch": 3.356055345741528, + "grad_norm": 0.24766592681407928, + "learning_rate": 6.531937048810573e-05, + "loss": 1.4146, + "step": 78570 + }, + { + "epoch": 3.35659728690704, + "grad_norm": 0.16486966609954834, + "learning_rate": 6.530395372895697e-05, + "loss": 1.4123, + "step": 78580 + }, + { + "epoch": 3.3571392280725525, + "grad_norm": 0.20207463204860687, + "learning_rate": 6.528853569339913e-05, + "loss": 1.4197, + "step": 78590 + }, + { + "epoch": 3.357681169238065, + "grad_norm": 0.17322023212909698, + "learning_rate": 6.527311638334218e-05, + "loss": 1.4166, + "step": 78600 + }, + { + "epoch": 3.3582231104035767, + "grad_norm": 0.18780970573425293, + "learning_rate": 6.525769580069617e-05, + "loss": 1.4212, + "step": 78610 + }, + { + "epoch": 3.358765051569089, + "grad_norm": 0.2877364456653595, + "learning_rate": 6.524227394737135e-05, + "loss": 1.4195, + "step": 78620 + }, + { + "epoch": 3.3593069927346013, + "grad_norm": 0.2212965041399002, + "learning_rate": 6.522685082527807e-05, + "loss": 1.4172, + "step": 78630 + }, + { + "epoch": 3.3598489339001136, + "grad_norm": 0.2948903441429138, + "learning_rate": 6.521142643632692e-05, + "loss": 1.4171, + "step": 78640 + }, + { + "epoch": 3.3602824868325234, + "eval_loss": 2.4318501949310303, + "eval_runtime": 24.8893, + "eval_samples_per_second": 200.89, + "eval_steps_per_second": 1.085, + "step": 78648 + }, + { + "epoch": 3.3603908750656255, + "grad_norm": 0.1802278459072113, + "learning_rate": 6.51960007824286e-05, + "loss": 1.4057, + "step": 78650 + }, + { + "epoch": 3.360932816231138, + "grad_norm": 0.2904506027698517, + "learning_rate": 6.518057386549398e-05, + "loss": 1.4219, + "step": 78660 + }, + { + "epoch": 3.36147475739665, + "grad_norm": 0.3286694586277008, + "learning_rate": 6.516514568743407e-05, + "loss": 1.4142, + "step": 78670 + }, + { + "epoch": 3.3620166985621625, + "grad_norm": 0.1851622760295868, + "learning_rate": 6.514971625016004e-05, + "loss": 1.4056, + "step": 78680 + }, + { + "epoch": 3.3625586397276743, + "grad_norm": 0.1889377385377884, + "learning_rate": 6.513428555558321e-05, + "loss": 1.4122, + "step": 78690 + }, + { + "epoch": 3.3631005808931866, + "grad_norm": 0.15249784290790558, + "learning_rate": 6.51188536056151e-05, + "loss": 1.4211, + "step": 78700 + }, + { + "epoch": 3.363642522058699, + "grad_norm": 0.3704091012477875, + "learning_rate": 6.510342040216733e-05, + "loss": 1.4221, + "step": 78710 + }, + { + "epoch": 3.3641844632242113, + "grad_norm": 0.2278876155614853, + "learning_rate": 6.508798594715172e-05, + "loss": 1.4273, + "step": 78720 + }, + { + "epoch": 3.3647264043897236, + "grad_norm": 0.21338103711605072, + "learning_rate": 6.507255024248019e-05, + "loss": 1.4185, + "step": 78730 + }, + { + "epoch": 3.3649973749724795, + "eval_loss": 2.418487548828125, + "eval_runtime": 21.9902, + "eval_samples_per_second": 227.374, + "eval_steps_per_second": 1.228, + "step": 78735 + }, + { + "epoch": 3.3652683455552355, + "grad_norm": 0.3524332642555237, + "learning_rate": 6.505711329006488e-05, + "loss": 1.4218, + "step": 78740 + }, + { + "epoch": 3.365810286720748, + "grad_norm": 0.1828557401895523, + "learning_rate": 6.504167509181804e-05, + "loss": 1.4172, + "step": 78750 + }, + { + "epoch": 3.36635222788626, + "grad_norm": 0.21107490360736847, + "learning_rate": 6.502623564965206e-05, + "loss": 1.4145, + "step": 78760 + }, + { + "epoch": 3.3668941690517724, + "grad_norm": 0.26644033193588257, + "learning_rate": 6.501079496547957e-05, + "loss": 1.4113, + "step": 78770 + }, + { + "epoch": 3.3674361102172847, + "grad_norm": 0.15477254986763, + "learning_rate": 6.499535304121324e-05, + "loss": 1.4176, + "step": 78780 + }, + { + "epoch": 3.3679780513827966, + "grad_norm": 0.1923697292804718, + "learning_rate": 6.497990987876598e-05, + "loss": 1.4192, + "step": 78790 + }, + { + "epoch": 3.368519992548309, + "grad_norm": 0.18154162168502808, + "learning_rate": 6.496446548005082e-05, + "loss": 1.4184, + "step": 78800 + }, + { + "epoch": 3.3690619337138212, + "grad_norm": 0.24058213829994202, + "learning_rate": 6.494901984698093e-05, + "loss": 1.4109, + "step": 78810 + }, + { + "epoch": 3.3696038748793335, + "grad_norm": 0.1790485382080078, + "learning_rate": 6.493357298146965e-05, + "loss": 1.4162, + "step": 78820 + }, + { + "epoch": 3.3697122631124357, + "eval_loss": 2.420318126678467, + "eval_runtime": 21.9925, + "eval_samples_per_second": 227.35, + "eval_steps_per_second": 1.228, + "step": 78822 + }, + { + "epoch": 3.370145816044846, + "grad_norm": 0.1824759691953659, + "learning_rate": 6.491812488543049e-05, + "loss": 1.4181, + "step": 78830 + }, + { + "epoch": 3.3706877572103577, + "grad_norm": 0.27409785985946655, + "learning_rate": 6.490267556077706e-05, + "loss": 1.4215, + "step": 78840 + }, + { + "epoch": 3.37122969837587, + "grad_norm": 0.17618605494499207, + "learning_rate": 6.488722500942321e-05, + "loss": 1.4215, + "step": 78850 + }, + { + "epoch": 3.3717716395413824, + "grad_norm": 0.21089158952236176, + "learning_rate": 6.487177323328282e-05, + "loss": 1.4105, + "step": 78860 + }, + { + "epoch": 3.3723135807068947, + "grad_norm": 0.1864428073167801, + "learning_rate": 6.485632023427003e-05, + "loss": 1.4122, + "step": 78870 + }, + { + "epoch": 3.3728555218724066, + "grad_norm": 0.21700777113437653, + "learning_rate": 6.484086601429907e-05, + "loss": 1.4169, + "step": 78880 + }, + { + "epoch": 3.373397463037919, + "grad_norm": 0.1815451681613922, + "learning_rate": 6.482541057528437e-05, + "loss": 1.428, + "step": 78890 + }, + { + "epoch": 3.373939404203431, + "grad_norm": 0.23216386139392853, + "learning_rate": 6.480995391914046e-05, + "loss": 1.4166, + "step": 78900 + }, + { + "epoch": 3.3744271512523922, + "eval_loss": 2.411614418029785, + "eval_runtime": 21.9934, + "eval_samples_per_second": 227.341, + "eval_steps_per_second": 1.228, + "step": 78909 + }, + { + "epoch": 3.3744813453689435, + "grad_norm": 0.1651027947664261, + "learning_rate": 6.479449604778206e-05, + "loss": 1.4183, + "step": 78910 + }, + { + "epoch": 3.3750232865344554, + "grad_norm": 0.20378755033016205, + "learning_rate": 6.477903696312398e-05, + "loss": 1.4167, + "step": 78920 + }, + { + "epoch": 3.3755652276999677, + "grad_norm": 0.19732166826725006, + "learning_rate": 6.476357666708129e-05, + "loss": 1.4083, + "step": 78930 + }, + { + "epoch": 3.37610716886548, + "grad_norm": 0.23895850777626038, + "learning_rate": 6.47481151615691e-05, + "loss": 1.4173, + "step": 78940 + }, + { + "epoch": 3.3766491100309923, + "grad_norm": 0.15536242723464966, + "learning_rate": 6.473265244850273e-05, + "loss": 1.4142, + "step": 78950 + }, + { + "epoch": 3.3771910511965046, + "grad_norm": 0.23873640596866608, + "learning_rate": 6.471718852979762e-05, + "loss": 1.4163, + "step": 78960 + }, + { + "epoch": 3.3777329923620165, + "grad_norm": 0.3809368908405304, + "learning_rate": 6.47017234073694e-05, + "loss": 1.404, + "step": 78970 + }, + { + "epoch": 3.378274933527529, + "grad_norm": 0.19484516978263855, + "learning_rate": 6.468625708313378e-05, + "loss": 1.4137, + "step": 78980 + }, + { + "epoch": 3.378816874693041, + "grad_norm": 0.23201866447925568, + "learning_rate": 6.46707895590067e-05, + "loss": 1.4246, + "step": 78990 + }, + { + "epoch": 3.3791420393923484, + "eval_loss": 2.4239916801452637, + "eval_runtime": 21.9918, + "eval_samples_per_second": 227.358, + "eval_steps_per_second": 1.228, + "step": 78996 + }, + { + "epoch": 3.3793588158585535, + "grad_norm": 0.17255160212516785, + "learning_rate": 6.46553208369042e-05, + "loss": 1.4182, + "step": 79000 + }, + { + "epoch": 3.379900757024066, + "grad_norm": 0.148577019572258, + "learning_rate": 6.463985091874248e-05, + "loss": 1.4222, + "step": 79010 + }, + { + "epoch": 3.3804426981895777, + "grad_norm": 0.17819197475910187, + "learning_rate": 6.462437980643786e-05, + "loss": 1.4207, + "step": 79020 + }, + { + "epoch": 3.38098463935509, + "grad_norm": 0.18190248310565948, + "learning_rate": 6.460890750190686e-05, + "loss": 1.4113, + "step": 79030 + }, + { + "epoch": 3.3815265805206023, + "grad_norm": 0.17424237728118896, + "learning_rate": 6.459343400706612e-05, + "loss": 1.4333, + "step": 79040 + }, + { + "epoch": 3.3820685216861146, + "grad_norm": 0.16150598227977753, + "learning_rate": 6.457795932383242e-05, + "loss": 1.4171, + "step": 79050 + }, + { + "epoch": 3.3826104628516265, + "grad_norm": 0.1738124042749405, + "learning_rate": 6.456248345412272e-05, + "loss": 1.4096, + "step": 79060 + }, + { + "epoch": 3.383152404017139, + "grad_norm": 0.21694537997245789, + "learning_rate": 6.454700639985408e-05, + "loss": 1.4149, + "step": 79070 + }, + { + "epoch": 3.383694345182651, + "grad_norm": 0.170954167842865, + "learning_rate": 6.453152816294373e-05, + "loss": 1.4234, + "step": 79080 + }, + { + "epoch": 3.383856927532305, + "eval_loss": 2.4151031970977783, + "eval_runtime": 21.9982, + "eval_samples_per_second": 227.291, + "eval_steps_per_second": 1.227, + "step": 79083 + }, + { + "epoch": 3.3842362863481634, + "grad_norm": 0.2635592818260193, + "learning_rate": 6.451604874530905e-05, + "loss": 1.4245, + "step": 79090 + }, + { + "epoch": 3.3847782275136753, + "grad_norm": 0.2918653190135956, + "learning_rate": 6.450056814886756e-05, + "loss": 1.4131, + "step": 79100 + }, + { + "epoch": 3.3853201686791876, + "grad_norm": 0.3705879747867584, + "learning_rate": 6.448508637553695e-05, + "loss": 1.427, + "step": 79110 + }, + { + "epoch": 3.3858621098447, + "grad_norm": 0.18804314732551575, + "learning_rate": 6.446960342723503e-05, + "loss": 1.4223, + "step": 79120 + }, + { + "epoch": 3.3864040510102122, + "grad_norm": 0.15354295074939728, + "learning_rate": 6.445411930587971e-05, + "loss": 1.4193, + "step": 79130 + }, + { + "epoch": 3.3869459921757246, + "grad_norm": 0.2109123021364212, + "learning_rate": 6.443863401338917e-05, + "loss": 1.4122, + "step": 79140 + }, + { + "epoch": 3.3874879333412364, + "grad_norm": 0.2178303450345993, + "learning_rate": 6.442314755168162e-05, + "loss": 1.4152, + "step": 79150 + }, + { + "epoch": 3.3880298745067488, + "grad_norm": 0.24825222790241241, + "learning_rate": 6.440765992267546e-05, + "loss": 1.4112, + "step": 79160 + }, + { + "epoch": 3.388571815672261, + "grad_norm": 0.18957504630088806, + "learning_rate": 6.439217112828924e-05, + "loss": 1.4217, + "step": 79170 + }, + { + "epoch": 3.388571815672261, + "eval_loss": 2.423619270324707, + "eval_runtime": 21.99, + "eval_samples_per_second": 227.376, + "eval_steps_per_second": 1.228, + "step": 79170 + }, + { + "epoch": 3.3891137568377734, + "grad_norm": 0.16275407373905182, + "learning_rate": 6.437668117044162e-05, + "loss": 1.4085, + "step": 79180 + }, + { + "epoch": 3.3896556980032857, + "grad_norm": 0.26474809646606445, + "learning_rate": 6.436119005105145e-05, + "loss": 1.4169, + "step": 79190 + }, + { + "epoch": 3.3901976391687976, + "grad_norm": 0.3433353900909424, + "learning_rate": 6.434569777203772e-05, + "loss": 1.4049, + "step": 79200 + }, + { + "epoch": 3.39073958033431, + "grad_norm": 0.31058382987976074, + "learning_rate": 6.433020433531951e-05, + "loss": 1.4222, + "step": 79210 + }, + { + "epoch": 3.391281521499822, + "grad_norm": 0.1867647022008896, + "learning_rate": 6.43147097428161e-05, + "loss": 1.4368, + "step": 79220 + }, + { + "epoch": 3.3918234626653345, + "grad_norm": 0.38746219873428345, + "learning_rate": 6.429921399644687e-05, + "loss": 1.4072, + "step": 79230 + }, + { + "epoch": 3.392365403830847, + "grad_norm": 0.21169978380203247, + "learning_rate": 6.42837170981314e-05, + "loss": 1.4154, + "step": 79240 + }, + { + "epoch": 3.3929073449963587, + "grad_norm": 0.17615120112895966, + "learning_rate": 6.426821904978936e-05, + "loss": 1.4117, + "step": 79250 + }, + { + "epoch": 3.393286703812217, + "eval_loss": 2.415933609008789, + "eval_runtime": 21.9936, + "eval_samples_per_second": 227.339, + "eval_steps_per_second": 1.228, + "step": 79257 + }, + { + "epoch": 3.393449286161871, + "grad_norm": 0.2107549011707306, + "learning_rate": 6.425271985334059e-05, + "loss": 1.4119, + "step": 79260 + }, + { + "epoch": 3.3939912273273833, + "grad_norm": 0.30386942625045776, + "learning_rate": 6.423721951070502e-05, + "loss": 1.408, + "step": 79270 + }, + { + "epoch": 3.3945331684928957, + "grad_norm": 0.17193304002285004, + "learning_rate": 6.422171802380283e-05, + "loss": 1.4055, + "step": 79280 + }, + { + "epoch": 3.3950751096584075, + "grad_norm": 0.30357927083969116, + "learning_rate": 6.420621539455426e-05, + "loss": 1.411, + "step": 79290 + }, + { + "epoch": 3.39561705082392, + "grad_norm": 0.20240160822868347, + "learning_rate": 6.419071162487969e-05, + "loss": 1.4088, + "step": 79300 + }, + { + "epoch": 3.396158991989432, + "grad_norm": 0.18283316493034363, + "learning_rate": 6.417520671669964e-05, + "loss": 1.4094, + "step": 79310 + }, + { + "epoch": 3.3967009331549445, + "grad_norm": 0.16190268099308014, + "learning_rate": 6.415970067193483e-05, + "loss": 1.4148, + "step": 79320 + }, + { + "epoch": 3.3972428743204564, + "grad_norm": 0.22923493385314941, + "learning_rate": 6.414419349250608e-05, + "loss": 1.409, + "step": 79330 + }, + { + "epoch": 3.3977848154859687, + "grad_norm": 0.3721964359283447, + "learning_rate": 6.412868518033432e-05, + "loss": 1.4198, + "step": 79340 + }, + { + "epoch": 3.3980015919521738, + "eval_loss": 2.421030044555664, + "eval_runtime": 21.9926, + "eval_samples_per_second": 227.349, + "eval_steps_per_second": 1.228, + "step": 79344 + }, + { + "epoch": 3.398326756651481, + "grad_norm": 0.392916738986969, + "learning_rate": 6.41131757373407e-05, + "loss": 1.4077, + "step": 79350 + }, + { + "epoch": 3.3988686978169933, + "grad_norm": 0.5418851375579834, + "learning_rate": 6.409766516544642e-05, + "loss": 1.4154, + "step": 79360 + }, + { + "epoch": 3.3994106389825056, + "grad_norm": 0.20894989371299744, + "learning_rate": 6.408215346657287e-05, + "loss": 1.4091, + "step": 79370 + }, + { + "epoch": 3.3999525801480175, + "grad_norm": 0.2258988916873932, + "learning_rate": 6.406664064264159e-05, + "loss": 1.4104, + "step": 79380 + }, + { + "epoch": 3.40049452131353, + "grad_norm": 0.1968545764684677, + "learning_rate": 6.405112669557424e-05, + "loss": 1.4136, + "step": 79390 + }, + { + "epoch": 3.401036462479042, + "grad_norm": 0.2986617684364319, + "learning_rate": 6.403561162729257e-05, + "loss": 1.4032, + "step": 79400 + }, + { + "epoch": 3.4015784036445544, + "grad_norm": 0.16921205818653107, + "learning_rate": 6.402009543971861e-05, + "loss": 1.4121, + "step": 79410 + }, + { + "epoch": 3.4021203448100668, + "grad_norm": 0.21860603988170624, + "learning_rate": 6.400457813477435e-05, + "loss": 1.418, + "step": 79420 + }, + { + "epoch": 3.4026622859755786, + "grad_norm": 0.15775620937347412, + "learning_rate": 6.398905971438207e-05, + "loss": 1.4144, + "step": 79430 + }, + { + "epoch": 3.40271648009213, + "eval_loss": 2.416477680206299, + "eval_runtime": 21.9934, + "eval_samples_per_second": 227.341, + "eval_steps_per_second": 1.228, + "step": 79431 + }, + { + "epoch": 3.403204227141091, + "grad_norm": 0.16970393061637878, + "learning_rate": 6.39735401804641e-05, + "loss": 1.4145, + "step": 79440 + }, + { + "epoch": 3.4037461683066033, + "grad_norm": 0.21157051622867584, + "learning_rate": 6.395801953494292e-05, + "loss": 1.4297, + "step": 79450 + }, + { + "epoch": 3.4042881094721156, + "grad_norm": 0.24515606462955475, + "learning_rate": 6.394249777974118e-05, + "loss": 1.4207, + "step": 79460 + }, + { + "epoch": 3.404830050637628, + "grad_norm": 0.20401237905025482, + "learning_rate": 6.392697491678163e-05, + "loss": 1.4104, + "step": 79470 + }, + { + "epoch": 3.4053719918031398, + "grad_norm": 0.2022889405488968, + "learning_rate": 6.391145094798718e-05, + "loss": 1.4191, + "step": 79480 + }, + { + "epoch": 3.405913932968652, + "grad_norm": 0.2641688287258148, + "learning_rate": 6.389592587528089e-05, + "loss": 1.4175, + "step": 79490 + }, + { + "epoch": 3.4064558741341644, + "grad_norm": 0.30329614877700806, + "learning_rate": 6.388039970058591e-05, + "loss": 1.4097, + "step": 79500 + }, + { + "epoch": 3.4069978152996767, + "grad_norm": 0.19015826284885406, + "learning_rate": 6.386487242582559e-05, + "loss": 1.4183, + "step": 79510 + }, + { + "epoch": 3.4074313682320865, + "eval_loss": 2.4201693534851074, + "eval_runtime": 22.0994, + "eval_samples_per_second": 226.25, + "eval_steps_per_second": 1.222, + "step": 79518 + }, + { + "epoch": 3.4075397564651886, + "grad_norm": 0.2884151041507721, + "learning_rate": 6.384934405292335e-05, + "loss": 1.4081, + "step": 79520 + }, + { + "epoch": 3.408081697630701, + "grad_norm": 0.2025626003742218, + "learning_rate": 6.383381458380279e-05, + "loss": 1.408, + "step": 79530 + }, + { + "epoch": 3.408623638796213, + "grad_norm": 0.20512153208255768, + "learning_rate": 6.381828402038763e-05, + "loss": 1.414, + "step": 79540 + }, + { + "epoch": 3.4091655799617255, + "grad_norm": 0.23173965513706207, + "learning_rate": 6.380275236460174e-05, + "loss": 1.4234, + "step": 79550 + }, + { + "epoch": 3.4097075211272374, + "grad_norm": 0.21376845240592957, + "learning_rate": 6.378721961836908e-05, + "loss": 1.4114, + "step": 79560 + }, + { + "epoch": 3.4102494622927497, + "grad_norm": 0.25625619292259216, + "learning_rate": 6.377168578361383e-05, + "loss": 1.4096, + "step": 79570 + }, + { + "epoch": 3.410791403458262, + "grad_norm": 0.1973353773355484, + "learning_rate": 6.375615086226022e-05, + "loss": 1.4196, + "step": 79580 + }, + { + "epoch": 3.4113333446237744, + "grad_norm": 0.21326379477977753, + "learning_rate": 6.374061485623266e-05, + "loss": 1.4115, + "step": 79590 + }, + { + "epoch": 3.4118752857892867, + "grad_norm": 0.16970664262771606, + "learning_rate": 6.372507776745567e-05, + "loss": 1.4198, + "step": 79600 + }, + { + "epoch": 3.4121462563720426, + "eval_loss": 2.4215962886810303, + "eval_runtime": 21.994, + "eval_samples_per_second": 227.335, + "eval_steps_per_second": 1.228, + "step": 79605 + }, + { + "epoch": 3.4124172269547985, + "grad_norm": 0.15813182294368744, + "learning_rate": 6.370953959785393e-05, + "loss": 1.4056, + "step": 79610 + }, + { + "epoch": 3.412959168120311, + "grad_norm": 0.3504032492637634, + "learning_rate": 6.369400034935224e-05, + "loss": 1.4224, + "step": 79620 + }, + { + "epoch": 3.413501109285823, + "grad_norm": 0.4499680995941162, + "learning_rate": 6.367846002387552e-05, + "loss": 1.4056, + "step": 79630 + }, + { + "epoch": 3.4140430504513355, + "grad_norm": 0.2306382656097412, + "learning_rate": 6.366291862334887e-05, + "loss": 1.416, + "step": 79640 + }, + { + "epoch": 3.414584991616848, + "grad_norm": 0.1756628006696701, + "learning_rate": 6.364737614969747e-05, + "loss": 1.4121, + "step": 79650 + }, + { + "epoch": 3.4151269327823597, + "grad_norm": 0.20633266866207123, + "learning_rate": 6.363183260484665e-05, + "loss": 1.4095, + "step": 79660 + }, + { + "epoch": 3.415668873947872, + "grad_norm": 0.17313161492347717, + "learning_rate": 6.361628799072187e-05, + "loss": 1.4148, + "step": 79670 + }, + { + "epoch": 3.4162108151133843, + "grad_norm": 0.16287872195243835, + "learning_rate": 6.360074230924877e-05, + "loss": 1.399, + "step": 79680 + }, + { + "epoch": 3.4167527562788966, + "grad_norm": 0.18536217510700226, + "learning_rate": 6.358519556235302e-05, + "loss": 1.4155, + "step": 79690 + }, + { + "epoch": 3.4168611445119987, + "eval_loss": 2.429548501968384, + "eval_runtime": 21.9964, + "eval_samples_per_second": 227.31, + "eval_steps_per_second": 1.227, + "step": 79692 + }, + { + "epoch": 3.4172946974444085, + "grad_norm": 0.3025163412094116, + "learning_rate": 6.356964775196055e-05, + "loss": 1.4113, + "step": 79700 + }, + { + "epoch": 3.417836638609921, + "grad_norm": 0.30600860714912415, + "learning_rate": 6.35540988799973e-05, + "loss": 1.4125, + "step": 79710 + }, + { + "epoch": 3.418378579775433, + "grad_norm": 0.2034313678741455, + "learning_rate": 6.353854894838942e-05, + "loss": 1.4084, + "step": 79720 + }, + { + "epoch": 3.4189205209409455, + "grad_norm": 0.16151131689548492, + "learning_rate": 6.352299795906317e-05, + "loss": 1.4079, + "step": 79730 + }, + { + "epoch": 3.4194624621064573, + "grad_norm": 0.3311507999897003, + "learning_rate": 6.350744591394494e-05, + "loss": 1.4156, + "step": 79740 + }, + { + "epoch": 3.4200044032719696, + "grad_norm": 0.21057330071926117, + "learning_rate": 6.349189281496124e-05, + "loss": 1.4149, + "step": 79750 + }, + { + "epoch": 3.420546344437482, + "grad_norm": 0.2549402117729187, + "learning_rate": 6.347633866403873e-05, + "loss": 1.4086, + "step": 79760 + }, + { + "epoch": 3.4210882856029943, + "grad_norm": 0.2768961787223816, + "learning_rate": 6.346078346310417e-05, + "loss": 1.4118, + "step": 79770 + }, + { + "epoch": 3.4215760326519553, + "eval_loss": 2.4365317821502686, + "eval_runtime": 21.997, + "eval_samples_per_second": 227.304, + "eval_steps_per_second": 1.227, + "step": 79779 + }, + { + "epoch": 3.4216302267685066, + "grad_norm": 0.16737370193004608, + "learning_rate": 6.344522721408453e-05, + "loss": 1.416, + "step": 79780 + }, + { + "epoch": 3.4221721679340185, + "grad_norm": 0.1741742491722107, + "learning_rate": 6.342966991890677e-05, + "loss": 1.4094, + "step": 79790 + }, + { + "epoch": 3.4227141090995308, + "grad_norm": 0.19367846846580505, + "learning_rate": 6.341411157949812e-05, + "loss": 1.4087, + "step": 79800 + }, + { + "epoch": 3.423256050265043, + "grad_norm": 0.17661771178245544, + "learning_rate": 6.339855219778586e-05, + "loss": 1.4114, + "step": 79810 + }, + { + "epoch": 3.4237979914305554, + "grad_norm": 0.1794527918100357, + "learning_rate": 6.338299177569739e-05, + "loss": 1.4256, + "step": 79820 + }, + { + "epoch": 3.4243399325960677, + "grad_norm": 0.16882766783237457, + "learning_rate": 6.336743031516031e-05, + "loss": 1.4138, + "step": 79830 + }, + { + "epoch": 3.4248818737615796, + "grad_norm": 0.2601432204246521, + "learning_rate": 6.33518678181023e-05, + "loss": 1.4253, + "step": 79840 + }, + { + "epoch": 3.425423814927092, + "grad_norm": 0.15983355045318604, + "learning_rate": 6.333630428645116e-05, + "loss": 1.4191, + "step": 79850 + }, + { + "epoch": 3.4259657560926042, + "grad_norm": 0.2166626751422882, + "learning_rate": 6.332073972213482e-05, + "loss": 1.4176, + "step": 79860 + }, + { + "epoch": 3.4262909207919114, + "eval_loss": 2.441222906112671, + "eval_runtime": 21.995, + "eval_samples_per_second": 227.324, + "eval_steps_per_second": 1.228, + "step": 79866 + }, + { + "epoch": 3.4265076972581165, + "grad_norm": 0.27949678897857666, + "learning_rate": 6.330517412708138e-05, + "loss": 1.4055, + "step": 79870 + }, + { + "epoch": 3.427049638423629, + "grad_norm": 0.23983129858970642, + "learning_rate": 6.328960750321903e-05, + "loss": 1.4154, + "step": 79880 + }, + { + "epoch": 3.4275915795891407, + "grad_norm": 0.18861855566501617, + "learning_rate": 6.32740398524761e-05, + "loss": 1.4094, + "step": 79890 + }, + { + "epoch": 3.428133520754653, + "grad_norm": 0.2114916443824768, + "learning_rate": 6.325847117678102e-05, + "loss": 1.4138, + "step": 79900 + }, + { + "epoch": 3.4286754619201654, + "grad_norm": 0.22669847309589386, + "learning_rate": 6.32429014780624e-05, + "loss": 1.4116, + "step": 79910 + }, + { + "epoch": 3.4292174030856777, + "grad_norm": 0.18799784779548645, + "learning_rate": 6.322733075824891e-05, + "loss": 1.4199, + "step": 79920 + }, + { + "epoch": 3.4297593442511896, + "grad_norm": 0.2161915898323059, + "learning_rate": 6.321175901926941e-05, + "loss": 1.4138, + "step": 79930 + }, + { + "epoch": 3.430301285416702, + "grad_norm": 0.17232954502105713, + "learning_rate": 6.319618626305288e-05, + "loss": 1.4154, + "step": 79940 + }, + { + "epoch": 3.430843226582214, + "grad_norm": 0.20791089534759521, + "learning_rate": 6.318061249152835e-05, + "loss": 1.4172, + "step": 79950 + }, + { + "epoch": 3.431005808931868, + "eval_loss": 2.4264652729034424, + "eval_runtime": 21.9927, + "eval_samples_per_second": 227.349, + "eval_steps_per_second": 1.228, + "step": 79953 + }, + { + "epoch": 3.4313851677477265, + "grad_norm": 0.22153067588806152, + "learning_rate": 6.316503770662508e-05, + "loss": 1.4078, + "step": 79960 + }, + { + "epoch": 3.4319271089132384, + "grad_norm": 0.17650219798088074, + "learning_rate": 6.314946191027238e-05, + "loss": 1.4162, + "step": 79970 + }, + { + "epoch": 3.4324690500787507, + "grad_norm": 0.24956724047660828, + "learning_rate": 6.313388510439972e-05, + "loss": 1.4138, + "step": 79980 + }, + { + "epoch": 3.433010991244263, + "grad_norm": 0.17035385966300964, + "learning_rate": 6.311830729093669e-05, + "loss": 1.4043, + "step": 79990 + }, + { + "epoch": 3.4335529324097753, + "grad_norm": 0.16698023676872253, + "learning_rate": 6.3102728471813e-05, + "loss": 1.4169, + "step": 80000 + }, + { + "epoch": 3.4340948735752876, + "grad_norm": 0.19518424570560455, + "learning_rate": 6.308714864895847e-05, + "loss": 1.4078, + "step": 80010 + }, + { + "epoch": 3.4346368147407995, + "grad_norm": 0.27307751774787903, + "learning_rate": 6.30715678243031e-05, + "loss": 1.4151, + "step": 80020 + }, + { + "epoch": 3.435178755906312, + "grad_norm": 0.2107393890619278, + "learning_rate": 6.305598599977694e-05, + "loss": 1.418, + "step": 80030 + }, + { + "epoch": 3.435720697071824, + "grad_norm": 0.20705653727054596, + "learning_rate": 6.304040317731022e-05, + "loss": 1.4223, + "step": 80040 + }, + { + "epoch": 3.435720697071824, + "eval_loss": 2.445282459259033, + "eval_runtime": 21.9862, + "eval_samples_per_second": 227.416, + "eval_steps_per_second": 1.228, + "step": 80040 + }, + { + "epoch": 3.4362626382373365, + "grad_norm": 0.1651361584663391, + "learning_rate": 6.302481935883325e-05, + "loss": 1.4108, + "step": 80050 + }, + { + "epoch": 3.436804579402849, + "grad_norm": 0.16168753802776337, + "learning_rate": 6.300923454627649e-05, + "loss": 1.4176, + "step": 80060 + }, + { + "epoch": 3.4373465205683607, + "grad_norm": 0.18073947727680206, + "learning_rate": 6.299364874157054e-05, + "loss": 1.3954, + "step": 80070 + }, + { + "epoch": 3.437888461733873, + "grad_norm": 0.2578413784503937, + "learning_rate": 6.297806194664609e-05, + "loss": 1.4147, + "step": 80080 + }, + { + "epoch": 3.4384304028993853, + "grad_norm": 0.16976307332515717, + "learning_rate": 6.296247416343396e-05, + "loss": 1.4134, + "step": 80090 + }, + { + "epoch": 3.4389723440648976, + "grad_norm": 0.16870634257793427, + "learning_rate": 6.29468853938651e-05, + "loss": 1.4163, + "step": 80100 + }, + { + "epoch": 3.43951428523041, + "grad_norm": 0.23185555636882782, + "learning_rate": 6.293129563987057e-05, + "loss": 1.4076, + "step": 80110 + }, + { + "epoch": 3.440056226395922, + "grad_norm": 0.15721189975738525, + "learning_rate": 6.291570490338159e-05, + "loss": 1.412, + "step": 80120 + }, + { + "epoch": 3.4404355852117803, + "eval_loss": 2.428380250930786, + "eval_runtime": 21.9892, + "eval_samples_per_second": 227.384, + "eval_steps_per_second": 1.228, + "step": 80127 + }, + { + "epoch": 3.440598167561434, + "grad_norm": 0.2671518623828888, + "learning_rate": 6.290011318632945e-05, + "loss": 1.4068, + "step": 80130 + }, + { + "epoch": 3.4411401087269464, + "grad_norm": 0.1867525279521942, + "learning_rate": 6.288452049064558e-05, + "loss": 1.4163, + "step": 80140 + }, + { + "epoch": 3.4416820498924587, + "grad_norm": 0.17865586280822754, + "learning_rate": 6.286892681826154e-05, + "loss": 1.409, + "step": 80150 + }, + { + "epoch": 3.4422239910579706, + "grad_norm": 0.19090549647808075, + "learning_rate": 6.285333217110901e-05, + "loss": 1.4124, + "step": 80160 + }, + { + "epoch": 3.442765932223483, + "grad_norm": 0.17862895131111145, + "learning_rate": 6.283773655111979e-05, + "loss": 1.4123, + "step": 80170 + }, + { + "epoch": 3.4433078733889952, + "grad_norm": 0.17863786220550537, + "learning_rate": 6.28221399602258e-05, + "loss": 1.4109, + "step": 80180 + }, + { + "epoch": 3.4438498145545076, + "grad_norm": 0.18190784752368927, + "learning_rate": 6.280654240035906e-05, + "loss": 1.4188, + "step": 80190 + }, + { + "epoch": 3.4443917557200194, + "grad_norm": 0.21105967462062836, + "learning_rate": 6.279094387345173e-05, + "loss": 1.4069, + "step": 80200 + }, + { + "epoch": 3.4449336968855317, + "grad_norm": 0.24444468319416046, + "learning_rate": 6.277534438143612e-05, + "loss": 1.4072, + "step": 80210 + }, + { + "epoch": 3.445150473351737, + "eval_loss": 2.4381296634674072, + "eval_runtime": 21.9948, + "eval_samples_per_second": 227.326, + "eval_steps_per_second": 1.228, + "step": 80214 + }, + { + "epoch": 3.445475638051044, + "grad_norm": 0.16191159188747406, + "learning_rate": 6.27597439262446e-05, + "loss": 1.4072, + "step": 80220 + }, + { + "epoch": 3.4460175792165564, + "grad_norm": 0.19114407896995544, + "learning_rate": 6.27441425098097e-05, + "loss": 1.4165, + "step": 80230 + }, + { + "epoch": 3.4465595203820687, + "grad_norm": 0.18071117997169495, + "learning_rate": 6.272854013406403e-05, + "loss": 1.4204, + "step": 80240 + }, + { + "epoch": 3.4471014615475806, + "grad_norm": 0.2497524917125702, + "learning_rate": 6.271293680094037e-05, + "loss": 1.4179, + "step": 80250 + }, + { + "epoch": 3.447643402713093, + "grad_norm": 0.17332278192043304, + "learning_rate": 6.269733251237159e-05, + "loss": 1.4071, + "step": 80260 + }, + { + "epoch": 3.448185343878605, + "grad_norm": 0.24143235385417938, + "learning_rate": 6.268172727029065e-05, + "loss": 1.4087, + "step": 80270 + }, + { + "epoch": 3.4487272850441175, + "grad_norm": 0.18241146206855774, + "learning_rate": 6.266612107663072e-05, + "loss": 1.4128, + "step": 80280 + }, + { + "epoch": 3.44926922620963, + "grad_norm": 0.386482834815979, + "learning_rate": 6.265051393332498e-05, + "loss": 1.398, + "step": 80290 + }, + { + "epoch": 3.4498111673751417, + "grad_norm": 0.22998222708702087, + "learning_rate": 6.26349058423068e-05, + "loss": 1.4189, + "step": 80300 + }, + { + "epoch": 3.449865361491693, + "eval_loss": 2.4301164150238037, + "eval_runtime": 21.9925, + "eval_samples_per_second": 227.35, + "eval_steps_per_second": 1.228, + "step": 80301 + }, + { + "epoch": 3.450353108540654, + "grad_norm": 0.37004634737968445, + "learning_rate": 6.261929680550963e-05, + "loss": 1.4088, + "step": 80310 + }, + { + "epoch": 3.4508950497061663, + "grad_norm": 0.19532030820846558, + "learning_rate": 6.260368682486704e-05, + "loss": 1.4071, + "step": 80320 + }, + { + "epoch": 3.4514369908716787, + "grad_norm": 0.1866229772567749, + "learning_rate": 6.258807590231275e-05, + "loss": 1.4214, + "step": 80330 + }, + { + "epoch": 3.4519789320371905, + "grad_norm": 0.18280602991580963, + "learning_rate": 6.257246403978056e-05, + "loss": 1.4061, + "step": 80340 + }, + { + "epoch": 3.452520873202703, + "grad_norm": 0.2126053422689438, + "learning_rate": 6.255685123920437e-05, + "loss": 1.4077, + "step": 80350 + }, + { + "epoch": 3.453062814368215, + "grad_norm": 0.1738966852426529, + "learning_rate": 6.25412375025183e-05, + "loss": 1.4177, + "step": 80360 + }, + { + "epoch": 3.4536047555337275, + "grad_norm": 0.23334649205207825, + "learning_rate": 6.252562283165645e-05, + "loss": 1.412, + "step": 80370 + }, + { + "epoch": 3.4541466966992393, + "grad_norm": 0.20518024265766144, + "learning_rate": 6.251000722855312e-05, + "loss": 1.4076, + "step": 80380 + }, + { + "epoch": 3.4545802496316496, + "eval_loss": 2.419971227645874, + "eval_runtime": 22.0015, + "eval_samples_per_second": 227.257, + "eval_steps_per_second": 1.227, + "step": 80388 + }, + { + "epoch": 3.4546886378647517, + "grad_norm": 0.23325951397418976, + "learning_rate": 6.249439069514269e-05, + "loss": 1.4109, + "step": 80390 + }, + { + "epoch": 3.455230579030264, + "grad_norm": 0.20998698472976685, + "learning_rate": 6.247877323335967e-05, + "loss": 1.4102, + "step": 80400 + }, + { + "epoch": 3.4557725201957763, + "grad_norm": 0.17500971257686615, + "learning_rate": 6.246315484513873e-05, + "loss": 1.4089, + "step": 80410 + }, + { + "epoch": 3.4563144613612886, + "grad_norm": 0.19114546477794647, + "learning_rate": 6.244753553241455e-05, + "loss": 1.4072, + "step": 80420 + }, + { + "epoch": 3.4568564025268005, + "grad_norm": 0.28155219554901123, + "learning_rate": 6.243191529712199e-05, + "loss": 1.4249, + "step": 80430 + }, + { + "epoch": 3.457398343692313, + "grad_norm": 0.18383963406085968, + "learning_rate": 6.241629414119603e-05, + "loss": 1.4128, + "step": 80440 + }, + { + "epoch": 3.457940284857825, + "grad_norm": 0.2947709262371063, + "learning_rate": 6.240067206657177e-05, + "loss": 1.4164, + "step": 80450 + }, + { + "epoch": 3.4584822260233374, + "grad_norm": 0.26993468403816223, + "learning_rate": 6.238504907518437e-05, + "loss": 1.4158, + "step": 80460 + }, + { + "epoch": 3.4590241671888498, + "grad_norm": 0.1699138730764389, + "learning_rate": 6.236942516896915e-05, + "loss": 1.4134, + "step": 80470 + }, + { + "epoch": 3.4592951377716057, + "eval_loss": 2.409397840499878, + "eval_runtime": 21.9932, + "eval_samples_per_second": 227.343, + "eval_steps_per_second": 1.228, + "step": 80475 + }, + { + "epoch": 3.4595661083543616, + "grad_norm": 0.1866215318441391, + "learning_rate": 6.235380034986154e-05, + "loss": 1.4173, + "step": 80480 + }, + { + "epoch": 3.460108049519874, + "grad_norm": 0.1942782998085022, + "learning_rate": 6.233817461979707e-05, + "loss": 1.4136, + "step": 80490 + }, + { + "epoch": 3.4606499906853863, + "grad_norm": 0.23292642831802368, + "learning_rate": 6.23225479807114e-05, + "loss": 1.4177, + "step": 80500 + }, + { + "epoch": 3.4611919318508986, + "grad_norm": 0.19163425266742706, + "learning_rate": 6.230692043454027e-05, + "loss": 1.4147, + "step": 80510 + }, + { + "epoch": 3.461733873016411, + "grad_norm": 0.18934902548789978, + "learning_rate": 6.229129198321955e-05, + "loss": 1.4062, + "step": 80520 + }, + { + "epoch": 3.4622758141819228, + "grad_norm": 0.16462522745132446, + "learning_rate": 6.227566262868523e-05, + "loss": 1.419, + "step": 80530 + }, + { + "epoch": 3.462817755347435, + "grad_norm": 0.16511207818984985, + "learning_rate": 6.226003237287343e-05, + "loss": 1.4138, + "step": 80540 + }, + { + "epoch": 3.4633596965129474, + "grad_norm": 0.23615549504756927, + "learning_rate": 6.224440121772034e-05, + "loss": 1.4121, + "step": 80550 + }, + { + "epoch": 3.4639016376784597, + "grad_norm": 0.19872863590717316, + "learning_rate": 6.222876916516225e-05, + "loss": 1.4056, + "step": 80560 + }, + { + "epoch": 3.464010025911562, + "eval_loss": 2.413600444793701, + "eval_runtime": 21.9886, + "eval_samples_per_second": 227.391, + "eval_steps_per_second": 1.228, + "step": 80562 + }, + { + "epoch": 3.4644435788439716, + "grad_norm": 0.2045847475528717, + "learning_rate": 6.221313621713566e-05, + "loss": 1.4146, + "step": 80570 + }, + { + "epoch": 3.464985520009484, + "grad_norm": 0.3229454755783081, + "learning_rate": 6.219750237557704e-05, + "loss": 1.4039, + "step": 80580 + }, + { + "epoch": 3.465527461174996, + "grad_norm": 0.20860926806926727, + "learning_rate": 6.218186764242308e-05, + "loss": 1.414, + "step": 80590 + }, + { + "epoch": 3.4660694023405085, + "grad_norm": 0.22892901301383972, + "learning_rate": 6.216623201961054e-05, + "loss": 1.4056, + "step": 80600 + }, + { + "epoch": 3.4666113435060204, + "grad_norm": 0.20201945304870605, + "learning_rate": 6.215059550907632e-05, + "loss": 1.4123, + "step": 80610 + }, + { + "epoch": 3.4671532846715327, + "grad_norm": 0.21173055469989777, + "learning_rate": 6.213495811275733e-05, + "loss": 1.4081, + "step": 80620 + }, + { + "epoch": 3.467695225837045, + "grad_norm": 0.20124772191047668, + "learning_rate": 6.211931983259073e-05, + "loss": 1.4108, + "step": 80630 + }, + { + "epoch": 3.4682371670025574, + "grad_norm": 0.2871626019477844, + "learning_rate": 6.210368067051368e-05, + "loss": 1.4115, + "step": 80640 + }, + { + "epoch": 3.4687249140515184, + "eval_loss": 2.4178643226623535, + "eval_runtime": 21.9935, + "eval_samples_per_second": 227.34, + "eval_steps_per_second": 1.228, + "step": 80649 + }, + { + "epoch": 3.4687791081680697, + "grad_norm": 0.17180606722831726, + "learning_rate": 6.208804062846353e-05, + "loss": 1.4096, + "step": 80650 + }, + { + "epoch": 3.4693210493335815, + "grad_norm": 0.250835657119751, + "learning_rate": 6.207239970837767e-05, + "loss": 1.4025, + "step": 80660 + }, + { + "epoch": 3.469862990499094, + "grad_norm": 0.20965339243412018, + "learning_rate": 6.205675791219365e-05, + "loss": 1.423, + "step": 80670 + }, + { + "epoch": 3.470404931664606, + "grad_norm": 0.1662900745868683, + "learning_rate": 6.204111524184907e-05, + "loss": 1.406, + "step": 80680 + }, + { + "epoch": 3.4709468728301185, + "grad_norm": 0.16801480948925018, + "learning_rate": 6.202547169928173e-05, + "loss": 1.4126, + "step": 80690 + }, + { + "epoch": 3.471488813995631, + "grad_norm": 0.1683754026889801, + "learning_rate": 6.200982728642945e-05, + "loss": 1.4136, + "step": 80700 + }, + { + "epoch": 3.4720307551611427, + "grad_norm": 0.17398451268672943, + "learning_rate": 6.19941820052302e-05, + "loss": 1.396, + "step": 80710 + }, + { + "epoch": 3.472572696326655, + "grad_norm": 0.25240159034729004, + "learning_rate": 6.197853585762204e-05, + "loss": 1.4273, + "step": 80720 + }, + { + "epoch": 3.4731146374921673, + "grad_norm": 0.2683483064174652, + "learning_rate": 6.196288884554315e-05, + "loss": 1.4114, + "step": 80730 + }, + { + "epoch": 3.4734398021914745, + "eval_loss": 2.4308483600616455, + "eval_runtime": 21.9921, + "eval_samples_per_second": 227.355, + "eval_steps_per_second": 1.228, + "step": 80736 + }, + { + "epoch": 3.4736565786576796, + "grad_norm": 0.1892116814851761, + "learning_rate": 6.194724097093184e-05, + "loss": 1.4226, + "step": 80740 + }, + { + "epoch": 3.474198519823192, + "grad_norm": 0.22284260392189026, + "learning_rate": 6.193159223572647e-05, + "loss": 1.4117, + "step": 80750 + }, + { + "epoch": 3.474740460988704, + "grad_norm": 0.32861995697021484, + "learning_rate": 6.191594264186556e-05, + "loss": 1.4114, + "step": 80760 + }, + { + "epoch": 3.475282402154216, + "grad_norm": 0.16212435066699982, + "learning_rate": 6.190029219128769e-05, + "loss": 1.4031, + "step": 80770 + }, + { + "epoch": 3.4758243433197284, + "grad_norm": 0.16907572746276855, + "learning_rate": 6.188464088593157e-05, + "loss": 1.4082, + "step": 80780 + }, + { + "epoch": 3.4763662844852408, + "grad_norm": 0.16995957493782043, + "learning_rate": 6.186898872773605e-05, + "loss": 1.4242, + "step": 80790 + }, + { + "epoch": 3.4769082256507526, + "grad_norm": 0.21154280006885529, + "learning_rate": 6.185333571864001e-05, + "loss": 1.4095, + "step": 80800 + }, + { + "epoch": 3.477450166816265, + "grad_norm": 0.22190575301647186, + "learning_rate": 6.183768186058252e-05, + "loss": 1.4173, + "step": 80810 + }, + { + "epoch": 3.4779921079817773, + "grad_norm": 0.1600603610277176, + "learning_rate": 6.182202715550266e-05, + "loss": 1.4161, + "step": 80820 + }, + { + "epoch": 3.478154690331431, + "eval_loss": 2.4302024841308594, + "eval_runtime": 21.996, + "eval_samples_per_second": 227.314, + "eval_steps_per_second": 1.227, + "step": 80823 + }, + { + "epoch": 3.4785340491472896, + "grad_norm": 0.15820740163326263, + "learning_rate": 6.18063716053397e-05, + "loss": 1.4155, + "step": 80830 + }, + { + "epoch": 3.4790759903128015, + "grad_norm": 0.20332497358322144, + "learning_rate": 6.179071521203299e-05, + "loss": 1.4101, + "step": 80840 + }, + { + "epoch": 3.4796179314783138, + "grad_norm": 0.23405461013317108, + "learning_rate": 6.177505797752195e-05, + "loss": 1.4245, + "step": 80850 + }, + { + "epoch": 3.480159872643826, + "grad_norm": 0.36314642429351807, + "learning_rate": 6.175939990374613e-05, + "loss": 1.415, + "step": 80860 + }, + { + "epoch": 3.4807018138093384, + "grad_norm": 0.3759239614009857, + "learning_rate": 6.174374099264522e-05, + "loss": 1.4123, + "step": 80870 + }, + { + "epoch": 3.4812437549748507, + "grad_norm": 0.18687903881072998, + "learning_rate": 6.172808124615895e-05, + "loss": 1.4108, + "step": 80880 + }, + { + "epoch": 3.4817856961403626, + "grad_norm": 0.27162638306617737, + "learning_rate": 6.171242066622718e-05, + "loss": 1.4069, + "step": 80890 + }, + { + "epoch": 3.482327637305875, + "grad_norm": 0.16891661286354065, + "learning_rate": 6.16967592547899e-05, + "loss": 1.4246, + "step": 80900 + }, + { + "epoch": 3.4828695784713872, + "grad_norm": 0.24053658545017242, + "learning_rate": 6.168109701378712e-05, + "loss": 1.4098, + "step": 80910 + }, + { + "epoch": 3.4828695784713872, + "eval_loss": 2.433274269104004, + "eval_runtime": 22.0653, + "eval_samples_per_second": 226.601, + "eval_steps_per_second": 1.224, + "step": 80910 + }, + { + "epoch": 3.4834115196368995, + "grad_norm": 0.16287986934185028, + "learning_rate": 6.166543394515906e-05, + "loss": 1.4179, + "step": 80920 + }, + { + "epoch": 3.483953460802412, + "grad_norm": 0.2310248166322708, + "learning_rate": 6.1649770050846e-05, + "loss": 1.4039, + "step": 80930 + }, + { + "epoch": 3.4844954019679237, + "grad_norm": 0.1746436357498169, + "learning_rate": 6.16341053327883e-05, + "loss": 1.4139, + "step": 80940 + }, + { + "epoch": 3.485037343133436, + "grad_norm": 0.17406900227069855, + "learning_rate": 6.16184397929264e-05, + "loss": 1.4117, + "step": 80950 + }, + { + "epoch": 3.4855792842989484, + "grad_norm": 0.17554210126399994, + "learning_rate": 6.160277343320095e-05, + "loss": 1.4133, + "step": 80960 + }, + { + "epoch": 3.4861212254644607, + "grad_norm": 0.1602802872657776, + "learning_rate": 6.158710625555257e-05, + "loss": 1.4056, + "step": 80970 + }, + { + "epoch": 3.4866631666299726, + "grad_norm": 0.19458161294460297, + "learning_rate": 6.157143826192207e-05, + "loss": 1.4136, + "step": 80980 + }, + { + "epoch": 3.487205107795485, + "grad_norm": 0.23610615730285645, + "learning_rate": 6.155576945425032e-05, + "loss": 1.414, + "step": 80990 + }, + { + "epoch": 3.4875844666113434, + "eval_loss": 2.429168701171875, + "eval_runtime": 22.2262, + "eval_samples_per_second": 224.96, + "eval_steps_per_second": 1.215, + "step": 80997 + }, + { + "epoch": 3.487747048960997, + "grad_norm": 0.18065819144248962, + "learning_rate": 6.154009983447834e-05, + "loss": 1.4076, + "step": 81000 + }, + { + "epoch": 4.000541941165512, + "grad_norm": 0.2180778533220291, + "learning_rate": 6.152442940454717e-05, + "loss": 1.3974, + "step": 81010 + }, + { + "epoch": 4.001083882331025, + "grad_norm": 0.20014463365077972, + "learning_rate": 6.1508758166398e-05, + "loss": 1.4054, + "step": 81020 + }, + { + "epoch": 4.0016258234965365, + "grad_norm": 0.1851683109998703, + "learning_rate": 6.149308612197213e-05, + "loss": 1.4141, + "step": 81030 + }, + { + "epoch": 4.002167764662049, + "grad_norm": 0.1777936816215515, + "learning_rate": 6.147741327321095e-05, + "loss": 1.4072, + "step": 81040 + }, + { + "epoch": 4.002709705827561, + "grad_norm": 0.17153799533843994, + "learning_rate": 6.146173962205594e-05, + "loss": 1.4019, + "step": 81050 + }, + { + "epoch": 4.003251646993073, + "grad_norm": 0.18811866641044617, + "learning_rate": 6.144606517044869e-05, + "loss": 1.4112, + "step": 81060 + }, + { + "epoch": 4.003793588158586, + "grad_norm": 0.23102541267871857, + "learning_rate": 6.143038992033083e-05, + "loss": 1.4195, + "step": 81070 + }, + { + "epoch": 4.004335529324098, + "grad_norm": 0.2683873176574707, + "learning_rate": 6.141471387364423e-05, + "loss": 1.3947, + "step": 81080 + }, + { + "epoch": 4.004552305790303, + "eval_loss": 2.4371650218963623, + "eval_runtime": 22.4585, + "eval_samples_per_second": 222.633, + "eval_steps_per_second": 1.202, + "step": 81084 + }, + { + "epoch": 4.00487747048961, + "grad_norm": 0.15493124723434448, + "learning_rate": 6.139903703233069e-05, + "loss": 1.4126, + "step": 81090 + }, + { + "epoch": 4.005419411655122, + "grad_norm": 0.17667770385742188, + "learning_rate": 6.138335939833225e-05, + "loss": 1.403, + "step": 81100 + }, + { + "epoch": 4.005961352820634, + "grad_norm": 0.1961461454629898, + "learning_rate": 6.136768097359096e-05, + "loss": 1.4091, + "step": 81110 + }, + { + "epoch": 4.006503293986147, + "grad_norm": 0.15184424817562103, + "learning_rate": 6.135200176004897e-05, + "loss": 1.4149, + "step": 81120 + }, + { + "epoch": 4.007045235151659, + "grad_norm": 0.18130755424499512, + "learning_rate": 6.133632175964861e-05, + "loss": 1.4037, + "step": 81130 + }, + { + "epoch": 4.007587176317171, + "grad_norm": 0.1635865420103073, + "learning_rate": 6.132064097433222e-05, + "loss": 1.4111, + "step": 81140 + }, + { + "epoch": 4.008129117482683, + "grad_norm": 0.22441507875919342, + "learning_rate": 6.130495940604225e-05, + "loss": 1.4135, + "step": 81150 + }, + { + "epoch": 4.008671058648195, + "grad_norm": 0.30155861377716064, + "learning_rate": 6.128927705672129e-05, + "loss": 1.4027, + "step": 81160 + }, + { + "epoch": 4.009212999813708, + "grad_norm": 0.3162151873111725, + "learning_rate": 6.127359392831198e-05, + "loss": 1.4073, + "step": 81170 + }, + { + "epoch": 4.009267193930259, + "eval_loss": 2.4355599880218506, + "eval_runtime": 21.9746, + "eval_samples_per_second": 227.536, + "eval_steps_per_second": 1.229, + "step": 81171 + }, + { + "epoch": 4.00975494097922, + "grad_norm": 0.22945579886436462, + "learning_rate": 6.12579100227571e-05, + "loss": 1.4077, + "step": 81180 + }, + { + "epoch": 4.010296882144732, + "grad_norm": 0.25422438979148865, + "learning_rate": 6.124222534199952e-05, + "loss": 1.4, + "step": 81190 + }, + { + "epoch": 4.0108388233102445, + "grad_norm": 0.21404403448104858, + "learning_rate": 6.122653988798214e-05, + "loss": 1.4092, + "step": 81200 + }, + { + "epoch": 4.011380764475756, + "grad_norm": 0.17690123617649078, + "learning_rate": 6.121085366264802e-05, + "loss": 1.4084, + "step": 81210 + }, + { + "epoch": 4.011922705641269, + "grad_norm": 0.18206678330898285, + "learning_rate": 6.119516666794034e-05, + "loss": 1.4086, + "step": 81220 + }, + { + "epoch": 4.012464646806781, + "grad_norm": 0.20577572286128998, + "learning_rate": 6.11794789058023e-05, + "loss": 1.4019, + "step": 81230 + }, + { + "epoch": 4.013006587972293, + "grad_norm": 0.16592557728290558, + "learning_rate": 6.116379037817724e-05, + "loss": 1.4091, + "step": 81240 + }, + { + "epoch": 4.013548529137806, + "grad_norm": 0.1908210813999176, + "learning_rate": 6.114810108700857e-05, + "loss": 1.4164, + "step": 81250 + }, + { + "epoch": 4.013982082070215, + "eval_loss": 2.4322612285614014, + "eval_runtime": 21.9747, + "eval_samples_per_second": 227.534, + "eval_steps_per_second": 1.229, + "step": 81258 + }, + { + "epoch": 4.014090470303318, + "grad_norm": 0.1619943529367447, + "learning_rate": 6.113241103423986e-05, + "loss": 1.4089, + "step": 81260 + }, + { + "epoch": 4.01463241146883, + "grad_norm": 0.17974944412708282, + "learning_rate": 6.11167202218147e-05, + "loss": 1.4134, + "step": 81270 + }, + { + "epoch": 4.015174352634342, + "grad_norm": 0.1860845535993576, + "learning_rate": 6.110102865167677e-05, + "loss": 1.4062, + "step": 81280 + }, + { + "epoch": 4.015716293799854, + "grad_norm": 0.16427412629127502, + "learning_rate": 6.108533632576992e-05, + "loss": 1.3899, + "step": 81290 + }, + { + "epoch": 4.016258234965367, + "grad_norm": 0.17551249265670776, + "learning_rate": 6.106964324603801e-05, + "loss": 1.4103, + "step": 81300 + }, + { + "epoch": 4.016800176130879, + "grad_norm": 0.20582075417041779, + "learning_rate": 6.105394941442503e-05, + "loss": 1.4072, + "step": 81310 + }, + { + "epoch": 4.017342117296391, + "grad_norm": 0.1872161328792572, + "learning_rate": 6.10382548328751e-05, + "loss": 1.4127, + "step": 81320 + }, + { + "epoch": 4.017884058461903, + "grad_norm": 0.18607717752456665, + "learning_rate": 6.102255950333239e-05, + "loss": 1.4179, + "step": 81330 + }, + { + "epoch": 4.018425999627415, + "grad_norm": 0.16646206378936768, + "learning_rate": 6.100686342774115e-05, + "loss": 1.4067, + "step": 81340 + }, + { + "epoch": 4.018696970210171, + "eval_loss": 2.431658983230591, + "eval_runtime": 21.9738, + "eval_samples_per_second": 227.544, + "eval_steps_per_second": 1.229, + "step": 81345 + }, + { + "epoch": 4.018967940792928, + "grad_norm": 0.33718639612197876, + "learning_rate": 6.0991166608045745e-05, + "loss": 1.4178, + "step": 81350 + }, + { + "epoch": 4.01950988195844, + "grad_norm": 0.32710033655166626, + "learning_rate": 6.097546904619061e-05, + "loss": 1.4108, + "step": 81360 + }, + { + "epoch": 4.020051823123952, + "grad_norm": 0.21620182693004608, + "learning_rate": 6.095977074412034e-05, + "loss": 1.4062, + "step": 81370 + }, + { + "epoch": 4.0205937642894645, + "grad_norm": 0.2620053291320801, + "learning_rate": 6.0944071703779536e-05, + "loss": 1.4072, + "step": 81380 + }, + { + "epoch": 4.021135705454976, + "grad_norm": 0.16901114583015442, + "learning_rate": 6.092837192711294e-05, + "loss": 1.4053, + "step": 81390 + }, + { + "epoch": 4.021677646620489, + "grad_norm": 0.2617866098880768, + "learning_rate": 6.091267141606537e-05, + "loss": 1.4131, + "step": 81400 + }, + { + "epoch": 4.022219587786001, + "grad_norm": 0.218822181224823, + "learning_rate": 6.089697017258171e-05, + "loss": 1.4063, + "step": 81410 + }, + { + "epoch": 4.022761528951513, + "grad_norm": 0.167058527469635, + "learning_rate": 6.088126819860701e-05, + "loss": 1.4009, + "step": 81420 + }, + { + "epoch": 4.023303470117026, + "grad_norm": 0.1802741438150406, + "learning_rate": 6.086556549608634e-05, + "loss": 1.4077, + "step": 81430 + }, + { + "epoch": 4.023411858350128, + "eval_loss": 2.4219465255737305, + "eval_runtime": 21.9753, + "eval_samples_per_second": 227.528, + "eval_steps_per_second": 1.229, + "step": 81432 + }, + { + "epoch": 4.0238454112825375, + "grad_norm": 0.1666201651096344, + "learning_rate": 6.084986206696487e-05, + "loss": 1.4031, + "step": 81440 + }, + { + "epoch": 4.02438735244805, + "grad_norm": 0.19865860044956207, + "learning_rate": 6.08341579131879e-05, + "loss": 1.3978, + "step": 81450 + }, + { + "epoch": 4.024929293613562, + "grad_norm": 0.1948530673980713, + "learning_rate": 6.081845303670077e-05, + "loss": 1.4148, + "step": 81460 + }, + { + "epoch": 4.025471234779074, + "grad_norm": 0.1879536211490631, + "learning_rate": 6.0802747439448935e-05, + "loss": 1.411, + "step": 81470 + }, + { + "epoch": 4.026013175944587, + "grad_norm": 0.20297576487064362, + "learning_rate": 6.078704112337795e-05, + "loss": 1.406, + "step": 81480 + }, + { + "epoch": 4.026555117110099, + "grad_norm": 0.17799319326877594, + "learning_rate": 6.077133409043342e-05, + "loss": 1.4034, + "step": 81490 + }, + { + "epoch": 4.027097058275611, + "grad_norm": 0.23970834910869598, + "learning_rate": 6.075562634256109e-05, + "loss": 1.4028, + "step": 81500 + }, + { + "epoch": 4.027638999441123, + "grad_norm": 0.28241363167762756, + "learning_rate": 6.073991788170675e-05, + "loss": 1.4104, + "step": 81510 + }, + { + "epoch": 4.028126746490084, + "eval_loss": 2.4167044162750244, + "eval_runtime": 21.9669, + "eval_samples_per_second": 227.615, + "eval_steps_per_second": 1.229, + "step": 81519 + }, + { + "epoch": 4.028180940606635, + "grad_norm": 0.22441983222961426, + "learning_rate": 6.072420870981631e-05, + "loss": 1.4092, + "step": 81520 + }, + { + "epoch": 4.028722881772148, + "grad_norm": 0.2237686812877655, + "learning_rate": 6.070849882883576e-05, + "loss": 1.4094, + "step": 81530 + }, + { + "epoch": 4.02926482293766, + "grad_norm": 0.17908205091953278, + "learning_rate": 6.069278824071114e-05, + "loss": 1.4032, + "step": 81540 + }, + { + "epoch": 4.029806764103172, + "grad_norm": 0.18530701100826263, + "learning_rate": 6.0677076947388635e-05, + "loss": 1.4038, + "step": 81550 + }, + { + "epoch": 4.030348705268684, + "grad_norm": 0.17356233298778534, + "learning_rate": 6.066136495081448e-05, + "loss": 1.4042, + "step": 81560 + }, + { + "epoch": 4.030890646434196, + "grad_norm": 0.1672087162733078, + "learning_rate": 6.0645652252935005e-05, + "loss": 1.408, + "step": 81570 + }, + { + "epoch": 4.031432587599709, + "grad_norm": 0.24931927025318146, + "learning_rate": 6.0629938855696655e-05, + "loss": 1.4034, + "step": 81580 + }, + { + "epoch": 4.031974528765221, + "grad_norm": 0.297122061252594, + "learning_rate": 6.061422476104592e-05, + "loss": 1.4126, + "step": 81590 + }, + { + "epoch": 4.032516469930733, + "grad_norm": 0.1879911869764328, + "learning_rate": 6.0598509970929396e-05, + "loss": 1.4148, + "step": 81600 + }, + { + "epoch": 4.03284163463004, + "eval_loss": 2.4190409183502197, + "eval_runtime": 21.9863, + "eval_samples_per_second": 227.414, + "eval_steps_per_second": 1.228, + "step": 81606 + }, + { + "epoch": 4.0330584110962455, + "grad_norm": 0.16557785868644714, + "learning_rate": 6.058279448729376e-05, + "loss": 1.3992, + "step": 81610 + }, + { + "epoch": 4.033600352261757, + "grad_norm": 0.21484564244747162, + "learning_rate": 6.056707831208579e-05, + "loss": 1.4228, + "step": 81620 + }, + { + "epoch": 4.03414229342727, + "grad_norm": 0.21425080299377441, + "learning_rate": 6.055136144725232e-05, + "loss": 1.4058, + "step": 81630 + }, + { + "epoch": 4.034684234592782, + "grad_norm": 0.16896581649780273, + "learning_rate": 6.0535643894740304e-05, + "loss": 1.4159, + "step": 81640 + }, + { + "epoch": 4.035226175758294, + "grad_norm": 0.20105527341365814, + "learning_rate": 6.0519925656496746e-05, + "loss": 1.4198, + "step": 81650 + }, + { + "epoch": 4.035768116923807, + "grad_norm": 0.18627329170703888, + "learning_rate": 6.050420673446876e-05, + "loss": 1.3985, + "step": 81660 + }, + { + "epoch": 4.0363100580893185, + "grad_norm": 0.1602090746164322, + "learning_rate": 6.048848713060354e-05, + "loss": 1.4063, + "step": 81670 + }, + { + "epoch": 4.036851999254831, + "grad_norm": 0.27577728033065796, + "learning_rate": 6.0472766846848384e-05, + "loss": 1.4078, + "step": 81680 + }, + { + "epoch": 4.037393940420343, + "grad_norm": 0.24706071615219116, + "learning_rate": 6.045704588515062e-05, + "loss": 1.4088, + "step": 81690 + }, + { + "epoch": 4.0375565227699965, + "eval_loss": 2.4332613945007324, + "eval_runtime": 21.9896, + "eval_samples_per_second": 227.38, + "eval_steps_per_second": 1.228, + "step": 81693 + }, + { + "epoch": 4.037935881585855, + "grad_norm": 0.18013040721416473, + "learning_rate": 6.0441324247457686e-05, + "loss": 1.4039, + "step": 81700 + }, + { + "epoch": 4.038477822751368, + "grad_norm": 0.2291117012500763, + "learning_rate": 6.042560193571714e-05, + "loss": 1.4175, + "step": 81710 + }, + { + "epoch": 4.03901976391688, + "grad_norm": 0.24475926160812378, + "learning_rate": 6.04098789518766e-05, + "loss": 1.4029, + "step": 81720 + }, + { + "epoch": 4.039561705082392, + "grad_norm": 0.31280967593193054, + "learning_rate": 6.039415529788372e-05, + "loss": 1.4068, + "step": 81730 + }, + { + "epoch": 4.040103646247904, + "grad_norm": 0.274687796831131, + "learning_rate": 6.037843097568631e-05, + "loss": 1.4094, + "step": 81740 + }, + { + "epoch": 4.040645587413416, + "grad_norm": 0.1743050217628479, + "learning_rate": 6.036270598723222e-05, + "loss": 1.4009, + "step": 81750 + }, + { + "epoch": 4.041187528578929, + "grad_norm": 0.17927388846874237, + "learning_rate": 6.0346980334469386e-05, + "loss": 1.4118, + "step": 81760 + }, + { + "epoch": 4.041729469744441, + "grad_norm": 0.2120426744222641, + "learning_rate": 6.0331254019345864e-05, + "loss": 1.3993, + "step": 81770 + }, + { + "epoch": 4.042271410909953, + "grad_norm": 0.2474319189786911, + "learning_rate": 6.0315527043809726e-05, + "loss": 1.4069, + "step": 81780 + }, + { + "epoch": 4.042271410909953, + "eval_loss": 2.4317309856414795, + "eval_runtime": 21.9711, + "eval_samples_per_second": 227.572, + "eval_steps_per_second": 1.229, + "step": 81780 + }, + { + "epoch": 4.042813352075465, + "grad_norm": 0.22434458136558533, + "learning_rate": 6.0299799409809175e-05, + "loss": 1.4062, + "step": 81790 + }, + { + "epoch": 4.043355293240977, + "grad_norm": 0.16757360100746155, + "learning_rate": 6.028407111929248e-05, + "loss": 1.4082, + "step": 81800 + }, + { + "epoch": 4.04389723440649, + "grad_norm": 0.23129674792289734, + "learning_rate": 6.0268342174207994e-05, + "loss": 1.3953, + "step": 81810 + }, + { + "epoch": 4.044439175572002, + "grad_norm": 0.21264296770095825, + "learning_rate": 6.025261257650416e-05, + "loss": 1.4025, + "step": 81820 + }, + { + "epoch": 4.044981116737514, + "grad_norm": 0.20318235456943512, + "learning_rate": 6.0236882328129495e-05, + "loss": 1.4144, + "step": 81830 + }, + { + "epoch": 4.045523057903027, + "grad_norm": 0.15029390156269073, + "learning_rate": 6.022115143103256e-05, + "loss": 1.4105, + "step": 81840 + }, + { + "epoch": 4.0460649990685384, + "grad_norm": 0.23690800368785858, + "learning_rate": 6.020541988716207e-05, + "loss": 1.4113, + "step": 81850 + }, + { + "epoch": 4.046606940234051, + "grad_norm": 0.16659775376319885, + "learning_rate": 6.018968769846676e-05, + "loss": 1.4208, + "step": 81860 + }, + { + "epoch": 4.04698629904991, + "eval_loss": 2.436213731765747, + "eval_runtime": 21.9755, + "eval_samples_per_second": 227.526, + "eval_steps_per_second": 1.229, + "step": 81867 + }, + { + "epoch": 4.047148881399563, + "grad_norm": 0.25907376408576965, + "learning_rate": 6.017395486689547e-05, + "loss": 1.4063, + "step": 81870 + }, + { + "epoch": 4.047690822565075, + "grad_norm": 0.15979789197444916, + "learning_rate": 6.015822139439712e-05, + "loss": 1.4152, + "step": 81880 + }, + { + "epoch": 4.048232763730588, + "grad_norm": 0.1618310660123825, + "learning_rate": 6.014248728292068e-05, + "loss": 1.4019, + "step": 81890 + }, + { + "epoch": 4.0487747048961, + "grad_norm": 0.22831520438194275, + "learning_rate": 6.0126752534415255e-05, + "loss": 1.4113, + "step": 81900 + }, + { + "epoch": 4.049316646061612, + "grad_norm": 0.24901092052459717, + "learning_rate": 6.011101715082997e-05, + "loss": 1.4011, + "step": 81910 + }, + { + "epoch": 4.049858587227124, + "grad_norm": 0.18291537463665009, + "learning_rate": 6.009528113411409e-05, + "loss": 1.4053, + "step": 81920 + }, + { + "epoch": 4.050400528392636, + "grad_norm": 0.3508703410625458, + "learning_rate": 6.007954448621691e-05, + "loss": 1.4052, + "step": 81930 + }, + { + "epoch": 4.050942469558149, + "grad_norm": 0.22802935540676117, + "learning_rate": 6.0063807209087784e-05, + "loss": 1.4107, + "step": 81940 + }, + { + "epoch": 4.051484410723661, + "grad_norm": 0.3096991777420044, + "learning_rate": 6.004806930467621e-05, + "loss": 1.4167, + "step": 81950 + }, + { + "epoch": 4.051701187189866, + "eval_loss": 2.434500217437744, + "eval_runtime": 21.9752, + "eval_samples_per_second": 227.529, + "eval_steps_per_second": 1.229, + "step": 81954 + }, + { + "epoch": 4.052026351889173, + "grad_norm": 0.2774633765220642, + "learning_rate": 6.0032330774931754e-05, + "loss": 1.4139, + "step": 81960 + }, + { + "epoch": 4.052568293054685, + "grad_norm": 0.16032350063323975, + "learning_rate": 6.0016591621803986e-05, + "loss": 1.4121, + "step": 81970 + }, + { + "epoch": 4.053110234220197, + "grad_norm": 0.26607418060302734, + "learning_rate": 6.000085184724265e-05, + "loss": 1.4051, + "step": 81980 + }, + { + "epoch": 4.05365217538571, + "grad_norm": 0.26877331733703613, + "learning_rate": 5.998511145319748e-05, + "loss": 1.4075, + "step": 81990 + }, + { + "epoch": 4.054194116551222, + "grad_norm": 0.18772627413272858, + "learning_rate": 5.996937044161835e-05, + "loss": 1.41, + "step": 82000 + }, + { + "epoch": 4.054736057716734, + "grad_norm": 0.17956435680389404, + "learning_rate": 5.9953628814455184e-05, + "loss": 1.4223, + "step": 82010 + }, + { + "epoch": 4.0552779988822465, + "grad_norm": 0.38991832733154297, + "learning_rate": 5.9937886573657986e-05, + "loss": 1.4082, + "step": 82020 + }, + { + "epoch": 4.055819940047758, + "grad_norm": 0.20310132205486298, + "learning_rate": 5.9922143721176846e-05, + "loss": 1.4081, + "step": 82030 + }, + { + "epoch": 4.056361881213271, + "grad_norm": 0.3149273693561554, + "learning_rate": 5.990640025896189e-05, + "loss": 1.4048, + "step": 82040 + }, + { + "epoch": 4.056416075329822, + "eval_loss": 2.435964822769165, + "eval_runtime": 21.972, + "eval_samples_per_second": 227.562, + "eval_steps_per_second": 1.229, + "step": 82041 + }, + { + "epoch": 4.056903822378783, + "grad_norm": 0.16741731762886047, + "learning_rate": 5.989065618896339e-05, + "loss": 1.4019, + "step": 82050 + }, + { + "epoch": 4.057445763544295, + "grad_norm": 0.19802165031433105, + "learning_rate": 5.987491151313164e-05, + "loss": 1.4076, + "step": 82060 + }, + { + "epoch": 4.057987704709808, + "grad_norm": 0.17228524386882782, + "learning_rate": 5.9859166233417016e-05, + "loss": 1.4031, + "step": 82070 + }, + { + "epoch": 4.0585296458753195, + "grad_norm": 0.21210512518882751, + "learning_rate": 5.984342035176996e-05, + "loss": 1.4075, + "step": 82080 + }, + { + "epoch": 4.059071587040832, + "grad_norm": 0.2215586155653, + "learning_rate": 5.982767387014102e-05, + "loss": 1.3966, + "step": 82090 + }, + { + "epoch": 4.059613528206344, + "grad_norm": 0.19042372703552246, + "learning_rate": 5.98119267904808e-05, + "loss": 1.4053, + "step": 82100 + }, + { + "epoch": 4.060155469371856, + "grad_norm": 0.2051682025194168, + "learning_rate": 5.979617911473999e-05, + "loss": 1.4144, + "step": 82110 + }, + { + "epoch": 4.060697410537369, + "grad_norm": 0.20389145612716675, + "learning_rate": 5.978043084486934e-05, + "loss": 1.4136, + "step": 82120 + }, + { + "epoch": 4.061130963469778, + "eval_loss": 2.4284167289733887, + "eval_runtime": 21.9746, + "eval_samples_per_second": 227.536, + "eval_steps_per_second": 1.229, + "step": 82128 + }, + { + "epoch": 4.061239351702881, + "grad_norm": 0.18237143754959106, + "learning_rate": 5.9764681982819656e-05, + "loss": 1.4024, + "step": 82130 + }, + { + "epoch": 4.061781292868393, + "grad_norm": 0.17889219522476196, + "learning_rate": 5.974893253054186e-05, + "loss": 1.4037, + "step": 82140 + }, + { + "epoch": 4.062323234033905, + "grad_norm": 0.23920495808124542, + "learning_rate": 5.973318248998693e-05, + "loss": 1.4037, + "step": 82150 + }, + { + "epoch": 4.062865175199417, + "grad_norm": 0.18914079666137695, + "learning_rate": 5.971743186310589e-05, + "loss": 1.41, + "step": 82160 + }, + { + "epoch": 4.06340711636493, + "grad_norm": 0.21431829035282135, + "learning_rate": 5.970168065184987e-05, + "loss": 1.4089, + "step": 82170 + }, + { + "epoch": 4.063949057530442, + "grad_norm": 0.16956184804439545, + "learning_rate": 5.968592885817007e-05, + "loss": 1.409, + "step": 82180 + }, + { + "epoch": 4.064490998695954, + "grad_norm": 0.23388327658176422, + "learning_rate": 5.967017648401775e-05, + "loss": 1.4165, + "step": 82190 + }, + { + "epoch": 4.065032939861466, + "grad_norm": 0.22132398188114166, + "learning_rate": 5.965442353134424e-05, + "loss": 1.3967, + "step": 82200 + }, + { + "epoch": 4.065574881026978, + "grad_norm": 0.32104483246803284, + "learning_rate": 5.963867000210094e-05, + "loss": 1.4003, + "step": 82210 + }, + { + "epoch": 4.065845851609734, + "eval_loss": 2.434609889984131, + "eval_runtime": 21.9734, + "eval_samples_per_second": 227.548, + "eval_steps_per_second": 1.229, + "step": 82215 + }, + { + "epoch": 4.066116822192491, + "grad_norm": 0.27653661370277405, + "learning_rate": 5.962291589823935e-05, + "loss": 1.4099, + "step": 82220 + }, + { + "epoch": 4.066658763358003, + "grad_norm": 0.18210026621818542, + "learning_rate": 5.960716122171102e-05, + "loss": 1.4126, + "step": 82230 + }, + { + "epoch": 4.067200704523515, + "grad_norm": 0.20089492201805115, + "learning_rate": 5.959140597446753e-05, + "loss": 1.4042, + "step": 82240 + }, + { + "epoch": 4.0677426456890275, + "grad_norm": 0.20778271555900574, + "learning_rate": 5.957565015846063e-05, + "loss": 1.4037, + "step": 82250 + }, + { + "epoch": 4.068284586854539, + "grad_norm": 0.31842777132987976, + "learning_rate": 5.955989377564203e-05, + "loss": 1.4034, + "step": 82260 + }, + { + "epoch": 4.068826528020052, + "grad_norm": 0.1979900300502777, + "learning_rate": 5.95441368279636e-05, + "loss": 1.391, + "step": 82270 + }, + { + "epoch": 4.069368469185564, + "grad_norm": 0.1853083223104477, + "learning_rate": 5.952837931737723e-05, + "loss": 1.417, + "step": 82280 + }, + { + "epoch": 4.069910410351076, + "grad_norm": 0.18772120773792267, + "learning_rate": 5.951262124583489e-05, + "loss": 1.4061, + "step": 82290 + }, + { + "epoch": 4.070452351516589, + "grad_norm": 0.207114115357399, + "learning_rate": 5.9496862615288615e-05, + "loss": 1.4112, + "step": 82300 + }, + { + "epoch": 4.070560739749691, + "eval_loss": 2.4208648204803467, + "eval_runtime": 21.9732, + "eval_samples_per_second": 227.549, + "eval_steps_per_second": 1.229, + "step": 82302 + }, + { + "epoch": 4.070994292682101, + "grad_norm": 0.21243199706077576, + "learning_rate": 5.948110342769054e-05, + "loss": 1.4098, + "step": 82310 + }, + { + "epoch": 4.071536233847613, + "grad_norm": 0.18174535036087036, + "learning_rate": 5.946534368499281e-05, + "loss": 1.3921, + "step": 82320 + }, + { + "epoch": 4.072078175013125, + "grad_norm": 0.195078507065773, + "learning_rate": 5.944958338914769e-05, + "loss": 1.3993, + "step": 82330 + }, + { + "epoch": 4.072620116178637, + "grad_norm": 0.1927531659603119, + "learning_rate": 5.943382254210751e-05, + "loss": 1.4068, + "step": 82340 + }, + { + "epoch": 4.07316205734415, + "grad_norm": 0.3011392652988434, + "learning_rate": 5.941806114582464e-05, + "loss": 1.4216, + "step": 82350 + }, + { + "epoch": 4.073703998509662, + "grad_norm": 0.22039276361465454, + "learning_rate": 5.940229920225154e-05, + "loss": 1.4045, + "step": 82360 + }, + { + "epoch": 4.0742459396751745, + "grad_norm": 0.1818740963935852, + "learning_rate": 5.9386536713340726e-05, + "loss": 1.4074, + "step": 82370 + }, + { + "epoch": 4.074787880840686, + "grad_norm": 0.1722990870475769, + "learning_rate": 5.9370773681044776e-05, + "loss": 1.4028, + "step": 82380 + }, + { + "epoch": 4.075275627889647, + "eval_loss": 2.426165819168091, + "eval_runtime": 21.9727, + "eval_samples_per_second": 227.555, + "eval_steps_per_second": 1.229, + "step": 82389 + }, + { + "epoch": 4.075329822006198, + "grad_norm": 0.17619258165359497, + "learning_rate": 5.935501010731637e-05, + "loss": 1.4043, + "step": 82390 + }, + { + "epoch": 4.075871763171711, + "grad_norm": 0.1646595150232315, + "learning_rate": 5.93392459941082e-05, + "loss": 1.4092, + "step": 82400 + }, + { + "epoch": 4.076413704337223, + "grad_norm": 0.2252381443977356, + "learning_rate": 5.932348134337311e-05, + "loss": 1.4022, + "step": 82410 + }, + { + "epoch": 4.076955645502735, + "grad_norm": 0.23596051335334778, + "learning_rate": 5.9307716157063895e-05, + "loss": 1.4146, + "step": 82420 + }, + { + "epoch": 4.0774975866682475, + "grad_norm": 0.20619599521160126, + "learning_rate": 5.9291950437133515e-05, + "loss": 1.3984, + "step": 82430 + }, + { + "epoch": 4.078039527833759, + "grad_norm": 0.2613559663295746, + "learning_rate": 5.927618418553495e-05, + "loss": 1.4036, + "step": 82440 + }, + { + "epoch": 4.078581468999272, + "grad_norm": 0.26218506693840027, + "learning_rate": 5.9260417404221245e-05, + "loss": 1.4058, + "step": 82450 + }, + { + "epoch": 4.079123410164784, + "grad_norm": 0.17391842603683472, + "learning_rate": 5.924465009514554e-05, + "loss": 1.4002, + "step": 82460 + }, + { + "epoch": 4.079665351330296, + "grad_norm": 0.2079029232263565, + "learning_rate": 5.9228882260261e-05, + "loss": 1.4079, + "step": 82470 + }, + { + "epoch": 4.0799905160296035, + "eval_loss": 2.4345686435699463, + "eval_runtime": 21.9671, + "eval_samples_per_second": 227.614, + "eval_steps_per_second": 1.229, + "step": 82476 + }, + { + "epoch": 4.080207292495809, + "grad_norm": 0.16714847087860107, + "learning_rate": 5.9213113901520875e-05, + "loss": 1.4065, + "step": 82480 + }, + { + "epoch": 4.0807492336613205, + "grad_norm": 0.19766834378242493, + "learning_rate": 5.9197345020878515e-05, + "loss": 1.4094, + "step": 82490 + }, + { + "epoch": 4.081291174826833, + "grad_norm": 0.26485520601272583, + "learning_rate": 5.918157562028726e-05, + "loss": 1.4029, + "step": 82500 + }, + { + "epoch": 4.081833115992345, + "grad_norm": 0.25532785058021545, + "learning_rate": 5.9165805701700595e-05, + "loss": 1.3951, + "step": 82510 + }, + { + "epoch": 4.082375057157857, + "grad_norm": 0.28087395429611206, + "learning_rate": 5.915003526707198e-05, + "loss": 1.4069, + "step": 82520 + }, + { + "epoch": 4.08291699832337, + "grad_norm": 0.306506872177124, + "learning_rate": 5.9134264318355025e-05, + "loss": 1.3996, + "step": 82530 + }, + { + "epoch": 4.083458939488882, + "grad_norm": 0.2110404223203659, + "learning_rate": 5.911849285750335e-05, + "loss": 1.418, + "step": 82540 + }, + { + "epoch": 4.084000880654394, + "grad_norm": 0.22511455416679382, + "learning_rate": 5.910272088647067e-05, + "loss": 1.4133, + "step": 82550 + }, + { + "epoch": 4.084542821819906, + "grad_norm": 0.17213839292526245, + "learning_rate": 5.9086948407210727e-05, + "loss": 1.4103, + "step": 82560 + }, + { + "epoch": 4.08470540416956, + "eval_loss": 2.431107521057129, + "eval_runtime": 21.9786, + "eval_samples_per_second": 227.494, + "eval_steps_per_second": 1.228, + "step": 82563 + }, + { + "epoch": 4.085084762985418, + "grad_norm": 0.26532599329948425, + "learning_rate": 5.907117542167737e-05, + "loss": 1.4042, + "step": 82570 + }, + { + "epoch": 4.085626704150931, + "grad_norm": 0.16096952557563782, + "learning_rate": 5.905540193182446e-05, + "loss": 1.4038, + "step": 82580 + }, + { + "epoch": 4.086168645316443, + "grad_norm": 0.1794336587190628, + "learning_rate": 5.903962793960599e-05, + "loss": 1.4073, + "step": 82590 + }, + { + "epoch": 4.086710586481955, + "grad_norm": 0.17377042770385742, + "learning_rate": 5.902385344697594e-05, + "loss": 1.3989, + "step": 82600 + }, + { + "epoch": 4.087252527647467, + "grad_norm": 0.16201172769069672, + "learning_rate": 5.9008078455888394e-05, + "loss": 1.3959, + "step": 82610 + }, + { + "epoch": 4.087794468812979, + "grad_norm": 0.27610713243484497, + "learning_rate": 5.899230296829748e-05, + "loss": 1.3948, + "step": 82620 + }, + { + "epoch": 4.088336409978492, + "grad_norm": 0.25443559885025024, + "learning_rate": 5.897652698615741e-05, + "loss": 1.396, + "step": 82630 + }, + { + "epoch": 4.088878351144004, + "grad_norm": 0.24116799235343933, + "learning_rate": 5.896075051142246e-05, + "loss": 1.404, + "step": 82640 + }, + { + "epoch": 4.089420292309516, + "grad_norm": 0.20483553409576416, + "learning_rate": 5.894497354604692e-05, + "loss": 1.405, + "step": 82650 + }, + { + "epoch": 4.089420292309516, + "eval_loss": 2.4280037879943848, + "eval_runtime": 21.9762, + "eval_samples_per_second": 227.519, + "eval_steps_per_second": 1.229, + "step": 82650 + }, + { + "epoch": 4.0899622334750285, + "grad_norm": 0.26821398735046387, + "learning_rate": 5.892919609198517e-05, + "loss": 1.4138, + "step": 82660 + }, + { + "epoch": 4.09050417464054, + "grad_norm": 0.1844187080860138, + "learning_rate": 5.891341815119168e-05, + "loss": 1.4063, + "step": 82670 + }, + { + "epoch": 4.091046115806053, + "grad_norm": 0.3293144702911377, + "learning_rate": 5.8897639725620956e-05, + "loss": 1.4092, + "step": 82680 + }, + { + "epoch": 4.091588056971565, + "grad_norm": 0.20657339692115784, + "learning_rate": 5.888186081722752e-05, + "loss": 1.3962, + "step": 82690 + }, + { + "epoch": 4.092129998137077, + "grad_norm": 0.16567468643188477, + "learning_rate": 5.8866081427966036e-05, + "loss": 1.4108, + "step": 82700 + }, + { + "epoch": 4.09267193930259, + "grad_norm": 0.18625999987125397, + "learning_rate": 5.885030155979116e-05, + "loss": 1.4004, + "step": 82710 + }, + { + "epoch": 4.0932138804681015, + "grad_norm": 0.16223831474781036, + "learning_rate": 5.8834521214657635e-05, + "loss": 1.4017, + "step": 82720 + }, + { + "epoch": 4.093755821633614, + "grad_norm": 0.1961243897676468, + "learning_rate": 5.881874039452029e-05, + "loss": 1.4039, + "step": 82730 + }, + { + "epoch": 4.094135180449473, + "eval_loss": 2.4336957931518555, + "eval_runtime": 21.6769, + "eval_samples_per_second": 230.66, + "eval_steps_per_second": 1.246, + "step": 82737 + }, + { + "epoch": 4.094297762799126, + "grad_norm": 0.20036591589450836, + "learning_rate": 5.880295910133394e-05, + "loss": 1.4055, + "step": 82740 + }, + { + "epoch": 4.094839703964638, + "grad_norm": 0.20665618777275085, + "learning_rate": 5.8787177337053555e-05, + "loss": 1.4043, + "step": 82750 + }, + { + "epoch": 4.095381645130151, + "grad_norm": 0.18086576461791992, + "learning_rate": 5.8771395103634065e-05, + "loss": 1.41, + "step": 82760 + }, + { + "epoch": 4.095923586295663, + "grad_norm": 0.15253938734531403, + "learning_rate": 5.8755612403030524e-05, + "loss": 1.4057, + "step": 82770 + }, + { + "epoch": 4.096465527461175, + "grad_norm": 0.17018210887908936, + "learning_rate": 5.873982923719804e-05, + "loss": 1.4059, + "step": 82780 + }, + { + "epoch": 4.097007468626687, + "grad_norm": 0.1573159396648407, + "learning_rate": 5.872404560809173e-05, + "loss": 1.4166, + "step": 82790 + }, + { + "epoch": 4.097549409792199, + "grad_norm": 0.2046526074409485, + "learning_rate": 5.870826151766683e-05, + "loss": 1.4067, + "step": 82800 + }, + { + "epoch": 4.098091350957712, + "grad_norm": 0.1698278784751892, + "learning_rate": 5.869247696787857e-05, + "loss": 1.4116, + "step": 82810 + }, + { + "epoch": 4.098633292123224, + "grad_norm": 0.24143236875534058, + "learning_rate": 5.867669196068231e-05, + "loss": 1.3992, + "step": 82820 + }, + { + "epoch": 4.098850068589429, + "eval_loss": 2.4264564514160156, + "eval_runtime": 21.9736, + "eval_samples_per_second": 227.546, + "eval_steps_per_second": 1.229, + "step": 82824 + }, + { + "epoch": 4.099175233288736, + "grad_norm": 0.16382378339767456, + "learning_rate": 5.866090649803342e-05, + "loss": 1.4064, + "step": 82830 + }, + { + "epoch": 4.099717174454248, + "grad_norm": 0.18778187036514282, + "learning_rate": 5.864512058188733e-05, + "loss": 1.398, + "step": 82840 + }, + { + "epoch": 4.10025911561976, + "grad_norm": 0.1958286166191101, + "learning_rate": 5.862933421419952e-05, + "loss": 1.4076, + "step": 82850 + }, + { + "epoch": 4.100801056785273, + "grad_norm": 0.18923796713352203, + "learning_rate": 5.861354739692553e-05, + "loss": 1.4033, + "step": 82860 + }, + { + "epoch": 4.101342997950785, + "grad_norm": 0.155537948012352, + "learning_rate": 5.859776013202098e-05, + "loss": 1.4019, + "step": 82870 + }, + { + "epoch": 4.101884939116297, + "grad_norm": 0.2815153896808624, + "learning_rate": 5.858197242144155e-05, + "loss": 1.4076, + "step": 82880 + }, + { + "epoch": 4.10242688028181, + "grad_norm": 0.20242029428482056, + "learning_rate": 5.856618426714291e-05, + "loss": 1.3973, + "step": 82890 + }, + { + "epoch": 4.1029688214473214, + "grad_norm": 0.18118534982204437, + "learning_rate": 5.855039567108084e-05, + "loss": 1.4034, + "step": 82900 + }, + { + "epoch": 4.103510762612834, + "grad_norm": 0.18179315328598022, + "learning_rate": 5.853460663521117e-05, + "loss": 1.3943, + "step": 82910 + }, + { + "epoch": 4.103564956729385, + "eval_loss": 2.43251633644104, + "eval_runtime": 21.9701, + "eval_samples_per_second": 227.582, + "eval_steps_per_second": 1.229, + "step": 82911 + }, + { + "epoch": 4.104052703778346, + "grad_norm": 0.3180215358734131, + "learning_rate": 5.851881716148979e-05, + "loss": 1.3989, + "step": 82920 + }, + { + "epoch": 4.104594644943858, + "grad_norm": 0.22346888482570648, + "learning_rate": 5.850302725187261e-05, + "loss": 1.4041, + "step": 82930 + }, + { + "epoch": 4.105136586109371, + "grad_norm": 0.1704285740852356, + "learning_rate": 5.8487236908315635e-05, + "loss": 1.4034, + "step": 82940 + }, + { + "epoch": 4.105678527274883, + "grad_norm": 0.1817091554403305, + "learning_rate": 5.8471446132774864e-05, + "loss": 1.402, + "step": 82950 + }, + { + "epoch": 4.106220468440395, + "grad_norm": 0.16778716444969177, + "learning_rate": 5.8455654927206436e-05, + "loss": 1.399, + "step": 82960 + }, + { + "epoch": 4.106762409605907, + "grad_norm": 0.20462718605995178, + "learning_rate": 5.8439863293566476e-05, + "loss": 1.4, + "step": 82970 + }, + { + "epoch": 4.107304350771419, + "grad_norm": 0.16164131462574005, + "learning_rate": 5.842407123381118e-05, + "loss": 1.4022, + "step": 82980 + }, + { + "epoch": 4.107846291936932, + "grad_norm": 0.17389550805091858, + "learning_rate": 5.8408278749896816e-05, + "loss": 1.4129, + "step": 82990 + }, + { + "epoch": 4.108279844869341, + "eval_loss": 2.4385242462158203, + "eval_runtime": 21.9887, + "eval_samples_per_second": 227.39, + "eval_steps_per_second": 1.228, + "step": 82998 + }, + { + "epoch": 4.108388233102444, + "grad_norm": 0.3119494616985321, + "learning_rate": 5.8392485843779676e-05, + "loss": 1.41, + "step": 83000 + }, + { + "epoch": 4.108930174267956, + "grad_norm": 0.21673525869846344, + "learning_rate": 5.83766925174161e-05, + "loss": 1.4092, + "step": 83010 + }, + { + "epoch": 4.109472115433468, + "grad_norm": 0.17000888288021088, + "learning_rate": 5.836089877276254e-05, + "loss": 1.4118, + "step": 83020 + }, + { + "epoch": 4.11001405659898, + "grad_norm": 0.1890566647052765, + "learning_rate": 5.83451046117754e-05, + "loss": 1.4017, + "step": 83030 + }, + { + "epoch": 4.110555997764493, + "grad_norm": 0.21135641634464264, + "learning_rate": 5.832931003641127e-05, + "loss": 1.393, + "step": 83040 + }, + { + "epoch": 4.111097938930005, + "grad_norm": 0.15873472392559052, + "learning_rate": 5.8313515048626634e-05, + "loss": 1.4016, + "step": 83050 + }, + { + "epoch": 4.111639880095517, + "grad_norm": 0.24823595583438873, + "learning_rate": 5.8297719650378136e-05, + "loss": 1.41, + "step": 83060 + }, + { + "epoch": 4.1121818212610295, + "grad_norm": 0.18484194576740265, + "learning_rate": 5.828192384362245e-05, + "loss": 1.4021, + "step": 83070 + }, + { + "epoch": 4.112723762426541, + "grad_norm": 0.26047736406326294, + "learning_rate": 5.826612763031632e-05, + "loss": 1.4074, + "step": 83080 + }, + { + "epoch": 4.112994733009297, + "eval_loss": 2.4378349781036377, + "eval_runtime": 21.9749, + "eval_samples_per_second": 227.532, + "eval_steps_per_second": 1.229, + "step": 83085 + }, + { + "epoch": 4.113265703592054, + "grad_norm": 0.38496696949005127, + "learning_rate": 5.825033101241644e-05, + "loss": 1.4136, + "step": 83090 + }, + { + "epoch": 4.113807644757566, + "grad_norm": 0.19482018053531647, + "learning_rate": 5.823453399187967e-05, + "loss": 1.4096, + "step": 83100 + }, + { + "epoch": 4.114349585923078, + "grad_norm": 0.17476175725460052, + "learning_rate": 5.821873657066288e-05, + "loss": 1.4118, + "step": 83110 + }, + { + "epoch": 4.114891527088591, + "grad_norm": 0.19466517865657806, + "learning_rate": 5.820293875072298e-05, + "loss": 1.4088, + "step": 83120 + }, + { + "epoch": 4.1154334682541025, + "grad_norm": 0.20809796452522278, + "learning_rate": 5.818714053401695e-05, + "loss": 1.4084, + "step": 83130 + }, + { + "epoch": 4.115975409419615, + "grad_norm": 0.16254140436649323, + "learning_rate": 5.8171341922501755e-05, + "loss": 1.4097, + "step": 83140 + }, + { + "epoch": 4.116517350585127, + "grad_norm": 0.15851789712905884, + "learning_rate": 5.8155542918134496e-05, + "loss": 1.4123, + "step": 83150 + }, + { + "epoch": 4.117059291750639, + "grad_norm": 0.29275140166282654, + "learning_rate": 5.8139743522872306e-05, + "loss": 1.4053, + "step": 83160 + }, + { + "epoch": 4.117601232916152, + "grad_norm": 0.2825930714607239, + "learning_rate": 5.812394373867229e-05, + "loss": 1.4093, + "step": 83170 + }, + { + "epoch": 4.117709621149254, + "eval_loss": 2.4347915649414062, + "eval_runtime": 21.9741, + "eval_samples_per_second": 227.54, + "eval_steps_per_second": 1.229, + "step": 83172 + }, + { + "epoch": 4.118143174081664, + "grad_norm": 0.24042131006717682, + "learning_rate": 5.81081435674917e-05, + "loss": 1.4135, + "step": 83180 + }, + { + "epoch": 4.118685115247176, + "grad_norm": 0.22210003435611725, + "learning_rate": 5.809234301128779e-05, + "loss": 1.4164, + "step": 83190 + }, + { + "epoch": 4.119227056412688, + "grad_norm": 0.15581659972667694, + "learning_rate": 5.807654207201784e-05, + "loss": 1.3988, + "step": 83200 + }, + { + "epoch": 4.1197689975782, + "grad_norm": 0.19380082190036774, + "learning_rate": 5.8060740751639217e-05, + "loss": 1.4111, + "step": 83210 + }, + { + "epoch": 4.120310938743713, + "grad_norm": 0.1942562609910965, + "learning_rate": 5.8044939052109315e-05, + "loss": 1.4082, + "step": 83220 + }, + { + "epoch": 4.120852879909225, + "grad_norm": 0.25676843523979187, + "learning_rate": 5.802913697538559e-05, + "loss": 1.4086, + "step": 83230 + }, + { + "epoch": 4.121394821074737, + "grad_norm": 0.21797262132167816, + "learning_rate": 5.8013334523425536e-05, + "loss": 1.408, + "step": 83240 + }, + { + "epoch": 4.121936762240249, + "grad_norm": 0.18944242596626282, + "learning_rate": 5.799753169818666e-05, + "loss": 1.4046, + "step": 83250 + }, + { + "epoch": 4.1224245092892104, + "eval_loss": 2.430614709854126, + "eval_runtime": 21.9688, + "eval_samples_per_second": 227.596, + "eval_steps_per_second": 1.229, + "step": 83259 + }, + { + "epoch": 4.122478703405761, + "grad_norm": 0.24879737198352814, + "learning_rate": 5.798172850162658e-05, + "loss": 1.4057, + "step": 83260 + }, + { + "epoch": 4.123020644571274, + "grad_norm": 0.16706986725330353, + "learning_rate": 5.796592493570292e-05, + "loss": 1.4101, + "step": 83270 + }, + { + "epoch": 4.123562585736786, + "grad_norm": 0.16859695315361023, + "learning_rate": 5.795012100237337e-05, + "loss": 1.3993, + "step": 83280 + }, + { + "epoch": 4.124104526902298, + "grad_norm": 0.17874988913536072, + "learning_rate": 5.7934316703595625e-05, + "loss": 1.4068, + "step": 83290 + }, + { + "epoch": 4.1246464680678105, + "grad_norm": 0.2080361247062683, + "learning_rate": 5.791851204132745e-05, + "loss": 1.4041, + "step": 83300 + }, + { + "epoch": 4.125188409233322, + "grad_norm": 0.20870813727378845, + "learning_rate": 5.79027070175267e-05, + "loss": 1.4055, + "step": 83310 + }, + { + "epoch": 4.125730350398835, + "grad_norm": 0.17612333595752716, + "learning_rate": 5.7886901634151205e-05, + "loss": 1.4071, + "step": 83320 + }, + { + "epoch": 4.126272291564347, + "grad_norm": 0.18745100498199463, + "learning_rate": 5.787109589315887e-05, + "loss": 1.4132, + "step": 83330 + }, + { + "epoch": 4.126814232729859, + "grad_norm": 0.2577305734157562, + "learning_rate": 5.7855289796507627e-05, + "loss": 1.4039, + "step": 83340 + }, + { + "epoch": 4.127139397429167, + "eval_loss": 2.428053855895996, + "eval_runtime": 21.9759, + "eval_samples_per_second": 227.521, + "eval_steps_per_second": 1.229, + "step": 83346 + }, + { + "epoch": 4.127356173895372, + "grad_norm": 0.1890375018119812, + "learning_rate": 5.783948334615548e-05, + "loss": 1.3949, + "step": 83350 + }, + { + "epoch": 4.1278981150608836, + "grad_norm": 0.34618306159973145, + "learning_rate": 5.782367654406047e-05, + "loss": 1.4095, + "step": 83360 + }, + { + "epoch": 4.128440056226396, + "grad_norm": 0.19756010174751282, + "learning_rate": 5.780786939218068e-05, + "loss": 1.4154, + "step": 83370 + }, + { + "epoch": 4.128981997391908, + "grad_norm": 0.2689013183116913, + "learning_rate": 5.77920618924742e-05, + "loss": 1.4067, + "step": 83380 + }, + { + "epoch": 4.12952393855742, + "grad_norm": 0.17390170693397522, + "learning_rate": 5.777625404689924e-05, + "loss": 1.4065, + "step": 83390 + }, + { + "epoch": 4.130065879722933, + "grad_norm": 0.2956944704055786, + "learning_rate": 5.7760445857413946e-05, + "loss": 1.4047, + "step": 83400 + }, + { + "epoch": 4.130607820888445, + "grad_norm": 0.2633947432041168, + "learning_rate": 5.774463732597662e-05, + "loss": 1.4067, + "step": 83410 + }, + { + "epoch": 4.131149762053957, + "grad_norm": 0.17089660465717316, + "learning_rate": 5.772882845454556e-05, + "loss": 1.4171, + "step": 83420 + }, + { + "epoch": 4.131691703219469, + "grad_norm": 0.18730990588665009, + "learning_rate": 5.771301924507905e-05, + "loss": 1.4067, + "step": 83430 + }, + { + "epoch": 4.131854285569123, + "eval_loss": 2.4326930046081543, + "eval_runtime": 21.9721, + "eval_samples_per_second": 227.562, + "eval_steps_per_second": 1.229, + "step": 83433 + }, + { + "epoch": 4.132233644384981, + "grad_norm": 0.16217251121997833, + "learning_rate": 5.7697209699535514e-05, + "loss": 1.4039, + "step": 83440 + }, + { + "epoch": 4.132775585550494, + "grad_norm": 0.16589990258216858, + "learning_rate": 5.768139981987334e-05, + "loss": 1.3967, + "step": 83450 + }, + { + "epoch": 4.133317526716006, + "grad_norm": 0.1889846920967102, + "learning_rate": 5.7665589608051e-05, + "loss": 1.4068, + "step": 83460 + }, + { + "epoch": 4.133859467881518, + "grad_norm": 0.18602542579174042, + "learning_rate": 5.7649779066026985e-05, + "loss": 1.4024, + "step": 83470 + }, + { + "epoch": 4.1344014090470305, + "grad_norm": 0.24643942713737488, + "learning_rate": 5.7633968195759855e-05, + "loss": 1.4069, + "step": 83480 + }, + { + "epoch": 4.134943350212542, + "grad_norm": 0.18482206761837006, + "learning_rate": 5.7618156999208186e-05, + "loss": 1.4186, + "step": 83490 + }, + { + "epoch": 4.135485291378055, + "grad_norm": 0.1943228840827942, + "learning_rate": 5.7602345478330577e-05, + "loss": 1.4047, + "step": 83500 + }, + { + "epoch": 4.136027232543567, + "grad_norm": 0.2460983842611313, + "learning_rate": 5.7586533635085705e-05, + "loss": 1.4117, + "step": 83510 + }, + { + "epoch": 4.136569173709079, + "grad_norm": 0.22592556476593018, + "learning_rate": 5.75707214714323e-05, + "loss": 1.407, + "step": 83520 + }, + { + "epoch": 4.136569173709079, + "eval_loss": 2.431594133377075, + "eval_runtime": 21.9667, + "eval_samples_per_second": 227.617, + "eval_steps_per_second": 1.229, + "step": 83520 + }, + { + "epoch": 4.137111114874592, + "grad_norm": 0.22698943316936493, + "learning_rate": 5.7554908989329084e-05, + "loss": 1.409, + "step": 83530 + }, + { + "epoch": 4.1376530560401035, + "grad_norm": 0.1876407265663147, + "learning_rate": 5.7539096190734806e-05, + "loss": 1.4023, + "step": 83540 + }, + { + "epoch": 4.138194997205616, + "grad_norm": 0.17315153777599335, + "learning_rate": 5.752328307760833e-05, + "loss": 1.3965, + "step": 83550 + }, + { + "epoch": 4.138736938371128, + "grad_norm": 0.1867133527994156, + "learning_rate": 5.7507469651908486e-05, + "loss": 1.414, + "step": 83560 + }, + { + "epoch": 4.13927887953664, + "grad_norm": 0.2372438907623291, + "learning_rate": 5.74916559155942e-05, + "loss": 1.41, + "step": 83570 + }, + { + "epoch": 4.139820820702153, + "grad_norm": 0.16649603843688965, + "learning_rate": 5.7475841870624416e-05, + "loss": 1.3959, + "step": 83580 + }, + { + "epoch": 4.140362761867665, + "grad_norm": 0.2065044343471527, + "learning_rate": 5.746002751895807e-05, + "loss": 1.4078, + "step": 83590 + }, + { + "epoch": 4.140904703033177, + "grad_norm": 0.20945148169994354, + "learning_rate": 5.744421286255418e-05, + "loss": 1.3964, + "step": 83600 + }, + { + "epoch": 4.141284061849036, + "eval_loss": 2.4306061267852783, + "eval_runtime": 21.9679, + "eval_samples_per_second": 227.605, + "eval_steps_per_second": 1.229, + "step": 83607 + }, + { + "epoch": 4.141446644198689, + "grad_norm": 0.21159766614437103, + "learning_rate": 5.7428397903371845e-05, + "loss": 1.4085, + "step": 83610 + }, + { + "epoch": 4.141988585364201, + "grad_norm": 0.1735406070947647, + "learning_rate": 5.741258264337009e-05, + "loss": 1.4062, + "step": 83620 + }, + { + "epoch": 4.142530526529714, + "grad_norm": 0.21971061825752258, + "learning_rate": 5.739676708450809e-05, + "loss": 1.4088, + "step": 83630 + }, + { + "epoch": 4.143072467695226, + "grad_norm": 0.1740388572216034, + "learning_rate": 5.7380951228744985e-05, + "loss": 1.4059, + "step": 83640 + }, + { + "epoch": 4.1436144088607385, + "grad_norm": 0.21105538308620453, + "learning_rate": 5.736513507803999e-05, + "loss": 1.4035, + "step": 83650 + }, + { + "epoch": 4.14415635002625, + "grad_norm": 0.21012961864471436, + "learning_rate": 5.7349318634352325e-05, + "loss": 1.4016, + "step": 83660 + }, + { + "epoch": 4.144698291191762, + "grad_norm": 0.21430379152297974, + "learning_rate": 5.7333501899641254e-05, + "loss": 1.399, + "step": 83670 + }, + { + "epoch": 4.145240232357275, + "grad_norm": 0.16836793720722198, + "learning_rate": 5.731768487586611e-05, + "loss": 1.3952, + "step": 83680 + }, + { + "epoch": 4.145782173522787, + "grad_norm": 0.22241011261940002, + "learning_rate": 5.730186756498622e-05, + "loss": 1.4089, + "step": 83690 + }, + { + "epoch": 4.145998949988992, + "eval_loss": 2.433333396911621, + "eval_runtime": 21.9717, + "eval_samples_per_second": 227.566, + "eval_steps_per_second": 1.229, + "step": 83694 + }, + { + "epoch": 4.146324114688299, + "grad_norm": 0.34341859817504883, + "learning_rate": 5.728604996896096e-05, + "loss": 1.4097, + "step": 83700 + }, + { + "epoch": 4.1468660558538115, + "grad_norm": 0.4072984755039215, + "learning_rate": 5.7270232089749775e-05, + "loss": 1.398, + "step": 83710 + }, + { + "epoch": 4.147407997019323, + "grad_norm": 0.21958887577056885, + "learning_rate": 5.725441392931209e-05, + "loss": 1.4012, + "step": 83720 + }, + { + "epoch": 4.147949938184836, + "grad_norm": 0.18960371613502502, + "learning_rate": 5.7238595489607396e-05, + "loss": 1.4089, + "step": 83730 + }, + { + "epoch": 4.148491879350348, + "grad_norm": 0.16692198812961578, + "learning_rate": 5.722277677259521e-05, + "loss": 1.4043, + "step": 83740 + }, + { + "epoch": 4.14903382051586, + "grad_norm": 0.1643580198287964, + "learning_rate": 5.720695778023508e-05, + "loss": 1.395, + "step": 83750 + }, + { + "epoch": 4.149575761681373, + "grad_norm": 0.23333659768104553, + "learning_rate": 5.7191138514486605e-05, + "loss": 1.4017, + "step": 83760 + }, + { + "epoch": 4.1501177028468845, + "grad_norm": 0.19941531121730804, + "learning_rate": 5.7175318977309424e-05, + "loss": 1.4022, + "step": 83770 + }, + { + "epoch": 4.150659644012397, + "grad_norm": 0.19911114871501923, + "learning_rate": 5.7159499170663144e-05, + "loss": 1.4112, + "step": 83780 + }, + { + "epoch": 4.150713838128948, + "eval_loss": 2.4322268962860107, + "eval_runtime": 21.9745, + "eval_samples_per_second": 227.537, + "eval_steps_per_second": 1.229, + "step": 83781 + }, + { + "epoch": 4.151201585177909, + "grad_norm": 0.18389646708965302, + "learning_rate": 5.7143679096507494e-05, + "loss": 1.402, + "step": 83790 + }, + { + "epoch": 4.151743526343421, + "grad_norm": 0.1758451610803604, + "learning_rate": 5.712785875680218e-05, + "loss": 1.4039, + "step": 83800 + }, + { + "epoch": 4.152285467508934, + "grad_norm": 0.16958458721637726, + "learning_rate": 5.711203815350696e-05, + "loss": 1.4069, + "step": 83810 + }, + { + "epoch": 4.152827408674446, + "grad_norm": 0.16103295981884003, + "learning_rate": 5.709621728858164e-05, + "loss": 1.404, + "step": 83820 + }, + { + "epoch": 4.153369349839958, + "grad_norm": 0.17550979554653168, + "learning_rate": 5.7080396163986004e-05, + "loss": 1.402, + "step": 83830 + }, + { + "epoch": 4.15391129100547, + "grad_norm": 0.2050134539604187, + "learning_rate": 5.706457478167992e-05, + "loss": 1.4078, + "step": 83840 + }, + { + "epoch": 4.154453232170982, + "grad_norm": 0.1583109349012375, + "learning_rate": 5.704875314362331e-05, + "loss": 1.414, + "step": 83850 + }, + { + "epoch": 4.154995173336495, + "grad_norm": 0.20555706322193146, + "learning_rate": 5.703293125177602e-05, + "loss": 1.4025, + "step": 83860 + }, + { + "epoch": 4.155428726268904, + "eval_loss": 2.4321482181549072, + "eval_runtime": 21.9695, + "eval_samples_per_second": 227.588, + "eval_steps_per_second": 1.229, + "step": 83868 + }, + { + "epoch": 4.155537114502007, + "grad_norm": 0.2936631739139557, + "learning_rate": 5.701710910809805e-05, + "loss": 1.4003, + "step": 83870 + }, + { + "epoch": 4.156079055667519, + "grad_norm": 0.24286337196826935, + "learning_rate": 5.700128671454935e-05, + "loss": 1.4065, + "step": 83880 + }, + { + "epoch": 4.156620996833031, + "grad_norm": 0.3643859624862671, + "learning_rate": 5.6985464073089944e-05, + "loss": 1.3969, + "step": 83890 + }, + { + "epoch": 4.157162937998543, + "grad_norm": 0.3168624937534332, + "learning_rate": 5.696964118567988e-05, + "loss": 1.3999, + "step": 83900 + }, + { + "epoch": 4.157704879164056, + "grad_norm": 0.24165990948677063, + "learning_rate": 5.6953818054279206e-05, + "loss": 1.3952, + "step": 83910 + }, + { + "epoch": 4.158246820329568, + "grad_norm": 0.16725140810012817, + "learning_rate": 5.693799468084804e-05, + "loss": 1.4162, + "step": 83920 + }, + { + "epoch": 4.15878876149508, + "grad_norm": 0.2385065108537674, + "learning_rate": 5.6922171067346495e-05, + "loss": 1.3966, + "step": 83930 + }, + { + "epoch": 4.159330702660593, + "grad_norm": 0.23973079025745392, + "learning_rate": 5.690634721573475e-05, + "loss": 1.409, + "step": 83940 + }, + { + "epoch": 4.159872643826104, + "grad_norm": 0.18828116357326508, + "learning_rate": 5.6890523127972993e-05, + "loss": 1.3998, + "step": 83950 + }, + { + "epoch": 4.16014361440886, + "eval_loss": 2.4352917671203613, + "eval_runtime": 21.9698, + "eval_samples_per_second": 227.585, + "eval_steps_per_second": 1.229, + "step": 83955 + }, + { + "epoch": 4.160414584991617, + "grad_norm": 0.1964593529701233, + "learning_rate": 5.687469880602143e-05, + "loss": 1.4101, + "step": 83960 + }, + { + "epoch": 4.160956526157129, + "grad_norm": 0.2196066826581955, + "learning_rate": 5.685887425184033e-05, + "loss": 1.4071, + "step": 83970 + }, + { + "epoch": 4.161498467322641, + "grad_norm": 0.16051217913627625, + "learning_rate": 5.684304946738995e-05, + "loss": 1.411, + "step": 83980 + }, + { + "epoch": 4.162040408488154, + "grad_norm": 0.23995092511177063, + "learning_rate": 5.68272244546306e-05, + "loss": 1.4054, + "step": 83990 + }, + { + "epoch": 4.162582349653666, + "grad_norm": 0.35630932450294495, + "learning_rate": 5.6811399215522634e-05, + "loss": 1.4047, + "step": 84000 + }, + { + "epoch": 4.163124290819178, + "grad_norm": 0.26559507846832275, + "learning_rate": 5.6795573752026386e-05, + "loss": 1.4162, + "step": 84010 + }, + { + "epoch": 4.16366623198469, + "grad_norm": 0.28465500473976135, + "learning_rate": 5.6779748066102254e-05, + "loss": 1.4168, + "step": 84020 + }, + { + "epoch": 4.164208173150202, + "grad_norm": 0.3914095163345337, + "learning_rate": 5.676392215971066e-05, + "loss": 1.4054, + "step": 84030 + }, + { + "epoch": 4.164750114315715, + "grad_norm": 0.2624354958534241, + "learning_rate": 5.6748096034812047e-05, + "loss": 1.4156, + "step": 84040 + }, + { + "epoch": 4.164858502548817, + "eval_loss": 2.431173324584961, + "eval_runtime": 21.9748, + "eval_samples_per_second": 227.534, + "eval_steps_per_second": 1.229, + "step": 84042 + }, + { + "epoch": 4.165292055481227, + "grad_norm": 0.25440993905067444, + "learning_rate": 5.6732269693366894e-05, + "loss": 1.4029, + "step": 84050 + }, + { + "epoch": 4.1658339966467395, + "grad_norm": 0.1714615821838379, + "learning_rate": 5.6716443137335695e-05, + "loss": 1.3996, + "step": 84060 + }, + { + "epoch": 4.166375937812251, + "grad_norm": 0.3408913016319275, + "learning_rate": 5.670061636867896e-05, + "loss": 1.4094, + "step": 84070 + }, + { + "epoch": 4.166917878977763, + "grad_norm": 0.25224050879478455, + "learning_rate": 5.6684789389357254e-05, + "loss": 1.4019, + "step": 84080 + }, + { + "epoch": 4.167459820143276, + "grad_norm": 0.18869130313396454, + "learning_rate": 5.666896220133116e-05, + "loss": 1.398, + "step": 84090 + }, + { + "epoch": 4.168001761308788, + "grad_norm": 0.23559992015361786, + "learning_rate": 5.665313480656127e-05, + "loss": 1.3896, + "step": 84100 + }, + { + "epoch": 4.1685437024743, + "grad_norm": 0.33458632230758667, + "learning_rate": 5.663730720700822e-05, + "loss": 1.4115, + "step": 84110 + }, + { + "epoch": 4.1690856436398125, + "grad_norm": 0.24740028381347656, + "learning_rate": 5.662147940463265e-05, + "loss": 1.4121, + "step": 84120 + }, + { + "epoch": 4.1695733906887735, + "eval_loss": 2.432617664337158, + "eval_runtime": 21.9739, + "eval_samples_per_second": 227.543, + "eval_steps_per_second": 1.229, + "step": 84129 + }, + { + "epoch": 4.169627584805324, + "grad_norm": 0.15262192487716675, + "learning_rate": 5.6605651401395265e-05, + "loss": 1.3928, + "step": 84130 + }, + { + "epoch": 4.170169525970837, + "grad_norm": 0.16818031668663025, + "learning_rate": 5.658982319925675e-05, + "loss": 1.4084, + "step": 84140 + }, + { + "epoch": 4.170711467136349, + "grad_norm": 0.16727863252162933, + "learning_rate": 5.6573994800177844e-05, + "loss": 1.3928, + "step": 84150 + }, + { + "epoch": 4.171253408301861, + "grad_norm": 0.21401916444301605, + "learning_rate": 5.655816620611929e-05, + "loss": 1.3994, + "step": 84160 + }, + { + "epoch": 4.171795349467374, + "grad_norm": 0.21131601929664612, + "learning_rate": 5.6542337419041866e-05, + "loss": 1.4001, + "step": 84170 + }, + { + "epoch": 4.1723372906328855, + "grad_norm": 0.17307646572589874, + "learning_rate": 5.652650844090639e-05, + "loss": 1.3984, + "step": 84180 + }, + { + "epoch": 4.172879231798398, + "grad_norm": 0.16799761354923248, + "learning_rate": 5.651067927367367e-05, + "loss": 1.405, + "step": 84190 + }, + { + "epoch": 4.17342117296391, + "grad_norm": 0.20099224150180817, + "learning_rate": 5.649484991930456e-05, + "loss": 1.4075, + "step": 84200 + }, + { + "epoch": 4.173963114129422, + "grad_norm": 0.28969597816467285, + "learning_rate": 5.647902037975994e-05, + "loss": 1.4005, + "step": 84210 + }, + { + "epoch": 4.17428827882873, + "eval_loss": 2.434582471847534, + "eval_runtime": 21.9717, + "eval_samples_per_second": 227.565, + "eval_steps_per_second": 1.229, + "step": 84216 + }, + { + "epoch": 4.174505055294935, + "grad_norm": 0.272678017616272, + "learning_rate": 5.646319065700068e-05, + "loss": 1.4166, + "step": 84220 + }, + { + "epoch": 4.175046996460447, + "grad_norm": 0.1885080486536026, + "learning_rate": 5.644736075298772e-05, + "loss": 1.4171, + "step": 84230 + }, + { + "epoch": 4.175588937625959, + "grad_norm": 0.1917075365781784, + "learning_rate": 5.643153066968201e-05, + "loss": 1.3912, + "step": 84240 + }, + { + "epoch": 4.176130878791471, + "grad_norm": 0.1733640879392624, + "learning_rate": 5.641570040904448e-05, + "loss": 1.4086, + "step": 84250 + }, + { + "epoch": 4.176672819956983, + "grad_norm": 0.21975889801979065, + "learning_rate": 5.6399869973036136e-05, + "loss": 1.4118, + "step": 84260 + }, + { + "epoch": 4.177214761122496, + "grad_norm": 0.1629042774438858, + "learning_rate": 5.638403936361797e-05, + "loss": 1.4043, + "step": 84270 + }, + { + "epoch": 4.177756702288008, + "grad_norm": 0.16598139703273773, + "learning_rate": 5.6368208582751026e-05, + "loss": 1.413, + "step": 84280 + }, + { + "epoch": 4.17829864345352, + "grad_norm": 0.16502542793750763, + "learning_rate": 5.6352377632396347e-05, + "loss": 1.404, + "step": 84290 + }, + { + "epoch": 4.178840584619032, + "grad_norm": 0.16018883883953094, + "learning_rate": 5.633654651451499e-05, + "loss": 1.3957, + "step": 84300 + }, + { + "epoch": 4.179003166968686, + "eval_loss": 2.4341461658477783, + "eval_runtime": 21.9742, + "eval_samples_per_second": 227.539, + "eval_steps_per_second": 1.229, + "step": 84303 + }, + { + "epoch": 4.179382525784544, + "grad_norm": 0.1671213060617447, + "learning_rate": 5.632071523106805e-05, + "loss": 1.4043, + "step": 84310 + }, + { + "epoch": 4.179924466950057, + "grad_norm": 0.35982993245124817, + "learning_rate": 5.630488378401665e-05, + "loss": 1.3999, + "step": 84320 + }, + { + "epoch": 4.180466408115569, + "grad_norm": 0.3010498881340027, + "learning_rate": 5.62890521753219e-05, + "loss": 1.4013, + "step": 84330 + }, + { + "epoch": 4.181008349281081, + "grad_norm": 0.27373287081718445, + "learning_rate": 5.627322040694497e-05, + "loss": 1.3974, + "step": 84340 + }, + { + "epoch": 4.1815502904465935, + "grad_norm": 0.21938002109527588, + "learning_rate": 5.6257388480847026e-05, + "loss": 1.4011, + "step": 84350 + }, + { + "epoch": 4.182092231612105, + "grad_norm": 0.184707373380661, + "learning_rate": 5.6241556398989246e-05, + "loss": 1.3999, + "step": 84360 + }, + { + "epoch": 4.182634172777618, + "grad_norm": 0.15879768133163452, + "learning_rate": 5.622572416333286e-05, + "loss": 1.4061, + "step": 84370 + }, + { + "epoch": 4.18317611394313, + "grad_norm": 0.16178719699382782, + "learning_rate": 5.620989177583908e-05, + "loss": 1.4106, + "step": 84380 + }, + { + "epoch": 4.183718055108642, + "grad_norm": 0.18774659931659698, + "learning_rate": 5.619405923846916e-05, + "loss": 1.4041, + "step": 84390 + }, + { + "epoch": 4.183718055108642, + "eval_loss": 2.4340009689331055, + "eval_runtime": 21.9661, + "eval_samples_per_second": 227.623, + "eval_steps_per_second": 1.229, + "step": 84390 + }, + { + "epoch": 4.184259996274155, + "grad_norm": 0.29684072732925415, + "learning_rate": 5.617822655318438e-05, + "loss": 1.4182, + "step": 84400 + }, + { + "epoch": 4.1848019374396666, + "grad_norm": 0.24184374511241913, + "learning_rate": 5.616239372194599e-05, + "loss": 1.406, + "step": 84410 + }, + { + "epoch": 4.185343878605179, + "grad_norm": 0.17345279455184937, + "learning_rate": 5.614656074671532e-05, + "loss": 1.4158, + "step": 84420 + }, + { + "epoch": 4.185885819770691, + "grad_norm": 0.18434324860572815, + "learning_rate": 5.613072762945369e-05, + "loss": 1.415, + "step": 84430 + }, + { + "epoch": 4.186427760936203, + "grad_norm": 0.16675621271133423, + "learning_rate": 5.6114894372122415e-05, + "loss": 1.4093, + "step": 84440 + }, + { + "epoch": 4.186969702101716, + "grad_norm": 0.18245235085487366, + "learning_rate": 5.6099060976682883e-05, + "loss": 1.4074, + "step": 84450 + }, + { + "epoch": 4.187511643267228, + "grad_norm": 0.18735003471374512, + "learning_rate": 5.608322744509644e-05, + "loss": 1.403, + "step": 84460 + }, + { + "epoch": 4.1880535844327405, + "grad_norm": 0.22122140228748322, + "learning_rate": 5.606739377932447e-05, + "loss": 1.3978, + "step": 84470 + }, + { + "epoch": 4.188432943248599, + "eval_loss": 2.4321837425231934, + "eval_runtime": 21.9752, + "eval_samples_per_second": 227.529, + "eval_steps_per_second": 1.229, + "step": 84477 + }, + { + "epoch": 4.188595525598252, + "grad_norm": 0.19624446332454681, + "learning_rate": 5.605155998132843e-05, + "loss": 1.4006, + "step": 84480 + }, + { + "epoch": 4.189137466763764, + "grad_norm": 0.2357572615146637, + "learning_rate": 5.603572605306967e-05, + "loss": 1.3976, + "step": 84490 + }, + { + "epoch": 4.189679407929277, + "grad_norm": 0.1909378468990326, + "learning_rate": 5.6019891996509676e-05, + "loss": 1.3962, + "step": 84500 + }, + { + "epoch": 4.190221349094789, + "grad_norm": 0.2646555006504059, + "learning_rate": 5.600405781360989e-05, + "loss": 1.4053, + "step": 84510 + }, + { + "epoch": 4.190763290260301, + "grad_norm": 0.16853564977645874, + "learning_rate": 5.598822350633177e-05, + "loss": 1.4133, + "step": 84520 + }, + { + "epoch": 4.1913052314258135, + "grad_norm": 0.1681780219078064, + "learning_rate": 5.5972389076636833e-05, + "loss": 1.4132, + "step": 84530 + }, + { + "epoch": 4.191847172591325, + "grad_norm": 0.15586897730827332, + "learning_rate": 5.595655452648655e-05, + "loss": 1.4015, + "step": 84540 + }, + { + "epoch": 4.192389113756838, + "grad_norm": 0.1730291098356247, + "learning_rate": 5.594071985784244e-05, + "loss": 1.3961, + "step": 84550 + }, + { + "epoch": 4.19293105492235, + "grad_norm": 0.16029633581638336, + "learning_rate": 5.5924885072666045e-05, + "loss": 1.4024, + "step": 84560 + }, + { + "epoch": 4.193147831388555, + "eval_loss": 2.4340431690216064, + "eval_runtime": 21.9997, + "eval_samples_per_second": 227.276, + "eval_steps_per_second": 1.227, + "step": 84564 + }, + { + "epoch": 4.193472996087862, + "grad_norm": 0.2048177421092987, + "learning_rate": 5.5909050172918896e-05, + "loss": 1.403, + "step": 84570 + }, + { + "epoch": 4.194014937253375, + "grad_norm": 0.1830647587776184, + "learning_rate": 5.589321516056256e-05, + "loss": 1.4039, + "step": 84580 + }, + { + "epoch": 4.1945568784188865, + "grad_norm": 0.18727242946624756, + "learning_rate": 5.587738003755861e-05, + "loss": 1.4054, + "step": 84590 + }, + { + "epoch": 4.195098819584399, + "grad_norm": 0.2537229657173157, + "learning_rate": 5.5861544805868624e-05, + "loss": 1.4062, + "step": 84600 + }, + { + "epoch": 4.195640760749911, + "grad_norm": 0.23978114128112793, + "learning_rate": 5.584570946745422e-05, + "loss": 1.4057, + "step": 84610 + }, + { + "epoch": 4.196182701915423, + "grad_norm": 0.19209878146648407, + "learning_rate": 5.5829874024276995e-05, + "loss": 1.409, + "step": 84620 + }, + { + "epoch": 4.196724643080936, + "grad_norm": 0.22839245200157166, + "learning_rate": 5.581403847829857e-05, + "loss": 1.4135, + "step": 84630 + }, + { + "epoch": 4.197266584246448, + "grad_norm": 0.1937345266342163, + "learning_rate": 5.5798202831480605e-05, + "loss": 1.3996, + "step": 84640 + }, + { + "epoch": 4.19780852541196, + "grad_norm": 0.23591110110282898, + "learning_rate": 5.5782367085784725e-05, + "loss": 1.4079, + "step": 84650 + }, + { + "epoch": 4.197862719528511, + "eval_loss": 2.4296658039093018, + "eval_runtime": 22.048, + "eval_samples_per_second": 226.778, + "eval_steps_per_second": 1.225, + "step": 84651 + }, + { + "epoch": 4.198350466577472, + "grad_norm": 0.17365294694900513, + "learning_rate": 5.5766531243172616e-05, + "loss": 1.3987, + "step": 84660 + }, + { + "epoch": 4.198892407742984, + "grad_norm": 0.20040321350097656, + "learning_rate": 5.575069530560594e-05, + "loss": 1.4116, + "step": 84670 + }, + { + "epoch": 4.199434348908497, + "grad_norm": 0.1579209417104721, + "learning_rate": 5.573485927504639e-05, + "loss": 1.4013, + "step": 84680 + }, + { + "epoch": 4.199976290074009, + "grad_norm": 0.2494509518146515, + "learning_rate": 5.5719023153455674e-05, + "loss": 1.4098, + "step": 84690 + }, + { + "epoch": 4.200518231239521, + "grad_norm": 0.18554720282554626, + "learning_rate": 5.5703186942795484e-05, + "loss": 1.4108, + "step": 84700 + }, + { + "epoch": 4.201060172405033, + "grad_norm": 0.1679755598306656, + "learning_rate": 5.5687350645027544e-05, + "loss": 1.4001, + "step": 84710 + }, + { + "epoch": 4.201602113570545, + "grad_norm": 0.17022711038589478, + "learning_rate": 5.56715142621136e-05, + "loss": 1.3902, + "step": 84720 + }, + { + "epoch": 4.202144054736058, + "grad_norm": 0.17805786430835724, + "learning_rate": 5.5655677796015374e-05, + "loss": 1.4025, + "step": 84730 + }, + { + "epoch": 4.202577607668467, + "eval_loss": 2.4276254177093506, + "eval_runtime": 22.04, + "eval_samples_per_second": 226.86, + "eval_steps_per_second": 1.225, + "step": 84738 + }, + { + "epoch": 4.20268599590157, + "grad_norm": 0.18491090834140778, + "learning_rate": 5.563984124869463e-05, + "loss": 1.4003, + "step": 84740 + }, + { + "epoch": 4.203227937067082, + "grad_norm": 0.1828235238790512, + "learning_rate": 5.5624004622113125e-05, + "loss": 1.4063, + "step": 84750 + }, + { + "epoch": 4.2037698782325945, + "grad_norm": 0.160418301820755, + "learning_rate": 5.560816791823264e-05, + "loss": 1.4013, + "step": 84760 + }, + { + "epoch": 4.204311819398106, + "grad_norm": 0.17551931738853455, + "learning_rate": 5.559233113901498e-05, + "loss": 1.4134, + "step": 84770 + }, + { + "epoch": 4.204853760563619, + "grad_norm": 0.18492551147937775, + "learning_rate": 5.557649428642189e-05, + "loss": 1.4118, + "step": 84780 + }, + { + "epoch": 4.205395701729131, + "grad_norm": 0.1690473109483719, + "learning_rate": 5.556065736241518e-05, + "loss": 1.4027, + "step": 84790 + }, + { + "epoch": 4.205937642894643, + "grad_norm": 0.15582244098186493, + "learning_rate": 5.5544820368956674e-05, + "loss": 1.3997, + "step": 84800 + }, + { + "epoch": 4.206479584060156, + "grad_norm": 0.21982191503047943, + "learning_rate": 5.55289833080082e-05, + "loss": 1.3989, + "step": 84810 + }, + { + "epoch": 4.2070215252256675, + "grad_norm": 0.1695873886346817, + "learning_rate": 5.551314618153156e-05, + "loss": 1.4016, + "step": 84820 + }, + { + "epoch": 4.2072924958084235, + "eval_loss": 2.426410436630249, + "eval_runtime": 22.0191, + "eval_samples_per_second": 227.076, + "eval_steps_per_second": 1.226, + "step": 84825 + }, + { + "epoch": 4.20756346639118, + "grad_norm": 0.15506160259246826, + "learning_rate": 5.5497308991488606e-05, + "loss": 1.4008, + "step": 84830 + }, + { + "epoch": 4.208105407556692, + "grad_norm": 0.21147039532661438, + "learning_rate": 5.548147173984116e-05, + "loss": 1.3972, + "step": 84840 + }, + { + "epoch": 4.208647348722204, + "grad_norm": 0.3284474313259125, + "learning_rate": 5.546563442855108e-05, + "loss": 1.405, + "step": 84850 + }, + { + "epoch": 4.209189289887717, + "grad_norm": 0.2625653147697449, + "learning_rate": 5.544979705958025e-05, + "loss": 1.4091, + "step": 84860 + }, + { + "epoch": 4.209731231053229, + "grad_norm": 0.24892371892929077, + "learning_rate": 5.543395963489051e-05, + "loss": 1.4066, + "step": 84870 + }, + { + "epoch": 4.210273172218741, + "grad_norm": 0.26100948452949524, + "learning_rate": 5.541812215644373e-05, + "loss": 1.4048, + "step": 84880 + }, + { + "epoch": 4.210815113384253, + "grad_norm": 0.18004588782787323, + "learning_rate": 5.5402284626201794e-05, + "loss": 1.3915, + "step": 84890 + }, + { + "epoch": 4.211357054549765, + "grad_norm": 0.21767373383045197, + "learning_rate": 5.538644704612658e-05, + "loss": 1.4005, + "step": 84900 + }, + { + "epoch": 4.211898995715278, + "grad_norm": 0.18447516858577728, + "learning_rate": 5.537060941817999e-05, + "loss": 1.4034, + "step": 84910 + }, + { + "epoch": 4.2120073839483805, + "eval_loss": 2.4239137172698975, + "eval_runtime": 21.9695, + "eval_samples_per_second": 227.588, + "eval_steps_per_second": 1.229, + "step": 84912 + }, + { + "epoch": 4.21244093688079, + "grad_norm": 0.1832926869392395, + "learning_rate": 5.535477174432391e-05, + "loss": 1.4034, + "step": 84920 + }, + { + "epoch": 4.212982878046302, + "grad_norm": 0.1812276393175125, + "learning_rate": 5.533893402652025e-05, + "loss": 1.392, + "step": 84930 + }, + { + "epoch": 4.213524819211814, + "grad_norm": 0.18913184106349945, + "learning_rate": 5.5323096266730914e-05, + "loss": 1.4024, + "step": 84940 + }, + { + "epoch": 4.214066760377326, + "grad_norm": 0.22599507868289948, + "learning_rate": 5.530725846691781e-05, + "loss": 1.4047, + "step": 84950 + }, + { + "epoch": 4.214608701542839, + "grad_norm": 0.1804644763469696, + "learning_rate": 5.529142062904286e-05, + "loss": 1.4025, + "step": 84960 + }, + { + "epoch": 4.215150642708351, + "grad_norm": 0.19616399705410004, + "learning_rate": 5.527558275506799e-05, + "loss": 1.3948, + "step": 84970 + }, + { + "epoch": 4.215692583873863, + "grad_norm": 0.22985543310642242, + "learning_rate": 5.5259744846955145e-05, + "loss": 1.3937, + "step": 84980 + }, + { + "epoch": 4.216234525039376, + "grad_norm": 0.20359660685062408, + "learning_rate": 5.52439069066662e-05, + "loss": 1.3941, + "step": 84990 + }, + { + "epoch": 4.216722272088337, + "eval_loss": 2.4288182258605957, + "eval_runtime": 21.9691, + "eval_samples_per_second": 227.592, + "eval_steps_per_second": 1.229, + "step": 84999 + }, + { + "epoch": 4.216776466204887, + "grad_norm": 0.18419791758060455, + "learning_rate": 5.5228068936163134e-05, + "loss": 1.4054, + "step": 85000 + }, + { + "epoch": 4.2173184073704, + "grad_norm": 0.17802853882312775, + "learning_rate": 5.52122309374079e-05, + "loss": 1.4009, + "step": 85010 + }, + { + "epoch": 4.217860348535912, + "grad_norm": 0.16491647064685822, + "learning_rate": 5.519639291236241e-05, + "loss": 1.4079, + "step": 85020 + }, + { + "epoch": 4.218402289701424, + "grad_norm": 0.26249879598617554, + "learning_rate": 5.518055486298862e-05, + "loss": 1.4104, + "step": 85030 + }, + { + "epoch": 4.218944230866937, + "grad_norm": 0.16674381494522095, + "learning_rate": 5.516471679124846e-05, + "loss": 1.4072, + "step": 85040 + }, + { + "epoch": 4.219486172032449, + "grad_norm": 0.1718483567237854, + "learning_rate": 5.514887869910391e-05, + "loss": 1.4052, + "step": 85050 + }, + { + "epoch": 4.220028113197961, + "grad_norm": 0.18503984808921814, + "learning_rate": 5.5133040588516925e-05, + "loss": 1.3951, + "step": 85060 + }, + { + "epoch": 4.220570054363473, + "grad_norm": 0.25337323546409607, + "learning_rate": 5.511720246144945e-05, + "loss": 1.399, + "step": 85070 + }, + { + "epoch": 4.221111995528985, + "grad_norm": 0.17536914348602295, + "learning_rate": 5.510136431986342e-05, + "loss": 1.401, + "step": 85080 + }, + { + "epoch": 4.221437160228293, + "eval_loss": 2.4300763607025146, + "eval_runtime": 21.9731, + "eval_samples_per_second": 227.551, + "eval_steps_per_second": 1.229, + "step": 85086 + }, + { + "epoch": 4.221653936694498, + "grad_norm": 0.25776609778404236, + "learning_rate": 5.5085526165720835e-05, + "loss": 1.4017, + "step": 85090 + }, + { + "epoch": 4.22219587786001, + "grad_norm": 0.23579788208007812, + "learning_rate": 5.5069688000983635e-05, + "loss": 1.3972, + "step": 85100 + }, + { + "epoch": 4.222737819025522, + "grad_norm": 0.23605017364025116, + "learning_rate": 5.505384982761379e-05, + "loss": 1.4034, + "step": 85110 + }, + { + "epoch": 4.223279760191034, + "grad_norm": 0.23162227869033813, + "learning_rate": 5.503801164757327e-05, + "loss": 1.403, + "step": 85120 + }, + { + "epoch": 4.223821701356546, + "grad_norm": 0.21863441169261932, + "learning_rate": 5.502217346282401e-05, + "loss": 1.4017, + "step": 85130 + }, + { + "epoch": 4.224363642522059, + "grad_norm": 0.1740538477897644, + "learning_rate": 5.500633527532802e-05, + "loss": 1.4107, + "step": 85140 + }, + { + "epoch": 4.224905583687571, + "grad_norm": 0.1974310427904129, + "learning_rate": 5.499049708704723e-05, + "loss": 1.4013, + "step": 85150 + }, + { + "epoch": 4.225447524853083, + "grad_norm": 0.24513813853263855, + "learning_rate": 5.497465889994362e-05, + "loss": 1.4011, + "step": 85160 + }, + { + "epoch": 4.2259894660185955, + "grad_norm": 0.28687775135040283, + "learning_rate": 5.495882071597915e-05, + "loss": 1.4033, + "step": 85170 + }, + { + "epoch": 4.226152048368249, + "eval_loss": 2.429248332977295, + "eval_runtime": 21.97, + "eval_samples_per_second": 227.583, + "eval_steps_per_second": 1.229, + "step": 85173 + }, + { + "epoch": 4.226531407184107, + "grad_norm": 0.17238466441631317, + "learning_rate": 5.4942982537115826e-05, + "loss": 1.4076, + "step": 85180 + }, + { + "epoch": 4.22707334834962, + "grad_norm": 0.1770007312297821, + "learning_rate": 5.4927144365315544e-05, + "loss": 1.4096, + "step": 85190 + }, + { + "epoch": 4.227615289515132, + "grad_norm": 0.1884445995092392, + "learning_rate": 5.49113062025403e-05, + "loss": 1.3935, + "step": 85200 + }, + { + "epoch": 4.228157230680644, + "grad_norm": 0.17585532367229462, + "learning_rate": 5.489546805075209e-05, + "loss": 1.4014, + "step": 85210 + }, + { + "epoch": 4.228699171846157, + "grad_norm": 0.15574230253696442, + "learning_rate": 5.487962991191281e-05, + "loss": 1.4228, + "step": 85220 + }, + { + "epoch": 4.2292411130116685, + "grad_norm": 0.1755153387784958, + "learning_rate": 5.486379178798446e-05, + "loss": 1.4031, + "step": 85230 + }, + { + "epoch": 4.229783054177181, + "grad_norm": 0.22659145295619965, + "learning_rate": 5.484795368092901e-05, + "loss": 1.4086, + "step": 85240 + }, + { + "epoch": 4.230324995342693, + "grad_norm": 0.2872506082057953, + "learning_rate": 5.483211559270838e-05, + "loss": 1.4091, + "step": 85250 + }, + { + "epoch": 4.230866936508205, + "grad_norm": 0.21537162363529205, + "learning_rate": 5.4816277525284544e-05, + "loss": 1.4028, + "step": 85260 + }, + { + "epoch": 4.230866936508205, + "eval_loss": 2.42655086517334, + "eval_runtime": 21.9673, + "eval_samples_per_second": 227.611, + "eval_steps_per_second": 1.229, + "step": 85260 + }, + { + "epoch": 4.231408877673718, + "grad_norm": 0.17939117550849915, + "learning_rate": 5.480043948061947e-05, + "loss": 1.4065, + "step": 85270 + }, + { + "epoch": 4.23195081883923, + "grad_norm": 0.5240494012832642, + "learning_rate": 5.4784601460675064e-05, + "loss": 1.4144, + "step": 85280 + }, + { + "epoch": 4.232492760004742, + "grad_norm": 0.18183523416519165, + "learning_rate": 5.47687634674133e-05, + "loss": 1.3938, + "step": 85290 + }, + { + "epoch": 4.233034701170254, + "grad_norm": 0.16505871713161469, + "learning_rate": 5.4752925502796105e-05, + "loss": 1.3924, + "step": 85300 + }, + { + "epoch": 4.233576642335766, + "grad_norm": 0.20003126561641693, + "learning_rate": 5.473708756878545e-05, + "loss": 1.4059, + "step": 85310 + }, + { + "epoch": 4.234118583501279, + "grad_norm": 0.21324165165424347, + "learning_rate": 5.472124966734322e-05, + "loss": 1.4002, + "step": 85320 + }, + { + "epoch": 4.234660524666791, + "grad_norm": 0.32130008935928345, + "learning_rate": 5.4705411800431386e-05, + "loss": 1.4071, + "step": 85330 + }, + { + "epoch": 4.2352024658323035, + "grad_norm": 0.18662355840206146, + "learning_rate": 5.468957397001186e-05, + "loss": 1.3962, + "step": 85340 + }, + { + "epoch": 4.235581824648162, + "eval_loss": 2.428666830062866, + "eval_runtime": 21.9762, + "eval_samples_per_second": 227.519, + "eval_steps_per_second": 1.229, + "step": 85347 + }, + { + "epoch": 4.235744406997815, + "grad_norm": 0.271916002035141, + "learning_rate": 5.467373617804655e-05, + "loss": 1.4044, + "step": 85350 + }, + { + "epoch": 4.236286348163327, + "grad_norm": 0.27290457487106323, + "learning_rate": 5.465789842649739e-05, + "loss": 1.3982, + "step": 85360 + }, + { + "epoch": 4.23682828932884, + "grad_norm": 0.25049006938934326, + "learning_rate": 5.4642060717326305e-05, + "loss": 1.4102, + "step": 85370 + }, + { + "epoch": 4.237370230494352, + "grad_norm": 0.17757458984851837, + "learning_rate": 5.462622305249517e-05, + "loss": 1.4, + "step": 85380 + }, + { + "epoch": 4.237912171659864, + "grad_norm": 0.16723614931106567, + "learning_rate": 5.461038543396589e-05, + "loss": 1.4082, + "step": 85390 + }, + { + "epoch": 4.2384541128253765, + "grad_norm": 0.14943601191043854, + "learning_rate": 5.4594547863700396e-05, + "loss": 1.4074, + "step": 85400 + }, + { + "epoch": 4.238996053990888, + "grad_norm": 0.18632861971855164, + "learning_rate": 5.457871034366053e-05, + "loss": 1.4057, + "step": 85410 + }, + { + "epoch": 4.239537995156401, + "grad_norm": 0.16720369458198547, + "learning_rate": 5.456287287580821e-05, + "loss": 1.3995, + "step": 85420 + }, + { + "epoch": 4.240079936321913, + "grad_norm": 0.23148633539676666, + "learning_rate": 5.45470354621053e-05, + "loss": 1.3956, + "step": 85430 + }, + { + "epoch": 4.240296712788118, + "eval_loss": 2.4289402961730957, + "eval_runtime": 21.9722, + "eval_samples_per_second": 227.561, + "eval_steps_per_second": 1.229, + "step": 85434 + }, + { + "epoch": 4.240621877487425, + "grad_norm": 0.3228211998939514, + "learning_rate": 5.453119810451366e-05, + "loss": 1.3988, + "step": 85440 + }, + { + "epoch": 4.241163818652938, + "grad_norm": 0.28075844049453735, + "learning_rate": 5.451536080499518e-05, + "loss": 1.3984, + "step": 85450 + }, + { + "epoch": 4.2417057598184496, + "grad_norm": 0.23300626873970032, + "learning_rate": 5.449952356551168e-05, + "loss": 1.3963, + "step": 85460 + }, + { + "epoch": 4.242247700983962, + "grad_norm": 0.2622208595275879, + "learning_rate": 5.448368638802506e-05, + "loss": 1.3987, + "step": 85470 + }, + { + "epoch": 4.242789642149474, + "grad_norm": 0.15895061194896698, + "learning_rate": 5.4467849274497094e-05, + "loss": 1.4072, + "step": 85480 + }, + { + "epoch": 4.243331583314986, + "grad_norm": 0.17281389236450195, + "learning_rate": 5.4452012226889646e-05, + "loss": 1.4066, + "step": 85490 + }, + { + "epoch": 4.243873524480499, + "grad_norm": 0.2625819444656372, + "learning_rate": 5.443617524716458e-05, + "loss": 1.4131, + "step": 85500 + }, + { + "epoch": 4.244415465646011, + "grad_norm": 0.20588402450084686, + "learning_rate": 5.442033833728365e-05, + "loss": 1.4123, + "step": 85510 + }, + { + "epoch": 4.2449574068115234, + "grad_norm": 0.22198638319969177, + "learning_rate": 5.440450149920869e-05, + "loss": 1.4101, + "step": 85520 + }, + { + "epoch": 4.245011600928074, + "eval_loss": 2.4276344776153564, + "eval_runtime": 21.9945, + "eval_samples_per_second": 227.33, + "eval_steps_per_second": 1.228, + "step": 85521 + }, + { + "epoch": 4.245499347977035, + "grad_norm": 0.16215014457702637, + "learning_rate": 5.438866473490152e-05, + "loss": 1.3984, + "step": 85530 + }, + { + "epoch": 4.246041289142547, + "grad_norm": 0.1525203287601471, + "learning_rate": 5.4372828046323885e-05, + "loss": 1.3939, + "step": 85540 + }, + { + "epoch": 4.24658323030806, + "grad_norm": 0.16212084889411926, + "learning_rate": 5.43569914354376e-05, + "loss": 1.4041, + "step": 85550 + }, + { + "epoch": 4.247125171473572, + "grad_norm": 0.17924129962921143, + "learning_rate": 5.434115490420443e-05, + "loss": 1.3977, + "step": 85560 + }, + { + "epoch": 4.247667112639084, + "grad_norm": 0.22136323153972626, + "learning_rate": 5.432531845458612e-05, + "loss": 1.4001, + "step": 85570 + }, + { + "epoch": 4.2482090538045965, + "grad_norm": 0.26103144884109497, + "learning_rate": 5.4309482088544416e-05, + "loss": 1.405, + "step": 85580 + }, + { + "epoch": 4.248750994970108, + "grad_norm": 0.23051413893699646, + "learning_rate": 5.429364580804111e-05, + "loss": 1.4005, + "step": 85590 + }, + { + "epoch": 4.249292936135621, + "grad_norm": 0.19414301216602325, + "learning_rate": 5.427780961503787e-05, + "loss": 1.4039, + "step": 85600 + }, + { + "epoch": 4.24972648906803, + "eval_loss": 2.4262514114379883, + "eval_runtime": 21.973, + "eval_samples_per_second": 227.551, + "eval_steps_per_second": 1.229, + "step": 85608 + }, + { + "epoch": 4.249834877301133, + "grad_norm": 0.1806284487247467, + "learning_rate": 5.426197351149645e-05, + "loss": 1.4023, + "step": 85610 + }, + { + "epoch": 4.250376818466645, + "grad_norm": 0.1894899308681488, + "learning_rate": 5.424613749937852e-05, + "loss": 1.3989, + "step": 85620 + }, + { + "epoch": 4.250918759632158, + "grad_norm": 0.1757279932498932, + "learning_rate": 5.423030158064584e-05, + "loss": 1.4181, + "step": 85630 + }, + { + "epoch": 4.2514607007976695, + "grad_norm": 0.345478892326355, + "learning_rate": 5.421446575726004e-05, + "loss": 1.4023, + "step": 85640 + }, + { + "epoch": 4.252002641963182, + "grad_norm": 0.36112889647483826, + "learning_rate": 5.419863003118281e-05, + "loss": 1.4054, + "step": 85650 + }, + { + "epoch": 4.252544583128694, + "grad_norm": 0.1516331434249878, + "learning_rate": 5.418279440437582e-05, + "loss": 1.4045, + "step": 85660 + }, + { + "epoch": 4.253086524294206, + "grad_norm": 0.23912928998470306, + "learning_rate": 5.416695887880071e-05, + "loss": 1.4001, + "step": 85670 + }, + { + "epoch": 4.253628465459719, + "grad_norm": 0.27274298667907715, + "learning_rate": 5.41511234564191e-05, + "loss": 1.4137, + "step": 85680 + }, + { + "epoch": 4.254170406625231, + "grad_norm": 0.25264525413513184, + "learning_rate": 5.4135288139192664e-05, + "loss": 1.4097, + "step": 85690 + }, + { + "epoch": 4.2544413772079865, + "eval_loss": 2.4301393032073975, + "eval_runtime": 21.9696, + "eval_samples_per_second": 227.587, + "eval_steps_per_second": 1.229, + "step": 85695 + }, + { + "epoch": 4.254712347790743, + "grad_norm": 0.1786414533853531, + "learning_rate": 5.411945292908296e-05, + "loss": 1.4005, + "step": 85700 + }, + { + "epoch": 4.255254288956255, + "grad_norm": 0.17254769802093506, + "learning_rate": 5.41036178280516e-05, + "loss": 1.4008, + "step": 85710 + }, + { + "epoch": 4.255796230121767, + "grad_norm": 0.31122511625289917, + "learning_rate": 5.408778283806018e-05, + "loss": 1.4009, + "step": 85720 + }, + { + "epoch": 4.25633817128728, + "grad_norm": 0.2594967484474182, + "learning_rate": 5.407194796107027e-05, + "loss": 1.4123, + "step": 85730 + }, + { + "epoch": 4.256880112452792, + "grad_norm": 0.26264312863349915, + "learning_rate": 5.405611319904339e-05, + "loss": 1.39, + "step": 85740 + }, + { + "epoch": 4.2574220536183045, + "grad_norm": 0.18427279591560364, + "learning_rate": 5.404027855394113e-05, + "loss": 1.3978, + "step": 85750 + }, + { + "epoch": 4.257963994783816, + "grad_norm": 0.2185283601284027, + "learning_rate": 5.4024444027725006e-05, + "loss": 1.4082, + "step": 85760 + }, + { + "epoch": 4.258505935949328, + "grad_norm": 0.16532769799232483, + "learning_rate": 5.40086096223565e-05, + "loss": 1.4063, + "step": 85770 + }, + { + "epoch": 4.259047877114841, + "grad_norm": 0.17376570403575897, + "learning_rate": 5.399277533979712e-05, + "loss": 1.4046, + "step": 85780 + }, + { + "epoch": 4.259156265347944, + "eval_loss": 2.4298863410949707, + "eval_runtime": 21.973, + "eval_samples_per_second": 227.552, + "eval_steps_per_second": 1.229, + "step": 85782 + }, + { + "epoch": 4.259589818280353, + "grad_norm": 0.17817267775535583, + "learning_rate": 5.397694118200839e-05, + "loss": 1.4041, + "step": 85790 + }, + { + "epoch": 4.260131759445865, + "grad_norm": 0.21518279612064362, + "learning_rate": 5.396110715095173e-05, + "loss": 1.3957, + "step": 85800 + }, + { + "epoch": 4.2606737006113775, + "grad_norm": 0.1868053376674652, + "learning_rate": 5.3945273248588604e-05, + "loss": 1.3996, + "step": 85810 + }, + { + "epoch": 4.261215641776889, + "grad_norm": 0.16047891974449158, + "learning_rate": 5.392943947688045e-05, + "loss": 1.4125, + "step": 85820 + }, + { + "epoch": 4.261757582942402, + "grad_norm": 0.16372744739055634, + "learning_rate": 5.3913605837788686e-05, + "loss": 1.399, + "step": 85830 + }, + { + "epoch": 4.262299524107914, + "grad_norm": 0.15229913592338562, + "learning_rate": 5.3897772333274696e-05, + "loss": 1.4013, + "step": 85840 + }, + { + "epoch": 4.262841465273426, + "grad_norm": 0.2408658266067505, + "learning_rate": 5.3881938965299916e-05, + "loss": 1.4027, + "step": 85850 + }, + { + "epoch": 4.263383406438939, + "grad_norm": 0.1828664392232895, + "learning_rate": 5.386610573582567e-05, + "loss": 1.398, + "step": 85860 + }, + { + "epoch": 4.2638711534879, + "eval_loss": 2.43064022064209, + "eval_runtime": 21.9716, + "eval_samples_per_second": 227.567, + "eval_steps_per_second": 1.229, + "step": 85869 + }, + { + "epoch": 4.2639253476044505, + "grad_norm": 0.24171289801597595, + "learning_rate": 5.385027264681332e-05, + "loss": 1.4029, + "step": 85870 + }, + { + "epoch": 4.264467288769963, + "grad_norm": 0.19249555468559265, + "learning_rate": 5.38344397002242e-05, + "loss": 1.4006, + "step": 85880 + }, + { + "epoch": 4.265009229935475, + "grad_norm": 0.1631373018026352, + "learning_rate": 5.381860689801963e-05, + "loss": 1.4004, + "step": 85890 + }, + { + "epoch": 4.265551171100987, + "grad_norm": 0.2803032398223877, + "learning_rate": 5.38027742421609e-05, + "loss": 1.4041, + "step": 85900 + }, + { + "epoch": 4.2660931122665, + "grad_norm": 0.1907922625541687, + "learning_rate": 5.378694173460932e-05, + "loss": 1.4021, + "step": 85910 + }, + { + "epoch": 4.266635053432012, + "grad_norm": 0.182045578956604, + "learning_rate": 5.377110937732612e-05, + "loss": 1.4019, + "step": 85920 + }, + { + "epoch": 4.267176994597524, + "grad_norm": 0.17628154158592224, + "learning_rate": 5.3755277172272556e-05, + "loss": 1.3939, + "step": 85930 + }, + { + "epoch": 4.267718935763036, + "grad_norm": 0.18110013008117676, + "learning_rate": 5.3739445121409846e-05, + "loss": 1.4047, + "step": 85940 + }, + { + "epoch": 4.268260876928548, + "grad_norm": 0.2522640824317932, + "learning_rate": 5.372361322669922e-05, + "loss": 1.4082, + "step": 85950 + }, + { + "epoch": 4.268586041627856, + "eval_loss": 2.4356658458709717, + "eval_runtime": 21.9696, + "eval_samples_per_second": 227.588, + "eval_steps_per_second": 1.229, + "step": 85956 + }, + { + "epoch": 4.268802818094061, + "grad_norm": 0.17792746424674988, + "learning_rate": 5.370778149010184e-05, + "loss": 1.3923, + "step": 85960 + }, + { + "epoch": 4.269344759259573, + "grad_norm": 0.20900298655033112, + "learning_rate": 5.3691949913578865e-05, + "loss": 1.3991, + "step": 85970 + }, + { + "epoch": 4.269886700425085, + "grad_norm": 0.18769441545009613, + "learning_rate": 5.3676118499091466e-05, + "loss": 1.4003, + "step": 85980 + }, + { + "epoch": 4.270428641590597, + "grad_norm": 0.20186538994312286, + "learning_rate": 5.366028724860076e-05, + "loss": 1.3982, + "step": 85990 + }, + { + "epoch": 4.270970582756109, + "grad_norm": 0.15831774473190308, + "learning_rate": 5.3644456164067847e-05, + "loss": 1.4019, + "step": 86000 + }, + { + "epoch": 4.271512523921622, + "grad_norm": 0.20776121318340302, + "learning_rate": 5.362862524745382e-05, + "loss": 1.3977, + "step": 86010 + }, + { + "epoch": 4.272054465087134, + "grad_norm": 0.1834535449743271, + "learning_rate": 5.3612794500719745e-05, + "loss": 1.4064, + "step": 86020 + }, + { + "epoch": 4.272596406252646, + "grad_norm": 0.18609929084777832, + "learning_rate": 5.3596963925826657e-05, + "loss": 1.4038, + "step": 86030 + }, + { + "epoch": 4.273138347418159, + "grad_norm": 0.19254687428474426, + "learning_rate": 5.3581133524735614e-05, + "loss": 1.3849, + "step": 86040 + }, + { + "epoch": 4.273300929767812, + "eval_loss": 2.434671640396118, + "eval_runtime": 21.6596, + "eval_samples_per_second": 230.845, + "eval_steps_per_second": 1.247, + "step": 86043 + }, + { + "epoch": 4.27368028858367, + "grad_norm": 0.1640702337026596, + "learning_rate": 5.356530329940757e-05, + "loss": 1.4098, + "step": 86050 + }, + { + "epoch": 4.274222229749183, + "grad_norm": 0.20824840664863586, + "learning_rate": 5.3549473251803514e-05, + "loss": 1.4025, + "step": 86060 + }, + { + "epoch": 4.274764170914695, + "grad_norm": 0.18921475112438202, + "learning_rate": 5.353364338388442e-05, + "loss": 1.399, + "step": 86070 + }, + { + "epoch": 4.275306112080207, + "grad_norm": 0.27768686413764954, + "learning_rate": 5.351781369761124e-05, + "loss": 1.407, + "step": 86080 + }, + { + "epoch": 4.27584805324572, + "grad_norm": 0.2548810839653015, + "learning_rate": 5.350198419494484e-05, + "loss": 1.4034, + "step": 86090 + }, + { + "epoch": 4.276389994411232, + "grad_norm": 0.1735634207725525, + "learning_rate": 5.348615487784614e-05, + "loss": 1.4001, + "step": 86100 + }, + { + "epoch": 4.276931935576744, + "grad_norm": 0.16922864317893982, + "learning_rate": 5.3470325748276004e-05, + "loss": 1.3943, + "step": 86110 + }, + { + "epoch": 4.277473876742256, + "grad_norm": 0.20345307886600494, + "learning_rate": 5.3454496808195264e-05, + "loss": 1.41, + "step": 86120 + }, + { + "epoch": 4.278015817907768, + "grad_norm": 0.20915158092975616, + "learning_rate": 5.343866805956476e-05, + "loss": 1.4036, + "step": 86130 + }, + { + "epoch": 4.278015817907768, + "eval_loss": 2.4313769340515137, + "eval_runtime": 21.964, + "eval_samples_per_second": 227.645, + "eval_steps_per_second": 1.229, + "step": 86130 + }, + { + "epoch": 4.278557759073281, + "grad_norm": 0.2022494673728943, + "learning_rate": 5.342283950434529e-05, + "loss": 1.3995, + "step": 86140 + }, + { + "epoch": 4.279099700238793, + "grad_norm": 0.1643056720495224, + "learning_rate": 5.3407011144497596e-05, + "loss": 1.4006, + "step": 86150 + }, + { + "epoch": 4.2796416414043055, + "grad_norm": 0.23833966255187988, + "learning_rate": 5.339118298198245e-05, + "loss": 1.3962, + "step": 86160 + }, + { + "epoch": 4.280183582569817, + "grad_norm": 0.22995562851428986, + "learning_rate": 5.337535501876057e-05, + "loss": 1.4049, + "step": 86170 + }, + { + "epoch": 4.280725523735329, + "grad_norm": 0.29235386848449707, + "learning_rate": 5.335952725679265e-05, + "loss": 1.3905, + "step": 86180 + }, + { + "epoch": 4.281267464900842, + "grad_norm": 0.3634209632873535, + "learning_rate": 5.334369969803937e-05, + "loss": 1.4187, + "step": 86190 + }, + { + "epoch": 4.281809406066354, + "grad_norm": 0.2359488159418106, + "learning_rate": 5.33278723444614e-05, + "loss": 1.3963, + "step": 86200 + }, + { + "epoch": 4.282351347231867, + "grad_norm": 0.19134867191314697, + "learning_rate": 5.331204519801933e-05, + "loss": 1.4091, + "step": 86210 + }, + { + "epoch": 4.282730706047725, + "eval_loss": 2.434406280517578, + "eval_runtime": 21.9736, + "eval_samples_per_second": 227.546, + "eval_steps_per_second": 1.229, + "step": 86217 + }, + { + "epoch": 4.2828932883973785, + "grad_norm": 0.22415363788604736, + "learning_rate": 5.329621826067377e-05, + "loss": 1.4002, + "step": 86220 + }, + { + "epoch": 4.28343522956289, + "grad_norm": 0.22894415259361267, + "learning_rate": 5.32803915343853e-05, + "loss": 1.4073, + "step": 86230 + }, + { + "epoch": 4.283977170728403, + "grad_norm": 0.21755249798297882, + "learning_rate": 5.3264565021114474e-05, + "loss": 1.3983, + "step": 86240 + }, + { + "epoch": 4.284519111893915, + "grad_norm": 0.22237862646579742, + "learning_rate": 5.324873872282179e-05, + "loss": 1.41, + "step": 86250 + }, + { + "epoch": 4.285061053059427, + "grad_norm": 0.2396155744791031, + "learning_rate": 5.323291264146774e-05, + "loss": 1.3943, + "step": 86260 + }, + { + "epoch": 4.28560299422494, + "grad_norm": 0.1976693719625473, + "learning_rate": 5.321708677901282e-05, + "loss": 1.4141, + "step": 86270 + }, + { + "epoch": 4.2861449353904515, + "grad_norm": 0.23698921501636505, + "learning_rate": 5.320126113741742e-05, + "loss": 1.3918, + "step": 86280 + }, + { + "epoch": 4.286686876555964, + "grad_norm": 0.2332429438829422, + "learning_rate": 5.3185435718641993e-05, + "loss": 1.3988, + "step": 86290 + }, + { + "epoch": 4.287228817721476, + "grad_norm": 0.18696242570877075, + "learning_rate": 5.3169610524646916e-05, + "loss": 1.4141, + "step": 86300 + }, + { + "epoch": 4.287445594187681, + "eval_loss": 2.429657459259033, + "eval_runtime": 21.9725, + "eval_samples_per_second": 227.558, + "eval_steps_per_second": 1.229, + "step": 86304 + }, + { + "epoch": 4.287770758886988, + "grad_norm": 0.15365861356258392, + "learning_rate": 5.315378555739254e-05, + "loss": 1.4071, + "step": 86310 + }, + { + "epoch": 4.288312700052501, + "grad_norm": 0.16417507827281952, + "learning_rate": 5.313796081883918e-05, + "loss": 1.3956, + "step": 86320 + }, + { + "epoch": 4.288854641218013, + "grad_norm": 0.19014696776866913, + "learning_rate": 5.3122136310947194e-05, + "loss": 1.4021, + "step": 86330 + }, + { + "epoch": 4.289396582383525, + "grad_norm": 0.20190609991550446, + "learning_rate": 5.3106312035676766e-05, + "loss": 1.3925, + "step": 86340 + }, + { + "epoch": 4.289938523549037, + "grad_norm": 0.16114814579486847, + "learning_rate": 5.3090487994988184e-05, + "loss": 1.3964, + "step": 86350 + }, + { + "epoch": 4.290480464714549, + "grad_norm": 0.3137826919555664, + "learning_rate": 5.307466419084166e-05, + "loss": 1.4145, + "step": 86360 + }, + { + "epoch": 4.291022405880062, + "grad_norm": 0.23774784803390503, + "learning_rate": 5.3058840625197394e-05, + "loss": 1.4024, + "step": 86370 + }, + { + "epoch": 4.291564347045574, + "grad_norm": 0.23315760493278503, + "learning_rate": 5.304301730001552e-05, + "loss": 1.3947, + "step": 86380 + }, + { + "epoch": 4.292106288211086, + "grad_norm": 0.34956422448158264, + "learning_rate": 5.302719421725615e-05, + "loss": 1.3983, + "step": 86390 + }, + { + "epoch": 4.292160482327637, + "eval_loss": 2.433197498321533, + "eval_runtime": 21.9709, + "eval_samples_per_second": 227.574, + "eval_steps_per_second": 1.229, + "step": 86391 + }, + { + "epoch": 4.292648229376598, + "grad_norm": 0.17017148435115814, + "learning_rate": 5.30113713788794e-05, + "loss": 1.3998, + "step": 86400 + }, + { + "epoch": 4.29319017054211, + "grad_norm": 0.17241883277893066, + "learning_rate": 5.2995548786845336e-05, + "loss": 1.3969, + "step": 86410 + }, + { + "epoch": 4.293732111707623, + "grad_norm": 0.19140473008155823, + "learning_rate": 5.2979726443113985e-05, + "loss": 1.3943, + "step": 86420 + }, + { + "epoch": 4.294274052873135, + "grad_norm": 0.18566745519638062, + "learning_rate": 5.296390434964537e-05, + "loss": 1.4065, + "step": 86430 + }, + { + "epoch": 4.294815994038647, + "grad_norm": 0.19149713218212128, + "learning_rate": 5.294808250839942e-05, + "loss": 1.3848, + "step": 86440 + }, + { + "epoch": 4.2953579352041595, + "grad_norm": 0.2127964198589325, + "learning_rate": 5.2932260921336106e-05, + "loss": 1.4022, + "step": 86450 + }, + { + "epoch": 4.295899876369671, + "grad_norm": 0.17330385744571686, + "learning_rate": 5.291643959041533e-05, + "loss": 1.4034, + "step": 86460 + }, + { + "epoch": 4.296441817535184, + "grad_norm": 0.21223121881484985, + "learning_rate": 5.290061851759698e-05, + "loss": 1.4036, + "step": 86470 + }, + { + "epoch": 4.2968753704675935, + "eval_loss": 2.4309399127960205, + "eval_runtime": 21.9752, + "eval_samples_per_second": 227.529, + "eval_steps_per_second": 1.229, + "step": 86478 + }, + { + "epoch": 4.296983758700696, + "grad_norm": 0.2610706388950348, + "learning_rate": 5.288479770484088e-05, + "loss": 1.4027, + "step": 86480 + }, + { + "epoch": 4.297525699866208, + "grad_norm": 0.159620001912117, + "learning_rate": 5.2868977154106866e-05, + "loss": 1.4052, + "step": 86490 + }, + { + "epoch": 4.298067641031721, + "grad_norm": 0.2618952691555023, + "learning_rate": 5.28531568673547e-05, + "loss": 1.4027, + "step": 86500 + }, + { + "epoch": 4.2986095821972325, + "grad_norm": 0.2610114812850952, + "learning_rate": 5.283733684654415e-05, + "loss": 1.4044, + "step": 86510 + }, + { + "epoch": 4.299151523362745, + "grad_norm": 0.22828315198421478, + "learning_rate": 5.282151709363492e-05, + "loss": 1.4014, + "step": 86520 + }, + { + "epoch": 4.299693464528257, + "grad_norm": 0.1908428966999054, + "learning_rate": 5.2805697610586716e-05, + "loss": 1.4008, + "step": 86530 + }, + { + "epoch": 4.300235405693769, + "grad_norm": 0.21566815674304962, + "learning_rate": 5.278987839935913e-05, + "loss": 1.4012, + "step": 86540 + }, + { + "epoch": 4.300777346859282, + "grad_norm": 0.23819507658481598, + "learning_rate": 5.277405946191183e-05, + "loss": 1.4001, + "step": 86550 + }, + { + "epoch": 4.301319288024794, + "grad_norm": 0.19423484802246094, + "learning_rate": 5.275824080020439e-05, + "loss": 1.4052, + "step": 86560 + }, + { + "epoch": 4.30159025860755, + "eval_loss": 2.4245753288269043, + "eval_runtime": 21.9738, + "eval_samples_per_second": 227.543, + "eval_steps_per_second": 1.229, + "step": 86565 + }, + { + "epoch": 4.3018612291903064, + "grad_norm": 0.19236032664775848, + "learning_rate": 5.2742422416196325e-05, + "loss": 1.3929, + "step": 86570 + }, + { + "epoch": 4.302403170355818, + "grad_norm": 0.1677989512681961, + "learning_rate": 5.272660431184717e-05, + "loss": 1.3995, + "step": 86580 + }, + { + "epoch": 4.30294511152133, + "grad_norm": 0.19151169061660767, + "learning_rate": 5.2710786489116416e-05, + "loss": 1.4002, + "step": 86590 + }, + { + "epoch": 4.303487052686843, + "grad_norm": 0.1588166505098343, + "learning_rate": 5.2694968949963485e-05, + "loss": 1.4059, + "step": 86600 + }, + { + "epoch": 4.304028993852355, + "grad_norm": 0.1966225802898407, + "learning_rate": 5.267915169634779e-05, + "loss": 1.4075, + "step": 86610 + }, + { + "epoch": 4.304570935017868, + "grad_norm": 0.16342392563819885, + "learning_rate": 5.266333473022873e-05, + "loss": 1.4083, + "step": 86620 + }, + { + "epoch": 4.3051128761833795, + "grad_norm": 0.1621856838464737, + "learning_rate": 5.26475180535656e-05, + "loss": 1.3987, + "step": 86630 + }, + { + "epoch": 4.305654817348891, + "grad_norm": 0.17513424158096313, + "learning_rate": 5.2631701668317726e-05, + "loss": 1.4009, + "step": 86640 + }, + { + "epoch": 4.306196758514404, + "grad_norm": 0.20303316414356232, + "learning_rate": 5.261588557644437e-05, + "loss": 1.3863, + "step": 86650 + }, + { + "epoch": 4.306305146747507, + "eval_loss": 2.4262466430664062, + "eval_runtime": 21.9658, + "eval_samples_per_second": 227.627, + "eval_steps_per_second": 1.229, + "step": 86652 + }, + { + "epoch": 4.306738699679916, + "grad_norm": 0.1640668660402298, + "learning_rate": 5.260006977990474e-05, + "loss": 1.4091, + "step": 86660 + }, + { + "epoch": 4.307280640845428, + "grad_norm": 0.16020847856998444, + "learning_rate": 5.258425428065805e-05, + "loss": 1.4089, + "step": 86670 + }, + { + "epoch": 4.307822582010941, + "grad_norm": 0.20218387246131897, + "learning_rate": 5.256843908066346e-05, + "loss": 1.3933, + "step": 86680 + }, + { + "epoch": 4.3083645231764525, + "grad_norm": 0.2258821278810501, + "learning_rate": 5.2552624181880086e-05, + "loss": 1.406, + "step": 86690 + }, + { + "epoch": 4.308906464341965, + "grad_norm": 0.2921973168849945, + "learning_rate": 5.253680958626699e-05, + "loss": 1.3927, + "step": 86700 + }, + { + "epoch": 4.309448405507477, + "grad_norm": 0.29356056451797485, + "learning_rate": 5.252099529578323e-05, + "loss": 1.398, + "step": 86710 + }, + { + "epoch": 4.309990346672989, + "grad_norm": 0.16703596711158752, + "learning_rate": 5.250518131238783e-05, + "loss": 1.4071, + "step": 86720 + }, + { + "epoch": 4.310532287838502, + "grad_norm": 0.1876375526189804, + "learning_rate": 5.248936763803972e-05, + "loss": 1.4027, + "step": 86730 + }, + { + "epoch": 4.311020034887463, + "eval_loss": 2.4287726879119873, + "eval_runtime": 21.9767, + "eval_samples_per_second": 227.513, + "eval_steps_per_second": 1.229, + "step": 86739 + }, + { + "epoch": 4.311074229004014, + "grad_norm": 0.20849046111106873, + "learning_rate": 5.2473554274697846e-05, + "loss": 1.3907, + "step": 86740 + }, + { + "epoch": 4.311616170169526, + "grad_norm": 0.24068142473697662, + "learning_rate": 5.24577412243211e-05, + "loss": 1.3999, + "step": 86750 + }, + { + "epoch": 4.312158111335038, + "grad_norm": 0.3270260691642761, + "learning_rate": 5.244192848886834e-05, + "loss": 1.4041, + "step": 86760 + }, + { + "epoch": 4.31270005250055, + "grad_norm": 0.2979717552661896, + "learning_rate": 5.242611607029836e-05, + "loss": 1.4006, + "step": 86770 + }, + { + "epoch": 4.313241993666063, + "grad_norm": 0.16698554158210754, + "learning_rate": 5.241030397056996e-05, + "loss": 1.4058, + "step": 86780 + }, + { + "epoch": 4.313783934831575, + "grad_norm": 0.16404001414775848, + "learning_rate": 5.2394492191641854e-05, + "loss": 1.4078, + "step": 86790 + }, + { + "epoch": 4.314325875997087, + "grad_norm": 0.16933251917362213, + "learning_rate": 5.237868073547274e-05, + "loss": 1.3915, + "step": 86800 + }, + { + "epoch": 4.314867817162599, + "grad_norm": 0.18435190618038177, + "learning_rate": 5.2362869604021304e-05, + "loss": 1.3983, + "step": 86810 + }, + { + "epoch": 4.315409758328111, + "grad_norm": 0.1849733144044876, + "learning_rate": 5.2347058799246104e-05, + "loss": 1.3964, + "step": 86820 + }, + { + "epoch": 4.315734923027419, + "eval_loss": 2.4305026531219482, + "eval_runtime": 21.9708, + "eval_samples_per_second": 227.574, + "eval_steps_per_second": 1.229, + "step": 86826 + }, + { + "epoch": 4.315951699493624, + "grad_norm": 0.2581421732902527, + "learning_rate": 5.233124832310574e-05, + "loss": 1.3981, + "step": 86830 + }, + { + "epoch": 4.316493640659136, + "grad_norm": 0.19301174581050873, + "learning_rate": 5.2315438177558754e-05, + "loss": 1.3909, + "step": 86840 + }, + { + "epoch": 4.317035581824648, + "grad_norm": 0.3168368637561798, + "learning_rate": 5.229962836456364e-05, + "loss": 1.4017, + "step": 86850 + }, + { + "epoch": 4.3175775229901605, + "grad_norm": 0.30351701378822327, + "learning_rate": 5.2283818886078827e-05, + "loss": 1.3929, + "step": 86860 + }, + { + "epoch": 4.318119464155672, + "grad_norm": 0.2167241871356964, + "learning_rate": 5.226800974406274e-05, + "loss": 1.4048, + "step": 86870 + }, + { + "epoch": 4.318661405321185, + "grad_norm": 0.24982495605945587, + "learning_rate": 5.225220094047375e-05, + "loss": 1.4042, + "step": 86880 + }, + { + "epoch": 4.319203346486697, + "grad_norm": 0.15455639362335205, + "learning_rate": 5.2236392477270165e-05, + "loss": 1.4019, + "step": 86890 + }, + { + "epoch": 4.319745287652209, + "grad_norm": 0.20189005136489868, + "learning_rate": 5.22205843564103e-05, + "loss": 1.3887, + "step": 86900 + }, + { + "epoch": 4.320287228817722, + "grad_norm": 0.17087921500205994, + "learning_rate": 5.2204776579852375e-05, + "loss": 1.403, + "step": 86910 + }, + { + "epoch": 4.320449811167375, + "eval_loss": 2.429542303085327, + "eval_runtime": 21.9702, + "eval_samples_per_second": 227.581, + "eval_steps_per_second": 1.229, + "step": 86913 + }, + { + "epoch": 4.3208291699832335, + "grad_norm": 0.19037020206451416, + "learning_rate": 5.218896914955459e-05, + "loss": 1.3954, + "step": 86920 + }, + { + "epoch": 4.321371111148746, + "grad_norm": 0.29596278071403503, + "learning_rate": 5.217316206747509e-05, + "loss": 1.3994, + "step": 86930 + }, + { + "epoch": 4.321913052314258, + "grad_norm": 0.2806200385093689, + "learning_rate": 5.2157355335572024e-05, + "loss": 1.4008, + "step": 86940 + }, + { + "epoch": 4.32245499347977, + "grad_norm": 0.19318336248397827, + "learning_rate": 5.214154895580342e-05, + "loss": 1.4017, + "step": 86950 + }, + { + "epoch": 4.322996934645283, + "grad_norm": 0.266025185585022, + "learning_rate": 5.2125742930127316e-05, + "loss": 1.3989, + "step": 86960 + }, + { + "epoch": 4.323538875810795, + "grad_norm": 0.20118410885334015, + "learning_rate": 5.210993726050173e-05, + "loss": 1.4009, + "step": 86970 + }, + { + "epoch": 4.324080816976307, + "grad_norm": 0.3275969922542572, + "learning_rate": 5.209413194888453e-05, + "loss": 1.3998, + "step": 86980 + }, + { + "epoch": 4.324622758141819, + "grad_norm": 0.22696785628795624, + "learning_rate": 5.2078326997233665e-05, + "loss": 1.4047, + "step": 86990 + }, + { + "epoch": 4.325164699307331, + "grad_norm": 0.1600492149591446, + "learning_rate": 5.206252240750695e-05, + "loss": 1.392, + "step": 87000 + }, + { + "epoch": 4.325164699307331, + "eval_loss": 2.4212775230407715, + "eval_runtime": 21.9652, + "eval_samples_per_second": 227.633, + "eval_steps_per_second": 1.229, + "step": 87000 + }, + { + "epoch": 4.325706640472844, + "grad_norm": 0.17092597484588623, + "learning_rate": 5.204671818166225e-05, + "loss": 1.3931, + "step": 87010 + }, + { + "epoch": 4.326248581638356, + "grad_norm": 0.3140299320220947, + "learning_rate": 5.203091432165724e-05, + "loss": 1.3986, + "step": 87020 + }, + { + "epoch": 4.3267905228038686, + "grad_norm": 0.2115614265203476, + "learning_rate": 5.201511082944968e-05, + "loss": 1.4102, + "step": 87030 + }, + { + "epoch": 4.32733246396938, + "grad_norm": 0.15958638489246368, + "learning_rate": 5.199930770699725e-05, + "loss": 1.4012, + "step": 87040 + }, + { + "epoch": 4.327874405134892, + "grad_norm": 0.15782247483730316, + "learning_rate": 5.198350495625753e-05, + "loss": 1.4026, + "step": 87050 + }, + { + "epoch": 4.328416346300405, + "grad_norm": 0.32305723428726196, + "learning_rate": 5.1967702579188125e-05, + "loss": 1.3988, + "step": 87060 + }, + { + "epoch": 4.328958287465917, + "grad_norm": 0.23694272339344025, + "learning_rate": 5.1951900577746584e-05, + "loss": 1.4041, + "step": 87070 + }, + { + "epoch": 4.329500228631429, + "grad_norm": 0.21021704375743866, + "learning_rate": 5.1936098953890335e-05, + "loss": 1.3879, + "step": 87080 + }, + { + "epoch": 4.329879587447287, + "eval_loss": 2.4319331645965576, + "eval_runtime": 21.9684, + "eval_samples_per_second": 227.6, + "eval_steps_per_second": 1.229, + "step": 87087 + }, + { + "epoch": 4.330042169796942, + "grad_norm": 0.2358713448047638, + "learning_rate": 5.192029770957685e-05, + "loss": 1.3971, + "step": 87090 + }, + { + "epoch": 4.330584110962453, + "grad_norm": 0.19418524205684662, + "learning_rate": 5.1904496846763536e-05, + "loss": 1.4053, + "step": 87100 + }, + { + "epoch": 4.331126052127966, + "grad_norm": 0.21557055413722992, + "learning_rate": 5.1888696367407696e-05, + "loss": 1.408, + "step": 87110 + }, + { + "epoch": 4.331667993293478, + "grad_norm": 0.15955239534378052, + "learning_rate": 5.1872896273466645e-05, + "loss": 1.4037, + "step": 87120 + }, + { + "epoch": 4.33220993445899, + "grad_norm": 0.24472442269325256, + "learning_rate": 5.1857096566897614e-05, + "loss": 1.3886, + "step": 87130 + }, + { + "epoch": 4.332751875624503, + "grad_norm": 0.24180065095424652, + "learning_rate": 5.184129724965784e-05, + "loss": 1.3925, + "step": 87140 + }, + { + "epoch": 4.333293816790015, + "grad_norm": 0.3072583079338074, + "learning_rate": 5.1825498323704423e-05, + "loss": 1.4073, + "step": 87150 + }, + { + "epoch": 4.333835757955527, + "grad_norm": 0.36256688833236694, + "learning_rate": 5.1809699790994505e-05, + "loss": 1.4036, + "step": 87160 + }, + { + "epoch": 4.334377699121039, + "grad_norm": 0.3161357641220093, + "learning_rate": 5.179390165348514e-05, + "loss": 1.3945, + "step": 87170 + }, + { + "epoch": 4.334594475587244, + "eval_loss": 2.4241182804107666, + "eval_runtime": 21.9706, + "eval_samples_per_second": 227.577, + "eval_steps_per_second": 1.229, + "step": 87174 + }, + { + "epoch": 4.334919640286551, + "grad_norm": 0.1909865289926529, + "learning_rate": 5.177810391313329e-05, + "loss": 1.4059, + "step": 87180 + }, + { + "epoch": 4.335461581452064, + "grad_norm": 0.2220858484506607, + "learning_rate": 5.176230657189596e-05, + "loss": 1.4005, + "step": 87190 + }, + { + "epoch": 4.336003522617576, + "grad_norm": 0.23918281495571136, + "learning_rate": 5.1746509631730035e-05, + "loss": 1.3936, + "step": 87200 + }, + { + "epoch": 4.336545463783088, + "grad_norm": 0.25223907828330994, + "learning_rate": 5.173071309459236e-05, + "loss": 1.4003, + "step": 87210 + }, + { + "epoch": 4.3370874049486, + "grad_norm": 0.1692775934934616, + "learning_rate": 5.171491696243976e-05, + "loss": 1.4004, + "step": 87220 + }, + { + "epoch": 4.337629346114112, + "grad_norm": 0.22520475089550018, + "learning_rate": 5.1699121237228984e-05, + "loss": 1.4074, + "step": 87230 + }, + { + "epoch": 4.338171287279625, + "grad_norm": 0.23879382014274597, + "learning_rate": 5.168332592091673e-05, + "loss": 1.3965, + "step": 87240 + }, + { + "epoch": 4.338713228445137, + "grad_norm": 0.18485189974308014, + "learning_rate": 5.166753101545967e-05, + "loss": 1.4024, + "step": 87250 + }, + { + "epoch": 4.339255169610649, + "grad_norm": 0.2033102959394455, + "learning_rate": 5.165173652281441e-05, + "loss": 1.4067, + "step": 87260 + }, + { + "epoch": 4.3393093637272, + "eval_loss": 2.4278345108032227, + "eval_runtime": 21.973, + "eval_samples_per_second": 227.552, + "eval_steps_per_second": 1.229, + "step": 87261 + }, + { + "epoch": 4.3397971107761615, + "grad_norm": 0.16319087147712708, + "learning_rate": 5.163594244493748e-05, + "loss": 1.4038, + "step": 87270 + }, + { + "epoch": 4.340339051941673, + "grad_norm": 0.18227989971637726, + "learning_rate": 5.1620148783785385e-05, + "loss": 1.4148, + "step": 87280 + }, + { + "epoch": 4.340880993107186, + "grad_norm": 0.17856183648109436, + "learning_rate": 5.1604355541314586e-05, + "loss": 1.4062, + "step": 87290 + }, + { + "epoch": 4.341422934272698, + "grad_norm": 0.21139036118984222, + "learning_rate": 5.15885627194815e-05, + "loss": 1.399, + "step": 87300 + }, + { + "epoch": 4.34196487543821, + "grad_norm": 0.3597468435764313, + "learning_rate": 5.157277032024245e-05, + "loss": 1.3997, + "step": 87310 + }, + { + "epoch": 4.342506816603723, + "grad_norm": 0.18101923167705536, + "learning_rate": 5.155697834555372e-05, + "loss": 1.3952, + "step": 87320 + }, + { + "epoch": 4.3430487577692345, + "grad_norm": 0.20467126369476318, + "learning_rate": 5.154118679737158e-05, + "loss": 1.4021, + "step": 87330 + }, + { + "epoch": 4.343590698934747, + "grad_norm": 0.2586832046508789, + "learning_rate": 5.152539567765219e-05, + "loss": 1.4073, + "step": 87340 + }, + { + "epoch": 4.344024251867157, + "eval_loss": 2.429697036743164, + "eval_runtime": 21.9753, + "eval_samples_per_second": 227.528, + "eval_steps_per_second": 1.229, + "step": 87348 + }, + { + "epoch": 4.344132640100259, + "grad_norm": 0.20841944217681885, + "learning_rate": 5.150960498835169e-05, + "loss": 1.3959, + "step": 87350 + }, + { + "epoch": 4.344674581265771, + "grad_norm": 0.15651927888393402, + "learning_rate": 5.149381473142621e-05, + "loss": 1.4009, + "step": 87360 + }, + { + "epoch": 4.345216522431284, + "grad_norm": 0.22933441400527954, + "learning_rate": 5.147802490883169e-05, + "loss": 1.3964, + "step": 87370 + }, + { + "epoch": 4.345758463596796, + "grad_norm": 0.2737453281879425, + "learning_rate": 5.146223552252416e-05, + "loss": 1.3928, + "step": 87380 + }, + { + "epoch": 4.346300404762308, + "grad_norm": 0.22717493772506714, + "learning_rate": 5.144644657445957e-05, + "loss": 1.3937, + "step": 87390 + }, + { + "epoch": 4.34684234592782, + "grad_norm": 0.16106519103050232, + "learning_rate": 5.143065806659373e-05, + "loss": 1.4036, + "step": 87400 + }, + { + "epoch": 4.347384287093332, + "grad_norm": 0.2053665816783905, + "learning_rate": 5.141487000088245e-05, + "loss": 1.4002, + "step": 87410 + }, + { + "epoch": 4.347926228258845, + "grad_norm": 0.14804090559482574, + "learning_rate": 5.139908237928155e-05, + "loss": 1.3886, + "step": 87420 + }, + { + "epoch": 4.348468169424357, + "grad_norm": 0.2843748927116394, + "learning_rate": 5.138329520374666e-05, + "loss": 1.4034, + "step": 87430 + }, + { + "epoch": 4.348739140007113, + "eval_loss": 2.4298503398895264, + "eval_runtime": 21.9707, + "eval_samples_per_second": 227.576, + "eval_steps_per_second": 1.229, + "step": 87435 + }, + { + "epoch": 4.3490101105898695, + "grad_norm": 0.15882396697998047, + "learning_rate": 5.1367508476233474e-05, + "loss": 1.3942, + "step": 87440 + }, + { + "epoch": 4.349552051755381, + "grad_norm": 0.23637013137340546, + "learning_rate": 5.135172219869755e-05, + "loss": 1.4045, + "step": 87450 + }, + { + "epoch": 4.350093992920893, + "grad_norm": 0.20774583518505096, + "learning_rate": 5.1335936373094475e-05, + "loss": 1.3984, + "step": 87460 + }, + { + "epoch": 4.350635934086406, + "grad_norm": 0.21993571519851685, + "learning_rate": 5.132015100137966e-05, + "loss": 1.4071, + "step": 87470 + }, + { + "epoch": 4.351177875251918, + "grad_norm": 0.16270707547664642, + "learning_rate": 5.130436608550856e-05, + "loss": 1.3982, + "step": 87480 + }, + { + "epoch": 4.35171981641743, + "grad_norm": 0.16741152107715607, + "learning_rate": 5.128858162743658e-05, + "loss": 1.3972, + "step": 87490 + }, + { + "epoch": 4.3522617575829425, + "grad_norm": 0.19168831408023834, + "learning_rate": 5.1272797629118976e-05, + "loss": 1.4038, + "step": 87500 + }, + { + "epoch": 4.352803698748454, + "grad_norm": 0.18986192345619202, + "learning_rate": 5.1257014092511e-05, + "loss": 1.4015, + "step": 87510 + }, + { + "epoch": 4.353345639913967, + "grad_norm": 0.15182600915431976, + "learning_rate": 5.12412310195679e-05, + "loss": 1.3979, + "step": 87520 + }, + { + "epoch": 4.35345402814707, + "eval_loss": 2.431755542755127, + "eval_runtime": 21.9949, + "eval_samples_per_second": 227.325, + "eval_steps_per_second": 1.228, + "step": 87522 + }, + { + "epoch": 4.353887581079479, + "grad_norm": 0.23063354194164276, + "learning_rate": 5.122544841224476e-05, + "loss": 1.3934, + "step": 87530 + }, + { + "epoch": 4.354429522244991, + "grad_norm": 0.2470499873161316, + "learning_rate": 5.120966627249669e-05, + "loss": 1.4009, + "step": 87540 + }, + { + "epoch": 4.354971463410504, + "grad_norm": 0.15349188446998596, + "learning_rate": 5.119388460227872e-05, + "loss": 1.3953, + "step": 87550 + }, + { + "epoch": 4.3555134045760155, + "grad_norm": 0.2129431664943695, + "learning_rate": 5.1178103403545794e-05, + "loss": 1.3964, + "step": 87560 + }, + { + "epoch": 4.356055345741528, + "grad_norm": 0.16350790858268738, + "learning_rate": 5.116232267825279e-05, + "loss": 1.3981, + "step": 87570 + }, + { + "epoch": 4.35659728690704, + "grad_norm": 0.19389735162258148, + "learning_rate": 5.114654242835465e-05, + "loss": 1.4019, + "step": 87580 + }, + { + "epoch": 4.357139228072552, + "grad_norm": 0.2006743848323822, + "learning_rate": 5.113076265580606e-05, + "loss": 1.3975, + "step": 87590 + }, + { + "epoch": 4.357681169238065, + "grad_norm": 0.1707877218723297, + "learning_rate": 5.111498336256181e-05, + "loss": 1.403, + "step": 87600 + }, + { + "epoch": 4.358168916287026, + "eval_loss": 2.4279067516326904, + "eval_runtime": 21.9698, + "eval_samples_per_second": 227.585, + "eval_steps_per_second": 1.229, + "step": 87609 + }, + { + "epoch": 4.358223110403577, + "grad_norm": 0.2142249345779419, + "learning_rate": 5.109920455057655e-05, + "loss": 1.403, + "step": 87610 + }, + { + "epoch": 4.358765051569089, + "grad_norm": 0.1643165647983551, + "learning_rate": 5.10834262218049e-05, + "loss": 1.3975, + "step": 87620 + }, + { + "epoch": 4.359306992734601, + "grad_norm": 0.20058096945285797, + "learning_rate": 5.106764837820141e-05, + "loss": 1.4049, + "step": 87630 + }, + { + "epoch": 4.359848933900113, + "grad_norm": 0.17463991045951843, + "learning_rate": 5.1051871021720546e-05, + "loss": 1.4036, + "step": 87640 + }, + { + "epoch": 4.360390875065626, + "grad_norm": 0.20207591354846954, + "learning_rate": 5.103609415431678e-05, + "loss": 1.4025, + "step": 87650 + }, + { + "epoch": 4.360932816231138, + "grad_norm": 0.1925676167011261, + "learning_rate": 5.102031777794446e-05, + "loss": 1.3931, + "step": 87660 + }, + { + "epoch": 4.36147475739665, + "grad_norm": 0.2542779743671417, + "learning_rate": 5.100454189455787e-05, + "loss": 1.3996, + "step": 87670 + }, + { + "epoch": 4.3620166985621625, + "grad_norm": 0.21328918635845184, + "learning_rate": 5.0988766506111316e-05, + "loss": 1.398, + "step": 87680 + }, + { + "epoch": 4.362558639727674, + "grad_norm": 0.23086091876029968, + "learning_rate": 5.097299161455893e-05, + "loss": 1.4014, + "step": 87690 + }, + { + "epoch": 4.362883804426982, + "eval_loss": 2.4321718215942383, + "eval_runtime": 21.9729, + "eval_samples_per_second": 227.553, + "eval_steps_per_second": 1.229, + "step": 87696 + }, + { + "epoch": 4.363100580893187, + "grad_norm": 0.15713347494602203, + "learning_rate": 5.095721722185487e-05, + "loss": 1.3932, + "step": 87700 + }, + { + "epoch": 4.363642522058699, + "grad_norm": 0.16743972897529602, + "learning_rate": 5.0941443329953185e-05, + "loss": 1.399, + "step": 87710 + }, + { + "epoch": 4.364184463224211, + "grad_norm": 0.15407240390777588, + "learning_rate": 5.0925669940807885e-05, + "loss": 1.4004, + "step": 87720 + }, + { + "epoch": 4.364726404389724, + "grad_norm": 0.1776094287633896, + "learning_rate": 5.090989705637289e-05, + "loss": 1.3983, + "step": 87730 + }, + { + "epoch": 4.3652683455552355, + "grad_norm": 0.16441775858402252, + "learning_rate": 5.0894124678602116e-05, + "loss": 1.3959, + "step": 87740 + }, + { + "epoch": 4.365810286720748, + "grad_norm": 0.21521979570388794, + "learning_rate": 5.0878352809449336e-05, + "loss": 1.3909, + "step": 87750 + }, + { + "epoch": 4.36635222788626, + "grad_norm": 0.1725737303495407, + "learning_rate": 5.086258145086831e-05, + "loss": 1.3974, + "step": 87760 + }, + { + "epoch": 4.366894169051772, + "grad_norm": 0.1918652206659317, + "learning_rate": 5.084681060481271e-05, + "loss": 1.3925, + "step": 87770 + }, + { + "epoch": 4.367436110217285, + "grad_norm": 0.16911828517913818, + "learning_rate": 5.083104027323623e-05, + "loss": 1.391, + "step": 87780 + }, + { + "epoch": 4.367598692566938, + "eval_loss": 2.4334969520568848, + "eval_runtime": 21.9702, + "eval_samples_per_second": 227.581, + "eval_steps_per_second": 1.229, + "step": 87783 + }, + { + "epoch": 4.367978051382797, + "grad_norm": 0.16556398570537567, + "learning_rate": 5.081527045809236e-05, + "loss": 1.3964, + "step": 87790 + }, + { + "epoch": 4.368519992548309, + "grad_norm": 0.17277011275291443, + "learning_rate": 5.0799501161334606e-05, + "loss": 1.3978, + "step": 87800 + }, + { + "epoch": 4.369061933713821, + "grad_norm": 0.17669574916362762, + "learning_rate": 5.078373238491642e-05, + "loss": 1.4088, + "step": 87810 + }, + { + "epoch": 4.369603874879333, + "grad_norm": 0.19774076342582703, + "learning_rate": 5.076796413079116e-05, + "loss": 1.4003, + "step": 87820 + }, + { + "epoch": 4.370145816044846, + "grad_norm": 0.22697632014751434, + "learning_rate": 5.075219640091212e-05, + "loss": 1.3944, + "step": 87830 + }, + { + "epoch": 4.370687757210358, + "grad_norm": 0.19095872342586517, + "learning_rate": 5.0736429197232574e-05, + "loss": 1.3905, + "step": 87840 + }, + { + "epoch": 4.3712296983758705, + "grad_norm": 0.2845691740512848, + "learning_rate": 5.072066252170565e-05, + "loss": 1.4046, + "step": 87850 + }, + { + "epoch": 4.371771639541382, + "grad_norm": 0.20539964735507965, + "learning_rate": 5.070489637628447e-05, + "loss": 1.3944, + "step": 87860 + }, + { + "epoch": 4.372313580706894, + "grad_norm": 0.22066408395767212, + "learning_rate": 5.06891307629221e-05, + "loss": 1.3861, + "step": 87870 + }, + { + "epoch": 4.372313580706894, + "eval_loss": 2.430873394012451, + "eval_runtime": 22.02, + "eval_samples_per_second": 227.067, + "eval_steps_per_second": 1.226, + "step": 87870 + }, + { + "epoch": 4.372855521872407, + "grad_norm": 0.25216028094291687, + "learning_rate": 5.067336568357147e-05, + "loss": 1.3944, + "step": 87880 + }, + { + "epoch": 4.373397463037919, + "grad_norm": 0.16091406345367432, + "learning_rate": 5.065760114018553e-05, + "loss": 1.4024, + "step": 87890 + }, + { + "epoch": 4.373939404203432, + "grad_norm": 0.19635522365570068, + "learning_rate": 5.0641837134717095e-05, + "loss": 1.409, + "step": 87900 + }, + { + "epoch": 4.3744813453689435, + "grad_norm": 0.24364635348320007, + "learning_rate": 5.062607366911897e-05, + "loss": 1.3902, + "step": 87910 + }, + { + "epoch": 4.375023286534455, + "grad_norm": 0.24988219141960144, + "learning_rate": 5.0610310745343836e-05, + "loss": 1.4047, + "step": 87920 + }, + { + "epoch": 4.375565227699968, + "grad_norm": 0.18812812864780426, + "learning_rate": 5.0594548365344354e-05, + "loss": 1.3898, + "step": 87930 + }, + { + "epoch": 4.37610716886548, + "grad_norm": 0.3591182231903076, + "learning_rate": 5.057878653107311e-05, + "loss": 1.4025, + "step": 87940 + }, + { + "epoch": 4.376649110030992, + "grad_norm": 0.23308515548706055, + "learning_rate": 5.0563025244482574e-05, + "loss": 1.4003, + "step": 87950 + }, + { + "epoch": 4.37702846884685, + "eval_loss": 2.4251065254211426, + "eval_runtime": 21.9772, + "eval_samples_per_second": 227.509, + "eval_steps_per_second": 1.229, + "step": 87957 + }, + { + "epoch": 4.377191051196505, + "grad_norm": 0.17095594108104706, + "learning_rate": 5.054726450752521e-05, + "loss": 1.4007, + "step": 87960 + }, + { + "epoch": 4.3777329923620165, + "grad_norm": 0.19432257115840912, + "learning_rate": 5.05315043221534e-05, + "loss": 1.4043, + "step": 87970 + }, + { + "epoch": 4.378274933527529, + "grad_norm": 0.18219472467899323, + "learning_rate": 5.051574469031942e-05, + "loss": 1.4022, + "step": 87980 + }, + { + "epoch": 4.378816874693041, + "grad_norm": 0.17361074686050415, + "learning_rate": 5.049998561397552e-05, + "loss": 1.3977, + "step": 87990 + }, + { + "epoch": 4.379358815858553, + "grad_norm": 0.21151654422283173, + "learning_rate": 5.0484227095073865e-05, + "loss": 1.3887, + "step": 88000 + }, + { + "epoch": 4.379900757024066, + "grad_norm": 0.16382147371768951, + "learning_rate": 5.0468469135566554e-05, + "loss": 1.3901, + "step": 88010 + }, + { + "epoch": 4.380442698189578, + "grad_norm": 0.1616712063550949, + "learning_rate": 5.045271173740562e-05, + "loss": 1.4068, + "step": 88020 + }, + { + "epoch": 4.38098463935509, + "grad_norm": 0.17013293504714966, + "learning_rate": 5.043695490254302e-05, + "loss": 1.3819, + "step": 88030 + }, + { + "epoch": 4.381526580520602, + "grad_norm": 0.1871418058872223, + "learning_rate": 5.0421198632930624e-05, + "loss": 1.4056, + "step": 88040 + }, + { + "epoch": 4.381743356986807, + "eval_loss": 2.4317269325256348, + "eval_runtime": 21.9684, + "eval_samples_per_second": 227.6, + "eval_steps_per_second": 1.229, + "step": 88044 + }, + { + "epoch": 4.382068521686114, + "grad_norm": 0.2265293151140213, + "learning_rate": 5.0405442930520253e-05, + "loss": 1.3987, + "step": 88050 + }, + { + "epoch": 4.382610462851627, + "grad_norm": 0.26006123423576355, + "learning_rate": 5.0389687797263664e-05, + "loss": 1.397, + "step": 88060 + }, + { + "epoch": 4.383152404017139, + "grad_norm": 0.19830361008644104, + "learning_rate": 5.037393323511256e-05, + "loss": 1.421, + "step": 88070 + }, + { + "epoch": 4.383694345182651, + "grad_norm": 0.2708401083946228, + "learning_rate": 5.03581792460185e-05, + "loss": 1.3956, + "step": 88080 + }, + { + "epoch": 4.384236286348163, + "grad_norm": 0.18328110873699188, + "learning_rate": 5.034242583193305e-05, + "loss": 1.3926, + "step": 88090 + }, + { + "epoch": 4.384778227513675, + "grad_norm": 0.19660696387290955, + "learning_rate": 5.032667299480767e-05, + "loss": 1.386, + "step": 88100 + }, + { + "epoch": 4.385320168679188, + "grad_norm": 0.21113744378089905, + "learning_rate": 5.0310920736593737e-05, + "loss": 1.4048, + "step": 88110 + }, + { + "epoch": 4.3858621098447, + "grad_norm": 0.22649426758289337, + "learning_rate": 5.029516905924258e-05, + "loss": 1.3993, + "step": 88120 + }, + { + "epoch": 4.386404051010212, + "grad_norm": 0.16807417571544647, + "learning_rate": 5.02794179647055e-05, + "loss": 1.3882, + "step": 88130 + }, + { + "epoch": 4.3864582451267635, + "eval_loss": 2.411639451980591, + "eval_runtime": 21.9652, + "eval_samples_per_second": 227.633, + "eval_steps_per_second": 1.229, + "step": 88131 + }, + { + "epoch": 4.386945992175725, + "grad_norm": 0.18633247911930084, + "learning_rate": 5.026366745493359e-05, + "loss": 1.4067, + "step": 88140 + }, + { + "epoch": 4.387487933341236, + "grad_norm": 0.16578739881515503, + "learning_rate": 5.0247917531877985e-05, + "loss": 1.3962, + "step": 88150 + }, + { + "epoch": 4.388029874506749, + "grad_norm": 0.23297452926635742, + "learning_rate": 5.0232168197489746e-05, + "loss": 1.4044, + "step": 88160 + }, + { + "epoch": 4.388571815672261, + "grad_norm": 0.24161213636398315, + "learning_rate": 5.02164194537198e-05, + "loss": 1.4081, + "step": 88170 + }, + { + "epoch": 4.389113756837773, + "grad_norm": 0.20169183611869812, + "learning_rate": 5.020067130251904e-05, + "loss": 1.3996, + "step": 88180 + }, + { + "epoch": 4.389655698003286, + "grad_norm": 0.21233811974525452, + "learning_rate": 5.01849237458383e-05, + "loss": 1.4046, + "step": 88190 + }, + { + "epoch": 4.390197639168798, + "grad_norm": 0.15420041978359222, + "learning_rate": 5.016917678562828e-05, + "loss": 1.4007, + "step": 88200 + }, + { + "epoch": 4.39073958033431, + "grad_norm": 0.1760631501674652, + "learning_rate": 5.0153430423839676e-05, + "loss": 1.3998, + "step": 88210 + }, + { + "epoch": 4.39117313326672, + "eval_loss": 2.422487258911133, + "eval_runtime": 21.9702, + "eval_samples_per_second": 227.581, + "eval_steps_per_second": 1.229, + "step": 88218 + }, + { + "epoch": 4.391281521499822, + "grad_norm": 0.18712304532527924, + "learning_rate": 5.013768466242307e-05, + "loss": 1.403, + "step": 88220 + }, + { + "epoch": 4.391823462665334, + "grad_norm": 0.1596558541059494, + "learning_rate": 5.0121939503328985e-05, + "loss": 1.3987, + "step": 88230 + }, + { + "epoch": 4.392365403830847, + "grad_norm": 0.1593562513589859, + "learning_rate": 5.010619494850785e-05, + "loss": 1.4045, + "step": 88240 + }, + { + "epoch": 4.392907344996359, + "grad_norm": 0.18083150684833527, + "learning_rate": 5.0090450999910035e-05, + "loss": 1.3972, + "step": 88250 + }, + { + "epoch": 4.3934492861618715, + "grad_norm": 0.16882987320423126, + "learning_rate": 5.007470765948584e-05, + "loss": 1.4041, + "step": 88260 + }, + { + "epoch": 4.393991227327383, + "grad_norm": 0.17636427283287048, + "learning_rate": 5.005896492918547e-05, + "loss": 1.3923, + "step": 88270 + }, + { + "epoch": 4.394533168492895, + "grad_norm": 0.16823971271514893, + "learning_rate": 5.004322281095907e-05, + "loss": 1.395, + "step": 88280 + }, + { + "epoch": 4.395075109658408, + "grad_norm": 0.1983443945646286, + "learning_rate": 5.002748130675672e-05, + "loss": 1.3915, + "step": 88290 + }, + { + "epoch": 4.39561705082392, + "grad_norm": 0.16008202731609344, + "learning_rate": 5.001174041852839e-05, + "loss": 1.3959, + "step": 88300 + }, + { + "epoch": 4.395888021406676, + "eval_loss": 2.420740842819214, + "eval_runtime": 21.9717, + "eval_samples_per_second": 227.565, + "eval_steps_per_second": 1.229, + "step": 88305 + }, + { + "epoch": 4.396158991989433, + "grad_norm": 0.1860068142414093, + "learning_rate": 4.9996000148224e-05, + "loss": 1.3967, + "step": 88310 + }, + { + "epoch": 4.3967009331549445, + "grad_norm": 0.2595568001270294, + "learning_rate": 4.99802604977934e-05, + "loss": 1.3915, + "step": 88320 + }, + { + "epoch": 4.397242874320456, + "grad_norm": 0.16008210182189941, + "learning_rate": 4.996452146918632e-05, + "loss": 1.396, + "step": 88330 + }, + { + "epoch": 4.397784815485969, + "grad_norm": 0.16884075105190277, + "learning_rate": 4.994878306435245e-05, + "loss": 1.3935, + "step": 88340 + }, + { + "epoch": 4.398326756651481, + "grad_norm": 0.17851825058460236, + "learning_rate": 4.993304528524143e-05, + "loss": 1.3962, + "step": 88350 + }, + { + "epoch": 4.398868697816993, + "grad_norm": 0.16581206023693085, + "learning_rate": 4.9917308133802745e-05, + "loss": 1.399, + "step": 88360 + }, + { + "epoch": 4.399410638982506, + "grad_norm": 0.2509933412075043, + "learning_rate": 4.9901571611985855e-05, + "loss": 1.3986, + "step": 88370 + }, + { + "epoch": 4.3999525801480175, + "grad_norm": 0.1800008863210678, + "learning_rate": 4.988583572174015e-05, + "loss": 1.4032, + "step": 88380 + }, + { + "epoch": 4.40049452131353, + "grad_norm": 0.19821348786354065, + "learning_rate": 4.987010046501491e-05, + "loss": 1.3933, + "step": 88390 + }, + { + "epoch": 4.400602909546633, + "eval_loss": 2.4244353771209717, + "eval_runtime": 22.0054, + "eval_samples_per_second": 227.217, + "eval_steps_per_second": 1.227, + "step": 88392 + }, + { + "epoch": 4.401036462479042, + "grad_norm": 0.18530848622322083, + "learning_rate": 4.9854365843759354e-05, + "loss": 1.405, + "step": 88400 + }, + { + "epoch": 4.401578403644554, + "grad_norm": 0.1724652349948883, + "learning_rate": 4.983863185992261e-05, + "loss": 1.3891, + "step": 88410 + }, + { + "epoch": 4.402120344810067, + "grad_norm": 0.2382229119539261, + "learning_rate": 4.982289851545376e-05, + "loss": 1.3989, + "step": 88420 + }, + { + "epoch": 4.402662285975579, + "grad_norm": 0.22667750716209412, + "learning_rate": 4.980716581230176e-05, + "loss": 1.3992, + "step": 88430 + }, + { + "epoch": 4.403204227141091, + "grad_norm": 0.2111603170633316, + "learning_rate": 4.9791433752415494e-05, + "loss": 1.3973, + "step": 88440 + }, + { + "epoch": 4.403746168306603, + "grad_norm": 0.16338708996772766, + "learning_rate": 4.977570233774382e-05, + "loss": 1.4002, + "step": 88450 + }, + { + "epoch": 4.404288109472115, + "grad_norm": 0.27522462606430054, + "learning_rate": 4.9759971570235454e-05, + "loss": 1.4011, + "step": 88460 + }, + { + "epoch": 4.404830050637628, + "grad_norm": 0.2958241105079651, + "learning_rate": 4.9744241451839056e-05, + "loss": 1.4013, + "step": 88470 + }, + { + "epoch": 4.405317797686589, + "eval_loss": 2.416937828063965, + "eval_runtime": 22.0681, + "eval_samples_per_second": 226.571, + "eval_steps_per_second": 1.223, + "step": 88479 + }, + { + "epoch": 4.40537199180314, + "grad_norm": 0.1971891224384308, + "learning_rate": 4.9728511984503223e-05, + "loss": 1.3898, + "step": 88480 + }, + { + "epoch": 4.405913932968652, + "grad_norm": 0.18956254422664642, + "learning_rate": 4.971278317017642e-05, + "loss": 1.3957, + "step": 88490 + }, + { + "epoch": 4.406455874134164, + "grad_norm": 0.2962309420108795, + "learning_rate": 4.969705501080709e-05, + "loss": 1.3972, + "step": 88500 + }, + { + "epoch": 4.406997815299676, + "grad_norm": 0.2366226315498352, + "learning_rate": 4.968132750834358e-05, + "loss": 1.4011, + "step": 88510 + }, + { + "epoch": 4.407539756465189, + "grad_norm": 0.25303101539611816, + "learning_rate": 4.9665600664734104e-05, + "loss": 1.3946, + "step": 88520 + }, + { + "epoch": 4.408081697630701, + "grad_norm": 0.1666572242975235, + "learning_rate": 4.964987448192686e-05, + "loss": 1.3924, + "step": 88530 + }, + { + "epoch": 4.408623638796213, + "grad_norm": 0.21928876638412476, + "learning_rate": 4.9634148961869945e-05, + "loss": 1.3978, + "step": 88540 + }, + { + "epoch": 4.4091655799617255, + "grad_norm": 0.15686646103858948, + "learning_rate": 4.961842410651135e-05, + "loss": 1.3989, + "step": 88550 + }, + { + "epoch": 4.409707521127237, + "grad_norm": 0.2927147448062897, + "learning_rate": 4.960269991779902e-05, + "loss": 1.3994, + "step": 88560 + }, + { + "epoch": 4.410032685826545, + "eval_loss": 2.4300734996795654, + "eval_runtime": 22.0915, + "eval_samples_per_second": 226.331, + "eval_steps_per_second": 1.222, + "step": 88566 + }, + { + "epoch": 4.41024946229275, + "grad_norm": 0.16612303256988525, + "learning_rate": 4.958697639768078e-05, + "loss": 1.3973, + "step": 88570 + }, + { + "epoch": 4.410791403458262, + "grad_norm": 0.2341911494731903, + "learning_rate": 4.957125354810444e-05, + "loss": 1.399, + "step": 88580 + }, + { + "epoch": 4.411333344623774, + "grad_norm": 0.2245527058839798, + "learning_rate": 4.9555531371017604e-05, + "loss": 1.4014, + "step": 88590 + }, + { + "epoch": 4.411875285789287, + "grad_norm": 0.25589320063591003, + "learning_rate": 4.9539809868367906e-05, + "loss": 1.395, + "step": 88600 + }, + { + "epoch": 4.4124172269547985, + "grad_norm": 0.19590523838996887, + "learning_rate": 4.952408904210288e-05, + "loss": 1.3888, + "step": 88610 + }, + { + "epoch": 4.412959168120311, + "grad_norm": 0.1683030128479004, + "learning_rate": 4.950836889416991e-05, + "loss": 1.3929, + "step": 88620 + }, + { + "epoch": 4.413501109285823, + "grad_norm": 0.20308978855609894, + "learning_rate": 4.949264942651637e-05, + "loss": 1.4006, + "step": 88630 + }, + { + "epoch": 4.414043050451335, + "grad_norm": 0.167439803481102, + "learning_rate": 4.9476930641089506e-05, + "loss": 1.3978, + "step": 88640 + }, + { + "epoch": 4.414584991616848, + "grad_norm": 0.2057618796825409, + "learning_rate": 4.9461212539836486e-05, + "loss": 1.3981, + "step": 88650 + }, + { + "epoch": 4.414747573966501, + "eval_loss": 2.4309983253479004, + "eval_runtime": 21.9716, + "eval_samples_per_second": 227.566, + "eval_steps_per_second": 1.229, + "step": 88653 + }, + { + "epoch": 4.41512693278236, + "grad_norm": 0.17505063116550446, + "learning_rate": 4.944549512470441e-05, + "loss": 1.3983, + "step": 88660 + }, + { + "epoch": 4.415668873947872, + "grad_norm": 0.16853992640972137, + "learning_rate": 4.942977839764028e-05, + "loss": 1.3883, + "step": 88670 + }, + { + "epoch": 4.416210815113384, + "grad_norm": 0.3285401165485382, + "learning_rate": 4.941406236059104e-05, + "loss": 1.3933, + "step": 88680 + }, + { + "epoch": 4.416752756278896, + "grad_norm": 0.22175738215446472, + "learning_rate": 4.9398347015503474e-05, + "loss": 1.3954, + "step": 88690 + }, + { + "epoch": 4.417294697444409, + "grad_norm": 0.17914403975009918, + "learning_rate": 4.938263236432438e-05, + "loss": 1.3919, + "step": 88700 + }, + { + "epoch": 4.417836638609921, + "grad_norm": 0.2186654806137085, + "learning_rate": 4.936691840900041e-05, + "loss": 1.3968, + "step": 88710 + }, + { + "epoch": 4.418378579775434, + "grad_norm": 0.2118908166885376, + "learning_rate": 4.935120515147811e-05, + "loss": 1.3969, + "step": 88720 + }, + { + "epoch": 4.4189205209409455, + "grad_norm": 0.2079256772994995, + "learning_rate": 4.9335492593704e-05, + "loss": 1.3966, + "step": 88730 + }, + { + "epoch": 4.419462462106457, + "grad_norm": 0.17602074146270752, + "learning_rate": 4.931978073762448e-05, + "loss": 1.404, + "step": 88740 + }, + { + "epoch": 4.419462462106457, + "eval_loss": 2.4278478622436523, + "eval_runtime": 21.9718, + "eval_samples_per_second": 227.565, + "eval_steps_per_second": 1.229, + "step": 88740 + }, + { + "epoch": 4.42000440327197, + "grad_norm": 0.16428543627262115, + "learning_rate": 4.930406958518584e-05, + "loss": 1.415, + "step": 88750 + }, + { + "epoch": 4.420546344437482, + "grad_norm": 0.18856200575828552, + "learning_rate": 4.928835913833435e-05, + "loss": 1.4063, + "step": 88760 + }, + { + "epoch": 4.421088285602994, + "grad_norm": 0.22589999437332153, + "learning_rate": 4.9272649399016134e-05, + "loss": 1.4013, + "step": 88770 + }, + { + "epoch": 4.421630226768507, + "grad_norm": 0.17148156464099884, + "learning_rate": 4.925694036917723e-05, + "loss": 1.3959, + "step": 88780 + }, + { + "epoch": 4.4221721679340185, + "grad_norm": 0.19456231594085693, + "learning_rate": 4.924123205076362e-05, + "loss": 1.3995, + "step": 88790 + }, + { + "epoch": 4.422714109099531, + "grad_norm": 0.18404072523117065, + "learning_rate": 4.92255244457212e-05, + "loss": 1.3852, + "step": 88800 + }, + { + "epoch": 4.423256050265043, + "grad_norm": 0.16325946152210236, + "learning_rate": 4.920981755599573e-05, + "loss": 1.399, + "step": 88810 + }, + { + "epoch": 4.423797991430555, + "grad_norm": 0.16499677300453186, + "learning_rate": 4.9194111383532914e-05, + "loss": 1.3996, + "step": 88820 + }, + { + "epoch": 4.4241773502464135, + "eval_loss": 2.4304182529449463, + "eval_runtime": 21.9655, + "eval_samples_per_second": 227.63, + "eval_steps_per_second": 1.229, + "step": 88827 + }, + { + "epoch": 4.424339932596068, + "grad_norm": 0.20651055872440338, + "learning_rate": 4.917840593027838e-05, + "loss": 1.3967, + "step": 88830 + }, + { + "epoch": 4.42488187376158, + "grad_norm": 0.16554446518421173, + "learning_rate": 4.9162701198177655e-05, + "loss": 1.3991, + "step": 88840 + }, + { + "epoch": 4.425423814927092, + "grad_norm": 0.19531118869781494, + "learning_rate": 4.914699718917615e-05, + "loss": 1.3937, + "step": 88850 + }, + { + "epoch": 4.425965756092604, + "grad_norm": 0.23367935419082642, + "learning_rate": 4.913129390521922e-05, + "loss": 1.3936, + "step": 88860 + }, + { + "epoch": 4.426507697258116, + "grad_norm": 0.2578764855861664, + "learning_rate": 4.911559134825213e-05, + "loss": 1.3988, + "step": 88870 + }, + { + "epoch": 4.427049638423629, + "grad_norm": 0.2369229793548584, + "learning_rate": 4.9099889520220034e-05, + "loss": 1.3805, + "step": 88880 + }, + { + "epoch": 4.427591579589141, + "grad_norm": 0.20222489535808563, + "learning_rate": 4.9084188423068e-05, + "loss": 1.3985, + "step": 88890 + }, + { + "epoch": 4.428133520754653, + "grad_norm": 0.1944282501935959, + "learning_rate": 4.9068488058741044e-05, + "loss": 1.3978, + "step": 88900 + }, + { + "epoch": 4.428675461920165, + "grad_norm": 0.2579196095466614, + "learning_rate": 4.905278842918402e-05, + "loss": 1.4046, + "step": 88910 + }, + { + "epoch": 4.4288922383863705, + "eval_loss": 2.4295578002929688, + "eval_runtime": 21.969, + "eval_samples_per_second": 227.593, + "eval_steps_per_second": 1.229, + "step": 88914 + }, + { + "epoch": 4.429217403085677, + "grad_norm": 0.21468320488929749, + "learning_rate": 4.903708953634174e-05, + "loss": 1.3975, + "step": 88920 + }, + { + "epoch": 4.42975934425119, + "grad_norm": 0.35410991311073303, + "learning_rate": 4.902139138215893e-05, + "loss": 1.3927, + "step": 88930 + }, + { + "epoch": 4.430301285416702, + "grad_norm": 0.3901987373828888, + "learning_rate": 4.900569396858019e-05, + "loss": 1.4055, + "step": 88940 + }, + { + "epoch": 4.430843226582214, + "grad_norm": 0.20195499062538147, + "learning_rate": 4.898999729755006e-05, + "loss": 1.3868, + "step": 88950 + }, + { + "epoch": 4.4313851677477265, + "grad_norm": 0.28525593876838684, + "learning_rate": 4.8974301371012986e-05, + "loss": 1.4099, + "step": 88960 + }, + { + "epoch": 4.431927108913238, + "grad_norm": 0.17980363965034485, + "learning_rate": 4.895860619091327e-05, + "loss": 1.3877, + "step": 88970 + }, + { + "epoch": 4.432469050078751, + "grad_norm": 0.18058229982852936, + "learning_rate": 4.8942911759195196e-05, + "loss": 1.3908, + "step": 88980 + }, + { + "epoch": 4.433010991244263, + "grad_norm": 0.17202942073345184, + "learning_rate": 4.892721807780293e-05, + "loss": 1.399, + "step": 88990 + }, + { + "epoch": 4.433552932409775, + "grad_norm": 0.24401873350143433, + "learning_rate": 4.891152514868052e-05, + "loss": 1.386, + "step": 89000 + }, + { + "epoch": 4.433607126526327, + "eval_loss": 2.4285366535186768, + "eval_runtime": 21.9665, + "eval_samples_per_second": 227.619, + "eval_steps_per_second": 1.229, + "step": 89001 + }, + { + "epoch": 4.434094873575288, + "grad_norm": 0.1815049946308136, + "learning_rate": 4.889583297377194e-05, + "loss": 1.398, + "step": 89010 + }, + { + "epoch": 4.4346368147407995, + "grad_norm": 0.2082160860300064, + "learning_rate": 4.8880141555021055e-05, + "loss": 1.4029, + "step": 89020 + }, + { + "epoch": 4.435178755906312, + "grad_norm": 0.20171265304088593, + "learning_rate": 4.8864450894371683e-05, + "loss": 1.397, + "step": 89030 + }, + { + "epoch": 4.435720697071824, + "grad_norm": 0.2109455019235611, + "learning_rate": 4.8848760993767497e-05, + "loss": 1.3919, + "step": 89040 + }, + { + "epoch": 4.436262638237336, + "grad_norm": 0.2744022011756897, + "learning_rate": 4.883307185515207e-05, + "loss": 1.4098, + "step": 89050 + }, + { + "epoch": 4.436804579402849, + "grad_norm": 0.1699230968952179, + "learning_rate": 4.881738348046897e-05, + "loss": 1.3943, + "step": 89060 + }, + { + "epoch": 4.437346520568361, + "grad_norm": 0.16242730617523193, + "learning_rate": 4.880169587166151e-05, + "loss": 1.4005, + "step": 89070 + }, + { + "epoch": 4.437888461733873, + "grad_norm": 0.14713962376117706, + "learning_rate": 4.8786009030673084e-05, + "loss": 1.3952, + "step": 89080 + }, + { + "epoch": 4.438322014666283, + "eval_loss": 2.425018548965454, + "eval_runtime": 21.9685, + "eval_samples_per_second": 227.599, + "eval_steps_per_second": 1.229, + "step": 89088 + }, + { + "epoch": 4.438430402899385, + "grad_norm": 0.1697177290916443, + "learning_rate": 4.877032295944689e-05, + "loss": 1.4059, + "step": 89090 + }, + { + "epoch": 4.438972344064897, + "grad_norm": 0.18118594586849213, + "learning_rate": 4.875463765992603e-05, + "loss": 1.3885, + "step": 89100 + }, + { + "epoch": 4.43951428523041, + "grad_norm": 0.3551163375377655, + "learning_rate": 4.8738953134053535e-05, + "loss": 1.3941, + "step": 89110 + }, + { + "epoch": 4.440056226395922, + "grad_norm": 0.18483349680900574, + "learning_rate": 4.872326938377235e-05, + "loss": 1.3873, + "step": 89120 + }, + { + "epoch": 4.4405981675614346, + "grad_norm": 0.22286327183246613, + "learning_rate": 4.870758641102531e-05, + "loss": 1.3998, + "step": 89130 + }, + { + "epoch": 4.441140108726946, + "grad_norm": 0.2854524850845337, + "learning_rate": 4.8691904217755126e-05, + "loss": 1.3987, + "step": 89140 + }, + { + "epoch": 4.441682049892458, + "grad_norm": 0.37562283873558044, + "learning_rate": 4.867622280590447e-05, + "loss": 1.3821, + "step": 89150 + }, + { + "epoch": 4.442223991057971, + "grad_norm": 0.20074434578418732, + "learning_rate": 4.866054217741589e-05, + "loss": 1.3955, + "step": 89160 + }, + { + "epoch": 4.442765932223483, + "grad_norm": 0.1584572046995163, + "learning_rate": 4.86448623342318e-05, + "loss": 1.3925, + "step": 89170 + }, + { + "epoch": 4.443036902806239, + "eval_loss": 2.4282519817352295, + "eval_runtime": 21.9667, + "eval_samples_per_second": 227.618, + "eval_steps_per_second": 1.229, + "step": 89175 + }, + { + "epoch": 4.443307873388995, + "grad_norm": 0.18110856413841248, + "learning_rate": 4.8629183278294584e-05, + "loss": 1.4015, + "step": 89180 + }, + { + "epoch": 4.443849814554508, + "grad_norm": 0.1557188332080841, + "learning_rate": 4.86135050115465e-05, + "loss": 1.4022, + "step": 89190 + }, + { + "epoch": 4.444391755720019, + "grad_norm": 0.15835806727409363, + "learning_rate": 4.859782753592968e-05, + "loss": 1.3943, + "step": 89200 + }, + { + "epoch": 4.444933696885532, + "grad_norm": 0.24644139409065247, + "learning_rate": 4.858215085338617e-05, + "loss": 1.4002, + "step": 89210 + }, + { + "epoch": 4.445475638051044, + "grad_norm": 0.3007756471633911, + "learning_rate": 4.856647496585797e-05, + "loss": 1.3948, + "step": 89220 + }, + { + "epoch": 4.446017579216556, + "grad_norm": 0.15734322369098663, + "learning_rate": 4.8550799875286904e-05, + "loss": 1.4099, + "step": 89230 + }, + { + "epoch": 4.446559520382069, + "grad_norm": 0.19473432004451752, + "learning_rate": 4.853512558361475e-05, + "loss": 1.4047, + "step": 89240 + }, + { + "epoch": 4.447101461547581, + "grad_norm": 0.17770782113075256, + "learning_rate": 4.8519452092783193e-05, + "loss": 1.3881, + "step": 89250 + }, + { + "epoch": 4.447643402713093, + "grad_norm": 0.19613268971443176, + "learning_rate": 4.850377940473375e-05, + "loss": 1.4063, + "step": 89260 + }, + { + "epoch": 4.447751790946195, + "eval_loss": 2.4215874671936035, + "eval_runtime": 21.964, + "eval_samples_per_second": 227.645, + "eval_steps_per_second": 1.229, + "step": 89262 + }, + { + "epoch": 4.448185343878605, + "grad_norm": 0.23467159271240234, + "learning_rate": 4.848810752140791e-05, + "loss": 1.3918, + "step": 89270 + }, + { + "epoch": 4.448727285044117, + "grad_norm": 0.1882312148809433, + "learning_rate": 4.847243644474707e-05, + "loss": 1.3882, + "step": 89280 + }, + { + "epoch": 4.44926922620963, + "grad_norm": 0.16581347584724426, + "learning_rate": 4.8456766176692435e-05, + "loss": 1.3909, + "step": 89290 + }, + { + "epoch": 4.449811167375142, + "grad_norm": 0.2854909300804138, + "learning_rate": 4.84410967191852e-05, + "loss": 1.4066, + "step": 89300 + }, + { + "epoch": 4.4503531085406545, + "grad_norm": 0.28027182817459106, + "learning_rate": 4.842542807416644e-05, + "loss": 1.4093, + "step": 89310 + }, + { + "epoch": 4.450895049706166, + "grad_norm": 0.2496391236782074, + "learning_rate": 4.840976024357709e-05, + "loss": 1.3993, + "step": 89320 + }, + { + "epoch": 4.451436990871678, + "grad_norm": 0.17088313400745392, + "learning_rate": 4.839409322935804e-05, + "loss": 1.3956, + "step": 89330 + }, + { + "epoch": 4.451978932037191, + "grad_norm": 0.20805992186069489, + "learning_rate": 4.837842703345003e-05, + "loss": 1.3964, + "step": 89340 + }, + { + "epoch": 4.452466679086152, + "eval_loss": 2.4300498962402344, + "eval_runtime": 21.9632, + "eval_samples_per_second": 227.654, + "eval_steps_per_second": 1.229, + "step": 89349 + }, + { + "epoch": 4.452520873202703, + "grad_norm": 0.22051380574703217, + "learning_rate": 4.8362761657793756e-05, + "loss": 1.3976, + "step": 89350 + }, + { + "epoch": 4.453062814368215, + "grad_norm": 0.20481519401073456, + "learning_rate": 4.834709710432972e-05, + "loss": 1.3972, + "step": 89360 + }, + { + "epoch": 4.4536047555337275, + "grad_norm": 0.17479705810546875, + "learning_rate": 4.8331433374998426e-05, + "loss": 1.4061, + "step": 89370 + }, + { + "epoch": 4.454146696699239, + "grad_norm": 0.16444361209869385, + "learning_rate": 4.831577047174023e-05, + "loss": 1.3974, + "step": 89380 + }, + { + "epoch": 4.454688637864752, + "grad_norm": 0.18008925020694733, + "learning_rate": 4.830010839649535e-05, + "loss": 1.4009, + "step": 89390 + }, + { + "epoch": 4.455230579030264, + "grad_norm": 0.17722876369953156, + "learning_rate": 4.828444715120395e-05, + "loss": 1.397, + "step": 89400 + }, + { + "epoch": 4.455772520195776, + "grad_norm": 0.21487845480442047, + "learning_rate": 4.826878673780609e-05, + "loss": 1.398, + "step": 89410 + }, + { + "epoch": 4.456314461361289, + "grad_norm": 0.16381223499774933, + "learning_rate": 4.8253127158241693e-05, + "loss": 1.4073, + "step": 89420 + }, + { + "epoch": 4.4568564025268005, + "grad_norm": 0.20386900007724762, + "learning_rate": 4.823746841445062e-05, + "loss": 1.3958, + "step": 89430 + }, + { + "epoch": 4.457181567226108, + "eval_loss": 2.4307987689971924, + "eval_runtime": 21.974, + "eval_samples_per_second": 227.542, + "eval_steps_per_second": 1.229, + "step": 89436 + }, + { + "epoch": 4.457398343692313, + "grad_norm": 0.2596319019794464, + "learning_rate": 4.82218105083726e-05, + "loss": 1.4007, + "step": 89440 + }, + { + "epoch": 4.457940284857825, + "grad_norm": 0.1717374473810196, + "learning_rate": 4.820615344194728e-05, + "loss": 1.3932, + "step": 89450 + }, + { + "epoch": 4.458482226023337, + "grad_norm": 0.16155289113521576, + "learning_rate": 4.819049721711415e-05, + "loss": 1.3987, + "step": 89460 + }, + { + "epoch": 4.45902416718885, + "grad_norm": 0.19415195286273956, + "learning_rate": 4.8174841835812665e-05, + "loss": 1.3963, + "step": 89470 + }, + { + "epoch": 4.459566108354362, + "grad_norm": 0.20656591653823853, + "learning_rate": 4.815918729998217e-05, + "loss": 1.3838, + "step": 89480 + }, + { + "epoch": 4.460108049519874, + "grad_norm": 0.33386117219924927, + "learning_rate": 4.814353361156182e-05, + "loss": 1.3849, + "step": 89490 + }, + { + "epoch": 4.460649990685386, + "grad_norm": 0.21893806755542755, + "learning_rate": 4.8127880772490764e-05, + "loss": 1.3819, + "step": 89500 + }, + { + "epoch": 4.461191931850898, + "grad_norm": 0.22136323153972626, + "learning_rate": 4.811222878470801e-05, + "loss": 1.3927, + "step": 89510 + }, + { + "epoch": 4.461733873016411, + "grad_norm": 0.40589383244514465, + "learning_rate": 4.809657765015245e-05, + "loss": 1.4007, + "step": 89520 + }, + { + "epoch": 4.461896455366064, + "eval_loss": 2.4328644275665283, + "eval_runtime": 21.9657, + "eval_samples_per_second": 227.627, + "eval_steps_per_second": 1.229, + "step": 89523 + }, + { + "epoch": 4.462275814181923, + "grad_norm": 0.2112714648246765, + "learning_rate": 4.808092737076287e-05, + "loss": 1.3999, + "step": 89530 + }, + { + "epoch": 4.4628177553474355, + "grad_norm": 0.22988109290599823, + "learning_rate": 4.8065277948477996e-05, + "loss": 1.4017, + "step": 89540 + }, + { + "epoch": 4.463359696512947, + "grad_norm": 0.2842319905757904, + "learning_rate": 4.804962938523636e-05, + "loss": 1.3933, + "step": 89550 + }, + { + "epoch": 4.463901637678459, + "grad_norm": 0.16226890683174133, + "learning_rate": 4.803398168297645e-05, + "loss": 1.4046, + "step": 89560 + }, + { + "epoch": 4.464443578843972, + "grad_norm": 0.18163561820983887, + "learning_rate": 4.801833484363667e-05, + "loss": 1.3999, + "step": 89570 + }, + { + "epoch": 4.464985520009484, + "grad_norm": 0.1625688374042511, + "learning_rate": 4.8002688869155246e-05, + "loss": 1.3945, + "step": 89580 + }, + { + "epoch": 4.465527461174997, + "grad_norm": 0.15731951594352722, + "learning_rate": 4.798704376147034e-05, + "loss": 1.3903, + "step": 89590 + }, + { + "epoch": 4.4660694023405085, + "grad_norm": 0.2153928130865097, + "learning_rate": 4.797139952252e-05, + "loss": 1.4023, + "step": 89600 + }, + { + "epoch": 4.46661134350602, + "grad_norm": 0.2910442650318146, + "learning_rate": 4.795575615424219e-05, + "loss": 1.3847, + "step": 89610 + }, + { + "epoch": 4.46661134350602, + "eval_loss": 2.4339542388916016, + "eval_runtime": 21.954, + "eval_samples_per_second": 227.749, + "eval_steps_per_second": 1.23, + "step": 89610 + }, + { + "epoch": 4.467153284671533, + "grad_norm": 0.17365294694900513, + "learning_rate": 4.794011365857472e-05, + "loss": 1.3971, + "step": 89620 + }, + { + "epoch": 4.467695225837045, + "grad_norm": 0.3318266272544861, + "learning_rate": 4.7924472037455304e-05, + "loss": 1.3983, + "step": 89630 + }, + { + "epoch": 4.468237167002557, + "grad_norm": 0.26000094413757324, + "learning_rate": 4.790883129282161e-05, + "loss": 1.3969, + "step": 89640 + }, + { + "epoch": 4.46877910816807, + "grad_norm": 0.1824985295534134, + "learning_rate": 4.789319142661107e-05, + "loss": 1.3838, + "step": 89650 + }, + { + "epoch": 4.4693210493335815, + "grad_norm": 0.1758066862821579, + "learning_rate": 4.787755244076112e-05, + "loss": 1.3934, + "step": 89660 + }, + { + "epoch": 4.469862990499094, + "grad_norm": 0.17066720128059387, + "learning_rate": 4.786191433720909e-05, + "loss": 1.4048, + "step": 89670 + }, + { + "epoch": 4.470404931664606, + "grad_norm": 0.1667661815881729, + "learning_rate": 4.784627711789209e-05, + "loss": 1.3922, + "step": 89680 + }, + { + "epoch": 4.470946872830118, + "grad_norm": 0.1883508414030075, + "learning_rate": 4.7830640784747225e-05, + "loss": 1.3911, + "step": 89690 + }, + { + "epoch": 4.4713262316459765, + "eval_loss": 2.428875207901001, + "eval_runtime": 21.9652, + "eval_samples_per_second": 227.633, + "eval_steps_per_second": 1.229, + "step": 89697 + }, + { + "epoch": 4.471488813995631, + "grad_norm": 0.21949131786823273, + "learning_rate": 4.781500533971147e-05, + "loss": 1.3919, + "step": 89700 + }, + { + "epoch": 4.472030755161143, + "grad_norm": 0.19700855016708374, + "learning_rate": 4.779937078472164e-05, + "loss": 1.4053, + "step": 89710 + }, + { + "epoch": 4.472572696326655, + "grad_norm": 0.17879433929920197, + "learning_rate": 4.7783737121714504e-05, + "loss": 1.3874, + "step": 89720 + }, + { + "epoch": 4.473114637492167, + "grad_norm": 0.16354680061340332, + "learning_rate": 4.77681043526267e-05, + "loss": 1.4007, + "step": 89730 + }, + { + "epoch": 4.473656578657679, + "grad_norm": 0.23582887649536133, + "learning_rate": 4.7752472479394706e-05, + "loss": 1.4049, + "step": 89740 + }, + { + "epoch": 4.474198519823192, + "grad_norm": 0.1751241385936737, + "learning_rate": 4.7736841503954956e-05, + "loss": 1.3926, + "step": 89750 + }, + { + "epoch": 4.474740460988704, + "grad_norm": 0.2157001942396164, + "learning_rate": 4.772121142824374e-05, + "loss": 1.3967, + "step": 89760 + }, + { + "epoch": 4.475282402154216, + "grad_norm": 0.2224288284778595, + "learning_rate": 4.770558225419728e-05, + "loss": 1.4, + "step": 89770 + }, + { + "epoch": 4.4758243433197284, + "grad_norm": 0.1734488606452942, + "learning_rate": 4.7689953983751604e-05, + "loss": 1.3875, + "step": 89780 + }, + { + "epoch": 4.4760411197859336, + "eval_loss": 2.4124958515167236, + "eval_runtime": 21.9708, + "eval_samples_per_second": 227.574, + "eval_steps_per_second": 1.229, + "step": 89784 + }, + { + "epoch": 4.47636628448524, + "grad_norm": 0.17843768000602722, + "learning_rate": 4.767432661884269e-05, + "loss": 1.3945, + "step": 89790 + }, + { + "epoch": 4.476908225650753, + "grad_norm": 0.165896475315094, + "learning_rate": 4.76587001614064e-05, + "loss": 1.4107, + "step": 89800 + }, + { + "epoch": 4.477450166816265, + "grad_norm": 0.19189293682575226, + "learning_rate": 4.7643074613378455e-05, + "loss": 1.3998, + "step": 89810 + }, + { + "epoch": 4.477992107981777, + "grad_norm": 0.26775965094566345, + "learning_rate": 4.7627449976694486e-05, + "loss": 1.3911, + "step": 89820 + }, + { + "epoch": 4.47853404914729, + "grad_norm": 0.2671341896057129, + "learning_rate": 4.761182625329001e-05, + "loss": 1.4019, + "step": 89830 + }, + { + "epoch": 4.4790759903128015, + "grad_norm": 0.16249962151050568, + "learning_rate": 4.7596203445100416e-05, + "loss": 1.3971, + "step": 89840 + }, + { + "epoch": 4.479617931478314, + "grad_norm": 0.273316353559494, + "learning_rate": 4.7580581554060986e-05, + "loss": 1.39, + "step": 89850 + }, + { + "epoch": 4.480159872643826, + "grad_norm": 0.19733589887619019, + "learning_rate": 4.7564960582106933e-05, + "loss": 1.4028, + "step": 89860 + }, + { + "epoch": 4.480701813809338, + "grad_norm": 0.17823106050491333, + "learning_rate": 4.754934053117325e-05, + "loss": 1.4006, + "step": 89870 + }, + { + "epoch": 4.48075600792589, + "eval_loss": 2.4114816188812256, + "eval_runtime": 21.9671, + "eval_samples_per_second": 227.613, + "eval_steps_per_second": 1.229, + "step": 89871 + }, + { + "epoch": 4.481243754974851, + "grad_norm": 0.18662744760513306, + "learning_rate": 4.753372140319492e-05, + "loss": 1.3969, + "step": 89880 + }, + { + "epoch": 4.481785696140363, + "grad_norm": 0.23681077361106873, + "learning_rate": 4.751810320010678e-05, + "loss": 1.4017, + "step": 89890 + }, + { + "epoch": 4.482327637305875, + "grad_norm": 0.3316483795642853, + "learning_rate": 4.750248592384352e-05, + "loss": 1.3945, + "step": 89900 + }, + { + "epoch": 4.482869578471387, + "grad_norm": 0.261080801486969, + "learning_rate": 4.748686957633975e-05, + "loss": 1.3913, + "step": 89910 + }, + { + "epoch": 4.483411519636899, + "grad_norm": 0.1946885585784912, + "learning_rate": 4.747125415952998e-05, + "loss": 1.3939, + "step": 89920 + }, + { + "epoch": 4.483953460802412, + "grad_norm": 0.2138025015592575, + "learning_rate": 4.745563967534855e-05, + "loss": 1.3982, + "step": 89930 + }, + { + "epoch": 4.484495401967924, + "grad_norm": 0.2956826984882355, + "learning_rate": 4.744002612572972e-05, + "loss": 1.3999, + "step": 89940 + }, + { + "epoch": 4.4850373431334365, + "grad_norm": 0.30752211809158325, + "learning_rate": 4.742441351260761e-05, + "loss": 1.3892, + "step": 89950 + }, + { + "epoch": 4.485470896065846, + "eval_loss": 2.4178147315979004, + "eval_runtime": 21.9672, + "eval_samples_per_second": 227.613, + "eval_steps_per_second": 1.229, + "step": 89958 + }, + { + "epoch": 4.485579284298948, + "grad_norm": 0.2102116495370865, + "learning_rate": 4.7408801837916306e-05, + "loss": 1.4062, + "step": 89960 + }, + { + "epoch": 4.48612122546446, + "grad_norm": 0.3934997320175171, + "learning_rate": 4.7393191103589643e-05, + "loss": 1.3999, + "step": 89970 + }, + { + "epoch": 4.486663166629973, + "grad_norm": 0.2734357416629791, + "learning_rate": 4.737758131156144e-05, + "loss": 1.3997, + "step": 89980 + }, + { + "epoch": 4.487205107795485, + "grad_norm": 0.1576622873544693, + "learning_rate": 4.736197246376538e-05, + "loss": 1.3969, + "step": 89990 + }, + { + "epoch": 4.487747048960998, + "grad_norm": 0.18201720714569092, + "learning_rate": 4.7346364562135e-05, + "loss": 1.3923, + "step": 90000 + } + ], + "logging_steps": 10, + "max_steps": 92260, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8386602891165565e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}