diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22181 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997401507125867, + "eval_steps": 100, + "global_step": 3126, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00031981450758560037, + "grad_norm": 0.5762398838996887, + "learning_rate": 0.0, + "loss": 11.9102, + "step": 1 + }, + { + "epoch": 0.0006396290151712007, + "grad_norm": 0.5624870657920837, + "learning_rate": 6.382978723404255e-06, + "loss": 11.9077, + "step": 2 + }, + { + "epoch": 0.0009594435227568011, + "grad_norm": 0.6053410768508911, + "learning_rate": 1.276595744680851e-05, + "loss": 11.9013, + "step": 3 + }, + { + "epoch": 0.0012792580303424015, + "grad_norm": 0.598536491394043, + "learning_rate": 1.9148936170212762e-05, + "loss": 11.8807, + "step": 4 + }, + { + "epoch": 0.0015990725379280018, + "grad_norm": 0.5812181234359741, + "learning_rate": 2.553191489361702e-05, + "loss": 11.848, + "step": 5 + }, + { + "epoch": 0.0019188870455136022, + "grad_norm": 0.5945044755935669, + "learning_rate": 3.1914893617021275e-05, + "loss": 11.8085, + "step": 6 + }, + { + "epoch": 0.0022387015530992023, + "grad_norm": 0.6150892376899719, + "learning_rate": 3.8297872340425525e-05, + "loss": 11.765, + "step": 7 + }, + { + "epoch": 0.002558516060684803, + "grad_norm": 0.6108999252319336, + "learning_rate": 4.468085106382978e-05, + "loss": 11.7116, + "step": 8 + }, + { + "epoch": 0.002878330568270403, + "grad_norm": 0.6705823540687561, + "learning_rate": 5.106382978723404e-05, + "loss": 11.644, + "step": 9 + }, + { + "epoch": 0.0031981450758560037, + "grad_norm": 0.7061710357666016, + "learning_rate": 5.7446808510638294e-05, + "loss": 11.5856, + "step": 10 + }, + { + "epoch": 0.003517959583441604, + "grad_norm": 0.7612898349761963, + "learning_rate": 6.382978723404255e-05, + "loss": 11.5082, + "step": 11 + }, + { + "epoch": 0.0038377740910272044, + "grad_norm": 0.9055056571960449, + "learning_rate": 7.02127659574468e-05, + "loss": 11.3855, + "step": 12 + }, + { + "epoch": 0.0041575885986128045, + "grad_norm": 0.9641930460929871, + "learning_rate": 7.659574468085105e-05, + "loss": 11.3033, + "step": 13 + }, + { + "epoch": 0.004477403106198405, + "grad_norm": 1.0467524528503418, + "learning_rate": 8.297872340425531e-05, + "loss": 11.2125, + "step": 14 + }, + { + "epoch": 0.004797217613784005, + "grad_norm": 1.1716750860214233, + "learning_rate": 8.936170212765956e-05, + "loss": 11.0821, + "step": 15 + }, + { + "epoch": 0.005117032121369606, + "grad_norm": 1.3222126960754395, + "learning_rate": 9.574468085106382e-05, + "loss": 10.9287, + "step": 16 + }, + { + "epoch": 0.005436846628955206, + "grad_norm": 1.4363603591918945, + "learning_rate": 0.00010212765957446807, + "loss": 10.8168, + "step": 17 + }, + { + "epoch": 0.005756661136540806, + "grad_norm": 1.5512793064117432, + "learning_rate": 0.00010851063829787234, + "loss": 10.6823, + "step": 18 + }, + { + "epoch": 0.006076475644126406, + "grad_norm": 1.624642014503479, + "learning_rate": 0.00011489361702127659, + "loss": 10.552, + "step": 19 + }, + { + "epoch": 0.006396290151712007, + "grad_norm": 1.6283295154571533, + "learning_rate": 0.00012127659574468084, + "loss": 10.4734, + "step": 20 + }, + { + "epoch": 0.0067161046592976075, + "grad_norm": 1.698269248008728, + "learning_rate": 0.0001276595744680851, + "loss": 10.3521, + "step": 21 + }, + { + "epoch": 0.007035919166883208, + "grad_norm": 1.7003437280654907, + "learning_rate": 0.00013404255319148935, + "loss": 10.2487, + "step": 22 + }, + { + "epoch": 0.007355733674468808, + "grad_norm": 1.6472309827804565, + "learning_rate": 0.0001404255319148936, + "loss": 10.1751, + "step": 23 + }, + { + "epoch": 0.007675548182054409, + "grad_norm": 1.6952213048934937, + "learning_rate": 0.00014680851063829785, + "loss": 10.0406, + "step": 24 + }, + { + "epoch": 0.007995362689640009, + "grad_norm": 1.6617240905761719, + "learning_rate": 0.0001531914893617021, + "loss": 9.9372, + "step": 25 + }, + { + "epoch": 0.008315177197225609, + "grad_norm": 1.6541690826416016, + "learning_rate": 0.00015957446808510637, + "loss": 9.8292, + "step": 26 + }, + { + "epoch": 0.00863499170481121, + "grad_norm": 1.5970962047576904, + "learning_rate": 0.00016595744680851062, + "loss": 9.7187, + "step": 27 + }, + { + "epoch": 0.00895480621239681, + "grad_norm": 1.5352132320404053, + "learning_rate": 0.0001723404255319149, + "loss": 9.6444, + "step": 28 + }, + { + "epoch": 0.00927462071998241, + "grad_norm": 1.5574132204055786, + "learning_rate": 0.00017872340425531912, + "loss": 9.5396, + "step": 29 + }, + { + "epoch": 0.00959443522756801, + "grad_norm": 1.7909637689590454, + "learning_rate": 0.0001851063829787234, + "loss": 9.4365, + "step": 30 + }, + { + "epoch": 0.009914249735153612, + "grad_norm": 2.3043689727783203, + "learning_rate": 0.00019148936170212765, + "loss": 9.3488, + "step": 31 + }, + { + "epoch": 0.010234064242739212, + "grad_norm": 1.7960251569747925, + "learning_rate": 0.00019787234042553187, + "loss": 9.215, + "step": 32 + }, + { + "epoch": 0.010553878750324812, + "grad_norm": 1.5585463047027588, + "learning_rate": 0.00020425531914893615, + "loss": 9.0985, + "step": 33 + }, + { + "epoch": 0.010873693257910412, + "grad_norm": 1.8078670501708984, + "learning_rate": 0.0002106382978723404, + "loss": 9.007, + "step": 34 + }, + { + "epoch": 0.011193507765496012, + "grad_norm": 1.4333995580673218, + "learning_rate": 0.00021702127659574468, + "loss": 8.8843, + "step": 35 + }, + { + "epoch": 0.011513322273081612, + "grad_norm": 1.474975347518921, + "learning_rate": 0.0002234042553191489, + "loss": 8.8019, + "step": 36 + }, + { + "epoch": 0.011833136780667212, + "grad_norm": 1.3332194089889526, + "learning_rate": 0.00022978723404255317, + "loss": 8.727, + "step": 37 + }, + { + "epoch": 0.012152951288252813, + "grad_norm": 1.425425410270691, + "learning_rate": 0.00023617021276595742, + "loss": 8.5894, + "step": 38 + }, + { + "epoch": 0.012472765795838415, + "grad_norm": 1.230981707572937, + "learning_rate": 0.00024255319148936167, + "loss": 8.5641, + "step": 39 + }, + { + "epoch": 0.012792580303424015, + "grad_norm": 1.2399401664733887, + "learning_rate": 0.0002489361702127659, + "loss": 8.3774, + "step": 40 + }, + { + "epoch": 0.013112394811009615, + "grad_norm": 1.1850652694702148, + "learning_rate": 0.0002553191489361702, + "loss": 8.3388, + "step": 41 + }, + { + "epoch": 0.013432209318595215, + "grad_norm": 1.085748314857483, + "learning_rate": 0.0002617021276595745, + "loss": 8.2185, + "step": 42 + }, + { + "epoch": 0.013752023826180815, + "grad_norm": 1.026831865310669, + "learning_rate": 0.0002680851063829787, + "loss": 8.1171, + "step": 43 + }, + { + "epoch": 0.014071838333766415, + "grad_norm": 1.0322154760360718, + "learning_rate": 0.000274468085106383, + "loss": 8.0373, + "step": 44 + }, + { + "epoch": 0.014391652841352015, + "grad_norm": 0.8840400576591492, + "learning_rate": 0.0002808510638297872, + "loss": 7.9007, + "step": 45 + }, + { + "epoch": 0.014711467348937616, + "grad_norm": 0.7530202269554138, + "learning_rate": 0.0002872340425531915, + "loss": 7.8504, + "step": 46 + }, + { + "epoch": 0.015031281856523216, + "grad_norm": 0.8164878487586975, + "learning_rate": 0.0002936170212765957, + "loss": 7.7963, + "step": 47 + }, + { + "epoch": 0.015351096364108818, + "grad_norm": 0.6531561613082886, + "learning_rate": 0.0003, + "loss": 7.8051, + "step": 48 + }, + { + "epoch": 0.015670910871694418, + "grad_norm": 0.5979849100112915, + "learning_rate": 0.0003063829787234042, + "loss": 7.6738, + "step": 49 + }, + { + "epoch": 0.015990725379280018, + "grad_norm": 0.7686684727668762, + "learning_rate": 0.0003127659574468085, + "loss": 7.6786, + "step": 50 + }, + { + "epoch": 0.016310539886865618, + "grad_norm": 0.7229753136634827, + "learning_rate": 0.00031914893617021275, + "loss": 7.6438, + "step": 51 + }, + { + "epoch": 0.016630354394451218, + "grad_norm": 0.40795406699180603, + "learning_rate": 0.00032553191489361697, + "loss": 7.6013, + "step": 52 + }, + { + "epoch": 0.01695016890203682, + "grad_norm": 0.7603433728218079, + "learning_rate": 0.00033191489361702125, + "loss": 7.5757, + "step": 53 + }, + { + "epoch": 0.01726998340962242, + "grad_norm": 0.5100083351135254, + "learning_rate": 0.00033829787234042547, + "loss": 7.5115, + "step": 54 + }, + { + "epoch": 0.01758979791720802, + "grad_norm": 0.42650240659713745, + "learning_rate": 0.0003446808510638298, + "loss": 7.4833, + "step": 55 + }, + { + "epoch": 0.01790961242479362, + "grad_norm": 0.4726926386356354, + "learning_rate": 0.000351063829787234, + "loss": 7.499, + "step": 56 + }, + { + "epoch": 0.01822942693237922, + "grad_norm": 0.6424689292907715, + "learning_rate": 0.00035744680851063825, + "loss": 7.3896, + "step": 57 + }, + { + "epoch": 0.01854924143996482, + "grad_norm": 0.969096839427948, + "learning_rate": 0.0003638297872340425, + "loss": 7.4446, + "step": 58 + }, + { + "epoch": 0.01886905594755042, + "grad_norm": 1.238357424736023, + "learning_rate": 0.0003702127659574468, + "loss": 7.4339, + "step": 59 + }, + { + "epoch": 0.01918887045513602, + "grad_norm": 0.5127294659614563, + "learning_rate": 0.000376595744680851, + "loss": 7.3979, + "step": 60 + }, + { + "epoch": 0.019508684962721623, + "grad_norm": 0.5544953942298889, + "learning_rate": 0.0003829787234042553, + "loss": 7.3483, + "step": 61 + }, + { + "epoch": 0.019828499470307223, + "grad_norm": 0.7398856282234192, + "learning_rate": 0.0003893617021276595, + "loss": 7.2954, + "step": 62 + }, + { + "epoch": 0.020148313977892823, + "grad_norm": 0.39174893498420715, + "learning_rate": 0.00039574468085106374, + "loss": 7.3479, + "step": 63 + }, + { + "epoch": 0.020468128485478423, + "grad_norm": 0.41279804706573486, + "learning_rate": 0.0004021276595744681, + "loss": 7.4516, + "step": 64 + }, + { + "epoch": 0.020787942993064024, + "grad_norm": 1.256068229675293, + "learning_rate": 0.0004085106382978723, + "loss": 7.3337, + "step": 65 + }, + { + "epoch": 0.021107757500649624, + "grad_norm": 0.7139375805854797, + "learning_rate": 0.0004148936170212766, + "loss": 7.2613, + "step": 66 + }, + { + "epoch": 0.021427572008235224, + "grad_norm": 0.6181330680847168, + "learning_rate": 0.0004212765957446808, + "loss": 7.3833, + "step": 67 + }, + { + "epoch": 0.021747386515820824, + "grad_norm": 0.6431315541267395, + "learning_rate": 0.0004276595744680851, + "loss": 7.3363, + "step": 68 + }, + { + "epoch": 0.022067201023406424, + "grad_norm": 0.5013834834098816, + "learning_rate": 0.00043404255319148935, + "loss": 7.2818, + "step": 69 + }, + { + "epoch": 0.022387015530992024, + "grad_norm": 0.331126868724823, + "learning_rate": 0.00044042553191489357, + "loss": 7.3045, + "step": 70 + }, + { + "epoch": 0.022706830038577624, + "grad_norm": 0.46566659212112427, + "learning_rate": 0.0004468085106382978, + "loss": 7.2218, + "step": 71 + }, + { + "epoch": 0.023026644546163225, + "grad_norm": 0.3669161796569824, + "learning_rate": 0.0004531914893617021, + "loss": 7.1607, + "step": 72 + }, + { + "epoch": 0.023346459053748825, + "grad_norm": 0.35284051299095154, + "learning_rate": 0.00045957446808510635, + "loss": 7.1419, + "step": 73 + }, + { + "epoch": 0.023666273561334425, + "grad_norm": 0.3425213396549225, + "learning_rate": 0.00046595744680851057, + "loss": 7.1929, + "step": 74 + }, + { + "epoch": 0.023986088068920025, + "grad_norm": 0.40044257044792175, + "learning_rate": 0.00047234042553191485, + "loss": 7.163, + "step": 75 + }, + { + "epoch": 0.024305902576505625, + "grad_norm": 0.35946935415267944, + "learning_rate": 0.0004787234042553191, + "loss": 7.0943, + "step": 76 + }, + { + "epoch": 0.024625717084091225, + "grad_norm": 0.3463836908340454, + "learning_rate": 0.00048510638297872335, + "loss": 7.0932, + "step": 77 + }, + { + "epoch": 0.02494553159167683, + "grad_norm": 0.37853115797042847, + "learning_rate": 0.0004914893617021277, + "loss": 7.1608, + "step": 78 + }, + { + "epoch": 0.02526534609926243, + "grad_norm": 0.9995279908180237, + "learning_rate": 0.0004978723404255318, + "loss": 7.0951, + "step": 79 + }, + { + "epoch": 0.02558516060684803, + "grad_norm": 0.4370875358581543, + "learning_rate": 0.0005042553191489361, + "loss": 7.0864, + "step": 80 + }, + { + "epoch": 0.02590497511443363, + "grad_norm": 0.39335861802101135, + "learning_rate": 0.0005106382978723404, + "loss": 7.1225, + "step": 81 + }, + { + "epoch": 0.02622478962201923, + "grad_norm": 0.5575166940689087, + "learning_rate": 0.0005170212765957446, + "loss": 7.0128, + "step": 82 + }, + { + "epoch": 0.02654460412960483, + "grad_norm": 0.382291704416275, + "learning_rate": 0.000523404255319149, + "loss": 7.0568, + "step": 83 + }, + { + "epoch": 0.02686441863719043, + "grad_norm": 0.37663501501083374, + "learning_rate": 0.0005297872340425531, + "loss": 7.0283, + "step": 84 + }, + { + "epoch": 0.02718423314477603, + "grad_norm": 0.324942022562027, + "learning_rate": 0.0005361702127659574, + "loss": 6.9855, + "step": 85 + }, + { + "epoch": 0.02750404765236163, + "grad_norm": 0.3324145972728729, + "learning_rate": 0.0005425531914893617, + "loss": 6.9152, + "step": 86 + }, + { + "epoch": 0.02782386215994723, + "grad_norm": 0.29377809166908264, + "learning_rate": 0.000548936170212766, + "loss": 6.9509, + "step": 87 + }, + { + "epoch": 0.02814367666753283, + "grad_norm": 0.30234116315841675, + "learning_rate": 0.0005553191489361701, + "loss": 6.9518, + "step": 88 + }, + { + "epoch": 0.02846349117511843, + "grad_norm": 0.42053472995758057, + "learning_rate": 0.0005617021276595744, + "loss": 6.8915, + "step": 89 + }, + { + "epoch": 0.02878330568270403, + "grad_norm": 0.3598235249519348, + "learning_rate": 0.0005680851063829787, + "loss": 6.9226, + "step": 90 + }, + { + "epoch": 0.02910312019028963, + "grad_norm": 0.35407304763793945, + "learning_rate": 0.000574468085106383, + "loss": 6.901, + "step": 91 + }, + { + "epoch": 0.02942293469787523, + "grad_norm": 0.29050391912460327, + "learning_rate": 0.0005808510638297872, + "loss": 6.816, + "step": 92 + }, + { + "epoch": 0.02974274920546083, + "grad_norm": 0.32192665338516235, + "learning_rate": 0.0005872340425531914, + "loss": 6.796, + "step": 93 + }, + { + "epoch": 0.03006256371304643, + "grad_norm": 0.3513169288635254, + "learning_rate": 0.0005936170212765957, + "loss": 6.8849, + "step": 94 + }, + { + "epoch": 0.030382378220632035, + "grad_norm": 0.4323410987854004, + "learning_rate": 0.0006, + "loss": 6.8101, + "step": 95 + }, + { + "epoch": 0.030702192728217635, + "grad_norm": 0.2884387671947479, + "learning_rate": 0.0005999998389604413, + "loss": 6.8199, + "step": 96 + }, + { + "epoch": 0.031022007235803235, + "grad_norm": 0.3581418991088867, + "learning_rate": 0.0005999993558419382, + "loss": 6.8157, + "step": 97 + }, + { + "epoch": 0.031341821743388835, + "grad_norm": 0.3392150402069092, + "learning_rate": 0.0005999985506450094, + "loss": 6.8469, + "step": 98 + }, + { + "epoch": 0.031661636250974436, + "grad_norm": 0.3693081736564636, + "learning_rate": 0.0005999974233705192, + "loss": 6.8005, + "step": 99 + }, + { + "epoch": 0.031981450758560036, + "grad_norm": 0.3570478856563568, + "learning_rate": 0.000599995974019678, + "loss": 6.8532, + "step": 100 + }, + { + "epoch": 0.031981450758560036, + "eval_loss": 6.748331546783447, + "eval_runtime": 76.957, + "eval_samples_per_second": 24.65, + "eval_steps_per_second": 6.172, + "step": 100 + }, + { + "epoch": 0.032301265266145636, + "grad_norm": 0.41135793924331665, + "learning_rate": 0.0005999942025940418, + "loss": 6.7633, + "step": 101 + }, + { + "epoch": 0.032621079773731236, + "grad_norm": 0.3616463840007782, + "learning_rate": 0.0005999921090955123, + "loss": 6.8156, + "step": 102 + }, + { + "epoch": 0.032940894281316836, + "grad_norm": 0.3172069787979126, + "learning_rate": 0.0005999896935263372, + "loss": 6.7868, + "step": 103 + }, + { + "epoch": 0.033260708788902436, + "grad_norm": 0.5328623652458191, + "learning_rate": 0.0005999869558891097, + "loss": 6.8434, + "step": 104 + }, + { + "epoch": 0.033580523296488037, + "grad_norm": 0.37089961767196655, + "learning_rate": 0.000599983896186769, + "loss": 6.7699, + "step": 105 + }, + { + "epoch": 0.03390033780407364, + "grad_norm": 0.4071977138519287, + "learning_rate": 0.0005999805144226, + "loss": 6.7143, + "step": 106 + }, + { + "epoch": 0.03422015231165924, + "grad_norm": 0.33601731061935425, + "learning_rate": 0.0005999768106002334, + "loss": 6.7098, + "step": 107 + }, + { + "epoch": 0.03453996681924484, + "grad_norm": 0.34338271617889404, + "learning_rate": 0.0005999727847236454, + "loss": 6.7295, + "step": 108 + }, + { + "epoch": 0.03485978132683044, + "grad_norm": 0.3717418611049652, + "learning_rate": 0.0005999684367971584, + "loss": 6.6593, + "step": 109 + }, + { + "epoch": 0.03517959583441604, + "grad_norm": 0.41923823952674866, + "learning_rate": 0.0005999637668254403, + "loss": 6.6612, + "step": 110 + }, + { + "epoch": 0.03549941034200164, + "grad_norm": 0.3013748824596405, + "learning_rate": 0.0005999587748135047, + "loss": 6.5929, + "step": 111 + }, + { + "epoch": 0.03581922484958724, + "grad_norm": 0.4821736216545105, + "learning_rate": 0.000599953460766711, + "loss": 6.6543, + "step": 112 + }, + { + "epoch": 0.03613903935717284, + "grad_norm": 0.32597431540489197, + "learning_rate": 0.0005999478246907643, + "loss": 6.7071, + "step": 113 + }, + { + "epoch": 0.03645885386475844, + "grad_norm": 0.33268311619758606, + "learning_rate": 0.0005999418665917157, + "loss": 6.6091, + "step": 114 + }, + { + "epoch": 0.03677866837234404, + "grad_norm": 0.30232611298561096, + "learning_rate": 0.0005999355864759614, + "loss": 6.6904, + "step": 115 + }, + { + "epoch": 0.03709848287992964, + "grad_norm": 0.3010026812553406, + "learning_rate": 0.0005999289843502441, + "loss": 6.6532, + "step": 116 + }, + { + "epoch": 0.03741829738751524, + "grad_norm": 0.34711113572120667, + "learning_rate": 0.0005999220602216517, + "loss": 6.5868, + "step": 117 + }, + { + "epoch": 0.03773811189510084, + "grad_norm": 0.4245126247406006, + "learning_rate": 0.0005999148140976179, + "loss": 6.6374, + "step": 118 + }, + { + "epoch": 0.03805792640268644, + "grad_norm": 0.43401578068733215, + "learning_rate": 0.0005999072459859221, + "loss": 6.5992, + "step": 119 + }, + { + "epoch": 0.03837774091027204, + "grad_norm": 0.4109058976173401, + "learning_rate": 0.0005998993558946892, + "loss": 6.5269, + "step": 120 + }, + { + "epoch": 0.038697555417857646, + "grad_norm": 0.3726668059825897, + "learning_rate": 0.0005998911438323904, + "loss": 6.6078, + "step": 121 + }, + { + "epoch": 0.039017369925443246, + "grad_norm": 0.3629209101200104, + "learning_rate": 0.000599882609807842, + "loss": 6.5175, + "step": 122 + }, + { + "epoch": 0.039337184433028846, + "grad_norm": 0.4317244291305542, + "learning_rate": 0.000599873753830206, + "loss": 6.5939, + "step": 123 + }, + { + "epoch": 0.039656998940614446, + "grad_norm": 0.2928583025932312, + "learning_rate": 0.0005998645759089901, + "loss": 6.5584, + "step": 124 + }, + { + "epoch": 0.039976813448200046, + "grad_norm": 0.3732014000415802, + "learning_rate": 0.0005998550760540478, + "loss": 6.5541, + "step": 125 + }, + { + "epoch": 0.04029662795578565, + "grad_norm": 0.27851733565330505, + "learning_rate": 0.000599845254275578, + "loss": 6.4988, + "step": 126 + }, + { + "epoch": 0.04061644246337125, + "grad_norm": 0.3921887278556824, + "learning_rate": 0.0005998351105841257, + "loss": 6.6184, + "step": 127 + }, + { + "epoch": 0.04093625697095685, + "grad_norm": 0.3566243052482605, + "learning_rate": 0.0005998246449905807, + "loss": 6.503, + "step": 128 + }, + { + "epoch": 0.04125607147854245, + "grad_norm": 0.39187195897102356, + "learning_rate": 0.0005998138575061791, + "loss": 6.4724, + "step": 129 + }, + { + "epoch": 0.04157588598612805, + "grad_norm": 0.3276127874851227, + "learning_rate": 0.000599802748142502, + "loss": 6.5288, + "step": 130 + }, + { + "epoch": 0.04189570049371365, + "grad_norm": 0.39741045236587524, + "learning_rate": 0.0005997913169114768, + "loss": 6.4702, + "step": 131 + }, + { + "epoch": 0.04221551500129925, + "grad_norm": 0.4539431035518646, + "learning_rate": 0.0005997795638253759, + "loss": 6.4902, + "step": 132 + }, + { + "epoch": 0.04253532950888485, + "grad_norm": 0.41237616539001465, + "learning_rate": 0.0005997674888968171, + "loss": 6.5647, + "step": 133 + }, + { + "epoch": 0.04285514401647045, + "grad_norm": 0.3613832890987396, + "learning_rate": 0.0005997550921387643, + "loss": 6.4609, + "step": 134 + }, + { + "epoch": 0.04317495852405605, + "grad_norm": 0.33870750665664673, + "learning_rate": 0.0005997423735645265, + "loss": 6.4513, + "step": 135 + }, + { + "epoch": 0.04349477303164165, + "grad_norm": 0.3336218297481537, + "learning_rate": 0.0005997293331877584, + "loss": 6.4568, + "step": 136 + }, + { + "epoch": 0.04381458753922725, + "grad_norm": 0.3454459309577942, + "learning_rate": 0.0005997159710224602, + "loss": 6.4306, + "step": 137 + }, + { + "epoch": 0.04413440204681285, + "grad_norm": 0.35895201563835144, + "learning_rate": 0.0005997022870829771, + "loss": 6.411, + "step": 138 + }, + { + "epoch": 0.04445421655439845, + "grad_norm": 0.335130900144577, + "learning_rate": 0.0005996882813840005, + "loss": 6.3966, + "step": 139 + }, + { + "epoch": 0.04477403106198405, + "grad_norm": 0.29891467094421387, + "learning_rate": 0.0005996739539405668, + "loss": 6.4543, + "step": 140 + }, + { + "epoch": 0.04509384556956965, + "grad_norm": 0.3919838070869446, + "learning_rate": 0.0005996593047680579, + "loss": 6.3899, + "step": 141 + }, + { + "epoch": 0.04541366007715525, + "grad_norm": 0.43485793471336365, + "learning_rate": 0.0005996443338822011, + "loss": 6.473, + "step": 142 + }, + { + "epoch": 0.04573347458474085, + "grad_norm": 0.5233163833618164, + "learning_rate": 0.000599629041299069, + "loss": 6.4152, + "step": 143 + }, + { + "epoch": 0.04605328909232645, + "grad_norm": 0.38263949751853943, + "learning_rate": 0.0005996134270350797, + "loss": 6.4465, + "step": 144 + }, + { + "epoch": 0.04637310359991205, + "grad_norm": 0.38162732124328613, + "learning_rate": 0.0005995974911069968, + "loss": 6.3812, + "step": 145 + }, + { + "epoch": 0.04669291810749765, + "grad_norm": 0.4471980333328247, + "learning_rate": 0.0005995812335319289, + "loss": 6.4205, + "step": 146 + }, + { + "epoch": 0.04701273261508325, + "grad_norm": 0.3626723885536194, + "learning_rate": 0.0005995646543273301, + "loss": 6.3803, + "step": 147 + }, + { + "epoch": 0.04733254712266885, + "grad_norm": 0.37030258774757385, + "learning_rate": 0.0005995477535109998, + "loss": 6.4335, + "step": 148 + }, + { + "epoch": 0.04765236163025445, + "grad_norm": 0.30875375866889954, + "learning_rate": 0.0005995305311010826, + "loss": 6.386, + "step": 149 + }, + { + "epoch": 0.04797217613784005, + "grad_norm": 0.41564837098121643, + "learning_rate": 0.0005995129871160688, + "loss": 6.3846, + "step": 150 + }, + { + "epoch": 0.04829199064542565, + "grad_norm": 0.3652048110961914, + "learning_rate": 0.000599495121574793, + "loss": 6.4395, + "step": 151 + }, + { + "epoch": 0.04861180515301125, + "grad_norm": 0.33011215925216675, + "learning_rate": 0.0005994769344964359, + "loss": 6.4096, + "step": 152 + }, + { + "epoch": 0.04893161966059685, + "grad_norm": 0.28985321521759033, + "learning_rate": 0.0005994584259005232, + "loss": 6.3923, + "step": 153 + }, + { + "epoch": 0.04925143416818245, + "grad_norm": 0.32829374074935913, + "learning_rate": 0.0005994395958069254, + "loss": 6.351, + "step": 154 + }, + { + "epoch": 0.04957124867576806, + "grad_norm": 0.32175174355506897, + "learning_rate": 0.0005994204442358586, + "loss": 6.3592, + "step": 155 + }, + { + "epoch": 0.04989106318335366, + "grad_norm": 0.3477462828159332, + "learning_rate": 0.0005994009712078839, + "loss": 6.3272, + "step": 156 + }, + { + "epoch": 0.05021087769093926, + "grad_norm": 0.31097620725631714, + "learning_rate": 0.0005993811767439074, + "loss": 6.358, + "step": 157 + }, + { + "epoch": 0.05053069219852486, + "grad_norm": 0.3622291386127472, + "learning_rate": 0.0005993610608651804, + "loss": 6.3131, + "step": 158 + }, + { + "epoch": 0.05085050670611046, + "grad_norm": 0.3798142373561859, + "learning_rate": 0.0005993406235932992, + "loss": 6.3733, + "step": 159 + }, + { + "epoch": 0.05117032121369606, + "grad_norm": 0.3285475969314575, + "learning_rate": 0.0005993198649502054, + "loss": 6.3525, + "step": 160 + }, + { + "epoch": 0.05149013572128166, + "grad_norm": 0.3842519223690033, + "learning_rate": 0.0005992987849581852, + "loss": 6.3819, + "step": 161 + }, + { + "epoch": 0.05180995022886726, + "grad_norm": 0.38792696595191956, + "learning_rate": 0.00059927738363987, + "loss": 6.383, + "step": 162 + }, + { + "epoch": 0.05212976473645286, + "grad_norm": 0.3751309812068939, + "learning_rate": 0.0005992556610182364, + "loss": 6.3039, + "step": 163 + }, + { + "epoch": 0.05244957924403846, + "grad_norm": 0.4142090976238251, + "learning_rate": 0.0005992336171166056, + "loss": 6.3056, + "step": 164 + }, + { + "epoch": 0.05276939375162406, + "grad_norm": 0.6167832016944885, + "learning_rate": 0.0005992112519586438, + "loss": 6.2986, + "step": 165 + }, + { + "epoch": 0.05308920825920966, + "grad_norm": 0.5728801488876343, + "learning_rate": 0.0005991885655683624, + "loss": 6.2972, + "step": 166 + }, + { + "epoch": 0.05340902276679526, + "grad_norm": 0.3504704236984253, + "learning_rate": 0.0005991655579701171, + "loss": 6.1997, + "step": 167 + }, + { + "epoch": 0.05372883727438086, + "grad_norm": 0.4289745092391968, + "learning_rate": 0.0005991422291886092, + "loss": 6.2569, + "step": 168 + }, + { + "epoch": 0.05404865178196646, + "grad_norm": 0.3686276972293854, + "learning_rate": 0.000599118579248884, + "loss": 6.2246, + "step": 169 + }, + { + "epoch": 0.05436846628955206, + "grad_norm": 0.3959043323993683, + "learning_rate": 0.0005990946081763324, + "loss": 6.3558, + "step": 170 + }, + { + "epoch": 0.05468828079713766, + "grad_norm": 0.31661829352378845, + "learning_rate": 0.0005990703159966894, + "loss": 6.2752, + "step": 171 + }, + { + "epoch": 0.05500809530472326, + "grad_norm": 0.3865572512149811, + "learning_rate": 0.000599045702736035, + "loss": 6.2855, + "step": 172 + }, + { + "epoch": 0.05532790981230886, + "grad_norm": 0.31952956318855286, + "learning_rate": 0.000599020768420794, + "loss": 6.2626, + "step": 173 + }, + { + "epoch": 0.05564772431989446, + "grad_norm": 0.40415337681770325, + "learning_rate": 0.000598995513077736, + "loss": 6.3349, + "step": 174 + }, + { + "epoch": 0.05596753882748006, + "grad_norm": 0.39738693833351135, + "learning_rate": 0.0005989699367339748, + "loss": 6.358, + "step": 175 + }, + { + "epoch": 0.05628735333506566, + "grad_norm": 0.34964892268180847, + "learning_rate": 0.0005989440394169692, + "loss": 6.2258, + "step": 176 + }, + { + "epoch": 0.05660716784265126, + "grad_norm": 0.3460441529750824, + "learning_rate": 0.0005989178211545223, + "loss": 6.2828, + "step": 177 + }, + { + "epoch": 0.05692698235023686, + "grad_norm": 0.38462570309638977, + "learning_rate": 0.0005988912819747822, + "loss": 6.2042, + "step": 178 + }, + { + "epoch": 0.05724679685782246, + "grad_norm": 0.35884854197502136, + "learning_rate": 0.0005988644219062412, + "loss": 6.2393, + "step": 179 + }, + { + "epoch": 0.05756661136540806, + "grad_norm": 0.3842756748199463, + "learning_rate": 0.0005988372409777362, + "loss": 6.1717, + "step": 180 + }, + { + "epoch": 0.05788642587299366, + "grad_norm": 0.3338313698768616, + "learning_rate": 0.0005988097392184486, + "loss": 6.2315, + "step": 181 + }, + { + "epoch": 0.05820624038057926, + "grad_norm": 0.36342915892601013, + "learning_rate": 0.000598781916657904, + "loss": 6.2506, + "step": 182 + }, + { + "epoch": 0.05852605488816486, + "grad_norm": 0.3592490255832672, + "learning_rate": 0.0005987537733259729, + "loss": 6.2037, + "step": 183 + }, + { + "epoch": 0.05884586939575046, + "grad_norm": 0.40599575638771057, + "learning_rate": 0.0005987253092528697, + "loss": 6.1787, + "step": 184 + }, + { + "epoch": 0.05916568390333606, + "grad_norm": 0.38951805233955383, + "learning_rate": 0.0005986965244691533, + "loss": 6.1482, + "step": 185 + }, + { + "epoch": 0.05948549841092166, + "grad_norm": 0.3523838520050049, + "learning_rate": 0.0005986674190057274, + "loss": 6.1634, + "step": 186 + }, + { + "epoch": 0.05980531291850726, + "grad_norm": 0.3295622766017914, + "learning_rate": 0.0005986379928938389, + "loss": 6.1444, + "step": 187 + }, + { + "epoch": 0.06012512742609286, + "grad_norm": 0.3437183201313019, + "learning_rate": 0.0005986082461650801, + "loss": 6.1733, + "step": 188 + }, + { + "epoch": 0.06044494193367847, + "grad_norm": 0.4113365709781647, + "learning_rate": 0.0005985781788513867, + "loss": 6.1967, + "step": 189 + }, + { + "epoch": 0.06076475644126407, + "grad_norm": 0.3398386240005493, + "learning_rate": 0.000598547790985039, + "loss": 6.1989, + "step": 190 + }, + { + "epoch": 0.06108457094884967, + "grad_norm": 0.3166162669658661, + "learning_rate": 0.0005985170825986613, + "loss": 6.1402, + "step": 191 + }, + { + "epoch": 0.06140438545643527, + "grad_norm": 0.4324597120285034, + "learning_rate": 0.000598486053725222, + "loss": 6.2131, + "step": 192 + }, + { + "epoch": 0.06172419996402087, + "grad_norm": 0.4721795320510864, + "learning_rate": 0.0005984547043980338, + "loss": 6.1555, + "step": 193 + }, + { + "epoch": 0.06204401447160647, + "grad_norm": 0.4092848002910614, + "learning_rate": 0.0005984230346507529, + "loss": 6.1317, + "step": 194 + }, + { + "epoch": 0.06236382897919207, + "grad_norm": 0.3960781991481781, + "learning_rate": 0.0005983910445173802, + "loss": 6.1638, + "step": 195 + }, + { + "epoch": 0.06268364348677767, + "grad_norm": 0.4284694492816925, + "learning_rate": 0.00059835873403226, + "loss": 6.1422, + "step": 196 + }, + { + "epoch": 0.06300345799436327, + "grad_norm": 0.3202615976333618, + "learning_rate": 0.0005983261032300807, + "loss": 6.0783, + "step": 197 + }, + { + "epoch": 0.06332327250194887, + "grad_norm": 0.40215131640434265, + "learning_rate": 0.0005982931521458747, + "loss": 6.0568, + "step": 198 + }, + { + "epoch": 0.06364308700953447, + "grad_norm": 0.36528280377388, + "learning_rate": 0.0005982598808150184, + "loss": 6.2003, + "step": 199 + }, + { + "epoch": 0.06396290151712007, + "grad_norm": 0.3844723701477051, + "learning_rate": 0.0005982262892732315, + "loss": 6.1754, + "step": 200 + }, + { + "epoch": 0.06396290151712007, + "eval_loss": 6.124255180358887, + "eval_runtime": 79.4151, + "eval_samples_per_second": 23.887, + "eval_steps_per_second": 5.981, + "step": 200 + }, + { + "epoch": 0.06428271602470567, + "grad_norm": 0.3817574679851532, + "learning_rate": 0.000598192377556578, + "loss": 6.1477, + "step": 201 + }, + { + "epoch": 0.06460253053229127, + "grad_norm": 0.36154475808143616, + "learning_rate": 0.0005981581457014652, + "loss": 6.1077, + "step": 202 + }, + { + "epoch": 0.06492234503987687, + "grad_norm": 0.33983469009399414, + "learning_rate": 0.0005981235937446446, + "loss": 6.1379, + "step": 203 + }, + { + "epoch": 0.06524215954746247, + "grad_norm": 0.3408897817134857, + "learning_rate": 0.0005980887217232107, + "loss": 6.1238, + "step": 204 + }, + { + "epoch": 0.06556197405504807, + "grad_norm": 0.3715430200099945, + "learning_rate": 0.0005980535296746023, + "loss": 6.1099, + "step": 205 + }, + { + "epoch": 0.06588178856263367, + "grad_norm": 0.39493903517723083, + "learning_rate": 0.0005980180176366013, + "loss": 6.1089, + "step": 206 + }, + { + "epoch": 0.06620160307021927, + "grad_norm": 0.35184791684150696, + "learning_rate": 0.0005979821856473336, + "loss": 6.0774, + "step": 207 + }, + { + "epoch": 0.06652141757780487, + "grad_norm": 0.3386211097240448, + "learning_rate": 0.0005979460337452681, + "loss": 6.0681, + "step": 208 + }, + { + "epoch": 0.06684123208539047, + "grad_norm": 0.34366345405578613, + "learning_rate": 0.0005979095619692172, + "loss": 6.0457, + "step": 209 + }, + { + "epoch": 0.06716104659297607, + "grad_norm": 0.4233472943305969, + "learning_rate": 0.0005978727703583374, + "loss": 6.0769, + "step": 210 + }, + { + "epoch": 0.06748086110056167, + "grad_norm": 0.38509127497673035, + "learning_rate": 0.0005978356589521276, + "loss": 6.0947, + "step": 211 + }, + { + "epoch": 0.06780067560814727, + "grad_norm": 0.3696242570877075, + "learning_rate": 0.0005977982277904306, + "loss": 6.0721, + "step": 212 + }, + { + "epoch": 0.06812049011573287, + "grad_norm": 0.4310404658317566, + "learning_rate": 0.0005977604769134325, + "loss": 6.0199, + "step": 213 + }, + { + "epoch": 0.06844030462331847, + "grad_norm": 0.47304555773735046, + "learning_rate": 0.0005977224063616625, + "loss": 6.115, + "step": 214 + }, + { + "epoch": 0.06876011913090407, + "grad_norm": 0.3975367844104767, + "learning_rate": 0.0005976840161759931, + "loss": 6.0713, + "step": 215 + }, + { + "epoch": 0.06907993363848967, + "grad_norm": 0.35958242416381836, + "learning_rate": 0.0005976453063976396, + "loss": 6.1155, + "step": 216 + }, + { + "epoch": 0.06939974814607527, + "grad_norm": 0.3872879147529602, + "learning_rate": 0.000597606277068161, + "loss": 6.0932, + "step": 217 + }, + { + "epoch": 0.06971956265366087, + "grad_norm": 0.4409867525100708, + "learning_rate": 0.000597566928229459, + "loss": 6.062, + "step": 218 + }, + { + "epoch": 0.07003937716124647, + "grad_norm": 0.39331111311912537, + "learning_rate": 0.0005975272599237784, + "loss": 5.9937, + "step": 219 + }, + { + "epoch": 0.07035919166883207, + "grad_norm": 0.42473429441452026, + "learning_rate": 0.0005974872721937069, + "loss": 6.038, + "step": 220 + }, + { + "epoch": 0.07067900617641767, + "grad_norm": 0.5901650190353394, + "learning_rate": 0.0005974469650821753, + "loss": 6.0971, + "step": 221 + }, + { + "epoch": 0.07099882068400327, + "grad_norm": 0.4795404374599457, + "learning_rate": 0.0005974063386324571, + "loss": 6.0316, + "step": 222 + }, + { + "epoch": 0.07131863519158887, + "grad_norm": 0.4145892858505249, + "learning_rate": 0.0005973653928881688, + "loss": 5.9712, + "step": 223 + }, + { + "epoch": 0.07163844969917448, + "grad_norm": 0.36704781651496887, + "learning_rate": 0.0005973241278932695, + "loss": 6.0002, + "step": 224 + }, + { + "epoch": 0.07195826420676008, + "grad_norm": 0.3889116048812866, + "learning_rate": 0.0005972825436920615, + "loss": 5.9938, + "step": 225 + }, + { + "epoch": 0.07227807871434568, + "grad_norm": 0.39109617471694946, + "learning_rate": 0.0005972406403291893, + "loss": 6.0182, + "step": 226 + }, + { + "epoch": 0.07259789322193128, + "grad_norm": 0.3956647217273712, + "learning_rate": 0.00059719841784964, + "loss": 6.0772, + "step": 227 + }, + { + "epoch": 0.07291770772951688, + "grad_norm": 0.3841460943222046, + "learning_rate": 0.0005971558762987439, + "loss": 5.9538, + "step": 228 + }, + { + "epoch": 0.07323752223710248, + "grad_norm": 0.3795830011367798, + "learning_rate": 0.0005971130157221733, + "loss": 6.022, + "step": 229 + }, + { + "epoch": 0.07355733674468808, + "grad_norm": 0.3490051031112671, + "learning_rate": 0.0005970698361659431, + "loss": 6.0195, + "step": 230 + }, + { + "epoch": 0.07387715125227368, + "grad_norm": 0.3971054255962372, + "learning_rate": 0.000597026337676411, + "loss": 6.0745, + "step": 231 + }, + { + "epoch": 0.07419696575985928, + "grad_norm": 0.3888954520225525, + "learning_rate": 0.0005969825203002765, + "loss": 5.998, + "step": 232 + }, + { + "epoch": 0.07451678026744488, + "grad_norm": 0.3782314360141754, + "learning_rate": 0.0005969383840845822, + "loss": 6.0144, + "step": 233 + }, + { + "epoch": 0.07483659477503048, + "grad_norm": 0.37992793321609497, + "learning_rate": 0.0005968939290767123, + "loss": 5.9202, + "step": 234 + }, + { + "epoch": 0.07515640928261608, + "grad_norm": 0.3844238817691803, + "learning_rate": 0.0005968491553243937, + "loss": 6.0261, + "step": 235 + }, + { + "epoch": 0.07547622379020168, + "grad_norm": 0.31843745708465576, + "learning_rate": 0.0005968040628756955, + "loss": 5.9868, + "step": 236 + }, + { + "epoch": 0.07579603829778728, + "grad_norm": 0.38442689180374146, + "learning_rate": 0.0005967586517790285, + "loss": 5.873, + "step": 237 + }, + { + "epoch": 0.07611585280537288, + "grad_norm": 0.4192902445793152, + "learning_rate": 0.0005967129220831461, + "loss": 6.0088, + "step": 238 + }, + { + "epoch": 0.07643566731295848, + "grad_norm": 0.4624556005001068, + "learning_rate": 0.0005966668738371436, + "loss": 5.9821, + "step": 239 + }, + { + "epoch": 0.07675548182054408, + "grad_norm": 0.46568813920021057, + "learning_rate": 0.0005966205070904582, + "loss": 5.9741, + "step": 240 + }, + { + "epoch": 0.07707529632812969, + "grad_norm": 0.41743966937065125, + "learning_rate": 0.0005965738218928693, + "loss": 5.8992, + "step": 241 + }, + { + "epoch": 0.07739511083571529, + "grad_norm": 0.4026016891002655, + "learning_rate": 0.0005965268182944976, + "loss": 5.9726, + "step": 242 + }, + { + "epoch": 0.07771492534330089, + "grad_norm": 0.3927863538265228, + "learning_rate": 0.0005964794963458063, + "loss": 6.0312, + "step": 243 + }, + { + "epoch": 0.07803473985088649, + "grad_norm": 0.3907330632209778, + "learning_rate": 0.0005964318560976001, + "loss": 5.9757, + "step": 244 + }, + { + "epoch": 0.07835455435847209, + "grad_norm": 0.42175161838531494, + "learning_rate": 0.0005963838976010252, + "loss": 5.8983, + "step": 245 + }, + { + "epoch": 0.07867436886605769, + "grad_norm": 0.44585391879081726, + "learning_rate": 0.0005963356209075701, + "loss": 5.9396, + "step": 246 + }, + { + "epoch": 0.07899418337364329, + "grad_norm": 0.40885382890701294, + "learning_rate": 0.0005962870260690641, + "loss": 6.0422, + "step": 247 + }, + { + "epoch": 0.07931399788122889, + "grad_norm": 0.3636566996574402, + "learning_rate": 0.0005962381131376788, + "loss": 5.9002, + "step": 248 + }, + { + "epoch": 0.07963381238881449, + "grad_norm": 0.3666331171989441, + "learning_rate": 0.0005961888821659268, + "loss": 5.9966, + "step": 249 + }, + { + "epoch": 0.07995362689640009, + "grad_norm": 0.3743114173412323, + "learning_rate": 0.0005961393332066623, + "loss": 5.9821, + "step": 250 + }, + { + "epoch": 0.08027344140398569, + "grad_norm": 0.41628390550613403, + "learning_rate": 0.0005960894663130811, + "loss": 5.9146, + "step": 251 + }, + { + "epoch": 0.0805932559115713, + "grad_norm": 0.5151297450065613, + "learning_rate": 0.0005960392815387201, + "loss": 5.9578, + "step": 252 + }, + { + "epoch": 0.0809130704191569, + "grad_norm": 0.45721518993377686, + "learning_rate": 0.0005959887789374573, + "loss": 5.9206, + "step": 253 + }, + { + "epoch": 0.0812328849267425, + "grad_norm": 0.402630090713501, + "learning_rate": 0.0005959379585635124, + "loss": 5.9299, + "step": 254 + }, + { + "epoch": 0.0815526994343281, + "grad_norm": 0.40605592727661133, + "learning_rate": 0.0005958868204714459, + "loss": 5.9394, + "step": 255 + }, + { + "epoch": 0.0818725139419137, + "grad_norm": 0.35721316933631897, + "learning_rate": 0.0005958353647161595, + "loss": 5.9359, + "step": 256 + }, + { + "epoch": 0.0821923284494993, + "grad_norm": 0.38720184564590454, + "learning_rate": 0.0005957835913528959, + "loss": 5.8879, + "step": 257 + }, + { + "epoch": 0.0825121429570849, + "grad_norm": 0.3456253111362457, + "learning_rate": 0.0005957315004372391, + "loss": 5.8717, + "step": 258 + }, + { + "epoch": 0.0828319574646705, + "grad_norm": 0.3766682744026184, + "learning_rate": 0.0005956790920251133, + "loss": 5.8761, + "step": 259 + }, + { + "epoch": 0.0831517719722561, + "grad_norm": 0.3425726294517517, + "learning_rate": 0.0005956263661727844, + "loss": 5.9843, + "step": 260 + }, + { + "epoch": 0.0834715864798417, + "grad_norm": 0.33851897716522217, + "learning_rate": 0.0005955733229368586, + "loss": 5.964, + "step": 261 + }, + { + "epoch": 0.0837914009874273, + "grad_norm": 0.3460259437561035, + "learning_rate": 0.000595519962374283, + "loss": 5.92, + "step": 262 + }, + { + "epoch": 0.0841112154950129, + "grad_norm": 0.40014368295669556, + "learning_rate": 0.0005954662845423452, + "loss": 5.8951, + "step": 263 + }, + { + "epoch": 0.0844310300025985, + "grad_norm": 0.4758973717689514, + "learning_rate": 0.0005954122894986736, + "loss": 5.9198, + "step": 264 + }, + { + "epoch": 0.0847508445101841, + "grad_norm": 0.34136953949928284, + "learning_rate": 0.0005953579773012374, + "loss": 5.9156, + "step": 265 + }, + { + "epoch": 0.0850706590177697, + "grad_norm": 0.41231444478034973, + "learning_rate": 0.0005953033480083456, + "loss": 5.8732, + "step": 266 + }, + { + "epoch": 0.0853904735253553, + "grad_norm": 0.4707014560699463, + "learning_rate": 0.0005952484016786483, + "loss": 5.8981, + "step": 267 + }, + { + "epoch": 0.0857102880329409, + "grad_norm": 0.375381201505661, + "learning_rate": 0.0005951931383711357, + "loss": 5.8859, + "step": 268 + }, + { + "epoch": 0.0860301025405265, + "grad_norm": 0.40857964754104614, + "learning_rate": 0.0005951375581451382, + "loss": 5.8945, + "step": 269 + }, + { + "epoch": 0.0863499170481121, + "grad_norm": 0.4175202548503876, + "learning_rate": 0.0005950816610603266, + "loss": 5.9622, + "step": 270 + }, + { + "epoch": 0.0866697315556977, + "grad_norm": 0.3857949376106262, + "learning_rate": 0.0005950254471767119, + "loss": 5.9279, + "step": 271 + }, + { + "epoch": 0.0869895460632833, + "grad_norm": 0.3293483853340149, + "learning_rate": 0.0005949689165546453, + "loss": 5.9038, + "step": 272 + }, + { + "epoch": 0.0873093605708689, + "grad_norm": 0.3591293394565582, + "learning_rate": 0.0005949120692548177, + "loss": 5.8429, + "step": 273 + }, + { + "epoch": 0.0876291750784545, + "grad_norm": 0.3662240505218506, + "learning_rate": 0.0005948549053382602, + "loss": 5.8856, + "step": 274 + }, + { + "epoch": 0.0879489895860401, + "grad_norm": 0.3530612289905548, + "learning_rate": 0.0005947974248663439, + "loss": 5.9053, + "step": 275 + }, + { + "epoch": 0.0882688040936257, + "grad_norm": 0.4124448001384735, + "learning_rate": 0.0005947396279007796, + "loss": 5.8455, + "step": 276 + }, + { + "epoch": 0.0885886186012113, + "grad_norm": 0.46663254499435425, + "learning_rate": 0.0005946815145036181, + "loss": 5.9011, + "step": 277 + }, + { + "epoch": 0.0889084331087969, + "grad_norm": 0.48376330733299255, + "learning_rate": 0.0005946230847372496, + "loss": 5.8198, + "step": 278 + }, + { + "epoch": 0.0892282476163825, + "grad_norm": 0.43925702571868896, + "learning_rate": 0.0005945643386644041, + "loss": 5.8751, + "step": 279 + }, + { + "epoch": 0.0895480621239681, + "grad_norm": 0.38675469160079956, + "learning_rate": 0.0005945052763481514, + "loss": 5.8037, + "step": 280 + }, + { + "epoch": 0.0898678766315537, + "grad_norm": 0.3925098776817322, + "learning_rate": 0.0005944458978519006, + "loss": 5.8462, + "step": 281 + }, + { + "epoch": 0.0901876911391393, + "grad_norm": 0.4551983177661896, + "learning_rate": 0.0005943862032394, + "loss": 5.9222, + "step": 282 + }, + { + "epoch": 0.0905075056467249, + "grad_norm": 0.3979947566986084, + "learning_rate": 0.000594326192574738, + "loss": 5.7826, + "step": 283 + }, + { + "epoch": 0.0908273201543105, + "grad_norm": 0.36182329058647156, + "learning_rate": 0.0005942658659223415, + "loss": 5.781, + "step": 284 + }, + { + "epoch": 0.0911471346618961, + "grad_norm": 0.34769323468208313, + "learning_rate": 0.0005942052233469771, + "loss": 5.8048, + "step": 285 + }, + { + "epoch": 0.0914669491694817, + "grad_norm": 0.3327469825744629, + "learning_rate": 0.0005941442649137507, + "loss": 5.8698, + "step": 286 + }, + { + "epoch": 0.0917867636770673, + "grad_norm": 0.3392864465713501, + "learning_rate": 0.0005940829906881066, + "loss": 5.7951, + "step": 287 + }, + { + "epoch": 0.0921065781846529, + "grad_norm": 0.3782911002635956, + "learning_rate": 0.0005940214007358293, + "loss": 5.8339, + "step": 288 + }, + { + "epoch": 0.0924263926922385, + "grad_norm": 0.36674779653549194, + "learning_rate": 0.0005939594951230412, + "loss": 5.7968, + "step": 289 + }, + { + "epoch": 0.0927462071998241, + "grad_norm": 0.3638828694820404, + "learning_rate": 0.0005938972739162041, + "loss": 5.8608, + "step": 290 + }, + { + "epoch": 0.0930660217074097, + "grad_norm": 0.367432177066803, + "learning_rate": 0.0005938347371821183, + "loss": 5.8873, + "step": 291 + }, + { + "epoch": 0.0933858362149953, + "grad_norm": 0.3655487895011902, + "learning_rate": 0.0005937718849879232, + "loss": 5.8549, + "step": 292 + }, + { + "epoch": 0.0937056507225809, + "grad_norm": 0.36033663153648376, + "learning_rate": 0.0005937087174010968, + "loss": 5.86, + "step": 293 + }, + { + "epoch": 0.0940254652301665, + "grad_norm": 0.36835548281669617, + "learning_rate": 0.0005936452344894556, + "loss": 5.7836, + "step": 294 + }, + { + "epoch": 0.0943452797377521, + "grad_norm": 0.4200371205806732, + "learning_rate": 0.0005935814363211546, + "loss": 5.7613, + "step": 295 + }, + { + "epoch": 0.0946650942453377, + "grad_norm": 0.6512305736541748, + "learning_rate": 0.0005935173229646873, + "loss": 5.7951, + "step": 296 + }, + { + "epoch": 0.0949849087529233, + "grad_norm": 0.37604695558547974, + "learning_rate": 0.0005934528944888857, + "loss": 5.7957, + "step": 297 + }, + { + "epoch": 0.0953047232605089, + "grad_norm": 0.38934704661369324, + "learning_rate": 0.0005933881509629201, + "loss": 5.8117, + "step": 298 + }, + { + "epoch": 0.0956245377680945, + "grad_norm": 0.3900265097618103, + "learning_rate": 0.0005933230924562987, + "loss": 5.8096, + "step": 299 + }, + { + "epoch": 0.0959443522756801, + "grad_norm": 0.41541120409965515, + "learning_rate": 0.0005932577190388684, + "loss": 5.8756, + "step": 300 + }, + { + "epoch": 0.0959443522756801, + "eval_loss": 5.780359268188477, + "eval_runtime": 79.6584, + "eval_samples_per_second": 23.814, + "eval_steps_per_second": 5.963, + "step": 300 + }, + { + "epoch": 0.0962641667832657, + "grad_norm": 0.4308043122291565, + "learning_rate": 0.0005931920307808138, + "loss": 5.8541, + "step": 301 + }, + { + "epoch": 0.0965839812908513, + "grad_norm": 0.4143083691596985, + "learning_rate": 0.0005931260277526574, + "loss": 5.7856, + "step": 302 + }, + { + "epoch": 0.0969037957984369, + "grad_norm": 0.5045465230941772, + "learning_rate": 0.0005930597100252602, + "loss": 5.7552, + "step": 303 + }, + { + "epoch": 0.0972236103060225, + "grad_norm": 0.6526516079902649, + "learning_rate": 0.0005929930776698205, + "loss": 5.8308, + "step": 304 + }, + { + "epoch": 0.0975434248136081, + "grad_norm": 0.5426544547080994, + "learning_rate": 0.0005929261307578747, + "loss": 5.813, + "step": 305 + }, + { + "epoch": 0.0978632393211937, + "grad_norm": 0.5369417071342468, + "learning_rate": 0.0005928588693612969, + "loss": 5.7372, + "step": 306 + }, + { + "epoch": 0.0981830538287793, + "grad_norm": 0.44663500785827637, + "learning_rate": 0.0005927912935522985, + "loss": 5.7072, + "step": 307 + }, + { + "epoch": 0.0985028683363649, + "grad_norm": 0.4832947850227356, + "learning_rate": 0.0005927234034034289, + "loss": 5.8266, + "step": 308 + }, + { + "epoch": 0.09882268284395052, + "grad_norm": 0.40185096859931946, + "learning_rate": 0.0005926551989875746, + "loss": 5.7626, + "step": 309 + }, + { + "epoch": 0.09914249735153612, + "grad_norm": 0.4085821807384491, + "learning_rate": 0.0005925866803779598, + "loss": 5.7838, + "step": 310 + }, + { + "epoch": 0.09946231185912172, + "grad_norm": 0.3931988477706909, + "learning_rate": 0.0005925178476481458, + "loss": 5.7797, + "step": 311 + }, + { + "epoch": 0.09978212636670732, + "grad_norm": 0.3664226830005646, + "learning_rate": 0.0005924487008720313, + "loss": 5.8671, + "step": 312 + }, + { + "epoch": 0.10010194087429292, + "grad_norm": 0.3390718698501587, + "learning_rate": 0.0005923792401238519, + "loss": 5.7122, + "step": 313 + }, + { + "epoch": 0.10042175538187852, + "grad_norm": 0.3831624984741211, + "learning_rate": 0.0005923094654781805, + "loss": 5.7979, + "step": 314 + }, + { + "epoch": 0.10074156988946412, + "grad_norm": 0.31763604283332825, + "learning_rate": 0.0005922393770099271, + "loss": 5.7992, + "step": 315 + }, + { + "epoch": 0.10106138439704972, + "grad_norm": 0.4179720878601074, + "learning_rate": 0.0005921689747943384, + "loss": 5.7227, + "step": 316 + }, + { + "epoch": 0.10138119890463532, + "grad_norm": 0.3841486871242523, + "learning_rate": 0.0005920982589069979, + "loss": 5.6847, + "step": 317 + }, + { + "epoch": 0.10170101341222092, + "grad_norm": 0.3815637230873108, + "learning_rate": 0.0005920272294238261, + "loss": 5.6553, + "step": 318 + }, + { + "epoch": 0.10202082791980652, + "grad_norm": 0.39948034286499023, + "learning_rate": 0.0005919558864210801, + "loss": 5.7416, + "step": 319 + }, + { + "epoch": 0.10234064242739212, + "grad_norm": 0.38976314663887024, + "learning_rate": 0.0005918842299753534, + "loss": 5.6859, + "step": 320 + }, + { + "epoch": 0.10266045693497772, + "grad_norm": 0.3874850273132324, + "learning_rate": 0.0005918122601635763, + "loss": 5.6806, + "step": 321 + }, + { + "epoch": 0.10298027144256332, + "grad_norm": 0.3757673501968384, + "learning_rate": 0.0005917399770630151, + "loss": 5.7353, + "step": 322 + }, + { + "epoch": 0.10330008595014892, + "grad_norm": 0.36281487345695496, + "learning_rate": 0.000591667380751273, + "loss": 5.7287, + "step": 323 + }, + { + "epoch": 0.10361990045773452, + "grad_norm": 0.353299617767334, + "learning_rate": 0.0005915944713062891, + "loss": 5.7731, + "step": 324 + }, + { + "epoch": 0.10393971496532012, + "grad_norm": 0.40751269459724426, + "learning_rate": 0.0005915212488063387, + "loss": 5.6843, + "step": 325 + }, + { + "epoch": 0.10425952947290572, + "grad_norm": 0.35061565041542053, + "learning_rate": 0.0005914477133300333, + "loss": 5.7085, + "step": 326 + }, + { + "epoch": 0.10457934398049132, + "grad_norm": 0.42064836621284485, + "learning_rate": 0.0005913738649563205, + "loss": 5.6999, + "step": 327 + }, + { + "epoch": 0.10489915848807692, + "grad_norm": 0.39509114623069763, + "learning_rate": 0.0005912997037644834, + "loss": 5.7637, + "step": 328 + }, + { + "epoch": 0.10521897299566252, + "grad_norm": 0.39371103048324585, + "learning_rate": 0.0005912252298341416, + "loss": 5.7397, + "step": 329 + }, + { + "epoch": 0.10553878750324812, + "grad_norm": 0.39554670453071594, + "learning_rate": 0.0005911504432452498, + "loss": 5.7249, + "step": 330 + }, + { + "epoch": 0.10585860201083372, + "grad_norm": 0.4175823926925659, + "learning_rate": 0.0005910753440780988, + "loss": 5.6705, + "step": 331 + }, + { + "epoch": 0.10617841651841932, + "grad_norm": 0.4640887677669525, + "learning_rate": 0.0005909999324133148, + "loss": 5.6927, + "step": 332 + }, + { + "epoch": 0.10649823102600492, + "grad_norm": 0.39599356055259705, + "learning_rate": 0.0005909242083318596, + "loss": 5.7207, + "step": 333 + }, + { + "epoch": 0.10681804553359052, + "grad_norm": 0.38180121779441833, + "learning_rate": 0.0005908481719150303, + "loss": 5.7066, + "step": 334 + }, + { + "epoch": 0.10713786004117612, + "grad_norm": 0.4411526918411255, + "learning_rate": 0.0005907718232444594, + "loss": 5.651, + "step": 335 + }, + { + "epoch": 0.10745767454876172, + "grad_norm": 0.3934768736362457, + "learning_rate": 0.0005906951624021147, + "loss": 5.6699, + "step": 336 + }, + { + "epoch": 0.10777748905634732, + "grad_norm": 0.366379976272583, + "learning_rate": 0.0005906181894702987, + "loss": 5.6752, + "step": 337 + }, + { + "epoch": 0.10809730356393292, + "grad_norm": 0.45642510056495667, + "learning_rate": 0.0005905409045316497, + "loss": 5.6655, + "step": 338 + }, + { + "epoch": 0.10841711807151852, + "grad_norm": 0.490781307220459, + "learning_rate": 0.0005904633076691404, + "loss": 5.7221, + "step": 339 + }, + { + "epoch": 0.10873693257910412, + "grad_norm": 0.47324830293655396, + "learning_rate": 0.0005903853989660787, + "loss": 5.7254, + "step": 340 + }, + { + "epoch": 0.10905674708668972, + "grad_norm": 0.38551396131515503, + "learning_rate": 0.0005903071785061069, + "loss": 5.6606, + "step": 341 + }, + { + "epoch": 0.10937656159427532, + "grad_norm": 0.4228317439556122, + "learning_rate": 0.0005902286463732026, + "loss": 5.7021, + "step": 342 + }, + { + "epoch": 0.10969637610186092, + "grad_norm": 0.4366403818130493, + "learning_rate": 0.0005901498026516774, + "loss": 5.6812, + "step": 343 + }, + { + "epoch": 0.11001619060944652, + "grad_norm": 0.43695297837257385, + "learning_rate": 0.0005900706474261778, + "loss": 5.7154, + "step": 344 + }, + { + "epoch": 0.11033600511703212, + "grad_norm": 0.43707820773124695, + "learning_rate": 0.0005899911807816844, + "loss": 5.6471, + "step": 345 + }, + { + "epoch": 0.11065581962461772, + "grad_norm": 0.505287766456604, + "learning_rate": 0.0005899114028035128, + "loss": 5.7538, + "step": 346 + }, + { + "epoch": 0.11097563413220332, + "grad_norm": 0.4542320668697357, + "learning_rate": 0.0005898313135773121, + "loss": 5.6872, + "step": 347 + }, + { + "epoch": 0.11129544863978892, + "grad_norm": 0.3504129648208618, + "learning_rate": 0.0005897509131890658, + "loss": 5.6519, + "step": 348 + }, + { + "epoch": 0.11161526314737452, + "grad_norm": 0.4545753598213196, + "learning_rate": 0.0005896702017250916, + "loss": 5.6922, + "step": 349 + }, + { + "epoch": 0.11193507765496012, + "grad_norm": 0.36486905813217163, + "learning_rate": 0.0005895891792720413, + "loss": 5.6931, + "step": 350 + }, + { + "epoch": 0.11225489216254572, + "grad_norm": 0.41182687878608704, + "learning_rate": 0.0005895078459169, + "loss": 5.7055, + "step": 351 + }, + { + "epoch": 0.11257470667013132, + "grad_norm": 0.40154850482940674, + "learning_rate": 0.0005894262017469872, + "loss": 5.6622, + "step": 352 + }, + { + "epoch": 0.11289452117771692, + "grad_norm": 0.40887096524238586, + "learning_rate": 0.0005893442468499557, + "loss": 5.5879, + "step": 353 + }, + { + "epoch": 0.11321433568530252, + "grad_norm": 0.4395909607410431, + "learning_rate": 0.0005892619813137923, + "loss": 5.6652, + "step": 354 + }, + { + "epoch": 0.11353415019288812, + "grad_norm": 0.4587445557117462, + "learning_rate": 0.0005891794052268167, + "loss": 5.5781, + "step": 355 + }, + { + "epoch": 0.11385396470047372, + "grad_norm": 0.5725453495979309, + "learning_rate": 0.0005890965186776825, + "loss": 5.6664, + "step": 356 + }, + { + "epoch": 0.11417377920805932, + "grad_norm": 0.45469459891319275, + "learning_rate": 0.0005890133217553765, + "loss": 5.6017, + "step": 357 + }, + { + "epoch": 0.11449359371564492, + "grad_norm": 0.39412954449653625, + "learning_rate": 0.0005889298145492185, + "loss": 5.6518, + "step": 358 + }, + { + "epoch": 0.11481340822323052, + "grad_norm": 0.3986184000968933, + "learning_rate": 0.0005888459971488618, + "loss": 5.6875, + "step": 359 + }, + { + "epoch": 0.11513322273081612, + "grad_norm": 0.4425166845321655, + "learning_rate": 0.0005887618696442925, + "loss": 5.7008, + "step": 360 + }, + { + "epoch": 0.11545303723840172, + "grad_norm": 0.4173751473426819, + "learning_rate": 0.0005886774321258294, + "loss": 5.5739, + "step": 361 + }, + { + "epoch": 0.11577285174598732, + "grad_norm": 0.44372227787971497, + "learning_rate": 0.0005885926846841246, + "loss": 5.6711, + "step": 362 + }, + { + "epoch": 0.11609266625357292, + "grad_norm": 0.4772733449935913, + "learning_rate": 0.0005885076274101627, + "loss": 5.6079, + "step": 363 + }, + { + "epoch": 0.11641248076115852, + "grad_norm": 0.4153997004032135, + "learning_rate": 0.0005884222603952608, + "loss": 5.6619, + "step": 364 + }, + { + "epoch": 0.11673229526874412, + "grad_norm": 0.37677884101867676, + "learning_rate": 0.0005883365837310689, + "loss": 5.5207, + "step": 365 + }, + { + "epoch": 0.11705210977632972, + "grad_norm": 0.42494794726371765, + "learning_rate": 0.0005882505975095689, + "loss": 5.5696, + "step": 366 + }, + { + "epoch": 0.11737192428391532, + "grad_norm": 0.6638725399971008, + "learning_rate": 0.0005881643018230755, + "loss": 5.659, + "step": 367 + }, + { + "epoch": 0.11769173879150092, + "grad_norm": 0.47131872177124023, + "learning_rate": 0.0005880776967642355, + "loss": 5.5253, + "step": 368 + }, + { + "epoch": 0.11801155329908652, + "grad_norm": 0.5139328241348267, + "learning_rate": 0.0005879907824260281, + "loss": 5.5355, + "step": 369 + }, + { + "epoch": 0.11833136780667212, + "grad_norm": 0.3897528350353241, + "learning_rate": 0.0005879035589017638, + "loss": 5.6067, + "step": 370 + }, + { + "epoch": 0.11865118231425772, + "grad_norm": 0.4552856385707855, + "learning_rate": 0.0005878160262850859, + "loss": 5.6653, + "step": 371 + }, + { + "epoch": 0.11897099682184333, + "grad_norm": 0.457346111536026, + "learning_rate": 0.0005877281846699689, + "loss": 5.5748, + "step": 372 + }, + { + "epoch": 0.11929081132942893, + "grad_norm": 0.40474575757980347, + "learning_rate": 0.0005876400341507194, + "loss": 5.5896, + "step": 373 + }, + { + "epoch": 0.11961062583701453, + "grad_norm": 0.4695594012737274, + "learning_rate": 0.0005875515748219757, + "loss": 5.6558, + "step": 374 + }, + { + "epoch": 0.11993044034460013, + "grad_norm": 0.41759058833122253, + "learning_rate": 0.0005874628067787072, + "loss": 5.6042, + "step": 375 + }, + { + "epoch": 0.12025025485218573, + "grad_norm": 0.47090569138526917, + "learning_rate": 0.0005873737301162151, + "loss": 5.5964, + "step": 376 + }, + { + "epoch": 0.12057006935977133, + "grad_norm": 0.4760238826274872, + "learning_rate": 0.000587284344930132, + "loss": 5.5101, + "step": 377 + }, + { + "epoch": 0.12088988386735694, + "grad_norm": 0.3871617615222931, + "learning_rate": 0.0005871946513164213, + "loss": 5.5711, + "step": 378 + }, + { + "epoch": 0.12120969837494254, + "grad_norm": 0.43261823058128357, + "learning_rate": 0.000587104649371378, + "loss": 5.5596, + "step": 379 + }, + { + "epoch": 0.12152951288252814, + "grad_norm": 0.3882465362548828, + "learning_rate": 0.000587014339191628, + "loss": 5.5447, + "step": 380 + }, + { + "epoch": 0.12184932739011374, + "grad_norm": 0.35880735516548157, + "learning_rate": 0.0005869237208741278, + "loss": 5.5992, + "step": 381 + }, + { + "epoch": 0.12216914189769934, + "grad_norm": 0.3885442614555359, + "learning_rate": 0.0005868327945161651, + "loss": 5.6215, + "step": 382 + }, + { + "epoch": 0.12248895640528494, + "grad_norm": 0.4411607086658478, + "learning_rate": 0.0005867415602153582, + "loss": 5.6445, + "step": 383 + }, + { + "epoch": 0.12280877091287054, + "grad_norm": 0.37853386998176575, + "learning_rate": 0.0005866500180696558, + "loss": 5.5588, + "step": 384 + }, + { + "epoch": 0.12312858542045614, + "grad_norm": 0.36723509430885315, + "learning_rate": 0.0005865581681773374, + "loss": 5.516, + "step": 385 + }, + { + "epoch": 0.12344839992804174, + "grad_norm": 0.37725216150283813, + "learning_rate": 0.000586466010637013, + "loss": 5.5922, + "step": 386 + }, + { + "epoch": 0.12376821443562734, + "grad_norm": 0.36253562569618225, + "learning_rate": 0.0005863735455476222, + "loss": 5.5197, + "step": 387 + }, + { + "epoch": 0.12408802894321294, + "grad_norm": 0.3463018834590912, + "learning_rate": 0.0005862807730084356, + "loss": 5.6061, + "step": 388 + }, + { + "epoch": 0.12440784345079854, + "grad_norm": 0.34062060713768005, + "learning_rate": 0.0005861876931190534, + "loss": 5.6057, + "step": 389 + }, + { + "epoch": 0.12472765795838414, + "grad_norm": 0.39427444338798523, + "learning_rate": 0.0005860943059794059, + "loss": 5.5445, + "step": 390 + }, + { + "epoch": 0.12504747246596973, + "grad_norm": 0.37826618552207947, + "learning_rate": 0.0005860006116897533, + "loss": 5.4999, + "step": 391 + }, + { + "epoch": 0.12536728697355534, + "grad_norm": 0.37145209312438965, + "learning_rate": 0.0005859066103506853, + "loss": 5.4957, + "step": 392 + }, + { + "epoch": 0.12568710148114093, + "grad_norm": 0.3861520290374756, + "learning_rate": 0.0005858123020631218, + "loss": 5.5209, + "step": 393 + }, + { + "epoch": 0.12600691598872654, + "grad_norm": 0.447878360748291, + "learning_rate": 0.0005857176869283118, + "loss": 5.5265, + "step": 394 + }, + { + "epoch": 0.12632673049631213, + "grad_norm": 0.37489327788352966, + "learning_rate": 0.0005856227650478335, + "loss": 5.5166, + "step": 395 + }, + { + "epoch": 0.12664654500389774, + "grad_norm": 0.6625049710273743, + "learning_rate": 0.0005855275365235953, + "loss": 5.5915, + "step": 396 + }, + { + "epoch": 0.12696635951148333, + "grad_norm": 0.44229626655578613, + "learning_rate": 0.0005854320014578338, + "loss": 5.5095, + "step": 397 + }, + { + "epoch": 0.12728617401906894, + "grad_norm": 0.4284876585006714, + "learning_rate": 0.0005853361599531155, + "loss": 5.4948, + "step": 398 + }, + { + "epoch": 0.12760598852665453, + "grad_norm": 0.3634096086025238, + "learning_rate": 0.0005852400121123353, + "loss": 5.5658, + "step": 399 + }, + { + "epoch": 0.12792580303424014, + "grad_norm": 0.43918880820274353, + "learning_rate": 0.0005851435580387175, + "loss": 5.5348, + "step": 400 + }, + { + "epoch": 0.12792580303424014, + "eval_loss": 5.526098251342773, + "eval_runtime": 82.1635, + "eval_samples_per_second": 23.088, + "eval_steps_per_second": 5.781, + "step": 400 + }, + { + "epoch": 0.12824561754182573, + "grad_norm": 0.3572410047054291, + "learning_rate": 0.0005850467978358146, + "loss": 5.5987, + "step": 401 + }, + { + "epoch": 0.12856543204941134, + "grad_norm": 0.3999462127685547, + "learning_rate": 0.0005849497316075084, + "loss": 5.4837, + "step": 402 + }, + { + "epoch": 0.12888524655699693, + "grad_norm": 0.4488193988800049, + "learning_rate": 0.0005848523594580086, + "loss": 5.5807, + "step": 403 + }, + { + "epoch": 0.12920506106458254, + "grad_norm": 0.3959190845489502, + "learning_rate": 0.0005847546814918538, + "loss": 5.5433, + "step": 404 + }, + { + "epoch": 0.12952487557216813, + "grad_norm": 0.4110461175441742, + "learning_rate": 0.0005846566978139108, + "loss": 5.4933, + "step": 405 + }, + { + "epoch": 0.12984469007975374, + "grad_norm": 0.4625667333602905, + "learning_rate": 0.0005845584085293745, + "loss": 5.4707, + "step": 406 + }, + { + "epoch": 0.13016450458733933, + "grad_norm": 0.3771616816520691, + "learning_rate": 0.0005844598137437682, + "loss": 5.4576, + "step": 407 + }, + { + "epoch": 0.13048431909492494, + "grad_norm": 0.42803341150283813, + "learning_rate": 0.0005843609135629427, + "loss": 5.5858, + "step": 408 + }, + { + "epoch": 0.13080413360251053, + "grad_norm": 0.4550051987171173, + "learning_rate": 0.0005842617080930771, + "loss": 5.549, + "step": 409 + }, + { + "epoch": 0.13112394811009614, + "grad_norm": 0.41399329900741577, + "learning_rate": 0.000584162197440678, + "loss": 5.6118, + "step": 410 + }, + { + "epoch": 0.13144376261768173, + "grad_norm": 0.41662803292274475, + "learning_rate": 0.0005840623817125799, + "loss": 5.4915, + "step": 411 + }, + { + "epoch": 0.13176357712526734, + "grad_norm": 0.4127683639526367, + "learning_rate": 0.0005839622610159446, + "loss": 5.5255, + "step": 412 + }, + { + "epoch": 0.13208339163285296, + "grad_norm": 0.4265100955963135, + "learning_rate": 0.0005838618354582612, + "loss": 5.4756, + "step": 413 + }, + { + "epoch": 0.13240320614043855, + "grad_norm": 0.38000795245170593, + "learning_rate": 0.0005837611051473466, + "loss": 5.4627, + "step": 414 + }, + { + "epoch": 0.13272302064802416, + "grad_norm": 0.43064582347869873, + "learning_rate": 0.0005836600701913443, + "loss": 5.4952, + "step": 415 + }, + { + "epoch": 0.13304283515560975, + "grad_norm": 0.38073885440826416, + "learning_rate": 0.0005835587306987255, + "loss": 5.5138, + "step": 416 + }, + { + "epoch": 0.13336264966319536, + "grad_norm": 0.37120160460472107, + "learning_rate": 0.0005834570867782875, + "loss": 5.5417, + "step": 417 + }, + { + "epoch": 0.13368246417078095, + "grad_norm": 0.3759710490703583, + "learning_rate": 0.0005833551385391551, + "loss": 5.5581, + "step": 418 + }, + { + "epoch": 0.13400227867836656, + "grad_norm": 0.4189684987068176, + "learning_rate": 0.0005832528860907798, + "loss": 5.4671, + "step": 419 + }, + { + "epoch": 0.13432209318595215, + "grad_norm": 0.4628429412841797, + "learning_rate": 0.0005831503295429393, + "loss": 5.4805, + "step": 420 + }, + { + "epoch": 0.13464190769353776, + "grad_norm": 0.4366797208786011, + "learning_rate": 0.0005830474690057383, + "loss": 5.5822, + "step": 421 + }, + { + "epoch": 0.13496172220112335, + "grad_norm": 0.3592755198478699, + "learning_rate": 0.0005829443045896072, + "loss": 5.5895, + "step": 422 + }, + { + "epoch": 0.13528153670870896, + "grad_norm": 0.4137701392173767, + "learning_rate": 0.0005828408364053031, + "loss": 5.4889, + "step": 423 + }, + { + "epoch": 0.13560135121629455, + "grad_norm": 0.3506132960319519, + "learning_rate": 0.0005827370645639095, + "loss": 5.5088, + "step": 424 + }, + { + "epoch": 0.13592116572388016, + "grad_norm": 0.34037062525749207, + "learning_rate": 0.0005826329891768351, + "loss": 5.4807, + "step": 425 + }, + { + "epoch": 0.13624098023146575, + "grad_norm": 0.5105721950531006, + "learning_rate": 0.0005825286103558151, + "loss": 5.4829, + "step": 426 + }, + { + "epoch": 0.13656079473905136, + "grad_norm": 0.39539438486099243, + "learning_rate": 0.0005824239282129103, + "loss": 5.5657, + "step": 427 + }, + { + "epoch": 0.13688060924663695, + "grad_norm": 0.45470479130744934, + "learning_rate": 0.0005823189428605072, + "loss": 5.5231, + "step": 428 + }, + { + "epoch": 0.13720042375422256, + "grad_norm": 0.42423999309539795, + "learning_rate": 0.0005822136544113177, + "loss": 5.3981, + "step": 429 + }, + { + "epoch": 0.13752023826180815, + "grad_norm": 0.34813690185546875, + "learning_rate": 0.000582108062978379, + "loss": 5.454, + "step": 430 + }, + { + "epoch": 0.13784005276939376, + "grad_norm": 0.36712193489074707, + "learning_rate": 0.0005820021686750542, + "loss": 5.4303, + "step": 431 + }, + { + "epoch": 0.13815986727697935, + "grad_norm": 0.3907421827316284, + "learning_rate": 0.0005818959716150306, + "loss": 5.4179, + "step": 432 + }, + { + "epoch": 0.13847968178456496, + "grad_norm": 0.38730525970458984, + "learning_rate": 0.0005817894719123214, + "loss": 5.4916, + "step": 433 + }, + { + "epoch": 0.13879949629215055, + "grad_norm": 0.4636697471141815, + "learning_rate": 0.0005816826696812643, + "loss": 5.4353, + "step": 434 + }, + { + "epoch": 0.13911931079973616, + "grad_norm": 0.4625189006328583, + "learning_rate": 0.0005815755650365217, + "loss": 5.4693, + "step": 435 + }, + { + "epoch": 0.13943912530732175, + "grad_norm": 0.39926567673683167, + "learning_rate": 0.000581468158093081, + "loss": 5.4779, + "step": 436 + }, + { + "epoch": 0.13975893981490736, + "grad_norm": 0.4575376510620117, + "learning_rate": 0.0005813604489662539, + "loss": 5.5438, + "step": 437 + }, + { + "epoch": 0.14007875432249295, + "grad_norm": 0.36058712005615234, + "learning_rate": 0.0005812524377716766, + "loss": 5.5096, + "step": 438 + }, + { + "epoch": 0.14039856883007856, + "grad_norm": 0.4126695990562439, + "learning_rate": 0.0005811441246253098, + "loss": 5.4493, + "step": 439 + }, + { + "epoch": 0.14071838333766415, + "grad_norm": 0.41481906175613403, + "learning_rate": 0.0005810355096434378, + "loss": 5.4734, + "step": 440 + }, + { + "epoch": 0.14103819784524976, + "grad_norm": 0.4174092710018158, + "learning_rate": 0.0005809265929426696, + "loss": 5.4339, + "step": 441 + }, + { + "epoch": 0.14135801235283535, + "grad_norm": 0.4385882318019867, + "learning_rate": 0.0005808173746399377, + "loss": 5.4296, + "step": 442 + }, + { + "epoch": 0.14167782686042096, + "grad_norm": 0.40480849146842957, + "learning_rate": 0.0005807078548524988, + "loss": 5.3593, + "step": 443 + }, + { + "epoch": 0.14199764136800655, + "grad_norm": 0.40930604934692383, + "learning_rate": 0.0005805980336979327, + "loss": 5.5535, + "step": 444 + }, + { + "epoch": 0.14231745587559216, + "grad_norm": 0.4092625677585602, + "learning_rate": 0.0005804879112941433, + "loss": 5.3996, + "step": 445 + }, + { + "epoch": 0.14263727038317775, + "grad_norm": 0.3877846300601959, + "learning_rate": 0.0005803774877593575, + "loss": 5.4833, + "step": 446 + }, + { + "epoch": 0.14295708489076336, + "grad_norm": 0.43150779604911804, + "learning_rate": 0.000580266763212126, + "loss": 5.4727, + "step": 447 + }, + { + "epoch": 0.14327689939834895, + "grad_norm": 0.3717440366744995, + "learning_rate": 0.0005801557377713218, + "loss": 5.4995, + "step": 448 + }, + { + "epoch": 0.14359671390593456, + "grad_norm": 0.39955979585647583, + "learning_rate": 0.0005800444115561422, + "loss": 5.511, + "step": 449 + }, + { + "epoch": 0.14391652841352015, + "grad_norm": 0.36227232217788696, + "learning_rate": 0.000579932784686106, + "loss": 5.4445, + "step": 450 + }, + { + "epoch": 0.14423634292110576, + "grad_norm": 0.36275947093963623, + "learning_rate": 0.000579820857281056, + "loss": 5.4402, + "step": 451 + }, + { + "epoch": 0.14455615742869135, + "grad_norm": 0.38074591755867004, + "learning_rate": 0.0005797086294611569, + "loss": 5.4352, + "step": 452 + }, + { + "epoch": 0.14487597193627696, + "grad_norm": 0.37614187598228455, + "learning_rate": 0.0005795961013468961, + "loss": 5.4581, + "step": 453 + }, + { + "epoch": 0.14519578644386255, + "grad_norm": 0.35029304027557373, + "learning_rate": 0.0005794832730590836, + "loss": 5.4321, + "step": 454 + }, + { + "epoch": 0.14551560095144817, + "grad_norm": 0.3891676366329193, + "learning_rate": 0.0005793701447188514, + "loss": 5.3738, + "step": 455 + }, + { + "epoch": 0.14583541545903375, + "grad_norm": 0.41309744119644165, + "learning_rate": 0.0005792567164476539, + "loss": 5.4222, + "step": 456 + }, + { + "epoch": 0.14615522996661937, + "grad_norm": 0.40795260667800903, + "learning_rate": 0.0005791429883672672, + "loss": 5.3891, + "step": 457 + }, + { + "epoch": 0.14647504447420495, + "grad_norm": 0.4105323255062103, + "learning_rate": 0.0005790289605997895, + "loss": 5.3823, + "step": 458 + }, + { + "epoch": 0.14679485898179057, + "grad_norm": 0.3415970504283905, + "learning_rate": 0.0005789146332676407, + "loss": 5.3935, + "step": 459 + }, + { + "epoch": 0.14711467348937615, + "grad_norm": 0.5799669027328491, + "learning_rate": 0.0005788000064935623, + "loss": 5.4125, + "step": 460 + }, + { + "epoch": 0.14743448799696177, + "grad_norm": 0.3847658038139343, + "learning_rate": 0.0005786850804006172, + "loss": 5.4022, + "step": 461 + }, + { + "epoch": 0.14775430250454735, + "grad_norm": 0.4211963713169098, + "learning_rate": 0.0005785698551121897, + "loss": 5.4022, + "step": 462 + }, + { + "epoch": 0.14807411701213297, + "grad_norm": 0.4019568860530853, + "learning_rate": 0.0005784543307519854, + "loss": 5.3992, + "step": 463 + }, + { + "epoch": 0.14839393151971855, + "grad_norm": 0.3978749215602875, + "learning_rate": 0.000578338507444031, + "loss": 5.4448, + "step": 464 + }, + { + "epoch": 0.14871374602730417, + "grad_norm": 0.44494131207466125, + "learning_rate": 0.0005782223853126739, + "loss": 5.4292, + "step": 465 + }, + { + "epoch": 0.14903356053488975, + "grad_norm": 0.4366230070590973, + "learning_rate": 0.0005781059644825824, + "loss": 5.4311, + "step": 466 + }, + { + "epoch": 0.14935337504247537, + "grad_norm": 0.3958189785480499, + "learning_rate": 0.0005779892450787458, + "loss": 5.3312, + "step": 467 + }, + { + "epoch": 0.14967318955006095, + "grad_norm": 0.43146812915802, + "learning_rate": 0.0005778722272264736, + "loss": 5.4564, + "step": 468 + }, + { + "epoch": 0.14999300405764657, + "grad_norm": 0.5479041337966919, + "learning_rate": 0.0005777549110513959, + "loss": 5.4525, + "step": 469 + }, + { + "epoch": 0.15031281856523215, + "grad_norm": 0.4975782632827759, + "learning_rate": 0.0005776372966794628, + "loss": 5.4587, + "step": 470 + }, + { + "epoch": 0.15063263307281777, + "grad_norm": 0.44812631607055664, + "learning_rate": 0.000577519384236945, + "loss": 5.4789, + "step": 471 + }, + { + "epoch": 0.15095244758040335, + "grad_norm": 0.4622386395931244, + "learning_rate": 0.0005774011738504326, + "loss": 5.4506, + "step": 472 + }, + { + "epoch": 0.15127226208798897, + "grad_norm": 0.4067244827747345, + "learning_rate": 0.0005772826656468363, + "loss": 5.465, + "step": 473 + }, + { + "epoch": 0.15159207659557455, + "grad_norm": 0.41158872842788696, + "learning_rate": 0.000577163859753386, + "loss": 5.4287, + "step": 474 + }, + { + "epoch": 0.15191189110316017, + "grad_norm": 0.38296622037887573, + "learning_rate": 0.0005770447562976313, + "loss": 5.3948, + "step": 475 + }, + { + "epoch": 0.15223170561074575, + "grad_norm": 0.479064017534256, + "learning_rate": 0.0005769253554074414, + "loss": 5.4274, + "step": 476 + }, + { + "epoch": 0.15255152011833137, + "grad_norm": 0.598816990852356, + "learning_rate": 0.0005768056572110047, + "loss": 5.4695, + "step": 477 + }, + { + "epoch": 0.15287133462591695, + "grad_norm": 0.4154305160045624, + "learning_rate": 0.000576685661836829, + "loss": 5.3828, + "step": 478 + }, + { + "epoch": 0.15319114913350257, + "grad_norm": 0.5114104747772217, + "learning_rate": 0.0005765653694137406, + "loss": 5.3977, + "step": 479 + }, + { + "epoch": 0.15351096364108816, + "grad_norm": 0.40716323256492615, + "learning_rate": 0.0005764447800708856, + "loss": 5.3884, + "step": 480 + }, + { + "epoch": 0.15383077814867377, + "grad_norm": 0.41213738918304443, + "learning_rate": 0.0005763238939377278, + "loss": 5.3991, + "step": 481 + }, + { + "epoch": 0.15415059265625938, + "grad_norm": 0.4534163773059845, + "learning_rate": 0.0005762027111440506, + "loss": 5.4046, + "step": 482 + }, + { + "epoch": 0.15447040716384497, + "grad_norm": 0.42364469170570374, + "learning_rate": 0.0005760812318199555, + "loss": 5.4144, + "step": 483 + }, + { + "epoch": 0.15479022167143058, + "grad_norm": 0.486278235912323, + "learning_rate": 0.000575959456095862, + "loss": 5.3027, + "step": 484 + }, + { + "epoch": 0.15511003617901617, + "grad_norm": 0.4105694890022278, + "learning_rate": 0.0005758373841025085, + "loss": 5.3753, + "step": 485 + }, + { + "epoch": 0.15542985068660178, + "grad_norm": 0.41940709948539734, + "learning_rate": 0.000575715015970951, + "loss": 5.358, + "step": 486 + }, + { + "epoch": 0.15574966519418737, + "grad_norm": 0.4332304894924164, + "learning_rate": 0.0005755923518325637, + "loss": 5.3866, + "step": 487 + }, + { + "epoch": 0.15606947970177298, + "grad_norm": 0.4478780925273895, + "learning_rate": 0.0005754693918190382, + "loss": 5.4167, + "step": 488 + }, + { + "epoch": 0.15638929420935857, + "grad_norm": 0.39930951595306396, + "learning_rate": 0.0005753461360623842, + "loss": 5.3498, + "step": 489 + }, + { + "epoch": 0.15670910871694418, + "grad_norm": 0.511391282081604, + "learning_rate": 0.0005752225846949287, + "loss": 5.4319, + "step": 490 + }, + { + "epoch": 0.15702892322452977, + "grad_norm": 0.44350582361221313, + "learning_rate": 0.000575098737849316, + "loss": 5.3892, + "step": 491 + }, + { + "epoch": 0.15734873773211538, + "grad_norm": 0.5511606335639954, + "learning_rate": 0.0005749745956585077, + "loss": 5.3716, + "step": 492 + }, + { + "epoch": 0.15766855223970097, + "grad_norm": 0.44419968128204346, + "learning_rate": 0.0005748501582557825, + "loss": 5.4444, + "step": 493 + }, + { + "epoch": 0.15798836674728658, + "grad_norm": 0.37331676483154297, + "learning_rate": 0.0005747254257747362, + "loss": 5.39, + "step": 494 + }, + { + "epoch": 0.15830818125487217, + "grad_norm": 0.4771307408809662, + "learning_rate": 0.0005746003983492811, + "loss": 5.344, + "step": 495 + }, + { + "epoch": 0.15862799576245779, + "grad_norm": 0.44543081521987915, + "learning_rate": 0.0005744750761136463, + "loss": 5.3273, + "step": 496 + }, + { + "epoch": 0.15894781027004337, + "grad_norm": 0.5002357363700867, + "learning_rate": 0.0005743494592023773, + "loss": 5.3547, + "step": 497 + }, + { + "epoch": 0.15926762477762899, + "grad_norm": 0.7490878105163574, + "learning_rate": 0.0005742235477503362, + "loss": 5.3641, + "step": 498 + }, + { + "epoch": 0.15958743928521457, + "grad_norm": 0.4287881553173065, + "learning_rate": 0.000574097341892701, + "loss": 5.333, + "step": 499 + }, + { + "epoch": 0.15990725379280019, + "grad_norm": 0.4294663071632385, + "learning_rate": 0.0005739708417649659, + "loss": 5.2918, + "step": 500 + }, + { + "epoch": 0.15990725379280019, + "eval_loss": 5.372103691101074, + "eval_runtime": 78.0671, + "eval_samples_per_second": 24.3, + "eval_steps_per_second": 6.085, + "step": 500 + }, + { + "epoch": 0.16022706830038577, + "grad_norm": 0.42622506618499756, + "learning_rate": 0.0005738440475029414, + "loss": 5.3715, + "step": 501 + }, + { + "epoch": 0.16054688280797139, + "grad_norm": 0.5203431248664856, + "learning_rate": 0.0005737169592427531, + "loss": 5.4029, + "step": 502 + }, + { + "epoch": 0.16086669731555697, + "grad_norm": 0.4429241120815277, + "learning_rate": 0.0005735895771208427, + "loss": 5.4216, + "step": 503 + }, + { + "epoch": 0.1611865118231426, + "grad_norm": 0.478929728269577, + "learning_rate": 0.0005734619012739673, + "loss": 5.4132, + "step": 504 + }, + { + "epoch": 0.16150632633072817, + "grad_norm": 0.3934062719345093, + "learning_rate": 0.0005733339318391992, + "loss": 5.4462, + "step": 505 + }, + { + "epoch": 0.1618261408383138, + "grad_norm": 0.4275869131088257, + "learning_rate": 0.0005732056689539262, + "loss": 5.3483, + "step": 506 + }, + { + "epoch": 0.16214595534589937, + "grad_norm": 1.1098144054412842, + "learning_rate": 0.0005730771127558508, + "loss": 5.3647, + "step": 507 + }, + { + "epoch": 0.162465769853485, + "grad_norm": 0.4583177864551544, + "learning_rate": 0.0005729482633829906, + "loss": 5.3677, + "step": 508 + }, + { + "epoch": 0.16278558436107057, + "grad_norm": 0.4252079725265503, + "learning_rate": 0.000572819120973678, + "loss": 5.3601, + "step": 509 + }, + { + "epoch": 0.1631053988686562, + "grad_norm": 0.5495724678039551, + "learning_rate": 0.0005726896856665599, + "loss": 5.3454, + "step": 510 + }, + { + "epoch": 0.16342521337624177, + "grad_norm": 0.40474510192871094, + "learning_rate": 0.0005725599576005975, + "loss": 5.3744, + "step": 511 + }, + { + "epoch": 0.1637450278838274, + "grad_norm": 0.4442523717880249, + "learning_rate": 0.0005724299369150665, + "loss": 5.396, + "step": 512 + }, + { + "epoch": 0.16406484239141297, + "grad_norm": 0.4059533178806305, + "learning_rate": 0.0005722996237495569, + "loss": 5.3996, + "step": 513 + }, + { + "epoch": 0.1643846568989986, + "grad_norm": 0.613540768623352, + "learning_rate": 0.0005721690182439724, + "loss": 5.4113, + "step": 514 + }, + { + "epoch": 0.16470447140658417, + "grad_norm": 0.42122897505760193, + "learning_rate": 0.0005720381205385306, + "loss": 5.3639, + "step": 515 + }, + { + "epoch": 0.1650242859141698, + "grad_norm": 0.3783654570579529, + "learning_rate": 0.000571906930773763, + "loss": 5.3616, + "step": 516 + }, + { + "epoch": 0.16534410042175537, + "grad_norm": 0.52512127161026, + "learning_rate": 0.0005717754490905146, + "loss": 5.3951, + "step": 517 + }, + { + "epoch": 0.165663914929341, + "grad_norm": 0.3968490958213806, + "learning_rate": 0.0005716436756299437, + "loss": 5.337, + "step": 518 + }, + { + "epoch": 0.16598372943692657, + "grad_norm": 0.40351352095603943, + "learning_rate": 0.000571511610533522, + "loss": 5.3883, + "step": 519 + }, + { + "epoch": 0.1663035439445122, + "grad_norm": 0.43766388297080994, + "learning_rate": 0.0005713792539430339, + "loss": 5.3675, + "step": 520 + }, + { + "epoch": 0.16662335845209778, + "grad_norm": 0.413519024848938, + "learning_rate": 0.0005712466060005774, + "loss": 5.368, + "step": 521 + }, + { + "epoch": 0.1669431729596834, + "grad_norm": 0.42428046464920044, + "learning_rate": 0.0005711136668485626, + "loss": 5.2983, + "step": 522 + }, + { + "epoch": 0.16726298746726898, + "grad_norm": 0.4255489110946655, + "learning_rate": 0.0005709804366297129, + "loss": 5.3115, + "step": 523 + }, + { + "epoch": 0.1675828019748546, + "grad_norm": 0.44080090522766113, + "learning_rate": 0.0005708469154870636, + "loss": 5.3866, + "step": 524 + }, + { + "epoch": 0.16790261648244018, + "grad_norm": 0.38031458854675293, + "learning_rate": 0.0005707131035639629, + "loss": 5.3553, + "step": 525 + }, + { + "epoch": 0.1682224309900258, + "grad_norm": 0.4256940484046936, + "learning_rate": 0.0005705790010040707, + "loss": 5.3747, + "step": 526 + }, + { + "epoch": 0.16854224549761138, + "grad_norm": 0.41666266322135925, + "learning_rate": 0.000570444607951359, + "loss": 5.3955, + "step": 527 + }, + { + "epoch": 0.168862060005197, + "grad_norm": 0.4019726514816284, + "learning_rate": 0.000570309924550112, + "loss": 5.3084, + "step": 528 + }, + { + "epoch": 0.16918187451278258, + "grad_norm": 0.75457763671875, + "learning_rate": 0.0005701749509449253, + "loss": 5.2837, + "step": 529 + }, + { + "epoch": 0.1695016890203682, + "grad_norm": 0.41248270869255066, + "learning_rate": 0.0005700396872807062, + "loss": 5.3335, + "step": 530 + }, + { + "epoch": 0.16982150352795378, + "grad_norm": 0.4266038239002228, + "learning_rate": 0.0005699041337026734, + "loss": 5.3278, + "step": 531 + }, + { + "epoch": 0.1701413180355394, + "grad_norm": 0.41244661808013916, + "learning_rate": 0.0005697682903563568, + "loss": 5.3348, + "step": 532 + }, + { + "epoch": 0.17046113254312498, + "grad_norm": 0.39695993065834045, + "learning_rate": 0.0005696321573875974, + "loss": 5.3294, + "step": 533 + }, + { + "epoch": 0.1707809470507106, + "grad_norm": 0.4267037808895111, + "learning_rate": 0.0005694957349425472, + "loss": 5.3353, + "step": 534 + }, + { + "epoch": 0.17110076155829618, + "grad_norm": 0.4073215425014496, + "learning_rate": 0.0005693590231676688, + "loss": 5.3505, + "step": 535 + }, + { + "epoch": 0.1714205760658818, + "grad_norm": 0.4800320565700531, + "learning_rate": 0.0005692220222097357, + "loss": 5.3556, + "step": 536 + }, + { + "epoch": 0.17174039057346738, + "grad_norm": 0.42410972714424133, + "learning_rate": 0.0005690847322158317, + "loss": 5.3686, + "step": 537 + }, + { + "epoch": 0.172060205081053, + "grad_norm": 0.4676796495914459, + "learning_rate": 0.0005689471533333508, + "loss": 5.2979, + "step": 538 + }, + { + "epoch": 0.17238001958863858, + "grad_norm": 0.4038192927837372, + "learning_rate": 0.0005688092857099974, + "loss": 5.2512, + "step": 539 + }, + { + "epoch": 0.1726998340962242, + "grad_norm": 0.42532771825790405, + "learning_rate": 0.0005686711294937858, + "loss": 5.247, + "step": 540 + }, + { + "epoch": 0.17301964860380978, + "grad_norm": 0.46493101119995117, + "learning_rate": 0.0005685326848330402, + "loss": 5.3337, + "step": 541 + }, + { + "epoch": 0.1733394631113954, + "grad_norm": 0.45587480068206787, + "learning_rate": 0.0005683939518763942, + "loss": 5.2597, + "step": 542 + }, + { + "epoch": 0.17365927761898098, + "grad_norm": 0.4194573760032654, + "learning_rate": 0.000568254930772791, + "loss": 5.3739, + "step": 543 + }, + { + "epoch": 0.1739790921265666, + "grad_norm": 0.4531188905239105, + "learning_rate": 0.0005681156216714836, + "loss": 5.2942, + "step": 544 + }, + { + "epoch": 0.17429890663415218, + "grad_norm": 0.4515056014060974, + "learning_rate": 0.0005679760247220336, + "loss": 5.3101, + "step": 545 + }, + { + "epoch": 0.1746187211417378, + "grad_norm": 0.43099555373191833, + "learning_rate": 0.0005678361400743119, + "loss": 5.3211, + "step": 546 + }, + { + "epoch": 0.17493853564932338, + "grad_norm": 0.48675426840782166, + "learning_rate": 0.0005676959678784982, + "loss": 5.215, + "step": 547 + }, + { + "epoch": 0.175258350156909, + "grad_norm": 0.40001335740089417, + "learning_rate": 0.000567555508285081, + "loss": 5.3301, + "step": 548 + }, + { + "epoch": 0.17557816466449458, + "grad_norm": 0.45683255791664124, + "learning_rate": 0.0005674147614448574, + "loss": 5.3602, + "step": 549 + }, + { + "epoch": 0.1758979791720802, + "grad_norm": 0.4169895052909851, + "learning_rate": 0.0005672737275089327, + "loss": 5.3581, + "step": 550 + }, + { + "epoch": 0.1762177936796658, + "grad_norm": 0.4237796664237976, + "learning_rate": 0.0005671324066287205, + "loss": 5.2729, + "step": 551 + }, + { + "epoch": 0.1765376081872514, + "grad_norm": 0.47753238677978516, + "learning_rate": 0.0005669907989559426, + "loss": 5.3352, + "step": 552 + }, + { + "epoch": 0.176857422694837, + "grad_norm": 0.4339083731174469, + "learning_rate": 0.0005668489046426285, + "loss": 5.3531, + "step": 553 + }, + { + "epoch": 0.1771772372024226, + "grad_norm": 0.4622672200202942, + "learning_rate": 0.0005667067238411153, + "loss": 5.3213, + "step": 554 + }, + { + "epoch": 0.1774970517100082, + "grad_norm": 0.4333019554615021, + "learning_rate": 0.0005665642567040483, + "loss": 5.3419, + "step": 555 + }, + { + "epoch": 0.1778168662175938, + "grad_norm": 0.3928539752960205, + "learning_rate": 0.0005664215033843796, + "loss": 5.2607, + "step": 556 + }, + { + "epoch": 0.1781366807251794, + "grad_norm": 0.387408584356308, + "learning_rate": 0.0005662784640353688, + "loss": 5.2742, + "step": 557 + }, + { + "epoch": 0.178456495232765, + "grad_norm": 0.4588526487350464, + "learning_rate": 0.0005661351388105823, + "loss": 5.3356, + "step": 558 + }, + { + "epoch": 0.1787763097403506, + "grad_norm": 0.440641850233078, + "learning_rate": 0.0005659915278638939, + "loss": 5.3318, + "step": 559 + }, + { + "epoch": 0.1790961242479362, + "grad_norm": 0.3728678822517395, + "learning_rate": 0.0005658476313494839, + "loss": 5.2345, + "step": 560 + }, + { + "epoch": 0.1794159387555218, + "grad_norm": 0.37536266446113586, + "learning_rate": 0.0005657034494218389, + "loss": 5.282, + "step": 561 + }, + { + "epoch": 0.1797357532631074, + "grad_norm": 0.3769688904285431, + "learning_rate": 0.0005655589822357526, + "loss": 5.2509, + "step": 562 + }, + { + "epoch": 0.180055567770693, + "grad_norm": 0.4108069837093353, + "learning_rate": 0.0005654142299463241, + "loss": 5.3465, + "step": 563 + }, + { + "epoch": 0.1803753822782786, + "grad_norm": 0.3620480000972748, + "learning_rate": 0.0005652691927089593, + "loss": 5.2491, + "step": 564 + }, + { + "epoch": 0.1806951967858642, + "grad_norm": 0.3661453127861023, + "learning_rate": 0.0005651238706793697, + "loss": 5.2158, + "step": 565 + }, + { + "epoch": 0.1810150112934498, + "grad_norm": 0.4069892466068268, + "learning_rate": 0.0005649782640135727, + "loss": 5.2745, + "step": 566 + }, + { + "epoch": 0.1813348258010354, + "grad_norm": 0.42880722880363464, + "learning_rate": 0.000564832372867891, + "loss": 5.2006, + "step": 567 + }, + { + "epoch": 0.181654640308621, + "grad_norm": 0.39120304584503174, + "learning_rate": 0.0005646861973989531, + "loss": 5.3242, + "step": 568 + }, + { + "epoch": 0.1819744548162066, + "grad_norm": 0.35623669624328613, + "learning_rate": 0.0005645397377636922, + "loss": 5.2269, + "step": 569 + }, + { + "epoch": 0.1822942693237922, + "grad_norm": 0.39440712332725525, + "learning_rate": 0.0005643929941193474, + "loss": 5.2249, + "step": 570 + }, + { + "epoch": 0.1826140838313778, + "grad_norm": 0.37591278553009033, + "learning_rate": 0.000564245966623462, + "loss": 5.2233, + "step": 571 + }, + { + "epoch": 0.1829338983389634, + "grad_norm": 0.4042278230190277, + "learning_rate": 0.0005640986554338842, + "loss": 5.2269, + "step": 572 + }, + { + "epoch": 0.183253712846549, + "grad_norm": 0.4501861035823822, + "learning_rate": 0.0005639510607087673, + "loss": 5.3752, + "step": 573 + }, + { + "epoch": 0.1835735273541346, + "grad_norm": 0.3743823170661926, + "learning_rate": 0.0005638031826065679, + "loss": 5.2696, + "step": 574 + }, + { + "epoch": 0.1838933418617202, + "grad_norm": 0.4235236942768097, + "learning_rate": 0.0005636550212860479, + "loss": 5.2862, + "step": 575 + }, + { + "epoch": 0.1842131563693058, + "grad_norm": 0.3701503872871399, + "learning_rate": 0.0005635065769062728, + "loss": 5.2726, + "step": 576 + }, + { + "epoch": 0.1845329708768914, + "grad_norm": 0.45752373337745667, + "learning_rate": 0.0005633578496266121, + "loss": 5.2393, + "step": 577 + }, + { + "epoch": 0.184852785384477, + "grad_norm": 0.35952097177505493, + "learning_rate": 0.0005632088396067389, + "loss": 5.2572, + "step": 578 + }, + { + "epoch": 0.1851725998920626, + "grad_norm": 0.5012151598930359, + "learning_rate": 0.0005630595470066299, + "loss": 5.3061, + "step": 579 + }, + { + "epoch": 0.1854924143996482, + "grad_norm": 0.42023152112960815, + "learning_rate": 0.0005629099719865652, + "loss": 5.337, + "step": 580 + }, + { + "epoch": 0.1858122289072338, + "grad_norm": 0.4002183973789215, + "learning_rate": 0.0005627601147071282, + "loss": 5.2125, + "step": 581 + }, + { + "epoch": 0.1861320434148194, + "grad_norm": 0.4116981327533722, + "learning_rate": 0.000562609975329205, + "loss": 5.2079, + "step": 582 + }, + { + "epoch": 0.186451857922405, + "grad_norm": 0.41482430696487427, + "learning_rate": 0.0005624595540139851, + "loss": 5.2986, + "step": 583 + }, + { + "epoch": 0.1867716724299906, + "grad_norm": 0.380515456199646, + "learning_rate": 0.0005623088509229602, + "loss": 5.2007, + "step": 584 + }, + { + "epoch": 0.1870914869375762, + "grad_norm": 0.42362508177757263, + "learning_rate": 0.0005621578662179247, + "loss": 5.2153, + "step": 585 + }, + { + "epoch": 0.1874113014451618, + "grad_norm": 0.43335291743278503, + "learning_rate": 0.0005620066000609755, + "loss": 5.2784, + "step": 586 + }, + { + "epoch": 0.1877311159527474, + "grad_norm": 0.39207449555397034, + "learning_rate": 0.0005618550526145113, + "loss": 5.256, + "step": 587 + }, + { + "epoch": 0.188050930460333, + "grad_norm": 0.4123283326625824, + "learning_rate": 0.0005617032240412329, + "loss": 5.2194, + "step": 588 + }, + { + "epoch": 0.1883707449679186, + "grad_norm": 0.43690216541290283, + "learning_rate": 0.0005615511145041433, + "loss": 5.2638, + "step": 589 + }, + { + "epoch": 0.1886905594755042, + "grad_norm": 0.41950878500938416, + "learning_rate": 0.0005613987241665468, + "loss": 5.3, + "step": 590 + }, + { + "epoch": 0.1890103739830898, + "grad_norm": 0.4976102113723755, + "learning_rate": 0.000561246053192049, + "loss": 5.2027, + "step": 591 + }, + { + "epoch": 0.1893301884906754, + "grad_norm": 0.4183708131313324, + "learning_rate": 0.0005610931017445573, + "loss": 5.264, + "step": 592 + }, + { + "epoch": 0.189650002998261, + "grad_norm": 0.5086091756820679, + "learning_rate": 0.0005609398699882796, + "loss": 5.2579, + "step": 593 + }, + { + "epoch": 0.1899698175058466, + "grad_norm": 0.3733188509941101, + "learning_rate": 0.0005607863580877253, + "loss": 5.282, + "step": 594 + }, + { + "epoch": 0.19028963201343221, + "grad_norm": 0.5005497336387634, + "learning_rate": 0.0005606325662077042, + "loss": 5.2219, + "step": 595 + }, + { + "epoch": 0.1906094465210178, + "grad_norm": 0.381599485874176, + "learning_rate": 0.0005604784945133271, + "loss": 5.1787, + "step": 596 + }, + { + "epoch": 0.19092926102860341, + "grad_norm": 0.47054019570350647, + "learning_rate": 0.0005603241431700045, + "loss": 5.2808, + "step": 597 + }, + { + "epoch": 0.191249075536189, + "grad_norm": 0.38837242126464844, + "learning_rate": 0.0005601695123434477, + "loss": 5.1762, + "step": 598 + }, + { + "epoch": 0.19156889004377461, + "grad_norm": 0.48838260769844055, + "learning_rate": 0.000560014602199668, + "loss": 5.2408, + "step": 599 + }, + { + "epoch": 0.1918887045513602, + "grad_norm": 0.3695312440395355, + "learning_rate": 0.0005598594129049765, + "loss": 5.329, + "step": 600 + }, + { + "epoch": 0.1918887045513602, + "eval_loss": 5.246670246124268, + "eval_runtime": 80.6874, + "eval_samples_per_second": 23.51, + "eval_steps_per_second": 5.887, + "step": 600 + }, + { + "epoch": 0.19220851905894581, + "grad_norm": 0.41941291093826294, + "learning_rate": 0.0005597039446259837, + "loss": 5.3453, + "step": 601 + }, + { + "epoch": 0.1925283335665314, + "grad_norm": 0.4879932999610901, + "learning_rate": 0.0005595481975296002, + "loss": 5.3331, + "step": 602 + }, + { + "epoch": 0.19284814807411702, + "grad_norm": 0.4067867398262024, + "learning_rate": 0.0005593921717830354, + "loss": 5.2184, + "step": 603 + }, + { + "epoch": 0.1931679625817026, + "grad_norm": 0.43347740173339844, + "learning_rate": 0.0005592358675537983, + "loss": 5.2875, + "step": 604 + }, + { + "epoch": 0.19348777708928822, + "grad_norm": 0.41474223136901855, + "learning_rate": 0.0005590792850096965, + "loss": 5.2204, + "step": 605 + }, + { + "epoch": 0.1938075915968738, + "grad_norm": 0.4278968870639801, + "learning_rate": 0.0005589224243188365, + "loss": 5.2632, + "step": 606 + }, + { + "epoch": 0.19412740610445942, + "grad_norm": 0.4322011470794678, + "learning_rate": 0.0005587652856496236, + "loss": 5.1834, + "step": 607 + }, + { + "epoch": 0.194447220612045, + "grad_norm": 0.3791482150554657, + "learning_rate": 0.0005586078691707614, + "loss": 5.1847, + "step": 608 + }, + { + "epoch": 0.19476703511963062, + "grad_norm": 0.37472835183143616, + "learning_rate": 0.0005584501750512516, + "loss": 5.2734, + "step": 609 + }, + { + "epoch": 0.1950868496272162, + "grad_norm": 0.3757517337799072, + "learning_rate": 0.0005582922034603945, + "loss": 5.2306, + "step": 610 + }, + { + "epoch": 0.19540666413480182, + "grad_norm": 0.37529265880584717, + "learning_rate": 0.0005581339545677877, + "loss": 5.2716, + "step": 611 + }, + { + "epoch": 0.1957264786423874, + "grad_norm": 0.38108786940574646, + "learning_rate": 0.0005579754285433269, + "loss": 5.1982, + "step": 612 + }, + { + "epoch": 0.19604629314997302, + "grad_norm": 0.4154306650161743, + "learning_rate": 0.0005578166255572048, + "loss": 5.2879, + "step": 613 + }, + { + "epoch": 0.1963661076575586, + "grad_norm": 0.40405797958374023, + "learning_rate": 0.0005576575457799122, + "loss": 5.2084, + "step": 614 + }, + { + "epoch": 0.19668592216514422, + "grad_norm": 0.4085008502006531, + "learning_rate": 0.0005574981893822365, + "loss": 5.2127, + "step": 615 + }, + { + "epoch": 0.1970057366727298, + "grad_norm": 0.39384084939956665, + "learning_rate": 0.0005573385565352622, + "loss": 5.2061, + "step": 616 + }, + { + "epoch": 0.19732555118031542, + "grad_norm": 0.3667563498020172, + "learning_rate": 0.0005571786474103709, + "loss": 5.305, + "step": 617 + }, + { + "epoch": 0.19764536568790103, + "grad_norm": 0.4089621603488922, + "learning_rate": 0.0005570184621792405, + "loss": 5.1365, + "step": 618 + }, + { + "epoch": 0.19796518019548662, + "grad_norm": 0.3874637484550476, + "learning_rate": 0.0005568580010138452, + "loss": 5.2534, + "step": 619 + }, + { + "epoch": 0.19828499470307223, + "grad_norm": 0.371349036693573, + "learning_rate": 0.0005566972640864558, + "loss": 5.2055, + "step": 620 + }, + { + "epoch": 0.19860480921065782, + "grad_norm": 0.36935508251190186, + "learning_rate": 0.0005565362515696389, + "loss": 5.2411, + "step": 621 + }, + { + "epoch": 0.19892462371824343, + "grad_norm": 0.4087619483470917, + "learning_rate": 0.0005563749636362572, + "loss": 5.1594, + "step": 622 + }, + { + "epoch": 0.19924443822582902, + "grad_norm": 0.3479459285736084, + "learning_rate": 0.0005562134004594687, + "loss": 5.2632, + "step": 623 + }, + { + "epoch": 0.19956425273341463, + "grad_norm": 0.4259833097457886, + "learning_rate": 0.0005560515622127276, + "loss": 5.1791, + "step": 624 + }, + { + "epoch": 0.19988406724100022, + "grad_norm": 0.4216997027397156, + "learning_rate": 0.0005558894490697824, + "loss": 5.1627, + "step": 625 + }, + { + "epoch": 0.20020388174858583, + "grad_norm": 0.37855878472328186, + "learning_rate": 0.0005557270612046777, + "loss": 5.1793, + "step": 626 + }, + { + "epoch": 0.20052369625617142, + "grad_norm": 0.35549864172935486, + "learning_rate": 0.0005555643987917525, + "loss": 5.1747, + "step": 627 + }, + { + "epoch": 0.20084351076375703, + "grad_norm": 0.3608039915561676, + "learning_rate": 0.0005554014620056406, + "loss": 5.1809, + "step": 628 + }, + { + "epoch": 0.20116332527134262, + "grad_norm": 0.35643693804740906, + "learning_rate": 0.0005552382510212706, + "loss": 5.1599, + "step": 629 + }, + { + "epoch": 0.20148313977892823, + "grad_norm": 0.38763564825057983, + "learning_rate": 0.0005550747660138653, + "loss": 5.2292, + "step": 630 + }, + { + "epoch": 0.20180295428651382, + "grad_norm": 0.4711453914642334, + "learning_rate": 0.0005549110071589418, + "loss": 5.1897, + "step": 631 + }, + { + "epoch": 0.20212276879409943, + "grad_norm": 0.3638678789138794, + "learning_rate": 0.0005547469746323109, + "loss": 5.1936, + "step": 632 + }, + { + "epoch": 0.20244258330168502, + "grad_norm": 0.3598167896270752, + "learning_rate": 0.0005545826686100776, + "loss": 5.2621, + "step": 633 + }, + { + "epoch": 0.20276239780927063, + "grad_norm": 0.38647595047950745, + "learning_rate": 0.0005544180892686403, + "loss": 5.1097, + "step": 634 + }, + { + "epoch": 0.20308221231685622, + "grad_norm": 0.38954514265060425, + "learning_rate": 0.000554253236784691, + "loss": 5.145, + "step": 635 + }, + { + "epoch": 0.20340202682444183, + "grad_norm": 0.580151379108429, + "learning_rate": 0.0005540881113352148, + "loss": 5.1903, + "step": 636 + }, + { + "epoch": 0.20372184133202742, + "grad_norm": 0.3926187753677368, + "learning_rate": 0.0005539227130974898, + "loss": 5.2114, + "step": 637 + }, + { + "epoch": 0.20404165583961303, + "grad_norm": 0.38736334443092346, + "learning_rate": 0.0005537570422490871, + "loss": 5.1262, + "step": 638 + }, + { + "epoch": 0.20436147034719862, + "grad_norm": 0.3514604866504669, + "learning_rate": 0.0005535910989678706, + "loss": 5.1722, + "step": 639 + }, + { + "epoch": 0.20468128485478423, + "grad_norm": 0.466145396232605, + "learning_rate": 0.0005534248834319962, + "loss": 5.1735, + "step": 640 + }, + { + "epoch": 0.20500109936236982, + "grad_norm": 0.3967139720916748, + "learning_rate": 0.0005532583958199126, + "loss": 5.1577, + "step": 641 + }, + { + "epoch": 0.20532091386995543, + "grad_norm": 0.8810538053512573, + "learning_rate": 0.0005530916363103605, + "loss": 5.2778, + "step": 642 + }, + { + "epoch": 0.20564072837754102, + "grad_norm": 0.367849737405777, + "learning_rate": 0.0005529246050823723, + "loss": 5.1614, + "step": 643 + }, + { + "epoch": 0.20596054288512664, + "grad_norm": 0.3990894556045532, + "learning_rate": 0.0005527573023152722, + "loss": 5.1537, + "step": 644 + }, + { + "epoch": 0.20628035739271222, + "grad_norm": 0.40600651502609253, + "learning_rate": 0.0005525897281886761, + "loss": 5.1424, + "step": 645 + }, + { + "epoch": 0.20660017190029784, + "grad_norm": 0.3771745264530182, + "learning_rate": 0.000552421882882491, + "loss": 5.1969, + "step": 646 + }, + { + "epoch": 0.20691998640788342, + "grad_norm": 0.4336475133895874, + "learning_rate": 0.000552253766576915, + "loss": 5.1765, + "step": 647 + }, + { + "epoch": 0.20723980091546904, + "grad_norm": 0.36182352900505066, + "learning_rate": 0.0005520853794524375, + "loss": 5.2697, + "step": 648 + }, + { + "epoch": 0.20755961542305462, + "grad_norm": 0.47667577862739563, + "learning_rate": 0.0005519167216898383, + "loss": 5.2595, + "step": 649 + }, + { + "epoch": 0.20787942993064024, + "grad_norm": 0.41349339485168457, + "learning_rate": 0.0005517477934701879, + "loss": 5.134, + "step": 650 + }, + { + "epoch": 0.20819924443822582, + "grad_norm": 0.3495538830757141, + "learning_rate": 0.0005515785949748471, + "loss": 5.1849, + "step": 651 + }, + { + "epoch": 0.20851905894581144, + "grad_norm": 0.3815857768058777, + "learning_rate": 0.0005514091263854671, + "loss": 5.1395, + "step": 652 + }, + { + "epoch": 0.20883887345339702, + "grad_norm": 0.3903793394565582, + "learning_rate": 0.0005512393878839885, + "loss": 5.0982, + "step": 653 + }, + { + "epoch": 0.20915868796098264, + "grad_norm": 0.4030245542526245, + "learning_rate": 0.0005510693796526425, + "loss": 5.0803, + "step": 654 + }, + { + "epoch": 0.20947850246856822, + "grad_norm": 0.4484241008758545, + "learning_rate": 0.000550899101873949, + "loss": 5.2476, + "step": 655 + }, + { + "epoch": 0.20979831697615384, + "grad_norm": 0.43497729301452637, + "learning_rate": 0.0005507285547307181, + "loss": 5.1655, + "step": 656 + }, + { + "epoch": 0.21011813148373942, + "grad_norm": 0.4137641489505768, + "learning_rate": 0.0005505577384060485, + "loss": 5.17, + "step": 657 + }, + { + "epoch": 0.21043794599132504, + "grad_norm": 0.4317357838153839, + "learning_rate": 0.0005503866530833281, + "loss": 5.219, + "step": 658 + }, + { + "epoch": 0.21075776049891062, + "grad_norm": 0.3765553832054138, + "learning_rate": 0.0005502152989462337, + "loss": 5.243, + "step": 659 + }, + { + "epoch": 0.21107757500649624, + "grad_norm": 0.4000135362148285, + "learning_rate": 0.0005500436761787306, + "loss": 5.1108, + "step": 660 + }, + { + "epoch": 0.21139738951408182, + "grad_norm": 0.39066606760025024, + "learning_rate": 0.0005498717849650724, + "loss": 5.2114, + "step": 661 + }, + { + "epoch": 0.21171720402166744, + "grad_norm": 0.3841003179550171, + "learning_rate": 0.0005496996254898011, + "loss": 5.1906, + "step": 662 + }, + { + "epoch": 0.21203701852925302, + "grad_norm": 0.4065331518650055, + "learning_rate": 0.0005495271979377464, + "loss": 5.1637, + "step": 663 + }, + { + "epoch": 0.21235683303683864, + "grad_norm": 0.37493783235549927, + "learning_rate": 0.0005493545024940264, + "loss": 5.1488, + "step": 664 + }, + { + "epoch": 0.21267664754442422, + "grad_norm": 0.3737407922744751, + "learning_rate": 0.000549181539344046, + "loss": 5.1694, + "step": 665 + }, + { + "epoch": 0.21299646205200984, + "grad_norm": 0.3587566018104553, + "learning_rate": 0.0005490083086734982, + "loss": 5.1737, + "step": 666 + }, + { + "epoch": 0.21331627655959542, + "grad_norm": 0.42007777094841003, + "learning_rate": 0.000548834810668363, + "loss": 5.2466, + "step": 667 + }, + { + "epoch": 0.21363609106718104, + "grad_norm": 0.3630130887031555, + "learning_rate": 0.0005486610455149069, + "loss": 5.2188, + "step": 668 + }, + { + "epoch": 0.21395590557476662, + "grad_norm": 0.4182991683483124, + "learning_rate": 0.0005484870133996842, + "loss": 5.2046, + "step": 669 + }, + { + "epoch": 0.21427572008235224, + "grad_norm": 0.40020623803138733, + "learning_rate": 0.0005483127145095349, + "loss": 5.1565, + "step": 670 + }, + { + "epoch": 0.21459553458993783, + "grad_norm": 0.41598305106163025, + "learning_rate": 0.0005481381490315859, + "loss": 5.2775, + "step": 671 + }, + { + "epoch": 0.21491534909752344, + "grad_norm": 0.3689316511154175, + "learning_rate": 0.0005479633171532503, + "loss": 5.1098, + "step": 672 + }, + { + "epoch": 0.21523516360510903, + "grad_norm": 0.40048545598983765, + "learning_rate": 0.0005477882190622269, + "loss": 5.1345, + "step": 673 + }, + { + "epoch": 0.21555497811269464, + "grad_norm": 0.36160582304000854, + "learning_rate": 0.0005476128549465006, + "loss": 5.1598, + "step": 674 + }, + { + "epoch": 0.21587479262028023, + "grad_norm": 0.3673115074634552, + "learning_rate": 0.0005474372249943417, + "loss": 5.2201, + "step": 675 + }, + { + "epoch": 0.21619460712786584, + "grad_norm": 0.4571950435638428, + "learning_rate": 0.0005472613293943062, + "loss": 5.202, + "step": 676 + }, + { + "epoch": 0.21651442163545143, + "grad_norm": 0.35925915837287903, + "learning_rate": 0.0005470851683352349, + "loss": 5.1643, + "step": 677 + }, + { + "epoch": 0.21683423614303704, + "grad_norm": 0.3808862864971161, + "learning_rate": 0.0005469087420062538, + "loss": 5.1623, + "step": 678 + }, + { + "epoch": 0.21715405065062263, + "grad_norm": 0.36828768253326416, + "learning_rate": 0.0005467320505967739, + "loss": 5.1198, + "step": 679 + }, + { + "epoch": 0.21747386515820824, + "grad_norm": 0.3488163352012634, + "learning_rate": 0.0005465550942964903, + "loss": 5.1343, + "step": 680 + }, + { + "epoch": 0.21779367966579383, + "grad_norm": 0.40178030729293823, + "learning_rate": 0.000546377873295383, + "loss": 5.0838, + "step": 681 + }, + { + "epoch": 0.21811349417337944, + "grad_norm": 0.37855249643325806, + "learning_rate": 0.0005462003877837157, + "loss": 5.1824, + "step": 682 + }, + { + "epoch": 0.21843330868096503, + "grad_norm": 0.39485079050064087, + "learning_rate": 0.0005460226379520365, + "loss": 5.1908, + "step": 683 + }, + { + "epoch": 0.21875312318855064, + "grad_norm": 0.4116956889629364, + "learning_rate": 0.0005458446239911772, + "loss": 5.1255, + "step": 684 + }, + { + "epoch": 0.21907293769613623, + "grad_norm": 0.38690185546875, + "learning_rate": 0.0005456663460922528, + "loss": 5.1903, + "step": 685 + }, + { + "epoch": 0.21939275220372184, + "grad_norm": 0.6163234114646912, + "learning_rate": 0.000545487804446662, + "loss": 5.1338, + "step": 686 + }, + { + "epoch": 0.21971256671130746, + "grad_norm": 0.3915090262889862, + "learning_rate": 0.0005453089992460868, + "loss": 5.1987, + "step": 687 + }, + { + "epoch": 0.22003238121889304, + "grad_norm": 0.4104084372520447, + "learning_rate": 0.0005451299306824917, + "loss": 5.1334, + "step": 688 + }, + { + "epoch": 0.22035219572647866, + "grad_norm": 0.44581642746925354, + "learning_rate": 0.0005449505989481243, + "loss": 5.1779, + "step": 689 + }, + { + "epoch": 0.22067201023406424, + "grad_norm": 0.41805586218833923, + "learning_rate": 0.0005447710042355145, + "loss": 5.1203, + "step": 690 + }, + { + "epoch": 0.22099182474164986, + "grad_norm": 0.42157623171806335, + "learning_rate": 0.0005445911467374747, + "loss": 5.1803, + "step": 691 + }, + { + "epoch": 0.22131163924923544, + "grad_norm": 0.38313740491867065, + "learning_rate": 0.0005444110266470995, + "loss": 5.2184, + "step": 692 + }, + { + "epoch": 0.22163145375682106, + "grad_norm": 0.35705432295799255, + "learning_rate": 0.0005442306441577651, + "loss": 5.3459, + "step": 693 + }, + { + "epoch": 0.22195126826440664, + "grad_norm": 0.3512997329235077, + "learning_rate": 0.0005440499994631299, + "loss": 5.1653, + "step": 694 + }, + { + "epoch": 0.22227108277199226, + "grad_norm": 0.38805651664733887, + "learning_rate": 0.0005438690927571332, + "loss": 5.1115, + "step": 695 + }, + { + "epoch": 0.22259089727957784, + "grad_norm": 0.37657782435417175, + "learning_rate": 0.000543687924233996, + "loss": 5.0689, + "step": 696 + }, + { + "epoch": 0.22291071178716346, + "grad_norm": 0.36555126309394836, + "learning_rate": 0.0005435064940882204, + "loss": 5.1798, + "step": 697 + }, + { + "epoch": 0.22323052629474904, + "grad_norm": 0.3793173134326935, + "learning_rate": 0.0005433248025145894, + "loss": 5.1051, + "step": 698 + }, + { + "epoch": 0.22355034080233466, + "grad_norm": 0.434103786945343, + "learning_rate": 0.0005431428497081661, + "loss": 5.1889, + "step": 699 + }, + { + "epoch": 0.22387015530992024, + "grad_norm": 0.3871440291404724, + "learning_rate": 0.0005429606358642948, + "loss": 5.0479, + "step": 700 + }, + { + "epoch": 0.22387015530992024, + "eval_loss": 5.1346116065979, + "eval_runtime": 83.8832, + "eval_samples_per_second": 22.615, + "eval_steps_per_second": 5.663, + "step": 700 + }, + { + "epoch": 0.22418996981750586, + "grad_norm": 0.37413862347602844, + "learning_rate": 0.0005427781611785998, + "loss": 5.0907, + "step": 701 + }, + { + "epoch": 0.22450978432509144, + "grad_norm": 0.41222137212753296, + "learning_rate": 0.0005425954258469852, + "loss": 5.1388, + "step": 702 + }, + { + "epoch": 0.22482959883267706, + "grad_norm": 0.3832525908946991, + "learning_rate": 0.000542412430065635, + "loss": 5.1396, + "step": 703 + }, + { + "epoch": 0.22514941334026264, + "grad_norm": 0.3836553692817688, + "learning_rate": 0.0005422291740310134, + "loss": 5.0898, + "step": 704 + }, + { + "epoch": 0.22546922784784826, + "grad_norm": 0.37601372599601746, + "learning_rate": 0.0005420456579398632, + "loss": 5.1271, + "step": 705 + }, + { + "epoch": 0.22578904235543384, + "grad_norm": 0.44405072927474976, + "learning_rate": 0.0005418618819892067, + "loss": 5.1508, + "step": 706 + }, + { + "epoch": 0.22610885686301946, + "grad_norm": 0.3710222542285919, + "learning_rate": 0.0005416778463763454, + "loss": 5.0919, + "step": 707 + }, + { + "epoch": 0.22642867137060504, + "grad_norm": 0.38442203402519226, + "learning_rate": 0.0005414935512988593, + "loss": 5.1242, + "step": 708 + }, + { + "epoch": 0.22674848587819066, + "grad_norm": 0.3756135106086731, + "learning_rate": 0.0005413089969546071, + "loss": 5.1882, + "step": 709 + }, + { + "epoch": 0.22706830038577624, + "grad_norm": 0.38981232047080994, + "learning_rate": 0.0005411241835417256, + "loss": 5.1904, + "step": 710 + }, + { + "epoch": 0.22738811489336186, + "grad_norm": 0.41756415367126465, + "learning_rate": 0.0005409391112586303, + "loss": 5.1239, + "step": 711 + }, + { + "epoch": 0.22770792940094745, + "grad_norm": 0.38702163100242615, + "learning_rate": 0.0005407537803040139, + "loss": 5.1117, + "step": 712 + }, + { + "epoch": 0.22802774390853306, + "grad_norm": 0.40529951453208923, + "learning_rate": 0.0005405681908768475, + "loss": 5.0795, + "step": 713 + }, + { + "epoch": 0.22834755841611865, + "grad_norm": 0.36242255568504333, + "learning_rate": 0.0005403823431763791, + "loss": 5.1368, + "step": 714 + }, + { + "epoch": 0.22866737292370426, + "grad_norm": 0.3760404884815216, + "learning_rate": 0.0005401962374021342, + "loss": 5.0858, + "step": 715 + }, + { + "epoch": 0.22898718743128985, + "grad_norm": 0.3962754011154175, + "learning_rate": 0.0005400098737539157, + "loss": 5.2717, + "step": 716 + }, + { + "epoch": 0.22930700193887546, + "grad_norm": 0.658300518989563, + "learning_rate": 0.0005398232524318029, + "loss": 5.1172, + "step": 717 + }, + { + "epoch": 0.22962681644646105, + "grad_norm": 0.3779653012752533, + "learning_rate": 0.0005396363736361519, + "loss": 5.1571, + "step": 718 + }, + { + "epoch": 0.22994663095404666, + "grad_norm": 0.4019310772418976, + "learning_rate": 0.0005394492375675953, + "loss": 5.0618, + "step": 719 + }, + { + "epoch": 0.23026644546163225, + "grad_norm": 0.38351598381996155, + "learning_rate": 0.0005392618444270417, + "loss": 5.0987, + "step": 720 + }, + { + "epoch": 0.23058625996921786, + "grad_norm": 0.4161638617515564, + "learning_rate": 0.0005390741944156759, + "loss": 5.1888, + "step": 721 + }, + { + "epoch": 0.23090607447680345, + "grad_norm": 0.3698093891143799, + "learning_rate": 0.0005388862877349584, + "loss": 5.0928, + "step": 722 + }, + { + "epoch": 0.23122588898438906, + "grad_norm": 0.4438035786151886, + "learning_rate": 0.0005386981245866252, + "loss": 5.0899, + "step": 723 + }, + { + "epoch": 0.23154570349197465, + "grad_norm": 0.3572853207588196, + "learning_rate": 0.0005385097051726879, + "loss": 5.1191, + "step": 724 + }, + { + "epoch": 0.23186551799956026, + "grad_norm": 0.4035443365573883, + "learning_rate": 0.0005383210296954328, + "loss": 5.1538, + "step": 725 + }, + { + "epoch": 0.23218533250714585, + "grad_norm": 0.3826752305030823, + "learning_rate": 0.0005381320983574214, + "loss": 5.188, + "step": 726 + }, + { + "epoch": 0.23250514701473146, + "grad_norm": 0.38229018449783325, + "learning_rate": 0.0005379429113614898, + "loss": 5.2084, + "step": 727 + }, + { + "epoch": 0.23282496152231705, + "grad_norm": 0.41644829511642456, + "learning_rate": 0.0005377534689107487, + "loss": 5.0967, + "step": 728 + }, + { + "epoch": 0.23314477602990266, + "grad_norm": 0.3715463876724243, + "learning_rate": 0.0005375637712085829, + "loss": 5.0421, + "step": 729 + }, + { + "epoch": 0.23346459053748825, + "grad_norm": 0.3967476785182953, + "learning_rate": 0.0005373738184586514, + "loss": 5.104, + "step": 730 + }, + { + "epoch": 0.23378440504507386, + "grad_norm": 0.3768517076969147, + "learning_rate": 0.0005371836108648868, + "loss": 5.1045, + "step": 731 + }, + { + "epoch": 0.23410421955265945, + "grad_norm": 0.4107721447944641, + "learning_rate": 0.0005369931486314953, + "loss": 5.0859, + "step": 732 + }, + { + "epoch": 0.23442403406024506, + "grad_norm": 0.3913835883140564, + "learning_rate": 0.0005368024319629569, + "loss": 5.2196, + "step": 733 + }, + { + "epoch": 0.23474384856783065, + "grad_norm": 0.4043770730495453, + "learning_rate": 0.0005366114610640241, + "loss": 5.1076, + "step": 734 + }, + { + "epoch": 0.23506366307541626, + "grad_norm": 0.3797147274017334, + "learning_rate": 0.000536420236139723, + "loss": 5.0718, + "step": 735 + }, + { + "epoch": 0.23538347758300185, + "grad_norm": 0.40881407260894775, + "learning_rate": 0.000536228757395352, + "loss": 5.0545, + "step": 736 + }, + { + "epoch": 0.23570329209058746, + "grad_norm": 0.3897201120853424, + "learning_rate": 0.000536037025036482, + "loss": 5.124, + "step": 737 + }, + { + "epoch": 0.23602310659817305, + "grad_norm": 0.3821205198764801, + "learning_rate": 0.0005358450392689564, + "loss": 5.0568, + "step": 738 + }, + { + "epoch": 0.23634292110575866, + "grad_norm": 0.36248835921287537, + "learning_rate": 0.0005356528002988907, + "loss": 5.1143, + "step": 739 + }, + { + "epoch": 0.23666273561334425, + "grad_norm": 0.3769552409648895, + "learning_rate": 0.000535460308332672, + "loss": 5.107, + "step": 740 + }, + { + "epoch": 0.23698255012092986, + "grad_norm": 0.4510996341705322, + "learning_rate": 0.0005352675635769589, + "loss": 5.2007, + "step": 741 + }, + { + "epoch": 0.23730236462851545, + "grad_norm": 0.39315706491470337, + "learning_rate": 0.0005350745662386818, + "loss": 5.115, + "step": 742 + }, + { + "epoch": 0.23762217913610106, + "grad_norm": 0.4506649076938629, + "learning_rate": 0.000534881316525042, + "loss": 5.0953, + "step": 743 + }, + { + "epoch": 0.23794199364368665, + "grad_norm": 0.4265020489692688, + "learning_rate": 0.0005346878146435119, + "loss": 5.0854, + "step": 744 + }, + { + "epoch": 0.23826180815127226, + "grad_norm": 0.7627310752868652, + "learning_rate": 0.0005344940608018345, + "loss": 5.0993, + "step": 745 + }, + { + "epoch": 0.23858162265885785, + "grad_norm": 0.40325039625167847, + "learning_rate": 0.0005343000552080235, + "loss": 5.1046, + "step": 746 + }, + { + "epoch": 0.23890143716644346, + "grad_norm": 0.4026595652103424, + "learning_rate": 0.0005341057980703624, + "loss": 5.0665, + "step": 747 + }, + { + "epoch": 0.23922125167402905, + "grad_norm": 0.38960710167884827, + "learning_rate": 0.0005339112895974054, + "loss": 5.1161, + "step": 748 + }, + { + "epoch": 0.23954106618161466, + "grad_norm": 0.37810978293418884, + "learning_rate": 0.0005337165299979761, + "loss": 5.1018, + "step": 749 + }, + { + "epoch": 0.23986088068920025, + "grad_norm": 0.406655877828598, + "learning_rate": 0.0005335215194811678, + "loss": 5.1116, + "step": 750 + }, + { + "epoch": 0.24018069519678586, + "grad_norm": 0.47911736369132996, + "learning_rate": 0.0005333262582563434, + "loss": 5.1131, + "step": 751 + }, + { + "epoch": 0.24050050970437145, + "grad_norm": 0.4555525779724121, + "learning_rate": 0.0005331307465331346, + "loss": 5.1441, + "step": 752 + }, + { + "epoch": 0.24082032421195707, + "grad_norm": 0.43813085556030273, + "learning_rate": 0.0005329349845214421, + "loss": 5.099, + "step": 753 + }, + { + "epoch": 0.24114013871954265, + "grad_norm": 0.411918967962265, + "learning_rate": 0.0005327389724314357, + "loss": 5.1283, + "step": 754 + }, + { + "epoch": 0.24145995322712827, + "grad_norm": 0.4153035879135132, + "learning_rate": 0.0005325427104735533, + "loss": 5.0294, + "step": 755 + }, + { + "epoch": 0.24177976773471388, + "grad_norm": 0.4288122057914734, + "learning_rate": 0.0005323461988585011, + "loss": 5.0778, + "step": 756 + }, + { + "epoch": 0.24209958224229947, + "grad_norm": 0.4059446454048157, + "learning_rate": 0.0005321494377972534, + "loss": 5.092, + "step": 757 + }, + { + "epoch": 0.24241939674988508, + "grad_norm": 0.4147324860095978, + "learning_rate": 0.0005319524275010524, + "loss": 5.0944, + "step": 758 + }, + { + "epoch": 0.24273921125747067, + "grad_norm": 0.43272823095321655, + "learning_rate": 0.0005317551681814076, + "loss": 5.1102, + "step": 759 + }, + { + "epoch": 0.24305902576505628, + "grad_norm": 0.39714720845222473, + "learning_rate": 0.0005315576600500962, + "loss": 5.1273, + "step": 760 + }, + { + "epoch": 0.24337884027264187, + "grad_norm": 0.38973182439804077, + "learning_rate": 0.0005313599033191622, + "loss": 5.0397, + "step": 761 + }, + { + "epoch": 0.24369865478022748, + "grad_norm": 0.5939362645149231, + "learning_rate": 0.0005311618982009168, + "loss": 5.1157, + "step": 762 + }, + { + "epoch": 0.24401846928781307, + "grad_norm": 0.38486504554748535, + "learning_rate": 0.0005309636449079377, + "loss": 5.1182, + "step": 763 + }, + { + "epoch": 0.24433828379539868, + "grad_norm": 0.4241164028644562, + "learning_rate": 0.0005307651436530688, + "loss": 5.1045, + "step": 764 + }, + { + "epoch": 0.24465809830298427, + "grad_norm": 0.39092934131622314, + "learning_rate": 0.0005305663946494208, + "loss": 5.0244, + "step": 765 + }, + { + "epoch": 0.24497791281056988, + "grad_norm": 0.3757341802120209, + "learning_rate": 0.0005303673981103698, + "loss": 5.0916, + "step": 766 + }, + { + "epoch": 0.24529772731815547, + "grad_norm": 0.41403621435165405, + "learning_rate": 0.000530168154249558, + "loss": 5.034, + "step": 767 + }, + { + "epoch": 0.24561754182574108, + "grad_norm": 0.4083942770957947, + "learning_rate": 0.000529968663280893, + "loss": 5.0816, + "step": 768 + }, + { + "epoch": 0.24593735633332667, + "grad_norm": 0.4417659044265747, + "learning_rate": 0.0005297689254185478, + "loss": 5.1167, + "step": 769 + }, + { + "epoch": 0.24625717084091228, + "grad_norm": 0.3944707214832306, + "learning_rate": 0.0005295689408769602, + "loss": 5.0785, + "step": 770 + }, + { + "epoch": 0.24657698534849787, + "grad_norm": 0.3857533037662506, + "learning_rate": 0.0005293687098708332, + "loss": 5.1196, + "step": 771 + }, + { + "epoch": 0.24689679985608348, + "grad_norm": 0.4001021981239319, + "learning_rate": 0.0005291682326151342, + "loss": 5.0776, + "step": 772 + }, + { + "epoch": 0.24721661436366907, + "grad_norm": 0.4205099046230316, + "learning_rate": 0.0005289675093250949, + "loss": 5.1358, + "step": 773 + }, + { + "epoch": 0.24753642887125468, + "grad_norm": 0.4752465784549713, + "learning_rate": 0.0005287665402162112, + "loss": 5.0899, + "step": 774 + }, + { + "epoch": 0.24785624337884027, + "grad_norm": 0.40020492672920227, + "learning_rate": 0.0005285653255042432, + "loss": 5.0728, + "step": 775 + }, + { + "epoch": 0.24817605788642588, + "grad_norm": 0.38226863741874695, + "learning_rate": 0.0005283638654052141, + "loss": 5.0339, + "step": 776 + }, + { + "epoch": 0.24849587239401147, + "grad_norm": 0.39739298820495605, + "learning_rate": 0.000528162160135411, + "loss": 5.1028, + "step": 777 + }, + { + "epoch": 0.24881568690159708, + "grad_norm": 0.41745543479919434, + "learning_rate": 0.000527960209911384, + "loss": 5.0628, + "step": 778 + }, + { + "epoch": 0.24913550140918267, + "grad_norm": 0.4319940507411957, + "learning_rate": 0.0005277580149499465, + "loss": 5.0945, + "step": 779 + }, + { + "epoch": 0.24945531591676828, + "grad_norm": 0.36128103733062744, + "learning_rate": 0.0005275555754681742, + "loss": 5.0755, + "step": 780 + }, + { + "epoch": 0.24977513042435387, + "grad_norm": 0.3810884356498718, + "learning_rate": 0.0005273528916834056, + "loss": 5.0289, + "step": 781 + }, + { + "epoch": 0.25009494493193946, + "grad_norm": 0.37757015228271484, + "learning_rate": 0.0005271499638132415, + "loss": 5.0682, + "step": 782 + }, + { + "epoch": 0.25041475943952507, + "grad_norm": 0.34914740920066833, + "learning_rate": 0.0005269467920755446, + "loss": 5.0937, + "step": 783 + }, + { + "epoch": 0.2507345739471107, + "grad_norm": 0.43093082308769226, + "learning_rate": 0.0005267433766884394, + "loss": 5.0231, + "step": 784 + }, + { + "epoch": 0.2510543884546963, + "grad_norm": 0.36913996934890747, + "learning_rate": 0.0005265397178703122, + "loss": 5.0134, + "step": 785 + }, + { + "epoch": 0.25137420296228186, + "grad_norm": 0.34622567892074585, + "learning_rate": 0.0005263358158398104, + "loss": 4.979, + "step": 786 + }, + { + "epoch": 0.25169401746986747, + "grad_norm": 0.3433881998062134, + "learning_rate": 0.0005261316708158426, + "loss": 5.0447, + "step": 787 + }, + { + "epoch": 0.2520138319774531, + "grad_norm": 0.34745779633522034, + "learning_rate": 0.0005259272830175784, + "loss": 5.015, + "step": 788 + }, + { + "epoch": 0.2523336464850387, + "grad_norm": 0.38063815236091614, + "learning_rate": 0.0005257226526644478, + "loss": 5.0884, + "step": 789 + }, + { + "epoch": 0.25265346099262426, + "grad_norm": 0.355397492647171, + "learning_rate": 0.0005255177799761416, + "loss": 5.0463, + "step": 790 + }, + { + "epoch": 0.25297327550020987, + "grad_norm": 0.4084603190422058, + "learning_rate": 0.0005253126651726102, + "loss": 5.0605, + "step": 791 + }, + { + "epoch": 0.2532930900077955, + "grad_norm": 0.37883585691452026, + "learning_rate": 0.0005251073084740646, + "loss": 5.058, + "step": 792 + }, + { + "epoch": 0.2536129045153811, + "grad_norm": 0.36867570877075195, + "learning_rate": 0.0005249017101009747, + "loss": 5.0808, + "step": 793 + }, + { + "epoch": 0.25393271902296666, + "grad_norm": 0.38485613465309143, + "learning_rate": 0.0005246958702740707, + "loss": 5.0665, + "step": 794 + }, + { + "epoch": 0.25425253353055227, + "grad_norm": 0.3903096318244934, + "learning_rate": 0.0005244897892143414, + "loss": 4.9922, + "step": 795 + }, + { + "epoch": 0.2545723480381379, + "grad_norm": 0.38079750537872314, + "learning_rate": 0.0005242834671430349, + "loss": 5.0481, + "step": 796 + }, + { + "epoch": 0.2548921625457235, + "grad_norm": 0.36984172463417053, + "learning_rate": 0.0005240769042816581, + "loss": 4.9691, + "step": 797 + }, + { + "epoch": 0.25521197705330906, + "grad_norm": 0.3557301461696625, + "learning_rate": 0.0005238701008519761, + "loss": 4.9433, + "step": 798 + }, + { + "epoch": 0.25553179156089467, + "grad_norm": 0.36051568388938904, + "learning_rate": 0.0005236630570760126, + "loss": 5.0263, + "step": 799 + }, + { + "epoch": 0.2558516060684803, + "grad_norm": 0.3609025180339813, + "learning_rate": 0.0005234557731760489, + "loss": 5.0769, + "step": 800 + }, + { + "epoch": 0.2558516060684803, + "eval_loss": 5.047708988189697, + "eval_runtime": 82.315, + "eval_samples_per_second": 23.046, + "eval_steps_per_second": 5.771, + "step": 800 + }, + { + "epoch": 0.2561714205760659, + "grad_norm": 0.35658442974090576, + "learning_rate": 0.0005232482493746247, + "loss": 5.0813, + "step": 801 + }, + { + "epoch": 0.25649123508365146, + "grad_norm": 0.37020763754844666, + "learning_rate": 0.0005230404858945369, + "loss": 5.1019, + "step": 802 + }, + { + "epoch": 0.2568110495912371, + "grad_norm": 0.34233710169792175, + "learning_rate": 0.0005228324829588396, + "loss": 5.1039, + "step": 803 + }, + { + "epoch": 0.2571308640988227, + "grad_norm": 0.37765568494796753, + "learning_rate": 0.0005226242407908441, + "loss": 5.0349, + "step": 804 + }, + { + "epoch": 0.2574506786064083, + "grad_norm": 0.35139763355255127, + "learning_rate": 0.0005224157596141189, + "loss": 5.0445, + "step": 805 + }, + { + "epoch": 0.25777049311399386, + "grad_norm": 0.3577726483345032, + "learning_rate": 0.0005222070396524886, + "loss": 4.9836, + "step": 806 + }, + { + "epoch": 0.2580903076215795, + "grad_norm": 0.34711799025535583, + "learning_rate": 0.0005219980811300342, + "loss": 5.1969, + "step": 807 + }, + { + "epoch": 0.2584101221291651, + "grad_norm": 0.36387181282043457, + "learning_rate": 0.0005217888842710931, + "loss": 5.0998, + "step": 808 + }, + { + "epoch": 0.2587299366367507, + "grad_norm": 0.3965149521827698, + "learning_rate": 0.0005215794493002583, + "loss": 4.9794, + "step": 809 + }, + { + "epoch": 0.25904975114433626, + "grad_norm": 0.39265790581703186, + "learning_rate": 0.000521369776442379, + "loss": 5.0195, + "step": 810 + }, + { + "epoch": 0.2593695656519219, + "grad_norm": 0.381270170211792, + "learning_rate": 0.0005211598659225588, + "loss": 5.0358, + "step": 811 + }, + { + "epoch": 0.2596893801595075, + "grad_norm": 0.38809168338775635, + "learning_rate": 0.0005209497179661573, + "loss": 5.1098, + "step": 812 + }, + { + "epoch": 0.2600091946670931, + "grad_norm": 0.4025643765926361, + "learning_rate": 0.0005207393327987886, + "loss": 5.0135, + "step": 813 + }, + { + "epoch": 0.26032900917467866, + "grad_norm": 0.3857475817203522, + "learning_rate": 0.0005205287106463219, + "loss": 5.0203, + "step": 814 + }, + { + "epoch": 0.2606488236822643, + "grad_norm": 0.386263906955719, + "learning_rate": 0.0005203178517348801, + "loss": 5.0744, + "step": 815 + }, + { + "epoch": 0.2609686381898499, + "grad_norm": 0.4239501357078552, + "learning_rate": 0.0005201067562908409, + "loss": 4.9913, + "step": 816 + }, + { + "epoch": 0.2612884526974355, + "grad_norm": 0.3845202922821045, + "learning_rate": 0.0005198954245408359, + "loss": 4.9916, + "step": 817 + }, + { + "epoch": 0.26160826720502106, + "grad_norm": 0.3505474328994751, + "learning_rate": 0.00051968385671175, + "loss": 5.0195, + "step": 818 + }, + { + "epoch": 0.2619280817126067, + "grad_norm": 0.36485254764556885, + "learning_rate": 0.000519472053030722, + "loss": 5.023, + "step": 819 + }, + { + "epoch": 0.2622478962201923, + "grad_norm": 0.3782816231250763, + "learning_rate": 0.0005192600137251435, + "loss": 5.0162, + "step": 820 + }, + { + "epoch": 0.2625677107277779, + "grad_norm": 0.5833194255828857, + "learning_rate": 0.0005190477390226595, + "loss": 5.1193, + "step": 821 + }, + { + "epoch": 0.26288752523536346, + "grad_norm": 0.3927282989025116, + "learning_rate": 0.0005188352291511673, + "loss": 4.9848, + "step": 822 + }, + { + "epoch": 0.2632073397429491, + "grad_norm": 0.4369112253189087, + "learning_rate": 0.000518622484338817, + "loss": 5.0558, + "step": 823 + }, + { + "epoch": 0.2635271542505347, + "grad_norm": 0.3752604126930237, + "learning_rate": 0.0005184095048140106, + "loss": 5.0508, + "step": 824 + }, + { + "epoch": 0.2638469687581203, + "grad_norm": 0.372988224029541, + "learning_rate": 0.0005181962908054027, + "loss": 5.0706, + "step": 825 + }, + { + "epoch": 0.2641667832657059, + "grad_norm": 0.371185839176178, + "learning_rate": 0.0005179828425418988, + "loss": 5.029, + "step": 826 + }, + { + "epoch": 0.2644865977732915, + "grad_norm": 0.4701564908027649, + "learning_rate": 0.0005177691602526566, + "loss": 5.0012, + "step": 827 + }, + { + "epoch": 0.2648064122808771, + "grad_norm": 0.37562939524650574, + "learning_rate": 0.0005175552441670847, + "loss": 5.0122, + "step": 828 + }, + { + "epoch": 0.2651262267884627, + "grad_norm": 0.3713657855987549, + "learning_rate": 0.0005173410945148427, + "loss": 5.0855, + "step": 829 + }, + { + "epoch": 0.2654460412960483, + "grad_norm": 0.3984169661998749, + "learning_rate": 0.0005171267115258412, + "loss": 5.0673, + "step": 830 + }, + { + "epoch": 0.2657658558036339, + "grad_norm": 0.3704157769680023, + "learning_rate": 0.0005169120954302409, + "loss": 5.0133, + "step": 831 + }, + { + "epoch": 0.2660856703112195, + "grad_norm": 0.38717636466026306, + "learning_rate": 0.0005166972464584532, + "loss": 4.9717, + "step": 832 + }, + { + "epoch": 0.2664054848188051, + "grad_norm": 0.37879058718681335, + "learning_rate": 0.0005164821648411394, + "loss": 5.0024, + "step": 833 + }, + { + "epoch": 0.2667252993263907, + "grad_norm": 0.37203672528266907, + "learning_rate": 0.0005162668508092103, + "loss": 4.9128, + "step": 834 + }, + { + "epoch": 0.2670451138339763, + "grad_norm": 0.3519008755683899, + "learning_rate": 0.0005160513045938265, + "loss": 4.9973, + "step": 835 + }, + { + "epoch": 0.2673649283415619, + "grad_norm": 0.36659252643585205, + "learning_rate": 0.0005158355264263978, + "loss": 5.0137, + "step": 836 + }, + { + "epoch": 0.2676847428491475, + "grad_norm": 0.36538055539131165, + "learning_rate": 0.0005156195165385829, + "loss": 5.009, + "step": 837 + }, + { + "epoch": 0.2680045573567331, + "grad_norm": 0.3755475580692291, + "learning_rate": 0.0005154032751622894, + "loss": 5.0259, + "step": 838 + }, + { + "epoch": 0.2683243718643187, + "grad_norm": 0.3508605360984802, + "learning_rate": 0.0005151868025296736, + "loss": 5.095, + "step": 839 + }, + { + "epoch": 0.2686441863719043, + "grad_norm": 0.3421269655227661, + "learning_rate": 0.0005149700988731397, + "loss": 4.9484, + "step": 840 + }, + { + "epoch": 0.2689640008794899, + "grad_norm": 0.42560285329818726, + "learning_rate": 0.0005147531644253402, + "loss": 5.11, + "step": 841 + }, + { + "epoch": 0.2692838153870755, + "grad_norm": 0.3709728717803955, + "learning_rate": 0.0005145359994191751, + "loss": 5.0851, + "step": 842 + }, + { + "epoch": 0.2696036298946611, + "grad_norm": 0.34871870279312134, + "learning_rate": 0.0005143186040877923, + "loss": 4.9554, + "step": 843 + }, + { + "epoch": 0.2699234444022467, + "grad_norm": 0.3683784604072571, + "learning_rate": 0.0005141009786645868, + "loss": 4.9996, + "step": 844 + }, + { + "epoch": 0.2702432589098323, + "grad_norm": 0.3769448697566986, + "learning_rate": 0.0005138831233832005, + "loss": 5.0486, + "step": 845 + }, + { + "epoch": 0.2705630734174179, + "grad_norm": 0.36065971851348877, + "learning_rate": 0.0005136650384775221, + "loss": 4.9998, + "step": 846 + }, + { + "epoch": 0.2708828879250035, + "grad_norm": 0.37307313084602356, + "learning_rate": 0.0005134467241816872, + "loss": 4.9706, + "step": 847 + }, + { + "epoch": 0.2712027024325891, + "grad_norm": 0.35749685764312744, + "learning_rate": 0.0005132281807300773, + "loss": 5.0893, + "step": 848 + }, + { + "epoch": 0.2715225169401747, + "grad_norm": 0.3517495095729828, + "learning_rate": 0.0005130094083573198, + "loss": 4.993, + "step": 849 + }, + { + "epoch": 0.2718423314477603, + "grad_norm": 0.3668519854545593, + "learning_rate": 0.0005127904072982884, + "loss": 5.0027, + "step": 850 + }, + { + "epoch": 0.2721621459553459, + "grad_norm": 0.3489197790622711, + "learning_rate": 0.0005125711777881016, + "loss": 4.9578, + "step": 851 + }, + { + "epoch": 0.2724819604629315, + "grad_norm": 0.39180049300193787, + "learning_rate": 0.0005123517200621238, + "loss": 5.0029, + "step": 852 + }, + { + "epoch": 0.2728017749705171, + "grad_norm": 0.37387964129447937, + "learning_rate": 0.0005121320343559641, + "loss": 4.9993, + "step": 853 + }, + { + "epoch": 0.2731215894781027, + "grad_norm": 0.369540274143219, + "learning_rate": 0.0005119121209054767, + "loss": 5.0809, + "step": 854 + }, + { + "epoch": 0.2734414039856883, + "grad_norm": 0.3686256408691406, + "learning_rate": 0.0005116919799467597, + "loss": 5.0736, + "step": 855 + }, + { + "epoch": 0.2737612184932739, + "grad_norm": 0.36162644624710083, + "learning_rate": 0.0005114716117161558, + "loss": 5.0013, + "step": 856 + }, + { + "epoch": 0.2740810330008595, + "grad_norm": 0.3603357970714569, + "learning_rate": 0.0005112510164502518, + "loss": 4.982, + "step": 857 + }, + { + "epoch": 0.2744008475084451, + "grad_norm": 0.41782650351524353, + "learning_rate": 0.000511030194385878, + "loss": 4.9961, + "step": 858 + }, + { + "epoch": 0.2747206620160307, + "grad_norm": 0.37291574478149414, + "learning_rate": 0.0005108091457601085, + "loss": 4.9476, + "step": 859 + }, + { + "epoch": 0.2750404765236163, + "grad_norm": 0.3500765562057495, + "learning_rate": 0.0005105878708102604, + "loss": 4.9956, + "step": 860 + }, + { + "epoch": 0.2753602910312019, + "grad_norm": 0.6664988398551941, + "learning_rate": 0.0005103663697738937, + "loss": 4.9269, + "step": 861 + }, + { + "epoch": 0.2756801055387875, + "grad_norm": 0.45024263858795166, + "learning_rate": 0.0005101446428888115, + "loss": 5.0055, + "step": 862 + }, + { + "epoch": 0.2759999200463731, + "grad_norm": 0.3707965314388275, + "learning_rate": 0.0005099226903930589, + "loss": 5.037, + "step": 863 + }, + { + "epoch": 0.2763197345539587, + "grad_norm": 0.4126085937023163, + "learning_rate": 0.0005097005125249236, + "loss": 4.9696, + "step": 864 + }, + { + "epoch": 0.2766395490615443, + "grad_norm": 0.34763750433921814, + "learning_rate": 0.0005094781095229352, + "loss": 4.9535, + "step": 865 + }, + { + "epoch": 0.2769593635691299, + "grad_norm": 0.35203567147254944, + "learning_rate": 0.0005092554816258644, + "loss": 4.9696, + "step": 866 + }, + { + "epoch": 0.2772791780767155, + "grad_norm": 0.36334866285324097, + "learning_rate": 0.0005090326290727245, + "loss": 4.9889, + "step": 867 + }, + { + "epoch": 0.2775989925843011, + "grad_norm": 0.40206074714660645, + "learning_rate": 0.0005088095521027689, + "loss": 4.9872, + "step": 868 + }, + { + "epoch": 0.2779188070918867, + "grad_norm": 0.35013046860694885, + "learning_rate": 0.0005085862509554926, + "loss": 4.9679, + "step": 869 + }, + { + "epoch": 0.2782386215994723, + "grad_norm": 0.40070170164108276, + "learning_rate": 0.000508362725870631, + "loss": 4.9543, + "step": 870 + }, + { + "epoch": 0.2785584361070579, + "grad_norm": 0.3722264766693115, + "learning_rate": 0.0005081389770881599, + "loss": 5.0307, + "step": 871 + }, + { + "epoch": 0.2788782506146435, + "grad_norm": 0.3882618844509125, + "learning_rate": 0.0005079150048482954, + "loss": 4.9114, + "step": 872 + }, + { + "epoch": 0.2791980651222291, + "grad_norm": 0.3612721562385559, + "learning_rate": 0.0005076908093914936, + "loss": 4.9894, + "step": 873 + }, + { + "epoch": 0.2795178796298147, + "grad_norm": 0.39893192052841187, + "learning_rate": 0.0005074663909584498, + "loss": 4.9934, + "step": 874 + }, + { + "epoch": 0.2798376941374003, + "grad_norm": 0.37248483300209045, + "learning_rate": 0.000507241749790099, + "loss": 4.9761, + "step": 875 + }, + { + "epoch": 0.2801575086449859, + "grad_norm": 0.41064324975013733, + "learning_rate": 0.0005070168861276155, + "loss": 4.9979, + "step": 876 + }, + { + "epoch": 0.2804773231525715, + "grad_norm": 0.5655444264411926, + "learning_rate": 0.0005067918002124121, + "loss": 4.9685, + "step": 877 + }, + { + "epoch": 0.2807971376601571, + "grad_norm": 0.37055379152297974, + "learning_rate": 0.0005065664922861405, + "loss": 5.0583, + "step": 878 + }, + { + "epoch": 0.2811169521677427, + "grad_norm": 0.342821329832077, + "learning_rate": 0.0005063409625906905, + "loss": 4.962, + "step": 879 + }, + { + "epoch": 0.2814367666753283, + "grad_norm": 0.3737742006778717, + "learning_rate": 0.0005061152113681901, + "loss": 4.9767, + "step": 880 + }, + { + "epoch": 0.2817565811829139, + "grad_norm": 0.34838446974754333, + "learning_rate": 0.0005058892388610053, + "loss": 4.9858, + "step": 881 + }, + { + "epoch": 0.2820763956904995, + "grad_norm": 0.362216979265213, + "learning_rate": 0.0005056630453117394, + "loss": 4.986, + "step": 882 + }, + { + "epoch": 0.2823962101980851, + "grad_norm": 0.6060847640037537, + "learning_rate": 0.0005054366309632333, + "loss": 5.0062, + "step": 883 + }, + { + "epoch": 0.2827160247056707, + "grad_norm": 0.3720206022262573, + "learning_rate": 0.0005052099960585645, + "loss": 4.9763, + "step": 884 + }, + { + "epoch": 0.2830358392132563, + "grad_norm": 0.3505547046661377, + "learning_rate": 0.0005049831408410478, + "loss": 4.9872, + "step": 885 + }, + { + "epoch": 0.2833556537208419, + "grad_norm": 0.3697142004966736, + "learning_rate": 0.0005047560655542342, + "loss": 4.9589, + "step": 886 + }, + { + "epoch": 0.2836754682284275, + "grad_norm": 0.3885795474052429, + "learning_rate": 0.000504528770441911, + "loss": 4.974, + "step": 887 + }, + { + "epoch": 0.2839952827360131, + "grad_norm": 0.36806532740592957, + "learning_rate": 0.0005043012557481016, + "loss": 4.9719, + "step": 888 + }, + { + "epoch": 0.2843150972435987, + "grad_norm": 0.3809673488140106, + "learning_rate": 0.0005040735217170653, + "loss": 4.9729, + "step": 889 + }, + { + "epoch": 0.2846349117511843, + "grad_norm": 0.38288503885269165, + "learning_rate": 0.0005038455685932964, + "loss": 4.9595, + "step": 890 + }, + { + "epoch": 0.2849547262587699, + "grad_norm": 0.3831663429737091, + "learning_rate": 0.0005036173966215248, + "loss": 4.9489, + "step": 891 + }, + { + "epoch": 0.2852745407663555, + "grad_norm": 0.3987366557121277, + "learning_rate": 0.0005033890060467153, + "loss": 4.9182, + "step": 892 + }, + { + "epoch": 0.2855943552739411, + "grad_norm": 0.37844398617744446, + "learning_rate": 0.0005031603971140674, + "loss": 4.9515, + "step": 893 + }, + { + "epoch": 0.28591416978152673, + "grad_norm": 0.4271279275417328, + "learning_rate": 0.000502931570069015, + "loss": 4.8621, + "step": 894 + }, + { + "epoch": 0.28623398428911234, + "grad_norm": 0.36089277267456055, + "learning_rate": 0.0005027025251572259, + "loss": 4.973, + "step": 895 + }, + { + "epoch": 0.2865537987966979, + "grad_norm": 0.47260555624961853, + "learning_rate": 0.0005024732626246022, + "loss": 4.9528, + "step": 896 + }, + { + "epoch": 0.2868736133042835, + "grad_norm": 0.3725051283836365, + "learning_rate": 0.0005022437827172795, + "loss": 5.0413, + "step": 897 + }, + { + "epoch": 0.28719342781186913, + "grad_norm": 0.3459242284297943, + "learning_rate": 0.0005020140856816268, + "loss": 5.0664, + "step": 898 + }, + { + "epoch": 0.28751324231945474, + "grad_norm": 0.3797368109226227, + "learning_rate": 0.0005017841717642461, + "loss": 5.0057, + "step": 899 + }, + { + "epoch": 0.2878330568270403, + "grad_norm": 0.3647208511829376, + "learning_rate": 0.0005015540412119721, + "loss": 4.9082, + "step": 900 + }, + { + "epoch": 0.2878330568270403, + "eval_loss": 4.972633361816406, + "eval_runtime": 83.5318, + "eval_samples_per_second": 22.71, + "eval_steps_per_second": 5.686, + "step": 900 + }, + { + "epoch": 0.2881528713346259, + "grad_norm": 0.3690331280231476, + "learning_rate": 0.0005013236942718725, + "loss": 4.9479, + "step": 901 + }, + { + "epoch": 0.28847268584221153, + "grad_norm": 0.3647288680076599, + "learning_rate": 0.0005010931311912473, + "loss": 5.0071, + "step": 902 + }, + { + "epoch": 0.28879250034979714, + "grad_norm": 0.4730885326862335, + "learning_rate": 0.0005008623522176279, + "loss": 4.9958, + "step": 903 + }, + { + "epoch": 0.2891123148573827, + "grad_norm": 0.3861388564109802, + "learning_rate": 0.0005006313575987784, + "loss": 4.951, + "step": 904 + }, + { + "epoch": 0.2894321293649683, + "grad_norm": 0.36299601197242737, + "learning_rate": 0.0005004001475826935, + "loss": 5.0293, + "step": 905 + }, + { + "epoch": 0.28975194387255393, + "grad_norm": 0.41219601035118103, + "learning_rate": 0.0005001687224175999, + "loss": 5.0395, + "step": 906 + }, + { + "epoch": 0.29007175838013954, + "grad_norm": 0.36094823479652405, + "learning_rate": 0.0004999370823519548, + "loss": 5.0214, + "step": 907 + }, + { + "epoch": 0.2903915728877251, + "grad_norm": 0.3846047818660736, + "learning_rate": 0.0004997052276344463, + "loss": 5.0025, + "step": 908 + }, + { + "epoch": 0.2907113873953107, + "grad_norm": 0.3636736571788788, + "learning_rate": 0.000499473158513993, + "loss": 5.0134, + "step": 909 + }, + { + "epoch": 0.29103120190289633, + "grad_norm": 0.38272932171821594, + "learning_rate": 0.0004992408752397437, + "loss": 4.9437, + "step": 910 + }, + { + "epoch": 0.29135101641048194, + "grad_norm": 0.34067991375923157, + "learning_rate": 0.0004990083780610769, + "loss": 4.8482, + "step": 911 + }, + { + "epoch": 0.2916708309180675, + "grad_norm": 0.35028305649757385, + "learning_rate": 0.000498775667227601, + "loss": 4.9277, + "step": 912 + }, + { + "epoch": 0.2919906454256531, + "grad_norm": 0.37718984484672546, + "learning_rate": 0.0004985427429891536, + "loss": 5.0258, + "step": 913 + }, + { + "epoch": 0.29231045993323873, + "grad_norm": 0.33929359912872314, + "learning_rate": 0.0004983096055958014, + "loss": 4.9383, + "step": 914 + }, + { + "epoch": 0.29263027444082435, + "grad_norm": 0.4247644245624542, + "learning_rate": 0.0004980762552978403, + "loss": 4.9067, + "step": 915 + }, + { + "epoch": 0.2929500889484099, + "grad_norm": 0.3345932364463806, + "learning_rate": 0.0004978426923457942, + "loss": 4.9354, + "step": 916 + }, + { + "epoch": 0.2932699034559955, + "grad_norm": 0.36324384808540344, + "learning_rate": 0.0004976089169904156, + "loss": 4.8719, + "step": 917 + }, + { + "epoch": 0.29358971796358113, + "grad_norm": 0.37019455432891846, + "learning_rate": 0.0004973749294826853, + "loss": 4.9591, + "step": 918 + }, + { + "epoch": 0.29390953247116675, + "grad_norm": 0.35735613107681274, + "learning_rate": 0.0004971407300738114, + "loss": 4.8741, + "step": 919 + }, + { + "epoch": 0.2942293469787523, + "grad_norm": 0.34016358852386475, + "learning_rate": 0.0004969063190152297, + "loss": 4.8985, + "step": 920 + }, + { + "epoch": 0.2945491614863379, + "grad_norm": 0.3893618881702423, + "learning_rate": 0.0004966716965586033, + "loss": 4.9197, + "step": 921 + }, + { + "epoch": 0.29486897599392353, + "grad_norm": 0.34983229637145996, + "learning_rate": 0.0004964368629558221, + "loss": 4.9452, + "step": 922 + }, + { + "epoch": 0.29518879050150915, + "grad_norm": 0.3541092574596405, + "learning_rate": 0.0004962018184590028, + "loss": 4.9591, + "step": 923 + }, + { + "epoch": 0.2955086050090947, + "grad_norm": 0.39640942215919495, + "learning_rate": 0.0004959665633204885, + "loss": 4.8656, + "step": 924 + }, + { + "epoch": 0.2958284195166803, + "grad_norm": 0.3901992440223694, + "learning_rate": 0.0004957310977928484, + "loss": 4.9126, + "step": 925 + }, + { + "epoch": 0.29614823402426593, + "grad_norm": 0.36173298954963684, + "learning_rate": 0.0004954954221288775, + "loss": 4.9545, + "step": 926 + }, + { + "epoch": 0.29646804853185155, + "grad_norm": 0.3484973907470703, + "learning_rate": 0.0004952595365815967, + "loss": 4.9946, + "step": 927 + }, + { + "epoch": 0.2967878630394371, + "grad_norm": 0.366262823343277, + "learning_rate": 0.0004950234414042519, + "loss": 4.8764, + "step": 928 + }, + { + "epoch": 0.2971076775470227, + "grad_norm": 0.348197877407074, + "learning_rate": 0.0004947871368503143, + "loss": 4.9088, + "step": 929 + }, + { + "epoch": 0.29742749205460833, + "grad_norm": 0.4134189486503601, + "learning_rate": 0.0004945506231734796, + "loss": 4.985, + "step": 930 + }, + { + "epoch": 0.29774730656219395, + "grad_norm": 0.3315630555152893, + "learning_rate": 0.0004943139006276683, + "loss": 4.9724, + "step": 931 + }, + { + "epoch": 0.2980671210697795, + "grad_norm": 0.3424299955368042, + "learning_rate": 0.0004940769694670251, + "loss": 5.0209, + "step": 932 + }, + { + "epoch": 0.2983869355773651, + "grad_norm": 0.3581695556640625, + "learning_rate": 0.0004938398299459183, + "loss": 4.9245, + "step": 933 + }, + { + "epoch": 0.29870675008495073, + "grad_norm": 0.34082433581352234, + "learning_rate": 0.0004936024823189406, + "loss": 4.882, + "step": 934 + }, + { + "epoch": 0.29902656459253635, + "grad_norm": 0.40066200494766235, + "learning_rate": 0.0004933649268409073, + "loss": 4.8821, + "step": 935 + }, + { + "epoch": 0.2993463791001219, + "grad_norm": 0.3584911823272705, + "learning_rate": 0.0004931271637668577, + "loss": 5.0226, + "step": 936 + }, + { + "epoch": 0.2996661936077075, + "grad_norm": 0.37688034772872925, + "learning_rate": 0.0004928891933520533, + "loss": 4.9522, + "step": 937 + }, + { + "epoch": 0.29998600811529313, + "grad_norm": 0.37159818410873413, + "learning_rate": 0.0004926510158519784, + "loss": 4.9429, + "step": 938 + }, + { + "epoch": 0.30030582262287875, + "grad_norm": 0.39578378200531006, + "learning_rate": 0.0004924126315223396, + "loss": 5.0257, + "step": 939 + }, + { + "epoch": 0.3006256371304643, + "grad_norm": 0.36488571763038635, + "learning_rate": 0.0004921740406190659, + "loss": 4.898, + "step": 940 + }, + { + "epoch": 0.3009454516380499, + "grad_norm": 0.3469265103340149, + "learning_rate": 0.0004919352433983075, + "loss": 4.877, + "step": 941 + }, + { + "epoch": 0.30126526614563554, + "grad_norm": 0.3934706151485443, + "learning_rate": 0.0004916962401164365, + "loss": 4.9137, + "step": 942 + }, + { + "epoch": 0.30158508065322115, + "grad_norm": 0.3785736858844757, + "learning_rate": 0.0004914570310300462, + "loss": 4.9126, + "step": 943 + }, + { + "epoch": 0.3019048951608067, + "grad_norm": 0.39063695073127747, + "learning_rate": 0.0004912176163959506, + "loss": 4.975, + "step": 944 + }, + { + "epoch": 0.3022247096683923, + "grad_norm": 0.3680741488933563, + "learning_rate": 0.0004909779964711848, + "loss": 4.8925, + "step": 945 + }, + { + "epoch": 0.30254452417597794, + "grad_norm": 0.3956775963306427, + "learning_rate": 0.0004907381715130038, + "loss": 4.8729, + "step": 946 + }, + { + "epoch": 0.30286433868356355, + "grad_norm": 0.39570778608322144, + "learning_rate": 0.000490498141778883, + "loss": 4.9321, + "step": 947 + }, + { + "epoch": 0.3031841531911491, + "grad_norm": 0.38764604926109314, + "learning_rate": 0.0004902579075265178, + "loss": 4.9018, + "step": 948 + }, + { + "epoch": 0.3035039676987347, + "grad_norm": 0.3573993742465973, + "learning_rate": 0.0004900174690138229, + "loss": 4.9348, + "step": 949 + }, + { + "epoch": 0.30382378220632034, + "grad_norm": 0.3773062825202942, + "learning_rate": 0.0004897768264989323, + "loss": 4.9789, + "step": 950 + }, + { + "epoch": 0.30414359671390595, + "grad_norm": 0.36863696575164795, + "learning_rate": 0.0004895359802401992, + "loss": 4.973, + "step": 951 + }, + { + "epoch": 0.3044634112214915, + "grad_norm": 0.3877570629119873, + "learning_rate": 0.0004892949304961952, + "loss": 4.9151, + "step": 952 + }, + { + "epoch": 0.3047832257290771, + "grad_norm": 0.3498269021511078, + "learning_rate": 0.0004890536775257109, + "loss": 4.8523, + "step": 953 + }, + { + "epoch": 0.30510304023666274, + "grad_norm": 0.35669517517089844, + "learning_rate": 0.0004888122215877547, + "loss": 4.9247, + "step": 954 + }, + { + "epoch": 0.30542285474424835, + "grad_norm": 0.3852956295013428, + "learning_rate": 0.0004885705629415528, + "loss": 4.9261, + "step": 955 + }, + { + "epoch": 0.3057426692518339, + "grad_norm": 0.3802240788936615, + "learning_rate": 0.0004883287018465494, + "loss": 4.9425, + "step": 956 + }, + { + "epoch": 0.3060624837594195, + "grad_norm": 0.3908936381340027, + "learning_rate": 0.00048808663856240596, + "loss": 5.0087, + "step": 957 + }, + { + "epoch": 0.30638229826700514, + "grad_norm": 0.39753517508506775, + "learning_rate": 0.0004878443733490006, + "loss": 4.919, + "step": 958 + }, + { + "epoch": 0.30670211277459075, + "grad_norm": 0.3648401200771332, + "learning_rate": 0.00048760190646642866, + "loss": 4.8682, + "step": 959 + }, + { + "epoch": 0.3070219272821763, + "grad_norm": 0.4140999913215637, + "learning_rate": 0.000487359238175002, + "loss": 4.9643, + "step": 960 + }, + { + "epoch": 0.3073417417897619, + "grad_norm": 0.3538423180580139, + "learning_rate": 0.00048711636873524856, + "loss": 4.9116, + "step": 961 + }, + { + "epoch": 0.30766155629734754, + "grad_norm": 0.43048593401908875, + "learning_rate": 0.00048687329840791207, + "loss": 4.9888, + "step": 962 + }, + { + "epoch": 0.30798137080493315, + "grad_norm": 0.3531091511249542, + "learning_rate": 0.0004866300274539523, + "loss": 4.9137, + "step": 963 + }, + { + "epoch": 0.30830118531251877, + "grad_norm": 0.38962531089782715, + "learning_rate": 0.0004863865561345442, + "loss": 4.9497, + "step": 964 + }, + { + "epoch": 0.3086209998201043, + "grad_norm": 0.3531169295310974, + "learning_rate": 0.00048614288471107774, + "loss": 4.856, + "step": 965 + }, + { + "epoch": 0.30894081432768994, + "grad_norm": 0.36637869477272034, + "learning_rate": 0.00048589901344515805, + "loss": 4.9839, + "step": 966 + }, + { + "epoch": 0.30926062883527555, + "grad_norm": 0.3657850921154022, + "learning_rate": 0.00048565494259860434, + "loss": 4.9139, + "step": 967 + }, + { + "epoch": 0.30958044334286117, + "grad_norm": 0.35608503222465515, + "learning_rate": 0.00048541067243345064, + "loss": 4.9159, + "step": 968 + }, + { + "epoch": 0.3099002578504467, + "grad_norm": 0.38662147521972656, + "learning_rate": 0.00048516620321194443, + "loss": 4.9352, + "step": 969 + }, + { + "epoch": 0.31022007235803234, + "grad_norm": 0.3447380065917969, + "learning_rate": 0.0004849215351965474, + "loss": 4.8427, + "step": 970 + }, + { + "epoch": 0.31053988686561795, + "grad_norm": 0.3676728308200836, + "learning_rate": 0.0004846766686499342, + "loss": 4.9424, + "step": 971 + }, + { + "epoch": 0.31085970137320357, + "grad_norm": 0.34721434116363525, + "learning_rate": 0.0004844316038349929, + "loss": 4.8913, + "step": 972 + }, + { + "epoch": 0.3111795158807891, + "grad_norm": 0.3613666892051697, + "learning_rate": 0.00048418634101482435, + "loss": 4.8762, + "step": 973 + }, + { + "epoch": 0.31149933038837474, + "grad_norm": 0.3623393774032593, + "learning_rate": 0.000483940880452742, + "loss": 4.9168, + "step": 974 + }, + { + "epoch": 0.31181914489596035, + "grad_norm": 0.3604717254638672, + "learning_rate": 0.0004836952224122716, + "loss": 4.9698, + "step": 975 + }, + { + "epoch": 0.31213895940354597, + "grad_norm": 0.3825310468673706, + "learning_rate": 0.00048344936715715104, + "loss": 4.9762, + "step": 976 + }, + { + "epoch": 0.3124587739111315, + "grad_norm": 0.34435907006263733, + "learning_rate": 0.0004832033149513295, + "loss": 4.8628, + "step": 977 + }, + { + "epoch": 0.31277858841871714, + "grad_norm": 0.6025840640068054, + "learning_rate": 0.0004829570660589681, + "loss": 4.9957, + "step": 978 + }, + { + "epoch": 0.31309840292630275, + "grad_norm": 0.3436499238014221, + "learning_rate": 0.0004827106207444389, + "loss": 4.8879, + "step": 979 + }, + { + "epoch": 0.31341821743388837, + "grad_norm": 0.4185909032821655, + "learning_rate": 0.00048246397927232483, + "loss": 4.9352, + "step": 980 + }, + { + "epoch": 0.3137380319414739, + "grad_norm": 0.358967125415802, + "learning_rate": 0.00048221714190741947, + "loss": 4.9247, + "step": 981 + }, + { + "epoch": 0.31405784644905954, + "grad_norm": 0.34398168325424194, + "learning_rate": 0.00048197010891472665, + "loss": 4.8922, + "step": 982 + }, + { + "epoch": 0.31437766095664516, + "grad_norm": 0.3618887662887573, + "learning_rate": 0.00048172288055946033, + "loss": 4.9811, + "step": 983 + }, + { + "epoch": 0.31469747546423077, + "grad_norm": 0.3518047332763672, + "learning_rate": 0.0004814754571070442, + "loss": 4.9583, + "step": 984 + }, + { + "epoch": 0.3150172899718163, + "grad_norm": 0.383455753326416, + "learning_rate": 0.00048122783882311126, + "loss": 4.9536, + "step": 985 + }, + { + "epoch": 0.31533710447940194, + "grad_norm": 0.38077494502067566, + "learning_rate": 0.0004809800259735038, + "loss": 4.9451, + "step": 986 + }, + { + "epoch": 0.31565691898698756, + "grad_norm": 0.3760354220867157, + "learning_rate": 0.0004807320188242728, + "loss": 4.8219, + "step": 987 + }, + { + "epoch": 0.31597673349457317, + "grad_norm": 0.3830978274345398, + "learning_rate": 0.0004804838176416782, + "loss": 4.8887, + "step": 988 + }, + { + "epoch": 0.31629654800215873, + "grad_norm": 0.40143024921417236, + "learning_rate": 0.000480235422692188, + "loss": 4.9925, + "step": 989 + }, + { + "epoch": 0.31661636250974434, + "grad_norm": 0.3880362808704376, + "learning_rate": 0.0004799868342424784, + "loss": 4.8642, + "step": 990 + }, + { + "epoch": 0.31693617701732996, + "grad_norm": 0.36687639355659485, + "learning_rate": 0.00047973805255943305, + "loss": 4.9327, + "step": 991 + }, + { + "epoch": 0.31725599152491557, + "grad_norm": 0.3998223841190338, + "learning_rate": 0.0004794890779101434, + "loss": 4.9667, + "step": 992 + }, + { + "epoch": 0.31757580603250113, + "grad_norm": 0.3674963712692261, + "learning_rate": 0.0004792399105619077, + "loss": 4.9583, + "step": 993 + }, + { + "epoch": 0.31789562054008674, + "grad_norm": 0.45018628239631653, + "learning_rate": 0.0004789905507822314, + "loss": 4.8859, + "step": 994 + }, + { + "epoch": 0.31821543504767236, + "grad_norm": 0.3655175268650055, + "learning_rate": 0.00047874099883882644, + "loss": 4.9264, + "step": 995 + }, + { + "epoch": 0.31853524955525797, + "grad_norm": 0.36511993408203125, + "learning_rate": 0.000478491254999611, + "loss": 4.8902, + "step": 996 + }, + { + "epoch": 0.31885506406284353, + "grad_norm": 0.3576342761516571, + "learning_rate": 0.0004782413195327094, + "loss": 4.937, + "step": 997 + }, + { + "epoch": 0.31917487857042914, + "grad_norm": 0.3686801791191101, + "learning_rate": 0.0004779911927064516, + "loss": 4.8042, + "step": 998 + }, + { + "epoch": 0.31949469307801476, + "grad_norm": 0.3380935490131378, + "learning_rate": 0.000477740874789373, + "loss": 4.9013, + "step": 999 + }, + { + "epoch": 0.31981450758560037, + "grad_norm": 0.36893823742866516, + "learning_rate": 0.0004774903660502142, + "loss": 4.8851, + "step": 1000 + }, + { + "epoch": 0.31981450758560037, + "eval_loss": 4.902451992034912, + "eval_runtime": 79.3732, + "eval_samples_per_second": 23.9, + "eval_steps_per_second": 5.984, + "step": 1000 + }, + { + "epoch": 0.32013432209318593, + "grad_norm": 0.37070271372795105, + "learning_rate": 0.0004772396667579205, + "loss": 4.962, + "step": 1001 + }, + { + "epoch": 0.32045413660077154, + "grad_norm": 0.3702748417854309, + "learning_rate": 0.0004769887771816422, + "loss": 4.8628, + "step": 1002 + }, + { + "epoch": 0.32077395110835716, + "grad_norm": 0.39788708090782166, + "learning_rate": 0.0004767376975907334, + "loss": 4.9365, + "step": 1003 + }, + { + "epoch": 0.32109376561594277, + "grad_norm": 0.36875808238983154, + "learning_rate": 0.00047648642825475255, + "loss": 4.8822, + "step": 1004 + }, + { + "epoch": 0.32141358012352833, + "grad_norm": 0.356067419052124, + "learning_rate": 0.0004762349694434615, + "loss": 4.8964, + "step": 1005 + }, + { + "epoch": 0.32173339463111394, + "grad_norm": 0.37602582573890686, + "learning_rate": 0.0004759833214268259, + "loss": 4.8708, + "step": 1006 + }, + { + "epoch": 0.32205320913869956, + "grad_norm": 0.36407309770584106, + "learning_rate": 0.0004757314844750141, + "loss": 4.854, + "step": 1007 + }, + { + "epoch": 0.3223730236462852, + "grad_norm": 0.35537827014923096, + "learning_rate": 0.00047547945885839763, + "loss": 4.9622, + "step": 1008 + }, + { + "epoch": 0.32269283815387073, + "grad_norm": 4.673837661743164, + "learning_rate": 0.00047522724484755054, + "loss": 4.937, + "step": 1009 + }, + { + "epoch": 0.32301265266145635, + "grad_norm": 0.4190693795681, + "learning_rate": 0.0004749748427132488, + "loss": 4.8952, + "step": 1010 + }, + { + "epoch": 0.32333246716904196, + "grad_norm": 0.3714112639427185, + "learning_rate": 0.00047472225272647084, + "loss": 4.9616, + "step": 1011 + }, + { + "epoch": 0.3236522816766276, + "grad_norm": 0.3778044283390045, + "learning_rate": 0.00047446947515839634, + "loss": 4.9127, + "step": 1012 + }, + { + "epoch": 0.32397209618421313, + "grad_norm": 0.36721158027648926, + "learning_rate": 0.0004742165102804067, + "loss": 4.9322, + "step": 1013 + }, + { + "epoch": 0.32429191069179875, + "grad_norm": 0.3614025413990021, + "learning_rate": 0.00047396335836408427, + "loss": 4.9971, + "step": 1014 + }, + { + "epoch": 0.32461172519938436, + "grad_norm": 0.3511156737804413, + "learning_rate": 0.0004737100196812121, + "loss": 4.8961, + "step": 1015 + }, + { + "epoch": 0.32493153970697, + "grad_norm": 0.3765702247619629, + "learning_rate": 0.00047345649450377395, + "loss": 4.8508, + "step": 1016 + }, + { + "epoch": 0.32525135421455553, + "grad_norm": 0.3439734876155853, + "learning_rate": 0.0004732027831039536, + "loss": 4.7858, + "step": 1017 + }, + { + "epoch": 0.32557116872214115, + "grad_norm": 0.3662970960140228, + "learning_rate": 0.00047294888575413486, + "loss": 4.9161, + "step": 1018 + }, + { + "epoch": 0.32589098322972676, + "grad_norm": 0.3613886535167694, + "learning_rate": 0.0004726948027269013, + "loss": 4.9031, + "step": 1019 + }, + { + "epoch": 0.3262107977373124, + "grad_norm": 0.38528895378112793, + "learning_rate": 0.00047244053429503565, + "loss": 4.9852, + "step": 1020 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.3607567846775055, + "learning_rate": 0.00047218608073151976, + "loss": 4.8439, + "step": 1021 + }, + { + "epoch": 0.32685042675248355, + "grad_norm": 0.36917412281036377, + "learning_rate": 0.0004719314423095342, + "loss": 4.9332, + "step": 1022 + }, + { + "epoch": 0.32717024126006916, + "grad_norm": 0.3716620206832886, + "learning_rate": 0.00047167661930245803, + "loss": 4.805, + "step": 1023 + }, + { + "epoch": 0.3274900557676548, + "grad_norm": 0.3702612519264221, + "learning_rate": 0.0004714216119838685, + "loss": 4.8916, + "step": 1024 + }, + { + "epoch": 0.32780987027524033, + "grad_norm": 0.3598913252353668, + "learning_rate": 0.00047116642062754074, + "loss": 4.8999, + "step": 1025 + }, + { + "epoch": 0.32812968478282595, + "grad_norm": 0.34733474254608154, + "learning_rate": 0.00047091104550744733, + "loss": 4.8999, + "step": 1026 + }, + { + "epoch": 0.32844949929041156, + "grad_norm": 0.34953516721725464, + "learning_rate": 0.00047065548689775844, + "loss": 4.844, + "step": 1027 + }, + { + "epoch": 0.3287693137979972, + "grad_norm": 0.37234926223754883, + "learning_rate": 0.00047039974507284086, + "loss": 4.9322, + "step": 1028 + }, + { + "epoch": 0.32908912830558273, + "grad_norm": 0.35275664925575256, + "learning_rate": 0.0004701438203072584, + "loss": 4.8971, + "step": 1029 + }, + { + "epoch": 0.32940894281316835, + "grad_norm": 0.365399569272995, + "learning_rate": 0.00046988771287577105, + "loss": 4.9074, + "step": 1030 + }, + { + "epoch": 0.32972875732075396, + "grad_norm": 0.345048725605011, + "learning_rate": 0.0004696314230533349, + "loss": 4.8695, + "step": 1031 + }, + { + "epoch": 0.3300485718283396, + "grad_norm": 0.33528172969818115, + "learning_rate": 0.00046937495111510204, + "loss": 4.8512, + "step": 1032 + }, + { + "epoch": 0.3303683863359252, + "grad_norm": 0.3688417375087738, + "learning_rate": 0.00046911829733642016, + "loss": 4.8542, + "step": 1033 + }, + { + "epoch": 0.33068820084351075, + "grad_norm": 0.3624173104763031, + "learning_rate": 0.0004688614619928318, + "loss": 4.9194, + "step": 1034 + }, + { + "epoch": 0.33100801535109636, + "grad_norm": 0.35962924361228943, + "learning_rate": 0.00046860444536007473, + "loss": 4.9694, + "step": 1035 + }, + { + "epoch": 0.331327829858682, + "grad_norm": 0.36196452379226685, + "learning_rate": 0.0004683472477140811, + "loss": 4.911, + "step": 1036 + }, + { + "epoch": 0.3316476443662676, + "grad_norm": 0.347702294588089, + "learning_rate": 0.0004680898693309777, + "loss": 4.859, + "step": 1037 + }, + { + "epoch": 0.33196745887385315, + "grad_norm": 0.36738497018814087, + "learning_rate": 0.0004678323104870852, + "loss": 4.9272, + "step": 1038 + }, + { + "epoch": 0.33228727338143876, + "grad_norm": 0.3768245577812195, + "learning_rate": 0.00046757457145891784, + "loss": 4.812, + "step": 1039 + }, + { + "epoch": 0.3326070878890244, + "grad_norm": 0.36470353603363037, + "learning_rate": 0.00046731665252318366, + "loss": 4.871, + "step": 1040 + }, + { + "epoch": 0.33292690239661, + "grad_norm": 0.388289213180542, + "learning_rate": 0.0004670585539567837, + "loss": 4.868, + "step": 1041 + }, + { + "epoch": 0.33324671690419555, + "grad_norm": 0.35525208711624146, + "learning_rate": 0.00046680027603681164, + "loss": 4.8379, + "step": 1042 + }, + { + "epoch": 0.33356653141178116, + "grad_norm": 0.4119868874549866, + "learning_rate": 0.0004665418190405541, + "loss": 4.8108, + "step": 1043 + }, + { + "epoch": 0.3338863459193668, + "grad_norm": 0.35186243057250977, + "learning_rate": 0.0004662831832454895, + "loss": 4.9152, + "step": 1044 + }, + { + "epoch": 0.3342061604269524, + "grad_norm": 0.3671918511390686, + "learning_rate": 0.00046602436892928875, + "loss": 4.8589, + "step": 1045 + }, + { + "epoch": 0.33452597493453795, + "grad_norm": 0.34154975414276123, + "learning_rate": 0.000465765376369814, + "loss": 4.8987, + "step": 1046 + }, + { + "epoch": 0.33484578944212356, + "grad_norm": 0.3686893582344055, + "learning_rate": 0.00046550620584511886, + "loss": 4.8835, + "step": 1047 + }, + { + "epoch": 0.3351656039497092, + "grad_norm": 0.3374258279800415, + "learning_rate": 0.00046524685763344803, + "loss": 4.8285, + "step": 1048 + }, + { + "epoch": 0.3354854184572948, + "grad_norm": 0.3705580532550812, + "learning_rate": 0.00046498733201323715, + "loss": 4.84, + "step": 1049 + }, + { + "epoch": 0.33580523296488035, + "grad_norm": 0.36040836572647095, + "learning_rate": 0.000464727629263112, + "loss": 4.8931, + "step": 1050 + }, + { + "epoch": 0.33612504747246597, + "grad_norm": 0.38583824038505554, + "learning_rate": 0.0004644677496618887, + "loss": 4.8973, + "step": 1051 + }, + { + "epoch": 0.3364448619800516, + "grad_norm": 0.36153537034988403, + "learning_rate": 0.00046420769348857343, + "loss": 4.8919, + "step": 1052 + }, + { + "epoch": 0.3367646764876372, + "grad_norm": 0.3621228039264679, + "learning_rate": 0.00046394746102236144, + "loss": 4.8117, + "step": 1053 + }, + { + "epoch": 0.33708449099522275, + "grad_norm": 0.41207414865493774, + "learning_rate": 0.00046368705254263773, + "loss": 4.853, + "step": 1054 + }, + { + "epoch": 0.33740430550280837, + "grad_norm": 0.37452879548072815, + "learning_rate": 0.000463426468328976, + "loss": 4.839, + "step": 1055 + }, + { + "epoch": 0.337724120010394, + "grad_norm": 0.3425996005535126, + "learning_rate": 0.0004631657086611387, + "loss": 4.8584, + "step": 1056 + }, + { + "epoch": 0.3380439345179796, + "grad_norm": 0.36205095052719116, + "learning_rate": 0.0004629047738190767, + "loss": 4.8762, + "step": 1057 + }, + { + "epoch": 0.33836374902556515, + "grad_norm": 0.3805515170097351, + "learning_rate": 0.00046264366408292883, + "loss": 4.8817, + "step": 1058 + }, + { + "epoch": 0.33868356353315077, + "grad_norm": 0.3517549932003021, + "learning_rate": 0.0004623823797330216, + "loss": 4.9096, + "step": 1059 + }, + { + "epoch": 0.3390033780407364, + "grad_norm": 0.35046592354774475, + "learning_rate": 0.00046212092104986946, + "loss": 4.9018, + "step": 1060 + }, + { + "epoch": 0.339323192548322, + "grad_norm": 0.3718402087688446, + "learning_rate": 0.0004618592883141734, + "loss": 4.8226, + "step": 1061 + }, + { + "epoch": 0.33964300705590755, + "grad_norm": 0.4067389965057373, + "learning_rate": 0.00046159748180682166, + "loss": 4.8947, + "step": 1062 + }, + { + "epoch": 0.33996282156349317, + "grad_norm": 0.3579217791557312, + "learning_rate": 0.0004613355018088889, + "loss": 4.7653, + "step": 1063 + }, + { + "epoch": 0.3402826360710788, + "grad_norm": 0.3520847260951996, + "learning_rate": 0.00046107334860163605, + "loss": 4.8847, + "step": 1064 + }, + { + "epoch": 0.3406024505786644, + "grad_norm": 0.37844428420066833, + "learning_rate": 0.00046081102246651014, + "loss": 4.8943, + "step": 1065 + }, + { + "epoch": 0.34092226508624995, + "grad_norm": 0.343375027179718, + "learning_rate": 0.0004605485236851436, + "loss": 4.8446, + "step": 1066 + }, + { + "epoch": 0.34124207959383557, + "grad_norm": 0.3797420263290405, + "learning_rate": 0.0004602858525393544, + "loss": 4.7879, + "step": 1067 + }, + { + "epoch": 0.3415618941014212, + "grad_norm": 0.3718990385532379, + "learning_rate": 0.00046002300931114555, + "loss": 4.8804, + "step": 1068 + }, + { + "epoch": 0.3418817086090068, + "grad_norm": 0.35296258330345154, + "learning_rate": 0.0004597599942827048, + "loss": 4.8964, + "step": 1069 + }, + { + "epoch": 0.34220152311659235, + "grad_norm": 0.3609677851200104, + "learning_rate": 0.0004594968077364041, + "loss": 4.9593, + "step": 1070 + }, + { + "epoch": 0.34252133762417797, + "grad_norm": 0.354596346616745, + "learning_rate": 0.00045923344995480006, + "loss": 4.8428, + "step": 1071 + }, + { + "epoch": 0.3428411521317636, + "grad_norm": 0.357670396566391, + "learning_rate": 0.0004589699212206325, + "loss": 4.9326, + "step": 1072 + }, + { + "epoch": 0.3431609666393492, + "grad_norm": 0.3854842185974121, + "learning_rate": 0.0004587062218168253, + "loss": 4.9109, + "step": 1073 + }, + { + "epoch": 0.34348078114693475, + "grad_norm": 0.3449469804763794, + "learning_rate": 0.0004584423520264853, + "loss": 4.9041, + "step": 1074 + }, + { + "epoch": 0.34380059565452037, + "grad_norm": 0.3718896508216858, + "learning_rate": 0.0004581783121329024, + "loss": 4.8633, + "step": 1075 + }, + { + "epoch": 0.344120410162106, + "grad_norm": 0.3840758502483368, + "learning_rate": 0.00045791410241954894, + "loss": 4.7397, + "step": 1076 + }, + { + "epoch": 0.3444402246696916, + "grad_norm": 0.3534940183162689, + "learning_rate": 0.0004576497231700798, + "loss": 4.8555, + "step": 1077 + }, + { + "epoch": 0.34476003917727716, + "grad_norm": 0.3727824091911316, + "learning_rate": 0.0004573851746683317, + "loss": 4.8823, + "step": 1078 + }, + { + "epoch": 0.34507985368486277, + "grad_norm": 0.345478355884552, + "learning_rate": 0.00045712045719832313, + "loss": 4.7997, + "step": 1079 + }, + { + "epoch": 0.3453996681924484, + "grad_norm": 0.3740197420120239, + "learning_rate": 0.00045685557104425397, + "loss": 4.9766, + "step": 1080 + }, + { + "epoch": 0.345719482700034, + "grad_norm": 0.36381879448890686, + "learning_rate": 0.00045659051649050525, + "loss": 4.8279, + "step": 1081 + }, + { + "epoch": 0.34603929720761956, + "grad_norm": 0.35513120889663696, + "learning_rate": 0.00045632529382163883, + "loss": 4.822, + "step": 1082 + }, + { + "epoch": 0.34635911171520517, + "grad_norm": 0.3540807068347931, + "learning_rate": 0.00045605990332239684, + "loss": 4.8205, + "step": 1083 + }, + { + "epoch": 0.3466789262227908, + "grad_norm": 0.3478928804397583, + "learning_rate": 0.00045579434527770186, + "loss": 4.8185, + "step": 1084 + }, + { + "epoch": 0.3469987407303764, + "grad_norm": 0.3550429344177246, + "learning_rate": 0.0004555286199726561, + "loss": 4.795, + "step": 1085 + }, + { + "epoch": 0.34731855523796196, + "grad_norm": 0.35866010189056396, + "learning_rate": 0.0004552627276925416, + "loss": 4.8276, + "step": 1086 + }, + { + "epoch": 0.34763836974554757, + "grad_norm": 0.35746774077415466, + "learning_rate": 0.0004549966687228195, + "loss": 4.8081, + "step": 1087 + }, + { + "epoch": 0.3479581842531332, + "grad_norm": 0.3648238778114319, + "learning_rate": 0.0004547304433491299, + "loss": 4.8687, + "step": 1088 + }, + { + "epoch": 0.3482779987607188, + "grad_norm": 0.3464508354663849, + "learning_rate": 0.00045446405185729154, + "loss": 4.8114, + "step": 1089 + }, + { + "epoch": 0.34859781326830436, + "grad_norm": 0.38740813732147217, + "learning_rate": 0.00045419749453330167, + "loss": 4.827, + "step": 1090 + }, + { + "epoch": 0.34891762777588997, + "grad_norm": 0.36972126364707947, + "learning_rate": 0.00045393077166333524, + "loss": 4.8716, + "step": 1091 + }, + { + "epoch": 0.3492374422834756, + "grad_norm": 0.3767491579055786, + "learning_rate": 0.0004536638835337452, + "loss": 4.8603, + "step": 1092 + }, + { + "epoch": 0.3495572567910612, + "grad_norm": 0.3509376347064972, + "learning_rate": 0.00045339683043106214, + "loss": 4.8471, + "step": 1093 + }, + { + "epoch": 0.34987707129864676, + "grad_norm": 0.34784337878227234, + "learning_rate": 0.00045312961264199316, + "loss": 4.7883, + "step": 1094 + }, + { + "epoch": 0.35019688580623237, + "grad_norm": 0.35170140862464905, + "learning_rate": 0.0004528622304534225, + "loss": 4.8756, + "step": 1095 + }, + { + "epoch": 0.350516700313818, + "grad_norm": 0.33361902832984924, + "learning_rate": 0.00045259468415241117, + "loss": 4.7779, + "step": 1096 + }, + { + "epoch": 0.3508365148214036, + "grad_norm": 0.38126787543296814, + "learning_rate": 0.0004523269740261957, + "loss": 4.8836, + "step": 1097 + }, + { + "epoch": 0.35115632932898916, + "grad_norm": 0.34420838952064514, + "learning_rate": 0.0004520591003621892, + "loss": 4.793, + "step": 1098 + }, + { + "epoch": 0.3514761438365748, + "grad_norm": 0.3590407371520996, + "learning_rate": 0.00045179106344798005, + "loss": 4.863, + "step": 1099 + }, + { + "epoch": 0.3517959583441604, + "grad_norm": 0.3648085594177246, + "learning_rate": 0.00045152286357133157, + "loss": 4.8578, + "step": 1100 + }, + { + "epoch": 0.3517959583441604, + "eval_loss": 4.842448711395264, + "eval_runtime": 78.0099, + "eval_samples_per_second": 24.317, + "eval_steps_per_second": 6.089, + "step": 1100 + }, + { + "epoch": 0.352115772851746, + "grad_norm": 0.47429999709129333, + "learning_rate": 0.0004512545010201828, + "loss": 4.77, + "step": 1101 + }, + { + "epoch": 0.3524355873593316, + "grad_norm": 0.36832159757614136, + "learning_rate": 0.0004509859760826466, + "loss": 4.756, + "step": 1102 + }, + { + "epoch": 0.3527554018669172, + "grad_norm": 0.34895962476730347, + "learning_rate": 0.0004507172890470108, + "loss": 4.8125, + "step": 1103 + }, + { + "epoch": 0.3530752163745028, + "grad_norm": 0.34782490134239197, + "learning_rate": 0.000450448440201737, + "loss": 4.7803, + "step": 1104 + }, + { + "epoch": 0.3533950308820884, + "grad_norm": 0.36187058687210083, + "learning_rate": 0.0004501794298354603, + "loss": 4.8202, + "step": 1105 + }, + { + "epoch": 0.353714845389674, + "grad_norm": 0.3739657998085022, + "learning_rate": 0.0004499102582369897, + "loss": 4.7097, + "step": 1106 + }, + { + "epoch": 0.3540346598972596, + "grad_norm": 0.36065033078193665, + "learning_rate": 0.0004496409256953069, + "loss": 4.9222, + "step": 1107 + }, + { + "epoch": 0.3543544744048452, + "grad_norm": 0.3667559027671814, + "learning_rate": 0.0004493714324995666, + "loss": 4.894, + "step": 1108 + }, + { + "epoch": 0.3546742889124308, + "grad_norm": 0.390155166387558, + "learning_rate": 0.00044910177893909577, + "loss": 4.9217, + "step": 1109 + }, + { + "epoch": 0.3549941034200164, + "grad_norm": 0.3407348394393921, + "learning_rate": 0.00044883196530339376, + "loss": 4.8408, + "step": 1110 + }, + { + "epoch": 0.355313917927602, + "grad_norm": 0.3738819360733032, + "learning_rate": 0.0004485619918821318, + "loss": 4.8303, + "step": 1111 + }, + { + "epoch": 0.3556337324351876, + "grad_norm": 0.3555871248245239, + "learning_rate": 0.00044829185896515245, + "loss": 4.933, + "step": 1112 + }, + { + "epoch": 0.3559535469427732, + "grad_norm": 0.3620026409626007, + "learning_rate": 0.0004480215668424696, + "loss": 4.8471, + "step": 1113 + }, + { + "epoch": 0.3562733614503588, + "grad_norm": 0.34683966636657715, + "learning_rate": 0.00044775111580426817, + "loss": 4.832, + "step": 1114 + }, + { + "epoch": 0.3565931759579444, + "grad_norm": 0.3447222411632538, + "learning_rate": 0.00044748050614090343, + "loss": 4.8587, + "step": 1115 + }, + { + "epoch": 0.35691299046553, + "grad_norm": 0.35056382417678833, + "learning_rate": 0.00044720973814290125, + "loss": 4.8076, + "step": 1116 + }, + { + "epoch": 0.3572328049731156, + "grad_norm": 0.3493582308292389, + "learning_rate": 0.0004469388121009574, + "loss": 4.8278, + "step": 1117 + }, + { + "epoch": 0.3575526194807012, + "grad_norm": 0.36693739891052246, + "learning_rate": 0.00044666772830593714, + "loss": 4.8642, + "step": 1118 + }, + { + "epoch": 0.3578724339882868, + "grad_norm": 0.34915661811828613, + "learning_rate": 0.00044639648704887535, + "loss": 4.8292, + "step": 1119 + }, + { + "epoch": 0.3581922484958724, + "grad_norm": 0.3442689776420593, + "learning_rate": 0.00044612508862097575, + "loss": 4.8267, + "step": 1120 + }, + { + "epoch": 0.358512063003458, + "grad_norm": 0.37427109479904175, + "learning_rate": 0.00044585353331361095, + "loss": 4.8233, + "step": 1121 + }, + { + "epoch": 0.3588318775110436, + "grad_norm": 0.3511858582496643, + "learning_rate": 0.000445581821418322, + "loss": 4.8552, + "step": 1122 + }, + { + "epoch": 0.3591516920186292, + "grad_norm": 0.36977121233940125, + "learning_rate": 0.0004453099532268178, + "loss": 4.8884, + "step": 1123 + }, + { + "epoch": 0.3594715065262148, + "grad_norm": 0.35378748178482056, + "learning_rate": 0.0004450379290309755, + "loss": 4.699, + "step": 1124 + }, + { + "epoch": 0.3597913210338004, + "grad_norm": 0.36219707131385803, + "learning_rate": 0.0004447657491228392, + "loss": 4.7922, + "step": 1125 + }, + { + "epoch": 0.360111135541386, + "grad_norm": 0.36497360467910767, + "learning_rate": 0.0004444934137946207, + "loss": 4.8067, + "step": 1126 + }, + { + "epoch": 0.3604309500489716, + "grad_norm": 0.35565951466560364, + "learning_rate": 0.00044422092333869814, + "loss": 4.8199, + "step": 1127 + }, + { + "epoch": 0.3607507645565572, + "grad_norm": 0.3564634621143341, + "learning_rate": 0.00044394827804761667, + "loss": 4.8902, + "step": 1128 + }, + { + "epoch": 0.3610705790641428, + "grad_norm": 0.3579818308353424, + "learning_rate": 0.0004436754782140875, + "loss": 4.7855, + "step": 1129 + }, + { + "epoch": 0.3613903935717284, + "grad_norm": 0.3743307888507843, + "learning_rate": 0.0004434025241309876, + "loss": 4.8971, + "step": 1130 + }, + { + "epoch": 0.361710208079314, + "grad_norm": 0.3698958158493042, + "learning_rate": 0.0004431294160913597, + "loss": 4.7787, + "step": 1131 + }, + { + "epoch": 0.3620300225868996, + "grad_norm": 0.37127307057380676, + "learning_rate": 0.0004428561543884118, + "loss": 4.6915, + "step": 1132 + }, + { + "epoch": 0.3623498370944852, + "grad_norm": 0.36235642433166504, + "learning_rate": 0.0004425827393155169, + "loss": 4.8175, + "step": 1133 + }, + { + "epoch": 0.3626696516020708, + "grad_norm": 0.3819771707057953, + "learning_rate": 0.00044230917116621266, + "loss": 4.7539, + "step": 1134 + }, + { + "epoch": 0.3629894661096564, + "grad_norm": 0.35926157236099243, + "learning_rate": 0.00044203545023420085, + "loss": 4.9012, + "step": 1135 + }, + { + "epoch": 0.363309280617242, + "grad_norm": 0.37980303168296814, + "learning_rate": 0.00044176157681334767, + "loss": 4.7817, + "step": 1136 + }, + { + "epoch": 0.3636290951248276, + "grad_norm": 0.35066235065460205, + "learning_rate": 0.0004414875511976827, + "loss": 4.7868, + "step": 1137 + }, + { + "epoch": 0.3639489096324132, + "grad_norm": 0.36465728282928467, + "learning_rate": 0.00044121337368139906, + "loss": 4.8841, + "step": 1138 + }, + { + "epoch": 0.3642687241399988, + "grad_norm": 0.3646154999732971, + "learning_rate": 0.0004409390445588528, + "loss": 4.8033, + "step": 1139 + }, + { + "epoch": 0.3645885386475844, + "grad_norm": 0.3409954309463501, + "learning_rate": 0.0004406645641245631, + "loss": 4.8258, + "step": 1140 + }, + { + "epoch": 0.36490835315517, + "grad_norm": 0.35481762886047363, + "learning_rate": 0.0004403899326732112, + "loss": 4.7841, + "step": 1141 + }, + { + "epoch": 0.3652281676627556, + "grad_norm": 0.35458990931510925, + "learning_rate": 0.00044011515049964073, + "loss": 4.8184, + "step": 1142 + }, + { + "epoch": 0.3655479821703412, + "grad_norm": 0.4338507354259491, + "learning_rate": 0.0004398402178988568, + "loss": 4.8885, + "step": 1143 + }, + { + "epoch": 0.3658677966779268, + "grad_norm": 0.36764827370643616, + "learning_rate": 0.00043956513516602653, + "loss": 4.8525, + "step": 1144 + }, + { + "epoch": 0.3661876111855124, + "grad_norm": 0.3740900158882141, + "learning_rate": 0.00043928990259647764, + "loss": 4.7446, + "step": 1145 + }, + { + "epoch": 0.366507425693098, + "grad_norm": 0.3610120117664337, + "learning_rate": 0.00043901452048569913, + "loss": 4.7707, + "step": 1146 + }, + { + "epoch": 0.3668272402006836, + "grad_norm": 0.37914660573005676, + "learning_rate": 0.00043873898912934054, + "loss": 4.9002, + "step": 1147 + }, + { + "epoch": 0.3671470547082692, + "grad_norm": 0.37821099162101746, + "learning_rate": 0.00043846330882321146, + "loss": 4.8313, + "step": 1148 + }, + { + "epoch": 0.3674668692158548, + "grad_norm": 0.3644234240055084, + "learning_rate": 0.00043818747986328136, + "loss": 4.7384, + "step": 1149 + }, + { + "epoch": 0.3677866837234404, + "grad_norm": 0.3613603413105011, + "learning_rate": 0.0004379115025456795, + "loss": 4.8134, + "step": 1150 + }, + { + "epoch": 0.368106498231026, + "grad_norm": 0.3736517131328583, + "learning_rate": 0.0004376353771666942, + "loss": 4.8469, + "step": 1151 + }, + { + "epoch": 0.3684263127386116, + "grad_norm": 0.37014541029930115, + "learning_rate": 0.000437359104022773, + "loss": 4.8588, + "step": 1152 + }, + { + "epoch": 0.3687461272461972, + "grad_norm": 0.3674084544181824, + "learning_rate": 0.00043708268341052185, + "loss": 4.771, + "step": 1153 + }, + { + "epoch": 0.3690659417537828, + "grad_norm": 0.35765981674194336, + "learning_rate": 0.00043680611562670513, + "loss": 4.7784, + "step": 1154 + }, + { + "epoch": 0.3693857562613684, + "grad_norm": 0.3745180368423462, + "learning_rate": 0.00043652940096824516, + "loss": 4.8395, + "step": 1155 + }, + { + "epoch": 0.369705570768954, + "grad_norm": 0.3918968439102173, + "learning_rate": 0.00043625253973222206, + "loss": 4.783, + "step": 1156 + }, + { + "epoch": 0.3700253852765396, + "grad_norm": 0.34982675313949585, + "learning_rate": 0.00043597553221587316, + "loss": 4.8801, + "step": 1157 + }, + { + "epoch": 0.3703451997841252, + "grad_norm": 0.3849962055683136, + "learning_rate": 0.00043569837871659296, + "loss": 4.7992, + "step": 1158 + }, + { + "epoch": 0.3706650142917108, + "grad_norm": 0.35203996300697327, + "learning_rate": 0.0004354210795319327, + "loss": 4.87, + "step": 1159 + }, + { + "epoch": 0.3709848287992964, + "grad_norm": 0.3672322928905487, + "learning_rate": 0.00043514363495959985, + "loss": 4.7667, + "step": 1160 + }, + { + "epoch": 0.371304643306882, + "grad_norm": 0.35468590259552, + "learning_rate": 0.0004348660452974581, + "loss": 4.7612, + "step": 1161 + }, + { + "epoch": 0.3716244578144676, + "grad_norm": 0.3686932325363159, + "learning_rate": 0.00043458831084352705, + "loss": 4.8025, + "step": 1162 + }, + { + "epoch": 0.3719442723220532, + "grad_norm": 0.3381997346878052, + "learning_rate": 0.00043431043189598125, + "loss": 4.8255, + "step": 1163 + }, + { + "epoch": 0.3722640868296388, + "grad_norm": 0.3542218506336212, + "learning_rate": 0.0004340324087531511, + "loss": 4.8247, + "step": 1164 + }, + { + "epoch": 0.3725839013372244, + "grad_norm": 0.3275507390499115, + "learning_rate": 0.00043375424171352133, + "loss": 4.7272, + "step": 1165 + }, + { + "epoch": 0.37290371584481, + "grad_norm": 0.3618198037147522, + "learning_rate": 0.00043347593107573106, + "loss": 4.8289, + "step": 1166 + }, + { + "epoch": 0.37322353035239564, + "grad_norm": 0.3532826006412506, + "learning_rate": 0.000433197477138574, + "loss": 4.8198, + "step": 1167 + }, + { + "epoch": 0.3735433448599812, + "grad_norm": 0.35923513770103455, + "learning_rate": 0.00043291888020099723, + "loss": 4.7377, + "step": 1168 + }, + { + "epoch": 0.3738631593675668, + "grad_norm": 0.34886351227760315, + "learning_rate": 0.0004326401405621019, + "loss": 4.8268, + "step": 1169 + }, + { + "epoch": 0.3741829738751524, + "grad_norm": 0.3776870369911194, + "learning_rate": 0.0004323612585211419, + "loss": 4.8158, + "step": 1170 + }, + { + "epoch": 0.37450278838273804, + "grad_norm": 0.35528770089149475, + "learning_rate": 0.0004320822343775242, + "loss": 4.9009, + "step": 1171 + }, + { + "epoch": 0.3748226028903236, + "grad_norm": 0.38312670588493347, + "learning_rate": 0.00043180306843080836, + "loss": 4.8545, + "step": 1172 + }, + { + "epoch": 0.3751424173979092, + "grad_norm": 0.40025246143341064, + "learning_rate": 0.0004315237609807059, + "loss": 4.881, + "step": 1173 + }, + { + "epoch": 0.3754622319054948, + "grad_norm": 0.35279229283332825, + "learning_rate": 0.00043124431232708076, + "loss": 4.7822, + "step": 1174 + }, + { + "epoch": 0.37578204641308044, + "grad_norm": 0.3803034722805023, + "learning_rate": 0.000430964722769948, + "loss": 4.7747, + "step": 1175 + }, + { + "epoch": 0.376101860920666, + "grad_norm": 0.3923249840736389, + "learning_rate": 0.0004306849926094742, + "loss": 4.7027, + "step": 1176 + }, + { + "epoch": 0.3764216754282516, + "grad_norm": 0.3511454164981842, + "learning_rate": 0.00043040512214597684, + "loss": 4.7863, + "step": 1177 + }, + { + "epoch": 0.3767414899358372, + "grad_norm": 0.34750327467918396, + "learning_rate": 0.00043012511167992405, + "loss": 4.7971, + "step": 1178 + }, + { + "epoch": 0.37706130444342284, + "grad_norm": 0.3603704273700714, + "learning_rate": 0.0004298449615119343, + "loss": 4.7601, + "step": 1179 + }, + { + "epoch": 0.3773811189510084, + "grad_norm": 0.3695945143699646, + "learning_rate": 0.0004295646719427758, + "loss": 4.8054, + "step": 1180 + }, + { + "epoch": 0.377700933458594, + "grad_norm": 0.3885416090488434, + "learning_rate": 0.00042928424327336667, + "loss": 4.7725, + "step": 1181 + }, + { + "epoch": 0.3780207479661796, + "grad_norm": 0.339851051568985, + "learning_rate": 0.00042900367580477446, + "loss": 4.7132, + "step": 1182 + }, + { + "epoch": 0.37834056247376524, + "grad_norm": 0.35797902941703796, + "learning_rate": 0.0004287229698382154, + "loss": 4.7972, + "step": 1183 + }, + { + "epoch": 0.3786603769813508, + "grad_norm": 0.3440041244029999, + "learning_rate": 0.0004284421256750547, + "loss": 4.8355, + "step": 1184 + }, + { + "epoch": 0.3789801914889364, + "grad_norm": 0.3346846401691437, + "learning_rate": 0.0004281611436168059, + "loss": 4.7913, + "step": 1185 + }, + { + "epoch": 0.379300005996522, + "grad_norm": 0.47180992364883423, + "learning_rate": 0.00042788002396513023, + "loss": 4.7696, + "step": 1186 + }, + { + "epoch": 0.37961982050410764, + "grad_norm": 0.34173375368118286, + "learning_rate": 0.00042759876702183706, + "loss": 4.7845, + "step": 1187 + }, + { + "epoch": 0.3799396350116932, + "grad_norm": 0.3632044494152069, + "learning_rate": 0.0004273173730888831, + "loss": 4.7338, + "step": 1188 + }, + { + "epoch": 0.3802594495192788, + "grad_norm": 0.3603549003601074, + "learning_rate": 0.00042703584246837206, + "loss": 4.8058, + "step": 1189 + }, + { + "epoch": 0.38057926402686443, + "grad_norm": 0.397983193397522, + "learning_rate": 0.0004267541754625543, + "loss": 4.8072, + "step": 1190 + }, + { + "epoch": 0.38089907853445004, + "grad_norm": 0.34411120414733887, + "learning_rate": 0.00042647237237382666, + "loss": 4.7369, + "step": 1191 + }, + { + "epoch": 0.3812188930420356, + "grad_norm": 0.35892680287361145, + "learning_rate": 0.00042619043350473223, + "loss": 4.8927, + "step": 1192 + }, + { + "epoch": 0.3815387075496212, + "grad_norm": 0.35901933908462524, + "learning_rate": 0.0004259083591579596, + "loss": 4.7778, + "step": 1193 + }, + { + "epoch": 0.38185852205720683, + "grad_norm": 0.3576551079750061, + "learning_rate": 0.000425626149636343, + "loss": 4.7477, + "step": 1194 + }, + { + "epoch": 0.38217833656479244, + "grad_norm": 0.3562488257884979, + "learning_rate": 0.0004253438052428619, + "loss": 4.7469, + "step": 1195 + }, + { + "epoch": 0.382498151072378, + "grad_norm": 0.3653848171234131, + "learning_rate": 0.00042506132628064016, + "loss": 4.8627, + "step": 1196 + }, + { + "epoch": 0.3828179655799636, + "grad_norm": 0.36694326996803284, + "learning_rate": 0.00042477871305294655, + "loss": 4.8451, + "step": 1197 + }, + { + "epoch": 0.38313778008754923, + "grad_norm": 0.3641037940979004, + "learning_rate": 0.0004244959658631938, + "loss": 4.7344, + "step": 1198 + }, + { + "epoch": 0.38345759459513484, + "grad_norm": 0.368745893239975, + "learning_rate": 0.00042421308501493823, + "loss": 4.6974, + "step": 1199 + }, + { + "epoch": 0.3837774091027204, + "grad_norm": 0.35546058416366577, + "learning_rate": 0.0004239300708118802, + "loss": 4.7683, + "step": 1200 + }, + { + "epoch": 0.3837774091027204, + "eval_loss": 4.789093494415283, + "eval_runtime": 79.2193, + "eval_samples_per_second": 23.946, + "eval_steps_per_second": 5.996, + "step": 1200 + }, + { + "epoch": 0.384097223610306, + "grad_norm": 0.3464343547821045, + "learning_rate": 0.0004236469235578627, + "loss": 4.7838, + "step": 1201 + }, + { + "epoch": 0.38441703811789163, + "grad_norm": 0.36398279666900635, + "learning_rate": 0.0004233636435568719, + "loss": 4.8208, + "step": 1202 + }, + { + "epoch": 0.38473685262547724, + "grad_norm": 0.359862357378006, + "learning_rate": 0.00042308023111303636, + "loss": 4.7881, + "step": 1203 + }, + { + "epoch": 0.3850566671330628, + "grad_norm": 0.4442932903766632, + "learning_rate": 0.00042279668653062686, + "loss": 4.8383, + "step": 1204 + }, + { + "epoch": 0.3853764816406484, + "grad_norm": 0.3936426043510437, + "learning_rate": 0.0004225130101140559, + "loss": 4.7622, + "step": 1205 + }, + { + "epoch": 0.38569629614823403, + "grad_norm": 0.3605417013168335, + "learning_rate": 0.00042222920216787786, + "loss": 4.7688, + "step": 1206 + }, + { + "epoch": 0.38601611065581964, + "grad_norm": 0.3449859917163849, + "learning_rate": 0.000421945262996788, + "loss": 4.8179, + "step": 1207 + }, + { + "epoch": 0.3863359251634052, + "grad_norm": 0.36619365215301514, + "learning_rate": 0.0004216611929056225, + "loss": 4.7845, + "step": 1208 + }, + { + "epoch": 0.3866557396709908, + "grad_norm": 0.3378742039203644, + "learning_rate": 0.0004213769921993583, + "loss": 4.8207, + "step": 1209 + }, + { + "epoch": 0.38697555417857643, + "grad_norm": 0.375707745552063, + "learning_rate": 0.0004210926611831124, + "loss": 4.6434, + "step": 1210 + }, + { + "epoch": 0.38729536868616204, + "grad_norm": 0.3499867916107178, + "learning_rate": 0.0004208082001621417, + "loss": 4.7915, + "step": 1211 + }, + { + "epoch": 0.3876151831937476, + "grad_norm": 0.36418724060058594, + "learning_rate": 0.0004205236094418428, + "loss": 4.8163, + "step": 1212 + }, + { + "epoch": 0.3879349977013332, + "grad_norm": 0.3566012978553772, + "learning_rate": 0.0004202388893277515, + "loss": 4.7468, + "step": 1213 + }, + { + "epoch": 0.38825481220891883, + "grad_norm": 0.3427860140800476, + "learning_rate": 0.00041995404012554226, + "loss": 4.7259, + "step": 1214 + }, + { + "epoch": 0.38857462671650445, + "grad_norm": 0.36360323429107666, + "learning_rate": 0.0004196690621410285, + "loss": 4.7334, + "step": 1215 + }, + { + "epoch": 0.38889444122409, + "grad_norm": 0.37315964698791504, + "learning_rate": 0.0004193839556801617, + "loss": 4.7683, + "step": 1216 + }, + { + "epoch": 0.3892142557316756, + "grad_norm": 0.348958820104599, + "learning_rate": 0.0004190987210490314, + "loss": 4.8029, + "step": 1217 + }, + { + "epoch": 0.38953407023926123, + "grad_norm": 0.3609643578529358, + "learning_rate": 0.00041881335855386463, + "loss": 4.7958, + "step": 1218 + }, + { + "epoch": 0.38985388474684685, + "grad_norm": 0.36503320932388306, + "learning_rate": 0.00041852786850102557, + "loss": 4.7606, + "step": 1219 + }, + { + "epoch": 0.3901736992544324, + "grad_norm": 0.3636496961116791, + "learning_rate": 0.00041824225119701576, + "loss": 4.8224, + "step": 1220 + }, + { + "epoch": 0.390493513762018, + "grad_norm": 0.7249408960342407, + "learning_rate": 0.0004179565069484729, + "loss": 4.7619, + "step": 1221 + }, + { + "epoch": 0.39081332826960363, + "grad_norm": 0.35281887650489807, + "learning_rate": 0.0004176706360621713, + "loss": 4.7195, + "step": 1222 + }, + { + "epoch": 0.39113314277718925, + "grad_norm": 0.37250128388404846, + "learning_rate": 0.0004173846388450209, + "loss": 4.7523, + "step": 1223 + }, + { + "epoch": 0.3914529572847748, + "grad_norm": 0.3570784628391266, + "learning_rate": 0.0004170985156040677, + "loss": 4.7547, + "step": 1224 + }, + { + "epoch": 0.3917727717923604, + "grad_norm": 0.4446999132633209, + "learning_rate": 0.0004168122666464927, + "loss": 4.7975, + "step": 1225 + }, + { + "epoch": 0.39209258629994603, + "grad_norm": 0.37899062037467957, + "learning_rate": 0.0004165258922796119, + "loss": 4.818, + "step": 1226 + }, + { + "epoch": 0.39241240080753165, + "grad_norm": 0.3786870837211609, + "learning_rate": 0.00041623939281087605, + "loss": 4.757, + "step": 1227 + }, + { + "epoch": 0.3927322153151172, + "grad_norm": 0.4067842662334442, + "learning_rate": 0.00041595276854787007, + "loss": 4.6869, + "step": 1228 + }, + { + "epoch": 0.3930520298227028, + "grad_norm": 0.36138680577278137, + "learning_rate": 0.00041566601979831287, + "loss": 4.7871, + "step": 1229 + }, + { + "epoch": 0.39337184433028843, + "grad_norm": 0.3776302933692932, + "learning_rate": 0.00041537914687005714, + "loss": 4.818, + "step": 1230 + }, + { + "epoch": 0.39369165883787405, + "grad_norm": 0.3752312660217285, + "learning_rate": 0.00041509215007108885, + "loss": 4.8022, + "step": 1231 + }, + { + "epoch": 0.3940114733454596, + "grad_norm": 0.3481239974498749, + "learning_rate": 0.0004148050297095269, + "loss": 4.7868, + "step": 1232 + }, + { + "epoch": 0.3943312878530452, + "grad_norm": 0.3753473460674286, + "learning_rate": 0.00041451778609362286, + "loss": 4.7985, + "step": 1233 + }, + { + "epoch": 0.39465110236063083, + "grad_norm": 0.38862380385398865, + "learning_rate": 0.0004142304195317605, + "loss": 4.7357, + "step": 1234 + }, + { + "epoch": 0.39497091686821645, + "grad_norm": 0.36923497915267944, + "learning_rate": 0.00041394293033245597, + "loss": 4.7071, + "step": 1235 + }, + { + "epoch": 0.39529073137580206, + "grad_norm": 0.3899265229701996, + "learning_rate": 0.00041365531880435647, + "loss": 4.7769, + "step": 1236 + }, + { + "epoch": 0.3956105458833876, + "grad_norm": 0.3533836603164673, + "learning_rate": 0.0004133675852562413, + "loss": 4.8711, + "step": 1237 + }, + { + "epoch": 0.39593036039097323, + "grad_norm": 0.3839608430862427, + "learning_rate": 0.00041307972999702014, + "loss": 4.7194, + "step": 1238 + }, + { + "epoch": 0.39625017489855885, + "grad_norm": 0.4235544800758362, + "learning_rate": 0.00041279175333573345, + "loss": 4.8387, + "step": 1239 + }, + { + "epoch": 0.39656998940614446, + "grad_norm": 0.3789507746696472, + "learning_rate": 0.00041250365558155236, + "loss": 4.8791, + "step": 1240 + }, + { + "epoch": 0.39688980391373, + "grad_norm": 0.4530871510505676, + "learning_rate": 0.0004122154370437776, + "loss": 4.799, + "step": 1241 + }, + { + "epoch": 0.39720961842131564, + "grad_norm": 0.38132578134536743, + "learning_rate": 0.0004119270980318398, + "loss": 4.79, + "step": 1242 + }, + { + "epoch": 0.39752943292890125, + "grad_norm": 0.38641002774238586, + "learning_rate": 0.0004116386388552988, + "loss": 4.8165, + "step": 1243 + }, + { + "epoch": 0.39784924743648686, + "grad_norm": 0.37316030263900757, + "learning_rate": 0.0004113500598238437, + "loss": 4.8027, + "step": 1244 + }, + { + "epoch": 0.3981690619440724, + "grad_norm": 0.34834080934524536, + "learning_rate": 0.000411061361247292, + "loss": 4.793, + "step": 1245 + }, + { + "epoch": 0.39848887645165804, + "grad_norm": 0.36298882961273193, + "learning_rate": 0.00041077254343558955, + "loss": 4.7775, + "step": 1246 + }, + { + "epoch": 0.39880869095924365, + "grad_norm": 0.378531277179718, + "learning_rate": 0.00041048360669881055, + "loss": 4.8365, + "step": 1247 + }, + { + "epoch": 0.39912850546682926, + "grad_norm": 0.3716171383857727, + "learning_rate": 0.0004101945513471563, + "loss": 4.7765, + "step": 1248 + }, + { + "epoch": 0.3994483199744148, + "grad_norm": 0.37697818875312805, + "learning_rate": 0.000409905377690956, + "loss": 4.7976, + "step": 1249 + }, + { + "epoch": 0.39976813448200044, + "grad_norm": 0.37140795588493347, + "learning_rate": 0.00040961608604066566, + "loss": 4.8138, + "step": 1250 + }, + { + "epoch": 0.40008794898958605, + "grad_norm": 0.3761359751224518, + "learning_rate": 0.0004093266767068677, + "loss": 4.7103, + "step": 1251 + }, + { + "epoch": 0.40040776349717166, + "grad_norm": 0.3735464811325073, + "learning_rate": 0.0004090371500002715, + "loss": 4.7285, + "step": 1252 + }, + { + "epoch": 0.4007275780047572, + "grad_norm": 0.36980125308036804, + "learning_rate": 0.00040874750623171176, + "loss": 4.6936, + "step": 1253 + }, + { + "epoch": 0.40104739251234284, + "grad_norm": 0.3474436402320862, + "learning_rate": 0.00040845774571214924, + "loss": 4.7765, + "step": 1254 + }, + { + "epoch": 0.40136720701992845, + "grad_norm": 0.3448978364467621, + "learning_rate": 0.0004081678687526701, + "loss": 4.7407, + "step": 1255 + }, + { + "epoch": 0.40168702152751407, + "grad_norm": 0.3554507791996002, + "learning_rate": 0.0004078778756644854, + "loss": 4.8064, + "step": 1256 + }, + { + "epoch": 0.4020068360350996, + "grad_norm": 0.36760538816452026, + "learning_rate": 0.00040758776675893065, + "loss": 4.7797, + "step": 1257 + }, + { + "epoch": 0.40232665054268524, + "grad_norm": 0.3476412892341614, + "learning_rate": 0.00040729754234746613, + "loss": 4.7432, + "step": 1258 + }, + { + "epoch": 0.40264646505027085, + "grad_norm": 0.3805728256702423, + "learning_rate": 0.0004070072027416758, + "loss": 4.7659, + "step": 1259 + }, + { + "epoch": 0.40296627955785647, + "grad_norm": 0.3726625442504883, + "learning_rate": 0.00040671674825326745, + "loss": 4.7388, + "step": 1260 + }, + { + "epoch": 0.403286094065442, + "grad_norm": 0.3568096160888672, + "learning_rate": 0.0004064261791940723, + "loss": 4.7267, + "step": 1261 + }, + { + "epoch": 0.40360590857302764, + "grad_norm": 0.4026634991168976, + "learning_rate": 0.0004061354958760441, + "loss": 4.8277, + "step": 1262 + }, + { + "epoch": 0.40392572308061325, + "grad_norm": 0.35849010944366455, + "learning_rate": 0.00040584469861126, + "loss": 4.7226, + "step": 1263 + }, + { + "epoch": 0.40424553758819887, + "grad_norm": 0.37187543511390686, + "learning_rate": 0.00040555378771191876, + "loss": 4.7215, + "step": 1264 + }, + { + "epoch": 0.4045653520957844, + "grad_norm": 0.35207071900367737, + "learning_rate": 0.0004052627634903417, + "loss": 4.8124, + "step": 1265 + }, + { + "epoch": 0.40488516660337004, + "grad_norm": 0.3443794846534729, + "learning_rate": 0.00040497162625897134, + "loss": 4.7859, + "step": 1266 + }, + { + "epoch": 0.40520498111095565, + "grad_norm": 0.3849242329597473, + "learning_rate": 0.00040468037633037196, + "loss": 4.6921, + "step": 1267 + }, + { + "epoch": 0.40552479561854127, + "grad_norm": 0.3563462197780609, + "learning_rate": 0.0004043890140172286, + "loss": 4.708, + "step": 1268 + }, + { + "epoch": 0.4058446101261268, + "grad_norm": 0.3593246638774872, + "learning_rate": 0.00040409753963234675, + "loss": 4.7976, + "step": 1269 + }, + { + "epoch": 0.40616442463371244, + "grad_norm": 0.34887611865997314, + "learning_rate": 0.00040380595348865286, + "loss": 4.6867, + "step": 1270 + }, + { + "epoch": 0.40648423914129805, + "grad_norm": 0.36668267846107483, + "learning_rate": 0.00040351425589919257, + "loss": 4.8058, + "step": 1271 + }, + { + "epoch": 0.40680405364888367, + "grad_norm": 0.35184717178344727, + "learning_rate": 0.0004032224471771317, + "loss": 4.7682, + "step": 1272 + }, + { + "epoch": 0.4071238681564692, + "grad_norm": 0.34717682003974915, + "learning_rate": 0.00040293052763575537, + "loss": 4.7093, + "step": 1273 + }, + { + "epoch": 0.40744368266405484, + "grad_norm": 0.3581322729587555, + "learning_rate": 0.0004026384975884673, + "loss": 4.7513, + "step": 1274 + }, + { + "epoch": 0.40776349717164045, + "grad_norm": 0.3602757453918457, + "learning_rate": 0.00040234635734879036, + "loss": 4.8043, + "step": 1275 + }, + { + "epoch": 0.40808331167922607, + "grad_norm": 0.3304402530193329, + "learning_rate": 0.00040205410723036526, + "loss": 4.7174, + "step": 1276 + }, + { + "epoch": 0.4084031261868116, + "grad_norm": 0.3786298632621765, + "learning_rate": 0.0004017617475469508, + "loss": 4.7561, + "step": 1277 + }, + { + "epoch": 0.40872294069439724, + "grad_norm": 0.352568119764328, + "learning_rate": 0.00040146927861242366, + "loss": 4.6942, + "step": 1278 + }, + { + "epoch": 0.40904275520198285, + "grad_norm": 0.35060355067253113, + "learning_rate": 0.00040117670074077747, + "loss": 4.7754, + "step": 1279 + }, + { + "epoch": 0.40936256970956847, + "grad_norm": 0.3489457964897156, + "learning_rate": 0.00040088401424612317, + "loss": 4.7322, + "step": 1280 + }, + { + "epoch": 0.409682384217154, + "grad_norm": 0.3488384783267975, + "learning_rate": 0.000400591219442688, + "loss": 4.6923, + "step": 1281 + }, + { + "epoch": 0.41000219872473964, + "grad_norm": 0.3631167411804199, + "learning_rate": 0.0004002983166448155, + "loss": 4.7409, + "step": 1282 + }, + { + "epoch": 0.41032201323232526, + "grad_norm": 0.3414135277271271, + "learning_rate": 0.0004000053061669654, + "loss": 4.7438, + "step": 1283 + }, + { + "epoch": 0.41064182773991087, + "grad_norm": 0.3368418216705322, + "learning_rate": 0.00039971218832371284, + "loss": 4.7802, + "step": 1284 + }, + { + "epoch": 0.41096164224749643, + "grad_norm": 0.3384615182876587, + "learning_rate": 0.0003994189634297483, + "loss": 4.6527, + "step": 1285 + }, + { + "epoch": 0.41128145675508204, + "grad_norm": 0.40064677596092224, + "learning_rate": 0.00039912563179987713, + "loss": 4.8604, + "step": 1286 + }, + { + "epoch": 0.41160127126266766, + "grad_norm": 0.3540748357772827, + "learning_rate": 0.00039883219374901933, + "loss": 4.713, + "step": 1287 + }, + { + "epoch": 0.41192108577025327, + "grad_norm": 0.34424421191215515, + "learning_rate": 0.0003985386495922091, + "loss": 4.7114, + "step": 1288 + }, + { + "epoch": 0.41224090027783883, + "grad_norm": 0.3402925133705139, + "learning_rate": 0.00039824499964459455, + "loss": 4.7385, + "step": 1289 + }, + { + "epoch": 0.41256071478542444, + "grad_norm": 0.38066187500953674, + "learning_rate": 0.00039795124422143746, + "loss": 4.7384, + "step": 1290 + }, + { + "epoch": 0.41288052929301006, + "grad_norm": 0.34309983253479004, + "learning_rate": 0.0003976573836381128, + "loss": 4.7604, + "step": 1291 + }, + { + "epoch": 0.41320034380059567, + "grad_norm": 0.3329739570617676, + "learning_rate": 0.00039736341821010833, + "loss": 4.77, + "step": 1292 + }, + { + "epoch": 0.41352015830818123, + "grad_norm": 0.35351860523223877, + "learning_rate": 0.0003970693482530247, + "loss": 4.7382, + "step": 1293 + }, + { + "epoch": 0.41383997281576684, + "grad_norm": 0.33818474411964417, + "learning_rate": 0.00039677517408257424, + "loss": 4.7161, + "step": 1294 + }, + { + "epoch": 0.41415978732335246, + "grad_norm": 0.4118068218231201, + "learning_rate": 0.00039648089601458165, + "loss": 4.733, + "step": 1295 + }, + { + "epoch": 0.41447960183093807, + "grad_norm": 0.3382696509361267, + "learning_rate": 0.000396186514364983, + "loss": 4.8091, + "step": 1296 + }, + { + "epoch": 0.41479941633852363, + "grad_norm": 0.348286509513855, + "learning_rate": 0.0003958920294498255, + "loss": 4.7826, + "step": 1297 + }, + { + "epoch": 0.41511923084610924, + "grad_norm": 0.3513914942741394, + "learning_rate": 0.00039559744158526735, + "loss": 4.7369, + "step": 1298 + }, + { + "epoch": 0.41543904535369486, + "grad_norm": 0.3593021333217621, + "learning_rate": 0.0003953027510875772, + "loss": 4.7276, + "step": 1299 + }, + { + "epoch": 0.41575885986128047, + "grad_norm": 0.3372235596179962, + "learning_rate": 0.0003950079582731339, + "loss": 4.7845, + "step": 1300 + }, + { + "epoch": 0.41575885986128047, + "eval_loss": 4.742072105407715, + "eval_runtime": 83.4036, + "eval_samples_per_second": 22.745, + "eval_steps_per_second": 5.695, + "step": 1300 + }, + { + "epoch": 0.41607867436886603, + "grad_norm": 0.4168403744697571, + "learning_rate": 0.0003947130634584261, + "loss": 4.7408, + "step": 1301 + }, + { + "epoch": 0.41639848887645164, + "grad_norm": 0.337454617023468, + "learning_rate": 0.000394418066960052, + "loss": 4.7456, + "step": 1302 + }, + { + "epoch": 0.41671830338403726, + "grad_norm": 0.3640049695968628, + "learning_rate": 0.00039412296909471914, + "loss": 4.7155, + "step": 1303 + }, + { + "epoch": 0.4170381178916229, + "grad_norm": 0.35823458433151245, + "learning_rate": 0.00039382777017924354, + "loss": 4.7275, + "step": 1304 + }, + { + "epoch": 0.4173579323992085, + "grad_norm": 0.3543967008590698, + "learning_rate": 0.00039353247053054984, + "loss": 4.7342, + "step": 1305 + }, + { + "epoch": 0.41767774690679405, + "grad_norm": 0.37220895290374756, + "learning_rate": 0.0003932370704656711, + "loss": 4.7876, + "step": 1306 + }, + { + "epoch": 0.41799756141437966, + "grad_norm": 0.34149494767189026, + "learning_rate": 0.00039294157030174783, + "loss": 4.7389, + "step": 1307 + }, + { + "epoch": 0.4183173759219653, + "grad_norm": 0.33853477239608765, + "learning_rate": 0.00039264597035602807, + "loss": 4.6651, + "step": 1308 + }, + { + "epoch": 0.4186371904295509, + "grad_norm": 0.33498257398605347, + "learning_rate": 0.0003923502709458672, + "loss": 4.7405, + "step": 1309 + }, + { + "epoch": 0.41895700493713645, + "grad_norm": 0.3531733453273773, + "learning_rate": 0.00039205447238872706, + "loss": 4.7414, + "step": 1310 + }, + { + "epoch": 0.41927681944472206, + "grad_norm": 0.34058284759521484, + "learning_rate": 0.0003917585750021763, + "loss": 4.718, + "step": 1311 + }, + { + "epoch": 0.4195966339523077, + "grad_norm": 0.428093284368515, + "learning_rate": 0.0003914625791038893, + "loss": 4.7236, + "step": 1312 + }, + { + "epoch": 0.4199164484598933, + "grad_norm": 0.35845211148262024, + "learning_rate": 0.00039116648501164665, + "loss": 4.6984, + "step": 1313 + }, + { + "epoch": 0.42023626296747885, + "grad_norm": 0.35991171002388, + "learning_rate": 0.0003908702930433338, + "loss": 4.7357, + "step": 1314 + }, + { + "epoch": 0.42055607747506446, + "grad_norm": 0.33685556054115295, + "learning_rate": 0.0003905740035169417, + "loss": 4.7028, + "step": 1315 + }, + { + "epoch": 0.4208758919826501, + "grad_norm": 0.3488212823867798, + "learning_rate": 0.00039027761675056595, + "loss": 4.6826, + "step": 1316 + }, + { + "epoch": 0.4211957064902357, + "grad_norm": 0.35829851031303406, + "learning_rate": 0.0003899811330624065, + "loss": 4.684, + "step": 1317 + }, + { + "epoch": 0.42151552099782125, + "grad_norm": 0.34577038884162903, + "learning_rate": 0.0003896845527707673, + "loss": 4.705, + "step": 1318 + }, + { + "epoch": 0.42183533550540686, + "grad_norm": 0.3443538248538971, + "learning_rate": 0.00038938787619405616, + "loss": 4.7583, + "step": 1319 + }, + { + "epoch": 0.4221551500129925, + "grad_norm": 0.35939159989356995, + "learning_rate": 0.00038909110365078413, + "loss": 4.6987, + "step": 1320 + }, + { + "epoch": 0.4224749645205781, + "grad_norm": 0.3435399830341339, + "learning_rate": 0.00038879423545956534, + "loss": 4.6453, + "step": 1321 + }, + { + "epoch": 0.42279477902816365, + "grad_norm": 0.3486972749233246, + "learning_rate": 0.00038849727193911664, + "loss": 4.7149, + "step": 1322 + }, + { + "epoch": 0.42311459353574926, + "grad_norm": 0.38485008478164673, + "learning_rate": 0.0003882002134082571, + "loss": 4.7308, + "step": 1323 + }, + { + "epoch": 0.4234344080433349, + "grad_norm": 0.3632776737213135, + "learning_rate": 0.000387903060185908, + "loss": 4.6965, + "step": 1324 + }, + { + "epoch": 0.4237542225509205, + "grad_norm": 0.36035025119781494, + "learning_rate": 0.00038760581259109214, + "loss": 4.7418, + "step": 1325 + }, + { + "epoch": 0.42407403705850605, + "grad_norm": 0.35747748613357544, + "learning_rate": 0.0003873084709429336, + "loss": 4.7514, + "step": 1326 + }, + { + "epoch": 0.42439385156609166, + "grad_norm": 0.35926157236099243, + "learning_rate": 0.00038701103556065754, + "loss": 4.8184, + "step": 1327 + }, + { + "epoch": 0.4247136660736773, + "grad_norm": 0.3767782747745514, + "learning_rate": 0.0003867135067635898, + "loss": 4.7554, + "step": 1328 + }, + { + "epoch": 0.4250334805812629, + "grad_norm": 0.34841591119766235, + "learning_rate": 0.0003864158848711562, + "loss": 4.705, + "step": 1329 + }, + { + "epoch": 0.42535329508884845, + "grad_norm": 0.36564236879348755, + "learning_rate": 0.000386118170202883, + "loss": 4.71, + "step": 1330 + }, + { + "epoch": 0.42567310959643406, + "grad_norm": 0.3872424066066742, + "learning_rate": 0.00038582036307839557, + "loss": 4.7814, + "step": 1331 + }, + { + "epoch": 0.4259929241040197, + "grad_norm": 0.3488774299621582, + "learning_rate": 0.00038552246381741884, + "loss": 4.6798, + "step": 1332 + }, + { + "epoch": 0.4263127386116053, + "grad_norm": 0.347474068403244, + "learning_rate": 0.0003852244727397766, + "loss": 4.8074, + "step": 1333 + }, + { + "epoch": 0.42663255311919085, + "grad_norm": 0.3365819752216339, + "learning_rate": 0.00038492639016539116, + "loss": 4.7403, + "step": 1334 + }, + { + "epoch": 0.42695236762677646, + "grad_norm": 0.3785269856452942, + "learning_rate": 0.0003846282164142831, + "loss": 4.836, + "step": 1335 + }, + { + "epoch": 0.4272721821343621, + "grad_norm": 0.3417024612426758, + "learning_rate": 0.00038432995180657094, + "loss": 4.6824, + "step": 1336 + }, + { + "epoch": 0.4275919966419477, + "grad_norm": 0.3502133786678314, + "learning_rate": 0.00038403159666247063, + "loss": 4.6996, + "step": 1337 + }, + { + "epoch": 0.42791181114953325, + "grad_norm": 0.3363844156265259, + "learning_rate": 0.0003837331513022954, + "loss": 4.701, + "step": 1338 + }, + { + "epoch": 0.42823162565711886, + "grad_norm": 0.3410629630088806, + "learning_rate": 0.0003834346160464553, + "loss": 4.7362, + "step": 1339 + }, + { + "epoch": 0.4285514401647045, + "grad_norm": 0.3651580214500427, + "learning_rate": 0.0003831359912154569, + "loss": 4.7333, + "step": 1340 + }, + { + "epoch": 0.4288712546722901, + "grad_norm": 0.38412901759147644, + "learning_rate": 0.0003828372771299029, + "loss": 4.6356, + "step": 1341 + }, + { + "epoch": 0.42919106917987565, + "grad_norm": 0.37616869807243347, + "learning_rate": 0.00038253847411049194, + "loss": 4.63, + "step": 1342 + }, + { + "epoch": 0.42951088368746126, + "grad_norm": 0.3305797576904297, + "learning_rate": 0.000382239582478018, + "loss": 4.783, + "step": 1343 + }, + { + "epoch": 0.4298306981950469, + "grad_norm": 0.3697926104068756, + "learning_rate": 0.00038194060255337026, + "loss": 4.8082, + "step": 1344 + }, + { + "epoch": 0.4301505127026325, + "grad_norm": 0.3631756007671356, + "learning_rate": 0.0003816415346575327, + "loss": 4.7397, + "step": 1345 + }, + { + "epoch": 0.43047032721021805, + "grad_norm": 0.35470134019851685, + "learning_rate": 0.0003813423791115838, + "loss": 4.6279, + "step": 1346 + }, + { + "epoch": 0.43079014171780367, + "grad_norm": 0.3372836410999298, + "learning_rate": 0.00038104313623669604, + "loss": 4.7687, + "step": 1347 + }, + { + "epoch": 0.4311099562253893, + "grad_norm": 0.3620103895664215, + "learning_rate": 0.0003807438063541356, + "loss": 4.7333, + "step": 1348 + }, + { + "epoch": 0.4314297707329749, + "grad_norm": 0.35666435956954956, + "learning_rate": 0.00038044438978526235, + "loss": 4.7729, + "step": 1349 + }, + { + "epoch": 0.43174958524056045, + "grad_norm": 0.3575831949710846, + "learning_rate": 0.0003801448868515287, + "loss": 4.7119, + "step": 1350 + }, + { + "epoch": 0.43206939974814607, + "grad_norm": 0.3621557354927063, + "learning_rate": 0.00037984529787448047, + "loss": 4.7006, + "step": 1351 + }, + { + "epoch": 0.4323892142557317, + "grad_norm": 0.360970675945282, + "learning_rate": 0.0003795456231757554, + "loss": 4.7223, + "step": 1352 + }, + { + "epoch": 0.4327090287633173, + "grad_norm": 0.33878061175346375, + "learning_rate": 0.0003792458630770833, + "loss": 4.7156, + "step": 1353 + }, + { + "epoch": 0.43302884327090285, + "grad_norm": 0.3453442454338074, + "learning_rate": 0.00037894601790028576, + "loss": 4.7931, + "step": 1354 + }, + { + "epoch": 0.43334865777848847, + "grad_norm": 0.34347814321517944, + "learning_rate": 0.0003786460879672756, + "loss": 4.7, + "step": 1355 + }, + { + "epoch": 0.4336684722860741, + "grad_norm": 0.3537500500679016, + "learning_rate": 0.0003783460736000569, + "loss": 4.6134, + "step": 1356 + }, + { + "epoch": 0.4339882867936597, + "grad_norm": 0.422690212726593, + "learning_rate": 0.0003780459751207241, + "loss": 4.7167, + "step": 1357 + }, + { + "epoch": 0.43430810130124525, + "grad_norm": 0.3382203280925751, + "learning_rate": 0.0003777457928514619, + "loss": 4.7304, + "step": 1358 + }, + { + "epoch": 0.43462791580883087, + "grad_norm": 0.3374168574810028, + "learning_rate": 0.0003774455271145454, + "loss": 4.6354, + "step": 1359 + }, + { + "epoch": 0.4349477303164165, + "grad_norm": 0.34140485525131226, + "learning_rate": 0.0003771451782323388, + "loss": 4.6944, + "step": 1360 + }, + { + "epoch": 0.4352675448240021, + "grad_norm": 0.35734114050865173, + "learning_rate": 0.0003768447465272959, + "loss": 4.667, + "step": 1361 + }, + { + "epoch": 0.43558735933158765, + "grad_norm": 0.3632422089576721, + "learning_rate": 0.0003765442323219591, + "loss": 4.6941, + "step": 1362 + }, + { + "epoch": 0.43590717383917327, + "grad_norm": 0.34204337000846863, + "learning_rate": 0.00037624363593895976, + "loss": 4.6961, + "step": 1363 + }, + { + "epoch": 0.4362269883467589, + "grad_norm": 0.37258926033973694, + "learning_rate": 0.00037594295770101716, + "loss": 4.728, + "step": 1364 + }, + { + "epoch": 0.4365468028543445, + "grad_norm": 0.34576255083084106, + "learning_rate": 0.0003756421979309387, + "loss": 4.7215, + "step": 1365 + }, + { + "epoch": 0.43686661736193005, + "grad_norm": 0.34704098105430603, + "learning_rate": 0.00037534135695161904, + "loss": 4.6805, + "step": 1366 + }, + { + "epoch": 0.43718643186951567, + "grad_norm": 0.3493821322917938, + "learning_rate": 0.0003750404350860402, + "loss": 4.7144, + "step": 1367 + }, + { + "epoch": 0.4375062463771013, + "grad_norm": 0.33115461468696594, + "learning_rate": 0.00037473943265727114, + "loss": 4.7186, + "step": 1368 + }, + { + "epoch": 0.4378260608846869, + "grad_norm": 0.34023842215538025, + "learning_rate": 0.000374438349988467, + "loss": 4.7335, + "step": 1369 + }, + { + "epoch": 0.43814587539227245, + "grad_norm": 0.3522568941116333, + "learning_rate": 0.00037413718740286935, + "loss": 4.6449, + "step": 1370 + }, + { + "epoch": 0.43846568989985807, + "grad_norm": 0.3428913652896881, + "learning_rate": 0.00037383594522380546, + "loss": 4.7108, + "step": 1371 + }, + { + "epoch": 0.4387855044074437, + "grad_norm": 0.3575150966644287, + "learning_rate": 0.00037353462377468806, + "loss": 4.761, + "step": 1372 + }, + { + "epoch": 0.4391053189150293, + "grad_norm": 0.34879520535469055, + "learning_rate": 0.0003732332233790149, + "loss": 4.6459, + "step": 1373 + }, + { + "epoch": 0.4394251334226149, + "grad_norm": 0.3344023525714874, + "learning_rate": 0.00037293174436036855, + "loss": 4.8296, + "step": 1374 + }, + { + "epoch": 0.43974494793020047, + "grad_norm": 0.49138399958610535, + "learning_rate": 0.000372630187042416, + "loss": 4.7823, + "step": 1375 + }, + { + "epoch": 0.4400647624377861, + "grad_norm": 0.3501664698123932, + "learning_rate": 0.0003723285517489084, + "loss": 4.7195, + "step": 1376 + }, + { + "epoch": 0.4403845769453717, + "grad_norm": 0.3450270891189575, + "learning_rate": 0.0003720268388036805, + "loss": 4.6751, + "step": 1377 + }, + { + "epoch": 0.4407043914529573, + "grad_norm": 0.3368213474750519, + "learning_rate": 0.0003717250485306503, + "loss": 4.7517, + "step": 1378 + }, + { + "epoch": 0.44102420596054287, + "grad_norm": 0.3620702624320984, + "learning_rate": 0.00037142318125381915, + "loss": 4.7359, + "step": 1379 + }, + { + "epoch": 0.4413440204681285, + "grad_norm": 0.3435147702693939, + "learning_rate": 0.0003711212372972706, + "loss": 4.6931, + "step": 1380 + }, + { + "epoch": 0.4416638349757141, + "grad_norm": 0.39190882444381714, + "learning_rate": 0.000370819216985171, + "loss": 4.803, + "step": 1381 + }, + { + "epoch": 0.4419836494832997, + "grad_norm": 0.40365469455718994, + "learning_rate": 0.0003705171206417685, + "loss": 4.728, + "step": 1382 + }, + { + "epoch": 0.44230346399088527, + "grad_norm": 0.3486280143260956, + "learning_rate": 0.0003702149485913926, + "loss": 4.6316, + "step": 1383 + }, + { + "epoch": 0.4426232784984709, + "grad_norm": 0.3493972718715668, + "learning_rate": 0.0003699127011584546, + "loss": 4.746, + "step": 1384 + }, + { + "epoch": 0.4429430930060565, + "grad_norm": 0.33820289373397827, + "learning_rate": 0.0003696103786674463, + "loss": 4.7163, + "step": 1385 + }, + { + "epoch": 0.4432629075136421, + "grad_norm": 0.35788559913635254, + "learning_rate": 0.0003693079814429403, + "loss": 4.7594, + "step": 1386 + }, + { + "epoch": 0.44358272202122767, + "grad_norm": 0.4123647212982178, + "learning_rate": 0.00036900550980958934, + "loss": 4.6385, + "step": 1387 + }, + { + "epoch": 0.4439025365288133, + "grad_norm": 0.339077889919281, + "learning_rate": 0.000368702964092126, + "loss": 4.6516, + "step": 1388 + }, + { + "epoch": 0.4442223510363989, + "grad_norm": 0.39766621589660645, + "learning_rate": 0.0003684003446153627, + "loss": 4.6859, + "step": 1389 + }, + { + "epoch": 0.4445421655439845, + "grad_norm": 0.33863818645477295, + "learning_rate": 0.0003680976517041905, + "loss": 4.6316, + "step": 1390 + }, + { + "epoch": 0.44486198005157007, + "grad_norm": 0.3599453568458557, + "learning_rate": 0.00036779488568358, + "loss": 4.6533, + "step": 1391 + }, + { + "epoch": 0.4451817945591557, + "grad_norm": 0.35659059882164, + "learning_rate": 0.00036749204687857955, + "loss": 4.6248, + "step": 1392 + }, + { + "epoch": 0.4455016090667413, + "grad_norm": 0.3393639624118805, + "learning_rate": 0.00036718913561431613, + "loss": 4.6681, + "step": 1393 + }, + { + "epoch": 0.4458214235743269, + "grad_norm": 0.39188846945762634, + "learning_rate": 0.0003668861522159945, + "loss": 4.6124, + "step": 1394 + }, + { + "epoch": 0.4461412380819125, + "grad_norm": 0.3615441918373108, + "learning_rate": 0.00036658309700889655, + "loss": 4.6809, + "step": 1395 + }, + { + "epoch": 0.4464610525894981, + "grad_norm": 0.34237948060035706, + "learning_rate": 0.0003662799703183817, + "loss": 4.6823, + "step": 1396 + }, + { + "epoch": 0.4467808670970837, + "grad_norm": 0.334337055683136, + "learning_rate": 0.00036597677246988564, + "loss": 4.6697, + "step": 1397 + }, + { + "epoch": 0.4471006816046693, + "grad_norm": 0.3497103154659271, + "learning_rate": 0.00036567350378892074, + "loss": 4.7279, + "step": 1398 + }, + { + "epoch": 0.4474204961122549, + "grad_norm": 0.3531494140625, + "learning_rate": 0.00036537016460107545, + "loss": 4.7062, + "step": 1399 + }, + { + "epoch": 0.4477403106198405, + "grad_norm": 0.34297361969947815, + "learning_rate": 0.00036506675523201385, + "loss": 4.7651, + "step": 1400 + }, + { + "epoch": 0.4477403106198405, + "eval_loss": 4.698581218719482, + "eval_runtime": 82.2605, + "eval_samples_per_second": 23.061, + "eval_steps_per_second": 5.774, + "step": 1400 + }, + { + "epoch": 0.4480601251274261, + "grad_norm": 0.3714090585708618, + "learning_rate": 0.0003647632760074751, + "loss": 4.6971, + "step": 1401 + }, + { + "epoch": 0.4483799396350117, + "grad_norm": 0.34016844630241394, + "learning_rate": 0.0003644597272532739, + "loss": 4.7252, + "step": 1402 + }, + { + "epoch": 0.4486997541425973, + "grad_norm": 0.3468213379383087, + "learning_rate": 0.00036415610929529913, + "loss": 4.7265, + "step": 1403 + }, + { + "epoch": 0.4490195686501829, + "grad_norm": 0.3513246178627014, + "learning_rate": 0.00036385242245951427, + "loss": 4.755, + "step": 1404 + }, + { + "epoch": 0.4493393831577685, + "grad_norm": 0.34193092584609985, + "learning_rate": 0.0003635486670719564, + "loss": 4.7513, + "step": 1405 + }, + { + "epoch": 0.4496591976653541, + "grad_norm": 0.3676043152809143, + "learning_rate": 0.0003632448434587366, + "loss": 4.6737, + "step": 1406 + }, + { + "epoch": 0.4499790121729397, + "grad_norm": 0.379415363073349, + "learning_rate": 0.00036294095194603905, + "loss": 4.6807, + "step": 1407 + }, + { + "epoch": 0.4502988266805253, + "grad_norm": 0.3363446593284607, + "learning_rate": 0.00036263699286012056, + "loss": 4.7128, + "step": 1408 + }, + { + "epoch": 0.4506186411881109, + "grad_norm": 0.3659687042236328, + "learning_rate": 0.0003623329665273108, + "loss": 4.6283, + "step": 1409 + }, + { + "epoch": 0.4509384556956965, + "grad_norm": 0.3622778654098511, + "learning_rate": 0.00036202887327401167, + "loss": 4.6883, + "step": 1410 + }, + { + "epoch": 0.4512582702032821, + "grad_norm": 0.4010184705257416, + "learning_rate": 0.00036172471342669663, + "loss": 4.772, + "step": 1411 + }, + { + "epoch": 0.4515780847108677, + "grad_norm": 0.3728751838207245, + "learning_rate": 0.000361420487311911, + "loss": 4.7281, + "step": 1412 + }, + { + "epoch": 0.4518978992184533, + "grad_norm": 0.3629361093044281, + "learning_rate": 0.0003611161952562707, + "loss": 4.7045, + "step": 1413 + }, + { + "epoch": 0.4522177137260389, + "grad_norm": 0.3519171178340912, + "learning_rate": 0.00036081183758646313, + "loss": 4.6861, + "step": 1414 + }, + { + "epoch": 0.4525375282336245, + "grad_norm": 0.3553248941898346, + "learning_rate": 0.00036050741462924563, + "loss": 4.724, + "step": 1415 + }, + { + "epoch": 0.4528573427412101, + "grad_norm": 0.37955769896507263, + "learning_rate": 0.0003602029267114457, + "loss": 4.6819, + "step": 1416 + }, + { + "epoch": 0.4531771572487957, + "grad_norm": 0.3557082712650299, + "learning_rate": 0.000359898374159961, + "loss": 4.6762, + "step": 1417 + }, + { + "epoch": 0.4534969717563813, + "grad_norm": 0.36919450759887695, + "learning_rate": 0.0003595937573017579, + "loss": 4.7242, + "step": 1418 + }, + { + "epoch": 0.4538167862639669, + "grad_norm": 0.35419464111328125, + "learning_rate": 0.00035928907646387234, + "loss": 4.7052, + "step": 1419 + }, + { + "epoch": 0.4541366007715525, + "grad_norm": 0.3797052502632141, + "learning_rate": 0.00035898433197340874, + "loss": 4.748, + "step": 1420 + }, + { + "epoch": 0.4544564152791381, + "grad_norm": 0.33481892943382263, + "learning_rate": 0.0003586795241575398, + "loss": 4.7317, + "step": 1421 + }, + { + "epoch": 0.4547762297867237, + "grad_norm": 0.336603045463562, + "learning_rate": 0.00035837465334350637, + "loss": 4.7278, + "step": 1422 + }, + { + "epoch": 0.4550960442943093, + "grad_norm": 0.3555077314376831, + "learning_rate": 0.0003580697198586169, + "loss": 4.6954, + "step": 1423 + }, + { + "epoch": 0.4554158588018949, + "grad_norm": 0.34736326336860657, + "learning_rate": 0.0003577647240302471, + "loss": 4.8181, + "step": 1424 + }, + { + "epoch": 0.4557356733094805, + "grad_norm": 0.34664371609687805, + "learning_rate": 0.0003574596661858396, + "loss": 4.6458, + "step": 1425 + }, + { + "epoch": 0.4560554878170661, + "grad_norm": 0.3364536464214325, + "learning_rate": 0.00035715454665290343, + "loss": 4.6564, + "step": 1426 + }, + { + "epoch": 0.4563753023246517, + "grad_norm": 0.3512004017829895, + "learning_rate": 0.0003568493657590142, + "loss": 4.6482, + "step": 1427 + }, + { + "epoch": 0.4566951168322373, + "grad_norm": 0.3391799032688141, + "learning_rate": 0.0003565441238318131, + "loss": 4.7488, + "step": 1428 + }, + { + "epoch": 0.4570149313398229, + "grad_norm": 0.33632373809814453, + "learning_rate": 0.000356238821199007, + "loss": 4.6127, + "step": 1429 + }, + { + "epoch": 0.4573347458474085, + "grad_norm": 0.3592793941497803, + "learning_rate": 0.000355933458188368, + "loss": 4.7661, + "step": 1430 + }, + { + "epoch": 0.4576545603549941, + "grad_norm": 0.34287509322166443, + "learning_rate": 0.00035562803512773284, + "loss": 4.6601, + "step": 1431 + }, + { + "epoch": 0.4579743748625797, + "grad_norm": 0.331759512424469, + "learning_rate": 0.00035532255234500284, + "loss": 4.6593, + "step": 1432 + }, + { + "epoch": 0.4582941893701653, + "grad_norm": 0.38584470748901367, + "learning_rate": 0.0003550170101681434, + "loss": 4.6229, + "step": 1433 + }, + { + "epoch": 0.4586140038777509, + "grad_norm": 0.3618987798690796, + "learning_rate": 0.00035471140892518366, + "loss": 4.6855, + "step": 1434 + }, + { + "epoch": 0.4589338183853365, + "grad_norm": 0.3290684223175049, + "learning_rate": 0.0003544057489442164, + "loss": 4.6551, + "step": 1435 + }, + { + "epoch": 0.4592536328929221, + "grad_norm": 0.35568806529045105, + "learning_rate": 0.0003541000305533971, + "loss": 4.7414, + "step": 1436 + }, + { + "epoch": 0.4595734474005077, + "grad_norm": 0.35015153884887695, + "learning_rate": 0.00035379425408094416, + "loss": 4.7583, + "step": 1437 + }, + { + "epoch": 0.4598932619080933, + "grad_norm": 0.3360646069049835, + "learning_rate": 0.00035348841985513834, + "loss": 4.7037, + "step": 1438 + }, + { + "epoch": 0.4602130764156789, + "grad_norm": 0.34291842579841614, + "learning_rate": 0.00035318252820432236, + "loss": 4.6583, + "step": 1439 + }, + { + "epoch": 0.4605328909232645, + "grad_norm": 0.328739732503891, + "learning_rate": 0.00035287657945690045, + "loss": 4.6659, + "step": 1440 + }, + { + "epoch": 0.4608527054308501, + "grad_norm": 0.34876030683517456, + "learning_rate": 0.0003525705739413385, + "loss": 4.7138, + "step": 1441 + }, + { + "epoch": 0.4611725199384357, + "grad_norm": 0.34805476665496826, + "learning_rate": 0.000352264511986163, + "loss": 4.6653, + "step": 1442 + }, + { + "epoch": 0.46149233444602134, + "grad_norm": 0.33524250984191895, + "learning_rate": 0.0003519583939199613, + "loss": 4.6185, + "step": 1443 + }, + { + "epoch": 0.4618121489536069, + "grad_norm": 0.3647015690803528, + "learning_rate": 0.00035165222007138076, + "loss": 4.7487, + "step": 1444 + }, + { + "epoch": 0.4621319634611925, + "grad_norm": 0.3870471119880676, + "learning_rate": 0.00035134599076912866, + "loss": 4.66, + "step": 1445 + }, + { + "epoch": 0.4624517779687781, + "grad_norm": 0.3421085476875305, + "learning_rate": 0.00035103970634197193, + "loss": 4.7053, + "step": 1446 + }, + { + "epoch": 0.46277159247636374, + "grad_norm": 0.35573166608810425, + "learning_rate": 0.00035073336711873666, + "loss": 4.775, + "step": 1447 + }, + { + "epoch": 0.4630914069839493, + "grad_norm": 0.3303551971912384, + "learning_rate": 0.00035042697342830783, + "loss": 4.718, + "step": 1448 + }, + { + "epoch": 0.4634112214915349, + "grad_norm": 0.37305688858032227, + "learning_rate": 0.00035012052559962853, + "loss": 4.714, + "step": 1449 + }, + { + "epoch": 0.4637310359991205, + "grad_norm": 0.33113083243370056, + "learning_rate": 0.0003498140239617005, + "loss": 4.7113, + "step": 1450 + }, + { + "epoch": 0.46405085050670614, + "grad_norm": 0.3569343090057373, + "learning_rate": 0.0003495074688435829, + "loss": 4.6924, + "step": 1451 + }, + { + "epoch": 0.4643706650142917, + "grad_norm": 0.3453904986381531, + "learning_rate": 0.00034920086057439243, + "loss": 4.7093, + "step": 1452 + }, + { + "epoch": 0.4646904795218773, + "grad_norm": 0.3387502431869507, + "learning_rate": 0.0003488941994833028, + "loss": 4.6262, + "step": 1453 + }, + { + "epoch": 0.4650102940294629, + "grad_norm": 0.35884377360343933, + "learning_rate": 0.00034858748589954437, + "loss": 4.6482, + "step": 1454 + }, + { + "epoch": 0.46533010853704854, + "grad_norm": 0.3423829972743988, + "learning_rate": 0.0003482807201524042, + "loss": 4.6878, + "step": 1455 + }, + { + "epoch": 0.4656499230446341, + "grad_norm": 0.360689640045166, + "learning_rate": 0.00034797390257122486, + "loss": 4.6499, + "step": 1456 + }, + { + "epoch": 0.4659697375522197, + "grad_norm": 0.3595859408378601, + "learning_rate": 0.0003476670334854049, + "loss": 4.7275, + "step": 1457 + }, + { + "epoch": 0.4662895520598053, + "grad_norm": 0.35201844573020935, + "learning_rate": 0.00034736011322439796, + "loss": 4.6911, + "step": 1458 + }, + { + "epoch": 0.46660936656739094, + "grad_norm": 0.3574092388153076, + "learning_rate": 0.0003470531421177128, + "loss": 4.6644, + "step": 1459 + }, + { + "epoch": 0.4669291810749765, + "grad_norm": 0.36814460158348083, + "learning_rate": 0.00034674612049491276, + "loss": 4.6686, + "step": 1460 + }, + { + "epoch": 0.4672489955825621, + "grad_norm": 0.3635476529598236, + "learning_rate": 0.0003464390486856153, + "loss": 4.7105, + "step": 1461 + }, + { + "epoch": 0.4675688100901477, + "grad_norm": 0.38265570998191833, + "learning_rate": 0.0003461319270194919, + "loss": 4.6519, + "step": 1462 + }, + { + "epoch": 0.46788862459773334, + "grad_norm": 0.36408907175064087, + "learning_rate": 0.0003458247558262672, + "loss": 4.6651, + "step": 1463 + }, + { + "epoch": 0.4682084391053189, + "grad_norm": 0.38444340229034424, + "learning_rate": 0.0003455175354357195, + "loss": 4.7486, + "step": 1464 + }, + { + "epoch": 0.4685282536129045, + "grad_norm": 0.36975181102752686, + "learning_rate": 0.0003452102661776798, + "loss": 4.564, + "step": 1465 + }, + { + "epoch": 0.4688480681204901, + "grad_norm": 0.35873138904571533, + "learning_rate": 0.0003449029483820313, + "loss": 4.5793, + "step": 1466 + }, + { + "epoch": 0.46916788262807574, + "grad_norm": 0.3803260028362274, + "learning_rate": 0.00034459558237870955, + "loss": 4.6854, + "step": 1467 + }, + { + "epoch": 0.4694876971356613, + "grad_norm": 0.3775056004524231, + "learning_rate": 0.00034428816849770173, + "loss": 4.6818, + "step": 1468 + }, + { + "epoch": 0.4698075116432469, + "grad_norm": 0.3574797809123993, + "learning_rate": 0.00034398070706904657, + "loss": 4.6641, + "step": 1469 + }, + { + "epoch": 0.4701273261508325, + "grad_norm": 0.3633163273334503, + "learning_rate": 0.0003436731984228336, + "loss": 4.6348, + "step": 1470 + }, + { + "epoch": 0.47044714065841814, + "grad_norm": 0.36290299892425537, + "learning_rate": 0.00034336564288920334, + "loss": 4.545, + "step": 1471 + }, + { + "epoch": 0.4707669551660037, + "grad_norm": 0.36241206526756287, + "learning_rate": 0.0003430580407983465, + "loss": 4.6508, + "step": 1472 + }, + { + "epoch": 0.4710867696735893, + "grad_norm": 0.3603331446647644, + "learning_rate": 0.00034275039248050384, + "loss": 4.6032, + "step": 1473 + }, + { + "epoch": 0.4714065841811749, + "grad_norm": 0.3851202726364136, + "learning_rate": 0.00034244269826596543, + "loss": 4.6484, + "step": 1474 + }, + { + "epoch": 0.47172639868876054, + "grad_norm": 0.3676314353942871, + "learning_rate": 0.0003421349584850711, + "loss": 4.7145, + "step": 1475 + }, + { + "epoch": 0.4720462131963461, + "grad_norm": 0.35233455896377563, + "learning_rate": 0.0003418271734682093, + "loss": 4.6362, + "step": 1476 + }, + { + "epoch": 0.4723660277039317, + "grad_norm": 0.36112335324287415, + "learning_rate": 0.00034151934354581715, + "loss": 4.6983, + "step": 1477 + }, + { + "epoch": 0.4726858422115173, + "grad_norm": 0.3636215329170227, + "learning_rate": 0.00034121146904837995, + "loss": 4.678, + "step": 1478 + }, + { + "epoch": 0.47300565671910294, + "grad_norm": 0.3911776542663574, + "learning_rate": 0.00034090355030643083, + "loss": 4.6516, + "step": 1479 + }, + { + "epoch": 0.4733254712266885, + "grad_norm": 0.45845794677734375, + "learning_rate": 0.00034059558765055047, + "loss": 4.7059, + "step": 1480 + }, + { + "epoch": 0.4736452857342741, + "grad_norm": 0.3456306755542755, + "learning_rate": 0.0003402875814113666, + "loss": 4.591, + "step": 1481 + }, + { + "epoch": 0.4739651002418597, + "grad_norm": 0.3881906270980835, + "learning_rate": 0.00033997953191955383, + "loss": 4.6141, + "step": 1482 + }, + { + "epoch": 0.47428491474944534, + "grad_norm": 0.3685750365257263, + "learning_rate": 0.0003396714395058333, + "loss": 4.6435, + "step": 1483 + }, + { + "epoch": 0.4746047292570309, + "grad_norm": 0.3566403090953827, + "learning_rate": 0.00033936330450097193, + "loss": 4.643, + "step": 1484 + }, + { + "epoch": 0.4749245437646165, + "grad_norm": 0.3359985947608948, + "learning_rate": 0.0003390551272357829, + "loss": 4.6454, + "step": 1485 + }, + { + "epoch": 0.4752443582722021, + "grad_norm": 0.36924082040786743, + "learning_rate": 0.00033874690804112397, + "loss": 4.676, + "step": 1486 + }, + { + "epoch": 0.47556417277978774, + "grad_norm": 0.35759541392326355, + "learning_rate": 0.00033843864724789866, + "loss": 4.6631, + "step": 1487 + }, + { + "epoch": 0.4758839872873733, + "grad_norm": 0.34228649735450745, + "learning_rate": 0.00033813034518705463, + "loss": 4.6948, + "step": 1488 + }, + { + "epoch": 0.4762038017949589, + "grad_norm": 0.3689830005168915, + "learning_rate": 0.00033782200218958433, + "loss": 4.6772, + "step": 1489 + }, + { + "epoch": 0.47652361630254453, + "grad_norm": 0.36067578196525574, + "learning_rate": 0.00033751361858652375, + "loss": 4.6559, + "step": 1490 + }, + { + "epoch": 0.47684343081013014, + "grad_norm": 0.3399069309234619, + "learning_rate": 0.0003372051947089526, + "loss": 4.7165, + "step": 1491 + }, + { + "epoch": 0.4771632453177157, + "grad_norm": 0.33639630675315857, + "learning_rate": 0.0003368967308879939, + "loss": 4.7684, + "step": 1492 + }, + { + "epoch": 0.4774830598253013, + "grad_norm": 0.37365174293518066, + "learning_rate": 0.0003365882274548135, + "loss": 4.6071, + "step": 1493 + }, + { + "epoch": 0.47780287433288693, + "grad_norm": 0.343077152967453, + "learning_rate": 0.00033627968474061966, + "loss": 4.6191, + "step": 1494 + }, + { + "epoch": 0.47812268884047254, + "grad_norm": 0.36870089173316956, + "learning_rate": 0.0003359711030766631, + "loss": 4.7342, + "step": 1495 + }, + { + "epoch": 0.4784425033480581, + "grad_norm": 0.3427372872829437, + "learning_rate": 0.0003356624827942361, + "loss": 4.6408, + "step": 1496 + }, + { + "epoch": 0.4787623178556437, + "grad_norm": 0.3489178419113159, + "learning_rate": 0.00033535382422467255, + "loss": 4.5872, + "step": 1497 + }, + { + "epoch": 0.47908213236322933, + "grad_norm": 0.34867149591445923, + "learning_rate": 0.0003350451276993473, + "loss": 4.6825, + "step": 1498 + }, + { + "epoch": 0.47940194687081494, + "grad_norm": 0.3406824469566345, + "learning_rate": 0.000334736393549676, + "loss": 4.5928, + "step": 1499 + }, + { + "epoch": 0.4797217613784005, + "grad_norm": 0.34946388006210327, + "learning_rate": 0.00033442762210711483, + "loss": 4.6101, + "step": 1500 + }, + { + "epoch": 0.4797217613784005, + "eval_loss": 4.658926010131836, + "eval_runtime": 80.6757, + "eval_samples_per_second": 23.514, + "eval_steps_per_second": 5.888, + "step": 1500 + }, + { + "epoch": 0.4800415758859861, + "grad_norm": 0.487714558839798, + "learning_rate": 0.0003341188137031599, + "loss": 4.6049, + "step": 1501 + }, + { + "epoch": 0.48036139039357173, + "grad_norm": 0.3670230805873871, + "learning_rate": 0.0003338099686693469, + "loss": 4.7125, + "step": 1502 + }, + { + "epoch": 0.48068120490115734, + "grad_norm": 0.3611529469490051, + "learning_rate": 0.00033350108733725103, + "loss": 4.6454, + "step": 1503 + }, + { + "epoch": 0.4810010194087429, + "grad_norm": 0.3804737627506256, + "learning_rate": 0.00033319217003848644, + "loss": 4.7163, + "step": 1504 + }, + { + "epoch": 0.4813208339163285, + "grad_norm": 0.3475566506385803, + "learning_rate": 0.0003328832171047057, + "loss": 4.6357, + "step": 1505 + }, + { + "epoch": 0.48164064842391413, + "grad_norm": 0.40150049328804016, + "learning_rate": 0.0003325742288675998, + "loss": 4.6686, + "step": 1506 + }, + { + "epoch": 0.48196046293149974, + "grad_norm": 0.35256749391555786, + "learning_rate": 0.0003322652056588976, + "loss": 4.6878, + "step": 1507 + }, + { + "epoch": 0.4822802774390853, + "grad_norm": 0.324462890625, + "learning_rate": 0.0003319561478103656, + "loss": 4.6671, + "step": 1508 + }, + { + "epoch": 0.4826000919466709, + "grad_norm": 0.3362123966217041, + "learning_rate": 0.00033164705565380737, + "loss": 4.6155, + "step": 1509 + }, + { + "epoch": 0.48291990645425653, + "grad_norm": 0.3507479131221771, + "learning_rate": 0.00033133792952106327, + "loss": 4.6358, + "step": 1510 + }, + { + "epoch": 0.48323972096184215, + "grad_norm": 0.35958319902420044, + "learning_rate": 0.0003310287697440102, + "loss": 4.5733, + "step": 1511 + }, + { + "epoch": 0.48355953546942776, + "grad_norm": 0.3396565020084381, + "learning_rate": 0.0003307195766545612, + "loss": 4.5693, + "step": 1512 + }, + { + "epoch": 0.4838793499770133, + "grad_norm": 0.32788902521133423, + "learning_rate": 0.00033041035058466525, + "loss": 4.6957, + "step": 1513 + }, + { + "epoch": 0.48419916448459893, + "grad_norm": 0.35396474599838257, + "learning_rate": 0.00033010109186630625, + "loss": 4.6261, + "step": 1514 + }, + { + "epoch": 0.48451897899218455, + "grad_norm": 0.3637486696243286, + "learning_rate": 0.00032979180083150366, + "loss": 4.6156, + "step": 1515 + }, + { + "epoch": 0.48483879349977016, + "grad_norm": 0.3381948173046112, + "learning_rate": 0.00032948247781231134, + "loss": 4.6401, + "step": 1516 + }, + { + "epoch": 0.4851586080073557, + "grad_norm": 0.3423517346382141, + "learning_rate": 0.0003291731231408175, + "loss": 4.6505, + "step": 1517 + }, + { + "epoch": 0.48547842251494133, + "grad_norm": 0.34794390201568604, + "learning_rate": 0.00032886373714914455, + "loss": 4.6943, + "step": 1518 + }, + { + "epoch": 0.48579823702252695, + "grad_norm": 0.3557645082473755, + "learning_rate": 0.00032855432016944835, + "loss": 4.6012, + "step": 1519 + }, + { + "epoch": 0.48611805153011256, + "grad_norm": 0.3364381790161133, + "learning_rate": 0.000328244872533918, + "loss": 4.6509, + "step": 1520 + }, + { + "epoch": 0.4864378660376981, + "grad_norm": 0.3535098731517792, + "learning_rate": 0.00032793539457477564, + "loss": 4.6239, + "step": 1521 + }, + { + "epoch": 0.48675768054528373, + "grad_norm": 0.34493860602378845, + "learning_rate": 0.00032762588662427585, + "loss": 4.571, + "step": 1522 + }, + { + "epoch": 0.48707749505286935, + "grad_norm": 0.37122923135757446, + "learning_rate": 0.0003273163490147054, + "loss": 4.5478, + "step": 1523 + }, + { + "epoch": 0.48739730956045496, + "grad_norm": 0.34408897161483765, + "learning_rate": 0.0003270067820783831, + "loss": 4.6441, + "step": 1524 + }, + { + "epoch": 0.4877171240680405, + "grad_norm": 0.36329254508018494, + "learning_rate": 0.0003266971861476589, + "loss": 4.6485, + "step": 1525 + }, + { + "epoch": 0.48803693857562613, + "grad_norm": 0.352076917886734, + "learning_rate": 0.00032638756155491436, + "loss": 4.6351, + "step": 1526 + }, + { + "epoch": 0.48835675308321175, + "grad_norm": 0.37958183884620667, + "learning_rate": 0.0003260779086325612, + "loss": 4.6328, + "step": 1527 + }, + { + "epoch": 0.48867656759079736, + "grad_norm": 0.3417004644870758, + "learning_rate": 0.0003257682277130422, + "loss": 4.5971, + "step": 1528 + }, + { + "epoch": 0.4889963820983829, + "grad_norm": 0.3330473005771637, + "learning_rate": 0.0003254585191288297, + "loss": 4.6064, + "step": 1529 + }, + { + "epoch": 0.48931619660596853, + "grad_norm": 0.3555782735347748, + "learning_rate": 0.0003251487832124259, + "loss": 4.5578, + "step": 1530 + }, + { + "epoch": 0.48963601111355415, + "grad_norm": 0.33937859535217285, + "learning_rate": 0.00032483902029636257, + "loss": 4.6298, + "step": 1531 + }, + { + "epoch": 0.48995582562113976, + "grad_norm": 0.3552972972393036, + "learning_rate": 0.00032452923071320006, + "loss": 4.6884, + "step": 1532 + }, + { + "epoch": 0.4902756401287253, + "grad_norm": 0.34347274899482727, + "learning_rate": 0.00032421941479552767, + "loss": 4.6478, + "step": 1533 + }, + { + "epoch": 0.49059545463631093, + "grad_norm": 0.3543516993522644, + "learning_rate": 0.00032390957287596275, + "loss": 4.7032, + "step": 1534 + }, + { + "epoch": 0.49091526914389655, + "grad_norm": 0.37930676341056824, + "learning_rate": 0.0003235997052871508, + "loss": 4.6913, + "step": 1535 + }, + { + "epoch": 0.49123508365148216, + "grad_norm": 0.3467349410057068, + "learning_rate": 0.00032328981236176465, + "loss": 4.5694, + "step": 1536 + }, + { + "epoch": 0.4915548981590677, + "grad_norm": 0.3399907946586609, + "learning_rate": 0.00032297989443250445, + "loss": 4.617, + "step": 1537 + }, + { + "epoch": 0.49187471266665334, + "grad_norm": 0.33236512541770935, + "learning_rate": 0.0003226699518320973, + "loss": 4.654, + "step": 1538 + }, + { + "epoch": 0.49219452717423895, + "grad_norm": 0.3547270894050598, + "learning_rate": 0.0003223599848932964, + "loss": 4.6355, + "step": 1539 + }, + { + "epoch": 0.49251434168182456, + "grad_norm": 0.3350517749786377, + "learning_rate": 0.0003220499939488817, + "loss": 4.5831, + "step": 1540 + }, + { + "epoch": 0.4928341561894101, + "grad_norm": 0.3335596024990082, + "learning_rate": 0.0003217399793316583, + "loss": 4.6857, + "step": 1541 + }, + { + "epoch": 0.49315397069699574, + "grad_norm": 0.3638198673725128, + "learning_rate": 0.00032142994137445693, + "loss": 4.5726, + "step": 1542 + }, + { + "epoch": 0.49347378520458135, + "grad_norm": 0.3430887460708618, + "learning_rate": 0.0003211198804101337, + "loss": 4.6377, + "step": 1543 + }, + { + "epoch": 0.49379359971216696, + "grad_norm": 0.3397575914859772, + "learning_rate": 0.000320809796771569, + "loss": 4.6102, + "step": 1544 + }, + { + "epoch": 0.4941134142197525, + "grad_norm": 0.33723217248916626, + "learning_rate": 0.00032049969079166765, + "loss": 4.5556, + "step": 1545 + }, + { + "epoch": 0.49443322872733814, + "grad_norm": 0.34441104531288147, + "learning_rate": 0.0003201895628033587, + "loss": 4.6884, + "step": 1546 + }, + { + "epoch": 0.49475304323492375, + "grad_norm": 0.3217085897922516, + "learning_rate": 0.00031987941313959433, + "loss": 4.678, + "step": 1547 + }, + { + "epoch": 0.49507285774250936, + "grad_norm": 0.34782636165618896, + "learning_rate": 0.0003195692421333506, + "loss": 4.6236, + "step": 1548 + }, + { + "epoch": 0.4953926722500949, + "grad_norm": 0.3566958010196686, + "learning_rate": 0.0003192590501176261, + "loss": 4.6062, + "step": 1549 + }, + { + "epoch": 0.49571248675768054, + "grad_norm": 0.36577853560447693, + "learning_rate": 0.0003189488374254421, + "loss": 4.6045, + "step": 1550 + }, + { + "epoch": 0.49603230126526615, + "grad_norm": 0.3512285649776459, + "learning_rate": 0.00031863860438984193, + "loss": 4.6837, + "step": 1551 + }, + { + "epoch": 0.49635211577285177, + "grad_norm": 0.3427387773990631, + "learning_rate": 0.00031832835134389093, + "loss": 4.6715, + "step": 1552 + }, + { + "epoch": 0.4966719302804373, + "grad_norm": 0.3531494140625, + "learning_rate": 0.0003180180786206759, + "loss": 4.5646, + "step": 1553 + }, + { + "epoch": 0.49699174478802294, + "grad_norm": 0.35843127965927124, + "learning_rate": 0.0003177077865533046, + "loss": 4.6221, + "step": 1554 + }, + { + "epoch": 0.49731155929560855, + "grad_norm": 0.35052454471588135, + "learning_rate": 0.00031739747547490584, + "loss": 4.611, + "step": 1555 + }, + { + "epoch": 0.49763137380319417, + "grad_norm": 0.364162802696228, + "learning_rate": 0.0003170871457186286, + "loss": 4.6817, + "step": 1556 + }, + { + "epoch": 0.4979511883107797, + "grad_norm": 0.35862216353416443, + "learning_rate": 0.0003167767976176419, + "loss": 4.485, + "step": 1557 + }, + { + "epoch": 0.49827100281836534, + "grad_norm": 0.35927194356918335, + "learning_rate": 0.0003164664315051347, + "loss": 4.6354, + "step": 1558 + }, + { + "epoch": 0.49859081732595095, + "grad_norm": 0.3603445887565613, + "learning_rate": 0.00031615604771431514, + "loss": 4.5751, + "step": 1559 + }, + { + "epoch": 0.49891063183353657, + "grad_norm": 0.37982043623924255, + "learning_rate": 0.00031584564657841015, + "loss": 4.6733, + "step": 1560 + }, + { + "epoch": 0.4992304463411221, + "grad_norm": 0.3557567000389099, + "learning_rate": 0.0003155352284306657, + "loss": 4.5831, + "step": 1561 + }, + { + "epoch": 0.49955026084870774, + "grad_norm": 0.35779282450675964, + "learning_rate": 0.00031522479360434567, + "loss": 4.6149, + "step": 1562 + }, + { + "epoch": 0.49987007535629335, + "grad_norm": 0.33635514974594116, + "learning_rate": 0.00031491434243273214, + "loss": 4.7235, + "step": 1563 + }, + { + "epoch": 0.5001898898638789, + "grad_norm": 0.38213077187538147, + "learning_rate": 0.00031460387524912437, + "loss": 4.6656, + "step": 1564 + }, + { + "epoch": 0.5005097043714646, + "grad_norm": 0.3529198467731476, + "learning_rate": 0.0003142933923868391, + "loss": 4.489, + "step": 1565 + }, + { + "epoch": 0.5008295188790501, + "grad_norm": 0.34217679500579834, + "learning_rate": 0.00031398289417920976, + "loss": 4.6375, + "step": 1566 + }, + { + "epoch": 0.5011493333866357, + "grad_norm": 0.37172558903694153, + "learning_rate": 0.00031367238095958644, + "loss": 4.6723, + "step": 1567 + }, + { + "epoch": 0.5014691478942214, + "grad_norm": 0.3407527208328247, + "learning_rate": 0.00031336185306133523, + "loss": 4.5211, + "step": 1568 + }, + { + "epoch": 0.5017889624018069, + "grad_norm": 0.36462146043777466, + "learning_rate": 0.0003130513108178378, + "loss": 4.5811, + "step": 1569 + }, + { + "epoch": 0.5021087769093926, + "grad_norm": 0.3509719669818878, + "learning_rate": 0.0003127407545624915, + "loss": 4.756, + "step": 1570 + }, + { + "epoch": 0.5024285914169782, + "grad_norm": 0.375997394323349, + "learning_rate": 0.0003124301846287085, + "loss": 4.6367, + "step": 1571 + }, + { + "epoch": 0.5027484059245637, + "grad_norm": 0.3393837809562683, + "learning_rate": 0.00031211960134991596, + "loss": 4.6095, + "step": 1572 + }, + { + "epoch": 0.5030682204321494, + "grad_norm": 0.3639393150806427, + "learning_rate": 0.00031180900505955496, + "loss": 4.7104, + "step": 1573 + }, + { + "epoch": 0.5033880349397349, + "grad_norm": 0.34335988759994507, + "learning_rate": 0.000311498396091081, + "loss": 4.6459, + "step": 1574 + }, + { + "epoch": 0.5037078494473205, + "grad_norm": 0.350800484418869, + "learning_rate": 0.00031118777477796275, + "loss": 4.6279, + "step": 1575 + }, + { + "epoch": 0.5040276639549062, + "grad_norm": 0.35865524411201477, + "learning_rate": 0.0003108771414536825, + "loss": 4.6563, + "step": 1576 + }, + { + "epoch": 0.5043474784624917, + "grad_norm": 0.346827894449234, + "learning_rate": 0.0003105664964517351, + "loss": 4.5878, + "step": 1577 + }, + { + "epoch": 0.5046672929700774, + "grad_norm": 0.34434160590171814, + "learning_rate": 0.0003102558401056282, + "loss": 4.5747, + "step": 1578 + }, + { + "epoch": 0.504987107477663, + "grad_norm": 0.358316570520401, + "learning_rate": 0.00030994517274888155, + "loss": 4.6122, + "step": 1579 + }, + { + "epoch": 0.5053069219852485, + "grad_norm": 0.3511278033256531, + "learning_rate": 0.00030963449471502674, + "loss": 4.6333, + "step": 1580 + }, + { + "epoch": 0.5056267364928342, + "grad_norm": 0.3433941900730133, + "learning_rate": 0.0003093238063376068, + "loss": 4.6083, + "step": 1581 + }, + { + "epoch": 0.5059465510004197, + "grad_norm": 0.34738221764564514, + "learning_rate": 0.00030901310795017567, + "loss": 4.6832, + "step": 1582 + }, + { + "epoch": 0.5062663655080054, + "grad_norm": 0.35411185026168823, + "learning_rate": 0.00030870239988629844, + "loss": 4.6223, + "step": 1583 + }, + { + "epoch": 0.506586180015591, + "grad_norm": 0.3453824818134308, + "learning_rate": 0.0003083916824795503, + "loss": 4.6439, + "step": 1584 + }, + { + "epoch": 0.5069059945231765, + "grad_norm": 0.35654762387275696, + "learning_rate": 0.0003080809560635165, + "loss": 4.5631, + "step": 1585 + }, + { + "epoch": 0.5072258090307622, + "grad_norm": 0.3844980299472809, + "learning_rate": 0.0003077702209717921, + "loss": 4.7018, + "step": 1586 + }, + { + "epoch": 0.5075456235383478, + "grad_norm": 0.3394923508167267, + "learning_rate": 0.0003074594775379812, + "loss": 4.5698, + "step": 1587 + }, + { + "epoch": 0.5078654380459333, + "grad_norm": 0.34746694564819336, + "learning_rate": 0.00030714872609569733, + "loss": 4.5785, + "step": 1588 + }, + { + "epoch": 0.508185252553519, + "grad_norm": 0.3709687292575836, + "learning_rate": 0.0003068379669785622, + "loss": 4.6214, + "step": 1589 + }, + { + "epoch": 0.5085050670611045, + "grad_norm": 0.3458937108516693, + "learning_rate": 0.0003065272005202056, + "loss": 4.5854, + "step": 1590 + }, + { + "epoch": 0.5088248815686902, + "grad_norm": 0.37032395601272583, + "learning_rate": 0.00030621642705426586, + "loss": 4.6531, + "step": 1591 + }, + { + "epoch": 0.5091446960762758, + "grad_norm": 0.35681501030921936, + "learning_rate": 0.0003059056469143884, + "loss": 4.5877, + "step": 1592 + }, + { + "epoch": 0.5094645105838613, + "grad_norm": 0.34055960178375244, + "learning_rate": 0.0003055948604342257, + "loss": 4.5802, + "step": 1593 + }, + { + "epoch": 0.509784325091447, + "grad_norm": 0.3622954785823822, + "learning_rate": 0.0003052840679474373, + "loss": 4.5284, + "step": 1594 + }, + { + "epoch": 0.5101041395990326, + "grad_norm": 0.33317726850509644, + "learning_rate": 0.0003049732697876891, + "loss": 4.6278, + "step": 1595 + }, + { + "epoch": 0.5104239541066181, + "grad_norm": 0.3395687937736511, + "learning_rate": 0.000304662466288653, + "loss": 4.6818, + "step": 1596 + }, + { + "epoch": 0.5107437686142038, + "grad_norm": 0.3536480963230133, + "learning_rate": 0.000304351657784007, + "loss": 4.5842, + "step": 1597 + }, + { + "epoch": 0.5110635831217893, + "grad_norm": 0.34140917658805847, + "learning_rate": 0.0003040408446074339, + "loss": 4.7312, + "step": 1598 + }, + { + "epoch": 0.511383397629375, + "grad_norm": 0.33931636810302734, + "learning_rate": 0.000303730027092622, + "loss": 4.5617, + "step": 1599 + }, + { + "epoch": 0.5117032121369606, + "grad_norm": 0.336770236492157, + "learning_rate": 0.00030341920557326385, + "loss": 4.5814, + "step": 1600 + }, + { + "epoch": 0.5117032121369606, + "eval_loss": 4.6179680824279785, + "eval_runtime": 80.4923, + "eval_samples_per_second": 23.567, + "eval_steps_per_second": 5.901, + "step": 1600 + }, + { + "epoch": 0.5120230266445461, + "grad_norm": 0.3277044892311096, + "learning_rate": 0.0003031083803830567, + "loss": 4.5805, + "step": 1601 + }, + { + "epoch": 0.5123428411521318, + "grad_norm": 0.34600555896759033, + "learning_rate": 0.0003027975518557016, + "loss": 4.6399, + "step": 1602 + }, + { + "epoch": 0.5126626556597174, + "grad_norm": 0.34565469622612, + "learning_rate": 0.00030248672032490295, + "loss": 4.6341, + "step": 1603 + }, + { + "epoch": 0.5129824701673029, + "grad_norm": 0.3367885649204254, + "learning_rate": 0.0003021758861243688, + "loss": 4.6379, + "step": 1604 + }, + { + "epoch": 0.5133022846748886, + "grad_norm": 0.3366295099258423, + "learning_rate": 0.0003018650495878096, + "loss": 4.6036, + "step": 1605 + }, + { + "epoch": 0.5136220991824741, + "grad_norm": 0.34721916913986206, + "learning_rate": 0.0003015542110489387, + "loss": 4.5726, + "step": 1606 + }, + { + "epoch": 0.5139419136900598, + "grad_norm": 0.33897995948791504, + "learning_rate": 0.00030124337084147144, + "loss": 4.5874, + "step": 1607 + }, + { + "epoch": 0.5142617281976454, + "grad_norm": 0.3487263023853302, + "learning_rate": 0.0003009325292991247, + "loss": 4.5789, + "step": 1608 + }, + { + "epoch": 0.5145815427052309, + "grad_norm": 0.3403370678424835, + "learning_rate": 0.0003006216867556175, + "loss": 4.625, + "step": 1609 + }, + { + "epoch": 0.5149013572128166, + "grad_norm": 0.33869045972824097, + "learning_rate": 0.00030031084354466904, + "loss": 4.5981, + "step": 1610 + }, + { + "epoch": 0.5152211717204022, + "grad_norm": 0.34933406114578247, + "learning_rate": 0.0003, + "loss": 4.6767, + "step": 1611 + }, + { + "epoch": 0.5155409862279877, + "grad_norm": 0.3558529317378998, + "learning_rate": 0.00029968915645533085, + "loss": 4.6249, + "step": 1612 + }, + { + "epoch": 0.5158608007355734, + "grad_norm": 0.3501514196395874, + "learning_rate": 0.0002993783132443825, + "loss": 4.5925, + "step": 1613 + }, + { + "epoch": 0.516180615243159, + "grad_norm": 0.34425514936447144, + "learning_rate": 0.0002990674707008752, + "loss": 4.6, + "step": 1614 + }, + { + "epoch": 0.5165004297507446, + "grad_norm": 0.3402040898799896, + "learning_rate": 0.0002987566291585286, + "loss": 4.6288, + "step": 1615 + }, + { + "epoch": 0.5168202442583302, + "grad_norm": 0.33086714148521423, + "learning_rate": 0.00029844578895106127, + "loss": 4.5629, + "step": 1616 + }, + { + "epoch": 0.5171400587659157, + "grad_norm": 0.35298269987106323, + "learning_rate": 0.0002981349504121904, + "loss": 4.5526, + "step": 1617 + }, + { + "epoch": 0.5174598732735014, + "grad_norm": 0.3435682952404022, + "learning_rate": 0.0002978241138756312, + "loss": 4.5722, + "step": 1618 + }, + { + "epoch": 0.517779687781087, + "grad_norm": 0.34460437297821045, + "learning_rate": 0.00029751327967509695, + "loss": 4.5969, + "step": 1619 + }, + { + "epoch": 0.5180995022886725, + "grad_norm": 0.3217136561870575, + "learning_rate": 0.0002972024481442984, + "loss": 4.5586, + "step": 1620 + }, + { + "epoch": 0.5184193167962582, + "grad_norm": 0.36733126640319824, + "learning_rate": 0.00029689161961694323, + "loss": 4.5857, + "step": 1621 + }, + { + "epoch": 0.5187391313038437, + "grad_norm": 0.3302042484283447, + "learning_rate": 0.00029658079442673616, + "loss": 4.6349, + "step": 1622 + }, + { + "epoch": 0.5190589458114294, + "grad_norm": 0.3308677673339844, + "learning_rate": 0.000296269972907378, + "loss": 4.5493, + "step": 1623 + }, + { + "epoch": 0.519378760319015, + "grad_norm": 0.34367772936820984, + "learning_rate": 0.00029595915539256605, + "loss": 4.6464, + "step": 1624 + }, + { + "epoch": 0.5196985748266005, + "grad_norm": 0.37406760454177856, + "learning_rate": 0.000295648342215993, + "loss": 4.6505, + "step": 1625 + }, + { + "epoch": 0.5200183893341862, + "grad_norm": 0.3676385283470154, + "learning_rate": 0.0002953375337113468, + "loss": 4.5778, + "step": 1626 + }, + { + "epoch": 0.5203382038417718, + "grad_norm": 0.3593791127204895, + "learning_rate": 0.00029502673021231096, + "loss": 4.6247, + "step": 1627 + }, + { + "epoch": 0.5206580183493573, + "grad_norm": 0.34721025824546814, + "learning_rate": 0.0002947159320525627, + "loss": 4.6347, + "step": 1628 + }, + { + "epoch": 0.520977832856943, + "grad_norm": 0.36313459277153015, + "learning_rate": 0.0002944051395657744, + "loss": 4.5717, + "step": 1629 + }, + { + "epoch": 0.5212976473645285, + "grad_norm": 0.34549617767333984, + "learning_rate": 0.0002940943530856116, + "loss": 4.6189, + "step": 1630 + }, + { + "epoch": 0.5216174618721142, + "grad_norm": 0.34426239132881165, + "learning_rate": 0.00029378357294573403, + "loss": 4.5404, + "step": 1631 + }, + { + "epoch": 0.5219372763796998, + "grad_norm": 0.3499269187450409, + "learning_rate": 0.0002934727994797944, + "loss": 4.6828, + "step": 1632 + }, + { + "epoch": 0.5222570908872853, + "grad_norm": 0.33449286222457886, + "learning_rate": 0.0002931620330214378, + "loss": 4.6162, + "step": 1633 + }, + { + "epoch": 0.522576905394871, + "grad_norm": 0.3505636751651764, + "learning_rate": 0.00029285127390430273, + "loss": 4.5799, + "step": 1634 + }, + { + "epoch": 0.5228967199024566, + "grad_norm": 0.3406105637550354, + "learning_rate": 0.00029254052246201873, + "loss": 4.6334, + "step": 1635 + }, + { + "epoch": 0.5232165344100421, + "grad_norm": 0.3492126762866974, + "learning_rate": 0.00029222977902820785, + "loss": 4.6696, + "step": 1636 + }, + { + "epoch": 0.5235363489176278, + "grad_norm": 0.3455886244773865, + "learning_rate": 0.0002919190439364835, + "loss": 4.5658, + "step": 1637 + }, + { + "epoch": 0.5238561634252134, + "grad_norm": 0.3499142825603485, + "learning_rate": 0.00029160831752044966, + "loss": 4.6524, + "step": 1638 + }, + { + "epoch": 0.524175977932799, + "grad_norm": 0.34150341153144836, + "learning_rate": 0.00029129760011370156, + "loss": 4.5589, + "step": 1639 + }, + { + "epoch": 0.5244957924403846, + "grad_norm": 0.36174193024635315, + "learning_rate": 0.00029098689204982433, + "loss": 4.6428, + "step": 1640 + }, + { + "epoch": 0.5248156069479701, + "grad_norm": 0.3357899785041809, + "learning_rate": 0.00029067619366239327, + "loss": 4.5669, + "step": 1641 + }, + { + "epoch": 0.5251354214555558, + "grad_norm": 0.3400886058807373, + "learning_rate": 0.00029036550528497326, + "loss": 4.5824, + "step": 1642 + }, + { + "epoch": 0.5254552359631414, + "grad_norm": 0.32499825954437256, + "learning_rate": 0.0002900548272511183, + "loss": 4.6356, + "step": 1643 + }, + { + "epoch": 0.5257750504707269, + "grad_norm": 0.34948495030403137, + "learning_rate": 0.00028974415989437176, + "loss": 4.5922, + "step": 1644 + }, + { + "epoch": 0.5260948649783126, + "grad_norm": 0.3675726056098938, + "learning_rate": 0.0002894335035482649, + "loss": 4.6599, + "step": 1645 + }, + { + "epoch": 0.5264146794858982, + "grad_norm": 0.3842601180076599, + "learning_rate": 0.00028912285854631754, + "loss": 4.552, + "step": 1646 + }, + { + "epoch": 0.5267344939934838, + "grad_norm": 0.33656397461891174, + "learning_rate": 0.0002888122252220372, + "loss": 4.5866, + "step": 1647 + }, + { + "epoch": 0.5270543085010694, + "grad_norm": 0.3506307005882263, + "learning_rate": 0.00028850160390891895, + "loss": 4.6307, + "step": 1648 + }, + { + "epoch": 0.5273741230086549, + "grad_norm": 0.40038666129112244, + "learning_rate": 0.000288190994940445, + "loss": 4.5926, + "step": 1649 + }, + { + "epoch": 0.5276939375162406, + "grad_norm": 0.350915789604187, + "learning_rate": 0.00028788039865008404, + "loss": 4.664, + "step": 1650 + }, + { + "epoch": 0.5280137520238262, + "grad_norm": 0.34243419766426086, + "learning_rate": 0.0002875698153712915, + "loss": 4.5545, + "step": 1651 + }, + { + "epoch": 0.5283335665314118, + "grad_norm": 0.33768656849861145, + "learning_rate": 0.0002872592454375086, + "loss": 4.5719, + "step": 1652 + }, + { + "epoch": 0.5286533810389974, + "grad_norm": 0.34391385316848755, + "learning_rate": 0.00028694868918216227, + "loss": 4.5932, + "step": 1653 + }, + { + "epoch": 0.528973195546583, + "grad_norm": 0.3686625361442566, + "learning_rate": 0.0002866381469386648, + "loss": 4.6416, + "step": 1654 + }, + { + "epoch": 0.5292930100541686, + "grad_norm": 0.3312521278858185, + "learning_rate": 0.0002863276190404135, + "loss": 4.5455, + "step": 1655 + }, + { + "epoch": 0.5296128245617542, + "grad_norm": 0.3539404273033142, + "learning_rate": 0.0002860171058207902, + "loss": 4.6432, + "step": 1656 + }, + { + "epoch": 0.5299326390693397, + "grad_norm": 0.34247922897338867, + "learning_rate": 0.0002857066076131609, + "loss": 4.5633, + "step": 1657 + }, + { + "epoch": 0.5302524535769254, + "grad_norm": 0.3436299264431, + "learning_rate": 0.00028539612475087563, + "loss": 4.5739, + "step": 1658 + }, + { + "epoch": 0.530572268084511, + "grad_norm": 0.36008891463279724, + "learning_rate": 0.0002850856575672679, + "loss": 4.5924, + "step": 1659 + }, + { + "epoch": 0.5308920825920966, + "grad_norm": 0.33665239810943604, + "learning_rate": 0.0002847752063956543, + "loss": 4.5593, + "step": 1660 + }, + { + "epoch": 0.5312118970996822, + "grad_norm": 0.35827916860580444, + "learning_rate": 0.00028446477156933425, + "loss": 4.5354, + "step": 1661 + }, + { + "epoch": 0.5315317116072678, + "grad_norm": 0.33464479446411133, + "learning_rate": 0.0002841543534215898, + "loss": 4.6346, + "step": 1662 + }, + { + "epoch": 0.5318515261148534, + "grad_norm": 0.3329927623271942, + "learning_rate": 0.0002838439522856849, + "loss": 4.5882, + "step": 1663 + }, + { + "epoch": 0.532171340622439, + "grad_norm": 0.34070804715156555, + "learning_rate": 0.00028353356849486526, + "loss": 4.571, + "step": 1664 + }, + { + "epoch": 0.5324911551300245, + "grad_norm": 0.339232861995697, + "learning_rate": 0.00028322320238235814, + "loss": 4.5275, + "step": 1665 + }, + { + "epoch": 0.5328109696376102, + "grad_norm": 0.3289678990840912, + "learning_rate": 0.00028291285428137146, + "loss": 4.6546, + "step": 1666 + }, + { + "epoch": 0.5331307841451958, + "grad_norm": 0.36252179741859436, + "learning_rate": 0.0002826025245250941, + "loss": 4.5838, + "step": 1667 + }, + { + "epoch": 0.5334505986527814, + "grad_norm": 0.3532697856426239, + "learning_rate": 0.00028229221344669534, + "loss": 4.6553, + "step": 1668 + }, + { + "epoch": 0.533770413160367, + "grad_norm": 0.33651626110076904, + "learning_rate": 0.0002819819213793241, + "loss": 4.6096, + "step": 1669 + }, + { + "epoch": 0.5340902276679526, + "grad_norm": 0.36147356033325195, + "learning_rate": 0.00028167164865610907, + "loss": 4.6065, + "step": 1670 + }, + { + "epoch": 0.5344100421755382, + "grad_norm": 0.3495180904865265, + "learning_rate": 0.00028136139561015807, + "loss": 4.5254, + "step": 1671 + }, + { + "epoch": 0.5347298566831238, + "grad_norm": 0.3535555899143219, + "learning_rate": 0.00028105116257455786, + "loss": 4.5102, + "step": 1672 + }, + { + "epoch": 0.5350496711907093, + "grad_norm": 0.35404103994369507, + "learning_rate": 0.00028074094988237385, + "loss": 4.5907, + "step": 1673 + }, + { + "epoch": 0.535369485698295, + "grad_norm": 0.3414469361305237, + "learning_rate": 0.00028043075786664934, + "loss": 4.5869, + "step": 1674 + }, + { + "epoch": 0.5356893002058806, + "grad_norm": 0.35111403465270996, + "learning_rate": 0.0002801205868604057, + "loss": 4.5031, + "step": 1675 + }, + { + "epoch": 0.5360091147134662, + "grad_norm": 0.3515930473804474, + "learning_rate": 0.0002798104371966414, + "loss": 4.5849, + "step": 1676 + }, + { + "epoch": 0.5363289292210518, + "grad_norm": 0.34862038493156433, + "learning_rate": 0.0002795003092083324, + "loss": 4.6823, + "step": 1677 + }, + { + "epoch": 0.5366487437286374, + "grad_norm": 0.3787916302680969, + "learning_rate": 0.000279190203228431, + "loss": 4.5382, + "step": 1678 + }, + { + "epoch": 0.536968558236223, + "grad_norm": 0.3493860960006714, + "learning_rate": 0.00027888011958986623, + "loss": 4.6592, + "step": 1679 + }, + { + "epoch": 0.5372883727438086, + "grad_norm": 0.35074809193611145, + "learning_rate": 0.00027857005862554307, + "loss": 4.5425, + "step": 1680 + }, + { + "epoch": 0.5376081872513941, + "grad_norm": 0.35839346051216125, + "learning_rate": 0.00027826002066834167, + "loss": 4.6724, + "step": 1681 + }, + { + "epoch": 0.5379280017589798, + "grad_norm": 0.3281191885471344, + "learning_rate": 0.0002779500060511184, + "loss": 4.5329, + "step": 1682 + }, + { + "epoch": 0.5382478162665654, + "grad_norm": 0.36021775007247925, + "learning_rate": 0.00027764001510670354, + "loss": 4.4596, + "step": 1683 + }, + { + "epoch": 0.538567630774151, + "grad_norm": 0.36002200841903687, + "learning_rate": 0.00027733004816790267, + "loss": 4.5739, + "step": 1684 + }, + { + "epoch": 0.5388874452817366, + "grad_norm": 0.33183857798576355, + "learning_rate": 0.00027702010556749556, + "loss": 4.5799, + "step": 1685 + }, + { + "epoch": 0.5392072597893222, + "grad_norm": 0.3585663437843323, + "learning_rate": 0.0002767101876382353, + "loss": 4.5949, + "step": 1686 + }, + { + "epoch": 0.5395270742969078, + "grad_norm": 0.36853551864624023, + "learning_rate": 0.00027640029471284923, + "loss": 4.553, + "step": 1687 + }, + { + "epoch": 0.5398468888044934, + "grad_norm": 0.33454397320747375, + "learning_rate": 0.00027609042712403725, + "loss": 4.6197, + "step": 1688 + }, + { + "epoch": 0.5401667033120789, + "grad_norm": 0.36533334851264954, + "learning_rate": 0.0002757805852044724, + "loss": 4.5755, + "step": 1689 + }, + { + "epoch": 0.5404865178196646, + "grad_norm": 0.3548142611980438, + "learning_rate": 0.00027547076928679994, + "loss": 4.5953, + "step": 1690 + }, + { + "epoch": 0.5408063323272502, + "grad_norm": 0.33640915155410767, + "learning_rate": 0.0002751609797036373, + "loss": 4.5421, + "step": 1691 + }, + { + "epoch": 0.5411261468348358, + "grad_norm": 0.3413470685482025, + "learning_rate": 0.00027485121678757406, + "loss": 4.6046, + "step": 1692 + }, + { + "epoch": 0.5414459613424214, + "grad_norm": 0.34951040148735046, + "learning_rate": 0.0002745414808711703, + "loss": 4.5637, + "step": 1693 + }, + { + "epoch": 0.541765775850007, + "grad_norm": 0.3462730348110199, + "learning_rate": 0.00027423177228695785, + "loss": 4.5405, + "step": 1694 + }, + { + "epoch": 0.5420855903575926, + "grad_norm": 0.3536907732486725, + "learning_rate": 0.00027392209136743875, + "loss": 4.5413, + "step": 1695 + }, + { + "epoch": 0.5424054048651782, + "grad_norm": 0.3455154299736023, + "learning_rate": 0.0002736124384450857, + "loss": 4.5782, + "step": 1696 + }, + { + "epoch": 0.5427252193727637, + "grad_norm": 0.35326552391052246, + "learning_rate": 0.0002733028138523411, + "loss": 4.5603, + "step": 1697 + }, + { + "epoch": 0.5430450338803494, + "grad_norm": 0.35846778750419617, + "learning_rate": 0.0002729932179216169, + "loss": 4.5149, + "step": 1698 + }, + { + "epoch": 0.543364848387935, + "grad_norm": 0.34490907192230225, + "learning_rate": 0.0002726836509852946, + "loss": 4.5543, + "step": 1699 + }, + { + "epoch": 0.5436846628955206, + "grad_norm": 0.35876429080963135, + "learning_rate": 0.0002723741133757242, + "loss": 4.5607, + "step": 1700 + }, + { + "epoch": 0.5436846628955206, + "eval_loss": 4.5858354568481445, + "eval_runtime": 80.5895, + "eval_samples_per_second": 23.539, + "eval_steps_per_second": 5.894, + "step": 1700 + }, + { + "epoch": 0.5440044774031062, + "grad_norm": 0.34980279207229614, + "learning_rate": 0.0002720646054252244, + "loss": 4.6041, + "step": 1701 + }, + { + "epoch": 0.5443242919106918, + "grad_norm": 0.3580838441848755, + "learning_rate": 0.000271755127466082, + "loss": 4.6248, + "step": 1702 + }, + { + "epoch": 0.5446441064182774, + "grad_norm": 0.34394776821136475, + "learning_rate": 0.0002714456798305516, + "loss": 4.6201, + "step": 1703 + }, + { + "epoch": 0.544963920925863, + "grad_norm": 0.35680657625198364, + "learning_rate": 0.0002711362628508554, + "loss": 4.5247, + "step": 1704 + }, + { + "epoch": 0.5452837354334485, + "grad_norm": 0.33691632747650146, + "learning_rate": 0.0002708268768591825, + "loss": 4.5839, + "step": 1705 + }, + { + "epoch": 0.5456035499410342, + "grad_norm": 0.35764801502227783, + "learning_rate": 0.0002705175221876887, + "loss": 4.6062, + "step": 1706 + }, + { + "epoch": 0.5459233644486198, + "grad_norm": 0.37481939792633057, + "learning_rate": 0.00027020819916849634, + "loss": 4.5672, + "step": 1707 + }, + { + "epoch": 0.5462431789562054, + "grad_norm": 0.347343772649765, + "learning_rate": 0.0002698989081336937, + "loss": 4.5451, + "step": 1708 + }, + { + "epoch": 0.546562993463791, + "grad_norm": 0.3330977261066437, + "learning_rate": 0.00026958964941533475, + "loss": 4.5537, + "step": 1709 + }, + { + "epoch": 0.5468828079713766, + "grad_norm": 0.35509002208709717, + "learning_rate": 0.00026928042334543867, + "loss": 4.5283, + "step": 1710 + }, + { + "epoch": 0.5472026224789622, + "grad_norm": 0.3351483941078186, + "learning_rate": 0.0002689712302559898, + "loss": 4.6409, + "step": 1711 + }, + { + "epoch": 0.5475224369865478, + "grad_norm": 0.3378784954547882, + "learning_rate": 0.0002686620704789367, + "loss": 4.5829, + "step": 1712 + }, + { + "epoch": 0.5478422514941333, + "grad_norm": 0.3531706929206848, + "learning_rate": 0.00026835294434619263, + "loss": 4.5944, + "step": 1713 + }, + { + "epoch": 0.548162066001719, + "grad_norm": 0.3312965929508209, + "learning_rate": 0.0002680438521896343, + "loss": 4.6036, + "step": 1714 + }, + { + "epoch": 0.5484818805093046, + "grad_norm": 0.3529874086380005, + "learning_rate": 0.0002677347943411023, + "loss": 4.6288, + "step": 1715 + }, + { + "epoch": 0.5488016950168902, + "grad_norm": 0.34308186173439026, + "learning_rate": 0.0002674257711324002, + "loss": 4.544, + "step": 1716 + }, + { + "epoch": 0.5491215095244758, + "grad_norm": 0.33933305740356445, + "learning_rate": 0.0002671167828952943, + "loss": 4.5742, + "step": 1717 + }, + { + "epoch": 0.5494413240320614, + "grad_norm": 0.3590443730354309, + "learning_rate": 0.0002668078299615136, + "loss": 4.5491, + "step": 1718 + }, + { + "epoch": 0.549761138539647, + "grad_norm": 0.3418152332305908, + "learning_rate": 0.0002664989126627489, + "loss": 4.6603, + "step": 1719 + }, + { + "epoch": 0.5500809530472326, + "grad_norm": 0.33432820439338684, + "learning_rate": 0.00026619003133065305, + "loss": 4.6488, + "step": 1720 + }, + { + "epoch": 0.5504007675548183, + "grad_norm": 0.36660119891166687, + "learning_rate": 0.0002658811862968401, + "loss": 4.5412, + "step": 1721 + }, + { + "epoch": 0.5507205820624038, + "grad_norm": 0.34112343192100525, + "learning_rate": 0.0002655723778928851, + "loss": 4.5507, + "step": 1722 + }, + { + "epoch": 0.5510403965699894, + "grad_norm": 0.3553185760974884, + "learning_rate": 0.00026526360645032405, + "loss": 4.5742, + "step": 1723 + }, + { + "epoch": 0.551360211077575, + "grad_norm": 0.34461116790771484, + "learning_rate": 0.0002649548723006527, + "loss": 4.6606, + "step": 1724 + }, + { + "epoch": 0.5516800255851606, + "grad_norm": 0.3275977671146393, + "learning_rate": 0.00026464617577532756, + "loss": 4.6868, + "step": 1725 + }, + { + "epoch": 0.5519998400927462, + "grad_norm": 0.340777188539505, + "learning_rate": 0.00026433751720576385, + "loss": 4.5074, + "step": 1726 + }, + { + "epoch": 0.5523196546003318, + "grad_norm": 0.3257087469100952, + "learning_rate": 0.00026402889692333684, + "loss": 4.5068, + "step": 1727 + }, + { + "epoch": 0.5526394691079174, + "grad_norm": 0.3451725244522095, + "learning_rate": 0.00026372031525938034, + "loss": 4.4852, + "step": 1728 + }, + { + "epoch": 0.5529592836155031, + "grad_norm": 0.3478621542453766, + "learning_rate": 0.0002634117725451865, + "loss": 4.5435, + "step": 1729 + }, + { + "epoch": 0.5532790981230886, + "grad_norm": 0.33903783559799194, + "learning_rate": 0.00026310326911200616, + "loss": 4.5651, + "step": 1730 + }, + { + "epoch": 0.5535989126306742, + "grad_norm": 0.3416825830936432, + "learning_rate": 0.0002627948052910474, + "loss": 4.6645, + "step": 1731 + }, + { + "epoch": 0.5539187271382598, + "grad_norm": 0.3245784044265747, + "learning_rate": 0.00026248638141347614, + "loss": 4.538, + "step": 1732 + }, + { + "epoch": 0.5542385416458454, + "grad_norm": 0.36666932702064514, + "learning_rate": 0.00026217799781041567, + "loss": 4.6333, + "step": 1733 + }, + { + "epoch": 0.554558356153431, + "grad_norm": 0.33497753739356995, + "learning_rate": 0.00026186965481294526, + "loss": 4.4563, + "step": 1734 + }, + { + "epoch": 0.5548781706610166, + "grad_norm": 0.34304019808769226, + "learning_rate": 0.0002615613527521014, + "loss": 4.5803, + "step": 1735 + }, + { + "epoch": 0.5551979851686022, + "grad_norm": 0.3408200144767761, + "learning_rate": 0.00026125309195887603, + "loss": 4.5838, + "step": 1736 + }, + { + "epoch": 0.5555177996761879, + "grad_norm": 0.36223822832107544, + "learning_rate": 0.0002609448727642172, + "loss": 4.5038, + "step": 1737 + }, + { + "epoch": 0.5558376141837734, + "grad_norm": 0.3321765959262848, + "learning_rate": 0.000260636695499028, + "loss": 4.5712, + "step": 1738 + }, + { + "epoch": 0.556157428691359, + "grad_norm": 0.33548539876937866, + "learning_rate": 0.00026032856049416664, + "loss": 4.5174, + "step": 1739 + }, + { + "epoch": 0.5564772431989446, + "grad_norm": 0.3426455855369568, + "learning_rate": 0.00026002046808044617, + "loss": 4.6059, + "step": 1740 + }, + { + "epoch": 0.5567970577065302, + "grad_norm": 0.34694841504096985, + "learning_rate": 0.0002597124185886334, + "loss": 4.6188, + "step": 1741 + }, + { + "epoch": 0.5571168722141158, + "grad_norm": 0.330301970243454, + "learning_rate": 0.0002594044123494496, + "loss": 4.558, + "step": 1742 + }, + { + "epoch": 0.5574366867217014, + "grad_norm": 0.3402824401855469, + "learning_rate": 0.00025909644969356917, + "loss": 4.5588, + "step": 1743 + }, + { + "epoch": 0.557756501229287, + "grad_norm": 0.3349056541919708, + "learning_rate": 0.00025878853095162, + "loss": 4.5558, + "step": 1744 + }, + { + "epoch": 0.5580763157368727, + "grad_norm": 0.3273567259311676, + "learning_rate": 0.00025848065645418285, + "loss": 4.4886, + "step": 1745 + }, + { + "epoch": 0.5583961302444582, + "grad_norm": 0.37428098917007446, + "learning_rate": 0.00025817282653179064, + "loss": 4.6228, + "step": 1746 + }, + { + "epoch": 0.5587159447520438, + "grad_norm": 0.33602315187454224, + "learning_rate": 0.0002578650415149289, + "loss": 4.5614, + "step": 1747 + }, + { + "epoch": 0.5590357592596295, + "grad_norm": 0.37221935391426086, + "learning_rate": 0.00025755730173403457, + "loss": 4.5068, + "step": 1748 + }, + { + "epoch": 0.559355573767215, + "grad_norm": 0.34931477904319763, + "learning_rate": 0.0002572496075194963, + "loss": 4.5419, + "step": 1749 + }, + { + "epoch": 0.5596753882748006, + "grad_norm": 0.33774423599243164, + "learning_rate": 0.00025694195920165344, + "loss": 4.6304, + "step": 1750 + }, + { + "epoch": 0.5599952027823862, + "grad_norm": 0.355263888835907, + "learning_rate": 0.0002566343571107966, + "loss": 4.5588, + "step": 1751 + }, + { + "epoch": 0.5603150172899718, + "grad_norm": 0.34730982780456543, + "learning_rate": 0.0002563268015771664, + "loss": 4.5754, + "step": 1752 + }, + { + "epoch": 0.5606348317975575, + "grad_norm": 0.32993969321250916, + "learning_rate": 0.00025601929293095344, + "loss": 4.5407, + "step": 1753 + }, + { + "epoch": 0.560954646305143, + "grad_norm": 0.3502591848373413, + "learning_rate": 0.00025571183150229827, + "loss": 4.5203, + "step": 1754 + }, + { + "epoch": 0.5612744608127286, + "grad_norm": 0.38740217685699463, + "learning_rate": 0.00025540441762129045, + "loss": 4.5163, + "step": 1755 + }, + { + "epoch": 0.5615942753203143, + "grad_norm": 0.34605130553245544, + "learning_rate": 0.00025509705161796866, + "loss": 4.5708, + "step": 1756 + }, + { + "epoch": 0.5619140898278998, + "grad_norm": 0.3458990156650543, + "learning_rate": 0.0002547897338223202, + "loss": 4.5134, + "step": 1757 + }, + { + "epoch": 0.5622339043354854, + "grad_norm": 0.3720369338989258, + "learning_rate": 0.0002544824645642804, + "loss": 4.5965, + "step": 1758 + }, + { + "epoch": 0.562553718843071, + "grad_norm": 0.3522976040840149, + "learning_rate": 0.00025417524417373276, + "loss": 4.5462, + "step": 1759 + }, + { + "epoch": 0.5628735333506566, + "grad_norm": 0.35671481490135193, + "learning_rate": 0.00025386807298050817, + "loss": 4.6235, + "step": 1760 + }, + { + "epoch": 0.5631933478582423, + "grad_norm": 0.35742253065109253, + "learning_rate": 0.00025356095131438464, + "loss": 4.6114, + "step": 1761 + }, + { + "epoch": 0.5635131623658278, + "grad_norm": 0.36183950304985046, + "learning_rate": 0.0002532538795050872, + "loss": 4.6162, + "step": 1762 + }, + { + "epoch": 0.5638329768734134, + "grad_norm": 0.33316004276275635, + "learning_rate": 0.0002529468578822871, + "loss": 4.5738, + "step": 1763 + }, + { + "epoch": 0.564152791380999, + "grad_norm": 0.3625221252441406, + "learning_rate": 0.00025263988677560204, + "loss": 4.4923, + "step": 1764 + }, + { + "epoch": 0.5644726058885846, + "grad_norm": 0.3555978536605835, + "learning_rate": 0.0002523329665145951, + "loss": 4.4812, + "step": 1765 + }, + { + "epoch": 0.5647924203961702, + "grad_norm": 0.38111501932144165, + "learning_rate": 0.00025202609742877515, + "loss": 4.5462, + "step": 1766 + }, + { + "epoch": 0.5651122349037558, + "grad_norm": 0.37184572219848633, + "learning_rate": 0.0002517192798475958, + "loss": 4.4111, + "step": 1767 + }, + { + "epoch": 0.5654320494113414, + "grad_norm": 0.3523752689361572, + "learning_rate": 0.00025141251410045547, + "loss": 4.4912, + "step": 1768 + }, + { + "epoch": 0.5657518639189271, + "grad_norm": 0.3493099808692932, + "learning_rate": 0.0002511058005166972, + "loss": 4.4431, + "step": 1769 + }, + { + "epoch": 0.5660716784265126, + "grad_norm": 0.3703976571559906, + "learning_rate": 0.0002507991394256075, + "loss": 4.5189, + "step": 1770 + }, + { + "epoch": 0.5663914929340982, + "grad_norm": 0.3478959798812866, + "learning_rate": 0.00025049253115641713, + "loss": 4.5718, + "step": 1771 + }, + { + "epoch": 0.5667113074416839, + "grad_norm": 0.3373987674713135, + "learning_rate": 0.00025018597603829944, + "loss": 4.5871, + "step": 1772 + }, + { + "epoch": 0.5670311219492694, + "grad_norm": 0.371184378862381, + "learning_rate": 0.0002498794744003715, + "loss": 4.5144, + "step": 1773 + }, + { + "epoch": 0.567350936456855, + "grad_norm": 0.35939913988113403, + "learning_rate": 0.0002495730265716922, + "loss": 4.5341, + "step": 1774 + }, + { + "epoch": 0.5676707509644406, + "grad_norm": 0.3704868257045746, + "learning_rate": 0.00024926663288126323, + "loss": 4.5542, + "step": 1775 + }, + { + "epoch": 0.5679905654720262, + "grad_norm": 0.3416334390640259, + "learning_rate": 0.00024896029365802807, + "loss": 4.4877, + "step": 1776 + }, + { + "epoch": 0.5683103799796119, + "grad_norm": 0.36683937907218933, + "learning_rate": 0.0002486540092308713, + "loss": 4.6201, + "step": 1777 + }, + { + "epoch": 0.5686301944871974, + "grad_norm": 0.34621694684028625, + "learning_rate": 0.00024834777992861935, + "loss": 4.4863, + "step": 1778 + }, + { + "epoch": 0.568950008994783, + "grad_norm": 0.34777936339378357, + "learning_rate": 0.0002480416060800387, + "loss": 4.5469, + "step": 1779 + }, + { + "epoch": 0.5692698235023687, + "grad_norm": 0.34788841009140015, + "learning_rate": 0.0002477354880138369, + "loss": 4.5659, + "step": 1780 + }, + { + "epoch": 0.5695896380099542, + "grad_norm": 0.3381100296974182, + "learning_rate": 0.0002474294260586615, + "loss": 4.5329, + "step": 1781 + }, + { + "epoch": 0.5699094525175398, + "grad_norm": 0.348910927772522, + "learning_rate": 0.00024712342054309945, + "loss": 4.5827, + "step": 1782 + }, + { + "epoch": 0.5702292670251254, + "grad_norm": 0.3376369774341583, + "learning_rate": 0.00024681747179567775, + "loss": 4.5388, + "step": 1783 + }, + { + "epoch": 0.570549081532711, + "grad_norm": 0.3344530761241913, + "learning_rate": 0.0002465115801448617, + "loss": 4.5565, + "step": 1784 + }, + { + "epoch": 0.5708688960402967, + "grad_norm": 0.3593077063560486, + "learning_rate": 0.0002462057459190559, + "loss": 4.5538, + "step": 1785 + }, + { + "epoch": 0.5711887105478822, + "grad_norm": 0.33933642506599426, + "learning_rate": 0.0002458999694466029, + "loss": 4.5417, + "step": 1786 + }, + { + "epoch": 0.5715085250554678, + "grad_norm": 0.3367229998111725, + "learning_rate": 0.0002455942510557836, + "loss": 4.5613, + "step": 1787 + }, + { + "epoch": 0.5718283395630535, + "grad_norm": 0.3529382646083832, + "learning_rate": 0.0002452885910748163, + "loss": 4.5653, + "step": 1788 + }, + { + "epoch": 0.572148154070639, + "grad_norm": 0.34044578671455383, + "learning_rate": 0.0002449829898318566, + "loss": 4.5278, + "step": 1789 + }, + { + "epoch": 0.5724679685782247, + "grad_norm": 0.3467525839805603, + "learning_rate": 0.0002446774476549972, + "loss": 4.57, + "step": 1790 + }, + { + "epoch": 0.5727877830858102, + "grad_norm": 0.344215989112854, + "learning_rate": 0.00024437196487226716, + "loss": 4.4932, + "step": 1791 + }, + { + "epoch": 0.5731075975933958, + "grad_norm": 0.3458828032016754, + "learning_rate": 0.00024406654181163197, + "loss": 4.5249, + "step": 1792 + }, + { + "epoch": 0.5734274121009815, + "grad_norm": 0.3492017984390259, + "learning_rate": 0.000243761178800993, + "loss": 4.5409, + "step": 1793 + }, + { + "epoch": 0.573747226608567, + "grad_norm": 0.3367227613925934, + "learning_rate": 0.00024345587616818692, + "loss": 4.4586, + "step": 1794 + }, + { + "epoch": 0.5740670411161526, + "grad_norm": 0.3362981379032135, + "learning_rate": 0.00024315063424098585, + "loss": 4.534, + "step": 1795 + }, + { + "epoch": 0.5743868556237383, + "grad_norm": 0.3493768870830536, + "learning_rate": 0.00024284545334709657, + "loss": 4.6799, + "step": 1796 + }, + { + "epoch": 0.5747066701313238, + "grad_norm": 0.35034531354904175, + "learning_rate": 0.00024254033381416047, + "loss": 4.4829, + "step": 1797 + }, + { + "epoch": 0.5750264846389095, + "grad_norm": 0.326031357049942, + "learning_rate": 0.00024223527596975284, + "loss": 4.5284, + "step": 1798 + }, + { + "epoch": 0.575346299146495, + "grad_norm": 0.3683282434940338, + "learning_rate": 0.000241930280141383, + "loss": 4.5942, + "step": 1799 + }, + { + "epoch": 0.5756661136540806, + "grad_norm": 0.3724258542060852, + "learning_rate": 0.00024162534665649358, + "loss": 4.62, + "step": 1800 + }, + { + "epoch": 0.5756661136540806, + "eval_loss": 4.554470062255859, + "eval_runtime": 97.1859, + "eval_samples_per_second": 19.519, + "eval_steps_per_second": 4.888, + "step": 1800 + }, + { + "epoch": 0.5759859281616663, + "grad_norm": 0.3389361500740051, + "learning_rate": 0.0002413204758424602, + "loss": 4.4803, + "step": 1801 + }, + { + "epoch": 0.5763057426692518, + "grad_norm": 0.3431956171989441, + "learning_rate": 0.00024101566802659137, + "loss": 4.5875, + "step": 1802 + }, + { + "epoch": 0.5766255571768374, + "grad_norm": 0.3464840352535248, + "learning_rate": 0.0002407109235361277, + "loss": 4.5123, + "step": 1803 + }, + { + "epoch": 0.5769453716844231, + "grad_norm": 0.3432200849056244, + "learning_rate": 0.0002404062426982421, + "loss": 4.5653, + "step": 1804 + }, + { + "epoch": 0.5772651861920086, + "grad_norm": 0.3582673668861389, + "learning_rate": 0.00024010162584003905, + "loss": 4.5433, + "step": 1805 + }, + { + "epoch": 0.5775850006995943, + "grad_norm": 0.3397902846336365, + "learning_rate": 0.0002397970732885542, + "loss": 4.6512, + "step": 1806 + }, + { + "epoch": 0.5779048152071798, + "grad_norm": 0.3517637252807617, + "learning_rate": 0.0002394925853707544, + "loss": 4.5119, + "step": 1807 + }, + { + "epoch": 0.5782246297147654, + "grad_norm": 0.3295648992061615, + "learning_rate": 0.00023918816241353684, + "loss": 4.5526, + "step": 1808 + }, + { + "epoch": 0.5785444442223511, + "grad_norm": 0.36371392011642456, + "learning_rate": 0.0002388838047437293, + "loss": 4.5532, + "step": 1809 + }, + { + "epoch": 0.5788642587299366, + "grad_norm": 0.38383764028549194, + "learning_rate": 0.00023857951268808905, + "loss": 4.5058, + "step": 1810 + }, + { + "epoch": 0.5791840732375222, + "grad_norm": 0.34932655096054077, + "learning_rate": 0.00023827528657330331, + "loss": 4.6644, + "step": 1811 + }, + { + "epoch": 0.5795038877451079, + "grad_norm": 0.3224482536315918, + "learning_rate": 0.00023797112672598833, + "loss": 4.5296, + "step": 1812 + }, + { + "epoch": 0.5798237022526934, + "grad_norm": 0.3408745527267456, + "learning_rate": 0.0002376670334726891, + "loss": 4.5024, + "step": 1813 + }, + { + "epoch": 0.5801435167602791, + "grad_norm": 0.3669995963573456, + "learning_rate": 0.00023736300713987946, + "loss": 4.6073, + "step": 1814 + }, + { + "epoch": 0.5804633312678646, + "grad_norm": 0.336173951625824, + "learning_rate": 0.00023705904805396095, + "loss": 4.5538, + "step": 1815 + }, + { + "epoch": 0.5807831457754502, + "grad_norm": 0.33492520451545715, + "learning_rate": 0.00023675515654126327, + "loss": 4.5486, + "step": 1816 + }, + { + "epoch": 0.5811029602830359, + "grad_norm": 0.3570772707462311, + "learning_rate": 0.00023645133292804352, + "loss": 4.5727, + "step": 1817 + }, + { + "epoch": 0.5814227747906214, + "grad_norm": 0.35234716534614563, + "learning_rate": 0.0002361475775404857, + "loss": 4.4869, + "step": 1818 + }, + { + "epoch": 0.581742589298207, + "grad_norm": 0.34041503071784973, + "learning_rate": 0.00023584389070470087, + "loss": 4.5867, + "step": 1819 + }, + { + "epoch": 0.5820624038057927, + "grad_norm": 0.3548620045185089, + "learning_rate": 0.000235540272746726, + "loss": 4.55, + "step": 1820 + }, + { + "epoch": 0.5823822183133782, + "grad_norm": 0.34405526518821716, + "learning_rate": 0.00023523672399252492, + "loss": 4.5571, + "step": 1821 + }, + { + "epoch": 0.5827020328209639, + "grad_norm": 0.33513495326042175, + "learning_rate": 0.00023493324476798618, + "loss": 4.5564, + "step": 1822 + }, + { + "epoch": 0.5830218473285494, + "grad_norm": 0.34113237261772156, + "learning_rate": 0.0002346298353989245, + "loss": 4.4995, + "step": 1823 + }, + { + "epoch": 0.583341661836135, + "grad_norm": 0.340901643037796, + "learning_rate": 0.00023432649621107928, + "loss": 4.6311, + "step": 1824 + }, + { + "epoch": 0.5836614763437207, + "grad_norm": 0.3488660156726837, + "learning_rate": 0.00023402322753011433, + "loss": 4.4622, + "step": 1825 + }, + { + "epoch": 0.5839812908513062, + "grad_norm": 0.3383903503417969, + "learning_rate": 0.0002337200296816184, + "loss": 4.5033, + "step": 1826 + }, + { + "epoch": 0.5843011053588918, + "grad_norm": 0.35267290472984314, + "learning_rate": 0.00023341690299110338, + "loss": 4.5771, + "step": 1827 + }, + { + "epoch": 0.5846209198664775, + "grad_norm": 0.34412881731987, + "learning_rate": 0.0002331138477840054, + "loss": 4.4763, + "step": 1828 + }, + { + "epoch": 0.584940734374063, + "grad_norm": 0.3485495448112488, + "learning_rate": 0.00023281086438568384, + "loss": 4.5253, + "step": 1829 + }, + { + "epoch": 0.5852605488816487, + "grad_norm": 0.337989866733551, + "learning_rate": 0.0002325079531214204, + "loss": 4.5676, + "step": 1830 + }, + { + "epoch": 0.5855803633892342, + "grad_norm": 0.37945863604545593, + "learning_rate": 0.00023220511431642008, + "loss": 4.5198, + "step": 1831 + }, + { + "epoch": 0.5859001778968198, + "grad_norm": 0.33420321345329285, + "learning_rate": 0.00023190234829580943, + "loss": 4.5406, + "step": 1832 + }, + { + "epoch": 0.5862199924044055, + "grad_norm": 0.33344873785972595, + "learning_rate": 0.00023159965538463738, + "loss": 4.4691, + "step": 1833 + }, + { + "epoch": 0.586539806911991, + "grad_norm": 0.34540778398513794, + "learning_rate": 0.00023129703590787394, + "loss": 4.4858, + "step": 1834 + }, + { + "epoch": 0.5868596214195766, + "grad_norm": 0.33878621459007263, + "learning_rate": 0.0002309944901904107, + "loss": 4.5232, + "step": 1835 + }, + { + "epoch": 0.5871794359271623, + "grad_norm": 0.35938650369644165, + "learning_rate": 0.00023069201855705973, + "loss": 4.5278, + "step": 1836 + }, + { + "epoch": 0.5874992504347478, + "grad_norm": 0.35607171058654785, + "learning_rate": 0.00023038962133255366, + "loss": 4.5165, + "step": 1837 + }, + { + "epoch": 0.5878190649423335, + "grad_norm": 0.35275357961654663, + "learning_rate": 0.00023008729884154542, + "loss": 4.5, + "step": 1838 + }, + { + "epoch": 0.588138879449919, + "grad_norm": 0.3373669385910034, + "learning_rate": 0.00022978505140860736, + "loss": 4.5829, + "step": 1839 + }, + { + "epoch": 0.5884586939575046, + "grad_norm": 0.3494403660297394, + "learning_rate": 0.00022948287935823153, + "loss": 4.542, + "step": 1840 + }, + { + "epoch": 0.5887785084650903, + "grad_norm": 0.33572816848754883, + "learning_rate": 0.00022918078301482897, + "loss": 4.4934, + "step": 1841 + }, + { + "epoch": 0.5890983229726758, + "grad_norm": 0.3375544846057892, + "learning_rate": 0.00022887876270272938, + "loss": 4.5353, + "step": 1842 + }, + { + "epoch": 0.5894181374802614, + "grad_norm": 0.3462027907371521, + "learning_rate": 0.0002285768187461809, + "loss": 4.5498, + "step": 1843 + }, + { + "epoch": 0.5897379519878471, + "grad_norm": 0.33625560998916626, + "learning_rate": 0.00022827495146934964, + "loss": 4.4935, + "step": 1844 + }, + { + "epoch": 0.5900577664954326, + "grad_norm": 0.3321734070777893, + "learning_rate": 0.00022797316119631952, + "loss": 4.5493, + "step": 1845 + }, + { + "epoch": 0.5903775810030183, + "grad_norm": 0.3436463475227356, + "learning_rate": 0.00022767144825109153, + "loss": 4.4957, + "step": 1846 + }, + { + "epoch": 0.5906973955106039, + "grad_norm": 0.34126192331314087, + "learning_rate": 0.00022736981295758393, + "loss": 4.4996, + "step": 1847 + }, + { + "epoch": 0.5910172100181894, + "grad_norm": 0.33446893095970154, + "learning_rate": 0.00022706825563963148, + "loss": 4.5688, + "step": 1848 + }, + { + "epoch": 0.5913370245257751, + "grad_norm": 0.3394505977630615, + "learning_rate": 0.00022676677662098512, + "loss": 4.5422, + "step": 1849 + }, + { + "epoch": 0.5916568390333606, + "grad_norm": 0.3588700592517853, + "learning_rate": 0.00022646537622531197, + "loss": 4.5228, + "step": 1850 + }, + { + "epoch": 0.5919766535409462, + "grad_norm": 0.33262884616851807, + "learning_rate": 0.00022616405477619448, + "loss": 4.5603, + "step": 1851 + }, + { + "epoch": 0.5922964680485319, + "grad_norm": 0.3373951315879822, + "learning_rate": 0.00022586281259713055, + "loss": 4.501, + "step": 1852 + }, + { + "epoch": 0.5926162825561174, + "grad_norm": 0.3469450771808624, + "learning_rate": 0.00022556165001153295, + "loss": 4.5772, + "step": 1853 + }, + { + "epoch": 0.5929360970637031, + "grad_norm": 0.3388214707374573, + "learning_rate": 0.0002252605673427288, + "loss": 4.5937, + "step": 1854 + }, + { + "epoch": 0.5932559115712887, + "grad_norm": 0.33294713497161865, + "learning_rate": 0.0002249595649139597, + "loss": 4.5361, + "step": 1855 + }, + { + "epoch": 0.5935757260788742, + "grad_norm": 0.33870381116867065, + "learning_rate": 0.0002246586430483809, + "loss": 4.4634, + "step": 1856 + }, + { + "epoch": 0.5938955405864599, + "grad_norm": 0.33916759490966797, + "learning_rate": 0.00022435780206906132, + "loss": 4.6296, + "step": 1857 + }, + { + "epoch": 0.5942153550940454, + "grad_norm": 0.33612513542175293, + "learning_rate": 0.00022405704229898278, + "loss": 4.5925, + "step": 1858 + }, + { + "epoch": 0.5945351696016311, + "grad_norm": 0.35525017976760864, + "learning_rate": 0.00022375636406104022, + "loss": 4.5298, + "step": 1859 + }, + { + "epoch": 0.5948549841092167, + "grad_norm": 0.35198476910591125, + "learning_rate": 0.00022345576767804085, + "loss": 4.5011, + "step": 1860 + }, + { + "epoch": 0.5951747986168022, + "grad_norm": 0.3347802460193634, + "learning_rate": 0.00022315525347270412, + "loss": 4.5497, + "step": 1861 + }, + { + "epoch": 0.5954946131243879, + "grad_norm": 0.332305371761322, + "learning_rate": 0.00022285482176766122, + "loss": 4.4805, + "step": 1862 + }, + { + "epoch": 0.5958144276319735, + "grad_norm": 0.35164013504981995, + "learning_rate": 0.00022255447288545453, + "loss": 4.5509, + "step": 1863 + }, + { + "epoch": 0.596134242139559, + "grad_norm": 0.3494516909122467, + "learning_rate": 0.00022225420714853798, + "loss": 4.505, + "step": 1864 + }, + { + "epoch": 0.5964540566471447, + "grad_norm": 0.349423885345459, + "learning_rate": 0.00022195402487927592, + "loss": 4.5237, + "step": 1865 + }, + { + "epoch": 0.5967738711547302, + "grad_norm": 0.3518355190753937, + "learning_rate": 0.00022165392639994307, + "loss": 4.4976, + "step": 1866 + }, + { + "epoch": 0.5970936856623159, + "grad_norm": 0.34486064314842224, + "learning_rate": 0.00022135391203272441, + "loss": 4.4665, + "step": 1867 + }, + { + "epoch": 0.5974135001699015, + "grad_norm": 0.3417609632015228, + "learning_rate": 0.00022105398209971424, + "loss": 4.5459, + "step": 1868 + }, + { + "epoch": 0.597733314677487, + "grad_norm": 0.35042083263397217, + "learning_rate": 0.00022075413692291678, + "loss": 4.4606, + "step": 1869 + }, + { + "epoch": 0.5980531291850727, + "grad_norm": 0.3334195613861084, + "learning_rate": 0.00022045437682424458, + "loss": 4.5156, + "step": 1870 + }, + { + "epoch": 0.5983729436926583, + "grad_norm": 0.3369036018848419, + "learning_rate": 0.00022015470212551942, + "loss": 4.5203, + "step": 1871 + }, + { + "epoch": 0.5986927582002438, + "grad_norm": 0.35034123063087463, + "learning_rate": 0.00021985511314847128, + "loss": 4.5367, + "step": 1872 + }, + { + "epoch": 0.5990125727078295, + "grad_norm": 0.34471485018730164, + "learning_rate": 0.00021955561021473765, + "loss": 4.5716, + "step": 1873 + }, + { + "epoch": 0.599332387215415, + "grad_norm": 0.3398720324039459, + "learning_rate": 0.0002192561936458644, + "loss": 4.4874, + "step": 1874 + }, + { + "epoch": 0.5996522017230007, + "grad_norm": 0.3795250654220581, + "learning_rate": 0.00021895686376330396, + "loss": 4.4272, + "step": 1875 + }, + { + "epoch": 0.5999720162305863, + "grad_norm": 0.34881579875946045, + "learning_rate": 0.00021865762088841607, + "loss": 4.4228, + "step": 1876 + }, + { + "epoch": 0.6002918307381718, + "grad_norm": 0.3544309437274933, + "learning_rate": 0.00021835846534246726, + "loss": 4.5057, + "step": 1877 + }, + { + "epoch": 0.6006116452457575, + "grad_norm": 0.39712315797805786, + "learning_rate": 0.00021805939744662964, + "loss": 4.5367, + "step": 1878 + }, + { + "epoch": 0.6009314597533431, + "grad_norm": 0.38285964727401733, + "learning_rate": 0.00021776041752198202, + "loss": 4.5301, + "step": 1879 + }, + { + "epoch": 0.6012512742609286, + "grad_norm": 0.3336244523525238, + "learning_rate": 0.00021746152588950809, + "loss": 4.5421, + "step": 1880 + }, + { + "epoch": 0.6015710887685143, + "grad_norm": 0.3482455313205719, + "learning_rate": 0.00021716272287009713, + "loss": 4.583, + "step": 1881 + }, + { + "epoch": 0.6018909032760998, + "grad_norm": 0.33469104766845703, + "learning_rate": 0.00021686400878454312, + "loss": 4.3977, + "step": 1882 + }, + { + "epoch": 0.6022107177836855, + "grad_norm": 0.34940576553344727, + "learning_rate": 0.0002165653839535447, + "loss": 4.4293, + "step": 1883 + }, + { + "epoch": 0.6025305322912711, + "grad_norm": 0.34258222579956055, + "learning_rate": 0.00021626684869770462, + "loss": 4.4359, + "step": 1884 + }, + { + "epoch": 0.6028503467988566, + "grad_norm": 0.3487623631954193, + "learning_rate": 0.00021596840333752934, + "loss": 4.5172, + "step": 1885 + }, + { + "epoch": 0.6031701613064423, + "grad_norm": 0.3448670208454132, + "learning_rate": 0.00021567004819342907, + "loss": 4.6143, + "step": 1886 + }, + { + "epoch": 0.6034899758140279, + "grad_norm": 0.36221206188201904, + "learning_rate": 0.00021537178358571686, + "loss": 4.4608, + "step": 1887 + }, + { + "epoch": 0.6038097903216134, + "grad_norm": 0.3512362241744995, + "learning_rate": 0.00021507360983460882, + "loss": 4.5247, + "step": 1888 + }, + { + "epoch": 0.6041296048291991, + "grad_norm": 0.38084450364112854, + "learning_rate": 0.0002147755272602234, + "loss": 4.4477, + "step": 1889 + }, + { + "epoch": 0.6044494193367846, + "grad_norm": 0.35751184821128845, + "learning_rate": 0.00021447753618258116, + "loss": 4.5502, + "step": 1890 + }, + { + "epoch": 0.6047692338443703, + "grad_norm": 0.3478350043296814, + "learning_rate": 0.00021417963692160448, + "loss": 4.4548, + "step": 1891 + }, + { + "epoch": 0.6050890483519559, + "grad_norm": 0.36147207021713257, + "learning_rate": 0.00021388182979711703, + "loss": 4.4134, + "step": 1892 + }, + { + "epoch": 0.6054088628595414, + "grad_norm": 0.34922581911087036, + "learning_rate": 0.0002135841151288438, + "loss": 4.5662, + "step": 1893 + }, + { + "epoch": 0.6057286773671271, + "grad_norm": 0.33871176838874817, + "learning_rate": 0.00021328649323641022, + "loss": 4.4339, + "step": 1894 + }, + { + "epoch": 0.6060484918747127, + "grad_norm": 0.3543529808521271, + "learning_rate": 0.00021298896443934238, + "loss": 4.4614, + "step": 1895 + }, + { + "epoch": 0.6063683063822982, + "grad_norm": 0.3440582752227783, + "learning_rate": 0.00021269152905706637, + "loss": 4.5051, + "step": 1896 + }, + { + "epoch": 0.6066881208898839, + "grad_norm": 0.3369908332824707, + "learning_rate": 0.00021239418740890786, + "loss": 4.5262, + "step": 1897 + }, + { + "epoch": 0.6070079353974694, + "grad_norm": 0.3528311848640442, + "learning_rate": 0.000212096939814092, + "loss": 4.531, + "step": 1898 + }, + { + "epoch": 0.6073277499050551, + "grad_norm": 0.3660440146923065, + "learning_rate": 0.00021179978659174284, + "loss": 4.4864, + "step": 1899 + }, + { + "epoch": 0.6076475644126407, + "grad_norm": 0.33674898743629456, + "learning_rate": 0.00021150272806088333, + "loss": 4.4465, + "step": 1900 + }, + { + "epoch": 0.6076475644126407, + "eval_loss": 4.525420665740967, + "eval_runtime": 90.4335, + "eval_samples_per_second": 20.977, + "eval_steps_per_second": 5.252, + "step": 1900 + }, + { + "epoch": 0.6079673789202262, + "grad_norm": 0.36991459131240845, + "learning_rate": 0.00021120576454043463, + "loss": 4.4703, + "step": 1901 + }, + { + "epoch": 0.6082871934278119, + "grad_norm": 0.3455876410007477, + "learning_rate": 0.00021090889634921585, + "loss": 4.6068, + "step": 1902 + }, + { + "epoch": 0.6086070079353975, + "grad_norm": 0.34351128339767456, + "learning_rate": 0.00021061212380594382, + "loss": 4.5503, + "step": 1903 + }, + { + "epoch": 0.608926822442983, + "grad_norm": 0.3383599519729614, + "learning_rate": 0.00021031544722923266, + "loss": 4.6339, + "step": 1904 + }, + { + "epoch": 0.6092466369505687, + "grad_norm": 0.35959696769714355, + "learning_rate": 0.0002100188669375935, + "loss": 4.5512, + "step": 1905 + }, + { + "epoch": 0.6095664514581542, + "grad_norm": 0.3345339298248291, + "learning_rate": 0.000209722383249434, + "loss": 4.4128, + "step": 1906 + }, + { + "epoch": 0.6098862659657399, + "grad_norm": 0.34817659854888916, + "learning_rate": 0.0002094259964830582, + "loss": 4.4591, + "step": 1907 + }, + { + "epoch": 0.6102060804733255, + "grad_norm": 0.3305789530277252, + "learning_rate": 0.0002091297069566662, + "loss": 4.3654, + "step": 1908 + }, + { + "epoch": 0.610525894980911, + "grad_norm": 0.3378927707672119, + "learning_rate": 0.00020883351498835335, + "loss": 4.5551, + "step": 1909 + }, + { + "epoch": 0.6108457094884967, + "grad_norm": 0.34520092606544495, + "learning_rate": 0.00020853742089611067, + "loss": 4.601, + "step": 1910 + }, + { + "epoch": 0.6111655239960823, + "grad_norm": 0.34389182925224304, + "learning_rate": 0.00020824142499782368, + "loss": 4.5051, + "step": 1911 + }, + { + "epoch": 0.6114853385036678, + "grad_norm": 0.33250677585601807, + "learning_rate": 0.00020794552761127283, + "loss": 4.3949, + "step": 1912 + }, + { + "epoch": 0.6118051530112535, + "grad_norm": 0.3374512493610382, + "learning_rate": 0.0002076497290541328, + "loss": 4.5135, + "step": 1913 + }, + { + "epoch": 0.612124967518839, + "grad_norm": 0.332403302192688, + "learning_rate": 0.0002073540296439719, + "loss": 4.4235, + "step": 1914 + }, + { + "epoch": 0.6124447820264247, + "grad_norm": 0.3468950390815735, + "learning_rate": 0.00020705842969825225, + "loss": 4.6296, + "step": 1915 + }, + { + "epoch": 0.6127645965340103, + "grad_norm": 0.34265509247779846, + "learning_rate": 0.00020676292953432886, + "loss": 4.4676, + "step": 1916 + }, + { + "epoch": 0.6130844110415958, + "grad_norm": 0.342316597700119, + "learning_rate": 0.00020646752946945016, + "loss": 4.4868, + "step": 1917 + }, + { + "epoch": 0.6134042255491815, + "grad_norm": 0.3344189524650574, + "learning_rate": 0.00020617222982075646, + "loss": 4.4635, + "step": 1918 + }, + { + "epoch": 0.6137240400567671, + "grad_norm": 0.33452966809272766, + "learning_rate": 0.0002058770309052808, + "loss": 4.4877, + "step": 1919 + }, + { + "epoch": 0.6140438545643526, + "grad_norm": 0.33984753489494324, + "learning_rate": 0.00020558193303994797, + "loss": 4.4969, + "step": 1920 + }, + { + "epoch": 0.6143636690719383, + "grad_norm": 0.32579725980758667, + "learning_rate": 0.0002052869365415738, + "loss": 4.5268, + "step": 1921 + }, + { + "epoch": 0.6146834835795238, + "grad_norm": 0.3360597789287567, + "learning_rate": 0.00020499204172686616, + "loss": 4.5063, + "step": 1922 + }, + { + "epoch": 0.6150032980871095, + "grad_norm": 0.33437472581863403, + "learning_rate": 0.00020469724891242281, + "loss": 4.4823, + "step": 1923 + }, + { + "epoch": 0.6153231125946951, + "grad_norm": 0.34467798471450806, + "learning_rate": 0.00020440255841473252, + "loss": 4.4709, + "step": 1924 + }, + { + "epoch": 0.6156429271022806, + "grad_norm": 0.39499175548553467, + "learning_rate": 0.0002041079705501745, + "loss": 4.4726, + "step": 1925 + }, + { + "epoch": 0.6159627416098663, + "grad_norm": 0.3400439918041229, + "learning_rate": 0.00020381348563501694, + "loss": 4.461, + "step": 1926 + }, + { + "epoch": 0.6162825561174519, + "grad_norm": 0.3481975495815277, + "learning_rate": 0.00020351910398541835, + "loss": 4.5781, + "step": 1927 + }, + { + "epoch": 0.6166023706250375, + "grad_norm": 0.3361368179321289, + "learning_rate": 0.00020322482591742576, + "loss": 4.4866, + "step": 1928 + }, + { + "epoch": 0.6169221851326231, + "grad_norm": 0.3281785845756531, + "learning_rate": 0.0002029306517469754, + "loss": 4.4865, + "step": 1929 + }, + { + "epoch": 0.6172419996402086, + "grad_norm": 0.35954657196998596, + "learning_rate": 0.00020263658178989162, + "loss": 4.5366, + "step": 1930 + }, + { + "epoch": 0.6175618141477943, + "grad_norm": 0.35379758477211, + "learning_rate": 0.0002023426163618872, + "loss": 4.4782, + "step": 1931 + }, + { + "epoch": 0.6178816286553799, + "grad_norm": 0.33757951855659485, + "learning_rate": 0.00020204875577856256, + "loss": 4.5129, + "step": 1932 + }, + { + "epoch": 0.6182014431629654, + "grad_norm": 0.35059356689453125, + "learning_rate": 0.00020175500035540545, + "loss": 4.4797, + "step": 1933 + }, + { + "epoch": 0.6185212576705511, + "grad_norm": 0.3704817295074463, + "learning_rate": 0.00020146135040779097, + "loss": 4.4134, + "step": 1934 + }, + { + "epoch": 0.6188410721781367, + "grad_norm": 0.33202195167541504, + "learning_rate": 0.0002011678062509807, + "loss": 4.5071, + "step": 1935 + }, + { + "epoch": 0.6191608866857223, + "grad_norm": 0.3939604163169861, + "learning_rate": 0.00020087436820012287, + "loss": 4.5482, + "step": 1936 + }, + { + "epoch": 0.6194807011933079, + "grad_norm": 0.34221702814102173, + "learning_rate": 0.0002005810365702517, + "loss": 4.4425, + "step": 1937 + }, + { + "epoch": 0.6198005157008935, + "grad_norm": 0.324089914560318, + "learning_rate": 0.00020028781167628714, + "loss": 4.5057, + "step": 1938 + }, + { + "epoch": 0.6201203302084791, + "grad_norm": 0.3476245403289795, + "learning_rate": 0.0001999946938330346, + "loss": 4.4755, + "step": 1939 + }, + { + "epoch": 0.6204401447160647, + "grad_norm": 0.3370070457458496, + "learning_rate": 0.0001997016833551845, + "loss": 4.5277, + "step": 1940 + }, + { + "epoch": 0.6207599592236502, + "grad_norm": 0.34495389461517334, + "learning_rate": 0.00019940878055731208, + "loss": 4.4686, + "step": 1941 + }, + { + "epoch": 0.6210797737312359, + "grad_norm": 0.34336042404174805, + "learning_rate": 0.00019911598575387683, + "loss": 4.4037, + "step": 1942 + }, + { + "epoch": 0.6213995882388215, + "grad_norm": 0.3384661078453064, + "learning_rate": 0.00019882329925922245, + "loss": 4.5484, + "step": 1943 + }, + { + "epoch": 0.6217194027464071, + "grad_norm": 0.32501736283302307, + "learning_rate": 0.00019853072138757637, + "loss": 4.4984, + "step": 1944 + }, + { + "epoch": 0.6220392172539927, + "grad_norm": 0.35760048031806946, + "learning_rate": 0.00019823825245304918, + "loss": 4.5562, + "step": 1945 + }, + { + "epoch": 0.6223590317615783, + "grad_norm": 0.3294287919998169, + "learning_rate": 0.00019794589276963482, + "loss": 4.5322, + "step": 1946 + }, + { + "epoch": 0.6226788462691639, + "grad_norm": 0.33806145191192627, + "learning_rate": 0.00019765364265120962, + "loss": 4.5018, + "step": 1947 + }, + { + "epoch": 0.6229986607767495, + "grad_norm": 0.3356688320636749, + "learning_rate": 0.00019736150241153258, + "loss": 4.4559, + "step": 1948 + }, + { + "epoch": 0.623318475284335, + "grad_norm": 0.3312791883945465, + "learning_rate": 0.0001970694723642446, + "loss": 4.4325, + "step": 1949 + }, + { + "epoch": 0.6236382897919207, + "grad_norm": 0.3365389406681061, + "learning_rate": 0.00019677755282286822, + "loss": 4.4863, + "step": 1950 + }, + { + "epoch": 0.6239581042995063, + "grad_norm": 0.3383220136165619, + "learning_rate": 0.00019648574410080743, + "loss": 4.4953, + "step": 1951 + }, + { + "epoch": 0.6242779188070919, + "grad_norm": 0.33379092812538147, + "learning_rate": 0.00019619404651134717, + "loss": 4.5105, + "step": 1952 + }, + { + "epoch": 0.6245977333146775, + "grad_norm": 0.33623144030570984, + "learning_rate": 0.0001959024603676532, + "loss": 4.6028, + "step": 1953 + }, + { + "epoch": 0.624917547822263, + "grad_norm": 0.34045708179473877, + "learning_rate": 0.00019561098598277145, + "loss": 4.6114, + "step": 1954 + }, + { + "epoch": 0.6252373623298487, + "grad_norm": 0.33441922068595886, + "learning_rate": 0.000195319623669628, + "loss": 4.5134, + "step": 1955 + }, + { + "epoch": 0.6255571768374343, + "grad_norm": 0.33533331751823425, + "learning_rate": 0.00019502837374102866, + "loss": 4.5119, + "step": 1956 + }, + { + "epoch": 0.6258769913450198, + "grad_norm": 0.3240436315536499, + "learning_rate": 0.00019473723650965832, + "loss": 4.5549, + "step": 1957 + }, + { + "epoch": 0.6261968058526055, + "grad_norm": 0.35325974225997925, + "learning_rate": 0.0001944462122880813, + "loss": 4.5761, + "step": 1958 + }, + { + "epoch": 0.6265166203601911, + "grad_norm": 0.34944358468055725, + "learning_rate": 0.00019415530138874, + "loss": 4.4868, + "step": 1959 + }, + { + "epoch": 0.6268364348677767, + "grad_norm": 0.33490127325057983, + "learning_rate": 0.0001938645041239558, + "loss": 4.4705, + "step": 1960 + }, + { + "epoch": 0.6271562493753623, + "grad_norm": 0.32527029514312744, + "learning_rate": 0.00019357382080592773, + "loss": 4.5074, + "step": 1961 + }, + { + "epoch": 0.6274760638829479, + "grad_norm": 0.3463835120201111, + "learning_rate": 0.00019328325174673247, + "loss": 4.556, + "step": 1962 + }, + { + "epoch": 0.6277958783905335, + "grad_norm": 0.3363448977470398, + "learning_rate": 0.0001929927972583242, + "loss": 4.5661, + "step": 1963 + }, + { + "epoch": 0.6281156928981191, + "grad_norm": 0.33256959915161133, + "learning_rate": 0.00019270245765253382, + "loss": 4.5234, + "step": 1964 + }, + { + "epoch": 0.6284355074057046, + "grad_norm": 0.3460303843021393, + "learning_rate": 0.0001924122332410694, + "loss": 4.5785, + "step": 1965 + }, + { + "epoch": 0.6287553219132903, + "grad_norm": 0.342033326625824, + "learning_rate": 0.00019212212433551465, + "loss": 4.5518, + "step": 1966 + }, + { + "epoch": 0.6290751364208759, + "grad_norm": 0.37389683723449707, + "learning_rate": 0.00019183213124732986, + "loss": 4.6119, + "step": 1967 + }, + { + "epoch": 0.6293949509284615, + "grad_norm": 0.34783628582954407, + "learning_rate": 0.0001915422542878508, + "loss": 4.4027, + "step": 1968 + }, + { + "epoch": 0.6297147654360471, + "grad_norm": 0.334176242351532, + "learning_rate": 0.00019125249376828824, + "loss": 4.5087, + "step": 1969 + }, + { + "epoch": 0.6300345799436327, + "grad_norm": 0.3315781056880951, + "learning_rate": 0.00019096284999972862, + "loss": 4.3967, + "step": 1970 + }, + { + "epoch": 0.6303543944512183, + "grad_norm": 0.3476397395133972, + "learning_rate": 0.00019067332329313226, + "loss": 4.4586, + "step": 1971 + }, + { + "epoch": 0.6306742089588039, + "grad_norm": 0.343718945980072, + "learning_rate": 0.0001903839139593343, + "loss": 4.4096, + "step": 1972 + }, + { + "epoch": 0.6309940234663894, + "grad_norm": 0.3535640835762024, + "learning_rate": 0.00019009462230904398, + "loss": 4.5057, + "step": 1973 + }, + { + "epoch": 0.6313138379739751, + "grad_norm": 0.3376810848712921, + "learning_rate": 0.0001898054486528436, + "loss": 4.5772, + "step": 1974 + }, + { + "epoch": 0.6316336524815607, + "grad_norm": 0.3592750132083893, + "learning_rate": 0.00018951639330118953, + "loss": 4.4987, + "step": 1975 + }, + { + "epoch": 0.6319534669891463, + "grad_norm": 0.3285256326198578, + "learning_rate": 0.0001892274565644104, + "loss": 4.4203, + "step": 1976 + }, + { + "epoch": 0.6322732814967319, + "grad_norm": 0.34887608885765076, + "learning_rate": 0.000188938638752708, + "loss": 4.3972, + "step": 1977 + }, + { + "epoch": 0.6325930960043175, + "grad_norm": 0.34385180473327637, + "learning_rate": 0.00018864994017615624, + "loss": 4.4383, + "step": 1978 + }, + { + "epoch": 0.6329129105119031, + "grad_norm": 0.3433148264884949, + "learning_rate": 0.0001883613611447011, + "loss": 4.58, + "step": 1979 + }, + { + "epoch": 0.6332327250194887, + "grad_norm": 0.34894120693206787, + "learning_rate": 0.00018807290196816022, + "loss": 4.4366, + "step": 1980 + }, + { + "epoch": 0.6335525395270742, + "grad_norm": 0.3402814269065857, + "learning_rate": 0.00018778456295622239, + "loss": 4.441, + "step": 1981 + }, + { + "epoch": 0.6338723540346599, + "grad_norm": 0.44206148386001587, + "learning_rate": 0.00018749634441844764, + "loss": 4.4753, + "step": 1982 + }, + { + "epoch": 0.6341921685422455, + "grad_norm": 0.3443446159362793, + "learning_rate": 0.00018720824666426647, + "loss": 4.5124, + "step": 1983 + }, + { + "epoch": 0.6345119830498311, + "grad_norm": 0.3371545672416687, + "learning_rate": 0.00018692027000297986, + "loss": 4.5428, + "step": 1984 + }, + { + "epoch": 0.6348317975574167, + "grad_norm": 0.3495273292064667, + "learning_rate": 0.0001866324147437587, + "loss": 4.5698, + "step": 1985 + }, + { + "epoch": 0.6351516120650023, + "grad_norm": 0.3294757306575775, + "learning_rate": 0.00018634468119564342, + "loss": 4.5179, + "step": 1986 + }, + { + "epoch": 0.6354714265725879, + "grad_norm": 0.338234007358551, + "learning_rate": 0.00018605706966754408, + "loss": 4.4306, + "step": 1987 + }, + { + "epoch": 0.6357912410801735, + "grad_norm": 0.39578574895858765, + "learning_rate": 0.00018576958046823944, + "loss": 4.4886, + "step": 1988 + }, + { + "epoch": 0.636111055587759, + "grad_norm": 0.3399364948272705, + "learning_rate": 0.0001854822139063772, + "loss": 4.5338, + "step": 1989 + }, + { + "epoch": 0.6364308700953447, + "grad_norm": 0.35335099697113037, + "learning_rate": 0.00018519497029047307, + "loss": 4.5156, + "step": 1990 + }, + { + "epoch": 0.6367506846029303, + "grad_norm": 0.4992465674877167, + "learning_rate": 0.00018490784992891107, + "loss": 4.5368, + "step": 1991 + }, + { + "epoch": 0.6370704991105159, + "grad_norm": 0.34836506843566895, + "learning_rate": 0.00018462085312994278, + "loss": 4.4756, + "step": 1992 + }, + { + "epoch": 0.6373903136181015, + "grad_norm": 0.36463114619255066, + "learning_rate": 0.0001843339802016871, + "loss": 4.6071, + "step": 1993 + }, + { + "epoch": 0.6377101281256871, + "grad_norm": 0.3420564532279968, + "learning_rate": 0.00018404723145212993, + "loss": 4.4792, + "step": 1994 + }, + { + "epoch": 0.6380299426332727, + "grad_norm": 0.3417738974094391, + "learning_rate": 0.00018376060718912392, + "loss": 4.5, + "step": 1995 + }, + { + "epoch": 0.6383497571408583, + "grad_norm": 0.3472539484500885, + "learning_rate": 0.00018347410772038807, + "loss": 4.4523, + "step": 1996 + }, + { + "epoch": 0.638669571648444, + "grad_norm": 0.3467002809047699, + "learning_rate": 0.00018318773335350723, + "loss": 4.4398, + "step": 1997 + }, + { + "epoch": 0.6389893861560295, + "grad_norm": 0.3370647430419922, + "learning_rate": 0.0001829014843959322, + "loss": 4.4692, + "step": 1998 + }, + { + "epoch": 0.6393092006636151, + "grad_norm": 0.3472033143043518, + "learning_rate": 0.00018261536115497904, + "loss": 4.5119, + "step": 1999 + }, + { + "epoch": 0.6396290151712007, + "grad_norm": 0.3552742600440979, + "learning_rate": 0.0001823293639378287, + "loss": 4.5038, + "step": 2000 + }, + { + "epoch": 0.6396290151712007, + "eval_loss": 4.501763820648193, + "eval_runtime": 85.2654, + "eval_samples_per_second": 22.248, + "eval_steps_per_second": 5.571, + "step": 2000 + }, + { + "epoch": 0.6399488296787863, + "grad_norm": 0.33789336681365967, + "learning_rate": 0.00018204349305152707, + "loss": 4.4083, + "step": 2001 + }, + { + "epoch": 0.6402686441863719, + "grad_norm": 0.35798177123069763, + "learning_rate": 0.00018175774880298422, + "loss": 4.6027, + "step": 2002 + }, + { + "epoch": 0.6405884586939575, + "grad_norm": 0.35995379090309143, + "learning_rate": 0.0001814721314989743, + "loss": 4.5795, + "step": 2003 + }, + { + "epoch": 0.6409082732015431, + "grad_norm": 0.3444795608520508, + "learning_rate": 0.0001811866414461354, + "loss": 4.4867, + "step": 2004 + }, + { + "epoch": 0.6412280877091288, + "grad_norm": 0.3556591272354126, + "learning_rate": 0.00018090127895096855, + "loss": 4.4126, + "step": 2005 + }, + { + "epoch": 0.6415479022167143, + "grad_norm": 0.34814828634262085, + "learning_rate": 0.0001806160443198383, + "loss": 4.5066, + "step": 2006 + }, + { + "epoch": 0.6418677167242999, + "grad_norm": 0.3337419629096985, + "learning_rate": 0.00018033093785897145, + "loss": 4.4302, + "step": 2007 + }, + { + "epoch": 0.6421875312318855, + "grad_norm": 0.3529178202152252, + "learning_rate": 0.00018004595987445782, + "loss": 4.5251, + "step": 2008 + }, + { + "epoch": 0.6425073457394711, + "grad_norm": 0.36757129430770874, + "learning_rate": 0.00017976111067224854, + "loss": 4.5572, + "step": 2009 + }, + { + "epoch": 0.6428271602470567, + "grad_norm": 0.33173325657844543, + "learning_rate": 0.00017947639055815713, + "loss": 4.5201, + "step": 2010 + }, + { + "epoch": 0.6431469747546423, + "grad_norm": 0.3335769772529602, + "learning_rate": 0.00017919179983785828, + "loss": 4.5098, + "step": 2011 + }, + { + "epoch": 0.6434667892622279, + "grad_norm": 0.3455420136451721, + "learning_rate": 0.00017890733881688754, + "loss": 4.5294, + "step": 2012 + }, + { + "epoch": 0.6437866037698136, + "grad_norm": 0.33455345034599304, + "learning_rate": 0.0001786230078006417, + "loss": 4.4092, + "step": 2013 + }, + { + "epoch": 0.6441064182773991, + "grad_norm": 0.34600844979286194, + "learning_rate": 0.00017833880709437752, + "loss": 4.4824, + "step": 2014 + }, + { + "epoch": 0.6444262327849847, + "grad_norm": 0.3213809132575989, + "learning_rate": 0.00017805473700321193, + "loss": 4.3922, + "step": 2015 + }, + { + "epoch": 0.6447460472925703, + "grad_norm": 0.32600635290145874, + "learning_rate": 0.00017777079783212215, + "loss": 4.5513, + "step": 2016 + }, + { + "epoch": 0.6450658618001559, + "grad_norm": 0.35062530636787415, + "learning_rate": 0.00017748698988594394, + "loss": 4.4991, + "step": 2017 + }, + { + "epoch": 0.6453856763077415, + "grad_norm": 0.3276106119155884, + "learning_rate": 0.00017720331346937317, + "loss": 4.4505, + "step": 2018 + }, + { + "epoch": 0.6457054908153271, + "grad_norm": 0.3346673250198364, + "learning_rate": 0.0001769197688869636, + "loss": 4.4227, + "step": 2019 + }, + { + "epoch": 0.6460253053229127, + "grad_norm": 0.3427330255508423, + "learning_rate": 0.0001766363564431281, + "loss": 4.4448, + "step": 2020 + }, + { + "epoch": 0.6463451198304984, + "grad_norm": 0.3343569338321686, + "learning_rate": 0.00017635307644213726, + "loss": 4.4658, + "step": 2021 + }, + { + "epoch": 0.6466649343380839, + "grad_norm": 0.3355761766433716, + "learning_rate": 0.00017606992918811976, + "loss": 4.4333, + "step": 2022 + }, + { + "epoch": 0.6469847488456695, + "grad_norm": 0.3303985297679901, + "learning_rate": 0.00017578691498506177, + "loss": 4.4637, + "step": 2023 + }, + { + "epoch": 0.6473045633532551, + "grad_norm": 0.3363431990146637, + "learning_rate": 0.00017550403413680625, + "loss": 4.5341, + "step": 2024 + }, + { + "epoch": 0.6476243778608407, + "grad_norm": 0.33544039726257324, + "learning_rate": 0.00017522128694705347, + "loss": 4.4159, + "step": 2025 + }, + { + "epoch": 0.6479441923684263, + "grad_norm": 0.33282896876335144, + "learning_rate": 0.0001749386737193598, + "loss": 4.462, + "step": 2026 + }, + { + "epoch": 0.6482640068760119, + "grad_norm": 0.34504929184913635, + "learning_rate": 0.00017465619475713813, + "loss": 4.5122, + "step": 2027 + }, + { + "epoch": 0.6485838213835975, + "grad_norm": 0.3385733664035797, + "learning_rate": 0.00017437385036365695, + "loss": 4.4972, + "step": 2028 + }, + { + "epoch": 0.6489036358911832, + "grad_norm": 0.3179207146167755, + "learning_rate": 0.00017409164084204037, + "loss": 4.4297, + "step": 2029 + }, + { + "epoch": 0.6492234503987687, + "grad_norm": 0.3247489631175995, + "learning_rate": 0.00017380956649526785, + "loss": 4.4343, + "step": 2030 + }, + { + "epoch": 0.6495432649063543, + "grad_norm": 0.3349881172180176, + "learning_rate": 0.00017352762762617334, + "loss": 4.4495, + "step": 2031 + }, + { + "epoch": 0.64986307941394, + "grad_norm": 0.35694414377212524, + "learning_rate": 0.00017324582453744577, + "loss": 4.436, + "step": 2032 + }, + { + "epoch": 0.6501828939215255, + "grad_norm": 0.36272308230400085, + "learning_rate": 0.00017296415753162786, + "loss": 4.4708, + "step": 2033 + }, + { + "epoch": 0.6505027084291111, + "grad_norm": 0.3409692049026489, + "learning_rate": 0.00017268262691111675, + "loss": 4.5294, + "step": 2034 + }, + { + "epoch": 0.6508225229366967, + "grad_norm": 0.3303185999393463, + "learning_rate": 0.0001724012329781629, + "loss": 4.4625, + "step": 2035 + }, + { + "epoch": 0.6511423374442823, + "grad_norm": 0.3401007056236267, + "learning_rate": 0.0001721199760348698, + "loss": 4.4891, + "step": 2036 + }, + { + "epoch": 0.651462151951868, + "grad_norm": 0.3425493836402893, + "learning_rate": 0.00017183885638319426, + "loss": 4.468, + "step": 2037 + }, + { + "epoch": 0.6517819664594535, + "grad_norm": 0.34367531538009644, + "learning_rate": 0.00017155787432494529, + "loss": 4.485, + "step": 2038 + }, + { + "epoch": 0.6521017809670391, + "grad_norm": 0.33023732900619507, + "learning_rate": 0.00017127703016178445, + "loss": 4.5316, + "step": 2039 + }, + { + "epoch": 0.6524215954746247, + "grad_norm": 0.33133843541145325, + "learning_rate": 0.00017099632419522552, + "loss": 4.5366, + "step": 2040 + }, + { + "epoch": 0.6527414099822103, + "grad_norm": 0.34126347303390503, + "learning_rate": 0.00017071575672663325, + "loss": 4.4601, + "step": 2041 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.34515446424484253, + "learning_rate": 0.0001704353280572243, + "loss": 4.5834, + "step": 2042 + }, + { + "epoch": 0.6533810389973815, + "grad_norm": 0.3388916850090027, + "learning_rate": 0.0001701550384880658, + "loss": 4.4507, + "step": 2043 + }, + { + "epoch": 0.6537008535049671, + "grad_norm": 0.3309282958507538, + "learning_rate": 0.00016987488832007593, + "loss": 4.4855, + "step": 2044 + }, + { + "epoch": 0.6540206680125528, + "grad_norm": 0.33507040143013, + "learning_rate": 0.00016959487785402313, + "loss": 4.4535, + "step": 2045 + }, + { + "epoch": 0.6543404825201383, + "grad_norm": 0.32252126932144165, + "learning_rate": 0.00016931500739052576, + "loss": 4.4948, + "step": 2046 + }, + { + "epoch": 0.6546602970277239, + "grad_norm": 0.33887749910354614, + "learning_rate": 0.00016903527723005206, + "loss": 4.534, + "step": 2047 + }, + { + "epoch": 0.6549801115353096, + "grad_norm": 0.3391673266887665, + "learning_rate": 0.0001687556876729193, + "loss": 4.5171, + "step": 2048 + }, + { + "epoch": 0.6552999260428951, + "grad_norm": 0.3284638226032257, + "learning_rate": 0.00016847623901929408, + "loss": 4.3993, + "step": 2049 + }, + { + "epoch": 0.6556197405504807, + "grad_norm": 0.33401697874069214, + "learning_rate": 0.00016819693156919167, + "loss": 4.4099, + "step": 2050 + }, + { + "epoch": 0.6559395550580663, + "grad_norm": 0.32520854473114014, + "learning_rate": 0.00016791776562247572, + "loss": 4.3699, + "step": 2051 + }, + { + "epoch": 0.6562593695656519, + "grad_norm": 0.33887824416160583, + "learning_rate": 0.0001676387414788581, + "loss": 4.337, + "step": 2052 + }, + { + "epoch": 0.6565791840732376, + "grad_norm": 0.3305310904979706, + "learning_rate": 0.00016735985943789808, + "loss": 4.458, + "step": 2053 + }, + { + "epoch": 0.6568989985808231, + "grad_norm": 0.33141234517097473, + "learning_rate": 0.0001670811197990027, + "loss": 4.3534, + "step": 2054 + }, + { + "epoch": 0.6572188130884087, + "grad_norm": 0.3346043825149536, + "learning_rate": 0.000166802522861426, + "loss": 4.3786, + "step": 2055 + }, + { + "epoch": 0.6575386275959944, + "grad_norm": 0.34695038199424744, + "learning_rate": 0.00016652406892426902, + "loss": 4.4992, + "step": 2056 + }, + { + "epoch": 0.6578584421035799, + "grad_norm": 0.32746338844299316, + "learning_rate": 0.00016624575828647878, + "loss": 4.4396, + "step": 2057 + }, + { + "epoch": 0.6581782566111655, + "grad_norm": 0.3358507454395294, + "learning_rate": 0.0001659675912468489, + "loss": 4.4843, + "step": 2058 + }, + { + "epoch": 0.6584980711187511, + "grad_norm": 0.35679417848587036, + "learning_rate": 0.00016568956810401867, + "loss": 4.4237, + "step": 2059 + }, + { + "epoch": 0.6588178856263367, + "grad_norm": 0.3554205298423767, + "learning_rate": 0.00016541168915647298, + "loss": 4.4, + "step": 2060 + }, + { + "epoch": 0.6591377001339224, + "grad_norm": 0.3345361053943634, + "learning_rate": 0.00016513395470254194, + "loss": 4.4388, + "step": 2061 + }, + { + "epoch": 0.6594575146415079, + "grad_norm": 0.33950522541999817, + "learning_rate": 0.00016485636504040015, + "loss": 4.4784, + "step": 2062 + }, + { + "epoch": 0.6597773291490935, + "grad_norm": 0.370823472738266, + "learning_rate": 0.00016457892046806727, + "loss": 4.4151, + "step": 2063 + }, + { + "epoch": 0.6600971436566792, + "grad_norm": 0.34681758284568787, + "learning_rate": 0.00016430162128340693, + "loss": 4.4152, + "step": 2064 + }, + { + "epoch": 0.6604169581642647, + "grad_norm": 0.3444735109806061, + "learning_rate": 0.0001640244677841267, + "loss": 4.472, + "step": 2065 + }, + { + "epoch": 0.6607367726718504, + "grad_norm": 0.3534565269947052, + "learning_rate": 0.00016374746026777794, + "loss": 4.4343, + "step": 2066 + }, + { + "epoch": 0.6610565871794359, + "grad_norm": 0.3453407883644104, + "learning_rate": 0.0001634705990317548, + "loss": 4.4922, + "step": 2067 + }, + { + "epoch": 0.6613764016870215, + "grad_norm": 0.3331710696220398, + "learning_rate": 0.00016319388437329482, + "loss": 4.4478, + "step": 2068 + }, + { + "epoch": 0.6616962161946072, + "grad_norm": 0.3514850437641144, + "learning_rate": 0.00016291731658947808, + "loss": 4.4396, + "step": 2069 + }, + { + "epoch": 0.6620160307021927, + "grad_norm": 0.344855934381485, + "learning_rate": 0.0001626408959772269, + "loss": 4.4916, + "step": 2070 + }, + { + "epoch": 0.6623358452097783, + "grad_norm": 0.40698984265327454, + "learning_rate": 0.00016236462283330578, + "loss": 4.4119, + "step": 2071 + }, + { + "epoch": 0.662655659717364, + "grad_norm": 0.3326229155063629, + "learning_rate": 0.0001620884974543205, + "loss": 4.4911, + "step": 2072 + }, + { + "epoch": 0.6629754742249495, + "grad_norm": 0.34035009145736694, + "learning_rate": 0.00016181252013671858, + "loss": 4.4649, + "step": 2073 + }, + { + "epoch": 0.6632952887325352, + "grad_norm": 0.34327614307403564, + "learning_rate": 0.00016153669117678848, + "loss": 4.4943, + "step": 2074 + }, + { + "epoch": 0.6636151032401207, + "grad_norm": 0.3320022225379944, + "learning_rate": 0.00016126101087065933, + "loss": 4.5206, + "step": 2075 + }, + { + "epoch": 0.6639349177477063, + "grad_norm": 0.350603312253952, + "learning_rate": 0.00016098547951430082, + "loss": 4.5001, + "step": 2076 + }, + { + "epoch": 0.664254732255292, + "grad_norm": 0.3385373055934906, + "learning_rate": 0.00016071009740352237, + "loss": 4.4999, + "step": 2077 + }, + { + "epoch": 0.6645745467628775, + "grad_norm": 0.35841625928878784, + "learning_rate": 0.0001604348648339736, + "loss": 4.426, + "step": 2078 + }, + { + "epoch": 0.6648943612704631, + "grad_norm": 0.331993967294693, + "learning_rate": 0.0001601597821011431, + "loss": 4.5417, + "step": 2079 + }, + { + "epoch": 0.6652141757780488, + "grad_norm": 0.34224316477775574, + "learning_rate": 0.0001598848495003593, + "loss": 4.4207, + "step": 2080 + }, + { + "epoch": 0.6655339902856343, + "grad_norm": 0.3511262834072113, + "learning_rate": 0.00015961006732678873, + "loss": 4.4842, + "step": 2081 + }, + { + "epoch": 0.66585380479322, + "grad_norm": 0.3389959931373596, + "learning_rate": 0.00015933543587543682, + "loss": 4.4316, + "step": 2082 + }, + { + "epoch": 0.6661736193008055, + "grad_norm": 0.34232959151268005, + "learning_rate": 0.0001590609554411472, + "loss": 4.5411, + "step": 2083 + }, + { + "epoch": 0.6664934338083911, + "grad_norm": 0.3344050347805023, + "learning_rate": 0.0001587866263186009, + "loss": 4.4495, + "step": 2084 + }, + { + "epoch": 0.6668132483159768, + "grad_norm": 0.35482099652290344, + "learning_rate": 0.0001585124488023173, + "loss": 4.5453, + "step": 2085 + }, + { + "epoch": 0.6671330628235623, + "grad_norm": 0.3336874842643738, + "learning_rate": 0.00015823842318665233, + "loss": 4.3472, + "step": 2086 + }, + { + "epoch": 0.6674528773311479, + "grad_norm": 0.366793155670166, + "learning_rate": 0.00015796454976579901, + "loss": 4.497, + "step": 2087 + }, + { + "epoch": 0.6677726918387336, + "grad_norm": 0.3410011827945709, + "learning_rate": 0.00015769082883378737, + "loss": 4.3986, + "step": 2088 + }, + { + "epoch": 0.6680925063463191, + "grad_norm": 0.3739314079284668, + "learning_rate": 0.00015741726068448293, + "loss": 4.4505, + "step": 2089 + }, + { + "epoch": 0.6684123208539048, + "grad_norm": 0.33777886629104614, + "learning_rate": 0.0001571438456115881, + "loss": 4.4421, + "step": 2090 + }, + { + "epoch": 0.6687321353614903, + "grad_norm": 0.33963218331336975, + "learning_rate": 0.0001568705839086402, + "loss": 4.4998, + "step": 2091 + }, + { + "epoch": 0.6690519498690759, + "grad_norm": 0.3247828185558319, + "learning_rate": 0.00015659747586901243, + "loss": 4.4794, + "step": 2092 + }, + { + "epoch": 0.6693717643766616, + "grad_norm": 0.32846730947494507, + "learning_rate": 0.00015632452178591252, + "loss": 4.4371, + "step": 2093 + }, + { + "epoch": 0.6696915788842471, + "grad_norm": 0.3390513062477112, + "learning_rate": 0.00015605172195238314, + "loss": 4.4305, + "step": 2094 + }, + { + "epoch": 0.6700113933918327, + "grad_norm": 0.33395376801490784, + "learning_rate": 0.00015577907666130178, + "loss": 4.5377, + "step": 2095 + }, + { + "epoch": 0.6703312078994184, + "grad_norm": 0.33547675609588623, + "learning_rate": 0.00015550658620537932, + "loss": 4.5431, + "step": 2096 + }, + { + "epoch": 0.6706510224070039, + "grad_norm": 0.34296077489852905, + "learning_rate": 0.0001552342508771608, + "loss": 4.386, + "step": 2097 + }, + { + "epoch": 0.6709708369145896, + "grad_norm": 0.36239510774612427, + "learning_rate": 0.00015496207096902457, + "loss": 4.4442, + "step": 2098 + }, + { + "epoch": 0.6712906514221751, + "grad_norm": 0.37812966108322144, + "learning_rate": 0.00015469004677318214, + "loss": 4.4708, + "step": 2099 + }, + { + "epoch": 0.6716104659297607, + "grad_norm": 0.34064537286758423, + "learning_rate": 0.000154418178581678, + "loss": 4.4746, + "step": 2100 + }, + { + "epoch": 0.6716104659297607, + "eval_loss": 4.476485729217529, + "eval_runtime": 99.2816, + "eval_samples_per_second": 19.107, + "eval_steps_per_second": 4.784, + "step": 2100 + }, + { + "epoch": 0.6719302804373464, + "grad_norm": 0.33424270153045654, + "learning_rate": 0.00015414646668638897, + "loss": 4.3999, + "step": 2101 + }, + { + "epoch": 0.6722500949449319, + "grad_norm": 0.33114364743232727, + "learning_rate": 0.00015387491137902428, + "loss": 4.4414, + "step": 2102 + }, + { + "epoch": 0.6725699094525175, + "grad_norm": 0.34432920813560486, + "learning_rate": 0.00015360351295112468, + "loss": 4.3984, + "step": 2103 + }, + { + "epoch": 0.6728897239601032, + "grad_norm": 0.33681678771972656, + "learning_rate": 0.00015333227169406284, + "loss": 4.4823, + "step": 2104 + }, + { + "epoch": 0.6732095384676887, + "grad_norm": 0.34930625557899475, + "learning_rate": 0.0001530611878990426, + "loss": 4.4129, + "step": 2105 + }, + { + "epoch": 0.6735293529752744, + "grad_norm": 0.3283677101135254, + "learning_rate": 0.00015279026185709865, + "loss": 4.3738, + "step": 2106 + }, + { + "epoch": 0.67384916748286, + "grad_norm": 0.3268563449382782, + "learning_rate": 0.0001525194938590966, + "loss": 4.5099, + "step": 2107 + }, + { + "epoch": 0.6741689819904455, + "grad_norm": 0.3501228094100952, + "learning_rate": 0.0001522488841957319, + "loss": 4.5102, + "step": 2108 + }, + { + "epoch": 0.6744887964980312, + "grad_norm": 0.33682167530059814, + "learning_rate": 0.00015197843315753034, + "loss": 4.5232, + "step": 2109 + }, + { + "epoch": 0.6748086110056167, + "grad_norm": 0.3597559630870819, + "learning_rate": 0.00015170814103484747, + "loss": 4.4373, + "step": 2110 + }, + { + "epoch": 0.6751284255132023, + "grad_norm": 0.340017169713974, + "learning_rate": 0.00015143800811786805, + "loss": 4.4699, + "step": 2111 + }, + { + "epoch": 0.675448240020788, + "grad_norm": 0.332932710647583, + "learning_rate": 0.00015116803469660616, + "loss": 4.4958, + "step": 2112 + }, + { + "epoch": 0.6757680545283735, + "grad_norm": 0.3516872227191925, + "learning_rate": 0.00015089822106090418, + "loss": 4.5335, + "step": 2113 + }, + { + "epoch": 0.6760878690359592, + "grad_norm": 0.35002401471138, + "learning_rate": 0.00015062856750043343, + "loss": 4.4151, + "step": 2114 + }, + { + "epoch": 0.6764076835435447, + "grad_norm": 0.3430773913860321, + "learning_rate": 0.00015035907430469304, + "loss": 4.4774, + "step": 2115 + }, + { + "epoch": 0.6767274980511303, + "grad_norm": 0.3273634910583496, + "learning_rate": 0.00015008974176301031, + "loss": 4.498, + "step": 2116 + }, + { + "epoch": 0.677047312558716, + "grad_norm": 0.3389027416706085, + "learning_rate": 0.00014982057016453969, + "loss": 4.4551, + "step": 2117 + }, + { + "epoch": 0.6773671270663015, + "grad_norm": 0.35352566838264465, + "learning_rate": 0.00014955155979826302, + "loss": 4.4706, + "step": 2118 + }, + { + "epoch": 0.6776869415738871, + "grad_norm": 0.3369107246398926, + "learning_rate": 0.00014928271095298912, + "loss": 4.3326, + "step": 2119 + }, + { + "epoch": 0.6780067560814728, + "grad_norm": 0.3422829508781433, + "learning_rate": 0.00014901402391735328, + "loss": 4.4644, + "step": 2120 + }, + { + "epoch": 0.6783265705890583, + "grad_norm": 0.33077096939086914, + "learning_rate": 0.00014874549897981725, + "loss": 4.4121, + "step": 2121 + }, + { + "epoch": 0.678646385096644, + "grad_norm": 0.33814603090286255, + "learning_rate": 0.00014847713642866835, + "loss": 4.5341, + "step": 2122 + }, + { + "epoch": 0.6789661996042295, + "grad_norm": 0.34842199087142944, + "learning_rate": 0.00014820893655201998, + "loss": 4.4643, + "step": 2123 + }, + { + "epoch": 0.6792860141118151, + "grad_norm": 0.33767151832580566, + "learning_rate": 0.0001479408996378107, + "loss": 4.424, + "step": 2124 + }, + { + "epoch": 0.6796058286194008, + "grad_norm": 0.33817440271377563, + "learning_rate": 0.00014767302597380418, + "loss": 4.5601, + "step": 2125 + }, + { + "epoch": 0.6799256431269863, + "grad_norm": 0.3428094983100891, + "learning_rate": 0.0001474053158475889, + "loss": 4.4309, + "step": 2126 + }, + { + "epoch": 0.6802454576345719, + "grad_norm": 0.33630338311195374, + "learning_rate": 0.00014713776954657743, + "loss": 4.4203, + "step": 2127 + }, + { + "epoch": 0.6805652721421576, + "grad_norm": 0.34329938888549805, + "learning_rate": 0.00014687038735800693, + "loss": 4.5438, + "step": 2128 + }, + { + "epoch": 0.6808850866497431, + "grad_norm": 0.3318649232387543, + "learning_rate": 0.0001466031695689378, + "loss": 4.4217, + "step": 2129 + }, + { + "epoch": 0.6812049011573288, + "grad_norm": 0.3329257071018219, + "learning_rate": 0.0001463361164662546, + "loss": 4.413, + "step": 2130 + }, + { + "epoch": 0.6815247156649143, + "grad_norm": 0.3311763107776642, + "learning_rate": 0.00014606922833666476, + "loss": 4.4273, + "step": 2131 + }, + { + "epoch": 0.6818445301724999, + "grad_norm": 0.3407959043979645, + "learning_rate": 0.00014580250546669836, + "loss": 4.4846, + "step": 2132 + }, + { + "epoch": 0.6821643446800856, + "grad_norm": 0.34549182653427124, + "learning_rate": 0.0001455359481427085, + "loss": 4.5137, + "step": 2133 + }, + { + "epoch": 0.6824841591876711, + "grad_norm": 0.33701956272125244, + "learning_rate": 0.00014526955665087013, + "loss": 4.403, + "step": 2134 + }, + { + "epoch": 0.6828039736952568, + "grad_norm": 0.327288419008255, + "learning_rate": 0.00014500333127718035, + "loss": 4.4906, + "step": 2135 + }, + { + "epoch": 0.6831237882028424, + "grad_norm": 0.33447083830833435, + "learning_rate": 0.00014473727230745833, + "loss": 4.4476, + "step": 2136 + }, + { + "epoch": 0.6834436027104279, + "grad_norm": 0.3489927053451538, + "learning_rate": 0.0001444713800273438, + "loss": 4.4885, + "step": 2137 + }, + { + "epoch": 0.6837634172180136, + "grad_norm": 0.3353646397590637, + "learning_rate": 0.0001442056547222982, + "loss": 4.496, + "step": 2138 + }, + { + "epoch": 0.6840832317255992, + "grad_norm": 0.33472129702568054, + "learning_rate": 0.0001439400966776032, + "loss": 4.4526, + "step": 2139 + }, + { + "epoch": 0.6844030462331847, + "grad_norm": 0.34008723497390747, + "learning_rate": 0.00014367470617836117, + "loss": 4.5634, + "step": 2140 + }, + { + "epoch": 0.6847228607407704, + "grad_norm": 0.3388221859931946, + "learning_rate": 0.00014340948350949467, + "loss": 4.5802, + "step": 2141 + }, + { + "epoch": 0.6850426752483559, + "grad_norm": 0.3305363953113556, + "learning_rate": 0.00014314442895574595, + "loss": 4.495, + "step": 2142 + }, + { + "epoch": 0.6853624897559416, + "grad_norm": 0.3456083834171295, + "learning_rate": 0.00014287954280167695, + "loss": 4.4121, + "step": 2143 + }, + { + "epoch": 0.6856823042635272, + "grad_norm": 0.3401589095592499, + "learning_rate": 0.00014261482533166832, + "loss": 4.3316, + "step": 2144 + }, + { + "epoch": 0.6860021187711127, + "grad_norm": 0.3346431851387024, + "learning_rate": 0.0001423502768299202, + "loss": 4.4217, + "step": 2145 + }, + { + "epoch": 0.6863219332786984, + "grad_norm": 0.3352597951889038, + "learning_rate": 0.00014208589758045098, + "loss": 4.4556, + "step": 2146 + }, + { + "epoch": 0.686641747786284, + "grad_norm": 0.3395027816295624, + "learning_rate": 0.00014182168786709755, + "loss": 4.4635, + "step": 2147 + }, + { + "epoch": 0.6869615622938695, + "grad_norm": 0.3329550623893738, + "learning_rate": 0.00014155764797351472, + "loss": 4.5404, + "step": 2148 + }, + { + "epoch": 0.6872813768014552, + "grad_norm": 0.33801010251045227, + "learning_rate": 0.0001412937781831747, + "loss": 4.5546, + "step": 2149 + }, + { + "epoch": 0.6876011913090407, + "grad_norm": 0.3325575590133667, + "learning_rate": 0.0001410300787793675, + "loss": 4.4383, + "step": 2150 + }, + { + "epoch": 0.6879210058166264, + "grad_norm": 0.33519449830055237, + "learning_rate": 0.00014076655004519997, + "loss": 4.4429, + "step": 2151 + }, + { + "epoch": 0.688240820324212, + "grad_norm": 0.3456709682941437, + "learning_rate": 0.00014050319226359593, + "loss": 4.5446, + "step": 2152 + }, + { + "epoch": 0.6885606348317975, + "grad_norm": 0.3320106565952301, + "learning_rate": 0.00014024000571729526, + "loss": 4.4131, + "step": 2153 + }, + { + "epoch": 0.6888804493393832, + "grad_norm": 0.32259705662727356, + "learning_rate": 0.00013997699068885443, + "loss": 4.4047, + "step": 2154 + }, + { + "epoch": 0.6892002638469688, + "grad_norm": 0.32922127842903137, + "learning_rate": 0.00013971414746064554, + "loss": 4.4786, + "step": 2155 + }, + { + "epoch": 0.6895200783545543, + "grad_norm": 0.3358190655708313, + "learning_rate": 0.00013945147631485634, + "loss": 4.3915, + "step": 2156 + }, + { + "epoch": 0.68983989286214, + "grad_norm": 0.34597843885421753, + "learning_rate": 0.00013918897753348991, + "loss": 4.4127, + "step": 2157 + }, + { + "epoch": 0.6901597073697255, + "grad_norm": 0.34270811080932617, + "learning_rate": 0.00013892665139836392, + "loss": 4.4276, + "step": 2158 + }, + { + "epoch": 0.6904795218773112, + "grad_norm": 0.33038774132728577, + "learning_rate": 0.0001386644981911111, + "loss": 4.5058, + "step": 2159 + }, + { + "epoch": 0.6907993363848968, + "grad_norm": 0.5337318181991577, + "learning_rate": 0.00013840251819317832, + "loss": 4.437, + "step": 2160 + }, + { + "epoch": 0.6911191508924823, + "grad_norm": 0.3278668522834778, + "learning_rate": 0.00013814071168582654, + "loss": 4.4028, + "step": 2161 + }, + { + "epoch": 0.691438965400068, + "grad_norm": 0.3297139108181, + "learning_rate": 0.00013787907895013054, + "loss": 4.4416, + "step": 2162 + }, + { + "epoch": 0.6917587799076536, + "grad_norm": 0.33460527658462524, + "learning_rate": 0.0001376176202669783, + "loss": 4.3965, + "step": 2163 + }, + { + "epoch": 0.6920785944152391, + "grad_norm": 0.33878377079963684, + "learning_rate": 0.00013735633591707117, + "loss": 4.5049, + "step": 2164 + }, + { + "epoch": 0.6923984089228248, + "grad_norm": 0.3428264558315277, + "learning_rate": 0.00013709522618092328, + "loss": 4.4022, + "step": 2165 + }, + { + "epoch": 0.6927182234304103, + "grad_norm": 0.33774280548095703, + "learning_rate": 0.00013683429133886122, + "loss": 4.4205, + "step": 2166 + }, + { + "epoch": 0.693038037937996, + "grad_norm": 0.33137834072113037, + "learning_rate": 0.00013657353167102401, + "loss": 4.4648, + "step": 2167 + }, + { + "epoch": 0.6933578524455816, + "grad_norm": 0.33991193771362305, + "learning_rate": 0.00013631294745736227, + "loss": 4.4886, + "step": 2168 + }, + { + "epoch": 0.6936776669531671, + "grad_norm": 0.3367961049079895, + "learning_rate": 0.0001360525389776385, + "loss": 4.4017, + "step": 2169 + }, + { + "epoch": 0.6939974814607528, + "grad_norm": 0.3382626175880432, + "learning_rate": 0.00013579230651142654, + "loss": 4.4184, + "step": 2170 + }, + { + "epoch": 0.6943172959683384, + "grad_norm": 0.3440368175506592, + "learning_rate": 0.00013553225033811114, + "loss": 4.4781, + "step": 2171 + }, + { + "epoch": 0.6946371104759239, + "grad_norm": 0.3414704501628876, + "learning_rate": 0.00013527237073688797, + "loss": 4.5359, + "step": 2172 + }, + { + "epoch": 0.6949569249835096, + "grad_norm": 0.3314661383628845, + "learning_rate": 0.00013501266798676283, + "loss": 4.4856, + "step": 2173 + }, + { + "epoch": 0.6952767394910951, + "grad_norm": 0.3305390477180481, + "learning_rate": 0.000134753142366552, + "loss": 4.4384, + "step": 2174 + }, + { + "epoch": 0.6955965539986808, + "grad_norm": 0.33603882789611816, + "learning_rate": 0.0001344937941548811, + "loss": 4.4986, + "step": 2175 + }, + { + "epoch": 0.6959163685062664, + "grad_norm": 0.3437805771827698, + "learning_rate": 0.00013423462363018604, + "loss": 4.3386, + "step": 2176 + }, + { + "epoch": 0.6962361830138519, + "grad_norm": 0.328469842672348, + "learning_rate": 0.00013397563107071125, + "loss": 4.412, + "step": 2177 + }, + { + "epoch": 0.6965559975214376, + "grad_norm": 0.3432283103466034, + "learning_rate": 0.0001337168167545104, + "loss": 4.5509, + "step": 2178 + }, + { + "epoch": 0.6968758120290232, + "grad_norm": 0.3334380090236664, + "learning_rate": 0.000133458180959446, + "loss": 4.4809, + "step": 2179 + }, + { + "epoch": 0.6971956265366087, + "grad_norm": 0.32941102981567383, + "learning_rate": 0.00013319972396318828, + "loss": 4.4263, + "step": 2180 + }, + { + "epoch": 0.6975154410441944, + "grad_norm": 0.34028035402297974, + "learning_rate": 0.00013294144604321633, + "loss": 4.4874, + "step": 2181 + }, + { + "epoch": 0.6978352555517799, + "grad_norm": 0.34419766068458557, + "learning_rate": 0.00013268334747681626, + "loss": 4.4144, + "step": 2182 + }, + { + "epoch": 0.6981550700593656, + "grad_norm": 0.32000261545181274, + "learning_rate": 0.0001324254285410821, + "loss": 4.4725, + "step": 2183 + }, + { + "epoch": 0.6984748845669512, + "grad_norm": 0.3259260058403015, + "learning_rate": 0.0001321676895129149, + "loss": 4.4092, + "step": 2184 + }, + { + "epoch": 0.6987946990745367, + "grad_norm": 0.33444148302078247, + "learning_rate": 0.0001319101306690222, + "loss": 4.3921, + "step": 2185 + }, + { + "epoch": 0.6991145135821224, + "grad_norm": 0.3239077627658844, + "learning_rate": 0.0001316527522859189, + "loss": 4.4585, + "step": 2186 + }, + { + "epoch": 0.699434328089708, + "grad_norm": 0.32290899753570557, + "learning_rate": 0.00013139555463992527, + "loss": 4.3708, + "step": 2187 + }, + { + "epoch": 0.6997541425972935, + "grad_norm": 0.33842989802360535, + "learning_rate": 0.00013113853800716824, + "loss": 4.4469, + "step": 2188 + }, + { + "epoch": 0.7000739571048792, + "grad_norm": 0.32952606678009033, + "learning_rate": 0.00013088170266357986, + "loss": 4.4598, + "step": 2189 + }, + { + "epoch": 0.7003937716124647, + "grad_norm": 0.3406091034412384, + "learning_rate": 0.00013062504888489788, + "loss": 4.484, + "step": 2190 + }, + { + "epoch": 0.7007135861200504, + "grad_norm": 0.3301216661930084, + "learning_rate": 0.0001303685769466651, + "loss": 4.5346, + "step": 2191 + }, + { + "epoch": 0.701033400627636, + "grad_norm": 0.34070494771003723, + "learning_rate": 0.00013011228712422898, + "loss": 4.4581, + "step": 2192 + }, + { + "epoch": 0.7013532151352215, + "grad_norm": 0.3262787163257599, + "learning_rate": 0.0001298561796927417, + "loss": 4.5433, + "step": 2193 + }, + { + "epoch": 0.7016730296428072, + "grad_norm": 0.33801570534706116, + "learning_rate": 0.00012960025492715914, + "loss": 4.4782, + "step": 2194 + }, + { + "epoch": 0.7019928441503928, + "grad_norm": 0.3243328630924225, + "learning_rate": 0.0001293445131022416, + "loss": 4.4121, + "step": 2195 + }, + { + "epoch": 0.7023126586579783, + "grad_norm": 0.323103666305542, + "learning_rate": 0.00012908895449255262, + "loss": 4.3293, + "step": 2196 + }, + { + "epoch": 0.702632473165564, + "grad_norm": 0.3357979953289032, + "learning_rate": 0.0001288335793724592, + "loss": 4.4821, + "step": 2197 + }, + { + "epoch": 0.7029522876731495, + "grad_norm": 0.3401612341403961, + "learning_rate": 0.00012857838801613153, + "loss": 4.4517, + "step": 2198 + }, + { + "epoch": 0.7032721021807352, + "grad_norm": 0.3437880873680115, + "learning_rate": 0.000128323380697542, + "loss": 4.3027, + "step": 2199 + }, + { + "epoch": 0.7035919166883208, + "grad_norm": 0.32506972551345825, + "learning_rate": 0.0001280685576904658, + "loss": 4.4328, + "step": 2200 + }, + { + "epoch": 0.7035919166883208, + "eval_loss": 4.454440593719482, + "eval_runtime": 96.9049, + "eval_samples_per_second": 19.576, + "eval_steps_per_second": 4.902, + "step": 2200 + }, + { + "epoch": 0.7039117311959063, + "grad_norm": 0.3431197702884674, + "learning_rate": 0.0001278139192684802, + "loss": 4.4512, + "step": 2201 + }, + { + "epoch": 0.704231545703492, + "grad_norm": 0.338541716337204, + "learning_rate": 0.00012755946570496427, + "loss": 4.4354, + "step": 2202 + }, + { + "epoch": 0.7045513602110776, + "grad_norm": 0.3421216905117035, + "learning_rate": 0.0001273051972730987, + "loss": 4.3548, + "step": 2203 + }, + { + "epoch": 0.7048711747186632, + "grad_norm": 0.352405309677124, + "learning_rate": 0.00012705111424586512, + "loss": 4.5309, + "step": 2204 + }, + { + "epoch": 0.7051909892262488, + "grad_norm": 0.34158819913864136, + "learning_rate": 0.00012679721689604642, + "loss": 4.4749, + "step": 2205 + }, + { + "epoch": 0.7055108037338343, + "grad_norm": 0.3365214765071869, + "learning_rate": 0.00012654350549622605, + "loss": 4.4697, + "step": 2206 + }, + { + "epoch": 0.70583061824142, + "grad_norm": 0.3539983034133911, + "learning_rate": 0.00012628998031878784, + "loss": 4.4178, + "step": 2207 + }, + { + "epoch": 0.7061504327490056, + "grad_norm": 0.34218186140060425, + "learning_rate": 0.00012603664163591573, + "loss": 4.5161, + "step": 2208 + }, + { + "epoch": 0.7064702472565911, + "grad_norm": 0.34383469820022583, + "learning_rate": 0.00012578348971959324, + "loss": 4.3953, + "step": 2209 + }, + { + "epoch": 0.7067900617641768, + "grad_norm": 0.3489435911178589, + "learning_rate": 0.0001255305248416036, + "loss": 4.3898, + "step": 2210 + }, + { + "epoch": 0.7071098762717624, + "grad_norm": 0.3475740849971771, + "learning_rate": 0.0001252777472735291, + "loss": 4.4867, + "step": 2211 + }, + { + "epoch": 0.707429690779348, + "grad_norm": 0.33726832270622253, + "learning_rate": 0.00012502515728675124, + "loss": 4.4221, + "step": 2212 + }, + { + "epoch": 0.7077495052869336, + "grad_norm": 0.33398839831352234, + "learning_rate": 0.00012477275515244951, + "loss": 4.4643, + "step": 2213 + }, + { + "epoch": 0.7080693197945191, + "grad_norm": 0.33227816224098206, + "learning_rate": 0.00012452054114160232, + "loss": 4.4726, + "step": 2214 + }, + { + "epoch": 0.7083891343021048, + "grad_norm": 0.348752498626709, + "learning_rate": 0.00012426851552498584, + "loss": 4.4492, + "step": 2215 + }, + { + "epoch": 0.7087089488096904, + "grad_norm": 0.3545939326286316, + "learning_rate": 0.00012401667857317406, + "loss": 4.4339, + "step": 2216 + }, + { + "epoch": 0.7090287633172759, + "grad_norm": 0.3521256148815155, + "learning_rate": 0.0001237650305565385, + "loss": 4.5827, + "step": 2217 + }, + { + "epoch": 0.7093485778248616, + "grad_norm": 0.33230698108673096, + "learning_rate": 0.00012351357174524745, + "loss": 4.4767, + "step": 2218 + }, + { + "epoch": 0.7096683923324472, + "grad_norm": 0.3276318311691284, + "learning_rate": 0.00012326230240926653, + "loss": 4.5138, + "step": 2219 + }, + { + "epoch": 0.7099882068400328, + "grad_norm": 0.3415897488594055, + "learning_rate": 0.00012301122281835772, + "loss": 4.4965, + "step": 2220 + }, + { + "epoch": 0.7103080213476184, + "grad_norm": 0.3361230790615082, + "learning_rate": 0.00012276033324207935, + "loss": 4.2935, + "step": 2221 + }, + { + "epoch": 0.710627835855204, + "grad_norm": 0.32287877798080444, + "learning_rate": 0.00012250963394978584, + "loss": 4.4713, + "step": 2222 + }, + { + "epoch": 0.7109476503627896, + "grad_norm": 0.3255848288536072, + "learning_rate": 0.00012225912521062702, + "loss": 4.492, + "step": 2223 + }, + { + "epoch": 0.7112674648703752, + "grad_norm": 0.3354206681251526, + "learning_rate": 0.00012200880729354847, + "loss": 4.475, + "step": 2224 + }, + { + "epoch": 0.7115872793779607, + "grad_norm": 0.3278037905693054, + "learning_rate": 0.0001217586804672905, + "loss": 4.4227, + "step": 2225 + }, + { + "epoch": 0.7119070938855464, + "grad_norm": 0.3404330611228943, + "learning_rate": 0.0001215087450003889, + "loss": 4.5188, + "step": 2226 + }, + { + "epoch": 0.712226908393132, + "grad_norm": 0.3332688510417938, + "learning_rate": 0.00012125900116117357, + "loss": 4.2328, + "step": 2227 + }, + { + "epoch": 0.7125467229007176, + "grad_norm": 0.33184289932250977, + "learning_rate": 0.0001210094492177686, + "loss": 4.3894, + "step": 2228 + }, + { + "epoch": 0.7128665374083032, + "grad_norm": 0.32943254709243774, + "learning_rate": 0.00012076008943809238, + "loss": 4.4396, + "step": 2229 + }, + { + "epoch": 0.7131863519158887, + "grad_norm": 0.32169508934020996, + "learning_rate": 0.00012051092208985671, + "loss": 4.452, + "step": 2230 + }, + { + "epoch": 0.7135061664234744, + "grad_norm": 0.3315311372280121, + "learning_rate": 0.00012026194744056684, + "loss": 4.4236, + "step": 2231 + }, + { + "epoch": 0.71382598093106, + "grad_norm": 0.33657577633857727, + "learning_rate": 0.00012001316575752159, + "loss": 4.4091, + "step": 2232 + }, + { + "epoch": 0.7141457954386455, + "grad_norm": 0.339559406042099, + "learning_rate": 0.00011976457730781191, + "loss": 4.5155, + "step": 2233 + }, + { + "epoch": 0.7144656099462312, + "grad_norm": 0.3332586884498596, + "learning_rate": 0.00011951618235832183, + "loss": 4.418, + "step": 2234 + }, + { + "epoch": 0.7147854244538168, + "grad_norm": 0.3349529206752777, + "learning_rate": 0.00011926798117572722, + "loss": 4.5207, + "step": 2235 + }, + { + "epoch": 0.7151052389614024, + "grad_norm": 0.35063642263412476, + "learning_rate": 0.00011901997402649629, + "loss": 4.4715, + "step": 2236 + }, + { + "epoch": 0.715425053468988, + "grad_norm": 0.340107262134552, + "learning_rate": 0.00011877216117688875, + "loss": 4.4252, + "step": 2237 + }, + { + "epoch": 0.7157448679765736, + "grad_norm": 0.3303718864917755, + "learning_rate": 0.00011852454289295575, + "loss": 4.3497, + "step": 2238 + }, + { + "epoch": 0.7160646824841592, + "grad_norm": 0.3243018686771393, + "learning_rate": 0.00011827711944053962, + "loss": 4.3824, + "step": 2239 + }, + { + "epoch": 0.7163844969917448, + "grad_norm": 0.3365945816040039, + "learning_rate": 0.00011802989108527331, + "loss": 4.5189, + "step": 2240 + }, + { + "epoch": 0.7167043114993303, + "grad_norm": 0.33397188782691956, + "learning_rate": 0.00011778285809258052, + "loss": 4.4392, + "step": 2241 + }, + { + "epoch": 0.717024126006916, + "grad_norm": 0.33258089423179626, + "learning_rate": 0.00011753602072767514, + "loss": 4.4494, + "step": 2242 + }, + { + "epoch": 0.7173439405145016, + "grad_norm": 0.3365824222564697, + "learning_rate": 0.00011728937925556107, + "loss": 4.3737, + "step": 2243 + }, + { + "epoch": 0.7176637550220872, + "grad_norm": 0.354343980550766, + "learning_rate": 0.00011704293394103194, + "loss": 4.4178, + "step": 2244 + }, + { + "epoch": 0.7179835695296728, + "grad_norm": 0.3422267735004425, + "learning_rate": 0.00011679668504867051, + "loss": 4.4577, + "step": 2245 + }, + { + "epoch": 0.7183033840372584, + "grad_norm": 0.3237408697605133, + "learning_rate": 0.00011655063284284901, + "loss": 4.356, + "step": 2246 + }, + { + "epoch": 0.718623198544844, + "grad_norm": 0.33587223291397095, + "learning_rate": 0.0001163047775877283, + "loss": 4.3869, + "step": 2247 + }, + { + "epoch": 0.7189430130524296, + "grad_norm": 0.3292006552219391, + "learning_rate": 0.00011605911954725802, + "loss": 4.3966, + "step": 2248 + }, + { + "epoch": 0.7192628275600151, + "grad_norm": 0.3411811590194702, + "learning_rate": 0.00011581365898517567, + "loss": 4.327, + "step": 2249 + }, + { + "epoch": 0.7195826420676008, + "grad_norm": 0.3547792434692383, + "learning_rate": 0.0001155683961650071, + "loss": 4.5197, + "step": 2250 + }, + { + "epoch": 0.7199024565751864, + "grad_norm": 0.33441030979156494, + "learning_rate": 0.00011532333135006579, + "loss": 4.3558, + "step": 2251 + }, + { + "epoch": 0.720222271082772, + "grad_norm": 0.3281584680080414, + "learning_rate": 0.00011507846480345255, + "loss": 4.4791, + "step": 2252 + }, + { + "epoch": 0.7205420855903576, + "grad_norm": 0.3454711139202118, + "learning_rate": 0.00011483379678805551, + "loss": 4.3724, + "step": 2253 + }, + { + "epoch": 0.7208619000979432, + "grad_norm": 0.33835241198539734, + "learning_rate": 0.00011458932756654938, + "loss": 4.4171, + "step": 2254 + }, + { + "epoch": 0.7211817146055288, + "grad_norm": 0.3432719111442566, + "learning_rate": 0.00011434505740139558, + "loss": 4.4025, + "step": 2255 + }, + { + "epoch": 0.7215015291131144, + "grad_norm": 0.34123605489730835, + "learning_rate": 0.00011410098655484194, + "loss": 4.4138, + "step": 2256 + }, + { + "epoch": 0.7218213436206999, + "grad_norm": 0.34249627590179443, + "learning_rate": 0.00011385711528892216, + "loss": 4.4132, + "step": 2257 + }, + { + "epoch": 0.7221411581282856, + "grad_norm": 0.3301515579223633, + "learning_rate": 0.00011361344386545585, + "loss": 4.4961, + "step": 2258 + }, + { + "epoch": 0.7224609726358712, + "grad_norm": 0.33210471272468567, + "learning_rate": 0.00011336997254604769, + "loss": 4.3918, + "step": 2259 + }, + { + "epoch": 0.7227807871434568, + "grad_norm": 0.3336414396762848, + "learning_rate": 0.0001131267015920879, + "loss": 4.4551, + "step": 2260 + }, + { + "epoch": 0.7231006016510424, + "grad_norm": 0.34347960352897644, + "learning_rate": 0.0001128836312647514, + "loss": 4.3648, + "step": 2261 + }, + { + "epoch": 0.723420416158628, + "grad_norm": 0.33358657360076904, + "learning_rate": 0.00011264076182499787, + "loss": 4.4994, + "step": 2262 + }, + { + "epoch": 0.7237402306662136, + "grad_norm": 0.32134950160980225, + "learning_rate": 0.00011239809353357127, + "loss": 4.4902, + "step": 2263 + }, + { + "epoch": 0.7240600451737992, + "grad_norm": 0.3383727967739105, + "learning_rate": 0.00011215562665099941, + "loss": 4.3234, + "step": 2264 + }, + { + "epoch": 0.7243798596813849, + "grad_norm": 0.36297017335891724, + "learning_rate": 0.00011191336143759417, + "loss": 4.4825, + "step": 2265 + }, + { + "epoch": 0.7246996741889704, + "grad_norm": 0.3291953206062317, + "learning_rate": 0.00011167129815345048, + "loss": 4.4349, + "step": 2266 + }, + { + "epoch": 0.725019488696556, + "grad_norm": 0.35270607471466064, + "learning_rate": 0.0001114294370584471, + "loss": 4.4035, + "step": 2267 + }, + { + "epoch": 0.7253393032041416, + "grad_norm": 0.3327333927154541, + "learning_rate": 0.00011118777841224534, + "loss": 4.444, + "step": 2268 + }, + { + "epoch": 0.7256591177117272, + "grad_norm": 0.3331069350242615, + "learning_rate": 0.00011094632247428907, + "loss": 4.3419, + "step": 2269 + }, + { + "epoch": 0.7259789322193128, + "grad_norm": 0.33473634719848633, + "learning_rate": 0.00011070506950380483, + "loss": 4.3794, + "step": 2270 + }, + { + "epoch": 0.7262987467268984, + "grad_norm": 0.3344692885875702, + "learning_rate": 0.0001104640197598008, + "loss": 4.4044, + "step": 2271 + }, + { + "epoch": 0.726618561234484, + "grad_norm": 0.3378191888332367, + "learning_rate": 0.00011022317350106774, + "loss": 4.4291, + "step": 2272 + }, + { + "epoch": 0.7269383757420697, + "grad_norm": 0.3254040479660034, + "learning_rate": 0.00010998253098617707, + "loss": 4.4667, + "step": 2273 + }, + { + "epoch": 0.7272581902496552, + "grad_norm": 0.3407931327819824, + "learning_rate": 0.00010974209247348211, + "loss": 4.439, + "step": 2274 + }, + { + "epoch": 0.7275780047572408, + "grad_norm": 0.3590623140335083, + "learning_rate": 0.00010950185822111697, + "loss": 4.3551, + "step": 2275 + }, + { + "epoch": 0.7278978192648264, + "grad_norm": 0.3413101136684418, + "learning_rate": 0.00010926182848699613, + "loss": 4.4105, + "step": 2276 + }, + { + "epoch": 0.728217633772412, + "grad_norm": 0.33191800117492676, + "learning_rate": 0.00010902200352881522, + "loss": 4.5755, + "step": 2277 + }, + { + "epoch": 0.7285374482799976, + "grad_norm": 0.3360489010810852, + "learning_rate": 0.00010878238360404934, + "loss": 4.383, + "step": 2278 + }, + { + "epoch": 0.7288572627875832, + "grad_norm": 0.33308154344558716, + "learning_rate": 0.00010854296896995379, + "loss": 4.4871, + "step": 2279 + }, + { + "epoch": 0.7291770772951688, + "grad_norm": 0.3325072228908539, + "learning_rate": 0.00010830375988356354, + "loss": 4.4002, + "step": 2280 + }, + { + "epoch": 0.7294968918027545, + "grad_norm": 0.32435134053230286, + "learning_rate": 0.00010806475660169243, + "loss": 4.3895, + "step": 2281 + }, + { + "epoch": 0.72981670631034, + "grad_norm": 0.3442571461200714, + "learning_rate": 0.00010782595938093417, + "loss": 4.4782, + "step": 2282 + }, + { + "epoch": 0.7301365208179256, + "grad_norm": 0.3612309992313385, + "learning_rate": 0.00010758736847766033, + "loss": 4.422, + "step": 2283 + }, + { + "epoch": 0.7304563353255112, + "grad_norm": 0.3383840024471283, + "learning_rate": 0.00010734898414802169, + "loss": 4.4108, + "step": 2284 + }, + { + "epoch": 0.7307761498330968, + "grad_norm": 0.3336951434612274, + "learning_rate": 0.00010711080664794676, + "loss": 4.4591, + "step": 2285 + }, + { + "epoch": 0.7310959643406824, + "grad_norm": 0.3335397243499756, + "learning_rate": 0.00010687283623314225, + "loss": 4.3975, + "step": 2286 + }, + { + "epoch": 0.731415778848268, + "grad_norm": 0.39656996726989746, + "learning_rate": 0.00010663507315909255, + "loss": 4.4759, + "step": 2287 + }, + { + "epoch": 0.7317355933558536, + "grad_norm": 0.347331702709198, + "learning_rate": 0.00010639751768105936, + "loss": 4.445, + "step": 2288 + }, + { + "epoch": 0.7320554078634393, + "grad_norm": 0.3473789095878601, + "learning_rate": 0.00010616017005408167, + "loss": 4.4213, + "step": 2289 + }, + { + "epoch": 0.7323752223710248, + "grad_norm": 0.32854992151260376, + "learning_rate": 0.00010592303053297499, + "loss": 4.3913, + "step": 2290 + }, + { + "epoch": 0.7326950368786104, + "grad_norm": 0.35310834646224976, + "learning_rate": 0.00010568609937233168, + "loss": 4.4185, + "step": 2291 + }, + { + "epoch": 0.733014851386196, + "grad_norm": 0.3509746491909027, + "learning_rate": 0.00010544937682652035, + "loss": 4.4345, + "step": 2292 + }, + { + "epoch": 0.7333346658937816, + "grad_norm": 0.3366645276546478, + "learning_rate": 0.00010521286314968567, + "loss": 4.3897, + "step": 2293 + }, + { + "epoch": 0.7336544804013672, + "grad_norm": 0.322303831577301, + "learning_rate": 0.00010497655859574809, + "loss": 4.4076, + "step": 2294 + }, + { + "epoch": 0.7339742949089528, + "grad_norm": 0.3318299949169159, + "learning_rate": 0.00010474046341840329, + "loss": 4.431, + "step": 2295 + }, + { + "epoch": 0.7342941094165384, + "grad_norm": 0.3476223945617676, + "learning_rate": 0.00010450457787112246, + "loss": 4.4455, + "step": 2296 + }, + { + "epoch": 0.7346139239241241, + "grad_norm": 0.34152400493621826, + "learning_rate": 0.00010426890220715164, + "loss": 4.3992, + "step": 2297 + }, + { + "epoch": 0.7349337384317096, + "grad_norm": 0.34011977910995483, + "learning_rate": 0.00010403343667951149, + "loss": 4.3288, + "step": 2298 + }, + { + "epoch": 0.7352535529392952, + "grad_norm": 0.3339643180370331, + "learning_rate": 0.00010379818154099724, + "loss": 4.4182, + "step": 2299 + }, + { + "epoch": 0.7355733674468808, + "grad_norm": 0.33768823742866516, + "learning_rate": 0.00010356313704417794, + "loss": 4.4182, + "step": 2300 + }, + { + "epoch": 0.7355733674468808, + "eval_loss": 4.4368157386779785, + "eval_runtime": 89.2711, + "eval_samples_per_second": 21.25, + "eval_steps_per_second": 5.321, + "step": 2300 + }, + { + "epoch": 0.7358931819544664, + "grad_norm": 0.33821696043014526, + "learning_rate": 0.0001033283034413967, + "loss": 4.3648, + "step": 2301 + }, + { + "epoch": 0.736212996462052, + "grad_norm": 0.3329372704029083, + "learning_rate": 0.00010309368098477025, + "loss": 4.4616, + "step": 2302 + }, + { + "epoch": 0.7365328109696376, + "grad_norm": 0.3347349464893341, + "learning_rate": 0.00010285926992618855, + "loss": 4.374, + "step": 2303 + }, + { + "epoch": 0.7368526254772232, + "grad_norm": 0.3423709273338318, + "learning_rate": 0.0001026250705173147, + "loss": 4.374, + "step": 2304 + }, + { + "epoch": 0.7371724399848089, + "grad_norm": 0.3551722466945648, + "learning_rate": 0.00010239108300958432, + "loss": 4.4644, + "step": 2305 + }, + { + "epoch": 0.7374922544923944, + "grad_norm": 0.3246476948261261, + "learning_rate": 0.00010215730765420579, + "loss": 4.4353, + "step": 2306 + }, + { + "epoch": 0.73781206899998, + "grad_norm": 0.3443451523780823, + "learning_rate": 0.00010192374470215969, + "loss": 4.4584, + "step": 2307 + }, + { + "epoch": 0.7381318835075656, + "grad_norm": 0.3336166441440582, + "learning_rate": 0.00010169039440419855, + "loss": 4.4641, + "step": 2308 + }, + { + "epoch": 0.7384516980151512, + "grad_norm": 0.34944331645965576, + "learning_rate": 0.00010145725701084643, + "loss": 4.3898, + "step": 2309 + }, + { + "epoch": 0.7387715125227368, + "grad_norm": 0.34261205792427063, + "learning_rate": 0.000101224332772399, + "loss": 4.3359, + "step": 2310 + }, + { + "epoch": 0.7390913270303224, + "grad_norm": 0.34464573860168457, + "learning_rate": 0.00010099162193892303, + "loss": 4.4617, + "step": 2311 + }, + { + "epoch": 0.739411141537908, + "grad_norm": 0.3316618800163269, + "learning_rate": 0.00010075912476025623, + "loss": 4.4845, + "step": 2312 + }, + { + "epoch": 0.7397309560454937, + "grad_norm": 0.32946211099624634, + "learning_rate": 0.000100526841486007, + "loss": 4.3392, + "step": 2313 + }, + { + "epoch": 0.7400507705530792, + "grad_norm": 0.34942230582237244, + "learning_rate": 0.00010029477236555372, + "loss": 4.4224, + "step": 2314 + }, + { + "epoch": 0.7403705850606648, + "grad_norm": 0.32532206177711487, + "learning_rate": 0.00010006291764804523, + "loss": 4.4112, + "step": 2315 + }, + { + "epoch": 0.7406903995682504, + "grad_norm": 0.3478979766368866, + "learning_rate": 9.98312775824001e-05, + "loss": 4.3782, + "step": 2316 + }, + { + "epoch": 0.741010214075836, + "grad_norm": 0.3328978419303894, + "learning_rate": 9.959985241730641e-05, + "loss": 4.426, + "step": 2317 + }, + { + "epoch": 0.7413300285834216, + "grad_norm": 0.33060571551322937, + "learning_rate": 9.936864240122164e-05, + "loss": 4.3671, + "step": 2318 + }, + { + "epoch": 0.7416498430910072, + "grad_norm": 0.3444348871707916, + "learning_rate": 9.913764778237196e-05, + "loss": 4.3798, + "step": 2319 + }, + { + "epoch": 0.7419696575985928, + "grad_norm": 0.3299075961112976, + "learning_rate": 9.890686880875274e-05, + "loss": 4.5089, + "step": 2320 + }, + { + "epoch": 0.7422894721061785, + "grad_norm": 0.347756028175354, + "learning_rate": 9.86763057281273e-05, + "loss": 4.3846, + "step": 2321 + }, + { + "epoch": 0.742609286613764, + "grad_norm": 0.3371478319168091, + "learning_rate": 9.844595878802778e-05, + "loss": 4.3443, + "step": 2322 + }, + { + "epoch": 0.7429291011213496, + "grad_norm": 0.3344487249851227, + "learning_rate": 9.821582823575398e-05, + "loss": 4.4405, + "step": 2323 + }, + { + "epoch": 0.7432489156289352, + "grad_norm": 0.3376719057559967, + "learning_rate": 9.79859143183732e-05, + "loss": 4.4945, + "step": 2324 + }, + { + "epoch": 0.7435687301365208, + "grad_norm": 0.3353802561759949, + "learning_rate": 9.77562172827205e-05, + "loss": 4.4184, + "step": 2325 + }, + { + "epoch": 0.7438885446441064, + "grad_norm": 0.3342118561267853, + "learning_rate": 9.752673737539779e-05, + "loss": 4.4159, + "step": 2326 + }, + { + "epoch": 0.744208359151692, + "grad_norm": 0.3366515338420868, + "learning_rate": 9.729747484277402e-05, + "loss": 4.3736, + "step": 2327 + }, + { + "epoch": 0.7445281736592776, + "grad_norm": 0.3395729959011078, + "learning_rate": 9.706842993098503e-05, + "loss": 4.4127, + "step": 2328 + }, + { + "epoch": 0.7448479881668633, + "grad_norm": 0.33426910638809204, + "learning_rate": 9.683960288593249e-05, + "loss": 4.3812, + "step": 2329 + }, + { + "epoch": 0.7451678026744488, + "grad_norm": 0.35259851813316345, + "learning_rate": 9.661099395328463e-05, + "loss": 4.3867, + "step": 2330 + }, + { + "epoch": 0.7454876171820344, + "grad_norm": 0.3474106192588806, + "learning_rate": 9.638260337847513e-05, + "loss": 4.3845, + "step": 2331 + }, + { + "epoch": 0.74580743168962, + "grad_norm": 0.331312358379364, + "learning_rate": 9.615443140670357e-05, + "loss": 4.4059, + "step": 2332 + }, + { + "epoch": 0.7461272461972056, + "grad_norm": 0.33790484070777893, + "learning_rate": 9.592647828293468e-05, + "loss": 4.3753, + "step": 2333 + }, + { + "epoch": 0.7464470607047913, + "grad_norm": 0.34516483545303345, + "learning_rate": 9.569874425189827e-05, + "loss": 4.3413, + "step": 2334 + }, + { + "epoch": 0.7467668752123768, + "grad_norm": 0.34916952252388, + "learning_rate": 9.547122955808902e-05, + "loss": 4.5153, + "step": 2335 + }, + { + "epoch": 0.7470866897199624, + "grad_norm": 0.3533429801464081, + "learning_rate": 9.524393444576585e-05, + "loss": 4.4815, + "step": 2336 + }, + { + "epoch": 0.7474065042275481, + "grad_norm": 0.32154449820518494, + "learning_rate": 9.501685915895218e-05, + "loss": 4.3711, + "step": 2337 + }, + { + "epoch": 0.7477263187351336, + "grad_norm": 0.33027729392051697, + "learning_rate": 9.479000394143543e-05, + "loss": 4.396, + "step": 2338 + }, + { + "epoch": 0.7480461332427192, + "grad_norm": 0.3365285098552704, + "learning_rate": 9.456336903676666e-05, + "loss": 4.4671, + "step": 2339 + }, + { + "epoch": 0.7483659477503048, + "grad_norm": 0.33891135454177856, + "learning_rate": 9.433695468826055e-05, + "loss": 4.4509, + "step": 2340 + }, + { + "epoch": 0.7486857622578904, + "grad_norm": 0.3212648332118988, + "learning_rate": 9.411076113899465e-05, + "loss": 4.4377, + "step": 2341 + }, + { + "epoch": 0.7490055767654761, + "grad_norm": 0.33179762959480286, + "learning_rate": 9.388478863180982e-05, + "loss": 4.4276, + "step": 2342 + }, + { + "epoch": 0.7493253912730616, + "grad_norm": 0.3465440273284912, + "learning_rate": 9.365903740930947e-05, + "loss": 4.4863, + "step": 2343 + }, + { + "epoch": 0.7496452057806472, + "grad_norm": 0.32808244228363037, + "learning_rate": 9.343350771385957e-05, + "loss": 4.4296, + "step": 2344 + }, + { + "epoch": 0.7499650202882329, + "grad_norm": 0.3241809010505676, + "learning_rate": 9.320819978758787e-05, + "loss": 4.3921, + "step": 2345 + }, + { + "epoch": 0.7502848347958184, + "grad_norm": 0.3299873173236847, + "learning_rate": 9.298311387238449e-05, + "loss": 4.484, + "step": 2346 + }, + { + "epoch": 0.750604649303404, + "grad_norm": 0.3408411145210266, + "learning_rate": 9.275825020990092e-05, + "loss": 4.4399, + "step": 2347 + }, + { + "epoch": 0.7509244638109897, + "grad_norm": 0.3207140564918518, + "learning_rate": 9.25336090415502e-05, + "loss": 4.3208, + "step": 2348 + }, + { + "epoch": 0.7512442783185752, + "grad_norm": 0.33310171961784363, + "learning_rate": 9.230919060850645e-05, + "loss": 4.5034, + "step": 2349 + }, + { + "epoch": 0.7515640928261609, + "grad_norm": 0.344112366437912, + "learning_rate": 9.208499515170451e-05, + "loss": 4.3852, + "step": 2350 + }, + { + "epoch": 0.7518839073337464, + "grad_norm": 0.33422914147377014, + "learning_rate": 9.186102291184003e-05, + "loss": 4.4972, + "step": 2351 + }, + { + "epoch": 0.752203721841332, + "grad_norm": 0.3382188081741333, + "learning_rate": 9.163727412936895e-05, + "loss": 4.455, + "step": 2352 + }, + { + "epoch": 0.7525235363489177, + "grad_norm": 0.3393407464027405, + "learning_rate": 9.141374904450733e-05, + "loss": 4.4865, + "step": 2353 + }, + { + "epoch": 0.7528433508565032, + "grad_norm": 0.34060433506965637, + "learning_rate": 9.119044789723108e-05, + "loss": 4.3234, + "step": 2354 + }, + { + "epoch": 0.7531631653640888, + "grad_norm": 0.33734777569770813, + "learning_rate": 9.09673709272755e-05, + "loss": 4.5168, + "step": 2355 + }, + { + "epoch": 0.7534829798716745, + "grad_norm": 0.3249700963497162, + "learning_rate": 9.07445183741355e-05, + "loss": 4.3355, + "step": 2356 + }, + { + "epoch": 0.75380279437926, + "grad_norm": 0.3302399516105652, + "learning_rate": 9.052189047706484e-05, + "loss": 4.3356, + "step": 2357 + }, + { + "epoch": 0.7541226088868457, + "grad_norm": 0.3629155158996582, + "learning_rate": 9.029948747507627e-05, + "loss": 4.3506, + "step": 2358 + }, + { + "epoch": 0.7544424233944312, + "grad_norm": 0.34194469451904297, + "learning_rate": 9.0077309606941e-05, + "loss": 4.4767, + "step": 2359 + }, + { + "epoch": 0.7547622379020168, + "grad_norm": 0.3354686498641968, + "learning_rate": 8.985535711118844e-05, + "loss": 4.4364, + "step": 2360 + }, + { + "epoch": 0.7550820524096025, + "grad_norm": 0.34276947379112244, + "learning_rate": 8.963363022610623e-05, + "loss": 4.3009, + "step": 2361 + }, + { + "epoch": 0.755401866917188, + "grad_norm": 0.33784785866737366, + "learning_rate": 8.941212918973952e-05, + "loss": 4.4614, + "step": 2362 + }, + { + "epoch": 0.7557216814247736, + "grad_norm": 0.32760488986968994, + "learning_rate": 8.919085423989135e-05, + "loss": 4.3488, + "step": 2363 + }, + { + "epoch": 0.7560414959323593, + "grad_norm": 0.32915598154067993, + "learning_rate": 8.896980561412196e-05, + "loss": 4.4392, + "step": 2364 + }, + { + "epoch": 0.7563613104399448, + "grad_norm": 0.33573564887046814, + "learning_rate": 8.874898354974821e-05, + "loss": 4.3963, + "step": 2365 + }, + { + "epoch": 0.7566811249475305, + "grad_norm": 0.3680020570755005, + "learning_rate": 8.85283882838443e-05, + "loss": 4.4194, + "step": 2366 + }, + { + "epoch": 0.757000939455116, + "grad_norm": 0.31760096549987793, + "learning_rate": 8.830802005324031e-05, + "loss": 4.3387, + "step": 2367 + }, + { + "epoch": 0.7573207539627016, + "grad_norm": 0.32655471563339233, + "learning_rate": 8.808787909452334e-05, + "loss": 4.3461, + "step": 2368 + }, + { + "epoch": 0.7576405684702873, + "grad_norm": 0.32854411005973816, + "learning_rate": 8.786796564403575e-05, + "loss": 4.354, + "step": 2369 + }, + { + "epoch": 0.7579603829778728, + "grad_norm": 0.35439810156822205, + "learning_rate": 8.764827993787613e-05, + "loss": 4.4464, + "step": 2370 + }, + { + "epoch": 0.7582801974854584, + "grad_norm": 0.34276890754699707, + "learning_rate": 8.742882221189844e-05, + "loss": 4.4719, + "step": 2371 + }, + { + "epoch": 0.758600011993044, + "grad_norm": 0.33621183037757874, + "learning_rate": 8.720959270171162e-05, + "loss": 4.3776, + "step": 2372 + }, + { + "epoch": 0.7589198265006296, + "grad_norm": 0.34009408950805664, + "learning_rate": 8.699059164268015e-05, + "loss": 4.4472, + "step": 2373 + }, + { + "epoch": 0.7592396410082153, + "grad_norm": 0.32150155305862427, + "learning_rate": 8.677181926992271e-05, + "loss": 4.3584, + "step": 2374 + }, + { + "epoch": 0.7595594555158008, + "grad_norm": 0.332084596157074, + "learning_rate": 8.655327581831279e-05, + "loss": 4.4232, + "step": 2375 + }, + { + "epoch": 0.7598792700233864, + "grad_norm": 0.32362791895866394, + "learning_rate": 8.633496152247784e-05, + "loss": 4.4576, + "step": 2376 + }, + { + "epoch": 0.7601990845309721, + "grad_norm": 0.3358210623264313, + "learning_rate": 8.611687661679945e-05, + "loss": 4.4467, + "step": 2377 + }, + { + "epoch": 0.7605188990385576, + "grad_norm": 0.33137738704681396, + "learning_rate": 8.589902133541323e-05, + "loss": 4.4064, + "step": 2378 + }, + { + "epoch": 0.7608387135461432, + "grad_norm": 0.3294398784637451, + "learning_rate": 8.568139591220764e-05, + "loss": 4.2972, + "step": 2379 + }, + { + "epoch": 0.7611585280537289, + "grad_norm": 0.32762813568115234, + "learning_rate": 8.546400058082492e-05, + "loss": 4.3664, + "step": 2380 + }, + { + "epoch": 0.7614783425613144, + "grad_norm": 0.3572172522544861, + "learning_rate": 8.524683557465987e-05, + "loss": 4.4907, + "step": 2381 + }, + { + "epoch": 0.7617981570689001, + "grad_norm": 0.33606159687042236, + "learning_rate": 8.502990112686028e-05, + "loss": 4.5022, + "step": 2382 + }, + { + "epoch": 0.7621179715764856, + "grad_norm": 0.3445335626602173, + "learning_rate": 8.481319747032635e-05, + "loss": 4.5111, + "step": 2383 + }, + { + "epoch": 0.7624377860840712, + "grad_norm": 0.3442610502243042, + "learning_rate": 8.459672483771046e-05, + "loss": 4.446, + "step": 2384 + }, + { + "epoch": 0.7627576005916569, + "grad_norm": 0.3225667476654053, + "learning_rate": 8.438048346141713e-05, + "loss": 4.3515, + "step": 2385 + }, + { + "epoch": 0.7630774150992424, + "grad_norm": 0.3303259313106537, + "learning_rate": 8.416447357360224e-05, + "loss": 4.4814, + "step": 2386 + }, + { + "epoch": 0.763397229606828, + "grad_norm": 0.3265552222728729, + "learning_rate": 8.394869540617347e-05, + "loss": 4.4112, + "step": 2387 + }, + { + "epoch": 0.7637170441144137, + "grad_norm": 0.35281017422676086, + "learning_rate": 8.373314919078964e-05, + "loss": 4.3664, + "step": 2388 + }, + { + "epoch": 0.7640368586219992, + "grad_norm": 0.3419521152973175, + "learning_rate": 8.35178351588605e-05, + "loss": 4.3578, + "step": 2389 + }, + { + "epoch": 0.7643566731295849, + "grad_norm": 0.3403926491737366, + "learning_rate": 8.330275354154672e-05, + "loss": 4.3963, + "step": 2390 + }, + { + "epoch": 0.7646764876371704, + "grad_norm": 0.3328573703765869, + "learning_rate": 8.308790456975905e-05, + "loss": 4.4007, + "step": 2391 + }, + { + "epoch": 0.764996302144756, + "grad_norm": 0.39904558658599854, + "learning_rate": 8.28732884741588e-05, + "loss": 4.4595, + "step": 2392 + }, + { + "epoch": 0.7653161166523417, + "grad_norm": 0.3315344750881195, + "learning_rate": 8.265890548515723e-05, + "loss": 4.3695, + "step": 2393 + }, + { + "epoch": 0.7656359311599272, + "grad_norm": 0.34847211837768555, + "learning_rate": 8.244475583291522e-05, + "loss": 4.3524, + "step": 2394 + }, + { + "epoch": 0.7659557456675128, + "grad_norm": 0.33161666989326477, + "learning_rate": 8.223083974734336e-05, + "loss": 4.4508, + "step": 2395 + }, + { + "epoch": 0.7662755601750985, + "grad_norm": 0.342219740152359, + "learning_rate": 8.201715745810112e-05, + "loss": 4.4734, + "step": 2396 + }, + { + "epoch": 0.766595374682684, + "grad_norm": 0.33304139971733093, + "learning_rate": 8.180370919459728e-05, + "loss": 4.3801, + "step": 2397 + }, + { + "epoch": 0.7669151891902697, + "grad_norm": 0.354841023683548, + "learning_rate": 8.159049518598924e-05, + "loss": 4.3773, + "step": 2398 + }, + { + "epoch": 0.7672350036978552, + "grad_norm": 0.32176217436790466, + "learning_rate": 8.137751566118306e-05, + "loss": 4.3688, + "step": 2399 + }, + { + "epoch": 0.7675548182054408, + "grad_norm": 0.3267159163951874, + "learning_rate": 8.11647708488327e-05, + "loss": 4.4987, + "step": 2400 + }, + { + "epoch": 0.7675548182054408, + "eval_loss": 4.421462059020996, + "eval_runtime": 84.969, + "eval_samples_per_second": 22.326, + "eval_steps_per_second": 5.59, + "step": 2400 + }, + { + "epoch": 0.7678746327130265, + "grad_norm": 0.33585166931152344, + "learning_rate": 8.09522609773405e-05, + "loss": 4.3732, + "step": 2401 + }, + { + "epoch": 0.768194447220612, + "grad_norm": 0.34492063522338867, + "learning_rate": 8.073998627485641e-05, + "loss": 4.4611, + "step": 2402 + }, + { + "epoch": 0.7685142617281977, + "grad_norm": 0.3376120626926422, + "learning_rate": 8.052794696927796e-05, + "loss": 4.3214, + "step": 2403 + }, + { + "epoch": 0.7688340762357833, + "grad_norm": 0.3616439998149872, + "learning_rate": 8.031614328824998e-05, + "loss": 4.3799, + "step": 2404 + }, + { + "epoch": 0.7691538907433688, + "grad_norm": 0.3344370722770691, + "learning_rate": 8.010457545916408e-05, + "loss": 4.4234, + "step": 2405 + }, + { + "epoch": 0.7694737052509545, + "grad_norm": 0.32834044098854065, + "learning_rate": 7.989324370915899e-05, + "loss": 4.4505, + "step": 2406 + }, + { + "epoch": 0.76979351975854, + "grad_norm": 0.3269968032836914, + "learning_rate": 7.968214826511987e-05, + "loss": 4.4195, + "step": 2407 + }, + { + "epoch": 0.7701133342661256, + "grad_norm": 0.47119635343551636, + "learning_rate": 7.947128935367813e-05, + "loss": 4.4604, + "step": 2408 + }, + { + "epoch": 0.7704331487737113, + "grad_norm": 0.3617459833621979, + "learning_rate": 7.926066720121134e-05, + "loss": 4.4275, + "step": 2409 + }, + { + "epoch": 0.7707529632812968, + "grad_norm": 0.3359169363975525, + "learning_rate": 7.905028203384269e-05, + "loss": 4.3953, + "step": 2410 + }, + { + "epoch": 0.7710727777888825, + "grad_norm": 0.34564802050590515, + "learning_rate": 7.884013407744129e-05, + "loss": 4.4501, + "step": 2411 + }, + { + "epoch": 0.7713925922964681, + "grad_norm": 0.3374510109424591, + "learning_rate": 7.863022355762101e-05, + "loss": 4.4675, + "step": 2412 + }, + { + "epoch": 0.7717124068040536, + "grad_norm": 0.3311443030834198, + "learning_rate": 7.842055069974149e-05, + "loss": 4.4123, + "step": 2413 + }, + { + "epoch": 0.7720322213116393, + "grad_norm": 0.33511215448379517, + "learning_rate": 7.82111157289069e-05, + "loss": 4.4288, + "step": 2414 + }, + { + "epoch": 0.7723520358192248, + "grad_norm": 0.342142790555954, + "learning_rate": 7.800191886996578e-05, + "loss": 4.2889, + "step": 2415 + }, + { + "epoch": 0.7726718503268104, + "grad_norm": 0.3379803001880646, + "learning_rate": 7.779296034751152e-05, + "loss": 4.3346, + "step": 2416 + }, + { + "epoch": 0.7729916648343961, + "grad_norm": 0.3355945944786072, + "learning_rate": 7.75842403858811e-05, + "loss": 4.4643, + "step": 2417 + }, + { + "epoch": 0.7733114793419816, + "grad_norm": 0.3274465799331665, + "learning_rate": 7.737575920915574e-05, + "loss": 4.3874, + "step": 2418 + }, + { + "epoch": 0.7736312938495673, + "grad_norm": 0.3239230811595917, + "learning_rate": 7.716751704116042e-05, + "loss": 4.3716, + "step": 2419 + }, + { + "epoch": 0.7739511083571529, + "grad_norm": 0.3335186541080475, + "learning_rate": 7.695951410546311e-05, + "loss": 4.5191, + "step": 2420 + }, + { + "epoch": 0.7742709228647384, + "grad_norm": 0.3376699388027191, + "learning_rate": 7.67517506253753e-05, + "loss": 4.4352, + "step": 2421 + }, + { + "epoch": 0.7745907373723241, + "grad_norm": 0.36317139863967896, + "learning_rate": 7.654422682395106e-05, + "loss": 4.424, + "step": 2422 + }, + { + "epoch": 0.7749105518799096, + "grad_norm": 0.3489604592323303, + "learning_rate": 7.633694292398745e-05, + "loss": 4.3503, + "step": 2423 + }, + { + "epoch": 0.7752303663874952, + "grad_norm": 0.33292216062545776, + "learning_rate": 7.612989914802383e-05, + "loss": 4.4104, + "step": 2424 + }, + { + "epoch": 0.7755501808950809, + "grad_norm": 0.3456326425075531, + "learning_rate": 7.592309571834179e-05, + "loss": 4.3774, + "step": 2425 + }, + { + "epoch": 0.7758699954026664, + "grad_norm": 0.3363443613052368, + "learning_rate": 7.5716532856965e-05, + "loss": 4.3974, + "step": 2426 + }, + { + "epoch": 0.7761898099102521, + "grad_norm": 0.3309082090854645, + "learning_rate": 7.551021078565857e-05, + "loss": 4.3973, + "step": 2427 + }, + { + "epoch": 0.7765096244178377, + "grad_norm": 0.3364388644695282, + "learning_rate": 7.530412972592928e-05, + "loss": 4.4558, + "step": 2428 + }, + { + "epoch": 0.7768294389254232, + "grad_norm": 0.33900633454322815, + "learning_rate": 7.509828989902525e-05, + "loss": 4.425, + "step": 2429 + }, + { + "epoch": 0.7771492534330089, + "grad_norm": 0.34174394607543945, + "learning_rate": 7.489269152593543e-05, + "loss": 4.3907, + "step": 2430 + }, + { + "epoch": 0.7774690679405944, + "grad_norm": 0.3305186629295349, + "learning_rate": 7.468733482738976e-05, + "loss": 4.4916, + "step": 2431 + }, + { + "epoch": 0.77778888244818, + "grad_norm": 0.34548285603523254, + "learning_rate": 7.44822200238584e-05, + "loss": 4.4056, + "step": 2432 + }, + { + "epoch": 0.7781086969557657, + "grad_norm": 0.3213754892349243, + "learning_rate": 7.42773473355521e-05, + "loss": 4.4359, + "step": 2433 + }, + { + "epoch": 0.7784285114633512, + "grad_norm": 0.35759109258651733, + "learning_rate": 7.407271698242155e-05, + "loss": 4.4189, + "step": 2434 + }, + { + "epoch": 0.7787483259709369, + "grad_norm": 0.3435339331626892, + "learning_rate": 7.386832918415741e-05, + "loss": 4.3936, + "step": 2435 + }, + { + "epoch": 0.7790681404785225, + "grad_norm": 0.34353944659233093, + "learning_rate": 7.366418416018963e-05, + "loss": 4.3856, + "step": 2436 + }, + { + "epoch": 0.779387954986108, + "grad_norm": 0.3284934461116791, + "learning_rate": 7.346028212968778e-05, + "loss": 4.3705, + "step": 2437 + }, + { + "epoch": 0.7797077694936937, + "grad_norm": 0.3299228549003601, + "learning_rate": 7.325662331156049e-05, + "loss": 4.3706, + "step": 2438 + }, + { + "epoch": 0.7800275840012793, + "grad_norm": 0.3226522207260132, + "learning_rate": 7.305320792445532e-05, + "loss": 4.3188, + "step": 2439 + }, + { + "epoch": 0.7803473985088648, + "grad_norm": 0.3250078558921814, + "learning_rate": 7.285003618675842e-05, + "loss": 4.3658, + "step": 2440 + }, + { + "epoch": 0.7806672130164505, + "grad_norm": 0.3348226249217987, + "learning_rate": 7.264710831659426e-05, + "loss": 4.3454, + "step": 2441 + }, + { + "epoch": 0.780987027524036, + "grad_norm": 0.32369184494018555, + "learning_rate": 7.24444245318257e-05, + "loss": 4.3334, + "step": 2442 + }, + { + "epoch": 0.7813068420316217, + "grad_norm": 0.37221813201904297, + "learning_rate": 7.224198505005344e-05, + "loss": 4.4436, + "step": 2443 + }, + { + "epoch": 0.7816266565392073, + "grad_norm": 0.3341996967792511, + "learning_rate": 7.203979008861588e-05, + "loss": 4.4195, + "step": 2444 + }, + { + "epoch": 0.7819464710467928, + "grad_norm": 0.3328430950641632, + "learning_rate": 7.183783986458906e-05, + "loss": 4.4021, + "step": 2445 + }, + { + "epoch": 0.7822662855543785, + "grad_norm": 0.3489823341369629, + "learning_rate": 7.163613459478595e-05, + "loss": 4.4735, + "step": 2446 + }, + { + "epoch": 0.782586100061964, + "grad_norm": 0.34073081612586975, + "learning_rate": 7.143467449575682e-05, + "loss": 4.3651, + "step": 2447 + }, + { + "epoch": 0.7829059145695496, + "grad_norm": 0.33331355452537537, + "learning_rate": 7.12334597837887e-05, + "loss": 4.4408, + "step": 2448 + }, + { + "epoch": 0.7832257290771353, + "grad_norm": 0.3399198651313782, + "learning_rate": 7.103249067490502e-05, + "loss": 4.4111, + "step": 2449 + }, + { + "epoch": 0.7835455435847208, + "grad_norm": 0.3442201614379883, + "learning_rate": 7.083176738486578e-05, + "loss": 4.3269, + "step": 2450 + }, + { + "epoch": 0.7838653580923065, + "grad_norm": 0.3376154899597168, + "learning_rate": 7.063129012916671e-05, + "loss": 4.3702, + "step": 2451 + }, + { + "epoch": 0.7841851725998921, + "grad_norm": 0.3253526985645294, + "learning_rate": 7.04310591230397e-05, + "loss": 4.3661, + "step": 2452 + }, + { + "epoch": 0.7845049871074776, + "grad_norm": 0.336834192276001, + "learning_rate": 7.023107458145214e-05, + "loss": 4.4983, + "step": 2453 + }, + { + "epoch": 0.7848248016150633, + "grad_norm": 0.32964494824409485, + "learning_rate": 7.003133671910688e-05, + "loss": 4.4248, + "step": 2454 + }, + { + "epoch": 0.7851446161226489, + "grad_norm": 0.32808932662010193, + "learning_rate": 6.983184575044199e-05, + "loss": 4.3788, + "step": 2455 + }, + { + "epoch": 0.7854644306302344, + "grad_norm": 0.3290889859199524, + "learning_rate": 6.963260188963016e-05, + "loss": 4.3742, + "step": 2456 + }, + { + "epoch": 0.7857842451378201, + "grad_norm": 0.3432980179786682, + "learning_rate": 6.943360535057926e-05, + "loss": 4.4134, + "step": 2457 + }, + { + "epoch": 0.7861040596454056, + "grad_norm": 0.34411704540252686, + "learning_rate": 6.923485634693109e-05, + "loss": 4.4038, + "step": 2458 + }, + { + "epoch": 0.7864238741529913, + "grad_norm": 0.3406968116760254, + "learning_rate": 6.903635509206234e-05, + "loss": 4.4335, + "step": 2459 + }, + { + "epoch": 0.7867436886605769, + "grad_norm": 0.3379204273223877, + "learning_rate": 6.883810179908315e-05, + "loss": 4.3759, + "step": 2460 + }, + { + "epoch": 0.7870635031681624, + "grad_norm": 0.335379034280777, + "learning_rate": 6.86400966808377e-05, + "loss": 4.4072, + "step": 2461 + }, + { + "epoch": 0.7873833176757481, + "grad_norm": 0.33987942337989807, + "learning_rate": 6.844233994990382e-05, + "loss": 4.404, + "step": 2462 + }, + { + "epoch": 0.7877031321833337, + "grad_norm": 0.3290582299232483, + "learning_rate": 6.824483181859231e-05, + "loss": 4.4041, + "step": 2463 + }, + { + "epoch": 0.7880229466909192, + "grad_norm": 0.3431604504585266, + "learning_rate": 6.804757249894762e-05, + "loss": 4.3937, + "step": 2464 + }, + { + "epoch": 0.7883427611985049, + "grad_norm": 0.3415065109729767, + "learning_rate": 6.785056220274658e-05, + "loss": 4.3634, + "step": 2465 + }, + { + "epoch": 0.7886625757060904, + "grad_norm": 0.34374210238456726, + "learning_rate": 6.765380114149887e-05, + "loss": 4.4099, + "step": 2466 + }, + { + "epoch": 0.7889823902136761, + "grad_norm": 0.3328897953033447, + "learning_rate": 6.745728952644675e-05, + "loss": 4.4099, + "step": 2467 + }, + { + "epoch": 0.7893022047212617, + "grad_norm": 0.32808518409729004, + "learning_rate": 6.726102756856422e-05, + "loss": 4.4275, + "step": 2468 + }, + { + "epoch": 0.7896220192288472, + "grad_norm": 0.33417269587516785, + "learning_rate": 6.706501547855787e-05, + "loss": 4.4154, + "step": 2469 + }, + { + "epoch": 0.7899418337364329, + "grad_norm": 0.3304135203361511, + "learning_rate": 6.686925346686544e-05, + "loss": 4.3633, + "step": 2470 + }, + { + "epoch": 0.7902616482440185, + "grad_norm": 0.3450307548046112, + "learning_rate": 6.667374174365667e-05, + "loss": 4.4377, + "step": 2471 + }, + { + "epoch": 0.7905814627516041, + "grad_norm": 0.3265368640422821, + "learning_rate": 6.647848051883217e-05, + "loss": 4.39, + "step": 2472 + }, + { + "epoch": 0.7909012772591897, + "grad_norm": 0.33910951018333435, + "learning_rate": 6.628347000202381e-05, + "loss": 4.3719, + "step": 2473 + }, + { + "epoch": 0.7912210917667752, + "grad_norm": 0.33357036113739014, + "learning_rate": 6.608871040259457e-05, + "loss": 4.38, + "step": 2474 + }, + { + "epoch": 0.7915409062743609, + "grad_norm": 0.33957427740097046, + "learning_rate": 6.589420192963754e-05, + "loss": 4.3611, + "step": 2475 + }, + { + "epoch": 0.7918607207819465, + "grad_norm": 0.34905606508255005, + "learning_rate": 6.56999447919766e-05, + "loss": 4.4632, + "step": 2476 + }, + { + "epoch": 0.792180535289532, + "grad_norm": 0.3371109962463379, + "learning_rate": 6.550593919816545e-05, + "loss": 4.3714, + "step": 2477 + }, + { + "epoch": 0.7925003497971177, + "grad_norm": 0.33412280678749084, + "learning_rate": 6.531218535648807e-05, + "loss": 4.3534, + "step": 2478 + }, + { + "epoch": 0.7928201643047033, + "grad_norm": 0.3325359523296356, + "learning_rate": 6.511868347495793e-05, + "loss": 4.3707, + "step": 2479 + }, + { + "epoch": 0.7931399788122889, + "grad_norm": 0.33235684037208557, + "learning_rate": 6.492543376131817e-05, + "loss": 4.464, + "step": 2480 + }, + { + "epoch": 0.7934597933198745, + "grad_norm": 0.3194396495819092, + "learning_rate": 6.473243642304114e-05, + "loss": 4.4164, + "step": 2481 + }, + { + "epoch": 0.79377960782746, + "grad_norm": 0.34822776913642883, + "learning_rate": 6.453969166732808e-05, + "loss": 4.4403, + "step": 2482 + }, + { + "epoch": 0.7940994223350457, + "grad_norm": 0.3341214656829834, + "learning_rate": 6.434719970110923e-05, + "loss": 4.2944, + "step": 2483 + }, + { + "epoch": 0.7944192368426313, + "grad_norm": 0.3367195427417755, + "learning_rate": 6.415496073104344e-05, + "loss": 4.4819, + "step": 2484 + }, + { + "epoch": 0.7947390513502168, + "grad_norm": 0.33086076378822327, + "learning_rate": 6.396297496351791e-05, + "loss": 4.4394, + "step": 2485 + }, + { + "epoch": 0.7950588658578025, + "grad_norm": 0.3196883499622345, + "learning_rate": 6.377124260464804e-05, + "loss": 4.3904, + "step": 2486 + }, + { + "epoch": 0.7953786803653881, + "grad_norm": 0.3320275843143463, + "learning_rate": 6.357976386027697e-05, + "loss": 4.4522, + "step": 2487 + }, + { + "epoch": 0.7956984948729737, + "grad_norm": 0.3390849828720093, + "learning_rate": 6.338853893597584e-05, + "loss": 4.4185, + "step": 2488 + }, + { + "epoch": 0.7960183093805593, + "grad_norm": 0.3309732973575592, + "learning_rate": 6.319756803704311e-05, + "loss": 4.403, + "step": 2489 + }, + { + "epoch": 0.7963381238881448, + "grad_norm": 0.33671632409095764, + "learning_rate": 6.300685136850458e-05, + "loss": 4.4218, + "step": 2490 + }, + { + "epoch": 0.7966579383957305, + "grad_norm": 0.32891610264778137, + "learning_rate": 6.281638913511324e-05, + "loss": 4.3904, + "step": 2491 + }, + { + "epoch": 0.7969777529033161, + "grad_norm": 0.3296626806259155, + "learning_rate": 6.262618154134858e-05, + "loss": 4.3143, + "step": 2492 + }, + { + "epoch": 0.7972975674109016, + "grad_norm": 0.32553809881210327, + "learning_rate": 6.2436228791417e-05, + "loss": 4.4164, + "step": 2493 + }, + { + "epoch": 0.7976173819184873, + "grad_norm": 0.3539281189441681, + "learning_rate": 6.224653108925122e-05, + "loss": 4.4039, + "step": 2494 + }, + { + "epoch": 0.7979371964260729, + "grad_norm": 0.32112589478492737, + "learning_rate": 6.205708863851019e-05, + "loss": 4.3172, + "step": 2495 + }, + { + "epoch": 0.7982570109336585, + "grad_norm": 0.3262506425380707, + "learning_rate": 6.186790164257866e-05, + "loss": 4.421, + "step": 2496 + }, + { + "epoch": 0.7985768254412441, + "grad_norm": 0.3287886679172516, + "learning_rate": 6.167897030456725e-05, + "loss": 4.4184, + "step": 2497 + }, + { + "epoch": 0.7988966399488296, + "grad_norm": 0.3291045129299164, + "learning_rate": 6.149029482731211e-05, + "loss": 4.3694, + "step": 2498 + }, + { + "epoch": 0.7992164544564153, + "grad_norm": 0.3448774814605713, + "learning_rate": 6.13018754133747e-05, + "loss": 4.4399, + "step": 2499 + }, + { + "epoch": 0.7995362689640009, + "grad_norm": 0.3265831470489502, + "learning_rate": 6.111371226504162e-05, + "loss": 4.4017, + "step": 2500 + }, + { + "epoch": 0.7995362689640009, + "eval_loss": 4.408499717712402, + "eval_runtime": 96.591, + "eval_samples_per_second": 19.64, + "eval_steps_per_second": 4.918, + "step": 2500 + }, + { + "epoch": 0.7998560834715864, + "grad_norm": 0.3272258937358856, + "learning_rate": 6.092580558432409e-05, + "loss": 4.4209, + "step": 2501 + }, + { + "epoch": 0.8001758979791721, + "grad_norm": 0.3352862596511841, + "learning_rate": 6.073815557295827e-05, + "loss": 4.3778, + "step": 2502 + }, + { + "epoch": 0.8004957124867577, + "grad_norm": 0.3348987400531769, + "learning_rate": 6.055076243240467e-05, + "loss": 4.3724, + "step": 2503 + }, + { + "epoch": 0.8008155269943433, + "grad_norm": 0.3495721220970154, + "learning_rate": 6.036362636384798e-05, + "loss": 4.3764, + "step": 2504 + }, + { + "epoch": 0.8011353415019289, + "grad_norm": 0.3421262204647064, + "learning_rate": 6.017674756819705e-05, + "loss": 4.334, + "step": 2505 + }, + { + "epoch": 0.8014551560095144, + "grad_norm": 0.3377733528614044, + "learning_rate": 5.9990126246084204e-05, + "loss": 4.3369, + "step": 2506 + }, + { + "epoch": 0.8017749705171001, + "grad_norm": 0.336994469165802, + "learning_rate": 5.9803762597865745e-05, + "loss": 4.3393, + "step": 2507 + }, + { + "epoch": 0.8020947850246857, + "grad_norm": 0.34005388617515564, + "learning_rate": 5.96176568236209e-05, + "loss": 4.4098, + "step": 2508 + }, + { + "epoch": 0.8024145995322712, + "grad_norm": 0.32835811376571655, + "learning_rate": 5.9431809123152465e-05, + "loss": 4.3953, + "step": 2509 + }, + { + "epoch": 0.8027344140398569, + "grad_norm": 0.34376657009124756, + "learning_rate": 5.924621969598604e-05, + "loss": 4.4965, + "step": 2510 + }, + { + "epoch": 0.8030542285474425, + "grad_norm": 0.3300655782222748, + "learning_rate": 5.906088874136968e-05, + "loss": 4.4316, + "step": 2511 + }, + { + "epoch": 0.8033740430550281, + "grad_norm": 0.3265095055103302, + "learning_rate": 5.887581645827436e-05, + "loss": 4.3927, + "step": 2512 + }, + { + "epoch": 0.8036938575626137, + "grad_norm": 0.3215543031692505, + "learning_rate": 5.869100304539297e-05, + "loss": 4.3649, + "step": 2513 + }, + { + "epoch": 0.8040136720701992, + "grad_norm": 0.32721421122550964, + "learning_rate": 5.850644870114063e-05, + "loss": 4.3213, + "step": 2514 + }, + { + "epoch": 0.8043334865777849, + "grad_norm": 0.3254854083061218, + "learning_rate": 5.832215362365458e-05, + "loss": 4.4192, + "step": 2515 + }, + { + "epoch": 0.8046533010853705, + "grad_norm": 0.34419766068458557, + "learning_rate": 5.813811801079325e-05, + "loss": 4.458, + "step": 2516 + }, + { + "epoch": 0.804973115592956, + "grad_norm": 0.3316987156867981, + "learning_rate": 5.795434206013685e-05, + "loss": 4.3995, + "step": 2517 + }, + { + "epoch": 0.8052929301005417, + "grad_norm": 0.3386606276035309, + "learning_rate": 5.77708259689866e-05, + "loss": 4.4321, + "step": 2518 + }, + { + "epoch": 0.8056127446081273, + "grad_norm": 0.3319477140903473, + "learning_rate": 5.7587569934364896e-05, + "loss": 4.4673, + "step": 2519 + }, + { + "epoch": 0.8059325591157129, + "grad_norm": 0.3327592611312866, + "learning_rate": 5.740457415301486e-05, + "loss": 4.4537, + "step": 2520 + }, + { + "epoch": 0.8062523736232985, + "grad_norm": 0.331850528717041, + "learning_rate": 5.72218388214002e-05, + "loss": 4.3952, + "step": 2521 + }, + { + "epoch": 0.806572188130884, + "grad_norm": 0.323560893535614, + "learning_rate": 5.703936413570519e-05, + "loss": 4.3919, + "step": 2522 + }, + { + "epoch": 0.8068920026384697, + "grad_norm": 0.4162808954715729, + "learning_rate": 5.6857150291833884e-05, + "loss": 4.4768, + "step": 2523 + }, + { + "epoch": 0.8072118171460553, + "grad_norm": 0.3326115310192108, + "learning_rate": 5.667519748541064e-05, + "loss": 4.3733, + "step": 2524 + }, + { + "epoch": 0.8075316316536408, + "grad_norm": 0.3389906585216522, + "learning_rate": 5.649350591177946e-05, + "loss": 4.336, + "step": 2525 + }, + { + "epoch": 0.8078514461612265, + "grad_norm": 0.32866370677948, + "learning_rate": 5.6312075766003876e-05, + "loss": 4.4592, + "step": 2526 + }, + { + "epoch": 0.8081712606688121, + "grad_norm": 0.32848337292671204, + "learning_rate": 5.613090724286681e-05, + "loss": 4.3391, + "step": 2527 + }, + { + "epoch": 0.8084910751763977, + "grad_norm": 0.3331284523010254, + "learning_rate": 5.595000053687014e-05, + "loss": 4.4384, + "step": 2528 + }, + { + "epoch": 0.8088108896839833, + "grad_norm": 0.33180293440818787, + "learning_rate": 5.576935584223482e-05, + "loss": 4.3779, + "step": 2529 + }, + { + "epoch": 0.8091307041915689, + "grad_norm": 0.3350355625152588, + "learning_rate": 5.55889733529005e-05, + "loss": 4.4041, + "step": 2530 + }, + { + "epoch": 0.8094505186991545, + "grad_norm": 0.3404220640659332, + "learning_rate": 5.540885326252531e-05, + "loss": 4.537, + "step": 2531 + }, + { + "epoch": 0.8097703332067401, + "grad_norm": 0.3256952166557312, + "learning_rate": 5.5228995764485564e-05, + "loss": 4.3572, + "step": 2532 + }, + { + "epoch": 0.8100901477143256, + "grad_norm": 0.3268665075302124, + "learning_rate": 5.5049401051875765e-05, + "loss": 4.4335, + "step": 2533 + }, + { + "epoch": 0.8104099622219113, + "grad_norm": 0.32366156578063965, + "learning_rate": 5.487006931750828e-05, + "loss": 4.3503, + "step": 2534 + }, + { + "epoch": 0.8107297767294969, + "grad_norm": 0.33809319138526917, + "learning_rate": 5.469100075391314e-05, + "loss": 4.4448, + "step": 2535 + }, + { + "epoch": 0.8110495912370825, + "grad_norm": 0.3393336236476898, + "learning_rate": 5.451219555333792e-05, + "loss": 4.3668, + "step": 2536 + }, + { + "epoch": 0.8113694057446681, + "grad_norm": 0.32927775382995605, + "learning_rate": 5.4333653907747174e-05, + "loss": 4.2971, + "step": 2537 + }, + { + "epoch": 0.8116892202522537, + "grad_norm": 0.33429858088493347, + "learning_rate": 5.4155376008822805e-05, + "loss": 4.3864, + "step": 2538 + }, + { + "epoch": 0.8120090347598393, + "grad_norm": 0.3288191258907318, + "learning_rate": 5.397736204796337e-05, + "loss": 4.4077, + "step": 2539 + }, + { + "epoch": 0.8123288492674249, + "grad_norm": 0.336532324552536, + "learning_rate": 5.37996122162842e-05, + "loss": 4.386, + "step": 2540 + }, + { + "epoch": 0.8126486637750105, + "grad_norm": 0.3313378691673279, + "learning_rate": 5.362212670461706e-05, + "loss": 4.4191, + "step": 2541 + }, + { + "epoch": 0.8129684782825961, + "grad_norm": 0.33725032210350037, + "learning_rate": 5.3444905703509687e-05, + "loss": 4.3506, + "step": 2542 + }, + { + "epoch": 0.8132882927901817, + "grad_norm": 0.324349582195282, + "learning_rate": 5.3267949403226104e-05, + "loss": 4.3942, + "step": 2543 + }, + { + "epoch": 0.8136081072977673, + "grad_norm": 0.33053717017173767, + "learning_rate": 5.3091257993746115e-05, + "loss": 4.3883, + "step": 2544 + }, + { + "epoch": 0.8139279218053529, + "grad_norm": 0.3250136971473694, + "learning_rate": 5.2914831664765045e-05, + "loss": 4.3464, + "step": 2545 + }, + { + "epoch": 0.8142477363129385, + "grad_norm": 0.3636740446090698, + "learning_rate": 5.2738670605693814e-05, + "loss": 4.4609, + "step": 2546 + }, + { + "epoch": 0.8145675508205241, + "grad_norm": 0.33198001980781555, + "learning_rate": 5.256277500565823e-05, + "loss": 4.3835, + "step": 2547 + }, + { + "epoch": 0.8148873653281097, + "grad_norm": 0.3311094641685486, + "learning_rate": 5.238714505349938e-05, + "loss": 4.378, + "step": 2548 + }, + { + "epoch": 0.8152071798356954, + "grad_norm": 0.3309457302093506, + "learning_rate": 5.221178093777303e-05, + "loss": 4.4277, + "step": 2549 + }, + { + "epoch": 0.8155269943432809, + "grad_norm": 0.3373015820980072, + "learning_rate": 5.2036682846749645e-05, + "loss": 4.4029, + "step": 2550 + }, + { + "epoch": 0.8158468088508665, + "grad_norm": 0.32732510566711426, + "learning_rate": 5.186185096841402e-05, + "loss": 4.4289, + "step": 2551 + }, + { + "epoch": 0.8161666233584521, + "grad_norm": 0.32910993695259094, + "learning_rate": 5.168728549046508e-05, + "loss": 4.4051, + "step": 2552 + }, + { + "epoch": 0.8164864378660377, + "grad_norm": 0.3277089595794678, + "learning_rate": 5.151298660031587e-05, + "loss": 4.4015, + "step": 2553 + }, + { + "epoch": 0.8168062523736233, + "grad_norm": 0.33765482902526855, + "learning_rate": 5.133895448509299e-05, + "loss": 4.369, + "step": 2554 + }, + { + "epoch": 0.8171260668812089, + "grad_norm": 0.33570003509521484, + "learning_rate": 5.116518933163709e-05, + "loss": 4.3564, + "step": 2555 + }, + { + "epoch": 0.8174458813887945, + "grad_norm": 0.33497217297554016, + "learning_rate": 5.099169132650173e-05, + "loss": 4.3783, + "step": 2556 + }, + { + "epoch": 0.8177656958963802, + "grad_norm": 0.330083429813385, + "learning_rate": 5.0818460655953894e-05, + "loss": 4.3159, + "step": 2557 + }, + { + "epoch": 0.8180855104039657, + "grad_norm": 0.3331679403781891, + "learning_rate": 5.0645497505973633e-05, + "loss": 4.3378, + "step": 2558 + }, + { + "epoch": 0.8184053249115513, + "grad_norm": 0.32330459356307983, + "learning_rate": 5.0472802062253426e-05, + "loss": 4.3493, + "step": 2559 + }, + { + "epoch": 0.8187251394191369, + "grad_norm": 0.32560333609580994, + "learning_rate": 5.0300374510198904e-05, + "loss": 4.3681, + "step": 2560 + }, + { + "epoch": 0.8190449539267225, + "grad_norm": 0.35731586813926697, + "learning_rate": 5.012821503492755e-05, + "loss": 4.4038, + "step": 2561 + }, + { + "epoch": 0.819364768434308, + "grad_norm": 0.34077370166778564, + "learning_rate": 4.9956323821269326e-05, + "loss": 4.4004, + "step": 2562 + }, + { + "epoch": 0.8196845829418937, + "grad_norm": 0.3387181758880615, + "learning_rate": 4.978470105376627e-05, + "loss": 4.4366, + "step": 2563 + }, + { + "epoch": 0.8200043974494793, + "grad_norm": 0.34127897024154663, + "learning_rate": 4.961334691667177e-05, + "loss": 4.374, + "step": 2564 + }, + { + "epoch": 0.820324211957065, + "grad_norm": 0.3408231735229492, + "learning_rate": 4.9442261593951496e-05, + "loss": 4.4105, + "step": 2565 + }, + { + "epoch": 0.8206440264646505, + "grad_norm": 0.33096012473106384, + "learning_rate": 4.9271445269281884e-05, + "loss": 4.4505, + "step": 2566 + }, + { + "epoch": 0.8209638409722361, + "grad_norm": 0.3267434537410736, + "learning_rate": 4.910089812605098e-05, + "loss": 4.3676, + "step": 2567 + }, + { + "epoch": 0.8212836554798217, + "grad_norm": 0.33827266097068787, + "learning_rate": 4.893062034735758e-05, + "loss": 4.3366, + "step": 2568 + }, + { + "epoch": 0.8216034699874073, + "grad_norm": 0.33350881934165955, + "learning_rate": 4.8760612116011464e-05, + "loss": 4.3751, + "step": 2569 + }, + { + "epoch": 0.8219232844949929, + "grad_norm": 0.3247975707054138, + "learning_rate": 4.8590873614532956e-05, + "loss": 4.3698, + "step": 2570 + }, + { + "epoch": 0.8222430990025785, + "grad_norm": 0.3426414728164673, + "learning_rate": 4.842140502515282e-05, + "loss": 4.3529, + "step": 2571 + }, + { + "epoch": 0.8225629135101641, + "grad_norm": 0.3254157602787018, + "learning_rate": 4.825220652981211e-05, + "loss": 4.3941, + "step": 2572 + }, + { + "epoch": 0.8228827280177498, + "grad_norm": 0.3290571868419647, + "learning_rate": 4.80832783101617e-05, + "loss": 4.3833, + "step": 2573 + }, + { + "epoch": 0.8232025425253353, + "grad_norm": 0.33417990803718567, + "learning_rate": 4.7914620547562475e-05, + "loss": 4.3641, + "step": 2574 + }, + { + "epoch": 0.8235223570329209, + "grad_norm": 0.3283510208129883, + "learning_rate": 4.7746233423084965e-05, + "loss": 4.3072, + "step": 2575 + }, + { + "epoch": 0.8238421715405065, + "grad_norm": 0.3323967456817627, + "learning_rate": 4.757811711750903e-05, + "loss": 4.4042, + "step": 2576 + }, + { + "epoch": 0.8241619860480921, + "grad_norm": 0.32422980666160583, + "learning_rate": 4.741027181132392e-05, + "loss": 4.4286, + "step": 2577 + }, + { + "epoch": 0.8244818005556777, + "grad_norm": 0.353149950504303, + "learning_rate": 4.724269768472776e-05, + "loss": 4.394, + "step": 2578 + }, + { + "epoch": 0.8248016150632633, + "grad_norm": 0.3301638066768646, + "learning_rate": 4.707539491762767e-05, + "loss": 4.4815, + "step": 2579 + }, + { + "epoch": 0.8251214295708489, + "grad_norm": 0.32457199692726135, + "learning_rate": 4.690836368963945e-05, + "loss": 4.4616, + "step": 2580 + }, + { + "epoch": 0.8254412440784346, + "grad_norm": 0.33951979875564575, + "learning_rate": 4.674160418008728e-05, + "loss": 4.3569, + "step": 2581 + }, + { + "epoch": 0.8257610585860201, + "grad_norm": 0.33899393677711487, + "learning_rate": 4.657511656800381e-05, + "loss": 4.4591, + "step": 2582 + }, + { + "epoch": 0.8260808730936057, + "grad_norm": 0.32934606075286865, + "learning_rate": 4.6408901032129476e-05, + "loss": 4.4011, + "step": 2583 + }, + { + "epoch": 0.8264006876011913, + "grad_norm": 0.3325296938419342, + "learning_rate": 4.624295775091282e-05, + "loss": 4.253, + "step": 2584 + }, + { + "epoch": 0.8267205021087769, + "grad_norm": 0.35208362340927124, + "learning_rate": 4.6077286902510144e-05, + "loss": 4.3729, + "step": 2585 + }, + { + "epoch": 0.8270403166163625, + "grad_norm": 0.3326524496078491, + "learning_rate": 4.591188866478513e-05, + "loss": 4.3585, + "step": 2586 + }, + { + "epoch": 0.8273601311239481, + "grad_norm": 0.3358716368675232, + "learning_rate": 4.574676321530891e-05, + "loss": 4.3566, + "step": 2587 + }, + { + "epoch": 0.8276799456315337, + "grad_norm": 0.3392607271671295, + "learning_rate": 4.558191073135957e-05, + "loss": 4.4831, + "step": 2588 + }, + { + "epoch": 0.8279997601391194, + "grad_norm": 0.32785317301750183, + "learning_rate": 4.541733138992231e-05, + "loss": 4.406, + "step": 2589 + }, + { + "epoch": 0.8283195746467049, + "grad_norm": 0.3403478264808655, + "learning_rate": 4.525302536768901e-05, + "loss": 4.4775, + "step": 2590 + }, + { + "epoch": 0.8286393891542905, + "grad_norm": 0.332083523273468, + "learning_rate": 4.5088992841058214e-05, + "loss": 4.3948, + "step": 2591 + }, + { + "epoch": 0.8289592036618761, + "grad_norm": 0.333533376455307, + "learning_rate": 4.4925233986134614e-05, + "loss": 4.4332, + "step": 2592 + }, + { + "epoch": 0.8292790181694617, + "grad_norm": 0.34167930483818054, + "learning_rate": 4.4761748978729305e-05, + "loss": 4.301, + "step": 2593 + }, + { + "epoch": 0.8295988326770473, + "grad_norm": 0.33838021755218506, + "learning_rate": 4.4598537994359297e-05, + "loss": 4.3024, + "step": 2594 + }, + { + "epoch": 0.8299186471846329, + "grad_norm": 0.3343757390975952, + "learning_rate": 4.443560120824748e-05, + "loss": 4.2999, + "step": 2595 + }, + { + "epoch": 0.8302384616922185, + "grad_norm": 0.3297710418701172, + "learning_rate": 4.427293879532231e-05, + "loss": 4.3453, + "step": 2596 + }, + { + "epoch": 0.8305582761998042, + "grad_norm": 0.3453871011734009, + "learning_rate": 4.411055093021758e-05, + "loss": 4.3338, + "step": 2597 + }, + { + "epoch": 0.8308780907073897, + "grad_norm": 0.334453821182251, + "learning_rate": 4.394843778727247e-05, + "loss": 4.368, + "step": 2598 + }, + { + "epoch": 0.8311979052149753, + "grad_norm": 0.338049978017807, + "learning_rate": 4.3786599540531164e-05, + "loss": 4.406, + "step": 2599 + }, + { + "epoch": 0.8315177197225609, + "grad_norm": 0.32910865545272827, + "learning_rate": 4.362503636374277e-05, + "loss": 4.4284, + "step": 2600 + }, + { + "epoch": 0.8315177197225609, + "eval_loss": 4.398345947265625, + "eval_runtime": 96.1854, + "eval_samples_per_second": 19.722, + "eval_steps_per_second": 4.938, + "step": 2600 + }, + { + "epoch": 0.8318375342301465, + "grad_norm": 0.34660476446151733, + "learning_rate": 4.346374843036104e-05, + "loss": 4.3165, + "step": 2601 + }, + { + "epoch": 0.8321573487377321, + "grad_norm": 0.3316264748573303, + "learning_rate": 4.3302735913544174e-05, + "loss": 4.3164, + "step": 2602 + }, + { + "epoch": 0.8324771632453177, + "grad_norm": 0.3275536894798279, + "learning_rate": 4.314199898615481e-05, + "loss": 4.3702, + "step": 2603 + }, + { + "epoch": 0.8327969777529033, + "grad_norm": 0.3246834874153137, + "learning_rate": 4.298153782075946e-05, + "loss": 4.3451, + "step": 2604 + }, + { + "epoch": 0.833116792260489, + "grad_norm": 0.3249054253101349, + "learning_rate": 4.2821352589628944e-05, + "loss": 4.3683, + "step": 2605 + }, + { + "epoch": 0.8334366067680745, + "grad_norm": 0.34387335181236267, + "learning_rate": 4.26614434647377e-05, + "loss": 4.4254, + "step": 2606 + }, + { + "epoch": 0.8337564212756601, + "grad_norm": 0.3431605100631714, + "learning_rate": 4.25018106177635e-05, + "loss": 4.416, + "step": 2607 + }, + { + "epoch": 0.8340762357832457, + "grad_norm": 0.35249772667884827, + "learning_rate": 4.2342454220087855e-05, + "loss": 4.3307, + "step": 2608 + }, + { + "epoch": 0.8343960502908313, + "grad_norm": 0.34468838572502136, + "learning_rate": 4.21833744427952e-05, + "loss": 4.3808, + "step": 2609 + }, + { + "epoch": 0.834715864798417, + "grad_norm": 0.32654619216918945, + "learning_rate": 4.202457145667311e-05, + "loss": 4.4191, + "step": 2610 + }, + { + "epoch": 0.8350356793060025, + "grad_norm": 0.3252173662185669, + "learning_rate": 4.1866045432212214e-05, + "loss": 4.4372, + "step": 2611 + }, + { + "epoch": 0.8353554938135881, + "grad_norm": 0.3516523540019989, + "learning_rate": 4.1707796539605385e-05, + "loss": 4.4323, + "step": 2612 + }, + { + "epoch": 0.8356753083211738, + "grad_norm": 0.33103737235069275, + "learning_rate": 4.154982494874829e-05, + "loss": 4.3093, + "step": 2613 + }, + { + "epoch": 0.8359951228287593, + "grad_norm": 0.3542514443397522, + "learning_rate": 4.139213082923862e-05, + "loss": 4.4259, + "step": 2614 + }, + { + "epoch": 0.8363149373363449, + "grad_norm": 0.341665118932724, + "learning_rate": 4.12347143503764e-05, + "loss": 4.4255, + "step": 2615 + }, + { + "epoch": 0.8366347518439305, + "grad_norm": 0.32837289571762085, + "learning_rate": 4.107757568116352e-05, + "loss": 4.405, + "step": 2616 + }, + { + "epoch": 0.8369545663515161, + "grad_norm": 0.3300032317638397, + "learning_rate": 4.092071499030355e-05, + "loss": 4.3612, + "step": 2617 + }, + { + "epoch": 0.8372743808591018, + "grad_norm": 0.33197590708732605, + "learning_rate": 4.076413244620177e-05, + "loss": 4.3632, + "step": 2618 + }, + { + "epoch": 0.8375941953666873, + "grad_norm": 0.33739814162254333, + "learning_rate": 4.060782821696458e-05, + "loss": 4.3407, + "step": 2619 + }, + { + "epoch": 0.8379140098742729, + "grad_norm": 0.3367348313331604, + "learning_rate": 4.0451802470399805e-05, + "loss": 4.4094, + "step": 2620 + }, + { + "epoch": 0.8382338243818586, + "grad_norm": 0.33014410734176636, + "learning_rate": 4.029605537401623e-05, + "loss": 4.3781, + "step": 2621 + }, + { + "epoch": 0.8385536388894441, + "grad_norm": 0.3287923336029053, + "learning_rate": 4.01405870950235e-05, + "loss": 4.3943, + "step": 2622 + }, + { + "epoch": 0.8388734533970297, + "grad_norm": 0.33316370844841003, + "learning_rate": 3.9985397800331965e-05, + "loss": 4.3524, + "step": 2623 + }, + { + "epoch": 0.8391932679046153, + "grad_norm": 0.33590611815452576, + "learning_rate": 3.983048765655225e-05, + "loss": 4.3676, + "step": 2624 + }, + { + "epoch": 0.8395130824122009, + "grad_norm": 0.33867955207824707, + "learning_rate": 3.9675856829995513e-05, + "loss": 4.365, + "step": 2625 + }, + { + "epoch": 0.8398328969197866, + "grad_norm": 0.32774361968040466, + "learning_rate": 3.95215054866729e-05, + "loss": 4.387, + "step": 2626 + }, + { + "epoch": 0.8401527114273721, + "grad_norm": 0.3267671465873718, + "learning_rate": 3.936743379229572e-05, + "loss": 4.3414, + "step": 2627 + }, + { + "epoch": 0.8404725259349577, + "grad_norm": 0.3394142687320709, + "learning_rate": 3.921364191227466e-05, + "loss": 4.4019, + "step": 2628 + }, + { + "epoch": 0.8407923404425434, + "grad_norm": 0.32703539729118347, + "learning_rate": 3.9060130011720345e-05, + "loss": 4.4492, + "step": 2629 + }, + { + "epoch": 0.8411121549501289, + "grad_norm": 0.3314916491508484, + "learning_rate": 3.890689825544271e-05, + "loss": 4.423, + "step": 2630 + }, + { + "epoch": 0.8414319694577145, + "grad_norm": 0.3226998746395111, + "learning_rate": 3.875394680795092e-05, + "loss": 4.344, + "step": 2631 + }, + { + "epoch": 0.8417517839653001, + "grad_norm": 0.3287048935890198, + "learning_rate": 3.8601275833453224e-05, + "loss": 4.3911, + "step": 2632 + }, + { + "epoch": 0.8420715984728857, + "grad_norm": 0.3339630365371704, + "learning_rate": 3.844888549585662e-05, + "loss": 4.4108, + "step": 2633 + }, + { + "epoch": 0.8423914129804714, + "grad_norm": 0.33046579360961914, + "learning_rate": 3.829677595876699e-05, + "loss": 4.3197, + "step": 2634 + }, + { + "epoch": 0.8427112274880569, + "grad_norm": 0.34412693977355957, + "learning_rate": 3.814494738548871e-05, + "loss": 4.3802, + "step": 2635 + }, + { + "epoch": 0.8430310419956425, + "grad_norm": 0.33722245693206787, + "learning_rate": 3.799339993902446e-05, + "loss": 4.3095, + "step": 2636 + }, + { + "epoch": 0.8433508565032282, + "grad_norm": 0.324649840593338, + "learning_rate": 3.784213378207522e-05, + "loss": 4.4115, + "step": 2637 + }, + { + "epoch": 0.8436706710108137, + "grad_norm": 0.32171830534935, + "learning_rate": 3.769114907703973e-05, + "loss": 4.3975, + "step": 2638 + }, + { + "epoch": 0.8439904855183993, + "grad_norm": 0.3386608064174652, + "learning_rate": 3.7540445986014845e-05, + "loss": 4.2348, + "step": 2639 + }, + { + "epoch": 0.844310300025985, + "grad_norm": 0.3258531093597412, + "learning_rate": 3.739002467079488e-05, + "loss": 4.4067, + "step": 2640 + }, + { + "epoch": 0.8446301145335705, + "grad_norm": 0.33536481857299805, + "learning_rate": 3.723988529287176e-05, + "loss": 4.4138, + "step": 2641 + }, + { + "epoch": 0.8449499290411562, + "grad_norm": 0.3187597692012787, + "learning_rate": 3.709002801343478e-05, + "loss": 4.3633, + "step": 2642 + }, + { + "epoch": 0.8452697435487417, + "grad_norm": 0.3352622985839844, + "learning_rate": 3.6940452993370105e-05, + "loss": 4.443, + "step": 2643 + }, + { + "epoch": 0.8455895580563273, + "grad_norm": 0.3221001625061035, + "learning_rate": 3.679116039326115e-05, + "loss": 4.3375, + "step": 2644 + }, + { + "epoch": 0.845909372563913, + "grad_norm": 0.3328348398208618, + "learning_rate": 3.664215037338785e-05, + "loss": 4.3941, + "step": 2645 + }, + { + "epoch": 0.8462291870714985, + "grad_norm": 0.3309707045555115, + "learning_rate": 3.6493423093727084e-05, + "loss": 4.4173, + "step": 2646 + }, + { + "epoch": 0.8465490015790841, + "grad_norm": 0.32627466320991516, + "learning_rate": 3.634497871395207e-05, + "loss": 4.457, + "step": 2647 + }, + { + "epoch": 0.8468688160866698, + "grad_norm": 0.3152695298194885, + "learning_rate": 3.6196817393432085e-05, + "loss": 4.3698, + "step": 2648 + }, + { + "epoch": 0.8471886305942553, + "grad_norm": 0.3303844928741455, + "learning_rate": 3.604893929123284e-05, + "loss": 4.4141, + "step": 2649 + }, + { + "epoch": 0.847508445101841, + "grad_norm": 0.32189106941223145, + "learning_rate": 3.590134456611562e-05, + "loss": 4.3802, + "step": 2650 + }, + { + "epoch": 0.8478282596094265, + "grad_norm": 0.3331794738769531, + "learning_rate": 3.5754033376537947e-05, + "loss": 4.4424, + "step": 2651 + }, + { + "epoch": 0.8481480741170121, + "grad_norm": 0.32761675119400024, + "learning_rate": 3.560700588065252e-05, + "loss": 4.3987, + "step": 2652 + }, + { + "epoch": 0.8484678886245978, + "grad_norm": 0.3419148027896881, + "learning_rate": 3.5460262236307657e-05, + "loss": 4.3915, + "step": 2653 + }, + { + "epoch": 0.8487877031321833, + "grad_norm": 0.34858790040016174, + "learning_rate": 3.531380260104698e-05, + "loss": 4.4045, + "step": 2654 + }, + { + "epoch": 0.8491075176397689, + "grad_norm": 0.3266744017601013, + "learning_rate": 3.516762713210891e-05, + "loss": 4.4382, + "step": 2655 + }, + { + "epoch": 0.8494273321473546, + "grad_norm": 0.32765451073646545, + "learning_rate": 3.502173598642728e-05, + "loss": 4.4018, + "step": 2656 + }, + { + "epoch": 0.8497471466549401, + "grad_norm": 0.33169153332710266, + "learning_rate": 3.4876129320630196e-05, + "loss": 4.3031, + "step": 2657 + }, + { + "epoch": 0.8500669611625258, + "grad_norm": 0.3545640707015991, + "learning_rate": 3.473080729104062e-05, + "loss": 4.4478, + "step": 2658 + }, + { + "epoch": 0.8503867756701113, + "grad_norm": 0.3219090402126312, + "learning_rate": 3.4585770053675876e-05, + "loss": 4.3893, + "step": 2659 + }, + { + "epoch": 0.8507065901776969, + "grad_norm": 0.3292368948459625, + "learning_rate": 3.444101776424738e-05, + "loss": 4.3986, + "step": 2660 + }, + { + "epoch": 0.8510264046852826, + "grad_norm": 0.32431018352508545, + "learning_rate": 3.429655057816099e-05, + "loss": 4.3436, + "step": 2661 + }, + { + "epoch": 0.8513462191928681, + "grad_norm": 0.3349042534828186, + "learning_rate": 3.415236865051606e-05, + "loss": 4.3661, + "step": 2662 + }, + { + "epoch": 0.8516660337004537, + "grad_norm": 0.32648709416389465, + "learning_rate": 3.4008472136106046e-05, + "loss": 4.3795, + "step": 2663 + }, + { + "epoch": 0.8519858482080394, + "grad_norm": 0.32753390073776245, + "learning_rate": 3.3864861189417636e-05, + "loss": 4.4567, + "step": 2664 + }, + { + "epoch": 0.8523056627156249, + "grad_norm": 0.3344663977622986, + "learning_rate": 3.3721535964631195e-05, + "loss": 4.3676, + "step": 2665 + }, + { + "epoch": 0.8526254772232106, + "grad_norm": 0.33271554112434387, + "learning_rate": 3.3578496615620307e-05, + "loss": 4.3831, + "step": 2666 + }, + { + "epoch": 0.8529452917307961, + "grad_norm": 0.32605451345443726, + "learning_rate": 3.343574329595157e-05, + "loss": 4.4372, + "step": 2667 + }, + { + "epoch": 0.8532651062383817, + "grad_norm": 0.3346996307373047, + "learning_rate": 3.329327615888461e-05, + "loss": 4.3936, + "step": 2668 + }, + { + "epoch": 0.8535849207459674, + "grad_norm": 0.33022093772888184, + "learning_rate": 3.315109535737155e-05, + "loss": 4.3627, + "step": 2669 + }, + { + "epoch": 0.8539047352535529, + "grad_norm": 0.33286476135253906, + "learning_rate": 3.300920104405739e-05, + "loss": 4.3585, + "step": 2670 + }, + { + "epoch": 0.8542245497611385, + "grad_norm": 0.3309535086154938, + "learning_rate": 3.2867593371279434e-05, + "loss": 4.4029, + "step": 2671 + }, + { + "epoch": 0.8545443642687242, + "grad_norm": 0.33394062519073486, + "learning_rate": 3.272627249106724e-05, + "loss": 4.3053, + "step": 2672 + }, + { + "epoch": 0.8548641787763097, + "grad_norm": 0.3265056908130646, + "learning_rate": 3.258523855514258e-05, + "loss": 4.2647, + "step": 2673 + }, + { + "epoch": 0.8551839932838954, + "grad_norm": 0.32501220703125, + "learning_rate": 3.244449171491896e-05, + "loss": 4.3984, + "step": 2674 + }, + { + "epoch": 0.8555038077914809, + "grad_norm": 0.33929458260536194, + "learning_rate": 3.230403212150179e-05, + "loss": 4.4622, + "step": 2675 + }, + { + "epoch": 0.8558236222990665, + "grad_norm": 0.3200635313987732, + "learning_rate": 3.216385992568813e-05, + "loss": 4.4147, + "step": 2676 + }, + { + "epoch": 0.8561434368066522, + "grad_norm": 0.3386853039264679, + "learning_rate": 3.202397527796637e-05, + "loss": 4.3412, + "step": 2677 + }, + { + "epoch": 0.8564632513142377, + "grad_norm": 0.3138151466846466, + "learning_rate": 3.188437832851639e-05, + "loss": 4.3613, + "step": 2678 + }, + { + "epoch": 0.8567830658218234, + "grad_norm": 0.3236668109893799, + "learning_rate": 3.1745069227208894e-05, + "loss": 4.3597, + "step": 2679 + }, + { + "epoch": 0.857102880329409, + "grad_norm": 0.3210524022579193, + "learning_rate": 3.160604812360579e-05, + "loss": 4.3687, + "step": 2680 + }, + { + "epoch": 0.8574226948369945, + "grad_norm": 0.32683470845222473, + "learning_rate": 3.146731516695974e-05, + "loss": 4.3062, + "step": 2681 + }, + { + "epoch": 0.8577425093445802, + "grad_norm": 0.3700844645500183, + "learning_rate": 3.1328870506214044e-05, + "loss": 4.3855, + "step": 2682 + }, + { + "epoch": 0.8580623238521657, + "grad_norm": 0.32976222038269043, + "learning_rate": 3.119071429000254e-05, + "loss": 4.4458, + "step": 2683 + }, + { + "epoch": 0.8583821383597513, + "grad_norm": 0.32784411311149597, + "learning_rate": 3.105284666664918e-05, + "loss": 4.3289, + "step": 2684 + }, + { + "epoch": 0.858701952867337, + "grad_norm": 0.31430763006210327, + "learning_rate": 3.091526778416833e-05, + "loss": 4.3308, + "step": 2685 + }, + { + "epoch": 0.8590217673749225, + "grad_norm": 0.3267063498497009, + "learning_rate": 3.077797779026428e-05, + "loss": 4.3948, + "step": 2686 + }, + { + "epoch": 0.8593415818825082, + "grad_norm": 0.33419206738471985, + "learning_rate": 3.064097683233121e-05, + "loss": 4.3861, + "step": 2687 + }, + { + "epoch": 0.8596613963900938, + "grad_norm": 0.3286486268043518, + "learning_rate": 3.0504265057452815e-05, + "loss": 4.2863, + "step": 2688 + }, + { + "epoch": 0.8599812108976793, + "grad_norm": 0.33864644169807434, + "learning_rate": 3.036784261240255e-05, + "loss": 4.2062, + "step": 2689 + }, + { + "epoch": 0.860301025405265, + "grad_norm": 0.33045342564582825, + "learning_rate": 3.0231709643643086e-05, + "loss": 4.3897, + "step": 2690 + }, + { + "epoch": 0.8606208399128505, + "grad_norm": 0.33123284578323364, + "learning_rate": 3.0095866297326455e-05, + "loss": 4.3802, + "step": 2691 + }, + { + "epoch": 0.8609406544204361, + "grad_norm": 0.33128055930137634, + "learning_rate": 2.996031271929369e-05, + "loss": 4.3809, + "step": 2692 + }, + { + "epoch": 0.8612604689280218, + "grad_norm": 0.3309783339500427, + "learning_rate": 2.982504905507461e-05, + "loss": 4.331, + "step": 2693 + }, + { + "epoch": 0.8615802834356073, + "grad_norm": 0.3281680941581726, + "learning_rate": 2.969007544988793e-05, + "loss": 4.2814, + "step": 2694 + }, + { + "epoch": 0.861900097943193, + "grad_norm": 0.3542790114879608, + "learning_rate": 2.9555392048640924e-05, + "loss": 4.3908, + "step": 2695 + }, + { + "epoch": 0.8622199124507786, + "grad_norm": 0.3346249461174011, + "learning_rate": 2.9420998995929267e-05, + "loss": 4.4109, + "step": 2696 + }, + { + "epoch": 0.8625397269583641, + "grad_norm": 0.3290999233722687, + "learning_rate": 2.9286896436037076e-05, + "loss": 4.389, + "step": 2697 + }, + { + "epoch": 0.8628595414659498, + "grad_norm": 0.337666779756546, + "learning_rate": 2.9153084512936285e-05, + "loss": 4.422, + "step": 2698 + }, + { + "epoch": 0.8631793559735353, + "grad_norm": 0.3327541947364807, + "learning_rate": 2.9019563370287112e-05, + "loss": 4.3749, + "step": 2699 + }, + { + "epoch": 0.8634991704811209, + "grad_norm": 0.31937623023986816, + "learning_rate": 2.8886333151437292e-05, + "loss": 4.3105, + "step": 2700 + }, + { + "epoch": 0.8634991704811209, + "eval_loss": 4.390111923217773, + "eval_runtime": 92.48, + "eval_samples_per_second": 20.513, + "eval_steps_per_second": 5.136, + "step": 2700 + }, + { + "epoch": 0.8638189849887066, + "grad_norm": 0.32369351387023926, + "learning_rate": 2.875339399942257e-05, + "loss": 4.3694, + "step": 2701 + }, + { + "epoch": 0.8641387994962921, + "grad_norm": 0.3404898941516876, + "learning_rate": 2.862074605696605e-05, + "loss": 4.3563, + "step": 2702 + }, + { + "epoch": 0.8644586140038778, + "grad_norm": 0.3288138508796692, + "learning_rate": 2.848838946647801e-05, + "loss": 4.3834, + "step": 2703 + }, + { + "epoch": 0.8647784285114634, + "grad_norm": 0.35545358061790466, + "learning_rate": 2.835632437005626e-05, + "loss": 4.2707, + "step": 2704 + }, + { + "epoch": 0.8650982430190489, + "grad_norm": 0.32958588004112244, + "learning_rate": 2.8224550909485344e-05, + "loss": 4.4501, + "step": 2705 + }, + { + "epoch": 0.8654180575266346, + "grad_norm": 0.32998397946357727, + "learning_rate": 2.8093069226236865e-05, + "loss": 4.3847, + "step": 2706 + }, + { + "epoch": 0.8657378720342201, + "grad_norm": 0.3279590308666229, + "learning_rate": 2.796187946146937e-05, + "loss": 4.3963, + "step": 2707 + }, + { + "epoch": 0.8660576865418057, + "grad_norm": 0.34240469336509705, + "learning_rate": 2.7830981756027636e-05, + "loss": 4.2706, + "step": 2708 + }, + { + "epoch": 0.8663775010493914, + "grad_norm": 0.32866594195365906, + "learning_rate": 2.7700376250443147e-05, + "loss": 4.3242, + "step": 2709 + }, + { + "epoch": 0.8666973155569769, + "grad_norm": 0.3390978276729584, + "learning_rate": 2.757006308493347e-05, + "loss": 4.3714, + "step": 2710 + }, + { + "epoch": 0.8670171300645626, + "grad_norm": 0.3310787081718445, + "learning_rate": 2.7440042399402496e-05, + "loss": 4.3714, + "step": 2711 + }, + { + "epoch": 0.8673369445721482, + "grad_norm": 0.3252509832382202, + "learning_rate": 2.7310314333440097e-05, + "loss": 4.2953, + "step": 2712 + }, + { + "epoch": 0.8676567590797337, + "grad_norm": 0.3651213049888611, + "learning_rate": 2.7180879026321866e-05, + "loss": 4.4167, + "step": 2713 + }, + { + "epoch": 0.8679765735873194, + "grad_norm": 0.3748612105846405, + "learning_rate": 2.7051736617009277e-05, + "loss": 4.3505, + "step": 2714 + }, + { + "epoch": 0.868296388094905, + "grad_norm": 0.34408658742904663, + "learning_rate": 2.6922887244149126e-05, + "loss": 4.4079, + "step": 2715 + }, + { + "epoch": 0.8686162026024905, + "grad_norm": 0.33819928765296936, + "learning_rate": 2.6794331046073724e-05, + "loss": 4.2749, + "step": 2716 + }, + { + "epoch": 0.8689360171100762, + "grad_norm": 0.33545467257499695, + "learning_rate": 2.6666068160800702e-05, + "loss": 4.4416, + "step": 2717 + }, + { + "epoch": 0.8692558316176617, + "grad_norm": 0.3370988667011261, + "learning_rate": 2.6538098726032675e-05, + "loss": 4.3412, + "step": 2718 + }, + { + "epoch": 0.8695756461252474, + "grad_norm": 0.33135709166526794, + "learning_rate": 2.6410422879157313e-05, + "loss": 4.3847, + "step": 2719 + }, + { + "epoch": 0.869895460632833, + "grad_norm": 0.36448410153388977, + "learning_rate": 2.628304075724693e-05, + "loss": 4.3429, + "step": 2720 + }, + { + "epoch": 0.8702152751404185, + "grad_norm": 0.39434826374053955, + "learning_rate": 2.6155952497058643e-05, + "loss": 4.3748, + "step": 2721 + }, + { + "epoch": 0.8705350896480042, + "grad_norm": 0.3282822072505951, + "learning_rate": 2.6029158235033997e-05, + "loss": 4.4443, + "step": 2722 + }, + { + "epoch": 0.8708549041555897, + "grad_norm": 0.3259486258029938, + "learning_rate": 2.5902658107299078e-05, + "loss": 4.4201, + "step": 2723 + }, + { + "epoch": 0.8711747186631753, + "grad_norm": 0.35804739594459534, + "learning_rate": 2.5776452249663847e-05, + "loss": 4.3938, + "step": 2724 + }, + { + "epoch": 0.871494533170761, + "grad_norm": 0.32361823320388794, + "learning_rate": 2.5650540797622687e-05, + "loss": 4.4667, + "step": 2725 + }, + { + "epoch": 0.8718143476783465, + "grad_norm": 0.32772985100746155, + "learning_rate": 2.5524923886353697e-05, + "loss": 4.3217, + "step": 2726 + }, + { + "epoch": 0.8721341621859322, + "grad_norm": 0.3569638729095459, + "learning_rate": 2.5399601650718838e-05, + "loss": 4.4559, + "step": 2727 + }, + { + "epoch": 0.8724539766935178, + "grad_norm": 0.3326680064201355, + "learning_rate": 2.5274574225263776e-05, + "loss": 4.366, + "step": 2728 + }, + { + "epoch": 0.8727737912011033, + "grad_norm": 0.3196980655193329, + "learning_rate": 2.5149841744217415e-05, + "loss": 4.293, + "step": 2729 + }, + { + "epoch": 0.873093605708689, + "grad_norm": 0.331160306930542, + "learning_rate": 2.5025404341492327e-05, + "loss": 4.3402, + "step": 2730 + }, + { + "epoch": 0.8734134202162745, + "grad_norm": 0.33455970883369446, + "learning_rate": 2.4901262150684055e-05, + "loss": 4.4117, + "step": 2731 + }, + { + "epoch": 0.8737332347238601, + "grad_norm": 0.3468061089515686, + "learning_rate": 2.4777415305071346e-05, + "loss": 4.4355, + "step": 2732 + }, + { + "epoch": 0.8740530492314458, + "grad_norm": 0.3290843963623047, + "learning_rate": 2.4653863937615813e-05, + "loss": 4.4007, + "step": 2733 + }, + { + "epoch": 0.8743728637390313, + "grad_norm": 0.3317509889602661, + "learning_rate": 2.4530608180961786e-05, + "loss": 4.3998, + "step": 2734 + }, + { + "epoch": 0.874692678246617, + "grad_norm": 0.3440130949020386, + "learning_rate": 2.440764816743631e-05, + "loss": 4.362, + "step": 2735 + }, + { + "epoch": 0.8750124927542026, + "grad_norm": 0.35315367579460144, + "learning_rate": 2.428498402904889e-05, + "loss": 4.3674, + "step": 2736 + }, + { + "epoch": 0.8753323072617881, + "grad_norm": 0.3442535698413849, + "learning_rate": 2.416261589749139e-05, + "loss": 4.4137, + "step": 2737 + }, + { + "epoch": 0.8756521217693738, + "grad_norm": 0.3287537097930908, + "learning_rate": 2.4040543904137942e-05, + "loss": 4.3668, + "step": 2738 + }, + { + "epoch": 0.8759719362769594, + "grad_norm": 0.32724133133888245, + "learning_rate": 2.391876818004452e-05, + "loss": 4.3551, + "step": 2739 + }, + { + "epoch": 0.8762917507845449, + "grad_norm": 0.3481774628162384, + "learning_rate": 2.3797288855949382e-05, + "loss": 4.2624, + "step": 2740 + }, + { + "epoch": 0.8766115652921306, + "grad_norm": 0.33640459179878235, + "learning_rate": 2.3676106062272126e-05, + "loss": 4.4033, + "step": 2741 + }, + { + "epoch": 0.8769313797997161, + "grad_norm": 0.33367154002189636, + "learning_rate": 2.3555219929114454e-05, + "loss": 4.4157, + "step": 2742 + }, + { + "epoch": 0.8772511943073018, + "grad_norm": 0.32960766553878784, + "learning_rate": 2.343463058625932e-05, + "loss": 4.3441, + "step": 2743 + }, + { + "epoch": 0.8775710088148874, + "grad_norm": 0.33007586002349854, + "learning_rate": 2.331433816317102e-05, + "loss": 4.4697, + "step": 2744 + }, + { + "epoch": 0.8778908233224729, + "grad_norm": 0.32566601037979126, + "learning_rate": 2.3194342788995257e-05, + "loss": 4.2543, + "step": 2745 + }, + { + "epoch": 0.8782106378300586, + "grad_norm": 0.3600383698940277, + "learning_rate": 2.307464459255851e-05, + "loss": 4.3637, + "step": 2746 + }, + { + "epoch": 0.8785304523376442, + "grad_norm": 0.3245023488998413, + "learning_rate": 2.2955243702368652e-05, + "loss": 4.3556, + "step": 2747 + }, + { + "epoch": 0.8788502668452298, + "grad_norm": 0.32192283868789673, + "learning_rate": 2.2836140246613977e-05, + "loss": 4.3702, + "step": 2748 + }, + { + "epoch": 0.8791700813528154, + "grad_norm": 0.32928332686424255, + "learning_rate": 2.271733435316363e-05, + "loss": 4.4101, + "step": 2749 + }, + { + "epoch": 0.8794898958604009, + "grad_norm": 0.3395914137363434, + "learning_rate": 2.2598826149567352e-05, + "loss": 4.4447, + "step": 2750 + }, + { + "epoch": 0.8798097103679866, + "grad_norm": 0.33036407828330994, + "learning_rate": 2.2480615763055032e-05, + "loss": 4.335, + "step": 2751 + }, + { + "epoch": 0.8801295248755722, + "grad_norm": 0.331396222114563, + "learning_rate": 2.2362703320537156e-05, + "loss": 4.3357, + "step": 2752 + }, + { + "epoch": 0.8804493393831577, + "grad_norm": 0.32844099402427673, + "learning_rate": 2.2245088948604095e-05, + "loss": 4.3891, + "step": 2753 + }, + { + "epoch": 0.8807691538907434, + "grad_norm": 0.3302816152572632, + "learning_rate": 2.2127772773526342e-05, + "loss": 4.3532, + "step": 2754 + }, + { + "epoch": 0.881088968398329, + "grad_norm": 0.33271288871765137, + "learning_rate": 2.201075492125415e-05, + "loss": 4.3602, + "step": 2755 + }, + { + "epoch": 0.8814087829059146, + "grad_norm": 0.32084935903549194, + "learning_rate": 2.1894035517417486e-05, + "loss": 4.3989, + "step": 2756 + }, + { + "epoch": 0.8817285974135002, + "grad_norm": 0.33213767409324646, + "learning_rate": 2.1777614687326116e-05, + "loss": 4.3145, + "step": 2757 + }, + { + "epoch": 0.8820484119210857, + "grad_norm": 0.3316733241081238, + "learning_rate": 2.166149255596896e-05, + "loss": 4.3445, + "step": 2758 + }, + { + "epoch": 0.8823682264286714, + "grad_norm": 0.3338378369808197, + "learning_rate": 2.154566924801453e-05, + "loss": 4.3665, + "step": 2759 + }, + { + "epoch": 0.882688040936257, + "grad_norm": 0.32423871755599976, + "learning_rate": 2.1430144887810218e-05, + "loss": 4.3968, + "step": 2760 + }, + { + "epoch": 0.8830078554438425, + "grad_norm": 0.3304431736469269, + "learning_rate": 2.131491959938275e-05, + "loss": 4.364, + "step": 2761 + }, + { + "epoch": 0.8833276699514282, + "grad_norm": 0.3281329572200775, + "learning_rate": 2.119999350643764e-05, + "loss": 4.3357, + "step": 2762 + }, + { + "epoch": 0.8836474844590138, + "grad_norm": 0.3241371512413025, + "learning_rate": 2.108536673235922e-05, + "loss": 4.3018, + "step": 2763 + }, + { + "epoch": 0.8839672989665994, + "grad_norm": 0.3227525055408478, + "learning_rate": 2.0971039400210453e-05, + "loss": 4.3876, + "step": 2764 + }, + { + "epoch": 0.884287113474185, + "grad_norm": 0.31972551345825195, + "learning_rate": 2.0857011632732755e-05, + "loss": 4.335, + "step": 2765 + }, + { + "epoch": 0.8846069279817705, + "grad_norm": 0.3417794704437256, + "learning_rate": 2.0743283552346067e-05, + "loss": 4.3918, + "step": 2766 + }, + { + "epoch": 0.8849267424893562, + "grad_norm": 0.3201994001865387, + "learning_rate": 2.062985528114852e-05, + "loss": 4.3126, + "step": 2767 + }, + { + "epoch": 0.8852465569969418, + "grad_norm": 0.32395732402801514, + "learning_rate": 2.0516726940916372e-05, + "loss": 4.2857, + "step": 2768 + }, + { + "epoch": 0.8855663715045273, + "grad_norm": 0.3288622498512268, + "learning_rate": 2.0403898653103867e-05, + "loss": 4.4953, + "step": 2769 + }, + { + "epoch": 0.885886186012113, + "grad_norm": 0.3226605951786041, + "learning_rate": 2.029137053884311e-05, + "loss": 4.4309, + "step": 2770 + }, + { + "epoch": 0.8862060005196986, + "grad_norm": 0.31975656747817993, + "learning_rate": 2.0179142718943964e-05, + "loss": 4.3941, + "step": 2771 + }, + { + "epoch": 0.8865258150272842, + "grad_norm": 0.32429590821266174, + "learning_rate": 2.006721531389388e-05, + "loss": 4.4056, + "step": 2772 + }, + { + "epoch": 0.8868456295348698, + "grad_norm": 0.3391755521297455, + "learning_rate": 1.9955588443857807e-05, + "loss": 4.3843, + "step": 2773 + }, + { + "epoch": 0.8871654440424553, + "grad_norm": 0.3246627151966095, + "learning_rate": 1.9844262228678077e-05, + "loss": 4.3636, + "step": 2774 + }, + { + "epoch": 0.887485258550041, + "grad_norm": 0.3355211913585663, + "learning_rate": 1.9733236787874053e-05, + "loss": 4.3891, + "step": 2775 + }, + { + "epoch": 0.8878050730576266, + "grad_norm": 0.3338679075241089, + "learning_rate": 1.9622512240642386e-05, + "loss": 4.3871, + "step": 2776 + }, + { + "epoch": 0.8881248875652121, + "grad_norm": 0.3314814567565918, + "learning_rate": 1.9512088705856654e-05, + "loss": 4.4157, + "step": 2777 + }, + { + "epoch": 0.8884447020727978, + "grad_norm": 0.32310590147972107, + "learning_rate": 1.9401966302067262e-05, + "loss": 4.3733, + "step": 2778 + }, + { + "epoch": 0.8887645165803834, + "grad_norm": 0.35283005237579346, + "learning_rate": 1.9292145147501204e-05, + "loss": 4.3552, + "step": 2779 + }, + { + "epoch": 0.889084331087969, + "grad_norm": 0.3279760181903839, + "learning_rate": 1.91826253600622e-05, + "loss": 4.3163, + "step": 2780 + }, + { + "epoch": 0.8894041455955546, + "grad_norm": 0.3331490159034729, + "learning_rate": 1.907340705733036e-05, + "loss": 4.3924, + "step": 2781 + }, + { + "epoch": 0.8897239601031401, + "grad_norm": 0.32731348276138306, + "learning_rate": 1.8964490356562155e-05, + "loss": 4.3928, + "step": 2782 + }, + { + "epoch": 0.8900437746107258, + "grad_norm": 0.32343342900276184, + "learning_rate": 1.8855875374690288e-05, + "loss": 4.3203, + "step": 2783 + }, + { + "epoch": 0.8903635891183114, + "grad_norm": 0.37520188093185425, + "learning_rate": 1.8747562228323344e-05, + "loss": 4.3195, + "step": 2784 + }, + { + "epoch": 0.8906834036258969, + "grad_norm": 0.32275810837745667, + "learning_rate": 1.863955103374607e-05, + "loss": 4.3777, + "step": 2785 + }, + { + "epoch": 0.8910032181334826, + "grad_norm": 0.35362449288368225, + "learning_rate": 1.8531841906918976e-05, + "loss": 4.3911, + "step": 2786 + }, + { + "epoch": 0.8913230326410682, + "grad_norm": 0.3307854235172272, + "learning_rate": 1.8424434963478262e-05, + "loss": 4.3747, + "step": 2787 + }, + { + "epoch": 0.8916428471486538, + "grad_norm": 0.33758893609046936, + "learning_rate": 1.8317330318735757e-05, + "loss": 4.3987, + "step": 2788 + }, + { + "epoch": 0.8919626616562394, + "grad_norm": 0.33458662033081055, + "learning_rate": 1.8210528087678577e-05, + "loss": 4.3698, + "step": 2789 + }, + { + "epoch": 0.892282476163825, + "grad_norm": 0.31996577978134155, + "learning_rate": 1.810402838496937e-05, + "loss": 4.3393, + "step": 2790 + }, + { + "epoch": 0.8926022906714106, + "grad_norm": 0.32762593030929565, + "learning_rate": 1.799783132494581e-05, + "loss": 4.3347, + "step": 2791 + }, + { + "epoch": 0.8929221051789962, + "grad_norm": 0.3323177397251129, + "learning_rate": 1.789193702162086e-05, + "loss": 4.4891, + "step": 2792 + }, + { + "epoch": 0.8932419196865817, + "grad_norm": 0.33863160014152527, + "learning_rate": 1.7786345588682317e-05, + "loss": 4.4102, + "step": 2793 + }, + { + "epoch": 0.8935617341941674, + "grad_norm": 0.3298216462135315, + "learning_rate": 1.7681057139492792e-05, + "loss": 4.3999, + "step": 2794 + }, + { + "epoch": 0.893881548701753, + "grad_norm": 0.33105114102363586, + "learning_rate": 1.7576071787089672e-05, + "loss": 4.359, + "step": 2795 + }, + { + "epoch": 0.8942013632093386, + "grad_norm": 0.32095324993133545, + "learning_rate": 1.7471389644184897e-05, + "loss": 4.4448, + "step": 2796 + }, + { + "epoch": 0.8945211777169242, + "grad_norm": 0.3180572986602783, + "learning_rate": 1.7367010823164862e-05, + "loss": 4.4066, + "step": 2797 + }, + { + "epoch": 0.8948409922245097, + "grad_norm": 0.32465660572052, + "learning_rate": 1.726293543609053e-05, + "loss": 4.3098, + "step": 2798 + }, + { + "epoch": 0.8951608067320954, + "grad_norm": 0.32698917388916016, + "learning_rate": 1.7159163594696756e-05, + "loss": 4.3551, + "step": 2799 + }, + { + "epoch": 0.895480621239681, + "grad_norm": 0.3233741223812103, + "learning_rate": 1.7055695410392823e-05, + "loss": 4.2949, + "step": 2800 + }, + { + "epoch": 0.895480621239681, + "eval_loss": 4.384608268737793, + "eval_runtime": 97.7804, + "eval_samples_per_second": 19.401, + "eval_steps_per_second": 4.858, + "step": 2800 + }, + { + "epoch": 0.8958004357472665, + "grad_norm": 0.32978156208992004, + "learning_rate": 1.695253099426177e-05, + "loss": 4.2676, + "step": 2801 + }, + { + "epoch": 0.8961202502548522, + "grad_norm": 0.3305906057357788, + "learning_rate": 1.6849670457060605e-05, + "loss": 4.353, + "step": 2802 + }, + { + "epoch": 0.8964400647624378, + "grad_norm": 0.33238425850868225, + "learning_rate": 1.6747113909220155e-05, + "loss": 4.374, + "step": 2803 + }, + { + "epoch": 0.8967598792700234, + "grad_norm": 0.33963367342948914, + "learning_rate": 1.6644861460844782e-05, + "loss": 4.2773, + "step": 2804 + }, + { + "epoch": 0.897079693777609, + "grad_norm": 0.3328540623188019, + "learning_rate": 1.6542913221712506e-05, + "loss": 4.4412, + "step": 2805 + }, + { + "epoch": 0.8973995082851945, + "grad_norm": 0.3270185887813568, + "learning_rate": 1.6441269301274572e-05, + "loss": 4.3709, + "step": 2806 + }, + { + "epoch": 0.8977193227927802, + "grad_norm": 0.31789711117744446, + "learning_rate": 1.633992980865556e-05, + "loss": 4.3758, + "step": 2807 + }, + { + "epoch": 0.8980391373003658, + "grad_norm": 0.33433958888053894, + "learning_rate": 1.6238894852653338e-05, + "loss": 4.3488, + "step": 2808 + }, + { + "epoch": 0.8983589518079513, + "grad_norm": 0.3263933062553406, + "learning_rate": 1.6138164541738674e-05, + "loss": 4.4204, + "step": 2809 + }, + { + "epoch": 0.898678766315537, + "grad_norm": 0.3347971737384796, + "learning_rate": 1.6037738984055425e-05, + "loss": 4.34, + "step": 2810 + }, + { + "epoch": 0.8989985808231226, + "grad_norm": 0.32892242074012756, + "learning_rate": 1.5937618287420052e-05, + "loss": 4.3679, + "step": 2811 + }, + { + "epoch": 0.8993183953307082, + "grad_norm": 0.3295742869377136, + "learning_rate": 1.583780255932193e-05, + "loss": 4.3666, + "step": 2812 + }, + { + "epoch": 0.8996382098382938, + "grad_norm": 0.33569231629371643, + "learning_rate": 1.5738291906922883e-05, + "loss": 4.3431, + "step": 2813 + }, + { + "epoch": 0.8999580243458793, + "grad_norm": 0.3311055898666382, + "learning_rate": 1.5639086437057314e-05, + "loss": 4.3241, + "step": 2814 + }, + { + "epoch": 0.900277838853465, + "grad_norm": 0.4988267421722412, + "learning_rate": 1.5540186256231823e-05, + "loss": 4.3952, + "step": 2815 + }, + { + "epoch": 0.9005976533610506, + "grad_norm": 0.3264239728450775, + "learning_rate": 1.5441591470625414e-05, + "loss": 4.333, + "step": 2816 + }, + { + "epoch": 0.9009174678686362, + "grad_norm": 0.3201085031032562, + "learning_rate": 1.534330218608918e-05, + "loss": 4.4191, + "step": 2817 + }, + { + "epoch": 0.9012372823762218, + "grad_norm": 0.3359006643295288, + "learning_rate": 1.5245318508146175e-05, + "loss": 4.3771, + "step": 2818 + }, + { + "epoch": 0.9015570968838074, + "grad_norm": 0.3259337544441223, + "learning_rate": 1.5147640541991424e-05, + "loss": 4.4356, + "step": 2819 + }, + { + "epoch": 0.901876911391393, + "grad_norm": 0.3287445306777954, + "learning_rate": 1.5050268392491639e-05, + "loss": 4.4308, + "step": 2820 + }, + { + "epoch": 0.9021967258989786, + "grad_norm": 0.33562859892845154, + "learning_rate": 1.4953202164185297e-05, + "loss": 4.4679, + "step": 2821 + }, + { + "epoch": 0.9025165404065641, + "grad_norm": 0.3279714584350586, + "learning_rate": 1.4856441961282472e-05, + "loss": 4.4241, + "step": 2822 + }, + { + "epoch": 0.9028363549141498, + "grad_norm": 0.3326357305049896, + "learning_rate": 1.4759987887664537e-05, + "loss": 4.3265, + "step": 2823 + }, + { + "epoch": 0.9031561694217354, + "grad_norm": 0.3204142451286316, + "learning_rate": 1.4663840046884423e-05, + "loss": 4.29, + "step": 2824 + }, + { + "epoch": 0.903475983929321, + "grad_norm": 0.32729199528694153, + "learning_rate": 1.456799854216606e-05, + "loss": 4.3067, + "step": 2825 + }, + { + "epoch": 0.9037957984369066, + "grad_norm": 0.3270171880722046, + "learning_rate": 1.447246347640464e-05, + "loss": 4.3767, + "step": 2826 + }, + { + "epoch": 0.9041156129444922, + "grad_norm": 0.3349086344242096, + "learning_rate": 1.437723495216635e-05, + "loss": 4.2788, + "step": 2827 + }, + { + "epoch": 0.9044354274520778, + "grad_norm": 0.3228219449520111, + "learning_rate": 1.4282313071688211e-05, + "loss": 4.4143, + "step": 2828 + }, + { + "epoch": 0.9047552419596634, + "grad_norm": 0.3361509442329407, + "learning_rate": 1.4187697936878172e-05, + "loss": 4.3191, + "step": 2829 + }, + { + "epoch": 0.905075056467249, + "grad_norm": 0.3282942473888397, + "learning_rate": 1.4093389649314613e-05, + "loss": 4.3967, + "step": 2830 + }, + { + "epoch": 0.9053948709748346, + "grad_norm": 0.3477500379085541, + "learning_rate": 1.399938831024674e-05, + "loss": 4.5111, + "step": 2831 + }, + { + "epoch": 0.9057146854824202, + "grad_norm": 0.3260105550289154, + "learning_rate": 1.3905694020594093e-05, + "loss": 4.4794, + "step": 2832 + }, + { + "epoch": 0.9060344999900058, + "grad_norm": 0.3467327952384949, + "learning_rate": 1.3812306880946577e-05, + "loss": 4.3209, + "step": 2833 + }, + { + "epoch": 0.9063543144975914, + "grad_norm": 0.3284205496311188, + "learning_rate": 1.3719226991564392e-05, + "loss": 4.3277, + "step": 2834 + }, + { + "epoch": 0.906674129005177, + "grad_norm": 0.33202722668647766, + "learning_rate": 1.3626454452377734e-05, + "loss": 4.4188, + "step": 2835 + }, + { + "epoch": 0.9069939435127626, + "grad_norm": 0.33401980996131897, + "learning_rate": 1.3533989362987063e-05, + "loss": 4.4344, + "step": 2836 + }, + { + "epoch": 0.9073137580203482, + "grad_norm": 0.330626517534256, + "learning_rate": 1.3441831822662441e-05, + "loss": 4.3946, + "step": 2837 + }, + { + "epoch": 0.9076335725279338, + "grad_norm": 0.3449816107749939, + "learning_rate": 1.3349981930344156e-05, + "loss": 4.346, + "step": 2838 + }, + { + "epoch": 0.9079533870355194, + "grad_norm": 0.33034005761146545, + "learning_rate": 1.3258439784641795e-05, + "loss": 4.3935, + "step": 2839 + }, + { + "epoch": 0.908273201543105, + "grad_norm": 0.33299288153648376, + "learning_rate": 1.3167205483834842e-05, + "loss": 4.3037, + "step": 2840 + }, + { + "epoch": 0.9085930160506906, + "grad_norm": 0.3254457712173462, + "learning_rate": 1.307627912587218e-05, + "loss": 4.3259, + "step": 2841 + }, + { + "epoch": 0.9089128305582762, + "grad_norm": 0.32772693037986755, + "learning_rate": 1.2985660808371955e-05, + "loss": 4.435, + "step": 2842 + }, + { + "epoch": 0.9092326450658618, + "grad_norm": 0.3270621597766876, + "learning_rate": 1.2895350628621882e-05, + "loss": 4.2919, + "step": 2843 + }, + { + "epoch": 0.9095524595734474, + "grad_norm": 0.32787418365478516, + "learning_rate": 1.2805348683578598e-05, + "loss": 4.3179, + "step": 2844 + }, + { + "epoch": 0.909872274081033, + "grad_norm": 0.32550475001335144, + "learning_rate": 1.271565506986798e-05, + "loss": 4.3153, + "step": 2845 + }, + { + "epoch": 0.9101920885886186, + "grad_norm": 0.3306429982185364, + "learning_rate": 1.2626269883784834e-05, + "loss": 4.3148, + "step": 2846 + }, + { + "epoch": 0.9105119030962042, + "grad_norm": 0.32415875792503357, + "learning_rate": 1.2537193221292763e-05, + "loss": 4.3869, + "step": 2847 + }, + { + "epoch": 0.9108317176037898, + "grad_norm": 0.3241727352142334, + "learning_rate": 1.2448425178024302e-05, + "loss": 4.408, + "step": 2848 + }, + { + "epoch": 0.9111515321113755, + "grad_norm": 0.34127408266067505, + "learning_rate": 1.2359965849280518e-05, + "loss": 4.3713, + "step": 2849 + }, + { + "epoch": 0.911471346618961, + "grad_norm": 0.3368206322193146, + "learning_rate": 1.2271815330031076e-05, + "loss": 4.4106, + "step": 2850 + }, + { + "epoch": 0.9117911611265466, + "grad_norm": 0.32262179255485535, + "learning_rate": 1.218397371491414e-05, + "loss": 4.3404, + "step": 2851 + }, + { + "epoch": 0.9121109756341322, + "grad_norm": 0.3209473788738251, + "learning_rate": 1.2096441098236108e-05, + "loss": 4.3591, + "step": 2852 + }, + { + "epoch": 0.9124307901417178, + "grad_norm": 0.33111003041267395, + "learning_rate": 1.2009217573971907e-05, + "loss": 4.3861, + "step": 2853 + }, + { + "epoch": 0.9127506046493034, + "grad_norm": 0.3383842408657074, + "learning_rate": 1.1922303235764363e-05, + "loss": 4.4078, + "step": 2854 + }, + { + "epoch": 0.913070419156889, + "grad_norm": 0.3423125743865967, + "learning_rate": 1.1835698176924468e-05, + "loss": 4.3925, + "step": 2855 + }, + { + "epoch": 0.9133902336644746, + "grad_norm": 0.3296370208263397, + "learning_rate": 1.1749402490431148e-05, + "loss": 4.364, + "step": 2856 + }, + { + "epoch": 0.9137100481720603, + "grad_norm": 0.32709506154060364, + "learning_rate": 1.1663416268931192e-05, + "loss": 4.3299, + "step": 2857 + }, + { + "epoch": 0.9140298626796458, + "grad_norm": 0.3346802294254303, + "learning_rate": 1.1577739604739155e-05, + "loss": 4.3629, + "step": 2858 + }, + { + "epoch": 0.9143496771872314, + "grad_norm": 0.32957184314727783, + "learning_rate": 1.1492372589837261e-05, + "loss": 4.3879, + "step": 2859 + }, + { + "epoch": 0.914669491694817, + "grad_norm": 0.3254009783267975, + "learning_rate": 1.1407315315875365e-05, + "loss": 4.4061, + "step": 2860 + }, + { + "epoch": 0.9149893062024026, + "grad_norm": 0.3332465589046478, + "learning_rate": 1.1322567874170552e-05, + "loss": 4.3795, + "step": 2861 + }, + { + "epoch": 0.9153091207099882, + "grad_norm": 0.31989049911499023, + "learning_rate": 1.1238130355707509e-05, + "loss": 4.3036, + "step": 2862 + }, + { + "epoch": 0.9156289352175738, + "grad_norm": 0.320244699716568, + "learning_rate": 1.1154002851138122e-05, + "loss": 4.3326, + "step": 2863 + }, + { + "epoch": 0.9159487497251594, + "grad_norm": 0.314765065908432, + "learning_rate": 1.107018545078141e-05, + "loss": 4.2839, + "step": 2864 + }, + { + "epoch": 0.916268564232745, + "grad_norm": 0.31837067008018494, + "learning_rate": 1.0986678244623526e-05, + "loss": 4.308, + "step": 2865 + }, + { + "epoch": 0.9165883787403306, + "grad_norm": 0.32831281423568726, + "learning_rate": 1.0903481322317486e-05, + "loss": 4.3934, + "step": 2866 + }, + { + "epoch": 0.9169081932479162, + "grad_norm": 0.320701003074646, + "learning_rate": 1.0820594773183278e-05, + "loss": 4.3102, + "step": 2867 + }, + { + "epoch": 0.9172280077555018, + "grad_norm": 0.37862440943717957, + "learning_rate": 1.0738018686207683e-05, + "loss": 4.3674, + "step": 2868 + }, + { + "epoch": 0.9175478222630874, + "grad_norm": 0.34594613313674927, + "learning_rate": 1.0655753150044155e-05, + "loss": 4.2845, + "step": 2869 + }, + { + "epoch": 0.917867636770673, + "grad_norm": 0.33938634395599365, + "learning_rate": 1.0573798253012778e-05, + "loss": 4.3945, + "step": 2870 + }, + { + "epoch": 0.9181874512782586, + "grad_norm": 0.3470006287097931, + "learning_rate": 1.0492154083099968e-05, + "loss": 4.4398, + "step": 2871 + }, + { + "epoch": 0.9185072657858442, + "grad_norm": 0.3180997371673584, + "learning_rate": 1.0410820727958712e-05, + "loss": 4.3954, + "step": 2872 + }, + { + "epoch": 0.9188270802934299, + "grad_norm": 0.32326364517211914, + "learning_rate": 1.0329798274908297e-05, + "loss": 4.3679, + "step": 2873 + }, + { + "epoch": 0.9191468948010154, + "grad_norm": 0.3454664945602417, + "learning_rate": 1.0249086810934204e-05, + "loss": 4.3929, + "step": 2874 + }, + { + "epoch": 0.919466709308601, + "grad_norm": 0.33620408177375793, + "learning_rate": 1.0168686422687921e-05, + "loss": 4.3501, + "step": 2875 + }, + { + "epoch": 0.9197865238161866, + "grad_norm": 0.335411936044693, + "learning_rate": 1.008859719648717e-05, + "loss": 4.3344, + "step": 2876 + }, + { + "epoch": 0.9201063383237722, + "grad_norm": 0.32632091641426086, + "learning_rate": 1.0008819218315434e-05, + "loss": 4.3609, + "step": 2877 + }, + { + "epoch": 0.9204261528313578, + "grad_norm": 0.3331916630268097, + "learning_rate": 9.929352573822203e-06, + "loss": 4.287, + "step": 2878 + }, + { + "epoch": 0.9207459673389434, + "grad_norm": 0.3235928416252136, + "learning_rate": 9.850197348322597e-06, + "loss": 4.3305, + "step": 2879 + }, + { + "epoch": 0.921065781846529, + "grad_norm": 0.3344513177871704, + "learning_rate": 9.771353626797373e-06, + "loss": 4.35, + "step": 2880 + }, + { + "epoch": 0.9213855963541147, + "grad_norm": 0.33483588695526123, + "learning_rate": 9.692821493892988e-06, + "loss": 4.4333, + "step": 2881 + }, + { + "epoch": 0.9217054108617002, + "grad_norm": 0.32479128241539, + "learning_rate": 9.614601033921266e-06, + "loss": 4.2633, + "step": 2882 + }, + { + "epoch": 0.9220252253692858, + "grad_norm": 0.35839805006980896, + "learning_rate": 9.536692330859497e-06, + "loss": 4.4617, + "step": 2883 + }, + { + "epoch": 0.9223450398768714, + "grad_norm": 0.3326389193534851, + "learning_rate": 9.459095468350241e-06, + "loss": 4.3522, + "step": 2884 + }, + { + "epoch": 0.922664854384457, + "grad_norm": 0.3288221061229706, + "learning_rate": 9.381810529701228e-06, + "loss": 4.3787, + "step": 2885 + }, + { + "epoch": 0.9229846688920427, + "grad_norm": 0.3186630606651306, + "learning_rate": 9.30483759788535e-06, + "loss": 4.403, + "step": 2886 + }, + { + "epoch": 0.9233044833996282, + "grad_norm": 0.3173036277294159, + "learning_rate": 9.228176755540506e-06, + "loss": 4.3659, + "step": 2887 + }, + { + "epoch": 0.9236242979072138, + "grad_norm": 0.3245466649532318, + "learning_rate": 9.151828084969593e-06, + "loss": 4.3796, + "step": 2888 + }, + { + "epoch": 0.9239441124147995, + "grad_norm": 0.31835073232650757, + "learning_rate": 9.075791668140308e-06, + "loss": 4.4037, + "step": 2889 + }, + { + "epoch": 0.924263926922385, + "grad_norm": 0.34545114636421204, + "learning_rate": 9.000067586685089e-06, + "loss": 4.358, + "step": 2890 + }, + { + "epoch": 0.9245837414299706, + "grad_norm": 0.3350619077682495, + "learning_rate": 8.924655921901135e-06, + "loss": 4.3785, + "step": 2891 + }, + { + "epoch": 0.9249035559375562, + "grad_norm": 0.3419342041015625, + "learning_rate": 8.849556754750153e-06, + "loss": 4.3976, + "step": 2892 + }, + { + "epoch": 0.9252233704451418, + "grad_norm": 1.3042224645614624, + "learning_rate": 8.774770165858347e-06, + "loss": 4.424, + "step": 2893 + }, + { + "epoch": 0.9255431849527275, + "grad_norm": 0.3174867331981659, + "learning_rate": 8.70029623551649e-06, + "loss": 4.3476, + "step": 2894 + }, + { + "epoch": 0.925862999460313, + "grad_norm": 0.3302491307258606, + "learning_rate": 8.626135043679495e-06, + "loss": 4.3753, + "step": 2895 + }, + { + "epoch": 0.9261828139678986, + "grad_norm": 0.3348652422428131, + "learning_rate": 8.552286669966635e-06, + "loss": 4.3164, + "step": 2896 + }, + { + "epoch": 0.9265026284754843, + "grad_norm": 0.3382120430469513, + "learning_rate": 8.47875119366126e-06, + "loss": 4.4176, + "step": 2897 + }, + { + "epoch": 0.9268224429830698, + "grad_norm": 0.3329518437385559, + "learning_rate": 8.405528693710883e-06, + "loss": 4.3972, + "step": 2898 + }, + { + "epoch": 0.9271422574906554, + "grad_norm": 0.34764277935028076, + "learning_rate": 8.332619248726957e-06, + "loss": 4.4171, + "step": 2899 + }, + { + "epoch": 0.927462071998241, + "grad_norm": 0.3330550491809845, + "learning_rate": 8.260022936984833e-06, + "loss": 4.3673, + "step": 2900 + }, + { + "epoch": 0.927462071998241, + "eval_loss": 4.38118314743042, + "eval_runtime": 96.8272, + "eval_samples_per_second": 19.592, + "eval_steps_per_second": 4.906, + "step": 2900 + }, + { + "epoch": 0.9277818865058266, + "grad_norm": 0.3248373866081238, + "learning_rate": 8.187739836423734e-06, + "loss": 4.3651, + "step": 2901 + }, + { + "epoch": 0.9281017010134123, + "grad_norm": 0.32036662101745605, + "learning_rate": 8.115770024646518e-06, + "loss": 4.3364, + "step": 2902 + }, + { + "epoch": 0.9284215155209978, + "grad_norm": 0.3199023902416229, + "learning_rate": 8.044113578919842e-06, + "loss": 4.4604, + "step": 2903 + }, + { + "epoch": 0.9287413300285834, + "grad_norm": 0.32586199045181274, + "learning_rate": 7.97277057617377e-06, + "loss": 4.392, + "step": 2904 + }, + { + "epoch": 0.9290611445361691, + "grad_norm": 0.34327834844589233, + "learning_rate": 7.901741093002002e-06, + "loss": 4.3379, + "step": 2905 + }, + { + "epoch": 0.9293809590437546, + "grad_norm": 0.3284776508808136, + "learning_rate": 7.8310252056616e-06, + "loss": 4.3835, + "step": 2906 + }, + { + "epoch": 0.9297007735513402, + "grad_norm": 0.3319602608680725, + "learning_rate": 7.760622990072873e-06, + "loss": 4.4086, + "step": 2907 + }, + { + "epoch": 0.9300205880589258, + "grad_norm": 0.3359242379665375, + "learning_rate": 7.690534521819458e-06, + "loss": 4.3714, + "step": 2908 + }, + { + "epoch": 0.9303404025665114, + "grad_norm": 0.3331519067287445, + "learning_rate": 7.6207598761481305e-06, + "loss": 4.3064, + "step": 2909 + }, + { + "epoch": 0.9306602170740971, + "grad_norm": 0.32268157601356506, + "learning_rate": 7.5512991279687684e-06, + "loss": 4.3458, + "step": 2910 + }, + { + "epoch": 0.9309800315816826, + "grad_norm": 0.32395583391189575, + "learning_rate": 7.482152351854187e-06, + "loss": 4.348, + "step": 2911 + }, + { + "epoch": 0.9312998460892682, + "grad_norm": 0.33460599184036255, + "learning_rate": 7.413319622040137e-06, + "loss": 4.3118, + "step": 2912 + }, + { + "epoch": 0.9316196605968539, + "grad_norm": 0.3139401972293854, + "learning_rate": 7.344801012425306e-06, + "loss": 4.3487, + "step": 2913 + }, + { + "epoch": 0.9319394751044394, + "grad_norm": 0.3270988464355469, + "learning_rate": 7.276596596571016e-06, + "loss": 4.3667, + "step": 2914 + }, + { + "epoch": 0.932259289612025, + "grad_norm": 0.32469436526298523, + "learning_rate": 7.208706447701395e-06, + "loss": 4.3818, + "step": 2915 + }, + { + "epoch": 0.9325791041196106, + "grad_norm": 0.3502923250198364, + "learning_rate": 7.141130638703041e-06, + "loss": 4.3693, + "step": 2916 + }, + { + "epoch": 0.9328989186271962, + "grad_norm": 0.33618977665901184, + "learning_rate": 7.073869242125152e-06, + "loss": 4.4508, + "step": 2917 + }, + { + "epoch": 0.9332187331347819, + "grad_norm": 0.3267005980014801, + "learning_rate": 7.006922330179398e-06, + "loss": 4.4196, + "step": 2918 + }, + { + "epoch": 0.9335385476423674, + "grad_norm": 0.32550060749053955, + "learning_rate": 6.940289974739754e-06, + "loss": 4.2636, + "step": 2919 + }, + { + "epoch": 0.933858362149953, + "grad_norm": 0.32697173953056335, + "learning_rate": 6.8739722473425295e-06, + "loss": 4.2765, + "step": 2920 + }, + { + "epoch": 0.9341781766575387, + "grad_norm": 0.3203597068786621, + "learning_rate": 6.807969219186271e-06, + "loss": 4.3186, + "step": 2921 + }, + { + "epoch": 0.9344979911651242, + "grad_norm": 0.32253390550613403, + "learning_rate": 6.742280961131563e-06, + "loss": 4.4395, + "step": 2922 + }, + { + "epoch": 0.9348178056727098, + "grad_norm": 0.3350317180156708, + "learning_rate": 6.676907543701227e-06, + "loss": 4.4133, + "step": 2923 + }, + { + "epoch": 0.9351376201802954, + "grad_norm": 0.35089296102523804, + "learning_rate": 6.611849037079886e-06, + "loss": 4.3996, + "step": 2924 + }, + { + "epoch": 0.935457434687881, + "grad_norm": 0.3362065851688385, + "learning_rate": 6.5471055111142035e-06, + "loss": 4.3639, + "step": 2925 + }, + { + "epoch": 0.9357772491954667, + "grad_norm": 0.33695685863494873, + "learning_rate": 6.4826770353126115e-06, + "loss": 4.3329, + "step": 2926 + }, + { + "epoch": 0.9360970637030522, + "grad_norm": 0.3362276256084442, + "learning_rate": 6.418563678845379e-06, + "loss": 4.3527, + "step": 2927 + }, + { + "epoch": 0.9364168782106378, + "grad_norm": 0.3312215805053711, + "learning_rate": 6.354765510544346e-06, + "loss": 4.4177, + "step": 2928 + }, + { + "epoch": 0.9367366927182235, + "grad_norm": 0.31979092955589294, + "learning_rate": 6.291282598903091e-06, + "loss": 4.3469, + "step": 2929 + }, + { + "epoch": 0.937056507225809, + "grad_norm": 0.32077279686927795, + "learning_rate": 6.228115012076729e-06, + "loss": 4.4415, + "step": 2930 + }, + { + "epoch": 0.9373763217333946, + "grad_norm": 0.3242551386356354, + "learning_rate": 6.165262817881678e-06, + "loss": 4.3477, + "step": 2931 + }, + { + "epoch": 0.9376961362409802, + "grad_norm": 0.32322248816490173, + "learning_rate": 6.102726083795961e-06, + "loss": 4.3688, + "step": 2932 + }, + { + "epoch": 0.9380159507485658, + "grad_norm": 0.323236346244812, + "learning_rate": 6.040504876958741e-06, + "loss": 4.367, + "step": 2933 + }, + { + "epoch": 0.9383357652561515, + "grad_norm": 0.3199426233768463, + "learning_rate": 5.978599264170614e-06, + "loss": 4.3521, + "step": 2934 + }, + { + "epoch": 0.938655579763737, + "grad_norm": 0.3212834298610687, + "learning_rate": 5.917009311893217e-06, + "loss": 4.3101, + "step": 2935 + }, + { + "epoch": 0.9389753942713226, + "grad_norm": 0.3298892080783844, + "learning_rate": 5.855735086249358e-06, + "loss": 4.4033, + "step": 2936 + }, + { + "epoch": 0.9392952087789083, + "grad_norm": 0.320708692073822, + "learning_rate": 5.794776653022881e-06, + "loss": 4.329, + "step": 2937 + }, + { + "epoch": 0.9396150232864938, + "grad_norm": 0.33011531829833984, + "learning_rate": 5.7341340776585035e-06, + "loss": 4.3612, + "step": 2938 + }, + { + "epoch": 0.9399348377940794, + "grad_norm": 0.3417920768260956, + "learning_rate": 5.673807425262045e-06, + "loss": 4.3838, + "step": 2939 + }, + { + "epoch": 0.940254652301665, + "grad_norm": 0.32390668988227844, + "learning_rate": 5.613796760599898e-06, + "loss": 4.3027, + "step": 2940 + }, + { + "epoch": 0.9405744668092506, + "grad_norm": 0.3507520854473114, + "learning_rate": 5.554102148099393e-06, + "loss": 4.4617, + "step": 2941 + }, + { + "epoch": 0.9408942813168363, + "grad_norm": 0.32159173488616943, + "learning_rate": 5.494723651848532e-06, + "loss": 4.4397, + "step": 2942 + }, + { + "epoch": 0.9412140958244218, + "grad_norm": 0.33860674500465393, + "learning_rate": 5.435661335595753e-06, + "loss": 4.3651, + "step": 2943 + }, + { + "epoch": 0.9415339103320074, + "grad_norm": 0.3142724931240082, + "learning_rate": 5.376915262750369e-06, + "loss": 4.3949, + "step": 2944 + }, + { + "epoch": 0.9418537248395931, + "grad_norm": 0.3150574564933777, + "learning_rate": 5.3184854963818305e-06, + "loss": 4.2867, + "step": 2945 + }, + { + "epoch": 0.9421735393471786, + "grad_norm": 0.32347816228866577, + "learning_rate": 5.260372099220289e-06, + "loss": 4.3839, + "step": 2946 + }, + { + "epoch": 0.9424933538547642, + "grad_norm": 0.32392770051956177, + "learning_rate": 5.202575133656039e-06, + "loss": 4.4329, + "step": 2947 + }, + { + "epoch": 0.9428131683623499, + "grad_norm": 0.334563672542572, + "learning_rate": 5.145094661739746e-06, + "loss": 4.3614, + "step": 2948 + }, + { + "epoch": 0.9431329828699354, + "grad_norm": 0.32934969663619995, + "learning_rate": 5.087930745182278e-06, + "loss": 4.3988, + "step": 2949 + }, + { + "epoch": 0.9434527973775211, + "grad_norm": 0.332389771938324, + "learning_rate": 5.031083445354644e-06, + "loss": 4.3827, + "step": 2950 + }, + { + "epoch": 0.9437726118851066, + "grad_norm": 0.3233090341091156, + "learning_rate": 4.9745528232879915e-06, + "loss": 4.3413, + "step": 2951 + }, + { + "epoch": 0.9440924263926922, + "grad_norm": 0.3395127058029175, + "learning_rate": 4.918338939673372e-06, + "loss": 4.4381, + "step": 2952 + }, + { + "epoch": 0.9444122409002779, + "grad_norm": 0.321155846118927, + "learning_rate": 4.862441854861809e-06, + "loss": 4.3916, + "step": 2953 + }, + { + "epoch": 0.9447320554078634, + "grad_norm": 0.33099478483200073, + "learning_rate": 4.806861628864333e-06, + "loss": 4.4219, + "step": 2954 + }, + { + "epoch": 0.9450518699154491, + "grad_norm": 0.33024051785469055, + "learning_rate": 4.751598321351679e-06, + "loss": 4.3912, + "step": 2955 + }, + { + "epoch": 0.9453716844230347, + "grad_norm": 0.3497087359428406, + "learning_rate": 4.6966519916543875e-06, + "loss": 4.3261, + "step": 2956 + }, + { + "epoch": 0.9456914989306202, + "grad_norm": 0.31810522079467773, + "learning_rate": 4.642022698762638e-06, + "loss": 4.2891, + "step": 2957 + }, + { + "epoch": 0.9460113134382059, + "grad_norm": 0.3298070728778839, + "learning_rate": 4.5877105013262805e-06, + "loss": 4.3088, + "step": 2958 + }, + { + "epoch": 0.9463311279457914, + "grad_norm": 0.32999950647354126, + "learning_rate": 4.533715457654741e-06, + "loss": 4.4058, + "step": 2959 + }, + { + "epoch": 0.946650942453377, + "grad_norm": 0.3260367214679718, + "learning_rate": 4.480037625716981e-06, + "loss": 4.3343, + "step": 2960 + }, + { + "epoch": 0.9469707569609627, + "grad_norm": 0.3239196538925171, + "learning_rate": 4.4266770631413374e-06, + "loss": 4.4208, + "step": 2961 + }, + { + "epoch": 0.9472905714685482, + "grad_norm": 0.3258492350578308, + "learning_rate": 4.373633827215517e-06, + "loss": 4.5037, + "step": 2962 + }, + { + "epoch": 0.9476103859761339, + "grad_norm": 0.3244808316230774, + "learning_rate": 4.3209079748866e-06, + "loss": 4.3016, + "step": 2963 + }, + { + "epoch": 0.9479302004837195, + "grad_norm": 0.33479514718055725, + "learning_rate": 4.268499562760907e-06, + "loss": 4.3761, + "step": 2964 + }, + { + "epoch": 0.948250014991305, + "grad_norm": 0.31352558732032776, + "learning_rate": 4.216408647103997e-06, + "loss": 4.3636, + "step": 2965 + }, + { + "epoch": 0.9485698294988907, + "grad_norm": 0.3271879255771637, + "learning_rate": 4.164635283840468e-06, + "loss": 4.3915, + "step": 2966 + }, + { + "epoch": 0.9488896440064762, + "grad_norm": 0.32334235310554504, + "learning_rate": 4.113179528554089e-06, + "loss": 4.3971, + "step": 2967 + }, + { + "epoch": 0.9492094585140618, + "grad_norm": 0.3343781530857086, + "learning_rate": 4.062041436487573e-06, + "loss": 4.3883, + "step": 2968 + }, + { + "epoch": 0.9495292730216475, + "grad_norm": 0.31923946738243103, + "learning_rate": 4.011221062542636e-06, + "loss": 4.2875, + "step": 2969 + }, + { + "epoch": 0.949849087529233, + "grad_norm": 0.334395170211792, + "learning_rate": 3.9607184612799325e-06, + "loss": 4.3944, + "step": 2970 + }, + { + "epoch": 0.9501689020368187, + "grad_norm": 0.33013853430747986, + "learning_rate": 3.910533686918826e-06, + "loss": 4.3853, + "step": 2971 + }, + { + "epoch": 0.9504887165444043, + "grad_norm": 0.3307196795940399, + "learning_rate": 3.860666793337585e-06, + "loss": 4.3385, + "step": 2972 + }, + { + "epoch": 0.9508085310519898, + "grad_norm": 0.3317532241344452, + "learning_rate": 3.811117834073152e-06, + "loss": 4.3694, + "step": 2973 + }, + { + "epoch": 0.9511283455595755, + "grad_norm": 0.3383411467075348, + "learning_rate": 3.761886862321173e-06, + "loss": 4.3688, + "step": 2974 + }, + { + "epoch": 0.951448160067161, + "grad_norm": 0.32362377643585205, + "learning_rate": 3.7129739309358362e-06, + "loss": 4.3074, + "step": 2975 + }, + { + "epoch": 0.9517679745747466, + "grad_norm": 0.32669341564178467, + "learning_rate": 3.664379092429903e-06, + "loss": 4.3258, + "step": 2976 + }, + { + "epoch": 0.9520877890823323, + "grad_norm": 0.32052507996559143, + "learning_rate": 3.6161023989747075e-06, + "loss": 4.4202, + "step": 2977 + }, + { + "epoch": 0.9524076035899178, + "grad_norm": 0.3372294306755066, + "learning_rate": 3.5681439023999224e-06, + "loss": 4.3761, + "step": 2978 + }, + { + "epoch": 0.9527274180975035, + "grad_norm": 0.33484935760498047, + "learning_rate": 3.5205036541936626e-06, + "loss": 4.3767, + "step": 2979 + }, + { + "epoch": 0.9530472326050891, + "grad_norm": 0.3463588356971741, + "learning_rate": 3.4731817055023812e-06, + "loss": 4.3565, + "step": 2980 + }, + { + "epoch": 0.9533670471126746, + "grad_norm": 0.325015664100647, + "learning_rate": 3.4261781071307393e-06, + "loss": 4.3474, + "step": 2981 + }, + { + "epoch": 0.9536868616202603, + "grad_norm": 0.32021352648735046, + "learning_rate": 3.3794929095417034e-06, + "loss": 4.3902, + "step": 2982 + }, + { + "epoch": 0.9540066761278458, + "grad_norm": 0.34606412053108215, + "learning_rate": 3.3331261628563145e-06, + "loss": 4.2884, + "step": 2983 + }, + { + "epoch": 0.9543264906354314, + "grad_norm": 0.3238740861415863, + "learning_rate": 3.2870779168538196e-06, + "loss": 4.3385, + "step": 2984 + }, + { + "epoch": 0.9546463051430171, + "grad_norm": 0.3227710723876953, + "learning_rate": 3.2413482209714737e-06, + "loss": 4.2974, + "step": 2985 + }, + { + "epoch": 0.9549661196506026, + "grad_norm": 0.33221638202667236, + "learning_rate": 3.195937124304504e-06, + "loss": 4.44, + "step": 2986 + }, + { + "epoch": 0.9552859341581883, + "grad_norm": 0.32845816016197205, + "learning_rate": 3.150844675606212e-06, + "loss": 4.2868, + "step": 2987 + }, + { + "epoch": 0.9556057486657739, + "grad_norm": 0.33255472779273987, + "learning_rate": 3.10607092328764e-06, + "loss": 4.3865, + "step": 2988 + }, + { + "epoch": 0.9559255631733594, + "grad_norm": 0.3340242803096771, + "learning_rate": 3.0616159154177366e-06, + "loss": 4.3735, + "step": 2989 + }, + { + "epoch": 0.9562453776809451, + "grad_norm": 0.3314898610115051, + "learning_rate": 3.0174796997233908e-06, + "loss": 4.4163, + "step": 2990 + }, + { + "epoch": 0.9565651921885306, + "grad_norm": 0.33092954754829407, + "learning_rate": 2.973662323588999e-06, + "loss": 4.3312, + "step": 2991 + }, + { + "epoch": 0.9568850066961162, + "grad_norm": 0.3198767900466919, + "learning_rate": 2.930163834056831e-06, + "loss": 4.4544, + "step": 2992 + }, + { + "epoch": 0.9572048212037019, + "grad_norm": 0.3256797194480896, + "learning_rate": 2.8869842778266983e-06, + "loss": 4.401, + "step": 2993 + }, + { + "epoch": 0.9575246357112874, + "grad_norm": 0.32482126355171204, + "learning_rate": 2.844123701256051e-06, + "loss": 4.3991, + "step": 2994 + }, + { + "epoch": 0.9578444502188731, + "grad_norm": 0.34271669387817383, + "learning_rate": 2.801582150359882e-06, + "loss": 4.3053, + "step": 2995 + }, + { + "epoch": 0.9581642647264587, + "grad_norm": 0.32339465618133545, + "learning_rate": 2.7593596708106904e-06, + "loss": 4.2509, + "step": 2996 + }, + { + "epoch": 0.9584840792340442, + "grad_norm": 0.3229231834411621, + "learning_rate": 2.717456307938415e-06, + "loss": 4.356, + "step": 2997 + }, + { + "epoch": 0.9588038937416299, + "grad_norm": 0.3340475559234619, + "learning_rate": 2.6758721067303367e-06, + "loss": 4.4262, + "step": 2998 + }, + { + "epoch": 0.9591237082492154, + "grad_norm": 0.3331576883792877, + "learning_rate": 2.634607111831177e-06, + "loss": 4.3648, + "step": 2999 + }, + { + "epoch": 0.959443522756801, + "grad_norm": 0.3264421820640564, + "learning_rate": 2.5936613675428985e-06, + "loss": 4.3048, + "step": 3000 + }, + { + "epoch": 0.959443522756801, + "eval_loss": 4.379649639129639, + "eval_runtime": 97.9927, + "eval_samples_per_second": 19.359, + "eval_steps_per_second": 4.847, + "step": 3000 + }, + { + "epoch": 0.9597633372643867, + "grad_norm": 0.3356820046901703, + "learning_rate": 2.5530349178247033e-06, + "loss": 4.3879, + "step": 3001 + }, + { + "epoch": 0.9600831517719722, + "grad_norm": 0.3238847553730011, + "learning_rate": 2.512727806293069e-06, + "loss": 4.3802, + "step": 3002 + }, + { + "epoch": 0.9604029662795579, + "grad_norm": 0.32261836528778076, + "learning_rate": 2.4727400762215798e-06, + "loss": 4.3, + "step": 3003 + }, + { + "epoch": 0.9607227807871435, + "grad_norm": 0.34502753615379333, + "learning_rate": 2.4330717705409287e-06, + "loss": 4.2756, + "step": 3004 + }, + { + "epoch": 0.961042595294729, + "grad_norm": 0.321431040763855, + "learning_rate": 2.393722931838882e-06, + "loss": 4.387, + "step": 3005 + }, + { + "epoch": 0.9613624098023147, + "grad_norm": 0.33116859197616577, + "learning_rate": 2.3546936023603134e-06, + "loss": 4.4409, + "step": 3006 + }, + { + "epoch": 0.9616822243099002, + "grad_norm": 0.32576796412467957, + "learning_rate": 2.315983824006906e-06, + "loss": 4.4106, + "step": 3007 + }, + { + "epoch": 0.9620020388174858, + "grad_norm": 0.32816702127456665, + "learning_rate": 2.277593638337416e-06, + "loss": 4.3597, + "step": 3008 + }, + { + "epoch": 0.9623218533250715, + "grad_norm": 0.3323589265346527, + "learning_rate": 2.2395230865674075e-06, + "loss": 4.3409, + "step": 3009 + }, + { + "epoch": 0.962641667832657, + "grad_norm": 0.3263186514377594, + "learning_rate": 2.201772209569319e-06, + "loss": 4.3907, + "step": 3010 + }, + { + "epoch": 0.9629614823402427, + "grad_norm": 0.3258233964443207, + "learning_rate": 2.164341047872398e-06, + "loss": 4.4135, + "step": 3011 + }, + { + "epoch": 0.9632812968478283, + "grad_norm": 0.33841806650161743, + "learning_rate": 2.127229641662598e-06, + "loss": 4.3527, + "step": 3012 + }, + { + "epoch": 0.9636011113554138, + "grad_norm": 0.3296872079372406, + "learning_rate": 2.0904380307826483e-06, + "loss": 4.3534, + "step": 3013 + }, + { + "epoch": 0.9639209258629995, + "grad_norm": 0.3302517533302307, + "learning_rate": 2.053966254731887e-06, + "loss": 4.322, + "step": 3014 + }, + { + "epoch": 0.964240740370585, + "grad_norm": 0.33398133516311646, + "learning_rate": 2.0178143526663248e-06, + "loss": 4.4822, + "step": 3015 + }, + { + "epoch": 0.9645605548781706, + "grad_norm": 0.3158261775970459, + "learning_rate": 1.981982363398549e-06, + "loss": 4.3978, + "step": 3016 + }, + { + "epoch": 0.9648803693857563, + "grad_norm": 0.32655230164527893, + "learning_rate": 1.9464703253976533e-06, + "loss": 4.2977, + "step": 3017 + }, + { + "epoch": 0.9652001838933418, + "grad_norm": 0.330531507730484, + "learning_rate": 1.911278276789241e-06, + "loss": 4.3358, + "step": 3018 + }, + { + "epoch": 0.9655199984009275, + "grad_norm": 0.3409190773963928, + "learning_rate": 1.8764062553554227e-06, + "loss": 4.2726, + "step": 3019 + }, + { + "epoch": 0.9658398129085131, + "grad_norm": 0.33117616176605225, + "learning_rate": 1.8418542985347174e-06, + "loss": 4.4635, + "step": 3020 + }, + { + "epoch": 0.9661596274160986, + "grad_norm": 0.3396887183189392, + "learning_rate": 1.8076224434219523e-06, + "loss": 4.3382, + "step": 3021 + }, + { + "epoch": 0.9664794419236843, + "grad_norm": 0.3328210115432739, + "learning_rate": 1.773710726768396e-06, + "loss": 4.3677, + "step": 3022 + }, + { + "epoch": 0.9667992564312698, + "grad_norm": 0.3235970139503479, + "learning_rate": 1.7401191849815255e-06, + "loss": 4.3789, + "step": 3023 + }, + { + "epoch": 0.9671190709388555, + "grad_norm": 0.32505133748054504, + "learning_rate": 1.7068478541251263e-06, + "loss": 4.3603, + "step": 3024 + }, + { + "epoch": 0.9674388854464411, + "grad_norm": 0.327471524477005, + "learning_rate": 1.673896769919192e-06, + "loss": 4.3199, + "step": 3025 + }, + { + "epoch": 0.9677586999540266, + "grad_norm": 0.3187865614891052, + "learning_rate": 1.6412659677399908e-06, + "loss": 4.3572, + "step": 3026 + }, + { + "epoch": 0.9680785144616123, + "grad_norm": 0.3221952021121979, + "learning_rate": 1.608955482619767e-06, + "loss": 4.369, + "step": 3027 + }, + { + "epoch": 0.9683983289691979, + "grad_norm": 0.3291928172111511, + "learning_rate": 1.5769653492470057e-06, + "loss": 4.3607, + "step": 3028 + }, + { + "epoch": 0.9687181434767834, + "grad_norm": 0.3323615491390228, + "learning_rate": 1.5452956019661678e-06, + "loss": 4.3528, + "step": 3029 + }, + { + "epoch": 0.9690379579843691, + "grad_norm": 0.3374388515949249, + "learning_rate": 1.5139462747778885e-06, + "loss": 4.3617, + "step": 3030 + }, + { + "epoch": 0.9693577724919547, + "grad_norm": 0.3264639973640442, + "learning_rate": 1.4829174013386126e-06, + "loss": 4.3405, + "step": 3031 + }, + { + "epoch": 0.9696775869995403, + "grad_norm": 0.33052435517311096, + "learning_rate": 1.4522090149609256e-06, + "loss": 4.3701, + "step": 3032 + }, + { + "epoch": 0.9699974015071259, + "grad_norm": 0.3322097063064575, + "learning_rate": 1.4218211486132558e-06, + "loss": 4.4074, + "step": 3033 + }, + { + "epoch": 0.9703172160147114, + "grad_norm": 0.3176632225513458, + "learning_rate": 1.3917538349198731e-06, + "loss": 4.3757, + "step": 3034 + }, + { + "epoch": 0.9706370305222971, + "grad_norm": 0.32025620341300964, + "learning_rate": 1.3620071061609894e-06, + "loss": 4.318, + "step": 3035 + }, + { + "epoch": 0.9709568450298827, + "grad_norm": 0.3187826871871948, + "learning_rate": 1.332580994272625e-06, + "loss": 4.2726, + "step": 3036 + }, + { + "epoch": 0.9712766595374682, + "grad_norm": 0.3278151750564575, + "learning_rate": 1.3034755308465428e-06, + "loss": 4.4687, + "step": 3037 + }, + { + "epoch": 0.9715964740450539, + "grad_norm": 0.31633511185646057, + "learning_rate": 1.2746907471302803e-06, + "loss": 4.3152, + "step": 3038 + }, + { + "epoch": 0.9719162885526395, + "grad_norm": 0.3345247805118561, + "learning_rate": 1.2462266740270843e-06, + "loss": 4.3626, + "step": 3039 + }, + { + "epoch": 0.9722361030602251, + "grad_norm": 0.33534467220306396, + "learning_rate": 1.2180833420959436e-06, + "loss": 4.4277, + "step": 3040 + }, + { + "epoch": 0.9725559175678107, + "grad_norm": 0.3333605229854584, + "learning_rate": 1.190260781551422e-06, + "loss": 4.314, + "step": 3041 + }, + { + "epoch": 0.9728757320753962, + "grad_norm": 0.32544615864753723, + "learning_rate": 1.1627590222637594e-06, + "loss": 4.3561, + "step": 3042 + }, + { + "epoch": 0.9731955465829819, + "grad_norm": 0.32639381289482117, + "learning_rate": 1.1355780937587378e-06, + "loss": 4.3612, + "step": 3043 + }, + { + "epoch": 0.9735153610905675, + "grad_norm": 0.33417561650276184, + "learning_rate": 1.1087180252177475e-06, + "loss": 4.3262, + "step": 3044 + }, + { + "epoch": 0.973835175598153, + "grad_norm": 0.33775198459625244, + "learning_rate": 1.0821788454776548e-06, + "loss": 4.4527, + "step": 3045 + }, + { + "epoch": 0.9741549901057387, + "grad_norm": 0.32477089762687683, + "learning_rate": 1.0559605830308682e-06, + "loss": 4.3682, + "step": 3046 + }, + { + "epoch": 0.9744748046133243, + "grad_norm": 0.3236415386199951, + "learning_rate": 1.030063266025205e-06, + "loss": 4.345, + "step": 3047 + }, + { + "epoch": 0.9747946191209099, + "grad_norm": 0.32303833961486816, + "learning_rate": 1.0044869222639917e-06, + "loss": 4.4157, + "step": 3048 + }, + { + "epoch": 0.9751144336284955, + "grad_norm": 0.32153502106666565, + "learning_rate": 9.79231579205897e-07, + "loss": 4.3407, + "step": 3049 + }, + { + "epoch": 0.975434248136081, + "grad_norm": 0.3337421715259552, + "learning_rate": 9.54297263964965e-07, + "loss": 4.2743, + "step": 3050 + }, + { + "epoch": 0.9757540626436667, + "grad_norm": 0.3250197768211365, + "learning_rate": 9.29684003310649e-07, + "loss": 4.4275, + "step": 3051 + }, + { + "epoch": 0.9760738771512523, + "grad_norm": 0.3775787949562073, + "learning_rate": 9.053918236676116e-07, + "loss": 4.4613, + "step": 3052 + }, + { + "epoch": 0.9763936916588378, + "grad_norm": 0.3378108739852905, + "learning_rate": 8.814207511159243e-07, + "loss": 4.4499, + "step": 3053 + }, + { + "epoch": 0.9767135061664235, + "grad_norm": 0.3372562527656555, + "learning_rate": 8.577708113908011e-07, + "loss": 4.48, + "step": 3054 + }, + { + "epoch": 0.977033320674009, + "grad_norm": 0.3256985545158386, + "learning_rate": 8.344420298827981e-07, + "loss": 4.3823, + "step": 3055 + }, + { + "epoch": 0.9773531351815947, + "grad_norm": 0.33073222637176514, + "learning_rate": 8.114344316376143e-07, + "loss": 4.4387, + "step": 3056 + }, + { + "epoch": 0.9776729496891803, + "grad_norm": 0.3283195197582245, + "learning_rate": 7.887480413561243e-07, + "loss": 4.3839, + "step": 3057 + }, + { + "epoch": 0.9779927641967658, + "grad_norm": 0.32535162568092346, + "learning_rate": 7.663828833943786e-07, + "loss": 4.3539, + "step": 3058 + }, + { + "epoch": 0.9783125787043515, + "grad_norm": 0.34874841570854187, + "learning_rate": 7.443389817635371e-07, + "loss": 4.3777, + "step": 3059 + }, + { + "epoch": 0.9786323932119371, + "grad_norm": 0.3269280195236206, + "learning_rate": 7.226163601298685e-07, + "loss": 4.367, + "step": 3060 + }, + { + "epoch": 0.9789522077195226, + "grad_norm": 0.3236599266529083, + "learning_rate": 7.01215041814751e-07, + "loss": 4.4251, + "step": 3061 + }, + { + "epoch": 0.9792720222271083, + "grad_norm": 0.31829267740249634, + "learning_rate": 6.801350497945391e-07, + "loss": 4.3718, + "step": 3062 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.32019856572151184, + "learning_rate": 6.593764067006624e-07, + "loss": 4.2499, + "step": 3063 + }, + { + "epoch": 0.9799116512422795, + "grad_norm": 0.318994402885437, + "learning_rate": 6.389391348195272e-07, + "loss": 4.3874, + "step": 3064 + }, + { + "epoch": 0.9802314657498651, + "grad_norm": 0.32636478543281555, + "learning_rate": 6.188232560925155e-07, + "loss": 4.3053, + "step": 3065 + }, + { + "epoch": 0.9805512802574506, + "grad_norm": 0.34945639967918396, + "learning_rate": 5.990287921160186e-07, + "loss": 4.404, + "step": 3066 + }, + { + "epoch": 0.9808710947650363, + "grad_norm": 0.32556045055389404, + "learning_rate": 5.79555764141304e-07, + "loss": 4.3463, + "step": 3067 + }, + { + "epoch": 0.9811909092726219, + "grad_norm": 0.3282671272754669, + "learning_rate": 5.604041930745485e-07, + "loss": 4.4157, + "step": 3068 + }, + { + "epoch": 0.9815107237802074, + "grad_norm": 0.3228117823600769, + "learning_rate": 5.415740994768048e-07, + "loss": 4.4524, + "step": 3069 + }, + { + "epoch": 0.9818305382877931, + "grad_norm": 0.3230701982975006, + "learning_rate": 5.230655035640352e-07, + "loss": 4.3208, + "step": 3070 + }, + { + "epoch": 0.9821503527953787, + "grad_norm": 0.42493200302124023, + "learning_rate": 5.048784252069782e-07, + "loss": 4.3802, + "step": 3071 + }, + { + "epoch": 0.9824701673029643, + "grad_norm": 0.3209701478481293, + "learning_rate": 4.870128839312815e-07, + "loss": 4.402, + "step": 3072 + }, + { + "epoch": 0.9827899818105499, + "grad_norm": 0.3352445363998413, + "learning_rate": 4.6946889891726903e-07, + "loss": 4.3347, + "step": 3073 + }, + { + "epoch": 0.9831097963181354, + "grad_norm": 0.3174760937690735, + "learning_rate": 4.5224648900017424e-07, + "loss": 4.3444, + "step": 3074 + }, + { + "epoch": 0.9834296108257211, + "grad_norm": 0.3161444664001465, + "learning_rate": 4.353456726699067e-07, + "loss": 4.319, + "step": 3075 + }, + { + "epoch": 0.9837494253333067, + "grad_norm": 0.33450281620025635, + "learning_rate": 4.1876646807111893e-07, + "loss": 4.327, + "step": 3076 + }, + { + "epoch": 0.9840692398408922, + "grad_norm": 0.3306039869785309, + "learning_rate": 4.025088930031728e-07, + "loss": 4.3109, + "step": 3077 + }, + { + "epoch": 0.9843890543484779, + "grad_norm": 0.31524422764778137, + "learning_rate": 3.8657296492023984e-07, + "loss": 4.2918, + "step": 3078 + }, + { + "epoch": 0.9847088688560635, + "grad_norm": 0.32267364859580994, + "learning_rate": 3.709587009309678e-07, + "loss": 4.3028, + "step": 3079 + }, + { + "epoch": 0.9850286833636491, + "grad_norm": 0.3193490505218506, + "learning_rate": 3.5566611779888066e-07, + "loss": 4.2841, + "step": 3080 + }, + { + "epoch": 0.9853484978712347, + "grad_norm": 0.32063576579093933, + "learning_rate": 3.406952319420453e-07, + "loss": 4.165, + "step": 3081 + }, + { + "epoch": 0.9856683123788202, + "grad_norm": 0.3379741311073303, + "learning_rate": 3.260460594330716e-07, + "loss": 4.406, + "step": 3082 + }, + { + "epoch": 0.9859881268864059, + "grad_norm": 0.3285538852214813, + "learning_rate": 3.1171861599937896e-07, + "loss": 4.4421, + "step": 3083 + }, + { + "epoch": 0.9863079413939915, + "grad_norm": 0.33019211888313293, + "learning_rate": 2.9771291702279655e-07, + "loss": 4.4411, + "step": 3084 + }, + { + "epoch": 0.986627755901577, + "grad_norm": 0.32944613695144653, + "learning_rate": 2.840289775398297e-07, + "loss": 4.4086, + "step": 3085 + }, + { + "epoch": 0.9869475704091627, + "grad_norm": 0.32963770627975464, + "learning_rate": 2.7066681224149344e-07, + "loss": 4.387, + "step": 3086 + }, + { + "epoch": 0.9872673849167483, + "grad_norm": 0.32858437299728394, + "learning_rate": 2.5762643547337924e-07, + "loss": 4.3697, + "step": 3087 + }, + { + "epoch": 0.9875871994243339, + "grad_norm": 0.33206164836883545, + "learning_rate": 2.4490786123562144e-07, + "loss": 4.3878, + "step": 3088 + }, + { + "epoch": 0.9879070139319195, + "grad_norm": 0.332179993391037, + "learning_rate": 2.3251110318283083e-07, + "loss": 4.3124, + "step": 3089 + }, + { + "epoch": 0.988226828439505, + "grad_norm": 0.32216018438339233, + "learning_rate": 2.204361746241279e-07, + "loss": 4.3728, + "step": 3090 + }, + { + "epoch": 0.9885466429470907, + "grad_norm": 0.33033496141433716, + "learning_rate": 2.0868308852310943e-07, + "loss": 4.3396, + "step": 3091 + }, + { + "epoch": 0.9888664574546763, + "grad_norm": 0.32683953642845154, + "learning_rate": 1.9725185749784879e-07, + "loss": 4.3295, + "step": 3092 + }, + { + "epoch": 0.9891862719622619, + "grad_norm": 0.3205666244029999, + "learning_rate": 1.861424938208955e-07, + "loss": 4.2917, + "step": 3093 + }, + { + "epoch": 0.9895060864698475, + "grad_norm": 0.3305319547653198, + "learning_rate": 1.753550094192424e-07, + "loss": 4.3785, + "step": 3094 + }, + { + "epoch": 0.9898259009774331, + "grad_norm": 0.3250020444393158, + "learning_rate": 1.6488941587429193e-07, + "loss": 4.3505, + "step": 3095 + }, + { + "epoch": 0.9901457154850187, + "grad_norm": 0.33178406953811646, + "learning_rate": 1.547457244218564e-07, + "loss": 4.3275, + "step": 3096 + }, + { + "epoch": 0.9904655299926043, + "grad_norm": 0.32208383083343506, + "learning_rate": 1.4492394595219115e-07, + "loss": 4.3009, + "step": 3097 + }, + { + "epoch": 0.9907853445001898, + "grad_norm": 0.3235897123813629, + "learning_rate": 1.3542409100992802e-07, + "loss": 4.3555, + "step": 3098 + }, + { + "epoch": 0.9911051590077755, + "grad_norm": 0.3169964551925659, + "learning_rate": 1.2624616979407532e-07, + "loss": 4.388, + "step": 3099 + }, + { + "epoch": 0.9914249735153611, + "grad_norm": 0.33814671635627747, + "learning_rate": 1.1739019215801781e-07, + "loss": 4.4036, + "step": 3100 + }, + { + "epoch": 0.9914249735153611, + "eval_loss": 4.37929630279541, + "eval_runtime": 85.0817, + "eval_samples_per_second": 22.296, + "eval_steps_per_second": 5.583, + "step": 3100 + }, + { + "epoch": 0.9917447880229467, + "grad_norm": 0.3211912214756012, + "learning_rate": 1.0885616760951676e-07, + "loss": 4.3015, + "step": 3101 + }, + { + "epoch": 0.9920646025305323, + "grad_norm": 0.31432220339775085, + "learning_rate": 1.0064410531067657e-07, + "loss": 4.3175, + "step": 3102 + }, + { + "epoch": 0.9923844170381179, + "grad_norm": 0.34299880266189575, + "learning_rate": 9.27540140779448e-08, + "loss": 4.4855, + "step": 3103 + }, + { + "epoch": 0.9927042315457035, + "grad_norm": 0.33430150151252747, + "learning_rate": 8.51859023821122e-08, + "loss": 4.4051, + "step": 3104 + }, + { + "epoch": 0.9930240460532891, + "grad_norm": 0.3370993137359619, + "learning_rate": 7.793977834824605e-08, + "loss": 4.3902, + "step": 3105 + }, + { + "epoch": 0.9933438605608746, + "grad_norm": 0.32714369893074036, + "learning_rate": 7.101564975579011e-08, + "loss": 4.3716, + "step": 3106 + }, + { + "epoch": 0.9936636750684603, + "grad_norm": 0.3260760009288788, + "learning_rate": 6.441352403849798e-08, + "loss": 4.3759, + "step": 3107 + }, + { + "epoch": 0.9939834895760459, + "grad_norm": 0.33366405963897705, + "learning_rate": 5.813340828429991e-08, + "loss": 4.3563, + "step": 3108 + }, + { + "epoch": 0.9943033040836315, + "grad_norm": 0.32677289843559265, + "learning_rate": 5.217530923560254e-08, + "loss": 4.3215, + "step": 3109 + }, + { + "epoch": 0.9946231185912171, + "grad_norm": 0.3401174545288086, + "learning_rate": 4.6539233288955816e-08, + "loss": 4.4004, + "step": 3110 + }, + { + "epoch": 0.9949429330988027, + "grad_norm": 0.37048086524009705, + "learning_rate": 4.122518649525286e-08, + "loss": 4.3808, + "step": 3111 + }, + { + "epoch": 0.9952627476063883, + "grad_norm": 0.3258591890335083, + "learning_rate": 3.623317455959673e-08, + "loss": 4.3797, + "step": 3112 + }, + { + "epoch": 0.9955825621139739, + "grad_norm": 0.32999947667121887, + "learning_rate": 3.156320284146696e-08, + "loss": 4.2535, + "step": 3113 + }, + { + "epoch": 0.9959023766215594, + "grad_norm": 0.33407601714134216, + "learning_rate": 2.7215276354486393e-08, + "loss": 4.3589, + "step": 3114 + }, + { + "epoch": 0.9962221911291451, + "grad_norm": 0.32332849502563477, + "learning_rate": 2.3189399766587735e-08, + "loss": 4.316, + "step": 3115 + }, + { + "epoch": 0.9965420056367307, + "grad_norm": 0.3339650332927704, + "learning_rate": 1.948557739994694e-08, + "loss": 4.4301, + "step": 3116 + }, + { + "epoch": 0.9968618201443163, + "grad_norm": 0.3142257034778595, + "learning_rate": 1.6103813230949892e-08, + "loss": 4.3146, + "step": 3117 + }, + { + "epoch": 0.9971816346519019, + "grad_norm": 0.31975337862968445, + "learning_rate": 1.3044110890292336e-08, + "loss": 4.4019, + "step": 3118 + }, + { + "epoch": 0.9975014491594875, + "grad_norm": 0.3598925471305847, + "learning_rate": 1.0306473662813341e-08, + "loss": 4.3411, + "step": 3119 + }, + { + "epoch": 0.9978212636670731, + "grad_norm": 0.3364954888820648, + "learning_rate": 7.89090448766183e-09, + "loss": 4.3687, + "step": 3120 + }, + { + "epoch": 0.9981410781746587, + "grad_norm": 0.32584497332572937, + "learning_rate": 5.7974059581633595e-09, + "loss": 4.3917, + "step": 3121 + }, + { + "epoch": 0.9984608926822442, + "grad_norm": 0.32605403661727905, + "learning_rate": 4.02598032192003e-09, + "loss": 4.4047, + "step": 3122 + }, + { + "epoch": 0.9987807071898299, + "grad_norm": 0.31860044598579407, + "learning_rate": 2.5766294807438858e-09, + "loss": 4.2947, + "step": 3123 + }, + { + "epoch": 0.9991005216974155, + "grad_norm": 0.32749542593955994, + "learning_rate": 1.4493549905902902e-09, + "loss": 4.3667, + "step": 3124 + }, + { + "epoch": 0.9994203362050011, + "grad_norm": 0.3176465630531311, + "learning_rate": 6.441580617577713e-10, + "loss": 4.3476, + "step": 3125 + }, + { + "epoch": 0.9997401507125867, + "grad_norm": 0.32228946685791016, + "learning_rate": 1.6103955865487407e-10, + "loss": 4.3319, + "step": 3126 + }, + { + "epoch": 0.9997401507125867, + "step": 3126, + "total_flos": 1.503421305716736e+17, + "train_loss": 4.916973250231068, + "train_runtime": 23463.9242, + "train_samples_per_second": 8.529, + "train_steps_per_second": 0.133 + }, + { + "epoch": 0.9997401507125867, + "eval_loss": 4.379289150238037, + "eval_runtime": 88.5733, + "eval_samples_per_second": 21.417, + "eval_steps_per_second": 5.363, + "step": 3126 + } + ], + "logging_steps": 1, + "max_steps": 3126, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.503421305716736e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}