{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997401507125867, "eval_steps": 100, "global_step": 3126, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00031981450758560037, "grad_norm": 0.8101146817207336, "learning_rate": 0.0, "loss": 11.9211, "step": 1 }, { "epoch": 0.0006396290151712007, "grad_norm": 0.7823522090911865, "learning_rate": 6.382978723404255e-06, "loss": 11.9218, "step": 2 }, { "epoch": 0.0009594435227568011, "grad_norm": 0.8412644267082214, "learning_rate": 1.276595744680851e-05, "loss": 11.8978, "step": 3 }, { "epoch": 0.0012792580303424015, "grad_norm": 0.8397166132926941, "learning_rate": 1.9148936170212762e-05, "loss": 11.8575, "step": 4 }, { "epoch": 0.0015990725379280018, "grad_norm": 0.8257705569267273, "learning_rate": 2.553191489361702e-05, "loss": 11.8123, "step": 5 }, { "epoch": 0.0019188870455136022, "grad_norm": 0.8732887506484985, "learning_rate": 3.1914893617021275e-05, "loss": 11.7332, "step": 6 }, { "epoch": 0.0022387015530992023, "grad_norm": 0.9519888758659363, "learning_rate": 3.8297872340425525e-05, "loss": 11.6434, "step": 7 }, { "epoch": 0.002558516060684803, "grad_norm": 1.003143072128296, "learning_rate": 4.468085106382978e-05, "loss": 11.552, "step": 8 }, { "epoch": 0.002878330568270403, "grad_norm": 1.1833689212799072, "learning_rate": 5.106382978723404e-05, "loss": 11.4068, "step": 9 }, { "epoch": 0.0031981450758560037, "grad_norm": 1.3067516088485718, "learning_rate": 5.7446808510638294e-05, "loss": 11.303, "step": 10 }, { "epoch": 0.003517959583441604, "grad_norm": 1.4812865257263184, "learning_rate": 6.382978723404255e-05, "loss": 11.1217, "step": 11 }, { "epoch": 0.0038377740910272044, "grad_norm": 1.7294337749481201, "learning_rate": 7.02127659574468e-05, "loss": 10.9054, "step": 12 }, { "epoch": 0.0041575885986128045, "grad_norm": 1.7894771099090576, "learning_rate": 7.659574468085105e-05, "loss": 10.7686, "step": 13 }, { "epoch": 0.004477403106198405, "grad_norm": 1.8553861379623413, "learning_rate": 8.297872340425531e-05, "loss": 10.6206, "step": 14 }, { "epoch": 0.004797217613784005, "grad_norm": 1.9344090223312378, "learning_rate": 8.936170212765956e-05, "loss": 10.4684, "step": 15 }, { "epoch": 0.005117032121369606, "grad_norm": 1.9899150133132935, "learning_rate": 9.574468085106382e-05, "loss": 10.3348, "step": 16 }, { "epoch": 0.005436846628955206, "grad_norm": 1.9777766466140747, "learning_rate": 0.00010212765957446807, "loss": 10.2333, "step": 17 }, { "epoch": 0.005756661136540806, "grad_norm": 1.972184658050537, "learning_rate": 0.00010851063829787234, "loss": 10.1109, "step": 18 }, { "epoch": 0.006076475644126406, "grad_norm": 1.9564199447631836, "learning_rate": 0.00011489361702127659, "loss": 10.0072, "step": 19 }, { "epoch": 0.006396290151712007, "grad_norm": 1.8650798797607422, "learning_rate": 0.00012127659574468084, "loss": 9.9349, "step": 20 }, { "epoch": 0.0067161046592976075, "grad_norm": 1.8822625875473022, "learning_rate": 0.0001276595744680851, "loss": 9.7968, "step": 21 }, { "epoch": 0.007035919166883208, "grad_norm": 1.845629334449768, "learning_rate": 0.00013404255319148935, "loss": 9.6948, "step": 22 }, { "epoch": 0.007355733674468808, "grad_norm": 1.7649613618850708, "learning_rate": 0.0001404255319148936, "loss": 9.6186, "step": 23 }, { "epoch": 0.007675548182054409, "grad_norm": 1.8104851245880127, "learning_rate": 0.00014680851063829785, "loss": 9.4502, "step": 24 }, { "epoch": 0.007995362689640009, "grad_norm": 1.757771611213684, "learning_rate": 0.0001531914893617021, "loss": 9.3377, "step": 25 }, { "epoch": 0.008315177197225609, "grad_norm": 1.7168152332305908, "learning_rate": 0.00015957446808510637, "loss": 9.2014, "step": 26 }, { "epoch": 0.00863499170481121, "grad_norm": 1.626467227935791, "learning_rate": 0.00016595744680851062, "loss": 9.0764, "step": 27 }, { "epoch": 0.00895480621239681, "grad_norm": 1.5004876852035522, "learning_rate": 0.0001723404255319149, "loss": 8.9716, "step": 28 }, { "epoch": 0.00927462071998241, "grad_norm": 1.4192017316818237, "learning_rate": 0.00017872340425531912, "loss": 8.8541, "step": 29 }, { "epoch": 0.00959443522756801, "grad_norm": 1.3647350072860718, "learning_rate": 0.0001851063829787234, "loss": 8.719, "step": 30 }, { "epoch": 0.009914249735153612, "grad_norm": 1.2844560146331787, "learning_rate": 0.00019148936170212765, "loss": 8.5866, "step": 31 }, { "epoch": 0.010234064242739212, "grad_norm": 1.2200034856796265, "learning_rate": 0.00019787234042553187, "loss": 8.4658, "step": 32 }, { "epoch": 0.010553878750324812, "grad_norm": 1.1399619579315186, "learning_rate": 0.00020425531914893615, "loss": 8.3508, "step": 33 }, { "epoch": 0.010873693257910412, "grad_norm": 1.028397798538208, "learning_rate": 0.0002106382978723404, "loss": 8.2486, "step": 34 }, { "epoch": 0.011193507765496012, "grad_norm": 0.9449910521507263, "learning_rate": 0.00021702127659574468, "loss": 8.1285, "step": 35 }, { "epoch": 0.011513322273081612, "grad_norm": 0.8323965668678284, "learning_rate": 0.0002234042553191489, "loss": 8.0544, "step": 36 }, { "epoch": 0.011833136780667212, "grad_norm": 0.8072855472564697, "learning_rate": 0.00022978723404255317, "loss": 7.9962, "step": 37 }, { "epoch": 0.012152951288252813, "grad_norm": 0.731778621673584, "learning_rate": 0.00023617021276595742, "loss": 7.8723, "step": 38 }, { "epoch": 0.012472765795838415, "grad_norm": 0.6545089483261108, "learning_rate": 0.00024255319148936167, "loss": 7.898, "step": 39 }, { "epoch": 0.012792580303424015, "grad_norm": 0.547839343547821, "learning_rate": 0.0002489361702127659, "loss": 7.7362, "step": 40 }, { "epoch": 0.013112394811009615, "grad_norm": 0.5083307027816772, "learning_rate": 0.0002553191489361702, "loss": 7.7618, "step": 41 }, { "epoch": 0.013432209318595215, "grad_norm": 0.4308398365974426, "learning_rate": 0.0002617021276595745, "loss": 7.6737, "step": 42 }, { "epoch": 0.013752023826180815, "grad_norm": 0.3842655122280121, "learning_rate": 0.0002680851063829787, "loss": 7.6312, "step": 43 }, { "epoch": 0.014071838333766415, "grad_norm": 0.35573652386665344, "learning_rate": 0.000274468085106383, "loss": 7.6023, "step": 44 }, { "epoch": 0.014391652841352015, "grad_norm": 0.46006205677986145, "learning_rate": 0.0002808510638297872, "loss": 7.4966, "step": 45 }, { "epoch": 0.014711467348937616, "grad_norm": 0.558365523815155, "learning_rate": 0.0002872340425531915, "loss": 7.5102, "step": 46 }, { "epoch": 0.015031281856523216, "grad_norm": 0.5497884154319763, "learning_rate": 0.0002936170212765957, "loss": 7.495, "step": 47 }, { "epoch": 0.015351096364108818, "grad_norm": 0.516185998916626, "learning_rate": 0.0003, "loss": 7.5394, "step": 48 }, { "epoch": 0.015670910871694418, "grad_norm": 0.46434205770492554, "learning_rate": 0.0003063829787234042, "loss": 7.4388, "step": 49 }, { "epoch": 0.015990725379280018, "grad_norm": 0.4425893723964691, "learning_rate": 0.0003127659574468085, "loss": 7.4612, "step": 50 }, { "epoch": 0.016310539886865618, "grad_norm": 0.3779885172843933, "learning_rate": 0.00031914893617021275, "loss": 7.4265, "step": 51 }, { "epoch": 0.016630354394451218, "grad_norm": 0.4956253170967102, "learning_rate": 0.00032553191489361697, "loss": 7.4218, "step": 52 }, { "epoch": 0.01695016890203682, "grad_norm": 0.32399898767471313, "learning_rate": 0.00033191489361702125, "loss": 7.3844, "step": 53 }, { "epoch": 0.01726998340962242, "grad_norm": 0.41598471999168396, "learning_rate": 0.00033829787234042547, "loss": 7.3073, "step": 54 }, { "epoch": 0.01758979791720802, "grad_norm": 0.32356274127960205, "learning_rate": 0.0003446808510638298, "loss": 7.2783, "step": 55 }, { "epoch": 0.01790961242479362, "grad_norm": 0.3733367919921875, "learning_rate": 0.000351063829787234, "loss": 7.306, "step": 56 }, { "epoch": 0.01822942693237922, "grad_norm": 0.517358660697937, "learning_rate": 0.00035744680851063825, "loss": 7.1725, "step": 57 }, { "epoch": 0.01854924143996482, "grad_norm": 0.507341742515564, "learning_rate": 0.0003638297872340425, "loss": 7.2059, "step": 58 }, { "epoch": 0.01886905594755042, "grad_norm": 0.5811232924461365, "learning_rate": 0.0003702127659574468, "loss": 7.1753, "step": 59 }, { "epoch": 0.01918887045513602, "grad_norm": 0.7594526410102844, "learning_rate": 0.000376595744680851, "loss": 7.1748, "step": 60 }, { "epoch": 0.019508684962721623, "grad_norm": 0.43021154403686523, "learning_rate": 0.0003829787234042553, "loss": 7.0965, "step": 61 }, { "epoch": 0.019828499470307223, "grad_norm": 0.5109601616859436, "learning_rate": 0.0003893617021276595, "loss": 7.0344, "step": 62 }, { "epoch": 0.020148313977892823, "grad_norm": 0.7585037350654602, "learning_rate": 0.00039574468085106374, "loss": 7.109, "step": 63 }, { "epoch": 0.020468128485478423, "grad_norm": 0.4438766539096832, "learning_rate": 0.0004021276595744681, "loss": 7.1915, "step": 64 }, { "epoch": 0.020787942993064024, "grad_norm": 1.0357528924942017, "learning_rate": 0.0004085106382978723, "loss": 7.0837, "step": 65 }, { "epoch": 0.021107757500649624, "grad_norm": 1.0639543533325195, "learning_rate": 0.0004148936170212766, "loss": 7.0189, "step": 66 }, { "epoch": 0.021427572008235224, "grad_norm": 1.2553629875183105, "learning_rate": 0.0004212765957446808, "loss": 7.1504, "step": 67 }, { "epoch": 0.021747386515820824, "grad_norm": 0.788976788520813, "learning_rate": 0.0004276595744680851, "loss": 7.0696, "step": 68 }, { "epoch": 0.022067201023406424, "grad_norm": 0.8938030004501343, "learning_rate": 0.00043404255319148935, "loss": 7.0524, "step": 69 }, { "epoch": 0.022387015530992024, "grad_norm": 0.8874337673187256, "learning_rate": 0.00044042553191489357, "loss": 7.0474, "step": 70 }, { "epoch": 0.022706830038577624, "grad_norm": 0.4377603828907013, "learning_rate": 0.0004468085106382978, "loss": 6.9641, "step": 71 }, { "epoch": 0.023026644546163225, "grad_norm": 0.7803600430488586, "learning_rate": 0.0004531914893617021, "loss": 6.9051, "step": 72 }, { "epoch": 0.023346459053748825, "grad_norm": 0.4542195498943329, "learning_rate": 0.00045957446808510635, "loss": 6.8628, "step": 73 }, { "epoch": 0.023666273561334425, "grad_norm": 0.47478944063186646, "learning_rate": 0.00046595744680851057, "loss": 6.9279, "step": 74 }, { "epoch": 0.023986088068920025, "grad_norm": 0.6075134873390198, "learning_rate": 0.00047234042553191485, "loss": 6.8808, "step": 75 }, { "epoch": 0.024305902576505625, "grad_norm": 0.3671388328075409, "learning_rate": 0.0004787234042553191, "loss": 6.8134, "step": 76 }, { "epoch": 0.024625717084091225, "grad_norm": 0.4649251401424408, "learning_rate": 0.00048510638297872335, "loss": 6.8064, "step": 77 }, { "epoch": 0.02494553159167683, "grad_norm": 0.504503071308136, "learning_rate": 0.0004914893617021277, "loss": 6.8879, "step": 78 }, { "epoch": 0.02526534609926243, "grad_norm": 0.4091486632823944, "learning_rate": 0.0004978723404255318, "loss": 6.8009, "step": 79 }, { "epoch": 0.02558516060684803, "grad_norm": 0.4504472315311432, "learning_rate": 0.0005042553191489361, "loss": 6.775, "step": 80 }, { "epoch": 0.02590497511443363, "grad_norm": 0.33001548051834106, "learning_rate": 0.0005106382978723404, "loss": 6.8068, "step": 81 }, { "epoch": 0.02622478962201923, "grad_norm": 0.4123372435569763, "learning_rate": 0.0005170212765957446, "loss": 6.6953, "step": 82 }, { "epoch": 0.02654460412960483, "grad_norm": 0.36776983737945557, "learning_rate": 0.000523404255319149, "loss": 6.755, "step": 83 }, { "epoch": 0.02686441863719043, "grad_norm": 0.3411658704280853, "learning_rate": 0.0005297872340425531, "loss": 6.7129, "step": 84 }, { "epoch": 0.02718423314477603, "grad_norm": 0.313671737909317, "learning_rate": 0.0005361702127659574, "loss": 6.6733, "step": 85 }, { "epoch": 0.02750404765236163, "grad_norm": 0.3086223602294922, "learning_rate": 0.0005425531914893617, "loss": 6.6093, "step": 86 }, { "epoch": 0.02782386215994723, "grad_norm": 0.32093822956085205, "learning_rate": 0.000548936170212766, "loss": 6.6547, "step": 87 }, { "epoch": 0.02814367666753283, "grad_norm": 0.3176129162311554, "learning_rate": 0.0005553191489361701, "loss": 6.6388, "step": 88 }, { "epoch": 0.02846349117511843, "grad_norm": 0.3457834720611572, "learning_rate": 0.0005617021276595744, "loss": 6.5775, "step": 89 }, { "epoch": 0.02878330568270403, "grad_norm": 0.35802027583122253, "learning_rate": 0.0005680851063829787, "loss": 6.61, "step": 90 }, { "epoch": 0.02910312019028963, "grad_norm": 0.34588831663131714, "learning_rate": 0.000574468085106383, "loss": 6.5866, "step": 91 }, { "epoch": 0.02942293469787523, "grad_norm": 0.3585263788700104, "learning_rate": 0.0005808510638297872, "loss": 6.5106, "step": 92 }, { "epoch": 0.02974274920546083, "grad_norm": 0.3470555245876312, "learning_rate": 0.0005872340425531914, "loss": 6.4859, "step": 93 }, { "epoch": 0.03006256371304643, "grad_norm": 0.3274666666984558, "learning_rate": 0.0005936170212765957, "loss": 6.5668, "step": 94 }, { "epoch": 0.030382378220632035, "grad_norm": 0.28370654582977295, "learning_rate": 0.0006, "loss": 6.5063, "step": 95 }, { "epoch": 0.030702192728217635, "grad_norm": 0.344675213098526, "learning_rate": 0.0005999998389604413, "loss": 6.5151, "step": 96 }, { "epoch": 0.031022007235803235, "grad_norm": 0.3831152617931366, "learning_rate": 0.0005999993558419382, "loss": 6.5066, "step": 97 }, { "epoch": 0.031341821743388835, "grad_norm": 0.35721099376678467, "learning_rate": 0.0005999985506450094, "loss": 6.5545, "step": 98 }, { "epoch": 0.031661636250974436, "grad_norm": 0.4410526156425476, "learning_rate": 0.0005999974233705192, "loss": 6.5003, "step": 99 }, { "epoch": 0.031981450758560036, "grad_norm": 0.4082791805267334, "learning_rate": 0.000599995974019678, "loss": 6.5629, "step": 100 }, { "epoch": 0.031981450758560036, "eval_runtime": 46.1192, "eval_samples_per_second": 41.133, "eval_steps_per_second": 10.299, "step": 100 }, { "epoch": 0.032301265266145636, "grad_norm": 0.32218196988105774, "learning_rate": 0.0005999942025940418, "loss": 6.4668, "step": 101 }, { "epoch": 0.032621079773731236, "grad_norm": 0.364271342754364, "learning_rate": 0.0005999921090955123, "loss": 6.5182, "step": 102 }, { "epoch": 0.032940894281316836, "grad_norm": 0.40794607996940613, "learning_rate": 0.0005999896935263372, "loss": 6.5041, "step": 103 }, { "epoch": 0.033260708788902436, "grad_norm": 0.4575202763080597, "learning_rate": 0.0005999869558891097, "loss": 6.5551, "step": 104 }, { "epoch": 0.033580523296488037, "grad_norm": 0.37083232402801514, "learning_rate": 0.000599983896186769, "loss": 6.4738, "step": 105 }, { "epoch": 0.03390033780407364, "grad_norm": 0.3427277207374573, "learning_rate": 0.0005999805144226, "loss": 6.4142, "step": 106 }, { "epoch": 0.03422015231165924, "grad_norm": 0.3857482075691223, "learning_rate": 0.0005999768106002334, "loss": 6.4109, "step": 107 }, { "epoch": 0.03453996681924484, "grad_norm": 0.3202458620071411, "learning_rate": 0.0005999727847236454, "loss": 6.4334, "step": 108 }, { "epoch": 0.03485978132683044, "grad_norm": 0.3509925305843353, "learning_rate": 0.0005999684367971584, "loss": 6.3379, "step": 109 }, { "epoch": 0.03517959583441604, "grad_norm": 0.33614200353622437, "learning_rate": 0.0005999637668254403, "loss": 6.3766, "step": 110 }, { "epoch": 0.03549941034200164, "grad_norm": 0.29926881194114685, "learning_rate": 0.0005999587748135047, "loss": 6.2963, "step": 111 }, { "epoch": 0.03581922484958724, "grad_norm": 0.34271731972694397, "learning_rate": 0.000599953460766711, "loss": 6.351, "step": 112 }, { "epoch": 0.03613903935717284, "grad_norm": 0.34930479526519775, "learning_rate": 0.0005999478246907643, "loss": 6.4004, "step": 113 }, { "epoch": 0.03645885386475844, "grad_norm": 0.3114824593067169, "learning_rate": 0.0005999418665917157, "loss": 6.2976, "step": 114 }, { "epoch": 0.03677866837234404, "grad_norm": 0.3479638993740082, "learning_rate": 0.0005999355864759614, "loss": 6.3891, "step": 115 }, { "epoch": 0.03709848287992964, "grad_norm": 0.3126205503940582, "learning_rate": 0.0005999289843502441, "loss": 6.3475, "step": 116 }, { "epoch": 0.03741829738751524, "grad_norm": 0.320515513420105, "learning_rate": 0.0005999220602216517, "loss": 6.2821, "step": 117 }, { "epoch": 0.03773811189510084, "grad_norm": 0.329067587852478, "learning_rate": 0.0005999148140976179, "loss": 6.3336, "step": 118 }, { "epoch": 0.03805792640268644, "grad_norm": 0.29812154173851013, "learning_rate": 0.0005999072459859221, "loss": 6.2908, "step": 119 }, { "epoch": 0.03837774091027204, "grad_norm": 0.2918648421764374, "learning_rate": 0.0005998993558946892, "loss": 6.2169, "step": 120 }, { "epoch": 0.038697555417857646, "grad_norm": 0.36118754744529724, "learning_rate": 0.0005998911438323904, "loss": 6.3045, "step": 121 }, { "epoch": 0.039017369925443246, "grad_norm": 0.371866375207901, "learning_rate": 0.000599882609807842, "loss": 6.2067, "step": 122 }, { "epoch": 0.039337184433028846, "grad_norm": 0.34631702303886414, "learning_rate": 0.000599873753830206, "loss": 6.2903, "step": 123 }, { "epoch": 0.039656998940614446, "grad_norm": 0.2922532260417938, "learning_rate": 0.0005998645759089901, "loss": 6.246, "step": 124 }, { "epoch": 0.039976813448200046, "grad_norm": 0.31321561336517334, "learning_rate": 0.0005998550760540478, "loss": 6.241, "step": 125 }, { "epoch": 0.04029662795578565, "grad_norm": 0.3488491177558899, "learning_rate": 0.000599845254275578, "loss": 6.1873, "step": 126 }, { "epoch": 0.04061644246337125, "grad_norm": 0.37727785110473633, "learning_rate": 0.0005998351105841257, "loss": 6.3039, "step": 127 }, { "epoch": 0.04093625697095685, "grad_norm": 0.3407670855522156, "learning_rate": 0.0005998246449905807, "loss": 6.1843, "step": 128 }, { "epoch": 0.04125607147854245, "grad_norm": 0.42200320959091187, "learning_rate": 0.0005998138575061791, "loss": 6.1552, "step": 129 }, { "epoch": 0.04157588598612805, "grad_norm": 0.45605847239494324, "learning_rate": 0.000599802748142502, "loss": 6.2322, "step": 130 }, { "epoch": 0.04189570049371365, "grad_norm": 0.34003040194511414, "learning_rate": 0.0005997913169114768, "loss": 6.1414, "step": 131 }, { "epoch": 0.04221551500129925, "grad_norm": 0.3052242696285248, "learning_rate": 0.0005997795638253759, "loss": 6.1699, "step": 132 }, { "epoch": 0.04253532950888485, "grad_norm": 0.38838160037994385, "learning_rate": 0.0005997674888968171, "loss": 6.252, "step": 133 }, { "epoch": 0.04285514401647045, "grad_norm": 0.28517138957977295, "learning_rate": 0.0005997550921387643, "loss": 6.1289, "step": 134 }, { "epoch": 0.04317495852405605, "grad_norm": 0.3472943902015686, "learning_rate": 0.0005997423735645265, "loss": 6.1262, "step": 135 }, { "epoch": 0.04349477303164165, "grad_norm": 0.3441876471042633, "learning_rate": 0.0005997293331877584, "loss": 6.1377, "step": 136 }, { "epoch": 0.04381458753922725, "grad_norm": 0.33104434609413147, "learning_rate": 0.0005997159710224602, "loss": 6.1146, "step": 137 }, { "epoch": 0.04413440204681285, "grad_norm": 0.35501518845558167, "learning_rate": 0.0005997022870829771, "loss": 6.0823, "step": 138 }, { "epoch": 0.04445421655439845, "grad_norm": 0.35409846901893616, "learning_rate": 0.0005996882813840005, "loss": 6.0689, "step": 139 }, { "epoch": 0.04477403106198405, "grad_norm": 0.3716523349285126, "learning_rate": 0.0005996739539405668, "loss": 6.1359, "step": 140 }, { "epoch": 0.04509384556956965, "grad_norm": 0.33184531331062317, "learning_rate": 0.0005996593047680579, "loss": 6.0616, "step": 141 }, { "epoch": 0.04541366007715525, "grad_norm": 0.34964922070503235, "learning_rate": 0.0005996443338822011, "loss": 6.142, "step": 142 }, { "epoch": 0.04573347458474085, "grad_norm": 0.32616034150123596, "learning_rate": 0.000599629041299069, "loss": 6.0785, "step": 143 }, { "epoch": 0.04605328909232645, "grad_norm": 0.30416902899742126, "learning_rate": 0.0005996134270350797, "loss": 6.1104, "step": 144 }, { "epoch": 0.04637310359991205, "grad_norm": 0.3815024197101593, "learning_rate": 0.0005995974911069968, "loss": 6.0285, "step": 145 }, { "epoch": 0.04669291810749765, "grad_norm": 0.5830708146095276, "learning_rate": 0.0005995812335319289, "loss": 6.097, "step": 146 }, { "epoch": 0.04701273261508325, "grad_norm": 0.7699606418609619, "learning_rate": 0.0005995646543273301, "loss": 6.1042, "step": 147 }, { "epoch": 0.04733254712266885, "grad_norm": 0.4435134530067444, "learning_rate": 0.0005995477535109998, "loss": 6.1143, "step": 148 }, { "epoch": 0.04765236163025445, "grad_norm": 0.47873392701148987, "learning_rate": 0.0005995305311010826, "loss": 6.0679, "step": 149 }, { "epoch": 0.04797217613784005, "grad_norm": 0.4431877136230469, "learning_rate": 0.0005995129871160688, "loss": 6.0625, "step": 150 }, { "epoch": 0.04829199064542565, "grad_norm": 0.6152800917625427, "learning_rate": 0.000599495121574793, "loss": 6.1348, "step": 151 }, { "epoch": 0.04861180515301125, "grad_norm": 0.4578916132450104, "learning_rate": 0.0005994769344964359, "loss": 6.0972, "step": 152 }, { "epoch": 0.04893161966059685, "grad_norm": 0.39392390847206116, "learning_rate": 0.0005994584259005232, "loss": 6.082, "step": 153 }, { "epoch": 0.04925143416818245, "grad_norm": 0.4243738651275635, "learning_rate": 0.0005994395958069254, "loss": 6.0196, "step": 154 }, { "epoch": 0.04957124867576806, "grad_norm": 0.3653479814529419, "learning_rate": 0.0005994204442358586, "loss": 6.0361, "step": 155 }, { "epoch": 0.04989106318335366, "grad_norm": 0.3577592372894287, "learning_rate": 0.0005994009712078839, "loss": 5.9958, "step": 156 }, { "epoch": 0.05021087769093926, "grad_norm": 0.35219165682792664, "learning_rate": 0.0005993811767439074, "loss": 6.0172, "step": 157 }, { "epoch": 0.05053069219852486, "grad_norm": 0.3088356554508209, "learning_rate": 0.0005993610608651804, "loss": 5.961, "step": 158 }, { "epoch": 0.05085050670611046, "grad_norm": 0.3456643223762512, "learning_rate": 0.0005993406235932992, "loss": 6.0369, "step": 159 }, { "epoch": 0.05117032121369606, "grad_norm": 0.3554942011833191, "learning_rate": 0.0005993198649502054, "loss": 5.9997, "step": 160 }, { "epoch": 0.05149013572128166, "grad_norm": 0.319603830575943, "learning_rate": 0.0005992987849581852, "loss": 6.0243, "step": 161 }, { "epoch": 0.05180995022886726, "grad_norm": 0.3472403585910797, "learning_rate": 0.00059927738363987, "loss": 6.0251, "step": 162 }, { "epoch": 0.05212976473645286, "grad_norm": 0.2756575345993042, "learning_rate": 0.0005992556610182364, "loss": 5.9523, "step": 163 }, { "epoch": 0.05244957924403846, "grad_norm": 0.30597907304763794, "learning_rate": 0.0005992336171166056, "loss": 5.9312, "step": 164 }, { "epoch": 0.05276939375162406, "grad_norm": 0.2721783220767975, "learning_rate": 0.0005992112519586438, "loss": 5.9293, "step": 165 }, { "epoch": 0.05308920825920966, "grad_norm": 0.3228548467159271, "learning_rate": 0.0005991885655683624, "loss": 5.9248, "step": 166 }, { "epoch": 0.05340902276679526, "grad_norm": 0.2784801423549652, "learning_rate": 0.0005991655579701171, "loss": 5.8224, "step": 167 }, { "epoch": 0.05372883727438086, "grad_norm": 0.2944832742214203, "learning_rate": 0.0005991422291886092, "loss": 5.9014, "step": 168 }, { "epoch": 0.05404865178196646, "grad_norm": 0.3072747588157654, "learning_rate": 0.000599118579248884, "loss": 5.8539, "step": 169 }, { "epoch": 0.05436846628955206, "grad_norm": 0.3246116638183594, "learning_rate": 0.0005990946081763324, "loss": 6.0012, "step": 170 }, { "epoch": 0.05468828079713766, "grad_norm": 0.2892024517059326, "learning_rate": 0.0005990703159966894, "loss": 5.9108, "step": 171 }, { "epoch": 0.05500809530472326, "grad_norm": 0.28960946202278137, "learning_rate": 0.000599045702736035, "loss": 5.9105, "step": 172 }, { "epoch": 0.05532790981230886, "grad_norm": 0.2955184280872345, "learning_rate": 0.000599020768420794, "loss": 5.887, "step": 173 }, { "epoch": 0.05564772431989446, "grad_norm": 0.2761589586734772, "learning_rate": 0.000598995513077736, "loss": 5.9759, "step": 174 }, { "epoch": 0.05596753882748006, "grad_norm": 0.2988210916519165, "learning_rate": 0.0005989699367339748, "loss": 5.9915, "step": 175 }, { "epoch": 0.05628735333506566, "grad_norm": 0.31130966544151306, "learning_rate": 0.0005989440394169692, "loss": 5.8571, "step": 176 }, { "epoch": 0.05660716784265126, "grad_norm": 0.28806987404823303, "learning_rate": 0.0005989178211545223, "loss": 5.9104, "step": 177 }, { "epoch": 0.05692698235023686, "grad_norm": 0.3409833610057831, "learning_rate": 0.0005988912819747822, "loss": 5.8124, "step": 178 }, { "epoch": 0.05724679685782246, "grad_norm": 0.41102084517478943, "learning_rate": 0.0005988644219062412, "loss": 5.8563, "step": 179 }, { "epoch": 0.05756661136540806, "grad_norm": 0.37034937739372253, "learning_rate": 0.0005988372409777362, "loss": 5.7895, "step": 180 }, { "epoch": 0.05788642587299366, "grad_norm": 0.3778378963470459, "learning_rate": 0.0005988097392184486, "loss": 5.8574, "step": 181 }, { "epoch": 0.05820624038057926, "grad_norm": 0.33154475688934326, "learning_rate": 0.000598781916657904, "loss": 5.885, "step": 182 }, { "epoch": 0.05852605488816486, "grad_norm": 0.3387259840965271, "learning_rate": 0.0005987537733259729, "loss": 5.8296, "step": 183 }, { "epoch": 0.05884586939575046, "grad_norm": 0.32705774903297424, "learning_rate": 0.0005987253092528697, "loss": 5.799, "step": 184 }, { "epoch": 0.05916568390333606, "grad_norm": 0.4102655351161957, "learning_rate": 0.0005986965244691533, "loss": 5.7681, "step": 185 }, { "epoch": 0.05948549841092166, "grad_norm": 0.2976662516593933, "learning_rate": 0.0005986674190057274, "loss": 5.7718, "step": 186 }, { "epoch": 0.05980531291850726, "grad_norm": 0.3745904266834259, "learning_rate": 0.0005986379928938389, "loss": 5.7579, "step": 187 }, { "epoch": 0.06012512742609286, "grad_norm": 0.3143707513809204, "learning_rate": 0.0005986082461650801, "loss": 5.7825, "step": 188 }, { "epoch": 0.06044494193367847, "grad_norm": 0.35436129570007324, "learning_rate": 0.0005985781788513867, "loss": 5.8054, "step": 189 }, { "epoch": 0.06076475644126407, "grad_norm": 0.313580185174942, "learning_rate": 0.000598547790985039, "loss": 5.8129, "step": 190 }, { "epoch": 0.06108457094884967, "grad_norm": 0.30826863646507263, "learning_rate": 0.0005985170825986613, "loss": 5.7603, "step": 191 }, { "epoch": 0.06140438545643527, "grad_norm": 0.3150077760219574, "learning_rate": 0.000598486053725222, "loss": 5.8161, "step": 192 }, { "epoch": 0.06172419996402087, "grad_norm": 0.34280553460121155, "learning_rate": 0.0005984547043980338, "loss": 5.7618, "step": 193 }, { "epoch": 0.06204401447160647, "grad_norm": 0.3738536238670349, "learning_rate": 0.0005984230346507529, "loss": 5.7289, "step": 194 }, { "epoch": 0.06236382897919207, "grad_norm": 0.33746597170829773, "learning_rate": 0.0005983910445173802, "loss": 5.7768, "step": 195 }, { "epoch": 0.06268364348677767, "grad_norm": 0.3918536603450775, "learning_rate": 0.00059835873403226, "loss": 5.7319, "step": 196 }, { "epoch": 0.06300345799436327, "grad_norm": 0.33554381132125854, "learning_rate": 0.0005983261032300807, "loss": 5.6813, "step": 197 }, { "epoch": 0.06332327250194887, "grad_norm": 0.348682165145874, "learning_rate": 0.0005982931521458747, "loss": 5.6592, "step": 198 }, { "epoch": 0.06364308700953447, "grad_norm": 0.3651124835014343, "learning_rate": 0.0005982598808150184, "loss": 5.7978, "step": 199 }, { "epoch": 0.06396290151712007, "grad_norm": 0.32559144496917725, "learning_rate": 0.0005982262892732315, "loss": 5.7813, "step": 200 }, { "epoch": 0.06396290151712007, "eval_runtime": 42.2108, "eval_samples_per_second": 44.941, "eval_steps_per_second": 11.253, "step": 200 }, { "epoch": 0.06428271602470567, "grad_norm": 0.3056526184082031, "learning_rate": 0.000598192377556578, "loss": 5.746, "step": 201 }, { "epoch": 0.06460253053229127, "grad_norm": 0.3324020206928253, "learning_rate": 0.0005981581457014652, "loss": 5.7171, "step": 202 }, { "epoch": 0.06492234503987687, "grad_norm": 0.32687661051750183, "learning_rate": 0.0005981235937446446, "loss": 5.7428, "step": 203 }, { "epoch": 0.06524215954746247, "grad_norm": 0.3477574288845062, "learning_rate": 0.0005980887217232107, "loss": 5.712, "step": 204 }, { "epoch": 0.06556197405504807, "grad_norm": 0.41119998693466187, "learning_rate": 0.0005980535296746023, "loss": 5.7269, "step": 205 }, { "epoch": 0.06588178856263367, "grad_norm": 0.33011364936828613, "learning_rate": 0.0005980180176366013, "loss": 5.6997, "step": 206 }, { "epoch": 0.06620160307021927, "grad_norm": 0.3344496190547943, "learning_rate": 0.0005979821856473336, "loss": 5.6729, "step": 207 }, { "epoch": 0.06652141757780487, "grad_norm": 0.3459053635597229, "learning_rate": 0.0005979460337452681, "loss": 5.6608, "step": 208 }, { "epoch": 0.06684123208539047, "grad_norm": 0.3595856726169586, "learning_rate": 0.0005979095619692172, "loss": 5.655, "step": 209 }, { "epoch": 0.06716104659297607, "grad_norm": 0.32227370142936707, "learning_rate": 0.0005978727703583374, "loss": 5.6479, "step": 210 }, { "epoch": 0.06748086110056167, "grad_norm": 0.32912585139274597, "learning_rate": 0.0005978356589521276, "loss": 5.6792, "step": 211 }, { "epoch": 0.06780067560814727, "grad_norm": 0.349470317363739, "learning_rate": 0.0005977982277904306, "loss": 5.6501, "step": 212 }, { "epoch": 0.06812049011573287, "grad_norm": 0.4150371849536896, "learning_rate": 0.0005977604769134325, "loss": 5.6039, "step": 213 }, { "epoch": 0.06844030462331847, "grad_norm": 0.3921232223510742, "learning_rate": 0.0005977224063616625, "loss": 5.7126, "step": 214 }, { "epoch": 0.06876011913090407, "grad_norm": 0.4018678665161133, "learning_rate": 0.0005976840161759931, "loss": 5.6602, "step": 215 }, { "epoch": 0.06907993363848967, "grad_norm": 0.3683200180530548, "learning_rate": 0.0005976453063976396, "loss": 5.6882, "step": 216 }, { "epoch": 0.06939974814607527, "grad_norm": 0.32523593306541443, "learning_rate": 0.000597606277068161, "loss": 5.6614, "step": 217 }, { "epoch": 0.06971956265366087, "grad_norm": 0.40541139245033264, "learning_rate": 0.000597566928229459, "loss": 5.6461, "step": 218 }, { "epoch": 0.07003937716124647, "grad_norm": 0.3462112843990326, "learning_rate": 0.0005975272599237784, "loss": 5.5652, "step": 219 }, { "epoch": 0.07035919166883207, "grad_norm": 0.328555166721344, "learning_rate": 0.0005974872721937069, "loss": 5.627, "step": 220 }, { "epoch": 0.07067900617641767, "grad_norm": 0.3438108265399933, "learning_rate": 0.0005974469650821753, "loss": 5.6603, "step": 221 }, { "epoch": 0.07099882068400327, "grad_norm": 0.32311320304870605, "learning_rate": 0.0005974063386324571, "loss": 5.6007, "step": 222 }, { "epoch": 0.07131863519158887, "grad_norm": 0.33150622248649597, "learning_rate": 0.0005973653928881688, "loss": 5.5348, "step": 223 }, { "epoch": 0.07163844969917448, "grad_norm": 0.42325443029403687, "learning_rate": 0.0005973241278932695, "loss": 5.566, "step": 224 }, { "epoch": 0.07195826420676008, "grad_norm": 0.37048596143722534, "learning_rate": 0.0005972825436920615, "loss": 5.5738, "step": 225 }, { "epoch": 0.07227807871434568, "grad_norm": 0.38833028078079224, "learning_rate": 0.0005972406403291893, "loss": 5.5963, "step": 226 }, { "epoch": 0.07259789322193128, "grad_norm": 0.41031408309936523, "learning_rate": 0.00059719841784964, "loss": 5.6343, "step": 227 }, { "epoch": 0.07291770772951688, "grad_norm": 0.3515760898590088, "learning_rate": 0.0005971558762987439, "loss": 5.5195, "step": 228 }, { "epoch": 0.07323752223710248, "grad_norm": 0.3203849196434021, "learning_rate": 0.0005971130157221733, "loss": 5.6058, "step": 229 }, { "epoch": 0.07355733674468808, "grad_norm": 0.4178776443004608, "learning_rate": 0.0005970698361659431, "loss": 5.5824, "step": 230 }, { "epoch": 0.07387715125227368, "grad_norm": 0.383701354265213, "learning_rate": 0.000597026337676411, "loss": 5.6442, "step": 231 }, { "epoch": 0.07419696575985928, "grad_norm": 0.434335857629776, "learning_rate": 0.0005969825203002765, "loss": 5.5736, "step": 232 }, { "epoch": 0.07451678026744488, "grad_norm": 0.3091406524181366, "learning_rate": 0.0005969383840845822, "loss": 5.5805, "step": 233 }, { "epoch": 0.07483659477503048, "grad_norm": 0.4250103235244751, "learning_rate": 0.0005968939290767123, "loss": 5.5032, "step": 234 }, { "epoch": 0.07515640928261608, "grad_norm": 0.3331656754016876, "learning_rate": 0.0005968491553243937, "loss": 5.5849, "step": 235 }, { "epoch": 0.07547622379020168, "grad_norm": 0.38698214292526245, "learning_rate": 0.0005968040628756955, "loss": 5.545, "step": 236 }, { "epoch": 0.07579603829778728, "grad_norm": 0.3226473033428192, "learning_rate": 0.0005967586517790285, "loss": 5.4426, "step": 237 }, { "epoch": 0.07611585280537288, "grad_norm": 0.33931753039360046, "learning_rate": 0.0005967129220831461, "loss": 5.5836, "step": 238 }, { "epoch": 0.07643566731295848, "grad_norm": 0.2982843518257141, "learning_rate": 0.0005966668738371436, "loss": 5.5282, "step": 239 }, { "epoch": 0.07675548182054408, "grad_norm": 0.3152060806751251, "learning_rate": 0.0005966205070904582, "loss": 5.5153, "step": 240 }, { "epoch": 0.07707529632812969, "grad_norm": 0.33332109451293945, "learning_rate": 0.0005965738218928693, "loss": 5.4703, "step": 241 }, { "epoch": 0.07739511083571529, "grad_norm": 0.3464970886707306, "learning_rate": 0.0005965268182944976, "loss": 5.5278, "step": 242 }, { "epoch": 0.07771492534330089, "grad_norm": 0.344780296087265, "learning_rate": 0.0005964794963458063, "loss": 5.5934, "step": 243 }, { "epoch": 0.07803473985088649, "grad_norm": 0.32048627734184265, "learning_rate": 0.0005964318560976001, "loss": 5.5239, "step": 244 }, { "epoch": 0.07835455435847209, "grad_norm": 0.30416032671928406, "learning_rate": 0.0005963838976010252, "loss": 5.4648, "step": 245 }, { "epoch": 0.07867436886605769, "grad_norm": 0.3211548924446106, "learning_rate": 0.0005963356209075701, "loss": 5.4866, "step": 246 }, { "epoch": 0.07899418337364329, "grad_norm": 0.3106992840766907, "learning_rate": 0.0005962870260690641, "loss": 5.5892, "step": 247 }, { "epoch": 0.07931399788122889, "grad_norm": 0.3421781063079834, "learning_rate": 0.0005962381131376788, "loss": 5.4546, "step": 248 }, { "epoch": 0.07963381238881449, "grad_norm": 0.32125163078308105, "learning_rate": 0.0005961888821659268, "loss": 5.5583, "step": 249 }, { "epoch": 0.07995362689640009, "grad_norm": 0.3202410638332367, "learning_rate": 0.0005961393332066623, "loss": 5.5122, "step": 250 }, { "epoch": 0.08027344140398569, "grad_norm": 0.33361053466796875, "learning_rate": 0.0005960894663130811, "loss": 5.4626, "step": 251 }, { "epoch": 0.0805932559115713, "grad_norm": 0.31106850504875183, "learning_rate": 0.0005960392815387201, "loss": 5.4941, "step": 252 }, { "epoch": 0.0809130704191569, "grad_norm": 0.3716835081577301, "learning_rate": 0.0005959887789374573, "loss": 5.4643, "step": 253 }, { "epoch": 0.0812328849267425, "grad_norm": 0.3157914876937866, "learning_rate": 0.0005959379585635124, "loss": 5.4807, "step": 254 }, { "epoch": 0.0815526994343281, "grad_norm": 0.3213408589363098, "learning_rate": 0.0005958868204714459, "loss": 5.4739, "step": 255 }, { "epoch": 0.0818725139419137, "grad_norm": 0.34843477606773376, "learning_rate": 0.0005958353647161595, "loss": 5.484, "step": 256 }, { "epoch": 0.0821923284494993, "grad_norm": 0.3158227801322937, "learning_rate": 0.0005957835913528959, "loss": 5.4016, "step": 257 }, { "epoch": 0.0825121429570849, "grad_norm": 0.37651175260543823, "learning_rate": 0.0005957315004372391, "loss": 5.4141, "step": 258 }, { "epoch": 0.0828319574646705, "grad_norm": 0.3332608640193939, "learning_rate": 0.0005956790920251133, "loss": 5.3981, "step": 259 }, { "epoch": 0.0831517719722561, "grad_norm": 0.2973606288433075, "learning_rate": 0.0005956263661727844, "loss": 5.5307, "step": 260 }, { "epoch": 0.0834715864798417, "grad_norm": 0.28823429346084595, "learning_rate": 0.0005955733229368586, "loss": 5.5241, "step": 261 }, { "epoch": 0.0837914009874273, "grad_norm": 0.32572367787361145, "learning_rate": 0.000595519962374283, "loss": 5.462, "step": 262 }, { "epoch": 0.0841112154950129, "grad_norm": 0.33712708950042725, "learning_rate": 0.0005954662845423452, "loss": 5.4315, "step": 263 }, { "epoch": 0.0844310300025985, "grad_norm": 0.36812883615493774, "learning_rate": 0.0005954122894986736, "loss": 5.4614, "step": 264 }, { "epoch": 0.0847508445101841, "grad_norm": 0.3349354565143585, "learning_rate": 0.0005953579773012374, "loss": 5.4445, "step": 265 }, { "epoch": 0.0850706590177697, "grad_norm": 0.3700368106365204, "learning_rate": 0.0005953033480083456, "loss": 5.3913, "step": 266 }, { "epoch": 0.0853904735253553, "grad_norm": 0.3214753270149231, "learning_rate": 0.0005952484016786483, "loss": 5.4352, "step": 267 }, { "epoch": 0.0857102880329409, "grad_norm": 0.35860490798950195, "learning_rate": 0.0005951931383711357, "loss": 5.428, "step": 268 }, { "epoch": 0.0860301025405265, "grad_norm": 0.32006773352622986, "learning_rate": 0.0005951375581451382, "loss": 5.4264, "step": 269 }, { "epoch": 0.0863499170481121, "grad_norm": 0.3063651919364929, "learning_rate": 0.0005950816610603266, "loss": 5.4869, "step": 270 }, { "epoch": 0.0866697315556977, "grad_norm": 0.3260275423526764, "learning_rate": 0.0005950254471767119, "loss": 5.4577, "step": 271 }, { "epoch": 0.0869895460632833, "grad_norm": 0.33964160084724426, "learning_rate": 0.0005949689165546453, "loss": 5.444, "step": 272 }, { "epoch": 0.0873093605708689, "grad_norm": 0.30213218927383423, "learning_rate": 0.0005949120692548177, "loss": 5.3891, "step": 273 }, { "epoch": 0.0876291750784545, "grad_norm": 0.29977500438690186, "learning_rate": 0.0005948549053382602, "loss": 5.4368, "step": 274 }, { "epoch": 0.0879489895860401, "grad_norm": 0.3072340488433838, "learning_rate": 0.0005947974248663439, "loss": 5.4545, "step": 275 }, { "epoch": 0.0882688040936257, "grad_norm": 0.2842322289943695, "learning_rate": 0.0005947396279007796, "loss": 5.4032, "step": 276 }, { "epoch": 0.0885886186012113, "grad_norm": 0.3031630218029022, "learning_rate": 0.0005946815145036181, "loss": 5.4349, "step": 277 }, { "epoch": 0.0889084331087969, "grad_norm": 0.35890865325927734, "learning_rate": 0.0005946230847372496, "loss": 5.3557, "step": 278 }, { "epoch": 0.0892282476163825, "grad_norm": 0.3267708420753479, "learning_rate": 0.0005945643386644041, "loss": 5.4143, "step": 279 }, { "epoch": 0.0895480621239681, "grad_norm": 0.36174091696739197, "learning_rate": 0.0005945052763481514, "loss": 5.3397, "step": 280 }, { "epoch": 0.0898678766315537, "grad_norm": 0.3225559592247009, "learning_rate": 0.0005944458978519006, "loss": 5.3599, "step": 281 }, { "epoch": 0.0901876911391393, "grad_norm": 0.32492998242378235, "learning_rate": 0.0005943862032394, "loss": 5.457, "step": 282 }, { "epoch": 0.0905075056467249, "grad_norm": 0.3274608552455902, "learning_rate": 0.000594326192574738, "loss": 5.3203, "step": 283 }, { "epoch": 0.0908273201543105, "grad_norm": 0.3054322898387909, "learning_rate": 0.0005942658659223415, "loss": 5.3049, "step": 284 }, { "epoch": 0.0911471346618961, "grad_norm": 0.3142618238925934, "learning_rate": 0.0005942052233469771, "loss": 5.329, "step": 285 }, { "epoch": 0.0914669491694817, "grad_norm": 0.3154004216194153, "learning_rate": 0.0005941442649137507, "loss": 5.3936, "step": 286 }, { "epoch": 0.0917867636770673, "grad_norm": 0.29942500591278076, "learning_rate": 0.0005940829906881066, "loss": 5.3333, "step": 287 }, { "epoch": 0.0921065781846529, "grad_norm": 0.31971099972724915, "learning_rate": 0.0005940214007358293, "loss": 5.3768, "step": 288 }, { "epoch": 0.0924263926922385, "grad_norm": 0.3242865204811096, "learning_rate": 0.0005939594951230412, "loss": 5.3302, "step": 289 }, { "epoch": 0.0927462071998241, "grad_norm": 0.32814836502075195, "learning_rate": 0.0005938972739162041, "loss": 5.3998, "step": 290 }, { "epoch": 0.0930660217074097, "grad_norm": 0.33235788345336914, "learning_rate": 0.0005938347371821183, "loss": 5.419, "step": 291 }, { "epoch": 0.0933858362149953, "grad_norm": 0.3215448260307312, "learning_rate": 0.0005937718849879232, "loss": 5.378, "step": 292 }, { "epoch": 0.0937056507225809, "grad_norm": 0.3052229583263397, "learning_rate": 0.0005937087174010968, "loss": 5.3721, "step": 293 }, { "epoch": 0.0940254652301665, "grad_norm": 0.29222971200942993, "learning_rate": 0.0005936452344894556, "loss": 5.3122, "step": 294 }, { "epoch": 0.0943452797377521, "grad_norm": 0.33696553111076355, "learning_rate": 0.0005935814363211546, "loss": 5.2752, "step": 295 }, { "epoch": 0.0946650942453377, "grad_norm": 0.29393741488456726, "learning_rate": 0.0005935173229646873, "loss": 5.3129, "step": 296 }, { "epoch": 0.0949849087529233, "grad_norm": 0.33300068974494934, "learning_rate": 0.0005934528944888857, "loss": 5.3366, "step": 297 }, { "epoch": 0.0953047232605089, "grad_norm": 0.3062058389186859, "learning_rate": 0.0005933881509629201, "loss": 5.3504, "step": 298 }, { "epoch": 0.0956245377680945, "grad_norm": 0.32169604301452637, "learning_rate": 0.0005933230924562987, "loss": 5.3565, "step": 299 }, { "epoch": 0.0959443522756801, "grad_norm": 0.3492211401462555, "learning_rate": 0.0005932577190388684, "loss": 5.3941, "step": 300 }, { "epoch": 0.0959443522756801, "eval_runtime": 42.5366, "eval_samples_per_second": 44.597, "eval_steps_per_second": 11.167, "step": 300 }, { "epoch": 0.0962641667832657, "grad_norm": 0.3271489441394806, "learning_rate": 0.0005931920307808138, "loss": 5.3494, "step": 301 }, { "epoch": 0.0965839812908513, "grad_norm": 0.3323195278644562, "learning_rate": 0.0005931260277526574, "loss": 5.3137, "step": 302 }, { "epoch": 0.0969037957984369, "grad_norm": 0.30432942509651184, "learning_rate": 0.0005930597100252602, "loss": 5.256, "step": 303 }, { "epoch": 0.0972236103060225, "grad_norm": 0.3033139109611511, "learning_rate": 0.0005929930776698205, "loss": 5.3255, "step": 304 }, { "epoch": 0.0975434248136081, "grad_norm": 0.2943139970302582, "learning_rate": 0.0005929261307578747, "loss": 5.3268, "step": 305 }, { "epoch": 0.0978632393211937, "grad_norm": 0.27769729495048523, "learning_rate": 0.0005928588693612969, "loss": 5.244, "step": 306 }, { "epoch": 0.0981830538287793, "grad_norm": 0.2837480902671814, "learning_rate": 0.0005927912935522985, "loss": 5.202, "step": 307 }, { "epoch": 0.0985028683363649, "grad_norm": 0.30128923058509827, "learning_rate": 0.0005927234034034289, "loss": 5.3183, "step": 308 }, { "epoch": 0.09882268284395052, "grad_norm": 0.31779980659484863, "learning_rate": 0.0005926551989875746, "loss": 5.2969, "step": 309 }, { "epoch": 0.09914249735153612, "grad_norm": 0.3054303824901581, "learning_rate": 0.0005925866803779598, "loss": 5.2856, "step": 310 }, { "epoch": 0.09946231185912172, "grad_norm": 0.31985148787498474, "learning_rate": 0.0005925178476481458, "loss": 5.2866, "step": 311 }, { "epoch": 0.09978212636670732, "grad_norm": 0.2819589674472809, "learning_rate": 0.0005924487008720313, "loss": 5.3985, "step": 312 }, { "epoch": 0.10010194087429292, "grad_norm": 0.3312123119831085, "learning_rate": 0.0005923792401238519, "loss": 5.2398, "step": 313 }, { "epoch": 0.10042175538187852, "grad_norm": 0.3334448039531708, "learning_rate": 0.0005923094654781805, "loss": 5.326, "step": 314 }, { "epoch": 0.10074156988946412, "grad_norm": 0.3000335693359375, "learning_rate": 0.0005922393770099271, "loss": 5.3338, "step": 315 }, { "epoch": 0.10106138439704972, "grad_norm": 0.33112016320228577, "learning_rate": 0.0005921689747943384, "loss": 5.2423, "step": 316 }, { "epoch": 0.10138119890463532, "grad_norm": 0.27599653601646423, "learning_rate": 0.0005920982589069979, "loss": 5.2019, "step": 317 }, { "epoch": 0.10170101341222092, "grad_norm": 0.29816755652427673, "learning_rate": 0.0005920272294238261, "loss": 5.136, "step": 318 }, { "epoch": 0.10202082791980652, "grad_norm": 0.3009512424468994, "learning_rate": 0.0005919558864210801, "loss": 5.2698, "step": 319 }, { "epoch": 0.10234064242739212, "grad_norm": 0.32471615076065063, "learning_rate": 0.0005918842299753534, "loss": 5.2168, "step": 320 }, { "epoch": 0.10266045693497772, "grad_norm": 0.32111233472824097, "learning_rate": 0.0005918122601635763, "loss": 5.2078, "step": 321 }, { "epoch": 0.10298027144256332, "grad_norm": 0.3042207360267639, "learning_rate": 0.0005917399770630151, "loss": 5.2639, "step": 322 }, { "epoch": 0.10330008595014892, "grad_norm": 0.29698646068573, "learning_rate": 0.000591667380751273, "loss": 5.2497, "step": 323 }, { "epoch": 0.10361990045773452, "grad_norm": 0.32809239625930786, "learning_rate": 0.0005915944713062891, "loss": 5.3087, "step": 324 }, { "epoch": 0.10393971496532012, "grad_norm": 0.3331183195114136, "learning_rate": 0.0005915212488063387, "loss": 5.2019, "step": 325 }, { "epoch": 0.10425952947290572, "grad_norm": 0.31039348244667053, "learning_rate": 0.0005914477133300333, "loss": 5.2409, "step": 326 }, { "epoch": 0.10457934398049132, "grad_norm": 0.30676165223121643, "learning_rate": 0.0005913738649563205, "loss": 5.216, "step": 327 }, { "epoch": 0.10489915848807692, "grad_norm": 0.36066240072250366, "learning_rate": 0.0005912997037644834, "loss": 5.3115, "step": 328 }, { "epoch": 0.10521897299566252, "grad_norm": 0.3313657343387604, "learning_rate": 0.0005912252298341416, "loss": 5.2791, "step": 329 }, { "epoch": 0.10553878750324812, "grad_norm": 0.31698256731033325, "learning_rate": 0.0005911504432452498, "loss": 5.2528, "step": 330 }, { "epoch": 0.10585860201083372, "grad_norm": 0.3029792010784149, "learning_rate": 0.0005910753440780988, "loss": 5.1874, "step": 331 }, { "epoch": 0.10617841651841932, "grad_norm": 0.2993261516094208, "learning_rate": 0.0005909999324133148, "loss": 5.2175, "step": 332 }, { "epoch": 0.10649823102600492, "grad_norm": 0.2945845425128937, "learning_rate": 0.0005909242083318596, "loss": 5.2562, "step": 333 }, { "epoch": 0.10681804553359052, "grad_norm": 0.3010497987270355, "learning_rate": 0.0005908481719150303, "loss": 5.2296, "step": 334 }, { "epoch": 0.10713786004117612, "grad_norm": 0.31832021474838257, "learning_rate": 0.0005907718232444594, "loss": 5.1728, "step": 335 }, { "epoch": 0.10745767454876172, "grad_norm": 0.345431923866272, "learning_rate": 0.0005906951624021147, "loss": 5.1974, "step": 336 }, { "epoch": 0.10777748905634732, "grad_norm": 0.29008468985557556, "learning_rate": 0.0005906181894702987, "loss": 5.2137, "step": 337 }, { "epoch": 0.10809730356393292, "grad_norm": 0.3625525236129761, "learning_rate": 0.0005905409045316497, "loss": 5.198, "step": 338 }, { "epoch": 0.10841711807151852, "grad_norm": 0.3357219994068146, "learning_rate": 0.0005904633076691404, "loss": 5.2058, "step": 339 }, { "epoch": 0.10873693257910412, "grad_norm": 0.30873924493789673, "learning_rate": 0.0005903853989660787, "loss": 5.2532, "step": 340 }, { "epoch": 0.10905674708668972, "grad_norm": 0.31059667468070984, "learning_rate": 0.0005903071785061069, "loss": 5.2044, "step": 341 }, { "epoch": 0.10937656159427532, "grad_norm": 0.33196550607681274, "learning_rate": 0.0005902286463732026, "loss": 5.2215, "step": 342 }, { "epoch": 0.10969637610186092, "grad_norm": 0.3098610043525696, "learning_rate": 0.0005901498026516774, "loss": 5.2058, "step": 343 }, { "epoch": 0.11001619060944652, "grad_norm": 0.37928423285484314, "learning_rate": 0.0005900706474261778, "loss": 5.2491, "step": 344 }, { "epoch": 0.11033600511703212, "grad_norm": 0.3329963982105255, "learning_rate": 0.0005899911807816844, "loss": 5.1749, "step": 345 }, { "epoch": 0.11065581962461772, "grad_norm": 0.2967645823955536, "learning_rate": 0.0005899114028035128, "loss": 5.2702, "step": 346 }, { "epoch": 0.11097563413220332, "grad_norm": 0.2964327335357666, "learning_rate": 0.0005898313135773121, "loss": 5.2088, "step": 347 }, { "epoch": 0.11129544863978892, "grad_norm": 0.2979515492916107, "learning_rate": 0.0005897509131890658, "loss": 5.1853, "step": 348 }, { "epoch": 0.11161526314737452, "grad_norm": 0.2919325530529022, "learning_rate": 0.0005896702017250916, "loss": 5.2291, "step": 349 }, { "epoch": 0.11193507765496012, "grad_norm": 0.30002135038375854, "learning_rate": 0.0005895891792720413, "loss": 5.2152, "step": 350 }, { "epoch": 0.11225489216254572, "grad_norm": 0.29685893654823303, "learning_rate": 0.0005895078459169, "loss": 5.2294, "step": 351 }, { "epoch": 0.11257470667013132, "grad_norm": 0.3043173849582672, "learning_rate": 0.0005894262017469872, "loss": 5.1851, "step": 352 }, { "epoch": 0.11289452117771692, "grad_norm": 0.2991015315055847, "learning_rate": 0.0005893442468499557, "loss": 5.0902, "step": 353 }, { "epoch": 0.11321433568530252, "grad_norm": 0.2895256280899048, "learning_rate": 0.0005892619813137923, "loss": 5.1836, "step": 354 }, { "epoch": 0.11353415019288812, "grad_norm": 0.3127066195011139, "learning_rate": 0.0005891794052268167, "loss": 5.1031, "step": 355 }, { "epoch": 0.11385396470047372, "grad_norm": 0.29016047716140747, "learning_rate": 0.0005890965186776825, "loss": 5.1812, "step": 356 }, { "epoch": 0.11417377920805932, "grad_norm": 0.2890208661556244, "learning_rate": 0.0005890133217553765, "loss": 5.1109, "step": 357 }, { "epoch": 0.11449359371564492, "grad_norm": 0.30912068486213684, "learning_rate": 0.0005889298145492185, "loss": 5.1906, "step": 358 }, { "epoch": 0.11481340822323052, "grad_norm": 0.30817946791648865, "learning_rate": 0.0005888459971488618, "loss": 5.2195, "step": 359 }, { "epoch": 0.11513322273081612, "grad_norm": 0.359819620847702, "learning_rate": 0.0005887618696442925, "loss": 5.2424, "step": 360 }, { "epoch": 0.11545303723840172, "grad_norm": 0.33261290192604065, "learning_rate": 0.0005886774321258294, "loss": 5.0774, "step": 361 }, { "epoch": 0.11577285174598732, "grad_norm": 0.3220166563987732, "learning_rate": 0.0005885926846841246, "loss": 5.1801, "step": 362 }, { "epoch": 0.11609266625357292, "grad_norm": 0.3206515610218048, "learning_rate": 0.0005885076274101627, "loss": 5.1212, "step": 363 }, { "epoch": 0.11641248076115852, "grad_norm": 0.31633198261260986, "learning_rate": 0.0005884222603952608, "loss": 5.1849, "step": 364 }, { "epoch": 0.11673229526874412, "grad_norm": 0.338256299495697, "learning_rate": 0.0005883365837310689, "loss": 5.0525, "step": 365 }, { "epoch": 0.11705210977632972, "grad_norm": 0.34129032492637634, "learning_rate": 0.0005882505975095689, "loss": 5.1021, "step": 366 }, { "epoch": 0.11737192428391532, "grad_norm": 0.3289518654346466, "learning_rate": 0.0005881643018230755, "loss": 5.1746, "step": 367 }, { "epoch": 0.11769173879150092, "grad_norm": 0.3415473699569702, "learning_rate": 0.0005880776967642355, "loss": 5.0374, "step": 368 }, { "epoch": 0.11801155329908652, "grad_norm": 0.29813775420188904, "learning_rate": 0.0005879907824260281, "loss": 5.0537, "step": 369 }, { "epoch": 0.11833136780667212, "grad_norm": 0.31397852301597595, "learning_rate": 0.0005879035589017638, "loss": 5.1292, "step": 370 }, { "epoch": 0.11865118231425772, "grad_norm": 0.29997017979621887, "learning_rate": 0.0005878160262850859, "loss": 5.1783, "step": 371 }, { "epoch": 0.11897099682184333, "grad_norm": 0.3558555245399475, "learning_rate": 0.0005877281846699689, "loss": 5.1115, "step": 372 }, { "epoch": 0.11929081132942893, "grad_norm": 0.3018968999385834, "learning_rate": 0.0005876400341507194, "loss": 5.0909, "step": 373 }, { "epoch": 0.11961062583701453, "grad_norm": 0.31483566761016846, "learning_rate": 0.0005875515748219757, "loss": 5.1896, "step": 374 }, { "epoch": 0.11993044034460013, "grad_norm": 0.29223746061325073, "learning_rate": 0.0005874628067787072, "loss": 5.1234, "step": 375 }, { "epoch": 0.12025025485218573, "grad_norm": 0.3251889646053314, "learning_rate": 0.0005873737301162151, "loss": 5.107, "step": 376 }, { "epoch": 0.12057006935977133, "grad_norm": 0.3011768162250519, "learning_rate": 0.000587284344930132, "loss": 5.0241, "step": 377 }, { "epoch": 0.12088988386735694, "grad_norm": 0.3017674386501312, "learning_rate": 0.0005871946513164213, "loss": 5.1043, "step": 378 }, { "epoch": 0.12120969837494254, "grad_norm": 0.32426270842552185, "learning_rate": 0.000587104649371378, "loss": 5.1025, "step": 379 }, { "epoch": 0.12152951288252814, "grad_norm": 0.2931612432003021, "learning_rate": 0.000587014339191628, "loss": 5.0622, "step": 380 }, { "epoch": 0.12184932739011374, "grad_norm": 0.2938215434551239, "learning_rate": 0.0005869237208741278, "loss": 5.1391, "step": 381 }, { "epoch": 0.12216914189769934, "grad_norm": 0.295305460691452, "learning_rate": 0.0005868327945161651, "loss": 5.1377, "step": 382 }, { "epoch": 0.12248895640528494, "grad_norm": 0.2914290726184845, "learning_rate": 0.0005867415602153582, "loss": 5.173, "step": 383 }, { "epoch": 0.12280877091287054, "grad_norm": 0.3063279688358307, "learning_rate": 0.0005866500180696558, "loss": 5.0527, "step": 384 }, { "epoch": 0.12312858542045614, "grad_norm": 0.3160459101200104, "learning_rate": 0.0005865581681773374, "loss": 5.0403, "step": 385 }, { "epoch": 0.12344839992804174, "grad_norm": 0.3257586359977722, "learning_rate": 0.000586466010637013, "loss": 5.1326, "step": 386 }, { "epoch": 0.12376821443562734, "grad_norm": 0.3460622727870941, "learning_rate": 0.0005863735455476222, "loss": 5.0481, "step": 387 }, { "epoch": 0.12408802894321294, "grad_norm": 0.31491711735725403, "learning_rate": 0.0005862807730084356, "loss": 5.1264, "step": 388 }, { "epoch": 0.12440784345079854, "grad_norm": 0.31746017932891846, "learning_rate": 0.0005861876931190534, "loss": 5.1427, "step": 389 }, { "epoch": 0.12472765795838414, "grad_norm": 0.32399696111679077, "learning_rate": 0.0005860943059794059, "loss": 5.087, "step": 390 }, { "epoch": 0.12504747246596973, "grad_norm": 0.3315321207046509, "learning_rate": 0.0005860006116897533, "loss": 5.0249, "step": 391 }, { "epoch": 0.12536728697355534, "grad_norm": 0.3068821430206299, "learning_rate": 0.0005859066103506853, "loss": 5.0232, "step": 392 }, { "epoch": 0.12568710148114093, "grad_norm": 0.3034381866455078, "learning_rate": 0.0005858123020631218, "loss": 5.0493, "step": 393 }, { "epoch": 0.12600691598872654, "grad_norm": 0.335399866104126, "learning_rate": 0.0005857176869283118, "loss": 5.0446, "step": 394 }, { "epoch": 0.12632673049631213, "grad_norm": 0.368830144405365, "learning_rate": 0.0005856227650478335, "loss": 5.0532, "step": 395 }, { "epoch": 0.12664654500389774, "grad_norm": 0.2932795286178589, "learning_rate": 0.0005855275365235953, "loss": 5.1059, "step": 396 }, { "epoch": 0.12696635951148333, "grad_norm": 0.32992836833000183, "learning_rate": 0.0005854320014578338, "loss": 5.0431, "step": 397 }, { "epoch": 0.12728617401906894, "grad_norm": 0.3175584375858307, "learning_rate": 0.0005853361599531155, "loss": 5.0178, "step": 398 }, { "epoch": 0.12760598852665453, "grad_norm": 0.29655924439430237, "learning_rate": 0.0005852400121123353, "loss": 5.0832, "step": 399 }, { "epoch": 0.12792580303424014, "grad_norm": 0.317992627620697, "learning_rate": 0.0005851435580387175, "loss": 5.066, "step": 400 }, { "epoch": 0.12792580303424014, "eval_runtime": 43.6781, "eval_samples_per_second": 43.431, "eval_steps_per_second": 10.875, "step": 400 }, { "epoch": 0.12824561754182573, "grad_norm": 0.31353482604026794, "learning_rate": 0.0005850467978358146, "loss": 5.1083, "step": 401 }, { "epoch": 0.12856543204941134, "grad_norm": 0.2888651490211487, "learning_rate": 0.0005849497316075084, "loss": 5.0149, "step": 402 }, { "epoch": 0.12888524655699693, "grad_norm": 0.3115566670894623, "learning_rate": 0.0005848523594580086, "loss": 5.1122, "step": 403 }, { "epoch": 0.12920506106458254, "grad_norm": 0.29510271549224854, "learning_rate": 0.0005847546814918538, "loss": 5.0697, "step": 404 }, { "epoch": 0.12952487557216813, "grad_norm": 0.2880918085575104, "learning_rate": 0.0005846566978139108, "loss": 5.041, "step": 405 }, { "epoch": 0.12984469007975374, "grad_norm": 0.3263723850250244, "learning_rate": 0.0005845584085293745, "loss": 4.999, "step": 406 }, { "epoch": 0.13016450458733933, "grad_norm": 0.29180020093917847, "learning_rate": 0.0005844598137437682, "loss": 4.9831, "step": 407 }, { "epoch": 0.13048431909492494, "grad_norm": 0.3123887777328491, "learning_rate": 0.0005843609135629427, "loss": 5.1215, "step": 408 }, { "epoch": 0.13080413360251053, "grad_norm": 0.2909485101699829, "learning_rate": 0.0005842617080930771, "loss": 5.073, "step": 409 }, { "epoch": 0.13112394811009614, "grad_norm": 0.3076760470867157, "learning_rate": 0.000584162197440678, "loss": 5.147, "step": 410 }, { "epoch": 0.13144376261768173, "grad_norm": 0.3147362470626831, "learning_rate": 0.0005840623817125799, "loss": 5.0205, "step": 411 }, { "epoch": 0.13176357712526734, "grad_norm": 0.3290916383266449, "learning_rate": 0.0005839622610159446, "loss": 5.0531, "step": 412 }, { "epoch": 0.13208339163285296, "grad_norm": 0.30235767364501953, "learning_rate": 0.0005838618354582612, "loss": 4.9903, "step": 413 }, { "epoch": 0.13240320614043855, "grad_norm": 0.33554261922836304, "learning_rate": 0.0005837611051473466, "loss": 4.9994, "step": 414 }, { "epoch": 0.13272302064802416, "grad_norm": 0.33954572677612305, "learning_rate": 0.0005836600701913443, "loss": 5.0126, "step": 415 }, { "epoch": 0.13304283515560975, "grad_norm": 0.31126219034194946, "learning_rate": 0.0005835587306987255, "loss": 5.0533, "step": 416 }, { "epoch": 0.13336264966319536, "grad_norm": 0.3165084421634674, "learning_rate": 0.0005834570867782875, "loss": 5.0812, "step": 417 }, { "epoch": 0.13368246417078095, "grad_norm": 0.3064981698989868, "learning_rate": 0.0005833551385391551, "loss": 5.0939, "step": 418 }, { "epoch": 0.13400227867836656, "grad_norm": 0.31144943833351135, "learning_rate": 0.0005832528860907798, "loss": 5.0032, "step": 419 }, { "epoch": 0.13432209318595215, "grad_norm": 0.31108570098876953, "learning_rate": 0.0005831503295429393, "loss": 4.9976, "step": 420 }, { "epoch": 0.13464190769353776, "grad_norm": 0.3092106878757477, "learning_rate": 0.0005830474690057383, "loss": 5.1104, "step": 421 }, { "epoch": 0.13496172220112335, "grad_norm": 0.30084457993507385, "learning_rate": 0.0005829443045896072, "loss": 5.1218, "step": 422 }, { "epoch": 0.13528153670870896, "grad_norm": 0.3126887083053589, "learning_rate": 0.0005828408364053031, "loss": 5.0283, "step": 423 }, { "epoch": 0.13560135121629455, "grad_norm": 0.31660887598991394, "learning_rate": 0.0005827370645639095, "loss": 5.0397, "step": 424 }, { "epoch": 0.13592116572388016, "grad_norm": 0.6651709675788879, "learning_rate": 0.0005826329891768351, "loss": 5.027, "step": 425 }, { "epoch": 0.13624098023146575, "grad_norm": 0.3309241831302643, "learning_rate": 0.0005825286103558151, "loss": 5.0134, "step": 426 }, { "epoch": 0.13656079473905136, "grad_norm": 0.2886042296886444, "learning_rate": 0.0005824239282129103, "loss": 5.1065, "step": 427 }, { "epoch": 0.13688060924663695, "grad_norm": 0.3092151880264282, "learning_rate": 0.0005823189428605072, "loss": 5.056, "step": 428 }, { "epoch": 0.13720042375422256, "grad_norm": 0.3057742118835449, "learning_rate": 0.0005822136544113177, "loss": 4.9193, "step": 429 }, { "epoch": 0.13752023826180815, "grad_norm": 0.28699538111686707, "learning_rate": 0.000582108062978379, "loss": 5.0065, "step": 430 }, { "epoch": 0.13784005276939376, "grad_norm": 0.31693604588508606, "learning_rate": 0.0005820021686750542, "loss": 4.9576, "step": 431 }, { "epoch": 0.13815986727697935, "grad_norm": 0.2998126149177551, "learning_rate": 0.0005818959716150306, "loss": 4.9428, "step": 432 }, { "epoch": 0.13847968178456496, "grad_norm": 0.36986225843429565, "learning_rate": 0.0005817894719123214, "loss": 5.0158, "step": 433 }, { "epoch": 0.13879949629215055, "grad_norm": 0.29460081458091736, "learning_rate": 0.0005816826696812643, "loss": 4.9671, "step": 434 }, { "epoch": 0.13911931079973616, "grad_norm": 0.3076103925704956, "learning_rate": 0.0005815755650365217, "loss": 4.9775, "step": 435 }, { "epoch": 0.13943912530732175, "grad_norm": 0.27557173371315, "learning_rate": 0.000581468158093081, "loss": 5.0156, "step": 436 }, { "epoch": 0.13975893981490736, "grad_norm": 0.3328568637371063, "learning_rate": 0.0005813604489662539, "loss": 5.0576, "step": 437 }, { "epoch": 0.14007875432249295, "grad_norm": 0.295427143573761, "learning_rate": 0.0005812524377716766, "loss": 5.0356, "step": 438 }, { "epoch": 0.14039856883007856, "grad_norm": 0.30137887597084045, "learning_rate": 0.0005811441246253098, "loss": 4.9678, "step": 439 }, { "epoch": 0.14071838333766415, "grad_norm": 0.2910289466381073, "learning_rate": 0.0005810355096434378, "loss": 4.997, "step": 440 }, { "epoch": 0.14103819784524976, "grad_norm": 0.30388736724853516, "learning_rate": 0.0005809265929426696, "loss": 4.9691, "step": 441 }, { "epoch": 0.14135801235283535, "grad_norm": 0.30900922417640686, "learning_rate": 0.0005808173746399377, "loss": 4.9435, "step": 442 }, { "epoch": 0.14167782686042096, "grad_norm": 0.29543375968933105, "learning_rate": 0.0005807078548524988, "loss": 4.8837, "step": 443 }, { "epoch": 0.14199764136800655, "grad_norm": 0.3063761293888092, "learning_rate": 0.0005805980336979327, "loss": 5.0769, "step": 444 }, { "epoch": 0.14231745587559216, "grad_norm": 0.3085290491580963, "learning_rate": 0.0005804879112941433, "loss": 4.9091, "step": 445 }, { "epoch": 0.14263727038317775, "grad_norm": 0.304557204246521, "learning_rate": 0.0005803774877593575, "loss": 4.9991, "step": 446 }, { "epoch": 0.14295708489076336, "grad_norm": 0.29000672698020935, "learning_rate": 0.000580266763212126, "loss": 5.0087, "step": 447 }, { "epoch": 0.14327689939834895, "grad_norm": 0.30279868841171265, "learning_rate": 0.0005801557377713218, "loss": 5.0417, "step": 448 }, { "epoch": 0.14359671390593456, "grad_norm": 0.296049565076828, "learning_rate": 0.0005800444115561422, "loss": 5.0325, "step": 449 }, { "epoch": 0.14391652841352015, "grad_norm": 0.31404370069503784, "learning_rate": 0.000579932784686106, "loss": 4.9775, "step": 450 }, { "epoch": 0.14423634292110576, "grad_norm": 0.30332455039024353, "learning_rate": 0.000579820857281056, "loss": 4.9691, "step": 451 }, { "epoch": 0.14455615742869135, "grad_norm": 0.2961044907569885, "learning_rate": 0.0005797086294611569, "loss": 4.9676, "step": 452 }, { "epoch": 0.14487597193627696, "grad_norm": 0.2976911664009094, "learning_rate": 0.0005795961013468961, "loss": 4.9942, "step": 453 }, { "epoch": 0.14519578644386255, "grad_norm": 0.30885469913482666, "learning_rate": 0.0005794832730590836, "loss": 4.9536, "step": 454 }, { "epoch": 0.14551560095144817, "grad_norm": 0.3050510585308075, "learning_rate": 0.0005793701447188514, "loss": 4.9145, "step": 455 }, { "epoch": 0.14583541545903375, "grad_norm": 0.30285489559173584, "learning_rate": 0.0005792567164476539, "loss": 4.955, "step": 456 }, { "epoch": 0.14615522996661937, "grad_norm": 0.29599839448928833, "learning_rate": 0.0005791429883672672, "loss": 4.9213, "step": 457 }, { "epoch": 0.14647504447420495, "grad_norm": 0.291484534740448, "learning_rate": 0.0005790289605997895, "loss": 4.9025, "step": 458 }, { "epoch": 0.14679485898179057, "grad_norm": 0.3018636405467987, "learning_rate": 0.0005789146332676407, "loss": 4.918, "step": 459 }, { "epoch": 0.14711467348937615, "grad_norm": 0.3085641860961914, "learning_rate": 0.0005788000064935623, "loss": 4.9425, "step": 460 }, { "epoch": 0.14743448799696177, "grad_norm": 0.32512131333351135, "learning_rate": 0.0005786850804006172, "loss": 4.9441, "step": 461 }, { "epoch": 0.14775430250454735, "grad_norm": 0.2937241494655609, "learning_rate": 0.0005785698551121897, "loss": 4.9349, "step": 462 }, { "epoch": 0.14807411701213297, "grad_norm": 0.3139166533946991, "learning_rate": 0.0005784543307519854, "loss": 4.9242, "step": 463 }, { "epoch": 0.14839393151971855, "grad_norm": 0.31503674387931824, "learning_rate": 0.000578338507444031, "loss": 4.9832, "step": 464 }, { "epoch": 0.14871374602730417, "grad_norm": 0.30212894082069397, "learning_rate": 0.0005782223853126739, "loss": 4.9715, "step": 465 }, { "epoch": 0.14903356053488975, "grad_norm": 0.37920045852661133, "learning_rate": 0.0005781059644825824, "loss": 4.9611, "step": 466 }, { "epoch": 0.14935337504247537, "grad_norm": 0.3025848865509033, "learning_rate": 0.0005779892450787458, "loss": 4.8641, "step": 467 }, { "epoch": 0.14967318955006095, "grad_norm": 0.3095141351222992, "learning_rate": 0.0005778722272264736, "loss": 4.9905, "step": 468 }, { "epoch": 0.14999300405764657, "grad_norm": 0.3070904612541199, "learning_rate": 0.0005777549110513959, "loss": 4.964, "step": 469 }, { "epoch": 0.15031281856523215, "grad_norm": 0.30658942461013794, "learning_rate": 0.0005776372966794628, "loss": 4.9664, "step": 470 }, { "epoch": 0.15063263307281777, "grad_norm": 0.3043235242366791, "learning_rate": 0.000577519384236945, "loss": 4.9942, "step": 471 }, { "epoch": 0.15095244758040335, "grad_norm": 0.2998588979244232, "learning_rate": 0.0005774011738504326, "loss": 4.9555, "step": 472 }, { "epoch": 0.15127226208798897, "grad_norm": 0.310573011636734, "learning_rate": 0.0005772826656468363, "loss": 4.9766, "step": 473 }, { "epoch": 0.15159207659557455, "grad_norm": 0.31677335500717163, "learning_rate": 0.000577163859753386, "loss": 4.9357, "step": 474 }, { "epoch": 0.15191189110316017, "grad_norm": 0.3008494973182678, "learning_rate": 0.0005770447562976313, "loss": 4.9198, "step": 475 }, { "epoch": 0.15223170561074575, "grad_norm": 0.3489666283130646, "learning_rate": 0.0005769253554074414, "loss": 4.9451, "step": 476 }, { "epoch": 0.15255152011833137, "grad_norm": 0.2944929003715515, "learning_rate": 0.0005768056572110047, "loss": 4.9871, "step": 477 }, { "epoch": 0.15287133462591695, "grad_norm": 0.35133224725723267, "learning_rate": 0.000576685661836829, "loss": 4.9248, "step": 478 }, { "epoch": 0.15319114913350257, "grad_norm": 0.34021714329719543, "learning_rate": 0.0005765653694137406, "loss": 4.9223, "step": 479 }, { "epoch": 0.15351096364108816, "grad_norm": 0.3776319622993469, "learning_rate": 0.0005764447800708856, "loss": 4.9016, "step": 480 }, { "epoch": 0.15383077814867377, "grad_norm": 0.2997561991214752, "learning_rate": 0.0005763238939377278, "loss": 4.9415, "step": 481 }, { "epoch": 0.15415059265625938, "grad_norm": 0.3601900637149811, "learning_rate": 0.0005762027111440506, "loss": 4.9301, "step": 482 }, { "epoch": 0.15447040716384497, "grad_norm": 0.3257822096347809, "learning_rate": 0.0005760812318199555, "loss": 4.9315, "step": 483 }, { "epoch": 0.15479022167143058, "grad_norm": 0.3159526586532593, "learning_rate": 0.000575959456095862, "loss": 4.8219, "step": 484 }, { "epoch": 0.15511003617901617, "grad_norm": 0.34556034207344055, "learning_rate": 0.0005758373841025085, "loss": 4.8768, "step": 485 }, { "epoch": 0.15542985068660178, "grad_norm": 0.3053138852119446, "learning_rate": 0.000575715015970951, "loss": 4.8725, "step": 486 }, { "epoch": 0.15574966519418737, "grad_norm": 0.3239409327507019, "learning_rate": 0.0005755923518325637, "loss": 4.8944, "step": 487 }, { "epoch": 0.15606947970177298, "grad_norm": 0.35138851404190063, "learning_rate": 0.0005754693918190382, "loss": 4.9292, "step": 488 }, { "epoch": 0.15638929420935857, "grad_norm": 0.29475781321525574, "learning_rate": 0.0005753461360623842, "loss": 4.8833, "step": 489 }, { "epoch": 0.15670910871694418, "grad_norm": 0.4046892523765564, "learning_rate": 0.0005752225846949287, "loss": 4.9586, "step": 490 }, { "epoch": 0.15702892322452977, "grad_norm": 0.3703584372997284, "learning_rate": 0.000575098737849316, "loss": 4.9072, "step": 491 }, { "epoch": 0.15734873773211538, "grad_norm": 0.33279603719711304, "learning_rate": 0.0005749745956585077, "loss": 4.899, "step": 492 }, { "epoch": 0.15766855223970097, "grad_norm": 0.35400599241256714, "learning_rate": 0.0005748501582557825, "loss": 4.9757, "step": 493 }, { "epoch": 0.15798836674728658, "grad_norm": 0.3420694172382355, "learning_rate": 0.0005747254257747362, "loss": 4.9324, "step": 494 }, { "epoch": 0.15830818125487217, "grad_norm": 0.31306788325309753, "learning_rate": 0.0005746003983492811, "loss": 4.8597, "step": 495 }, { "epoch": 0.15862799576245779, "grad_norm": 0.336323082447052, "learning_rate": 0.0005744750761136463, "loss": 4.8622, "step": 496 }, { "epoch": 0.15894781027004337, "grad_norm": 0.308992862701416, "learning_rate": 0.0005743494592023773, "loss": 4.8827, "step": 497 }, { "epoch": 0.15926762477762899, "grad_norm": 0.3386521637439728, "learning_rate": 0.0005742235477503362, "loss": 4.8792, "step": 498 }, { "epoch": 0.15958743928521457, "grad_norm": 0.3046622574329376, "learning_rate": 0.000574097341892701, "loss": 4.8313, "step": 499 }, { "epoch": 0.15990725379280019, "grad_norm": 0.3147921562194824, "learning_rate": 0.0005739708417649659, "loss": 4.812, "step": 500 }, { "epoch": 0.15990725379280019, "eval_runtime": 44.1342, "eval_samples_per_second": 42.983, "eval_steps_per_second": 10.763, "step": 500 }, { "epoch": 0.16022706830038577, "grad_norm": 0.3024921417236328, "learning_rate": 0.0005738440475029414, "loss": 4.89, "step": 501 }, { "epoch": 0.16054688280797139, "grad_norm": 0.29367366433143616, "learning_rate": 0.0005737169592427531, "loss": 4.9191, "step": 502 }, { "epoch": 0.16086669731555697, "grad_norm": 0.29385578632354736, "learning_rate": 0.0005735895771208427, "loss": 4.9347, "step": 503 }, { "epoch": 0.1611865118231426, "grad_norm": 0.30841246247291565, "learning_rate": 0.0005734619012739673, "loss": 4.9322, "step": 504 }, { "epoch": 0.16150632633072817, "grad_norm": 0.29118648171424866, "learning_rate": 0.0005733339318391992, "loss": 4.9593, "step": 505 }, { "epoch": 0.1618261408383138, "grad_norm": 0.2874990403652191, "learning_rate": 0.0005732056689539262, "loss": 4.8523, "step": 506 }, { "epoch": 0.16214595534589937, "grad_norm": 0.2819806933403015, "learning_rate": 0.0005730771127558508, "loss": 4.8806, "step": 507 }, { "epoch": 0.162465769853485, "grad_norm": 0.2971401512622833, "learning_rate": 0.0005729482633829906, "loss": 4.8832, "step": 508 }, { "epoch": 0.16278558436107057, "grad_norm": 0.30105161666870117, "learning_rate": 0.000572819120973678, "loss": 4.8716, "step": 509 }, { "epoch": 0.1631053988686562, "grad_norm": 0.29176127910614014, "learning_rate": 0.0005726896856665599, "loss": 4.8584, "step": 510 }, { "epoch": 0.16342521337624177, "grad_norm": 0.31242042779922485, "learning_rate": 0.0005725599576005975, "loss": 4.8787, "step": 511 }, { "epoch": 0.1637450278838274, "grad_norm": 0.2941184639930725, "learning_rate": 0.0005724299369150665, "loss": 4.9049, "step": 512 }, { "epoch": 0.16406484239141297, "grad_norm": 0.2790621221065521, "learning_rate": 0.0005722996237495569, "loss": 4.9173, "step": 513 }, { "epoch": 0.1643846568989986, "grad_norm": 0.3209446668624878, "learning_rate": 0.0005721690182439724, "loss": 4.9238, "step": 514 }, { "epoch": 0.16470447140658417, "grad_norm": 0.2904362976551056, "learning_rate": 0.0005720381205385306, "loss": 4.8683, "step": 515 }, { "epoch": 0.1650242859141698, "grad_norm": 0.29575833678245544, "learning_rate": 0.000571906930773763, "loss": 4.8849, "step": 516 }, { "epoch": 0.16534410042175537, "grad_norm": 0.29062676429748535, "learning_rate": 0.0005717754490905146, "loss": 4.9019, "step": 517 }, { "epoch": 0.165663914929341, "grad_norm": 0.2932336628437042, "learning_rate": 0.0005716436756299437, "loss": 4.843, "step": 518 }, { "epoch": 0.16598372943692657, "grad_norm": 0.2936388850212097, "learning_rate": 0.000571511610533522, "loss": 4.8932, "step": 519 }, { "epoch": 0.1663035439445122, "grad_norm": 0.2803901731967926, "learning_rate": 0.0005713792539430339, "loss": 4.8838, "step": 520 }, { "epoch": 0.16662335845209778, "grad_norm": 0.31033194065093994, "learning_rate": 0.0005712466060005774, "loss": 4.8704, "step": 521 }, { "epoch": 0.1669431729596834, "grad_norm": 0.30137529969215393, "learning_rate": 0.0005711136668485626, "loss": 4.8095, "step": 522 }, { "epoch": 0.16726298746726898, "grad_norm": 0.2859789729118347, "learning_rate": 0.0005709804366297129, "loss": 4.8315, "step": 523 }, { "epoch": 0.1675828019748546, "grad_norm": 0.3099839985370636, "learning_rate": 0.0005708469154870636, "loss": 4.8802, "step": 524 }, { "epoch": 0.16790261648244018, "grad_norm": 0.2786038815975189, "learning_rate": 0.0005707131035639629, "loss": 4.8547, "step": 525 }, { "epoch": 0.1682224309900258, "grad_norm": 0.2959405779838562, "learning_rate": 0.0005705790010040707, "loss": 4.89, "step": 526 }, { "epoch": 0.16854224549761138, "grad_norm": 0.28984883427619934, "learning_rate": 0.000570444607951359, "loss": 4.8956, "step": 527 }, { "epoch": 0.168862060005197, "grad_norm": 0.30197766423225403, "learning_rate": 0.000570309924550112, "loss": 4.8194, "step": 528 }, { "epoch": 0.16918187451278258, "grad_norm": 0.29613879323005676, "learning_rate": 0.0005701749509449253, "loss": 4.7951, "step": 529 }, { "epoch": 0.1695016890203682, "grad_norm": 0.3111899197101593, "learning_rate": 0.0005700396872807062, "loss": 4.82, "step": 530 }, { "epoch": 0.16982150352795378, "grad_norm": 0.3030010163784027, "learning_rate": 0.0005699041337026734, "loss": 4.8259, "step": 531 }, { "epoch": 0.1701413180355394, "grad_norm": 0.31297561526298523, "learning_rate": 0.0005697682903563568, "loss": 4.8472, "step": 532 }, { "epoch": 0.17046113254312498, "grad_norm": 0.30666249990463257, "learning_rate": 0.0005696321573875974, "loss": 4.8454, "step": 533 }, { "epoch": 0.1707809470507106, "grad_norm": 0.3130928575992584, "learning_rate": 0.0005694957349425472, "loss": 4.8436, "step": 534 }, { "epoch": 0.17110076155829618, "grad_norm": 0.31319954991340637, "learning_rate": 0.0005693590231676688, "loss": 4.8623, "step": 535 }, { "epoch": 0.1714205760658818, "grad_norm": 0.3089316189289093, "learning_rate": 0.0005692220222097357, "loss": 4.8735, "step": 536 }, { "epoch": 0.17174039057346738, "grad_norm": 0.30962470173835754, "learning_rate": 0.0005690847322158317, "loss": 4.8749, "step": 537 }, { "epoch": 0.172060205081053, "grad_norm": 0.3034002482891083, "learning_rate": 0.0005689471533333508, "loss": 4.7948, "step": 538 }, { "epoch": 0.17238001958863858, "grad_norm": 0.2823324501514435, "learning_rate": 0.0005688092857099974, "loss": 4.7469, "step": 539 }, { "epoch": 0.1726998340962242, "grad_norm": 0.3232250511646271, "learning_rate": 0.0005686711294937858, "loss": 4.7407, "step": 540 }, { "epoch": 0.17301964860380978, "grad_norm": 0.2919607162475586, "learning_rate": 0.0005685326848330402, "loss": 4.8575, "step": 541 }, { "epoch": 0.1733394631113954, "grad_norm": 0.3029126524925232, "learning_rate": 0.0005683939518763942, "loss": 4.7783, "step": 542 }, { "epoch": 0.17365927761898098, "grad_norm": 0.2910093665122986, "learning_rate": 0.000568254930772791, "loss": 4.8718, "step": 543 }, { "epoch": 0.1739790921265666, "grad_norm": 0.3077618479728699, "learning_rate": 0.0005681156216714836, "loss": 4.7973, "step": 544 }, { "epoch": 0.17429890663415218, "grad_norm": 0.2984183430671692, "learning_rate": 0.0005679760247220336, "loss": 4.8164, "step": 545 }, { "epoch": 0.1746187211417378, "grad_norm": 0.2927844524383545, "learning_rate": 0.0005678361400743119, "loss": 4.8145, "step": 546 }, { "epoch": 0.17493853564932338, "grad_norm": 0.3073475658893585, "learning_rate": 0.0005676959678784982, "loss": 4.7253, "step": 547 }, { "epoch": 0.175258350156909, "grad_norm": 0.28819167613983154, "learning_rate": 0.000567555508285081, "loss": 4.8276, "step": 548 }, { "epoch": 0.17557816466449458, "grad_norm": 0.30008575320243835, "learning_rate": 0.0005674147614448574, "loss": 4.8883, "step": 549 }, { "epoch": 0.1758979791720802, "grad_norm": 0.3039601743221283, "learning_rate": 0.0005672737275089327, "loss": 4.8682, "step": 550 }, { "epoch": 0.1762177936796658, "grad_norm": 0.3032839894294739, "learning_rate": 0.0005671324066287205, "loss": 4.7863, "step": 551 }, { "epoch": 0.1765376081872514, "grad_norm": 0.297080934047699, "learning_rate": 0.0005669907989559426, "loss": 4.8554, "step": 552 }, { "epoch": 0.176857422694837, "grad_norm": 0.29981404542922974, "learning_rate": 0.0005668489046426285, "loss": 4.8663, "step": 553 }, { "epoch": 0.1771772372024226, "grad_norm": 0.2833574712276459, "learning_rate": 0.0005667067238411153, "loss": 4.8102, "step": 554 }, { "epoch": 0.1774970517100082, "grad_norm": 0.2933471202850342, "learning_rate": 0.0005665642567040483, "loss": 4.8438, "step": 555 }, { "epoch": 0.1778168662175938, "grad_norm": 0.2967337667942047, "learning_rate": 0.0005664215033843796, "loss": 4.7604, "step": 556 }, { "epoch": 0.1781366807251794, "grad_norm": 0.2810646593570709, "learning_rate": 0.0005662784640353688, "loss": 4.7869, "step": 557 }, { "epoch": 0.178456495232765, "grad_norm": 0.3116728365421295, "learning_rate": 0.0005661351388105823, "loss": 4.8392, "step": 558 }, { "epoch": 0.1787763097403506, "grad_norm": 0.2967308759689331, "learning_rate": 0.0005659915278638939, "loss": 4.8455, "step": 559 }, { "epoch": 0.1790961242479362, "grad_norm": 0.28990232944488525, "learning_rate": 0.0005658476313494839, "loss": 4.7536, "step": 560 }, { "epoch": 0.1794159387555218, "grad_norm": 0.2723807394504547, "learning_rate": 0.0005657034494218389, "loss": 4.8071, "step": 561 }, { "epoch": 0.1797357532631074, "grad_norm": 0.300227552652359, "learning_rate": 0.0005655589822357526, "loss": 4.7658, "step": 562 }, { "epoch": 0.180055567770693, "grad_norm": 0.3008899390697479, "learning_rate": 0.0005654142299463241, "loss": 4.8264, "step": 563 }, { "epoch": 0.1803753822782786, "grad_norm": 0.29611489176750183, "learning_rate": 0.0005652691927089593, "loss": 4.7667, "step": 564 }, { "epoch": 0.1806951967858642, "grad_norm": 0.29880276322364807, "learning_rate": 0.0005651238706793697, "loss": 4.7314, "step": 565 }, { "epoch": 0.1810150112934498, "grad_norm": 0.31382426619529724, "learning_rate": 0.0005649782640135727, "loss": 4.757, "step": 566 }, { "epoch": 0.1813348258010354, "grad_norm": 0.2812778949737549, "learning_rate": 0.000564832372867891, "loss": 4.7308, "step": 567 }, { "epoch": 0.181654640308621, "grad_norm": 0.32051247358322144, "learning_rate": 0.0005646861973989531, "loss": 4.8435, "step": 568 }, { "epoch": 0.1819744548162066, "grad_norm": 0.27928078174591064, "learning_rate": 0.0005645397377636922, "loss": 4.7303, "step": 569 }, { "epoch": 0.1822942693237922, "grad_norm": 0.2898458242416382, "learning_rate": 0.0005643929941193474, "loss": 4.7149, "step": 570 }, { "epoch": 0.1826140838313778, "grad_norm": 0.28139132261276245, "learning_rate": 0.000564245966623462, "loss": 4.74, "step": 571 }, { "epoch": 0.1829338983389634, "grad_norm": 0.2959553003311157, "learning_rate": 0.0005640986554338842, "loss": 4.7293, "step": 572 }, { "epoch": 0.183253712846549, "grad_norm": 0.2750091254711151, "learning_rate": 0.0005639510607087673, "loss": 4.891, "step": 573 }, { "epoch": 0.1835735273541346, "grad_norm": 0.28795307874679565, "learning_rate": 0.0005638031826065679, "loss": 4.7883, "step": 574 }, { "epoch": 0.1838933418617202, "grad_norm": 0.2937767803668976, "learning_rate": 0.0005636550212860479, "loss": 4.7973, "step": 575 }, { "epoch": 0.1842131563693058, "grad_norm": 0.2969697117805481, "learning_rate": 0.0005635065769062728, "loss": 4.7899, "step": 576 }, { "epoch": 0.1845329708768914, "grad_norm": 0.28628045320510864, "learning_rate": 0.0005633578496266121, "loss": 4.7428, "step": 577 }, { "epoch": 0.184852785384477, "grad_norm": 0.2963985502719879, "learning_rate": 0.0005632088396067389, "loss": 4.7637, "step": 578 }, { "epoch": 0.1851725998920626, "grad_norm": 0.30827540159225464, "learning_rate": 0.0005630595470066299, "loss": 4.8327, "step": 579 }, { "epoch": 0.1854924143996482, "grad_norm": 0.3127327859401703, "learning_rate": 0.0005629099719865652, "loss": 4.858, "step": 580 }, { "epoch": 0.1858122289072338, "grad_norm": 0.28379836678504944, "learning_rate": 0.0005627601147071282, "loss": 4.7403, "step": 581 }, { "epoch": 0.1861320434148194, "grad_norm": 0.30736714601516724, "learning_rate": 0.000562609975329205, "loss": 4.7297, "step": 582 }, { "epoch": 0.186451857922405, "grad_norm": 0.2943870425224304, "learning_rate": 0.0005624595540139851, "loss": 4.809, "step": 583 }, { "epoch": 0.1867716724299906, "grad_norm": 0.31398749351501465, "learning_rate": 0.0005623088509229602, "loss": 4.7054, "step": 584 }, { "epoch": 0.1870914869375762, "grad_norm": 0.2906714379787445, "learning_rate": 0.0005621578662179247, "loss": 4.7288, "step": 585 }, { "epoch": 0.1874113014451618, "grad_norm": 0.3153475224971771, "learning_rate": 0.0005620066000609755, "loss": 4.7974, "step": 586 }, { "epoch": 0.1877311159527474, "grad_norm": 0.3062897324562073, "learning_rate": 0.0005618550526145113, "loss": 4.7636, "step": 587 }, { "epoch": 0.188050930460333, "grad_norm": 0.3051263988018036, "learning_rate": 0.0005617032240412329, "loss": 4.7146, "step": 588 }, { "epoch": 0.1883707449679186, "grad_norm": 0.3300788700580597, "learning_rate": 0.0005615511145041433, "loss": 4.7797, "step": 589 }, { "epoch": 0.1886905594755042, "grad_norm": 0.2894851863384247, "learning_rate": 0.0005613987241665468, "loss": 4.8063, "step": 590 }, { "epoch": 0.1890103739830898, "grad_norm": 0.35211434960365295, "learning_rate": 0.000561246053192049, "loss": 4.7154, "step": 591 }, { "epoch": 0.1893301884906754, "grad_norm": 0.2770704925060272, "learning_rate": 0.0005610931017445573, "loss": 4.763, "step": 592 }, { "epoch": 0.189650002998261, "grad_norm": 0.3280916213989258, "learning_rate": 0.0005609398699882796, "loss": 4.7627, "step": 593 }, { "epoch": 0.1899698175058466, "grad_norm": 0.2923208475112915, "learning_rate": 0.0005607863580877253, "loss": 4.8022, "step": 594 }, { "epoch": 0.19028963201343221, "grad_norm": 0.35542020201683044, "learning_rate": 0.0005606325662077042, "loss": 4.7335, "step": 595 }, { "epoch": 0.1906094465210178, "grad_norm": 0.2958972454071045, "learning_rate": 0.0005604784945133271, "loss": 4.6845, "step": 596 }, { "epoch": 0.19092926102860341, "grad_norm": 0.32878950238227844, "learning_rate": 0.0005603241431700045, "loss": 4.7863, "step": 597 }, { "epoch": 0.191249075536189, "grad_norm": 0.3126453757286072, "learning_rate": 0.0005601695123434477, "loss": 4.6924, "step": 598 }, { "epoch": 0.19156889004377461, "grad_norm": 0.3100943863391876, "learning_rate": 0.000560014602199668, "loss": 4.739, "step": 599 }, { "epoch": 0.1918887045513602, "grad_norm": 0.32049739360809326, "learning_rate": 0.0005598594129049765, "loss": 4.861, "step": 600 }, { "epoch": 0.1918887045513602, "eval_runtime": 43.661, "eval_samples_per_second": 43.448, "eval_steps_per_second": 10.879, "step": 600 }, { "epoch": 0.19220851905894581, "grad_norm": 0.29994630813598633, "learning_rate": 0.0005597039446259837, "loss": 4.8562, "step": 601 }, { "epoch": 0.1925283335665314, "grad_norm": 0.3189934492111206, "learning_rate": 0.0005595481975296002, "loss": 4.8536, "step": 602 }, { "epoch": 0.19284814807411702, "grad_norm": 0.2819173038005829, "learning_rate": 0.0005593921717830354, "loss": 4.7392, "step": 603 }, { "epoch": 0.1931679625817026, "grad_norm": 0.3349928557872772, "learning_rate": 0.0005592358675537983, "loss": 4.7877, "step": 604 }, { "epoch": 0.19348777708928822, "grad_norm": 0.32551273703575134, "learning_rate": 0.0005590792850096965, "loss": 4.731, "step": 605 }, { "epoch": 0.1938075915968738, "grad_norm": 0.32351675629615784, "learning_rate": 0.0005589224243188365, "loss": 4.7883, "step": 606 }, { "epoch": 0.19412740610445942, "grad_norm": 0.30613231658935547, "learning_rate": 0.0005587652856496236, "loss": 4.708, "step": 607 }, { "epoch": 0.194447220612045, "grad_norm": 0.2942259609699249, "learning_rate": 0.0005586078691707614, "loss": 4.6949, "step": 608 }, { "epoch": 0.19476703511963062, "grad_norm": 0.3091714084148407, "learning_rate": 0.0005584501750512516, "loss": 4.7987, "step": 609 }, { "epoch": 0.1950868496272162, "grad_norm": 0.29207688570022583, "learning_rate": 0.0005582922034603945, "loss": 4.7497, "step": 610 }, { "epoch": 0.19540666413480182, "grad_norm": 0.3144815266132355, "learning_rate": 0.0005581339545677877, "loss": 4.7766, "step": 611 }, { "epoch": 0.1957264786423874, "grad_norm": 0.2998262643814087, "learning_rate": 0.0005579754285433269, "loss": 4.7061, "step": 612 }, { "epoch": 0.19604629314997302, "grad_norm": 0.32424458861351013, "learning_rate": 0.0005578166255572048, "loss": 4.7937, "step": 613 }, { "epoch": 0.1963661076575586, "grad_norm": 0.3043598234653473, "learning_rate": 0.0005576575457799122, "loss": 4.7379, "step": 614 }, { "epoch": 0.19668592216514422, "grad_norm": 0.29779666662216187, "learning_rate": 0.0005574981893822365, "loss": 4.7057, "step": 615 }, { "epoch": 0.1970057366727298, "grad_norm": 0.28995975852012634, "learning_rate": 0.0005573385565352622, "loss": 4.7275, "step": 616 }, { "epoch": 0.19732555118031542, "grad_norm": 0.3021906614303589, "learning_rate": 0.0005571786474103709, "loss": 4.8216, "step": 617 }, { "epoch": 0.19764536568790103, "grad_norm": 0.3078979551792145, "learning_rate": 0.0005570184621792405, "loss": 4.6524, "step": 618 }, { "epoch": 0.19796518019548662, "grad_norm": 0.3037913143634796, "learning_rate": 0.0005568580010138452, "loss": 4.761, "step": 619 }, { "epoch": 0.19828499470307223, "grad_norm": 0.2881077229976654, "learning_rate": 0.0005566972640864558, "loss": 4.7218, "step": 620 }, { "epoch": 0.19860480921065782, "grad_norm": 0.3089888095855713, "learning_rate": 0.0005565362515696389, "loss": 4.7586, "step": 621 }, { "epoch": 0.19892462371824343, "grad_norm": 0.2849983274936676, "learning_rate": 0.0005563749636362572, "loss": 4.6937, "step": 622 }, { "epoch": 0.19924443822582902, "grad_norm": 0.3052695393562317, "learning_rate": 0.0005562134004594687, "loss": 4.7755, "step": 623 }, { "epoch": 0.19956425273341463, "grad_norm": 0.29641565680503845, "learning_rate": 0.0005560515622127276, "loss": 4.6769, "step": 624 }, { "epoch": 0.19988406724100022, "grad_norm": 0.31382811069488525, "learning_rate": 0.0005558894490697824, "loss": 4.6917, "step": 625 }, { "epoch": 0.20020388174858583, "grad_norm": 0.3017825484275818, "learning_rate": 0.0005557270612046777, "loss": 4.6935, "step": 626 }, { "epoch": 0.20052369625617142, "grad_norm": 0.29594001173973083, "learning_rate": 0.0005555643987917525, "loss": 4.6875, "step": 627 }, { "epoch": 0.20084351076375703, "grad_norm": 0.2880837619304657, "learning_rate": 0.0005554014620056406, "loss": 4.7043, "step": 628 }, { "epoch": 0.20116332527134262, "grad_norm": 0.29555588960647583, "learning_rate": 0.0005552382510212706, "loss": 4.6632, "step": 629 }, { "epoch": 0.20148313977892823, "grad_norm": 0.30356717109680176, "learning_rate": 0.0005550747660138653, "loss": 4.7396, "step": 630 }, { "epoch": 0.20180295428651382, "grad_norm": 0.28041693568229675, "learning_rate": 0.0005549110071589418, "loss": 4.7105, "step": 631 }, { "epoch": 0.20212276879409943, "grad_norm": 0.28780633211135864, "learning_rate": 0.0005547469746323109, "loss": 4.6914, "step": 632 }, { "epoch": 0.20244258330168502, "grad_norm": 0.28109389543533325, "learning_rate": 0.0005545826686100776, "loss": 4.7766, "step": 633 }, { "epoch": 0.20276239780927063, "grad_norm": 0.297075480222702, "learning_rate": 0.0005544180892686403, "loss": 4.6094, "step": 634 }, { "epoch": 0.20308221231685622, "grad_norm": 0.28346458077430725, "learning_rate": 0.000554253236784691, "loss": 4.6592, "step": 635 }, { "epoch": 0.20340202682444183, "grad_norm": 0.2926829159259796, "learning_rate": 0.0005540881113352148, "loss": 4.7178, "step": 636 }, { "epoch": 0.20372184133202742, "grad_norm": 0.30800721049308777, "learning_rate": 0.0005539227130974898, "loss": 4.7305, "step": 637 }, { "epoch": 0.20404165583961303, "grad_norm": 0.3070976436138153, "learning_rate": 0.0005537570422490871, "loss": 4.6457, "step": 638 }, { "epoch": 0.20436147034719862, "grad_norm": 0.30414000153541565, "learning_rate": 0.0005535910989678706, "loss": 4.6876, "step": 639 }, { "epoch": 0.20468128485478423, "grad_norm": 0.2945687472820282, "learning_rate": 0.0005534248834319962, "loss": 4.6796, "step": 640 }, { "epoch": 0.20500109936236982, "grad_norm": 0.2937379479408264, "learning_rate": 0.0005532583958199126, "loss": 4.6688, "step": 641 }, { "epoch": 0.20532091386995543, "grad_norm": 0.29582861065864563, "learning_rate": 0.0005530916363103605, "loss": 4.7749, "step": 642 }, { "epoch": 0.20564072837754102, "grad_norm": 0.310712993144989, "learning_rate": 0.0005529246050823723, "loss": 4.699, "step": 643 }, { "epoch": 0.20596054288512664, "grad_norm": 0.30164024233818054, "learning_rate": 0.0005527573023152722, "loss": 4.6708, "step": 644 }, { "epoch": 0.20628035739271222, "grad_norm": 0.2931421399116516, "learning_rate": 0.0005525897281886761, "loss": 4.6349, "step": 645 }, { "epoch": 0.20660017190029784, "grad_norm": 0.28729385137557983, "learning_rate": 0.000552421882882491, "loss": 4.6923, "step": 646 }, { "epoch": 0.20691998640788342, "grad_norm": 0.3129110634326935, "learning_rate": 0.000552253766576915, "loss": 4.6813, "step": 647 }, { "epoch": 0.20723980091546904, "grad_norm": 0.29494187235832214, "learning_rate": 0.0005520853794524375, "loss": 4.7932, "step": 648 }, { "epoch": 0.20755961542305462, "grad_norm": 0.37377920746803284, "learning_rate": 0.0005519167216898383, "loss": 4.7495, "step": 649 }, { "epoch": 0.20787942993064024, "grad_norm": 0.3005518913269043, "learning_rate": 0.0005517477934701879, "loss": 4.6161, "step": 650 }, { "epoch": 0.20819924443822582, "grad_norm": 0.289385586977005, "learning_rate": 0.0005515785949748471, "loss": 4.7144, "step": 651 }, { "epoch": 0.20851905894581144, "grad_norm": 0.2991845905780792, "learning_rate": 0.0005514091263854671, "loss": 4.6491, "step": 652 }, { "epoch": 0.20883887345339702, "grad_norm": 0.2991924583911896, "learning_rate": 0.0005512393878839885, "loss": 4.6294, "step": 653 }, { "epoch": 0.20915868796098264, "grad_norm": 0.2967759966850281, "learning_rate": 0.0005510693796526425, "loss": 4.5914, "step": 654 }, { "epoch": 0.20947850246856822, "grad_norm": 0.3146279454231262, "learning_rate": 0.000550899101873949, "loss": 4.7732, "step": 655 }, { "epoch": 0.20979831697615384, "grad_norm": 0.3095434904098511, "learning_rate": 0.0005507285547307181, "loss": 4.6648, "step": 656 }, { "epoch": 0.21011813148373942, "grad_norm": 0.30334800481796265, "learning_rate": 0.0005505577384060485, "loss": 4.672, "step": 657 }, { "epoch": 0.21043794599132504, "grad_norm": 0.3130626976490021, "learning_rate": 0.0005503866530833281, "loss": 4.7307, "step": 658 }, { "epoch": 0.21075776049891062, "grad_norm": 0.2948593497276306, "learning_rate": 0.0005502152989462337, "loss": 4.7609, "step": 659 }, { "epoch": 0.21107757500649624, "grad_norm": 0.2727290391921997, "learning_rate": 0.0005500436761787306, "loss": 4.6029, "step": 660 }, { "epoch": 0.21139738951408182, "grad_norm": 0.2965226471424103, "learning_rate": 0.0005498717849650724, "loss": 4.7178, "step": 661 }, { "epoch": 0.21171720402166744, "grad_norm": 0.29029273986816406, "learning_rate": 0.0005496996254898011, "loss": 4.7071, "step": 662 }, { "epoch": 0.21203701852925302, "grad_norm": 0.30917757749557495, "learning_rate": 0.0005495271979377464, "loss": 4.6762, "step": 663 }, { "epoch": 0.21235683303683864, "grad_norm": 0.2794337570667267, "learning_rate": 0.0005493545024940264, "loss": 4.6741, "step": 664 }, { "epoch": 0.21267664754442422, "grad_norm": 0.3020618259906769, "learning_rate": 0.000549181539344046, "loss": 4.6785, "step": 665 }, { "epoch": 0.21299646205200984, "grad_norm": 0.2958187758922577, "learning_rate": 0.0005490083086734982, "loss": 4.6894, "step": 666 }, { "epoch": 0.21331627655959542, "grad_norm": 0.3331874907016754, "learning_rate": 0.000548834810668363, "loss": 4.7506, "step": 667 }, { "epoch": 0.21363609106718104, "grad_norm": 0.2972181737422943, "learning_rate": 0.0005486610455149069, "loss": 4.746, "step": 668 }, { "epoch": 0.21395590557476662, "grad_norm": 0.33447733521461487, "learning_rate": 0.0005484870133996842, "loss": 4.7227, "step": 669 }, { "epoch": 0.21427572008235224, "grad_norm": 0.3140539526939392, "learning_rate": 0.0005483127145095349, "loss": 4.6576, "step": 670 }, { "epoch": 0.21459553458993783, "grad_norm": 0.3433819115161896, "learning_rate": 0.0005481381490315859, "loss": 4.7702, "step": 671 }, { "epoch": 0.21491534909752344, "grad_norm": 0.3164707124233246, "learning_rate": 0.0005479633171532503, "loss": 4.6362, "step": 672 }, { "epoch": 0.21523516360510903, "grad_norm": 0.3185357451438904, "learning_rate": 0.0005477882190622269, "loss": 4.6532, "step": 673 }, { "epoch": 0.21555497811269464, "grad_norm": 0.33554694056510925, "learning_rate": 0.0005476128549465006, "loss": 4.6832, "step": 674 }, { "epoch": 0.21587479262028023, "grad_norm": 0.3078819513320923, "learning_rate": 0.0005474372249943417, "loss": 4.7325, "step": 675 }, { "epoch": 0.21619460712786584, "grad_norm": 0.3173082768917084, "learning_rate": 0.0005472613293943062, "loss": 4.6965, "step": 676 }, { "epoch": 0.21651442163545143, "grad_norm": 0.32218828797340393, "learning_rate": 0.0005470851683352349, "loss": 4.6792, "step": 677 }, { "epoch": 0.21683423614303704, "grad_norm": 0.3067447543144226, "learning_rate": 0.0005469087420062538, "loss": 4.6965, "step": 678 }, { "epoch": 0.21715405065062263, "grad_norm": 0.304235577583313, "learning_rate": 0.0005467320505967739, "loss": 4.6229, "step": 679 }, { "epoch": 0.21747386515820824, "grad_norm": 0.31875747442245483, "learning_rate": 0.0005465550942964903, "loss": 4.6635, "step": 680 }, { "epoch": 0.21779367966579383, "grad_norm": 0.30761486291885376, "learning_rate": 0.000546377873295383, "loss": 4.5984, "step": 681 }, { "epoch": 0.21811349417337944, "grad_norm": 0.3020329475402832, "learning_rate": 0.0005462003877837157, "loss": 4.7169, "step": 682 }, { "epoch": 0.21843330868096503, "grad_norm": 0.33058735728263855, "learning_rate": 0.0005460226379520365, "loss": 4.7092, "step": 683 }, { "epoch": 0.21875312318855064, "grad_norm": 0.3102986216545105, "learning_rate": 0.0005458446239911772, "loss": 4.6436, "step": 684 }, { "epoch": 0.21907293769613623, "grad_norm": 0.29908737540245056, "learning_rate": 0.0005456663460922528, "loss": 4.7082, "step": 685 }, { "epoch": 0.21939275220372184, "grad_norm": 0.32533496618270874, "learning_rate": 0.000545487804446662, "loss": 4.6387, "step": 686 }, { "epoch": 0.21971256671130746, "grad_norm": 0.3325665295124054, "learning_rate": 0.0005453089992460868, "loss": 4.7023, "step": 687 }, { "epoch": 0.22003238121889304, "grad_norm": 0.3323059380054474, "learning_rate": 0.0005451299306824917, "loss": 4.6604, "step": 688 }, { "epoch": 0.22035219572647866, "grad_norm": 0.2909764051437378, "learning_rate": 0.0005449505989481243, "loss": 4.6913, "step": 689 }, { "epoch": 0.22067201023406424, "grad_norm": 0.31438586115837097, "learning_rate": 0.0005447710042355145, "loss": 4.6227, "step": 690 }, { "epoch": 0.22099182474164986, "grad_norm": 0.31848856806755066, "learning_rate": 0.0005445911467374747, "loss": 4.6928, "step": 691 }, { "epoch": 0.22131163924923544, "grad_norm": 0.32235923409461975, "learning_rate": 0.0005444110266470995, "loss": 4.7288, "step": 692 }, { "epoch": 0.22163145375682106, "grad_norm": 0.3019062876701355, "learning_rate": 0.0005442306441577651, "loss": 4.8696, "step": 693 }, { "epoch": 0.22195126826440664, "grad_norm": 0.2892726957798004, "learning_rate": 0.0005440499994631299, "loss": 4.6942, "step": 694 }, { "epoch": 0.22227108277199226, "grad_norm": 0.31549590826034546, "learning_rate": 0.0005438690927571332, "loss": 4.6131, "step": 695 }, { "epoch": 0.22259089727957784, "grad_norm": 0.29862073063850403, "learning_rate": 0.000543687924233996, "loss": 4.5702, "step": 696 }, { "epoch": 0.22291071178716346, "grad_norm": 0.296994149684906, "learning_rate": 0.0005435064940882204, "loss": 4.6865, "step": 697 }, { "epoch": 0.22323052629474904, "grad_norm": 0.3024628460407257, "learning_rate": 0.0005433248025145894, "loss": 4.5858, "step": 698 }, { "epoch": 0.22355034080233466, "grad_norm": 0.29701897501945496, "learning_rate": 0.0005431428497081661, "loss": 4.6855, "step": 699 }, { "epoch": 0.22387015530992024, "grad_norm": 0.30706948041915894, "learning_rate": 0.0005429606358642948, "loss": 4.5519, "step": 700 }, { "epoch": 0.22387015530992024, "eval_runtime": 42.4426, "eval_samples_per_second": 44.696, "eval_steps_per_second": 11.192, "step": 700 }, { "epoch": 0.22418996981750586, "grad_norm": 0.31330904364585876, "learning_rate": 0.0005427781611785998, "loss": 4.5952, "step": 701 }, { "epoch": 0.22450978432509144, "grad_norm": 0.3005469739437103, "learning_rate": 0.0005425954258469852, "loss": 4.6477, "step": 702 }, { "epoch": 0.22482959883267706, "grad_norm": 0.27588513493537903, "learning_rate": 0.000542412430065635, "loss": 4.65, "step": 703 }, { "epoch": 0.22514941334026264, "grad_norm": 0.2997315227985382, "learning_rate": 0.0005422291740310134, "loss": 4.6089, "step": 704 }, { "epoch": 0.22546922784784826, "grad_norm": 0.28503403067588806, "learning_rate": 0.0005420456579398632, "loss": 4.6205, "step": 705 }, { "epoch": 0.22578904235543384, "grad_norm": 0.4010476768016815, "learning_rate": 0.0005418618819892067, "loss": 4.6744, "step": 706 }, { "epoch": 0.22610885686301946, "grad_norm": 0.2963140606880188, "learning_rate": 0.0005416778463763454, "loss": 4.6059, "step": 707 }, { "epoch": 0.22642867137060504, "grad_norm": 0.30308544635772705, "learning_rate": 0.0005414935512988593, "loss": 4.6381, "step": 708 }, { "epoch": 0.22674848587819066, "grad_norm": 0.2945327162742615, "learning_rate": 0.0005413089969546071, "loss": 4.7126, "step": 709 }, { "epoch": 0.22706830038577624, "grad_norm": 0.31786808371543884, "learning_rate": 0.0005411241835417256, "loss": 4.6997, "step": 710 }, { "epoch": 0.22738811489336186, "grad_norm": 0.28789451718330383, "learning_rate": 0.0005409391112586303, "loss": 4.6289, "step": 711 }, { "epoch": 0.22770792940094745, "grad_norm": 0.3239929974079132, "learning_rate": 0.0005407537803040139, "loss": 4.6015, "step": 712 }, { "epoch": 0.22802774390853306, "grad_norm": 0.3091432750225067, "learning_rate": 0.0005405681908768475, "loss": 4.5794, "step": 713 }, { "epoch": 0.22834755841611865, "grad_norm": 0.29155635833740234, "learning_rate": 0.0005403823431763791, "loss": 4.6651, "step": 714 }, { "epoch": 0.22866737292370426, "grad_norm": 0.2865404486656189, "learning_rate": 0.0005401962374021342, "loss": 4.5989, "step": 715 }, { "epoch": 0.22898718743128985, "grad_norm": 0.3179633617401123, "learning_rate": 0.0005400098737539157, "loss": 4.7592, "step": 716 }, { "epoch": 0.22930700193887546, "grad_norm": 0.2965264916419983, "learning_rate": 0.0005398232524318029, "loss": 4.6123, "step": 717 }, { "epoch": 0.22962681644646105, "grad_norm": 0.3025377094745636, "learning_rate": 0.0005396363736361519, "loss": 4.6592, "step": 718 }, { "epoch": 0.22994663095404666, "grad_norm": 0.3015200197696686, "learning_rate": 0.0005394492375675953, "loss": 4.5654, "step": 719 }, { "epoch": 0.23026644546163225, "grad_norm": 0.31419816613197327, "learning_rate": 0.0005392618444270417, "loss": 4.5821, "step": 720 }, { "epoch": 0.23058625996921786, "grad_norm": 0.33039045333862305, "learning_rate": 0.0005390741944156759, "loss": 4.6946, "step": 721 }, { "epoch": 0.23090607447680345, "grad_norm": 0.3312971293926239, "learning_rate": 0.0005388862877349584, "loss": 4.6006, "step": 722 }, { "epoch": 0.23122588898438906, "grad_norm": 0.3019008934497833, "learning_rate": 0.0005386981245866252, "loss": 4.6071, "step": 723 }, { "epoch": 0.23154570349197465, "grad_norm": 0.29676640033721924, "learning_rate": 0.0005385097051726879, "loss": 4.6217, "step": 724 }, { "epoch": 0.23186551799956026, "grad_norm": 0.29920849204063416, "learning_rate": 0.0005383210296954328, "loss": 4.6746, "step": 725 }, { "epoch": 0.23218533250714585, "grad_norm": 0.31218525767326355, "learning_rate": 0.0005381320983574214, "loss": 4.6973, "step": 726 }, { "epoch": 0.23250514701473146, "grad_norm": 0.30252861976623535, "learning_rate": 0.0005379429113614898, "loss": 4.7121, "step": 727 }, { "epoch": 0.23282496152231705, "grad_norm": 0.3055766522884369, "learning_rate": 0.0005377534689107487, "loss": 4.5993, "step": 728 }, { "epoch": 0.23314477602990266, "grad_norm": 0.3270404636859894, "learning_rate": 0.0005375637712085829, "loss": 4.5425, "step": 729 }, { "epoch": 0.23346459053748825, "grad_norm": 0.2844412326812744, "learning_rate": 0.0005373738184586514, "loss": 4.6039, "step": 730 }, { "epoch": 0.23378440504507386, "grad_norm": 0.2935783267021179, "learning_rate": 0.0005371836108648868, "loss": 4.6469, "step": 731 }, { "epoch": 0.23410421955265945, "grad_norm": 0.27497968077659607, "learning_rate": 0.0005369931486314953, "loss": 4.586, "step": 732 }, { "epoch": 0.23442403406024506, "grad_norm": 0.30102384090423584, "learning_rate": 0.0005368024319629569, "loss": 4.7417, "step": 733 }, { "epoch": 0.23474384856783065, "grad_norm": 0.3343384563922882, "learning_rate": 0.0005366114610640241, "loss": 4.6293, "step": 734 }, { "epoch": 0.23506366307541626, "grad_norm": 0.29406625032424927, "learning_rate": 0.000536420236139723, "loss": 4.5799, "step": 735 }, { "epoch": 0.23538347758300185, "grad_norm": 0.3229942321777344, "learning_rate": 0.000536228757395352, "loss": 4.548, "step": 736 }, { "epoch": 0.23570329209058746, "grad_norm": 0.2915140390396118, "learning_rate": 0.000536037025036482, "loss": 4.6266, "step": 737 }, { "epoch": 0.23602310659817305, "grad_norm": 0.3146757185459137, "learning_rate": 0.0005358450392689564, "loss": 4.5522, "step": 738 }, { "epoch": 0.23634292110575866, "grad_norm": 0.29982420802116394, "learning_rate": 0.0005356528002988907, "loss": 4.6231, "step": 739 }, { "epoch": 0.23666273561334425, "grad_norm": 0.3044486343860626, "learning_rate": 0.000535460308332672, "loss": 4.643, "step": 740 }, { "epoch": 0.23698255012092986, "grad_norm": 0.29942041635513306, "learning_rate": 0.0005352675635769589, "loss": 4.6973, "step": 741 }, { "epoch": 0.23730236462851545, "grad_norm": 0.30295315384864807, "learning_rate": 0.0005350745662386818, "loss": 4.61, "step": 742 }, { "epoch": 0.23762217913610106, "grad_norm": 0.2998339831829071, "learning_rate": 0.000534881316525042, "loss": 4.6015, "step": 743 }, { "epoch": 0.23794199364368665, "grad_norm": 0.32832202315330505, "learning_rate": 0.0005346878146435119, "loss": 4.599, "step": 744 }, { "epoch": 0.23826180815127226, "grad_norm": 0.30961769819259644, "learning_rate": 0.0005344940608018345, "loss": 4.5964, "step": 745 }, { "epoch": 0.23858162265885785, "grad_norm": 0.2965133786201477, "learning_rate": 0.0005343000552080235, "loss": 4.6187, "step": 746 }, { "epoch": 0.23890143716644346, "grad_norm": 0.3049655258655548, "learning_rate": 0.0005341057980703624, "loss": 4.5913, "step": 747 }, { "epoch": 0.23922125167402905, "grad_norm": 0.2973322868347168, "learning_rate": 0.0005339112895974054, "loss": 4.6168, "step": 748 }, { "epoch": 0.23954106618161466, "grad_norm": 0.29436466097831726, "learning_rate": 0.0005337165299979761, "loss": 4.5937, "step": 749 }, { "epoch": 0.23986088068920025, "grad_norm": 0.31333065032958984, "learning_rate": 0.0005335215194811678, "loss": 4.6051, "step": 750 }, { "epoch": 0.24018069519678586, "grad_norm": 0.30850592255592346, "learning_rate": 0.0005333262582563434, "loss": 4.5975, "step": 751 }, { "epoch": 0.24050050970437145, "grad_norm": 0.29627135396003723, "learning_rate": 0.0005331307465331346, "loss": 4.6516, "step": 752 }, { "epoch": 0.24082032421195707, "grad_norm": 0.31854382157325745, "learning_rate": 0.0005329349845214421, "loss": 4.5914, "step": 753 }, { "epoch": 0.24114013871954265, "grad_norm": 0.2998705506324768, "learning_rate": 0.0005327389724314357, "loss": 4.6102, "step": 754 }, { "epoch": 0.24145995322712827, "grad_norm": 0.2999056279659271, "learning_rate": 0.0005325427104735533, "loss": 4.5066, "step": 755 }, { "epoch": 0.24177976773471388, "grad_norm": 0.3036806583404541, "learning_rate": 0.0005323461988585011, "loss": 4.5633, "step": 756 }, { "epoch": 0.24209958224229947, "grad_norm": 0.30324384570121765, "learning_rate": 0.0005321494377972534, "loss": 4.5776, "step": 757 }, { "epoch": 0.24241939674988508, "grad_norm": 0.31028205156326294, "learning_rate": 0.0005319524275010524, "loss": 4.5835, "step": 758 }, { "epoch": 0.24273921125747067, "grad_norm": 0.3162449896335602, "learning_rate": 0.0005317551681814076, "loss": 4.6091, "step": 759 }, { "epoch": 0.24305902576505628, "grad_norm": 0.30578404664993286, "learning_rate": 0.0005315576600500962, "loss": 4.6308, "step": 760 }, { "epoch": 0.24337884027264187, "grad_norm": 0.2690615653991699, "learning_rate": 0.0005313599033191622, "loss": 4.5216, "step": 761 }, { "epoch": 0.24369865478022748, "grad_norm": 0.3034672439098358, "learning_rate": 0.0005311618982009168, "loss": 4.631, "step": 762 }, { "epoch": 0.24401846928781307, "grad_norm": 0.3006930947303772, "learning_rate": 0.0005309636449079377, "loss": 4.627, "step": 763 }, { "epoch": 0.24433828379539868, "grad_norm": 0.3088935613632202, "learning_rate": 0.0005307651436530688, "loss": 4.6071, "step": 764 }, { "epoch": 0.24465809830298427, "grad_norm": 0.29248863458633423, "learning_rate": 0.0005305663946494208, "loss": 4.5153, "step": 765 }, { "epoch": 0.24497791281056988, "grad_norm": 0.3239281177520752, "learning_rate": 0.0005303673981103698, "loss": 4.5689, "step": 766 }, { "epoch": 0.24529772731815547, "grad_norm": 0.30040568113327026, "learning_rate": 0.000530168154249558, "loss": 4.5336, "step": 767 }, { "epoch": 0.24561754182574108, "grad_norm": 0.3012166917324066, "learning_rate": 0.000529968663280893, "loss": 4.57, "step": 768 }, { "epoch": 0.24593735633332667, "grad_norm": 0.32919690012931824, "learning_rate": 0.0005297689254185478, "loss": 4.5974, "step": 769 }, { "epoch": 0.24625717084091228, "grad_norm": 0.2880009114742279, "learning_rate": 0.0005295689408769602, "loss": 4.5903, "step": 770 }, { "epoch": 0.24657698534849787, "grad_norm": 0.3026755452156067, "learning_rate": 0.0005293687098708332, "loss": 4.6259, "step": 771 }, { "epoch": 0.24689679985608348, "grad_norm": 0.30567294359207153, "learning_rate": 0.0005291682326151342, "loss": 4.5785, "step": 772 }, { "epoch": 0.24721661436366907, "grad_norm": 0.31355470418930054, "learning_rate": 0.0005289675093250949, "loss": 4.6089, "step": 773 }, { "epoch": 0.24753642887125468, "grad_norm": 0.30152633786201477, "learning_rate": 0.0005287665402162112, "loss": 4.558, "step": 774 }, { "epoch": 0.24785624337884027, "grad_norm": 0.30632874369621277, "learning_rate": 0.0005285653255042432, "loss": 4.5721, "step": 775 }, { "epoch": 0.24817605788642588, "grad_norm": 0.33047720789909363, "learning_rate": 0.0005283638654052141, "loss": 4.5352, "step": 776 }, { "epoch": 0.24849587239401147, "grad_norm": 0.30601057410240173, "learning_rate": 0.000528162160135411, "loss": 4.5966, "step": 777 }, { "epoch": 0.24881568690159708, "grad_norm": 0.3034043610095978, "learning_rate": 0.000527960209911384, "loss": 4.5561, "step": 778 }, { "epoch": 0.24913550140918267, "grad_norm": 0.2902142405509949, "learning_rate": 0.0005277580149499465, "loss": 4.5624, "step": 779 }, { "epoch": 0.24945531591676828, "grad_norm": 0.3587496280670166, "learning_rate": 0.0005275555754681742, "loss": 4.562, "step": 780 }, { "epoch": 0.24977513042435387, "grad_norm": 0.2910183370113373, "learning_rate": 0.0005273528916834056, "loss": 4.5076, "step": 781 }, { "epoch": 0.25009494493193946, "grad_norm": 0.29718324542045593, "learning_rate": 0.0005271499638132415, "loss": 4.5528, "step": 782 }, { "epoch": 0.25041475943952507, "grad_norm": 0.30006152391433716, "learning_rate": 0.0005269467920755446, "loss": 4.5877, "step": 783 }, { "epoch": 0.2507345739471107, "grad_norm": 0.29712775349617004, "learning_rate": 0.0005267433766884394, "loss": 4.5218, "step": 784 }, { "epoch": 0.2510543884546963, "grad_norm": 0.29716482758522034, "learning_rate": 0.0005265397178703122, "loss": 4.4928, "step": 785 }, { "epoch": 0.25137420296228186, "grad_norm": 0.28058889508247375, "learning_rate": 0.0005263358158398104, "loss": 4.4912, "step": 786 }, { "epoch": 0.25169401746986747, "grad_norm": 0.3047398030757904, "learning_rate": 0.0005261316708158426, "loss": 4.5619, "step": 787 }, { "epoch": 0.2520138319774531, "grad_norm": 0.3007103204727173, "learning_rate": 0.0005259272830175784, "loss": 4.526, "step": 788 }, { "epoch": 0.2523336464850387, "grad_norm": 0.2976570725440979, "learning_rate": 0.0005257226526644478, "loss": 4.5989, "step": 789 }, { "epoch": 0.25265346099262426, "grad_norm": 0.2854878306388855, "learning_rate": 0.0005255177799761416, "loss": 4.562, "step": 790 }, { "epoch": 0.25297327550020987, "grad_norm": 0.29306548833847046, "learning_rate": 0.0005253126651726102, "loss": 4.5492, "step": 791 }, { "epoch": 0.2532930900077955, "grad_norm": 0.3061801791191101, "learning_rate": 0.0005251073084740646, "loss": 4.5475, "step": 792 }, { "epoch": 0.2536129045153811, "grad_norm": 0.3675724267959595, "learning_rate": 0.0005249017101009747, "loss": 4.571, "step": 793 }, { "epoch": 0.25393271902296666, "grad_norm": 0.3222801983356476, "learning_rate": 0.0005246958702740707, "loss": 4.544, "step": 794 }, { "epoch": 0.25425253353055227, "grad_norm": 0.3157154619693756, "learning_rate": 0.0005244897892143414, "loss": 4.4648, "step": 795 }, { "epoch": 0.2545723480381379, "grad_norm": 0.3390721082687378, "learning_rate": 0.0005242834671430349, "loss": 4.5567, "step": 796 }, { "epoch": 0.2548921625457235, "grad_norm": 0.3196147382259369, "learning_rate": 0.0005240769042816581, "loss": 4.4746, "step": 797 }, { "epoch": 0.25521197705330906, "grad_norm": 0.28795325756073, "learning_rate": 0.0005238701008519761, "loss": 4.4333, "step": 798 }, { "epoch": 0.25553179156089467, "grad_norm": 0.29653698205947876, "learning_rate": 0.0005236630570760126, "loss": 4.5217, "step": 799 }, { "epoch": 0.2558516060684803, "grad_norm": 0.3081868886947632, "learning_rate": 0.0005234557731760489, "loss": 4.5709, "step": 800 }, { "epoch": 0.2558516060684803, "eval_runtime": 43.8338, "eval_samples_per_second": 43.277, "eval_steps_per_second": 10.836, "step": 800 }, { "epoch": 0.2561714205760659, "grad_norm": 0.30065667629241943, "learning_rate": 0.0005232482493746247, "loss": 4.5754, "step": 801 }, { "epoch": 0.25649123508365146, "grad_norm": 0.303011953830719, "learning_rate": 0.0005230404858945369, "loss": 4.5989, "step": 802 }, { "epoch": 0.2568110495912371, "grad_norm": 0.29960060119628906, "learning_rate": 0.0005228324829588396, "loss": 4.5953, "step": 803 }, { "epoch": 0.2571308640988227, "grad_norm": 0.306133508682251, "learning_rate": 0.0005226242407908441, "loss": 4.5284, "step": 804 }, { "epoch": 0.2574506786064083, "grad_norm": 0.3094584047794342, "learning_rate": 0.0005224157596141189, "loss": 4.5089, "step": 805 }, { "epoch": 0.25777049311399386, "grad_norm": 0.29571986198425293, "learning_rate": 0.0005222070396524886, "loss": 4.4823, "step": 806 }, { "epoch": 0.2580903076215795, "grad_norm": 0.3143523633480072, "learning_rate": 0.0005219980811300342, "loss": 4.7122, "step": 807 }, { "epoch": 0.2584101221291651, "grad_norm": 0.31211498379707336, "learning_rate": 0.0005217888842710931, "loss": 4.5949, "step": 808 }, { "epoch": 0.2587299366367507, "grad_norm": 0.2941458821296692, "learning_rate": 0.0005215794493002583, "loss": 4.4566, "step": 809 }, { "epoch": 0.25904975114433626, "grad_norm": 0.2992144227027893, "learning_rate": 0.000521369776442379, "loss": 4.5316, "step": 810 }, { "epoch": 0.2593695656519219, "grad_norm": 0.303811639547348, "learning_rate": 0.0005211598659225588, "loss": 4.5409, "step": 811 }, { "epoch": 0.2596893801595075, "grad_norm": 0.3056770861148834, "learning_rate": 0.0005209497179661573, "loss": 4.5962, "step": 812 }, { "epoch": 0.2600091946670931, "grad_norm": 0.28062760829925537, "learning_rate": 0.0005207393327987886, "loss": 4.5057, "step": 813 }, { "epoch": 0.26032900917467866, "grad_norm": 0.2896265387535095, "learning_rate": 0.0005205287106463219, "loss": 4.5084, "step": 814 }, { "epoch": 0.2606488236822643, "grad_norm": 0.31087687611579895, "learning_rate": 0.0005203178517348801, "loss": 4.5997, "step": 815 }, { "epoch": 0.2609686381898499, "grad_norm": 0.2979047894477844, "learning_rate": 0.0005201067562908409, "loss": 4.5078, "step": 816 }, { "epoch": 0.2612884526974355, "grad_norm": 0.3160172998905182, "learning_rate": 0.0005198954245408359, "loss": 4.4875, "step": 817 }, { "epoch": 0.26160826720502106, "grad_norm": 0.2979666292667389, "learning_rate": 0.00051968385671175, "loss": 4.5275, "step": 818 }, { "epoch": 0.2619280817126067, "grad_norm": 0.2992227077484131, "learning_rate": 0.000519472053030722, "loss": 4.5082, "step": 819 }, { "epoch": 0.2622478962201923, "grad_norm": 0.3047192692756653, "learning_rate": 0.0005192600137251435, "loss": 4.52, "step": 820 }, { "epoch": 0.2625677107277779, "grad_norm": 0.32497283816337585, "learning_rate": 0.0005190477390226595, "loss": 4.609, "step": 821 }, { "epoch": 0.26288752523536346, "grad_norm": 0.32369890809059143, "learning_rate": 0.0005188352291511673, "loss": 4.4654, "step": 822 }, { "epoch": 0.2632073397429491, "grad_norm": 0.30935385823249817, "learning_rate": 0.000518622484338817, "loss": 4.5397, "step": 823 }, { "epoch": 0.2635271542505347, "grad_norm": 0.3115449845790863, "learning_rate": 0.0005184095048140106, "loss": 4.5532, "step": 824 }, { "epoch": 0.2638469687581203, "grad_norm": 0.29413673281669617, "learning_rate": 0.0005181962908054027, "loss": 4.5763, "step": 825 }, { "epoch": 0.2641667832657059, "grad_norm": 0.2929513156414032, "learning_rate": 0.0005179828425418988, "loss": 4.5208, "step": 826 }, { "epoch": 0.2644865977732915, "grad_norm": 0.3271793723106384, "learning_rate": 0.0005177691602526566, "loss": 4.4725, "step": 827 }, { "epoch": 0.2648064122808771, "grad_norm": 0.3443414568901062, "learning_rate": 0.0005175552441670847, "loss": 4.5058, "step": 828 }, { "epoch": 0.2651262267884627, "grad_norm": 0.31311389803886414, "learning_rate": 0.0005173410945148427, "loss": 4.571, "step": 829 }, { "epoch": 0.2654460412960483, "grad_norm": 0.3115324079990387, "learning_rate": 0.0005171267115258412, "loss": 4.5391, "step": 830 }, { "epoch": 0.2657658558036339, "grad_norm": 0.3240220844745636, "learning_rate": 0.0005169120954302409, "loss": 4.5257, "step": 831 }, { "epoch": 0.2660856703112195, "grad_norm": 0.3160390853881836, "learning_rate": 0.0005166972464584532, "loss": 4.4549, "step": 832 }, { "epoch": 0.2664054848188051, "grad_norm": 0.31099945306777954, "learning_rate": 0.0005164821648411394, "loss": 4.5003, "step": 833 }, { "epoch": 0.2667252993263907, "grad_norm": 0.30797210335731506, "learning_rate": 0.0005162668508092103, "loss": 4.4093, "step": 834 }, { "epoch": 0.2670451138339763, "grad_norm": 0.34178709983825684, "learning_rate": 0.0005160513045938265, "loss": 4.5046, "step": 835 }, { "epoch": 0.2673649283415619, "grad_norm": 0.29707714915275574, "learning_rate": 0.0005158355264263978, "loss": 4.5163, "step": 836 }, { "epoch": 0.2676847428491475, "grad_norm": 0.3115328252315521, "learning_rate": 0.0005156195165385829, "loss": 4.496, "step": 837 }, { "epoch": 0.2680045573567331, "grad_norm": 0.30233150720596313, "learning_rate": 0.0005154032751622894, "loss": 4.5185, "step": 838 }, { "epoch": 0.2683243718643187, "grad_norm": 0.30327290296554565, "learning_rate": 0.0005151868025296736, "loss": 4.5897, "step": 839 }, { "epoch": 0.2686441863719043, "grad_norm": 0.3013684153556824, "learning_rate": 0.0005149700988731397, "loss": 4.4728, "step": 840 }, { "epoch": 0.2689640008794899, "grad_norm": 0.370692640542984, "learning_rate": 0.0005147531644253402, "loss": 4.5868, "step": 841 }, { "epoch": 0.2692838153870755, "grad_norm": 0.3172016739845276, "learning_rate": 0.0005145359994191751, "loss": 4.5846, "step": 842 }, { "epoch": 0.2696036298946611, "grad_norm": 0.3058904707431793, "learning_rate": 0.0005143186040877923, "loss": 4.4485, "step": 843 }, { "epoch": 0.2699234444022467, "grad_norm": 0.29665982723236084, "learning_rate": 0.0005141009786645868, "loss": 4.4847, "step": 844 }, { "epoch": 0.2702432589098323, "grad_norm": 0.3275633156299591, "learning_rate": 0.0005138831233832005, "loss": 4.5423, "step": 845 }, { "epoch": 0.2705630734174179, "grad_norm": 0.33318156003952026, "learning_rate": 0.0005136650384775221, "loss": 4.4994, "step": 846 }, { "epoch": 0.2708828879250035, "grad_norm": 0.3003494441509247, "learning_rate": 0.0005134467241816872, "loss": 4.4761, "step": 847 }, { "epoch": 0.2712027024325891, "grad_norm": 0.30181971192359924, "learning_rate": 0.0005132281807300773, "loss": 4.5881, "step": 848 }, { "epoch": 0.2715225169401747, "grad_norm": 0.3056182861328125, "learning_rate": 0.0005130094083573198, "loss": 4.4947, "step": 849 }, { "epoch": 0.2718423314477603, "grad_norm": 0.29251542687416077, "learning_rate": 0.0005127904072982884, "loss": 4.516, "step": 850 }, { "epoch": 0.2721621459553459, "grad_norm": 0.30300524830818176, "learning_rate": 0.0005125711777881016, "loss": 4.4637, "step": 851 }, { "epoch": 0.2724819604629315, "grad_norm": 0.3046075105667114, "learning_rate": 0.0005123517200621238, "loss": 4.5179, "step": 852 }, { "epoch": 0.2728017749705171, "grad_norm": 0.3496110439300537, "learning_rate": 0.0005121320343559641, "loss": 4.4844, "step": 853 }, { "epoch": 0.2731215894781027, "grad_norm": 0.3065270185470581, "learning_rate": 0.0005119121209054767, "loss": 4.5721, "step": 854 }, { "epoch": 0.2734414039856883, "grad_norm": 0.3314630091190338, "learning_rate": 0.0005116919799467597, "loss": 4.5692, "step": 855 }, { "epoch": 0.2737612184932739, "grad_norm": 0.3025791645050049, "learning_rate": 0.0005114716117161558, "loss": 4.5017, "step": 856 }, { "epoch": 0.2740810330008595, "grad_norm": 0.2894890308380127, "learning_rate": 0.0005112510164502518, "loss": 4.4788, "step": 857 }, { "epoch": 0.2744008475084451, "grad_norm": 0.3204091191291809, "learning_rate": 0.000511030194385878, "loss": 4.4753, "step": 858 }, { "epoch": 0.2747206620160307, "grad_norm": 0.3035833537578583, "learning_rate": 0.0005108091457601085, "loss": 4.4445, "step": 859 }, { "epoch": 0.2750404765236163, "grad_norm": 0.30635136365890503, "learning_rate": 0.0005105878708102604, "loss": 4.488, "step": 860 }, { "epoch": 0.2753602910312019, "grad_norm": 0.36212730407714844, "learning_rate": 0.0005103663697738937, "loss": 4.4256, "step": 861 }, { "epoch": 0.2756801055387875, "grad_norm": 0.3171091377735138, "learning_rate": 0.0005101446428888115, "loss": 4.4777, "step": 862 }, { "epoch": 0.2759999200463731, "grad_norm": 0.3786751329898834, "learning_rate": 0.0005099226903930589, "loss": 4.522, "step": 863 }, { "epoch": 0.2763197345539587, "grad_norm": 0.33254367113113403, "learning_rate": 0.0005097005125249236, "loss": 4.4385, "step": 864 }, { "epoch": 0.2766395490615443, "grad_norm": 0.3374261260032654, "learning_rate": 0.0005094781095229352, "loss": 4.4368, "step": 865 }, { "epoch": 0.2769593635691299, "grad_norm": 0.298818975687027, "learning_rate": 0.0005092554816258644, "loss": 4.472, "step": 866 }, { "epoch": 0.2772791780767155, "grad_norm": 0.3123992383480072, "learning_rate": 0.0005090326290727245, "loss": 4.4893, "step": 867 }, { "epoch": 0.2775989925843011, "grad_norm": 0.3864470422267914, "learning_rate": 0.0005088095521027689, "loss": 4.4666, "step": 868 }, { "epoch": 0.2779188070918867, "grad_norm": 0.3071383237838745, "learning_rate": 0.0005085862509554926, "loss": 4.4342, "step": 869 }, { "epoch": 0.2782386215994723, "grad_norm": 0.3254585564136505, "learning_rate": 0.000508362725870631, "loss": 4.445, "step": 870 }, { "epoch": 0.2785584361070579, "grad_norm": 0.31379327178001404, "learning_rate": 0.0005081389770881599, "loss": 4.5255, "step": 871 }, { "epoch": 0.2788782506146435, "grad_norm": 0.32465967535972595, "learning_rate": 0.0005079150048482954, "loss": 4.4083, "step": 872 }, { "epoch": 0.2791980651222291, "grad_norm": 0.2967774569988251, "learning_rate": 0.0005076908093914936, "loss": 4.4684, "step": 873 }, { "epoch": 0.2795178796298147, "grad_norm": 0.3091309070587158, "learning_rate": 0.0005074663909584498, "loss": 4.4727, "step": 874 }, { "epoch": 0.2798376941374003, "grad_norm": 0.31195878982543945, "learning_rate": 0.000507241749790099, "loss": 4.4643, "step": 875 }, { "epoch": 0.2801575086449859, "grad_norm": 0.3253560960292816, "learning_rate": 0.0005070168861276155, "loss": 4.493, "step": 876 }, { "epoch": 0.2804773231525715, "grad_norm": 0.30696171522140503, "learning_rate": 0.0005067918002124121, "loss": 4.4351, "step": 877 }, { "epoch": 0.2807971376601571, "grad_norm": 0.296865314245224, "learning_rate": 0.0005065664922861405, "loss": 4.5331, "step": 878 }, { "epoch": 0.2811169521677427, "grad_norm": 0.3351920545101166, "learning_rate": 0.0005063409625906905, "loss": 4.453, "step": 879 }, { "epoch": 0.2814367666753283, "grad_norm": 0.3060401678085327, "learning_rate": 0.0005061152113681901, "loss": 4.4676, "step": 880 }, { "epoch": 0.2817565811829139, "grad_norm": 0.3067130148410797, "learning_rate": 0.0005058892388610053, "loss": 4.4903, "step": 881 }, { "epoch": 0.2820763956904995, "grad_norm": 0.29070737957954407, "learning_rate": 0.0005056630453117394, "loss": 4.4948, "step": 882 }, { "epoch": 0.2823962101980851, "grad_norm": 0.28819337487220764, "learning_rate": 0.0005054366309632333, "loss": 4.507, "step": 883 }, { "epoch": 0.2827160247056707, "grad_norm": 0.318706750869751, "learning_rate": 0.0005052099960585645, "loss": 4.4734, "step": 884 }, { "epoch": 0.2830358392132563, "grad_norm": 0.31502071022987366, "learning_rate": 0.0005049831408410478, "loss": 4.5008, "step": 885 }, { "epoch": 0.2833556537208419, "grad_norm": 0.31208616495132446, "learning_rate": 0.0005047560655542342, "loss": 4.4481, "step": 886 }, { "epoch": 0.2836754682284275, "grad_norm": 0.3279353976249695, "learning_rate": 0.000504528770441911, "loss": 4.4686, "step": 887 }, { "epoch": 0.2839952827360131, "grad_norm": 0.30125418305397034, "learning_rate": 0.0005043012557481016, "loss": 4.4647, "step": 888 }, { "epoch": 0.2843150972435987, "grad_norm": 0.31265607476234436, "learning_rate": 0.0005040735217170653, "loss": 4.4795, "step": 889 }, { "epoch": 0.2846349117511843, "grad_norm": 0.2953793406486511, "learning_rate": 0.0005038455685932964, "loss": 4.4573, "step": 890 }, { "epoch": 0.2849547262587699, "grad_norm": 0.3046894073486328, "learning_rate": 0.0005036173966215248, "loss": 4.403, "step": 891 }, { "epoch": 0.2852745407663555, "grad_norm": 0.2988243103027344, "learning_rate": 0.0005033890060467153, "loss": 4.4117, "step": 892 }, { "epoch": 0.2855943552739411, "grad_norm": 0.2956027686595917, "learning_rate": 0.0005031603971140674, "loss": 4.4555, "step": 893 }, { "epoch": 0.28591416978152673, "grad_norm": 0.3157637119293213, "learning_rate": 0.000502931570069015, "loss": 4.335, "step": 894 }, { "epoch": 0.28623398428911234, "grad_norm": 0.3274424970149994, "learning_rate": 0.0005027025251572259, "loss": 4.465, "step": 895 }, { "epoch": 0.2865537987966979, "grad_norm": 0.31196221709251404, "learning_rate": 0.0005024732626246022, "loss": 4.4236, "step": 896 }, { "epoch": 0.2868736133042835, "grad_norm": 0.3047398626804352, "learning_rate": 0.0005022437827172795, "loss": 4.5308, "step": 897 }, { "epoch": 0.28719342781186913, "grad_norm": 0.32717710733413696, "learning_rate": 0.0005020140856816268, "loss": 4.5639, "step": 898 }, { "epoch": 0.28751324231945474, "grad_norm": 0.3181251883506775, "learning_rate": 0.0005017841717642461, "loss": 4.4707, "step": 899 }, { "epoch": 0.2878330568270403, "grad_norm": 0.2859209477901459, "learning_rate": 0.0005015540412119721, "loss": 4.3949, "step": 900 }, { "epoch": 0.2878330568270403, "eval_runtime": 44.1274, "eval_samples_per_second": 42.989, "eval_steps_per_second": 10.764, "step": 900 }, { "epoch": 0.2881528713346259, "grad_norm": 0.322801411151886, "learning_rate": 0.0005013236942718725, "loss": 4.4386, "step": 901 }, { "epoch": 0.28847268584221153, "grad_norm": 0.3823021948337555, "learning_rate": 0.0005010931311912473, "loss": 4.4901, "step": 902 }, { "epoch": 0.28879250034979714, "grad_norm": 0.3377479016780853, "learning_rate": 0.0005008623522176279, "loss": 4.4821, "step": 903 }, { "epoch": 0.2891123148573827, "grad_norm": 0.32171210646629333, "learning_rate": 0.0005006313575987784, "loss": 4.4326, "step": 904 }, { "epoch": 0.2894321293649683, "grad_norm": 0.3168190121650696, "learning_rate": 0.0005004001475826935, "loss": 4.5371, "step": 905 }, { "epoch": 0.28975194387255393, "grad_norm": 0.35529348254203796, "learning_rate": 0.0005001687224175999, "loss": 4.4957, "step": 906 }, { "epoch": 0.29007175838013954, "grad_norm": 0.30193379521369934, "learning_rate": 0.0004999370823519548, "loss": 4.4856, "step": 907 }, { "epoch": 0.2903915728877251, "grad_norm": 0.31604552268981934, "learning_rate": 0.0004997052276344463, "loss": 4.4793, "step": 908 }, { "epoch": 0.2907113873953107, "grad_norm": 0.31218579411506653, "learning_rate": 0.000499473158513993, "loss": 4.5012, "step": 909 }, { "epoch": 0.29103120190289633, "grad_norm": 0.3007553517818451, "learning_rate": 0.0004992408752397437, "loss": 4.4452, "step": 910 }, { "epoch": 0.29135101641048194, "grad_norm": 0.3200906813144684, "learning_rate": 0.0004990083780610769, "loss": 4.3383, "step": 911 }, { "epoch": 0.2916708309180675, "grad_norm": 0.36419957876205444, "learning_rate": 0.000498775667227601, "loss": 4.433, "step": 912 }, { "epoch": 0.2919906454256531, "grad_norm": 0.3441981077194214, "learning_rate": 0.0004985427429891536, "loss": 4.5175, "step": 913 }, { "epoch": 0.29231045993323873, "grad_norm": 0.3522200584411621, "learning_rate": 0.0004983096055958014, "loss": 4.4386, "step": 914 }, { "epoch": 0.29263027444082435, "grad_norm": 0.3122841715812683, "learning_rate": 0.0004980762552978403, "loss": 4.3775, "step": 915 }, { "epoch": 0.2929500889484099, "grad_norm": 0.28400447964668274, "learning_rate": 0.0004978426923457942, "loss": 4.4238, "step": 916 }, { "epoch": 0.2932699034559955, "grad_norm": 0.2767007648944855, "learning_rate": 0.0004976089169904156, "loss": 4.364, "step": 917 }, { "epoch": 0.29358971796358113, "grad_norm": 0.3220713138580322, "learning_rate": 0.0004973749294826853, "loss": 4.4427, "step": 918 }, { "epoch": 0.29390953247116675, "grad_norm": 0.2896125316619873, "learning_rate": 0.0004971407300738114, "loss": 4.3645, "step": 919 }, { "epoch": 0.2942293469787523, "grad_norm": 0.3019895851612091, "learning_rate": 0.0004969063190152297, "loss": 4.387, "step": 920 }, { "epoch": 0.2945491614863379, "grad_norm": 0.29725131392478943, "learning_rate": 0.0004966716965586033, "loss": 4.4265, "step": 921 }, { "epoch": 0.29486897599392353, "grad_norm": 0.309479683637619, "learning_rate": 0.0004964368629558221, "loss": 4.4504, "step": 922 }, { "epoch": 0.29518879050150915, "grad_norm": 0.312238872051239, "learning_rate": 0.0004962018184590028, "loss": 4.4524, "step": 923 }, { "epoch": 0.2955086050090947, "grad_norm": 0.39455848932266235, "learning_rate": 0.0004959665633204885, "loss": 4.3496, "step": 924 }, { "epoch": 0.2958284195166803, "grad_norm": 0.3318155109882355, "learning_rate": 0.0004957310977928484, "loss": 4.3925, "step": 925 }, { "epoch": 0.29614823402426593, "grad_norm": 0.31263166666030884, "learning_rate": 0.0004954954221288775, "loss": 4.4383, "step": 926 }, { "epoch": 0.29646804853185155, "grad_norm": 0.3029886484146118, "learning_rate": 0.0004952595365815967, "loss": 4.4976, "step": 927 }, { "epoch": 0.2967878630394371, "grad_norm": 0.3193996250629425, "learning_rate": 0.0004950234414042519, "loss": 4.3577, "step": 928 }, { "epoch": 0.2971076775470227, "grad_norm": 0.3641533851623535, "learning_rate": 0.0004947871368503143, "loss": 4.3743, "step": 929 }, { "epoch": 0.29742749205460833, "grad_norm": 0.30906450748443604, "learning_rate": 0.0004945506231734796, "loss": 4.4657, "step": 930 }, { "epoch": 0.29774730656219395, "grad_norm": 0.3369085192680359, "learning_rate": 0.0004943139006276683, "loss": 4.4608, "step": 931 }, { "epoch": 0.2980671210697795, "grad_norm": 0.3784608542919159, "learning_rate": 0.0004940769694670251, "loss": 4.5055, "step": 932 }, { "epoch": 0.2983869355773651, "grad_norm": 0.29169726371765137, "learning_rate": 0.0004938398299459183, "loss": 4.4255, "step": 933 }, { "epoch": 0.29870675008495073, "grad_norm": 0.3588141202926636, "learning_rate": 0.0004936024823189406, "loss": 4.3692, "step": 934 }, { "epoch": 0.29902656459253635, "grad_norm": 0.3325275480747223, "learning_rate": 0.0004933649268409073, "loss": 4.356, "step": 935 }, { "epoch": 0.2993463791001219, "grad_norm": 0.31794947385787964, "learning_rate": 0.0004931271637668577, "loss": 4.5139, "step": 936 }, { "epoch": 0.2996661936077075, "grad_norm": 0.31381064653396606, "learning_rate": 0.0004928891933520533, "loss": 4.4287, "step": 937 }, { "epoch": 0.29998600811529313, "grad_norm": 0.31029853224754333, "learning_rate": 0.0004926510158519784, "loss": 4.4079, "step": 938 }, { "epoch": 0.30030582262287875, "grad_norm": 0.3128076195716858, "learning_rate": 0.0004924126315223396, "loss": 4.4961, "step": 939 }, { "epoch": 0.3006256371304643, "grad_norm": 0.2869124412536621, "learning_rate": 0.0004921740406190659, "loss": 4.3926, "step": 940 }, { "epoch": 0.3009454516380499, "grad_norm": 0.3829290568828583, "learning_rate": 0.0004919352433983075, "loss": 4.3339, "step": 941 }, { "epoch": 0.30126526614563554, "grad_norm": 0.30775660276412964, "learning_rate": 0.0004916962401164365, "loss": 4.41, "step": 942 }, { "epoch": 0.30158508065322115, "grad_norm": 0.30188286304473877, "learning_rate": 0.0004914570310300462, "loss": 4.3936, "step": 943 }, { "epoch": 0.3019048951608067, "grad_norm": 0.36703312397003174, "learning_rate": 0.0004912176163959506, "loss": 4.4462, "step": 944 }, { "epoch": 0.3022247096683923, "grad_norm": 0.4332222044467926, "learning_rate": 0.0004909779964711848, "loss": 4.3785, "step": 945 }, { "epoch": 0.30254452417597794, "grad_norm": 0.3340286612510681, "learning_rate": 0.0004907381715130038, "loss": 4.3255, "step": 946 }, { "epoch": 0.30286433868356355, "grad_norm": 0.4300132691860199, "learning_rate": 0.000490498141778883, "loss": 4.3907, "step": 947 }, { "epoch": 0.3031841531911491, "grad_norm": 0.3359420895576477, "learning_rate": 0.0004902579075265178, "loss": 4.3563, "step": 948 }, { "epoch": 0.3035039676987347, "grad_norm": 0.32604947686195374, "learning_rate": 0.0004900174690138229, "loss": 4.4156, "step": 949 }, { "epoch": 0.30382378220632034, "grad_norm": 0.33923494815826416, "learning_rate": 0.0004897768264989323, "loss": 4.4603, "step": 950 }, { "epoch": 0.30414359671390595, "grad_norm": 0.33348721265792847, "learning_rate": 0.0004895359802401992, "loss": 4.4567, "step": 951 }, { "epoch": 0.3044634112214915, "grad_norm": 0.36403873562812805, "learning_rate": 0.0004892949304961952, "loss": 4.4047, "step": 952 }, { "epoch": 0.3047832257290771, "grad_norm": 0.33704331517219543, "learning_rate": 0.0004890536775257109, "loss": 4.3462, "step": 953 }, { "epoch": 0.30510304023666274, "grad_norm": 0.30426234006881714, "learning_rate": 0.0004888122215877547, "loss": 4.4224, "step": 954 }, { "epoch": 0.30542285474424835, "grad_norm": 0.3160933554172516, "learning_rate": 0.0004885705629415528, "loss": 4.408, "step": 955 }, { "epoch": 0.3057426692518339, "grad_norm": 0.4620996117591858, "learning_rate": 0.0004883287018465494, "loss": 4.4154, "step": 956 }, { "epoch": 0.3060624837594195, "grad_norm": 0.30563884973526, "learning_rate": 0.00048808663856240596, "loss": 4.4956, "step": 957 }, { "epoch": 0.30638229826700514, "grad_norm": 0.29823562502861023, "learning_rate": 0.0004878443733490006, "loss": 4.3881, "step": 958 }, { "epoch": 0.30670211277459075, "grad_norm": 0.3017095625400543, "learning_rate": 0.00048760190646642866, "loss": 4.3448, "step": 959 }, { "epoch": 0.3070219272821763, "grad_norm": 0.3225107491016388, "learning_rate": 0.000487359238175002, "loss": 4.4178, "step": 960 }, { "epoch": 0.3073417417897619, "grad_norm": 0.34174293279647827, "learning_rate": 0.00048711636873524856, "loss": 4.4109, "step": 961 }, { "epoch": 0.30766155629734754, "grad_norm": 0.3300286531448364, "learning_rate": 0.00048687329840791207, "loss": 4.4565, "step": 962 }, { "epoch": 0.30798137080493315, "grad_norm": 0.3178355395793915, "learning_rate": 0.0004866300274539523, "loss": 4.3932, "step": 963 }, { "epoch": 0.30830118531251877, "grad_norm": 0.2987779378890991, "learning_rate": 0.0004863865561345442, "loss": 4.4181, "step": 964 }, { "epoch": 0.3086209998201043, "grad_norm": 0.28862786293029785, "learning_rate": 0.00048614288471107774, "loss": 4.3402, "step": 965 }, { "epoch": 0.30894081432768994, "grad_norm": 0.3404427468776703, "learning_rate": 0.00048589901344515805, "loss": 4.4609, "step": 966 }, { "epoch": 0.30926062883527555, "grad_norm": 0.35388022661209106, "learning_rate": 0.00048565494259860434, "loss": 4.3715, "step": 967 }, { "epoch": 0.30958044334286117, "grad_norm": 0.3838047385215759, "learning_rate": 0.00048541067243345064, "loss": 4.4187, "step": 968 }, { "epoch": 0.3099002578504467, "grad_norm": 0.30805131793022156, "learning_rate": 0.00048516620321194443, "loss": 4.3982, "step": 969 }, { "epoch": 0.31022007235803234, "grad_norm": 0.32373467087745667, "learning_rate": 0.0004849215351965474, "loss": 4.3169, "step": 970 }, { "epoch": 0.31053988686561795, "grad_norm": 0.3159528076648712, "learning_rate": 0.0004846766686499342, "loss": 4.4129, "step": 971 }, { "epoch": 0.31085970137320357, "grad_norm": 0.3269490897655487, "learning_rate": 0.0004844316038349929, "loss": 4.3783, "step": 972 }, { "epoch": 0.3111795158807891, "grad_norm": 0.29986268281936646, "learning_rate": 0.00048418634101482435, "loss": 4.3646, "step": 973 }, { "epoch": 0.31149933038837474, "grad_norm": 0.3188677132129669, "learning_rate": 0.000483940880452742, "loss": 4.4084, "step": 974 }, { "epoch": 0.31181914489596035, "grad_norm": 0.3339698016643524, "learning_rate": 0.0004836952224122716, "loss": 4.4616, "step": 975 }, { "epoch": 0.31213895940354597, "grad_norm": 0.3420525789260864, "learning_rate": 0.00048344936715715104, "loss": 4.4595, "step": 976 }, { "epoch": 0.3124587739111315, "grad_norm": 0.2960668206214905, "learning_rate": 0.0004832033149513295, "loss": 4.347, "step": 977 }, { "epoch": 0.31277858841871714, "grad_norm": 0.4462215006351471, "learning_rate": 0.0004829570660589681, "loss": 4.4674, "step": 978 }, { "epoch": 0.31309840292630275, "grad_norm": 0.31284236907958984, "learning_rate": 0.0004827106207444389, "loss": 4.3866, "step": 979 }, { "epoch": 0.31341821743388837, "grad_norm": 0.33517399430274963, "learning_rate": 0.00048246397927232483, "loss": 4.3998, "step": 980 }, { "epoch": 0.3137380319414739, "grad_norm": 0.32631775736808777, "learning_rate": 0.00048221714190741947, "loss": 4.3914, "step": 981 }, { "epoch": 0.31405784644905954, "grad_norm": 0.35559579730033875, "learning_rate": 0.00048197010891472665, "loss": 4.3739, "step": 982 }, { "epoch": 0.31437766095664516, "grad_norm": 0.32086825370788574, "learning_rate": 0.00048172288055946033, "loss": 4.4245, "step": 983 }, { "epoch": 0.31469747546423077, "grad_norm": 0.35193315148353577, "learning_rate": 0.0004814754571070442, "loss": 4.4306, "step": 984 }, { "epoch": 0.3150172899718163, "grad_norm": 0.3815382421016693, "learning_rate": 0.00048122783882311126, "loss": 4.415, "step": 985 }, { "epoch": 0.31533710447940194, "grad_norm": 0.35121268033981323, "learning_rate": 0.0004809800259735038, "loss": 4.3986, "step": 986 }, { "epoch": 0.31565691898698756, "grad_norm": 0.30291619896888733, "learning_rate": 0.0004807320188242728, "loss": 4.3048, "step": 987 }, { "epoch": 0.31597673349457317, "grad_norm": 0.39759495854377747, "learning_rate": 0.0004804838176416782, "loss": 4.3604, "step": 988 }, { "epoch": 0.31629654800215873, "grad_norm": 0.4281408488750458, "learning_rate": 0.000480235422692188, "loss": 4.4629, "step": 989 }, { "epoch": 0.31661636250974434, "grad_norm": 0.3828846216201782, "learning_rate": 0.0004799868342424784, "loss": 4.3242, "step": 990 }, { "epoch": 0.31693617701732996, "grad_norm": 0.2975770831108093, "learning_rate": 0.00047973805255943305, "loss": 4.428, "step": 991 }, { "epoch": 0.31725599152491557, "grad_norm": 0.44388994574546814, "learning_rate": 0.0004794890779101434, "loss": 4.4103, "step": 992 }, { "epoch": 0.31757580603250113, "grad_norm": 0.3990457355976105, "learning_rate": 0.0004792399105619077, "loss": 4.4268, "step": 993 }, { "epoch": 0.31789562054008674, "grad_norm": 0.39087843894958496, "learning_rate": 0.0004789905507822314, "loss": 4.3693, "step": 994 }, { "epoch": 0.31821543504767236, "grad_norm": 0.3066149950027466, "learning_rate": 0.00047874099883882644, "loss": 4.4177, "step": 995 }, { "epoch": 0.31853524955525797, "grad_norm": 0.33759185671806335, "learning_rate": 0.000478491254999611, "loss": 4.3704, "step": 996 }, { "epoch": 0.31885506406284353, "grad_norm": 0.36516574025154114, "learning_rate": 0.0004782413195327094, "loss": 4.4011, "step": 997 }, { "epoch": 0.31917487857042914, "grad_norm": 0.3744923174381256, "learning_rate": 0.0004779911927064516, "loss": 4.276, "step": 998 }, { "epoch": 0.31949469307801476, "grad_norm": 0.32529783248901367, "learning_rate": 0.000477740874789373, "loss": 4.3723, "step": 999 }, { "epoch": 0.31981450758560037, "grad_norm": 0.3058508634567261, "learning_rate": 0.0004774903660502142, "loss": 4.345, "step": 1000 }, { "epoch": 0.31981450758560037, "eval_runtime": 42.5864, "eval_samples_per_second": 44.545, "eval_steps_per_second": 11.154, "step": 1000 }, { "epoch": 0.32013432209318593, "grad_norm": 0.3245224952697754, "learning_rate": 0.0004772396667579205, "loss": 4.4359, "step": 1001 }, { "epoch": 0.32045413660077154, "grad_norm": 0.33880415558815, "learning_rate": 0.0004769887771816422, "loss": 4.3315, "step": 1002 }, { "epoch": 0.32077395110835716, "grad_norm": 0.3146042227745056, "learning_rate": 0.0004767376975907334, "loss": 4.4185, "step": 1003 }, { "epoch": 0.32109376561594277, "grad_norm": 0.4236794412136078, "learning_rate": 0.00047648642825475255, "loss": 4.3659, "step": 1004 }, { "epoch": 0.32141358012352833, "grad_norm": 0.40911680459976196, "learning_rate": 0.0004762349694434615, "loss": 4.3701, "step": 1005 }, { "epoch": 0.32173339463111394, "grad_norm": 0.32179972529411316, "learning_rate": 0.0004759833214268259, "loss": 4.3617, "step": 1006 }, { "epoch": 0.32205320913869956, "grad_norm": 0.31925222277641296, "learning_rate": 0.0004757314844750141, "loss": 4.3345, "step": 1007 }, { "epoch": 0.3223730236462852, "grad_norm": 0.329226553440094, "learning_rate": 0.00047547945885839763, "loss": 4.4564, "step": 1008 }, { "epoch": 0.32269283815387073, "grad_norm": 0.29546889662742615, "learning_rate": 0.00047522724484755054, "loss": 4.4153, "step": 1009 }, { "epoch": 0.32301265266145635, "grad_norm": 0.30273008346557617, "learning_rate": 0.0004749748427132488, "loss": 4.3656, "step": 1010 }, { "epoch": 0.32333246716904196, "grad_norm": 0.49392780661582947, "learning_rate": 0.00047472225272647084, "loss": 4.4288, "step": 1011 }, { "epoch": 0.3236522816766276, "grad_norm": 0.303698867559433, "learning_rate": 0.00047446947515839634, "loss": 4.3793, "step": 1012 }, { "epoch": 0.32397209618421313, "grad_norm": 0.3202921152114868, "learning_rate": 0.0004742165102804067, "loss": 4.4124, "step": 1013 }, { "epoch": 0.32429191069179875, "grad_norm": 0.48569950461387634, "learning_rate": 0.00047396335836408427, "loss": 4.4659, "step": 1014 }, { "epoch": 0.32461172519938436, "grad_norm": 0.30552586913108826, "learning_rate": 0.0004737100196812121, "loss": 4.3715, "step": 1015 }, { "epoch": 0.32493153970697, "grad_norm": 0.4701118469238281, "learning_rate": 0.00047345649450377395, "loss": 4.3149, "step": 1016 }, { "epoch": 0.32525135421455553, "grad_norm": 0.36418306827545166, "learning_rate": 0.0004732027831039536, "loss": 4.2729, "step": 1017 }, { "epoch": 0.32557116872214115, "grad_norm": 0.32645609974861145, "learning_rate": 0.00047294888575413486, "loss": 4.4051, "step": 1018 }, { "epoch": 0.32589098322972676, "grad_norm": 0.329334020614624, "learning_rate": 0.0004726948027269013, "loss": 4.3671, "step": 1019 }, { "epoch": 0.3262107977373124, "grad_norm": 0.31493499875068665, "learning_rate": 0.00047244053429503565, "loss": 4.466, "step": 1020 }, { "epoch": 0.32653061224489793, "grad_norm": 0.34572502970695496, "learning_rate": 0.00047218608073151976, "loss": 4.3213, "step": 1021 }, { "epoch": 0.32685042675248355, "grad_norm": 0.2968219220638275, "learning_rate": 0.0004719314423095342, "loss": 4.4019, "step": 1022 }, { "epoch": 0.32717024126006916, "grad_norm": 0.2953996956348419, "learning_rate": 0.00047167661930245803, "loss": 4.2683, "step": 1023 }, { "epoch": 0.3274900557676548, "grad_norm": 0.3686489760875702, "learning_rate": 0.0004714216119838685, "loss": 4.3611, "step": 1024 }, { "epoch": 0.32780987027524033, "grad_norm": 0.3282267451286316, "learning_rate": 0.00047116642062754074, "loss": 4.3784, "step": 1025 }, { "epoch": 0.32812968478282595, "grad_norm": 0.2965298593044281, "learning_rate": 0.00047091104550744733, "loss": 4.3731, "step": 1026 }, { "epoch": 0.32844949929041156, "grad_norm": 0.3799196779727936, "learning_rate": 0.00047065548689775844, "loss": 4.3258, "step": 1027 }, { "epoch": 0.3287693137979972, "grad_norm": 0.4065827429294586, "learning_rate": 0.00047039974507284086, "loss": 4.4055, "step": 1028 }, { "epoch": 0.32908912830558273, "grad_norm": 0.4028932750225067, "learning_rate": 0.0004701438203072584, "loss": 4.3897, "step": 1029 }, { "epoch": 0.32940894281316835, "grad_norm": 0.3512417674064636, "learning_rate": 0.00046988771287577105, "loss": 4.3816, "step": 1030 }, { "epoch": 0.32972875732075396, "grad_norm": 0.3803594708442688, "learning_rate": 0.0004696314230533349, "loss": 4.3544, "step": 1031 }, { "epoch": 0.3300485718283396, "grad_norm": 0.33042091131210327, "learning_rate": 0.00046937495111510204, "loss": 4.34, "step": 1032 }, { "epoch": 0.3303683863359252, "grad_norm": 0.37239307165145874, "learning_rate": 0.00046911829733642016, "loss": 4.3284, "step": 1033 }, { "epoch": 0.33068820084351075, "grad_norm": 0.37064096331596375, "learning_rate": 0.0004688614619928318, "loss": 4.4142, "step": 1034 }, { "epoch": 0.33100801535109636, "grad_norm": 0.3694692850112915, "learning_rate": 0.00046860444536007473, "loss": 4.4338, "step": 1035 }, { "epoch": 0.331327829858682, "grad_norm": 0.33115631341934204, "learning_rate": 0.0004683472477140811, "loss": 4.3731, "step": 1036 }, { "epoch": 0.3316476443662676, "grad_norm": 0.36821213364601135, "learning_rate": 0.0004680898693309777, "loss": 4.3462, "step": 1037 }, { "epoch": 0.33196745887385315, "grad_norm": 0.38638320565223694, "learning_rate": 0.0004678323104870852, "loss": 4.3788, "step": 1038 }, { "epoch": 0.33228727338143876, "grad_norm": 0.30784502625465393, "learning_rate": 0.00046757457145891784, "loss": 4.2782, "step": 1039 }, { "epoch": 0.3326070878890244, "grad_norm": 0.41412702202796936, "learning_rate": 0.00046731665252318366, "loss": 4.3603, "step": 1040 }, { "epoch": 0.33292690239661, "grad_norm": 0.31608834862709045, "learning_rate": 0.0004670585539567837, "loss": 4.3506, "step": 1041 }, { "epoch": 0.33324671690419555, "grad_norm": 0.32441073656082153, "learning_rate": 0.00046680027603681164, "loss": 4.2949, "step": 1042 }, { "epoch": 0.33356653141178116, "grad_norm": 0.2947441339492798, "learning_rate": 0.0004665418190405541, "loss": 4.2632, "step": 1043 }, { "epoch": 0.3338863459193668, "grad_norm": 0.3132975697517395, "learning_rate": 0.0004662831832454895, "loss": 4.3906, "step": 1044 }, { "epoch": 0.3342061604269524, "grad_norm": 0.3060220777988434, "learning_rate": 0.00046602436892928875, "loss": 4.3315, "step": 1045 }, { "epoch": 0.33452597493453795, "grad_norm": 0.3844990134239197, "learning_rate": 0.000465765376369814, "loss": 4.3901, "step": 1046 }, { "epoch": 0.33484578944212356, "grad_norm": 0.3193272650241852, "learning_rate": 0.00046550620584511886, "loss": 4.3546, "step": 1047 }, { "epoch": 0.3351656039497092, "grad_norm": 0.31956154108047485, "learning_rate": 0.00046524685763344803, "loss": 4.2959, "step": 1048 }, { "epoch": 0.3354854184572948, "grad_norm": 0.34097185730934143, "learning_rate": 0.00046498733201323715, "loss": 4.3099, "step": 1049 }, { "epoch": 0.33580523296488035, "grad_norm": 0.35974618792533875, "learning_rate": 0.000464727629263112, "loss": 4.3478, "step": 1050 }, { "epoch": 0.33612504747246597, "grad_norm": 0.31820425391197205, "learning_rate": 0.0004644677496618887, "loss": 4.3582, "step": 1051 }, { "epoch": 0.3364448619800516, "grad_norm": 0.3453025817871094, "learning_rate": 0.00046420769348857343, "loss": 4.3433, "step": 1052 }, { "epoch": 0.3367646764876372, "grad_norm": 0.3787394165992737, "learning_rate": 0.00046394746102236144, "loss": 4.2923, "step": 1053 }, { "epoch": 0.33708449099522275, "grad_norm": 0.3330345153808594, "learning_rate": 0.00046368705254263773, "loss": 4.2958, "step": 1054 }, { "epoch": 0.33740430550280837, "grad_norm": 0.3871642053127289, "learning_rate": 0.000463426468328976, "loss": 4.3105, "step": 1055 }, { "epoch": 0.337724120010394, "grad_norm": 0.46061065793037415, "learning_rate": 0.0004631657086611387, "loss": 4.3242, "step": 1056 }, { "epoch": 0.3380439345179796, "grad_norm": 0.30320876836776733, "learning_rate": 0.0004629047738190767, "loss": 4.3487, "step": 1057 }, { "epoch": 0.33836374902556515, "grad_norm": 0.33296167850494385, "learning_rate": 0.00046264366408292883, "loss": 4.3494, "step": 1058 }, { "epoch": 0.33868356353315077, "grad_norm": 0.30399850010871887, "learning_rate": 0.0004623823797330216, "loss": 4.3774, "step": 1059 }, { "epoch": 0.3390033780407364, "grad_norm": 0.4569533169269562, "learning_rate": 0.00046212092104986946, "loss": 4.3789, "step": 1060 }, { "epoch": 0.339323192548322, "grad_norm": 0.3754693269729614, "learning_rate": 0.0004618592883141734, "loss": 4.2856, "step": 1061 }, { "epoch": 0.33964300705590755, "grad_norm": 0.3247743546962738, "learning_rate": 0.00046159748180682166, "loss": 4.3513, "step": 1062 }, { "epoch": 0.33996282156349317, "grad_norm": 0.5447053909301758, "learning_rate": 0.0004613355018088889, "loss": 4.2327, "step": 1063 }, { "epoch": 0.3402826360710788, "grad_norm": 0.494564414024353, "learning_rate": 0.00046107334860163605, "loss": 4.3347, "step": 1064 }, { "epoch": 0.3406024505786644, "grad_norm": 0.585490345954895, "learning_rate": 0.00046081102246651014, "loss": 4.3566, "step": 1065 }, { "epoch": 0.34092226508624995, "grad_norm": 0.5277570486068726, "learning_rate": 0.0004605485236851436, "loss": 4.3176, "step": 1066 }, { "epoch": 0.34124207959383557, "grad_norm": 0.3707808256149292, "learning_rate": 0.0004602858525393544, "loss": 4.2546, "step": 1067 }, { "epoch": 0.3415618941014212, "grad_norm": 0.3869034945964813, "learning_rate": 0.00046002300931114555, "loss": 4.3346, "step": 1068 }, { "epoch": 0.3418817086090068, "grad_norm": 0.31987568736076355, "learning_rate": 0.0004597599942827048, "loss": 4.3582, "step": 1069 }, { "epoch": 0.34220152311659235, "grad_norm": 0.36692267656326294, "learning_rate": 0.0004594968077364041, "loss": 4.4443, "step": 1070 }, { "epoch": 0.34252133762417797, "grad_norm": 0.5996853113174438, "learning_rate": 0.00045923344995480006, "loss": 4.3172, "step": 1071 }, { "epoch": 0.3428411521317636, "grad_norm": 0.5550093054771423, "learning_rate": 0.0004589699212206325, "loss": 4.3949, "step": 1072 }, { "epoch": 0.3431609666393492, "grad_norm": 0.4480937719345093, "learning_rate": 0.0004587062218168253, "loss": 4.3765, "step": 1073 }, { "epoch": 0.34348078114693475, "grad_norm": 0.43137478828430176, "learning_rate": 0.0004584423520264853, "loss": 4.3786, "step": 1074 }, { "epoch": 0.34380059565452037, "grad_norm": 0.6294241547584534, "learning_rate": 0.0004581783121329024, "loss": 4.353, "step": 1075 }, { "epoch": 0.344120410162106, "grad_norm": 0.3459145724773407, "learning_rate": 0.00045791410241954894, "loss": 4.195, "step": 1076 }, { "epoch": 0.3444402246696916, "grad_norm": 0.43369606137275696, "learning_rate": 0.0004576497231700798, "loss": 4.3213, "step": 1077 }, { "epoch": 0.34476003917727716, "grad_norm": 0.3029576241970062, "learning_rate": 0.0004573851746683317, "loss": 4.3785, "step": 1078 }, { "epoch": 0.34507985368486277, "grad_norm": 0.566472053527832, "learning_rate": 0.00045712045719832313, "loss": 4.2466, "step": 1079 }, { "epoch": 0.3453996681924484, "grad_norm": 0.4368940591812134, "learning_rate": 0.00045685557104425397, "loss": 4.4498, "step": 1080 }, { "epoch": 0.345719482700034, "grad_norm": 0.33139288425445557, "learning_rate": 0.00045659051649050525, "loss": 4.2762, "step": 1081 }, { "epoch": 0.34603929720761956, "grad_norm": 0.3286347985267639, "learning_rate": 0.00045632529382163883, "loss": 4.2881, "step": 1082 }, { "epoch": 0.34635911171520517, "grad_norm": 0.34062454104423523, "learning_rate": 0.00045605990332239684, "loss": 4.2939, "step": 1083 }, { "epoch": 0.3466789262227908, "grad_norm": 0.3304431736469269, "learning_rate": 0.00045579434527770186, "loss": 4.2885, "step": 1084 }, { "epoch": 0.3469987407303764, "grad_norm": 0.3614174723625183, "learning_rate": 0.0004555286199726561, "loss": 4.2811, "step": 1085 }, { "epoch": 0.34731855523796196, "grad_norm": 0.31524088978767395, "learning_rate": 0.0004552627276925416, "loss": 4.2908, "step": 1086 }, { "epoch": 0.34763836974554757, "grad_norm": 0.29279032349586487, "learning_rate": 0.0004549966687228195, "loss": 4.2779, "step": 1087 }, { "epoch": 0.3479581842531332, "grad_norm": 0.35258224606513977, "learning_rate": 0.0004547304433491299, "loss": 4.3404, "step": 1088 }, { "epoch": 0.3482779987607188, "grad_norm": 0.31771084666252136, "learning_rate": 0.00045446405185729154, "loss": 4.2901, "step": 1089 }, { "epoch": 0.34859781326830436, "grad_norm": 0.34701991081237793, "learning_rate": 0.00045419749453330167, "loss": 4.305, "step": 1090 }, { "epoch": 0.34891762777588997, "grad_norm": 0.3273800015449524, "learning_rate": 0.00045393077166333524, "loss": 4.3159, "step": 1091 }, { "epoch": 0.3492374422834756, "grad_norm": 0.3439023792743683, "learning_rate": 0.0004536638835337452, "loss": 4.3294, "step": 1092 }, { "epoch": 0.3495572567910612, "grad_norm": 0.4060024917125702, "learning_rate": 0.00045339683043106214, "loss": 4.3201, "step": 1093 }, { "epoch": 0.34987707129864676, "grad_norm": 0.4527241289615631, "learning_rate": 0.00045312961264199316, "loss": 4.2653, "step": 1094 }, { "epoch": 0.35019688580623237, "grad_norm": 0.3257511258125305, "learning_rate": 0.0004528622304534225, "loss": 4.3552, "step": 1095 }, { "epoch": 0.350516700313818, "grad_norm": 0.47020405530929565, "learning_rate": 0.00045259468415241117, "loss": 4.2856, "step": 1096 }, { "epoch": 0.3508365148214036, "grad_norm": 0.39292362332344055, "learning_rate": 0.0004523269740261957, "loss": 4.3446, "step": 1097 }, { "epoch": 0.35115632932898916, "grad_norm": 0.3462170660495758, "learning_rate": 0.0004520591003621892, "loss": 4.2602, "step": 1098 }, { "epoch": 0.3514761438365748, "grad_norm": 0.34140291810035706, "learning_rate": 0.00045179106344798005, "loss": 4.3126, "step": 1099 }, { "epoch": 0.3517959583441604, "grad_norm": 0.45510709285736084, "learning_rate": 0.00045152286357133157, "loss": 4.2856, "step": 1100 }, { "epoch": 0.3517959583441604, "eval_runtime": 42.7228, "eval_samples_per_second": 44.402, "eval_steps_per_second": 11.118, "step": 1100 }, { "epoch": 0.352115772851746, "grad_norm": 0.3210175931453705, "learning_rate": 0.0004512545010201828, "loss": 4.2392, "step": 1101 }, { "epoch": 0.3524355873593316, "grad_norm": 0.3366824984550476, "learning_rate": 0.0004509859760826466, "loss": 4.2224, "step": 1102 }, { "epoch": 0.3527554018669172, "grad_norm": 0.39086446166038513, "learning_rate": 0.0004507172890470108, "loss": 4.2948, "step": 1103 }, { "epoch": 0.3530752163745028, "grad_norm": 0.3005397617816925, "learning_rate": 0.000450448440201737, "loss": 4.2543, "step": 1104 }, { "epoch": 0.3533950308820884, "grad_norm": 0.44945546984672546, "learning_rate": 0.0004501794298354603, "loss": 4.2856, "step": 1105 }, { "epoch": 0.353714845389674, "grad_norm": 0.41856294870376587, "learning_rate": 0.0004499102582369897, "loss": 4.1783, "step": 1106 }, { "epoch": 0.3540346598972596, "grad_norm": 0.35962650179862976, "learning_rate": 0.0004496409256953069, "loss": 4.3726, "step": 1107 }, { "epoch": 0.3543544744048452, "grad_norm": 0.6032203435897827, "learning_rate": 0.0004493714324995666, "loss": 4.3605, "step": 1108 }, { "epoch": 0.3546742889124308, "grad_norm": 0.580268144607544, "learning_rate": 0.00044910177893909577, "loss": 4.3948, "step": 1109 }, { "epoch": 0.3549941034200164, "grad_norm": 0.3460249602794647, "learning_rate": 0.00044883196530339376, "loss": 4.297, "step": 1110 }, { "epoch": 0.355313917927602, "grad_norm": 0.3502854108810425, "learning_rate": 0.0004485619918821318, "loss": 4.2981, "step": 1111 }, { "epoch": 0.3556337324351876, "grad_norm": 0.43106362223625183, "learning_rate": 0.00044829185896515245, "loss": 4.3908, "step": 1112 }, { "epoch": 0.3559535469427732, "grad_norm": 0.5422151684761047, "learning_rate": 0.0004480215668424696, "loss": 4.3121, "step": 1113 }, { "epoch": 0.3562733614503588, "grad_norm": 0.30447298288345337, "learning_rate": 0.00044775111580426817, "loss": 4.3218, "step": 1114 }, { "epoch": 0.3565931759579444, "grad_norm": 0.4284678101539612, "learning_rate": 0.00044748050614090343, "loss": 4.3455, "step": 1115 }, { "epoch": 0.35691299046553, "grad_norm": 0.3075186312198639, "learning_rate": 0.00044720973814290125, "loss": 4.2798, "step": 1116 }, { "epoch": 0.3572328049731156, "grad_norm": 0.3467978239059448, "learning_rate": 0.0004469388121009574, "loss": 4.3088, "step": 1117 }, { "epoch": 0.3575526194807012, "grad_norm": 0.3193303644657135, "learning_rate": 0.00044666772830593714, "loss": 4.3323, "step": 1118 }, { "epoch": 0.3578724339882868, "grad_norm": 0.30850112438201904, "learning_rate": 0.00044639648704887535, "loss": 4.3012, "step": 1119 }, { "epoch": 0.3581922484958724, "grad_norm": 0.5923464894294739, "learning_rate": 0.00044612508862097575, "loss": 4.3013, "step": 1120 }, { "epoch": 0.358512063003458, "grad_norm": 0.3257342576980591, "learning_rate": 0.00044585353331361095, "loss": 4.2905, "step": 1121 }, { "epoch": 0.3588318775110436, "grad_norm": 0.3831838369369507, "learning_rate": 0.000445581821418322, "loss": 4.3218, "step": 1122 }, { "epoch": 0.3591516920186292, "grad_norm": 0.49090829491615295, "learning_rate": 0.0004453099532268178, "loss": 4.3451, "step": 1123 }, { "epoch": 0.3594715065262148, "grad_norm": 0.4272654354572296, "learning_rate": 0.0004450379290309755, "loss": 4.1814, "step": 1124 }, { "epoch": 0.3597913210338004, "grad_norm": 0.3223675787448883, "learning_rate": 0.0004447657491228392, "loss": 4.2544, "step": 1125 }, { "epoch": 0.360111135541386, "grad_norm": 0.5000753402709961, "learning_rate": 0.0004444934137946207, "loss": 4.2566, "step": 1126 }, { "epoch": 0.3604309500489716, "grad_norm": 0.41143205761909485, "learning_rate": 0.00044422092333869814, "loss": 4.3063, "step": 1127 }, { "epoch": 0.3607507645565572, "grad_norm": 0.3561902940273285, "learning_rate": 0.00044394827804761667, "loss": 4.3567, "step": 1128 }, { "epoch": 0.3610705790641428, "grad_norm": 0.36221325397491455, "learning_rate": 0.0004436754782140875, "loss": 4.2407, "step": 1129 }, { "epoch": 0.3613903935717284, "grad_norm": 0.3408898711204529, "learning_rate": 0.0004434025241309876, "loss": 4.36, "step": 1130 }, { "epoch": 0.361710208079314, "grad_norm": 0.39159896969795227, "learning_rate": 0.0004431294160913597, "loss": 4.2251, "step": 1131 }, { "epoch": 0.3620300225868996, "grad_norm": 0.31581205129623413, "learning_rate": 0.0004428561543884118, "loss": 4.1653, "step": 1132 }, { "epoch": 0.3623498370944852, "grad_norm": 0.32266056537628174, "learning_rate": 0.0004425827393155169, "loss": 4.2836, "step": 1133 }, { "epoch": 0.3626696516020708, "grad_norm": 0.34637635946273804, "learning_rate": 0.00044230917116621266, "loss": 4.2229, "step": 1134 }, { "epoch": 0.3629894661096564, "grad_norm": 0.30707281827926636, "learning_rate": 0.00044203545023420085, "loss": 4.3584, "step": 1135 }, { "epoch": 0.363309280617242, "grad_norm": 0.3290161192417145, "learning_rate": 0.00044176157681334767, "loss": 4.2233, "step": 1136 }, { "epoch": 0.3636290951248276, "grad_norm": 0.3440206050872803, "learning_rate": 0.0004414875511976827, "loss": 4.2527, "step": 1137 }, { "epoch": 0.3639489096324132, "grad_norm": 0.29877492785453796, "learning_rate": 0.00044121337368139906, "loss": 4.3465, "step": 1138 }, { "epoch": 0.3642687241399988, "grad_norm": 0.355345219373703, "learning_rate": 0.0004409390445588528, "loss": 4.2787, "step": 1139 }, { "epoch": 0.3645885386475844, "grad_norm": 0.3147085905075073, "learning_rate": 0.0004406645641245631, "loss": 4.2728, "step": 1140 }, { "epoch": 0.36490835315517, "grad_norm": 0.42567288875579834, "learning_rate": 0.0004403899326732112, "loss": 4.2434, "step": 1141 }, { "epoch": 0.3652281676627556, "grad_norm": 0.40198370814323425, "learning_rate": 0.00044011515049964073, "loss": 4.2719, "step": 1142 }, { "epoch": 0.3655479821703412, "grad_norm": 0.41583725810050964, "learning_rate": 0.0004398402178988568, "loss": 4.3428, "step": 1143 }, { "epoch": 0.3658677966779268, "grad_norm": 0.33910584449768066, "learning_rate": 0.00043956513516602653, "loss": 4.3242, "step": 1144 }, { "epoch": 0.3661876111855124, "grad_norm": 0.31835636496543884, "learning_rate": 0.00043928990259647764, "loss": 4.2005, "step": 1145 }, { "epoch": 0.366507425693098, "grad_norm": 0.3116024136543274, "learning_rate": 0.00043901452048569913, "loss": 4.2312, "step": 1146 }, { "epoch": 0.3668272402006836, "grad_norm": 0.32156339287757874, "learning_rate": 0.00043873898912934054, "loss": 4.3572, "step": 1147 }, { "epoch": 0.3671470547082692, "grad_norm": 0.32407522201538086, "learning_rate": 0.00043846330882321146, "loss": 4.2835, "step": 1148 }, { "epoch": 0.3674668692158548, "grad_norm": 0.3374769687652588, "learning_rate": 0.00043818747986328136, "loss": 4.1995, "step": 1149 }, { "epoch": 0.3677866837234404, "grad_norm": 0.5640645623207092, "learning_rate": 0.0004379115025456795, "loss": 4.2871, "step": 1150 }, { "epoch": 0.368106498231026, "grad_norm": 0.32684746384620667, "learning_rate": 0.0004376353771666942, "loss": 4.3147, "step": 1151 }, { "epoch": 0.3684263127386116, "grad_norm": 0.5162673592567444, "learning_rate": 0.000437359104022773, "loss": 4.3293, "step": 1152 }, { "epoch": 0.3687461272461972, "grad_norm": 0.3636702001094818, "learning_rate": 0.00043708268341052185, "loss": 4.2496, "step": 1153 }, { "epoch": 0.3690659417537828, "grad_norm": 0.29849907755851746, "learning_rate": 0.00043680611562670513, "loss": 4.2728, "step": 1154 }, { "epoch": 0.3693857562613684, "grad_norm": 0.4179359972476959, "learning_rate": 0.00043652940096824516, "loss": 4.2995, "step": 1155 }, { "epoch": 0.369705570768954, "grad_norm": 0.350577175617218, "learning_rate": 0.00043625253973222206, "loss": 4.2286, "step": 1156 }, { "epoch": 0.3700253852765396, "grad_norm": 0.4183795750141144, "learning_rate": 0.00043597553221587316, "loss": 4.3407, "step": 1157 }, { "epoch": 0.3703451997841252, "grad_norm": 0.3485710322856903, "learning_rate": 0.00043569837871659296, "loss": 4.2534, "step": 1158 }, { "epoch": 0.3706650142917108, "grad_norm": 0.37208688259124756, "learning_rate": 0.0004354210795319327, "loss": 4.3233, "step": 1159 }, { "epoch": 0.3709848287992964, "grad_norm": 0.33319997787475586, "learning_rate": 0.00043514363495959985, "loss": 4.2208, "step": 1160 }, { "epoch": 0.371304643306882, "grad_norm": 0.37885740399360657, "learning_rate": 0.0004348660452974581, "loss": 4.2123, "step": 1161 }, { "epoch": 0.3716244578144676, "grad_norm": 0.32580146193504333, "learning_rate": 0.00043458831084352705, "loss": 4.279, "step": 1162 }, { "epoch": 0.3719442723220532, "grad_norm": 0.323013037443161, "learning_rate": 0.00043431043189598125, "loss": 4.2975, "step": 1163 }, { "epoch": 0.3722640868296388, "grad_norm": 0.350331574678421, "learning_rate": 0.0004340324087531511, "loss": 4.297, "step": 1164 }, { "epoch": 0.3725839013372244, "grad_norm": 0.42210012674331665, "learning_rate": 0.00043375424171352133, "loss": 4.213, "step": 1165 }, { "epoch": 0.37290371584481, "grad_norm": 0.384178102016449, "learning_rate": 0.00043347593107573106, "loss": 4.2866, "step": 1166 }, { "epoch": 0.37322353035239564, "grad_norm": 0.3340718150138855, "learning_rate": 0.000433197477138574, "loss": 4.2846, "step": 1167 }, { "epoch": 0.3735433448599812, "grad_norm": 0.5975145101547241, "learning_rate": 0.00043291888020099723, "loss": 4.1832, "step": 1168 }, { "epoch": 0.3738631593675668, "grad_norm": 0.39390960335731506, "learning_rate": 0.0004326401405621019, "loss": 4.2972, "step": 1169 }, { "epoch": 0.3741829738751524, "grad_norm": 0.3531685769557953, "learning_rate": 0.0004323612585211419, "loss": 4.2833, "step": 1170 }, { "epoch": 0.37450278838273804, "grad_norm": 0.34088510274887085, "learning_rate": 0.0004320822343775242, "loss": 4.357, "step": 1171 }, { "epoch": 0.3748226028903236, "grad_norm": 0.3222389817237854, "learning_rate": 0.00043180306843080836, "loss": 4.3207, "step": 1172 }, { "epoch": 0.3751424173979092, "grad_norm": 0.31497499346733093, "learning_rate": 0.0004315237609807059, "loss": 4.3176, "step": 1173 }, { "epoch": 0.3754622319054948, "grad_norm": 0.5189108848571777, "learning_rate": 0.00043124431232708076, "loss": 4.2546, "step": 1174 }, { "epoch": 0.37578204641308044, "grad_norm": 0.36304277181625366, "learning_rate": 0.000430964722769948, "loss": 4.2302, "step": 1175 }, { "epoch": 0.376101860920666, "grad_norm": 0.35259848833084106, "learning_rate": 0.0004306849926094742, "loss": 4.1562, "step": 1176 }, { "epoch": 0.3764216754282516, "grad_norm": 0.329343318939209, "learning_rate": 0.00043040512214597684, "loss": 4.2487, "step": 1177 }, { "epoch": 0.3767414899358372, "grad_norm": 0.3913806974887848, "learning_rate": 0.00043012511167992405, "loss": 4.2689, "step": 1178 }, { "epoch": 0.37706130444342284, "grad_norm": 0.36462417244911194, "learning_rate": 0.0004298449615119343, "loss": 4.2306, "step": 1179 }, { "epoch": 0.3773811189510084, "grad_norm": 0.3268580138683319, "learning_rate": 0.0004295646719427758, "loss": 4.2826, "step": 1180 }, { "epoch": 0.377700933458594, "grad_norm": 0.44085097312927246, "learning_rate": 0.00042928424327336667, "loss": 4.2645, "step": 1181 }, { "epoch": 0.3780207479661796, "grad_norm": 0.31006789207458496, "learning_rate": 0.00042900367580477446, "loss": 4.1979, "step": 1182 }, { "epoch": 0.37834056247376524, "grad_norm": 0.3364875316619873, "learning_rate": 0.0004287229698382154, "loss": 4.2647, "step": 1183 }, { "epoch": 0.3786603769813508, "grad_norm": 0.45685797929763794, "learning_rate": 0.0004284421256750547, "loss": 4.3052, "step": 1184 }, { "epoch": 0.3789801914889364, "grad_norm": 0.4049988389015198, "learning_rate": 0.0004281611436168059, "loss": 4.2515, "step": 1185 }, { "epoch": 0.379300005996522, "grad_norm": 0.3493066430091858, "learning_rate": 0.00042788002396513023, "loss": 4.2413, "step": 1186 }, { "epoch": 0.37961982050410764, "grad_norm": 0.40198519825935364, "learning_rate": 0.00042759876702183706, "loss": 4.2685, "step": 1187 }, { "epoch": 0.3799396350116932, "grad_norm": 0.3397846519947052, "learning_rate": 0.0004273173730888831, "loss": 4.2085, "step": 1188 }, { "epoch": 0.3802594495192788, "grad_norm": 0.5147667527198792, "learning_rate": 0.00042703584246837206, "loss": 4.2582, "step": 1189 }, { "epoch": 0.38057926402686443, "grad_norm": 0.32970157265663147, "learning_rate": 0.0004267541754625543, "loss": 4.2705, "step": 1190 }, { "epoch": 0.38089907853445004, "grad_norm": 0.3348868489265442, "learning_rate": 0.00042647237237382666, "loss": 4.1939, "step": 1191 }, { "epoch": 0.3812188930420356, "grad_norm": 0.33571240305900574, "learning_rate": 0.00042619043350473223, "loss": 4.3685, "step": 1192 }, { "epoch": 0.3815387075496212, "grad_norm": 0.37644755840301514, "learning_rate": 0.0004259083591579596, "loss": 4.2379, "step": 1193 }, { "epoch": 0.38185852205720683, "grad_norm": 0.40287286043167114, "learning_rate": 0.000425626149636343, "loss": 4.1694, "step": 1194 }, { "epoch": 0.38217833656479244, "grad_norm": 0.32822495698928833, "learning_rate": 0.0004253438052428619, "loss": 4.212, "step": 1195 }, { "epoch": 0.382498151072378, "grad_norm": 0.34690216183662415, "learning_rate": 0.00042506132628064016, "loss": 4.2939, "step": 1196 }, { "epoch": 0.3828179655799636, "grad_norm": 0.34783118963241577, "learning_rate": 0.00042477871305294655, "loss": 4.3151, "step": 1197 }, { "epoch": 0.38313778008754923, "grad_norm": 0.4219350218772888, "learning_rate": 0.0004244959658631938, "loss": 4.1929, "step": 1198 }, { "epoch": 0.38345759459513484, "grad_norm": 0.3672184646129608, "learning_rate": 0.00042421308501493823, "loss": 4.1591, "step": 1199 }, { "epoch": 0.3837774091027204, "grad_norm": 0.30949458479881287, "learning_rate": 0.0004239300708118802, "loss": 4.2211, "step": 1200 }, { "epoch": 0.3837774091027204, "eval_runtime": 42.2661, "eval_samples_per_second": 44.882, "eval_steps_per_second": 11.238, "step": 1200 }, { "epoch": 0.384097223610306, "grad_norm": 0.4713312089443207, "learning_rate": 0.0004236469235578627, "loss": 4.2575, "step": 1201 }, { "epoch": 0.38441703811789163, "grad_norm": 0.670041561126709, "learning_rate": 0.0004233636435568719, "loss": 4.2631, "step": 1202 }, { "epoch": 0.38473685262547724, "grad_norm": 0.3080516457557678, "learning_rate": 0.00042308023111303636, "loss": 4.242, "step": 1203 }, { "epoch": 0.3850566671330628, "grad_norm": 0.45693066716194153, "learning_rate": 0.00042279668653062686, "loss": 4.2663, "step": 1204 }, { "epoch": 0.3853764816406484, "grad_norm": 0.32466092705726624, "learning_rate": 0.0004225130101140559, "loss": 4.2215, "step": 1205 }, { "epoch": 0.38569629614823403, "grad_norm": 0.3173944652080536, "learning_rate": 0.00042222920216787786, "loss": 4.2295, "step": 1206 }, { "epoch": 0.38601611065581964, "grad_norm": 0.3243696391582489, "learning_rate": 0.000421945262996788, "loss": 4.2847, "step": 1207 }, { "epoch": 0.3863359251634052, "grad_norm": 0.3683353364467621, "learning_rate": 0.0004216611929056225, "loss": 4.2735, "step": 1208 }, { "epoch": 0.3866557396709908, "grad_norm": 0.44163763523101807, "learning_rate": 0.0004213769921993583, "loss": 4.2795, "step": 1209 }, { "epoch": 0.38697555417857643, "grad_norm": 0.3980725109577179, "learning_rate": 0.0004210926611831124, "loss": 4.0819, "step": 1210 }, { "epoch": 0.38729536868616204, "grad_norm": 0.40014225244522095, "learning_rate": 0.0004208082001621417, "loss": 4.2705, "step": 1211 }, { "epoch": 0.3876151831937476, "grad_norm": 0.5397045016288757, "learning_rate": 0.0004205236094418428, "loss": 4.2801, "step": 1212 }, { "epoch": 0.3879349977013332, "grad_norm": 0.36510613560676575, "learning_rate": 0.0004202388893277515, "loss": 4.2174, "step": 1213 }, { "epoch": 0.38825481220891883, "grad_norm": 0.4227857291698456, "learning_rate": 0.00041995404012554226, "loss": 4.1985, "step": 1214 }, { "epoch": 0.38857462671650445, "grad_norm": 0.3449905216693878, "learning_rate": 0.0004196690621410285, "loss": 4.2093, "step": 1215 }, { "epoch": 0.38889444122409, "grad_norm": 0.4645538032054901, "learning_rate": 0.0004193839556801617, "loss": 4.2472, "step": 1216 }, { "epoch": 0.3892142557316756, "grad_norm": 0.48015058040618896, "learning_rate": 0.0004190987210490314, "loss": 4.2528, "step": 1217 }, { "epoch": 0.38953407023926123, "grad_norm": 0.41599658131599426, "learning_rate": 0.00041881335855386463, "loss": 4.2649, "step": 1218 }, { "epoch": 0.38985388474684685, "grad_norm": 0.3830937147140503, "learning_rate": 0.00041852786850102557, "loss": 4.1982, "step": 1219 }, { "epoch": 0.3901736992544324, "grad_norm": 0.393541544675827, "learning_rate": 0.00041824225119701576, "loss": 4.2637, "step": 1220 }, { "epoch": 0.390493513762018, "grad_norm": 0.3500911593437195, "learning_rate": 0.0004179565069484729, "loss": 4.2202, "step": 1221 }, { "epoch": 0.39081332826960363, "grad_norm": 0.3576711118221283, "learning_rate": 0.0004176706360621713, "loss": 4.1918, "step": 1222 }, { "epoch": 0.39113314277718925, "grad_norm": 0.4570639431476593, "learning_rate": 0.0004173846388450209, "loss": 4.1981, "step": 1223 }, { "epoch": 0.3914529572847748, "grad_norm": 0.3474108874797821, "learning_rate": 0.0004170985156040677, "loss": 4.1862, "step": 1224 }, { "epoch": 0.3917727717923604, "grad_norm": 0.32948338985443115, "learning_rate": 0.0004168122666464927, "loss": 4.263, "step": 1225 }, { "epoch": 0.39209258629994603, "grad_norm": 0.33027228713035583, "learning_rate": 0.0004165258922796119, "loss": 4.2863, "step": 1226 }, { "epoch": 0.39241240080753165, "grad_norm": 0.30686843395233154, "learning_rate": 0.00041623939281087605, "loss": 4.2372, "step": 1227 }, { "epoch": 0.3927322153151172, "grad_norm": 0.31884413957595825, "learning_rate": 0.00041595276854787007, "loss": 4.1383, "step": 1228 }, { "epoch": 0.3930520298227028, "grad_norm": 0.46335259079933167, "learning_rate": 0.00041566601979831287, "loss": 4.245, "step": 1229 }, { "epoch": 0.39337184433028843, "grad_norm": 0.41647928953170776, "learning_rate": 0.00041537914687005714, "loss": 4.2791, "step": 1230 }, { "epoch": 0.39369165883787405, "grad_norm": 0.5393695831298828, "learning_rate": 0.00041509215007108885, "loss": 4.2444, "step": 1231 }, { "epoch": 0.3940114733454596, "grad_norm": 0.3126046061515808, "learning_rate": 0.0004148050297095269, "loss": 4.2636, "step": 1232 }, { "epoch": 0.3943312878530452, "grad_norm": 0.3176352381706238, "learning_rate": 0.00041451778609362286, "loss": 4.2205, "step": 1233 }, { "epoch": 0.39465110236063083, "grad_norm": 0.6550034284591675, "learning_rate": 0.0004142304195317605, "loss": 4.1842, "step": 1234 }, { "epoch": 0.39497091686821645, "grad_norm": 0.39726129174232483, "learning_rate": 0.00041394293033245597, "loss": 4.1665, "step": 1235 }, { "epoch": 0.39529073137580206, "grad_norm": 0.34352433681488037, "learning_rate": 0.00041365531880435647, "loss": 4.242, "step": 1236 }, { "epoch": 0.3956105458833876, "grad_norm": 0.3200819194316864, "learning_rate": 0.0004133675852562413, "loss": 4.3594, "step": 1237 }, { "epoch": 0.39593036039097323, "grad_norm": 0.7176745533943176, "learning_rate": 0.00041307972999702014, "loss": 4.1622, "step": 1238 }, { "epoch": 0.39625017489855885, "grad_norm": 0.33239173889160156, "learning_rate": 0.00041279175333573345, "loss": 4.3232, "step": 1239 }, { "epoch": 0.39656998940614446, "grad_norm": 0.3267849087715149, "learning_rate": 0.00041250365558155236, "loss": 4.322, "step": 1240 }, { "epoch": 0.39688980391373, "grad_norm": 0.3703368008136749, "learning_rate": 0.0004122154370437776, "loss": 4.232, "step": 1241 }, { "epoch": 0.39720961842131564, "grad_norm": 0.3273005485534668, "learning_rate": 0.0004119270980318398, "loss": 4.2255, "step": 1242 }, { "epoch": 0.39752943292890125, "grad_norm": 0.37401920557022095, "learning_rate": 0.0004116386388552988, "loss": 4.249, "step": 1243 }, { "epoch": 0.39784924743648686, "grad_norm": 0.35054463148117065, "learning_rate": 0.0004113500598238437, "loss": 4.2677, "step": 1244 }, { "epoch": 0.3981690619440724, "grad_norm": 0.35126587748527527, "learning_rate": 0.000411061361247292, "loss": 4.2701, "step": 1245 }, { "epoch": 0.39848887645165804, "grad_norm": 0.3643576204776764, "learning_rate": 0.00041077254343558955, "loss": 4.2452, "step": 1246 }, { "epoch": 0.39880869095924365, "grad_norm": 0.4299747943878174, "learning_rate": 0.00041048360669881055, "loss": 4.2913, "step": 1247 }, { "epoch": 0.39912850546682926, "grad_norm": 0.3413592576980591, "learning_rate": 0.0004101945513471563, "loss": 4.2381, "step": 1248 }, { "epoch": 0.3994483199744148, "grad_norm": 0.410049706697464, "learning_rate": 0.000409905377690956, "loss": 4.2595, "step": 1249 }, { "epoch": 0.39976813448200044, "grad_norm": 0.3631017804145813, "learning_rate": 0.00040961608604066566, "loss": 4.2574, "step": 1250 }, { "epoch": 0.40008794898958605, "grad_norm": 0.33316096663475037, "learning_rate": 0.0004093266767068677, "loss": 4.1716, "step": 1251 }, { "epoch": 0.40040776349717166, "grad_norm": 0.33460763096809387, "learning_rate": 0.0004090371500002715, "loss": 4.2001, "step": 1252 }, { "epoch": 0.4007275780047572, "grad_norm": 0.3317466974258423, "learning_rate": 0.00040874750623171176, "loss": 4.1485, "step": 1253 }, { "epoch": 0.40104739251234284, "grad_norm": 0.305465966463089, "learning_rate": 0.00040845774571214924, "loss": 4.2558, "step": 1254 }, { "epoch": 0.40136720701992845, "grad_norm": 0.35846203565597534, "learning_rate": 0.0004081678687526701, "loss": 4.2194, "step": 1255 }, { "epoch": 0.40168702152751407, "grad_norm": 0.3070729076862335, "learning_rate": 0.0004078778756644854, "loss": 4.2918, "step": 1256 }, { "epoch": 0.4020068360350996, "grad_norm": 0.40230226516723633, "learning_rate": 0.00040758776675893065, "loss": 4.2516, "step": 1257 }, { "epoch": 0.40232665054268524, "grad_norm": 0.5927528142929077, "learning_rate": 0.00040729754234746613, "loss": 4.1899, "step": 1258 }, { "epoch": 0.40264646505027085, "grad_norm": 0.35053226351737976, "learning_rate": 0.0004070072027416758, "loss": 4.2241, "step": 1259 }, { "epoch": 0.40296627955785647, "grad_norm": 0.3327595293521881, "learning_rate": 0.00040671674825326745, "loss": 4.2157, "step": 1260 }, { "epoch": 0.403286094065442, "grad_norm": 0.3997345566749573, "learning_rate": 0.0004064261791940723, "loss": 4.1673, "step": 1261 }, { "epoch": 0.40360590857302764, "grad_norm": 0.6464436650276184, "learning_rate": 0.0004061354958760441, "loss": 4.2945, "step": 1262 }, { "epoch": 0.40392572308061325, "grad_norm": 0.3762631118297577, "learning_rate": 0.00040584469861126, "loss": 4.1757, "step": 1263 }, { "epoch": 0.40424553758819887, "grad_norm": 0.31638339161872864, "learning_rate": 0.00040555378771191876, "loss": 4.1744, "step": 1264 }, { "epoch": 0.4045653520957844, "grad_norm": 0.43923285603523254, "learning_rate": 0.0004052627634903417, "loss": 4.2814, "step": 1265 }, { "epoch": 0.40488516660337004, "grad_norm": 0.4571131467819214, "learning_rate": 0.00040497162625897134, "loss": 4.2227, "step": 1266 }, { "epoch": 0.40520498111095565, "grad_norm": 0.4038392901420593, "learning_rate": 0.00040468037633037196, "loss": 4.1466, "step": 1267 }, { "epoch": 0.40552479561854127, "grad_norm": 0.5921345353126526, "learning_rate": 0.0004043890140172286, "loss": 4.1655, "step": 1268 }, { "epoch": 0.4058446101261268, "grad_norm": 0.4224293529987335, "learning_rate": 0.00040409753963234675, "loss": 4.2793, "step": 1269 }, { "epoch": 0.40616442463371244, "grad_norm": 0.36951300501823425, "learning_rate": 0.00040380595348865286, "loss": 4.1462, "step": 1270 }, { "epoch": 0.40648423914129805, "grad_norm": 0.33291858434677124, "learning_rate": 0.00040351425589919257, "loss": 4.2967, "step": 1271 }, { "epoch": 0.40680405364888367, "grad_norm": 0.4488460421562195, "learning_rate": 0.0004032224471771317, "loss": 4.229, "step": 1272 }, { "epoch": 0.4071238681564692, "grad_norm": 0.47688713669776917, "learning_rate": 0.00040293052763575537, "loss": 4.1741, "step": 1273 }, { "epoch": 0.40744368266405484, "grad_norm": 0.3574417233467102, "learning_rate": 0.0004026384975884673, "loss": 4.2141, "step": 1274 }, { "epoch": 0.40776349717164045, "grad_norm": 0.3303288221359253, "learning_rate": 0.00040234635734879036, "loss": 4.2708, "step": 1275 }, { "epoch": 0.40808331167922607, "grad_norm": 0.33420050144195557, "learning_rate": 0.00040205410723036526, "loss": 4.1872, "step": 1276 }, { "epoch": 0.4084031261868116, "grad_norm": 0.3651393949985504, "learning_rate": 0.0004017617475469508, "loss": 4.2027, "step": 1277 }, { "epoch": 0.40872294069439724, "grad_norm": 0.46024513244628906, "learning_rate": 0.00040146927861242366, "loss": 4.1241, "step": 1278 }, { "epoch": 0.40904275520198285, "grad_norm": 0.35662344098091125, "learning_rate": 0.00040117670074077747, "loss": 4.2101, "step": 1279 }, { "epoch": 0.40936256970956847, "grad_norm": 0.4037419855594635, "learning_rate": 0.00040088401424612317, "loss": 4.2017, "step": 1280 }, { "epoch": 0.409682384217154, "grad_norm": 0.3635391294956207, "learning_rate": 0.000400591219442688, "loss": 4.1656, "step": 1281 }, { "epoch": 0.41000219872473964, "grad_norm": 0.3675207495689392, "learning_rate": 0.0004002983166448155, "loss": 4.1989, "step": 1282 }, { "epoch": 0.41032201323232526, "grad_norm": 0.34774187207221985, "learning_rate": 0.0004000053061669654, "loss": 4.2134, "step": 1283 }, { "epoch": 0.41064182773991087, "grad_norm": 0.34826135635375977, "learning_rate": 0.00039971218832371284, "loss": 4.2726, "step": 1284 }, { "epoch": 0.41096164224749643, "grad_norm": 0.32837414741516113, "learning_rate": 0.0003994189634297483, "loss": 4.107, "step": 1285 }, { "epoch": 0.41128145675508204, "grad_norm": 0.5760036706924438, "learning_rate": 0.00039912563179987713, "loss": 4.2871, "step": 1286 }, { "epoch": 0.41160127126266766, "grad_norm": 0.32344427704811096, "learning_rate": 0.00039883219374901933, "loss": 4.17, "step": 1287 }, { "epoch": 0.41192108577025327, "grad_norm": 0.32329803705215454, "learning_rate": 0.0003985386495922091, "loss": 4.182, "step": 1288 }, { "epoch": 0.41224090027783883, "grad_norm": 0.4408464729785919, "learning_rate": 0.00039824499964459455, "loss": 4.1857, "step": 1289 }, { "epoch": 0.41256071478542444, "grad_norm": 0.4378527104854584, "learning_rate": 0.00039795124422143746, "loss": 4.1227, "step": 1290 }, { "epoch": 0.41288052929301006, "grad_norm": 0.3393475413322449, "learning_rate": 0.0003976573836381128, "loss": 4.2468, "step": 1291 }, { "epoch": 0.41320034380059567, "grad_norm": 0.4020253121852875, "learning_rate": 0.00039736341821010833, "loss": 4.2474, "step": 1292 }, { "epoch": 0.41352015830818123, "grad_norm": 0.36049744486808777, "learning_rate": 0.0003970693482530247, "loss": 4.1996, "step": 1293 }, { "epoch": 0.41383997281576684, "grad_norm": 0.39351141452789307, "learning_rate": 0.00039677517408257424, "loss": 4.1736, "step": 1294 }, { "epoch": 0.41415978732335246, "grad_norm": 0.3423965573310852, "learning_rate": 0.00039648089601458165, "loss": 4.167, "step": 1295 }, { "epoch": 0.41447960183093807, "grad_norm": 0.38224858045578003, "learning_rate": 0.000396186514364983, "loss": 4.3002, "step": 1296 }, { "epoch": 0.41479941633852363, "grad_norm": 0.3113231062889099, "learning_rate": 0.0003958920294498255, "loss": 4.2659, "step": 1297 }, { "epoch": 0.41511923084610924, "grad_norm": 0.3761517107486725, "learning_rate": 0.00039559744158526735, "loss": 4.2073, "step": 1298 }, { "epoch": 0.41543904535369486, "grad_norm": 0.4494081437587738, "learning_rate": 0.0003953027510875772, "loss": 4.1748, "step": 1299 }, { "epoch": 0.41575885986128047, "grad_norm": 0.3728586435317993, "learning_rate": 0.0003950079582731339, "loss": 4.2584, "step": 1300 }, { "epoch": 0.41575885986128047, "eval_runtime": 43.2753, "eval_samples_per_second": 43.836, "eval_steps_per_second": 10.976, "step": 1300 }, { "epoch": 0.41607867436886603, "grad_norm": 0.351593941450119, "learning_rate": 0.0003947130634584261, "loss": 4.2141, "step": 1301 }, { "epoch": 0.41639848887645164, "grad_norm": 0.34159013628959656, "learning_rate": 0.000394418066960052, "loss": 4.2135, "step": 1302 }, { "epoch": 0.41671830338403726, "grad_norm": 0.3718115985393524, "learning_rate": 0.00039412296909471914, "loss": 4.1869, "step": 1303 }, { "epoch": 0.4170381178916229, "grad_norm": 0.37006789445877075, "learning_rate": 0.00039382777017924354, "loss": 4.1954, "step": 1304 }, { "epoch": 0.4173579323992085, "grad_norm": 0.5150507092475891, "learning_rate": 0.00039353247053054984, "loss": 4.2007, "step": 1305 }, { "epoch": 0.41767774690679405, "grad_norm": 0.4240642488002777, "learning_rate": 0.0003932370704656711, "loss": 4.2219, "step": 1306 }, { "epoch": 0.41799756141437966, "grad_norm": 0.3280806541442871, "learning_rate": 0.00039294157030174783, "loss": 4.2169, "step": 1307 }, { "epoch": 0.4183173759219653, "grad_norm": 0.31104522943496704, "learning_rate": 0.00039264597035602807, "loss": 4.1463, "step": 1308 }, { "epoch": 0.4186371904295509, "grad_norm": 0.34534013271331787, "learning_rate": 0.0003923502709458672, "loss": 4.2092, "step": 1309 }, { "epoch": 0.41895700493713645, "grad_norm": 0.32108622789382935, "learning_rate": 0.00039205447238872706, "loss": 4.1816, "step": 1310 }, { "epoch": 0.41927681944472206, "grad_norm": 0.3265717029571533, "learning_rate": 0.0003917585750021763, "loss": 4.1989, "step": 1311 }, { "epoch": 0.4195966339523077, "grad_norm": 0.3503040075302124, "learning_rate": 0.0003914625791038893, "loss": 4.1861, "step": 1312 }, { "epoch": 0.4199164484598933, "grad_norm": 0.36474478244781494, "learning_rate": 0.00039116648501164665, "loss": 4.1248, "step": 1313 }, { "epoch": 0.42023626296747885, "grad_norm": 0.33244428038597107, "learning_rate": 0.0003908702930433338, "loss": 4.181, "step": 1314 }, { "epoch": 0.42055607747506446, "grad_norm": 0.3336569666862488, "learning_rate": 0.0003905740035169417, "loss": 4.1559, "step": 1315 }, { "epoch": 0.4208758919826501, "grad_norm": 0.3262631893157959, "learning_rate": 0.00039027761675056595, "loss": 4.1413, "step": 1316 }, { "epoch": 0.4211957064902357, "grad_norm": 0.33067336678504944, "learning_rate": 0.0003899811330624065, "loss": 4.1378, "step": 1317 }, { "epoch": 0.42151552099782125, "grad_norm": 0.34728142619132996, "learning_rate": 0.0003896845527707673, "loss": 4.1591, "step": 1318 }, { "epoch": 0.42183533550540686, "grad_norm": 0.5357210636138916, "learning_rate": 0.00038938787619405616, "loss": 4.2273, "step": 1319 }, { "epoch": 0.4221551500129925, "grad_norm": 0.5904281735420227, "learning_rate": 0.00038909110365078413, "loss": 4.1553, "step": 1320 }, { "epoch": 0.4224749645205781, "grad_norm": 0.39853817224502563, "learning_rate": 0.00038879423545956534, "loss": 4.1125, "step": 1321 }, { "epoch": 0.42279477902816365, "grad_norm": 0.40623781085014343, "learning_rate": 0.00038849727193911664, "loss": 4.1922, "step": 1322 }, { "epoch": 0.42311459353574926, "grad_norm": 0.6153369545936584, "learning_rate": 0.0003882002134082571, "loss": 4.1908, "step": 1323 }, { "epoch": 0.4234344080433349, "grad_norm": 0.4433586001396179, "learning_rate": 0.000387903060185908, "loss": 4.17, "step": 1324 }, { "epoch": 0.4237542225509205, "grad_norm": 0.38687431812286377, "learning_rate": 0.00038760581259109214, "loss": 4.1879, "step": 1325 }, { "epoch": 0.42407403705850605, "grad_norm": 0.5282825231552124, "learning_rate": 0.0003873084709429336, "loss": 4.2238, "step": 1326 }, { "epoch": 0.42439385156609166, "grad_norm": 0.4060686230659485, "learning_rate": 0.00038701103556065754, "loss": 4.2484, "step": 1327 }, { "epoch": 0.4247136660736773, "grad_norm": 0.37743812799453735, "learning_rate": 0.0003867135067635898, "loss": 4.1963, "step": 1328 }, { "epoch": 0.4250334805812629, "grad_norm": 0.33393755555152893, "learning_rate": 0.0003864158848711562, "loss": 4.1817, "step": 1329 }, { "epoch": 0.42535329508884845, "grad_norm": 0.662140965461731, "learning_rate": 0.000386118170202883, "loss": 4.1647, "step": 1330 }, { "epoch": 0.42567310959643406, "grad_norm": 0.3202478289604187, "learning_rate": 0.00038582036307839557, "loss": 4.2276, "step": 1331 }, { "epoch": 0.4259929241040197, "grad_norm": 0.4697231650352478, "learning_rate": 0.00038552246381741884, "loss": 4.1306, "step": 1332 }, { "epoch": 0.4263127386116053, "grad_norm": 0.3259800672531128, "learning_rate": 0.0003852244727397766, "loss": 4.2903, "step": 1333 }, { "epoch": 0.42663255311919085, "grad_norm": 0.3587746024131775, "learning_rate": 0.00038492639016539116, "loss": 4.2133, "step": 1334 }, { "epoch": 0.42695236762677646, "grad_norm": 0.4351878762245178, "learning_rate": 0.0003846282164142831, "loss": 4.2692, "step": 1335 }, { "epoch": 0.4272721821343621, "grad_norm": 0.37234535813331604, "learning_rate": 0.00038432995180657094, "loss": 4.1341, "step": 1336 }, { "epoch": 0.4275919966419477, "grad_norm": 0.3554999530315399, "learning_rate": 0.00038403159666247063, "loss": 4.1538, "step": 1337 }, { "epoch": 0.42791181114953325, "grad_norm": 0.37163516879081726, "learning_rate": 0.0003837331513022954, "loss": 4.1935, "step": 1338 }, { "epoch": 0.42823162565711886, "grad_norm": 0.36984384059906006, "learning_rate": 0.0003834346160464553, "loss": 4.2113, "step": 1339 }, { "epoch": 0.4285514401647045, "grad_norm": 0.4967077374458313, "learning_rate": 0.0003831359912154569, "loss": 4.1756, "step": 1340 }, { "epoch": 0.4288712546722901, "grad_norm": 0.41509532928466797, "learning_rate": 0.0003828372771299029, "loss": 4.0993, "step": 1341 }, { "epoch": 0.42919106917987565, "grad_norm": 0.5049270987510681, "learning_rate": 0.00038253847411049194, "loss": 4.079, "step": 1342 }, { "epoch": 0.42951088368746126, "grad_norm": 0.7024607062339783, "learning_rate": 0.000382239582478018, "loss": 4.2543, "step": 1343 }, { "epoch": 0.4298306981950469, "grad_norm": 0.4005550146102905, "learning_rate": 0.00038194060255337026, "loss": 4.2566, "step": 1344 }, { "epoch": 0.4301505127026325, "grad_norm": 0.4691022038459778, "learning_rate": 0.0003816415346575327, "loss": 4.1658, "step": 1345 }, { "epoch": 0.43047032721021805, "grad_norm": 0.6480712294578552, "learning_rate": 0.0003813423791115838, "loss": 4.0673, "step": 1346 }, { "epoch": 0.43079014171780367, "grad_norm": 0.5240592956542969, "learning_rate": 0.00038104313623669604, "loss": 4.2306, "step": 1347 }, { "epoch": 0.4311099562253893, "grad_norm": 0.3090254068374634, "learning_rate": 0.0003807438063541356, "loss": 4.1958, "step": 1348 }, { "epoch": 0.4314297707329749, "grad_norm": 0.5625685453414917, "learning_rate": 0.00038044438978526235, "loss": 4.2393, "step": 1349 }, { "epoch": 0.43174958524056045, "grad_norm": 0.4118185341358185, "learning_rate": 0.0003801448868515287, "loss": 4.1558, "step": 1350 }, { "epoch": 0.43206939974814607, "grad_norm": 0.36767861247062683, "learning_rate": 0.00037984529787448047, "loss": 4.1872, "step": 1351 }, { "epoch": 0.4323892142557317, "grad_norm": 0.33349353075027466, "learning_rate": 0.0003795456231757554, "loss": 4.1896, "step": 1352 }, { "epoch": 0.4327090287633173, "grad_norm": 0.304738849401474, "learning_rate": 0.0003792458630770833, "loss": 4.1663, "step": 1353 }, { "epoch": 0.43302884327090285, "grad_norm": 0.34616366028785706, "learning_rate": 0.00037894601790028576, "loss": 4.2542, "step": 1354 }, { "epoch": 0.43334865777848847, "grad_norm": 0.6568637490272522, "learning_rate": 0.0003786460879672756, "loss": 4.1556, "step": 1355 }, { "epoch": 0.4336684722860741, "grad_norm": 0.6700954437255859, "learning_rate": 0.0003783460736000569, "loss": 4.0721, "step": 1356 }, { "epoch": 0.4339882867936597, "grad_norm": 0.4154500663280487, "learning_rate": 0.0003780459751207241, "loss": 4.1548, "step": 1357 }, { "epoch": 0.43430810130124525, "grad_norm": 0.572571337223053, "learning_rate": 0.0003777457928514619, "loss": 4.2181, "step": 1358 }, { "epoch": 0.43462791580883087, "grad_norm": 0.47688862681388855, "learning_rate": 0.0003774455271145454, "loss": 4.1083, "step": 1359 }, { "epoch": 0.4349477303164165, "grad_norm": 0.33533376455307007, "learning_rate": 0.0003771451782323388, "loss": 4.1551, "step": 1360 }, { "epoch": 0.4352675448240021, "grad_norm": 0.3115862011909485, "learning_rate": 0.0003768447465272959, "loss": 4.106, "step": 1361 }, { "epoch": 0.43558735933158765, "grad_norm": 0.3317044675350189, "learning_rate": 0.0003765442323219591, "loss": 4.1614, "step": 1362 }, { "epoch": 0.43590717383917327, "grad_norm": 0.36808913946151733, "learning_rate": 0.00037624363593895976, "loss": 4.1857, "step": 1363 }, { "epoch": 0.4362269883467589, "grad_norm": 0.377441942691803, "learning_rate": 0.00037594295770101716, "loss": 4.1788, "step": 1364 }, { "epoch": 0.4365468028543445, "grad_norm": 0.33686643838882446, "learning_rate": 0.0003756421979309387, "loss": 4.1971, "step": 1365 }, { "epoch": 0.43686661736193005, "grad_norm": 0.3213209807872772, "learning_rate": 0.00037534135695161904, "loss": 4.1215, "step": 1366 }, { "epoch": 0.43718643186951567, "grad_norm": 0.39843055605888367, "learning_rate": 0.0003750404350860402, "loss": 4.1651, "step": 1367 }, { "epoch": 0.4375062463771013, "grad_norm": 0.5910980105400085, "learning_rate": 0.00037473943265727114, "loss": 4.2175, "step": 1368 }, { "epoch": 0.4378260608846869, "grad_norm": 0.4348961412906647, "learning_rate": 0.000374438349988467, "loss": 4.1866, "step": 1369 }, { "epoch": 0.43814587539227245, "grad_norm": 0.41352027654647827, "learning_rate": 0.00037413718740286935, "loss": 4.0934, "step": 1370 }, { "epoch": 0.43846568989985807, "grad_norm": 0.3208502233028412, "learning_rate": 0.00037383594522380546, "loss": 4.1758, "step": 1371 }, { "epoch": 0.4387855044074437, "grad_norm": 0.3612997829914093, "learning_rate": 0.00037353462377468806, "loss": 4.2138, "step": 1372 }, { "epoch": 0.4391053189150293, "grad_norm": 0.35409602522850037, "learning_rate": 0.0003732332233790149, "loss": 4.0984, "step": 1373 }, { "epoch": 0.4394251334226149, "grad_norm": 0.3285612463951111, "learning_rate": 0.00037293174436036855, "loss": 4.3041, "step": 1374 }, { "epoch": 0.43974494793020047, "grad_norm": 0.4354185163974762, "learning_rate": 0.000372630187042416, "loss": 4.2465, "step": 1375 }, { "epoch": 0.4400647624377861, "grad_norm": 0.45516887307167053, "learning_rate": 0.0003723285517489084, "loss": 4.1564, "step": 1376 }, { "epoch": 0.4403845769453717, "grad_norm": 0.35248568654060364, "learning_rate": 0.0003720268388036805, "loss": 4.1354, "step": 1377 }, { "epoch": 0.4407043914529573, "grad_norm": 0.5774437189102173, "learning_rate": 0.0003717250485306503, "loss": 4.2308, "step": 1378 }, { "epoch": 0.44102420596054287, "grad_norm": 0.5937883257865906, "learning_rate": 0.00037142318125381915, "loss": 4.2045, "step": 1379 }, { "epoch": 0.4413440204681285, "grad_norm": 0.4784785211086273, "learning_rate": 0.0003711212372972706, "loss": 4.155, "step": 1380 }, { "epoch": 0.4416638349757141, "grad_norm": 0.8106619715690613, "learning_rate": 0.000370819216985171, "loss": 4.2335, "step": 1381 }, { "epoch": 0.4419836494832997, "grad_norm": 0.35943347215652466, "learning_rate": 0.0003705171206417685, "loss": 4.1474, "step": 1382 }, { "epoch": 0.44230346399088527, "grad_norm": 0.5781442523002625, "learning_rate": 0.0003702149485913926, "loss": 4.0904, "step": 1383 }, { "epoch": 0.4426232784984709, "grad_norm": 0.41128629446029663, "learning_rate": 0.0003699127011584546, "loss": 4.2005, "step": 1384 }, { "epoch": 0.4429430930060565, "grad_norm": 0.3602639138698578, "learning_rate": 0.0003696103786674463, "loss": 4.1694, "step": 1385 }, { "epoch": 0.4432629075136421, "grad_norm": 0.3897869288921356, "learning_rate": 0.0003693079814429403, "loss": 4.206, "step": 1386 }, { "epoch": 0.44358272202122767, "grad_norm": 0.4483744204044342, "learning_rate": 0.00036900550980958934, "loss": 4.0887, "step": 1387 }, { "epoch": 0.4439025365288133, "grad_norm": 0.34620460867881775, "learning_rate": 0.000368702964092126, "loss": 4.0832, "step": 1388 }, { "epoch": 0.4442223510363989, "grad_norm": 0.3690264821052551, "learning_rate": 0.0003684003446153627, "loss": 4.1426, "step": 1389 }, { "epoch": 0.4445421655439845, "grad_norm": 0.3391917645931244, "learning_rate": 0.0003680976517041905, "loss": 4.072, "step": 1390 }, { "epoch": 0.44486198005157007, "grad_norm": 0.7859227061271667, "learning_rate": 0.00036779488568358, "loss": 4.0881, "step": 1391 }, { "epoch": 0.4451817945591557, "grad_norm": 0.36248093843460083, "learning_rate": 0.00036749204687857955, "loss": 4.0893, "step": 1392 }, { "epoch": 0.4455016090667413, "grad_norm": 0.3668447434902191, "learning_rate": 0.00036718913561431613, "loss": 4.1506, "step": 1393 }, { "epoch": 0.4458214235743269, "grad_norm": 0.4075997471809387, "learning_rate": 0.0003668861522159945, "loss": 4.0668, "step": 1394 }, { "epoch": 0.4461412380819125, "grad_norm": 0.5935477018356323, "learning_rate": 0.00036658309700889655, "loss": 4.1256, "step": 1395 }, { "epoch": 0.4464610525894981, "grad_norm": 0.48834285140037537, "learning_rate": 0.0003662799703183817, "loss": 4.1729, "step": 1396 }, { "epoch": 0.4467808670970837, "grad_norm": 0.3952053487300873, "learning_rate": 0.00036597677246988564, "loss": 4.1451, "step": 1397 }, { "epoch": 0.4471006816046693, "grad_norm": 0.34440720081329346, "learning_rate": 0.00036567350378892074, "loss": 4.1913, "step": 1398 }, { "epoch": 0.4474204961122549, "grad_norm": 0.4927520751953125, "learning_rate": 0.00036537016460107545, "loss": 4.1667, "step": 1399 }, { "epoch": 0.4477403106198405, "grad_norm": 0.38067302107810974, "learning_rate": 0.00036506675523201385, "loss": 4.2262, "step": 1400 }, { "epoch": 0.4477403106198405, "eval_runtime": 42.0071, "eval_samples_per_second": 45.159, "eval_steps_per_second": 11.308, "step": 1400 }, { "epoch": 0.4480601251274261, "grad_norm": 0.4869115352630615, "learning_rate": 0.0003647632760074751, "loss": 4.1271, "step": 1401 }, { "epoch": 0.4483799396350117, "grad_norm": 0.31394392251968384, "learning_rate": 0.0003644597272532739, "loss": 4.2053, "step": 1402 }, { "epoch": 0.4486997541425973, "grad_norm": 0.3356005847454071, "learning_rate": 0.00036415610929529913, "loss": 4.1842, "step": 1403 }, { "epoch": 0.4490195686501829, "grad_norm": 0.32470637559890747, "learning_rate": 0.00036385242245951427, "loss": 4.2052, "step": 1404 }, { "epoch": 0.4493393831577685, "grad_norm": 0.5195385813713074, "learning_rate": 0.0003635486670719564, "loss": 4.1925, "step": 1405 }, { "epoch": 0.4496591976653541, "grad_norm": 0.2910522520542145, "learning_rate": 0.0003632448434587366, "loss": 4.1326, "step": 1406 }, { "epoch": 0.4499790121729397, "grad_norm": 0.39860472083091736, "learning_rate": 0.00036294095194603905, "loss": 4.1354, "step": 1407 }, { "epoch": 0.4502988266805253, "grad_norm": 0.35632720589637756, "learning_rate": 0.00036263699286012056, "loss": 4.165, "step": 1408 }, { "epoch": 0.4506186411881109, "grad_norm": 0.34689879417419434, "learning_rate": 0.0003623329665273108, "loss": 4.0851, "step": 1409 }, { "epoch": 0.4509384556956965, "grad_norm": 0.36565715074539185, "learning_rate": 0.00036202887327401167, "loss": 4.1472, "step": 1410 }, { "epoch": 0.4512582702032821, "grad_norm": 0.36994320154190063, "learning_rate": 0.00036172471342669663, "loss": 4.1935, "step": 1411 }, { "epoch": 0.4515780847108677, "grad_norm": 0.39877843856811523, "learning_rate": 0.000361420487311911, "loss": 4.1451, "step": 1412 }, { "epoch": 0.4518978992184533, "grad_norm": 0.37688520550727844, "learning_rate": 0.0003611161952562707, "loss": 4.1469, "step": 1413 }, { "epoch": 0.4522177137260389, "grad_norm": 0.32469335198402405, "learning_rate": 0.00036081183758646313, "loss": 4.1529, "step": 1414 }, { "epoch": 0.4525375282336245, "grad_norm": 0.5636565685272217, "learning_rate": 0.00036050741462924563, "loss": 4.1937, "step": 1415 }, { "epoch": 0.4528573427412101, "grad_norm": 0.35731878876686096, "learning_rate": 0.0003602029267114457, "loss": 4.1568, "step": 1416 }, { "epoch": 0.4531771572487957, "grad_norm": 0.32796329259872437, "learning_rate": 0.000359898374159961, "loss": 4.1478, "step": 1417 }, { "epoch": 0.4534969717563813, "grad_norm": 0.3077724575996399, "learning_rate": 0.0003595937573017579, "loss": 4.2, "step": 1418 }, { "epoch": 0.4538167862639669, "grad_norm": 0.38929975032806396, "learning_rate": 0.00035928907646387234, "loss": 4.1748, "step": 1419 }, { "epoch": 0.4541366007715525, "grad_norm": 0.37046897411346436, "learning_rate": 0.00035898433197340874, "loss": 4.1845, "step": 1420 }, { "epoch": 0.4544564152791381, "grad_norm": 0.577949583530426, "learning_rate": 0.0003586795241575398, "loss": 4.1968, "step": 1421 }, { "epoch": 0.4547762297867237, "grad_norm": 0.41907408833503723, "learning_rate": 0.00035837465334350637, "loss": 4.1916, "step": 1422 }, { "epoch": 0.4550960442943093, "grad_norm": 0.3775298297405243, "learning_rate": 0.0003580697198586169, "loss": 4.1683, "step": 1423 }, { "epoch": 0.4554158588018949, "grad_norm": 0.39509111642837524, "learning_rate": 0.0003577647240302471, "loss": 4.2644, "step": 1424 }, { "epoch": 0.4557356733094805, "grad_norm": 0.37813782691955566, "learning_rate": 0.0003574596661858396, "loss": 4.111, "step": 1425 }, { "epoch": 0.4560554878170661, "grad_norm": 0.373153954744339, "learning_rate": 0.00035715454665290343, "loss": 4.133, "step": 1426 }, { "epoch": 0.4563753023246517, "grad_norm": 0.39696401357650757, "learning_rate": 0.0003568493657590142, "loss": 4.1195, "step": 1427 }, { "epoch": 0.4566951168322373, "grad_norm": 0.37225645780563354, "learning_rate": 0.0003565441238318131, "loss": 4.2039, "step": 1428 }, { "epoch": 0.4570149313398229, "grad_norm": 0.36852118372917175, "learning_rate": 0.000356238821199007, "loss": 4.069, "step": 1429 }, { "epoch": 0.4573347458474085, "grad_norm": 0.34762415289878845, "learning_rate": 0.000355933458188368, "loss": 4.1935, "step": 1430 }, { "epoch": 0.4576545603549941, "grad_norm": 0.5953834056854248, "learning_rate": 0.00035562803512773284, "loss": 4.1333, "step": 1431 }, { "epoch": 0.4579743748625797, "grad_norm": 0.35744547843933105, "learning_rate": 0.00035532255234500284, "loss": 4.1208, "step": 1432 }, { "epoch": 0.4582941893701653, "grad_norm": 0.3656679391860962, "learning_rate": 0.0003550170101681434, "loss": 4.1018, "step": 1433 }, { "epoch": 0.4586140038777509, "grad_norm": 0.334159791469574, "learning_rate": 0.00035471140892518366, "loss": 4.1339, "step": 1434 }, { "epoch": 0.4589338183853365, "grad_norm": 0.32746434211730957, "learning_rate": 0.0003544057489442164, "loss": 4.1337, "step": 1435 }, { "epoch": 0.4592536328929221, "grad_norm": 0.3329735994338989, "learning_rate": 0.0003541000305533971, "loss": 4.2315, "step": 1436 }, { "epoch": 0.4595734474005077, "grad_norm": 0.4345363676548004, "learning_rate": 0.00035379425408094416, "loss": 4.2204, "step": 1437 }, { "epoch": 0.4598932619080933, "grad_norm": 0.4318109452724457, "learning_rate": 0.00035348841985513834, "loss": 4.1554, "step": 1438 }, { "epoch": 0.4602130764156789, "grad_norm": 0.35080626606941223, "learning_rate": 0.00035318252820432236, "loss": 4.1392, "step": 1439 }, { "epoch": 0.4605328909232645, "grad_norm": 0.32207825779914856, "learning_rate": 0.00035287657945690045, "loss": 4.135, "step": 1440 }, { "epoch": 0.4608527054308501, "grad_norm": 0.40435412526130676, "learning_rate": 0.0003525705739413385, "loss": 4.1811, "step": 1441 }, { "epoch": 0.4611725199384357, "grad_norm": 0.4196210503578186, "learning_rate": 0.000352264511986163, "loss": 4.1047, "step": 1442 }, { "epoch": 0.46149233444602134, "grad_norm": 0.36664676666259766, "learning_rate": 0.0003519583939199613, "loss": 4.0995, "step": 1443 }, { "epoch": 0.4618121489536069, "grad_norm": 0.6406135559082031, "learning_rate": 0.00035165222007138076, "loss": 4.1712, "step": 1444 }, { "epoch": 0.4621319634611925, "grad_norm": 0.3827049434185028, "learning_rate": 0.00035134599076912866, "loss": 4.142, "step": 1445 }, { "epoch": 0.4624517779687781, "grad_norm": 0.32631683349609375, "learning_rate": 0.00035103970634197193, "loss": 4.1598, "step": 1446 }, { "epoch": 0.46277159247636374, "grad_norm": 0.5294780731201172, "learning_rate": 0.00035073336711873666, "loss": 4.2318, "step": 1447 }, { "epoch": 0.4630914069839493, "grad_norm": 0.36057692766189575, "learning_rate": 0.00035042697342830783, "loss": 4.1732, "step": 1448 }, { "epoch": 0.4634112214915349, "grad_norm": 0.41221415996551514, "learning_rate": 0.00035012052559962853, "loss": 4.1658, "step": 1449 }, { "epoch": 0.4637310359991205, "grad_norm": 0.3409416377544403, "learning_rate": 0.0003498140239617005, "loss": 4.1707, "step": 1450 }, { "epoch": 0.46405085050670614, "grad_norm": 0.3248681426048279, "learning_rate": 0.0003495074688435829, "loss": 4.1657, "step": 1451 }, { "epoch": 0.4643706650142917, "grad_norm": 0.3181304931640625, "learning_rate": 0.00034920086057439243, "loss": 4.1739, "step": 1452 }, { "epoch": 0.4646904795218773, "grad_norm": 0.32246774435043335, "learning_rate": 0.0003488941994833028, "loss": 4.083, "step": 1453 }, { "epoch": 0.4650102940294629, "grad_norm": 0.33430513739585876, "learning_rate": 0.00034858748589954437, "loss": 4.1026, "step": 1454 }, { "epoch": 0.46533010853704854, "grad_norm": 0.40877765417099, "learning_rate": 0.0003482807201524042, "loss": 4.1484, "step": 1455 }, { "epoch": 0.4656499230446341, "grad_norm": 0.3767591714859009, "learning_rate": 0.00034797390257122486, "loss": 4.1051, "step": 1456 }, { "epoch": 0.4659697375522197, "grad_norm": 0.3127965033054352, "learning_rate": 0.0003476670334854049, "loss": 4.1839, "step": 1457 }, { "epoch": 0.4662895520598053, "grad_norm": 0.3863343596458435, "learning_rate": 0.00034736011322439796, "loss": 4.1525, "step": 1458 }, { "epoch": 0.46660936656739094, "grad_norm": 0.352515310049057, "learning_rate": 0.0003470531421177128, "loss": 4.1112, "step": 1459 }, { "epoch": 0.4669291810749765, "grad_norm": 0.3956506848335266, "learning_rate": 0.00034674612049491276, "loss": 4.1297, "step": 1460 }, { "epoch": 0.4672489955825621, "grad_norm": 0.3826676607131958, "learning_rate": 0.0003464390486856153, "loss": 4.1716, "step": 1461 }, { "epoch": 0.4675688100901477, "grad_norm": 0.3917238116264343, "learning_rate": 0.0003461319270194919, "loss": 4.1042, "step": 1462 }, { "epoch": 0.46788862459773334, "grad_norm": 0.49299493432044983, "learning_rate": 0.0003458247558262672, "loss": 4.119, "step": 1463 }, { "epoch": 0.4682084391053189, "grad_norm": 0.433088481426239, "learning_rate": 0.0003455175354357195, "loss": 4.2126, "step": 1464 }, { "epoch": 0.4685282536129045, "grad_norm": 0.3167847692966461, "learning_rate": 0.0003452102661776798, "loss": 4.0001, "step": 1465 }, { "epoch": 0.4688480681204901, "grad_norm": 0.39964962005615234, "learning_rate": 0.0003449029483820313, "loss": 4.0599, "step": 1466 }, { "epoch": 0.46916788262807574, "grad_norm": 0.4375894367694855, "learning_rate": 0.00034459558237870955, "loss": 4.1411, "step": 1467 }, { "epoch": 0.4694876971356613, "grad_norm": 0.31997978687286377, "learning_rate": 0.00034428816849770173, "loss": 4.1452, "step": 1468 }, { "epoch": 0.4698075116432469, "grad_norm": 0.4670328199863434, "learning_rate": 0.00034398070706904657, "loss": 4.1169, "step": 1469 }, { "epoch": 0.4701273261508325, "grad_norm": 0.3502720892429352, "learning_rate": 0.0003436731984228336, "loss": 4.0849, "step": 1470 }, { "epoch": 0.47044714065841814, "grad_norm": 0.32370302081108093, "learning_rate": 0.00034336564288920334, "loss": 4.0169, "step": 1471 }, { "epoch": 0.4707669551660037, "grad_norm": 0.8601170182228088, "learning_rate": 0.0003430580407983465, "loss": 4.1213, "step": 1472 }, { "epoch": 0.4710867696735893, "grad_norm": 0.5899140238761902, "learning_rate": 0.00034275039248050384, "loss": 4.0764, "step": 1473 }, { "epoch": 0.4714065841811749, "grad_norm": 0.3963443338871002, "learning_rate": 0.00034244269826596543, "loss": 4.1167, "step": 1474 }, { "epoch": 0.47172639868876054, "grad_norm": 0.3760378360748291, "learning_rate": 0.0003421349584850711, "loss": 4.1512, "step": 1475 }, { "epoch": 0.4720462131963461, "grad_norm": 0.4537976384162903, "learning_rate": 0.0003418271734682093, "loss": 4.1094, "step": 1476 }, { "epoch": 0.4723660277039317, "grad_norm": 0.5556796193122864, "learning_rate": 0.00034151934354581715, "loss": 4.18, "step": 1477 }, { "epoch": 0.4726858422115173, "grad_norm": 0.37939462065696716, "learning_rate": 0.00034121146904837995, "loss": 4.1603, "step": 1478 }, { "epoch": 0.47300565671910294, "grad_norm": 0.38446247577667236, "learning_rate": 0.00034090355030643083, "loss": 4.107, "step": 1479 }, { "epoch": 0.4733254712266885, "grad_norm": 0.37731459736824036, "learning_rate": 0.00034059558765055047, "loss": 4.1664, "step": 1480 }, { "epoch": 0.4736452857342741, "grad_norm": 0.5014836192131042, "learning_rate": 0.0003402875814113666, "loss": 4.0636, "step": 1481 }, { "epoch": 0.4739651002418597, "grad_norm": 0.549540102481842, "learning_rate": 0.00033997953191955383, "loss": 4.0481, "step": 1482 }, { "epoch": 0.47428491474944534, "grad_norm": 0.3488246202468872, "learning_rate": 0.0003396714395058333, "loss": 4.0931, "step": 1483 }, { "epoch": 0.4746047292570309, "grad_norm": 0.37121206521987915, "learning_rate": 0.00033936330450097193, "loss": 4.0934, "step": 1484 }, { "epoch": 0.4749245437646165, "grad_norm": 0.5817061066627502, "learning_rate": 0.0003390551272357829, "loss": 4.1365, "step": 1485 }, { "epoch": 0.4752443582722021, "grad_norm": 0.3825298845767975, "learning_rate": 0.00033874690804112397, "loss": 4.1189, "step": 1486 }, { "epoch": 0.47556417277978774, "grad_norm": 0.43237248063087463, "learning_rate": 0.00033843864724789866, "loss": 4.114, "step": 1487 }, { "epoch": 0.4758839872873733, "grad_norm": 0.38237112760543823, "learning_rate": 0.00033813034518705463, "loss": 4.1563, "step": 1488 }, { "epoch": 0.4762038017949589, "grad_norm": 0.31087061762809753, "learning_rate": 0.00033782200218958433, "loss": 4.1455, "step": 1489 }, { "epoch": 0.47652361630254453, "grad_norm": 0.3619230091571808, "learning_rate": 0.00033751361858652375, "loss": 4.1041, "step": 1490 }, { "epoch": 0.47684343081013014, "grad_norm": 0.3648468255996704, "learning_rate": 0.0003372051947089526, "loss": 4.1729, "step": 1491 }, { "epoch": 0.4771632453177157, "grad_norm": 0.39360687136650085, "learning_rate": 0.0003368967308879939, "loss": 4.1986, "step": 1492 }, { "epoch": 0.4774830598253013, "grad_norm": 0.35672345757484436, "learning_rate": 0.0003365882274548135, "loss": 4.0632, "step": 1493 }, { "epoch": 0.47780287433288693, "grad_norm": 0.41038697957992554, "learning_rate": 0.00033627968474061966, "loss": 4.0924, "step": 1494 }, { "epoch": 0.47812268884047254, "grad_norm": 0.34004780650138855, "learning_rate": 0.0003359711030766631, "loss": 4.1671, "step": 1495 }, { "epoch": 0.4784425033480581, "grad_norm": 0.38863426446914673, "learning_rate": 0.0003356624827942361, "loss": 4.1077, "step": 1496 }, { "epoch": 0.4787623178556437, "grad_norm": 0.3457399308681488, "learning_rate": 0.00033535382422467255, "loss": 4.0423, "step": 1497 }, { "epoch": 0.47908213236322933, "grad_norm": 0.3752861022949219, "learning_rate": 0.0003350451276993473, "loss": 4.1592, "step": 1498 }, { "epoch": 0.47940194687081494, "grad_norm": 0.3314865827560425, "learning_rate": 0.000334736393549676, "loss": 4.0649, "step": 1499 }, { "epoch": 0.4797217613784005, "grad_norm": 0.4653809070587158, "learning_rate": 0.00033442762210711483, "loss": 4.0766, "step": 1500 }, { "epoch": 0.4797217613784005, "eval_runtime": 42.9713, "eval_samples_per_second": 44.146, "eval_steps_per_second": 11.054, "step": 1500 }, { "epoch": 0.4800415758859861, "grad_norm": 0.4948757290840149, "learning_rate": 0.0003341188137031599, "loss": 4.1, "step": 1501 }, { "epoch": 0.48036139039357173, "grad_norm": 0.43844202160835266, "learning_rate": 0.0003338099686693469, "loss": 4.1702, "step": 1502 }, { "epoch": 0.48068120490115734, "grad_norm": 0.38527727127075195, "learning_rate": 0.00033350108733725103, "loss": 4.0891, "step": 1503 }, { "epoch": 0.4810010194087429, "grad_norm": 0.3630102276802063, "learning_rate": 0.00033319217003848644, "loss": 4.1548, "step": 1504 }, { "epoch": 0.4813208339163285, "grad_norm": 0.3592187464237213, "learning_rate": 0.0003328832171047057, "loss": 4.1025, "step": 1505 }, { "epoch": 0.48164064842391413, "grad_norm": 0.3607748746871948, "learning_rate": 0.0003325742288675998, "loss": 4.1096, "step": 1506 }, { "epoch": 0.48196046293149974, "grad_norm": 0.5581963062286377, "learning_rate": 0.0003322652056588976, "loss": 4.1531, "step": 1507 }, { "epoch": 0.4822802774390853, "grad_norm": 0.34117671847343445, "learning_rate": 0.0003319561478103656, "loss": 4.16, "step": 1508 }, { "epoch": 0.4826000919466709, "grad_norm": 0.344590961933136, "learning_rate": 0.00033164705565380737, "loss": 4.0816, "step": 1509 }, { "epoch": 0.48291990645425653, "grad_norm": 0.5942979454994202, "learning_rate": 0.00033133792952106327, "loss": 4.1014, "step": 1510 }, { "epoch": 0.48323972096184215, "grad_norm": 0.5177531242370605, "learning_rate": 0.0003310287697440102, "loss": 4.0342, "step": 1511 }, { "epoch": 0.48355953546942776, "grad_norm": 0.3519006073474884, "learning_rate": 0.0003307195766545612, "loss": 4.0419, "step": 1512 }, { "epoch": 0.4838793499770133, "grad_norm": 0.622458279132843, "learning_rate": 0.00033041035058466525, "loss": 4.1797, "step": 1513 }, { "epoch": 0.48419916448459893, "grad_norm": 0.33985814452171326, "learning_rate": 0.00033010109186630625, "loss": 4.0562, "step": 1514 }, { "epoch": 0.48451897899218455, "grad_norm": 0.3592928349971771, "learning_rate": 0.00032979180083150366, "loss": 4.0741, "step": 1515 }, { "epoch": 0.48483879349977016, "grad_norm": 0.6950055360794067, "learning_rate": 0.00032948247781231134, "loss": 4.126, "step": 1516 }, { "epoch": 0.4851586080073557, "grad_norm": 0.44427594542503357, "learning_rate": 0.0003291731231408175, "loss": 4.118, "step": 1517 }, { "epoch": 0.48547842251494133, "grad_norm": 0.3500032424926758, "learning_rate": 0.00032886373714914455, "loss": 4.123, "step": 1518 }, { "epoch": 0.48579823702252695, "grad_norm": 0.3127995729446411, "learning_rate": 0.00032855432016944835, "loss": 4.0496, "step": 1519 }, { "epoch": 0.48611805153011256, "grad_norm": 0.4494898021221161, "learning_rate": 0.000328244872533918, "loss": 4.1302, "step": 1520 }, { "epoch": 0.4864378660376981, "grad_norm": 0.3141591250896454, "learning_rate": 0.00032793539457477564, "loss": 4.0679, "step": 1521 }, { "epoch": 0.48675768054528373, "grad_norm": 0.447311669588089, "learning_rate": 0.00032762588662427585, "loss": 4.0039, "step": 1522 }, { "epoch": 0.48707749505286935, "grad_norm": 0.3813198208808899, "learning_rate": 0.0003273163490147054, "loss": 4.0099, "step": 1523 }, { "epoch": 0.48739730956045496, "grad_norm": 0.4869134724140167, "learning_rate": 0.0003270067820783831, "loss": 4.0944, "step": 1524 }, { "epoch": 0.4877171240680405, "grad_norm": 0.356149286031723, "learning_rate": 0.0003266971861476589, "loss": 4.1428, "step": 1525 }, { "epoch": 0.48803693857562613, "grad_norm": 0.33206453919410706, "learning_rate": 0.00032638756155491436, "loss": 4.0902, "step": 1526 }, { "epoch": 0.48835675308321175, "grad_norm": 0.7001564502716064, "learning_rate": 0.0003260779086325612, "loss": 4.106, "step": 1527 }, { "epoch": 0.48867656759079736, "grad_norm": 0.335132896900177, "learning_rate": 0.0003257682277130422, "loss": 4.0664, "step": 1528 }, { "epoch": 0.4889963820983829, "grad_norm": 0.3467666506767273, "learning_rate": 0.0003254585191288297, "loss": 4.0964, "step": 1529 }, { "epoch": 0.48931619660596853, "grad_norm": 0.48314183950424194, "learning_rate": 0.0003251487832124259, "loss": 4.0103, "step": 1530 }, { "epoch": 0.48963601111355415, "grad_norm": 0.4692327082157135, "learning_rate": 0.00032483902029636257, "loss": 4.1113, "step": 1531 }, { "epoch": 0.48995582562113976, "grad_norm": 0.5636443495750427, "learning_rate": 0.00032452923071320006, "loss": 4.1355, "step": 1532 }, { "epoch": 0.4902756401287253, "grad_norm": 0.6990973949432373, "learning_rate": 0.00032421941479552767, "loss": 4.1273, "step": 1533 }, { "epoch": 0.49059545463631093, "grad_norm": 0.44104430079460144, "learning_rate": 0.00032390957287596275, "loss": 4.153, "step": 1534 }, { "epoch": 0.49091526914389655, "grad_norm": 0.3496834337711334, "learning_rate": 0.0003235997052871508, "loss": 4.1292, "step": 1535 }, { "epoch": 0.49123508365148216, "grad_norm": 0.4113143980503082, "learning_rate": 0.00032328981236176465, "loss": 4.0417, "step": 1536 }, { "epoch": 0.4915548981590677, "grad_norm": 0.33827435970306396, "learning_rate": 0.00032297989443250445, "loss": 4.0919, "step": 1537 }, { "epoch": 0.49187471266665334, "grad_norm": 0.31342950463294983, "learning_rate": 0.0003226699518320973, "loss": 4.1291, "step": 1538 }, { "epoch": 0.49219452717423895, "grad_norm": 0.3942566215991974, "learning_rate": 0.0003223599848932964, "loss": 4.0714, "step": 1539 }, { "epoch": 0.49251434168182456, "grad_norm": 0.30204451084136963, "learning_rate": 0.0003220499939488817, "loss": 4.0417, "step": 1540 }, { "epoch": 0.4928341561894101, "grad_norm": 0.4029451608657837, "learning_rate": 0.0003217399793316583, "loss": 4.1639, "step": 1541 }, { "epoch": 0.49315397069699574, "grad_norm": 0.36016273498535156, "learning_rate": 0.00032142994137445693, "loss": 4.0426, "step": 1542 }, { "epoch": 0.49347378520458135, "grad_norm": 0.4433692693710327, "learning_rate": 0.0003211198804101337, "loss": 4.0998, "step": 1543 }, { "epoch": 0.49379359971216696, "grad_norm": 0.5803028345108032, "learning_rate": 0.000320809796771569, "loss": 4.0622, "step": 1544 }, { "epoch": 0.4941134142197525, "grad_norm": 0.32752785086631775, "learning_rate": 0.00032049969079166765, "loss": 4.0054, "step": 1545 }, { "epoch": 0.49443322872733814, "grad_norm": 0.3414701223373413, "learning_rate": 0.0003201895628033587, "loss": 4.1581, "step": 1546 }, { "epoch": 0.49475304323492375, "grad_norm": 0.31249287724494934, "learning_rate": 0.00031987941313959433, "loss": 4.1529, "step": 1547 }, { "epoch": 0.49507285774250936, "grad_norm": 0.681825578212738, "learning_rate": 0.0003195692421333506, "loss": 4.0889, "step": 1548 }, { "epoch": 0.4953926722500949, "grad_norm": 0.35818034410476685, "learning_rate": 0.0003192590501176261, "loss": 4.0724, "step": 1549 }, { "epoch": 0.49571248675768054, "grad_norm": 0.435090571641922, "learning_rate": 0.0003189488374254421, "loss": 4.0859, "step": 1550 }, { "epoch": 0.49603230126526615, "grad_norm": 0.4104984998703003, "learning_rate": 0.00031863860438984193, "loss": 4.148, "step": 1551 }, { "epoch": 0.49635211577285177, "grad_norm": 0.42864561080932617, "learning_rate": 0.00031832835134389093, "loss": 4.1541, "step": 1552 }, { "epoch": 0.4966719302804373, "grad_norm": 0.46398380398750305, "learning_rate": 0.0003180180786206759, "loss": 4.0462, "step": 1553 }, { "epoch": 0.49699174478802294, "grad_norm": 0.3906078040599823, "learning_rate": 0.0003177077865533046, "loss": 4.0608, "step": 1554 }, { "epoch": 0.49731155929560855, "grad_norm": 0.3377833664417267, "learning_rate": 0.00031739747547490584, "loss": 4.0764, "step": 1555 }, { "epoch": 0.49763137380319417, "grad_norm": 0.3614610731601715, "learning_rate": 0.0003170871457186286, "loss": 4.1298, "step": 1556 }, { "epoch": 0.4979511883107797, "grad_norm": 0.3353988528251648, "learning_rate": 0.0003167767976176419, "loss": 3.9492, "step": 1557 }, { "epoch": 0.49827100281836534, "grad_norm": 0.46879708766937256, "learning_rate": 0.0003164664315051347, "loss": 4.0959, "step": 1558 }, { "epoch": 0.49859081732595095, "grad_norm": 0.4320794939994812, "learning_rate": 0.00031615604771431514, "loss": 4.053, "step": 1559 }, { "epoch": 0.49891063183353657, "grad_norm": 0.4326510727405548, "learning_rate": 0.00031584564657841015, "loss": 4.1258, "step": 1560 }, { "epoch": 0.4992304463411221, "grad_norm": 0.42747175693511963, "learning_rate": 0.0003155352284306657, "loss": 4.0548, "step": 1561 }, { "epoch": 0.49955026084870774, "grad_norm": 0.5194921493530273, "learning_rate": 0.00031522479360434567, "loss": 4.0666, "step": 1562 }, { "epoch": 0.49987007535629335, "grad_norm": 0.34342098236083984, "learning_rate": 0.00031491434243273214, "loss": 4.1897, "step": 1563 }, { "epoch": 0.5001898898638789, "grad_norm": 0.3505987524986267, "learning_rate": 0.00031460387524912437, "loss": 4.1167, "step": 1564 }, { "epoch": 0.5005097043714646, "grad_norm": 0.40554648637771606, "learning_rate": 0.0003142933923868391, "loss": 3.9827, "step": 1565 }, { "epoch": 0.5008295188790501, "grad_norm": 0.5192351341247559, "learning_rate": 0.00031398289417920976, "loss": 4.099, "step": 1566 }, { "epoch": 0.5011493333866357, "grad_norm": 0.3549719452857971, "learning_rate": 0.00031367238095958644, "loss": 4.1459, "step": 1567 }, { "epoch": 0.5014691478942214, "grad_norm": 0.33742159605026245, "learning_rate": 0.00031336185306133523, "loss": 3.9932, "step": 1568 }, { "epoch": 0.5017889624018069, "grad_norm": 0.3752286434173584, "learning_rate": 0.0003130513108178378, "loss": 4.043, "step": 1569 }, { "epoch": 0.5021087769093926, "grad_norm": 0.5240476727485657, "learning_rate": 0.0003127407545624915, "loss": 4.2195, "step": 1570 }, { "epoch": 0.5024285914169782, "grad_norm": 0.39175838232040405, "learning_rate": 0.0003124301846287085, "loss": 4.0877, "step": 1571 }, { "epoch": 0.5027484059245637, "grad_norm": 0.395557701587677, "learning_rate": 0.00031211960134991596, "loss": 4.091, "step": 1572 }, { "epoch": 0.5030682204321494, "grad_norm": 0.5906341671943665, "learning_rate": 0.00031180900505955496, "loss": 4.1541, "step": 1573 }, { "epoch": 0.5033880349397349, "grad_norm": 0.32791221141815186, "learning_rate": 0.000311498396091081, "loss": 4.1287, "step": 1574 }, { "epoch": 0.5037078494473205, "grad_norm": 0.4331429600715637, "learning_rate": 0.00031118777477796275, "loss": 4.0703, "step": 1575 }, { "epoch": 0.5040276639549062, "grad_norm": 0.44842204451560974, "learning_rate": 0.0003108771414536825, "loss": 4.1031, "step": 1576 }, { "epoch": 0.5043474784624917, "grad_norm": 0.4451039731502533, "learning_rate": 0.0003105664964517351, "loss": 4.0531, "step": 1577 }, { "epoch": 0.5046672929700774, "grad_norm": 0.3688308894634247, "learning_rate": 0.0003102558401056282, "loss": 4.0338, "step": 1578 }, { "epoch": 0.504987107477663, "grad_norm": 0.3220631778240204, "learning_rate": 0.00030994517274888155, "loss": 4.0877, "step": 1579 }, { "epoch": 0.5053069219852485, "grad_norm": 0.40773797035217285, "learning_rate": 0.00030963449471502674, "loss": 4.1061, "step": 1580 }, { "epoch": 0.5056267364928342, "grad_norm": 0.39932629466056824, "learning_rate": 0.0003093238063376068, "loss": 4.0941, "step": 1581 }, { "epoch": 0.5059465510004197, "grad_norm": 0.3180277347564697, "learning_rate": 0.00030901310795017567, "loss": 4.135, "step": 1582 }, { "epoch": 0.5062663655080054, "grad_norm": 0.3557443916797638, "learning_rate": 0.00030870239988629844, "loss": 4.0923, "step": 1583 }, { "epoch": 0.506586180015591, "grad_norm": 0.36635780334472656, "learning_rate": 0.0003083916824795503, "loss": 4.1085, "step": 1584 }, { "epoch": 0.5069059945231765, "grad_norm": 0.39818745851516724, "learning_rate": 0.0003080809560635165, "loss": 4.0002, "step": 1585 }, { "epoch": 0.5072258090307622, "grad_norm": 0.5297528505325317, "learning_rate": 0.0003077702209717921, "loss": 4.1021, "step": 1586 }, { "epoch": 0.5075456235383478, "grad_norm": 0.4554721713066101, "learning_rate": 0.0003074594775379812, "loss": 4.0474, "step": 1587 }, { "epoch": 0.5078654380459333, "grad_norm": 0.3332599997520447, "learning_rate": 0.00030714872609569733, "loss": 4.0538, "step": 1588 }, { "epoch": 0.508185252553519, "grad_norm": 0.35154786705970764, "learning_rate": 0.0003068379669785622, "loss": 4.0741, "step": 1589 }, { "epoch": 0.5085050670611045, "grad_norm": 0.3290943503379822, "learning_rate": 0.0003065272005202056, "loss": 4.054, "step": 1590 }, { "epoch": 0.5088248815686902, "grad_norm": 0.43877410888671875, "learning_rate": 0.00030621642705426586, "loss": 4.103, "step": 1591 }, { "epoch": 0.5091446960762758, "grad_norm": 0.38104188442230225, "learning_rate": 0.0003059056469143884, "loss": 4.0351, "step": 1592 }, { "epoch": 0.5094645105838613, "grad_norm": 0.3215627074241638, "learning_rate": 0.0003055948604342257, "loss": 4.0292, "step": 1593 }, { "epoch": 0.509784325091447, "grad_norm": 0.6582420468330383, "learning_rate": 0.0003052840679474373, "loss": 3.9827, "step": 1594 }, { "epoch": 0.5101041395990326, "grad_norm": 0.30473238229751587, "learning_rate": 0.0003049732697876891, "loss": 4.1003, "step": 1595 }, { "epoch": 0.5104239541066181, "grad_norm": 0.34233206510543823, "learning_rate": 0.000304662466288653, "loss": 4.1198, "step": 1596 }, { "epoch": 0.5107437686142038, "grad_norm": 0.37094613909721375, "learning_rate": 0.000304351657784007, "loss": 4.0349, "step": 1597 }, { "epoch": 0.5110635831217893, "grad_norm": 0.3599124252796173, "learning_rate": 0.0003040408446074339, "loss": 4.1672, "step": 1598 }, { "epoch": 0.511383397629375, "grad_norm": 0.3312455713748932, "learning_rate": 0.000303730027092622, "loss": 4.0407, "step": 1599 }, { "epoch": 0.5117032121369606, "grad_norm": 0.4842011332511902, "learning_rate": 0.00030341920557326385, "loss": 4.0657, "step": 1600 }, { "epoch": 0.5117032121369606, "eval_runtime": 42.6206, "eval_samples_per_second": 44.509, "eval_steps_per_second": 11.145, "step": 1600 }, { "epoch": 0.5120230266445461, "grad_norm": 0.31443530321121216, "learning_rate": 0.0003031083803830567, "loss": 4.0517, "step": 1601 }, { "epoch": 0.5123428411521318, "grad_norm": 0.6406258940696716, "learning_rate": 0.0003027975518557016, "loss": 4.1028, "step": 1602 }, { "epoch": 0.5126626556597174, "grad_norm": 0.32266587018966675, "learning_rate": 0.00030248672032490295, "loss": 4.0963, "step": 1603 }, { "epoch": 0.5129824701673029, "grad_norm": 0.3912874758243561, "learning_rate": 0.0003021758861243688, "loss": 4.0878, "step": 1604 }, { "epoch": 0.5133022846748886, "grad_norm": 0.31555572152137756, "learning_rate": 0.0003018650495878096, "loss": 4.0794, "step": 1605 }, { "epoch": 0.5136220991824741, "grad_norm": 0.3191808760166168, "learning_rate": 0.0003015542110489387, "loss": 4.0301, "step": 1606 }, { "epoch": 0.5139419136900598, "grad_norm": 0.3191812336444855, "learning_rate": 0.00030124337084147144, "loss": 4.0614, "step": 1607 }, { "epoch": 0.5142617281976454, "grad_norm": 0.45402586460113525, "learning_rate": 0.0003009325292991247, "loss": 4.0647, "step": 1608 }, { "epoch": 0.5145815427052309, "grad_norm": 0.3726907968521118, "learning_rate": 0.0003006216867556175, "loss": 4.1078, "step": 1609 }, { "epoch": 0.5149013572128166, "grad_norm": 0.32805368304252625, "learning_rate": 0.00030031084354466904, "loss": 4.0728, "step": 1610 }, { "epoch": 0.5152211717204022, "grad_norm": 0.3323350250720978, "learning_rate": 0.0003, "loss": 4.1404, "step": 1611 }, { "epoch": 0.5155409862279877, "grad_norm": 0.3396187126636505, "learning_rate": 0.00029968915645533085, "loss": 4.0735, "step": 1612 }, { "epoch": 0.5158608007355734, "grad_norm": 0.3361773192882538, "learning_rate": 0.0002993783132443825, "loss": 4.0472, "step": 1613 }, { "epoch": 0.516180615243159, "grad_norm": 0.33853742480278015, "learning_rate": 0.0002990674707008752, "loss": 4.0639, "step": 1614 }, { "epoch": 0.5165004297507446, "grad_norm": 0.42280757427215576, "learning_rate": 0.0002987566291585286, "loss": 4.0895, "step": 1615 }, { "epoch": 0.5168202442583302, "grad_norm": 0.3496019244194031, "learning_rate": 0.00029844578895106127, "loss": 4.0313, "step": 1616 }, { "epoch": 0.5171400587659157, "grad_norm": 0.31136554479599, "learning_rate": 0.0002981349504121904, "loss": 3.998, "step": 1617 }, { "epoch": 0.5174598732735014, "grad_norm": 0.33187514543533325, "learning_rate": 0.0002978241138756312, "loss": 4.0164, "step": 1618 }, { "epoch": 0.517779687781087, "grad_norm": 0.45364049077033997, "learning_rate": 0.00029751327967509695, "loss": 4.0767, "step": 1619 }, { "epoch": 0.5180995022886725, "grad_norm": 0.3755611479282379, "learning_rate": 0.0002972024481442984, "loss": 4.0475, "step": 1620 }, { "epoch": 0.5184193167962582, "grad_norm": 0.37028712034225464, "learning_rate": 0.00029689161961694323, "loss": 4.0342, "step": 1621 }, { "epoch": 0.5187391313038437, "grad_norm": 0.3561839461326599, "learning_rate": 0.00029658079442673616, "loss": 4.0952, "step": 1622 }, { "epoch": 0.5190589458114294, "grad_norm": 0.32721537351608276, "learning_rate": 0.000296269972907378, "loss": 4.0327, "step": 1623 }, { "epoch": 0.519378760319015, "grad_norm": 0.37354496121406555, "learning_rate": 0.00029595915539256605, "loss": 4.1092, "step": 1624 }, { "epoch": 0.5196985748266005, "grad_norm": 0.4460698664188385, "learning_rate": 0.000295648342215993, "loss": 4.0615, "step": 1625 }, { "epoch": 0.5200183893341862, "grad_norm": 0.4962615370750427, "learning_rate": 0.0002953375337113468, "loss": 4.0444, "step": 1626 }, { "epoch": 0.5203382038417718, "grad_norm": 0.48865246772766113, "learning_rate": 0.00029502673021231096, "loss": 4.0543, "step": 1627 }, { "epoch": 0.5206580183493573, "grad_norm": 0.5612025260925293, "learning_rate": 0.0002947159320525627, "loss": 4.0833, "step": 1628 }, { "epoch": 0.520977832856943, "grad_norm": 0.32466867566108704, "learning_rate": 0.0002944051395657744, "loss": 3.9949, "step": 1629 }, { "epoch": 0.5212976473645285, "grad_norm": 0.4937516450881958, "learning_rate": 0.0002940943530856116, "loss": 4.075, "step": 1630 }, { "epoch": 0.5216174618721142, "grad_norm": 0.533191978931427, "learning_rate": 0.00029378357294573403, "loss": 4.0165, "step": 1631 }, { "epoch": 0.5219372763796998, "grad_norm": 0.4194160997867584, "learning_rate": 0.0002934727994797944, "loss": 4.1115, "step": 1632 }, { "epoch": 0.5222570908872853, "grad_norm": 0.3668864667415619, "learning_rate": 0.0002931620330214378, "loss": 4.0874, "step": 1633 }, { "epoch": 0.522576905394871, "grad_norm": 0.5022832155227661, "learning_rate": 0.00029285127390430273, "loss": 4.0451, "step": 1634 }, { "epoch": 0.5228967199024566, "grad_norm": 0.3530194163322449, "learning_rate": 0.00029254052246201873, "loss": 4.125, "step": 1635 }, { "epoch": 0.5232165344100421, "grad_norm": 0.4069964587688446, "learning_rate": 0.00029222977902820785, "loss": 4.1147, "step": 1636 }, { "epoch": 0.5235363489176278, "grad_norm": 0.32398703694343567, "learning_rate": 0.0002919190439364835, "loss": 4.0335, "step": 1637 }, { "epoch": 0.5238561634252134, "grad_norm": 0.4013730585575104, "learning_rate": 0.00029160831752044966, "loss": 4.1049, "step": 1638 }, { "epoch": 0.524175977932799, "grad_norm": 0.3667943477630615, "learning_rate": 0.00029129760011370156, "loss": 3.9907, "step": 1639 }, { "epoch": 0.5244957924403846, "grad_norm": 0.4426942765712738, "learning_rate": 0.00029098689204982433, "loss": 4.0806, "step": 1640 }, { "epoch": 0.5248156069479701, "grad_norm": 0.36928027868270874, "learning_rate": 0.00029067619366239327, "loss": 4.0464, "step": 1641 }, { "epoch": 0.5251354214555558, "grad_norm": 0.3222687244415283, "learning_rate": 0.00029036550528497326, "loss": 4.0586, "step": 1642 }, { "epoch": 0.5254552359631414, "grad_norm": 0.32106471061706543, "learning_rate": 0.0002900548272511183, "loss": 4.1192, "step": 1643 }, { "epoch": 0.5257750504707269, "grad_norm": 0.3290542960166931, "learning_rate": 0.00028974415989437176, "loss": 4.0667, "step": 1644 }, { "epoch": 0.5260948649783126, "grad_norm": 0.35136616230010986, "learning_rate": 0.0002894335035482649, "loss": 4.1077, "step": 1645 }, { "epoch": 0.5264146794858982, "grad_norm": 0.41029417514801025, "learning_rate": 0.00028912285854631754, "loss": 4.0127, "step": 1646 }, { "epoch": 0.5267344939934838, "grad_norm": 0.3016681671142578, "learning_rate": 0.0002888122252220372, "loss": 4.0609, "step": 1647 }, { "epoch": 0.5270543085010694, "grad_norm": 0.36644431948661804, "learning_rate": 0.00028850160390891895, "loss": 4.1058, "step": 1648 }, { "epoch": 0.5273741230086549, "grad_norm": 0.3869048058986664, "learning_rate": 0.000288190994940445, "loss": 4.0179, "step": 1649 }, { "epoch": 0.5276939375162406, "grad_norm": 0.3634847402572632, "learning_rate": 0.00028788039865008404, "loss": 4.1166, "step": 1650 }, { "epoch": 0.5280137520238262, "grad_norm": 0.3887310326099396, "learning_rate": 0.0002875698153712915, "loss": 4.0161, "step": 1651 }, { "epoch": 0.5283335665314118, "grad_norm": 0.37465617060661316, "learning_rate": 0.0002872592454375086, "loss": 4.0383, "step": 1652 }, { "epoch": 0.5286533810389974, "grad_norm": 0.3881850242614746, "learning_rate": 0.00028694868918216227, "loss": 4.0616, "step": 1653 }, { "epoch": 0.528973195546583, "grad_norm": 0.5985110998153687, "learning_rate": 0.0002866381469386648, "loss": 4.0874, "step": 1654 }, { "epoch": 0.5292930100541686, "grad_norm": 0.44514885544776917, "learning_rate": 0.0002863276190404135, "loss": 4.0218, "step": 1655 }, { "epoch": 0.5296128245617542, "grad_norm": 0.5675281882286072, "learning_rate": 0.0002860171058207902, "loss": 4.1017, "step": 1656 }, { "epoch": 0.5299326390693397, "grad_norm": 0.3356822729110718, "learning_rate": 0.0002857066076131609, "loss": 4.0572, "step": 1657 }, { "epoch": 0.5302524535769254, "grad_norm": 0.5394580960273743, "learning_rate": 0.00028539612475087563, "loss": 4.0329, "step": 1658 }, { "epoch": 0.530572268084511, "grad_norm": 0.7685129046440125, "learning_rate": 0.0002850856575672679, "loss": 4.0363, "step": 1659 }, { "epoch": 0.5308920825920966, "grad_norm": 0.3382013142108917, "learning_rate": 0.0002847752063956543, "loss": 4.0467, "step": 1660 }, { "epoch": 0.5312118970996822, "grad_norm": 0.41088518500328064, "learning_rate": 0.00028446477156933425, "loss": 3.9888, "step": 1661 }, { "epoch": 0.5315317116072678, "grad_norm": 0.7157890200614929, "learning_rate": 0.0002841543534215898, "loss": 4.1127, "step": 1662 }, { "epoch": 0.5318515261148534, "grad_norm": 0.39408162236213684, "learning_rate": 0.0002838439522856849, "loss": 4.0687, "step": 1663 }, { "epoch": 0.532171340622439, "grad_norm": 0.6037306785583496, "learning_rate": 0.00028353356849486526, "loss": 4.0626, "step": 1664 }, { "epoch": 0.5324911551300245, "grad_norm": 0.6773415803909302, "learning_rate": 0.00028322320238235814, "loss": 3.9787, "step": 1665 }, { "epoch": 0.5328109696376102, "grad_norm": 0.36819398403167725, "learning_rate": 0.00028291285428137146, "loss": 4.1131, "step": 1666 }, { "epoch": 0.5331307841451958, "grad_norm": 0.40404972434043884, "learning_rate": 0.0002826025245250941, "loss": 4.0551, "step": 1667 }, { "epoch": 0.5334505986527814, "grad_norm": 0.526303768157959, "learning_rate": 0.00028229221344669534, "loss": 4.0899, "step": 1668 }, { "epoch": 0.533770413160367, "grad_norm": 0.36476826667785645, "learning_rate": 0.0002819819213793241, "loss": 4.0847, "step": 1669 }, { "epoch": 0.5340902276679526, "grad_norm": 0.3811258375644684, "learning_rate": 0.00028167164865610907, "loss": 4.0743, "step": 1670 }, { "epoch": 0.5344100421755382, "grad_norm": 0.39980533719062805, "learning_rate": 0.00028136139561015807, "loss": 3.9995, "step": 1671 }, { "epoch": 0.5347298566831238, "grad_norm": 0.39294004440307617, "learning_rate": 0.00028105116257455786, "loss": 3.9854, "step": 1672 }, { "epoch": 0.5350496711907093, "grad_norm": 0.3292320966720581, "learning_rate": 0.00028074094988237385, "loss": 4.0632, "step": 1673 }, { "epoch": 0.535369485698295, "grad_norm": 0.388158917427063, "learning_rate": 0.00028043075786664934, "loss": 4.0764, "step": 1674 }, { "epoch": 0.5356893002058806, "grad_norm": 0.3017561137676239, "learning_rate": 0.0002801205868604057, "loss": 3.9542, "step": 1675 }, { "epoch": 0.5360091147134662, "grad_norm": 0.38703247904777527, "learning_rate": 0.0002798104371966414, "loss": 4.0427, "step": 1676 }, { "epoch": 0.5363289292210518, "grad_norm": 0.36992308497428894, "learning_rate": 0.0002795003092083324, "loss": 4.1174, "step": 1677 }, { "epoch": 0.5366487437286374, "grad_norm": 0.546577513217926, "learning_rate": 0.000279190203228431, "loss": 4.0147, "step": 1678 }, { "epoch": 0.536968558236223, "grad_norm": 0.3498661518096924, "learning_rate": 0.00027888011958986623, "loss": 4.1561, "step": 1679 }, { "epoch": 0.5372883727438086, "grad_norm": 0.3992694616317749, "learning_rate": 0.00027857005862554307, "loss": 4.0285, "step": 1680 }, { "epoch": 0.5376081872513941, "grad_norm": 0.90511554479599, "learning_rate": 0.00027826002066834167, "loss": 4.1299, "step": 1681 }, { "epoch": 0.5379280017589798, "grad_norm": 0.3943385183811188, "learning_rate": 0.0002779500060511184, "loss": 4.0105, "step": 1682 }, { "epoch": 0.5382478162665654, "grad_norm": 0.41083163022994995, "learning_rate": 0.00027764001510670354, "loss": 3.9245, "step": 1683 }, { "epoch": 0.538567630774151, "grad_norm": 0.6444272994995117, "learning_rate": 0.00027733004816790267, "loss": 4.0668, "step": 1684 }, { "epoch": 0.5388874452817366, "grad_norm": 0.4096246361732483, "learning_rate": 0.00027702010556749556, "loss": 4.0423, "step": 1685 }, { "epoch": 0.5392072597893222, "grad_norm": 0.33811822533607483, "learning_rate": 0.0002767101876382353, "loss": 4.061, "step": 1686 }, { "epoch": 0.5395270742969078, "grad_norm": 0.3735474944114685, "learning_rate": 0.00027640029471284923, "loss": 4.0152, "step": 1687 }, { "epoch": 0.5398468888044934, "grad_norm": 0.36716386675834656, "learning_rate": 0.00027609042712403725, "loss": 4.0819, "step": 1688 }, { "epoch": 0.5401667033120789, "grad_norm": 0.41999539732933044, "learning_rate": 0.0002757805852044724, "loss": 4.0657, "step": 1689 }, { "epoch": 0.5404865178196646, "grad_norm": 0.35667943954467773, "learning_rate": 0.00027547076928679994, "loss": 4.0403, "step": 1690 }, { "epoch": 0.5408063323272502, "grad_norm": 0.656845211982727, "learning_rate": 0.0002751609797036373, "loss": 4.0227, "step": 1691 }, { "epoch": 0.5411261468348358, "grad_norm": 0.4996317923069, "learning_rate": 0.00027485121678757406, "loss": 4.09, "step": 1692 }, { "epoch": 0.5414459613424214, "grad_norm": 0.33114397525787354, "learning_rate": 0.0002745414808711703, "loss": 4.0208, "step": 1693 }, { "epoch": 0.541765775850007, "grad_norm": 0.35478827357292175, "learning_rate": 0.00027423177228695785, "loss": 4.002, "step": 1694 }, { "epoch": 0.5420855903575926, "grad_norm": 0.454840749502182, "learning_rate": 0.00027392209136743875, "loss": 3.985, "step": 1695 }, { "epoch": 0.5424054048651782, "grad_norm": 0.42486342787742615, "learning_rate": 0.0002736124384450857, "loss": 4.0289, "step": 1696 }, { "epoch": 0.5427252193727637, "grad_norm": 0.40213125944137573, "learning_rate": 0.0002733028138523411, "loss": 4.0017, "step": 1697 }, { "epoch": 0.5430450338803494, "grad_norm": 0.35876110196113586, "learning_rate": 0.0002729932179216169, "loss": 3.9766, "step": 1698 }, { "epoch": 0.543364848387935, "grad_norm": 0.34585532546043396, "learning_rate": 0.0002726836509852946, "loss": 4.0055, "step": 1699 }, { "epoch": 0.5436846628955206, "grad_norm": 0.32652515172958374, "learning_rate": 0.0002723741133757242, "loss": 4.0335, "step": 1700 }, { "epoch": 0.5436846628955206, "eval_runtime": 43.061, "eval_samples_per_second": 44.054, "eval_steps_per_second": 11.031, "step": 1700 }, { "epoch": 0.5440044774031062, "grad_norm": 0.5089649558067322, "learning_rate": 0.0002720646054252244, "loss": 4.0781, "step": 1701 }, { "epoch": 0.5443242919106918, "grad_norm": 0.37917423248291016, "learning_rate": 0.000271755127466082, "loss": 4.0729, "step": 1702 }, { "epoch": 0.5446441064182774, "grad_norm": 0.49459511041641235, "learning_rate": 0.0002714456798305516, "loss": 4.0783, "step": 1703 }, { "epoch": 0.544963920925863, "grad_norm": 0.42468175292015076, "learning_rate": 0.0002711362628508554, "loss": 3.9748, "step": 1704 }, { "epoch": 0.5452837354334485, "grad_norm": 0.3681325316429138, "learning_rate": 0.0002708268768591825, "loss": 4.085, "step": 1705 }, { "epoch": 0.5456035499410342, "grad_norm": 0.3825188875198364, "learning_rate": 0.0002705175221876887, "loss": 4.0337, "step": 1706 }, { "epoch": 0.5459233644486198, "grad_norm": 0.36943691968917847, "learning_rate": 0.00027020819916849634, "loss": 4.0276, "step": 1707 }, { "epoch": 0.5462431789562054, "grad_norm": 0.4450071454048157, "learning_rate": 0.0002698989081336937, "loss": 4.023, "step": 1708 }, { "epoch": 0.546562993463791, "grad_norm": 0.32940152287483215, "learning_rate": 0.00026958964941533475, "loss": 4.0164, "step": 1709 }, { "epoch": 0.5468828079713766, "grad_norm": 0.4260207712650299, "learning_rate": 0.00026928042334543867, "loss": 4.0055, "step": 1710 }, { "epoch": 0.5472026224789622, "grad_norm": 0.5239726901054382, "learning_rate": 0.0002689712302559898, "loss": 4.1212, "step": 1711 }, { "epoch": 0.5475224369865478, "grad_norm": 0.3462755084037781, "learning_rate": 0.0002686620704789367, "loss": 4.0522, "step": 1712 }, { "epoch": 0.5478422514941333, "grad_norm": 0.45866432785987854, "learning_rate": 0.00026835294434619263, "loss": 4.0515, "step": 1713 }, { "epoch": 0.548162066001719, "grad_norm": 0.42668798565864563, "learning_rate": 0.0002680438521896343, "loss": 4.0564, "step": 1714 }, { "epoch": 0.5484818805093046, "grad_norm": 0.388555109500885, "learning_rate": 0.0002677347943411023, "loss": 4.0777, "step": 1715 }, { "epoch": 0.5488016950168902, "grad_norm": 0.45258060097694397, "learning_rate": 0.0002674257711324002, "loss": 4.012, "step": 1716 }, { "epoch": 0.5491215095244758, "grad_norm": 0.5278245806694031, "learning_rate": 0.0002671167828952943, "loss": 4.0515, "step": 1717 }, { "epoch": 0.5494413240320614, "grad_norm": 0.6190354228019714, "learning_rate": 0.0002668078299615136, "loss": 4.0427, "step": 1718 }, { "epoch": 0.549761138539647, "grad_norm": 0.38841569423675537, "learning_rate": 0.0002664989126627489, "loss": 4.1325, "step": 1719 }, { "epoch": 0.5500809530472326, "grad_norm": 0.47609779238700867, "learning_rate": 0.00026619003133065305, "loss": 4.122, "step": 1720 }, { "epoch": 0.5504007675548183, "grad_norm": 0.44085171818733215, "learning_rate": 0.0002658811862968401, "loss": 4.0301, "step": 1721 }, { "epoch": 0.5507205820624038, "grad_norm": 0.5716449022293091, "learning_rate": 0.0002655723778928851, "loss": 4.0235, "step": 1722 }, { "epoch": 0.5510403965699894, "grad_norm": 0.4146965742111206, "learning_rate": 0.00026526360645032405, "loss": 4.0532, "step": 1723 }, { "epoch": 0.551360211077575, "grad_norm": 0.43997400999069214, "learning_rate": 0.0002649548723006527, "loss": 4.1418, "step": 1724 }, { "epoch": 0.5516800255851606, "grad_norm": 0.34734827280044556, "learning_rate": 0.00026464617577532756, "loss": 4.1796, "step": 1725 }, { "epoch": 0.5519998400927462, "grad_norm": 0.4285658001899719, "learning_rate": 0.00026433751720576385, "loss": 3.986, "step": 1726 }, { "epoch": 0.5523196546003318, "grad_norm": 0.3476516008377075, "learning_rate": 0.00026402889692333684, "loss": 4.0018, "step": 1727 }, { "epoch": 0.5526394691079174, "grad_norm": 0.39595460891723633, "learning_rate": 0.00026372031525938034, "loss": 3.9432, "step": 1728 }, { "epoch": 0.5529592836155031, "grad_norm": 0.5140640139579773, "learning_rate": 0.0002634117725451865, "loss": 4.0044, "step": 1729 }, { "epoch": 0.5532790981230886, "grad_norm": 0.3805321156978607, "learning_rate": 0.00026310326911200616, "loss": 4.0336, "step": 1730 }, { "epoch": 0.5535989126306742, "grad_norm": 0.6569865345954895, "learning_rate": 0.0002627948052910474, "loss": 4.1356, "step": 1731 }, { "epoch": 0.5539187271382598, "grad_norm": 0.5452926754951477, "learning_rate": 0.00026248638141347614, "loss": 4.0345, "step": 1732 }, { "epoch": 0.5542385416458454, "grad_norm": 0.352804958820343, "learning_rate": 0.00026217799781041567, "loss": 4.1062, "step": 1733 }, { "epoch": 0.554558356153431, "grad_norm": 0.3797714412212372, "learning_rate": 0.00026186965481294526, "loss": 3.9511, "step": 1734 }, { "epoch": 0.5548781706610166, "grad_norm": 0.43765488266944885, "learning_rate": 0.0002615613527521014, "loss": 4.0472, "step": 1735 }, { "epoch": 0.5551979851686022, "grad_norm": 0.4260343611240387, "learning_rate": 0.00026125309195887603, "loss": 4.0278, "step": 1736 }, { "epoch": 0.5555177996761879, "grad_norm": 0.3735271394252777, "learning_rate": 0.0002609448727642172, "loss": 3.9373, "step": 1737 }, { "epoch": 0.5558376141837734, "grad_norm": 0.4758170545101166, "learning_rate": 0.000260636695499028, "loss": 4.0541, "step": 1738 }, { "epoch": 0.556157428691359, "grad_norm": 0.31774604320526123, "learning_rate": 0.00026032856049416664, "loss": 3.9905, "step": 1739 }, { "epoch": 0.5564772431989446, "grad_norm": 0.458478718996048, "learning_rate": 0.00026002046808044617, "loss": 4.0608, "step": 1740 }, { "epoch": 0.5567970577065302, "grad_norm": 0.32627275586128235, "learning_rate": 0.0002597124185886334, "loss": 4.0963, "step": 1741 }, { "epoch": 0.5571168722141158, "grad_norm": 0.4891500174999237, "learning_rate": 0.0002594044123494496, "loss": 4.0229, "step": 1742 }, { "epoch": 0.5574366867217014, "grad_norm": 0.3926384449005127, "learning_rate": 0.00025909644969356917, "loss": 4.0057, "step": 1743 }, { "epoch": 0.557756501229287, "grad_norm": 0.3676303029060364, "learning_rate": 0.00025878853095162, "loss": 4.0332, "step": 1744 }, { "epoch": 0.5580763157368727, "grad_norm": 0.3462071716785431, "learning_rate": 0.00025848065645418285, "loss": 3.9714, "step": 1745 }, { "epoch": 0.5583961302444582, "grad_norm": 0.42904749512672424, "learning_rate": 0.00025817282653179064, "loss": 4.1009, "step": 1746 }, { "epoch": 0.5587159447520438, "grad_norm": 0.6535913944244385, "learning_rate": 0.0002578650415149289, "loss": 4.0261, "step": 1747 }, { "epoch": 0.5590357592596295, "grad_norm": 0.4931611120700836, "learning_rate": 0.00025755730173403457, "loss": 3.9667, "step": 1748 }, { "epoch": 0.559355573767215, "grad_norm": 0.3824300169944763, "learning_rate": 0.0002572496075194963, "loss": 3.9736, "step": 1749 }, { "epoch": 0.5596753882748006, "grad_norm": 0.4904285669326782, "learning_rate": 0.00025694195920165344, "loss": 4.1195, "step": 1750 }, { "epoch": 0.5599952027823862, "grad_norm": 0.37303683161735535, "learning_rate": 0.0002566343571107966, "loss": 4.0203, "step": 1751 }, { "epoch": 0.5603150172899718, "grad_norm": 0.3547070026397705, "learning_rate": 0.0002563268015771664, "loss": 4.0531, "step": 1752 }, { "epoch": 0.5606348317975575, "grad_norm": 0.3359990119934082, "learning_rate": 0.00025601929293095344, "loss": 4.022, "step": 1753 }, { "epoch": 0.560954646305143, "grad_norm": 0.5377110838890076, "learning_rate": 0.00025571183150229827, "loss": 3.9716, "step": 1754 }, { "epoch": 0.5612744608127286, "grad_norm": 0.34876373410224915, "learning_rate": 0.00025540441762129045, "loss": 3.9787, "step": 1755 }, { "epoch": 0.5615942753203143, "grad_norm": 0.3653636872768402, "learning_rate": 0.00025509705161796866, "loss": 4.0329, "step": 1756 }, { "epoch": 0.5619140898278998, "grad_norm": 0.33364200592041016, "learning_rate": 0.0002547897338223202, "loss": 3.9363, "step": 1757 }, { "epoch": 0.5622339043354854, "grad_norm": 0.6702976226806641, "learning_rate": 0.0002544824645642804, "loss": 4.0403, "step": 1758 }, { "epoch": 0.562553718843071, "grad_norm": 0.3349170386791229, "learning_rate": 0.00025417524417373276, "loss": 3.9859, "step": 1759 }, { "epoch": 0.5628735333506566, "grad_norm": 0.3668507933616638, "learning_rate": 0.00025386807298050817, "loss": 4.0717, "step": 1760 }, { "epoch": 0.5631933478582423, "grad_norm": 0.3946221172809601, "learning_rate": 0.00025356095131438464, "loss": 4.0408, "step": 1761 }, { "epoch": 0.5635131623658278, "grad_norm": 0.35808873176574707, "learning_rate": 0.0002532538795050872, "loss": 4.0958, "step": 1762 }, { "epoch": 0.5638329768734134, "grad_norm": 0.3439721167087555, "learning_rate": 0.0002529468578822871, "loss": 4.0423, "step": 1763 }, { "epoch": 0.564152791380999, "grad_norm": 0.4603897035121918, "learning_rate": 0.00025263988677560204, "loss": 3.9542, "step": 1764 }, { "epoch": 0.5644726058885846, "grad_norm": 0.3750258982181549, "learning_rate": 0.0002523329665145951, "loss": 3.9531, "step": 1765 }, { "epoch": 0.5647924203961702, "grad_norm": 0.3267868459224701, "learning_rate": 0.00025202609742877515, "loss": 4.0192, "step": 1766 }, { "epoch": 0.5651122349037558, "grad_norm": 0.34897565841674805, "learning_rate": 0.0002517192798475958, "loss": 3.8793, "step": 1767 }, { "epoch": 0.5654320494113414, "grad_norm": 0.4219346046447754, "learning_rate": 0.00025141251410045547, "loss": 3.9628, "step": 1768 }, { "epoch": 0.5657518639189271, "grad_norm": 0.33135056495666504, "learning_rate": 0.0002511058005166972, "loss": 3.9099, "step": 1769 }, { "epoch": 0.5660716784265126, "grad_norm": 0.40127331018447876, "learning_rate": 0.0002507991394256075, "loss": 3.9843, "step": 1770 }, { "epoch": 0.5663914929340982, "grad_norm": 0.4966872036457062, "learning_rate": 0.00025049253115641713, "loss": 4.0473, "step": 1771 }, { "epoch": 0.5667113074416839, "grad_norm": 0.498229056596756, "learning_rate": 0.00025018597603829944, "loss": 4.0608, "step": 1772 }, { "epoch": 0.5670311219492694, "grad_norm": 0.36300697922706604, "learning_rate": 0.0002498794744003715, "loss": 3.9855, "step": 1773 }, { "epoch": 0.567350936456855, "grad_norm": 0.4380260705947876, "learning_rate": 0.0002495730265716922, "loss": 3.9949, "step": 1774 }, { "epoch": 0.5676707509644406, "grad_norm": 0.3299658000469208, "learning_rate": 0.00024926663288126323, "loss": 4.032, "step": 1775 }, { "epoch": 0.5679905654720262, "grad_norm": 0.39703038334846497, "learning_rate": 0.00024896029365802807, "loss": 3.9584, "step": 1776 }, { "epoch": 0.5683103799796119, "grad_norm": 0.4423528015613556, "learning_rate": 0.0002486540092308713, "loss": 4.0724, "step": 1777 }, { "epoch": 0.5686301944871974, "grad_norm": 0.3460763096809387, "learning_rate": 0.00024834777992861935, "loss": 3.9382, "step": 1778 }, { "epoch": 0.568950008994783, "grad_norm": 0.3504014015197754, "learning_rate": 0.0002480416060800387, "loss": 4.0114, "step": 1779 }, { "epoch": 0.5692698235023687, "grad_norm": 0.4190087616443634, "learning_rate": 0.0002477354880138369, "loss": 4.0119, "step": 1780 }, { "epoch": 0.5695896380099542, "grad_norm": 0.34340980648994446, "learning_rate": 0.0002474294260586615, "loss": 3.9894, "step": 1781 }, { "epoch": 0.5699094525175398, "grad_norm": 0.3749614655971527, "learning_rate": 0.00024712342054309945, "loss": 4.0422, "step": 1782 }, { "epoch": 0.5702292670251254, "grad_norm": 0.33671990036964417, "learning_rate": 0.00024681747179567775, "loss": 4.0081, "step": 1783 }, { "epoch": 0.570549081532711, "grad_norm": 0.30939245223999023, "learning_rate": 0.0002465115801448617, "loss": 4.0095, "step": 1784 }, { "epoch": 0.5708688960402967, "grad_norm": 0.4347824454307556, "learning_rate": 0.0002462057459190559, "loss": 4.0279, "step": 1785 }, { "epoch": 0.5711887105478822, "grad_norm": 0.3796108067035675, "learning_rate": 0.0002458999694466029, "loss": 4.0277, "step": 1786 }, { "epoch": 0.5715085250554678, "grad_norm": 0.4084206223487854, "learning_rate": 0.0002455942510557836, "loss": 4.0319, "step": 1787 }, { "epoch": 0.5718283395630535, "grad_norm": 0.6607809662818909, "learning_rate": 0.0002452885910748163, "loss": 4.0093, "step": 1788 }, { "epoch": 0.572148154070639, "grad_norm": 0.3705383837223053, "learning_rate": 0.0002449829898318566, "loss": 3.9736, "step": 1789 }, { "epoch": 0.5724679685782247, "grad_norm": 0.5855865478515625, "learning_rate": 0.0002446774476549972, "loss": 4.0316, "step": 1790 }, { "epoch": 0.5727877830858102, "grad_norm": 0.8142665028572083, "learning_rate": 0.00024437196487226716, "loss": 3.9636, "step": 1791 }, { "epoch": 0.5731075975933958, "grad_norm": 0.47535109519958496, "learning_rate": 0.00024406654181163197, "loss": 3.9903, "step": 1792 }, { "epoch": 0.5734274121009815, "grad_norm": 0.3583545684814453, "learning_rate": 0.000243761178800993, "loss": 4.0187, "step": 1793 }, { "epoch": 0.573747226608567, "grad_norm": 0.3775182366371155, "learning_rate": 0.00024345587616818692, "loss": 3.9304, "step": 1794 }, { "epoch": 0.5740670411161526, "grad_norm": 0.5332264304161072, "learning_rate": 0.00024315063424098585, "loss": 4.0016, "step": 1795 }, { "epoch": 0.5743868556237383, "grad_norm": 0.3671213984489441, "learning_rate": 0.00024284545334709657, "loss": 4.1651, "step": 1796 }, { "epoch": 0.5747066701313238, "grad_norm": 0.36971262097358704, "learning_rate": 0.00024254033381416047, "loss": 3.9375, "step": 1797 }, { "epoch": 0.5750264846389095, "grad_norm": 0.35038304328918457, "learning_rate": 0.00024223527596975284, "loss": 3.9945, "step": 1798 }, { "epoch": 0.575346299146495, "grad_norm": 0.35673201084136963, "learning_rate": 0.000241930280141383, "loss": 4.0694, "step": 1799 }, { "epoch": 0.5756661136540806, "grad_norm": 0.3971046209335327, "learning_rate": 0.00024162534665649358, "loss": 4.0741, "step": 1800 }, { "epoch": 0.5756661136540806, "eval_runtime": 43.0172, "eval_samples_per_second": 44.099, "eval_steps_per_second": 11.042, "step": 1800 }, { "epoch": 0.5759859281616663, "grad_norm": 0.35878297686576843, "learning_rate": 0.0002413204758424602, "loss": 3.9403, "step": 1801 }, { "epoch": 0.5763057426692518, "grad_norm": 0.4249137043952942, "learning_rate": 0.00024101566802659137, "loss": 4.0707, "step": 1802 }, { "epoch": 0.5766255571768374, "grad_norm": 0.34513869881629944, "learning_rate": 0.0002407109235361277, "loss": 3.9796, "step": 1803 }, { "epoch": 0.5769453716844231, "grad_norm": 0.30753403902053833, "learning_rate": 0.0002404062426982421, "loss": 4.0451, "step": 1804 }, { "epoch": 0.5772651861920086, "grad_norm": 0.36476585268974304, "learning_rate": 0.00024010162584003905, "loss": 4.0069, "step": 1805 }, { "epoch": 0.5775850006995943, "grad_norm": 0.43495407700538635, "learning_rate": 0.0002397970732885542, "loss": 4.1122, "step": 1806 }, { "epoch": 0.5779048152071798, "grad_norm": 0.34263455867767334, "learning_rate": 0.0002394925853707544, "loss": 3.9604, "step": 1807 }, { "epoch": 0.5782246297147654, "grad_norm": 0.35898175835609436, "learning_rate": 0.00023918816241353684, "loss": 4.0403, "step": 1808 }, { "epoch": 0.5785444442223511, "grad_norm": 0.48811063170433044, "learning_rate": 0.0002388838047437293, "loss": 3.9834, "step": 1809 }, { "epoch": 0.5788642587299366, "grad_norm": 0.36961546540260315, "learning_rate": 0.00023857951268808905, "loss": 4.0012, "step": 1810 }, { "epoch": 0.5791840732375222, "grad_norm": 0.41611772775650024, "learning_rate": 0.00023827528657330331, "loss": 4.1022, "step": 1811 }, { "epoch": 0.5795038877451079, "grad_norm": 0.639523983001709, "learning_rate": 0.00023797112672598833, "loss": 3.9937, "step": 1812 }, { "epoch": 0.5798237022526934, "grad_norm": 0.37891748547554016, "learning_rate": 0.0002376670334726891, "loss": 3.9609, "step": 1813 }, { "epoch": 0.5801435167602791, "grad_norm": 0.37956759333610535, "learning_rate": 0.00023736300713987946, "loss": 4.0736, "step": 1814 }, { "epoch": 0.5804633312678646, "grad_norm": 0.3285279870033264, "learning_rate": 0.00023705904805396095, "loss": 4.005, "step": 1815 }, { "epoch": 0.5807831457754502, "grad_norm": 0.3209591209888458, "learning_rate": 0.00023675515654126327, "loss": 4.0346, "step": 1816 }, { "epoch": 0.5811029602830359, "grad_norm": 0.3819675147533417, "learning_rate": 0.00023645133292804352, "loss": 4.0181, "step": 1817 }, { "epoch": 0.5814227747906214, "grad_norm": 0.29155775904655457, "learning_rate": 0.0002361475775404857, "loss": 3.9717, "step": 1818 }, { "epoch": 0.581742589298207, "grad_norm": 0.43003252148628235, "learning_rate": 0.00023584389070470087, "loss": 4.0475, "step": 1819 }, { "epoch": 0.5820624038057927, "grad_norm": 0.3241746425628662, "learning_rate": 0.000235540272746726, "loss": 4.0187, "step": 1820 }, { "epoch": 0.5823822183133782, "grad_norm": 0.5025460124015808, "learning_rate": 0.00023523672399252492, "loss": 3.9924, "step": 1821 }, { "epoch": 0.5827020328209639, "grad_norm": 0.36268341541290283, "learning_rate": 0.00023493324476798618, "loss": 4.0278, "step": 1822 }, { "epoch": 0.5830218473285494, "grad_norm": 0.7405848503112793, "learning_rate": 0.0002346298353989245, "loss": 4.0008, "step": 1823 }, { "epoch": 0.583341661836135, "grad_norm": 0.36367058753967285, "learning_rate": 0.00023432649621107928, "loss": 4.1009, "step": 1824 }, { "epoch": 0.5836614763437207, "grad_norm": 0.5475504398345947, "learning_rate": 0.00023402322753011433, "loss": 3.9081, "step": 1825 }, { "epoch": 0.5839812908513062, "grad_norm": 0.481734961271286, "learning_rate": 0.0002337200296816184, "loss": 3.9764, "step": 1826 }, { "epoch": 0.5843011053588918, "grad_norm": 0.42002663016319275, "learning_rate": 0.00023341690299110338, "loss": 4.0349, "step": 1827 }, { "epoch": 0.5846209198664775, "grad_norm": 0.4293695092201233, "learning_rate": 0.0002331138477840054, "loss": 3.9444, "step": 1828 }, { "epoch": 0.584940734374063, "grad_norm": 0.36801302433013916, "learning_rate": 0.00023281086438568384, "loss": 3.9778, "step": 1829 }, { "epoch": 0.5852605488816487, "grad_norm": 0.3421482741832733, "learning_rate": 0.0002325079531214204, "loss": 4.0107, "step": 1830 }, { "epoch": 0.5855803633892342, "grad_norm": 0.588682234287262, "learning_rate": 0.00023220511431642008, "loss": 3.9668, "step": 1831 }, { "epoch": 0.5859001778968198, "grad_norm": 0.48187145590782166, "learning_rate": 0.00023190234829580943, "loss": 4.0049, "step": 1832 }, { "epoch": 0.5862199924044055, "grad_norm": 0.3539397418498993, "learning_rate": 0.00023159965538463738, "loss": 3.9405, "step": 1833 }, { "epoch": 0.586539806911991, "grad_norm": 0.39426591992378235, "learning_rate": 0.00023129703590787394, "loss": 3.9606, "step": 1834 }, { "epoch": 0.5868596214195766, "grad_norm": 0.6450797915458679, "learning_rate": 0.0002309944901904107, "loss": 3.979, "step": 1835 }, { "epoch": 0.5871794359271623, "grad_norm": 0.32639405131340027, "learning_rate": 0.00023069201855705973, "loss": 3.9814, "step": 1836 }, { "epoch": 0.5874992504347478, "grad_norm": 0.30487409234046936, "learning_rate": 0.00023038962133255366, "loss": 3.9883, "step": 1837 }, { "epoch": 0.5878190649423335, "grad_norm": 0.35221052169799805, "learning_rate": 0.00023008729884154542, "loss": 3.9622, "step": 1838 }, { "epoch": 0.588138879449919, "grad_norm": 0.39746445417404175, "learning_rate": 0.00022978505140860736, "loss": 4.0674, "step": 1839 }, { "epoch": 0.5884586939575046, "grad_norm": 0.4461354911327362, "learning_rate": 0.00022948287935823153, "loss": 4.0229, "step": 1840 }, { "epoch": 0.5887785084650903, "grad_norm": 0.34972959756851196, "learning_rate": 0.00022918078301482897, "loss": 3.9762, "step": 1841 }, { "epoch": 0.5890983229726758, "grad_norm": 0.35696741938591003, "learning_rate": 0.00022887876270272938, "loss": 3.999, "step": 1842 }, { "epoch": 0.5894181374802614, "grad_norm": 0.5749049186706543, "learning_rate": 0.0002285768187461809, "loss": 4.0098, "step": 1843 }, { "epoch": 0.5897379519878471, "grad_norm": 0.446810781955719, "learning_rate": 0.00022827495146934964, "loss": 3.9631, "step": 1844 }, { "epoch": 0.5900577664954326, "grad_norm": 0.521059513092041, "learning_rate": 0.00022797316119631952, "loss": 4.0207, "step": 1845 }, { "epoch": 0.5903775810030183, "grad_norm": 0.38042277097702026, "learning_rate": 0.00022767144825109153, "loss": 3.9619, "step": 1846 }, { "epoch": 0.5906973955106039, "grad_norm": 0.46804195642471313, "learning_rate": 0.00022736981295758393, "loss": 3.998, "step": 1847 }, { "epoch": 0.5910172100181894, "grad_norm": 0.5972468852996826, "learning_rate": 0.00022706825563963148, "loss": 4.0689, "step": 1848 }, { "epoch": 0.5913370245257751, "grad_norm": 0.6922116875648499, "learning_rate": 0.00022676677662098512, "loss": 4.0082, "step": 1849 }, { "epoch": 0.5916568390333606, "grad_norm": 0.32202717661857605, "learning_rate": 0.00022646537622531197, "loss": 3.9861, "step": 1850 }, { "epoch": 0.5919766535409462, "grad_norm": 0.39114245772361755, "learning_rate": 0.00022616405477619448, "loss": 4.044, "step": 1851 }, { "epoch": 0.5922964680485319, "grad_norm": 0.7816163897514343, "learning_rate": 0.00022586281259713055, "loss": 3.9538, "step": 1852 }, { "epoch": 0.5926162825561174, "grad_norm": 0.4304044246673584, "learning_rate": 0.00022556165001153295, "loss": 4.0388, "step": 1853 }, { "epoch": 0.5929360970637031, "grad_norm": 0.42333152890205383, "learning_rate": 0.0002252605673427288, "loss": 4.0625, "step": 1854 }, { "epoch": 0.5932559115712887, "grad_norm": 0.5886488556861877, "learning_rate": 0.0002249595649139597, "loss": 4.0114, "step": 1855 }, { "epoch": 0.5935757260788742, "grad_norm": 0.4643212854862213, "learning_rate": 0.0002246586430483809, "loss": 3.9493, "step": 1856 }, { "epoch": 0.5938955405864599, "grad_norm": 0.5130184292793274, "learning_rate": 0.00022435780206906132, "loss": 4.098, "step": 1857 }, { "epoch": 0.5942153550940454, "grad_norm": 0.4009779691696167, "learning_rate": 0.00022405704229898278, "loss": 4.0598, "step": 1858 }, { "epoch": 0.5945351696016311, "grad_norm": 0.4298734664916992, "learning_rate": 0.00022375636406104022, "loss": 4.001, "step": 1859 }, { "epoch": 0.5948549841092167, "grad_norm": 0.5448179841041565, "learning_rate": 0.00022345576767804085, "loss": 3.9552, "step": 1860 }, { "epoch": 0.5951747986168022, "grad_norm": 0.47339797019958496, "learning_rate": 0.00022315525347270412, "loss": 4.0071, "step": 1861 }, { "epoch": 0.5954946131243879, "grad_norm": 0.32078489661216736, "learning_rate": 0.00022285482176766122, "loss": 3.9741, "step": 1862 }, { "epoch": 0.5958144276319735, "grad_norm": 0.4228768050670624, "learning_rate": 0.00022255447288545453, "loss": 3.9889, "step": 1863 }, { "epoch": 0.596134242139559, "grad_norm": 0.42705073952674866, "learning_rate": 0.00022225420714853798, "loss": 3.9992, "step": 1864 }, { "epoch": 0.5964540566471447, "grad_norm": 0.3589797616004944, "learning_rate": 0.00022195402487927592, "loss": 3.9625, "step": 1865 }, { "epoch": 0.5967738711547302, "grad_norm": 0.36698251962661743, "learning_rate": 0.00022165392639994307, "loss": 3.9551, "step": 1866 }, { "epoch": 0.5970936856623159, "grad_norm": 0.6020881533622742, "learning_rate": 0.00022135391203272441, "loss": 3.9381, "step": 1867 }, { "epoch": 0.5974135001699015, "grad_norm": 0.37951937317848206, "learning_rate": 0.00022105398209971424, "loss": 3.9843, "step": 1868 }, { "epoch": 0.597733314677487, "grad_norm": 0.39198434352874756, "learning_rate": 0.00022075413692291678, "loss": 3.942, "step": 1869 }, { "epoch": 0.5980531291850727, "grad_norm": 0.49255192279815674, "learning_rate": 0.00022045437682424458, "loss": 3.9896, "step": 1870 }, { "epoch": 0.5983729436926583, "grad_norm": 0.3883727490901947, "learning_rate": 0.00022015470212551942, "loss": 3.9985, "step": 1871 }, { "epoch": 0.5986927582002438, "grad_norm": 0.38822776079177856, "learning_rate": 0.00021985511314847128, "loss": 4.0096, "step": 1872 }, { "epoch": 0.5990125727078295, "grad_norm": 0.36566266417503357, "learning_rate": 0.00021955561021473765, "loss": 4.0449, "step": 1873 }, { "epoch": 0.599332387215415, "grad_norm": 0.377751886844635, "learning_rate": 0.0002192561936458644, "loss": 3.9627, "step": 1874 }, { "epoch": 0.5996522017230007, "grad_norm": 0.47996005415916443, "learning_rate": 0.00021895686376330396, "loss": 3.8961, "step": 1875 }, { "epoch": 0.5999720162305863, "grad_norm": 0.48906296491622925, "learning_rate": 0.00021865762088841607, "loss": 3.8754, "step": 1876 }, { "epoch": 0.6002918307381718, "grad_norm": 0.5492601990699768, "learning_rate": 0.00021835846534246726, "loss": 3.9596, "step": 1877 }, { "epoch": 0.6006116452457575, "grad_norm": 0.6935892105102539, "learning_rate": 0.00021805939744662964, "loss": 3.9897, "step": 1878 }, { "epoch": 0.6009314597533431, "grad_norm": 0.4750279188156128, "learning_rate": 0.00021776041752198202, "loss": 3.9803, "step": 1879 }, { "epoch": 0.6012512742609286, "grad_norm": 0.3231257200241089, "learning_rate": 0.00021746152588950809, "loss": 4.0187, "step": 1880 }, { "epoch": 0.6015710887685143, "grad_norm": 0.6179749965667725, "learning_rate": 0.00021716272287009713, "loss": 4.0469, "step": 1881 }, { "epoch": 0.6018909032760998, "grad_norm": 0.32085660099983215, "learning_rate": 0.00021686400878454312, "loss": 3.8693, "step": 1882 }, { "epoch": 0.6022107177836855, "grad_norm": 0.35324013233184814, "learning_rate": 0.0002165653839535447, "loss": 3.9056, "step": 1883 }, { "epoch": 0.6025305322912711, "grad_norm": 0.462796151638031, "learning_rate": 0.00021626684869770462, "loss": 3.9113, "step": 1884 }, { "epoch": 0.6028503467988566, "grad_norm": 0.3689422011375427, "learning_rate": 0.00021596840333752934, "loss": 3.979, "step": 1885 }, { "epoch": 0.6031701613064423, "grad_norm": 0.4045153260231018, "learning_rate": 0.00021567004819342907, "loss": 4.0896, "step": 1886 }, { "epoch": 0.6034899758140279, "grad_norm": 0.4475997984409332, "learning_rate": 0.00021537178358571686, "loss": 3.9436, "step": 1887 }, { "epoch": 0.6038097903216134, "grad_norm": 0.41908904910087585, "learning_rate": 0.00021507360983460882, "loss": 3.9893, "step": 1888 }, { "epoch": 0.6041296048291991, "grad_norm": 0.34321165084838867, "learning_rate": 0.0002147755272602234, "loss": 3.8931, "step": 1889 }, { "epoch": 0.6044494193367846, "grad_norm": 0.43174245953559875, "learning_rate": 0.00021447753618258116, "loss": 4.0089, "step": 1890 }, { "epoch": 0.6047692338443703, "grad_norm": 0.5203201174736023, "learning_rate": 0.00021417963692160448, "loss": 3.9071, "step": 1891 }, { "epoch": 0.6050890483519559, "grad_norm": 0.35405370593070984, "learning_rate": 0.00021388182979711703, "loss": 3.8814, "step": 1892 }, { "epoch": 0.6054088628595414, "grad_norm": 0.33903661370277405, "learning_rate": 0.0002135841151288438, "loss": 4.042, "step": 1893 }, { "epoch": 0.6057286773671271, "grad_norm": 0.38405701518058777, "learning_rate": 0.00021328649323641022, "loss": 3.8933, "step": 1894 }, { "epoch": 0.6060484918747127, "grad_norm": 0.3316797912120819, "learning_rate": 0.00021298896443934238, "loss": 3.9368, "step": 1895 }, { "epoch": 0.6063683063822982, "grad_norm": 0.3724839687347412, "learning_rate": 0.00021269152905706637, "loss": 3.9718, "step": 1896 }, { "epoch": 0.6066881208898839, "grad_norm": 0.3574998676776886, "learning_rate": 0.00021239418740890786, "loss": 3.9995, "step": 1897 }, { "epoch": 0.6070079353974694, "grad_norm": 0.31199875473976135, "learning_rate": 0.000212096939814092, "loss": 3.9926, "step": 1898 }, { "epoch": 0.6073277499050551, "grad_norm": 0.3758678734302521, "learning_rate": 0.00021179978659174284, "loss": 3.9481, "step": 1899 }, { "epoch": 0.6076475644126407, "grad_norm": 0.30696552991867065, "learning_rate": 0.00021150272806088333, "loss": 3.9129, "step": 1900 }, { "epoch": 0.6076475644126407, "eval_runtime": 43.2683, "eval_samples_per_second": 43.843, "eval_steps_per_second": 10.978, "step": 1900 }, { "epoch": 0.6079673789202262, "grad_norm": 0.35300207138061523, "learning_rate": 0.00021120576454043463, "loss": 3.92, "step": 1901 }, { "epoch": 0.6082871934278119, "grad_norm": 0.38618525862693787, "learning_rate": 0.00021090889634921585, "loss": 4.0717, "step": 1902 }, { "epoch": 0.6086070079353975, "grad_norm": 0.30580955743789673, "learning_rate": 0.00021061212380594382, "loss": 4.0204, "step": 1903 }, { "epoch": 0.608926822442983, "grad_norm": 0.3796938955783844, "learning_rate": 0.00021031544722923266, "loss": 4.114, "step": 1904 }, { "epoch": 0.6092466369505687, "grad_norm": 0.5832591652870178, "learning_rate": 0.0002100188669375935, "loss": 4.0147, "step": 1905 }, { "epoch": 0.6095664514581542, "grad_norm": 0.41533592343330383, "learning_rate": 0.000209722383249434, "loss": 3.8837, "step": 1906 }, { "epoch": 0.6098862659657399, "grad_norm": 0.34099969267845154, "learning_rate": 0.0002094259964830582, "loss": 3.9282, "step": 1907 }, { "epoch": 0.6102060804733255, "grad_norm": 0.30617910623550415, "learning_rate": 0.0002091297069566662, "loss": 3.8526, "step": 1908 }, { "epoch": 0.610525894980911, "grad_norm": 0.4214378595352173, "learning_rate": 0.00020883351498835335, "loss": 4.0334, "step": 1909 }, { "epoch": 0.6108457094884967, "grad_norm": 0.3440532386302948, "learning_rate": 0.00020853742089611067, "loss": 4.0642, "step": 1910 }, { "epoch": 0.6111655239960823, "grad_norm": 0.3563397526741028, "learning_rate": 0.00020824142499782368, "loss": 3.9886, "step": 1911 }, { "epoch": 0.6114853385036678, "grad_norm": 0.4378513693809509, "learning_rate": 0.00020794552761127283, "loss": 3.8707, "step": 1912 }, { "epoch": 0.6118051530112535, "grad_norm": 0.4734366238117218, "learning_rate": 0.0002076497290541328, "loss": 3.9897, "step": 1913 }, { "epoch": 0.612124967518839, "grad_norm": 0.3813638985157013, "learning_rate": 0.0002073540296439719, "loss": 3.9138, "step": 1914 }, { "epoch": 0.6124447820264247, "grad_norm": 0.3876492977142334, "learning_rate": 0.00020705842969825225, "loss": 4.0656, "step": 1915 }, { "epoch": 0.6127645965340103, "grad_norm": 0.3468707799911499, "learning_rate": 0.00020676292953432886, "loss": 3.9298, "step": 1916 }, { "epoch": 0.6130844110415958, "grad_norm": 0.47034353017807007, "learning_rate": 0.00020646752946945016, "loss": 3.962, "step": 1917 }, { "epoch": 0.6134042255491815, "grad_norm": 0.6423379182815552, "learning_rate": 0.00020617222982075646, "loss": 3.9515, "step": 1918 }, { "epoch": 0.6137240400567671, "grad_norm": 0.35965994000434875, "learning_rate": 0.0002058770309052808, "loss": 3.9774, "step": 1919 }, { "epoch": 0.6140438545643526, "grad_norm": 0.43699392676353455, "learning_rate": 0.00020558193303994797, "loss": 3.9562, "step": 1920 }, { "epoch": 0.6143636690719383, "grad_norm": 0.5094754099845886, "learning_rate": 0.0002052869365415738, "loss": 4.0034, "step": 1921 }, { "epoch": 0.6146834835795238, "grad_norm": 0.398531973361969, "learning_rate": 0.00020499204172686616, "loss": 3.9658, "step": 1922 }, { "epoch": 0.6150032980871095, "grad_norm": 0.4287668466567993, "learning_rate": 0.00020469724891242281, "loss": 3.9648, "step": 1923 }, { "epoch": 0.6153231125946951, "grad_norm": 0.31665459275245667, "learning_rate": 0.00020440255841473252, "loss": 3.9317, "step": 1924 }, { "epoch": 0.6156429271022806, "grad_norm": 0.3121630549430847, "learning_rate": 0.0002041079705501745, "loss": 3.9603, "step": 1925 }, { "epoch": 0.6159627416098663, "grad_norm": 0.47175371646881104, "learning_rate": 0.00020381348563501694, "loss": 3.9162, "step": 1926 }, { "epoch": 0.6162825561174519, "grad_norm": 0.40410250425338745, "learning_rate": 0.00020351910398541835, "loss": 4.0769, "step": 1927 }, { "epoch": 0.6166023706250375, "grad_norm": 0.3510867953300476, "learning_rate": 0.00020322482591742576, "loss": 3.9773, "step": 1928 }, { "epoch": 0.6169221851326231, "grad_norm": 0.33025795221328735, "learning_rate": 0.0002029306517469754, "loss": 3.9686, "step": 1929 }, { "epoch": 0.6172419996402086, "grad_norm": 0.4340962767601013, "learning_rate": 0.00020263658178989162, "loss": 3.9738, "step": 1930 }, { "epoch": 0.6175618141477943, "grad_norm": 0.40323495864868164, "learning_rate": 0.0002023426163618872, "loss": 3.9579, "step": 1931 }, { "epoch": 0.6178816286553799, "grad_norm": 0.3533179759979248, "learning_rate": 0.00020204875577856256, "loss": 3.986, "step": 1932 }, { "epoch": 0.6182014431629654, "grad_norm": 0.32608500123023987, "learning_rate": 0.00020175500035540545, "loss": 3.9581, "step": 1933 }, { "epoch": 0.6185212576705511, "grad_norm": 0.40661561489105225, "learning_rate": 0.00020146135040779097, "loss": 3.8989, "step": 1934 }, { "epoch": 0.6188410721781367, "grad_norm": 0.4673665165901184, "learning_rate": 0.0002011678062509807, "loss": 3.9875, "step": 1935 }, { "epoch": 0.6191608866857223, "grad_norm": 0.3668428063392639, "learning_rate": 0.00020087436820012287, "loss": 4.0006, "step": 1936 }, { "epoch": 0.6194807011933079, "grad_norm": 0.32912349700927734, "learning_rate": 0.0002005810365702517, "loss": 3.9378, "step": 1937 }, { "epoch": 0.6198005157008935, "grad_norm": 0.37547996640205383, "learning_rate": 0.00020028781167628714, "loss": 3.9863, "step": 1938 }, { "epoch": 0.6201203302084791, "grad_norm": 0.46071314811706543, "learning_rate": 0.0001999946938330346, "loss": 3.9293, "step": 1939 }, { "epoch": 0.6204401447160647, "grad_norm": 0.3896280825138092, "learning_rate": 0.0001997016833551845, "loss": 3.98, "step": 1940 }, { "epoch": 0.6207599592236502, "grad_norm": 0.5212989449501038, "learning_rate": 0.00019940878055731208, "loss": 3.9141, "step": 1941 }, { "epoch": 0.6210797737312359, "grad_norm": 0.34885451197624207, "learning_rate": 0.00019911598575387683, "loss": 3.8735, "step": 1942 }, { "epoch": 0.6213995882388215, "grad_norm": 0.5809696912765503, "learning_rate": 0.00019882329925922245, "loss": 4.0095, "step": 1943 }, { "epoch": 0.6217194027464071, "grad_norm": 0.3279409408569336, "learning_rate": 0.00019853072138757637, "loss": 3.9806, "step": 1944 }, { "epoch": 0.6220392172539927, "grad_norm": 0.38503119349479675, "learning_rate": 0.00019823825245304918, "loss": 3.9956, "step": 1945 }, { "epoch": 0.6223590317615783, "grad_norm": 0.3609060049057007, "learning_rate": 0.00019794589276963482, "loss": 4.0285, "step": 1946 }, { "epoch": 0.6226788462691639, "grad_norm": 0.6297301650047302, "learning_rate": 0.00019765364265120962, "loss": 4.0027, "step": 1947 }, { "epoch": 0.6229986607767495, "grad_norm": 0.31925275921821594, "learning_rate": 0.00019736150241153258, "loss": 3.9263, "step": 1948 }, { "epoch": 0.623318475284335, "grad_norm": 0.3062548339366913, "learning_rate": 0.0001970694723642446, "loss": 3.9068, "step": 1949 }, { "epoch": 0.6236382897919207, "grad_norm": 0.3654589056968689, "learning_rate": 0.00019677755282286822, "loss": 3.9539, "step": 1950 }, { "epoch": 0.6239581042995063, "grad_norm": 0.4468153715133667, "learning_rate": 0.00019648574410080743, "loss": 3.9544, "step": 1951 }, { "epoch": 0.6242779188070919, "grad_norm": 0.37895843386650085, "learning_rate": 0.00019619404651134717, "loss": 3.9683, "step": 1952 }, { "epoch": 0.6245977333146775, "grad_norm": 0.3288390636444092, "learning_rate": 0.0001959024603676532, "loss": 4.065, "step": 1953 }, { "epoch": 0.624917547822263, "grad_norm": 0.41918325424194336, "learning_rate": 0.00019561098598277145, "loss": 4.0903, "step": 1954 }, { "epoch": 0.6252373623298487, "grad_norm": 0.3803117573261261, "learning_rate": 0.000195319623669628, "loss": 3.9824, "step": 1955 }, { "epoch": 0.6255571768374343, "grad_norm": 0.40122026205062866, "learning_rate": 0.00019502837374102866, "loss": 3.9613, "step": 1956 }, { "epoch": 0.6258769913450198, "grad_norm": 0.2980599105358124, "learning_rate": 0.00019473723650965832, "loss": 4.0305, "step": 1957 }, { "epoch": 0.6261968058526055, "grad_norm": 0.34777259826660156, "learning_rate": 0.0001944462122880813, "loss": 4.0232, "step": 1958 }, { "epoch": 0.6265166203601911, "grad_norm": 0.3668854832649231, "learning_rate": 0.00019415530138874, "loss": 3.9499, "step": 1959 }, { "epoch": 0.6268364348677767, "grad_norm": 0.3651364743709564, "learning_rate": 0.0001938645041239558, "loss": 3.9566, "step": 1960 }, { "epoch": 0.6271562493753623, "grad_norm": 0.3600693345069885, "learning_rate": 0.00019357382080592773, "loss": 3.9996, "step": 1961 }, { "epoch": 0.6274760638829479, "grad_norm": 0.3154357671737671, "learning_rate": 0.00019328325174673247, "loss": 4.0012, "step": 1962 }, { "epoch": 0.6277958783905335, "grad_norm": 0.45288151502609253, "learning_rate": 0.0001929927972583242, "loss": 4.0444, "step": 1963 }, { "epoch": 0.6281156928981191, "grad_norm": 0.42163434624671936, "learning_rate": 0.00019270245765253382, "loss": 3.9947, "step": 1964 }, { "epoch": 0.6284355074057046, "grad_norm": 0.3588712811470032, "learning_rate": 0.0001924122332410694, "loss": 4.025, "step": 1965 }, { "epoch": 0.6287553219132903, "grad_norm": 0.34556859731674194, "learning_rate": 0.00019212212433551465, "loss": 4.0219, "step": 1966 }, { "epoch": 0.6290751364208759, "grad_norm": 0.8043129444122314, "learning_rate": 0.00019183213124732986, "loss": 4.0168, "step": 1967 }, { "epoch": 0.6293949509284615, "grad_norm": 0.3868224024772644, "learning_rate": 0.0001915422542878508, "loss": 3.8946, "step": 1968 }, { "epoch": 0.6297147654360471, "grad_norm": 0.3401508331298828, "learning_rate": 0.00019125249376828824, "loss": 3.9929, "step": 1969 }, { "epoch": 0.6300345799436327, "grad_norm": 0.394098699092865, "learning_rate": 0.00019096284999972862, "loss": 3.8699, "step": 1970 }, { "epoch": 0.6303543944512183, "grad_norm": 0.5570782423019409, "learning_rate": 0.00019067332329313226, "loss": 3.9064, "step": 1971 }, { "epoch": 0.6306742089588039, "grad_norm": 0.431003600358963, "learning_rate": 0.0001903839139593343, "loss": 3.8768, "step": 1972 }, { "epoch": 0.6309940234663894, "grad_norm": 0.35499903559684753, "learning_rate": 0.00019009462230904398, "loss": 3.9818, "step": 1973 }, { "epoch": 0.6313138379739751, "grad_norm": 0.4086095690727234, "learning_rate": 0.0001898054486528436, "loss": 4.0557, "step": 1974 }, { "epoch": 0.6316336524815607, "grad_norm": 0.42284929752349854, "learning_rate": 0.00018951639330118953, "loss": 3.9529, "step": 1975 }, { "epoch": 0.6319534669891463, "grad_norm": 0.3404867649078369, "learning_rate": 0.0001892274565644104, "loss": 3.9013, "step": 1976 }, { "epoch": 0.6322732814967319, "grad_norm": 0.40925267338752747, "learning_rate": 0.000188938638752708, "loss": 3.85, "step": 1977 }, { "epoch": 0.6325930960043175, "grad_norm": 0.4649677574634552, "learning_rate": 0.00018864994017615624, "loss": 3.8934, "step": 1978 }, { "epoch": 0.6329129105119031, "grad_norm": 0.3123490810394287, "learning_rate": 0.0001883613611447011, "loss": 4.0644, "step": 1979 }, { "epoch": 0.6332327250194887, "grad_norm": 0.5454819202423096, "learning_rate": 0.00018807290196816022, "loss": 3.8929, "step": 1980 }, { "epoch": 0.6335525395270742, "grad_norm": 0.5105277299880981, "learning_rate": 0.00018778456295622239, "loss": 3.9106, "step": 1981 }, { "epoch": 0.6338723540346599, "grad_norm": 0.4525469243526459, "learning_rate": 0.00018749634441844764, "loss": 3.9497, "step": 1982 }, { "epoch": 0.6341921685422455, "grad_norm": 0.6078768968582153, "learning_rate": 0.00018720824666426647, "loss": 3.949, "step": 1983 }, { "epoch": 0.6345119830498311, "grad_norm": 0.3176713287830353, "learning_rate": 0.00018692027000297986, "loss": 4.0347, "step": 1984 }, { "epoch": 0.6348317975574167, "grad_norm": 0.5481239557266235, "learning_rate": 0.0001866324147437587, "loss": 4.0158, "step": 1985 }, { "epoch": 0.6351516120650023, "grad_norm": 0.40704241394996643, "learning_rate": 0.00018634468119564342, "loss": 3.9663, "step": 1986 }, { "epoch": 0.6354714265725879, "grad_norm": 0.5176820158958435, "learning_rate": 0.00018605706966754408, "loss": 3.9023, "step": 1987 }, { "epoch": 0.6357912410801735, "grad_norm": 0.34254997968673706, "learning_rate": 0.00018576958046823944, "loss": 3.91, "step": 1988 }, { "epoch": 0.636111055587759, "grad_norm": 0.32999110221862793, "learning_rate": 0.0001854822139063772, "loss": 3.9975, "step": 1989 }, { "epoch": 0.6364308700953447, "grad_norm": 0.3165121376514435, "learning_rate": 0.00018519497029047307, "loss": 3.9748, "step": 1990 }, { "epoch": 0.6367506846029303, "grad_norm": 0.39868029952049255, "learning_rate": 0.00018490784992891107, "loss": 4.0046, "step": 1991 }, { "epoch": 0.6370704991105159, "grad_norm": 0.5520655512809753, "learning_rate": 0.00018462085312994278, "loss": 3.9415, "step": 1992 }, { "epoch": 0.6373903136181015, "grad_norm": 0.3837994337081909, "learning_rate": 0.0001843339802016871, "loss": 4.0628, "step": 1993 }, { "epoch": 0.6377101281256871, "grad_norm": 0.5525141954421997, "learning_rate": 0.00018404723145212993, "loss": 3.958, "step": 1994 }, { "epoch": 0.6380299426332727, "grad_norm": 0.5434145331382751, "learning_rate": 0.00018376060718912392, "loss": 3.9739, "step": 1995 }, { "epoch": 0.6383497571408583, "grad_norm": 0.3779720366001129, "learning_rate": 0.00018347410772038807, "loss": 3.9301, "step": 1996 }, { "epoch": 0.638669571648444, "grad_norm": 0.40796390175819397, "learning_rate": 0.00018318773335350723, "loss": 3.9271, "step": 1997 }, { "epoch": 0.6389893861560295, "grad_norm": 0.3307484984397888, "learning_rate": 0.0001829014843959322, "loss": 3.9492, "step": 1998 }, { "epoch": 0.6393092006636151, "grad_norm": 0.40682584047317505, "learning_rate": 0.00018261536115497904, "loss": 3.976, "step": 1999 }, { "epoch": 0.6396290151712007, "grad_norm": 0.6171014904975891, "learning_rate": 0.0001823293639378287, "loss": 3.9583, "step": 2000 }, { "epoch": 0.6396290151712007, "eval_runtime": 42.2351, "eval_samples_per_second": 44.915, "eval_steps_per_second": 11.247, "step": 2000 }, { "epoch": 0.6399488296787863, "grad_norm": 0.379080206155777, "learning_rate": 0.00018204349305152707, "loss": 3.8831, "step": 2001 }, { "epoch": 0.6402686441863719, "grad_norm": 0.4027690589427948, "learning_rate": 0.00018175774880298422, "loss": 4.0538, "step": 2002 }, { "epoch": 0.6405884586939575, "grad_norm": 0.37698525190353394, "learning_rate": 0.0001814721314989743, "loss": 4.0556, "step": 2003 }, { "epoch": 0.6409082732015431, "grad_norm": 0.36136868596076965, "learning_rate": 0.0001811866414461354, "loss": 3.9806, "step": 2004 }, { "epoch": 0.6412280877091288, "grad_norm": 0.4031491279602051, "learning_rate": 0.00018090127895096855, "loss": 3.8947, "step": 2005 }, { "epoch": 0.6415479022167143, "grad_norm": 0.32587480545043945, "learning_rate": 0.0001806160443198383, "loss": 3.9628, "step": 2006 }, { "epoch": 0.6418677167242999, "grad_norm": 0.5907444953918457, "learning_rate": 0.00018033093785897145, "loss": 3.8942, "step": 2007 }, { "epoch": 0.6421875312318855, "grad_norm": 0.3496485948562622, "learning_rate": 0.00018004595987445782, "loss": 3.9956, "step": 2008 }, { "epoch": 0.6425073457394711, "grad_norm": 0.38146597146987915, "learning_rate": 0.00017976111067224854, "loss": 4.0234, "step": 2009 }, { "epoch": 0.6428271602470567, "grad_norm": 0.4207388460636139, "learning_rate": 0.00017947639055815713, "loss": 4.0033, "step": 2010 }, { "epoch": 0.6431469747546423, "grad_norm": 0.3919213116168976, "learning_rate": 0.00017919179983785828, "loss": 3.9818, "step": 2011 }, { "epoch": 0.6434667892622279, "grad_norm": 0.630363941192627, "learning_rate": 0.00017890733881688754, "loss": 4.0245, "step": 2012 }, { "epoch": 0.6437866037698136, "grad_norm": 0.38853439688682556, "learning_rate": 0.0001786230078006417, "loss": 3.8987, "step": 2013 }, { "epoch": 0.6441064182773991, "grad_norm": 0.3755444586277008, "learning_rate": 0.00017833880709437752, "loss": 3.9348, "step": 2014 }, { "epoch": 0.6444262327849847, "grad_norm": 0.583441436290741, "learning_rate": 0.00017805473700321193, "loss": 3.8866, "step": 2015 }, { "epoch": 0.6447460472925703, "grad_norm": 0.3632233440876007, "learning_rate": 0.00017777079783212215, "loss": 4.0422, "step": 2016 }, { "epoch": 0.6450658618001559, "grad_norm": 0.3999553918838501, "learning_rate": 0.00017748698988594394, "loss": 3.9764, "step": 2017 }, { "epoch": 0.6453856763077415, "grad_norm": 0.4404124617576599, "learning_rate": 0.00017720331346937317, "loss": 3.9468, "step": 2018 }, { "epoch": 0.6457054908153271, "grad_norm": 0.3417743444442749, "learning_rate": 0.0001769197688869636, "loss": 3.8738, "step": 2019 }, { "epoch": 0.6460253053229127, "grad_norm": 0.4049082100391388, "learning_rate": 0.0001766363564431281, "loss": 3.926, "step": 2020 }, { "epoch": 0.6463451198304984, "grad_norm": 0.32640039920806885, "learning_rate": 0.00017635307644213726, "loss": 3.9475, "step": 2021 }, { "epoch": 0.6466649343380839, "grad_norm": 0.37244564294815063, "learning_rate": 0.00017606992918811976, "loss": 3.9252, "step": 2022 }, { "epoch": 0.6469847488456695, "grad_norm": 0.32260367274284363, "learning_rate": 0.00017578691498506177, "loss": 3.9296, "step": 2023 }, { "epoch": 0.6473045633532551, "grad_norm": 0.36949023604393005, "learning_rate": 0.00017550403413680625, "loss": 4.0026, "step": 2024 }, { "epoch": 0.6476243778608407, "grad_norm": 0.348063588142395, "learning_rate": 0.00017522128694705347, "loss": 3.9053, "step": 2025 }, { "epoch": 0.6479441923684263, "grad_norm": 0.37847042083740234, "learning_rate": 0.0001749386737193598, "loss": 3.9311, "step": 2026 }, { "epoch": 0.6482640068760119, "grad_norm": 0.378100723028183, "learning_rate": 0.00017465619475713813, "loss": 3.9791, "step": 2027 }, { "epoch": 0.6485838213835975, "grad_norm": 0.3911798596382141, "learning_rate": 0.00017437385036365695, "loss": 3.9783, "step": 2028 }, { "epoch": 0.6489036358911832, "grad_norm": 0.3561522960662842, "learning_rate": 0.00017409164084204037, "loss": 3.922, "step": 2029 }, { "epoch": 0.6492234503987687, "grad_norm": 0.48556751012802124, "learning_rate": 0.00017380956649526785, "loss": 3.9425, "step": 2030 }, { "epoch": 0.6495432649063543, "grad_norm": 0.4286404848098755, "learning_rate": 0.00017352762762617334, "loss": 3.8998, "step": 2031 }, { "epoch": 0.64986307941394, "grad_norm": 0.34082135558128357, "learning_rate": 0.00017324582453744577, "loss": 3.9163, "step": 2032 }, { "epoch": 0.6501828939215255, "grad_norm": 0.37657931447029114, "learning_rate": 0.00017296415753162786, "loss": 3.9381, "step": 2033 }, { "epoch": 0.6505027084291111, "grad_norm": 0.3365090489387512, "learning_rate": 0.00017268262691111675, "loss": 4.0002, "step": 2034 }, { "epoch": 0.6508225229366967, "grad_norm": 0.46130597591400146, "learning_rate": 0.0001724012329781629, "loss": 3.9406, "step": 2035 }, { "epoch": 0.6511423374442823, "grad_norm": 0.32992061972618103, "learning_rate": 0.0001721199760348698, "loss": 3.957, "step": 2036 }, { "epoch": 0.651462151951868, "grad_norm": 0.4717605412006378, "learning_rate": 0.00017183885638319426, "loss": 3.9436, "step": 2037 }, { "epoch": 0.6517819664594535, "grad_norm": 0.6148199439048767, "learning_rate": 0.00017155787432494529, "loss": 3.9693, "step": 2038 }, { "epoch": 0.6521017809670391, "grad_norm": 0.37032511830329895, "learning_rate": 0.00017127703016178445, "loss": 3.986, "step": 2039 }, { "epoch": 0.6524215954746247, "grad_norm": 0.3708939254283905, "learning_rate": 0.00017099632419522552, "loss": 4.0123, "step": 2040 }, { "epoch": 0.6527414099822103, "grad_norm": 0.459658145904541, "learning_rate": 0.00017071575672663325, "loss": 3.8974, "step": 2041 }, { "epoch": 0.6530612244897959, "grad_norm": 0.5088686943054199, "learning_rate": 0.0001704353280572243, "loss": 4.0609, "step": 2042 }, { "epoch": 0.6533810389973815, "grad_norm": 0.32678094506263733, "learning_rate": 0.0001701550384880658, "loss": 3.9332, "step": 2043 }, { "epoch": 0.6537008535049671, "grad_norm": 0.3500315845012665, "learning_rate": 0.00016987488832007593, "loss": 3.9701, "step": 2044 }, { "epoch": 0.6540206680125528, "grad_norm": 0.3899858295917511, "learning_rate": 0.00016959487785402313, "loss": 3.938, "step": 2045 }, { "epoch": 0.6543404825201383, "grad_norm": 0.39684924483299255, "learning_rate": 0.00016931500739052576, "loss": 3.9793, "step": 2046 }, { "epoch": 0.6546602970277239, "grad_norm": 0.37010735273361206, "learning_rate": 0.00016903527723005206, "loss": 4.0185, "step": 2047 }, { "epoch": 0.6549801115353096, "grad_norm": 0.3881669342517853, "learning_rate": 0.0001687556876729193, "loss": 4.0019, "step": 2048 }, { "epoch": 0.6552999260428951, "grad_norm": 0.3419054448604584, "learning_rate": 0.00016847623901929408, "loss": 3.8872, "step": 2049 }, { "epoch": 0.6556197405504807, "grad_norm": 0.4082156717777252, "learning_rate": 0.00016819693156919167, "loss": 3.8758, "step": 2050 }, { "epoch": 0.6559395550580663, "grad_norm": 0.31883829832077026, "learning_rate": 0.00016791776562247572, "loss": 3.8571, "step": 2051 }, { "epoch": 0.6562593695656519, "grad_norm": 0.4540899693965912, "learning_rate": 0.0001676387414788581, "loss": 3.8177, "step": 2052 }, { "epoch": 0.6565791840732376, "grad_norm": 0.43127772212028503, "learning_rate": 0.00016735985943789808, "loss": 3.9628, "step": 2053 }, { "epoch": 0.6568989985808231, "grad_norm": 0.43607109785079956, "learning_rate": 0.0001670811197990027, "loss": 3.8107, "step": 2054 }, { "epoch": 0.6572188130884087, "grad_norm": 0.38606831431388855, "learning_rate": 0.000166802522861426, "loss": 3.85, "step": 2055 }, { "epoch": 0.6575386275959944, "grad_norm": 0.48399317264556885, "learning_rate": 0.00016652406892426902, "loss": 3.9495, "step": 2056 }, { "epoch": 0.6578584421035799, "grad_norm": 0.342654824256897, "learning_rate": 0.00016624575828647878, "loss": 3.9177, "step": 2057 }, { "epoch": 0.6581782566111655, "grad_norm": 0.35340890288352966, "learning_rate": 0.0001659675912468489, "loss": 3.9794, "step": 2058 }, { "epoch": 0.6584980711187511, "grad_norm": 0.3230057954788208, "learning_rate": 0.00016568956810401867, "loss": 3.8937, "step": 2059 }, { "epoch": 0.6588178856263367, "grad_norm": 0.3592866361141205, "learning_rate": 0.00016541168915647298, "loss": 3.8658, "step": 2060 }, { "epoch": 0.6591377001339224, "grad_norm": 0.34195631742477417, "learning_rate": 0.00016513395470254194, "loss": 3.9296, "step": 2061 }, { "epoch": 0.6594575146415079, "grad_norm": 0.45823630690574646, "learning_rate": 0.00016485636504040015, "loss": 3.9372, "step": 2062 }, { "epoch": 0.6597773291490935, "grad_norm": 0.3437504768371582, "learning_rate": 0.00016457892046806727, "loss": 3.8805, "step": 2063 }, { "epoch": 0.6600971436566792, "grad_norm": 0.3809511959552765, "learning_rate": 0.00016430162128340693, "loss": 3.8593, "step": 2064 }, { "epoch": 0.6604169581642647, "grad_norm": 0.371191143989563, "learning_rate": 0.0001640244677841267, "loss": 3.953, "step": 2065 }, { "epoch": 0.6607367726718504, "grad_norm": 0.4661886990070343, "learning_rate": 0.00016374746026777794, "loss": 3.9225, "step": 2066 }, { "epoch": 0.6610565871794359, "grad_norm": 0.3599169850349426, "learning_rate": 0.0001634705990317548, "loss": 3.9287, "step": 2067 }, { "epoch": 0.6613764016870215, "grad_norm": 0.43606647849082947, "learning_rate": 0.00016319388437329482, "loss": 3.9329, "step": 2068 }, { "epoch": 0.6616962161946072, "grad_norm": 0.29583272337913513, "learning_rate": 0.00016291731658947808, "loss": 3.887, "step": 2069 }, { "epoch": 0.6620160307021927, "grad_norm": 0.4337099492549896, "learning_rate": 0.0001626408959772269, "loss": 3.9662, "step": 2070 }, { "epoch": 0.6623358452097783, "grad_norm": 0.5836841464042664, "learning_rate": 0.00016236462283330578, "loss": 3.8729, "step": 2071 }, { "epoch": 0.662655659717364, "grad_norm": 0.3051029145717621, "learning_rate": 0.0001620884974543205, "loss": 3.9652, "step": 2072 }, { "epoch": 0.6629754742249495, "grad_norm": 0.31746339797973633, "learning_rate": 0.00016181252013671858, "loss": 3.9314, "step": 2073 }, { "epoch": 0.6632952887325352, "grad_norm": 0.3851584792137146, "learning_rate": 0.00016153669117678848, "loss": 3.9733, "step": 2074 }, { "epoch": 0.6636151032401207, "grad_norm": 0.3936731517314911, "learning_rate": 0.00016126101087065933, "loss": 3.9989, "step": 2075 }, { "epoch": 0.6639349177477063, "grad_norm": 0.4133431911468506, "learning_rate": 0.00016098547951430082, "loss": 3.9508, "step": 2076 }, { "epoch": 0.664254732255292, "grad_norm": 0.30565547943115234, "learning_rate": 0.00016071009740352237, "loss": 3.97, "step": 2077 }, { "epoch": 0.6645745467628775, "grad_norm": 0.33455324172973633, "learning_rate": 0.0001604348648339736, "loss": 3.8934, "step": 2078 }, { "epoch": 0.6648943612704631, "grad_norm": 0.36230698227882385, "learning_rate": 0.0001601597821011431, "loss": 3.9974, "step": 2079 }, { "epoch": 0.6652141757780488, "grad_norm": 0.3556344211101532, "learning_rate": 0.0001598848495003593, "loss": 3.8799, "step": 2080 }, { "epoch": 0.6655339902856343, "grad_norm": 0.35150814056396484, "learning_rate": 0.00015961006732678873, "loss": 3.9555, "step": 2081 }, { "epoch": 0.66585380479322, "grad_norm": 0.3764215111732483, "learning_rate": 0.00015933543587543682, "loss": 3.9014, "step": 2082 }, { "epoch": 0.6661736193008055, "grad_norm": 0.3252947926521301, "learning_rate": 0.0001590609554411472, "loss": 3.9993, "step": 2083 }, { "epoch": 0.6664934338083911, "grad_norm": 0.3725816309452057, "learning_rate": 0.0001587866263186009, "loss": 3.9159, "step": 2084 }, { "epoch": 0.6668132483159768, "grad_norm": 0.34076380729675293, "learning_rate": 0.0001585124488023173, "loss": 4.0098, "step": 2085 }, { "epoch": 0.6671330628235623, "grad_norm": 0.40748488903045654, "learning_rate": 0.00015823842318665233, "loss": 3.8034, "step": 2086 }, { "epoch": 0.6674528773311479, "grad_norm": 0.33367976546287537, "learning_rate": 0.00015796454976579901, "loss": 3.9897, "step": 2087 }, { "epoch": 0.6677726918387336, "grad_norm": 0.3090580403804779, "learning_rate": 0.00015769082883378737, "loss": 3.886, "step": 2088 }, { "epoch": 0.6680925063463191, "grad_norm": 0.3293338716030121, "learning_rate": 0.00015741726068448293, "loss": 3.8969, "step": 2089 }, { "epoch": 0.6684123208539048, "grad_norm": 0.3851467967033386, "learning_rate": 0.0001571438456115881, "loss": 3.9018, "step": 2090 }, { "epoch": 0.6687321353614903, "grad_norm": 0.35412824153900146, "learning_rate": 0.0001568705839086402, "loss": 3.9896, "step": 2091 }, { "epoch": 0.6690519498690759, "grad_norm": 0.4424075484275818, "learning_rate": 0.00015659747586901243, "loss": 3.9959, "step": 2092 }, { "epoch": 0.6693717643766616, "grad_norm": 0.36159658432006836, "learning_rate": 0.00015632452178591252, "loss": 3.9189, "step": 2093 }, { "epoch": 0.6696915788842471, "grad_norm": 0.318804532289505, "learning_rate": 0.00015605172195238314, "loss": 3.9192, "step": 2094 }, { "epoch": 0.6700113933918327, "grad_norm": 0.3826238512992859, "learning_rate": 0.00015577907666130178, "loss": 4.0211, "step": 2095 }, { "epoch": 0.6703312078994184, "grad_norm": 0.5648853778839111, "learning_rate": 0.00015550658620537932, "loss": 4.0116, "step": 2096 }, { "epoch": 0.6706510224070039, "grad_norm": 0.44683822989463806, "learning_rate": 0.0001552342508771608, "loss": 3.8176, "step": 2097 }, { "epoch": 0.6709708369145896, "grad_norm": 0.33599159121513367, "learning_rate": 0.00015496207096902457, "loss": 3.8934, "step": 2098 }, { "epoch": 0.6712906514221751, "grad_norm": 0.3600747585296631, "learning_rate": 0.00015469004677318214, "loss": 3.9369, "step": 2099 }, { "epoch": 0.6716104659297607, "grad_norm": 0.49049103260040283, "learning_rate": 0.000154418178581678, "loss": 3.9469, "step": 2100 }, { "epoch": 0.6716104659297607, "eval_runtime": 41.0371, "eval_samples_per_second": 46.226, "eval_steps_per_second": 11.575, "step": 2100 }, { "epoch": 0.6719302804373464, "grad_norm": 0.6127731800079346, "learning_rate": 0.00015414646668638897, "loss": 3.879, "step": 2101 }, { "epoch": 0.6722500949449319, "grad_norm": 0.3398420512676239, "learning_rate": 0.00015387491137902428, "loss": 3.9192, "step": 2102 }, { "epoch": 0.6725699094525175, "grad_norm": 0.4781002998352051, "learning_rate": 0.00015360351295112468, "loss": 3.8637, "step": 2103 }, { "epoch": 0.6728897239601032, "grad_norm": 0.44633492827415466, "learning_rate": 0.00015333227169406284, "loss": 3.9597, "step": 2104 }, { "epoch": 0.6732095384676887, "grad_norm": 0.3925057053565979, "learning_rate": 0.0001530611878990426, "loss": 3.8983, "step": 2105 }, { "epoch": 0.6735293529752744, "grad_norm": 0.3493632674217224, "learning_rate": 0.00015279026185709865, "loss": 3.8492, "step": 2106 }, { "epoch": 0.67384916748286, "grad_norm": 0.338154673576355, "learning_rate": 0.0001525194938590966, "loss": 3.9895, "step": 2107 }, { "epoch": 0.6741689819904455, "grad_norm": 0.3570291996002197, "learning_rate": 0.0001522488841957319, "loss": 3.9494, "step": 2108 }, { "epoch": 0.6744887964980312, "grad_norm": 0.3930520713329315, "learning_rate": 0.00015197843315753034, "loss": 4.0252, "step": 2109 }, { "epoch": 0.6748086110056167, "grad_norm": 0.3284169137477875, "learning_rate": 0.00015170814103484747, "loss": 3.9398, "step": 2110 }, { "epoch": 0.6751284255132023, "grad_norm": 0.41097426414489746, "learning_rate": 0.00015143800811786805, "loss": 3.9079, "step": 2111 }, { "epoch": 0.675448240020788, "grad_norm": 0.38808372616767883, "learning_rate": 0.00015116803469660616, "loss": 3.9701, "step": 2112 }, { "epoch": 0.6757680545283735, "grad_norm": 0.5045790076255798, "learning_rate": 0.00015089822106090418, "loss": 3.9765, "step": 2113 }, { "epoch": 0.6760878690359592, "grad_norm": 0.427201509475708, "learning_rate": 0.00015062856750043343, "loss": 3.8885, "step": 2114 }, { "epoch": 0.6764076835435447, "grad_norm": 0.3949994146823883, "learning_rate": 0.00015035907430469304, "loss": 3.9412, "step": 2115 }, { "epoch": 0.6767274980511303, "grad_norm": 0.36145153641700745, "learning_rate": 0.00015008974176301031, "loss": 3.9604, "step": 2116 }, { "epoch": 0.677047312558716, "grad_norm": 0.4452461004257202, "learning_rate": 0.00014982057016453969, "loss": 3.9275, "step": 2117 }, { "epoch": 0.6773671270663015, "grad_norm": 0.4273628890514374, "learning_rate": 0.00014955155979826302, "loss": 3.93, "step": 2118 }, { "epoch": 0.6776869415738871, "grad_norm": 0.3132435977458954, "learning_rate": 0.00014928271095298912, "loss": 3.8008, "step": 2119 }, { "epoch": 0.6780067560814728, "grad_norm": 0.3293551206588745, "learning_rate": 0.00014901402391735328, "loss": 3.9471, "step": 2120 }, { "epoch": 0.6783265705890583, "grad_norm": 0.34885716438293457, "learning_rate": 0.00014874549897981725, "loss": 3.8886, "step": 2121 }, { "epoch": 0.678646385096644, "grad_norm": 0.6618058085441589, "learning_rate": 0.00014847713642866835, "loss": 3.9977, "step": 2122 }, { "epoch": 0.6789661996042295, "grad_norm": 0.4187517464160919, "learning_rate": 0.00014820893655201998, "loss": 3.9031, "step": 2123 }, { "epoch": 0.6792860141118151, "grad_norm": 0.3431651294231415, "learning_rate": 0.0001479408996378107, "loss": 3.8949, "step": 2124 }, { "epoch": 0.6796058286194008, "grad_norm": 0.3684055805206299, "learning_rate": 0.00014767302597380418, "loss": 4.0188, "step": 2125 }, { "epoch": 0.6799256431269863, "grad_norm": 0.464478462934494, "learning_rate": 0.0001474053158475889, "loss": 3.9005, "step": 2126 }, { "epoch": 0.6802454576345719, "grad_norm": 0.44516369700431824, "learning_rate": 0.00014713776954657743, "loss": 3.9087, "step": 2127 }, { "epoch": 0.6805652721421576, "grad_norm": 0.4766101539134979, "learning_rate": 0.00014687038735800693, "loss": 3.9896, "step": 2128 }, { "epoch": 0.6808850866497431, "grad_norm": 0.37514054775238037, "learning_rate": 0.0001466031695689378, "loss": 3.908, "step": 2129 }, { "epoch": 0.6812049011573288, "grad_norm": 0.4369240403175354, "learning_rate": 0.0001463361164662546, "loss": 3.8942, "step": 2130 }, { "epoch": 0.6815247156649143, "grad_norm": 0.5984254479408264, "learning_rate": 0.00014606922833666476, "loss": 3.9269, "step": 2131 }, { "epoch": 0.6818445301724999, "grad_norm": 0.3306334614753723, "learning_rate": 0.00014580250546669836, "loss": 3.9577, "step": 2132 }, { "epoch": 0.6821643446800856, "grad_norm": 0.40419507026672363, "learning_rate": 0.0001455359481427085, "loss": 3.9854, "step": 2133 }, { "epoch": 0.6824841591876711, "grad_norm": 0.33096417784690857, "learning_rate": 0.00014526955665087013, "loss": 3.8751, "step": 2134 }, { "epoch": 0.6828039736952568, "grad_norm": 0.3930308520793915, "learning_rate": 0.00014500333127718035, "loss": 3.9765, "step": 2135 }, { "epoch": 0.6831237882028424, "grad_norm": 0.4103385806083679, "learning_rate": 0.00014473727230745833, "loss": 3.9125, "step": 2136 }, { "epoch": 0.6834436027104279, "grad_norm": 0.32777532935142517, "learning_rate": 0.0001444713800273438, "loss": 3.969, "step": 2137 }, { "epoch": 0.6837634172180136, "grad_norm": 0.4517122507095337, "learning_rate": 0.0001442056547222982, "loss": 3.9749, "step": 2138 }, { "epoch": 0.6840832317255992, "grad_norm": 0.3686842620372772, "learning_rate": 0.0001439400966776032, "loss": 3.9288, "step": 2139 }, { "epoch": 0.6844030462331847, "grad_norm": 0.3947031497955322, "learning_rate": 0.00014367470617836117, "loss": 4.0324, "step": 2140 }, { "epoch": 0.6847228607407704, "grad_norm": 0.47459638118743896, "learning_rate": 0.00014340948350949467, "loss": 4.0363, "step": 2141 }, { "epoch": 0.6850426752483559, "grad_norm": 0.3568572700023651, "learning_rate": 0.00014314442895574595, "loss": 3.9646, "step": 2142 }, { "epoch": 0.6853624897559416, "grad_norm": 0.5114895701408386, "learning_rate": 0.00014287954280167695, "loss": 3.8598, "step": 2143 }, { "epoch": 0.6856823042635272, "grad_norm": 0.6017412543296814, "learning_rate": 0.00014261482533166832, "loss": 3.7981, "step": 2144 }, { "epoch": 0.6860021187711127, "grad_norm": 0.48608461022377014, "learning_rate": 0.0001423502768299202, "loss": 3.8874, "step": 2145 }, { "epoch": 0.6863219332786984, "grad_norm": 0.5386192202568054, "learning_rate": 0.00014208589758045098, "loss": 3.9477, "step": 2146 }, { "epoch": 0.686641747786284, "grad_norm": 0.4803098738193512, "learning_rate": 0.00014182168786709755, "loss": 3.912, "step": 2147 }, { "epoch": 0.6869615622938695, "grad_norm": 0.506546676158905, "learning_rate": 0.00014155764797351472, "loss": 4.0117, "step": 2148 }, { "epoch": 0.6872813768014552, "grad_norm": 0.5137774348258972, "learning_rate": 0.0001412937781831747, "loss": 4.0251, "step": 2149 }, { "epoch": 0.6876011913090407, "grad_norm": 0.3707796633243561, "learning_rate": 0.0001410300787793675, "loss": 3.9329, "step": 2150 }, { "epoch": 0.6879210058166264, "grad_norm": 0.6940580010414124, "learning_rate": 0.00014076655004519997, "loss": 3.9045, "step": 2151 }, { "epoch": 0.688240820324212, "grad_norm": 0.5341334939002991, "learning_rate": 0.00014050319226359593, "loss": 4.0082, "step": 2152 }, { "epoch": 0.6885606348317975, "grad_norm": 0.30568039417266846, "learning_rate": 0.00014024000571729526, "loss": 3.8895, "step": 2153 }, { "epoch": 0.6888804493393832, "grad_norm": 0.2981771230697632, "learning_rate": 0.00013997699068885443, "loss": 3.8915, "step": 2154 }, { "epoch": 0.6892002638469688, "grad_norm": 0.46084609627723694, "learning_rate": 0.00013971414746064554, "loss": 3.9449, "step": 2155 }, { "epoch": 0.6895200783545543, "grad_norm": 0.3918108344078064, "learning_rate": 0.00013945147631485634, "loss": 3.8658, "step": 2156 }, { "epoch": 0.68983989286214, "grad_norm": 0.35267341136932373, "learning_rate": 0.00013918897753348991, "loss": 3.878, "step": 2157 }, { "epoch": 0.6901597073697255, "grad_norm": 0.3140313923358917, "learning_rate": 0.00013892665139836392, "loss": 3.8886, "step": 2158 }, { "epoch": 0.6904795218773112, "grad_norm": 0.30501848459243774, "learning_rate": 0.0001386644981911111, "loss": 3.9902, "step": 2159 }, { "epoch": 0.6907993363848968, "grad_norm": 0.39345335960388184, "learning_rate": 0.00013840251819317832, "loss": 3.9088, "step": 2160 }, { "epoch": 0.6911191508924823, "grad_norm": 0.4248923361301422, "learning_rate": 0.00013814071168582654, "loss": 3.8892, "step": 2161 }, { "epoch": 0.691438965400068, "grad_norm": 0.3433879017829895, "learning_rate": 0.00013787907895013054, "loss": 3.8927, "step": 2162 }, { "epoch": 0.6917587799076536, "grad_norm": 0.4030303955078125, "learning_rate": 0.0001376176202669783, "loss": 3.8597, "step": 2163 }, { "epoch": 0.6920785944152391, "grad_norm": 0.3271911144256592, "learning_rate": 0.00013735633591707117, "loss": 3.9599, "step": 2164 }, { "epoch": 0.6923984089228248, "grad_norm": 0.3707104027271271, "learning_rate": 0.00013709522618092328, "loss": 3.864, "step": 2165 }, { "epoch": 0.6927182234304103, "grad_norm": 0.34690529108047485, "learning_rate": 0.00013683429133886122, "loss": 3.8954, "step": 2166 }, { "epoch": 0.693038037937996, "grad_norm": 0.41822192072868347, "learning_rate": 0.00013657353167102401, "loss": 3.9738, "step": 2167 }, { "epoch": 0.6933578524455816, "grad_norm": 0.3262981176376343, "learning_rate": 0.00013631294745736227, "loss": 3.9278, "step": 2168 }, { "epoch": 0.6936776669531671, "grad_norm": 0.3289790451526642, "learning_rate": 0.0001360525389776385, "loss": 3.8595, "step": 2169 }, { "epoch": 0.6939974814607528, "grad_norm": 0.381346732378006, "learning_rate": 0.00013579230651142654, "loss": 3.9149, "step": 2170 }, { "epoch": 0.6943172959683384, "grad_norm": 0.3445601463317871, "learning_rate": 0.00013553225033811114, "loss": 3.9363, "step": 2171 }, { "epoch": 0.6946371104759239, "grad_norm": 0.41692402958869934, "learning_rate": 0.00013527237073688797, "loss": 4.0257, "step": 2172 }, { "epoch": 0.6949569249835096, "grad_norm": 0.3924713730812073, "learning_rate": 0.00013501266798676283, "loss": 3.9543, "step": 2173 }, { "epoch": 0.6952767394910951, "grad_norm": 0.33074504137039185, "learning_rate": 0.000134753142366552, "loss": 3.9175, "step": 2174 }, { "epoch": 0.6955965539986808, "grad_norm": 0.32100334763526917, "learning_rate": 0.0001344937941548811, "loss": 3.9623, "step": 2175 }, { "epoch": 0.6959163685062664, "grad_norm": 0.35004085302352905, "learning_rate": 0.00013423462363018604, "loss": 3.8106, "step": 2176 }, { "epoch": 0.6962361830138519, "grad_norm": 0.46313849091529846, "learning_rate": 0.00013397563107071125, "loss": 3.905, "step": 2177 }, { "epoch": 0.6965559975214376, "grad_norm": 0.42949020862579346, "learning_rate": 0.0001337168167545104, "loss": 4.0118, "step": 2178 }, { "epoch": 0.6968758120290232, "grad_norm": 0.3588680326938629, "learning_rate": 0.000133458180959446, "loss": 3.9537, "step": 2179 }, { "epoch": 0.6971956265366087, "grad_norm": 0.3486484885215759, "learning_rate": 0.00013319972396318828, "loss": 3.908, "step": 2180 }, { "epoch": 0.6975154410441944, "grad_norm": 0.3724646270275116, "learning_rate": 0.00013294144604321633, "loss": 3.9627, "step": 2181 }, { "epoch": 0.6978352555517799, "grad_norm": 0.3343456983566284, "learning_rate": 0.00013268334747681626, "loss": 3.8584, "step": 2182 }, { "epoch": 0.6981550700593656, "grad_norm": 0.5656236410140991, "learning_rate": 0.0001324254285410821, "loss": 3.9563, "step": 2183 }, { "epoch": 0.6984748845669512, "grad_norm": 0.4718714952468872, "learning_rate": 0.0001321676895129149, "loss": 3.9122, "step": 2184 }, { "epoch": 0.6987946990745367, "grad_norm": 0.32784634828567505, "learning_rate": 0.0001319101306690222, "loss": 3.8895, "step": 2185 }, { "epoch": 0.6991145135821224, "grad_norm": 0.3506825268268585, "learning_rate": 0.0001316527522859189, "loss": 3.9374, "step": 2186 }, { "epoch": 0.699434328089708, "grad_norm": 0.298984169960022, "learning_rate": 0.00013139555463992527, "loss": 3.8519, "step": 2187 }, { "epoch": 0.6997541425972935, "grad_norm": 0.48135343194007874, "learning_rate": 0.00013113853800716824, "loss": 3.9041, "step": 2188 }, { "epoch": 0.7000739571048792, "grad_norm": 0.44690045714378357, "learning_rate": 0.00013088170266357986, "loss": 3.9453, "step": 2189 }, { "epoch": 0.7003937716124647, "grad_norm": 0.38618242740631104, "learning_rate": 0.00013062504888489788, "loss": 3.9348, "step": 2190 }, { "epoch": 0.7007135861200504, "grad_norm": 0.5404916405677795, "learning_rate": 0.0001303685769466651, "loss": 4.0107, "step": 2191 }, { "epoch": 0.701033400627636, "grad_norm": 0.6092861294746399, "learning_rate": 0.00013011228712422898, "loss": 3.9074, "step": 2192 }, { "epoch": 0.7013532151352215, "grad_norm": 0.49827849864959717, "learning_rate": 0.0001298561796927417, "loss": 4.0187, "step": 2193 }, { "epoch": 0.7016730296428072, "grad_norm": 0.32609912753105164, "learning_rate": 0.00012960025492715914, "loss": 3.9344, "step": 2194 }, { "epoch": 0.7019928441503928, "grad_norm": 0.37241941690444946, "learning_rate": 0.0001293445131022416, "loss": 3.8965, "step": 2195 }, { "epoch": 0.7023126586579783, "grad_norm": 0.314900666475296, "learning_rate": 0.00012908895449255262, "loss": 3.8302, "step": 2196 }, { "epoch": 0.702632473165564, "grad_norm": 0.341109961271286, "learning_rate": 0.0001288335793724592, "loss": 3.963, "step": 2197 }, { "epoch": 0.7029522876731495, "grad_norm": 0.4221562445163727, "learning_rate": 0.00012857838801613153, "loss": 3.923, "step": 2198 }, { "epoch": 0.7032721021807352, "grad_norm": 0.5216532945632935, "learning_rate": 0.000128323380697542, "loss": 3.764, "step": 2199 }, { "epoch": 0.7035919166883208, "grad_norm": 0.5267884731292725, "learning_rate": 0.0001280685576904658, "loss": 3.9321, "step": 2200 }, { "epoch": 0.7035919166883208, "eval_runtime": 43.2604, "eval_samples_per_second": 43.851, "eval_steps_per_second": 10.98, "step": 2200 }, { "epoch": 0.7039117311959063, "grad_norm": 0.34158656001091003, "learning_rate": 0.0001278139192684802, "loss": 3.9104, "step": 2201 }, { "epoch": 0.704231545703492, "grad_norm": 0.3815334439277649, "learning_rate": 0.00012755946570496427, "loss": 3.9014, "step": 2202 }, { "epoch": 0.7045513602110776, "grad_norm": 0.38654825091362, "learning_rate": 0.0001273051972730987, "loss": 3.8325, "step": 2203 }, { "epoch": 0.7048711747186632, "grad_norm": 0.4327349364757538, "learning_rate": 0.00012705111424586512, "loss": 3.9616, "step": 2204 }, { "epoch": 0.7051909892262488, "grad_norm": 0.4033554196357727, "learning_rate": 0.00012679721689604642, "loss": 3.9325, "step": 2205 }, { "epoch": 0.7055108037338343, "grad_norm": 0.33580973744392395, "learning_rate": 0.00012654350549622605, "loss": 3.9443, "step": 2206 }, { "epoch": 0.70583061824142, "grad_norm": 0.45306217670440674, "learning_rate": 0.00012628998031878784, "loss": 3.8437, "step": 2207 }, { "epoch": 0.7061504327490056, "grad_norm": 0.348193883895874, "learning_rate": 0.00012603664163591573, "loss": 3.9793, "step": 2208 }, { "epoch": 0.7064702472565911, "grad_norm": 0.34074461460113525, "learning_rate": 0.00012578348971959324, "loss": 3.8667, "step": 2209 }, { "epoch": 0.7067900617641768, "grad_norm": 0.324503093957901, "learning_rate": 0.0001255305248416036, "loss": 3.8424, "step": 2210 }, { "epoch": 0.7071098762717624, "grad_norm": 0.4301024079322815, "learning_rate": 0.0001252777472735291, "loss": 3.9466, "step": 2211 }, { "epoch": 0.707429690779348, "grad_norm": 0.3237893879413605, "learning_rate": 0.00012502515728675124, "loss": 3.9057, "step": 2212 }, { "epoch": 0.7077495052869336, "grad_norm": 0.3247719705104828, "learning_rate": 0.00012477275515244951, "loss": 3.932, "step": 2213 }, { "epoch": 0.7080693197945191, "grad_norm": 0.36865073442459106, "learning_rate": 0.00012452054114160232, "loss": 3.9571, "step": 2214 }, { "epoch": 0.7083891343021048, "grad_norm": 0.3585265278816223, "learning_rate": 0.00012426851552498584, "loss": 3.9117, "step": 2215 }, { "epoch": 0.7087089488096904, "grad_norm": 0.4705827534198761, "learning_rate": 0.00012401667857317406, "loss": 3.9052, "step": 2216 }, { "epoch": 0.7090287633172759, "grad_norm": 0.35736343264579773, "learning_rate": 0.0001237650305565385, "loss": 4.0187, "step": 2217 }, { "epoch": 0.7093485778248616, "grad_norm": 0.3399057686328888, "learning_rate": 0.00012351357174524745, "loss": 3.9818, "step": 2218 }, { "epoch": 0.7096683923324472, "grad_norm": 0.4361019730567932, "learning_rate": 0.00012326230240926653, "loss": 3.9809, "step": 2219 }, { "epoch": 0.7099882068400328, "grad_norm": 0.3767326772212982, "learning_rate": 0.00012301122281835772, "loss": 3.9454, "step": 2220 }, { "epoch": 0.7103080213476184, "grad_norm": 0.3601025342941284, "learning_rate": 0.00012276033324207935, "loss": 3.7672, "step": 2221 }, { "epoch": 0.710627835855204, "grad_norm": 0.34189024567604065, "learning_rate": 0.00012250963394978584, "loss": 3.9546, "step": 2222 }, { "epoch": 0.7109476503627896, "grad_norm": 0.3055751621723175, "learning_rate": 0.00012225912521062702, "loss": 3.9749, "step": 2223 }, { "epoch": 0.7112674648703752, "grad_norm": 0.37792590260505676, "learning_rate": 0.00012200880729354847, "loss": 3.9635, "step": 2224 }, { "epoch": 0.7115872793779607, "grad_norm": 0.4625628888607025, "learning_rate": 0.0001217586804672905, "loss": 3.8821, "step": 2225 }, { "epoch": 0.7119070938855464, "grad_norm": 0.35062894225120544, "learning_rate": 0.0001215087450003889, "loss": 4.0088, "step": 2226 }, { "epoch": 0.712226908393132, "grad_norm": 0.37986505031585693, "learning_rate": 0.00012125900116117357, "loss": 3.7118, "step": 2227 }, { "epoch": 0.7125467229007176, "grad_norm": 0.47540727257728577, "learning_rate": 0.0001210094492177686, "loss": 3.8778, "step": 2228 }, { "epoch": 0.7128665374083032, "grad_norm": 0.3193809390068054, "learning_rate": 0.00012076008943809238, "loss": 3.9178, "step": 2229 }, { "epoch": 0.7131863519158887, "grad_norm": 0.49913179874420166, "learning_rate": 0.00012051092208985671, "loss": 3.9391, "step": 2230 }, { "epoch": 0.7135061664234744, "grad_norm": 0.40152502059936523, "learning_rate": 0.00012026194744056684, "loss": 3.8833, "step": 2231 }, { "epoch": 0.71382598093106, "grad_norm": 0.4507232904434204, "learning_rate": 0.00012001316575752159, "loss": 3.9177, "step": 2232 }, { "epoch": 0.7141457954386455, "grad_norm": 0.3616706132888794, "learning_rate": 0.00011976457730781191, "loss": 3.96, "step": 2233 }, { "epoch": 0.7144656099462312, "grad_norm": 0.346789687871933, "learning_rate": 0.00011951618235832183, "loss": 3.8765, "step": 2234 }, { "epoch": 0.7147854244538168, "grad_norm": 0.3568446934223175, "learning_rate": 0.00011926798117572722, "loss": 4.0083, "step": 2235 }, { "epoch": 0.7151052389614024, "grad_norm": 0.3346664011478424, "learning_rate": 0.00011901997402649629, "loss": 3.93, "step": 2236 }, { "epoch": 0.715425053468988, "grad_norm": 0.34739163517951965, "learning_rate": 0.00011877216117688875, "loss": 3.9084, "step": 2237 }, { "epoch": 0.7157448679765736, "grad_norm": 0.3533191680908203, "learning_rate": 0.00011852454289295575, "loss": 3.8385, "step": 2238 }, { "epoch": 0.7160646824841592, "grad_norm": 0.5475122928619385, "learning_rate": 0.00011827711944053962, "loss": 3.852, "step": 2239 }, { "epoch": 0.7163844969917448, "grad_norm": 0.30915728211402893, "learning_rate": 0.00011802989108527331, "loss": 3.9887, "step": 2240 }, { "epoch": 0.7167043114993303, "grad_norm": 0.3982253968715668, "learning_rate": 0.00011778285809258052, "loss": 3.9105, "step": 2241 }, { "epoch": 0.717024126006916, "grad_norm": 0.4199460744857788, "learning_rate": 0.00011753602072767514, "loss": 3.929, "step": 2242 }, { "epoch": 0.7173439405145016, "grad_norm": 0.4522932767868042, "learning_rate": 0.00011728937925556107, "loss": 3.8438, "step": 2243 }, { "epoch": 0.7176637550220872, "grad_norm": 0.6459726095199585, "learning_rate": 0.00011704293394103194, "loss": 3.8697, "step": 2244 }, { "epoch": 0.7179835695296728, "grad_norm": 0.6647776365280151, "learning_rate": 0.00011679668504867051, "loss": 3.9786, "step": 2245 }, { "epoch": 0.7183033840372584, "grad_norm": 0.3584718406200409, "learning_rate": 0.00011655063284284901, "loss": 3.8488, "step": 2246 }, { "epoch": 0.718623198544844, "grad_norm": 0.49823465943336487, "learning_rate": 0.0001163047775877283, "loss": 3.8595, "step": 2247 }, { "epoch": 0.7189430130524296, "grad_norm": 0.4789009392261505, "learning_rate": 0.00011605911954725802, "loss": 3.8669, "step": 2248 }, { "epoch": 0.7192628275600151, "grad_norm": 0.43157705664634705, "learning_rate": 0.00011581365898517567, "loss": 3.7743, "step": 2249 }, { "epoch": 0.7195826420676008, "grad_norm": 0.7434427738189697, "learning_rate": 0.0001155683961650071, "loss": 3.9365, "step": 2250 }, { "epoch": 0.7199024565751864, "grad_norm": 0.3248288631439209, "learning_rate": 0.00011532333135006579, "loss": 3.8418, "step": 2251 }, { "epoch": 0.720222271082772, "grad_norm": 0.3763604164123535, "learning_rate": 0.00011507846480345255, "loss": 3.9434, "step": 2252 }, { "epoch": 0.7205420855903576, "grad_norm": 0.3775944411754608, "learning_rate": 0.00011483379678805551, "loss": 3.8144, "step": 2253 }, { "epoch": 0.7208619000979432, "grad_norm": 0.5250409245491028, "learning_rate": 0.00011458932756654938, "loss": 3.9064, "step": 2254 }, { "epoch": 0.7211817146055288, "grad_norm": 0.3435560464859009, "learning_rate": 0.00011434505740139558, "loss": 3.8758, "step": 2255 }, { "epoch": 0.7215015291131144, "grad_norm": 0.32786789536476135, "learning_rate": 0.00011410098655484194, "loss": 3.8758, "step": 2256 }, { "epoch": 0.7218213436206999, "grad_norm": 0.3942683935165405, "learning_rate": 0.00011385711528892216, "loss": 3.8991, "step": 2257 }, { "epoch": 0.7221411581282856, "grad_norm": 0.4304429292678833, "learning_rate": 0.00011361344386545585, "loss": 3.9732, "step": 2258 }, { "epoch": 0.7224609726358712, "grad_norm": 0.3767305016517639, "learning_rate": 0.00011336997254604769, "loss": 3.8725, "step": 2259 }, { "epoch": 0.7227807871434568, "grad_norm": 0.40955471992492676, "learning_rate": 0.0001131267015920879, "loss": 3.9209, "step": 2260 }, { "epoch": 0.7231006016510424, "grad_norm": 0.3473454415798187, "learning_rate": 0.0001128836312647514, "loss": 3.8335, "step": 2261 }, { "epoch": 0.723420416158628, "grad_norm": 0.39475351572036743, "learning_rate": 0.00011264076182499787, "loss": 3.9802, "step": 2262 }, { "epoch": 0.7237402306662136, "grad_norm": 0.46817660331726074, "learning_rate": 0.00011239809353357127, "loss": 3.9751, "step": 2263 }, { "epoch": 0.7240600451737992, "grad_norm": 0.3184073865413666, "learning_rate": 0.00011215562665099941, "loss": 3.7843, "step": 2264 }, { "epoch": 0.7243798596813849, "grad_norm": 0.5899235010147095, "learning_rate": 0.00011191336143759417, "loss": 3.9219, "step": 2265 }, { "epoch": 0.7246996741889704, "grad_norm": 0.3494570851325989, "learning_rate": 0.00011167129815345048, "loss": 3.9111, "step": 2266 }, { "epoch": 0.725019488696556, "grad_norm": 0.3236294388771057, "learning_rate": 0.0001114294370584471, "loss": 3.8425, "step": 2267 }, { "epoch": 0.7253393032041416, "grad_norm": 0.438624769449234, "learning_rate": 0.00011118777841224534, "loss": 3.9389, "step": 2268 }, { "epoch": 0.7256591177117272, "grad_norm": 0.33403080701828003, "learning_rate": 0.00011094632247428907, "loss": 3.8045, "step": 2269 }, { "epoch": 0.7259789322193128, "grad_norm": 0.3179053068161011, "learning_rate": 0.00011070506950380483, "loss": 3.8384, "step": 2270 }, { "epoch": 0.7262987467268984, "grad_norm": 0.47933289408683777, "learning_rate": 0.0001104640197598008, "loss": 3.8943, "step": 2271 }, { "epoch": 0.726618561234484, "grad_norm": 0.468839555978775, "learning_rate": 0.00011022317350106774, "loss": 3.8996, "step": 2272 }, { "epoch": 0.7269383757420697, "grad_norm": 0.31900113821029663, "learning_rate": 0.00010998253098617707, "loss": 3.9385, "step": 2273 }, { "epoch": 0.7272581902496552, "grad_norm": 0.4792587459087372, "learning_rate": 0.00010974209247348211, "loss": 3.8824, "step": 2274 }, { "epoch": 0.7275780047572408, "grad_norm": 0.3987066149711609, "learning_rate": 0.00010950185822111697, "loss": 3.8361, "step": 2275 }, { "epoch": 0.7278978192648264, "grad_norm": 0.4270268380641937, "learning_rate": 0.00010926182848699613, "loss": 3.8872, "step": 2276 }, { "epoch": 0.728217633772412, "grad_norm": 0.5016852021217346, "learning_rate": 0.00010902200352881522, "loss": 4.0529, "step": 2277 }, { "epoch": 0.7285374482799976, "grad_norm": 0.3930438756942749, "learning_rate": 0.00010878238360404934, "loss": 3.8746, "step": 2278 }, { "epoch": 0.7288572627875832, "grad_norm": 0.5877540111541748, "learning_rate": 0.00010854296896995379, "loss": 4.0049, "step": 2279 }, { "epoch": 0.7291770772951688, "grad_norm": 0.513640820980072, "learning_rate": 0.00010830375988356354, "loss": 3.8858, "step": 2280 }, { "epoch": 0.7294968918027545, "grad_norm": 0.5378215312957764, "learning_rate": 0.00010806475660169243, "loss": 3.8633, "step": 2281 }, { "epoch": 0.72981670631034, "grad_norm": 0.39620062708854675, "learning_rate": 0.00010782595938093417, "loss": 3.9657, "step": 2282 }, { "epoch": 0.7301365208179256, "grad_norm": 0.4462153911590576, "learning_rate": 0.00010758736847766033, "loss": 3.8706, "step": 2283 }, { "epoch": 0.7304563353255112, "grad_norm": 0.43002307415008545, "learning_rate": 0.00010734898414802169, "loss": 3.8629, "step": 2284 }, { "epoch": 0.7307761498330968, "grad_norm": 0.42244669795036316, "learning_rate": 0.00010711080664794676, "loss": 3.9204, "step": 2285 }, { "epoch": 0.7310959643406824, "grad_norm": 0.5252935290336609, "learning_rate": 0.00010687283623314225, "loss": 3.8745, "step": 2286 }, { "epoch": 0.731415778848268, "grad_norm": 0.392553448677063, "learning_rate": 0.00010663507315909255, "loss": 3.9157, "step": 2287 }, { "epoch": 0.7317355933558536, "grad_norm": 0.3386857807636261, "learning_rate": 0.00010639751768105936, "loss": 3.8901, "step": 2288 }, { "epoch": 0.7320554078634393, "grad_norm": 0.38901957869529724, "learning_rate": 0.00010616017005408167, "loss": 3.8844, "step": 2289 }, { "epoch": 0.7323752223710248, "grad_norm": 0.4061199724674225, "learning_rate": 0.00010592303053297499, "loss": 3.8884, "step": 2290 }, { "epoch": 0.7326950368786104, "grad_norm": 0.41093528270721436, "learning_rate": 0.00010568609937233168, "loss": 3.8815, "step": 2291 }, { "epoch": 0.733014851386196, "grad_norm": 0.5832483768463135, "learning_rate": 0.00010544937682652035, "loss": 3.9001, "step": 2292 }, { "epoch": 0.7333346658937816, "grad_norm": 0.3580489754676819, "learning_rate": 0.00010521286314968567, "loss": 3.8637, "step": 2293 }, { "epoch": 0.7336544804013672, "grad_norm": 0.3755680322647095, "learning_rate": 0.00010497655859574809, "loss": 3.8853, "step": 2294 }, { "epoch": 0.7339742949089528, "grad_norm": 0.38433733582496643, "learning_rate": 0.00010474046341840329, "loss": 3.9131, "step": 2295 }, { "epoch": 0.7342941094165384, "grad_norm": 0.4288310408592224, "learning_rate": 0.00010450457787112246, "loss": 3.9103, "step": 2296 }, { "epoch": 0.7346139239241241, "grad_norm": 0.3748302161693573, "learning_rate": 0.00010426890220715164, "loss": 3.8531, "step": 2297 }, { "epoch": 0.7349337384317096, "grad_norm": 0.33175036311149597, "learning_rate": 0.00010403343667951149, "loss": 3.7888, "step": 2298 }, { "epoch": 0.7352535529392952, "grad_norm": 0.4560914933681488, "learning_rate": 0.00010379818154099724, "loss": 3.9013, "step": 2299 }, { "epoch": 0.7355733674468808, "grad_norm": 0.4435993432998657, "learning_rate": 0.00010356313704417794, "loss": 3.8924, "step": 2300 }, { "epoch": 0.7355733674468808, "eval_runtime": 43.2856, "eval_samples_per_second": 43.825, "eval_steps_per_second": 10.974, "step": 2300 }, { "epoch": 0.7358931819544664, "grad_norm": 0.37959930300712585, "learning_rate": 0.0001033283034413967, "loss": 3.8052, "step": 2301 }, { "epoch": 0.736212996462052, "grad_norm": 0.4684748947620392, "learning_rate": 0.00010309368098477025, "loss": 3.9475, "step": 2302 }, { "epoch": 0.7365328109696376, "grad_norm": 0.3512060046195984, "learning_rate": 0.00010285926992618855, "loss": 3.851, "step": 2303 }, { "epoch": 0.7368526254772232, "grad_norm": 0.39180752635002136, "learning_rate": 0.0001026250705173147, "loss": 3.8624, "step": 2304 }, { "epoch": 0.7371724399848089, "grad_norm": 0.45222827792167664, "learning_rate": 0.00010239108300958432, "loss": 3.9378, "step": 2305 }, { "epoch": 0.7374922544923944, "grad_norm": 0.6518957614898682, "learning_rate": 0.00010215730765420579, "loss": 3.9193, "step": 2306 }, { "epoch": 0.73781206899998, "grad_norm": 0.365378201007843, "learning_rate": 0.00010192374470215969, "loss": 3.9094, "step": 2307 }, { "epoch": 0.7381318835075656, "grad_norm": 0.4570569694042206, "learning_rate": 0.00010169039440419855, "loss": 3.9398, "step": 2308 }, { "epoch": 0.7384516980151512, "grad_norm": 0.3978157937526703, "learning_rate": 0.00010145725701084643, "loss": 3.8431, "step": 2309 }, { "epoch": 0.7387715125227368, "grad_norm": 0.3495570123195648, "learning_rate": 0.000101224332772399, "loss": 3.7936, "step": 2310 }, { "epoch": 0.7390913270303224, "grad_norm": 0.4211837649345398, "learning_rate": 0.00010099162193892303, "loss": 3.944, "step": 2311 }, { "epoch": 0.739411141537908, "grad_norm": 0.5703482627868652, "learning_rate": 0.00010075912476025623, "loss": 3.9754, "step": 2312 }, { "epoch": 0.7397309560454937, "grad_norm": 0.3191204369068146, "learning_rate": 0.000100526841486007, "loss": 3.8252, "step": 2313 }, { "epoch": 0.7400507705530792, "grad_norm": 0.41751861572265625, "learning_rate": 0.00010029477236555372, "loss": 3.9062, "step": 2314 }, { "epoch": 0.7403705850606648, "grad_norm": 0.41126886010169983, "learning_rate": 0.00010006291764804523, "loss": 3.9022, "step": 2315 }, { "epoch": 0.7406903995682504, "grad_norm": 0.7578606605529785, "learning_rate": 9.98312775824001e-05, "loss": 3.8212, "step": 2316 }, { "epoch": 0.741010214075836, "grad_norm": 0.4711100161075592, "learning_rate": 9.959985241730641e-05, "loss": 3.9019, "step": 2317 }, { "epoch": 0.7413300285834216, "grad_norm": 0.3300808072090149, "learning_rate": 9.936864240122164e-05, "loss": 3.8553, "step": 2318 }, { "epoch": 0.7416498430910072, "grad_norm": 0.4489166736602783, "learning_rate": 9.913764778237196e-05, "loss": 3.8475, "step": 2319 }, { "epoch": 0.7419696575985928, "grad_norm": 0.4848867952823639, "learning_rate": 9.890686880875274e-05, "loss": 3.9862, "step": 2320 }, { "epoch": 0.7422894721061785, "grad_norm": 0.41174688935279846, "learning_rate": 9.86763057281273e-05, "loss": 3.8462, "step": 2321 }, { "epoch": 0.742609286613764, "grad_norm": 0.5664379000663757, "learning_rate": 9.844595878802778e-05, "loss": 3.8221, "step": 2322 }, { "epoch": 0.7429291011213496, "grad_norm": 0.46127867698669434, "learning_rate": 9.821582823575398e-05, "loss": 3.9355, "step": 2323 }, { "epoch": 0.7432489156289352, "grad_norm": 0.40406090021133423, "learning_rate": 9.79859143183732e-05, "loss": 3.9704, "step": 2324 }, { "epoch": 0.7435687301365208, "grad_norm": 0.5955961346626282, "learning_rate": 9.77562172827205e-05, "loss": 3.8976, "step": 2325 }, { "epoch": 0.7438885446441064, "grad_norm": 0.49023935198783875, "learning_rate": 9.752673737539779e-05, "loss": 3.9182, "step": 2326 }, { "epoch": 0.744208359151692, "grad_norm": 0.6680769324302673, "learning_rate": 9.729747484277402e-05, "loss": 3.8453, "step": 2327 }, { "epoch": 0.7445281736592776, "grad_norm": 0.405144065618515, "learning_rate": 9.706842993098503e-05, "loss": 3.9012, "step": 2328 }, { "epoch": 0.7448479881668633, "grad_norm": 0.37068724632263184, "learning_rate": 9.683960288593249e-05, "loss": 3.8806, "step": 2329 }, { "epoch": 0.7451678026744488, "grad_norm": 0.33892303705215454, "learning_rate": 9.661099395328463e-05, "loss": 3.8323, "step": 2330 }, { "epoch": 0.7454876171820344, "grad_norm": 0.3691041171550751, "learning_rate": 9.638260337847513e-05, "loss": 3.8524, "step": 2331 }, { "epoch": 0.74580743168962, "grad_norm": 0.34266188740730286, "learning_rate": 9.615443140670357e-05, "loss": 3.873, "step": 2332 }, { "epoch": 0.7461272461972056, "grad_norm": 0.3864574134349823, "learning_rate": 9.592647828293468e-05, "loss": 3.8554, "step": 2333 }, { "epoch": 0.7464470607047913, "grad_norm": 0.337662935256958, "learning_rate": 9.569874425189827e-05, "loss": 3.809, "step": 2334 }, { "epoch": 0.7467668752123768, "grad_norm": 0.3487156331539154, "learning_rate": 9.547122955808902e-05, "loss": 3.9723, "step": 2335 }, { "epoch": 0.7470866897199624, "grad_norm": 0.3953099548816681, "learning_rate": 9.524393444576585e-05, "loss": 3.9275, "step": 2336 }, { "epoch": 0.7474065042275481, "grad_norm": 0.46731966733932495, "learning_rate": 9.501685915895218e-05, "loss": 3.874, "step": 2337 }, { "epoch": 0.7477263187351336, "grad_norm": 0.39451658725738525, "learning_rate": 9.479000394143543e-05, "loss": 3.8745, "step": 2338 }, { "epoch": 0.7480461332427192, "grad_norm": 0.3589484691619873, "learning_rate": 9.456336903676666e-05, "loss": 3.9342, "step": 2339 }, { "epoch": 0.7483659477503048, "grad_norm": 0.3433530330657959, "learning_rate": 9.433695468826055e-05, "loss": 3.936, "step": 2340 }, { "epoch": 0.7486857622578904, "grad_norm": 0.38511332869529724, "learning_rate": 9.411076113899465e-05, "loss": 3.925, "step": 2341 }, { "epoch": 0.7490055767654761, "grad_norm": 0.3879907429218292, "learning_rate": 9.388478863180982e-05, "loss": 3.9169, "step": 2342 }, { "epoch": 0.7493253912730616, "grad_norm": 0.4785348176956177, "learning_rate": 9.365903740930947e-05, "loss": 3.9337, "step": 2343 }, { "epoch": 0.7496452057806472, "grad_norm": 0.3361395597457886, "learning_rate": 9.343350771385957e-05, "loss": 3.9195, "step": 2344 }, { "epoch": 0.7499650202882329, "grad_norm": 0.5303536057472229, "learning_rate": 9.320819978758787e-05, "loss": 3.9016, "step": 2345 }, { "epoch": 0.7502848347958184, "grad_norm": 0.6289753317832947, "learning_rate": 9.298311387238449e-05, "loss": 3.9492, "step": 2346 }, { "epoch": 0.750604649303404, "grad_norm": 0.378821462392807, "learning_rate": 9.275825020990092e-05, "loss": 3.9222, "step": 2347 }, { "epoch": 0.7509244638109897, "grad_norm": 0.4651988744735718, "learning_rate": 9.25336090415502e-05, "loss": 3.8093, "step": 2348 }, { "epoch": 0.7512442783185752, "grad_norm": 0.4249071776866913, "learning_rate": 9.230919060850645e-05, "loss": 3.9818, "step": 2349 }, { "epoch": 0.7515640928261609, "grad_norm": 0.3876030445098877, "learning_rate": 9.208499515170451e-05, "loss": 3.8498, "step": 2350 }, { "epoch": 0.7518839073337464, "grad_norm": 0.34562650322914124, "learning_rate": 9.186102291184003e-05, "loss": 3.9847, "step": 2351 }, { "epoch": 0.752203721841332, "grad_norm": 0.3939934968948364, "learning_rate": 9.163727412936895e-05, "loss": 3.8954, "step": 2352 }, { "epoch": 0.7525235363489177, "grad_norm": 0.41945981979370117, "learning_rate": 9.141374904450733e-05, "loss": 3.9567, "step": 2353 }, { "epoch": 0.7528433508565032, "grad_norm": 0.35306331515312195, "learning_rate": 9.119044789723108e-05, "loss": 3.8116, "step": 2354 }, { "epoch": 0.7531631653640888, "grad_norm": 0.37440225481987, "learning_rate": 9.09673709272755e-05, "loss": 3.9799, "step": 2355 }, { "epoch": 0.7534829798716745, "grad_norm": 0.35472750663757324, "learning_rate": 9.07445183741355e-05, "loss": 3.8025, "step": 2356 }, { "epoch": 0.75380279437926, "grad_norm": 0.4324037730693817, "learning_rate": 9.052189047706484e-05, "loss": 3.8328, "step": 2357 }, { "epoch": 0.7541226088868457, "grad_norm": 0.41722384095191956, "learning_rate": 9.029948747507627e-05, "loss": 3.8342, "step": 2358 }, { "epoch": 0.7544424233944312, "grad_norm": 0.34890711307525635, "learning_rate": 9.0077309606941e-05, "loss": 3.9523, "step": 2359 }, { "epoch": 0.7547622379020168, "grad_norm": 0.39528024196624756, "learning_rate": 8.985535711118844e-05, "loss": 3.9106, "step": 2360 }, { "epoch": 0.7550820524096025, "grad_norm": 0.3370823860168457, "learning_rate": 8.963363022610623e-05, "loss": 3.7718, "step": 2361 }, { "epoch": 0.755401866917188, "grad_norm": 0.4401865005493164, "learning_rate": 8.941212918973952e-05, "loss": 3.9373, "step": 2362 }, { "epoch": 0.7557216814247736, "grad_norm": 0.3218029737472534, "learning_rate": 8.919085423989135e-05, "loss": 3.8504, "step": 2363 }, { "epoch": 0.7560414959323593, "grad_norm": 0.42808905243873596, "learning_rate": 8.896980561412196e-05, "loss": 3.9032, "step": 2364 }, { "epoch": 0.7563613104399448, "grad_norm": 0.31665682792663574, "learning_rate": 8.874898354974821e-05, "loss": 3.8844, "step": 2365 }, { "epoch": 0.7566811249475305, "grad_norm": 0.43175074458122253, "learning_rate": 8.85283882838443e-05, "loss": 3.8736, "step": 2366 }, { "epoch": 0.757000939455116, "grad_norm": 0.4223661720752716, "learning_rate": 8.830802005324031e-05, "loss": 3.8354, "step": 2367 }, { "epoch": 0.7573207539627016, "grad_norm": 0.36371466517448425, "learning_rate": 8.808787909452334e-05, "loss": 3.8422, "step": 2368 }, { "epoch": 0.7576405684702873, "grad_norm": 0.3233981728553772, "learning_rate": 8.786796564403575e-05, "loss": 3.833, "step": 2369 }, { "epoch": 0.7579603829778728, "grad_norm": 0.3819316625595093, "learning_rate": 8.764827993787613e-05, "loss": 3.9023, "step": 2370 }, { "epoch": 0.7582801974854584, "grad_norm": 0.3288232684135437, "learning_rate": 8.742882221189844e-05, "loss": 3.9125, "step": 2371 }, { "epoch": 0.758600011993044, "grad_norm": 0.32690754532814026, "learning_rate": 8.720959270171162e-05, "loss": 3.86, "step": 2372 }, { "epoch": 0.7589198265006296, "grad_norm": 0.4671561121940613, "learning_rate": 8.699059164268015e-05, "loss": 3.9122, "step": 2373 }, { "epoch": 0.7592396410082153, "grad_norm": 0.43894633650779724, "learning_rate": 8.677181926992271e-05, "loss": 3.8501, "step": 2374 }, { "epoch": 0.7595594555158008, "grad_norm": 0.31778839230537415, "learning_rate": 8.655327581831279e-05, "loss": 3.8808, "step": 2375 }, { "epoch": 0.7598792700233864, "grad_norm": 0.35313156247138977, "learning_rate": 8.633496152247784e-05, "loss": 3.9376, "step": 2376 }, { "epoch": 0.7601990845309721, "grad_norm": 0.3833259046077728, "learning_rate": 8.611687661679945e-05, "loss": 3.9112, "step": 2377 }, { "epoch": 0.7605188990385576, "grad_norm": 0.4065259099006653, "learning_rate": 8.589902133541323e-05, "loss": 3.9001, "step": 2378 }, { "epoch": 0.7608387135461432, "grad_norm": 0.48490288853645325, "learning_rate": 8.568139591220764e-05, "loss": 3.7938, "step": 2379 }, { "epoch": 0.7611585280537289, "grad_norm": 0.32246172428131104, "learning_rate": 8.546400058082492e-05, "loss": 3.8631, "step": 2380 }, { "epoch": 0.7614783425613144, "grad_norm": 0.35473573207855225, "learning_rate": 8.524683557465987e-05, "loss": 3.9317, "step": 2381 }, { "epoch": 0.7617981570689001, "grad_norm": 0.3305418789386749, "learning_rate": 8.502990112686028e-05, "loss": 3.9866, "step": 2382 }, { "epoch": 0.7621179715764856, "grad_norm": 0.37989288568496704, "learning_rate": 8.481319747032635e-05, "loss": 3.9698, "step": 2383 }, { "epoch": 0.7624377860840712, "grad_norm": 0.34510353207588196, "learning_rate": 8.459672483771046e-05, "loss": 3.9204, "step": 2384 }, { "epoch": 0.7627576005916569, "grad_norm": 0.5202625393867493, "learning_rate": 8.438048346141713e-05, "loss": 3.8394, "step": 2385 }, { "epoch": 0.7630774150992424, "grad_norm": 0.3357793390750885, "learning_rate": 8.416447357360224e-05, "loss": 3.967, "step": 2386 }, { "epoch": 0.763397229606828, "grad_norm": 0.3380637466907501, "learning_rate": 8.394869540617347e-05, "loss": 3.9005, "step": 2387 }, { "epoch": 0.7637170441144137, "grad_norm": 0.3185747265815735, "learning_rate": 8.373314919078964e-05, "loss": 3.8386, "step": 2388 }, { "epoch": 0.7640368586219992, "grad_norm": 0.4482046365737915, "learning_rate": 8.35178351588605e-05, "loss": 3.8186, "step": 2389 }, { "epoch": 0.7643566731295849, "grad_norm": 0.3133537173271179, "learning_rate": 8.330275354154672e-05, "loss": 3.8734, "step": 2390 }, { "epoch": 0.7646764876371704, "grad_norm": 0.3837348520755768, "learning_rate": 8.308790456975905e-05, "loss": 3.8952, "step": 2391 }, { "epoch": 0.764996302144756, "grad_norm": 0.4393792450428009, "learning_rate": 8.28732884741588e-05, "loss": 3.9404, "step": 2392 }, { "epoch": 0.7653161166523417, "grad_norm": 0.3395466208457947, "learning_rate": 8.265890548515723e-05, "loss": 3.8355, "step": 2393 }, { "epoch": 0.7656359311599272, "grad_norm": 0.3615204095840454, "learning_rate": 8.244475583291522e-05, "loss": 3.8146, "step": 2394 }, { "epoch": 0.7659557456675128, "grad_norm": 0.3259068727493286, "learning_rate": 8.223083974734336e-05, "loss": 3.9298, "step": 2395 }, { "epoch": 0.7662755601750985, "grad_norm": 0.42036378383636475, "learning_rate": 8.201715745810112e-05, "loss": 3.9477, "step": 2396 }, { "epoch": 0.766595374682684, "grad_norm": 0.36269089579582214, "learning_rate": 8.180370919459728e-05, "loss": 3.8494, "step": 2397 }, { "epoch": 0.7669151891902697, "grad_norm": 0.4476913511753082, "learning_rate": 8.159049518598924e-05, "loss": 3.8371, "step": 2398 }, { "epoch": 0.7672350036978552, "grad_norm": 0.6153945922851562, "learning_rate": 8.137751566118306e-05, "loss": 3.8649, "step": 2399 }, { "epoch": 0.7675548182054408, "grad_norm": 0.37216612696647644, "learning_rate": 8.11647708488327e-05, "loss": 3.9751, "step": 2400 }, { "epoch": 0.7675548182054408, "eval_runtime": 42.8968, "eval_samples_per_second": 44.222, "eval_steps_per_second": 11.073, "step": 2400 }, { "epoch": 0.7678746327130265, "grad_norm": 0.3690406382083893, "learning_rate": 8.09522609773405e-05, "loss": 3.8533, "step": 2401 }, { "epoch": 0.768194447220612, "grad_norm": 0.3896867334842682, "learning_rate": 8.073998627485641e-05, "loss": 3.9098, "step": 2402 }, { "epoch": 0.7685142617281977, "grad_norm": 0.41072574257850647, "learning_rate": 8.052794696927796e-05, "loss": 3.8065, "step": 2403 }, { "epoch": 0.7688340762357833, "grad_norm": 0.4070708155632019, "learning_rate": 8.031614328824998e-05, "loss": 3.8663, "step": 2404 }, { "epoch": 0.7691538907433688, "grad_norm": 0.35022106766700745, "learning_rate": 8.010457545916408e-05, "loss": 3.9001, "step": 2405 }, { "epoch": 0.7694737052509545, "grad_norm": 0.349926233291626, "learning_rate": 7.989324370915899e-05, "loss": 3.9131, "step": 2406 }, { "epoch": 0.76979351975854, "grad_norm": 0.44366300106048584, "learning_rate": 7.968214826511987e-05, "loss": 3.9189, "step": 2407 }, { "epoch": 0.7701133342661256, "grad_norm": 0.38078024983406067, "learning_rate": 7.947128935367813e-05, "loss": 3.939, "step": 2408 }, { "epoch": 0.7704331487737113, "grad_norm": 0.41515371203422546, "learning_rate": 7.926066720121134e-05, "loss": 3.8553, "step": 2409 }, { "epoch": 0.7707529632812968, "grad_norm": 0.4976687431335449, "learning_rate": 7.905028203384269e-05, "loss": 3.8681, "step": 2410 }, { "epoch": 0.7710727777888825, "grad_norm": 0.4131871163845062, "learning_rate": 7.884013407744129e-05, "loss": 3.9089, "step": 2411 }, { "epoch": 0.7713925922964681, "grad_norm": 0.34952712059020996, "learning_rate": 7.863022355762101e-05, "loss": 3.9638, "step": 2412 }, { "epoch": 0.7717124068040536, "grad_norm": 0.3402501344680786, "learning_rate": 7.842055069974149e-05, "loss": 3.8968, "step": 2413 }, { "epoch": 0.7720322213116393, "grad_norm": 0.31344491243362427, "learning_rate": 7.82111157289069e-05, "loss": 3.9142, "step": 2414 }, { "epoch": 0.7723520358192248, "grad_norm": 0.405819296836853, "learning_rate": 7.800191886996578e-05, "loss": 3.7485, "step": 2415 }, { "epoch": 0.7726718503268104, "grad_norm": 0.3349445164203644, "learning_rate": 7.779296034751152e-05, "loss": 3.8098, "step": 2416 }, { "epoch": 0.7729916648343961, "grad_norm": 0.3116932213306427, "learning_rate": 7.75842403858811e-05, "loss": 3.9339, "step": 2417 }, { "epoch": 0.7733114793419816, "grad_norm": 0.32616016268730164, "learning_rate": 7.737575920915574e-05, "loss": 3.8854, "step": 2418 }, { "epoch": 0.7736312938495673, "grad_norm": 0.4334855377674103, "learning_rate": 7.716751704116042e-05, "loss": 3.8649, "step": 2419 }, { "epoch": 0.7739511083571529, "grad_norm": 0.3935568332672119, "learning_rate": 7.695951410546311e-05, "loss": 3.9974, "step": 2420 }, { "epoch": 0.7742709228647384, "grad_norm": 0.35519447922706604, "learning_rate": 7.67517506253753e-05, "loss": 3.9095, "step": 2421 }, { "epoch": 0.7745907373723241, "grad_norm": 0.4638981819152832, "learning_rate": 7.654422682395106e-05, "loss": 3.8587, "step": 2422 }, { "epoch": 0.7749105518799096, "grad_norm": 0.3826427757740021, "learning_rate": 7.633694292398745e-05, "loss": 3.8206, "step": 2423 }, { "epoch": 0.7752303663874952, "grad_norm": 0.38036826252937317, "learning_rate": 7.612989914802383e-05, "loss": 3.8814, "step": 2424 }, { "epoch": 0.7755501808950809, "grad_norm": 0.4507428705692291, "learning_rate": 7.592309571834179e-05, "loss": 3.8536, "step": 2425 }, { "epoch": 0.7758699954026664, "grad_norm": 0.4660930335521698, "learning_rate": 7.5716532856965e-05, "loss": 3.8875, "step": 2426 }, { "epoch": 0.7761898099102521, "grad_norm": 0.32899385690689087, "learning_rate": 7.551021078565857e-05, "loss": 3.8662, "step": 2427 }, { "epoch": 0.7765096244178377, "grad_norm": 0.3710196614265442, "learning_rate": 7.530412972592928e-05, "loss": 3.933, "step": 2428 }, { "epoch": 0.7768294389254232, "grad_norm": 0.34057101607322693, "learning_rate": 7.509828989902525e-05, "loss": 3.9073, "step": 2429 }, { "epoch": 0.7771492534330089, "grad_norm": 0.4555116295814514, "learning_rate": 7.489269152593543e-05, "loss": 3.8637, "step": 2430 }, { "epoch": 0.7774690679405944, "grad_norm": 0.457455575466156, "learning_rate": 7.468733482738976e-05, "loss": 3.9844, "step": 2431 }, { "epoch": 0.77778888244818, "grad_norm": 0.349774569272995, "learning_rate": 7.44822200238584e-05, "loss": 3.8457, "step": 2432 }, { "epoch": 0.7781086969557657, "grad_norm": 0.3690963387489319, "learning_rate": 7.42773473355521e-05, "loss": 3.9307, "step": 2433 }, { "epoch": 0.7784285114633512, "grad_norm": 0.48311102390289307, "learning_rate": 7.407271698242155e-05, "loss": 3.8764, "step": 2434 }, { "epoch": 0.7787483259709369, "grad_norm": 0.44709160923957825, "learning_rate": 7.386832918415741e-05, "loss": 3.8647, "step": 2435 }, { "epoch": 0.7790681404785225, "grad_norm": 0.6309023499488831, "learning_rate": 7.366418416018963e-05, "loss": 3.8398, "step": 2436 }, { "epoch": 0.779387954986108, "grad_norm": 0.6202185750007629, "learning_rate": 7.346028212968778e-05, "loss": 3.8686, "step": 2437 }, { "epoch": 0.7797077694936937, "grad_norm": 0.40240392088890076, "learning_rate": 7.325662331156049e-05, "loss": 3.8328, "step": 2438 }, { "epoch": 0.7800275840012793, "grad_norm": 0.4376048147678375, "learning_rate": 7.305320792445532e-05, "loss": 3.8069, "step": 2439 }, { "epoch": 0.7803473985088648, "grad_norm": 0.4918051064014435, "learning_rate": 7.285003618675842e-05, "loss": 3.8587, "step": 2440 }, { "epoch": 0.7806672130164505, "grad_norm": 0.4197878837585449, "learning_rate": 7.264710831659426e-05, "loss": 3.8206, "step": 2441 }, { "epoch": 0.780987027524036, "grad_norm": 0.4253217279911041, "learning_rate": 7.24444245318257e-05, "loss": 3.8304, "step": 2442 }, { "epoch": 0.7813068420316217, "grad_norm": 0.3384673297405243, "learning_rate": 7.224198505005344e-05, "loss": 3.9098, "step": 2443 }, { "epoch": 0.7816266565392073, "grad_norm": 0.37927570939064026, "learning_rate": 7.203979008861588e-05, "loss": 3.9125, "step": 2444 }, { "epoch": 0.7819464710467928, "grad_norm": 0.3123345673084259, "learning_rate": 7.183783986458906e-05, "loss": 3.8893, "step": 2445 }, { "epoch": 0.7822662855543785, "grad_norm": 0.3696351945400238, "learning_rate": 7.163613459478595e-05, "loss": 3.9389, "step": 2446 }, { "epoch": 0.782586100061964, "grad_norm": 0.4006158411502838, "learning_rate": 7.143467449575682e-05, "loss": 3.8294, "step": 2447 }, { "epoch": 0.7829059145695496, "grad_norm": 0.48442721366882324, "learning_rate": 7.12334597837887e-05, "loss": 3.923, "step": 2448 }, { "epoch": 0.7832257290771353, "grad_norm": 0.3486553728580475, "learning_rate": 7.103249067490502e-05, "loss": 3.9101, "step": 2449 }, { "epoch": 0.7835455435847208, "grad_norm": 0.3334793746471405, "learning_rate": 7.083176738486578e-05, "loss": 3.7951, "step": 2450 }, { "epoch": 0.7838653580923065, "grad_norm": 0.42709240317344666, "learning_rate": 7.063129012916671e-05, "loss": 3.8341, "step": 2451 }, { "epoch": 0.7841851725998921, "grad_norm": 0.5859894752502441, "learning_rate": 7.04310591230397e-05, "loss": 3.8438, "step": 2452 }, { "epoch": 0.7845049871074776, "grad_norm": 0.5821248292922974, "learning_rate": 7.023107458145214e-05, "loss": 3.9563, "step": 2453 }, { "epoch": 0.7848248016150633, "grad_norm": 0.359943151473999, "learning_rate": 7.003133671910688e-05, "loss": 3.913, "step": 2454 }, { "epoch": 0.7851446161226489, "grad_norm": 0.35467836260795593, "learning_rate": 6.983184575044199e-05, "loss": 3.8667, "step": 2455 }, { "epoch": 0.7854644306302344, "grad_norm": 0.32859066128730774, "learning_rate": 6.963260188963016e-05, "loss": 3.8721, "step": 2456 }, { "epoch": 0.7857842451378201, "grad_norm": 0.3248803913593292, "learning_rate": 6.943360535057926e-05, "loss": 3.9093, "step": 2457 }, { "epoch": 0.7861040596454056, "grad_norm": 0.3260408341884613, "learning_rate": 6.923485634693109e-05, "loss": 3.8813, "step": 2458 }, { "epoch": 0.7864238741529913, "grad_norm": 0.3499215543270111, "learning_rate": 6.903635509206234e-05, "loss": 3.9124, "step": 2459 }, { "epoch": 0.7867436886605769, "grad_norm": 0.35315051674842834, "learning_rate": 6.883810179908315e-05, "loss": 3.8492, "step": 2460 }, { "epoch": 0.7870635031681624, "grad_norm": 0.5216625928878784, "learning_rate": 6.86400966808377e-05, "loss": 3.8841, "step": 2461 }, { "epoch": 0.7873833176757481, "grad_norm": 0.39670202136039734, "learning_rate": 6.844233994990382e-05, "loss": 3.8772, "step": 2462 }, { "epoch": 0.7877031321833337, "grad_norm": 0.5957048535346985, "learning_rate": 6.824483181859231e-05, "loss": 3.8784, "step": 2463 }, { "epoch": 0.7880229466909192, "grad_norm": 0.48852774500846863, "learning_rate": 6.804757249894762e-05, "loss": 3.8589, "step": 2464 }, { "epoch": 0.7883427611985049, "grad_norm": 0.35292282700538635, "learning_rate": 6.785056220274658e-05, "loss": 3.8253, "step": 2465 }, { "epoch": 0.7886625757060904, "grad_norm": 0.31522834300994873, "learning_rate": 6.765380114149887e-05, "loss": 3.8948, "step": 2466 }, { "epoch": 0.7889823902136761, "grad_norm": 0.46899184584617615, "learning_rate": 6.745728952644675e-05, "loss": 3.8924, "step": 2467 }, { "epoch": 0.7893022047212617, "grad_norm": 0.5264912843704224, "learning_rate": 6.726102756856422e-05, "loss": 3.9043, "step": 2468 }, { "epoch": 0.7896220192288472, "grad_norm": 0.38859331607818604, "learning_rate": 6.706501547855787e-05, "loss": 3.9187, "step": 2469 }, { "epoch": 0.7899418337364329, "grad_norm": 0.45980527997016907, "learning_rate": 6.686925346686544e-05, "loss": 3.8275, "step": 2470 }, { "epoch": 0.7902616482440185, "grad_norm": 0.31803491711616516, "learning_rate": 6.667374174365667e-05, "loss": 3.9077, "step": 2471 }, { "epoch": 0.7905814627516041, "grad_norm": 0.3065436780452728, "learning_rate": 6.647848051883217e-05, "loss": 3.8783, "step": 2472 }, { "epoch": 0.7909012772591897, "grad_norm": 0.32231515645980835, "learning_rate": 6.628347000202381e-05, "loss": 3.8563, "step": 2473 }, { "epoch": 0.7912210917667752, "grad_norm": 0.36974066495895386, "learning_rate": 6.608871040259457e-05, "loss": 3.8537, "step": 2474 }, { "epoch": 0.7915409062743609, "grad_norm": 0.3763801157474518, "learning_rate": 6.589420192963754e-05, "loss": 3.8524, "step": 2475 }, { "epoch": 0.7918607207819465, "grad_norm": 0.32826054096221924, "learning_rate": 6.56999447919766e-05, "loss": 3.9308, "step": 2476 }, { "epoch": 0.792180535289532, "grad_norm": 0.32793721556663513, "learning_rate": 6.550593919816545e-05, "loss": 3.8584, "step": 2477 }, { "epoch": 0.7925003497971177, "grad_norm": 0.5547561049461365, "learning_rate": 6.531218535648807e-05, "loss": 3.8365, "step": 2478 }, { "epoch": 0.7928201643047033, "grad_norm": 0.3614239990711212, "learning_rate": 6.511868347495793e-05, "loss": 3.8677, "step": 2479 }, { "epoch": 0.7931399788122889, "grad_norm": 0.344993531703949, "learning_rate": 6.492543376131817e-05, "loss": 3.9299, "step": 2480 }, { "epoch": 0.7934597933198745, "grad_norm": 0.45573487877845764, "learning_rate": 6.473243642304114e-05, "loss": 3.9056, "step": 2481 }, { "epoch": 0.79377960782746, "grad_norm": 0.5808489322662354, "learning_rate": 6.453969166732808e-05, "loss": 3.883, "step": 2482 }, { "epoch": 0.7940994223350457, "grad_norm": 0.3459409177303314, "learning_rate": 6.434719970110923e-05, "loss": 3.7777, "step": 2483 }, { "epoch": 0.7944192368426313, "grad_norm": 0.31603914499282837, "learning_rate": 6.415496073104344e-05, "loss": 3.9505, "step": 2484 }, { "epoch": 0.7947390513502168, "grad_norm": 0.6193767786026001, "learning_rate": 6.396297496351791e-05, "loss": 3.9338, "step": 2485 }, { "epoch": 0.7950588658578025, "grad_norm": 0.30562475323677063, "learning_rate": 6.377124260464804e-05, "loss": 3.8896, "step": 2486 }, { "epoch": 0.7953786803653881, "grad_norm": 0.3781988322734833, "learning_rate": 6.357976386027697e-05, "loss": 3.925, "step": 2487 }, { "epoch": 0.7956984948729737, "grad_norm": 0.5499515533447266, "learning_rate": 6.338853893597584e-05, "loss": 3.8785, "step": 2488 }, { "epoch": 0.7960183093805593, "grad_norm": 0.3819548785686493, "learning_rate": 6.319756803704311e-05, "loss": 3.9063, "step": 2489 }, { "epoch": 0.7963381238881448, "grad_norm": 0.5077492594718933, "learning_rate": 6.300685136850458e-05, "loss": 3.8836, "step": 2490 }, { "epoch": 0.7966579383957305, "grad_norm": 0.361968994140625, "learning_rate": 6.281638913511324e-05, "loss": 3.8747, "step": 2491 }, { "epoch": 0.7969777529033161, "grad_norm": 0.3381747007369995, "learning_rate": 6.262618154134858e-05, "loss": 3.7955, "step": 2492 }, { "epoch": 0.7972975674109016, "grad_norm": 0.3091414272785187, "learning_rate": 6.2436228791417e-05, "loss": 3.8996, "step": 2493 }, { "epoch": 0.7976173819184873, "grad_norm": 0.45440101623535156, "learning_rate": 6.224653108925122e-05, "loss": 3.8613, "step": 2494 }, { "epoch": 0.7979371964260729, "grad_norm": 0.3720913231372833, "learning_rate": 6.205708863851019e-05, "loss": 3.8168, "step": 2495 }, { "epoch": 0.7982570109336585, "grad_norm": 0.3557816445827484, "learning_rate": 6.186790164257866e-05, "loss": 3.8916, "step": 2496 }, { "epoch": 0.7985768254412441, "grad_norm": 0.43525996804237366, "learning_rate": 6.167897030456725e-05, "loss": 3.9149, "step": 2497 }, { "epoch": 0.7988966399488296, "grad_norm": 0.4283919036388397, "learning_rate": 6.149029482731211e-05, "loss": 3.8477, "step": 2498 }, { "epoch": 0.7992164544564153, "grad_norm": 0.5837275981903076, "learning_rate": 6.13018754133747e-05, "loss": 3.9017, "step": 2499 }, { "epoch": 0.7995362689640009, "grad_norm": 0.3566829264163971, "learning_rate": 6.111371226504162e-05, "loss": 3.8702, "step": 2500 }, { "epoch": 0.7995362689640009, "eval_runtime": 42.1432, "eval_samples_per_second": 45.013, "eval_steps_per_second": 11.271, "step": 2500 }, { "epoch": 0.7998560834715864, "grad_norm": 0.312110960483551, "learning_rate": 6.092580558432409e-05, "loss": 3.9102, "step": 2501 }, { "epoch": 0.8001758979791721, "grad_norm": 0.3782331645488739, "learning_rate": 6.073815557295827e-05, "loss": 3.8686, "step": 2502 }, { "epoch": 0.8004957124867577, "grad_norm": 0.41211917996406555, "learning_rate": 6.055076243240467e-05, "loss": 3.8291, "step": 2503 }, { "epoch": 0.8008155269943433, "grad_norm": 0.3206890821456909, "learning_rate": 6.036362636384798e-05, "loss": 3.8646, "step": 2504 }, { "epoch": 0.8011353415019289, "grad_norm": 0.3144303560256958, "learning_rate": 6.017674756819705e-05, "loss": 3.8277, "step": 2505 }, { "epoch": 0.8014551560095144, "grad_norm": 0.3633415400981903, "learning_rate": 5.9990126246084204e-05, "loss": 3.8277, "step": 2506 }, { "epoch": 0.8017749705171001, "grad_norm": 0.35377538204193115, "learning_rate": 5.9803762597865745e-05, "loss": 3.7857, "step": 2507 }, { "epoch": 0.8020947850246857, "grad_norm": 0.332643061876297, "learning_rate": 5.96176568236209e-05, "loss": 3.8642, "step": 2508 }, { "epoch": 0.8024145995322712, "grad_norm": 0.31508487462997437, "learning_rate": 5.9431809123152465e-05, "loss": 3.8788, "step": 2509 }, { "epoch": 0.8027344140398569, "grad_norm": 0.4109523594379425, "learning_rate": 5.924621969598604e-05, "loss": 3.9638, "step": 2510 }, { "epoch": 0.8030542285474425, "grad_norm": 0.5165453553199768, "learning_rate": 5.906088874136968e-05, "loss": 3.9204, "step": 2511 }, { "epoch": 0.8033740430550281, "grad_norm": 0.4532223343849182, "learning_rate": 5.887581645827436e-05, "loss": 3.8923, "step": 2512 }, { "epoch": 0.8036938575626137, "grad_norm": 0.3330877125263214, "learning_rate": 5.869100304539297e-05, "loss": 3.8619, "step": 2513 }, { "epoch": 0.8040136720701992, "grad_norm": 0.39762070775032043, "learning_rate": 5.850644870114063e-05, "loss": 3.7813, "step": 2514 }, { "epoch": 0.8043334865777849, "grad_norm": 0.30016523599624634, "learning_rate": 5.832215362365458e-05, "loss": 3.8957, "step": 2515 }, { "epoch": 0.8046533010853705, "grad_norm": 0.34443408250808716, "learning_rate": 5.813811801079325e-05, "loss": 3.9398, "step": 2516 }, { "epoch": 0.804973115592956, "grad_norm": 0.40481120347976685, "learning_rate": 5.795434206013685e-05, "loss": 3.8685, "step": 2517 }, { "epoch": 0.8052929301005417, "grad_norm": 0.43617337942123413, "learning_rate": 5.77708259689866e-05, "loss": 3.9, "step": 2518 }, { "epoch": 0.8056127446081273, "grad_norm": 0.4165572226047516, "learning_rate": 5.7587569934364896e-05, "loss": 3.949, "step": 2519 }, { "epoch": 0.8059325591157129, "grad_norm": 0.6039724946022034, "learning_rate": 5.740457415301486e-05, "loss": 3.9069, "step": 2520 }, { "epoch": 0.8062523736232985, "grad_norm": 0.3261542320251465, "learning_rate": 5.72218388214002e-05, "loss": 3.8726, "step": 2521 }, { "epoch": 0.806572188130884, "grad_norm": 0.33693626523017883, "learning_rate": 5.703936413570519e-05, "loss": 3.8784, "step": 2522 }, { "epoch": 0.8068920026384697, "grad_norm": 0.38988664746284485, "learning_rate": 5.6857150291833884e-05, "loss": 3.9222, "step": 2523 }, { "epoch": 0.8072118171460553, "grad_norm": 0.4214157462120056, "learning_rate": 5.667519748541064e-05, "loss": 3.8417, "step": 2524 }, { "epoch": 0.8075316316536408, "grad_norm": 0.4479750692844391, "learning_rate": 5.649350591177946e-05, "loss": 3.7938, "step": 2525 }, { "epoch": 0.8078514461612265, "grad_norm": 0.3286908268928528, "learning_rate": 5.6312075766003876e-05, "loss": 3.9259, "step": 2526 }, { "epoch": 0.8081712606688121, "grad_norm": 0.44016754627227783, "learning_rate": 5.613090724286681e-05, "loss": 3.8099, "step": 2527 }, { "epoch": 0.8084910751763977, "grad_norm": 0.3701966404914856, "learning_rate": 5.595000053687014e-05, "loss": 3.8982, "step": 2528 }, { "epoch": 0.8088108896839833, "grad_norm": 0.4387320280075073, "learning_rate": 5.576935584223482e-05, "loss": 3.8553, "step": 2529 }, { "epoch": 0.8091307041915689, "grad_norm": 0.37879040837287903, "learning_rate": 5.55889733529005e-05, "loss": 3.8876, "step": 2530 }, { "epoch": 0.8094505186991545, "grad_norm": 0.34753432869911194, "learning_rate": 5.540885326252531e-05, "loss": 4.0035, "step": 2531 }, { "epoch": 0.8097703332067401, "grad_norm": 0.6233555674552917, "learning_rate": 5.5228995764485564e-05, "loss": 3.815, "step": 2532 }, { "epoch": 0.8100901477143256, "grad_norm": 0.41114088892936707, "learning_rate": 5.5049401051875765e-05, "loss": 3.9239, "step": 2533 }, { "epoch": 0.8104099622219113, "grad_norm": 0.4441838562488556, "learning_rate": 5.487006931750828e-05, "loss": 3.8124, "step": 2534 }, { "epoch": 0.8107297767294969, "grad_norm": 0.3657591938972473, "learning_rate": 5.469100075391314e-05, "loss": 3.9034, "step": 2535 }, { "epoch": 0.8110495912370825, "grad_norm": 0.7792800068855286, "learning_rate": 5.451219555333792e-05, "loss": 3.8457, "step": 2536 }, { "epoch": 0.8113694057446681, "grad_norm": 0.5676858425140381, "learning_rate": 5.4333653907747174e-05, "loss": 3.7783, "step": 2537 }, { "epoch": 0.8116892202522537, "grad_norm": 0.5630744099617004, "learning_rate": 5.4155376008822805e-05, "loss": 3.8631, "step": 2538 }, { "epoch": 0.8120090347598393, "grad_norm": 0.4195505380630493, "learning_rate": 5.397736204796337e-05, "loss": 3.8889, "step": 2539 }, { "epoch": 0.8123288492674249, "grad_norm": 0.35054275393486023, "learning_rate": 5.37996122162842e-05, "loss": 3.8777, "step": 2540 }, { "epoch": 0.8126486637750105, "grad_norm": 0.40650415420532227, "learning_rate": 5.362212670461706e-05, "loss": 3.8892, "step": 2541 }, { "epoch": 0.8129684782825961, "grad_norm": 0.4843025803565979, "learning_rate": 5.3444905703509687e-05, "loss": 3.8217, "step": 2542 }, { "epoch": 0.8132882927901817, "grad_norm": 0.480148047208786, "learning_rate": 5.3267949403226104e-05, "loss": 3.8874, "step": 2543 }, { "epoch": 0.8136081072977673, "grad_norm": 0.42070749402046204, "learning_rate": 5.3091257993746115e-05, "loss": 3.8749, "step": 2544 }, { "epoch": 0.8139279218053529, "grad_norm": 0.34799161553382874, "learning_rate": 5.2914831664765045e-05, "loss": 3.8415, "step": 2545 }, { "epoch": 0.8142477363129385, "grad_norm": 0.632991373538971, "learning_rate": 5.2738670605693814e-05, "loss": 3.9054, "step": 2546 }, { "epoch": 0.8145675508205241, "grad_norm": 0.34581393003463745, "learning_rate": 5.256277500565823e-05, "loss": 3.8705, "step": 2547 }, { "epoch": 0.8148873653281097, "grad_norm": 0.460650771856308, "learning_rate": 5.238714505349938e-05, "loss": 3.8688, "step": 2548 }, { "epoch": 0.8152071798356954, "grad_norm": 0.540112316608429, "learning_rate": 5.221178093777303e-05, "loss": 3.919, "step": 2549 }, { "epoch": 0.8155269943432809, "grad_norm": 0.3159160017967224, "learning_rate": 5.2036682846749645e-05, "loss": 3.861, "step": 2550 }, { "epoch": 0.8158468088508665, "grad_norm": 0.3432331383228302, "learning_rate": 5.186185096841402e-05, "loss": 3.9117, "step": 2551 }, { "epoch": 0.8161666233584521, "grad_norm": 0.32781413197517395, "learning_rate": 5.168728549046508e-05, "loss": 3.8743, "step": 2552 }, { "epoch": 0.8164864378660377, "grad_norm": 0.41665518283843994, "learning_rate": 5.151298660031587e-05, "loss": 3.885, "step": 2553 }, { "epoch": 0.8168062523736233, "grad_norm": 0.4601728618144989, "learning_rate": 5.133895448509299e-05, "loss": 3.8488, "step": 2554 }, { "epoch": 0.8171260668812089, "grad_norm": 0.37514132261276245, "learning_rate": 5.116518933163709e-05, "loss": 3.8293, "step": 2555 }, { "epoch": 0.8174458813887945, "grad_norm": 0.6124889850616455, "learning_rate": 5.099169132650173e-05, "loss": 3.8677, "step": 2556 }, { "epoch": 0.8177656958963802, "grad_norm": 0.4033961594104767, "learning_rate": 5.0818460655953894e-05, "loss": 3.8061, "step": 2557 }, { "epoch": 0.8180855104039657, "grad_norm": 0.3679426312446594, "learning_rate": 5.0645497505973633e-05, "loss": 3.8286, "step": 2558 }, { "epoch": 0.8184053249115513, "grad_norm": 0.32691076397895813, "learning_rate": 5.0472802062253426e-05, "loss": 3.841, "step": 2559 }, { "epoch": 0.8187251394191369, "grad_norm": 0.3308683931827545, "learning_rate": 5.0300374510198904e-05, "loss": 3.8306, "step": 2560 }, { "epoch": 0.8190449539267225, "grad_norm": 0.3351989984512329, "learning_rate": 5.012821503492755e-05, "loss": 3.8423, "step": 2561 }, { "epoch": 0.819364768434308, "grad_norm": 0.3582146167755127, "learning_rate": 4.9956323821269326e-05, "loss": 3.861, "step": 2562 }, { "epoch": 0.8196845829418937, "grad_norm": 0.33505481481552124, "learning_rate": 4.978470105376627e-05, "loss": 3.8932, "step": 2563 }, { "epoch": 0.8200043974494793, "grad_norm": 0.5673553943634033, "learning_rate": 4.961334691667177e-05, "loss": 3.835, "step": 2564 }, { "epoch": 0.820324211957065, "grad_norm": 0.4847067594528198, "learning_rate": 4.9442261593951496e-05, "loss": 3.8877, "step": 2565 }, { "epoch": 0.8206440264646505, "grad_norm": 0.43251875042915344, "learning_rate": 4.9271445269281884e-05, "loss": 3.9338, "step": 2566 }, { "epoch": 0.8209638409722361, "grad_norm": 0.32604730129241943, "learning_rate": 4.910089812605098e-05, "loss": 3.8378, "step": 2567 }, { "epoch": 0.8212836554798217, "grad_norm": 0.30937349796295166, "learning_rate": 4.893062034735758e-05, "loss": 3.8141, "step": 2568 }, { "epoch": 0.8216034699874073, "grad_norm": 0.5576289296150208, "learning_rate": 4.8760612116011464e-05, "loss": 3.863, "step": 2569 }, { "epoch": 0.8219232844949929, "grad_norm": 0.3234955966472626, "learning_rate": 4.8590873614532956e-05, "loss": 3.8531, "step": 2570 }, { "epoch": 0.8222430990025785, "grad_norm": 0.43384894728660583, "learning_rate": 4.842140502515282e-05, "loss": 3.8406, "step": 2571 }, { "epoch": 0.8225629135101641, "grad_norm": 0.31922826170921326, "learning_rate": 4.825220652981211e-05, "loss": 3.8745, "step": 2572 }, { "epoch": 0.8228827280177498, "grad_norm": 0.45489072799682617, "learning_rate": 4.80832783101617e-05, "loss": 3.8488, "step": 2573 }, { "epoch": 0.8232025425253353, "grad_norm": 0.4145664870738983, "learning_rate": 4.7914620547562475e-05, "loss": 3.8304, "step": 2574 }, { "epoch": 0.8235223570329209, "grad_norm": 0.3373119831085205, "learning_rate": 4.7746233423084965e-05, "loss": 3.7842, "step": 2575 }, { "epoch": 0.8238421715405065, "grad_norm": 0.31247204542160034, "learning_rate": 4.757811711750903e-05, "loss": 3.8768, "step": 2576 }, { "epoch": 0.8241619860480921, "grad_norm": 0.3563995659351349, "learning_rate": 4.741027181132392e-05, "loss": 3.9256, "step": 2577 }, { "epoch": 0.8244818005556777, "grad_norm": 0.35994648933410645, "learning_rate": 4.724269768472776e-05, "loss": 3.8505, "step": 2578 }, { "epoch": 0.8248016150632633, "grad_norm": 0.45338165760040283, "learning_rate": 4.707539491762767e-05, "loss": 3.9676, "step": 2579 }, { "epoch": 0.8251214295708489, "grad_norm": 0.44423708319664, "learning_rate": 4.690836368963945e-05, "loss": 3.9729, "step": 2580 }, { "epoch": 0.8254412440784346, "grad_norm": 0.39437204599380493, "learning_rate": 4.674160418008728e-05, "loss": 3.8225, "step": 2581 }, { "epoch": 0.8257610585860201, "grad_norm": 0.34230560064315796, "learning_rate": 4.657511656800381e-05, "loss": 3.8986, "step": 2582 }, { "epoch": 0.8260808730936057, "grad_norm": 0.5127937197685242, "learning_rate": 4.6408901032129476e-05, "loss": 3.8866, "step": 2583 }, { "epoch": 0.8264006876011913, "grad_norm": 0.33529824018478394, "learning_rate": 4.624295775091282e-05, "loss": 3.7341, "step": 2584 }, { "epoch": 0.8267205021087769, "grad_norm": 0.37846505641937256, "learning_rate": 4.6077286902510144e-05, "loss": 3.8486, "step": 2585 }, { "epoch": 0.8270403166163625, "grad_norm": 0.32476457953453064, "learning_rate": 4.591188866478513e-05, "loss": 3.8257, "step": 2586 }, { "epoch": 0.8273601311239481, "grad_norm": 0.3404695987701416, "learning_rate": 4.574676321530891e-05, "loss": 3.8415, "step": 2587 }, { "epoch": 0.8276799456315337, "grad_norm": 0.41689202189445496, "learning_rate": 4.558191073135957e-05, "loss": 3.9473, "step": 2588 }, { "epoch": 0.8279997601391194, "grad_norm": 0.3202640414237976, "learning_rate": 4.541733138992231e-05, "loss": 3.8853, "step": 2589 }, { "epoch": 0.8283195746467049, "grad_norm": 0.3372405469417572, "learning_rate": 4.525302536768901e-05, "loss": 3.9324, "step": 2590 }, { "epoch": 0.8286393891542905, "grad_norm": 0.5137133598327637, "learning_rate": 4.5088992841058214e-05, "loss": 3.8809, "step": 2591 }, { "epoch": 0.8289592036618761, "grad_norm": 0.3560962975025177, "learning_rate": 4.4925233986134614e-05, "loss": 3.9017, "step": 2592 }, { "epoch": 0.8292790181694617, "grad_norm": 0.3489327132701874, "learning_rate": 4.4761748978729305e-05, "loss": 3.7883, "step": 2593 }, { "epoch": 0.8295988326770473, "grad_norm": 0.3608783781528473, "learning_rate": 4.4598537994359297e-05, "loss": 3.7683, "step": 2594 }, { "epoch": 0.8299186471846329, "grad_norm": 0.4558559060096741, "learning_rate": 4.443560120824748e-05, "loss": 3.774, "step": 2595 }, { "epoch": 0.8302384616922185, "grad_norm": 0.6595021486282349, "learning_rate": 4.427293879532231e-05, "loss": 3.8313, "step": 2596 }, { "epoch": 0.8305582761998042, "grad_norm": 0.37782156467437744, "learning_rate": 4.411055093021758e-05, "loss": 3.7941, "step": 2597 }, { "epoch": 0.8308780907073897, "grad_norm": 0.312266081571579, "learning_rate": 4.394843778727247e-05, "loss": 3.8508, "step": 2598 }, { "epoch": 0.8311979052149753, "grad_norm": 0.5805156230926514, "learning_rate": 4.3786599540531164e-05, "loss": 3.8628, "step": 2599 }, { "epoch": 0.8315177197225609, "grad_norm": 0.3274054527282715, "learning_rate": 4.362503636374277e-05, "loss": 3.9278, "step": 2600 }, { "epoch": 0.8315177197225609, "eval_runtime": 42.3295, "eval_samples_per_second": 44.815, "eval_steps_per_second": 11.221, "step": 2600 }, { "epoch": 0.8318375342301465, "grad_norm": 0.38668861985206604, "learning_rate": 4.346374843036104e-05, "loss": 3.7948, "step": 2601 }, { "epoch": 0.8321573487377321, "grad_norm": 0.32462695240974426, "learning_rate": 4.3302735913544174e-05, "loss": 3.8289, "step": 2602 }, { "epoch": 0.8324771632453177, "grad_norm": 0.3686949908733368, "learning_rate": 4.314199898615481e-05, "loss": 3.843, "step": 2603 }, { "epoch": 0.8327969777529033, "grad_norm": 0.35174691677093506, "learning_rate": 4.298153782075946e-05, "loss": 3.8362, "step": 2604 }, { "epoch": 0.833116792260489, "grad_norm": 0.3854190707206726, "learning_rate": 4.2821352589628944e-05, "loss": 3.8622, "step": 2605 }, { "epoch": 0.8334366067680745, "grad_norm": 0.4495396614074707, "learning_rate": 4.26614434647377e-05, "loss": 3.9206, "step": 2606 }, { "epoch": 0.8337564212756601, "grad_norm": 0.34063953161239624, "learning_rate": 4.25018106177635e-05, "loss": 3.8748, "step": 2607 }, { "epoch": 0.8340762357832457, "grad_norm": 0.5710259079933167, "learning_rate": 4.2342454220087855e-05, "loss": 3.7799, "step": 2608 }, { "epoch": 0.8343960502908313, "grad_norm": 0.3465093672275543, "learning_rate": 4.21833744427952e-05, "loss": 3.8394, "step": 2609 }, { "epoch": 0.834715864798417, "grad_norm": 0.329547643661499, "learning_rate": 4.202457145667311e-05, "loss": 3.8885, "step": 2610 }, { "epoch": 0.8350356793060025, "grad_norm": 0.3053514361381531, "learning_rate": 4.1866045432212214e-05, "loss": 3.9205, "step": 2611 }, { "epoch": 0.8353554938135881, "grad_norm": 0.6778898239135742, "learning_rate": 4.1707796539605385e-05, "loss": 3.8961, "step": 2612 }, { "epoch": 0.8356753083211738, "grad_norm": 0.3170827627182007, "learning_rate": 4.154982494874829e-05, "loss": 3.7874, "step": 2613 }, { "epoch": 0.8359951228287593, "grad_norm": 0.4287985563278198, "learning_rate": 4.139213082923862e-05, "loss": 3.8709, "step": 2614 }, { "epoch": 0.8363149373363449, "grad_norm": 0.4338605999946594, "learning_rate": 4.12347143503764e-05, "loss": 3.8713, "step": 2615 }, { "epoch": 0.8366347518439305, "grad_norm": 0.4319593906402588, "learning_rate": 4.107757568116352e-05, "loss": 3.872, "step": 2616 }, { "epoch": 0.8369545663515161, "grad_norm": 0.8693323731422424, "learning_rate": 4.092071499030355e-05, "loss": 3.8593, "step": 2617 }, { "epoch": 0.8372743808591018, "grad_norm": 0.4352510869503021, "learning_rate": 4.076413244620177e-05, "loss": 3.8332, "step": 2618 }, { "epoch": 0.8375941953666873, "grad_norm": 0.3398313820362091, "learning_rate": 4.060782821696458e-05, "loss": 3.8017, "step": 2619 }, { "epoch": 0.8379140098742729, "grad_norm": 0.31383174657821655, "learning_rate": 4.0451802470399805e-05, "loss": 3.8659, "step": 2620 }, { "epoch": 0.8382338243818586, "grad_norm": 0.4128970205783844, "learning_rate": 4.029605537401623e-05, "loss": 3.8422, "step": 2621 }, { "epoch": 0.8385536388894441, "grad_norm": 0.35981470346450806, "learning_rate": 4.01405870950235e-05, "loss": 3.8998, "step": 2622 }, { "epoch": 0.8388734533970297, "grad_norm": 0.3180490732192993, "learning_rate": 3.9985397800331965e-05, "loss": 3.8183, "step": 2623 }, { "epoch": 0.8391932679046153, "grad_norm": 0.773688793182373, "learning_rate": 3.983048765655225e-05, "loss": 3.8327, "step": 2624 }, { "epoch": 0.8395130824122009, "grad_norm": 0.36455658078193665, "learning_rate": 3.9675856829995513e-05, "loss": 3.8228, "step": 2625 }, { "epoch": 0.8398328969197866, "grad_norm": 0.41970229148864746, "learning_rate": 3.95215054866729e-05, "loss": 3.8726, "step": 2626 }, { "epoch": 0.8401527114273721, "grad_norm": 0.33635053038597107, "learning_rate": 3.936743379229572e-05, "loss": 3.8139, "step": 2627 }, { "epoch": 0.8404725259349577, "grad_norm": 0.3619576394557953, "learning_rate": 3.921364191227466e-05, "loss": 3.8695, "step": 2628 }, { "epoch": 0.8407923404425434, "grad_norm": 0.3273950219154358, "learning_rate": 3.9060130011720345e-05, "loss": 3.9328, "step": 2629 }, { "epoch": 0.8411121549501289, "grad_norm": 0.5261261463165283, "learning_rate": 3.890689825544271e-05, "loss": 3.9041, "step": 2630 }, { "epoch": 0.8414319694577145, "grad_norm": 0.48524612188339233, "learning_rate": 3.875394680795092e-05, "loss": 3.8303, "step": 2631 }, { "epoch": 0.8417517839653001, "grad_norm": 0.3665068447589874, "learning_rate": 3.8601275833453224e-05, "loss": 3.8435, "step": 2632 }, { "epoch": 0.8420715984728857, "grad_norm": 0.4098714292049408, "learning_rate": 3.844888549585662e-05, "loss": 3.8826, "step": 2633 }, { "epoch": 0.8423914129804714, "grad_norm": 0.3876728415489197, "learning_rate": 3.829677595876699e-05, "loss": 3.8039, "step": 2634 }, { "epoch": 0.8427112274880569, "grad_norm": 0.38134297728538513, "learning_rate": 3.814494738548871e-05, "loss": 3.8409, "step": 2635 }, { "epoch": 0.8430310419956425, "grad_norm": 0.475462406873703, "learning_rate": 3.799339993902446e-05, "loss": 3.7894, "step": 2636 }, { "epoch": 0.8433508565032282, "grad_norm": 0.46903789043426514, "learning_rate": 3.784213378207522e-05, "loss": 3.8972, "step": 2637 }, { "epoch": 0.8436706710108137, "grad_norm": 0.43271490931510925, "learning_rate": 3.769114907703973e-05, "loss": 3.8883, "step": 2638 }, { "epoch": 0.8439904855183993, "grad_norm": 0.35547372698783875, "learning_rate": 3.7540445986014845e-05, "loss": 3.6872, "step": 2639 }, { "epoch": 0.844310300025985, "grad_norm": 0.3307410478591919, "learning_rate": 3.739002467079488e-05, "loss": 3.8959, "step": 2640 }, { "epoch": 0.8446301145335705, "grad_norm": 0.3443170189857483, "learning_rate": 3.723988529287176e-05, "loss": 3.8707, "step": 2641 }, { "epoch": 0.8449499290411562, "grad_norm": 0.3743644654750824, "learning_rate": 3.709002801343478e-05, "loss": 3.858, "step": 2642 }, { "epoch": 0.8452697435487417, "grad_norm": 0.3544786870479584, "learning_rate": 3.6940452993370105e-05, "loss": 3.8926, "step": 2643 }, { "epoch": 0.8455895580563273, "grad_norm": 0.3395989239215851, "learning_rate": 3.679116039326115e-05, "loss": 3.8185, "step": 2644 }, { "epoch": 0.845909372563913, "grad_norm": 0.4053204357624054, "learning_rate": 3.664215037338785e-05, "loss": 3.861, "step": 2645 }, { "epoch": 0.8462291870714985, "grad_norm": 0.42001065611839294, "learning_rate": 3.6493423093727084e-05, "loss": 3.8889, "step": 2646 }, { "epoch": 0.8465490015790841, "grad_norm": 0.3583778142929077, "learning_rate": 3.634497871395207e-05, "loss": 3.9315, "step": 2647 }, { "epoch": 0.8468688160866698, "grad_norm": 0.3603445887565613, "learning_rate": 3.6196817393432085e-05, "loss": 3.8625, "step": 2648 }, { "epoch": 0.8471886305942553, "grad_norm": 0.3926777243614197, "learning_rate": 3.604893929123284e-05, "loss": 3.884, "step": 2649 }, { "epoch": 0.847508445101841, "grad_norm": 0.5067328214645386, "learning_rate": 3.590134456611562e-05, "loss": 3.872, "step": 2650 }, { "epoch": 0.8478282596094265, "grad_norm": 0.32091429829597473, "learning_rate": 3.5754033376537947e-05, "loss": 3.9113, "step": 2651 }, { "epoch": 0.8481480741170121, "grad_norm": 0.3884349763393402, "learning_rate": 3.560700588065252e-05, "loss": 3.8805, "step": 2652 }, { "epoch": 0.8484678886245978, "grad_norm": 0.6649706363677979, "learning_rate": 3.5460262236307657e-05, "loss": 3.8442, "step": 2653 }, { "epoch": 0.8487877031321833, "grad_norm": 0.34632837772369385, "learning_rate": 3.531380260104698e-05, "loss": 3.864, "step": 2654 }, { "epoch": 0.8491075176397689, "grad_norm": 0.3906109929084778, "learning_rate": 3.516762713210891e-05, "loss": 3.8961, "step": 2655 }, { "epoch": 0.8494273321473546, "grad_norm": 0.31094592809677124, "learning_rate": 3.502173598642728e-05, "loss": 3.892, "step": 2656 }, { "epoch": 0.8497471466549401, "grad_norm": 0.3360503017902374, "learning_rate": 3.4876129320630196e-05, "loss": 3.7529, "step": 2657 }, { "epoch": 0.8500669611625258, "grad_norm": 0.4324783980846405, "learning_rate": 3.473080729104062e-05, "loss": 3.9322, "step": 2658 }, { "epoch": 0.8503867756701113, "grad_norm": 0.4330647885799408, "learning_rate": 3.4585770053675876e-05, "loss": 3.8923, "step": 2659 }, { "epoch": 0.8507065901776969, "grad_norm": 0.3172549903392792, "learning_rate": 3.444101776424738e-05, "loss": 3.8603, "step": 2660 }, { "epoch": 0.8510264046852826, "grad_norm": 0.31120914220809937, "learning_rate": 3.429655057816099e-05, "loss": 3.8364, "step": 2661 }, { "epoch": 0.8513462191928681, "grad_norm": 0.4264627993106842, "learning_rate": 3.415236865051606e-05, "loss": 3.813, "step": 2662 }, { "epoch": 0.8516660337004537, "grad_norm": 0.36595046520233154, "learning_rate": 3.4008472136106046e-05, "loss": 3.86, "step": 2663 }, { "epoch": 0.8519858482080394, "grad_norm": 0.390438050031662, "learning_rate": 3.3864861189417636e-05, "loss": 3.9378, "step": 2664 }, { "epoch": 0.8523056627156249, "grad_norm": 0.4973544478416443, "learning_rate": 3.3721535964631195e-05, "loss": 3.8318, "step": 2665 }, { "epoch": 0.8526254772232106, "grad_norm": 0.31964460015296936, "learning_rate": 3.3578496615620307e-05, "loss": 3.8727, "step": 2666 }, { "epoch": 0.8529452917307961, "grad_norm": 0.36366310715675354, "learning_rate": 3.343574329595157e-05, "loss": 3.9245, "step": 2667 }, { "epoch": 0.8532651062383817, "grad_norm": 0.31549423933029175, "learning_rate": 3.329327615888461e-05, "loss": 3.8596, "step": 2668 }, { "epoch": 0.8535849207459674, "grad_norm": 0.3658754825592041, "learning_rate": 3.315109535737155e-05, "loss": 3.8518, "step": 2669 }, { "epoch": 0.8539047352535529, "grad_norm": 0.3329384922981262, "learning_rate": 3.300920104405739e-05, "loss": 3.8201, "step": 2670 }, { "epoch": 0.8542245497611385, "grad_norm": 0.33013859391212463, "learning_rate": 3.2867593371279434e-05, "loss": 3.8899, "step": 2671 }, { "epoch": 0.8545443642687242, "grad_norm": 0.43552127480506897, "learning_rate": 3.272627249106724e-05, "loss": 3.7791, "step": 2672 }, { "epoch": 0.8548641787763097, "grad_norm": 0.3143914043903351, "learning_rate": 3.258523855514258e-05, "loss": 3.7631, "step": 2673 }, { "epoch": 0.8551839932838954, "grad_norm": 0.3055509030818939, "learning_rate": 3.244449171491896e-05, "loss": 3.8645, "step": 2674 }, { "epoch": 0.8555038077914809, "grad_norm": 0.328490287065506, "learning_rate": 3.230403212150179e-05, "loss": 3.9306, "step": 2675 }, { "epoch": 0.8558236222990665, "grad_norm": 0.32438135147094727, "learning_rate": 3.216385992568813e-05, "loss": 3.8919, "step": 2676 }, { "epoch": 0.8561434368066522, "grad_norm": 0.4711686372756958, "learning_rate": 3.202397527796637e-05, "loss": 3.7867, "step": 2677 }, { "epoch": 0.8564632513142377, "grad_norm": 0.3341000974178314, "learning_rate": 3.188437832851639e-05, "loss": 3.8648, "step": 2678 }, { "epoch": 0.8567830658218234, "grad_norm": 0.4218962490558624, "learning_rate": 3.1745069227208894e-05, "loss": 3.8354, "step": 2679 }, { "epoch": 0.857102880329409, "grad_norm": 0.3630712926387787, "learning_rate": 3.160604812360579e-05, "loss": 3.8605, "step": 2680 }, { "epoch": 0.8574226948369945, "grad_norm": 0.3077477216720581, "learning_rate": 3.146731516695974e-05, "loss": 3.7771, "step": 2681 }, { "epoch": 0.8577425093445802, "grad_norm": 0.34068411588668823, "learning_rate": 3.1328870506214044e-05, "loss": 3.8322, "step": 2682 }, { "epoch": 0.8580623238521657, "grad_norm": 0.37170499563217163, "learning_rate": 3.119071429000254e-05, "loss": 3.932, "step": 2683 }, { "epoch": 0.8583821383597513, "grad_norm": 0.3384850323200226, "learning_rate": 3.105284666664918e-05, "loss": 3.811, "step": 2684 }, { "epoch": 0.858701952867337, "grad_norm": 0.42808955907821655, "learning_rate": 3.091526778416833e-05, "loss": 3.8283, "step": 2685 }, { "epoch": 0.8590217673749225, "grad_norm": 0.3682732582092285, "learning_rate": 3.077797779026428e-05, "loss": 3.8919, "step": 2686 }, { "epoch": 0.8593415818825082, "grad_norm": 0.3175056278705597, "learning_rate": 3.064097683233121e-05, "loss": 3.8593, "step": 2687 }, { "epoch": 0.8596613963900938, "grad_norm": 0.3698948621749878, "learning_rate": 3.0504265057452815e-05, "loss": 3.777, "step": 2688 }, { "epoch": 0.8599812108976793, "grad_norm": 0.3754269480705261, "learning_rate": 3.036784261240255e-05, "loss": 3.697, "step": 2689 }, { "epoch": 0.860301025405265, "grad_norm": 0.34182238578796387, "learning_rate": 3.0231709643643086e-05, "loss": 3.876, "step": 2690 }, { "epoch": 0.8606208399128505, "grad_norm": 0.30060431361198425, "learning_rate": 3.0095866297326455e-05, "loss": 3.865, "step": 2691 }, { "epoch": 0.8609406544204361, "grad_norm": 0.39089590311050415, "learning_rate": 2.996031271929369e-05, "loss": 3.8657, "step": 2692 }, { "epoch": 0.8612604689280218, "grad_norm": 0.3527035117149353, "learning_rate": 2.982504905507461e-05, "loss": 3.812, "step": 2693 }, { "epoch": 0.8615802834356073, "grad_norm": 0.3286183476448059, "learning_rate": 2.969007544988793e-05, "loss": 3.7504, "step": 2694 }, { "epoch": 0.861900097943193, "grad_norm": 0.3596310615539551, "learning_rate": 2.9555392048640924e-05, "loss": 3.8344, "step": 2695 }, { "epoch": 0.8622199124507786, "grad_norm": 0.33346909284591675, "learning_rate": 2.9420998995929267e-05, "loss": 3.8686, "step": 2696 }, { "epoch": 0.8625397269583641, "grad_norm": 0.3616090714931488, "learning_rate": 2.9286896436037076e-05, "loss": 3.8445, "step": 2697 }, { "epoch": 0.8628595414659498, "grad_norm": 0.5143676400184631, "learning_rate": 2.9153084512936285e-05, "loss": 3.8708, "step": 2698 }, { "epoch": 0.8631793559735353, "grad_norm": 0.3566453754901886, "learning_rate": 2.9019563370287112e-05, "loss": 3.8471, "step": 2699 }, { "epoch": 0.8634991704811209, "grad_norm": 0.34008604288101196, "learning_rate": 2.8886333151437292e-05, "loss": 3.8023, "step": 2700 }, { "epoch": 0.8634991704811209, "eval_runtime": 49.6627, "eval_samples_per_second": 38.198, "eval_steps_per_second": 9.565, "step": 2700 }, { "epoch": 0.8638189849887066, "grad_norm": 0.4346884489059448, "learning_rate": 2.875339399942257e-05, "loss": 3.8634, "step": 2701 }, { "epoch": 0.8641387994962921, "grad_norm": 0.35016992688179016, "learning_rate": 2.862074605696605e-05, "loss": 3.8224, "step": 2702 }, { "epoch": 0.8644586140038778, "grad_norm": 0.3892556130886078, "learning_rate": 2.848838946647801e-05, "loss": 3.8829, "step": 2703 }, { "epoch": 0.8647784285114634, "grad_norm": 0.32547521591186523, "learning_rate": 2.835632437005626e-05, "loss": 3.7488, "step": 2704 }, { "epoch": 0.8650982430190489, "grad_norm": 0.36034688353538513, "learning_rate": 2.8224550909485344e-05, "loss": 3.9326, "step": 2705 }, { "epoch": 0.8654180575266346, "grad_norm": 0.3451097011566162, "learning_rate": 2.8093069226236865e-05, "loss": 3.8656, "step": 2706 }, { "epoch": 0.8657378720342201, "grad_norm": 0.36633050441741943, "learning_rate": 2.796187946146937e-05, "loss": 3.8692, "step": 2707 }, { "epoch": 0.8660576865418057, "grad_norm": 0.3605062961578369, "learning_rate": 2.7830981756027636e-05, "loss": 3.7493, "step": 2708 }, { "epoch": 0.8663775010493914, "grad_norm": 0.36099496483802795, "learning_rate": 2.7700376250443147e-05, "loss": 3.8151, "step": 2709 }, { "epoch": 0.8666973155569769, "grad_norm": 0.32434943318367004, "learning_rate": 2.757006308493347e-05, "loss": 3.8458, "step": 2710 }, { "epoch": 0.8670171300645626, "grad_norm": 0.4042629599571228, "learning_rate": 2.7440042399402496e-05, "loss": 3.8472, "step": 2711 }, { "epoch": 0.8673369445721482, "grad_norm": 0.3322801887989044, "learning_rate": 2.7310314333440097e-05, "loss": 3.7731, "step": 2712 }, { "epoch": 0.8676567590797337, "grad_norm": 0.47325199842453003, "learning_rate": 2.7180879026321866e-05, "loss": 3.8468, "step": 2713 }, { "epoch": 0.8679765735873194, "grad_norm": 0.40187668800354004, "learning_rate": 2.7051736617009277e-05, "loss": 3.8171, "step": 2714 }, { "epoch": 0.868296388094905, "grad_norm": 0.40191495418548584, "learning_rate": 2.6922887244149126e-05, "loss": 3.8843, "step": 2715 }, { "epoch": 0.8686162026024905, "grad_norm": 0.3628566563129425, "learning_rate": 2.6794331046073724e-05, "loss": 3.7605, "step": 2716 }, { "epoch": 0.8689360171100762, "grad_norm": 0.3545006811618805, "learning_rate": 2.6666068160800702e-05, "loss": 3.8956, "step": 2717 }, { "epoch": 0.8692558316176617, "grad_norm": 0.597759485244751, "learning_rate": 2.6538098726032675e-05, "loss": 3.8292, "step": 2718 }, { "epoch": 0.8695756461252474, "grad_norm": 0.4432847797870636, "learning_rate": 2.6410422879157313e-05, "loss": 3.8457, "step": 2719 }, { "epoch": 0.869895460632833, "grad_norm": 0.44418999552726746, "learning_rate": 2.628304075724693e-05, "loss": 3.7724, "step": 2720 }, { "epoch": 0.8702152751404185, "grad_norm": 0.6845078468322754, "learning_rate": 2.6155952497058643e-05, "loss": 3.8254, "step": 2721 }, { "epoch": 0.8705350896480042, "grad_norm": 0.5458826422691345, "learning_rate": 2.6029158235033997e-05, "loss": 3.9395, "step": 2722 }, { "epoch": 0.8708549041555897, "grad_norm": 0.42053529620170593, "learning_rate": 2.5902658107299078e-05, "loss": 3.9047, "step": 2723 }, { "epoch": 0.8711747186631753, "grad_norm": 0.5469644665718079, "learning_rate": 2.5776452249663847e-05, "loss": 3.8479, "step": 2724 }, { "epoch": 0.871494533170761, "grad_norm": 0.3658195436000824, "learning_rate": 2.5650540797622687e-05, "loss": 3.9469, "step": 2725 }, { "epoch": 0.8718143476783465, "grad_norm": 0.38809964060783386, "learning_rate": 2.5524923886353697e-05, "loss": 3.8007, "step": 2726 }, { "epoch": 0.8721341621859322, "grad_norm": 0.39968934655189514, "learning_rate": 2.5399601650718838e-05, "loss": 3.9012, "step": 2727 }, { "epoch": 0.8724539766935178, "grad_norm": 0.3871692717075348, "learning_rate": 2.5274574225263776e-05, "loss": 3.8421, "step": 2728 }, { "epoch": 0.8727737912011033, "grad_norm": 0.30253735184669495, "learning_rate": 2.5149841744217415e-05, "loss": 3.7973, "step": 2729 }, { "epoch": 0.873093605708689, "grad_norm": 0.3134574890136719, "learning_rate": 2.5025404341492327e-05, "loss": 3.8285, "step": 2730 }, { "epoch": 0.8734134202162745, "grad_norm": 0.4455544054508209, "learning_rate": 2.4901262150684055e-05, "loss": 3.8621, "step": 2731 }, { "epoch": 0.8737332347238601, "grad_norm": 0.3873212933540344, "learning_rate": 2.4777415305071346e-05, "loss": 3.8921, "step": 2732 }, { "epoch": 0.8740530492314458, "grad_norm": 0.3558337688446045, "learning_rate": 2.4653863937615813e-05, "loss": 3.8654, "step": 2733 }, { "epoch": 0.8743728637390313, "grad_norm": 0.3835333287715912, "learning_rate": 2.4530608180961786e-05, "loss": 3.8493, "step": 2734 }, { "epoch": 0.874692678246617, "grad_norm": 0.48255854845046997, "learning_rate": 2.440764816743631e-05, "loss": 3.8517, "step": 2735 }, { "epoch": 0.8750124927542026, "grad_norm": 0.3131025433540344, "learning_rate": 2.428498402904889e-05, "loss": 3.8396, "step": 2736 }, { "epoch": 0.8753323072617881, "grad_norm": 0.37108540534973145, "learning_rate": 2.416261589749139e-05, "loss": 3.905, "step": 2737 }, { "epoch": 0.8756521217693738, "grad_norm": 0.3304899334907532, "learning_rate": 2.4040543904137942e-05, "loss": 3.8441, "step": 2738 }, { "epoch": 0.8759719362769594, "grad_norm": 0.5130865573883057, "learning_rate": 2.391876818004452e-05, "loss": 3.8469, "step": 2739 }, { "epoch": 0.8762917507845449, "grad_norm": 0.42325326800346375, "learning_rate": 2.3797288855949382e-05, "loss": 3.7202, "step": 2740 }, { "epoch": 0.8766115652921306, "grad_norm": 0.512284517288208, "learning_rate": 2.3676106062272126e-05, "loss": 3.875, "step": 2741 }, { "epoch": 0.8769313797997161, "grad_norm": 0.4337746500968933, "learning_rate": 2.3555219929114454e-05, "loss": 3.8827, "step": 2742 }, { "epoch": 0.8772511943073018, "grad_norm": 0.3801739513874054, "learning_rate": 2.343463058625932e-05, "loss": 3.8288, "step": 2743 }, { "epoch": 0.8775710088148874, "grad_norm": 0.4057891368865967, "learning_rate": 2.331433816317102e-05, "loss": 3.9245, "step": 2744 }, { "epoch": 0.8778908233224729, "grad_norm": 0.4014319181442261, "learning_rate": 2.3194342788995257e-05, "loss": 3.7374, "step": 2745 }, { "epoch": 0.8782106378300586, "grad_norm": 0.3144015073776245, "learning_rate": 2.307464459255851e-05, "loss": 3.8471, "step": 2746 }, { "epoch": 0.8785304523376442, "grad_norm": 0.3924254775047302, "learning_rate": 2.2955243702368652e-05, "loss": 3.8287, "step": 2747 }, { "epoch": 0.8788502668452298, "grad_norm": 0.343808650970459, "learning_rate": 2.2836140246613977e-05, "loss": 3.8551, "step": 2748 }, { "epoch": 0.8791700813528154, "grad_norm": 0.44950705766677856, "learning_rate": 2.271733435316363e-05, "loss": 3.8791, "step": 2749 }, { "epoch": 0.8794898958604009, "grad_norm": 0.3195308446884155, "learning_rate": 2.2598826149567352e-05, "loss": 3.9094, "step": 2750 }, { "epoch": 0.8798097103679866, "grad_norm": 0.434033066034317, "learning_rate": 2.2480615763055032e-05, "loss": 3.8111, "step": 2751 }, { "epoch": 0.8801295248755722, "grad_norm": 0.35549700260162354, "learning_rate": 2.2362703320537156e-05, "loss": 3.8203, "step": 2752 }, { "epoch": 0.8804493393831577, "grad_norm": 0.343838632106781, "learning_rate": 2.2245088948604095e-05, "loss": 3.8644, "step": 2753 }, { "epoch": 0.8807691538907434, "grad_norm": 0.306664377450943, "learning_rate": 2.2127772773526342e-05, "loss": 3.8344, "step": 2754 }, { "epoch": 0.881088968398329, "grad_norm": 0.7272199392318726, "learning_rate": 2.201075492125415e-05, "loss": 3.8132, "step": 2755 }, { "epoch": 0.8814087829059146, "grad_norm": 0.37542179226875305, "learning_rate": 2.1894035517417486e-05, "loss": 3.8819, "step": 2756 }, { "epoch": 0.8817285974135002, "grad_norm": 0.37869417667388916, "learning_rate": 2.1777614687326116e-05, "loss": 3.794, "step": 2757 }, { "epoch": 0.8820484119210857, "grad_norm": 0.39466193318367004, "learning_rate": 2.166149255596896e-05, "loss": 3.8313, "step": 2758 }, { "epoch": 0.8823682264286714, "grad_norm": 0.4193485975265503, "learning_rate": 2.154566924801453e-05, "loss": 3.8369, "step": 2759 }, { "epoch": 0.882688040936257, "grad_norm": 0.35964250564575195, "learning_rate": 2.1430144887810218e-05, "loss": 3.8953, "step": 2760 }, { "epoch": 0.8830078554438425, "grad_norm": 0.3282972276210785, "learning_rate": 2.131491959938275e-05, "loss": 3.8548, "step": 2761 }, { "epoch": 0.8833276699514282, "grad_norm": 0.3078734874725342, "learning_rate": 2.119999350643764e-05, "loss": 3.805, "step": 2762 }, { "epoch": 0.8836474844590138, "grad_norm": 0.4858565032482147, "learning_rate": 2.108536673235922e-05, "loss": 3.7995, "step": 2763 }, { "epoch": 0.8839672989665994, "grad_norm": 0.3983120918273926, "learning_rate": 2.0971039400210453e-05, "loss": 3.8797, "step": 2764 }, { "epoch": 0.884287113474185, "grad_norm": 0.4818762540817261, "learning_rate": 2.0857011632732755e-05, "loss": 3.8312, "step": 2765 }, { "epoch": 0.8846069279817705, "grad_norm": 0.35248318314552307, "learning_rate": 2.0743283552346067e-05, "loss": 3.8521, "step": 2766 }, { "epoch": 0.8849267424893562, "grad_norm": 0.3448288142681122, "learning_rate": 2.062985528114852e-05, "loss": 3.8034, "step": 2767 }, { "epoch": 0.8852465569969418, "grad_norm": 0.3956616520881653, "learning_rate": 2.0516726940916372e-05, "loss": 3.7699, "step": 2768 }, { "epoch": 0.8855663715045273, "grad_norm": 0.31437134742736816, "learning_rate": 2.0403898653103867e-05, "loss": 3.9804, "step": 2769 }, { "epoch": 0.885886186012113, "grad_norm": 0.42421358823776245, "learning_rate": 2.029137053884311e-05, "loss": 3.9169, "step": 2770 }, { "epoch": 0.8862060005196986, "grad_norm": 0.4843742549419403, "learning_rate": 2.0179142718943964e-05, "loss": 3.8826, "step": 2771 }, { "epoch": 0.8865258150272842, "grad_norm": 0.36626988649368286, "learning_rate": 2.006721531389388e-05, "loss": 3.9106, "step": 2772 }, { "epoch": 0.8868456295348698, "grad_norm": 0.400566428899765, "learning_rate": 1.9955588443857807e-05, "loss": 3.8464, "step": 2773 }, { "epoch": 0.8871654440424553, "grad_norm": 0.3801354467868805, "learning_rate": 1.9844262228678077e-05, "loss": 3.8415, "step": 2774 }, { "epoch": 0.887485258550041, "grad_norm": 0.43907976150512695, "learning_rate": 1.9733236787874053e-05, "loss": 3.8366, "step": 2775 }, { "epoch": 0.8878050730576266, "grad_norm": 0.6770468354225159, "learning_rate": 1.9622512240642386e-05, "loss": 3.8555, "step": 2776 }, { "epoch": 0.8881248875652121, "grad_norm": 0.5378474593162537, "learning_rate": 1.9512088705856654e-05, "loss": 3.8746, "step": 2777 }, { "epoch": 0.8884447020727978, "grad_norm": 0.4067249894142151, "learning_rate": 1.9401966302067262e-05, "loss": 3.8531, "step": 2778 }, { "epoch": 0.8887645165803834, "grad_norm": 0.39134106040000916, "learning_rate": 1.9292145147501204e-05, "loss": 3.8279, "step": 2779 }, { "epoch": 0.889084331087969, "grad_norm": 0.32504501938819885, "learning_rate": 1.91826253600622e-05, "loss": 3.8013, "step": 2780 }, { "epoch": 0.8894041455955546, "grad_norm": 0.40928134322166443, "learning_rate": 1.907340705733036e-05, "loss": 3.8473, "step": 2781 }, { "epoch": 0.8897239601031401, "grad_norm": 0.3515646755695343, "learning_rate": 1.8964490356562155e-05, "loss": 3.8698, "step": 2782 }, { "epoch": 0.8900437746107258, "grad_norm": 0.4661243259906769, "learning_rate": 1.8855875374690288e-05, "loss": 3.803, "step": 2783 }, { "epoch": 0.8903635891183114, "grad_norm": 0.5454912781715393, "learning_rate": 1.8747562228323344e-05, "loss": 3.8188, "step": 2784 }, { "epoch": 0.8906834036258969, "grad_norm": 0.3561480939388275, "learning_rate": 1.863955103374607e-05, "loss": 3.8392, "step": 2785 }, { "epoch": 0.8910032181334826, "grad_norm": 0.5022075772285461, "learning_rate": 1.8531841906918976e-05, "loss": 3.8153, "step": 2786 }, { "epoch": 0.8913230326410682, "grad_norm": 0.3213578462600708, "learning_rate": 1.8424434963478262e-05, "loss": 3.8519, "step": 2787 }, { "epoch": 0.8916428471486538, "grad_norm": 0.3137240707874298, "learning_rate": 1.8317330318735757e-05, "loss": 3.8666, "step": 2788 }, { "epoch": 0.8919626616562394, "grad_norm": 0.3906106650829315, "learning_rate": 1.8210528087678577e-05, "loss": 3.845, "step": 2789 }, { "epoch": 0.892282476163825, "grad_norm": 0.5237786173820496, "learning_rate": 1.810402838496937e-05, "loss": 3.8299, "step": 2790 }, { "epoch": 0.8926022906714106, "grad_norm": 0.5011598467826843, "learning_rate": 1.799783132494581e-05, "loss": 3.7989, "step": 2791 }, { "epoch": 0.8929221051789962, "grad_norm": 0.37820762395858765, "learning_rate": 1.789193702162086e-05, "loss": 3.9592, "step": 2792 }, { "epoch": 0.8932419196865817, "grad_norm": 0.4449549913406372, "learning_rate": 1.7786345588682317e-05, "loss": 3.8861, "step": 2793 }, { "epoch": 0.8935617341941674, "grad_norm": 0.3762252926826477, "learning_rate": 1.7681057139492792e-05, "loss": 3.8822, "step": 2794 }, { "epoch": 0.893881548701753, "grad_norm": 0.35808414220809937, "learning_rate": 1.7576071787089672e-05, "loss": 3.8303, "step": 2795 }, { "epoch": 0.8942013632093386, "grad_norm": 0.37280550599098206, "learning_rate": 1.7471389644184897e-05, "loss": 3.9491, "step": 2796 }, { "epoch": 0.8945211777169242, "grad_norm": 0.5637244582176208, "learning_rate": 1.7367010823164862e-05, "loss": 3.9094, "step": 2797 }, { "epoch": 0.8948409922245097, "grad_norm": 0.301642507314682, "learning_rate": 1.726293543609053e-05, "loss": 3.7891, "step": 2798 }, { "epoch": 0.8951608067320954, "grad_norm": 0.505777895450592, "learning_rate": 1.7159163594696756e-05, "loss": 3.8159, "step": 2799 }, { "epoch": 0.895480621239681, "grad_norm": 0.30109620094299316, "learning_rate": 1.7055695410392823e-05, "loss": 3.7732, "step": 2800 }, { "epoch": 0.895480621239681, "eval_runtime": 52.9552, "eval_samples_per_second": 35.823, "eval_steps_per_second": 8.97, "step": 2800 }, { "epoch": 0.8958004357472665, "grad_norm": 0.3829757869243622, "learning_rate": 1.695253099426177e-05, "loss": 3.7576, "step": 2801 }, { "epoch": 0.8961202502548522, "grad_norm": 0.36284857988357544, "learning_rate": 1.6849670457060605e-05, "loss": 3.8252, "step": 2802 }, { "epoch": 0.8964400647624378, "grad_norm": 0.3593968152999878, "learning_rate": 1.6747113909220155e-05, "loss": 3.8364, "step": 2803 }, { "epoch": 0.8967598792700234, "grad_norm": 0.4475672245025635, "learning_rate": 1.6644861460844782e-05, "loss": 3.7346, "step": 2804 }, { "epoch": 0.897079693777609, "grad_norm": 0.5313023328781128, "learning_rate": 1.6542913221712506e-05, "loss": 3.9256, "step": 2805 }, { "epoch": 0.8973995082851945, "grad_norm": 0.4179762303829193, "learning_rate": 1.6441269301274572e-05, "loss": 3.8418, "step": 2806 }, { "epoch": 0.8977193227927802, "grad_norm": 0.36792925000190735, "learning_rate": 1.633992980865556e-05, "loss": 3.869, "step": 2807 }, { "epoch": 0.8980391373003658, "grad_norm": 0.3127034306526184, "learning_rate": 1.6238894852653338e-05, "loss": 3.835, "step": 2808 }, { "epoch": 0.8983589518079513, "grad_norm": 0.3414289057254791, "learning_rate": 1.6138164541738674e-05, "loss": 3.9081, "step": 2809 }, { "epoch": 0.898678766315537, "grad_norm": 0.5243842601776123, "learning_rate": 1.6037738984055425e-05, "loss": 3.811, "step": 2810 }, { "epoch": 0.8989985808231226, "grad_norm": 0.3705567419528961, "learning_rate": 1.5937618287420052e-05, "loss": 3.8422, "step": 2811 }, { "epoch": 0.8993183953307082, "grad_norm": 0.3383541703224182, "learning_rate": 1.583780255932193e-05, "loss": 3.8347, "step": 2812 }, { "epoch": 0.8996382098382938, "grad_norm": 0.34444403648376465, "learning_rate": 1.5738291906922883e-05, "loss": 3.8078, "step": 2813 }, { "epoch": 0.8999580243458793, "grad_norm": 0.33358678221702576, "learning_rate": 1.5639086437057314e-05, "loss": 3.7918, "step": 2814 }, { "epoch": 0.900277838853465, "grad_norm": 0.32898664474487305, "learning_rate": 1.5540186256231823e-05, "loss": 3.8704, "step": 2815 }, { "epoch": 0.9005976533610506, "grad_norm": 0.34698760509490967, "learning_rate": 1.5441591470625414e-05, "loss": 3.8094, "step": 2816 }, { "epoch": 0.9009174678686362, "grad_norm": 0.5104339718818665, "learning_rate": 1.534330218608918e-05, "loss": 3.9008, "step": 2817 }, { "epoch": 0.9012372823762218, "grad_norm": 0.34266233444213867, "learning_rate": 1.5245318508146175e-05, "loss": 3.8251, "step": 2818 }, { "epoch": 0.9015570968838074, "grad_norm": 0.3544078469276428, "learning_rate": 1.5147640541991424e-05, "loss": 3.9217, "step": 2819 }, { "epoch": 0.901876911391393, "grad_norm": 0.32537633180618286, "learning_rate": 1.5050268392491639e-05, "loss": 3.9132, "step": 2820 }, { "epoch": 0.9021967258989786, "grad_norm": 0.5211623311042786, "learning_rate": 1.4953202164185297e-05, "loss": 3.9396, "step": 2821 }, { "epoch": 0.9025165404065641, "grad_norm": 0.41474851965904236, "learning_rate": 1.4856441961282472e-05, "loss": 3.9117, "step": 2822 }, { "epoch": 0.9028363549141498, "grad_norm": 0.32742059230804443, "learning_rate": 1.4759987887664537e-05, "loss": 3.7874, "step": 2823 }, { "epoch": 0.9031561694217354, "grad_norm": 0.3288312256336212, "learning_rate": 1.4663840046884423e-05, "loss": 3.7849, "step": 2824 }, { "epoch": 0.903475983929321, "grad_norm": 0.3334638774394989, "learning_rate": 1.456799854216606e-05, "loss": 3.7763, "step": 2825 }, { "epoch": 0.9037957984369066, "grad_norm": 0.37792477011680603, "learning_rate": 1.447246347640464e-05, "loss": 3.8625, "step": 2826 }, { "epoch": 0.9041156129444922, "grad_norm": 0.38779416680336, "learning_rate": 1.437723495216635e-05, "loss": 3.7314, "step": 2827 }, { "epoch": 0.9044354274520778, "grad_norm": 0.3234004080295563, "learning_rate": 1.4282313071688211e-05, "loss": 3.9192, "step": 2828 }, { "epoch": 0.9047552419596634, "grad_norm": 0.3537444472312927, "learning_rate": 1.4187697936878172e-05, "loss": 3.7959, "step": 2829 }, { "epoch": 0.905075056467249, "grad_norm": 0.4011874496936798, "learning_rate": 1.4093389649314613e-05, "loss": 3.8791, "step": 2830 }, { "epoch": 0.9053948709748346, "grad_norm": 0.3776912987232208, "learning_rate": 1.399938831024674e-05, "loss": 3.9807, "step": 2831 }, { "epoch": 0.9057146854824202, "grad_norm": 0.3704184591770172, "learning_rate": 1.3905694020594093e-05, "loss": 3.9448, "step": 2832 }, { "epoch": 0.9060344999900058, "grad_norm": 0.31775835156440735, "learning_rate": 1.3812306880946577e-05, "loss": 3.7701, "step": 2833 }, { "epoch": 0.9063543144975914, "grad_norm": 0.3736931085586548, "learning_rate": 1.3719226991564392e-05, "loss": 3.8121, "step": 2834 }, { "epoch": 0.906674129005177, "grad_norm": 0.327279657125473, "learning_rate": 1.3626454452377734e-05, "loss": 3.8883, "step": 2835 }, { "epoch": 0.9069939435127626, "grad_norm": 0.3093405067920685, "learning_rate": 1.3533989362987063e-05, "loss": 3.9212, "step": 2836 }, { "epoch": 0.9073137580203482, "grad_norm": 0.3085209131240845, "learning_rate": 1.3441831822662441e-05, "loss": 3.8462, "step": 2837 }, { "epoch": 0.9076335725279338, "grad_norm": 0.39359065890312195, "learning_rate": 1.3349981930344156e-05, "loss": 3.7751, "step": 2838 }, { "epoch": 0.9079533870355194, "grad_norm": 0.39021480083465576, "learning_rate": 1.3258439784641795e-05, "loss": 3.8775, "step": 2839 }, { "epoch": 0.908273201543105, "grad_norm": 0.33121538162231445, "learning_rate": 1.3167205483834842e-05, "loss": 3.7862, "step": 2840 }, { "epoch": 0.9085930160506906, "grad_norm": 0.3666438162326813, "learning_rate": 1.307627912587218e-05, "loss": 3.819, "step": 2841 }, { "epoch": 0.9089128305582762, "grad_norm": 0.3102065324783325, "learning_rate": 1.2985660808371955e-05, "loss": 3.9266, "step": 2842 }, { "epoch": 0.9092326450658618, "grad_norm": 0.3597174882888794, "learning_rate": 1.2895350628621882e-05, "loss": 3.7819, "step": 2843 }, { "epoch": 0.9095524595734474, "grad_norm": 0.34017184376716614, "learning_rate": 1.2805348683578598e-05, "loss": 3.7964, "step": 2844 }, { "epoch": 0.909872274081033, "grad_norm": 0.36802756786346436, "learning_rate": 1.271565506986798e-05, "loss": 3.7846, "step": 2845 }, { "epoch": 0.9101920885886186, "grad_norm": 0.34145769476890564, "learning_rate": 1.2626269883784834e-05, "loss": 3.8026, "step": 2846 }, { "epoch": 0.9105119030962042, "grad_norm": 0.32813650369644165, "learning_rate": 1.2537193221292763e-05, "loss": 3.8747, "step": 2847 }, { "epoch": 0.9108317176037898, "grad_norm": 0.37596747279167175, "learning_rate": 1.2448425178024302e-05, "loss": 3.8774, "step": 2848 }, { "epoch": 0.9111515321113755, "grad_norm": 0.33264321088790894, "learning_rate": 1.2359965849280518e-05, "loss": 3.8502, "step": 2849 }, { "epoch": 0.911471346618961, "grad_norm": 0.3156581521034241, "learning_rate": 1.2271815330031076e-05, "loss": 3.882, "step": 2850 }, { "epoch": 0.9117911611265466, "grad_norm": 0.3950587213039398, "learning_rate": 1.218397371491414e-05, "loss": 3.8256, "step": 2851 }, { "epoch": 0.9121109756341322, "grad_norm": 0.4216480851173401, "learning_rate": 1.2096441098236108e-05, "loss": 3.8416, "step": 2852 }, { "epoch": 0.9124307901417178, "grad_norm": 0.40821999311447144, "learning_rate": 1.2009217573971907e-05, "loss": 3.8601, "step": 2853 }, { "epoch": 0.9127506046493034, "grad_norm": 0.36346718668937683, "learning_rate": 1.1922303235764363e-05, "loss": 3.8867, "step": 2854 }, { "epoch": 0.913070419156889, "grad_norm": 0.47768259048461914, "learning_rate": 1.1835698176924468e-05, "loss": 3.8476, "step": 2855 }, { "epoch": 0.9133902336644746, "grad_norm": 0.47484543919563293, "learning_rate": 1.1749402490431148e-05, "loss": 3.84, "step": 2856 }, { "epoch": 0.9137100481720603, "grad_norm": 0.36377885937690735, "learning_rate": 1.1663416268931192e-05, "loss": 3.8385, "step": 2857 }, { "epoch": 0.9140298626796458, "grad_norm": 0.3305024802684784, "learning_rate": 1.1577739604739155e-05, "loss": 3.8427, "step": 2858 }, { "epoch": 0.9143496771872314, "grad_norm": 0.3563801944255829, "learning_rate": 1.1492372589837261e-05, "loss": 3.8438, "step": 2859 }, { "epoch": 0.914669491694817, "grad_norm": 0.33061423897743225, "learning_rate": 1.1407315315875365e-05, "loss": 3.8782, "step": 2860 }, { "epoch": 0.9149893062024026, "grad_norm": 0.35521742701530457, "learning_rate": 1.1322567874170552e-05, "loss": 3.8355, "step": 2861 }, { "epoch": 0.9153091207099882, "grad_norm": 0.3163262605667114, "learning_rate": 1.1238130355707509e-05, "loss": 3.7897, "step": 2862 }, { "epoch": 0.9156289352175738, "grad_norm": 0.3818413019180298, "learning_rate": 1.1154002851138122e-05, "loss": 3.8162, "step": 2863 }, { "epoch": 0.9159487497251594, "grad_norm": 0.38127657771110535, "learning_rate": 1.107018545078141e-05, "loss": 3.7691, "step": 2864 }, { "epoch": 0.916268564232745, "grad_norm": 0.3562355935573578, "learning_rate": 1.0986678244623526e-05, "loss": 3.802, "step": 2865 }, { "epoch": 0.9165883787403306, "grad_norm": 0.363773912191391, "learning_rate": 1.0903481322317486e-05, "loss": 3.8814, "step": 2866 }, { "epoch": 0.9169081932479162, "grad_norm": 0.42210477590560913, "learning_rate": 1.0820594773183278e-05, "loss": 3.7959, "step": 2867 }, { "epoch": 0.9172280077555018, "grad_norm": 0.3285976052284241, "learning_rate": 1.0738018686207683e-05, "loss": 3.8378, "step": 2868 }, { "epoch": 0.9175478222630874, "grad_norm": 0.4005603790283203, "learning_rate": 1.0655753150044155e-05, "loss": 3.7046, "step": 2869 }, { "epoch": 0.917867636770673, "grad_norm": 0.3804391622543335, "learning_rate": 1.0573798253012778e-05, "loss": 3.8179, "step": 2870 }, { "epoch": 0.9181874512782586, "grad_norm": 0.38206663727760315, "learning_rate": 1.0492154083099968e-05, "loss": 3.8841, "step": 2871 }, { "epoch": 0.9185072657858442, "grad_norm": 0.34211817383766174, "learning_rate": 1.0410820727958712e-05, "loss": 3.8722, "step": 2872 }, { "epoch": 0.9188270802934299, "grad_norm": 0.32601305842399597, "learning_rate": 1.0329798274908297e-05, "loss": 3.8693, "step": 2873 }, { "epoch": 0.9191468948010154, "grad_norm": 0.4372338354587555, "learning_rate": 1.0249086810934204e-05, "loss": 3.85, "step": 2874 }, { "epoch": 0.919466709308601, "grad_norm": 0.408493310213089, "learning_rate": 1.0168686422687921e-05, "loss": 3.8124, "step": 2875 }, { "epoch": 0.9197865238161866, "grad_norm": 0.3873545229434967, "learning_rate": 1.008859719648717e-05, "loss": 3.7881, "step": 2876 }, { "epoch": 0.9201063383237722, "grad_norm": 0.3421507477760315, "learning_rate": 1.0008819218315434e-05, "loss": 3.8616, "step": 2877 }, { "epoch": 0.9204261528313578, "grad_norm": 0.3607138991355896, "learning_rate": 9.929352573822203e-06, "loss": 3.7697, "step": 2878 }, { "epoch": 0.9207459673389434, "grad_norm": 0.4271627366542816, "learning_rate": 9.850197348322597e-06, "loss": 3.8241, "step": 2879 }, { "epoch": 0.921065781846529, "grad_norm": 0.31965309381484985, "learning_rate": 9.771353626797373e-06, "loss": 3.8219, "step": 2880 }, { "epoch": 0.9213855963541147, "grad_norm": 0.3279765844345093, "learning_rate": 9.692821493892988e-06, "loss": 3.9066, "step": 2881 }, { "epoch": 0.9217054108617002, "grad_norm": 0.42092078924179077, "learning_rate": 9.614601033921266e-06, "loss": 3.7628, "step": 2882 }, { "epoch": 0.9220252253692858, "grad_norm": 0.34555935859680176, "learning_rate": 9.536692330859497e-06, "loss": 3.9168, "step": 2883 }, { "epoch": 0.9223450398768714, "grad_norm": 0.38723963499069214, "learning_rate": 9.459095468350241e-06, "loss": 3.8204, "step": 2884 }, { "epoch": 0.922664854384457, "grad_norm": 0.3473418951034546, "learning_rate": 9.381810529701228e-06, "loss": 3.8639, "step": 2885 }, { "epoch": 0.9229846688920427, "grad_norm": 0.32579275965690613, "learning_rate": 9.30483759788535e-06, "loss": 3.893, "step": 2886 }, { "epoch": 0.9233044833996282, "grad_norm": 0.4575331509113312, "learning_rate": 9.228176755540506e-06, "loss": 3.8638, "step": 2887 }, { "epoch": 0.9236242979072138, "grad_norm": 0.3727882206439972, "learning_rate": 9.151828084969593e-06, "loss": 3.8482, "step": 2888 }, { "epoch": 0.9239441124147995, "grad_norm": 0.4113883376121521, "learning_rate": 9.075791668140308e-06, "loss": 3.9036, "step": 2889 }, { "epoch": 0.924263926922385, "grad_norm": 0.42329221963882446, "learning_rate": 9.000067586685089e-06, "loss": 3.8058, "step": 2890 }, { "epoch": 0.9245837414299706, "grad_norm": 0.4567498564720154, "learning_rate": 8.924655921901135e-06, "loss": 3.8317, "step": 2891 }, { "epoch": 0.9249035559375562, "grad_norm": 0.35736435651779175, "learning_rate": 8.849556754750153e-06, "loss": 3.8462, "step": 2892 }, { "epoch": 0.9252233704451418, "grad_norm": 0.7284477353096008, "learning_rate": 8.774770165858347e-06, "loss": 3.8679, "step": 2893 }, { "epoch": 0.9255431849527275, "grad_norm": 0.4010378122329712, "learning_rate": 8.70029623551649e-06, "loss": 3.8371, "step": 2894 }, { "epoch": 0.925862999460313, "grad_norm": 0.36940640211105347, "learning_rate": 8.626135043679495e-06, "loss": 3.8271, "step": 2895 }, { "epoch": 0.9261828139678986, "grad_norm": 0.4148366153240204, "learning_rate": 8.552286669966635e-06, "loss": 3.7586, "step": 2896 }, { "epoch": 0.9265026284754843, "grad_norm": 0.3730383813381195, "learning_rate": 8.47875119366126e-06, "loss": 3.867, "step": 2897 }, { "epoch": 0.9268224429830698, "grad_norm": 0.3152139186859131, "learning_rate": 8.405528693710883e-06, "loss": 3.8764, "step": 2898 }, { "epoch": 0.9271422574906554, "grad_norm": 0.3949700593948364, "learning_rate": 8.332619248726957e-06, "loss": 3.8988, "step": 2899 }, { "epoch": 0.927462071998241, "grad_norm": 0.3136104643344879, "learning_rate": 8.260022936984833e-06, "loss": 3.83, "step": 2900 }, { "epoch": 0.927462071998241, "eval_runtime": 49.9895, "eval_samples_per_second": 37.948, "eval_steps_per_second": 9.502, "step": 2900 }, { "epoch": 0.9277818865058266, "grad_norm": 0.32620909810066223, "learning_rate": 8.187739836423734e-06, "loss": 3.8494, "step": 2901 }, { "epoch": 0.9281017010134123, "grad_norm": 0.3751737177371979, "learning_rate": 8.115770024646518e-06, "loss": 3.8191, "step": 2902 }, { "epoch": 0.9284215155209978, "grad_norm": 0.30418094992637634, "learning_rate": 8.044113578919842e-06, "loss": 3.9349, "step": 2903 }, { "epoch": 0.9287413300285834, "grad_norm": 0.34678685665130615, "learning_rate": 7.97277057617377e-06, "loss": 3.8738, "step": 2904 }, { "epoch": 0.9290611445361691, "grad_norm": 0.48156625032424927, "learning_rate": 7.901741093002002e-06, "loss": 3.7991, "step": 2905 }, { "epoch": 0.9293809590437546, "grad_norm": 0.2981961965560913, "learning_rate": 7.8310252056616e-06, "loss": 3.8475, "step": 2906 }, { "epoch": 0.9297007735513402, "grad_norm": 0.31321606040000916, "learning_rate": 7.760622990072873e-06, "loss": 3.8855, "step": 2907 }, { "epoch": 0.9300205880589258, "grad_norm": 0.31777769327163696, "learning_rate": 7.690534521819458e-06, "loss": 3.8242, "step": 2908 }, { "epoch": 0.9303404025665114, "grad_norm": 0.3483651280403137, "learning_rate": 7.6207598761481305e-06, "loss": 3.7806, "step": 2909 }, { "epoch": 0.9306602170740971, "grad_norm": 0.39319679141044617, "learning_rate": 7.5512991279687684e-06, "loss": 3.8205, "step": 2910 }, { "epoch": 0.9309800315816826, "grad_norm": 0.4952144920825958, "learning_rate": 7.482152351854187e-06, "loss": 3.8145, "step": 2911 }, { "epoch": 0.9312998460892682, "grad_norm": 0.49825191497802734, "learning_rate": 7.413319622040137e-06, "loss": 3.7801, "step": 2912 }, { "epoch": 0.9316196605968539, "grad_norm": 0.31667929887771606, "learning_rate": 7.344801012425306e-06, "loss": 3.8218, "step": 2913 }, { "epoch": 0.9319394751044394, "grad_norm": 0.40096330642700195, "learning_rate": 7.276596596571016e-06, "loss": 3.8339, "step": 2914 }, { "epoch": 0.932259289612025, "grad_norm": 0.42563605308532715, "learning_rate": 7.208706447701395e-06, "loss": 3.861, "step": 2915 }, { "epoch": 0.9325791041196106, "grad_norm": 0.37720730900764465, "learning_rate": 7.141130638703041e-06, "loss": 3.8087, "step": 2916 }, { "epoch": 0.9328989186271962, "grad_norm": 0.33922338485717773, "learning_rate": 7.073869242125152e-06, "loss": 3.9268, "step": 2917 }, { "epoch": 0.9332187331347819, "grad_norm": 0.37561967968940735, "learning_rate": 7.006922330179398e-06, "loss": 3.8881, "step": 2918 }, { "epoch": 0.9335385476423674, "grad_norm": 0.34245485067367554, "learning_rate": 6.940289974739754e-06, "loss": 3.7238, "step": 2919 }, { "epoch": 0.933858362149953, "grad_norm": 0.3230489194393158, "learning_rate": 6.8739722473425295e-06, "loss": 3.7531, "step": 2920 }, { "epoch": 0.9341781766575387, "grad_norm": 0.3259875476360321, "learning_rate": 6.807969219186271e-06, "loss": 3.817, "step": 2921 }, { "epoch": 0.9344979911651242, "grad_norm": 0.3754819631576538, "learning_rate": 6.742280961131563e-06, "loss": 3.9332, "step": 2922 }, { "epoch": 0.9348178056727098, "grad_norm": 0.3073190748691559, "learning_rate": 6.676907543701227e-06, "loss": 3.8572, "step": 2923 }, { "epoch": 0.9351376201802954, "grad_norm": 0.34672433137893677, "learning_rate": 6.611849037079886e-06, "loss": 3.8637, "step": 2924 }, { "epoch": 0.935457434687881, "grad_norm": 0.3417993485927582, "learning_rate": 6.5471055111142035e-06, "loss": 3.8077, "step": 2925 }, { "epoch": 0.9357772491954667, "grad_norm": 0.609849214553833, "learning_rate": 6.4826770353126115e-06, "loss": 3.8212, "step": 2926 }, { "epoch": 0.9360970637030522, "grad_norm": 0.417621910572052, "learning_rate": 6.418563678845379e-06, "loss": 3.8181, "step": 2927 }, { "epoch": 0.9364168782106378, "grad_norm": 0.37244054675102234, "learning_rate": 6.354765510544346e-06, "loss": 3.8812, "step": 2928 }, { "epoch": 0.9367366927182235, "grad_norm": 0.28731822967529297, "learning_rate": 6.291282598903091e-06, "loss": 3.8218, "step": 2929 }, { "epoch": 0.937056507225809, "grad_norm": 0.3922556936740875, "learning_rate": 6.228115012076729e-06, "loss": 3.9596, "step": 2930 }, { "epoch": 0.9373763217333946, "grad_norm": 0.343537837266922, "learning_rate": 6.165262817881678e-06, "loss": 3.8232, "step": 2931 }, { "epoch": 0.9376961362409802, "grad_norm": 0.33651161193847656, "learning_rate": 6.102726083795961e-06, "loss": 3.845, "step": 2932 }, { "epoch": 0.9380159507485658, "grad_norm": 0.4260159134864807, "learning_rate": 6.040504876958741e-06, "loss": 3.8422, "step": 2933 }, { "epoch": 0.9383357652561515, "grad_norm": 0.36133843660354614, "learning_rate": 5.978599264170614e-06, "loss": 3.8306, "step": 2934 }, { "epoch": 0.938655579763737, "grad_norm": 0.3132469356060028, "learning_rate": 5.917009311893217e-06, "loss": 3.8144, "step": 2935 }, { "epoch": 0.9389753942713226, "grad_norm": 0.32899340987205505, "learning_rate": 5.855735086249358e-06, "loss": 3.8552, "step": 2936 }, { "epoch": 0.9392952087789083, "grad_norm": 0.40264222025871277, "learning_rate": 5.794776653022881e-06, "loss": 3.8203, "step": 2937 }, { "epoch": 0.9396150232864938, "grad_norm": 0.6008609533309937, "learning_rate": 5.7341340776585035e-06, "loss": 3.8182, "step": 2938 }, { "epoch": 0.9399348377940794, "grad_norm": 0.3235461115837097, "learning_rate": 5.673807425262045e-06, "loss": 3.8476, "step": 2939 }, { "epoch": 0.940254652301665, "grad_norm": 0.35002022981643677, "learning_rate": 5.613796760599898e-06, "loss": 3.772, "step": 2940 }, { "epoch": 0.9405744668092506, "grad_norm": 0.38841116428375244, "learning_rate": 5.554102148099393e-06, "loss": 3.9363, "step": 2941 }, { "epoch": 0.9408942813168363, "grad_norm": 0.40948817133903503, "learning_rate": 5.494723651848532e-06, "loss": 3.919, "step": 2942 }, { "epoch": 0.9412140958244218, "grad_norm": 0.5074942111968994, "learning_rate": 5.435661335595753e-06, "loss": 3.8352, "step": 2943 }, { "epoch": 0.9415339103320074, "grad_norm": 0.33767393231391907, "learning_rate": 5.376915262750369e-06, "loss": 3.9075, "step": 2944 }, { "epoch": 0.9418537248395931, "grad_norm": 0.4783247113227844, "learning_rate": 5.3184854963818305e-06, "loss": 3.7858, "step": 2945 }, { "epoch": 0.9421735393471786, "grad_norm": 0.34749627113342285, "learning_rate": 5.260372099220289e-06, "loss": 3.8649, "step": 2946 }, { "epoch": 0.9424933538547642, "grad_norm": 0.31855520606040955, "learning_rate": 5.202575133656039e-06, "loss": 3.9171, "step": 2947 }, { "epoch": 0.9428131683623499, "grad_norm": 0.31797823309898376, "learning_rate": 5.145094661739746e-06, "loss": 3.8342, "step": 2948 }, { "epoch": 0.9431329828699354, "grad_norm": 0.3786947429180145, "learning_rate": 5.087930745182278e-06, "loss": 3.856, "step": 2949 }, { "epoch": 0.9434527973775211, "grad_norm": 0.34037601947784424, "learning_rate": 5.031083445354644e-06, "loss": 3.8433, "step": 2950 }, { "epoch": 0.9437726118851066, "grad_norm": 0.3460736572742462, "learning_rate": 4.9745528232879915e-06, "loss": 3.81, "step": 2951 }, { "epoch": 0.9440924263926922, "grad_norm": 0.4227440357208252, "learning_rate": 4.918338939673372e-06, "loss": 3.9432, "step": 2952 }, { "epoch": 0.9444122409002779, "grad_norm": 0.40755754709243774, "learning_rate": 4.862441854861809e-06, "loss": 3.865, "step": 2953 }, { "epoch": 0.9447320554078634, "grad_norm": 0.4231763780117035, "learning_rate": 4.806861628864333e-06, "loss": 3.9071, "step": 2954 }, { "epoch": 0.9450518699154491, "grad_norm": 0.34026941657066345, "learning_rate": 4.751598321351679e-06, "loss": 3.8805, "step": 2955 }, { "epoch": 0.9453716844230347, "grad_norm": 0.3763134479522705, "learning_rate": 4.6966519916543875e-06, "loss": 3.802, "step": 2956 }, { "epoch": 0.9456914989306202, "grad_norm": 0.32251763343811035, "learning_rate": 4.642022698762638e-06, "loss": 3.8018, "step": 2957 }, { "epoch": 0.9460113134382059, "grad_norm": 0.38308969140052795, "learning_rate": 4.5877105013262805e-06, "loss": 3.7695, "step": 2958 }, { "epoch": 0.9463311279457914, "grad_norm": 0.3079821467399597, "learning_rate": 4.533715457654741e-06, "loss": 3.8824, "step": 2959 }, { "epoch": 0.946650942453377, "grad_norm": 0.4104771018028259, "learning_rate": 4.480037625716981e-06, "loss": 3.7921, "step": 2960 }, { "epoch": 0.9469707569609627, "grad_norm": 0.37571239471435547, "learning_rate": 4.4266770631413374e-06, "loss": 3.8874, "step": 2961 }, { "epoch": 0.9472905714685482, "grad_norm": 0.35999199748039246, "learning_rate": 4.373633827215517e-06, "loss": 3.9927, "step": 2962 }, { "epoch": 0.9476103859761339, "grad_norm": 0.36155542731285095, "learning_rate": 4.3209079748866e-06, "loss": 3.7778, "step": 2963 }, { "epoch": 0.9479302004837195, "grad_norm": 0.3367122411727905, "learning_rate": 4.268499562760907e-06, "loss": 3.8607, "step": 2964 }, { "epoch": 0.948250014991305, "grad_norm": 0.32927775382995605, "learning_rate": 4.216408647103997e-06, "loss": 3.8713, "step": 2965 }, { "epoch": 0.9485698294988907, "grad_norm": 0.39797112345695496, "learning_rate": 4.164635283840468e-06, "loss": 3.8595, "step": 2966 }, { "epoch": 0.9488896440064762, "grad_norm": 0.3358381688594818, "learning_rate": 4.113179528554089e-06, "loss": 3.8629, "step": 2967 }, { "epoch": 0.9492094585140618, "grad_norm": 0.3226361870765686, "learning_rate": 4.062041436487573e-06, "loss": 3.8626, "step": 2968 }, { "epoch": 0.9495292730216475, "grad_norm": 0.3047392666339874, "learning_rate": 4.011221062542636e-06, "loss": 3.7564, "step": 2969 }, { "epoch": 0.949849087529233, "grad_norm": 0.4025292992591858, "learning_rate": 3.9607184612799325e-06, "loss": 3.8724, "step": 2970 }, { "epoch": 0.9501689020368187, "grad_norm": 0.3585968613624573, "learning_rate": 3.910533686918826e-06, "loss": 3.8774, "step": 2971 }, { "epoch": 0.9504887165444043, "grad_norm": 0.3131186068058014, "learning_rate": 3.860666793337585e-06, "loss": 3.8145, "step": 2972 }, { "epoch": 0.9508085310519898, "grad_norm": 0.4556500315666199, "learning_rate": 3.811117834073152e-06, "loss": 3.8048, "step": 2973 }, { "epoch": 0.9511283455595755, "grad_norm": 0.3522467017173767, "learning_rate": 3.761886862321173e-06, "loss": 3.8226, "step": 2974 }, { "epoch": 0.951448160067161, "grad_norm": 0.44986897706985474, "learning_rate": 3.7129739309358362e-06, "loss": 3.798, "step": 2975 }, { "epoch": 0.9517679745747466, "grad_norm": 0.39320096373558044, "learning_rate": 3.664379092429903e-06, "loss": 3.8119, "step": 2976 }, { "epoch": 0.9520877890823323, "grad_norm": 0.377770334482193, "learning_rate": 3.6161023989747075e-06, "loss": 3.9129, "step": 2977 }, { "epoch": 0.9524076035899178, "grad_norm": 0.36752569675445557, "learning_rate": 3.5681439023999224e-06, "loss": 3.8372, "step": 2978 }, { "epoch": 0.9527274180975035, "grad_norm": 0.32876789569854736, "learning_rate": 3.5205036541936626e-06, "loss": 3.8526, "step": 2979 }, { "epoch": 0.9530472326050891, "grad_norm": 0.33512139320373535, "learning_rate": 3.4731817055023812e-06, "loss": 3.8257, "step": 2980 }, { "epoch": 0.9533670471126746, "grad_norm": 0.3675519824028015, "learning_rate": 3.4261781071307393e-06, "loss": 3.8223, "step": 2981 }, { "epoch": 0.9536868616202603, "grad_norm": 0.3107050061225891, "learning_rate": 3.3794929095417034e-06, "loss": 3.8696, "step": 2982 }, { "epoch": 0.9540066761278458, "grad_norm": 0.6994888186454773, "learning_rate": 3.3331261628563145e-06, "loss": 3.7948, "step": 2983 }, { "epoch": 0.9543264906354314, "grad_norm": 0.3883497416973114, "learning_rate": 3.2870779168538196e-06, "loss": 3.8204, "step": 2984 }, { "epoch": 0.9546463051430171, "grad_norm": 0.3863656222820282, "learning_rate": 3.2413482209714737e-06, "loss": 3.779, "step": 2985 }, { "epoch": 0.9549661196506026, "grad_norm": 0.33559244871139526, "learning_rate": 3.195937124304504e-06, "loss": 3.8896, "step": 2986 }, { "epoch": 0.9552859341581883, "grad_norm": 0.35432329773902893, "learning_rate": 3.150844675606212e-06, "loss": 3.7534, "step": 2987 }, { "epoch": 0.9556057486657739, "grad_norm": 0.39576801657676697, "learning_rate": 3.10607092328764e-06, "loss": 3.8389, "step": 2988 }, { "epoch": 0.9559255631733594, "grad_norm": 0.3919737637042999, "learning_rate": 3.0616159154177366e-06, "loss": 3.8149, "step": 2989 }, { "epoch": 0.9562453776809451, "grad_norm": 0.3739626407623291, "learning_rate": 3.0174796997233908e-06, "loss": 3.8717, "step": 2990 }, { "epoch": 0.9565651921885306, "grad_norm": 0.3722630739212036, "learning_rate": 2.973662323588999e-06, "loss": 3.8096, "step": 2991 }, { "epoch": 0.9568850066961162, "grad_norm": 0.3538426458835602, "learning_rate": 2.930163834056831e-06, "loss": 3.9325, "step": 2992 }, { "epoch": 0.9572048212037019, "grad_norm": 0.3393118381500244, "learning_rate": 2.8869842778266983e-06, "loss": 3.893, "step": 2993 }, { "epoch": 0.9575246357112874, "grad_norm": 0.4572709798812866, "learning_rate": 2.844123701256051e-06, "loss": 3.9051, "step": 2994 }, { "epoch": 0.9578444502188731, "grad_norm": 0.3282608389854431, "learning_rate": 2.801582150359882e-06, "loss": 3.7759, "step": 2995 }, { "epoch": 0.9581642647264587, "grad_norm": 0.4027000367641449, "learning_rate": 2.7593596708106904e-06, "loss": 3.7417, "step": 2996 }, { "epoch": 0.9584840792340442, "grad_norm": 0.3149329125881195, "learning_rate": 2.717456307938415e-06, "loss": 3.8386, "step": 2997 }, { "epoch": 0.9588038937416299, "grad_norm": 0.3250699043273926, "learning_rate": 2.6758721067303367e-06, "loss": 3.886, "step": 2998 }, { "epoch": 0.9591237082492154, "grad_norm": 0.3039015531539917, "learning_rate": 2.634607111831177e-06, "loss": 3.8364, "step": 2999 }, { "epoch": 0.959443522756801, "grad_norm": 0.33608779311180115, "learning_rate": 2.5936613675428985e-06, "loss": 3.792, "step": 3000 }, { "epoch": 0.959443522756801, "eval_runtime": 51.687, "eval_samples_per_second": 36.702, "eval_steps_per_second": 9.19, "step": 3000 }, { "epoch": 0.9597633372643867, "grad_norm": 0.41396185755729675, "learning_rate": 2.5530349178247033e-06, "loss": 3.8596, "step": 3001 }, { "epoch": 0.9600831517719722, "grad_norm": 0.33457860350608826, "learning_rate": 2.512727806293069e-06, "loss": 3.8754, "step": 3002 }, { "epoch": 0.9604029662795579, "grad_norm": 0.36531147360801697, "learning_rate": 2.4727400762215798e-06, "loss": 3.7873, "step": 3003 }, { "epoch": 0.9607227807871435, "grad_norm": 0.38964322209358215, "learning_rate": 2.4330717705409287e-06, "loss": 3.7481, "step": 3004 }, { "epoch": 0.961042595294729, "grad_norm": 0.4089541733264923, "learning_rate": 2.393722931838882e-06, "loss": 3.8792, "step": 3005 }, { "epoch": 0.9613624098023147, "grad_norm": 0.4337267279624939, "learning_rate": 2.3546936023603134e-06, "loss": 3.9024, "step": 3006 }, { "epoch": 0.9616822243099002, "grad_norm": 0.4201469123363495, "learning_rate": 2.315983824006906e-06, "loss": 3.888, "step": 3007 }, { "epoch": 0.9620020388174858, "grad_norm": 0.36949995160102844, "learning_rate": 2.277593638337416e-06, "loss": 3.8194, "step": 3008 }, { "epoch": 0.9623218533250715, "grad_norm": 0.337776780128479, "learning_rate": 2.2395230865674075e-06, "loss": 3.8213, "step": 3009 }, { "epoch": 0.962641667832657, "grad_norm": 0.35747015476226807, "learning_rate": 2.201772209569319e-06, "loss": 3.8493, "step": 3010 }, { "epoch": 0.9629614823402427, "grad_norm": 0.3196185231208801, "learning_rate": 2.164341047872398e-06, "loss": 3.8886, "step": 3011 }, { "epoch": 0.9632812968478283, "grad_norm": 0.33058109879493713, "learning_rate": 2.127229641662598e-06, "loss": 3.8166, "step": 3012 }, { "epoch": 0.9636011113554138, "grad_norm": 0.34766048192977905, "learning_rate": 2.0904380307826483e-06, "loss": 3.8286, "step": 3013 }, { "epoch": 0.9639209258629995, "grad_norm": 0.3611506521701813, "learning_rate": 2.053966254731887e-06, "loss": 3.8307, "step": 3014 }, { "epoch": 0.964240740370585, "grad_norm": 0.39147886633872986, "learning_rate": 2.0178143526663248e-06, "loss": 3.9441, "step": 3015 }, { "epoch": 0.9645605548781706, "grad_norm": 0.3689884841442108, "learning_rate": 1.981982363398549e-06, "loss": 3.9108, "step": 3016 }, { "epoch": 0.9648803693857563, "grad_norm": 0.33246079087257385, "learning_rate": 1.9464703253976533e-06, "loss": 3.7631, "step": 3017 }, { "epoch": 0.9652001838933418, "grad_norm": 0.3382446765899658, "learning_rate": 1.911278276789241e-06, "loss": 3.7968, "step": 3018 }, { "epoch": 0.9655199984009275, "grad_norm": 0.5021396279335022, "learning_rate": 1.8764062553554227e-06, "loss": 3.739, "step": 3019 }, { "epoch": 0.9658398129085131, "grad_norm": 0.3148387670516968, "learning_rate": 1.8418542985347174e-06, "loss": 3.92, "step": 3020 }, { "epoch": 0.9661596274160986, "grad_norm": 0.41547971963882446, "learning_rate": 1.8076224434219523e-06, "loss": 3.7999, "step": 3021 }, { "epoch": 0.9664794419236843, "grad_norm": 0.5084079504013062, "learning_rate": 1.773710726768396e-06, "loss": 3.8218, "step": 3022 }, { "epoch": 0.9667992564312698, "grad_norm": 0.4002964198589325, "learning_rate": 1.7401191849815255e-06, "loss": 3.8712, "step": 3023 }, { "epoch": 0.9671190709388555, "grad_norm": 0.3130205273628235, "learning_rate": 1.7068478541251263e-06, "loss": 3.8616, "step": 3024 }, { "epoch": 0.9674388854464411, "grad_norm": 0.37930676341056824, "learning_rate": 1.673896769919192e-06, "loss": 3.8008, "step": 3025 }, { "epoch": 0.9677586999540266, "grad_norm": 0.31064632534980774, "learning_rate": 1.6412659677399908e-06, "loss": 3.855, "step": 3026 }, { "epoch": 0.9680785144616123, "grad_norm": 0.4104081988334656, "learning_rate": 1.608955482619767e-06, "loss": 3.8593, "step": 3027 }, { "epoch": 0.9683983289691979, "grad_norm": 0.3251653015613556, "learning_rate": 1.5769653492470057e-06, "loss": 3.8259, "step": 3028 }, { "epoch": 0.9687181434767834, "grad_norm": 0.3425891399383545, "learning_rate": 1.5452956019661678e-06, "loss": 3.8136, "step": 3029 }, { "epoch": 0.9690379579843691, "grad_norm": 0.4805048108100891, "learning_rate": 1.5139462747778885e-06, "loss": 3.8373, "step": 3030 }, { "epoch": 0.9693577724919547, "grad_norm": 0.36025211215019226, "learning_rate": 1.4829174013386126e-06, "loss": 3.816, "step": 3031 }, { "epoch": 0.9696775869995403, "grad_norm": 0.34036335349082947, "learning_rate": 1.4522090149609256e-06, "loss": 3.831, "step": 3032 }, { "epoch": 0.9699974015071259, "grad_norm": 0.3695327341556549, "learning_rate": 1.4218211486132558e-06, "loss": 3.8908, "step": 3033 }, { "epoch": 0.9703172160147114, "grad_norm": 0.31533119082450867, "learning_rate": 1.3917538349198731e-06, "loss": 3.8586, "step": 3034 }, { "epoch": 0.9706370305222971, "grad_norm": 0.32432880997657776, "learning_rate": 1.3620071061609894e-06, "loss": 3.8027, "step": 3035 }, { "epoch": 0.9709568450298827, "grad_norm": 0.40443745255470276, "learning_rate": 1.332580994272625e-06, "loss": 3.7668, "step": 3036 }, { "epoch": 0.9712766595374682, "grad_norm": 0.3637508153915405, "learning_rate": 1.3034755308465428e-06, "loss": 3.9281, "step": 3037 }, { "epoch": 0.9715964740450539, "grad_norm": 0.43956559896469116, "learning_rate": 1.2746907471302803e-06, "loss": 3.8238, "step": 3038 }, { "epoch": 0.9719162885526395, "grad_norm": 0.3777841627597809, "learning_rate": 1.2462266740270843e-06, "loss": 3.8142, "step": 3039 }, { "epoch": 0.9722361030602251, "grad_norm": 0.3405802845954895, "learning_rate": 1.2180833420959436e-06, "loss": 3.8904, "step": 3040 }, { "epoch": 0.9725559175678107, "grad_norm": 0.327116459608078, "learning_rate": 1.190260781551422e-06, "loss": 3.8132, "step": 3041 }, { "epoch": 0.9728757320753962, "grad_norm": 0.31746113300323486, "learning_rate": 1.1627590222637594e-06, "loss": 3.8479, "step": 3042 }, { "epoch": 0.9731955465829819, "grad_norm": 0.3245218098163605, "learning_rate": 1.1355780937587378e-06, "loss": 3.8363, "step": 3043 }, { "epoch": 0.9735153610905675, "grad_norm": 0.3749473989009857, "learning_rate": 1.1087180252177475e-06, "loss": 3.7869, "step": 3044 }, { "epoch": 0.973835175598153, "grad_norm": 0.6619917750358582, "learning_rate": 1.0821788454776548e-06, "loss": 3.8939, "step": 3045 }, { "epoch": 0.9741549901057387, "grad_norm": 0.3483913540840149, "learning_rate": 1.0559605830308682e-06, "loss": 3.8447, "step": 3046 }, { "epoch": 0.9744748046133243, "grad_norm": 0.5605376958847046, "learning_rate": 1.030063266025205e-06, "loss": 3.8284, "step": 3047 }, { "epoch": 0.9747946191209099, "grad_norm": 0.35739341378211975, "learning_rate": 1.0044869222639917e-06, "loss": 3.8944, "step": 3048 }, { "epoch": 0.9751144336284955, "grad_norm": 0.34825676679611206, "learning_rate": 9.79231579205897e-07, "loss": 3.8178, "step": 3049 }, { "epoch": 0.975434248136081, "grad_norm": 0.3362109065055847, "learning_rate": 9.54297263964965e-07, "loss": 3.7137, "step": 3050 }, { "epoch": 0.9757540626436667, "grad_norm": 0.3090246319770813, "learning_rate": 9.29684003310649e-07, "loss": 3.9184, "step": 3051 }, { "epoch": 0.9760738771512523, "grad_norm": 0.4439792037010193, "learning_rate": 9.053918236676116e-07, "loss": 3.9061, "step": 3052 }, { "epoch": 0.9763936916588378, "grad_norm": 0.4872496724128723, "learning_rate": 8.814207511159243e-07, "loss": 3.8985, "step": 3053 }, { "epoch": 0.9767135061664235, "grad_norm": 0.3381938934326172, "learning_rate": 8.577708113908011e-07, "loss": 3.9371, "step": 3054 }, { "epoch": 0.977033320674009, "grad_norm": 0.3490566611289978, "learning_rate": 8.344420298827981e-07, "loss": 3.8793, "step": 3055 }, { "epoch": 0.9773531351815947, "grad_norm": 0.33192285895347595, "learning_rate": 8.114344316376143e-07, "loss": 3.9263, "step": 3056 }, { "epoch": 0.9776729496891803, "grad_norm": 0.38641390204429626, "learning_rate": 7.887480413561243e-07, "loss": 3.8854, "step": 3057 }, { "epoch": 0.9779927641967658, "grad_norm": 0.32289981842041016, "learning_rate": 7.663828833943786e-07, "loss": 3.8398, "step": 3058 }, { "epoch": 0.9783125787043515, "grad_norm": 0.36082369089126587, "learning_rate": 7.443389817635371e-07, "loss": 3.8446, "step": 3059 }, { "epoch": 0.9786323932119371, "grad_norm": 0.337561696767807, "learning_rate": 7.226163601298685e-07, "loss": 3.8269, "step": 3060 }, { "epoch": 0.9789522077195226, "grad_norm": 0.3223322927951813, "learning_rate": 7.01215041814751e-07, "loss": 3.9162, "step": 3061 }, { "epoch": 0.9792720222271083, "grad_norm": 0.35244524478912354, "learning_rate": 6.801350497945391e-07, "loss": 3.864, "step": 3062 }, { "epoch": 0.9795918367346939, "grad_norm": 0.4898795485496521, "learning_rate": 6.593764067006624e-07, "loss": 3.7289, "step": 3063 }, { "epoch": 0.9799116512422795, "grad_norm": 0.3561013638973236, "learning_rate": 6.389391348195272e-07, "loss": 3.8887, "step": 3064 }, { "epoch": 0.9802314657498651, "grad_norm": 0.3641793131828308, "learning_rate": 6.188232560925155e-07, "loss": 3.7782, "step": 3065 }, { "epoch": 0.9805512802574506, "grad_norm": 0.2989286184310913, "learning_rate": 5.990287921160186e-07, "loss": 3.8795, "step": 3066 }, { "epoch": 0.9808710947650363, "grad_norm": 0.3340880870819092, "learning_rate": 5.79555764141304e-07, "loss": 3.816, "step": 3067 }, { "epoch": 0.9811909092726219, "grad_norm": 0.487589955329895, "learning_rate": 5.604041930745485e-07, "loss": 3.8868, "step": 3068 }, { "epoch": 0.9815107237802074, "grad_norm": 0.36624425649642944, "learning_rate": 5.415740994768048e-07, "loss": 3.9517, "step": 3069 }, { "epoch": 0.9818305382877931, "grad_norm": 0.3947308361530304, "learning_rate": 5.230655035640352e-07, "loss": 3.81, "step": 3070 }, { "epoch": 0.9821503527953787, "grad_norm": 0.5169360637664795, "learning_rate": 5.048784252069782e-07, "loss": 3.869, "step": 3071 }, { "epoch": 0.9824701673029643, "grad_norm": 0.42947855591773987, "learning_rate": 4.870128839312815e-07, "loss": 3.8745, "step": 3072 }, { "epoch": 0.9827899818105499, "grad_norm": 0.3206599950790405, "learning_rate": 4.6946889891726903e-07, "loss": 3.8055, "step": 3073 }, { "epoch": 0.9831097963181354, "grad_norm": 0.4280754327774048, "learning_rate": 4.5224648900017424e-07, "loss": 3.8546, "step": 3074 }, { "epoch": 0.9834296108257211, "grad_norm": 0.4488411843776703, "learning_rate": 4.353456726699067e-07, "loss": 3.8407, "step": 3075 }, { "epoch": 0.9837494253333067, "grad_norm": 0.4452347159385681, "learning_rate": 4.1876646807111893e-07, "loss": 3.7843, "step": 3076 }, { "epoch": 0.9840692398408922, "grad_norm": 0.37414631247520447, "learning_rate": 4.025088930031728e-07, "loss": 3.7777, "step": 3077 }, { "epoch": 0.9843890543484779, "grad_norm": 0.30453184247016907, "learning_rate": 3.8657296492023984e-07, "loss": 3.7954, "step": 3078 }, { "epoch": 0.9847088688560635, "grad_norm": 0.31287437677383423, "learning_rate": 3.709587009309678e-07, "loss": 3.7747, "step": 3079 }, { "epoch": 0.9850286833636491, "grad_norm": 0.3087519705295563, "learning_rate": 3.5566611779888066e-07, "loss": 3.7772, "step": 3080 }, { "epoch": 0.9853484978712347, "grad_norm": 0.3278191387653351, "learning_rate": 3.406952319420453e-07, "loss": 3.6379, "step": 3081 }, { "epoch": 0.9856683123788202, "grad_norm": 0.42179006338119507, "learning_rate": 3.260460594330716e-07, "loss": 3.8773, "step": 3082 }, { "epoch": 0.9859881268864059, "grad_norm": 0.31324389576911926, "learning_rate": 3.1171861599937896e-07, "loss": 3.9099, "step": 3083 }, { "epoch": 0.9863079413939915, "grad_norm": 0.3378654420375824, "learning_rate": 2.9771291702279655e-07, "loss": 3.9229, "step": 3084 }, { "epoch": 0.986627755901577, "grad_norm": 0.3062484562397003, "learning_rate": 2.840289775398297e-07, "loss": 3.8824, "step": 3085 }, { "epoch": 0.9869475704091627, "grad_norm": 0.3244476020336151, "learning_rate": 2.7066681224149344e-07, "loss": 3.8655, "step": 3086 }, { "epoch": 0.9872673849167483, "grad_norm": 0.31210950016975403, "learning_rate": 2.5762643547337924e-07, "loss": 3.8447, "step": 3087 }, { "epoch": 0.9875871994243339, "grad_norm": 0.34307006001472473, "learning_rate": 2.4490786123562144e-07, "loss": 3.8307, "step": 3088 }, { "epoch": 0.9879070139319195, "grad_norm": 0.44219186902046204, "learning_rate": 2.3251110318283083e-07, "loss": 3.7925, "step": 3089 }, { "epoch": 0.988226828439505, "grad_norm": 0.33055782318115234, "learning_rate": 2.204361746241279e-07, "loss": 3.8632, "step": 3090 }, { "epoch": 0.9885466429470907, "grad_norm": 0.4885416626930237, "learning_rate": 2.0868308852310943e-07, "loss": 3.811, "step": 3091 }, { "epoch": 0.9888664574546763, "grad_norm": 0.3083757162094116, "learning_rate": 1.9725185749784879e-07, "loss": 3.8244, "step": 3092 }, { "epoch": 0.9891862719622619, "grad_norm": 0.34024709463119507, "learning_rate": 1.861424938208955e-07, "loss": 3.7605, "step": 3093 }, { "epoch": 0.9895060864698475, "grad_norm": 0.4302571713924408, "learning_rate": 1.753550094192424e-07, "loss": 3.8423, "step": 3094 }, { "epoch": 0.9898259009774331, "grad_norm": 0.3134204149246216, "learning_rate": 1.6488941587429193e-07, "loss": 3.8345, "step": 3095 }, { "epoch": 0.9901457154850187, "grad_norm": 0.3406553268432617, "learning_rate": 1.547457244218564e-07, "loss": 3.8085, "step": 3096 }, { "epoch": 0.9904655299926043, "grad_norm": 0.42396512627601624, "learning_rate": 1.4492394595219115e-07, "loss": 3.7839, "step": 3097 }, { "epoch": 0.9907853445001898, "grad_norm": 0.39441990852355957, "learning_rate": 1.3542409100992802e-07, "loss": 3.8271, "step": 3098 }, { "epoch": 0.9911051590077755, "grad_norm": 0.3295893967151642, "learning_rate": 1.2624616979407532e-07, "loss": 3.9037, "step": 3099 }, { "epoch": 0.9914249735153611, "grad_norm": 0.3320423364639282, "learning_rate": 1.1739019215801781e-07, "loss": 3.899, "step": 3100 }, { "epoch": 0.9914249735153611, "eval_runtime": 50.6501, "eval_samples_per_second": 37.453, "eval_steps_per_second": 9.378, "step": 3100 }, { "epoch": 0.9917447880229467, "grad_norm": 0.34247344732284546, "learning_rate": 1.0885616760951676e-07, "loss": 3.7877, "step": 3101 }, { "epoch": 0.9920646025305323, "grad_norm": 0.47989964485168457, "learning_rate": 1.0064410531067657e-07, "loss": 3.8206, "step": 3102 }, { "epoch": 0.9923844170381179, "grad_norm": 0.3649783432483673, "learning_rate": 9.27540140779448e-08, "loss": 3.9293, "step": 3103 }, { "epoch": 0.9927042315457035, "grad_norm": 0.3627043664455414, "learning_rate": 8.51859023821122e-08, "loss": 3.8537, "step": 3104 }, { "epoch": 0.9930240460532891, "grad_norm": 0.31545159220695496, "learning_rate": 7.793977834824605e-08, "loss": 3.8517, "step": 3105 }, { "epoch": 0.9933438605608746, "grad_norm": 0.3834705054759979, "learning_rate": 7.101564975579011e-08, "loss": 3.8426, "step": 3106 }, { "epoch": 0.9936636750684603, "grad_norm": 0.44169601798057556, "learning_rate": 6.441352403849798e-08, "loss": 3.8617, "step": 3107 }, { "epoch": 0.9939834895760459, "grad_norm": 0.4339190423488617, "learning_rate": 5.813340828429991e-08, "loss": 3.838, "step": 3108 }, { "epoch": 0.9943033040836315, "grad_norm": 0.3510226905345917, "learning_rate": 5.217530923560254e-08, "loss": 3.7939, "step": 3109 }, { "epoch": 0.9946231185912171, "grad_norm": 0.32680079340934753, "learning_rate": 4.6539233288955816e-08, "loss": 3.8582, "step": 3110 }, { "epoch": 0.9949429330988027, "grad_norm": 0.36098724603652954, "learning_rate": 4.122518649525286e-08, "loss": 3.8123, "step": 3111 }, { "epoch": 0.9952627476063883, "grad_norm": 0.3424474000930786, "learning_rate": 3.623317455959673e-08, "loss": 3.8691, "step": 3112 }, { "epoch": 0.9955825621139739, "grad_norm": 0.5065611600875854, "learning_rate": 3.156320284146696e-08, "loss": 3.7082, "step": 3113 }, { "epoch": 0.9959023766215594, "grad_norm": 0.36753055453300476, "learning_rate": 2.7215276354486393e-08, "loss": 3.8286, "step": 3114 }, { "epoch": 0.9962221911291451, "grad_norm": 0.374754935503006, "learning_rate": 2.3189399766587735e-08, "loss": 3.7989, "step": 3115 }, { "epoch": 0.9965420056367307, "grad_norm": 0.3200516104698181, "learning_rate": 1.948557739994694e-08, "loss": 3.9148, "step": 3116 }, { "epoch": 0.9968618201443163, "grad_norm": 0.34902477264404297, "learning_rate": 1.6103813230949892e-08, "loss": 3.808, "step": 3117 }, { "epoch": 0.9971816346519019, "grad_norm": 0.30570390820503235, "learning_rate": 1.3044110890292336e-08, "loss": 3.8949, "step": 3118 }, { "epoch": 0.9975014491594875, "grad_norm": 0.3514823317527771, "learning_rate": 1.0306473662813341e-08, "loss": 3.8389, "step": 3119 }, { "epoch": 0.9978212636670731, "grad_norm": 0.4388292133808136, "learning_rate": 7.89090448766183e-09, "loss": 3.8029, "step": 3120 }, { "epoch": 0.9981410781746587, "grad_norm": 0.43854770064353943, "learning_rate": 5.7974059581633595e-09, "loss": 3.88, "step": 3121 }, { "epoch": 0.9984608926822442, "grad_norm": 0.3822016716003418, "learning_rate": 4.02598032192003e-09, "loss": 3.8692, "step": 3122 }, { "epoch": 0.9987807071898299, "grad_norm": 0.31966227293014526, "learning_rate": 2.5766294807438858e-09, "loss": 3.7572, "step": 3123 }, { "epoch": 0.9991005216974155, "grad_norm": 0.46040087938308716, "learning_rate": 1.4493549905902902e-09, "loss": 3.8576, "step": 3124 }, { "epoch": 0.9994203362050011, "grad_norm": 0.2964051067829132, "learning_rate": 6.441580617577713e-10, "loss": 3.8289, "step": 3125 }, { "epoch": 0.9997401507125867, "grad_norm": 0.5804516673088074, "learning_rate": 1.6103955865487407e-10, "loss": 3.8162, "step": 3126 }, { "epoch": 0.9997401507125867, "step": 3126, "total_flos": 2.622826277901435e+17, "train_loss": 4.408677676360117, "train_runtime": 28395.2688, "train_samples_per_second": 7.047, "train_steps_per_second": 0.11 }, { "epoch": 0.9997401507125867, "eval_runtime": 55.4062, "eval_samples_per_second": 34.238, "eval_steps_per_second": 8.573, "step": 3126 } ], "logging_steps": 1, "max_steps": 3126, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.622826277901435e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }