{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999310276874569, "eval_steps": 400, "global_step": 11415, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005255033336617729, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 9.6506, "step": 2 }, { "epoch": 0.0010510066673235458, "grad_norm": 37.969757080078125, "learning_rate": 9.999123882950763e-05, "loss": 9.7103, "step": 4 }, { "epoch": 0.0015765100009853187, "grad_norm": 24.103715896606445, "learning_rate": 9.997371648852288e-05, "loss": 7.2578, "step": 6 }, { "epoch": 0.0021020133346470915, "grad_norm": 9.479035377502441, "learning_rate": 9.995619414753812e-05, "loss": 6.5928, "step": 8 }, { "epoch": 0.0026275166683088647, "grad_norm": 9.890775680541992, "learning_rate": 9.993867180655336e-05, "loss": 6.2815, "step": 10 }, { "epoch": 0.0031530200019706375, "grad_norm": 8.57809829711914, "learning_rate": 9.99211494655686e-05, "loss": 6.1121, "step": 12 }, { "epoch": 0.0036785233356324103, "grad_norm": 7.994872093200684, "learning_rate": 9.990362712458385e-05, "loss": 5.9755, "step": 14 }, { "epoch": 0.004204026669294183, "grad_norm": 7.026456356048584, "learning_rate": 9.988610478359909e-05, "loss": 5.8224, "step": 16 }, { "epoch": 0.004729530002955957, "grad_norm": 5.809737682342529, "learning_rate": 9.986858244261433e-05, "loss": 5.7188, "step": 18 }, { "epoch": 0.0052550333366177295, "grad_norm": 6.866558074951172, "learning_rate": 9.985106010162958e-05, "loss": 5.6471, "step": 20 }, { "epoch": 0.005780536670279502, "grad_norm": 6.097131252288818, "learning_rate": 9.983353776064483e-05, "loss": 5.5565, "step": 22 }, { "epoch": 0.006306040003941275, "grad_norm": 6.54796838760376, "learning_rate": 9.981601541966006e-05, "loss": 5.5451, "step": 24 }, { "epoch": 0.006831543337603048, "grad_norm": 5.477590560913086, "learning_rate": 9.979849307867531e-05, "loss": 5.4577, "step": 26 }, { "epoch": 0.0073570466712648205, "grad_norm": 5.943256378173828, "learning_rate": 9.978097073769056e-05, "loss": 5.4583, "step": 28 }, { "epoch": 0.007882550004926594, "grad_norm": 4.521725654602051, "learning_rate": 9.97634483967058e-05, "loss": 5.4021, "step": 30 }, { "epoch": 0.008408053338588366, "grad_norm": 4.905904293060303, "learning_rate": 9.974592605572105e-05, "loss": 5.3635, "step": 32 }, { "epoch": 0.00893355667225014, "grad_norm": 5.947779655456543, "learning_rate": 9.97284037147363e-05, "loss": 5.3016, "step": 34 }, { "epoch": 0.009459060005911913, "grad_norm": 4.351555824279785, "learning_rate": 9.971088137375154e-05, "loss": 5.3334, "step": 36 }, { "epoch": 0.009984563339573685, "grad_norm": 5.503723621368408, "learning_rate": 9.969335903276678e-05, "loss": 5.2841, "step": 38 }, { "epoch": 0.010510066673235459, "grad_norm": 4.78935432434082, "learning_rate": 9.967583669178203e-05, "loss": 5.2653, "step": 40 }, { "epoch": 0.01103557000689723, "grad_norm": 3.698417901992798, "learning_rate": 9.965831435079726e-05, "loss": 5.2492, "step": 42 }, { "epoch": 0.011561073340559004, "grad_norm": 4.703723907470703, "learning_rate": 9.964079200981251e-05, "loss": 5.2477, "step": 44 }, { "epoch": 0.012086576674220776, "grad_norm": 5.180492401123047, "learning_rate": 9.962326966882776e-05, "loss": 5.1948, "step": 46 }, { "epoch": 0.01261208000788255, "grad_norm": 4.127843856811523, "learning_rate": 9.960574732784301e-05, "loss": 5.1441, "step": 48 }, { "epoch": 0.013137583341544324, "grad_norm": 3.8251705169677734, "learning_rate": 9.958822498685824e-05, "loss": 5.2034, "step": 50 }, { "epoch": 0.013663086675206096, "grad_norm": 3.3737661838531494, "learning_rate": 9.957070264587349e-05, "loss": 5.1664, "step": 52 }, { "epoch": 0.01418859000886787, "grad_norm": 4.652468681335449, "learning_rate": 9.955318030488874e-05, "loss": 5.1389, "step": 54 }, { "epoch": 0.014714093342529641, "grad_norm": 4.0360307693481445, "learning_rate": 9.953565796390398e-05, "loss": 5.11, "step": 56 }, { "epoch": 0.015239596676191415, "grad_norm": 4.159426689147949, "learning_rate": 9.951813562291923e-05, "loss": 5.1039, "step": 58 }, { "epoch": 0.01576510000985319, "grad_norm": 3.732241153717041, "learning_rate": 9.950061328193448e-05, "loss": 5.1089, "step": 60 }, { "epoch": 0.01629060334351496, "grad_norm": 4.215841293334961, "learning_rate": 9.948309094094971e-05, "loss": 5.078, "step": 62 }, { "epoch": 0.016816106677176732, "grad_norm": 3.332622528076172, "learning_rate": 9.946556859996496e-05, "loss": 5.0974, "step": 64 }, { "epoch": 0.017341610010838508, "grad_norm": 4.1641764640808105, "learning_rate": 9.944804625898021e-05, "loss": 5.029, "step": 66 }, { "epoch": 0.01786711334450028, "grad_norm": 3.6099252700805664, "learning_rate": 9.943052391799544e-05, "loss": 5.0712, "step": 68 }, { "epoch": 0.01839261667816205, "grad_norm": 3.314837694168091, "learning_rate": 9.941300157701069e-05, "loss": 5.0361, "step": 70 }, { "epoch": 0.018918120011823827, "grad_norm": 3.7522711753845215, "learning_rate": 9.939547923602594e-05, "loss": 5.0595, "step": 72 }, { "epoch": 0.0194436233454856, "grad_norm": 3.479856491088867, "learning_rate": 9.937795689504118e-05, "loss": 5.0458, "step": 74 }, { "epoch": 0.01996912667914737, "grad_norm": 4.363260269165039, "learning_rate": 9.936043455405642e-05, "loss": 5.0355, "step": 76 }, { "epoch": 0.020494630012809142, "grad_norm": 3.0860562324523926, "learning_rate": 9.934291221307167e-05, "loss": 5.0104, "step": 78 }, { "epoch": 0.021020133346470918, "grad_norm": 3.978928327560425, "learning_rate": 9.932538987208691e-05, "loss": 5.0099, "step": 80 }, { "epoch": 0.02154563668013269, "grad_norm": 3.7484583854675293, "learning_rate": 9.930786753110216e-05, "loss": 4.964, "step": 82 }, { "epoch": 0.02207114001379446, "grad_norm": 2.865388870239258, "learning_rate": 9.929034519011741e-05, "loss": 4.9169, "step": 84 }, { "epoch": 0.022596643347456237, "grad_norm": 2.627261161804199, "learning_rate": 9.927282284913266e-05, "loss": 4.9835, "step": 86 }, { "epoch": 0.02312214668111801, "grad_norm": 3.3558332920074463, "learning_rate": 9.925530050814789e-05, "loss": 4.9638, "step": 88 }, { "epoch": 0.02364765001477978, "grad_norm": 3.5611279010772705, "learning_rate": 9.923777816716314e-05, "loss": 4.9904, "step": 90 }, { "epoch": 0.024173153348441553, "grad_norm": 2.5689175128936768, "learning_rate": 9.922025582617839e-05, "loss": 4.9498, "step": 92 }, { "epoch": 0.024698656682103328, "grad_norm": 3.4009926319122314, "learning_rate": 9.920273348519362e-05, "loss": 4.9395, "step": 94 }, { "epoch": 0.0252241600157651, "grad_norm": 3.381573438644409, "learning_rate": 9.918521114420887e-05, "loss": 4.9174, "step": 96 }, { "epoch": 0.025749663349426872, "grad_norm": 3.074411153793335, "learning_rate": 9.916768880322411e-05, "loss": 4.936, "step": 98 }, { "epoch": 0.026275166683088647, "grad_norm": 2.972590923309326, "learning_rate": 9.915016646223936e-05, "loss": 4.9532, "step": 100 }, { "epoch": 0.02680067001675042, "grad_norm": 3.233518362045288, "learning_rate": 9.91326441212546e-05, "loss": 4.9296, "step": 102 }, { "epoch": 0.02732617335041219, "grad_norm": 3.6069324016571045, "learning_rate": 9.911512178026984e-05, "loss": 4.9269, "step": 104 }, { "epoch": 0.027851676684073963, "grad_norm": 3.7742021083831787, "learning_rate": 9.909759943928509e-05, "loss": 4.9561, "step": 106 }, { "epoch": 0.02837718001773574, "grad_norm": 3.146205186843872, "learning_rate": 9.908007709830034e-05, "loss": 4.919, "step": 108 }, { "epoch": 0.02890268335139751, "grad_norm": 3.027519702911377, "learning_rate": 9.906255475731559e-05, "loss": 4.9097, "step": 110 }, { "epoch": 0.029428186685059282, "grad_norm": 3.5012547969818115, "learning_rate": 9.904503241633083e-05, "loss": 4.8517, "step": 112 }, { "epoch": 0.029953690018721058, "grad_norm": 3.740164041519165, "learning_rate": 9.902751007534607e-05, "loss": 4.9077, "step": 114 }, { "epoch": 0.03047919335238283, "grad_norm": 2.845717430114746, "learning_rate": 9.900998773436132e-05, "loss": 4.8609, "step": 116 }, { "epoch": 0.0310046966860446, "grad_norm": 3.504624128341675, "learning_rate": 9.899246539337655e-05, "loss": 4.8487, "step": 118 }, { "epoch": 0.03153020001970638, "grad_norm": 3.5355591773986816, "learning_rate": 9.89749430523918e-05, "loss": 4.8256, "step": 120 }, { "epoch": 0.032055703353368145, "grad_norm": 3.782735824584961, "learning_rate": 9.895742071140705e-05, "loss": 4.8548, "step": 122 }, { "epoch": 0.03258120668702992, "grad_norm": 2.17950701713562, "learning_rate": 9.893989837042229e-05, "loss": 4.835, "step": 124 }, { "epoch": 0.033106710020691696, "grad_norm": 2.494934558868408, "learning_rate": 9.892237602943754e-05, "loss": 4.818, "step": 126 }, { "epoch": 0.033632213354353464, "grad_norm": 2.9806673526763916, "learning_rate": 9.890485368845279e-05, "loss": 4.8288, "step": 128 }, { "epoch": 0.03415771668801524, "grad_norm": 2.8178086280822754, "learning_rate": 9.888733134746802e-05, "loss": 4.748, "step": 130 }, { "epoch": 0.034683220021677015, "grad_norm": 3.51326847076416, "learning_rate": 9.886980900648327e-05, "loss": 4.8393, "step": 132 }, { "epoch": 0.035208723355338784, "grad_norm": 2.438596248626709, "learning_rate": 9.885228666549852e-05, "loss": 4.7637, "step": 134 }, { "epoch": 0.03573422668900056, "grad_norm": 2.5073726177215576, "learning_rate": 9.883476432451376e-05, "loss": 4.6837, "step": 136 }, { "epoch": 0.036259730022662334, "grad_norm": 2.8833799362182617, "learning_rate": 9.881724198352901e-05, "loss": 4.684, "step": 138 }, { "epoch": 0.0367852333563241, "grad_norm": 2.8142380714416504, "learning_rate": 9.879971964254426e-05, "loss": 4.7231, "step": 140 }, { "epoch": 0.03731073668998588, "grad_norm": 2.427053451538086, "learning_rate": 9.87821973015595e-05, "loss": 4.6758, "step": 142 }, { "epoch": 0.03783624002364765, "grad_norm": 2.8985769748687744, "learning_rate": 9.876467496057473e-05, "loss": 4.6855, "step": 144 }, { "epoch": 0.03836174335730942, "grad_norm": 2.8868801593780518, "learning_rate": 9.874715261958998e-05, "loss": 4.6766, "step": 146 }, { "epoch": 0.0388872466909712, "grad_norm": 3.419677495956421, "learning_rate": 9.872963027860522e-05, "loss": 4.6921, "step": 148 }, { "epoch": 0.039412750024632966, "grad_norm": 2.744260311126709, "learning_rate": 9.871210793762047e-05, "loss": 4.6349, "step": 150 }, { "epoch": 0.03993825335829474, "grad_norm": 2.3687593936920166, "learning_rate": 9.869458559663572e-05, "loss": 4.6261, "step": 152 }, { "epoch": 0.040463756691956516, "grad_norm": 2.6554672718048096, "learning_rate": 9.867706325565097e-05, "loss": 4.6846, "step": 154 }, { "epoch": 0.040989260025618285, "grad_norm": 2.260415554046631, "learning_rate": 9.86595409146662e-05, "loss": 4.6151, "step": 156 }, { "epoch": 0.04151476335928006, "grad_norm": 2.6216397285461426, "learning_rate": 9.864201857368145e-05, "loss": 4.6074, "step": 158 }, { "epoch": 0.042040266692941836, "grad_norm": 3.158665180206299, "learning_rate": 9.86244962326967e-05, "loss": 4.6406, "step": 160 }, { "epoch": 0.042565770026603604, "grad_norm": 2.724135398864746, "learning_rate": 9.860697389171194e-05, "loss": 4.6772, "step": 162 }, { "epoch": 0.04309127336026538, "grad_norm": 2.316377878189087, "learning_rate": 9.858945155072719e-05, "loss": 4.6242, "step": 164 }, { "epoch": 0.043616776693927155, "grad_norm": 2.3119211196899414, "learning_rate": 9.857192920974244e-05, "loss": 4.6446, "step": 166 }, { "epoch": 0.04414228002758892, "grad_norm": 2.0346710681915283, "learning_rate": 9.855440686875767e-05, "loss": 4.6524, "step": 168 }, { "epoch": 0.0446677833612507, "grad_norm": 2.668606996536255, "learning_rate": 9.85368845277729e-05, "loss": 4.6406, "step": 170 }, { "epoch": 0.045193286694912474, "grad_norm": 2.1967880725860596, "learning_rate": 9.851936218678815e-05, "loss": 4.6166, "step": 172 }, { "epoch": 0.04571879002857424, "grad_norm": 2.469740867614746, "learning_rate": 9.85018398458034e-05, "loss": 4.6125, "step": 174 }, { "epoch": 0.04624429336223602, "grad_norm": 2.4009954929351807, "learning_rate": 9.848431750481865e-05, "loss": 4.55, "step": 176 }, { "epoch": 0.046769796695897786, "grad_norm": 2.1962878704071045, "learning_rate": 9.84667951638339e-05, "loss": 4.5814, "step": 178 }, { "epoch": 0.04729530002955956, "grad_norm": 1.8299018144607544, "learning_rate": 9.844927282284914e-05, "loss": 4.6099, "step": 180 }, { "epoch": 0.04782080336322134, "grad_norm": 1.863736867904663, "learning_rate": 9.843175048186438e-05, "loss": 4.6149, "step": 182 }, { "epoch": 0.048346306696883105, "grad_norm": 2.5451714992523193, "learning_rate": 9.841422814087962e-05, "loss": 4.6395, "step": 184 }, { "epoch": 0.04887181003054488, "grad_norm": 2.0755879878997803, "learning_rate": 9.839670579989487e-05, "loss": 4.5834, "step": 186 }, { "epoch": 0.049397313364206656, "grad_norm": 2.239759922027588, "learning_rate": 9.837918345891012e-05, "loss": 4.5895, "step": 188 }, { "epoch": 0.049922816697868425, "grad_norm": 1.8995624780654907, "learning_rate": 9.836166111792537e-05, "loss": 4.5773, "step": 190 }, { "epoch": 0.0504483200315302, "grad_norm": 2.3484208583831787, "learning_rate": 9.834413877694061e-05, "loss": 4.5279, "step": 192 }, { "epoch": 0.050973823365191975, "grad_norm": 1.9540201425552368, "learning_rate": 9.832661643595585e-05, "loss": 4.5255, "step": 194 }, { "epoch": 0.051499326698853744, "grad_norm": 2.366356134414673, "learning_rate": 9.830909409497108e-05, "loss": 4.5644, "step": 196 }, { "epoch": 0.05202483003251552, "grad_norm": 2.0400447845458984, "learning_rate": 9.829157175398633e-05, "loss": 4.5732, "step": 198 }, { "epoch": 0.052550333366177295, "grad_norm": 1.7304773330688477, "learning_rate": 9.827404941300158e-05, "loss": 4.5429, "step": 200 }, { "epoch": 0.05307583669983906, "grad_norm": 1.7541052103042603, "learning_rate": 9.825652707201683e-05, "loss": 4.5754, "step": 202 }, { "epoch": 0.05360134003350084, "grad_norm": 2.0837032794952393, "learning_rate": 9.823900473103207e-05, "loss": 4.5185, "step": 204 }, { "epoch": 0.054126843367162614, "grad_norm": 2.540789842605591, "learning_rate": 9.822148239004732e-05, "loss": 4.5905, "step": 206 }, { "epoch": 0.05465234670082438, "grad_norm": 2.091892719268799, "learning_rate": 9.820396004906255e-05, "loss": 4.5456, "step": 208 }, { "epoch": 0.05517785003448616, "grad_norm": 2.489567995071411, "learning_rate": 9.81864377080778e-05, "loss": 4.5526, "step": 210 }, { "epoch": 0.055703353368147926, "grad_norm": 1.9146480560302734, "learning_rate": 9.816891536709305e-05, "loss": 4.5311, "step": 212 }, { "epoch": 0.0562288567018097, "grad_norm": 1.778203010559082, "learning_rate": 9.81513930261083e-05, "loss": 4.5732, "step": 214 }, { "epoch": 0.05675436003547148, "grad_norm": 1.8997642993927002, "learning_rate": 9.813387068512355e-05, "loss": 4.5201, "step": 216 }, { "epoch": 0.057279863369133245, "grad_norm": 1.726938247680664, "learning_rate": 9.811634834413879e-05, "loss": 4.5382, "step": 218 }, { "epoch": 0.05780536670279502, "grad_norm": 1.638175129890442, "learning_rate": 9.809882600315403e-05, "loss": 4.5165, "step": 220 }, { "epoch": 0.058330870036456796, "grad_norm": 1.854191541671753, "learning_rate": 9.808130366216926e-05, "loss": 4.527, "step": 222 }, { "epoch": 0.058856373370118564, "grad_norm": 1.579681634902954, "learning_rate": 9.806378132118451e-05, "loss": 4.5336, "step": 224 }, { "epoch": 0.05938187670378034, "grad_norm": 1.7318247556686401, "learning_rate": 9.804625898019976e-05, "loss": 4.5638, "step": 226 }, { "epoch": 0.059907380037442115, "grad_norm": 1.5140520334243774, "learning_rate": 9.8028736639215e-05, "loss": 4.5181, "step": 228 }, { "epoch": 0.060432883371103883, "grad_norm": 1.5837546586990356, "learning_rate": 9.801121429823025e-05, "loss": 4.5799, "step": 230 }, { "epoch": 0.06095838670476566, "grad_norm": 1.5691721439361572, "learning_rate": 9.79936919572455e-05, "loss": 4.5141, "step": 232 }, { "epoch": 0.061483890038427434, "grad_norm": 1.7556579113006592, "learning_rate": 9.797616961626073e-05, "loss": 4.4748, "step": 234 }, { "epoch": 0.0620093933720892, "grad_norm": 1.7606879472732544, "learning_rate": 9.795864727527598e-05, "loss": 4.5171, "step": 236 }, { "epoch": 0.06253489670575098, "grad_norm": 1.7246081829071045, "learning_rate": 9.794112493429123e-05, "loss": 4.5275, "step": 238 }, { "epoch": 0.06306040003941275, "grad_norm": 1.550188660621643, "learning_rate": 9.792360259330648e-05, "loss": 4.5323, "step": 240 }, { "epoch": 0.06358590337307453, "grad_norm": 1.5160279273986816, "learning_rate": 9.790608025232172e-05, "loss": 4.4913, "step": 242 }, { "epoch": 0.06411140670673629, "grad_norm": 1.6685302257537842, "learning_rate": 9.788855791133697e-05, "loss": 4.5145, "step": 244 }, { "epoch": 0.06463691004039807, "grad_norm": 2.3153581619262695, "learning_rate": 9.78710355703522e-05, "loss": 4.5513, "step": 246 }, { "epoch": 0.06516241337405984, "grad_norm": 1.6441559791564941, "learning_rate": 9.785351322936744e-05, "loss": 4.5015, "step": 248 }, { "epoch": 0.06568791670772162, "grad_norm": 1.3852648735046387, "learning_rate": 9.783599088838269e-05, "loss": 4.5032, "step": 250 }, { "epoch": 0.06621342004138339, "grad_norm": 1.788515567779541, "learning_rate": 9.781846854739793e-05, "loss": 4.5156, "step": 252 }, { "epoch": 0.06673892337504517, "grad_norm": 1.7113053798675537, "learning_rate": 9.780094620641318e-05, "loss": 4.5557, "step": 254 }, { "epoch": 0.06726442670870693, "grad_norm": 1.761032223701477, "learning_rate": 9.778342386542843e-05, "loss": 4.4853, "step": 256 }, { "epoch": 0.0677899300423687, "grad_norm": 1.4864990711212158, "learning_rate": 9.776590152444368e-05, "loss": 4.4645, "step": 258 }, { "epoch": 0.06831543337603048, "grad_norm": 1.4883724451065063, "learning_rate": 9.774837918345891e-05, "loss": 4.4943, "step": 260 }, { "epoch": 0.06884093670969225, "grad_norm": 1.5285605192184448, "learning_rate": 9.773085684247416e-05, "loss": 4.5006, "step": 262 }, { "epoch": 0.06936644004335403, "grad_norm": 1.6827439069747925, "learning_rate": 9.77133345014894e-05, "loss": 4.5073, "step": 264 }, { "epoch": 0.06989194337701579, "grad_norm": 1.7509275674819946, "learning_rate": 9.769581216050465e-05, "loss": 4.4839, "step": 266 }, { "epoch": 0.07041744671067757, "grad_norm": 1.6838834285736084, "learning_rate": 9.76782898195199e-05, "loss": 4.4676, "step": 268 }, { "epoch": 0.07094295004433934, "grad_norm": 1.4589577913284302, "learning_rate": 9.766076747853515e-05, "loss": 4.4783, "step": 270 }, { "epoch": 0.07146845337800112, "grad_norm": 1.6401913166046143, "learning_rate": 9.764324513755038e-05, "loss": 4.5118, "step": 272 }, { "epoch": 0.0719939567116629, "grad_norm": 1.4094103574752808, "learning_rate": 9.762572279656562e-05, "loss": 4.4846, "step": 274 }, { "epoch": 0.07251946004532467, "grad_norm": 1.4403023719787598, "learning_rate": 9.760820045558086e-05, "loss": 4.5305, "step": 276 }, { "epoch": 0.07304496337898643, "grad_norm": 1.4050440788269043, "learning_rate": 9.759067811459611e-05, "loss": 4.4903, "step": 278 }, { "epoch": 0.0735704667126482, "grad_norm": 1.3854197263717651, "learning_rate": 9.757315577361136e-05, "loss": 4.4835, "step": 280 }, { "epoch": 0.07409597004630998, "grad_norm": 1.139122724533081, "learning_rate": 9.75556334326266e-05, "loss": 4.4159, "step": 282 }, { "epoch": 0.07462147337997176, "grad_norm": 1.4378740787506104, "learning_rate": 9.753811109164185e-05, "loss": 4.4704, "step": 284 }, { "epoch": 0.07514697671363353, "grad_norm": 1.3609644174575806, "learning_rate": 9.752058875065709e-05, "loss": 4.5008, "step": 286 }, { "epoch": 0.0756724800472953, "grad_norm": 1.47963547706604, "learning_rate": 9.750306640967234e-05, "loss": 4.4637, "step": 288 }, { "epoch": 0.07619798338095707, "grad_norm": 1.395162582397461, "learning_rate": 9.748554406868758e-05, "loss": 4.5029, "step": 290 }, { "epoch": 0.07672348671461884, "grad_norm": 1.4480257034301758, "learning_rate": 9.746802172770283e-05, "loss": 4.4865, "step": 292 }, { "epoch": 0.07724899004828062, "grad_norm": 1.483083963394165, "learning_rate": 9.745049938671808e-05, "loss": 4.4217, "step": 294 }, { "epoch": 0.0777744933819424, "grad_norm": 1.5140736103057861, "learning_rate": 9.743297704573331e-05, "loss": 4.4926, "step": 296 }, { "epoch": 0.07829999671560417, "grad_norm": 1.405802607536316, "learning_rate": 9.741545470474856e-05, "loss": 4.5095, "step": 298 }, { "epoch": 0.07882550004926593, "grad_norm": 1.2400410175323486, "learning_rate": 9.73979323637638e-05, "loss": 4.4361, "step": 300 }, { "epoch": 0.0793510033829277, "grad_norm": 1.3110464811325073, "learning_rate": 9.738041002277904e-05, "loss": 4.4201, "step": 302 }, { "epoch": 0.07987650671658948, "grad_norm": 1.0086743831634521, "learning_rate": 9.736288768179429e-05, "loss": 4.4472, "step": 304 }, { "epoch": 0.08040201005025126, "grad_norm": 1.599827527999878, "learning_rate": 9.734536534080954e-05, "loss": 4.4677, "step": 306 }, { "epoch": 0.08092751338391303, "grad_norm": 1.5571688413619995, "learning_rate": 9.732784299982478e-05, "loss": 4.4515, "step": 308 }, { "epoch": 0.08145301671757481, "grad_norm": 1.2003650665283203, "learning_rate": 9.731032065884003e-05, "loss": 4.4676, "step": 310 }, { "epoch": 0.08197852005123657, "grad_norm": 1.3154256343841553, "learning_rate": 9.729279831785527e-05, "loss": 4.4477, "step": 312 }, { "epoch": 0.08250402338489835, "grad_norm": 1.140539526939392, "learning_rate": 9.727527597687051e-05, "loss": 4.4554, "step": 314 }, { "epoch": 0.08302952671856012, "grad_norm": 1.212310791015625, "learning_rate": 9.725775363588576e-05, "loss": 4.4369, "step": 316 }, { "epoch": 0.0835550300522219, "grad_norm": 1.1583181619644165, "learning_rate": 9.724023129490101e-05, "loss": 4.4275, "step": 318 }, { "epoch": 0.08408053338588367, "grad_norm": 1.1005992889404297, "learning_rate": 9.722270895391626e-05, "loss": 4.4473, "step": 320 }, { "epoch": 0.08460603671954543, "grad_norm": 1.1062551736831665, "learning_rate": 9.720518661293149e-05, "loss": 4.4534, "step": 322 }, { "epoch": 0.08513154005320721, "grad_norm": 1.1605561971664429, "learning_rate": 9.718766427194674e-05, "loss": 4.4747, "step": 324 }, { "epoch": 0.08565704338686898, "grad_norm": 1.5627479553222656, "learning_rate": 9.717014193096197e-05, "loss": 4.44, "step": 326 }, { "epoch": 0.08618254672053076, "grad_norm": 1.4984396696090698, "learning_rate": 9.715261958997722e-05, "loss": 4.3958, "step": 328 }, { "epoch": 0.08670805005419253, "grad_norm": 1.3127459287643433, "learning_rate": 9.713509724899247e-05, "loss": 4.4325, "step": 330 }, { "epoch": 0.08723355338785431, "grad_norm": 1.1719914674758911, "learning_rate": 9.711757490800771e-05, "loss": 4.4459, "step": 332 }, { "epoch": 0.08775905672151607, "grad_norm": 1.2020385265350342, "learning_rate": 9.710005256702296e-05, "loss": 4.4546, "step": 334 }, { "epoch": 0.08828456005517785, "grad_norm": 0.9500870108604431, "learning_rate": 9.708253022603821e-05, "loss": 4.4354, "step": 336 }, { "epoch": 0.08881006338883962, "grad_norm": 1.1271554231643677, "learning_rate": 9.706500788505344e-05, "loss": 4.4566, "step": 338 }, { "epoch": 0.0893355667225014, "grad_norm": 1.2026797533035278, "learning_rate": 9.704748554406869e-05, "loss": 4.4256, "step": 340 }, { "epoch": 0.08986107005616317, "grad_norm": 1.0629955530166626, "learning_rate": 9.702996320308394e-05, "loss": 4.4387, "step": 342 }, { "epoch": 0.09038657338982495, "grad_norm": 1.162955403327942, "learning_rate": 9.701244086209919e-05, "loss": 4.4111, "step": 344 }, { "epoch": 0.09091207672348671, "grad_norm": 0.9643715620040894, "learning_rate": 9.699491852111443e-05, "loss": 4.4102, "step": 346 }, { "epoch": 0.09143758005714848, "grad_norm": 0.963996946811676, "learning_rate": 9.697739618012967e-05, "loss": 4.4129, "step": 348 }, { "epoch": 0.09196308339081026, "grad_norm": 1.2173408269882202, "learning_rate": 9.695987383914491e-05, "loss": 4.41, "step": 350 }, { "epoch": 0.09248858672447204, "grad_norm": 1.1584279537200928, "learning_rate": 9.694235149816015e-05, "loss": 4.432, "step": 352 }, { "epoch": 0.09301409005813381, "grad_norm": 1.3441529273986816, "learning_rate": 9.69248291571754e-05, "loss": 4.3815, "step": 354 }, { "epoch": 0.09353959339179557, "grad_norm": 0.9360788464546204, "learning_rate": 9.690730681619064e-05, "loss": 4.425, "step": 356 }, { "epoch": 0.09406509672545735, "grad_norm": 1.1219030618667603, "learning_rate": 9.688978447520589e-05, "loss": 4.3961, "step": 358 }, { "epoch": 0.09459060005911912, "grad_norm": 1.0091334581375122, "learning_rate": 9.687226213422114e-05, "loss": 4.352, "step": 360 }, { "epoch": 0.0951161033927809, "grad_norm": 1.127103567123413, "learning_rate": 9.685473979323639e-05, "loss": 4.381, "step": 362 }, { "epoch": 0.09564160672644267, "grad_norm": 0.9449039101600647, "learning_rate": 9.683721745225162e-05, "loss": 4.4109, "step": 364 }, { "epoch": 0.09616711006010445, "grad_norm": 0.9935988783836365, "learning_rate": 9.681969511126687e-05, "loss": 4.4066, "step": 366 }, { "epoch": 0.09669261339376621, "grad_norm": 0.9146400094032288, "learning_rate": 9.680217277028212e-05, "loss": 4.3494, "step": 368 }, { "epoch": 0.09721811672742799, "grad_norm": 0.987327516078949, "learning_rate": 9.678465042929736e-05, "loss": 4.4436, "step": 370 }, { "epoch": 0.09774362006108976, "grad_norm": 1.0952600240707397, "learning_rate": 9.676712808831261e-05, "loss": 4.4694, "step": 372 }, { "epoch": 0.09826912339475154, "grad_norm": 1.185031533241272, "learning_rate": 9.674960574732785e-05, "loss": 4.4129, "step": 374 }, { "epoch": 0.09879462672841331, "grad_norm": 0.9882381558418274, "learning_rate": 9.673208340634309e-05, "loss": 4.3561, "step": 376 }, { "epoch": 0.09932013006207509, "grad_norm": 1.1043294668197632, "learning_rate": 9.671456106535834e-05, "loss": 4.4386, "step": 378 }, { "epoch": 0.09984563339573685, "grad_norm": 1.2278759479522705, "learning_rate": 9.669703872437357e-05, "loss": 4.3522, "step": 380 }, { "epoch": 0.10037113672939862, "grad_norm": 1.009266972541809, "learning_rate": 9.667951638338882e-05, "loss": 4.4211, "step": 382 }, { "epoch": 0.1008966400630604, "grad_norm": 0.8734738230705261, "learning_rate": 9.666199404240407e-05, "loss": 4.3453, "step": 384 }, { "epoch": 0.10142214339672218, "grad_norm": 0.9534622430801392, "learning_rate": 9.664447170141932e-05, "loss": 4.3906, "step": 386 }, { "epoch": 0.10194764673038395, "grad_norm": 0.990115225315094, "learning_rate": 9.662694936043456e-05, "loss": 4.3737, "step": 388 }, { "epoch": 0.10247315006404571, "grad_norm": 0.9969471096992493, "learning_rate": 9.660942701944981e-05, "loss": 4.3926, "step": 390 }, { "epoch": 0.10299865339770749, "grad_norm": 0.8525747656822205, "learning_rate": 9.659190467846505e-05, "loss": 4.3938, "step": 392 }, { "epoch": 0.10352415673136926, "grad_norm": 0.8276899456977844, "learning_rate": 9.65743823374803e-05, "loss": 4.3986, "step": 394 }, { "epoch": 0.10404966006503104, "grad_norm": 0.9172145128250122, "learning_rate": 9.655685999649554e-05, "loss": 4.3396, "step": 396 }, { "epoch": 0.10457516339869281, "grad_norm": 0.8582144975662231, "learning_rate": 9.653933765551078e-05, "loss": 4.3517, "step": 398 }, { "epoch": 0.10510066673235459, "grad_norm": 0.9213413000106812, "learning_rate": 9.652181531452602e-05, "loss": 4.3727, "step": 400 }, { "epoch": 0.10510066673235459, "eval_loss": 4.323671340942383, "eval_runtime": 464.4067, "eval_samples_per_second": 262.246, "eval_steps_per_second": 8.195, "step": 400 }, { "epoch": 0.10562617006601635, "grad_norm": 0.813126266002655, "learning_rate": 9.650429297354127e-05, "loss": 4.4168, "step": 402 }, { "epoch": 0.10615167339967813, "grad_norm": 0.8609282374382019, "learning_rate": 9.648677063255652e-05, "loss": 4.3829, "step": 404 }, { "epoch": 0.1066771767333399, "grad_norm": 0.9762548804283142, "learning_rate": 9.646924829157175e-05, "loss": 4.3688, "step": 406 }, { "epoch": 0.10720268006700168, "grad_norm": 0.9275674819946289, "learning_rate": 9.6451725950587e-05, "loss": 4.3896, "step": 408 }, { "epoch": 0.10772818340066345, "grad_norm": 0.9462507367134094, "learning_rate": 9.643420360960225e-05, "loss": 4.3632, "step": 410 }, { "epoch": 0.10825368673432523, "grad_norm": 0.8897784948348999, "learning_rate": 9.64166812686175e-05, "loss": 4.3426, "step": 412 }, { "epoch": 0.10877919006798699, "grad_norm": 0.991764485836029, "learning_rate": 9.639915892763274e-05, "loss": 4.3795, "step": 414 }, { "epoch": 0.10930469340164876, "grad_norm": 0.8135252594947815, "learning_rate": 9.638163658664799e-05, "loss": 4.4081, "step": 416 }, { "epoch": 0.10983019673531054, "grad_norm": 0.8907259106636047, "learning_rate": 9.636411424566322e-05, "loss": 4.3491, "step": 418 }, { "epoch": 0.11035570006897231, "grad_norm": 0.9635459780693054, "learning_rate": 9.634659190467847e-05, "loss": 4.341, "step": 420 }, { "epoch": 0.11088120340263409, "grad_norm": 0.8131710290908813, "learning_rate": 9.632906956369372e-05, "loss": 4.3996, "step": 422 }, { "epoch": 0.11140670673629585, "grad_norm": 0.973064124584198, "learning_rate": 9.631154722270895e-05, "loss": 4.2993, "step": 424 }, { "epoch": 0.11193221006995763, "grad_norm": 0.7931177020072937, "learning_rate": 9.62940248817242e-05, "loss": 4.3233, "step": 426 }, { "epoch": 0.1124577134036194, "grad_norm": 0.7838175892829895, "learning_rate": 9.627650254073945e-05, "loss": 4.364, "step": 428 }, { "epoch": 0.11298321673728118, "grad_norm": 0.8624389171600342, "learning_rate": 9.62589801997547e-05, "loss": 4.3434, "step": 430 }, { "epoch": 0.11350872007094295, "grad_norm": 0.8050585389137268, "learning_rate": 9.624145785876993e-05, "loss": 4.365, "step": 432 }, { "epoch": 0.11403422340460473, "grad_norm": 0.8698369860649109, "learning_rate": 9.622393551778518e-05, "loss": 4.3527, "step": 434 }, { "epoch": 0.11455972673826649, "grad_norm": 0.9632725715637207, "learning_rate": 9.620641317680042e-05, "loss": 4.3234, "step": 436 }, { "epoch": 0.11508523007192827, "grad_norm": 0.9318451881408691, "learning_rate": 9.618889083581567e-05, "loss": 4.3141, "step": 438 }, { "epoch": 0.11561073340559004, "grad_norm": 0.8633777499198914, "learning_rate": 9.617136849483092e-05, "loss": 4.3902, "step": 440 }, { "epoch": 0.11613623673925182, "grad_norm": 0.7846489548683167, "learning_rate": 9.615384615384617e-05, "loss": 4.3897, "step": 442 }, { "epoch": 0.11666174007291359, "grad_norm": 0.7871001958847046, "learning_rate": 9.61363238128614e-05, "loss": 4.3204, "step": 444 }, { "epoch": 0.11718724340657537, "grad_norm": 0.8608377575874329, "learning_rate": 9.611880147187665e-05, "loss": 4.3337, "step": 446 }, { "epoch": 0.11771274674023713, "grad_norm": 1.106196641921997, "learning_rate": 9.61012791308919e-05, "loss": 4.3458, "step": 448 }, { "epoch": 0.1182382500738989, "grad_norm": 0.929239809513092, "learning_rate": 9.608375678990713e-05, "loss": 4.3178, "step": 450 }, { "epoch": 0.11876375340756068, "grad_norm": 0.9668716788291931, "learning_rate": 9.606623444892238e-05, "loss": 4.3389, "step": 452 }, { "epoch": 0.11928925674122245, "grad_norm": 0.9082039594650269, "learning_rate": 9.604871210793763e-05, "loss": 4.3256, "step": 454 }, { "epoch": 0.11981476007488423, "grad_norm": 0.9413854479789734, "learning_rate": 9.603118976695287e-05, "loss": 4.3427, "step": 456 }, { "epoch": 0.12034026340854599, "grad_norm": 0.85140061378479, "learning_rate": 9.601366742596811e-05, "loss": 4.3336, "step": 458 }, { "epoch": 0.12086576674220777, "grad_norm": 0.8141552209854126, "learning_rate": 9.599614508498335e-05, "loss": 4.3413, "step": 460 }, { "epoch": 0.12139127007586954, "grad_norm": 0.766933798789978, "learning_rate": 9.59786227439986e-05, "loss": 4.331, "step": 462 }, { "epoch": 0.12191677340953132, "grad_norm": 0.7736857533454895, "learning_rate": 9.596110040301385e-05, "loss": 4.3881, "step": 464 }, { "epoch": 0.1224422767431931, "grad_norm": 0.7159366607666016, "learning_rate": 9.59435780620291e-05, "loss": 4.3759, "step": 466 }, { "epoch": 0.12296778007685487, "grad_norm": 0.8450105786323547, "learning_rate": 9.592605572104434e-05, "loss": 4.3295, "step": 468 }, { "epoch": 0.12349328341051663, "grad_norm": 0.9408858418464661, "learning_rate": 9.590853338005958e-05, "loss": 4.3351, "step": 470 }, { "epoch": 0.1240187867441784, "grad_norm": 1.0368094444274902, "learning_rate": 9.589101103907483e-05, "loss": 4.3093, "step": 472 }, { "epoch": 0.12454429007784018, "grad_norm": 0.995236873626709, "learning_rate": 9.587348869809007e-05, "loss": 4.3521, "step": 474 }, { "epoch": 0.12506979341150196, "grad_norm": 0.7337868213653564, "learning_rate": 9.585596635710531e-05, "loss": 4.3452, "step": 476 }, { "epoch": 0.12559529674516373, "grad_norm": 0.9112853407859802, "learning_rate": 9.583844401612056e-05, "loss": 4.35, "step": 478 }, { "epoch": 0.1261208000788255, "grad_norm": 0.9311347007751465, "learning_rate": 9.58209216751358e-05, "loss": 4.3337, "step": 480 }, { "epoch": 0.12664630341248728, "grad_norm": 0.7674478888511658, "learning_rate": 9.580339933415105e-05, "loss": 4.317, "step": 482 }, { "epoch": 0.12717180674614906, "grad_norm": 0.9330013394355774, "learning_rate": 9.578587699316628e-05, "loss": 4.3132, "step": 484 }, { "epoch": 0.1276973100798108, "grad_norm": 0.9824528694152832, "learning_rate": 9.576835465218153e-05, "loss": 4.3326, "step": 486 }, { "epoch": 0.12822281341347258, "grad_norm": 0.8478718996047974, "learning_rate": 9.575083231119678e-05, "loss": 4.2661, "step": 488 }, { "epoch": 0.12874831674713436, "grad_norm": 0.8402353525161743, "learning_rate": 9.573330997021203e-05, "loss": 4.3381, "step": 490 }, { "epoch": 0.12927382008079613, "grad_norm": 0.8112274408340454, "learning_rate": 9.571578762922728e-05, "loss": 4.3129, "step": 492 }, { "epoch": 0.1297993234144579, "grad_norm": 0.9338264465332031, "learning_rate": 9.569826528824252e-05, "loss": 4.2888, "step": 494 }, { "epoch": 0.13032482674811968, "grad_norm": 0.8397476673126221, "learning_rate": 9.568074294725776e-05, "loss": 4.3135, "step": 496 }, { "epoch": 0.13085033008178146, "grad_norm": 0.8148236870765686, "learning_rate": 9.5663220606273e-05, "loss": 4.3372, "step": 498 }, { "epoch": 0.13137583341544323, "grad_norm": 0.9035194516181946, "learning_rate": 9.564569826528824e-05, "loss": 4.2888, "step": 500 }, { "epoch": 0.131901336749105, "grad_norm": 0.8043197989463806, "learning_rate": 9.562817592430349e-05, "loss": 4.3227, "step": 502 }, { "epoch": 0.13242684008276678, "grad_norm": 0.8552638292312622, "learning_rate": 9.561065358331873e-05, "loss": 4.3043, "step": 504 }, { "epoch": 0.13295234341642856, "grad_norm": 1.0246176719665527, "learning_rate": 9.559313124233398e-05, "loss": 4.283, "step": 506 }, { "epoch": 0.13347784675009033, "grad_norm": 0.840494692325592, "learning_rate": 9.557560890134923e-05, "loss": 4.279, "step": 508 }, { "epoch": 0.13400335008375208, "grad_norm": 0.920755922794342, "learning_rate": 9.555808656036446e-05, "loss": 4.3415, "step": 510 }, { "epoch": 0.13452885341741386, "grad_norm": 0.9088913798332214, "learning_rate": 9.554056421937971e-05, "loss": 4.2931, "step": 512 }, { "epoch": 0.13505435675107563, "grad_norm": 0.8052839636802673, "learning_rate": 9.552304187839496e-05, "loss": 4.284, "step": 514 }, { "epoch": 0.1355798600847374, "grad_norm": 0.84073406457901, "learning_rate": 9.55055195374102e-05, "loss": 4.2628, "step": 516 }, { "epoch": 0.13610536341839918, "grad_norm": 0.9609066247940063, "learning_rate": 9.548799719642545e-05, "loss": 4.3385, "step": 518 }, { "epoch": 0.13663086675206096, "grad_norm": 1.0310906171798706, "learning_rate": 9.54704748554407e-05, "loss": 4.3026, "step": 520 }, { "epoch": 0.13715637008572273, "grad_norm": 0.8873734474182129, "learning_rate": 9.545295251445593e-05, "loss": 4.3267, "step": 522 }, { "epoch": 0.1376818734193845, "grad_norm": 0.7936632633209229, "learning_rate": 9.543543017347118e-05, "loss": 4.3387, "step": 524 }, { "epoch": 0.13820737675304628, "grad_norm": 0.8136696815490723, "learning_rate": 9.541790783248642e-05, "loss": 4.2735, "step": 526 }, { "epoch": 0.13873288008670806, "grad_norm": 0.8784980773925781, "learning_rate": 9.540038549150166e-05, "loss": 4.3, "step": 528 }, { "epoch": 0.13925838342036984, "grad_norm": 0.9588335752487183, "learning_rate": 9.538286315051691e-05, "loss": 4.2889, "step": 530 }, { "epoch": 0.13978388675403158, "grad_norm": 0.8407328128814697, "learning_rate": 9.536534080953216e-05, "loss": 4.3484, "step": 532 }, { "epoch": 0.14030939008769336, "grad_norm": 0.7581162452697754, "learning_rate": 9.53478184685474e-05, "loss": 4.3209, "step": 534 }, { "epoch": 0.14083489342135513, "grad_norm": 0.8761801719665527, "learning_rate": 9.533029612756264e-05, "loss": 4.2945, "step": 536 }, { "epoch": 0.1413603967550169, "grad_norm": 0.9985829591751099, "learning_rate": 9.531277378657789e-05, "loss": 4.298, "step": 538 }, { "epoch": 0.14188590008867868, "grad_norm": 1.0523216724395752, "learning_rate": 9.529525144559314e-05, "loss": 4.3131, "step": 540 }, { "epoch": 0.14241140342234046, "grad_norm": 0.8014693260192871, "learning_rate": 9.527772910460838e-05, "loss": 4.3133, "step": 542 }, { "epoch": 0.14293690675600224, "grad_norm": 0.7733297348022461, "learning_rate": 9.526020676362363e-05, "loss": 4.3402, "step": 544 }, { "epoch": 0.143462410089664, "grad_norm": 0.8763622045516968, "learning_rate": 9.524268442263888e-05, "loss": 4.3093, "step": 546 }, { "epoch": 0.1439879134233258, "grad_norm": 0.8544076085090637, "learning_rate": 9.522516208165411e-05, "loss": 4.3203, "step": 548 }, { "epoch": 0.14451341675698756, "grad_norm": 0.7923790216445923, "learning_rate": 9.520763974066936e-05, "loss": 4.3074, "step": 550 }, { "epoch": 0.14503892009064934, "grad_norm": 1.0110771656036377, "learning_rate": 9.51901173996846e-05, "loss": 4.2345, "step": 552 }, { "epoch": 0.14556442342431108, "grad_norm": 0.8644735217094421, "learning_rate": 9.517259505869984e-05, "loss": 4.2825, "step": 554 }, { "epoch": 0.14608992675797286, "grad_norm": 0.7749819159507751, "learning_rate": 9.515507271771509e-05, "loss": 4.286, "step": 556 }, { "epoch": 0.14661543009163464, "grad_norm": 0.8563172221183777, "learning_rate": 9.513755037673034e-05, "loss": 4.321, "step": 558 }, { "epoch": 0.1471409334252964, "grad_norm": 0.8272625803947449, "learning_rate": 9.512002803574558e-05, "loss": 4.2839, "step": 560 }, { "epoch": 0.14766643675895819, "grad_norm": 0.8743438720703125, "learning_rate": 9.510250569476082e-05, "loss": 4.2689, "step": 562 }, { "epoch": 0.14819194009261996, "grad_norm": 0.898620069026947, "learning_rate": 9.508498335377607e-05, "loss": 4.2745, "step": 564 }, { "epoch": 0.14871744342628174, "grad_norm": 0.848181426525116, "learning_rate": 9.506746101279131e-05, "loss": 4.2848, "step": 566 }, { "epoch": 0.1492429467599435, "grad_norm": 0.9381906390190125, "learning_rate": 9.504993867180656e-05, "loss": 4.2636, "step": 568 }, { "epoch": 0.1497684500936053, "grad_norm": 0.8950619101524353, "learning_rate": 9.503241633082181e-05, "loss": 4.2992, "step": 570 }, { "epoch": 0.15029395342726706, "grad_norm": 0.9357477426528931, "learning_rate": 9.501489398983706e-05, "loss": 4.3064, "step": 572 }, { "epoch": 0.15081945676092884, "grad_norm": 0.9902758002281189, "learning_rate": 9.499737164885229e-05, "loss": 4.3064, "step": 574 }, { "epoch": 0.1513449600945906, "grad_norm": 0.932582437992096, "learning_rate": 9.497984930786754e-05, "loss": 4.2873, "step": 576 }, { "epoch": 0.15187046342825236, "grad_norm": 0.8543179035186768, "learning_rate": 9.496232696688277e-05, "loss": 4.2392, "step": 578 }, { "epoch": 0.15239596676191414, "grad_norm": 0.87977135181427, "learning_rate": 9.494480462589802e-05, "loss": 4.3193, "step": 580 }, { "epoch": 0.1529214700955759, "grad_norm": 0.8900327682495117, "learning_rate": 9.492728228491327e-05, "loss": 4.3148, "step": 582 }, { "epoch": 0.1534469734292377, "grad_norm": 0.8193830251693726, "learning_rate": 9.490975994392851e-05, "loss": 4.2968, "step": 584 }, { "epoch": 0.15397247676289946, "grad_norm": 0.8425807356834412, "learning_rate": 9.489223760294376e-05, "loss": 4.2737, "step": 586 }, { "epoch": 0.15449798009656124, "grad_norm": 1.0273034572601318, "learning_rate": 9.4874715261959e-05, "loss": 4.2643, "step": 588 }, { "epoch": 0.155023483430223, "grad_norm": 0.967189610004425, "learning_rate": 9.485719292097424e-05, "loss": 4.2582, "step": 590 }, { "epoch": 0.1555489867638848, "grad_norm": 0.9653576612472534, "learning_rate": 9.483967057998949e-05, "loss": 4.2647, "step": 592 }, { "epoch": 0.15607449009754656, "grad_norm": 1.009203553199768, "learning_rate": 9.482214823900474e-05, "loss": 4.2463, "step": 594 }, { "epoch": 0.15659999343120834, "grad_norm": 1.0033949613571167, "learning_rate": 9.480462589801999e-05, "loss": 4.2799, "step": 596 }, { "epoch": 0.15712549676487012, "grad_norm": 1.1340419054031372, "learning_rate": 9.478710355703523e-05, "loss": 4.228, "step": 598 }, { "epoch": 0.15765100009853186, "grad_norm": 0.8829800486564636, "learning_rate": 9.476958121605047e-05, "loss": 4.2653, "step": 600 }, { "epoch": 0.15817650343219364, "grad_norm": 1.14998197555542, "learning_rate": 9.47520588750657e-05, "loss": 4.3043, "step": 602 }, { "epoch": 0.1587020067658554, "grad_norm": 1.1028305292129517, "learning_rate": 9.473453653408095e-05, "loss": 4.2706, "step": 604 }, { "epoch": 0.1592275100995172, "grad_norm": 0.9737115502357483, "learning_rate": 9.47170141930962e-05, "loss": 4.2686, "step": 606 }, { "epoch": 0.15975301343317896, "grad_norm": 1.3182507753372192, "learning_rate": 9.469949185211144e-05, "loss": 4.2957, "step": 608 }, { "epoch": 0.16027851676684074, "grad_norm": 0.9065335392951965, "learning_rate": 9.468196951112669e-05, "loss": 4.2532, "step": 610 }, { "epoch": 0.16080402010050251, "grad_norm": 1.3520152568817139, "learning_rate": 9.466444717014194e-05, "loss": 4.2405, "step": 612 }, { "epoch": 0.1613295234341643, "grad_norm": 0.8745186924934387, "learning_rate": 9.464692482915717e-05, "loss": 4.2717, "step": 614 }, { "epoch": 0.16185502676782607, "grad_norm": 0.9047099947929382, "learning_rate": 9.462940248817242e-05, "loss": 4.2633, "step": 616 }, { "epoch": 0.16238053010148784, "grad_norm": 0.8175111413002014, "learning_rate": 9.461188014718767e-05, "loss": 4.281, "step": 618 }, { "epoch": 0.16290603343514962, "grad_norm": 0.8885545134544373, "learning_rate": 9.459435780620292e-05, "loss": 4.2489, "step": 620 }, { "epoch": 0.16343153676881136, "grad_norm": 0.8279030323028564, "learning_rate": 9.457683546521816e-05, "loss": 4.2595, "step": 622 }, { "epoch": 0.16395704010247314, "grad_norm": 0.7932460904121399, "learning_rate": 9.455931312423341e-05, "loss": 4.2485, "step": 624 }, { "epoch": 0.16448254343613491, "grad_norm": 0.8510848879814148, "learning_rate": 9.454179078324864e-05, "loss": 4.2638, "step": 626 }, { "epoch": 0.1650080467697967, "grad_norm": 1.2163007259368896, "learning_rate": 9.452426844226389e-05, "loss": 4.2885, "step": 628 }, { "epoch": 0.16553355010345847, "grad_norm": 0.907429575920105, "learning_rate": 9.450674610127913e-05, "loss": 4.2589, "step": 630 }, { "epoch": 0.16605905343712024, "grad_norm": 0.910266637802124, "learning_rate": 9.448922376029437e-05, "loss": 4.2905, "step": 632 }, { "epoch": 0.16658455677078202, "grad_norm": 0.9601488709449768, "learning_rate": 9.447170141930962e-05, "loss": 4.2897, "step": 634 }, { "epoch": 0.1671100601044438, "grad_norm": 0.936589777469635, "learning_rate": 9.445417907832487e-05, "loss": 4.2335, "step": 636 }, { "epoch": 0.16763556343810557, "grad_norm": 0.8298571705818176, "learning_rate": 9.443665673734012e-05, "loss": 4.2048, "step": 638 }, { "epoch": 0.16816106677176734, "grad_norm": 0.8749000430107117, "learning_rate": 9.441913439635536e-05, "loss": 4.2624, "step": 640 }, { "epoch": 0.16868657010542912, "grad_norm": 0.8492471575737, "learning_rate": 9.44016120553706e-05, "loss": 4.2885, "step": 642 }, { "epoch": 0.16921207343909087, "grad_norm": 0.8286552429199219, "learning_rate": 9.438408971438585e-05, "loss": 4.2598, "step": 644 }, { "epoch": 0.16973757677275264, "grad_norm": 0.7817071676254272, "learning_rate": 9.43665673734011e-05, "loss": 4.2647, "step": 646 }, { "epoch": 0.17026308010641442, "grad_norm": 0.795383095741272, "learning_rate": 9.434904503241634e-05, "loss": 4.2571, "step": 648 }, { "epoch": 0.1707885834400762, "grad_norm": 1.0703294277191162, "learning_rate": 9.433152269143159e-05, "loss": 4.2676, "step": 650 }, { "epoch": 0.17131408677373797, "grad_norm": 0.9871751666069031, "learning_rate": 9.431400035044684e-05, "loss": 4.3052, "step": 652 }, { "epoch": 0.17183959010739974, "grad_norm": 0.8267862200737, "learning_rate": 9.429647800946207e-05, "loss": 4.2451, "step": 654 }, { "epoch": 0.17236509344106152, "grad_norm": 0.9759035110473633, "learning_rate": 9.42789556684773e-05, "loss": 4.2516, "step": 656 }, { "epoch": 0.1728905967747233, "grad_norm": 0.8905848860740662, "learning_rate": 9.426143332749255e-05, "loss": 4.2287, "step": 658 }, { "epoch": 0.17341610010838507, "grad_norm": 0.9217715859413147, "learning_rate": 9.42439109865078e-05, "loss": 4.2334, "step": 660 }, { "epoch": 0.17394160344204684, "grad_norm": 1.1035958528518677, "learning_rate": 9.422638864552305e-05, "loss": 4.2477, "step": 662 }, { "epoch": 0.17446710677570862, "grad_norm": 0.9161078333854675, "learning_rate": 9.42088663045383e-05, "loss": 4.2256, "step": 664 }, { "epoch": 0.1749926101093704, "grad_norm": 1.0139251947402954, "learning_rate": 9.419134396355354e-05, "loss": 4.2746, "step": 666 }, { "epoch": 0.17551811344303214, "grad_norm": 0.9318559765815735, "learning_rate": 9.417382162256878e-05, "loss": 4.2028, "step": 668 }, { "epoch": 0.17604361677669392, "grad_norm": 1.0254356861114502, "learning_rate": 9.415629928158402e-05, "loss": 4.2593, "step": 670 }, { "epoch": 0.1765691201103557, "grad_norm": 1.164018988609314, "learning_rate": 9.413877694059927e-05, "loss": 4.2203, "step": 672 }, { "epoch": 0.17709462344401747, "grad_norm": 0.9168299436569214, "learning_rate": 9.412125459961452e-05, "loss": 4.2363, "step": 674 }, { "epoch": 0.17762012677767924, "grad_norm": 1.0169556140899658, "learning_rate": 9.410373225862977e-05, "loss": 4.2229, "step": 676 }, { "epoch": 0.17814563011134102, "grad_norm": 1.065355896949768, "learning_rate": 9.408620991764501e-05, "loss": 4.2208, "step": 678 }, { "epoch": 0.1786711334450028, "grad_norm": 0.9870384931564331, "learning_rate": 9.406868757666025e-05, "loss": 4.2474, "step": 680 }, { "epoch": 0.17919663677866457, "grad_norm": 1.1554317474365234, "learning_rate": 9.405116523567548e-05, "loss": 4.2563, "step": 682 }, { "epoch": 0.17972214011232635, "grad_norm": 1.5406105518341064, "learning_rate": 9.403364289469073e-05, "loss": 4.2521, "step": 684 }, { "epoch": 0.18024764344598812, "grad_norm": 1.0204381942749023, "learning_rate": 9.401612055370598e-05, "loss": 4.284, "step": 686 }, { "epoch": 0.1807731467796499, "grad_norm": 1.1895354986190796, "learning_rate": 9.399859821272122e-05, "loss": 4.2439, "step": 688 }, { "epoch": 0.18129865011331164, "grad_norm": 0.9624170064926147, "learning_rate": 9.398107587173647e-05, "loss": 4.2083, "step": 690 }, { "epoch": 0.18182415344697342, "grad_norm": 0.987634539604187, "learning_rate": 9.396355353075172e-05, "loss": 4.2183, "step": 692 }, { "epoch": 0.1823496567806352, "grad_norm": 1.1291043758392334, "learning_rate": 9.394603118976695e-05, "loss": 4.2287, "step": 694 }, { "epoch": 0.18287516011429697, "grad_norm": 0.8966813087463379, "learning_rate": 9.39285088487822e-05, "loss": 4.1979, "step": 696 }, { "epoch": 0.18340066344795874, "grad_norm": 0.9052355289459229, "learning_rate": 9.391098650779745e-05, "loss": 4.2507, "step": 698 }, { "epoch": 0.18392616678162052, "grad_norm": 0.912899374961853, "learning_rate": 9.38934641668127e-05, "loss": 4.2001, "step": 700 }, { "epoch": 0.1844516701152823, "grad_norm": 1.0003747940063477, "learning_rate": 9.387594182582794e-05, "loss": 4.2114, "step": 702 }, { "epoch": 0.18497717344894407, "grad_norm": 0.9614863991737366, "learning_rate": 9.385841948484318e-05, "loss": 4.2166, "step": 704 }, { "epoch": 0.18550267678260585, "grad_norm": 1.0388106107711792, "learning_rate": 9.384089714385843e-05, "loss": 4.2686, "step": 706 }, { "epoch": 0.18602818011626762, "grad_norm": 1.080675721168518, "learning_rate": 9.382337480287366e-05, "loss": 4.3008, "step": 708 }, { "epoch": 0.1865536834499294, "grad_norm": 1.2461402416229248, "learning_rate": 9.380585246188891e-05, "loss": 4.2769, "step": 710 }, { "epoch": 0.18707918678359114, "grad_norm": 0.9676238298416138, "learning_rate": 9.378833012090415e-05, "loss": 4.2849, "step": 712 }, { "epoch": 0.18760469011725292, "grad_norm": 0.905734121799469, "learning_rate": 9.37708077799194e-05, "loss": 4.2485, "step": 714 }, { "epoch": 0.1881301934509147, "grad_norm": 0.8259114623069763, "learning_rate": 9.375328543893465e-05, "loss": 4.2153, "step": 716 }, { "epoch": 0.18865569678457647, "grad_norm": 0.938637912273407, "learning_rate": 9.37357630979499e-05, "loss": 4.289, "step": 718 }, { "epoch": 0.18918120011823825, "grad_norm": 1.0711158514022827, "learning_rate": 9.371824075696513e-05, "loss": 4.2388, "step": 720 }, { "epoch": 0.18970670345190002, "grad_norm": 0.9799172282218933, "learning_rate": 9.370071841598038e-05, "loss": 4.2323, "step": 722 }, { "epoch": 0.1902322067855618, "grad_norm": 0.9656129479408264, "learning_rate": 9.368319607499563e-05, "loss": 4.1815, "step": 724 }, { "epoch": 0.19075771011922357, "grad_norm": 1.1559302806854248, "learning_rate": 9.366567373401087e-05, "loss": 4.2163, "step": 726 }, { "epoch": 0.19128321345288535, "grad_norm": 0.933671772480011, "learning_rate": 9.364815139302612e-05, "loss": 4.2208, "step": 728 }, { "epoch": 0.19180871678654712, "grad_norm": 1.159047245979309, "learning_rate": 9.363062905204136e-05, "loss": 4.2604, "step": 730 }, { "epoch": 0.1923342201202089, "grad_norm": 1.2142375707626343, "learning_rate": 9.36131067110566e-05, "loss": 4.209, "step": 732 }, { "epoch": 0.19285972345387067, "grad_norm": 1.104740023612976, "learning_rate": 9.359558437007184e-05, "loss": 4.2, "step": 734 }, { "epoch": 0.19338522678753242, "grad_norm": 1.0690265893936157, "learning_rate": 9.357806202908708e-05, "loss": 4.2394, "step": 736 }, { "epoch": 0.1939107301211942, "grad_norm": 0.9487683773040771, "learning_rate": 9.356053968810233e-05, "loss": 4.1891, "step": 738 }, { "epoch": 0.19443623345485597, "grad_norm": 1.0458849668502808, "learning_rate": 9.354301734711758e-05, "loss": 4.24, "step": 740 }, { "epoch": 0.19496173678851775, "grad_norm": 1.189351201057434, "learning_rate": 9.352549500613283e-05, "loss": 4.1955, "step": 742 }, { "epoch": 0.19548724012217952, "grad_norm": 1.0025452375411987, "learning_rate": 9.350797266514808e-05, "loss": 4.2261, "step": 744 }, { "epoch": 0.1960127434558413, "grad_norm": 1.004149079322815, "learning_rate": 9.349045032416331e-05, "loss": 4.2616, "step": 746 }, { "epoch": 0.19653824678950307, "grad_norm": 1.034237027168274, "learning_rate": 9.347292798317856e-05, "loss": 4.2472, "step": 748 }, { "epoch": 0.19706375012316485, "grad_norm": 0.8545398712158203, "learning_rate": 9.34554056421938e-05, "loss": 4.2177, "step": 750 }, { "epoch": 0.19758925345682662, "grad_norm": 0.970920205116272, "learning_rate": 9.343788330120905e-05, "loss": 4.2288, "step": 752 }, { "epoch": 0.1981147567904884, "grad_norm": 0.9060124158859253, "learning_rate": 9.34203609602243e-05, "loss": 4.2747, "step": 754 }, { "epoch": 0.19864026012415018, "grad_norm": 0.9495584964752197, "learning_rate": 9.340283861923953e-05, "loss": 4.2124, "step": 756 }, { "epoch": 0.19916576345781192, "grad_norm": 1.0117506980895996, "learning_rate": 9.338531627825478e-05, "loss": 4.2345, "step": 758 }, { "epoch": 0.1996912667914737, "grad_norm": 1.1771867275238037, "learning_rate": 9.336779393727001e-05, "loss": 4.2508, "step": 760 }, { "epoch": 0.20021677012513547, "grad_norm": 1.2082531452178955, "learning_rate": 9.335027159628526e-05, "loss": 4.2797, "step": 762 }, { "epoch": 0.20074227345879725, "grad_norm": 1.0689860582351685, "learning_rate": 9.333274925530051e-05, "loss": 4.1988, "step": 764 }, { "epoch": 0.20126777679245902, "grad_norm": 1.2013009786605835, "learning_rate": 9.331522691431576e-05, "loss": 4.2373, "step": 766 }, { "epoch": 0.2017932801261208, "grad_norm": 0.9715994596481323, "learning_rate": 9.3297704573331e-05, "loss": 4.2338, "step": 768 }, { "epoch": 0.20231878345978258, "grad_norm": 1.1095867156982422, "learning_rate": 9.328018223234625e-05, "loss": 4.1865, "step": 770 }, { "epoch": 0.20284428679344435, "grad_norm": 0.9889922738075256, "learning_rate": 9.326265989136149e-05, "loss": 4.2322, "step": 772 }, { "epoch": 0.20336979012710613, "grad_norm": 1.049436092376709, "learning_rate": 9.324513755037673e-05, "loss": 4.1422, "step": 774 }, { "epoch": 0.2038952934607679, "grad_norm": 1.210349202156067, "learning_rate": 9.322761520939198e-05, "loss": 4.3001, "step": 776 }, { "epoch": 0.20442079679442968, "grad_norm": 0.7830674648284912, "learning_rate": 9.321009286840723e-05, "loss": 4.2102, "step": 778 }, { "epoch": 0.20494630012809142, "grad_norm": 1.1593890190124512, "learning_rate": 9.319257052742248e-05, "loss": 4.2442, "step": 780 }, { "epoch": 0.2054718034617532, "grad_norm": 0.9361159205436707, "learning_rate": 9.317504818643771e-05, "loss": 4.2606, "step": 782 }, { "epoch": 0.20599730679541498, "grad_norm": 1.0915993452072144, "learning_rate": 9.315752584545296e-05, "loss": 4.1703, "step": 784 }, { "epoch": 0.20652281012907675, "grad_norm": 0.9425475001335144, "learning_rate": 9.314000350446819e-05, "loss": 4.1767, "step": 786 }, { "epoch": 0.20704831346273853, "grad_norm": 0.9038797616958618, "learning_rate": 9.312248116348344e-05, "loss": 4.2341, "step": 788 }, { "epoch": 0.2075738167964003, "grad_norm": 0.9676367044448853, "learning_rate": 9.310495882249869e-05, "loss": 4.18, "step": 790 }, { "epoch": 0.20809932013006208, "grad_norm": 1.0088634490966797, "learning_rate": 9.308743648151394e-05, "loss": 4.2168, "step": 792 }, { "epoch": 0.20862482346372385, "grad_norm": 0.9227249622344971, "learning_rate": 9.306991414052918e-05, "loss": 4.1734, "step": 794 }, { "epoch": 0.20915032679738563, "grad_norm": 1.0162461996078491, "learning_rate": 9.305239179954443e-05, "loss": 4.1948, "step": 796 }, { "epoch": 0.2096758301310474, "grad_norm": 0.9532257914543152, "learning_rate": 9.303486945855966e-05, "loss": 4.236, "step": 798 }, { "epoch": 0.21020133346470918, "grad_norm": 0.979803740978241, "learning_rate": 9.301734711757491e-05, "loss": 4.1849, "step": 800 }, { "epoch": 0.21020133346470918, "eval_loss": 4.153889179229736, "eval_runtime": 464.6366, "eval_samples_per_second": 262.117, "eval_steps_per_second": 8.191, "step": 800 }, { "epoch": 0.21072683679837095, "grad_norm": 1.134989857673645, "learning_rate": 9.299982477659016e-05, "loss": 4.1635, "step": 802 }, { "epoch": 0.2112523401320327, "grad_norm": 1.0039291381835938, "learning_rate": 9.298230243560541e-05, "loss": 4.2594, "step": 804 }, { "epoch": 0.21177784346569448, "grad_norm": 0.8763834834098816, "learning_rate": 9.296478009462064e-05, "loss": 4.2166, "step": 806 }, { "epoch": 0.21230334679935625, "grad_norm": 1.0064553022384644, "learning_rate": 9.294725775363589e-05, "loss": 4.2107, "step": 808 }, { "epoch": 0.21282885013301803, "grad_norm": 0.9566521048545837, "learning_rate": 9.292973541265114e-05, "loss": 4.2018, "step": 810 }, { "epoch": 0.2133543534666798, "grad_norm": 0.9895017147064209, "learning_rate": 9.291221307166637e-05, "loss": 4.2032, "step": 812 }, { "epoch": 0.21387985680034158, "grad_norm": 0.8688331246376038, "learning_rate": 9.289469073068162e-05, "loss": 4.184, "step": 814 }, { "epoch": 0.21440536013400335, "grad_norm": 1.0605552196502686, "learning_rate": 9.287716838969687e-05, "loss": 4.201, "step": 816 }, { "epoch": 0.21493086346766513, "grad_norm": 0.9318816661834717, "learning_rate": 9.285964604871211e-05, "loss": 4.1967, "step": 818 }, { "epoch": 0.2154563668013269, "grad_norm": 1.022118330001831, "learning_rate": 9.284212370772736e-05, "loss": 4.2504, "step": 820 }, { "epoch": 0.21598187013498868, "grad_norm": 0.9688105583190918, "learning_rate": 9.282460136674261e-05, "loss": 4.2442, "step": 822 }, { "epoch": 0.21650737346865045, "grad_norm": 1.0767052173614502, "learning_rate": 9.280707902575784e-05, "loss": 4.1522, "step": 824 }, { "epoch": 0.2170328768023122, "grad_norm": 1.114383578300476, "learning_rate": 9.278955668477309e-05, "loss": 4.1923, "step": 826 }, { "epoch": 0.21755838013597398, "grad_norm": 0.9999129176139832, "learning_rate": 9.277203434378834e-05, "loss": 4.2046, "step": 828 }, { "epoch": 0.21808388346963575, "grad_norm": 0.8999491930007935, "learning_rate": 9.275451200280358e-05, "loss": 4.2022, "step": 830 }, { "epoch": 0.21860938680329753, "grad_norm": 1.0829931497573853, "learning_rate": 9.273698966181882e-05, "loss": 4.1798, "step": 832 }, { "epoch": 0.2191348901369593, "grad_norm": 0.8846607804298401, "learning_rate": 9.271946732083407e-05, "loss": 4.1823, "step": 834 }, { "epoch": 0.21966039347062108, "grad_norm": 0.9241704940795898, "learning_rate": 9.270194497984931e-05, "loss": 4.2042, "step": 836 }, { "epoch": 0.22018589680428285, "grad_norm": 0.873506486415863, "learning_rate": 9.268442263886455e-05, "loss": 4.2098, "step": 838 }, { "epoch": 0.22071140013794463, "grad_norm": 0.9637856483459473, "learning_rate": 9.26669002978798e-05, "loss": 4.2359, "step": 840 }, { "epoch": 0.2212369034716064, "grad_norm": 0.9835045337677002, "learning_rate": 9.264937795689504e-05, "loss": 4.2072, "step": 842 }, { "epoch": 0.22176240680526818, "grad_norm": 1.0108649730682373, "learning_rate": 9.263185561591029e-05, "loss": 4.2229, "step": 844 }, { "epoch": 0.22228791013892996, "grad_norm": 1.172702431678772, "learning_rate": 9.261433327492554e-05, "loss": 4.1913, "step": 846 }, { "epoch": 0.2228134134725917, "grad_norm": 1.07243013381958, "learning_rate": 9.259681093394079e-05, "loss": 4.2167, "step": 848 }, { "epoch": 0.22333891680625348, "grad_norm": 1.20272696018219, "learning_rate": 9.257928859295602e-05, "loss": 4.2007, "step": 850 }, { "epoch": 0.22386442013991525, "grad_norm": 1.091685175895691, "learning_rate": 9.256176625197127e-05, "loss": 4.1863, "step": 852 }, { "epoch": 0.22438992347357703, "grad_norm": 1.0167911052703857, "learning_rate": 9.254424391098651e-05, "loss": 4.1526, "step": 854 }, { "epoch": 0.2249154268072388, "grad_norm": 1.2932296991348267, "learning_rate": 9.252672157000176e-05, "loss": 4.1453, "step": 856 }, { "epoch": 0.22544093014090058, "grad_norm": 1.119771122932434, "learning_rate": 9.2509199229017e-05, "loss": 4.1752, "step": 858 }, { "epoch": 0.22596643347456236, "grad_norm": 1.1617517471313477, "learning_rate": 9.249167688803224e-05, "loss": 4.1858, "step": 860 }, { "epoch": 0.22649193680822413, "grad_norm": 1.1158087253570557, "learning_rate": 9.247415454704749e-05, "loss": 4.1651, "step": 862 }, { "epoch": 0.2270174401418859, "grad_norm": 1.190997838973999, "learning_rate": 9.245663220606273e-05, "loss": 4.1887, "step": 864 }, { "epoch": 0.22754294347554768, "grad_norm": 0.8594352006912231, "learning_rate": 9.243910986507797e-05, "loss": 4.2354, "step": 866 }, { "epoch": 0.22806844680920946, "grad_norm": 0.829155445098877, "learning_rate": 9.242158752409322e-05, "loss": 4.2437, "step": 868 }, { "epoch": 0.2285939501428712, "grad_norm": 0.9194639921188354, "learning_rate": 9.240406518310847e-05, "loss": 4.1545, "step": 870 }, { "epoch": 0.22911945347653298, "grad_norm": 1.0732256174087524, "learning_rate": 9.238654284212372e-05, "loss": 4.193, "step": 872 }, { "epoch": 0.22964495681019476, "grad_norm": 0.8682575821876526, "learning_rate": 9.236902050113896e-05, "loss": 4.1676, "step": 874 }, { "epoch": 0.23017046014385653, "grad_norm": 1.0200450420379639, "learning_rate": 9.23514981601542e-05, "loss": 4.1864, "step": 876 }, { "epoch": 0.2306959634775183, "grad_norm": 1.0348173379898071, "learning_rate": 9.233397581916944e-05, "loss": 4.1793, "step": 878 }, { "epoch": 0.23122146681118008, "grad_norm": 1.0631320476531982, "learning_rate": 9.231645347818469e-05, "loss": 4.2207, "step": 880 }, { "epoch": 0.23174697014484186, "grad_norm": 0.9892516136169434, "learning_rate": 9.229893113719993e-05, "loss": 4.1645, "step": 882 }, { "epoch": 0.23227247347850363, "grad_norm": 0.9504501223564148, "learning_rate": 9.228140879621517e-05, "loss": 4.2239, "step": 884 }, { "epoch": 0.2327979768121654, "grad_norm": 0.941244900226593, "learning_rate": 9.226388645523042e-05, "loss": 4.1562, "step": 886 }, { "epoch": 0.23332348014582718, "grad_norm": 1.0678659677505493, "learning_rate": 9.224636411424567e-05, "loss": 4.2083, "step": 888 }, { "epoch": 0.23384898347948896, "grad_norm": 1.0697544813156128, "learning_rate": 9.222884177326092e-05, "loss": 4.2134, "step": 890 }, { "epoch": 0.23437448681315073, "grad_norm": 0.9326673746109009, "learning_rate": 9.221131943227615e-05, "loss": 4.1684, "step": 892 }, { "epoch": 0.23489999014681248, "grad_norm": 1.085872769355774, "learning_rate": 9.21937970912914e-05, "loss": 4.15, "step": 894 }, { "epoch": 0.23542549348047426, "grad_norm": 0.9281808733940125, "learning_rate": 9.217627475030665e-05, "loss": 4.1774, "step": 896 }, { "epoch": 0.23595099681413603, "grad_norm": 0.9127162098884583, "learning_rate": 9.21587524093219e-05, "loss": 4.1959, "step": 898 }, { "epoch": 0.2364765001477978, "grad_norm": 0.9778500199317932, "learning_rate": 9.214123006833714e-05, "loss": 4.2188, "step": 900 }, { "epoch": 0.23700200348145958, "grad_norm": 0.883875846862793, "learning_rate": 9.212370772735239e-05, "loss": 4.2124, "step": 902 }, { "epoch": 0.23752750681512136, "grad_norm": 1.013342022895813, "learning_rate": 9.210618538636762e-05, "loss": 4.1381, "step": 904 }, { "epoch": 0.23805301014878313, "grad_norm": 0.9834499359130859, "learning_rate": 9.208866304538287e-05, "loss": 4.231, "step": 906 }, { "epoch": 0.2385785134824449, "grad_norm": 0.9580435156822205, "learning_rate": 9.20711407043981e-05, "loss": 4.2067, "step": 908 }, { "epoch": 0.23910401681610668, "grad_norm": 1.129984974861145, "learning_rate": 9.205361836341335e-05, "loss": 4.1916, "step": 910 }, { "epoch": 0.23962952014976846, "grad_norm": 0.8870227336883545, "learning_rate": 9.20360960224286e-05, "loss": 4.1851, "step": 912 }, { "epoch": 0.24015502348343024, "grad_norm": 1.2790063619613647, "learning_rate": 9.201857368144385e-05, "loss": 4.1732, "step": 914 }, { "epoch": 0.24068052681709198, "grad_norm": 0.92209792137146, "learning_rate": 9.20010513404591e-05, "loss": 4.2006, "step": 916 }, { "epoch": 0.24120603015075376, "grad_norm": 1.010841727256775, "learning_rate": 9.198352899947433e-05, "loss": 4.1729, "step": 918 }, { "epoch": 0.24173153348441553, "grad_norm": 0.85013347864151, "learning_rate": 9.196600665848958e-05, "loss": 4.2045, "step": 920 }, { "epoch": 0.2422570368180773, "grad_norm": 1.341628909111023, "learning_rate": 9.194848431750482e-05, "loss": 4.1893, "step": 922 }, { "epoch": 0.24278254015173908, "grad_norm": 1.0640243291854858, "learning_rate": 9.193096197652007e-05, "loss": 4.1636, "step": 924 }, { "epoch": 0.24330804348540086, "grad_norm": 1.1869163513183594, "learning_rate": 9.191343963553532e-05, "loss": 4.1947, "step": 926 }, { "epoch": 0.24383354681906264, "grad_norm": 1.0969324111938477, "learning_rate": 9.189591729455057e-05, "loss": 4.1483, "step": 928 }, { "epoch": 0.2443590501527244, "grad_norm": 0.9260280132293701, "learning_rate": 9.18783949535658e-05, "loss": 4.1621, "step": 930 }, { "epoch": 0.2448845534863862, "grad_norm": 1.5303243398666382, "learning_rate": 9.186087261258105e-05, "loss": 4.2022, "step": 932 }, { "epoch": 0.24541005682004796, "grad_norm": 0.9254022836685181, "learning_rate": 9.184335027159628e-05, "loss": 4.1248, "step": 934 }, { "epoch": 0.24593556015370974, "grad_norm": 1.1392782926559448, "learning_rate": 9.182582793061153e-05, "loss": 4.1415, "step": 936 }, { "epoch": 0.24646106348737148, "grad_norm": 1.0129998922348022, "learning_rate": 9.180830558962678e-05, "loss": 4.1574, "step": 938 }, { "epoch": 0.24698656682103326, "grad_norm": 1.1144006252288818, "learning_rate": 9.179078324864202e-05, "loss": 4.17, "step": 940 }, { "epoch": 0.24751207015469504, "grad_norm": 1.2086069583892822, "learning_rate": 9.177326090765727e-05, "loss": 4.1527, "step": 942 }, { "epoch": 0.2480375734883568, "grad_norm": 1.0316379070281982, "learning_rate": 9.17557385666725e-05, "loss": 4.1946, "step": 944 }, { "epoch": 0.24856307682201859, "grad_norm": 1.3506675958633423, "learning_rate": 9.173821622568775e-05, "loss": 4.1664, "step": 946 }, { "epoch": 0.24908858015568036, "grad_norm": 0.9351489543914795, "learning_rate": 9.1720693884703e-05, "loss": 4.1353, "step": 948 }, { "epoch": 0.24961408348934214, "grad_norm": 1.010267734527588, "learning_rate": 9.170317154371825e-05, "loss": 4.1591, "step": 950 }, { "epoch": 0.2501395868230039, "grad_norm": 0.9747198224067688, "learning_rate": 9.16856492027335e-05, "loss": 4.16, "step": 952 }, { "epoch": 0.25066509015666566, "grad_norm": 1.190571904182434, "learning_rate": 9.166812686174874e-05, "loss": 4.1786, "step": 954 }, { "epoch": 0.25119059349032746, "grad_norm": 1.1121963262557983, "learning_rate": 9.165060452076398e-05, "loss": 4.214, "step": 956 }, { "epoch": 0.2517160968239892, "grad_norm": 1.0191761255264282, "learning_rate": 9.163308217977923e-05, "loss": 4.091, "step": 958 }, { "epoch": 0.252241600157651, "grad_norm": 1.211373209953308, "learning_rate": 9.161555983879446e-05, "loss": 4.1691, "step": 960 }, { "epoch": 0.25276710349131276, "grad_norm": 1.146796464920044, "learning_rate": 9.159803749780971e-05, "loss": 4.1601, "step": 962 }, { "epoch": 0.25329260682497456, "grad_norm": 0.9757868051528931, "learning_rate": 9.158051515682495e-05, "loss": 4.2334, "step": 964 }, { "epoch": 0.2538181101586363, "grad_norm": 1.2072404623031616, "learning_rate": 9.15629928158402e-05, "loss": 4.1519, "step": 966 }, { "epoch": 0.2543436134922981, "grad_norm": 1.1799613237380981, "learning_rate": 9.154547047485545e-05, "loss": 4.1626, "step": 968 }, { "epoch": 0.25486911682595986, "grad_norm": 0.9668821692466736, "learning_rate": 9.152794813387068e-05, "loss": 4.1555, "step": 970 }, { "epoch": 0.2553946201596216, "grad_norm": 1.0472767353057861, "learning_rate": 9.151042579288593e-05, "loss": 4.1916, "step": 972 }, { "epoch": 0.2559201234932834, "grad_norm": 1.1745355129241943, "learning_rate": 9.149290345190118e-05, "loss": 4.1678, "step": 974 }, { "epoch": 0.25644562682694516, "grad_norm": 1.2124180793762207, "learning_rate": 9.147538111091643e-05, "loss": 4.1421, "step": 976 }, { "epoch": 0.25697113016060696, "grad_norm": 0.9439253807067871, "learning_rate": 9.145785876993167e-05, "loss": 4.1716, "step": 978 }, { "epoch": 0.2574966334942687, "grad_norm": 1.0484727621078491, "learning_rate": 9.144033642894692e-05, "loss": 4.1998, "step": 980 }, { "epoch": 0.2580221368279305, "grad_norm": 1.1602739095687866, "learning_rate": 9.142281408796216e-05, "loss": 4.108, "step": 982 }, { "epoch": 0.25854764016159226, "grad_norm": 1.1963294744491577, "learning_rate": 9.140529174697739e-05, "loss": 4.2143, "step": 984 }, { "epoch": 0.25907314349525407, "grad_norm": 0.9893843531608582, "learning_rate": 9.138776940599264e-05, "loss": 4.2016, "step": 986 }, { "epoch": 0.2595986468289158, "grad_norm": 1.1895357370376587, "learning_rate": 9.137024706500788e-05, "loss": 4.1831, "step": 988 }, { "epoch": 0.2601241501625776, "grad_norm": 0.9686254262924194, "learning_rate": 9.135272472402313e-05, "loss": 4.1225, "step": 990 }, { "epoch": 0.26064965349623936, "grad_norm": 1.1365811824798584, "learning_rate": 9.133520238303838e-05, "loss": 4.1569, "step": 992 }, { "epoch": 0.26117515682990117, "grad_norm": 1.0534486770629883, "learning_rate": 9.131768004205363e-05, "loss": 4.1959, "step": 994 }, { "epoch": 0.2617006601635629, "grad_norm": 0.9037180542945862, "learning_rate": 9.130015770106886e-05, "loss": 4.1495, "step": 996 }, { "epoch": 0.26222616349722466, "grad_norm": 1.1592471599578857, "learning_rate": 9.128263536008411e-05, "loss": 4.2057, "step": 998 }, { "epoch": 0.26275166683088647, "grad_norm": 1.0708454847335815, "learning_rate": 9.126511301909936e-05, "loss": 4.1927, "step": 1000 }, { "epoch": 0.2632771701645482, "grad_norm": 1.233514428138733, "learning_rate": 9.12475906781146e-05, "loss": 4.0928, "step": 1002 }, { "epoch": 0.26380267349821, "grad_norm": 1.1946560144424438, "learning_rate": 9.123006833712985e-05, "loss": 4.1635, "step": 1004 }, { "epoch": 0.26432817683187176, "grad_norm": 1.0209877490997314, "learning_rate": 9.12125459961451e-05, "loss": 4.11, "step": 1006 }, { "epoch": 0.26485368016553357, "grad_norm": 1.1710538864135742, "learning_rate": 9.119502365516033e-05, "loss": 4.1392, "step": 1008 }, { "epoch": 0.2653791834991953, "grad_norm": 1.1850839853286743, "learning_rate": 9.117750131417557e-05, "loss": 4.142, "step": 1010 }, { "epoch": 0.2659046868328571, "grad_norm": 1.1082483530044556, "learning_rate": 9.115997897319081e-05, "loss": 4.1544, "step": 1012 }, { "epoch": 0.26643019016651887, "grad_norm": 1.022905707359314, "learning_rate": 9.114245663220606e-05, "loss": 4.1488, "step": 1014 }, { "epoch": 0.26695569350018067, "grad_norm": 1.2282233238220215, "learning_rate": 9.112493429122131e-05, "loss": 4.1807, "step": 1016 }, { "epoch": 0.2674811968338424, "grad_norm": 1.0894320011138916, "learning_rate": 9.110741195023656e-05, "loss": 4.0915, "step": 1018 }, { "epoch": 0.26800670016750416, "grad_norm": 1.1942992210388184, "learning_rate": 9.10898896092518e-05, "loss": 4.1999, "step": 1020 }, { "epoch": 0.26853220350116597, "grad_norm": 0.946590781211853, "learning_rate": 9.107236726826704e-05, "loss": 4.1098, "step": 1022 }, { "epoch": 0.2690577068348277, "grad_norm": 1.08231520652771, "learning_rate": 9.105484492728229e-05, "loss": 4.2151, "step": 1024 }, { "epoch": 0.2695832101684895, "grad_norm": 0.9118293523788452, "learning_rate": 9.103732258629753e-05, "loss": 4.1333, "step": 1026 }, { "epoch": 0.27010871350215127, "grad_norm": 1.1627392768859863, "learning_rate": 9.101980024531278e-05, "loss": 4.1482, "step": 1028 }, { "epoch": 0.27063421683581307, "grad_norm": 0.9562456011772156, "learning_rate": 9.100227790432803e-05, "loss": 4.1552, "step": 1030 }, { "epoch": 0.2711597201694748, "grad_norm": 1.090744137763977, "learning_rate": 9.098475556334328e-05, "loss": 4.1644, "step": 1032 }, { "epoch": 0.2716852235031366, "grad_norm": 0.9701551795005798, "learning_rate": 9.096723322235851e-05, "loss": 4.111, "step": 1034 }, { "epoch": 0.27221072683679837, "grad_norm": 1.5099138021469116, "learning_rate": 9.094971088137374e-05, "loss": 4.1806, "step": 1036 }, { "epoch": 0.27273623017046017, "grad_norm": 1.094212293624878, "learning_rate": 9.093218854038899e-05, "loss": 4.1769, "step": 1038 }, { "epoch": 0.2732617335041219, "grad_norm": 1.100304126739502, "learning_rate": 9.091466619940424e-05, "loss": 4.1893, "step": 1040 }, { "epoch": 0.27378723683778367, "grad_norm": 1.2607030868530273, "learning_rate": 9.089714385841949e-05, "loss": 4.1755, "step": 1042 }, { "epoch": 0.27431274017144547, "grad_norm": 1.2708219289779663, "learning_rate": 9.087962151743474e-05, "loss": 4.1732, "step": 1044 }, { "epoch": 0.2748382435051072, "grad_norm": 1.1459835767745972, "learning_rate": 9.086209917644998e-05, "loss": 4.1507, "step": 1046 }, { "epoch": 0.275363746838769, "grad_norm": 1.2392457723617554, "learning_rate": 9.084457683546522e-05, "loss": 4.1468, "step": 1048 }, { "epoch": 0.27588925017243077, "grad_norm": 0.9618298411369324, "learning_rate": 9.082705449448046e-05, "loss": 4.1553, "step": 1050 }, { "epoch": 0.27641475350609257, "grad_norm": 1.1533738374710083, "learning_rate": 9.080953215349571e-05, "loss": 4.1741, "step": 1052 }, { "epoch": 0.2769402568397543, "grad_norm": 1.4546704292297363, "learning_rate": 9.079200981251096e-05, "loss": 4.1016, "step": 1054 }, { "epoch": 0.2774657601734161, "grad_norm": 1.028242826461792, "learning_rate": 9.077448747152621e-05, "loss": 4.1731, "step": 1056 }, { "epoch": 0.27799126350707787, "grad_norm": 1.216888427734375, "learning_rate": 9.075696513054145e-05, "loss": 4.1411, "step": 1058 }, { "epoch": 0.27851676684073967, "grad_norm": 1.0497368574142456, "learning_rate": 9.073944278955669e-05, "loss": 4.1392, "step": 1060 }, { "epoch": 0.2790422701744014, "grad_norm": 1.0181561708450317, "learning_rate": 9.072192044857192e-05, "loss": 4.128, "step": 1062 }, { "epoch": 0.27956777350806317, "grad_norm": 0.961370050907135, "learning_rate": 9.070439810758717e-05, "loss": 4.1396, "step": 1064 }, { "epoch": 0.28009327684172497, "grad_norm": 1.2265616655349731, "learning_rate": 9.068687576660242e-05, "loss": 4.1579, "step": 1066 }, { "epoch": 0.2806187801753867, "grad_norm": 1.2476699352264404, "learning_rate": 9.066935342561767e-05, "loss": 4.1725, "step": 1068 }, { "epoch": 0.2811442835090485, "grad_norm": 1.1935466527938843, "learning_rate": 9.065183108463291e-05, "loss": 4.1636, "step": 1070 }, { "epoch": 0.28166978684271027, "grad_norm": 1.1502715349197388, "learning_rate": 9.063430874364816e-05, "loss": 4.1493, "step": 1072 }, { "epoch": 0.28219529017637207, "grad_norm": 1.00157630443573, "learning_rate": 9.06167864026634e-05, "loss": 4.1371, "step": 1074 }, { "epoch": 0.2827207935100338, "grad_norm": 1.3991929292678833, "learning_rate": 9.059926406167864e-05, "loss": 4.2056, "step": 1076 }, { "epoch": 0.2832462968436956, "grad_norm": 1.3524880409240723, "learning_rate": 9.058174172069389e-05, "loss": 4.1221, "step": 1078 }, { "epoch": 0.28377180017735737, "grad_norm": 0.9978928565979004, "learning_rate": 9.056421937970914e-05, "loss": 4.1233, "step": 1080 }, { "epoch": 0.2842973035110192, "grad_norm": 0.8633115887641907, "learning_rate": 9.054669703872438e-05, "loss": 4.1879, "step": 1082 }, { "epoch": 0.2848228068446809, "grad_norm": 0.9556880593299866, "learning_rate": 9.052917469773963e-05, "loss": 4.2208, "step": 1084 }, { "epoch": 0.28534831017834267, "grad_norm": 1.0791780948638916, "learning_rate": 9.051165235675487e-05, "loss": 4.1653, "step": 1086 }, { "epoch": 0.28587381351200447, "grad_norm": 1.2553349733352661, "learning_rate": 9.04941300157701e-05, "loss": 4.129, "step": 1088 }, { "epoch": 0.2863993168456662, "grad_norm": 1.2013320922851562, "learning_rate": 9.047660767478535e-05, "loss": 4.1602, "step": 1090 }, { "epoch": 0.286924820179328, "grad_norm": 1.3966493606567383, "learning_rate": 9.04590853338006e-05, "loss": 4.1519, "step": 1092 }, { "epoch": 0.28745032351298977, "grad_norm": 1.030616044998169, "learning_rate": 9.044156299281584e-05, "loss": 4.1552, "step": 1094 }, { "epoch": 0.2879758268466516, "grad_norm": 1.1960362195968628, "learning_rate": 9.042404065183109e-05, "loss": 4.1156, "step": 1096 }, { "epoch": 0.2885013301803133, "grad_norm": 1.0187939405441284, "learning_rate": 9.040651831084634e-05, "loss": 4.1133, "step": 1098 }, { "epoch": 0.2890268335139751, "grad_norm": 1.3078755140304565, "learning_rate": 9.038899596986157e-05, "loss": 4.1912, "step": 1100 }, { "epoch": 0.28955233684763687, "grad_norm": 0.8451811075210571, "learning_rate": 9.037147362887682e-05, "loss": 4.1097, "step": 1102 }, { "epoch": 0.2900778401812987, "grad_norm": 1.0018359422683716, "learning_rate": 9.035395128789207e-05, "loss": 4.1197, "step": 1104 }, { "epoch": 0.2906033435149604, "grad_norm": 0.9376197457313538, "learning_rate": 9.033642894690731e-05, "loss": 4.1695, "step": 1106 }, { "epoch": 0.29112884684862217, "grad_norm": 0.9326140284538269, "learning_rate": 9.031890660592256e-05, "loss": 4.0942, "step": 1108 }, { "epoch": 0.291654350182284, "grad_norm": 1.0732053518295288, "learning_rate": 9.030138426493781e-05, "loss": 4.1547, "step": 1110 }, { "epoch": 0.2921798535159457, "grad_norm": 1.2791393995285034, "learning_rate": 9.028386192395304e-05, "loss": 4.1093, "step": 1112 }, { "epoch": 0.2927053568496075, "grad_norm": 0.9891042709350586, "learning_rate": 9.026633958296828e-05, "loss": 4.1924, "step": 1114 }, { "epoch": 0.29323086018326927, "grad_norm": 1.3346772193908691, "learning_rate": 9.024881724198353e-05, "loss": 4.1296, "step": 1116 }, { "epoch": 0.2937563635169311, "grad_norm": 0.9858383536338806, "learning_rate": 9.023129490099877e-05, "loss": 4.129, "step": 1118 }, { "epoch": 0.2942818668505928, "grad_norm": 1.1703115701675415, "learning_rate": 9.021377256001402e-05, "loss": 4.1664, "step": 1120 }, { "epoch": 0.2948073701842546, "grad_norm": 1.2127907276153564, "learning_rate": 9.019625021902927e-05, "loss": 4.1905, "step": 1122 }, { "epoch": 0.29533287351791637, "grad_norm": 1.1919876337051392, "learning_rate": 9.017872787804452e-05, "loss": 4.1022, "step": 1124 }, { "epoch": 0.2958583768515782, "grad_norm": 1.015348196029663, "learning_rate": 9.016120553705975e-05, "loss": 4.0752, "step": 1126 }, { "epoch": 0.2963838801852399, "grad_norm": 0.9550849795341492, "learning_rate": 9.0143683196075e-05, "loss": 4.1181, "step": 1128 }, { "epoch": 0.29690938351890167, "grad_norm": 1.081204891204834, "learning_rate": 9.012616085509024e-05, "loss": 4.1504, "step": 1130 }, { "epoch": 0.2974348868525635, "grad_norm": 1.3727375268936157, "learning_rate": 9.010863851410549e-05, "loss": 4.1469, "step": 1132 }, { "epoch": 0.2979603901862252, "grad_norm": 1.2950149774551392, "learning_rate": 9.009111617312074e-05, "loss": 4.0913, "step": 1134 }, { "epoch": 0.298485893519887, "grad_norm": 1.017348289489746, "learning_rate": 9.007359383213599e-05, "loss": 4.1446, "step": 1136 }, { "epoch": 0.29901139685354877, "grad_norm": 1.1251317262649536, "learning_rate": 9.005607149115122e-05, "loss": 4.1324, "step": 1138 }, { "epoch": 0.2995369001872106, "grad_norm": 1.0848757028579712, "learning_rate": 9.003854915016647e-05, "loss": 4.1465, "step": 1140 }, { "epoch": 0.3000624035208723, "grad_norm": 0.9154160022735596, "learning_rate": 9.00210268091817e-05, "loss": 4.1347, "step": 1142 }, { "epoch": 0.3005879068545341, "grad_norm": 0.9990867972373962, "learning_rate": 9.000350446819695e-05, "loss": 4.1418, "step": 1144 }, { "epoch": 0.3011134101881959, "grad_norm": 0.9951852560043335, "learning_rate": 8.99859821272122e-05, "loss": 4.0869, "step": 1146 }, { "epoch": 0.3016389135218577, "grad_norm": 0.967040479183197, "learning_rate": 8.996845978622745e-05, "loss": 4.1108, "step": 1148 }, { "epoch": 0.3021644168555194, "grad_norm": 1.0775066614151, "learning_rate": 8.99509374452427e-05, "loss": 4.1528, "step": 1150 }, { "epoch": 0.3026899201891812, "grad_norm": 0.926328718662262, "learning_rate": 8.993341510425794e-05, "loss": 4.104, "step": 1152 }, { "epoch": 0.303215423522843, "grad_norm": 1.0798577070236206, "learning_rate": 8.991589276327317e-05, "loss": 4.1289, "step": 1154 }, { "epoch": 0.3037409268565047, "grad_norm": 0.9451173543930054, "learning_rate": 8.989837042228842e-05, "loss": 4.1509, "step": 1156 }, { "epoch": 0.3042664301901665, "grad_norm": 1.1517791748046875, "learning_rate": 8.988084808130367e-05, "loss": 4.1534, "step": 1158 }, { "epoch": 0.3047919335238283, "grad_norm": 1.2816054821014404, "learning_rate": 8.986332574031892e-05, "loss": 4.1138, "step": 1160 }, { "epoch": 0.3053174368574901, "grad_norm": 1.1231276988983154, "learning_rate": 8.984580339933417e-05, "loss": 4.1446, "step": 1162 }, { "epoch": 0.3058429401911518, "grad_norm": 1.0346784591674805, "learning_rate": 8.98282810583494e-05, "loss": 4.1439, "step": 1164 }, { "epoch": 0.3063684435248136, "grad_norm": 1.3358701467514038, "learning_rate": 8.981075871736465e-05, "loss": 4.1501, "step": 1166 }, { "epoch": 0.3068939468584754, "grad_norm": 1.0350964069366455, "learning_rate": 8.979323637637988e-05, "loss": 4.0711, "step": 1168 }, { "epoch": 0.3074194501921372, "grad_norm": 1.1273199319839478, "learning_rate": 8.977571403539513e-05, "loss": 4.1152, "step": 1170 }, { "epoch": 0.3079449535257989, "grad_norm": 1.1725157499313354, "learning_rate": 8.975819169441038e-05, "loss": 4.073, "step": 1172 }, { "epoch": 0.30847045685946073, "grad_norm": 0.921409010887146, "learning_rate": 8.974066935342562e-05, "loss": 4.1325, "step": 1174 }, { "epoch": 0.3089959601931225, "grad_norm": 1.0092064142227173, "learning_rate": 8.972314701244087e-05, "loss": 4.1111, "step": 1176 }, { "epoch": 0.3095214635267842, "grad_norm": 1.1221214532852173, "learning_rate": 8.970562467145612e-05, "loss": 4.1453, "step": 1178 }, { "epoch": 0.310046966860446, "grad_norm": 1.3515424728393555, "learning_rate": 8.968810233047135e-05, "loss": 4.1455, "step": 1180 }, { "epoch": 0.3105724701941078, "grad_norm": 1.1687883138656616, "learning_rate": 8.96705799894866e-05, "loss": 4.139, "step": 1182 }, { "epoch": 0.3110979735277696, "grad_norm": 1.0609285831451416, "learning_rate": 8.965305764850185e-05, "loss": 4.1096, "step": 1184 }, { "epoch": 0.3116234768614313, "grad_norm": 1.121606469154358, "learning_rate": 8.96355353075171e-05, "loss": 4.1238, "step": 1186 }, { "epoch": 0.31214898019509313, "grad_norm": 1.0141186714172363, "learning_rate": 8.961801296653233e-05, "loss": 4.1036, "step": 1188 }, { "epoch": 0.3126744835287549, "grad_norm": 1.0657929182052612, "learning_rate": 8.960049062554758e-05, "loss": 4.0814, "step": 1190 }, { "epoch": 0.3131999868624167, "grad_norm": 1.1390215158462524, "learning_rate": 8.958296828456282e-05, "loss": 4.1281, "step": 1192 }, { "epoch": 0.3137254901960784, "grad_norm": 0.9116215705871582, "learning_rate": 8.956544594357806e-05, "loss": 4.1122, "step": 1194 }, { "epoch": 0.31425099352974023, "grad_norm": 1.0679601430892944, "learning_rate": 8.95479236025933e-05, "loss": 4.1467, "step": 1196 }, { "epoch": 0.314776496863402, "grad_norm": 1.051199197769165, "learning_rate": 8.953040126160855e-05, "loss": 4.1396, "step": 1198 }, { "epoch": 0.3153020001970637, "grad_norm": 1.1230065822601318, "learning_rate": 8.95128789206238e-05, "loss": 4.1369, "step": 1200 }, { "epoch": 0.3153020001970637, "eval_loss": 4.084097862243652, "eval_runtime": 464.6434, "eval_samples_per_second": 262.113, "eval_steps_per_second": 8.191, "step": 1200 }, { "epoch": 0.31582750353072553, "grad_norm": 1.1806052923202515, "learning_rate": 8.949535657963905e-05, "loss": 4.1462, "step": 1202 }, { "epoch": 0.3163530068643873, "grad_norm": 1.0441542863845825, "learning_rate": 8.94778342386543e-05, "loss": 4.1154, "step": 1204 }, { "epoch": 0.3168785101980491, "grad_norm": 1.0149189233779907, "learning_rate": 8.946031189766953e-05, "loss": 4.1162, "step": 1206 }, { "epoch": 0.3174040135317108, "grad_norm": 1.0076645612716675, "learning_rate": 8.944278955668478e-05, "loss": 4.0969, "step": 1208 }, { "epoch": 0.31792951686537263, "grad_norm": 1.380466341972351, "learning_rate": 8.942526721570003e-05, "loss": 4.1983, "step": 1210 }, { "epoch": 0.3184550201990344, "grad_norm": 1.0471292734146118, "learning_rate": 8.940774487471527e-05, "loss": 4.1261, "step": 1212 }, { "epoch": 0.3189805235326962, "grad_norm": 1.2716357707977295, "learning_rate": 8.939022253373051e-05, "loss": 4.1015, "step": 1214 }, { "epoch": 0.31950602686635793, "grad_norm": 1.2458291053771973, "learning_rate": 8.937270019274575e-05, "loss": 4.1035, "step": 1216 }, { "epoch": 0.32003153020001973, "grad_norm": 1.0957000255584717, "learning_rate": 8.9355177851761e-05, "loss": 4.1216, "step": 1218 }, { "epoch": 0.3205570335336815, "grad_norm": 1.1473244428634644, "learning_rate": 8.933765551077624e-05, "loss": 4.1432, "step": 1220 }, { "epoch": 0.3210825368673432, "grad_norm": 1.028734564781189, "learning_rate": 8.932013316979148e-05, "loss": 4.1375, "step": 1222 }, { "epoch": 0.32160804020100503, "grad_norm": 1.0124138593673706, "learning_rate": 8.930261082880673e-05, "loss": 4.1403, "step": 1224 }, { "epoch": 0.3221335435346668, "grad_norm": 0.9268117547035217, "learning_rate": 8.928508848782198e-05, "loss": 4.1335, "step": 1226 }, { "epoch": 0.3226590468683286, "grad_norm": 1.5620018243789673, "learning_rate": 8.926756614683723e-05, "loss": 4.1025, "step": 1228 }, { "epoch": 0.32318455020199033, "grad_norm": 1.0583161115646362, "learning_rate": 8.925004380585247e-05, "loss": 4.1212, "step": 1230 }, { "epoch": 0.32371005353565213, "grad_norm": 1.3555305004119873, "learning_rate": 8.923252146486771e-05, "loss": 4.1123, "step": 1232 }, { "epoch": 0.3242355568693139, "grad_norm": 1.2149893045425415, "learning_rate": 8.921499912388296e-05, "loss": 4.087, "step": 1234 }, { "epoch": 0.3247610602029757, "grad_norm": 1.0165433883666992, "learning_rate": 8.91974767828982e-05, "loss": 4.1624, "step": 1236 }, { "epoch": 0.32528656353663743, "grad_norm": 1.2030507326126099, "learning_rate": 8.917995444191345e-05, "loss": 4.1265, "step": 1238 }, { "epoch": 0.32581206687029923, "grad_norm": 1.0473111867904663, "learning_rate": 8.916243210092868e-05, "loss": 4.1253, "step": 1240 }, { "epoch": 0.326337570203961, "grad_norm": 1.3173843622207642, "learning_rate": 8.914490975994393e-05, "loss": 4.133, "step": 1242 }, { "epoch": 0.32686307353762273, "grad_norm": 1.1272777318954468, "learning_rate": 8.912738741895918e-05, "loss": 4.0932, "step": 1244 }, { "epoch": 0.32738857687128453, "grad_norm": 1.1983803510665894, "learning_rate": 8.910986507797441e-05, "loss": 4.1112, "step": 1246 }, { "epoch": 0.3279140802049463, "grad_norm": 1.1434625387191772, "learning_rate": 8.909234273698966e-05, "loss": 4.1286, "step": 1248 }, { "epoch": 0.3284395835386081, "grad_norm": 1.0072258710861206, "learning_rate": 8.907482039600491e-05, "loss": 4.1013, "step": 1250 }, { "epoch": 0.32896508687226983, "grad_norm": 1.1848386526107788, "learning_rate": 8.905729805502016e-05, "loss": 4.1458, "step": 1252 }, { "epoch": 0.32949059020593163, "grad_norm": 1.0572404861450195, "learning_rate": 8.90397757140354e-05, "loss": 4.103, "step": 1254 }, { "epoch": 0.3300160935395934, "grad_norm": 0.8860021233558655, "learning_rate": 8.902225337305065e-05, "loss": 4.0959, "step": 1256 }, { "epoch": 0.3305415968732552, "grad_norm": 1.34895920753479, "learning_rate": 8.900473103206589e-05, "loss": 4.1297, "step": 1258 }, { "epoch": 0.33106710020691693, "grad_norm": 1.010286808013916, "learning_rate": 8.898720869108113e-05, "loss": 4.1187, "step": 1260 }, { "epoch": 0.33159260354057873, "grad_norm": 1.0417863130569458, "learning_rate": 8.896968635009638e-05, "loss": 4.1122, "step": 1262 }, { "epoch": 0.3321181068742405, "grad_norm": 1.117486596107483, "learning_rate": 8.895216400911163e-05, "loss": 4.1209, "step": 1264 }, { "epoch": 0.33264361020790223, "grad_norm": 0.9882538914680481, "learning_rate": 8.893464166812686e-05, "loss": 4.1013, "step": 1266 }, { "epoch": 0.33316911354156403, "grad_norm": 1.1124893426895142, "learning_rate": 8.891711932714211e-05, "loss": 4.1254, "step": 1268 }, { "epoch": 0.3336946168752258, "grad_norm": 0.9390459060668945, "learning_rate": 8.889959698615736e-05, "loss": 4.1111, "step": 1270 }, { "epoch": 0.3342201202088876, "grad_norm": 1.0176641941070557, "learning_rate": 8.888207464517259e-05, "loss": 4.1198, "step": 1272 }, { "epoch": 0.33474562354254933, "grad_norm": 1.0470962524414062, "learning_rate": 8.886455230418784e-05, "loss": 4.1043, "step": 1274 }, { "epoch": 0.33527112687621113, "grad_norm": 1.0495022535324097, "learning_rate": 8.884702996320309e-05, "loss": 4.1464, "step": 1276 }, { "epoch": 0.3357966302098729, "grad_norm": 1.2079837322235107, "learning_rate": 8.882950762221833e-05, "loss": 4.1223, "step": 1278 }, { "epoch": 0.3363221335435347, "grad_norm": 0.8855212926864624, "learning_rate": 8.881198528123358e-05, "loss": 4.1361, "step": 1280 }, { "epoch": 0.33684763687719643, "grad_norm": 1.0201783180236816, "learning_rate": 8.879446294024883e-05, "loss": 4.1291, "step": 1282 }, { "epoch": 0.33737314021085824, "grad_norm": 1.0831118822097778, "learning_rate": 8.877694059926406e-05, "loss": 4.0707, "step": 1284 }, { "epoch": 0.33789864354452, "grad_norm": 1.065210223197937, "learning_rate": 8.875941825827931e-05, "loss": 4.1465, "step": 1286 }, { "epoch": 0.33842414687818173, "grad_norm": 1.0698130130767822, "learning_rate": 8.874189591729456e-05, "loss": 4.1149, "step": 1288 }, { "epoch": 0.33894965021184353, "grad_norm": 0.9620767831802368, "learning_rate": 8.872437357630979e-05, "loss": 4.1058, "step": 1290 }, { "epoch": 0.3394751535455053, "grad_norm": 1.1384581327438354, "learning_rate": 8.870685123532504e-05, "loss": 4.0791, "step": 1292 }, { "epoch": 0.3400006568791671, "grad_norm": 1.0505400896072388, "learning_rate": 8.868932889434029e-05, "loss": 4.1071, "step": 1294 }, { "epoch": 0.34052616021282883, "grad_norm": 1.0416316986083984, "learning_rate": 8.867180655335554e-05, "loss": 4.0953, "step": 1296 }, { "epoch": 0.34105166354649064, "grad_norm": 0.9876034259796143, "learning_rate": 8.865428421237077e-05, "loss": 4.0997, "step": 1298 }, { "epoch": 0.3415771668801524, "grad_norm": 1.07735276222229, "learning_rate": 8.863676187138602e-05, "loss": 4.1554, "step": 1300 }, { "epoch": 0.3421026702138142, "grad_norm": 1.0448873043060303, "learning_rate": 8.861923953040126e-05, "loss": 4.1551, "step": 1302 }, { "epoch": 0.34262817354747593, "grad_norm": 0.9699362516403198, "learning_rate": 8.860171718941651e-05, "loss": 4.0937, "step": 1304 }, { "epoch": 0.34315367688113774, "grad_norm": 1.0039384365081787, "learning_rate": 8.858419484843176e-05, "loss": 4.1033, "step": 1306 }, { "epoch": 0.3436791802147995, "grad_norm": 1.1278984546661377, "learning_rate": 8.856667250744701e-05, "loss": 4.1108, "step": 1308 }, { "epoch": 0.3442046835484613, "grad_norm": 1.0225863456726074, "learning_rate": 8.854915016646224e-05, "loss": 4.0923, "step": 1310 }, { "epoch": 0.34473018688212304, "grad_norm": 0.9759097099304199, "learning_rate": 8.853162782547749e-05, "loss": 4.0944, "step": 1312 }, { "epoch": 0.3452556902157848, "grad_norm": 1.089032769203186, "learning_rate": 8.851410548449274e-05, "loss": 4.0962, "step": 1314 }, { "epoch": 0.3457811935494466, "grad_norm": 0.9909473657608032, "learning_rate": 8.849658314350797e-05, "loss": 4.0987, "step": 1316 }, { "epoch": 0.34630669688310833, "grad_norm": 1.146744966506958, "learning_rate": 8.847906080252322e-05, "loss": 4.0749, "step": 1318 }, { "epoch": 0.34683220021677014, "grad_norm": 1.010433554649353, "learning_rate": 8.846153846153847e-05, "loss": 4.1137, "step": 1320 }, { "epoch": 0.3473577035504319, "grad_norm": 1.1270643472671509, "learning_rate": 8.844401612055371e-05, "loss": 4.0623, "step": 1322 }, { "epoch": 0.3478832068840937, "grad_norm": 1.1532455682754517, "learning_rate": 8.842649377956895e-05, "loss": 4.1347, "step": 1324 }, { "epoch": 0.34840871021775544, "grad_norm": 1.0283042192459106, "learning_rate": 8.84089714385842e-05, "loss": 4.0505, "step": 1326 }, { "epoch": 0.34893421355141724, "grad_norm": 0.9454820156097412, "learning_rate": 8.839144909759944e-05, "loss": 4.0695, "step": 1328 }, { "epoch": 0.349459716885079, "grad_norm": 1.1341617107391357, "learning_rate": 8.837392675661469e-05, "loss": 4.1284, "step": 1330 }, { "epoch": 0.3499852202187408, "grad_norm": 0.9082289934158325, "learning_rate": 8.835640441562994e-05, "loss": 4.1176, "step": 1332 }, { "epoch": 0.35051072355240254, "grad_norm": 1.3507771492004395, "learning_rate": 8.833888207464518e-05, "loss": 4.0804, "step": 1334 }, { "epoch": 0.3510362268860643, "grad_norm": 1.0751078128814697, "learning_rate": 8.832135973366042e-05, "loss": 4.0509, "step": 1336 }, { "epoch": 0.3515617302197261, "grad_norm": 1.1190617084503174, "learning_rate": 8.830383739267567e-05, "loss": 4.1331, "step": 1338 }, { "epoch": 0.35208723355338784, "grad_norm": 1.2387303113937378, "learning_rate": 8.828631505169091e-05, "loss": 4.0727, "step": 1340 }, { "epoch": 0.35261273688704964, "grad_norm": 1.2572470903396606, "learning_rate": 8.826879271070615e-05, "loss": 4.0855, "step": 1342 }, { "epoch": 0.3531382402207114, "grad_norm": 1.181637167930603, "learning_rate": 8.82512703697214e-05, "loss": 4.108, "step": 1344 }, { "epoch": 0.3536637435543732, "grad_norm": 1.0532227754592896, "learning_rate": 8.823374802873664e-05, "loss": 4.0752, "step": 1346 }, { "epoch": 0.35418924688803494, "grad_norm": 1.0992045402526855, "learning_rate": 8.821622568775189e-05, "loss": 4.0746, "step": 1348 }, { "epoch": 0.35471475022169674, "grad_norm": 1.0857479572296143, "learning_rate": 8.819870334676712e-05, "loss": 4.1136, "step": 1350 }, { "epoch": 0.3552402535553585, "grad_norm": 0.9370406866073608, "learning_rate": 8.818118100578237e-05, "loss": 4.1372, "step": 1352 }, { "epoch": 0.3557657568890203, "grad_norm": 1.1188218593597412, "learning_rate": 8.816365866479762e-05, "loss": 4.1047, "step": 1354 }, { "epoch": 0.35629126022268204, "grad_norm": 1.173396110534668, "learning_rate": 8.814613632381287e-05, "loss": 4.1077, "step": 1356 }, { "epoch": 0.3568167635563438, "grad_norm": 1.1859370470046997, "learning_rate": 8.812861398282811e-05, "loss": 4.0747, "step": 1358 }, { "epoch": 0.3573422668900056, "grad_norm": 0.9493948817253113, "learning_rate": 8.811109164184336e-05, "loss": 4.0781, "step": 1360 }, { "epoch": 0.35786777022366734, "grad_norm": 0.9884160757064819, "learning_rate": 8.80935693008586e-05, "loss": 4.0761, "step": 1362 }, { "epoch": 0.35839327355732914, "grad_norm": 0.8795813322067261, "learning_rate": 8.807604695987384e-05, "loss": 4.0919, "step": 1364 }, { "epoch": 0.3589187768909909, "grad_norm": 0.9356005787849426, "learning_rate": 8.805852461888909e-05, "loss": 4.0673, "step": 1366 }, { "epoch": 0.3594442802246527, "grad_norm": 1.0025309324264526, "learning_rate": 8.804100227790433e-05, "loss": 4.1276, "step": 1368 }, { "epoch": 0.35996978355831444, "grad_norm": 1.1137665510177612, "learning_rate": 8.802347993691957e-05, "loss": 4.0841, "step": 1370 }, { "epoch": 0.36049528689197624, "grad_norm": 1.1181511878967285, "learning_rate": 8.800595759593482e-05, "loss": 4.069, "step": 1372 }, { "epoch": 0.361020790225638, "grad_norm": 1.0211069583892822, "learning_rate": 8.798843525495007e-05, "loss": 4.1285, "step": 1374 }, { "epoch": 0.3615462935592998, "grad_norm": 1.0622150897979736, "learning_rate": 8.79709129139653e-05, "loss": 4.043, "step": 1376 }, { "epoch": 0.36207179689296154, "grad_norm": 1.0479551553726196, "learning_rate": 8.795339057298055e-05, "loss": 4.0721, "step": 1378 }, { "epoch": 0.3625973002266233, "grad_norm": 1.3599224090576172, "learning_rate": 8.79358682319958e-05, "loss": 4.0501, "step": 1380 }, { "epoch": 0.3631228035602851, "grad_norm": 1.228256106376648, "learning_rate": 8.791834589101104e-05, "loss": 4.1599, "step": 1382 }, { "epoch": 0.36364830689394684, "grad_norm": 1.0864194631576538, "learning_rate": 8.790082355002629e-05, "loss": 4.0723, "step": 1384 }, { "epoch": 0.36417381022760864, "grad_norm": 1.1955984830856323, "learning_rate": 8.788330120904154e-05, "loss": 4.0837, "step": 1386 }, { "epoch": 0.3646993135612704, "grad_norm": 1.103770136833191, "learning_rate": 8.786577886805677e-05, "loss": 4.1264, "step": 1388 }, { "epoch": 0.3652248168949322, "grad_norm": 1.129332184791565, "learning_rate": 8.784825652707202e-05, "loss": 4.1083, "step": 1390 }, { "epoch": 0.36575032022859394, "grad_norm": 1.1752461194992065, "learning_rate": 8.783073418608726e-05, "loss": 4.1051, "step": 1392 }, { "epoch": 0.36627582356225574, "grad_norm": 1.1222906112670898, "learning_rate": 8.78132118451025e-05, "loss": 4.0753, "step": 1394 }, { "epoch": 0.3668013268959175, "grad_norm": 1.0724331140518188, "learning_rate": 8.779568950411775e-05, "loss": 4.079, "step": 1396 }, { "epoch": 0.3673268302295793, "grad_norm": 1.4286153316497803, "learning_rate": 8.7778167163133e-05, "loss": 4.073, "step": 1398 }, { "epoch": 0.36785233356324104, "grad_norm": 0.9650706648826599, "learning_rate": 8.776064482214825e-05, "loss": 4.1275, "step": 1400 }, { "epoch": 0.3683778368969028, "grad_norm": 1.2482645511627197, "learning_rate": 8.77431224811635e-05, "loss": 4.0662, "step": 1402 }, { "epoch": 0.3689033402305646, "grad_norm": 1.079986572265625, "learning_rate": 8.772560014017873e-05, "loss": 4.1133, "step": 1404 }, { "epoch": 0.36942884356422634, "grad_norm": 1.1633272171020508, "learning_rate": 8.770807779919397e-05, "loss": 4.13, "step": 1406 }, { "epoch": 0.36995434689788814, "grad_norm": 1.0702005624771118, "learning_rate": 8.769055545820922e-05, "loss": 4.0619, "step": 1408 }, { "epoch": 0.3704798502315499, "grad_norm": 1.0377660989761353, "learning_rate": 8.767303311722447e-05, "loss": 4.1283, "step": 1410 }, { "epoch": 0.3710053535652117, "grad_norm": 1.1670407056808472, "learning_rate": 8.765551077623972e-05, "loss": 4.1819, "step": 1412 }, { "epoch": 0.37153085689887344, "grad_norm": 1.0752537250518799, "learning_rate": 8.763798843525497e-05, "loss": 4.0608, "step": 1414 }, { "epoch": 0.37205636023253524, "grad_norm": 1.1705405712127686, "learning_rate": 8.76204660942702e-05, "loss": 4.1048, "step": 1416 }, { "epoch": 0.372581863566197, "grad_norm": 1.242299199104309, "learning_rate": 8.760294375328543e-05, "loss": 4.0697, "step": 1418 }, { "epoch": 0.3731073668998588, "grad_norm": 1.1357474327087402, "learning_rate": 8.758542141230068e-05, "loss": 4.0627, "step": 1420 }, { "epoch": 0.37363287023352054, "grad_norm": 0.9191403985023499, "learning_rate": 8.756789907131593e-05, "loss": 4.1312, "step": 1422 }, { "epoch": 0.3741583735671823, "grad_norm": 1.1778554916381836, "learning_rate": 8.755037673033118e-05, "loss": 4.0557, "step": 1424 }, { "epoch": 0.3746838769008441, "grad_norm": 1.1147055625915527, "learning_rate": 8.753285438934642e-05, "loss": 4.1192, "step": 1426 }, { "epoch": 0.37520938023450584, "grad_norm": 0.8952246308326721, "learning_rate": 8.751533204836167e-05, "loss": 4.0817, "step": 1428 }, { "epoch": 0.37573488356816764, "grad_norm": 1.0158376693725586, "learning_rate": 8.74978097073769e-05, "loss": 4.1189, "step": 1430 }, { "epoch": 0.3762603869018294, "grad_norm": 1.377648115158081, "learning_rate": 8.748028736639215e-05, "loss": 4.0943, "step": 1432 }, { "epoch": 0.3767858902354912, "grad_norm": 1.0235607624053955, "learning_rate": 8.74627650254074e-05, "loss": 4.1062, "step": 1434 }, { "epoch": 0.37731139356915294, "grad_norm": 0.9491191506385803, "learning_rate": 8.744524268442265e-05, "loss": 4.0809, "step": 1436 }, { "epoch": 0.37783689690281475, "grad_norm": 0.9439339637756348, "learning_rate": 8.74277203434379e-05, "loss": 4.1236, "step": 1438 }, { "epoch": 0.3783624002364765, "grad_norm": 0.9380881786346436, "learning_rate": 8.741019800245314e-05, "loss": 4.0939, "step": 1440 }, { "epoch": 0.3788879035701383, "grad_norm": 1.071476936340332, "learning_rate": 8.739267566146838e-05, "loss": 4.11, "step": 1442 }, { "epoch": 0.37941340690380004, "grad_norm": 0.9841758608818054, "learning_rate": 8.737515332048361e-05, "loss": 4.0754, "step": 1444 }, { "epoch": 0.37993891023746185, "grad_norm": 1.2554810047149658, "learning_rate": 8.735763097949886e-05, "loss": 4.0707, "step": 1446 }, { "epoch": 0.3804644135711236, "grad_norm": 0.9026429057121277, "learning_rate": 8.73401086385141e-05, "loss": 4.0933, "step": 1448 }, { "epoch": 0.38098991690478534, "grad_norm": 0.9642812013626099, "learning_rate": 8.732258629752935e-05, "loss": 4.0805, "step": 1450 }, { "epoch": 0.38151542023844714, "grad_norm": 0.9809693694114685, "learning_rate": 8.73050639565446e-05, "loss": 4.0991, "step": 1452 }, { "epoch": 0.3820409235721089, "grad_norm": 0.8850809931755066, "learning_rate": 8.728754161555985e-05, "loss": 4.105, "step": 1454 }, { "epoch": 0.3825664269057707, "grad_norm": 0.9109047651290894, "learning_rate": 8.727001927457508e-05, "loss": 4.1377, "step": 1456 }, { "epoch": 0.38309193023943244, "grad_norm": 0.9586990475654602, "learning_rate": 8.725249693359033e-05, "loss": 4.1245, "step": 1458 }, { "epoch": 0.38361743357309425, "grad_norm": 0.9680961966514587, "learning_rate": 8.723497459260558e-05, "loss": 4.0937, "step": 1460 }, { "epoch": 0.384142936906756, "grad_norm": 1.145357608795166, "learning_rate": 8.721745225162083e-05, "loss": 4.096, "step": 1462 }, { "epoch": 0.3846684402404178, "grad_norm": 0.9385539293289185, "learning_rate": 8.719992991063607e-05, "loss": 4.0857, "step": 1464 }, { "epoch": 0.38519394357407954, "grad_norm": 1.0277316570281982, "learning_rate": 8.718240756965132e-05, "loss": 4.1071, "step": 1466 }, { "epoch": 0.38571944690774135, "grad_norm": 1.1784234046936035, "learning_rate": 8.716488522866655e-05, "loss": 4.1359, "step": 1468 }, { "epoch": 0.3862449502414031, "grad_norm": 1.0855989456176758, "learning_rate": 8.714736288768179e-05, "loss": 4.128, "step": 1470 }, { "epoch": 0.38677045357506484, "grad_norm": 1.027363896369934, "learning_rate": 8.712984054669704e-05, "loss": 4.1474, "step": 1472 }, { "epoch": 0.38729595690872665, "grad_norm": 1.3894141912460327, "learning_rate": 8.711231820571228e-05, "loss": 4.1009, "step": 1474 }, { "epoch": 0.3878214602423884, "grad_norm": 1.0460965633392334, "learning_rate": 8.709479586472753e-05, "loss": 4.105, "step": 1476 }, { "epoch": 0.3883469635760502, "grad_norm": 1.1291357278823853, "learning_rate": 8.707727352374278e-05, "loss": 4.0568, "step": 1478 }, { "epoch": 0.38887246690971194, "grad_norm": 1.268921136856079, "learning_rate": 8.705975118275803e-05, "loss": 4.1139, "step": 1480 }, { "epoch": 0.38939797024337375, "grad_norm": 1.0260307788848877, "learning_rate": 8.704222884177326e-05, "loss": 4.0564, "step": 1482 }, { "epoch": 0.3899234735770355, "grad_norm": 0.9651431441307068, "learning_rate": 8.702470650078851e-05, "loss": 4.1014, "step": 1484 }, { "epoch": 0.3904489769106973, "grad_norm": 1.2107367515563965, "learning_rate": 8.700718415980376e-05, "loss": 4.0584, "step": 1486 }, { "epoch": 0.39097448024435905, "grad_norm": 0.9229625463485718, "learning_rate": 8.6989661818819e-05, "loss": 4.0458, "step": 1488 }, { "epoch": 0.39149998357802085, "grad_norm": 0.9460687637329102, "learning_rate": 8.697213947783425e-05, "loss": 4.0504, "step": 1490 }, { "epoch": 0.3920254869116826, "grad_norm": 1.1883207559585571, "learning_rate": 8.69546171368495e-05, "loss": 4.0472, "step": 1492 }, { "epoch": 0.39255099024534434, "grad_norm": 1.0315344333648682, "learning_rate": 8.693709479586473e-05, "loss": 4.0445, "step": 1494 }, { "epoch": 0.39307649357900615, "grad_norm": 0.9775876402854919, "learning_rate": 8.691957245487997e-05, "loss": 4.0841, "step": 1496 }, { "epoch": 0.3936019969126679, "grad_norm": 0.8928582072257996, "learning_rate": 8.690205011389521e-05, "loss": 4.0524, "step": 1498 }, { "epoch": 0.3941275002463297, "grad_norm": 0.9832913279533386, "learning_rate": 8.688452777291046e-05, "loss": 4.0938, "step": 1500 }, { "epoch": 0.39465300357999145, "grad_norm": 1.1828914880752563, "learning_rate": 8.686700543192571e-05, "loss": 4.0768, "step": 1502 }, { "epoch": 0.39517850691365325, "grad_norm": 1.0799012184143066, "learning_rate": 8.684948309094096e-05, "loss": 4.0797, "step": 1504 }, { "epoch": 0.395704010247315, "grad_norm": 1.1886405944824219, "learning_rate": 8.68319607499562e-05, "loss": 4.0867, "step": 1506 }, { "epoch": 0.3962295135809768, "grad_norm": 0.9912604689598083, "learning_rate": 8.681443840897144e-05, "loss": 4.0961, "step": 1508 }, { "epoch": 0.39675501691463855, "grad_norm": 1.237928867340088, "learning_rate": 8.679691606798669e-05, "loss": 4.0775, "step": 1510 }, { "epoch": 0.39728052024830035, "grad_norm": 1.0307836532592773, "learning_rate": 8.677939372700193e-05, "loss": 4.0851, "step": 1512 }, { "epoch": 0.3978060235819621, "grad_norm": 0.9411685466766357, "learning_rate": 8.676187138601718e-05, "loss": 4.0986, "step": 1514 }, { "epoch": 0.39833152691562385, "grad_norm": 1.0608259439468384, "learning_rate": 8.674434904503243e-05, "loss": 4.0797, "step": 1516 }, { "epoch": 0.39885703024928565, "grad_norm": 0.9626899361610413, "learning_rate": 8.672682670404768e-05, "loss": 4.0324, "step": 1518 }, { "epoch": 0.3993825335829474, "grad_norm": 1.2333307266235352, "learning_rate": 8.670930436306291e-05, "loss": 4.0393, "step": 1520 }, { "epoch": 0.3999080369166092, "grad_norm": 1.4501585960388184, "learning_rate": 8.669178202207814e-05, "loss": 4.0696, "step": 1522 }, { "epoch": 0.40043354025027095, "grad_norm": 1.172493577003479, "learning_rate": 8.667425968109339e-05, "loss": 4.0804, "step": 1524 }, { "epoch": 0.40095904358393275, "grad_norm": 1.1191751956939697, "learning_rate": 8.665673734010864e-05, "loss": 4.0858, "step": 1526 }, { "epoch": 0.4014845469175945, "grad_norm": 1.3972909450531006, "learning_rate": 8.663921499912389e-05, "loss": 4.0699, "step": 1528 }, { "epoch": 0.4020100502512563, "grad_norm": 1.2228561639785767, "learning_rate": 8.662169265813913e-05, "loss": 4.046, "step": 1530 }, { "epoch": 0.40253555358491805, "grad_norm": 1.1553021669387817, "learning_rate": 8.660417031715438e-05, "loss": 4.0625, "step": 1532 }, { "epoch": 0.40306105691857985, "grad_norm": 1.0474298000335693, "learning_rate": 8.658664797616962e-05, "loss": 4.0689, "step": 1534 }, { "epoch": 0.4035865602522416, "grad_norm": 0.9654756784439087, "learning_rate": 8.656912563518486e-05, "loss": 4.0956, "step": 1536 }, { "epoch": 0.40411206358590335, "grad_norm": 1.0488797426223755, "learning_rate": 8.655160329420011e-05, "loss": 4.0465, "step": 1538 }, { "epoch": 0.40463756691956515, "grad_norm": 1.0860599279403687, "learning_rate": 8.653408095321536e-05, "loss": 4.0415, "step": 1540 }, { "epoch": 0.4051630702532269, "grad_norm": 1.4597686529159546, "learning_rate": 8.65165586122306e-05, "loss": 4.0717, "step": 1542 }, { "epoch": 0.4056885735868887, "grad_norm": 1.0490556955337524, "learning_rate": 8.649903627124585e-05, "loss": 4.066, "step": 1544 }, { "epoch": 0.40621407692055045, "grad_norm": 0.9629617929458618, "learning_rate": 8.648151393026109e-05, "loss": 4.016, "step": 1546 }, { "epoch": 0.40673958025421225, "grad_norm": 1.1787109375, "learning_rate": 8.646399158927632e-05, "loss": 4.102, "step": 1548 }, { "epoch": 0.407265083587874, "grad_norm": 1.277995228767395, "learning_rate": 8.644646924829157e-05, "loss": 4.0778, "step": 1550 }, { "epoch": 0.4077905869215358, "grad_norm": 1.1067969799041748, "learning_rate": 8.642894690730682e-05, "loss": 4.0367, "step": 1552 }, { "epoch": 0.40831609025519755, "grad_norm": 1.4447568655014038, "learning_rate": 8.641142456632206e-05, "loss": 4.1297, "step": 1554 }, { "epoch": 0.40884159358885935, "grad_norm": 1.0413399934768677, "learning_rate": 8.639390222533731e-05, "loss": 4.0356, "step": 1556 }, { "epoch": 0.4093670969225211, "grad_norm": 0.9793872237205505, "learning_rate": 8.637637988435256e-05, "loss": 4.0997, "step": 1558 }, { "epoch": 0.40989260025618285, "grad_norm": 1.0590181350708008, "learning_rate": 8.63588575433678e-05, "loss": 4.063, "step": 1560 }, { "epoch": 0.41041810358984465, "grad_norm": 1.2400062084197998, "learning_rate": 8.634133520238304e-05, "loss": 4.0991, "step": 1562 }, { "epoch": 0.4109436069235064, "grad_norm": 1.0311602354049683, "learning_rate": 8.632381286139829e-05, "loss": 4.1004, "step": 1564 }, { "epoch": 0.4114691102571682, "grad_norm": 1.1502728462219238, "learning_rate": 8.630629052041354e-05, "loss": 4.0507, "step": 1566 }, { "epoch": 0.41199461359082995, "grad_norm": 1.0739549398422241, "learning_rate": 8.628876817942878e-05, "loss": 4.0881, "step": 1568 }, { "epoch": 0.41252011692449175, "grad_norm": 0.9908766150474548, "learning_rate": 8.627124583844402e-05, "loss": 4.0572, "step": 1570 }, { "epoch": 0.4130456202581535, "grad_norm": 1.190178632736206, "learning_rate": 8.625372349745927e-05, "loss": 4.1303, "step": 1572 }, { "epoch": 0.4135711235918153, "grad_norm": 0.9081438183784485, "learning_rate": 8.62362011564745e-05, "loss": 4.1133, "step": 1574 }, { "epoch": 0.41409662692547705, "grad_norm": 0.9896316528320312, "learning_rate": 8.621867881548975e-05, "loss": 4.0971, "step": 1576 }, { "epoch": 0.41462213025913885, "grad_norm": 1.0361907482147217, "learning_rate": 8.6201156474505e-05, "loss": 4.0798, "step": 1578 }, { "epoch": 0.4151476335928006, "grad_norm": 1.2782325744628906, "learning_rate": 8.618363413352024e-05, "loss": 4.0612, "step": 1580 }, { "epoch": 0.41567313692646235, "grad_norm": 1.0914652347564697, "learning_rate": 8.616611179253549e-05, "loss": 4.0783, "step": 1582 }, { "epoch": 0.41619864026012415, "grad_norm": 1.1218914985656738, "learning_rate": 8.614858945155074e-05, "loss": 4.1004, "step": 1584 }, { "epoch": 0.4167241435937859, "grad_norm": 0.9536086916923523, "learning_rate": 8.613106711056597e-05, "loss": 4.0901, "step": 1586 }, { "epoch": 0.4172496469274477, "grad_norm": 1.114506721496582, "learning_rate": 8.611354476958122e-05, "loss": 4.0853, "step": 1588 }, { "epoch": 0.41777515026110945, "grad_norm": 0.9834117293357849, "learning_rate": 8.609602242859647e-05, "loss": 4.0206, "step": 1590 }, { "epoch": 0.41830065359477125, "grad_norm": 1.3030213117599487, "learning_rate": 8.607850008761171e-05, "loss": 4.1143, "step": 1592 }, { "epoch": 0.418826156928433, "grad_norm": 1.0820945501327515, "learning_rate": 8.606097774662696e-05, "loss": 4.0705, "step": 1594 }, { "epoch": 0.4193516602620948, "grad_norm": 1.3783637285232544, "learning_rate": 8.60434554056422e-05, "loss": 4.0843, "step": 1596 }, { "epoch": 0.41987716359575655, "grad_norm": 1.0626778602600098, "learning_rate": 8.602593306465744e-05, "loss": 4.0744, "step": 1598 }, { "epoch": 0.42040266692941836, "grad_norm": 1.1349996328353882, "learning_rate": 8.600841072367268e-05, "loss": 4.0574, "step": 1600 }, { "epoch": 0.42040266692941836, "eval_loss": 4.016514778137207, "eval_runtime": 464.4488, "eval_samples_per_second": 262.223, "eval_steps_per_second": 8.195, "step": 1600 }, { "epoch": 0.4209281702630801, "grad_norm": 1.4154913425445557, "learning_rate": 8.599088838268792e-05, "loss": 4.1106, "step": 1602 }, { "epoch": 0.4214536735967419, "grad_norm": 1.1178895235061646, "learning_rate": 8.597336604170317e-05, "loss": 4.0941, "step": 1604 }, { "epoch": 0.42197917693040365, "grad_norm": 1.292711853981018, "learning_rate": 8.595584370071842e-05, "loss": 4.0498, "step": 1606 }, { "epoch": 0.4225046802640654, "grad_norm": 0.9515407681465149, "learning_rate": 8.593832135973367e-05, "loss": 4.0829, "step": 1608 }, { "epoch": 0.4230301835977272, "grad_norm": 1.2529016733169556, "learning_rate": 8.592079901874891e-05, "loss": 4.0718, "step": 1610 }, { "epoch": 0.42355568693138895, "grad_norm": 1.3942055702209473, "learning_rate": 8.590327667776415e-05, "loss": 4.0583, "step": 1612 }, { "epoch": 0.42408119026505076, "grad_norm": 1.2902196645736694, "learning_rate": 8.58857543367794e-05, "loss": 4.0752, "step": 1614 }, { "epoch": 0.4246066935987125, "grad_norm": 1.0781664848327637, "learning_rate": 8.586823199579464e-05, "loss": 4.0685, "step": 1616 }, { "epoch": 0.4251321969323743, "grad_norm": 1.153396725654602, "learning_rate": 8.585070965480989e-05, "loss": 4.0599, "step": 1618 }, { "epoch": 0.42565770026603605, "grad_norm": 1.1469669342041016, "learning_rate": 8.583318731382514e-05, "loss": 4.1026, "step": 1620 }, { "epoch": 0.42618320359969786, "grad_norm": 1.3448874950408936, "learning_rate": 8.581566497284037e-05, "loss": 4.0894, "step": 1622 }, { "epoch": 0.4267087069333596, "grad_norm": 1.5000888109207153, "learning_rate": 8.579814263185562e-05, "loss": 4.0971, "step": 1624 }, { "epoch": 0.4272342102670214, "grad_norm": 0.9904858469963074, "learning_rate": 8.578062029087085e-05, "loss": 4.0795, "step": 1626 }, { "epoch": 0.42775971360068316, "grad_norm": 1.3167495727539062, "learning_rate": 8.57630979498861e-05, "loss": 4.0798, "step": 1628 }, { "epoch": 0.4282852169343449, "grad_norm": 1.1937294006347656, "learning_rate": 8.574557560890135e-05, "loss": 4.0613, "step": 1630 }, { "epoch": 0.4288107202680067, "grad_norm": 1.2655911445617676, "learning_rate": 8.57280532679166e-05, "loss": 4.0366, "step": 1632 }, { "epoch": 0.42933622360166845, "grad_norm": 1.2438817024230957, "learning_rate": 8.571053092693184e-05, "loss": 4.0244, "step": 1634 }, { "epoch": 0.42986172693533026, "grad_norm": 0.8966429829597473, "learning_rate": 8.569300858594709e-05, "loss": 4.0223, "step": 1636 }, { "epoch": 0.430387230268992, "grad_norm": 1.0923736095428467, "learning_rate": 8.567548624496233e-05, "loss": 4.1005, "step": 1638 }, { "epoch": 0.4309127336026538, "grad_norm": 1.0901503562927246, "learning_rate": 8.565796390397757e-05, "loss": 4.0182, "step": 1640 }, { "epoch": 0.43143823693631556, "grad_norm": 1.0520310401916504, "learning_rate": 8.564044156299282e-05, "loss": 4.0924, "step": 1642 }, { "epoch": 0.43196374026997736, "grad_norm": 1.0936928987503052, "learning_rate": 8.562291922200807e-05, "loss": 4.0644, "step": 1644 }, { "epoch": 0.4324892436036391, "grad_norm": 1.0344078540802002, "learning_rate": 8.560539688102332e-05, "loss": 4.0628, "step": 1646 }, { "epoch": 0.4330147469373009, "grad_norm": 1.074208378791809, "learning_rate": 8.558787454003855e-05, "loss": 4.0376, "step": 1648 }, { "epoch": 0.43354025027096266, "grad_norm": 1.0647521018981934, "learning_rate": 8.55703521990538e-05, "loss": 4.0616, "step": 1650 }, { "epoch": 0.4340657536046244, "grad_norm": 1.2965573072433472, "learning_rate": 8.555282985806905e-05, "loss": 4.1087, "step": 1652 }, { "epoch": 0.4345912569382862, "grad_norm": 1.087363600730896, "learning_rate": 8.553530751708428e-05, "loss": 3.9931, "step": 1654 }, { "epoch": 0.43511676027194796, "grad_norm": 0.9833258986473083, "learning_rate": 8.551778517609953e-05, "loss": 4.0505, "step": 1656 }, { "epoch": 0.43564226360560976, "grad_norm": 1.0560259819030762, "learning_rate": 8.550026283511477e-05, "loss": 4.0242, "step": 1658 }, { "epoch": 0.4361677669392715, "grad_norm": 0.9070264101028442, "learning_rate": 8.548274049413002e-05, "loss": 4.0708, "step": 1660 }, { "epoch": 0.4366932702729333, "grad_norm": 0.9511452317237854, "learning_rate": 8.546521815314527e-05, "loss": 4.0047, "step": 1662 }, { "epoch": 0.43721877360659506, "grad_norm": 1.115578293800354, "learning_rate": 8.544769581216052e-05, "loss": 4.1004, "step": 1664 }, { "epoch": 0.43774427694025686, "grad_norm": 1.0156495571136475, "learning_rate": 8.543017347117575e-05, "loss": 4.0561, "step": 1666 }, { "epoch": 0.4382697802739186, "grad_norm": 1.1741002798080444, "learning_rate": 8.5412651130191e-05, "loss": 4.0617, "step": 1668 }, { "epoch": 0.4387952836075804, "grad_norm": 0.8858454823493958, "learning_rate": 8.539512878920625e-05, "loss": 4.0369, "step": 1670 }, { "epoch": 0.43932078694124216, "grad_norm": 1.0727438926696777, "learning_rate": 8.537760644822148e-05, "loss": 4.1121, "step": 1672 }, { "epoch": 0.4398462902749039, "grad_norm": 1.4730733633041382, "learning_rate": 8.536008410723673e-05, "loss": 4.0571, "step": 1674 }, { "epoch": 0.4403717936085657, "grad_norm": 1.1257169246673584, "learning_rate": 8.534256176625198e-05, "loss": 4.0574, "step": 1676 }, { "epoch": 0.44089729694222746, "grad_norm": 1.387749433517456, "learning_rate": 8.532503942526722e-05, "loss": 4.1052, "step": 1678 }, { "epoch": 0.44142280027588926, "grad_norm": 0.86592036485672, "learning_rate": 8.530751708428246e-05, "loss": 4.0564, "step": 1680 }, { "epoch": 0.441948303609551, "grad_norm": 0.8364253640174866, "learning_rate": 8.52899947432977e-05, "loss": 4.0566, "step": 1682 }, { "epoch": 0.4424738069432128, "grad_norm": 0.9764265418052673, "learning_rate": 8.527247240231295e-05, "loss": 4.0501, "step": 1684 }, { "epoch": 0.44299931027687456, "grad_norm": 0.9818626046180725, "learning_rate": 8.52549500613282e-05, "loss": 4.0614, "step": 1686 }, { "epoch": 0.44352481361053636, "grad_norm": 1.1309095621109009, "learning_rate": 8.523742772034345e-05, "loss": 4.0219, "step": 1688 }, { "epoch": 0.4440503169441981, "grad_norm": 1.0226690769195557, "learning_rate": 8.52199053793587e-05, "loss": 4.0304, "step": 1690 }, { "epoch": 0.4445758202778599, "grad_norm": 1.1330664157867432, "learning_rate": 8.520238303837393e-05, "loss": 4.0238, "step": 1692 }, { "epoch": 0.44510132361152166, "grad_norm": 1.0397083759307861, "learning_rate": 8.518486069738918e-05, "loss": 4.0318, "step": 1694 }, { "epoch": 0.4456268269451834, "grad_norm": 0.9553044438362122, "learning_rate": 8.516733835640442e-05, "loss": 4.0654, "step": 1696 }, { "epoch": 0.4461523302788452, "grad_norm": 1.1285008192062378, "learning_rate": 8.514981601541966e-05, "loss": 4.0598, "step": 1698 }, { "epoch": 0.44667783361250696, "grad_norm": 1.1313588619232178, "learning_rate": 8.51322936744349e-05, "loss": 4.0225, "step": 1700 }, { "epoch": 0.44720333694616876, "grad_norm": 1.0267915725708008, "learning_rate": 8.511477133345015e-05, "loss": 4.1059, "step": 1702 }, { "epoch": 0.4477288402798305, "grad_norm": 1.1800528764724731, "learning_rate": 8.50972489924654e-05, "loss": 4.0713, "step": 1704 }, { "epoch": 0.4482543436134923, "grad_norm": 1.021100640296936, "learning_rate": 8.507972665148064e-05, "loss": 4.0667, "step": 1706 }, { "epoch": 0.44877984694715406, "grad_norm": 1.0350342988967896, "learning_rate": 8.506220431049588e-05, "loss": 4.0597, "step": 1708 }, { "epoch": 0.44930535028081586, "grad_norm": 1.0944093465805054, "learning_rate": 8.504468196951113e-05, "loss": 4.0804, "step": 1710 }, { "epoch": 0.4498308536144776, "grad_norm": 1.0474380254745483, "learning_rate": 8.502715962852638e-05, "loss": 4.0612, "step": 1712 }, { "epoch": 0.4503563569481394, "grad_norm": 1.008052110671997, "learning_rate": 8.500963728754163e-05, "loss": 4.0207, "step": 1714 }, { "epoch": 0.45088186028180116, "grad_norm": 1.000032901763916, "learning_rate": 8.499211494655687e-05, "loss": 4.0572, "step": 1716 }, { "epoch": 0.4514073636154629, "grad_norm": 0.9902980327606201, "learning_rate": 8.497459260557211e-05, "loss": 4.0767, "step": 1718 }, { "epoch": 0.4519328669491247, "grad_norm": 1.1393933296203613, "learning_rate": 8.495707026458735e-05, "loss": 4.077, "step": 1720 }, { "epoch": 0.45245837028278646, "grad_norm": 1.0182420015335083, "learning_rate": 8.49395479236026e-05, "loss": 3.9983, "step": 1722 }, { "epoch": 0.45298387361644826, "grad_norm": 1.2237807512283325, "learning_rate": 8.492202558261784e-05, "loss": 4.0578, "step": 1724 }, { "epoch": 0.45350937695011, "grad_norm": 1.3458858728408813, "learning_rate": 8.490450324163308e-05, "loss": 4.0742, "step": 1726 }, { "epoch": 0.4540348802837718, "grad_norm": 1.0266990661621094, "learning_rate": 8.488698090064833e-05, "loss": 4.0573, "step": 1728 }, { "epoch": 0.45456038361743356, "grad_norm": 1.003554344177246, "learning_rate": 8.486945855966358e-05, "loss": 4.0501, "step": 1730 }, { "epoch": 0.45508588695109536, "grad_norm": 1.1404519081115723, "learning_rate": 8.485193621867881e-05, "loss": 4.055, "step": 1732 }, { "epoch": 0.4556113902847571, "grad_norm": 1.0809496641159058, "learning_rate": 8.483441387769406e-05, "loss": 4.0604, "step": 1734 }, { "epoch": 0.4561368936184189, "grad_norm": 0.9890826940536499, "learning_rate": 8.481689153670931e-05, "loss": 4.0709, "step": 1736 }, { "epoch": 0.45666239695208066, "grad_norm": 0.9811969995498657, "learning_rate": 8.479936919572456e-05, "loss": 4.0899, "step": 1738 }, { "epoch": 0.4571879002857424, "grad_norm": 0.9258866310119629, "learning_rate": 8.47818468547398e-05, "loss": 4.0134, "step": 1740 }, { "epoch": 0.4577134036194042, "grad_norm": 1.261148452758789, "learning_rate": 8.476432451375505e-05, "loss": 4.0452, "step": 1742 }, { "epoch": 0.45823890695306596, "grad_norm": 0.9911594986915588, "learning_rate": 8.474680217277028e-05, "loss": 4.0295, "step": 1744 }, { "epoch": 0.45876441028672776, "grad_norm": 0.8593732118606567, "learning_rate": 8.472927983178553e-05, "loss": 4.0623, "step": 1746 }, { "epoch": 0.4592899136203895, "grad_norm": 1.3807445764541626, "learning_rate": 8.471175749080078e-05, "loss": 4.095, "step": 1748 }, { "epoch": 0.4598154169540513, "grad_norm": 0.9687548875808716, "learning_rate": 8.469423514981601e-05, "loss": 4.0255, "step": 1750 }, { "epoch": 0.46034092028771306, "grad_norm": 1.2159545421600342, "learning_rate": 8.467671280883126e-05, "loss": 3.9835, "step": 1752 }, { "epoch": 0.46086642362137487, "grad_norm": 1.2598481178283691, "learning_rate": 8.465919046784651e-05, "loss": 4.0066, "step": 1754 }, { "epoch": 0.4613919269550366, "grad_norm": 1.0157935619354248, "learning_rate": 8.464166812686176e-05, "loss": 4.0183, "step": 1756 }, { "epoch": 0.4619174302886984, "grad_norm": 1.0953752994537354, "learning_rate": 8.462414578587699e-05, "loss": 4.0221, "step": 1758 }, { "epoch": 0.46244293362236016, "grad_norm": 0.9985252618789673, "learning_rate": 8.460662344489224e-05, "loss": 4.0428, "step": 1760 }, { "epoch": 0.46296843695602197, "grad_norm": 0.9567633271217346, "learning_rate": 8.458910110390749e-05, "loss": 4.0639, "step": 1762 }, { "epoch": 0.4634939402896837, "grad_norm": 0.9036881327629089, "learning_rate": 8.457157876292273e-05, "loss": 4.0513, "step": 1764 }, { "epoch": 0.46401944362334546, "grad_norm": 1.1164054870605469, "learning_rate": 8.455405642193798e-05, "loss": 4.0839, "step": 1766 }, { "epoch": 0.46454494695700727, "grad_norm": 1.0270805358886719, "learning_rate": 8.453653408095323e-05, "loss": 4.009, "step": 1768 }, { "epoch": 0.465070450290669, "grad_norm": 1.1856820583343506, "learning_rate": 8.451901173996846e-05, "loss": 4.0387, "step": 1770 }, { "epoch": 0.4655959536243308, "grad_norm": 1.0524976253509521, "learning_rate": 8.450148939898371e-05, "loss": 4.0448, "step": 1772 }, { "epoch": 0.46612145695799256, "grad_norm": 1.3269898891448975, "learning_rate": 8.448396705799894e-05, "loss": 4.059, "step": 1774 }, { "epoch": 0.46664696029165437, "grad_norm": 1.3503835201263428, "learning_rate": 8.446644471701419e-05, "loss": 4.0537, "step": 1776 }, { "epoch": 0.4671724636253161, "grad_norm": 1.1619043350219727, "learning_rate": 8.444892237602944e-05, "loss": 4.0063, "step": 1778 }, { "epoch": 0.4676979669589779, "grad_norm": 1.20862877368927, "learning_rate": 8.443140003504469e-05, "loss": 4.0681, "step": 1780 }, { "epoch": 0.46822347029263967, "grad_norm": 1.189637541770935, "learning_rate": 8.441387769405993e-05, "loss": 4.0502, "step": 1782 }, { "epoch": 0.46874897362630147, "grad_norm": 1.0598156452178955, "learning_rate": 8.439635535307517e-05, "loss": 4.0834, "step": 1784 }, { "epoch": 0.4692744769599632, "grad_norm": 0.9177694320678711, "learning_rate": 8.437883301209042e-05, "loss": 4.0271, "step": 1786 }, { "epoch": 0.46979998029362496, "grad_norm": 1.1182200908660889, "learning_rate": 8.436131067110566e-05, "loss": 4.0901, "step": 1788 }, { "epoch": 0.47032548362728677, "grad_norm": 1.2071590423583984, "learning_rate": 8.434378833012091e-05, "loss": 4.0244, "step": 1790 }, { "epoch": 0.4708509869609485, "grad_norm": 1.2733187675476074, "learning_rate": 8.432626598913616e-05, "loss": 4.0547, "step": 1792 }, { "epoch": 0.4713764902946103, "grad_norm": 0.9854167103767395, "learning_rate": 8.43087436481514e-05, "loss": 4.0552, "step": 1794 }, { "epoch": 0.47190199362827207, "grad_norm": 1.0186132192611694, "learning_rate": 8.429122130716664e-05, "loss": 4.0431, "step": 1796 }, { "epoch": 0.47242749696193387, "grad_norm": 1.0224884748458862, "learning_rate": 8.427369896618189e-05, "loss": 4.0456, "step": 1798 }, { "epoch": 0.4729530002955956, "grad_norm": 1.049132227897644, "learning_rate": 8.425617662519712e-05, "loss": 4.0285, "step": 1800 }, { "epoch": 0.4734785036292574, "grad_norm": 1.0882220268249512, "learning_rate": 8.423865428421237e-05, "loss": 4.0261, "step": 1802 }, { "epoch": 0.47400400696291917, "grad_norm": 1.0918564796447754, "learning_rate": 8.422113194322762e-05, "loss": 4.0108, "step": 1804 }, { "epoch": 0.47452951029658097, "grad_norm": 1.0556299686431885, "learning_rate": 8.420360960224286e-05, "loss": 4.015, "step": 1806 }, { "epoch": 0.4750550136302427, "grad_norm": 1.0790144205093384, "learning_rate": 8.418608726125811e-05, "loss": 4.0241, "step": 1808 }, { "epoch": 0.47558051696390446, "grad_norm": 1.199230432510376, "learning_rate": 8.416856492027335e-05, "loss": 4.0683, "step": 1810 }, { "epoch": 0.47610602029756627, "grad_norm": 1.049172282218933, "learning_rate": 8.41510425792886e-05, "loss": 4.1102, "step": 1812 }, { "epoch": 0.476631523631228, "grad_norm": 1.2184536457061768, "learning_rate": 8.413352023830384e-05, "loss": 4.0273, "step": 1814 }, { "epoch": 0.4771570269648898, "grad_norm": 1.0186938047409058, "learning_rate": 8.411599789731909e-05, "loss": 4.0806, "step": 1816 }, { "epoch": 0.47768253029855157, "grad_norm": 1.1421421766281128, "learning_rate": 8.409847555633434e-05, "loss": 4.0535, "step": 1818 }, { "epoch": 0.47820803363221337, "grad_norm": 0.9979041814804077, "learning_rate": 8.408095321534958e-05, "loss": 4.0319, "step": 1820 }, { "epoch": 0.4787335369658751, "grad_norm": 1.0474679470062256, "learning_rate": 8.406343087436482e-05, "loss": 4.113, "step": 1822 }, { "epoch": 0.4792590402995369, "grad_norm": 1.0851263999938965, "learning_rate": 8.404590853338007e-05, "loss": 4.0172, "step": 1824 }, { "epoch": 0.47978454363319867, "grad_norm": 1.0941178798675537, "learning_rate": 8.40283861923953e-05, "loss": 4.0436, "step": 1826 }, { "epoch": 0.48031004696686047, "grad_norm": 1.2451874017715454, "learning_rate": 8.401086385141055e-05, "loss": 4.0425, "step": 1828 }, { "epoch": 0.4808355503005222, "grad_norm": 1.0262019634246826, "learning_rate": 8.39933415104258e-05, "loss": 4.0356, "step": 1830 }, { "epoch": 0.48136105363418397, "grad_norm": 1.1029359102249146, "learning_rate": 8.397581916944104e-05, "loss": 4.1061, "step": 1832 }, { "epoch": 0.48188655696784577, "grad_norm": 0.9775404930114746, "learning_rate": 8.395829682845629e-05, "loss": 4.0574, "step": 1834 }, { "epoch": 0.4824120603015075, "grad_norm": 1.094001293182373, "learning_rate": 8.394077448747152e-05, "loss": 4.0385, "step": 1836 }, { "epoch": 0.4829375636351693, "grad_norm": 1.0775823593139648, "learning_rate": 8.392325214648677e-05, "loss": 4.0522, "step": 1838 }, { "epoch": 0.48346306696883107, "grad_norm": 0.989027738571167, "learning_rate": 8.390572980550202e-05, "loss": 4.0677, "step": 1840 }, { "epoch": 0.48398857030249287, "grad_norm": 0.9148473143577576, "learning_rate": 8.388820746451727e-05, "loss": 4.0489, "step": 1842 }, { "epoch": 0.4845140736361546, "grad_norm": 1.1394762992858887, "learning_rate": 8.387068512353251e-05, "loss": 4.0068, "step": 1844 }, { "epoch": 0.4850395769698164, "grad_norm": 1.0647790431976318, "learning_rate": 8.385316278254776e-05, "loss": 4.0029, "step": 1846 }, { "epoch": 0.48556508030347817, "grad_norm": 1.0168123245239258, "learning_rate": 8.3835640441563e-05, "loss": 4.0345, "step": 1848 }, { "epoch": 0.48609058363714, "grad_norm": 1.066354513168335, "learning_rate": 8.381811810057824e-05, "loss": 4.0213, "step": 1850 }, { "epoch": 0.4866160869708017, "grad_norm": 1.1315792798995972, "learning_rate": 8.380059575959348e-05, "loss": 4.0298, "step": 1852 }, { "epoch": 0.48714159030446347, "grad_norm": 1.167419672012329, "learning_rate": 8.378307341860872e-05, "loss": 3.9821, "step": 1854 }, { "epoch": 0.48766709363812527, "grad_norm": 1.1397274732589722, "learning_rate": 8.376555107762397e-05, "loss": 4.0067, "step": 1856 }, { "epoch": 0.488192596971787, "grad_norm": 1.1780146360397339, "learning_rate": 8.374802873663922e-05, "loss": 4.0503, "step": 1858 }, { "epoch": 0.4887181003054488, "grad_norm": 1.3302661180496216, "learning_rate": 8.373050639565447e-05, "loss": 4.0077, "step": 1860 }, { "epoch": 0.48924360363911057, "grad_norm": 1.0713132619857788, "learning_rate": 8.37129840546697e-05, "loss": 3.9925, "step": 1862 }, { "epoch": 0.4897691069727724, "grad_norm": 1.1408880949020386, "learning_rate": 8.369546171368495e-05, "loss": 4.0511, "step": 1864 }, { "epoch": 0.4902946103064341, "grad_norm": 1.094518780708313, "learning_rate": 8.36779393727002e-05, "loss": 3.9868, "step": 1866 }, { "epoch": 0.4908201136400959, "grad_norm": 1.0943959951400757, "learning_rate": 8.366041703171544e-05, "loss": 4.0299, "step": 1868 }, { "epoch": 0.49134561697375767, "grad_norm": 1.184888243675232, "learning_rate": 8.364289469073069e-05, "loss": 4.0265, "step": 1870 }, { "epoch": 0.4918711203074195, "grad_norm": 0.9974071979522705, "learning_rate": 8.362537234974594e-05, "loss": 4.0272, "step": 1872 }, { "epoch": 0.4923966236410812, "grad_norm": 1.0674171447753906, "learning_rate": 8.360785000876117e-05, "loss": 4.0596, "step": 1874 }, { "epoch": 0.49292212697474297, "grad_norm": 1.0133895874023438, "learning_rate": 8.359032766777641e-05, "loss": 4.058, "step": 1876 }, { "epoch": 0.49344763030840477, "grad_norm": 1.1249886751174927, "learning_rate": 8.357280532679165e-05, "loss": 4.0355, "step": 1878 }, { "epoch": 0.4939731336420665, "grad_norm": 0.9206785559654236, "learning_rate": 8.35552829858069e-05, "loss": 4.0619, "step": 1880 }, { "epoch": 0.4944986369757283, "grad_norm": 1.169454574584961, "learning_rate": 8.353776064482215e-05, "loss": 4.0396, "step": 1882 }, { "epoch": 0.49502414030939007, "grad_norm": 1.106631875038147, "learning_rate": 8.35202383038374e-05, "loss": 4.0363, "step": 1884 }, { "epoch": 0.4955496436430519, "grad_norm": 1.2055917978286743, "learning_rate": 8.350271596285264e-05, "loss": 4.0475, "step": 1886 }, { "epoch": 0.4960751469767136, "grad_norm": 0.9267309904098511, "learning_rate": 8.348519362186788e-05, "loss": 4.0314, "step": 1888 }, { "epoch": 0.4966006503103754, "grad_norm": 0.9528260231018066, "learning_rate": 8.346767128088313e-05, "loss": 4.0459, "step": 1890 }, { "epoch": 0.49712615364403717, "grad_norm": 1.060927391052246, "learning_rate": 8.345014893989837e-05, "loss": 4.1096, "step": 1892 }, { "epoch": 0.497651656977699, "grad_norm": 1.2961064577102661, "learning_rate": 8.343262659891362e-05, "loss": 4.0269, "step": 1894 }, { "epoch": 0.4981771603113607, "grad_norm": 1.0860316753387451, "learning_rate": 8.341510425792887e-05, "loss": 4.0458, "step": 1896 }, { "epoch": 0.49870266364502247, "grad_norm": 1.4097607135772705, "learning_rate": 8.339758191694412e-05, "loss": 4.0213, "step": 1898 }, { "epoch": 0.4992281669786843, "grad_norm": 1.0954804420471191, "learning_rate": 8.338005957595935e-05, "loss": 4.0488, "step": 1900 }, { "epoch": 0.499753670312346, "grad_norm": 1.02521550655365, "learning_rate": 8.33625372349746e-05, "loss": 4.0136, "step": 1902 }, { "epoch": 0.5002791736460078, "grad_norm": 0.8976112604141235, "learning_rate": 8.334501489398983e-05, "loss": 4.0215, "step": 1904 }, { "epoch": 0.5008046769796696, "grad_norm": 1.2523068189620972, "learning_rate": 8.332749255300508e-05, "loss": 4.0361, "step": 1906 }, { "epoch": 0.5013301803133313, "grad_norm": 1.0542731285095215, "learning_rate": 8.330997021202033e-05, "loss": 4.027, "step": 1908 }, { "epoch": 0.5018556836469932, "grad_norm": 1.0470690727233887, "learning_rate": 8.329244787103557e-05, "loss": 4.0087, "step": 1910 }, { "epoch": 0.5023811869806549, "grad_norm": 1.0792596340179443, "learning_rate": 8.327492553005082e-05, "loss": 4.0801, "step": 1912 }, { "epoch": 0.5029066903143167, "grad_norm": 1.1124752759933472, "learning_rate": 8.325740318906607e-05, "loss": 4.0537, "step": 1914 }, { "epoch": 0.5034321936479784, "grad_norm": 1.2260900735855103, "learning_rate": 8.32398808480813e-05, "loss": 4.0591, "step": 1916 }, { "epoch": 0.5039576969816403, "grad_norm": 0.9523537158966064, "learning_rate": 8.322235850709655e-05, "loss": 4.0949, "step": 1918 }, { "epoch": 0.504483200315302, "grad_norm": 1.0982946157455444, "learning_rate": 8.32048361661118e-05, "loss": 4.0337, "step": 1920 }, { "epoch": 0.5050087036489638, "grad_norm": 0.9355713129043579, "learning_rate": 8.318731382512705e-05, "loss": 4.0488, "step": 1922 }, { "epoch": 0.5055342069826255, "grad_norm": 1.2182650566101074, "learning_rate": 8.31697914841423e-05, "loss": 4.0456, "step": 1924 }, { "epoch": 0.5060597103162873, "grad_norm": 1.0426530838012695, "learning_rate": 8.315226914315754e-05, "loss": 4.0332, "step": 1926 }, { "epoch": 0.5065852136499491, "grad_norm": 1.093074083328247, "learning_rate": 8.313474680217278e-05, "loss": 4.0151, "step": 1928 }, { "epoch": 0.5071107169836109, "grad_norm": 1.0201054811477661, "learning_rate": 8.311722446118801e-05, "loss": 3.9915, "step": 1930 }, { "epoch": 0.5076362203172726, "grad_norm": 1.23635733127594, "learning_rate": 8.309970212020326e-05, "loss": 4.0266, "step": 1932 }, { "epoch": 0.5081617236509344, "grad_norm": 1.0443339347839355, "learning_rate": 8.30821797792185e-05, "loss": 4.0297, "step": 1934 }, { "epoch": 0.5086872269845962, "grad_norm": 1.261130690574646, "learning_rate": 8.306465743823375e-05, "loss": 4.0554, "step": 1936 }, { "epoch": 0.509212730318258, "grad_norm": 1.0700048208236694, "learning_rate": 8.3047135097249e-05, "loss": 4.0338, "step": 1938 }, { "epoch": 0.5097382336519197, "grad_norm": 1.1626553535461426, "learning_rate": 8.302961275626425e-05, "loss": 4.0353, "step": 1940 }, { "epoch": 0.5102637369855815, "grad_norm": 1.3702563047409058, "learning_rate": 8.301209041527948e-05, "loss": 4.0115, "step": 1942 }, { "epoch": 0.5107892403192432, "grad_norm": 0.9823038578033447, "learning_rate": 8.299456807429473e-05, "loss": 4.0293, "step": 1944 }, { "epoch": 0.5113147436529051, "grad_norm": 1.184643030166626, "learning_rate": 8.297704573330998e-05, "loss": 3.9832, "step": 1946 }, { "epoch": 0.5118402469865668, "grad_norm": 1.0356690883636475, "learning_rate": 8.295952339232522e-05, "loss": 4.0286, "step": 1948 }, { "epoch": 0.5123657503202286, "grad_norm": 0.9067436456680298, "learning_rate": 8.294200105134047e-05, "loss": 3.9689, "step": 1950 }, { "epoch": 0.5128912536538903, "grad_norm": 0.9896895289421082, "learning_rate": 8.292447871035572e-05, "loss": 4.0265, "step": 1952 }, { "epoch": 0.5134167569875522, "grad_norm": 0.9950932860374451, "learning_rate": 8.290695636937095e-05, "loss": 4.0239, "step": 1954 }, { "epoch": 0.5139422603212139, "grad_norm": 1.1776306629180908, "learning_rate": 8.288943402838619e-05, "loss": 4.0333, "step": 1956 }, { "epoch": 0.5144677636548757, "grad_norm": 1.046035885810852, "learning_rate": 8.287191168740144e-05, "loss": 3.9936, "step": 1958 }, { "epoch": 0.5149932669885374, "grad_norm": 1.1232341527938843, "learning_rate": 8.285438934641668e-05, "loss": 3.9806, "step": 1960 }, { "epoch": 0.5155187703221993, "grad_norm": 0.9925497770309448, "learning_rate": 8.283686700543193e-05, "loss": 4.0051, "step": 1962 }, { "epoch": 0.516044273655861, "grad_norm": 1.3231794834136963, "learning_rate": 8.281934466444718e-05, "loss": 4.0581, "step": 1964 }, { "epoch": 0.5165697769895228, "grad_norm": 1.1579099893569946, "learning_rate": 8.280182232346243e-05, "loss": 4.0024, "step": 1966 }, { "epoch": 0.5170952803231845, "grad_norm": 0.9150693416595459, "learning_rate": 8.278429998247766e-05, "loss": 4.0001, "step": 1968 }, { "epoch": 0.5176207836568463, "grad_norm": 1.2032802104949951, "learning_rate": 8.276677764149291e-05, "loss": 4.0195, "step": 1970 }, { "epoch": 0.5181462869905081, "grad_norm": 0.9698496460914612, "learning_rate": 8.274925530050815e-05, "loss": 4.0293, "step": 1972 }, { "epoch": 0.5186717903241699, "grad_norm": 1.1908624172210693, "learning_rate": 8.27317329595234e-05, "loss": 4.0449, "step": 1974 }, { "epoch": 0.5191972936578316, "grad_norm": 1.1393076181411743, "learning_rate": 8.271421061853865e-05, "loss": 3.9754, "step": 1976 }, { "epoch": 0.5197227969914934, "grad_norm": 1.2156882286071777, "learning_rate": 8.269668827755388e-05, "loss": 4.0512, "step": 1978 }, { "epoch": 0.5202483003251552, "grad_norm": 1.09844172000885, "learning_rate": 8.267916593656913e-05, "loss": 4.0774, "step": 1980 }, { "epoch": 0.520773803658817, "grad_norm": 1.157374382019043, "learning_rate": 8.266164359558437e-05, "loss": 4.0353, "step": 1982 }, { "epoch": 0.5212993069924787, "grad_norm": 1.0127466917037964, "learning_rate": 8.264412125459961e-05, "loss": 4.0044, "step": 1984 }, { "epoch": 0.5218248103261405, "grad_norm": 1.484802484512329, "learning_rate": 8.262659891361486e-05, "loss": 4.0465, "step": 1986 }, { "epoch": 0.5223503136598023, "grad_norm": 1.3299627304077148, "learning_rate": 8.260907657263011e-05, "loss": 4.0123, "step": 1988 }, { "epoch": 0.5228758169934641, "grad_norm": 1.1253503561019897, "learning_rate": 8.259155423164536e-05, "loss": 4.059, "step": 1990 }, { "epoch": 0.5234013203271258, "grad_norm": 1.0858798027038574, "learning_rate": 8.25740318906606e-05, "loss": 4.0227, "step": 1992 }, { "epoch": 0.5239268236607876, "grad_norm": 1.1092426776885986, "learning_rate": 8.255650954967584e-05, "loss": 4.1028, "step": 1994 }, { "epoch": 0.5244523269944493, "grad_norm": 1.24538254737854, "learning_rate": 8.253898720869108e-05, "loss": 4.0331, "step": 1996 }, { "epoch": 0.5249778303281112, "grad_norm": 1.0742114782333374, "learning_rate": 8.252146486770633e-05, "loss": 4.0165, "step": 1998 }, { "epoch": 0.5255033336617729, "grad_norm": 0.950673520565033, "learning_rate": 8.250394252672158e-05, "loss": 4.0123, "step": 2000 }, { "epoch": 0.5255033336617729, "eval_loss": 3.9716429710388184, "eval_runtime": 478.8342, "eval_samples_per_second": 254.345, "eval_steps_per_second": 7.948, "step": 2000 }, { "epoch": 0.5260288369954347, "grad_norm": 1.0853930711746216, "learning_rate": 8.248642018573683e-05, "loss": 4.0322, "step": 2002 }, { "epoch": 0.5265543403290964, "grad_norm": 1.0149612426757812, "learning_rate": 8.246889784475206e-05, "loss": 4.0084, "step": 2004 }, { "epoch": 0.5270798436627583, "grad_norm": 1.267917275428772, "learning_rate": 8.245137550376731e-05, "loss": 4.032, "step": 2006 }, { "epoch": 0.52760534699642, "grad_norm": 1.0529532432556152, "learning_rate": 8.243385316278254e-05, "loss": 4.0601, "step": 2008 }, { "epoch": 0.5281308503300818, "grad_norm": 1.032828450202942, "learning_rate": 8.241633082179779e-05, "loss": 4.0228, "step": 2010 }, { "epoch": 0.5286563536637435, "grad_norm": 1.172132968902588, "learning_rate": 8.239880848081304e-05, "loss": 3.998, "step": 2012 }, { "epoch": 0.5291818569974053, "grad_norm": 0.9776386618614197, "learning_rate": 8.238128613982829e-05, "loss": 4.0394, "step": 2014 }, { "epoch": 0.5297073603310671, "grad_norm": 1.0237400531768799, "learning_rate": 8.236376379884353e-05, "loss": 4.0442, "step": 2016 }, { "epoch": 0.5302328636647289, "grad_norm": 0.9288114905357361, "learning_rate": 8.234624145785878e-05, "loss": 4.0181, "step": 2018 }, { "epoch": 0.5307583669983906, "grad_norm": 1.1143784523010254, "learning_rate": 8.232871911687401e-05, "loss": 4.0445, "step": 2020 }, { "epoch": 0.5312838703320524, "grad_norm": 0.9036015868186951, "learning_rate": 8.231119677588926e-05, "loss": 4.0199, "step": 2022 }, { "epoch": 0.5318093736657142, "grad_norm": 0.9095367193222046, "learning_rate": 8.229367443490451e-05, "loss": 4.0182, "step": 2024 }, { "epoch": 0.532334876999376, "grad_norm": 1.0532479286193848, "learning_rate": 8.227615209391976e-05, "loss": 4.0383, "step": 2026 }, { "epoch": 0.5328603803330377, "grad_norm": 0.9810823202133179, "learning_rate": 8.2258629752935e-05, "loss": 4.024, "step": 2028 }, { "epoch": 0.5333858836666995, "grad_norm": 1.0345335006713867, "learning_rate": 8.224110741195024e-05, "loss": 4.0198, "step": 2030 }, { "epoch": 0.5339113870003613, "grad_norm": 1.3674103021621704, "learning_rate": 8.222358507096549e-05, "loss": 4.0177, "step": 2032 }, { "epoch": 0.5344368903340231, "grad_norm": 0.9938297867774963, "learning_rate": 8.220606272998072e-05, "loss": 4.0224, "step": 2034 }, { "epoch": 0.5349623936676848, "grad_norm": 1.1028859615325928, "learning_rate": 8.218854038899597e-05, "loss": 3.9482, "step": 2036 }, { "epoch": 0.5354878970013466, "grad_norm": 0.93757164478302, "learning_rate": 8.217101804801122e-05, "loss": 4.007, "step": 2038 }, { "epoch": 0.5360134003350083, "grad_norm": 0.8286834955215454, "learning_rate": 8.215349570702646e-05, "loss": 4.0464, "step": 2040 }, { "epoch": 0.5365389036686702, "grad_norm": 0.9312530755996704, "learning_rate": 8.213597336604171e-05, "loss": 4.0368, "step": 2042 }, { "epoch": 0.5370644070023319, "grad_norm": 1.1937620639801025, "learning_rate": 8.211845102505696e-05, "loss": 4.0075, "step": 2044 }, { "epoch": 0.5375899103359937, "grad_norm": 0.9114108085632324, "learning_rate": 8.210092868407219e-05, "loss": 4.0293, "step": 2046 }, { "epoch": 0.5381154136696554, "grad_norm": 0.8943130373954773, "learning_rate": 8.208340634308744e-05, "loss": 3.9685, "step": 2048 }, { "epoch": 0.5386409170033173, "grad_norm": 1.0412880182266235, "learning_rate": 8.206588400210269e-05, "loss": 4.0582, "step": 2050 }, { "epoch": 0.539166420336979, "grad_norm": 0.9598733186721802, "learning_rate": 8.204836166111794e-05, "loss": 3.9898, "step": 2052 }, { "epoch": 0.5396919236706408, "grad_norm": 0.9655455350875854, "learning_rate": 8.203083932013317e-05, "loss": 3.9867, "step": 2054 }, { "epoch": 0.5402174270043025, "grad_norm": 0.9302845001220703, "learning_rate": 8.201331697914842e-05, "loss": 4.0109, "step": 2056 }, { "epoch": 0.5407429303379643, "grad_norm": 0.9652458429336548, "learning_rate": 8.199579463816366e-05, "loss": 3.9898, "step": 2058 }, { "epoch": 0.5412684336716261, "grad_norm": 0.887883186340332, "learning_rate": 8.19782722971789e-05, "loss": 4.0055, "step": 2060 }, { "epoch": 0.5417939370052879, "grad_norm": 0.9993941783905029, "learning_rate": 8.196074995619415e-05, "loss": 3.9712, "step": 2062 }, { "epoch": 0.5423194403389496, "grad_norm": 0.8853415250778198, "learning_rate": 8.19432276152094e-05, "loss": 4.0202, "step": 2064 }, { "epoch": 0.5428449436726114, "grad_norm": 1.0737260580062866, "learning_rate": 8.192570527422464e-05, "loss": 3.9955, "step": 2066 }, { "epoch": 0.5433704470062732, "grad_norm": 0.9146938323974609, "learning_rate": 8.190818293323989e-05, "loss": 4.0714, "step": 2068 }, { "epoch": 0.543895950339935, "grad_norm": 0.9432989954948425, "learning_rate": 8.189066059225514e-05, "loss": 4.0478, "step": 2070 }, { "epoch": 0.5444214536735967, "grad_norm": 1.051314353942871, "learning_rate": 8.187313825127037e-05, "loss": 3.9655, "step": 2072 }, { "epoch": 0.5449469570072585, "grad_norm": 1.0668617486953735, "learning_rate": 8.185561591028562e-05, "loss": 4.0364, "step": 2074 }, { "epoch": 0.5454724603409203, "grad_norm": 1.2679084539413452, "learning_rate": 8.183809356930087e-05, "loss": 4.012, "step": 2076 }, { "epoch": 0.5459979636745821, "grad_norm": 0.9845985174179077, "learning_rate": 8.182057122831611e-05, "loss": 4.0055, "step": 2078 }, { "epoch": 0.5465234670082438, "grad_norm": 1.0778881311416626, "learning_rate": 8.180304888733135e-05, "loss": 4.0436, "step": 2080 }, { "epoch": 0.5470489703419056, "grad_norm": 1.1152747869491577, "learning_rate": 8.17855265463466e-05, "loss": 3.9916, "step": 2082 }, { "epoch": 0.5475744736755673, "grad_norm": 1.20831298828125, "learning_rate": 8.176800420536184e-05, "loss": 4.0207, "step": 2084 }, { "epoch": 0.5480999770092292, "grad_norm": 1.4499107599258423, "learning_rate": 8.175048186437708e-05, "loss": 4.0551, "step": 2086 }, { "epoch": 0.5486254803428909, "grad_norm": 1.1653656959533691, "learning_rate": 8.173295952339232e-05, "loss": 3.995, "step": 2088 }, { "epoch": 0.5491509836765527, "grad_norm": 1.1139881610870361, "learning_rate": 8.171543718240757e-05, "loss": 4.0014, "step": 2090 }, { "epoch": 0.5496764870102144, "grad_norm": 0.8757941722869873, "learning_rate": 8.169791484142282e-05, "loss": 4.0001, "step": 2092 }, { "epoch": 0.5502019903438763, "grad_norm": 0.9843849539756775, "learning_rate": 8.168039250043807e-05, "loss": 3.9966, "step": 2094 }, { "epoch": 0.550727493677538, "grad_norm": 0.8887428045272827, "learning_rate": 8.166287015945331e-05, "loss": 4.0475, "step": 2096 }, { "epoch": 0.5512529970111998, "grad_norm": 0.9970255494117737, "learning_rate": 8.164534781846855e-05, "loss": 4.0077, "step": 2098 }, { "epoch": 0.5517785003448615, "grad_norm": 1.1570550203323364, "learning_rate": 8.16278254774838e-05, "loss": 4.0283, "step": 2100 }, { "epoch": 0.5523040036785233, "grad_norm": 1.1024932861328125, "learning_rate": 8.161030313649904e-05, "loss": 4.0342, "step": 2102 }, { "epoch": 0.5528295070121851, "grad_norm": 0.9216394424438477, "learning_rate": 8.159278079551429e-05, "loss": 4.0027, "step": 2104 }, { "epoch": 0.5533550103458469, "grad_norm": 1.0783919095993042, "learning_rate": 8.157525845452952e-05, "loss": 4.0533, "step": 2106 }, { "epoch": 0.5538805136795086, "grad_norm": 0.9461116790771484, "learning_rate": 8.155773611354477e-05, "loss": 4.0244, "step": 2108 }, { "epoch": 0.5544060170131704, "grad_norm": 1.0073761940002441, "learning_rate": 8.154021377256002e-05, "loss": 4.0338, "step": 2110 }, { "epoch": 0.5549315203468322, "grad_norm": 0.9284674525260925, "learning_rate": 8.152269143157525e-05, "loss": 4.0213, "step": 2112 }, { "epoch": 0.555457023680494, "grad_norm": 1.1598211526870728, "learning_rate": 8.15051690905905e-05, "loss": 3.9655, "step": 2114 }, { "epoch": 0.5559825270141557, "grad_norm": 1.043800711631775, "learning_rate": 8.148764674960575e-05, "loss": 4.0319, "step": 2116 }, { "epoch": 0.5565080303478175, "grad_norm": 0.9038809537887573, "learning_rate": 8.1470124408621e-05, "loss": 3.9904, "step": 2118 }, { "epoch": 0.5570335336814793, "grad_norm": 1.0074305534362793, "learning_rate": 8.145260206763624e-05, "loss": 4.01, "step": 2120 }, { "epoch": 0.5575590370151411, "grad_norm": 0.9808998107910156, "learning_rate": 8.143507972665149e-05, "loss": 4.0583, "step": 2122 }, { "epoch": 0.5580845403488028, "grad_norm": 0.9064949750900269, "learning_rate": 8.141755738566673e-05, "loss": 4.0337, "step": 2124 }, { "epoch": 0.5586100436824646, "grad_norm": 0.9302887916564941, "learning_rate": 8.140003504468197e-05, "loss": 3.9728, "step": 2126 }, { "epoch": 0.5591355470161263, "grad_norm": 1.0245585441589355, "learning_rate": 8.138251270369722e-05, "loss": 3.9974, "step": 2128 }, { "epoch": 0.5596610503497882, "grad_norm": 0.946938693523407, "learning_rate": 8.136499036271247e-05, "loss": 4.0164, "step": 2130 }, { "epoch": 0.5601865536834499, "grad_norm": 0.9066144227981567, "learning_rate": 8.13474680217277e-05, "loss": 4.014, "step": 2132 }, { "epoch": 0.5607120570171117, "grad_norm": 0.9954712986946106, "learning_rate": 8.132994568074295e-05, "loss": 4.0084, "step": 2134 }, { "epoch": 0.5612375603507734, "grad_norm": 0.8025904893875122, "learning_rate": 8.13124233397582e-05, "loss": 4.0563, "step": 2136 }, { "epoch": 0.5617630636844353, "grad_norm": 1.1543691158294678, "learning_rate": 8.129490099877343e-05, "loss": 4.0058, "step": 2138 }, { "epoch": 0.562288567018097, "grad_norm": 1.0630464553833008, "learning_rate": 8.127737865778868e-05, "loss": 4.0194, "step": 2140 }, { "epoch": 0.5628140703517588, "grad_norm": 0.9283933043479919, "learning_rate": 8.125985631680393e-05, "loss": 3.9781, "step": 2142 }, { "epoch": 0.5633395736854205, "grad_norm": 1.007786512374878, "learning_rate": 8.124233397581917e-05, "loss": 4.0039, "step": 2144 }, { "epoch": 0.5638650770190824, "grad_norm": 0.9337747097015381, "learning_rate": 8.122481163483442e-05, "loss": 3.9915, "step": 2146 }, { "epoch": 0.5643905803527441, "grad_norm": 1.0909299850463867, "learning_rate": 8.120728929384967e-05, "loss": 4.0364, "step": 2148 }, { "epoch": 0.5649160836864059, "grad_norm": 1.0672916173934937, "learning_rate": 8.11897669528649e-05, "loss": 4.0221, "step": 2150 }, { "epoch": 0.5654415870200676, "grad_norm": 0.8512747287750244, "learning_rate": 8.117224461188015e-05, "loss": 4.069, "step": 2152 }, { "epoch": 0.5659670903537294, "grad_norm": 1.1954847574234009, "learning_rate": 8.11547222708954e-05, "loss": 3.9911, "step": 2154 }, { "epoch": 0.5664925936873912, "grad_norm": 0.9831007719039917, "learning_rate": 8.113719992991063e-05, "loss": 4.0574, "step": 2156 }, { "epoch": 0.567018097021053, "grad_norm": 0.8921142816543579, "learning_rate": 8.111967758892588e-05, "loss": 4.0272, "step": 2158 }, { "epoch": 0.5675436003547147, "grad_norm": 1.0555696487426758, "learning_rate": 8.110215524794113e-05, "loss": 3.9943, "step": 2160 }, { "epoch": 0.5680691036883765, "grad_norm": 0.9853285551071167, "learning_rate": 8.108463290695637e-05, "loss": 3.9745, "step": 2162 }, { "epoch": 0.5685946070220383, "grad_norm": 1.3033417463302612, "learning_rate": 8.106711056597162e-05, "loss": 4.002, "step": 2164 }, { "epoch": 0.5691201103557001, "grad_norm": 1.050309658050537, "learning_rate": 8.104958822498686e-05, "loss": 3.9832, "step": 2166 }, { "epoch": 0.5696456136893618, "grad_norm": 1.3063619136810303, "learning_rate": 8.10320658840021e-05, "loss": 3.9936, "step": 2168 }, { "epoch": 0.5701711170230236, "grad_norm": 0.8233873844146729, "learning_rate": 8.101454354301735e-05, "loss": 4.0063, "step": 2170 }, { "epoch": 0.5706966203566853, "grad_norm": 1.0360580682754517, "learning_rate": 8.09970212020326e-05, "loss": 3.9551, "step": 2172 }, { "epoch": 0.5712221236903472, "grad_norm": 0.8834353685379028, "learning_rate": 8.097949886104785e-05, "loss": 4.0013, "step": 2174 }, { "epoch": 0.5717476270240089, "grad_norm": 0.9395789504051208, "learning_rate": 8.09619765200631e-05, "loss": 4.023, "step": 2176 }, { "epoch": 0.5722731303576707, "grad_norm": 1.0187454223632812, "learning_rate": 8.094445417907833e-05, "loss": 4.0409, "step": 2178 }, { "epoch": 0.5727986336913324, "grad_norm": 0.9762563705444336, "learning_rate": 8.092693183809358e-05, "loss": 4.0307, "step": 2180 }, { "epoch": 0.5733241370249943, "grad_norm": 0.8479059338569641, "learning_rate": 8.090940949710881e-05, "loss": 4.0072, "step": 2182 }, { "epoch": 0.573849640358656, "grad_norm": 0.9563126564025879, "learning_rate": 8.089188715612406e-05, "loss": 3.973, "step": 2184 }, { "epoch": 0.5743751436923178, "grad_norm": 0.9131817817687988, "learning_rate": 8.08743648151393e-05, "loss": 3.9255, "step": 2186 }, { "epoch": 0.5749006470259795, "grad_norm": 0.8804376721382141, "learning_rate": 8.085684247415455e-05, "loss": 3.9948, "step": 2188 }, { "epoch": 0.5754261503596414, "grad_norm": 1.2335556745529175, "learning_rate": 8.08393201331698e-05, "loss": 4.0545, "step": 2190 }, { "epoch": 0.5759516536933031, "grad_norm": 1.1221791505813599, "learning_rate": 8.082179779218503e-05, "loss": 4.0168, "step": 2192 }, { "epoch": 0.5764771570269649, "grad_norm": 1.3346920013427734, "learning_rate": 8.080427545120028e-05, "loss": 3.9815, "step": 2194 }, { "epoch": 0.5770026603606266, "grad_norm": 0.9278746247291565, "learning_rate": 8.078675311021553e-05, "loss": 3.9754, "step": 2196 }, { "epoch": 0.5775281636942884, "grad_norm": 1.0014820098876953, "learning_rate": 8.076923076923078e-05, "loss": 3.9917, "step": 2198 }, { "epoch": 0.5780536670279502, "grad_norm": 0.981235682964325, "learning_rate": 8.075170842824602e-05, "loss": 3.9448, "step": 2200 }, { "epoch": 0.578579170361612, "grad_norm": 1.0179733037948608, "learning_rate": 8.073418608726127e-05, "loss": 3.9985, "step": 2202 }, { "epoch": 0.5791046736952737, "grad_norm": 0.936255931854248, "learning_rate": 8.07166637462765e-05, "loss": 4.0347, "step": 2204 }, { "epoch": 0.5796301770289355, "grad_norm": 1.0361779928207397, "learning_rate": 8.069914140529175e-05, "loss": 3.9886, "step": 2206 }, { "epoch": 0.5801556803625973, "grad_norm": 0.9133824110031128, "learning_rate": 8.068161906430699e-05, "loss": 3.9785, "step": 2208 }, { "epoch": 0.5806811836962591, "grad_norm": 0.9091584086418152, "learning_rate": 8.066409672332224e-05, "loss": 4.0211, "step": 2210 }, { "epoch": 0.5812066870299208, "grad_norm": 1.0066263675689697, "learning_rate": 8.064657438233748e-05, "loss": 3.9528, "step": 2212 }, { "epoch": 0.5817321903635826, "grad_norm": 0.8723448514938354, "learning_rate": 8.062905204135273e-05, "loss": 3.9623, "step": 2214 }, { "epoch": 0.5822576936972443, "grad_norm": 1.1297575235366821, "learning_rate": 8.061152970036798e-05, "loss": 4.0417, "step": 2216 }, { "epoch": 0.5827831970309062, "grad_norm": 1.0618151426315308, "learning_rate": 8.059400735938321e-05, "loss": 4.071, "step": 2218 }, { "epoch": 0.583308700364568, "grad_norm": 1.141478419303894, "learning_rate": 8.057648501839846e-05, "loss": 3.9865, "step": 2220 }, { "epoch": 0.5838342036982297, "grad_norm": 0.9279476404190063, "learning_rate": 8.055896267741371e-05, "loss": 3.9755, "step": 2222 }, { "epoch": 0.5843597070318914, "grad_norm": 1.0134791135787964, "learning_rate": 8.054144033642895e-05, "loss": 4.016, "step": 2224 }, { "epoch": 0.5848852103655533, "grad_norm": 0.9677004218101501, "learning_rate": 8.05239179954442e-05, "loss": 4.0333, "step": 2226 }, { "epoch": 0.585410713699215, "grad_norm": 1.083963394165039, "learning_rate": 8.050639565445945e-05, "loss": 4.0292, "step": 2228 }, { "epoch": 0.5859362170328768, "grad_norm": 0.9336953163146973, "learning_rate": 8.048887331347468e-05, "loss": 4.0181, "step": 2230 }, { "epoch": 0.5864617203665385, "grad_norm": 0.8827482461929321, "learning_rate": 8.047135097248993e-05, "loss": 3.9839, "step": 2232 }, { "epoch": 0.5869872237002004, "grad_norm": 0.9447222948074341, "learning_rate": 8.045382863150517e-05, "loss": 4.0414, "step": 2234 }, { "epoch": 0.5875127270338621, "grad_norm": 0.9160022139549255, "learning_rate": 8.043630629052041e-05, "loss": 4.0161, "step": 2236 }, { "epoch": 0.5880382303675239, "grad_norm": 0.8835806846618652, "learning_rate": 8.041878394953566e-05, "loss": 3.9878, "step": 2238 }, { "epoch": 0.5885637337011856, "grad_norm": 0.8925127387046814, "learning_rate": 8.040126160855091e-05, "loss": 3.9887, "step": 2240 }, { "epoch": 0.5890892370348474, "grad_norm": 0.9460300803184509, "learning_rate": 8.038373926756616e-05, "loss": 4.0349, "step": 2242 }, { "epoch": 0.5896147403685092, "grad_norm": 0.8593032956123352, "learning_rate": 8.036621692658139e-05, "loss": 4.0258, "step": 2244 }, { "epoch": 0.590140243702171, "grad_norm": 0.922136664390564, "learning_rate": 8.034869458559664e-05, "loss": 4.0031, "step": 2246 }, { "epoch": 0.5906657470358327, "grad_norm": 0.9428873062133789, "learning_rate": 8.033117224461188e-05, "loss": 4.0194, "step": 2248 }, { "epoch": 0.5911912503694945, "grad_norm": 0.9693458080291748, "learning_rate": 8.031364990362713e-05, "loss": 3.9447, "step": 2250 }, { "epoch": 0.5917167537031564, "grad_norm": 1.0728312730789185, "learning_rate": 8.029612756264238e-05, "loss": 4.0304, "step": 2252 }, { "epoch": 0.5922422570368181, "grad_norm": 1.1566743850708008, "learning_rate": 8.027860522165763e-05, "loss": 3.9736, "step": 2254 }, { "epoch": 0.5927677603704798, "grad_norm": 0.9907905459403992, "learning_rate": 8.026108288067286e-05, "loss": 3.9483, "step": 2256 }, { "epoch": 0.5932932637041416, "grad_norm": 1.0294867753982544, "learning_rate": 8.02435605396881e-05, "loss": 4.0444, "step": 2258 }, { "epoch": 0.5938187670378033, "grad_norm": 0.9118397235870361, "learning_rate": 8.022603819870334e-05, "loss": 4.0132, "step": 2260 }, { "epoch": 0.5943442703714652, "grad_norm": 1.136412262916565, "learning_rate": 8.020851585771859e-05, "loss": 4.0197, "step": 2262 }, { "epoch": 0.594869773705127, "grad_norm": 1.3454824686050415, "learning_rate": 8.019099351673384e-05, "loss": 3.985, "step": 2264 }, { "epoch": 0.5953952770387887, "grad_norm": 0.8937453627586365, "learning_rate": 8.017347117574909e-05, "loss": 4.0267, "step": 2266 }, { "epoch": 0.5959207803724504, "grad_norm": 0.8828874826431274, "learning_rate": 8.015594883476433e-05, "loss": 4.066, "step": 2268 }, { "epoch": 0.5964462837061123, "grad_norm": 0.8990727663040161, "learning_rate": 8.013842649377957e-05, "loss": 3.9782, "step": 2270 }, { "epoch": 0.596971787039774, "grad_norm": 0.9934741258621216, "learning_rate": 8.012090415279481e-05, "loss": 3.98, "step": 2272 }, { "epoch": 0.5974972903734358, "grad_norm": 1.241985559463501, "learning_rate": 8.010338181181006e-05, "loss": 3.9886, "step": 2274 }, { "epoch": 0.5980227937070975, "grad_norm": 0.986441433429718, "learning_rate": 8.008585947082531e-05, "loss": 4.0658, "step": 2276 }, { "epoch": 0.5985482970407594, "grad_norm": 1.0386943817138672, "learning_rate": 8.006833712984056e-05, "loss": 3.9657, "step": 2278 }, { "epoch": 0.5990738003744212, "grad_norm": 0.8961172699928284, "learning_rate": 8.00508147888558e-05, "loss": 4.0273, "step": 2280 }, { "epoch": 0.5995993037080829, "grad_norm": 1.013377070426941, "learning_rate": 8.003329244787104e-05, "loss": 3.9915, "step": 2282 }, { "epoch": 0.6001248070417446, "grad_norm": 0.9038441777229309, "learning_rate": 8.001577010688627e-05, "loss": 3.9488, "step": 2284 }, { "epoch": 0.6006503103754064, "grad_norm": 0.9897975921630859, "learning_rate": 7.999824776590152e-05, "loss": 3.974, "step": 2286 }, { "epoch": 0.6011758137090683, "grad_norm": 0.8691598773002625, "learning_rate": 7.998072542491677e-05, "loss": 3.9484, "step": 2288 }, { "epoch": 0.60170131704273, "grad_norm": 1.1366537809371948, "learning_rate": 7.996320308393202e-05, "loss": 4.0092, "step": 2290 }, { "epoch": 0.6022268203763917, "grad_norm": 0.9541658163070679, "learning_rate": 7.994568074294726e-05, "loss": 4.0208, "step": 2292 }, { "epoch": 0.6027523237100535, "grad_norm": 0.9372681379318237, "learning_rate": 7.992815840196251e-05, "loss": 4.0315, "step": 2294 }, { "epoch": 0.6032778270437154, "grad_norm": 0.9746453762054443, "learning_rate": 7.991063606097774e-05, "loss": 3.9406, "step": 2296 }, { "epoch": 0.6038033303773771, "grad_norm": 1.098426103591919, "learning_rate": 7.989311371999299e-05, "loss": 3.96, "step": 2298 }, { "epoch": 0.6043288337110388, "grad_norm": 0.981225848197937, "learning_rate": 7.987559137900824e-05, "loss": 3.9238, "step": 2300 }, { "epoch": 0.6048543370447006, "grad_norm": 1.0758087635040283, "learning_rate": 7.985806903802349e-05, "loss": 3.9589, "step": 2302 }, { "epoch": 0.6053798403783625, "grad_norm": 0.9756500124931335, "learning_rate": 7.984054669703873e-05, "loss": 4.0465, "step": 2304 }, { "epoch": 0.6059053437120242, "grad_norm": 0.9800392389297485, "learning_rate": 7.982302435605398e-05, "loss": 3.9551, "step": 2306 }, { "epoch": 0.606430847045686, "grad_norm": 1.0872782468795776, "learning_rate": 7.980550201506922e-05, "loss": 3.9954, "step": 2308 }, { "epoch": 0.6069563503793477, "grad_norm": 0.9080469012260437, "learning_rate": 7.978797967408445e-05, "loss": 4.007, "step": 2310 }, { "epoch": 0.6074818537130094, "grad_norm": 0.9582008123397827, "learning_rate": 7.97704573330997e-05, "loss": 3.9565, "step": 2312 }, { "epoch": 0.6080073570466713, "grad_norm": 0.9006783366203308, "learning_rate": 7.975293499211495e-05, "loss": 4.0371, "step": 2314 }, { "epoch": 0.608532860380333, "grad_norm": 0.8905913829803467, "learning_rate": 7.97354126511302e-05, "loss": 3.9442, "step": 2316 }, { "epoch": 0.6090583637139948, "grad_norm": 0.864420473575592, "learning_rate": 7.971789031014544e-05, "loss": 4.0007, "step": 2318 }, { "epoch": 0.6095838670476565, "grad_norm": 0.9698134064674377, "learning_rate": 7.970036796916069e-05, "loss": 3.9337, "step": 2320 }, { "epoch": 0.6101093703813184, "grad_norm": 0.9633287787437439, "learning_rate": 7.968284562817592e-05, "loss": 3.9868, "step": 2322 }, { "epoch": 0.6106348737149802, "grad_norm": 0.9664962887763977, "learning_rate": 7.966532328719117e-05, "loss": 3.9996, "step": 2324 }, { "epoch": 0.6111603770486419, "grad_norm": 0.9269763827323914, "learning_rate": 7.964780094620642e-05, "loss": 4.0137, "step": 2326 }, { "epoch": 0.6116858803823036, "grad_norm": 1.0116597414016724, "learning_rate": 7.963027860522167e-05, "loss": 4.0029, "step": 2328 }, { "epoch": 0.6122113837159654, "grad_norm": 1.0242760181427002, "learning_rate": 7.961275626423691e-05, "loss": 3.9959, "step": 2330 }, { "epoch": 0.6127368870496273, "grad_norm": 0.8526800274848938, "learning_rate": 7.959523392325216e-05, "loss": 4.0106, "step": 2332 }, { "epoch": 0.613262390383289, "grad_norm": 1.0614292621612549, "learning_rate": 7.95777115822674e-05, "loss": 4.0424, "step": 2334 }, { "epoch": 0.6137878937169507, "grad_norm": 0.8463445901870728, "learning_rate": 7.956018924128263e-05, "loss": 3.9791, "step": 2336 }, { "epoch": 0.6143133970506125, "grad_norm": 1.1134741306304932, "learning_rate": 7.954266690029788e-05, "loss": 3.958, "step": 2338 }, { "epoch": 0.6148389003842744, "grad_norm": 1.0754562616348267, "learning_rate": 7.952514455931312e-05, "loss": 4.0275, "step": 2340 }, { "epoch": 0.6153644037179361, "grad_norm": 0.9124601483345032, "learning_rate": 7.950762221832837e-05, "loss": 4.0256, "step": 2342 }, { "epoch": 0.6158899070515979, "grad_norm": 1.4143973588943481, "learning_rate": 7.949009987734362e-05, "loss": 3.9379, "step": 2344 }, { "epoch": 0.6164154103852596, "grad_norm": 1.0514856576919556, "learning_rate": 7.947257753635887e-05, "loss": 4.0496, "step": 2346 }, { "epoch": 0.6169409137189215, "grad_norm": 0.859139621257782, "learning_rate": 7.94550551953741e-05, "loss": 3.9639, "step": 2348 }, { "epoch": 0.6174664170525832, "grad_norm": 1.0306580066680908, "learning_rate": 7.943753285438935e-05, "loss": 4.0308, "step": 2350 }, { "epoch": 0.617991920386245, "grad_norm": 0.8120926022529602, "learning_rate": 7.94200105134046e-05, "loss": 4.0111, "step": 2352 }, { "epoch": 0.6185174237199067, "grad_norm": 1.101628065109253, "learning_rate": 7.940248817241984e-05, "loss": 3.9496, "step": 2354 }, { "epoch": 0.6190429270535684, "grad_norm": 0.8194558620452881, "learning_rate": 7.938496583143509e-05, "loss": 4.0022, "step": 2356 }, { "epoch": 0.6195684303872303, "grad_norm": 0.951854407787323, "learning_rate": 7.936744349045034e-05, "loss": 3.9754, "step": 2358 }, { "epoch": 0.620093933720892, "grad_norm": 0.8281410932540894, "learning_rate": 7.934992114946557e-05, "loss": 3.9553, "step": 2360 }, { "epoch": 0.6206194370545538, "grad_norm": 0.9923112988471985, "learning_rate": 7.93323988084808e-05, "loss": 3.958, "step": 2362 }, { "epoch": 0.6211449403882155, "grad_norm": 0.9411230087280273, "learning_rate": 7.931487646749605e-05, "loss": 4.0171, "step": 2364 }, { "epoch": 0.6216704437218774, "grad_norm": 0.98564612865448, "learning_rate": 7.92973541265113e-05, "loss": 3.9296, "step": 2366 }, { "epoch": 0.6221959470555392, "grad_norm": 0.9742181301116943, "learning_rate": 7.927983178552655e-05, "loss": 4.0039, "step": 2368 }, { "epoch": 0.6227214503892009, "grad_norm": 0.912119448184967, "learning_rate": 7.92623094445418e-05, "loss": 3.9815, "step": 2370 }, { "epoch": 0.6232469537228627, "grad_norm": 1.036596417427063, "learning_rate": 7.924478710355704e-05, "loss": 4.0025, "step": 2372 }, { "epoch": 0.6237724570565244, "grad_norm": 0.9650595188140869, "learning_rate": 7.922726476257228e-05, "loss": 3.95, "step": 2374 }, { "epoch": 0.6242979603901863, "grad_norm": 0.9106220602989197, "learning_rate": 7.920974242158753e-05, "loss": 4.0145, "step": 2376 }, { "epoch": 0.624823463723848, "grad_norm": 0.9039183855056763, "learning_rate": 7.919222008060277e-05, "loss": 3.9871, "step": 2378 }, { "epoch": 0.6253489670575098, "grad_norm": 0.9161286354064941, "learning_rate": 7.917469773961802e-05, "loss": 3.9358, "step": 2380 }, { "epoch": 0.6258744703911715, "grad_norm": 0.9155192971229553, "learning_rate": 7.915717539863327e-05, "loss": 3.9659, "step": 2382 }, { "epoch": 0.6263999737248334, "grad_norm": 0.8665133714675903, "learning_rate": 7.913965305764852e-05, "loss": 3.9434, "step": 2384 }, { "epoch": 0.6269254770584951, "grad_norm": 0.9369966983795166, "learning_rate": 7.912213071666375e-05, "loss": 3.9772, "step": 2386 }, { "epoch": 0.6274509803921569, "grad_norm": 0.9370551109313965, "learning_rate": 7.910460837567898e-05, "loss": 4.0094, "step": 2388 }, { "epoch": 0.6279764837258186, "grad_norm": 0.8143720030784607, "learning_rate": 7.908708603469423e-05, "loss": 3.9893, "step": 2390 }, { "epoch": 0.6285019870594805, "grad_norm": 0.9256435036659241, "learning_rate": 7.906956369370948e-05, "loss": 4.0127, "step": 2392 }, { "epoch": 0.6290274903931422, "grad_norm": 0.8944156169891357, "learning_rate": 7.905204135272473e-05, "loss": 3.9844, "step": 2394 }, { "epoch": 0.629552993726804, "grad_norm": 0.9650312066078186, "learning_rate": 7.903451901173997e-05, "loss": 3.993, "step": 2396 }, { "epoch": 0.6300784970604657, "grad_norm": 0.8487775325775146, "learning_rate": 7.901699667075522e-05, "loss": 3.9708, "step": 2398 }, { "epoch": 0.6306040003941275, "grad_norm": 0.975088357925415, "learning_rate": 7.899947432977046e-05, "loss": 3.9874, "step": 2400 }, { "epoch": 0.6306040003941275, "eval_loss": 3.935922384262085, "eval_runtime": 485.3478, "eval_samples_per_second": 250.931, "eval_steps_per_second": 7.842, "step": 2400 }, { "epoch": 0.6311295037277893, "grad_norm": 0.903324544429779, "learning_rate": 7.89819519887857e-05, "loss": 4.0082, "step": 2402 }, { "epoch": 0.6316550070614511, "grad_norm": 0.9841864109039307, "learning_rate": 7.896442964780095e-05, "loss": 4.0219, "step": 2404 }, { "epoch": 0.6321805103951128, "grad_norm": 0.8466092944145203, "learning_rate": 7.89469073068162e-05, "loss": 3.9398, "step": 2406 }, { "epoch": 0.6327060137287746, "grad_norm": 0.9567834138870239, "learning_rate": 7.892938496583145e-05, "loss": 3.9473, "step": 2408 }, { "epoch": 0.6332315170624364, "grad_norm": 1.0650795698165894, "learning_rate": 7.891186262484669e-05, "loss": 3.987, "step": 2410 }, { "epoch": 0.6337570203960982, "grad_norm": 0.916459321975708, "learning_rate": 7.889434028386193e-05, "loss": 4.0072, "step": 2412 }, { "epoch": 0.6342825237297599, "grad_norm": 0.8857781887054443, "learning_rate": 7.887681794287717e-05, "loss": 4.0423, "step": 2414 }, { "epoch": 0.6348080270634217, "grad_norm": 1.0256000757217407, "learning_rate": 7.885929560189241e-05, "loss": 3.9628, "step": 2416 }, { "epoch": 0.6353335303970834, "grad_norm": 0.7993658185005188, "learning_rate": 7.884177326090766e-05, "loss": 3.988, "step": 2418 }, { "epoch": 0.6358590337307453, "grad_norm": 0.9537171125411987, "learning_rate": 7.88242509199229e-05, "loss": 4.01, "step": 2420 }, { "epoch": 0.636384537064407, "grad_norm": 0.9659585356712341, "learning_rate": 7.880672857893815e-05, "loss": 3.9836, "step": 2422 }, { "epoch": 0.6369100403980688, "grad_norm": 0.9807218313217163, "learning_rate": 7.87892062379534e-05, "loss": 4.007, "step": 2424 }, { "epoch": 0.6374355437317305, "grad_norm": 0.9743635058403015, "learning_rate": 7.877168389696865e-05, "loss": 3.9786, "step": 2426 }, { "epoch": 0.6379610470653924, "grad_norm": 1.0024867057800293, "learning_rate": 7.875416155598388e-05, "loss": 4.005, "step": 2428 }, { "epoch": 0.6384865503990541, "grad_norm": 1.0909353494644165, "learning_rate": 7.873663921499913e-05, "loss": 4.0201, "step": 2430 }, { "epoch": 0.6390120537327159, "grad_norm": 1.1109079122543335, "learning_rate": 7.871911687401438e-05, "loss": 4.0159, "step": 2432 }, { "epoch": 0.6395375570663776, "grad_norm": 0.9947724938392639, "learning_rate": 7.870159453302962e-05, "loss": 3.9849, "step": 2434 }, { "epoch": 0.6400630604000395, "grad_norm": 1.0718566179275513, "learning_rate": 7.868407219204487e-05, "loss": 3.9981, "step": 2436 }, { "epoch": 0.6405885637337012, "grad_norm": 0.9501470327377319, "learning_rate": 7.86665498510601e-05, "loss": 3.9512, "step": 2438 }, { "epoch": 0.641114067067363, "grad_norm": 0.8307205438613892, "learning_rate": 7.864902751007535e-05, "loss": 3.9567, "step": 2440 }, { "epoch": 0.6416395704010247, "grad_norm": 0.8976757526397705, "learning_rate": 7.863150516909059e-05, "loss": 4.0078, "step": 2442 }, { "epoch": 0.6421650737346865, "grad_norm": 0.9467252492904663, "learning_rate": 7.861398282810583e-05, "loss": 4.041, "step": 2444 }, { "epoch": 0.6426905770683483, "grad_norm": 1.186718463897705, "learning_rate": 7.859646048712108e-05, "loss": 3.9838, "step": 2446 }, { "epoch": 0.6432160804020101, "grad_norm": 0.9179478287696838, "learning_rate": 7.857893814613633e-05, "loss": 3.9329, "step": 2448 }, { "epoch": 0.6437415837356718, "grad_norm": 1.029965877532959, "learning_rate": 7.856141580515158e-05, "loss": 4.0165, "step": 2450 }, { "epoch": 0.6442670870693336, "grad_norm": 0.8846809267997742, "learning_rate": 7.854389346416682e-05, "loss": 4.0354, "step": 2452 }, { "epoch": 0.6447925904029954, "grad_norm": 0.9284564256668091, "learning_rate": 7.852637112318206e-05, "loss": 3.9478, "step": 2454 }, { "epoch": 0.6453180937366572, "grad_norm": 0.869635820388794, "learning_rate": 7.85088487821973e-05, "loss": 4.0127, "step": 2456 }, { "epoch": 0.6458435970703189, "grad_norm": 1.0596611499786377, "learning_rate": 7.849132644121255e-05, "loss": 3.9773, "step": 2458 }, { "epoch": 0.6463691004039807, "grad_norm": 0.9975203275680542, "learning_rate": 7.84738041002278e-05, "loss": 3.9382, "step": 2460 }, { "epoch": 0.6468946037376425, "grad_norm": 0.9549726843833923, "learning_rate": 7.845628175924303e-05, "loss": 3.9918, "step": 2462 }, { "epoch": 0.6474201070713043, "grad_norm": 0.8911670446395874, "learning_rate": 7.843875941825828e-05, "loss": 3.9698, "step": 2464 }, { "epoch": 0.647945610404966, "grad_norm": 0.9614885449409485, "learning_rate": 7.842123707727353e-05, "loss": 4.0215, "step": 2466 }, { "epoch": 0.6484711137386278, "grad_norm": 0.9461268782615662, "learning_rate": 7.840371473628876e-05, "loss": 3.9916, "step": 2468 }, { "epoch": 0.6489966170722895, "grad_norm": 0.9601477384567261, "learning_rate": 7.838619239530401e-05, "loss": 3.9886, "step": 2470 }, { "epoch": 0.6495221204059514, "grad_norm": 0.7785865664482117, "learning_rate": 7.836867005431926e-05, "loss": 3.9964, "step": 2472 }, { "epoch": 0.6500476237396131, "grad_norm": 1.0971733331680298, "learning_rate": 7.835114771333451e-05, "loss": 3.9847, "step": 2474 }, { "epoch": 0.6505731270732749, "grad_norm": 0.9541909694671631, "learning_rate": 7.833362537234975e-05, "loss": 3.9732, "step": 2476 }, { "epoch": 0.6510986304069366, "grad_norm": 0.9917490482330322, "learning_rate": 7.8316103031365e-05, "loss": 4.041, "step": 2478 }, { "epoch": 0.6516241337405985, "grad_norm": 0.9501031041145325, "learning_rate": 7.829858069038024e-05, "loss": 4.0043, "step": 2480 }, { "epoch": 0.6521496370742602, "grad_norm": 0.8852449059486389, "learning_rate": 7.828105834939548e-05, "loss": 3.9436, "step": 2482 }, { "epoch": 0.652675140407922, "grad_norm": 1.0195696353912354, "learning_rate": 7.826353600841073e-05, "loss": 3.9997, "step": 2484 }, { "epoch": 0.6532006437415837, "grad_norm": 0.9401587247848511, "learning_rate": 7.824601366742598e-05, "loss": 3.9725, "step": 2486 }, { "epoch": 0.6537261470752455, "grad_norm": 0.8441434502601624, "learning_rate": 7.822849132644121e-05, "loss": 3.9692, "step": 2488 }, { "epoch": 0.6542516504089073, "grad_norm": 0.9912038445472717, "learning_rate": 7.821096898545646e-05, "loss": 3.9151, "step": 2490 }, { "epoch": 0.6547771537425691, "grad_norm": 1.4427993297576904, "learning_rate": 7.819344664447171e-05, "loss": 3.9739, "step": 2492 }, { "epoch": 0.6553026570762308, "grad_norm": 0.9082440733909607, "learning_rate": 7.817592430348694e-05, "loss": 3.948, "step": 2494 }, { "epoch": 0.6558281604098926, "grad_norm": 1.0606833696365356, "learning_rate": 7.815840196250219e-05, "loss": 3.9508, "step": 2496 }, { "epoch": 0.6563536637435544, "grad_norm": 0.8990094065666199, "learning_rate": 7.814087962151744e-05, "loss": 3.951, "step": 2498 }, { "epoch": 0.6568791670772162, "grad_norm": 0.8774867653846741, "learning_rate": 7.812335728053268e-05, "loss": 4.0201, "step": 2500 }, { "epoch": 0.6574046704108779, "grad_norm": 0.9492168426513672, "learning_rate": 7.810583493954793e-05, "loss": 3.9914, "step": 2502 }, { "epoch": 0.6579301737445397, "grad_norm": 1.0110276937484741, "learning_rate": 7.808831259856318e-05, "loss": 3.9824, "step": 2504 }, { "epoch": 0.6584556770782015, "grad_norm": 1.11878502368927, "learning_rate": 7.807079025757841e-05, "loss": 3.9317, "step": 2506 }, { "epoch": 0.6589811804118633, "grad_norm": 1.1290756464004517, "learning_rate": 7.805326791659366e-05, "loss": 4.0182, "step": 2508 }, { "epoch": 0.659506683745525, "grad_norm": 0.8934504985809326, "learning_rate": 7.803574557560891e-05, "loss": 3.9992, "step": 2510 }, { "epoch": 0.6600321870791868, "grad_norm": 0.9680683016777039, "learning_rate": 7.801822323462416e-05, "loss": 3.9823, "step": 2512 }, { "epoch": 0.6605576904128485, "grad_norm": 0.7583855390548706, "learning_rate": 7.800070089363939e-05, "loss": 3.9544, "step": 2514 }, { "epoch": 0.6610831937465104, "grad_norm": 0.7969910502433777, "learning_rate": 7.798317855265464e-05, "loss": 3.9653, "step": 2516 }, { "epoch": 0.6616086970801721, "grad_norm": 0.9186200499534607, "learning_rate": 7.796565621166989e-05, "loss": 4.0032, "step": 2518 }, { "epoch": 0.6621342004138339, "grad_norm": 0.8746758699417114, "learning_rate": 7.794813387068512e-05, "loss": 3.9631, "step": 2520 }, { "epoch": 0.6626597037474956, "grad_norm": 0.9777311682701111, "learning_rate": 7.793061152970037e-05, "loss": 3.9716, "step": 2522 }, { "epoch": 0.6631852070811575, "grad_norm": 0.9752228856086731, "learning_rate": 7.791308918871561e-05, "loss": 3.9769, "step": 2524 }, { "epoch": 0.6637107104148192, "grad_norm": 0.9165812134742737, "learning_rate": 7.789556684773086e-05, "loss": 4.0201, "step": 2526 }, { "epoch": 0.664236213748481, "grad_norm": 1.0126044750213623, "learning_rate": 7.787804450674611e-05, "loss": 3.9259, "step": 2528 }, { "epoch": 0.6647617170821427, "grad_norm": 0.9292515516281128, "learning_rate": 7.786052216576136e-05, "loss": 3.9836, "step": 2530 }, { "epoch": 0.6652872204158045, "grad_norm": 1.0550158023834229, "learning_rate": 7.784299982477659e-05, "loss": 3.9112, "step": 2532 }, { "epoch": 0.6658127237494663, "grad_norm": 0.9967262744903564, "learning_rate": 7.782547748379184e-05, "loss": 3.9826, "step": 2534 }, { "epoch": 0.6663382270831281, "grad_norm": 0.9763745069503784, "learning_rate": 7.780795514280709e-05, "loss": 3.9683, "step": 2536 }, { "epoch": 0.6668637304167898, "grad_norm": 0.9886252880096436, "learning_rate": 7.779043280182233e-05, "loss": 3.9551, "step": 2538 }, { "epoch": 0.6673892337504516, "grad_norm": 0.9396386742591858, "learning_rate": 7.777291046083757e-05, "loss": 3.9447, "step": 2540 }, { "epoch": 0.6679147370841134, "grad_norm": 1.036364197731018, "learning_rate": 7.775538811985282e-05, "loss": 3.9562, "step": 2542 }, { "epoch": 0.6684402404177752, "grad_norm": 0.9738422632217407, "learning_rate": 7.773786577886806e-05, "loss": 3.9524, "step": 2544 }, { "epoch": 0.6689657437514369, "grad_norm": 0.8481252193450928, "learning_rate": 7.77203434378833e-05, "loss": 3.9279, "step": 2546 }, { "epoch": 0.6694912470850987, "grad_norm": 1.000169038772583, "learning_rate": 7.770282109689854e-05, "loss": 4.0043, "step": 2548 }, { "epoch": 0.6700167504187605, "grad_norm": 0.9505040049552917, "learning_rate": 7.768529875591379e-05, "loss": 3.96, "step": 2550 }, { "epoch": 0.6705422537524223, "grad_norm": 0.840177059173584, "learning_rate": 7.766777641492904e-05, "loss": 3.9933, "step": 2552 }, { "epoch": 0.671067757086084, "grad_norm": 0.7664931416511536, "learning_rate": 7.765025407394429e-05, "loss": 3.987, "step": 2554 }, { "epoch": 0.6715932604197458, "grad_norm": 0.8426335453987122, "learning_rate": 7.763273173295953e-05, "loss": 4.0279, "step": 2556 }, { "epoch": 0.6721187637534075, "grad_norm": 0.8907631635665894, "learning_rate": 7.761520939197477e-05, "loss": 3.9598, "step": 2558 }, { "epoch": 0.6726442670870694, "grad_norm": 1.0607668161392212, "learning_rate": 7.759768705099002e-05, "loss": 4.0039, "step": 2560 }, { "epoch": 0.6731697704207311, "grad_norm": 0.9832420349121094, "learning_rate": 7.758016471000526e-05, "loss": 3.989, "step": 2562 }, { "epoch": 0.6736952737543929, "grad_norm": 0.9804418683052063, "learning_rate": 7.75626423690205e-05, "loss": 4.0003, "step": 2564 }, { "epoch": 0.6742207770880546, "grad_norm": 1.2076867818832397, "learning_rate": 7.754512002803575e-05, "loss": 3.988, "step": 2566 }, { "epoch": 0.6747462804217165, "grad_norm": 1.0381132364273071, "learning_rate": 7.752759768705099e-05, "loss": 3.9671, "step": 2568 }, { "epoch": 0.6752717837553782, "grad_norm": 0.8960400819778442, "learning_rate": 7.751007534606624e-05, "loss": 3.9767, "step": 2570 }, { "epoch": 0.67579728708904, "grad_norm": 1.080571174621582, "learning_rate": 7.749255300508147e-05, "loss": 3.992, "step": 2572 }, { "epoch": 0.6763227904227017, "grad_norm": 1.0236200094223022, "learning_rate": 7.747503066409672e-05, "loss": 3.9893, "step": 2574 }, { "epoch": 0.6768482937563635, "grad_norm": 0.8540567755699158, "learning_rate": 7.745750832311197e-05, "loss": 3.9677, "step": 2576 }, { "epoch": 0.6773737970900253, "grad_norm": 1.0379787683486938, "learning_rate": 7.743998598212722e-05, "loss": 3.9903, "step": 2578 }, { "epoch": 0.6778993004236871, "grad_norm": 0.839572548866272, "learning_rate": 7.742246364114247e-05, "loss": 4.0236, "step": 2580 }, { "epoch": 0.6784248037573488, "grad_norm": 0.8604666590690613, "learning_rate": 7.740494130015771e-05, "loss": 3.9409, "step": 2582 }, { "epoch": 0.6789503070910106, "grad_norm": 0.9099580645561218, "learning_rate": 7.738741895917295e-05, "loss": 3.9641, "step": 2584 }, { "epoch": 0.6794758104246724, "grad_norm": 0.8638170957565308, "learning_rate": 7.73698966181882e-05, "loss": 3.9562, "step": 2586 }, { "epoch": 0.6800013137583342, "grad_norm": 0.9043103456497192, "learning_rate": 7.735237427720344e-05, "loss": 3.9892, "step": 2588 }, { "epoch": 0.6805268170919959, "grad_norm": 0.9269018173217773, "learning_rate": 7.733485193621868e-05, "loss": 3.9923, "step": 2590 }, { "epoch": 0.6810523204256577, "grad_norm": 0.8372640609741211, "learning_rate": 7.731732959523392e-05, "loss": 4.0156, "step": 2592 }, { "epoch": 0.6815778237593195, "grad_norm": 0.9861947894096375, "learning_rate": 7.729980725424917e-05, "loss": 3.9955, "step": 2594 }, { "epoch": 0.6821033270929813, "grad_norm": 0.9948751330375671, "learning_rate": 7.728228491326442e-05, "loss": 3.9882, "step": 2596 }, { "epoch": 0.682628830426643, "grad_norm": 0.9911062717437744, "learning_rate": 7.726476257227965e-05, "loss": 3.9671, "step": 2598 }, { "epoch": 0.6831543337603048, "grad_norm": 0.8879866600036621, "learning_rate": 7.72472402312949e-05, "loss": 3.9642, "step": 2600 }, { "epoch": 0.6836798370939665, "grad_norm": 0.9679906368255615, "learning_rate": 7.722971789031015e-05, "loss": 3.962, "step": 2602 }, { "epoch": 0.6842053404276284, "grad_norm": 0.8334205150604248, "learning_rate": 7.72121955493254e-05, "loss": 3.9736, "step": 2604 }, { "epoch": 0.6847308437612901, "grad_norm": 0.8924968242645264, "learning_rate": 7.719467320834064e-05, "loss": 3.9519, "step": 2606 }, { "epoch": 0.6852563470949519, "grad_norm": 0.9406372904777527, "learning_rate": 7.717715086735589e-05, "loss": 3.9702, "step": 2608 }, { "epoch": 0.6857818504286136, "grad_norm": 0.7975469827651978, "learning_rate": 7.715962852637112e-05, "loss": 3.9719, "step": 2610 }, { "epoch": 0.6863073537622755, "grad_norm": 0.9133123755455017, "learning_rate": 7.714210618538637e-05, "loss": 3.9829, "step": 2612 }, { "epoch": 0.6868328570959372, "grad_norm": 0.8396740555763245, "learning_rate": 7.712458384440162e-05, "loss": 3.9995, "step": 2614 }, { "epoch": 0.687358360429599, "grad_norm": 0.9757854342460632, "learning_rate": 7.710706150341685e-05, "loss": 3.9524, "step": 2616 }, { "epoch": 0.6878838637632607, "grad_norm": 0.8964901566505432, "learning_rate": 7.70895391624321e-05, "loss": 3.9956, "step": 2618 }, { "epoch": 0.6884093670969226, "grad_norm": 0.9891788959503174, "learning_rate": 7.707201682144735e-05, "loss": 3.9667, "step": 2620 }, { "epoch": 0.6889348704305843, "grad_norm": 0.9349448084831238, "learning_rate": 7.70544944804626e-05, "loss": 3.9661, "step": 2622 }, { "epoch": 0.6894603737642461, "grad_norm": 0.8198410272598267, "learning_rate": 7.703697213947783e-05, "loss": 3.9465, "step": 2624 }, { "epoch": 0.6899858770979078, "grad_norm": 0.9551869630813599, "learning_rate": 7.701944979849308e-05, "loss": 3.9661, "step": 2626 }, { "epoch": 0.6905113804315696, "grad_norm": 1.2609916925430298, "learning_rate": 7.700192745750833e-05, "loss": 3.972, "step": 2628 }, { "epoch": 0.6910368837652314, "grad_norm": 0.9373360872268677, "learning_rate": 7.698440511652357e-05, "loss": 3.9669, "step": 2630 }, { "epoch": 0.6915623870988932, "grad_norm": 1.1616547107696533, "learning_rate": 7.696688277553882e-05, "loss": 3.9379, "step": 2632 }, { "epoch": 0.6920878904325549, "grad_norm": 0.8292416334152222, "learning_rate": 7.694936043455407e-05, "loss": 3.9437, "step": 2634 }, { "epoch": 0.6926133937662167, "grad_norm": 1.0372940301895142, "learning_rate": 7.69318380935693e-05, "loss": 3.9864, "step": 2636 }, { "epoch": 0.6931388970998785, "grad_norm": 0.9068877100944519, "learning_rate": 7.691431575258455e-05, "loss": 3.9539, "step": 2638 }, { "epoch": 0.6936644004335403, "grad_norm": 0.9246941804885864, "learning_rate": 7.689679341159978e-05, "loss": 3.9895, "step": 2640 }, { "epoch": 0.694189903767202, "grad_norm": 0.8411228656768799, "learning_rate": 7.687927107061503e-05, "loss": 3.9848, "step": 2642 }, { "epoch": 0.6947154071008638, "grad_norm": 0.9195095300674438, "learning_rate": 7.686174872963028e-05, "loss": 4.0011, "step": 2644 }, { "epoch": 0.6952409104345255, "grad_norm": 0.9249100685119629, "learning_rate": 7.684422638864553e-05, "loss": 3.9568, "step": 2646 }, { "epoch": 0.6957664137681874, "grad_norm": 0.9122694134712219, "learning_rate": 7.682670404766077e-05, "loss": 3.9679, "step": 2648 }, { "epoch": 0.6962919171018491, "grad_norm": 1.1443372964859009, "learning_rate": 7.680918170667601e-05, "loss": 4.0156, "step": 2650 }, { "epoch": 0.6968174204355109, "grad_norm": 1.0019484758377075, "learning_rate": 7.679165936569126e-05, "loss": 3.9455, "step": 2652 }, { "epoch": 0.6973429237691726, "grad_norm": 1.11434006690979, "learning_rate": 7.67741370247065e-05, "loss": 3.9408, "step": 2654 }, { "epoch": 0.6978684271028345, "grad_norm": 0.9584097862243652, "learning_rate": 7.675661468372175e-05, "loss": 3.9806, "step": 2656 }, { "epoch": 0.6983939304364962, "grad_norm": 0.9332491159439087, "learning_rate": 7.6739092342737e-05, "loss": 3.9671, "step": 2658 }, { "epoch": 0.698919433770158, "grad_norm": 0.8306555151939392, "learning_rate": 7.672157000175225e-05, "loss": 4.0184, "step": 2660 }, { "epoch": 0.6994449371038197, "grad_norm": 0.9298374056816101, "learning_rate": 7.670404766076748e-05, "loss": 4.0064, "step": 2662 }, { "epoch": 0.6999704404374816, "grad_norm": 0.8325909972190857, "learning_rate": 7.668652531978273e-05, "loss": 3.9654, "step": 2664 }, { "epoch": 0.7004959437711433, "grad_norm": 1.1498817205429077, "learning_rate": 7.666900297879796e-05, "loss": 4.0104, "step": 2666 }, { "epoch": 0.7010214471048051, "grad_norm": 0.8374989032745361, "learning_rate": 7.665148063781321e-05, "loss": 3.9988, "step": 2668 }, { "epoch": 0.7015469504384668, "grad_norm": 0.9182053208351135, "learning_rate": 7.663395829682846e-05, "loss": 3.964, "step": 2670 }, { "epoch": 0.7020724537721286, "grad_norm": 1.0188909769058228, "learning_rate": 7.66164359558437e-05, "loss": 3.9848, "step": 2672 }, { "epoch": 0.7025979571057904, "grad_norm": 0.9034000039100647, "learning_rate": 7.659891361485895e-05, "loss": 3.9747, "step": 2674 }, { "epoch": 0.7031234604394522, "grad_norm": 1.2692492008209229, "learning_rate": 7.65813912738742e-05, "loss": 3.9406, "step": 2676 }, { "epoch": 0.7036489637731139, "grad_norm": 1.1862064599990845, "learning_rate": 7.656386893288943e-05, "loss": 3.9652, "step": 2678 }, { "epoch": 0.7041744671067757, "grad_norm": 0.9371146559715271, "learning_rate": 7.654634659190468e-05, "loss": 3.9503, "step": 2680 }, { "epoch": 0.7046999704404375, "grad_norm": 1.209226131439209, "learning_rate": 7.652882425091993e-05, "loss": 3.9475, "step": 2682 }, { "epoch": 0.7052254737740993, "grad_norm": 0.9184825420379639, "learning_rate": 7.651130190993518e-05, "loss": 3.9421, "step": 2684 }, { "epoch": 0.705750977107761, "grad_norm": 0.9726893305778503, "learning_rate": 7.649377956895042e-05, "loss": 3.9548, "step": 2686 }, { "epoch": 0.7062764804414228, "grad_norm": 0.8479759097099304, "learning_rate": 7.647625722796567e-05, "loss": 3.9753, "step": 2688 }, { "epoch": 0.7068019837750845, "grad_norm": 1.0404555797576904, "learning_rate": 7.64587348869809e-05, "loss": 3.9853, "step": 2690 }, { "epoch": 0.7073274871087464, "grad_norm": 1.0414960384368896, "learning_rate": 7.644121254599614e-05, "loss": 3.9354, "step": 2692 }, { "epoch": 0.7078529904424081, "grad_norm": 0.9660995006561279, "learning_rate": 7.642369020501139e-05, "loss": 3.9606, "step": 2694 }, { "epoch": 0.7083784937760699, "grad_norm": 1.0886223316192627, "learning_rate": 7.640616786402663e-05, "loss": 3.9842, "step": 2696 }, { "epoch": 0.7089039971097316, "grad_norm": 0.825875997543335, "learning_rate": 7.638864552304188e-05, "loss": 3.9475, "step": 2698 }, { "epoch": 0.7094295004433935, "grad_norm": 0.9850082397460938, "learning_rate": 7.637112318205713e-05, "loss": 3.9799, "step": 2700 }, { "epoch": 0.7099550037770552, "grad_norm": 0.9581945538520813, "learning_rate": 7.635360084107238e-05, "loss": 3.9812, "step": 2702 }, { "epoch": 0.710480507110717, "grad_norm": 0.8776742219924927, "learning_rate": 7.633607850008761e-05, "loss": 4.0108, "step": 2704 }, { "epoch": 0.7110060104443787, "grad_norm": 0.9603926539421082, "learning_rate": 7.631855615910286e-05, "loss": 4.0139, "step": 2706 }, { "epoch": 0.7115315137780406, "grad_norm": 0.9462234973907471, "learning_rate": 7.63010338181181e-05, "loss": 3.9896, "step": 2708 }, { "epoch": 0.7120570171117023, "grad_norm": 1.0107725858688354, "learning_rate": 7.628351147713335e-05, "loss": 3.9664, "step": 2710 }, { "epoch": 0.7125825204453641, "grad_norm": 0.8773449063301086, "learning_rate": 7.62659891361486e-05, "loss": 4.0175, "step": 2712 }, { "epoch": 0.7131080237790258, "grad_norm": 0.9536393284797668, "learning_rate": 7.624846679516385e-05, "loss": 3.9693, "step": 2714 }, { "epoch": 0.7136335271126876, "grad_norm": 0.9716771841049194, "learning_rate": 7.623094445417908e-05, "loss": 3.9305, "step": 2716 }, { "epoch": 0.7141590304463494, "grad_norm": 0.9568358063697815, "learning_rate": 7.621342211319432e-05, "loss": 4.0081, "step": 2718 }, { "epoch": 0.7146845337800112, "grad_norm": 1.0787606239318848, "learning_rate": 7.619589977220956e-05, "loss": 4.0097, "step": 2720 }, { "epoch": 0.7152100371136729, "grad_norm": 0.9969055652618408, "learning_rate": 7.617837743122481e-05, "loss": 3.9276, "step": 2722 }, { "epoch": 0.7157355404473347, "grad_norm": 0.871742844581604, "learning_rate": 7.616085509024006e-05, "loss": 3.9378, "step": 2724 }, { "epoch": 0.7162610437809965, "grad_norm": 1.0618207454681396, "learning_rate": 7.61433327492553e-05, "loss": 3.9905, "step": 2726 }, { "epoch": 0.7167865471146583, "grad_norm": 0.8698229193687439, "learning_rate": 7.612581040827055e-05, "loss": 3.9871, "step": 2728 }, { "epoch": 0.71731205044832, "grad_norm": 0.9829655289649963, "learning_rate": 7.610828806728579e-05, "loss": 3.956, "step": 2730 }, { "epoch": 0.7178375537819818, "grad_norm": 1.004198431968689, "learning_rate": 7.609076572630104e-05, "loss": 3.9719, "step": 2732 }, { "epoch": 0.7183630571156436, "grad_norm": 1.1582940816879272, "learning_rate": 7.607324338531628e-05, "loss": 3.9463, "step": 2734 }, { "epoch": 0.7188885604493054, "grad_norm": 1.1018438339233398, "learning_rate": 7.605572104433153e-05, "loss": 3.9536, "step": 2736 }, { "epoch": 0.7194140637829671, "grad_norm": 0.8488019704818726, "learning_rate": 7.603819870334678e-05, "loss": 3.9546, "step": 2738 }, { "epoch": 0.7199395671166289, "grad_norm": 0.9990813136100769, "learning_rate": 7.602067636236203e-05, "loss": 3.9381, "step": 2740 }, { "epoch": 0.7204650704502906, "grad_norm": 0.9754452705383301, "learning_rate": 7.600315402137726e-05, "loss": 3.9427, "step": 2742 }, { "epoch": 0.7209905737839525, "grad_norm": 0.8679227828979492, "learning_rate": 7.59856316803925e-05, "loss": 3.959, "step": 2744 }, { "epoch": 0.7215160771176142, "grad_norm": 0.873285174369812, "learning_rate": 7.596810933940774e-05, "loss": 3.9492, "step": 2746 }, { "epoch": 0.722041580451276, "grad_norm": 0.8717808723449707, "learning_rate": 7.595058699842299e-05, "loss": 3.9333, "step": 2748 }, { "epoch": 0.7225670837849377, "grad_norm": 0.8706353306770325, "learning_rate": 7.593306465743824e-05, "loss": 3.9668, "step": 2750 }, { "epoch": 0.7230925871185996, "grad_norm": 0.9973061084747314, "learning_rate": 7.591554231645348e-05, "loss": 3.9865, "step": 2752 }, { "epoch": 0.7236180904522613, "grad_norm": 0.8771011829376221, "learning_rate": 7.589801997546873e-05, "loss": 3.973, "step": 2754 }, { "epoch": 0.7241435937859231, "grad_norm": 0.8952291011810303, "learning_rate": 7.588049763448397e-05, "loss": 3.9252, "step": 2756 }, { "epoch": 0.7246690971195848, "grad_norm": 0.832181990146637, "learning_rate": 7.586297529349921e-05, "loss": 3.9345, "step": 2758 }, { "epoch": 0.7251946004532466, "grad_norm": 0.956555187702179, "learning_rate": 7.584545295251446e-05, "loss": 3.9665, "step": 2760 }, { "epoch": 0.7257201037869084, "grad_norm": 0.969571590423584, "learning_rate": 7.582793061152971e-05, "loss": 4.0018, "step": 2762 }, { "epoch": 0.7262456071205702, "grad_norm": 0.9067625999450684, "learning_rate": 7.581040827054496e-05, "loss": 3.9346, "step": 2764 }, { "epoch": 0.7267711104542319, "grad_norm": 0.8608770370483398, "learning_rate": 7.57928859295602e-05, "loss": 3.9949, "step": 2766 }, { "epoch": 0.7272966137878937, "grad_norm": 0.8524954319000244, "learning_rate": 7.577536358857544e-05, "loss": 3.9771, "step": 2768 }, { "epoch": 0.7278221171215555, "grad_norm": 1.039624810218811, "learning_rate": 7.575784124759067e-05, "loss": 4.0046, "step": 2770 }, { "epoch": 0.7283476204552173, "grad_norm": 0.8703776001930237, "learning_rate": 7.574031890660592e-05, "loss": 3.9777, "step": 2772 }, { "epoch": 0.728873123788879, "grad_norm": 1.0916212797164917, "learning_rate": 7.572279656562117e-05, "loss": 3.9627, "step": 2774 }, { "epoch": 0.7293986271225408, "grad_norm": 0.7967639565467834, "learning_rate": 7.570527422463641e-05, "loss": 3.9423, "step": 2776 }, { "epoch": 0.7299241304562026, "grad_norm": 0.9283713102340698, "learning_rate": 7.568775188365166e-05, "loss": 3.9268, "step": 2778 }, { "epoch": 0.7304496337898644, "grad_norm": 0.8241966962814331, "learning_rate": 7.567022954266691e-05, "loss": 4.0042, "step": 2780 }, { "epoch": 0.7309751371235261, "grad_norm": 0.9157766103744507, "learning_rate": 7.565270720168214e-05, "loss": 3.9557, "step": 2782 }, { "epoch": 0.7315006404571879, "grad_norm": 0.8863827586174011, "learning_rate": 7.563518486069739e-05, "loss": 3.9811, "step": 2784 }, { "epoch": 0.7320261437908496, "grad_norm": 0.8942471146583557, "learning_rate": 7.561766251971264e-05, "loss": 3.9544, "step": 2786 }, { "epoch": 0.7325516471245115, "grad_norm": 0.8443163633346558, "learning_rate": 7.560014017872789e-05, "loss": 3.9699, "step": 2788 }, { "epoch": 0.7330771504581732, "grad_norm": 0.7878749370574951, "learning_rate": 7.558261783774313e-05, "loss": 3.9673, "step": 2790 }, { "epoch": 0.733602653791835, "grad_norm": 0.8778356313705444, "learning_rate": 7.556509549675838e-05, "loss": 3.9507, "step": 2792 }, { "epoch": 0.7341281571254967, "grad_norm": 0.9457665681838989, "learning_rate": 7.554757315577362e-05, "loss": 4.0049, "step": 2794 }, { "epoch": 0.7346536604591586, "grad_norm": 0.8581964373588562, "learning_rate": 7.553005081478885e-05, "loss": 3.9548, "step": 2796 }, { "epoch": 0.7351791637928203, "grad_norm": 0.9654092788696289, "learning_rate": 7.55125284738041e-05, "loss": 3.9435, "step": 2798 }, { "epoch": 0.7357046671264821, "grad_norm": 0.9047033190727234, "learning_rate": 7.549500613281934e-05, "loss": 3.9459, "step": 2800 }, { "epoch": 0.7357046671264821, "eval_loss": 3.9110937118530273, "eval_runtime": 464.8042, "eval_samples_per_second": 262.022, "eval_steps_per_second": 8.188, "step": 2800 }, { "epoch": 0.7362301704601438, "grad_norm": 0.8206807971000671, "learning_rate": 7.547748379183459e-05, "loss": 3.9575, "step": 2802 }, { "epoch": 0.7367556737938056, "grad_norm": 0.9366137385368347, "learning_rate": 7.545996145084984e-05, "loss": 3.996, "step": 2804 }, { "epoch": 0.7372811771274674, "grad_norm": 0.8182359337806702, "learning_rate": 7.544243910986509e-05, "loss": 3.9824, "step": 2806 }, { "epoch": 0.7378066804611292, "grad_norm": 0.8680740594863892, "learning_rate": 7.542491676888032e-05, "loss": 3.9396, "step": 2808 }, { "epoch": 0.7383321837947909, "grad_norm": 0.902900755405426, "learning_rate": 7.540739442789557e-05, "loss": 3.9286, "step": 2810 }, { "epoch": 0.7388576871284527, "grad_norm": 1.0591987371444702, "learning_rate": 7.538987208691082e-05, "loss": 3.9531, "step": 2812 }, { "epoch": 0.7393831904621145, "grad_norm": 0.942427396774292, "learning_rate": 7.537234974592606e-05, "loss": 3.9687, "step": 2814 }, { "epoch": 0.7399086937957763, "grad_norm": 0.8563567996025085, "learning_rate": 7.535482740494131e-05, "loss": 3.9552, "step": 2816 }, { "epoch": 0.740434197129438, "grad_norm": 0.8842030167579651, "learning_rate": 7.533730506395656e-05, "loss": 3.9741, "step": 2818 }, { "epoch": 0.7409597004630998, "grad_norm": 0.8618088960647583, "learning_rate": 7.531978272297179e-05, "loss": 3.9324, "step": 2820 }, { "epoch": 0.7414852037967616, "grad_norm": 0.9721809029579163, "learning_rate": 7.530226038198703e-05, "loss": 3.9669, "step": 2822 }, { "epoch": 0.7420107071304234, "grad_norm": 0.8166691660881042, "learning_rate": 7.528473804100227e-05, "loss": 3.9277, "step": 2824 }, { "epoch": 0.7425362104640851, "grad_norm": 1.053092360496521, "learning_rate": 7.526721570001752e-05, "loss": 3.97, "step": 2826 }, { "epoch": 0.7430617137977469, "grad_norm": 0.9154080748558044, "learning_rate": 7.524969335903277e-05, "loss": 3.95, "step": 2828 }, { "epoch": 0.7435872171314086, "grad_norm": 0.8160024881362915, "learning_rate": 7.523217101804802e-05, "loss": 3.9376, "step": 2830 }, { "epoch": 0.7441127204650705, "grad_norm": 0.8830057382583618, "learning_rate": 7.521464867706326e-05, "loss": 3.9811, "step": 2832 }, { "epoch": 0.7446382237987322, "grad_norm": 1.019036054611206, "learning_rate": 7.51971263360785e-05, "loss": 3.9411, "step": 2834 }, { "epoch": 0.745163727132394, "grad_norm": 0.8228240013122559, "learning_rate": 7.517960399509375e-05, "loss": 3.9594, "step": 2836 }, { "epoch": 0.7456892304660557, "grad_norm": 1.167412519454956, "learning_rate": 7.5162081654109e-05, "loss": 3.9307, "step": 2838 }, { "epoch": 0.7462147337997176, "grad_norm": 1.018781304359436, "learning_rate": 7.514455931312424e-05, "loss": 3.9849, "step": 2840 }, { "epoch": 0.7467402371333793, "grad_norm": 0.9486522078514099, "learning_rate": 7.512703697213949e-05, "loss": 4.0144, "step": 2842 }, { "epoch": 0.7472657404670411, "grad_norm": 0.8708224296569824, "learning_rate": 7.510951463115472e-05, "loss": 3.972, "step": 2844 }, { "epoch": 0.7477912438007028, "grad_norm": 0.8615313172340393, "learning_rate": 7.509199229016997e-05, "loss": 4.0089, "step": 2846 }, { "epoch": 0.7483167471343646, "grad_norm": 0.9585216641426086, "learning_rate": 7.50744699491852e-05, "loss": 4.0114, "step": 2848 }, { "epoch": 0.7488422504680264, "grad_norm": 0.9481785297393799, "learning_rate": 7.505694760820045e-05, "loss": 3.9713, "step": 2850 }, { "epoch": 0.7493677538016882, "grad_norm": 0.9055920839309692, "learning_rate": 7.50394252672157e-05, "loss": 3.9517, "step": 2852 }, { "epoch": 0.7498932571353499, "grad_norm": 1.0092451572418213, "learning_rate": 7.502190292623095e-05, "loss": 3.9905, "step": 2854 }, { "epoch": 0.7504187604690117, "grad_norm": 0.8346351981163025, "learning_rate": 7.50043805852462e-05, "loss": 3.9166, "step": 2856 }, { "epoch": 0.7509442638026735, "grad_norm": 1.0473018884658813, "learning_rate": 7.498685824426144e-05, "loss": 3.9389, "step": 2858 }, { "epoch": 0.7514697671363353, "grad_norm": 0.8580000400543213, "learning_rate": 7.496933590327668e-05, "loss": 3.9228, "step": 2860 }, { "epoch": 0.751995270469997, "grad_norm": 1.110920786857605, "learning_rate": 7.495181356229192e-05, "loss": 3.9941, "step": 2862 }, { "epoch": 0.7525207738036588, "grad_norm": 0.9773488640785217, "learning_rate": 7.493429122130717e-05, "loss": 3.9726, "step": 2864 }, { "epoch": 0.7530462771373206, "grad_norm": 0.8099828958511353, "learning_rate": 7.491676888032242e-05, "loss": 3.9176, "step": 2866 }, { "epoch": 0.7535717804709824, "grad_norm": 0.8451085090637207, "learning_rate": 7.489924653933767e-05, "loss": 3.9941, "step": 2868 }, { "epoch": 0.7540972838046441, "grad_norm": 0.8165507316589355, "learning_rate": 7.48817241983529e-05, "loss": 3.9249, "step": 2870 }, { "epoch": 0.7546227871383059, "grad_norm": 0.8639516830444336, "learning_rate": 7.486420185736815e-05, "loss": 3.956, "step": 2872 }, { "epoch": 0.7551482904719676, "grad_norm": 0.8562615513801575, "learning_rate": 7.484667951638338e-05, "loss": 3.9331, "step": 2874 }, { "epoch": 0.7556737938056295, "grad_norm": 0.8129862546920776, "learning_rate": 7.482915717539863e-05, "loss": 3.9528, "step": 2876 }, { "epoch": 0.7561992971392912, "grad_norm": 0.8422499299049377, "learning_rate": 7.481163483441388e-05, "loss": 3.9752, "step": 2878 }, { "epoch": 0.756724800472953, "grad_norm": 0.7934624552726746, "learning_rate": 7.479411249342913e-05, "loss": 3.9306, "step": 2880 }, { "epoch": 0.7572503038066147, "grad_norm": 0.8922916054725647, "learning_rate": 7.477659015244437e-05, "loss": 3.9219, "step": 2882 }, { "epoch": 0.7577758071402766, "grad_norm": 0.8188332319259644, "learning_rate": 7.475906781145962e-05, "loss": 3.9043, "step": 2884 }, { "epoch": 0.7583013104739383, "grad_norm": 0.8114766478538513, "learning_rate": 7.474154547047485e-05, "loss": 3.9343, "step": 2886 }, { "epoch": 0.7588268138076001, "grad_norm": 0.9019129276275635, "learning_rate": 7.47240231294901e-05, "loss": 3.9996, "step": 2888 }, { "epoch": 0.7593523171412618, "grad_norm": 0.9008287787437439, "learning_rate": 7.470650078850535e-05, "loss": 3.9266, "step": 2890 }, { "epoch": 0.7598778204749237, "grad_norm": 0.9981153011322021, "learning_rate": 7.46889784475206e-05, "loss": 3.9248, "step": 2892 }, { "epoch": 0.7604033238085854, "grad_norm": 0.9118956327438354, "learning_rate": 7.467145610653584e-05, "loss": 3.953, "step": 2894 }, { "epoch": 0.7609288271422472, "grad_norm": 1.0773121118545532, "learning_rate": 7.465393376555108e-05, "loss": 3.9247, "step": 2896 }, { "epoch": 0.7614543304759089, "grad_norm": 0.868709146976471, "learning_rate": 7.463641142456633e-05, "loss": 4.0084, "step": 2898 }, { "epoch": 0.7619798338095707, "grad_norm": 0.9161157011985779, "learning_rate": 7.461888908358156e-05, "loss": 3.9689, "step": 2900 }, { "epoch": 0.7625053371432325, "grad_norm": 0.8519930243492126, "learning_rate": 7.460136674259681e-05, "loss": 3.9573, "step": 2902 }, { "epoch": 0.7630308404768943, "grad_norm": 0.8285689949989319, "learning_rate": 7.458384440161206e-05, "loss": 3.9088, "step": 2904 }, { "epoch": 0.763556343810556, "grad_norm": 0.8663350343704224, "learning_rate": 7.45663220606273e-05, "loss": 3.9601, "step": 2906 }, { "epoch": 0.7640818471442178, "grad_norm": 0.7357189059257507, "learning_rate": 7.454879971964255e-05, "loss": 3.9417, "step": 2908 }, { "epoch": 0.7646073504778796, "grad_norm": 0.8532646298408508, "learning_rate": 7.45312773786578e-05, "loss": 3.9017, "step": 2910 }, { "epoch": 0.7651328538115414, "grad_norm": 0.8424131274223328, "learning_rate": 7.451375503767303e-05, "loss": 3.951, "step": 2912 }, { "epoch": 0.7656583571452031, "grad_norm": 0.8325870633125305, "learning_rate": 7.449623269668828e-05, "loss": 3.941, "step": 2914 }, { "epoch": 0.7661838604788649, "grad_norm": 0.7691123485565186, "learning_rate": 7.447871035570353e-05, "loss": 3.9454, "step": 2916 }, { "epoch": 0.7667093638125266, "grad_norm": 0.800471842288971, "learning_rate": 7.446118801471877e-05, "loss": 3.9357, "step": 2918 }, { "epoch": 0.7672348671461885, "grad_norm": 0.8014011979103088, "learning_rate": 7.444366567373402e-05, "loss": 4.0231, "step": 2920 }, { "epoch": 0.7677603704798502, "grad_norm": 0.8439620137214661, "learning_rate": 7.442614333274926e-05, "loss": 3.9411, "step": 2922 }, { "epoch": 0.768285873813512, "grad_norm": 0.9512550830841064, "learning_rate": 7.44086209917645e-05, "loss": 3.9664, "step": 2924 }, { "epoch": 0.7688113771471737, "grad_norm": 0.9486682415008545, "learning_rate": 7.439109865077975e-05, "loss": 3.8703, "step": 2926 }, { "epoch": 0.7693368804808356, "grad_norm": 1.0245190858840942, "learning_rate": 7.437357630979499e-05, "loss": 3.9441, "step": 2928 }, { "epoch": 0.7698623838144973, "grad_norm": 0.9086085557937622, "learning_rate": 7.435605396881023e-05, "loss": 3.942, "step": 2930 }, { "epoch": 0.7703878871481591, "grad_norm": 0.8598043918609619, "learning_rate": 7.433853162782548e-05, "loss": 3.9535, "step": 2932 }, { "epoch": 0.7709133904818208, "grad_norm": 0.9137352705001831, "learning_rate": 7.432100928684073e-05, "loss": 3.9106, "step": 2934 }, { "epoch": 0.7714388938154827, "grad_norm": 1.0996919870376587, "learning_rate": 7.430348694585598e-05, "loss": 4.0168, "step": 2936 }, { "epoch": 0.7719643971491444, "grad_norm": 0.8783286809921265, "learning_rate": 7.428596460487122e-05, "loss": 3.9657, "step": 2938 }, { "epoch": 0.7724899004828062, "grad_norm": 1.0403074026107788, "learning_rate": 7.426844226388646e-05, "loss": 4.0059, "step": 2940 }, { "epoch": 0.7730154038164679, "grad_norm": 0.9424353837966919, "learning_rate": 7.42509199229017e-05, "loss": 3.9474, "step": 2942 }, { "epoch": 0.7735409071501297, "grad_norm": 0.9421131014823914, "learning_rate": 7.423339758191695e-05, "loss": 3.962, "step": 2944 }, { "epoch": 0.7740664104837915, "grad_norm": 0.8906638622283936, "learning_rate": 7.421587524093219e-05, "loss": 3.9379, "step": 2946 }, { "epoch": 0.7745919138174533, "grad_norm": 0.9124142527580261, "learning_rate": 7.419835289994743e-05, "loss": 3.8953, "step": 2948 }, { "epoch": 0.775117417151115, "grad_norm": 0.9165842533111572, "learning_rate": 7.418083055896268e-05, "loss": 3.9532, "step": 2950 }, { "epoch": 0.7756429204847768, "grad_norm": 0.8612256050109863, "learning_rate": 7.416330821797793e-05, "loss": 3.9324, "step": 2952 }, { "epoch": 0.7761684238184386, "grad_norm": 0.9833328127861023, "learning_rate": 7.414578587699316e-05, "loss": 3.9629, "step": 2954 }, { "epoch": 0.7766939271521004, "grad_norm": 0.8044130802154541, "learning_rate": 7.412826353600841e-05, "loss": 3.9801, "step": 2956 }, { "epoch": 0.7772194304857621, "grad_norm": 0.9419910311698914, "learning_rate": 7.411074119502366e-05, "loss": 3.9052, "step": 2958 }, { "epoch": 0.7777449338194239, "grad_norm": 0.868776798248291, "learning_rate": 7.40932188540389e-05, "loss": 3.9654, "step": 2960 }, { "epoch": 0.7782704371530856, "grad_norm": 0.9859121441841125, "learning_rate": 7.407569651305415e-05, "loss": 3.9423, "step": 2962 }, { "epoch": 0.7787959404867475, "grad_norm": 0.8608348369598389, "learning_rate": 7.40581741720694e-05, "loss": 3.975, "step": 2964 }, { "epoch": 0.7793214438204092, "grad_norm": 0.9920139908790588, "learning_rate": 7.404065183108463e-05, "loss": 3.9518, "step": 2966 }, { "epoch": 0.779846947154071, "grad_norm": 0.8238610625267029, "learning_rate": 7.402312949009988e-05, "loss": 3.9114, "step": 2968 }, { "epoch": 0.7803724504877327, "grad_norm": 0.7924317121505737, "learning_rate": 7.400560714911513e-05, "loss": 3.9023, "step": 2970 }, { "epoch": 0.7808979538213946, "grad_norm": 0.9084011316299438, "learning_rate": 7.398808480813036e-05, "loss": 3.9458, "step": 2972 }, { "epoch": 0.7814234571550563, "grad_norm": 0.935979425907135, "learning_rate": 7.397056246714561e-05, "loss": 3.9737, "step": 2974 }, { "epoch": 0.7819489604887181, "grad_norm": 0.9353903532028198, "learning_rate": 7.395304012616086e-05, "loss": 3.914, "step": 2976 }, { "epoch": 0.7824744638223798, "grad_norm": 0.8916248679161072, "learning_rate": 7.39355177851761e-05, "loss": 3.9693, "step": 2978 }, { "epoch": 0.7829999671560417, "grad_norm": 1.0279054641723633, "learning_rate": 7.391799544419134e-05, "loss": 3.9443, "step": 2980 }, { "epoch": 0.7835254704897034, "grad_norm": 0.9135000705718994, "learning_rate": 7.390047310320659e-05, "loss": 3.9448, "step": 2982 }, { "epoch": 0.7840509738233652, "grad_norm": 0.8835594058036804, "learning_rate": 7.388295076222184e-05, "loss": 3.9526, "step": 2984 }, { "epoch": 0.7845764771570269, "grad_norm": 0.8380985260009766, "learning_rate": 7.386542842123708e-05, "loss": 3.9448, "step": 2986 }, { "epoch": 0.7851019804906887, "grad_norm": 1.135197639465332, "learning_rate": 7.384790608025233e-05, "loss": 3.9085, "step": 2988 }, { "epoch": 0.7856274838243505, "grad_norm": 0.9773169159889221, "learning_rate": 7.383038373926758e-05, "loss": 3.9203, "step": 2990 }, { "epoch": 0.7861529871580123, "grad_norm": 0.7570453882217407, "learning_rate": 7.381286139828281e-05, "loss": 3.9464, "step": 2992 }, { "epoch": 0.786678490491674, "grad_norm": 0.8980574011802673, "learning_rate": 7.379533905729806e-05, "loss": 3.9317, "step": 2994 }, { "epoch": 0.7872039938253358, "grad_norm": 1.181261658668518, "learning_rate": 7.377781671631331e-05, "loss": 3.9426, "step": 2996 }, { "epoch": 0.7877294971589976, "grad_norm": 0.8125550150871277, "learning_rate": 7.376029437532854e-05, "loss": 3.9634, "step": 2998 }, { "epoch": 0.7882550004926594, "grad_norm": 0.7798334956169128, "learning_rate": 7.374277203434379e-05, "loss": 3.943, "step": 3000 }, { "epoch": 0.7887805038263211, "grad_norm": 0.8521909117698669, "learning_rate": 7.372524969335904e-05, "loss": 3.9578, "step": 3002 }, { "epoch": 0.7893060071599829, "grad_norm": 0.9647176861763, "learning_rate": 7.370772735237428e-05, "loss": 3.9473, "step": 3004 }, { "epoch": 0.7898315104936446, "grad_norm": 0.8578081727027893, "learning_rate": 7.369020501138952e-05, "loss": 3.9621, "step": 3006 }, { "epoch": 0.7903570138273065, "grad_norm": 0.9992386102676392, "learning_rate": 7.367268267040477e-05, "loss": 3.9313, "step": 3008 }, { "epoch": 0.7908825171609682, "grad_norm": 0.907853364944458, "learning_rate": 7.365516032942001e-05, "loss": 3.9423, "step": 3010 }, { "epoch": 0.79140802049463, "grad_norm": 0.9531067609786987, "learning_rate": 7.363763798843526e-05, "loss": 3.9536, "step": 3012 }, { "epoch": 0.7919335238282917, "grad_norm": 0.9060195684432983, "learning_rate": 7.362011564745051e-05, "loss": 3.9382, "step": 3014 }, { "epoch": 0.7924590271619536, "grad_norm": 0.8415207266807556, "learning_rate": 7.360259330646576e-05, "loss": 3.9742, "step": 3016 }, { "epoch": 0.7929845304956153, "grad_norm": 0.8454105257987976, "learning_rate": 7.358507096548099e-05, "loss": 3.9301, "step": 3018 }, { "epoch": 0.7935100338292771, "grad_norm": 0.7673818469047546, "learning_rate": 7.356754862449624e-05, "loss": 3.9519, "step": 3020 }, { "epoch": 0.7940355371629388, "grad_norm": 0.8769925832748413, "learning_rate": 7.355002628351149e-05, "loss": 3.9514, "step": 3022 }, { "epoch": 0.7945610404966007, "grad_norm": 0.9674590229988098, "learning_rate": 7.353250394252672e-05, "loss": 3.945, "step": 3024 }, { "epoch": 0.7950865438302624, "grad_norm": 1.2287559509277344, "learning_rate": 7.351498160154197e-05, "loss": 3.9626, "step": 3026 }, { "epoch": 0.7956120471639242, "grad_norm": 0.9756744503974915, "learning_rate": 7.349745926055721e-05, "loss": 3.9372, "step": 3028 }, { "epoch": 0.7961375504975859, "grad_norm": 0.9817586541175842, "learning_rate": 7.347993691957246e-05, "loss": 3.895, "step": 3030 }, { "epoch": 0.7966630538312477, "grad_norm": 0.8866286277770996, "learning_rate": 7.34624145785877e-05, "loss": 3.892, "step": 3032 }, { "epoch": 0.7971885571649096, "grad_norm": 0.8575455546379089, "learning_rate": 7.344489223760294e-05, "loss": 3.9543, "step": 3034 }, { "epoch": 0.7977140604985713, "grad_norm": 1.1008391380310059, "learning_rate": 7.342736989661819e-05, "loss": 3.9621, "step": 3036 }, { "epoch": 0.798239563832233, "grad_norm": 0.8179908394813538, "learning_rate": 7.340984755563344e-05, "loss": 3.9143, "step": 3038 }, { "epoch": 0.7987650671658948, "grad_norm": 0.8950192928314209, "learning_rate": 7.339232521464869e-05, "loss": 3.9085, "step": 3040 }, { "epoch": 0.7992905704995567, "grad_norm": 0.8385193943977356, "learning_rate": 7.337480287366393e-05, "loss": 3.9359, "step": 3042 }, { "epoch": 0.7998160738332184, "grad_norm": 0.8626218438148499, "learning_rate": 7.335728053267917e-05, "loss": 3.9637, "step": 3044 }, { "epoch": 0.8003415771668801, "grad_norm": 0.8214182257652283, "learning_rate": 7.333975819169442e-05, "loss": 3.9091, "step": 3046 }, { "epoch": 0.8008670805005419, "grad_norm": 0.867668628692627, "learning_rate": 7.332223585070965e-05, "loss": 3.92, "step": 3048 }, { "epoch": 0.8013925838342038, "grad_norm": 0.8035020232200623, "learning_rate": 7.33047135097249e-05, "loss": 3.9464, "step": 3050 }, { "epoch": 0.8019180871678655, "grad_norm": 1.0953487157821655, "learning_rate": 7.328719116874014e-05, "loss": 3.94, "step": 3052 }, { "epoch": 0.8024435905015272, "grad_norm": 0.7501306533813477, "learning_rate": 7.326966882775539e-05, "loss": 3.985, "step": 3054 }, { "epoch": 0.802969093835189, "grad_norm": 0.8052178025245667, "learning_rate": 7.325214648677064e-05, "loss": 3.9273, "step": 3056 }, { "epoch": 0.8034945971688507, "grad_norm": 0.811393141746521, "learning_rate": 7.323462414578587e-05, "loss": 3.9617, "step": 3058 }, { "epoch": 0.8040201005025126, "grad_norm": 0.8806167244911194, "learning_rate": 7.321710180480112e-05, "loss": 3.9052, "step": 3060 }, { "epoch": 0.8045456038361744, "grad_norm": 0.8324558138847351, "learning_rate": 7.319957946381637e-05, "loss": 3.9702, "step": 3062 }, { "epoch": 0.8050711071698361, "grad_norm": 0.8404276371002197, "learning_rate": 7.318205712283162e-05, "loss": 3.893, "step": 3064 }, { "epoch": 0.8055966105034978, "grad_norm": 0.8447362184524536, "learning_rate": 7.316453478184686e-05, "loss": 3.9553, "step": 3066 }, { "epoch": 0.8061221138371597, "grad_norm": 0.8744348287582397, "learning_rate": 7.314701244086211e-05, "loss": 3.9907, "step": 3068 }, { "epoch": 0.8066476171708215, "grad_norm": 0.9429835081100464, "learning_rate": 7.312949009987735e-05, "loss": 3.8616, "step": 3070 }, { "epoch": 0.8071731205044832, "grad_norm": 0.7926254868507385, "learning_rate": 7.311196775889259e-05, "loss": 3.879, "step": 3072 }, { "epoch": 0.807698623838145, "grad_norm": 0.796306312084198, "learning_rate": 7.309444541790783e-05, "loss": 3.9243, "step": 3074 }, { "epoch": 0.8082241271718067, "grad_norm": 0.9853757619857788, "learning_rate": 7.307692307692307e-05, "loss": 4.0182, "step": 3076 }, { "epoch": 0.8087496305054686, "grad_norm": 0.8455913662910461, "learning_rate": 7.305940073593832e-05, "loss": 3.9511, "step": 3078 }, { "epoch": 0.8092751338391303, "grad_norm": 1.0423041582107544, "learning_rate": 7.304187839495357e-05, "loss": 3.9868, "step": 3080 }, { "epoch": 0.809800637172792, "grad_norm": 0.8681489825248718, "learning_rate": 7.302435605396882e-05, "loss": 3.9364, "step": 3082 }, { "epoch": 0.8103261405064538, "grad_norm": 0.8622909784317017, "learning_rate": 7.300683371298405e-05, "loss": 3.9439, "step": 3084 }, { "epoch": 0.8108516438401157, "grad_norm": 0.7866640686988831, "learning_rate": 7.29893113719993e-05, "loss": 3.9464, "step": 3086 }, { "epoch": 0.8113771471737774, "grad_norm": 0.8052477240562439, "learning_rate": 7.297178903101455e-05, "loss": 3.9748, "step": 3088 }, { "epoch": 0.8119026505074391, "grad_norm": 0.8624947667121887, "learning_rate": 7.29542666900298e-05, "loss": 3.9325, "step": 3090 }, { "epoch": 0.8124281538411009, "grad_norm": 0.8868775963783264, "learning_rate": 7.293674434904504e-05, "loss": 3.9304, "step": 3092 }, { "epoch": 0.8129536571747628, "grad_norm": 0.8466366529464722, "learning_rate": 7.291922200806029e-05, "loss": 3.9454, "step": 3094 }, { "epoch": 0.8134791605084245, "grad_norm": 1.0121136903762817, "learning_rate": 7.290169966707552e-05, "loss": 3.9265, "step": 3096 }, { "epoch": 0.8140046638420863, "grad_norm": 0.9327197670936584, "learning_rate": 7.288417732609077e-05, "loss": 3.9407, "step": 3098 }, { "epoch": 0.814530167175748, "grad_norm": 0.9295117855072021, "learning_rate": 7.2866654985106e-05, "loss": 3.9856, "step": 3100 }, { "epoch": 0.8150556705094097, "grad_norm": 0.927067756652832, "learning_rate": 7.284913264412125e-05, "loss": 4.0139, "step": 3102 }, { "epoch": 0.8155811738430716, "grad_norm": 0.8182541728019714, "learning_rate": 7.28316103031365e-05, "loss": 3.8781, "step": 3104 }, { "epoch": 0.8161066771767334, "grad_norm": 0.8844822645187378, "learning_rate": 7.281408796215175e-05, "loss": 3.8792, "step": 3106 }, { "epoch": 0.8166321805103951, "grad_norm": 0.9709361791610718, "learning_rate": 7.2796565621167e-05, "loss": 3.9422, "step": 3108 }, { "epoch": 0.8171576838440568, "grad_norm": 0.8035185933113098, "learning_rate": 7.277904328018223e-05, "loss": 3.9375, "step": 3110 }, { "epoch": 0.8176831871777187, "grad_norm": 0.922577977180481, "learning_rate": 7.276152093919748e-05, "loss": 3.9297, "step": 3112 }, { "epoch": 0.8182086905113805, "grad_norm": 0.9715713858604431, "learning_rate": 7.274399859821272e-05, "loss": 3.913, "step": 3114 }, { "epoch": 0.8187341938450422, "grad_norm": 0.9624879360198975, "learning_rate": 7.272647625722797e-05, "loss": 3.9084, "step": 3116 }, { "epoch": 0.819259697178704, "grad_norm": 0.9769260883331299, "learning_rate": 7.270895391624322e-05, "loss": 3.9483, "step": 3118 }, { "epoch": 0.8197852005123657, "grad_norm": 0.9527453780174255, "learning_rate": 7.269143157525847e-05, "loss": 3.9573, "step": 3120 }, { "epoch": 0.8203107038460276, "grad_norm": 1.0209273099899292, "learning_rate": 7.26739092342737e-05, "loss": 3.9435, "step": 3122 }, { "epoch": 0.8208362071796893, "grad_norm": 0.990148663520813, "learning_rate": 7.265638689328895e-05, "loss": 3.9477, "step": 3124 }, { "epoch": 0.821361710513351, "grad_norm": 0.8259176015853882, "learning_rate": 7.263886455230418e-05, "loss": 3.9601, "step": 3126 }, { "epoch": 0.8218872138470128, "grad_norm": 0.9739744067192078, "learning_rate": 7.262134221131943e-05, "loss": 3.9035, "step": 3128 }, { "epoch": 0.8224127171806747, "grad_norm": 0.931696355342865, "learning_rate": 7.260381987033468e-05, "loss": 3.9447, "step": 3130 }, { "epoch": 0.8229382205143364, "grad_norm": 0.8424457311630249, "learning_rate": 7.258629752934993e-05, "loss": 3.9821, "step": 3132 }, { "epoch": 0.8234637238479982, "grad_norm": 0.8795027732849121, "learning_rate": 7.256877518836517e-05, "loss": 3.9779, "step": 3134 }, { "epoch": 0.8239892271816599, "grad_norm": 0.927527129650116, "learning_rate": 7.25512528473804e-05, "loss": 3.9295, "step": 3136 }, { "epoch": 0.8245147305153218, "grad_norm": 0.8437166213989258, "learning_rate": 7.253373050639565e-05, "loss": 3.9357, "step": 3138 }, { "epoch": 0.8250402338489835, "grad_norm": 0.7399552464485168, "learning_rate": 7.25162081654109e-05, "loss": 3.9251, "step": 3140 }, { "epoch": 0.8255657371826453, "grad_norm": 0.8255283832550049, "learning_rate": 7.249868582442615e-05, "loss": 3.9204, "step": 3142 }, { "epoch": 0.826091240516307, "grad_norm": 0.790286660194397, "learning_rate": 7.24811634834414e-05, "loss": 3.9198, "step": 3144 }, { "epoch": 0.8266167438499687, "grad_norm": 0.758497953414917, "learning_rate": 7.246364114245664e-05, "loss": 3.9032, "step": 3146 }, { "epoch": 0.8271422471836306, "grad_norm": 0.807555615901947, "learning_rate": 7.244611880147188e-05, "loss": 3.9283, "step": 3148 }, { "epoch": 0.8276677505172924, "grad_norm": 0.9071643948554993, "learning_rate": 7.242859646048711e-05, "loss": 3.9623, "step": 3150 }, { "epoch": 0.8281932538509541, "grad_norm": 1.083069086074829, "learning_rate": 7.241107411950236e-05, "loss": 3.9425, "step": 3152 }, { "epoch": 0.8287187571846159, "grad_norm": 0.8988904356956482, "learning_rate": 7.239355177851761e-05, "loss": 3.9709, "step": 3154 }, { "epoch": 0.8292442605182777, "grad_norm": 0.8508176207542419, "learning_rate": 7.237602943753286e-05, "loss": 3.9615, "step": 3156 }, { "epoch": 0.8297697638519395, "grad_norm": 0.9259406924247742, "learning_rate": 7.23585070965481e-05, "loss": 3.9624, "step": 3158 }, { "epoch": 0.8302952671856012, "grad_norm": 0.9048233032226562, "learning_rate": 7.234098475556335e-05, "loss": 3.9756, "step": 3160 }, { "epoch": 0.830820770519263, "grad_norm": 0.9635868072509766, "learning_rate": 7.232346241457858e-05, "loss": 3.9281, "step": 3162 }, { "epoch": 0.8313462738529247, "grad_norm": 0.9304730296134949, "learning_rate": 7.230594007359383e-05, "loss": 3.9282, "step": 3164 }, { "epoch": 0.8318717771865866, "grad_norm": 0.8457497954368591, "learning_rate": 7.228841773260908e-05, "loss": 3.9711, "step": 3166 }, { "epoch": 0.8323972805202483, "grad_norm": 0.839465856552124, "learning_rate": 7.227089539162433e-05, "loss": 3.9172, "step": 3168 }, { "epoch": 0.83292278385391, "grad_norm": 1.0180386304855347, "learning_rate": 7.225337305063957e-05, "loss": 3.9659, "step": 3170 }, { "epoch": 0.8334482871875718, "grad_norm": 0.8106426000595093, "learning_rate": 7.223585070965482e-05, "loss": 3.9216, "step": 3172 }, { "epoch": 0.8339737905212337, "grad_norm": 0.955028235912323, "learning_rate": 7.221832836867006e-05, "loss": 3.9457, "step": 3174 }, { "epoch": 0.8344992938548954, "grad_norm": 0.9382230043411255, "learning_rate": 7.22008060276853e-05, "loss": 3.9302, "step": 3176 }, { "epoch": 0.8350247971885572, "grad_norm": 1.0239802598953247, "learning_rate": 7.218328368670054e-05, "loss": 3.9337, "step": 3178 }, { "epoch": 0.8355503005222189, "grad_norm": 0.8869624733924866, "learning_rate": 7.216576134571579e-05, "loss": 3.9551, "step": 3180 }, { "epoch": 0.8360758038558808, "grad_norm": 0.8521376252174377, "learning_rate": 7.214823900473103e-05, "loss": 3.9406, "step": 3182 }, { "epoch": 0.8366013071895425, "grad_norm": 0.8971220254898071, "learning_rate": 7.213071666374628e-05, "loss": 3.9095, "step": 3184 }, { "epoch": 0.8371268105232043, "grad_norm": 0.8850120306015015, "learning_rate": 7.211319432276153e-05, "loss": 3.8963, "step": 3186 }, { "epoch": 0.837652313856866, "grad_norm": 0.9425832033157349, "learning_rate": 7.209567198177678e-05, "loss": 3.9411, "step": 3188 }, { "epoch": 0.8381778171905278, "grad_norm": 0.9427008032798767, "learning_rate": 7.207814964079201e-05, "loss": 3.8855, "step": 3190 }, { "epoch": 0.8387033205241896, "grad_norm": 0.8230452537536621, "learning_rate": 7.206062729980726e-05, "loss": 3.9066, "step": 3192 }, { "epoch": 0.8392288238578514, "grad_norm": 1.0843548774719238, "learning_rate": 7.20431049588225e-05, "loss": 3.9552, "step": 3194 }, { "epoch": 0.8397543271915131, "grad_norm": 0.9672249555587769, "learning_rate": 7.202558261783775e-05, "loss": 3.9846, "step": 3196 }, { "epoch": 0.8402798305251749, "grad_norm": 1.0270726680755615, "learning_rate": 7.2008060276853e-05, "loss": 3.9106, "step": 3198 }, { "epoch": 0.8408053338588367, "grad_norm": 0.8849105834960938, "learning_rate": 7.199053793586825e-05, "loss": 3.9004, "step": 3200 }, { "epoch": 0.8408053338588367, "eval_loss": 3.886536121368408, "eval_runtime": 864.0984, "eval_samples_per_second": 140.943, "eval_steps_per_second": 4.405, "step": 3200 }, { "epoch": 0.8413308371924985, "grad_norm": 1.0466121435165405, "learning_rate": 7.197301559488348e-05, "loss": 3.9733, "step": 3202 }, { "epoch": 0.8418563405261602, "grad_norm": 0.9569512009620667, "learning_rate": 7.195549325389872e-05, "loss": 3.9058, "step": 3204 }, { "epoch": 0.842381843859822, "grad_norm": 0.8777816295623779, "learning_rate": 7.193797091291396e-05, "loss": 3.9524, "step": 3206 }, { "epoch": 0.8429073471934838, "grad_norm": 0.9770000576972961, "learning_rate": 7.192044857192921e-05, "loss": 3.9101, "step": 3208 }, { "epoch": 0.8434328505271456, "grad_norm": 0.877210259437561, "learning_rate": 7.190292623094446e-05, "loss": 3.9421, "step": 3210 }, { "epoch": 0.8439583538608073, "grad_norm": 0.8630277514457703, "learning_rate": 7.18854038899597e-05, "loss": 3.9448, "step": 3212 }, { "epoch": 0.8444838571944691, "grad_norm": 0.8615143299102783, "learning_rate": 7.186788154897495e-05, "loss": 3.9421, "step": 3214 }, { "epoch": 0.8450093605281308, "grad_norm": 0.8666689991950989, "learning_rate": 7.185035920799019e-05, "loss": 3.9532, "step": 3216 }, { "epoch": 0.8455348638617927, "grad_norm": 0.7709742188453674, "learning_rate": 7.183283686700543e-05, "loss": 3.8648, "step": 3218 }, { "epoch": 0.8460603671954544, "grad_norm": 0.8389413356781006, "learning_rate": 7.181531452602068e-05, "loss": 3.9717, "step": 3220 }, { "epoch": 0.8465858705291162, "grad_norm": 0.8544262051582336, "learning_rate": 7.179779218503593e-05, "loss": 3.9006, "step": 3222 }, { "epoch": 0.8471113738627779, "grad_norm": 0.9318450093269348, "learning_rate": 7.178026984405118e-05, "loss": 3.9238, "step": 3224 }, { "epoch": 0.8476368771964398, "grad_norm": 1.1516430377960205, "learning_rate": 7.176274750306641e-05, "loss": 3.9586, "step": 3226 }, { "epoch": 0.8481623805301015, "grad_norm": 0.9018006324768066, "learning_rate": 7.174522516208166e-05, "loss": 3.9605, "step": 3228 }, { "epoch": 0.8486878838637633, "grad_norm": 1.065131664276123, "learning_rate": 7.172770282109689e-05, "loss": 3.9473, "step": 3230 }, { "epoch": 0.849213387197425, "grad_norm": 0.8698140382766724, "learning_rate": 7.171018048011214e-05, "loss": 3.8981, "step": 3232 }, { "epoch": 0.8497388905310868, "grad_norm": 0.9699116349220276, "learning_rate": 7.169265813912739e-05, "loss": 3.9643, "step": 3234 }, { "epoch": 0.8502643938647486, "grad_norm": 1.061144232749939, "learning_rate": 7.167513579814264e-05, "loss": 3.9515, "step": 3236 }, { "epoch": 0.8507898971984104, "grad_norm": 1.0403856039047241, "learning_rate": 7.165761345715788e-05, "loss": 3.9919, "step": 3238 }, { "epoch": 0.8513154005320721, "grad_norm": 0.8435994386672974, "learning_rate": 7.164009111617313e-05, "loss": 3.9314, "step": 3240 }, { "epoch": 0.8518409038657339, "grad_norm": 0.8563053607940674, "learning_rate": 7.162256877518836e-05, "loss": 3.9863, "step": 3242 }, { "epoch": 0.8523664071993957, "grad_norm": 0.9026196002960205, "learning_rate": 7.160504643420361e-05, "loss": 3.9381, "step": 3244 }, { "epoch": 0.8528919105330575, "grad_norm": 0.9929929375648499, "learning_rate": 7.158752409321886e-05, "loss": 3.916, "step": 3246 }, { "epoch": 0.8534174138667192, "grad_norm": 0.8123306632041931, "learning_rate": 7.157000175223411e-05, "loss": 3.9617, "step": 3248 }, { "epoch": 0.853942917200381, "grad_norm": 0.8718085885047913, "learning_rate": 7.155247941124936e-05, "loss": 3.9674, "step": 3250 }, { "epoch": 0.8544684205340428, "grad_norm": 1.0405092239379883, "learning_rate": 7.153495707026459e-05, "loss": 3.9314, "step": 3252 }, { "epoch": 0.8549939238677046, "grad_norm": 0.8671479225158691, "learning_rate": 7.151743472927984e-05, "loss": 3.9453, "step": 3254 }, { "epoch": 0.8555194272013663, "grad_norm": 0.9385197162628174, "learning_rate": 7.149991238829507e-05, "loss": 3.9317, "step": 3256 }, { "epoch": 0.8560449305350281, "grad_norm": 0.858830988407135, "learning_rate": 7.148239004731032e-05, "loss": 3.9034, "step": 3258 }, { "epoch": 0.8565704338686898, "grad_norm": 0.9038625359535217, "learning_rate": 7.146486770632557e-05, "loss": 3.9646, "step": 3260 }, { "epoch": 0.8570959372023517, "grad_norm": 0.8343058824539185, "learning_rate": 7.144734536534081e-05, "loss": 3.9275, "step": 3262 }, { "epoch": 0.8576214405360134, "grad_norm": 0.8572068214416504, "learning_rate": 7.142982302435606e-05, "loss": 3.9318, "step": 3264 }, { "epoch": 0.8581469438696752, "grad_norm": 0.8012566566467285, "learning_rate": 7.141230068337131e-05, "loss": 3.905, "step": 3266 }, { "epoch": 0.8586724472033369, "grad_norm": 0.9250853061676025, "learning_rate": 7.139477834238654e-05, "loss": 3.9393, "step": 3268 }, { "epoch": 0.8591979505369988, "grad_norm": 0.8421234488487244, "learning_rate": 7.137725600140179e-05, "loss": 3.9482, "step": 3270 }, { "epoch": 0.8597234538706605, "grad_norm": 0.8613210916519165, "learning_rate": 7.135973366041704e-05, "loss": 3.8968, "step": 3272 }, { "epoch": 0.8602489572043223, "grad_norm": 0.8278005719184875, "learning_rate": 7.134221131943229e-05, "loss": 3.9471, "step": 3274 }, { "epoch": 0.860774460537984, "grad_norm": 0.7866396307945251, "learning_rate": 7.132468897844753e-05, "loss": 3.9502, "step": 3276 }, { "epoch": 0.8612999638716458, "grad_norm": 1.0152380466461182, "learning_rate": 7.130716663746277e-05, "loss": 3.9319, "step": 3278 }, { "epoch": 0.8618254672053076, "grad_norm": 0.804505467414856, "learning_rate": 7.128964429647801e-05, "loss": 3.9787, "step": 3280 }, { "epoch": 0.8623509705389694, "grad_norm": 0.8667430877685547, "learning_rate": 7.127212195549325e-05, "loss": 3.9274, "step": 3282 }, { "epoch": 0.8628764738726311, "grad_norm": 0.9690466523170471, "learning_rate": 7.12545996145085e-05, "loss": 3.9571, "step": 3284 }, { "epoch": 0.8634019772062929, "grad_norm": 0.8078598976135254, "learning_rate": 7.123707727352374e-05, "loss": 3.9067, "step": 3286 }, { "epoch": 0.8639274805399547, "grad_norm": 0.9516221880912781, "learning_rate": 7.121955493253899e-05, "loss": 3.9282, "step": 3288 }, { "epoch": 0.8644529838736165, "grad_norm": 0.8735782504081726, "learning_rate": 7.120203259155424e-05, "loss": 3.9487, "step": 3290 }, { "epoch": 0.8649784872072782, "grad_norm": 0.8112698793411255, "learning_rate": 7.118451025056949e-05, "loss": 3.957, "step": 3292 }, { "epoch": 0.86550399054094, "grad_norm": 0.847469687461853, "learning_rate": 7.116698790958472e-05, "loss": 3.9722, "step": 3294 }, { "epoch": 0.8660294938746018, "grad_norm": 0.8048822283744812, "learning_rate": 7.114946556859997e-05, "loss": 3.8646, "step": 3296 }, { "epoch": 0.8665549972082636, "grad_norm": 0.801766037940979, "learning_rate": 7.113194322761522e-05, "loss": 3.8619, "step": 3298 }, { "epoch": 0.8670805005419253, "grad_norm": 0.8708166480064392, "learning_rate": 7.111442088663046e-05, "loss": 3.8881, "step": 3300 }, { "epoch": 0.8676060038755871, "grad_norm": 0.8877458572387695, "learning_rate": 7.109689854564571e-05, "loss": 3.9455, "step": 3302 }, { "epoch": 0.8681315072092488, "grad_norm": 0.8392919898033142, "learning_rate": 7.107937620466094e-05, "loss": 3.8937, "step": 3304 }, { "epoch": 0.8686570105429107, "grad_norm": 0.8310826420783997, "learning_rate": 7.106185386367619e-05, "loss": 3.9071, "step": 3306 }, { "epoch": 0.8691825138765724, "grad_norm": 0.8675073385238647, "learning_rate": 7.104433152269143e-05, "loss": 3.9263, "step": 3308 }, { "epoch": 0.8697080172102342, "grad_norm": 0.8747515082359314, "learning_rate": 7.102680918170667e-05, "loss": 3.9361, "step": 3310 }, { "epoch": 0.8702335205438959, "grad_norm": 0.9788192510604858, "learning_rate": 7.100928684072192e-05, "loss": 3.976, "step": 3312 }, { "epoch": 0.8707590238775578, "grad_norm": 1.0401703119277954, "learning_rate": 7.099176449973717e-05, "loss": 3.9545, "step": 3314 }, { "epoch": 0.8712845272112195, "grad_norm": 0.8920219540596008, "learning_rate": 7.097424215875242e-05, "loss": 3.9288, "step": 3316 }, { "epoch": 0.8718100305448813, "grad_norm": 1.102508783340454, "learning_rate": 7.095671981776766e-05, "loss": 3.9225, "step": 3318 }, { "epoch": 0.872335533878543, "grad_norm": 0.9349856376647949, "learning_rate": 7.09391974767829e-05, "loss": 3.8898, "step": 3320 }, { "epoch": 0.8728610372122048, "grad_norm": 0.9802332520484924, "learning_rate": 7.092167513579815e-05, "loss": 3.9523, "step": 3322 }, { "epoch": 0.8733865405458666, "grad_norm": 0.9279443025588989, "learning_rate": 7.090415279481339e-05, "loss": 3.9406, "step": 3324 }, { "epoch": 0.8739120438795284, "grad_norm": 0.9254508018493652, "learning_rate": 7.088663045382864e-05, "loss": 3.9129, "step": 3326 }, { "epoch": 0.8744375472131901, "grad_norm": 0.8901885747909546, "learning_rate": 7.086910811284387e-05, "loss": 3.9894, "step": 3328 }, { "epoch": 0.8749630505468519, "grad_norm": 0.9766623973846436, "learning_rate": 7.085158577185912e-05, "loss": 3.9126, "step": 3330 }, { "epoch": 0.8754885538805137, "grad_norm": 0.8547890186309814, "learning_rate": 7.083406343087437e-05, "loss": 3.9732, "step": 3332 }, { "epoch": 0.8760140572141755, "grad_norm": 0.9389622807502747, "learning_rate": 7.08165410898896e-05, "loss": 3.9247, "step": 3334 }, { "epoch": 0.8765395605478372, "grad_norm": 1.08566153049469, "learning_rate": 7.079901874890485e-05, "loss": 3.9307, "step": 3336 }, { "epoch": 0.877065063881499, "grad_norm": 0.9304254055023193, "learning_rate": 7.07814964079201e-05, "loss": 3.9038, "step": 3338 }, { "epoch": 0.8775905672151608, "grad_norm": 0.9074569940567017, "learning_rate": 7.076397406693535e-05, "loss": 3.9007, "step": 3340 }, { "epoch": 0.8781160705488226, "grad_norm": 1.0591316223144531, "learning_rate": 7.07464517259506e-05, "loss": 3.8963, "step": 3342 }, { "epoch": 0.8786415738824843, "grad_norm": 0.8704319596290588, "learning_rate": 7.072892938496584e-05, "loss": 3.9527, "step": 3344 }, { "epoch": 0.8791670772161461, "grad_norm": 0.8054947853088379, "learning_rate": 7.071140704398108e-05, "loss": 3.939, "step": 3346 }, { "epoch": 0.8796925805498078, "grad_norm": 0.8404005765914917, "learning_rate": 7.069388470299632e-05, "loss": 3.9254, "step": 3348 }, { "epoch": 0.8802180838834697, "grad_norm": 0.8139736652374268, "learning_rate": 7.067636236201157e-05, "loss": 3.8842, "step": 3350 }, { "epoch": 0.8807435872171314, "grad_norm": 0.892755389213562, "learning_rate": 7.065884002102682e-05, "loss": 3.9925, "step": 3352 }, { "epoch": 0.8812690905507932, "grad_norm": 1.038904070854187, "learning_rate": 7.064131768004205e-05, "loss": 3.9448, "step": 3354 }, { "epoch": 0.8817945938844549, "grad_norm": 0.798176646232605, "learning_rate": 7.06237953390573e-05, "loss": 3.9244, "step": 3356 }, { "epoch": 0.8823200972181168, "grad_norm": 1.0364841222763062, "learning_rate": 7.060627299807255e-05, "loss": 3.9306, "step": 3358 }, { "epoch": 0.8828456005517785, "grad_norm": 0.9165228605270386, "learning_rate": 7.058875065708778e-05, "loss": 3.9531, "step": 3360 }, { "epoch": 0.8833711038854403, "grad_norm": 0.9911144971847534, "learning_rate": 7.057122831610303e-05, "loss": 3.9173, "step": 3362 }, { "epoch": 0.883896607219102, "grad_norm": 0.8411067128181458, "learning_rate": 7.055370597511828e-05, "loss": 3.9404, "step": 3364 }, { "epoch": 0.8844221105527639, "grad_norm": 0.8446078896522522, "learning_rate": 7.053618363413352e-05, "loss": 3.9442, "step": 3366 }, { "epoch": 0.8849476138864256, "grad_norm": 0.8802130818367004, "learning_rate": 7.051866129314877e-05, "loss": 3.8875, "step": 3368 }, { "epoch": 0.8854731172200874, "grad_norm": 0.8149612545967102, "learning_rate": 7.050113895216402e-05, "loss": 3.9157, "step": 3370 }, { "epoch": 0.8859986205537491, "grad_norm": 0.868574857711792, "learning_rate": 7.048361661117925e-05, "loss": 3.9384, "step": 3372 }, { "epoch": 0.8865241238874109, "grad_norm": 0.9337440729141235, "learning_rate": 7.04660942701945e-05, "loss": 3.9232, "step": 3374 }, { "epoch": 0.8870496272210727, "grad_norm": 0.8533630967140198, "learning_rate": 7.044857192920975e-05, "loss": 3.9608, "step": 3376 }, { "epoch": 0.8875751305547345, "grad_norm": 0.9128209948539734, "learning_rate": 7.0431049588225e-05, "loss": 3.915, "step": 3378 }, { "epoch": 0.8881006338883962, "grad_norm": 0.904403805732727, "learning_rate": 7.041352724724023e-05, "loss": 3.8981, "step": 3380 }, { "epoch": 0.888626137222058, "grad_norm": 0.9596070647239685, "learning_rate": 7.039600490625548e-05, "loss": 3.8996, "step": 3382 }, { "epoch": 0.8891516405557198, "grad_norm": 0.9046396613121033, "learning_rate": 7.037848256527073e-05, "loss": 3.9723, "step": 3384 }, { "epoch": 0.8896771438893816, "grad_norm": 1.0256905555725098, "learning_rate": 7.036096022428596e-05, "loss": 3.9435, "step": 3386 }, { "epoch": 0.8902026472230433, "grad_norm": 0.9532716274261475, "learning_rate": 7.03434378833012e-05, "loss": 3.9189, "step": 3388 }, { "epoch": 0.8907281505567051, "grad_norm": 0.8965052366256714, "learning_rate": 7.032591554231645e-05, "loss": 3.8766, "step": 3390 }, { "epoch": 0.8912536538903668, "grad_norm": 0.8485153913497925, "learning_rate": 7.03083932013317e-05, "loss": 3.966, "step": 3392 }, { "epoch": 0.8917791572240287, "grad_norm": 0.8398302793502808, "learning_rate": 7.029087086034695e-05, "loss": 3.9221, "step": 3394 }, { "epoch": 0.8923046605576904, "grad_norm": 1.0401352643966675, "learning_rate": 7.02733485193622e-05, "loss": 3.9742, "step": 3396 }, { "epoch": 0.8928301638913522, "grad_norm": 0.7917698621749878, "learning_rate": 7.025582617837743e-05, "loss": 3.9458, "step": 3398 }, { "epoch": 0.8933556672250139, "grad_norm": 0.828471839427948, "learning_rate": 7.023830383739268e-05, "loss": 3.9048, "step": 3400 }, { "epoch": 0.8938811705586758, "grad_norm": 0.8189290165901184, "learning_rate": 7.022078149640793e-05, "loss": 3.9105, "step": 3402 }, { "epoch": 0.8944066738923375, "grad_norm": 0.7652946710586548, "learning_rate": 7.020325915542317e-05, "loss": 3.9355, "step": 3404 }, { "epoch": 0.8949321772259993, "grad_norm": 1.0772185325622559, "learning_rate": 7.018573681443841e-05, "loss": 3.8931, "step": 3406 }, { "epoch": 0.895457680559661, "grad_norm": 0.8732746839523315, "learning_rate": 7.016821447345366e-05, "loss": 3.9213, "step": 3408 }, { "epoch": 0.8959831838933229, "grad_norm": 0.9796597361564636, "learning_rate": 7.01506921324689e-05, "loss": 3.9585, "step": 3410 }, { "epoch": 0.8965086872269846, "grad_norm": 0.9846336245536804, "learning_rate": 7.013316979148414e-05, "loss": 3.9158, "step": 3412 }, { "epoch": 0.8970341905606464, "grad_norm": 0.9459385275840759, "learning_rate": 7.011564745049938e-05, "loss": 3.8855, "step": 3414 }, { "epoch": 0.8975596938943081, "grad_norm": 0.9014521837234497, "learning_rate": 7.009812510951463e-05, "loss": 3.8915, "step": 3416 }, { "epoch": 0.8980851972279699, "grad_norm": 1.0051113367080688, "learning_rate": 7.008060276852988e-05, "loss": 3.9146, "step": 3418 }, { "epoch": 0.8986107005616317, "grad_norm": 0.9282394647598267, "learning_rate": 7.006308042754513e-05, "loss": 3.9612, "step": 3420 }, { "epoch": 0.8991362038952935, "grad_norm": 0.9045000672340393, "learning_rate": 7.004555808656037e-05, "loss": 3.9069, "step": 3422 }, { "epoch": 0.8996617072289552, "grad_norm": 0.9043731689453125, "learning_rate": 7.002803574557561e-05, "loss": 3.9497, "step": 3424 }, { "epoch": 0.900187210562617, "grad_norm": 0.9173302054405212, "learning_rate": 7.001051340459086e-05, "loss": 3.9112, "step": 3426 }, { "epoch": 0.9007127138962788, "grad_norm": 0.9110281467437744, "learning_rate": 6.99929910636061e-05, "loss": 3.8806, "step": 3428 }, { "epoch": 0.9012382172299406, "grad_norm": 0.9528982043266296, "learning_rate": 6.997546872262134e-05, "loss": 3.9421, "step": 3430 }, { "epoch": 0.9017637205636023, "grad_norm": 1.010575532913208, "learning_rate": 6.995794638163659e-05, "loss": 3.9024, "step": 3432 }, { "epoch": 0.9022892238972641, "grad_norm": 0.8852566480636597, "learning_rate": 6.994042404065183e-05, "loss": 3.8706, "step": 3434 }, { "epoch": 0.9028147272309258, "grad_norm": 0.9048908948898315, "learning_rate": 6.992290169966708e-05, "loss": 3.9311, "step": 3436 }, { "epoch": 0.9033402305645877, "grad_norm": 1.042065143585205, "learning_rate": 6.990537935868233e-05, "loss": 4.046, "step": 3438 }, { "epoch": 0.9038657338982494, "grad_norm": 0.801839292049408, "learning_rate": 6.988785701769756e-05, "loss": 3.9291, "step": 3440 }, { "epoch": 0.9043912372319112, "grad_norm": 0.9133000373840332, "learning_rate": 6.987033467671281e-05, "loss": 3.8926, "step": 3442 }, { "epoch": 0.9049167405655729, "grad_norm": 0.7363576889038086, "learning_rate": 6.985281233572806e-05, "loss": 3.9714, "step": 3444 }, { "epoch": 0.9054422438992348, "grad_norm": 0.823438286781311, "learning_rate": 6.98352899947433e-05, "loss": 3.9031, "step": 3446 }, { "epoch": 0.9059677472328965, "grad_norm": 0.8830239772796631, "learning_rate": 6.981776765375855e-05, "loss": 3.9174, "step": 3448 }, { "epoch": 0.9064932505665583, "grad_norm": 0.8695173263549805, "learning_rate": 6.98002453127738e-05, "loss": 3.927, "step": 3450 }, { "epoch": 0.90701875390022, "grad_norm": 0.7776471972465515, "learning_rate": 6.978272297178903e-05, "loss": 3.9414, "step": 3452 }, { "epoch": 0.9075442572338819, "grad_norm": 0.9110491871833801, "learning_rate": 6.976520063080428e-05, "loss": 3.8999, "step": 3454 }, { "epoch": 0.9080697605675436, "grad_norm": 0.9248384237289429, "learning_rate": 6.974767828981952e-05, "loss": 3.943, "step": 3456 }, { "epoch": 0.9085952639012054, "grad_norm": 0.886202335357666, "learning_rate": 6.973015594883476e-05, "loss": 3.9081, "step": 3458 }, { "epoch": 0.9091207672348671, "grad_norm": 0.9277637004852295, "learning_rate": 6.971263360785001e-05, "loss": 3.9427, "step": 3460 }, { "epoch": 0.9096462705685289, "grad_norm": 0.9681371450424194, "learning_rate": 6.969511126686526e-05, "loss": 3.9147, "step": 3462 }, { "epoch": 0.9101717739021907, "grad_norm": 0.9273000955581665, "learning_rate": 6.96775889258805e-05, "loss": 3.8846, "step": 3464 }, { "epoch": 0.9106972772358525, "grad_norm": 0.8650227785110474, "learning_rate": 6.966006658489574e-05, "loss": 3.9185, "step": 3466 }, { "epoch": 0.9112227805695142, "grad_norm": 0.8240212798118591, "learning_rate": 6.964254424391099e-05, "loss": 3.8828, "step": 3468 }, { "epoch": 0.911748283903176, "grad_norm": 0.846308171749115, "learning_rate": 6.962502190292623e-05, "loss": 3.9477, "step": 3470 }, { "epoch": 0.9122737872368378, "grad_norm": 0.8405637741088867, "learning_rate": 6.960749956194148e-05, "loss": 3.9245, "step": 3472 }, { "epoch": 0.9127992905704996, "grad_norm": 0.800810694694519, "learning_rate": 6.958997722095673e-05, "loss": 3.9525, "step": 3474 }, { "epoch": 0.9133247939041613, "grad_norm": 0.9002047181129456, "learning_rate": 6.957245487997198e-05, "loss": 3.8996, "step": 3476 }, { "epoch": 0.9138502972378231, "grad_norm": 0.9654659628868103, "learning_rate": 6.955493253898721e-05, "loss": 3.9549, "step": 3478 }, { "epoch": 0.9143758005714848, "grad_norm": 1.0132460594177246, "learning_rate": 6.953741019800246e-05, "loss": 3.9047, "step": 3480 }, { "epoch": 0.9149013039051467, "grad_norm": 0.8853533864021301, "learning_rate": 6.951988785701769e-05, "loss": 3.8819, "step": 3482 }, { "epoch": 0.9154268072388084, "grad_norm": 0.9070357084274292, "learning_rate": 6.950236551603294e-05, "loss": 3.8664, "step": 3484 }, { "epoch": 0.9159523105724702, "grad_norm": 0.8579594492912292, "learning_rate": 6.948484317504819e-05, "loss": 3.9032, "step": 3486 }, { "epoch": 0.9164778139061319, "grad_norm": 0.9389886856079102, "learning_rate": 6.946732083406344e-05, "loss": 3.9097, "step": 3488 }, { "epoch": 0.9170033172397938, "grad_norm": 0.9680403470993042, "learning_rate": 6.944979849307868e-05, "loss": 3.8884, "step": 3490 }, { "epoch": 0.9175288205734555, "grad_norm": 1.0357543230056763, "learning_rate": 6.943227615209392e-05, "loss": 3.9027, "step": 3492 }, { "epoch": 0.9180543239071173, "grad_norm": 0.8705071806907654, "learning_rate": 6.941475381110916e-05, "loss": 3.8859, "step": 3494 }, { "epoch": 0.918579827240779, "grad_norm": 0.9946110844612122, "learning_rate": 6.939723147012441e-05, "loss": 3.9258, "step": 3496 }, { "epoch": 0.9191053305744409, "grad_norm": 0.8200335502624512, "learning_rate": 6.937970912913966e-05, "loss": 3.9161, "step": 3498 }, { "epoch": 0.9196308339081026, "grad_norm": 1.0109559297561646, "learning_rate": 6.936218678815491e-05, "loss": 3.8905, "step": 3500 }, { "epoch": 0.9201563372417644, "grad_norm": 0.8505614399909973, "learning_rate": 6.934466444717016e-05, "loss": 3.9348, "step": 3502 }, { "epoch": 0.9206818405754261, "grad_norm": 0.7488939762115479, "learning_rate": 6.932714210618539e-05, "loss": 3.9232, "step": 3504 }, { "epoch": 0.9212073439090879, "grad_norm": 0.9005396962165833, "learning_rate": 6.930961976520064e-05, "loss": 3.9257, "step": 3506 }, { "epoch": 0.9217328472427497, "grad_norm": 0.7843775749206543, "learning_rate": 6.929209742421587e-05, "loss": 3.8824, "step": 3508 }, { "epoch": 0.9222583505764115, "grad_norm": 1.0249260663986206, "learning_rate": 6.927457508323112e-05, "loss": 4.0006, "step": 3510 }, { "epoch": 0.9227838539100732, "grad_norm": 0.8444844484329224, "learning_rate": 6.925705274224637e-05, "loss": 3.8823, "step": 3512 }, { "epoch": 0.923309357243735, "grad_norm": 0.9872987866401672, "learning_rate": 6.923953040126161e-05, "loss": 3.8295, "step": 3514 }, { "epoch": 0.9238348605773968, "grad_norm": 0.7945454120635986, "learning_rate": 6.922200806027686e-05, "loss": 3.9529, "step": 3516 }, { "epoch": 0.9243603639110586, "grad_norm": 0.8952515721321106, "learning_rate": 6.92044857192921e-05, "loss": 3.9375, "step": 3518 }, { "epoch": 0.9248858672447203, "grad_norm": 0.8539207577705383, "learning_rate": 6.918696337830734e-05, "loss": 3.9179, "step": 3520 }, { "epoch": 0.9254113705783821, "grad_norm": 0.9214663505554199, "learning_rate": 6.916944103732259e-05, "loss": 3.8809, "step": 3522 }, { "epoch": 0.9259368739120439, "grad_norm": 0.867942214012146, "learning_rate": 6.915191869633784e-05, "loss": 3.8709, "step": 3524 }, { "epoch": 0.9264623772457057, "grad_norm": 0.9308571815490723, "learning_rate": 6.913439635535309e-05, "loss": 3.8887, "step": 3526 }, { "epoch": 0.9269878805793674, "grad_norm": 0.9459753632545471, "learning_rate": 6.911687401436833e-05, "loss": 3.9122, "step": 3528 }, { "epoch": 0.9275133839130292, "grad_norm": 0.9179220795631409, "learning_rate": 6.909935167338357e-05, "loss": 3.8859, "step": 3530 }, { "epoch": 0.9280388872466909, "grad_norm": 0.9898721575737, "learning_rate": 6.90818293323988e-05, "loss": 3.8793, "step": 3532 }, { "epoch": 0.9285643905803528, "grad_norm": 0.9851050972938538, "learning_rate": 6.906430699141405e-05, "loss": 3.9326, "step": 3534 }, { "epoch": 0.9290898939140145, "grad_norm": 0.8228738903999329, "learning_rate": 6.90467846504293e-05, "loss": 3.9316, "step": 3536 }, { "epoch": 0.9296153972476763, "grad_norm": 1.0194953680038452, "learning_rate": 6.902926230944454e-05, "loss": 3.8637, "step": 3538 }, { "epoch": 0.930140900581338, "grad_norm": 0.8430651426315308, "learning_rate": 6.901173996845979e-05, "loss": 3.9312, "step": 3540 }, { "epoch": 0.9306664039149999, "grad_norm": 0.9290679097175598, "learning_rate": 6.899421762747504e-05, "loss": 3.9144, "step": 3542 }, { "epoch": 0.9311919072486616, "grad_norm": 1.0065464973449707, "learning_rate": 6.897669528649027e-05, "loss": 3.913, "step": 3544 }, { "epoch": 0.9317174105823234, "grad_norm": 0.8166496157646179, "learning_rate": 6.895917294550552e-05, "loss": 3.9161, "step": 3546 }, { "epoch": 0.9322429139159851, "grad_norm": 0.929343044757843, "learning_rate": 6.894165060452077e-05, "loss": 3.9351, "step": 3548 }, { "epoch": 0.9327684172496469, "grad_norm": 0.9479802250862122, "learning_rate": 6.892412826353602e-05, "loss": 3.9077, "step": 3550 }, { "epoch": 0.9332939205833087, "grad_norm": 0.8740216493606567, "learning_rate": 6.890660592255126e-05, "loss": 3.9047, "step": 3552 }, { "epoch": 0.9338194239169705, "grad_norm": 1.0723867416381836, "learning_rate": 6.888908358156651e-05, "loss": 3.8599, "step": 3554 }, { "epoch": 0.9343449272506322, "grad_norm": 1.0262950658798218, "learning_rate": 6.887156124058174e-05, "loss": 3.9028, "step": 3556 }, { "epoch": 0.934870430584294, "grad_norm": 0.9340352416038513, "learning_rate": 6.885403889959698e-05, "loss": 3.9348, "step": 3558 }, { "epoch": 0.9353959339179558, "grad_norm": 0.8115987777709961, "learning_rate": 6.883651655861223e-05, "loss": 3.9191, "step": 3560 }, { "epoch": 0.9359214372516176, "grad_norm": 1.0442367792129517, "learning_rate": 6.881899421762747e-05, "loss": 3.8585, "step": 3562 }, { "epoch": 0.9364469405852793, "grad_norm": 0.8228043913841248, "learning_rate": 6.880147187664272e-05, "loss": 3.9055, "step": 3564 }, { "epoch": 0.9369724439189411, "grad_norm": 0.9084616899490356, "learning_rate": 6.878394953565797e-05, "loss": 3.9057, "step": 3566 }, { "epoch": 0.9374979472526029, "grad_norm": 0.982569694519043, "learning_rate": 6.876642719467322e-05, "loss": 3.9106, "step": 3568 }, { "epoch": 0.9380234505862647, "grad_norm": 0.8334186673164368, "learning_rate": 6.874890485368845e-05, "loss": 3.8605, "step": 3570 }, { "epoch": 0.9385489539199264, "grad_norm": 0.9210214018821716, "learning_rate": 6.87313825127037e-05, "loss": 3.8877, "step": 3572 }, { "epoch": 0.9390744572535882, "grad_norm": 0.8922497034072876, "learning_rate": 6.871386017171895e-05, "loss": 3.9125, "step": 3574 }, { "epoch": 0.9395999605872499, "grad_norm": 0.9049947261810303, "learning_rate": 6.869633783073419e-05, "loss": 3.9201, "step": 3576 }, { "epoch": 0.9401254639209118, "grad_norm": 0.8910399079322815, "learning_rate": 6.867881548974944e-05, "loss": 3.9037, "step": 3578 }, { "epoch": 0.9406509672545735, "grad_norm": 1.0027482509613037, "learning_rate": 6.866129314876469e-05, "loss": 3.9756, "step": 3580 }, { "epoch": 0.9411764705882353, "grad_norm": 0.8373531699180603, "learning_rate": 6.864377080777992e-05, "loss": 3.96, "step": 3582 }, { "epoch": 0.941701973921897, "grad_norm": 0.7858461141586304, "learning_rate": 6.862624846679516e-05, "loss": 3.9348, "step": 3584 }, { "epoch": 0.9422274772555589, "grad_norm": 0.8853874206542969, "learning_rate": 6.86087261258104e-05, "loss": 3.8939, "step": 3586 }, { "epoch": 0.9427529805892206, "grad_norm": 0.9715880751609802, "learning_rate": 6.859120378482565e-05, "loss": 3.9269, "step": 3588 }, { "epoch": 0.9432784839228824, "grad_norm": 0.8926761746406555, "learning_rate": 6.85736814438409e-05, "loss": 3.8856, "step": 3590 }, { "epoch": 0.9438039872565441, "grad_norm": 0.9122655391693115, "learning_rate": 6.855615910285615e-05, "loss": 3.8772, "step": 3592 }, { "epoch": 0.9443294905902059, "grad_norm": 0.8253449201583862, "learning_rate": 6.85386367618714e-05, "loss": 3.9681, "step": 3594 }, { "epoch": 0.9448549939238677, "grad_norm": 0.8136500120162964, "learning_rate": 6.852111442088663e-05, "loss": 3.944, "step": 3596 }, { "epoch": 0.9453804972575295, "grad_norm": 0.9325180649757385, "learning_rate": 6.850359207990188e-05, "loss": 3.8964, "step": 3598 }, { "epoch": 0.9459060005911912, "grad_norm": 1.007025122642517, "learning_rate": 6.848606973891712e-05, "loss": 3.9493, "step": 3600 }, { "epoch": 0.9459060005911912, "eval_loss": 3.858748197555542, "eval_runtime": 565.9083, "eval_samples_per_second": 215.21, "eval_steps_per_second": 6.725, "step": 3600 }, { "epoch": 0.946431503924853, "grad_norm": 0.8543709516525269, "learning_rate": 6.846854739793237e-05, "loss": 3.912, "step": 3602 }, { "epoch": 0.9469570072585148, "grad_norm": 0.7813833355903625, "learning_rate": 6.845102505694762e-05, "loss": 3.9312, "step": 3604 }, { "epoch": 0.9474825105921766, "grad_norm": 0.761431097984314, "learning_rate": 6.843350271596287e-05, "loss": 3.9139, "step": 3606 }, { "epoch": 0.9480080139258383, "grad_norm": 0.8889532089233398, "learning_rate": 6.84159803749781e-05, "loss": 3.8691, "step": 3608 }, { "epoch": 0.9485335172595001, "grad_norm": 0.8141577243804932, "learning_rate": 6.839845803399333e-05, "loss": 3.9172, "step": 3610 }, { "epoch": 0.9490590205931619, "grad_norm": 0.8708415627479553, "learning_rate": 6.838093569300858e-05, "loss": 3.8683, "step": 3612 }, { "epoch": 0.9495845239268237, "grad_norm": 0.8277978897094727, "learning_rate": 6.836341335202383e-05, "loss": 3.9413, "step": 3614 }, { "epoch": 0.9501100272604854, "grad_norm": 0.8648923635482788, "learning_rate": 6.834589101103908e-05, "loss": 3.8989, "step": 3616 }, { "epoch": 0.9506355305941472, "grad_norm": 0.8293460011482239, "learning_rate": 6.832836867005432e-05, "loss": 3.8912, "step": 3618 }, { "epoch": 0.9511610339278089, "grad_norm": 1.079303503036499, "learning_rate": 6.831084632906957e-05, "loss": 3.8595, "step": 3620 }, { "epoch": 0.9516865372614708, "grad_norm": 0.8873153924942017, "learning_rate": 6.82933239880848e-05, "loss": 3.9106, "step": 3622 }, { "epoch": 0.9522120405951325, "grad_norm": 0.9375338554382324, "learning_rate": 6.827580164710005e-05, "loss": 3.9357, "step": 3624 }, { "epoch": 0.9527375439287943, "grad_norm": 0.9206287860870361, "learning_rate": 6.82582793061153e-05, "loss": 3.9107, "step": 3626 }, { "epoch": 0.953263047262456, "grad_norm": 0.8726155757904053, "learning_rate": 6.824075696513055e-05, "loss": 3.8375, "step": 3628 }, { "epoch": 0.9537885505961179, "grad_norm": 1.0177313089370728, "learning_rate": 6.82232346241458e-05, "loss": 3.9064, "step": 3630 }, { "epoch": 0.9543140539297796, "grad_norm": 0.8020273447036743, "learning_rate": 6.820571228316104e-05, "loss": 3.9038, "step": 3632 }, { "epoch": 0.9548395572634414, "grad_norm": 0.913148045539856, "learning_rate": 6.818818994217628e-05, "loss": 3.913, "step": 3634 }, { "epoch": 0.9553650605971031, "grad_norm": 0.9545018672943115, "learning_rate": 6.817066760119151e-05, "loss": 3.9122, "step": 3636 }, { "epoch": 0.9558905639307649, "grad_norm": 0.8851977586746216, "learning_rate": 6.815314526020676e-05, "loss": 3.8572, "step": 3638 }, { "epoch": 0.9564160672644267, "grad_norm": 0.8675808310508728, "learning_rate": 6.8135622919222e-05, "loss": 3.8866, "step": 3640 }, { "epoch": 0.9569415705980885, "grad_norm": 0.9044348001480103, "learning_rate": 6.811810057823725e-05, "loss": 3.9368, "step": 3642 }, { "epoch": 0.9574670739317502, "grad_norm": 0.8391221761703491, "learning_rate": 6.81005782372525e-05, "loss": 3.9112, "step": 3644 }, { "epoch": 0.957992577265412, "grad_norm": 0.8398281931877136, "learning_rate": 6.808305589626775e-05, "loss": 3.8706, "step": 3646 }, { "epoch": 0.9585180805990738, "grad_norm": 0.7941089272499084, "learning_rate": 6.806553355528298e-05, "loss": 3.9126, "step": 3648 }, { "epoch": 0.9590435839327356, "grad_norm": 0.7988334894180298, "learning_rate": 6.804801121429823e-05, "loss": 3.8697, "step": 3650 }, { "epoch": 0.9595690872663973, "grad_norm": 0.861297070980072, "learning_rate": 6.803048887331348e-05, "loss": 3.9067, "step": 3652 }, { "epoch": 0.9600945906000591, "grad_norm": 0.885358989238739, "learning_rate": 6.801296653232873e-05, "loss": 3.9029, "step": 3654 }, { "epoch": 0.9606200939337209, "grad_norm": 0.8571000099182129, "learning_rate": 6.799544419134397e-05, "loss": 3.8661, "step": 3656 }, { "epoch": 0.9611455972673827, "grad_norm": 0.9009043574333191, "learning_rate": 6.797792185035922e-05, "loss": 3.9256, "step": 3658 }, { "epoch": 0.9616711006010444, "grad_norm": 0.8639697432518005, "learning_rate": 6.796039950937446e-05, "loss": 3.8444, "step": 3660 }, { "epoch": 0.9621966039347062, "grad_norm": 1.0304346084594727, "learning_rate": 6.794287716838969e-05, "loss": 3.94, "step": 3662 }, { "epoch": 0.9627221072683679, "grad_norm": 0.7830595970153809, "learning_rate": 6.792535482740494e-05, "loss": 3.9564, "step": 3664 }, { "epoch": 0.9632476106020298, "grad_norm": 0.9229221940040588, "learning_rate": 6.790783248642018e-05, "loss": 3.9349, "step": 3666 }, { "epoch": 0.9637731139356915, "grad_norm": 0.880426824092865, "learning_rate": 6.789031014543543e-05, "loss": 3.9349, "step": 3668 }, { "epoch": 0.9642986172693533, "grad_norm": 0.881723165512085, "learning_rate": 6.787278780445068e-05, "loss": 3.9094, "step": 3670 }, { "epoch": 0.964824120603015, "grad_norm": 0.8703471422195435, "learning_rate": 6.785526546346593e-05, "loss": 3.9176, "step": 3672 }, { "epoch": 0.9653496239366769, "grad_norm": 0.8280903697013855, "learning_rate": 6.783774312248116e-05, "loss": 3.8837, "step": 3674 }, { "epoch": 0.9658751272703386, "grad_norm": 0.9504658579826355, "learning_rate": 6.782022078149641e-05, "loss": 3.9032, "step": 3676 }, { "epoch": 0.9664006306040004, "grad_norm": 0.7995091676712036, "learning_rate": 6.780269844051166e-05, "loss": 3.9024, "step": 3678 }, { "epoch": 0.9669261339376621, "grad_norm": 0.9818856716156006, "learning_rate": 6.77851760995269e-05, "loss": 3.9099, "step": 3680 }, { "epoch": 0.967451637271324, "grad_norm": 0.9180545806884766, "learning_rate": 6.776765375854215e-05, "loss": 3.9348, "step": 3682 }, { "epoch": 0.9679771406049857, "grad_norm": 0.8541598320007324, "learning_rate": 6.77501314175574e-05, "loss": 3.8935, "step": 3684 }, { "epoch": 0.9685026439386475, "grad_norm": 0.9697319865226746, "learning_rate": 6.773260907657263e-05, "loss": 3.8266, "step": 3686 }, { "epoch": 0.9690281472723092, "grad_norm": 0.8518965244293213, "learning_rate": 6.771508673558788e-05, "loss": 3.9008, "step": 3688 }, { "epoch": 0.969553650605971, "grad_norm": 0.8743928670883179, "learning_rate": 6.769756439460311e-05, "loss": 3.9446, "step": 3690 }, { "epoch": 0.9700791539396328, "grad_norm": 1.0322755575180054, "learning_rate": 6.768004205361836e-05, "loss": 3.9332, "step": 3692 }, { "epoch": 0.9706046572732946, "grad_norm": 0.8484731316566467, "learning_rate": 6.766251971263361e-05, "loss": 3.9155, "step": 3694 }, { "epoch": 0.9711301606069563, "grad_norm": 0.9075736999511719, "learning_rate": 6.764499737164886e-05, "loss": 3.9186, "step": 3696 }, { "epoch": 0.9716556639406181, "grad_norm": 0.9127398729324341, "learning_rate": 6.76274750306641e-05, "loss": 3.9024, "step": 3698 }, { "epoch": 0.97218116727428, "grad_norm": 0.8073276281356812, "learning_rate": 6.760995268967935e-05, "loss": 3.8639, "step": 3700 }, { "epoch": 0.9727066706079417, "grad_norm": 0.8933922648429871, "learning_rate": 6.759243034869459e-05, "loss": 3.8947, "step": 3702 }, { "epoch": 0.9732321739416034, "grad_norm": 0.8752590417861938, "learning_rate": 6.757490800770983e-05, "loss": 3.9459, "step": 3704 }, { "epoch": 0.9737576772752652, "grad_norm": 0.8586541414260864, "learning_rate": 6.755738566672508e-05, "loss": 3.9032, "step": 3706 }, { "epoch": 0.9742831806089269, "grad_norm": 0.955687940120697, "learning_rate": 6.753986332574033e-05, "loss": 3.907, "step": 3708 }, { "epoch": 0.9748086839425888, "grad_norm": 0.8558814525604248, "learning_rate": 6.752234098475558e-05, "loss": 3.9411, "step": 3710 }, { "epoch": 0.9753341872762505, "grad_norm": 0.9419000148773193, "learning_rate": 6.750481864377081e-05, "loss": 3.9205, "step": 3712 }, { "epoch": 0.9758596906099123, "grad_norm": 0.7861726880073547, "learning_rate": 6.748729630278606e-05, "loss": 3.9045, "step": 3714 }, { "epoch": 0.976385193943574, "grad_norm": 0.9036528468132019, "learning_rate": 6.746977396180129e-05, "loss": 3.9194, "step": 3716 }, { "epoch": 0.9769106972772359, "grad_norm": 0.788662314414978, "learning_rate": 6.745225162081654e-05, "loss": 3.8856, "step": 3718 }, { "epoch": 0.9774362006108976, "grad_norm": 0.8471680879592896, "learning_rate": 6.743472927983179e-05, "loss": 3.9158, "step": 3720 }, { "epoch": 0.9779617039445594, "grad_norm": 1.0542877912521362, "learning_rate": 6.741720693884703e-05, "loss": 3.9049, "step": 3722 }, { "epoch": 0.9784872072782211, "grad_norm": 0.8509339094161987, "learning_rate": 6.739968459786228e-05, "loss": 3.88, "step": 3724 }, { "epoch": 0.979012710611883, "grad_norm": 0.8660452365875244, "learning_rate": 6.738216225687753e-05, "loss": 3.8885, "step": 3726 }, { "epoch": 0.9795382139455447, "grad_norm": 0.9621448516845703, "learning_rate": 6.736463991589276e-05, "loss": 3.8941, "step": 3728 }, { "epoch": 0.9800637172792065, "grad_norm": 0.7974744439125061, "learning_rate": 6.734711757490801e-05, "loss": 3.8601, "step": 3730 }, { "epoch": 0.9805892206128682, "grad_norm": 0.8622754216194153, "learning_rate": 6.732959523392326e-05, "loss": 3.9109, "step": 3732 }, { "epoch": 0.98111472394653, "grad_norm": 0.8975734710693359, "learning_rate": 6.73120728929385e-05, "loss": 3.9135, "step": 3734 }, { "epoch": 0.9816402272801918, "grad_norm": 0.852653443813324, "learning_rate": 6.729455055195374e-05, "loss": 3.8929, "step": 3736 }, { "epoch": 0.9821657306138536, "grad_norm": 0.741301417350769, "learning_rate": 6.727702821096899e-05, "loss": 3.8769, "step": 3738 }, { "epoch": 0.9826912339475153, "grad_norm": 0.8955078721046448, "learning_rate": 6.725950586998424e-05, "loss": 3.8743, "step": 3740 }, { "epoch": 0.9832167372811771, "grad_norm": 0.954612135887146, "learning_rate": 6.724198352899947e-05, "loss": 3.895, "step": 3742 }, { "epoch": 0.983742240614839, "grad_norm": 0.8938519954681396, "learning_rate": 6.722446118801472e-05, "loss": 3.8754, "step": 3744 }, { "epoch": 0.9842677439485007, "grad_norm": 1.0525662899017334, "learning_rate": 6.720693884702996e-05, "loss": 3.8844, "step": 3746 }, { "epoch": 0.9847932472821624, "grad_norm": 0.7719287872314453, "learning_rate": 6.718941650604521e-05, "loss": 3.8935, "step": 3748 }, { "epoch": 0.9853187506158242, "grad_norm": 0.8309032917022705, "learning_rate": 6.717189416506046e-05, "loss": 3.8974, "step": 3750 }, { "epoch": 0.9858442539494859, "grad_norm": 0.7771923542022705, "learning_rate": 6.715437182407571e-05, "loss": 3.9051, "step": 3752 }, { "epoch": 0.9863697572831478, "grad_norm": 0.968725860118866, "learning_rate": 6.713684948309094e-05, "loss": 3.9164, "step": 3754 }, { "epoch": 0.9868952606168095, "grad_norm": 0.8064023852348328, "learning_rate": 6.711932714210619e-05, "loss": 3.9078, "step": 3756 }, { "epoch": 0.9874207639504713, "grad_norm": 1.0726840496063232, "learning_rate": 6.710180480112144e-05, "loss": 3.8749, "step": 3758 }, { "epoch": 0.987946267284133, "grad_norm": 0.8268805742263794, "learning_rate": 6.708428246013668e-05, "loss": 3.911, "step": 3760 }, { "epoch": 0.9884717706177949, "grad_norm": 0.9227132201194763, "learning_rate": 6.706676011915192e-05, "loss": 3.8719, "step": 3762 }, { "epoch": 0.9889972739514566, "grad_norm": 0.878226637840271, "learning_rate": 6.704923777816717e-05, "loss": 3.8901, "step": 3764 }, { "epoch": 0.9895227772851184, "grad_norm": 0.8103342652320862, "learning_rate": 6.703171543718241e-05, "loss": 3.9036, "step": 3766 }, { "epoch": 0.9900482806187801, "grad_norm": 0.8260455131530762, "learning_rate": 6.701419309619765e-05, "loss": 3.8434, "step": 3768 }, { "epoch": 0.990573783952442, "grad_norm": 0.8837816715240479, "learning_rate": 6.69966707552129e-05, "loss": 3.8585, "step": 3770 }, { "epoch": 0.9910992872861037, "grad_norm": 0.7529637813568115, "learning_rate": 6.697914841422814e-05, "loss": 3.9568, "step": 3772 }, { "epoch": 0.9916247906197655, "grad_norm": 0.7796488404273987, "learning_rate": 6.696162607324339e-05, "loss": 3.9205, "step": 3774 }, { "epoch": 0.9921502939534272, "grad_norm": 0.8511204719543457, "learning_rate": 6.694410373225864e-05, "loss": 3.9365, "step": 3776 }, { "epoch": 0.992675797287089, "grad_norm": 0.8204172253608704, "learning_rate": 6.692658139127389e-05, "loss": 3.8895, "step": 3778 }, { "epoch": 0.9932013006207508, "grad_norm": 0.926581084728241, "learning_rate": 6.690905905028912e-05, "loss": 3.9522, "step": 3780 }, { "epoch": 0.9937268039544126, "grad_norm": 0.8706080913543701, "learning_rate": 6.689153670930437e-05, "loss": 3.9036, "step": 3782 }, { "epoch": 0.9942523072880743, "grad_norm": 0.9458873867988586, "learning_rate": 6.687401436831961e-05, "loss": 3.9226, "step": 3784 }, { "epoch": 0.9947778106217361, "grad_norm": 0.945679247379303, "learning_rate": 6.685649202733486e-05, "loss": 3.8944, "step": 3786 }, { "epoch": 0.995303313955398, "grad_norm": 0.8382325768470764, "learning_rate": 6.68389696863501e-05, "loss": 3.8808, "step": 3788 }, { "epoch": 0.9958288172890597, "grad_norm": 0.882784366607666, "learning_rate": 6.682144734536534e-05, "loss": 3.9167, "step": 3790 }, { "epoch": 0.9963543206227214, "grad_norm": 0.7749800682067871, "learning_rate": 6.680392500438059e-05, "loss": 3.8945, "step": 3792 }, { "epoch": 0.9968798239563832, "grad_norm": 0.8743687272071838, "learning_rate": 6.678640266339583e-05, "loss": 3.9633, "step": 3794 }, { "epoch": 0.9974053272900449, "grad_norm": 0.8476532697677612, "learning_rate": 6.676888032241107e-05, "loss": 3.9477, "step": 3796 }, { "epoch": 0.9979308306237068, "grad_norm": 0.7881777286529541, "learning_rate": 6.675135798142632e-05, "loss": 3.9221, "step": 3798 }, { "epoch": 0.9984563339573685, "grad_norm": 0.8368692994117737, "learning_rate": 6.673383564044157e-05, "loss": 3.9113, "step": 3800 }, { "epoch": 0.9989818372910303, "grad_norm": 0.8076598644256592, "learning_rate": 6.671631329945682e-05, "loss": 3.8972, "step": 3802 }, { "epoch": 0.999507340624692, "grad_norm": 0.8020215630531311, "learning_rate": 6.669879095847206e-05, "loss": 3.9113, "step": 3804 }, { "epoch": 1.000032843958354, "grad_norm": 0.9062731266021729, "learning_rate": 6.66812686174873e-05, "loss": 3.8801, "step": 3806 }, { "epoch": 1.0005583472920156, "grad_norm": 0.914361298084259, "learning_rate": 6.666374627650254e-05, "loss": 3.8586, "step": 3808 }, { "epoch": 1.0010838506256774, "grad_norm": 0.8308265805244446, "learning_rate": 6.664622393551779e-05, "loss": 3.805, "step": 3810 }, { "epoch": 1.0016093539593391, "grad_norm": 0.8808644413948059, "learning_rate": 6.662870159453303e-05, "loss": 3.8036, "step": 3812 }, { "epoch": 1.002134857293001, "grad_norm": 0.7458781003952026, "learning_rate": 6.661117925354827e-05, "loss": 3.8846, "step": 3814 }, { "epoch": 1.0026603606266626, "grad_norm": 0.9244504570960999, "learning_rate": 6.659365691256352e-05, "loss": 3.8967, "step": 3816 }, { "epoch": 1.0031858639603246, "grad_norm": 0.8910004496574402, "learning_rate": 6.657613457157877e-05, "loss": 3.8824, "step": 3818 }, { "epoch": 1.0037113672939864, "grad_norm": 1.0594176054000854, "learning_rate": 6.6558612230594e-05, "loss": 3.8945, "step": 3820 }, { "epoch": 1.004236870627648, "grad_norm": 0.9436342120170593, "learning_rate": 6.654108988960925e-05, "loss": 3.8743, "step": 3822 }, { "epoch": 1.0047623739613099, "grad_norm": 0.8509497046470642, "learning_rate": 6.65235675486245e-05, "loss": 3.9225, "step": 3824 }, { "epoch": 1.0052878772949716, "grad_norm": 0.9058356285095215, "learning_rate": 6.650604520763975e-05, "loss": 3.8641, "step": 3826 }, { "epoch": 1.0058133806286333, "grad_norm": 0.8306275010108948, "learning_rate": 6.648852286665499e-05, "loss": 3.8288, "step": 3828 }, { "epoch": 1.006338883962295, "grad_norm": 0.949256956577301, "learning_rate": 6.647100052567024e-05, "loss": 3.8995, "step": 3830 }, { "epoch": 1.0068643872959568, "grad_norm": 0.8137875199317932, "learning_rate": 6.645347818468547e-05, "loss": 3.8338, "step": 3832 }, { "epoch": 1.0073898906296186, "grad_norm": 0.9271281957626343, "learning_rate": 6.643595584370072e-05, "loss": 3.8543, "step": 3834 }, { "epoch": 1.0079153939632806, "grad_norm": 0.8100757598876953, "learning_rate": 6.641843350271597e-05, "loss": 3.8495, "step": 3836 }, { "epoch": 1.0084408972969423, "grad_norm": 0.823035478591919, "learning_rate": 6.64009111617312e-05, "loss": 3.8825, "step": 3838 }, { "epoch": 1.008966400630604, "grad_norm": 0.8110647797584534, "learning_rate": 6.638338882074645e-05, "loss": 3.8888, "step": 3840 }, { "epoch": 1.0094919039642658, "grad_norm": 0.9150046706199646, "learning_rate": 6.63658664797617e-05, "loss": 3.8861, "step": 3842 }, { "epoch": 1.0100174072979275, "grad_norm": 0.836549699306488, "learning_rate": 6.634834413877695e-05, "loss": 3.8318, "step": 3844 }, { "epoch": 1.0105429106315893, "grad_norm": 0.828356921672821, "learning_rate": 6.633082179779218e-05, "loss": 3.887, "step": 3846 }, { "epoch": 1.011068413965251, "grad_norm": 1.0422781705856323, "learning_rate": 6.631329945680743e-05, "loss": 3.8529, "step": 3848 }, { "epoch": 1.0115939172989128, "grad_norm": 0.9840695858001709, "learning_rate": 6.629577711582268e-05, "loss": 3.8943, "step": 3850 }, { "epoch": 1.0121194206325745, "grad_norm": 0.8625655770301819, "learning_rate": 6.627825477483792e-05, "loss": 3.8771, "step": 3852 }, { "epoch": 1.0126449239662365, "grad_norm": 0.8041239380836487, "learning_rate": 6.626073243385317e-05, "loss": 3.8326, "step": 3854 }, { "epoch": 1.0131704272998983, "grad_norm": 0.9397164583206177, "learning_rate": 6.624321009286842e-05, "loss": 3.877, "step": 3856 }, { "epoch": 1.01369593063356, "grad_norm": 0.9606198668479919, "learning_rate": 6.622568775188365e-05, "loss": 3.8815, "step": 3858 }, { "epoch": 1.0142214339672218, "grad_norm": 0.9005401134490967, "learning_rate": 6.62081654108989e-05, "loss": 3.8605, "step": 3860 }, { "epoch": 1.0147469373008835, "grad_norm": 0.8581411242485046, "learning_rate": 6.619064306991415e-05, "loss": 3.8624, "step": 3862 }, { "epoch": 1.0152724406345452, "grad_norm": 0.7873390913009644, "learning_rate": 6.617312072892938e-05, "loss": 3.8841, "step": 3864 }, { "epoch": 1.015797943968207, "grad_norm": 0.9921785593032837, "learning_rate": 6.615559838794463e-05, "loss": 3.8387, "step": 3866 }, { "epoch": 1.0163234473018687, "grad_norm": 0.9543824791908264, "learning_rate": 6.613807604695988e-05, "loss": 3.9137, "step": 3868 }, { "epoch": 1.0168489506355305, "grad_norm": 0.839214563369751, "learning_rate": 6.612055370597512e-05, "loss": 3.8735, "step": 3870 }, { "epoch": 1.0173744539691925, "grad_norm": 1.1795284748077393, "learning_rate": 6.610303136499036e-05, "loss": 3.8969, "step": 3872 }, { "epoch": 1.0178999573028542, "grad_norm": 0.9348719716072083, "learning_rate": 6.60855090240056e-05, "loss": 3.8553, "step": 3874 }, { "epoch": 1.018425460636516, "grad_norm": 0.8174589276313782, "learning_rate": 6.606798668302085e-05, "loss": 3.8583, "step": 3876 }, { "epoch": 1.0189509639701777, "grad_norm": 0.8982873558998108, "learning_rate": 6.60504643420361e-05, "loss": 3.844, "step": 3878 }, { "epoch": 1.0194764673038395, "grad_norm": 0.9340962767601013, "learning_rate": 6.603294200105135e-05, "loss": 3.8539, "step": 3880 }, { "epoch": 1.0200019706375012, "grad_norm": 0.9119390845298767, "learning_rate": 6.60154196600666e-05, "loss": 3.8103, "step": 3882 }, { "epoch": 1.020527473971163, "grad_norm": 1.0119855403900146, "learning_rate": 6.599789731908183e-05, "loss": 3.8826, "step": 3884 }, { "epoch": 1.0210529773048247, "grad_norm": 0.8974456191062927, "learning_rate": 6.598037497809708e-05, "loss": 3.8193, "step": 3886 }, { "epoch": 1.0215784806384864, "grad_norm": 0.9113464951515198, "learning_rate": 6.596285263711233e-05, "loss": 3.8498, "step": 3888 }, { "epoch": 1.0221039839721484, "grad_norm": 0.8172871470451355, "learning_rate": 6.594533029612756e-05, "loss": 3.8068, "step": 3890 }, { "epoch": 1.0226294873058102, "grad_norm": 0.9064889550209045, "learning_rate": 6.59278079551428e-05, "loss": 3.8291, "step": 3892 }, { "epoch": 1.023154990639472, "grad_norm": 0.9843708872795105, "learning_rate": 6.591028561415805e-05, "loss": 3.8747, "step": 3894 }, { "epoch": 1.0236804939731337, "grad_norm": 0.902087390422821, "learning_rate": 6.58927632731733e-05, "loss": 3.8404, "step": 3896 }, { "epoch": 1.0242059973067954, "grad_norm": 0.8679825663566589, "learning_rate": 6.587524093218854e-05, "loss": 3.8703, "step": 3898 }, { "epoch": 1.0247315006404571, "grad_norm": 0.811574399471283, "learning_rate": 6.585771859120378e-05, "loss": 3.8963, "step": 3900 }, { "epoch": 1.025257003974119, "grad_norm": 0.9086605310440063, "learning_rate": 6.584019625021903e-05, "loss": 3.8548, "step": 3902 }, { "epoch": 1.0257825073077806, "grad_norm": 0.9455679655075073, "learning_rate": 6.582267390923428e-05, "loss": 3.8587, "step": 3904 }, { "epoch": 1.0263080106414426, "grad_norm": 0.9683346748352051, "learning_rate": 6.580515156824953e-05, "loss": 3.7974, "step": 3906 }, { "epoch": 1.0268335139751044, "grad_norm": 0.8888550400733948, "learning_rate": 6.578762922726477e-05, "loss": 3.8301, "step": 3908 }, { "epoch": 1.027359017308766, "grad_norm": 1.0487765073776245, "learning_rate": 6.577010688628001e-05, "loss": 3.8517, "step": 3910 }, { "epoch": 1.0278845206424279, "grad_norm": 0.8408461213111877, "learning_rate": 6.575258454529526e-05, "loss": 3.8718, "step": 3912 }, { "epoch": 1.0284100239760896, "grad_norm": 0.9295628666877747, "learning_rate": 6.573506220431049e-05, "loss": 3.8334, "step": 3914 }, { "epoch": 1.0289355273097514, "grad_norm": 0.8336082696914673, "learning_rate": 6.571753986332574e-05, "loss": 3.8027, "step": 3916 }, { "epoch": 1.029461030643413, "grad_norm": 0.8898026943206787, "learning_rate": 6.570001752234098e-05, "loss": 3.8319, "step": 3918 }, { "epoch": 1.0299865339770748, "grad_norm": 0.9203651547431946, "learning_rate": 6.568249518135623e-05, "loss": 3.8682, "step": 3920 }, { "epoch": 1.0305120373107366, "grad_norm": 0.8968489766120911, "learning_rate": 6.566497284037148e-05, "loss": 3.8777, "step": 3922 }, { "epoch": 1.0310375406443986, "grad_norm": 0.9172600507736206, "learning_rate": 6.564745049938671e-05, "loss": 3.7887, "step": 3924 }, { "epoch": 1.0315630439780603, "grad_norm": 0.9517332911491394, "learning_rate": 6.562992815840196e-05, "loss": 3.8508, "step": 3926 }, { "epoch": 1.032088547311722, "grad_norm": 0.7458018064498901, "learning_rate": 6.561240581741721e-05, "loss": 3.8367, "step": 3928 }, { "epoch": 1.0326140506453838, "grad_norm": 0.8443341851234436, "learning_rate": 6.559488347643246e-05, "loss": 3.8844, "step": 3930 }, { "epoch": 1.0331395539790456, "grad_norm": 0.8137369751930237, "learning_rate": 6.55773611354477e-05, "loss": 3.8326, "step": 3932 }, { "epoch": 1.0336650573127073, "grad_norm": 0.8455327749252319, "learning_rate": 6.555983879446295e-05, "loss": 3.8747, "step": 3934 }, { "epoch": 1.034190560646369, "grad_norm": 0.9861891269683838, "learning_rate": 6.554231645347819e-05, "loss": 3.9066, "step": 3936 }, { "epoch": 1.0347160639800308, "grad_norm": 0.890238344669342, "learning_rate": 6.552479411249343e-05, "loss": 3.8023, "step": 3938 }, { "epoch": 1.0352415673136925, "grad_norm": 0.8128176331520081, "learning_rate": 6.550727177150867e-05, "loss": 3.85, "step": 3940 }, { "epoch": 1.0357670706473545, "grad_norm": 0.941335141658783, "learning_rate": 6.548974943052391e-05, "loss": 3.8244, "step": 3942 }, { "epoch": 1.0362925739810163, "grad_norm": 0.98939448595047, "learning_rate": 6.547222708953916e-05, "loss": 3.8751, "step": 3944 }, { "epoch": 1.036818077314678, "grad_norm": 0.9169302582740784, "learning_rate": 6.545470474855441e-05, "loss": 3.8788, "step": 3946 }, { "epoch": 1.0373435806483398, "grad_norm": 0.902184009552002, "learning_rate": 6.543718240756966e-05, "loss": 3.8368, "step": 3948 }, { "epoch": 1.0378690839820015, "grad_norm": 1.003171682357788, "learning_rate": 6.54196600665849e-05, "loss": 3.8348, "step": 3950 }, { "epoch": 1.0383945873156633, "grad_norm": 0.8802980780601501, "learning_rate": 6.540213772560014e-05, "loss": 3.899, "step": 3952 }, { "epoch": 1.038920090649325, "grad_norm": 0.8725629448890686, "learning_rate": 6.538461538461539e-05, "loss": 3.8443, "step": 3954 }, { "epoch": 1.0394455939829867, "grad_norm": 0.8711289763450623, "learning_rate": 6.536709304363063e-05, "loss": 3.8257, "step": 3956 }, { "epoch": 1.0399710973166485, "grad_norm": 0.8182446360588074, "learning_rate": 6.534957070264588e-05, "loss": 3.8247, "step": 3958 }, { "epoch": 1.0404966006503105, "grad_norm": 0.9122081398963928, "learning_rate": 6.533204836166113e-05, "loss": 3.8588, "step": 3960 }, { "epoch": 1.0410221039839722, "grad_norm": 0.8065180778503418, "learning_rate": 6.531452602067638e-05, "loss": 3.8632, "step": 3962 }, { "epoch": 1.041547607317634, "grad_norm": 0.8800660371780396, "learning_rate": 6.529700367969161e-05, "loss": 3.8164, "step": 3964 }, { "epoch": 1.0420731106512957, "grad_norm": 0.9254799485206604, "learning_rate": 6.527948133870684e-05, "loss": 3.8642, "step": 3966 }, { "epoch": 1.0425986139849575, "grad_norm": 0.966559648513794, "learning_rate": 6.526195899772209e-05, "loss": 3.8258, "step": 3968 }, { "epoch": 1.0431241173186192, "grad_norm": 0.8392562866210938, "learning_rate": 6.524443665673734e-05, "loss": 3.8541, "step": 3970 }, { "epoch": 1.043649620652281, "grad_norm": 1.0574942827224731, "learning_rate": 6.522691431575259e-05, "loss": 3.8331, "step": 3972 }, { "epoch": 1.0441751239859427, "grad_norm": 1.0327550172805786, "learning_rate": 6.520939197476783e-05, "loss": 3.843, "step": 3974 }, { "epoch": 1.0447006273196044, "grad_norm": 0.8649450540542603, "learning_rate": 6.519186963378308e-05, "loss": 3.8215, "step": 3976 }, { "epoch": 1.0452261306532664, "grad_norm": 0.8453118801116943, "learning_rate": 6.517434729279832e-05, "loss": 3.8492, "step": 3978 }, { "epoch": 1.0457516339869282, "grad_norm": 0.9099166989326477, "learning_rate": 6.515682495181356e-05, "loss": 3.8405, "step": 3980 }, { "epoch": 1.04627713732059, "grad_norm": 0.8245223760604858, "learning_rate": 6.513930261082881e-05, "loss": 3.8334, "step": 3982 }, { "epoch": 1.0468026406542517, "grad_norm": 0.8605002164840698, "learning_rate": 6.512178026984406e-05, "loss": 3.8648, "step": 3984 }, { "epoch": 1.0473281439879134, "grad_norm": 0.8193170428276062, "learning_rate": 6.51042579288593e-05, "loss": 3.8563, "step": 3986 }, { "epoch": 1.0478536473215752, "grad_norm": 0.805557131767273, "learning_rate": 6.508673558787455e-05, "loss": 3.8448, "step": 3988 }, { "epoch": 1.048379150655237, "grad_norm": 0.8168925046920776, "learning_rate": 6.506921324688979e-05, "loss": 3.8715, "step": 3990 }, { "epoch": 1.0489046539888986, "grad_norm": 0.7736477255821228, "learning_rate": 6.505169090590502e-05, "loss": 3.8635, "step": 3992 }, { "epoch": 1.0494301573225606, "grad_norm": 0.902656614780426, "learning_rate": 6.503416856492027e-05, "loss": 3.872, "step": 3994 }, { "epoch": 1.0499556606562224, "grad_norm": 0.9031539559364319, "learning_rate": 6.501664622393552e-05, "loss": 3.8381, "step": 3996 }, { "epoch": 1.0504811639898841, "grad_norm": 0.9271982312202454, "learning_rate": 6.499912388295076e-05, "loss": 3.817, "step": 3998 }, { "epoch": 1.0510066673235459, "grad_norm": 0.8856391310691833, "learning_rate": 6.498160154196601e-05, "loss": 3.8599, "step": 4000 }, { "epoch": 1.0510066673235459, "eval_loss": 3.8426928520202637, "eval_runtime": 464.8442, "eval_samples_per_second": 262.0, "eval_steps_per_second": 8.188, "step": 4000 }, { "epoch": 1.0515321706572076, "grad_norm": 0.9939782619476318, "learning_rate": 6.496407920098126e-05, "loss": 3.8143, "step": 4002 }, { "epoch": 1.0520576739908694, "grad_norm": 0.8333626389503479, "learning_rate": 6.49465568599965e-05, "loss": 3.8547, "step": 4004 }, { "epoch": 1.052583177324531, "grad_norm": 0.977675199508667, "learning_rate": 6.492903451901174e-05, "loss": 3.8804, "step": 4006 }, { "epoch": 1.0531086806581929, "grad_norm": 0.895599901676178, "learning_rate": 6.491151217802699e-05, "loss": 3.8225, "step": 4008 }, { "epoch": 1.0536341839918546, "grad_norm": 1.0155105590820312, "learning_rate": 6.489398983704224e-05, "loss": 3.8978, "step": 4010 }, { "epoch": 1.0541596873255166, "grad_norm": 0.9863442778587341, "learning_rate": 6.487646749605748e-05, "loss": 3.795, "step": 4012 }, { "epoch": 1.0546851906591783, "grad_norm": 0.7992003560066223, "learning_rate": 6.485894515507273e-05, "loss": 3.8246, "step": 4014 }, { "epoch": 1.05521069399284, "grad_norm": 0.7765113711357117, "learning_rate": 6.484142281408797e-05, "loss": 3.8659, "step": 4016 }, { "epoch": 1.0557361973265018, "grad_norm": 0.9398106932640076, "learning_rate": 6.48239004731032e-05, "loss": 3.8669, "step": 4018 }, { "epoch": 1.0562617006601636, "grad_norm": 0.8791581392288208, "learning_rate": 6.480637813211845e-05, "loss": 3.8063, "step": 4020 }, { "epoch": 1.0567872039938253, "grad_norm": 0.8028295040130615, "learning_rate": 6.47888557911337e-05, "loss": 3.7787, "step": 4022 }, { "epoch": 1.057312707327487, "grad_norm": 0.8878529667854309, "learning_rate": 6.477133345014894e-05, "loss": 3.8707, "step": 4024 }, { "epoch": 1.0578382106611488, "grad_norm": 1.0250942707061768, "learning_rate": 6.475381110916419e-05, "loss": 3.8501, "step": 4026 }, { "epoch": 1.0583637139948106, "grad_norm": 0.9061123728752136, "learning_rate": 6.473628876817944e-05, "loss": 3.8601, "step": 4028 }, { "epoch": 1.0588892173284725, "grad_norm": 0.8102031350135803, "learning_rate": 6.471876642719467e-05, "loss": 3.8807, "step": 4030 }, { "epoch": 1.0594147206621343, "grad_norm": 0.8829103112220764, "learning_rate": 6.470124408620992e-05, "loss": 3.8242, "step": 4032 }, { "epoch": 1.059940223995796, "grad_norm": 0.890247642993927, "learning_rate": 6.468372174522517e-05, "loss": 3.811, "step": 4034 }, { "epoch": 1.0604657273294578, "grad_norm": 0.8437374830245972, "learning_rate": 6.466619940424041e-05, "loss": 3.8326, "step": 4036 }, { "epoch": 1.0609912306631195, "grad_norm": 0.9033440351486206, "learning_rate": 6.464867706325566e-05, "loss": 3.8717, "step": 4038 }, { "epoch": 1.0615167339967813, "grad_norm": 0.8358924984931946, "learning_rate": 6.463115472227091e-05, "loss": 3.8506, "step": 4040 }, { "epoch": 1.062042237330443, "grad_norm": 0.9294697046279907, "learning_rate": 6.461363238128614e-05, "loss": 3.8911, "step": 4042 }, { "epoch": 1.0625677406641048, "grad_norm": 0.7996838688850403, "learning_rate": 6.459611004030138e-05, "loss": 3.8126, "step": 4044 }, { "epoch": 1.0630932439977667, "grad_norm": 0.9733649492263794, "learning_rate": 6.457858769931663e-05, "loss": 3.8388, "step": 4046 }, { "epoch": 1.0636187473314285, "grad_norm": 1.039107084274292, "learning_rate": 6.456106535833187e-05, "loss": 3.8284, "step": 4048 }, { "epoch": 1.0641442506650902, "grad_norm": 0.8756958842277527, "learning_rate": 6.454354301734712e-05, "loss": 3.9218, "step": 4050 }, { "epoch": 1.064669753998752, "grad_norm": 0.9694856405258179, "learning_rate": 6.452602067636237e-05, "loss": 3.8674, "step": 4052 }, { "epoch": 1.0651952573324137, "grad_norm": 0.8886772990226746, "learning_rate": 6.450849833537762e-05, "loss": 3.8201, "step": 4054 }, { "epoch": 1.0657207606660755, "grad_norm": 0.8049248456954956, "learning_rate": 6.449097599439285e-05, "loss": 3.8816, "step": 4056 }, { "epoch": 1.0662462639997372, "grad_norm": 0.9415490031242371, "learning_rate": 6.44734536534081e-05, "loss": 3.8678, "step": 4058 }, { "epoch": 1.066771767333399, "grad_norm": 0.8531544804573059, "learning_rate": 6.445593131242334e-05, "loss": 3.8432, "step": 4060 }, { "epoch": 1.0672972706670607, "grad_norm": 0.8535398244857788, "learning_rate": 6.443840897143859e-05, "loss": 3.8306, "step": 4062 }, { "epoch": 1.0678227740007227, "grad_norm": 0.8916287422180176, "learning_rate": 6.442088663045384e-05, "loss": 3.8904, "step": 4064 }, { "epoch": 1.0683482773343844, "grad_norm": 0.9007816314697266, "learning_rate": 6.440336428946909e-05, "loss": 3.841, "step": 4066 }, { "epoch": 1.0688737806680462, "grad_norm": 0.9287604689598083, "learning_rate": 6.438584194848432e-05, "loss": 3.8248, "step": 4068 }, { "epoch": 1.069399284001708, "grad_norm": 0.8469907641410828, "learning_rate": 6.436831960749956e-05, "loss": 3.832, "step": 4070 }, { "epoch": 1.0699247873353697, "grad_norm": 0.92658931016922, "learning_rate": 6.43507972665148e-05, "loss": 3.8475, "step": 4072 }, { "epoch": 1.0704502906690314, "grad_norm": 0.7670734524726868, "learning_rate": 6.433327492553005e-05, "loss": 3.7983, "step": 4074 }, { "epoch": 1.0709757940026932, "grad_norm": 0.8031631112098694, "learning_rate": 6.43157525845453e-05, "loss": 3.8062, "step": 4076 }, { "epoch": 1.071501297336355, "grad_norm": 0.9783763289451599, "learning_rate": 6.429823024356055e-05, "loss": 3.8436, "step": 4078 }, { "epoch": 1.0720268006700167, "grad_norm": 0.9198837876319885, "learning_rate": 6.428070790257579e-05, "loss": 3.829, "step": 4080 }, { "epoch": 1.0725523040036786, "grad_norm": 0.9614832401275635, "learning_rate": 6.426318556159103e-05, "loss": 3.8238, "step": 4082 }, { "epoch": 1.0730778073373404, "grad_norm": 0.8214021921157837, "learning_rate": 6.424566322060627e-05, "loss": 3.8512, "step": 4084 }, { "epoch": 1.0736033106710021, "grad_norm": 0.7459424734115601, "learning_rate": 6.422814087962152e-05, "loss": 3.8469, "step": 4086 }, { "epoch": 1.0741288140046639, "grad_norm": 0.7654638886451721, "learning_rate": 6.421061853863677e-05, "loss": 3.7965, "step": 4088 }, { "epoch": 1.0746543173383256, "grad_norm": 0.8567413091659546, "learning_rate": 6.419309619765202e-05, "loss": 3.848, "step": 4090 }, { "epoch": 1.0751798206719874, "grad_norm": 0.7573654651641846, "learning_rate": 6.417557385666726e-05, "loss": 3.8564, "step": 4092 }, { "epoch": 1.075705324005649, "grad_norm": 0.9066358804702759, "learning_rate": 6.41580515156825e-05, "loss": 3.8254, "step": 4094 }, { "epoch": 1.0762308273393109, "grad_norm": 0.8160527944564819, "learning_rate": 6.414052917469773e-05, "loss": 3.8339, "step": 4096 }, { "epoch": 1.0767563306729726, "grad_norm": 0.762395441532135, "learning_rate": 6.412300683371298e-05, "loss": 3.7905, "step": 4098 }, { "epoch": 1.0772818340066346, "grad_norm": 0.8505592942237854, "learning_rate": 6.410548449272823e-05, "loss": 3.8601, "step": 4100 }, { "epoch": 1.0778073373402963, "grad_norm": 0.8492096662521362, "learning_rate": 6.408796215174348e-05, "loss": 3.8326, "step": 4102 }, { "epoch": 1.078332840673958, "grad_norm": 0.8845439553260803, "learning_rate": 6.407043981075872e-05, "loss": 3.8599, "step": 4104 }, { "epoch": 1.0788583440076198, "grad_norm": 0.7818361520767212, "learning_rate": 6.405291746977397e-05, "loss": 3.8053, "step": 4106 }, { "epoch": 1.0793838473412816, "grad_norm": 0.7327656149864197, "learning_rate": 6.40353951287892e-05, "loss": 3.8627, "step": 4108 }, { "epoch": 1.0799093506749433, "grad_norm": 0.9555023312568665, "learning_rate": 6.401787278780445e-05, "loss": 3.8763, "step": 4110 }, { "epoch": 1.080434854008605, "grad_norm": 0.8695909380912781, "learning_rate": 6.40003504468197e-05, "loss": 3.8555, "step": 4112 }, { "epoch": 1.0809603573422668, "grad_norm": 0.8913126587867737, "learning_rate": 6.398282810583495e-05, "loss": 3.8289, "step": 4114 }, { "epoch": 1.0814858606759286, "grad_norm": 0.8994353413581848, "learning_rate": 6.39653057648502e-05, "loss": 3.8294, "step": 4116 }, { "epoch": 1.0820113640095905, "grad_norm": 0.9057719111442566, "learning_rate": 6.394778342386543e-05, "loss": 3.9014, "step": 4118 }, { "epoch": 1.0825368673432523, "grad_norm": 0.7565765976905823, "learning_rate": 6.393026108288068e-05, "loss": 3.8061, "step": 4120 }, { "epoch": 1.083062370676914, "grad_norm": 0.8439077138900757, "learning_rate": 6.391273874189591e-05, "loss": 3.8239, "step": 4122 }, { "epoch": 1.0835878740105758, "grad_norm": 0.9600284695625305, "learning_rate": 6.389521640091116e-05, "loss": 3.8233, "step": 4124 }, { "epoch": 1.0841133773442375, "grad_norm": 0.8094576001167297, "learning_rate": 6.38776940599264e-05, "loss": 3.8166, "step": 4126 }, { "epoch": 1.0846388806778993, "grad_norm": 0.8222635984420776, "learning_rate": 6.386017171894165e-05, "loss": 3.8765, "step": 4128 }, { "epoch": 1.085164384011561, "grad_norm": 0.7892642617225647, "learning_rate": 6.38426493779569e-05, "loss": 3.7781, "step": 4130 }, { "epoch": 1.0856898873452228, "grad_norm": 0.8776226043701172, "learning_rate": 6.382512703697215e-05, "loss": 3.8506, "step": 4132 }, { "epoch": 1.0862153906788845, "grad_norm": 0.7815094590187073, "learning_rate": 6.380760469598738e-05, "loss": 3.8719, "step": 4134 }, { "epoch": 1.0867408940125465, "grad_norm": 0.8802372217178345, "learning_rate": 6.379008235500263e-05, "loss": 3.8371, "step": 4136 }, { "epoch": 1.0872663973462082, "grad_norm": 0.9378541111946106, "learning_rate": 6.377256001401788e-05, "loss": 3.8219, "step": 4138 }, { "epoch": 1.08779190067987, "grad_norm": 0.8962987065315247, "learning_rate": 6.375503767303312e-05, "loss": 3.8422, "step": 4140 }, { "epoch": 1.0883174040135317, "grad_norm": 0.9124652743339539, "learning_rate": 6.373751533204837e-05, "loss": 3.8756, "step": 4142 }, { "epoch": 1.0888429073471935, "grad_norm": 0.8171470165252686, "learning_rate": 6.37199929910636e-05, "loss": 3.8224, "step": 4144 }, { "epoch": 1.0893684106808552, "grad_norm": 0.812147855758667, "learning_rate": 6.370247065007885e-05, "loss": 3.818, "step": 4146 }, { "epoch": 1.089893914014517, "grad_norm": 0.8782671093940735, "learning_rate": 6.368494830909409e-05, "loss": 3.844, "step": 4148 }, { "epoch": 1.0904194173481787, "grad_norm": 1.0579992532730103, "learning_rate": 6.366742596810934e-05, "loss": 3.8555, "step": 4150 }, { "epoch": 1.0909449206818405, "grad_norm": 0.8519566059112549, "learning_rate": 6.364990362712458e-05, "loss": 3.8382, "step": 4152 }, { "epoch": 1.0914704240155024, "grad_norm": 0.8776350617408752, "learning_rate": 6.363238128613983e-05, "loss": 3.8449, "step": 4154 }, { "epoch": 1.0919959273491642, "grad_norm": 0.9119934439659119, "learning_rate": 6.361485894515508e-05, "loss": 3.8376, "step": 4156 }, { "epoch": 1.092521430682826, "grad_norm": 0.8797875642776489, "learning_rate": 6.359733660417033e-05, "loss": 3.8327, "step": 4158 }, { "epoch": 1.0930469340164877, "grad_norm": 0.7698355317115784, "learning_rate": 6.357981426318556e-05, "loss": 3.8082, "step": 4160 }, { "epoch": 1.0935724373501494, "grad_norm": 0.7504047155380249, "learning_rate": 6.356229192220081e-05, "loss": 3.8256, "step": 4162 }, { "epoch": 1.0940979406838112, "grad_norm": 0.8217814564704895, "learning_rate": 6.354476958121606e-05, "loss": 3.819, "step": 4164 }, { "epoch": 1.094623444017473, "grad_norm": 0.8430846929550171, "learning_rate": 6.35272472402313e-05, "loss": 3.8529, "step": 4166 }, { "epoch": 1.0951489473511347, "grad_norm": 0.7589781284332275, "learning_rate": 6.350972489924655e-05, "loss": 3.845, "step": 4168 }, { "epoch": 1.0956744506847966, "grad_norm": 0.9010476469993591, "learning_rate": 6.349220255826178e-05, "loss": 3.8493, "step": 4170 }, { "epoch": 1.0961999540184584, "grad_norm": 0.7424849271774292, "learning_rate": 6.347468021727703e-05, "loss": 3.8074, "step": 4172 }, { "epoch": 1.0967254573521201, "grad_norm": 0.8957710266113281, "learning_rate": 6.345715787629227e-05, "loss": 3.8392, "step": 4174 }, { "epoch": 1.0972509606857819, "grad_norm": 0.8367264270782471, "learning_rate": 6.343963553530751e-05, "loss": 3.8441, "step": 4176 }, { "epoch": 1.0977764640194436, "grad_norm": 0.7793827056884766, "learning_rate": 6.342211319432276e-05, "loss": 3.8944, "step": 4178 }, { "epoch": 1.0983019673531054, "grad_norm": 0.8232257962226868, "learning_rate": 6.340459085333801e-05, "loss": 3.8715, "step": 4180 }, { "epoch": 1.0988274706867671, "grad_norm": 0.7441130876541138, "learning_rate": 6.338706851235326e-05, "loss": 3.863, "step": 4182 }, { "epoch": 1.0993529740204289, "grad_norm": 0.8245807886123657, "learning_rate": 6.33695461713685e-05, "loss": 3.8568, "step": 4184 }, { "epoch": 1.0998784773540906, "grad_norm": 0.8235321044921875, "learning_rate": 6.335202383038374e-05, "loss": 3.8773, "step": 4186 }, { "epoch": 1.1004039806877526, "grad_norm": 0.7852765917778015, "learning_rate": 6.333450148939899e-05, "loss": 3.8489, "step": 4188 }, { "epoch": 1.1009294840214143, "grad_norm": 0.9607143402099609, "learning_rate": 6.331697914841423e-05, "loss": 3.912, "step": 4190 }, { "epoch": 1.101454987355076, "grad_norm": 0.9211600422859192, "learning_rate": 6.329945680742948e-05, "loss": 3.8655, "step": 4192 }, { "epoch": 1.1019804906887378, "grad_norm": 0.9310218691825867, "learning_rate": 6.328193446644473e-05, "loss": 3.8122, "step": 4194 }, { "epoch": 1.1025059940223996, "grad_norm": 0.918931245803833, "learning_rate": 6.326441212545996e-05, "loss": 3.7672, "step": 4196 }, { "epoch": 1.1030314973560613, "grad_norm": 0.8389474153518677, "learning_rate": 6.324688978447521e-05, "loss": 3.8836, "step": 4198 }, { "epoch": 1.103557000689723, "grad_norm": 0.9582043290138245, "learning_rate": 6.322936744349046e-05, "loss": 3.8169, "step": 4200 }, { "epoch": 1.1040825040233848, "grad_norm": 0.8230003118515015, "learning_rate": 6.321184510250569e-05, "loss": 3.8615, "step": 4202 }, { "epoch": 1.1046080073570468, "grad_norm": 0.7851263880729675, "learning_rate": 6.319432276152094e-05, "loss": 3.8574, "step": 4204 }, { "epoch": 1.1051335106907085, "grad_norm": 0.8246819972991943, "learning_rate": 6.317680042053619e-05, "loss": 3.8442, "step": 4206 }, { "epoch": 1.1056590140243703, "grad_norm": 0.8993629217147827, "learning_rate": 6.315927807955143e-05, "loss": 3.8243, "step": 4208 }, { "epoch": 1.106184517358032, "grad_norm": 0.8947849869728088, "learning_rate": 6.314175573856668e-05, "loss": 3.8724, "step": 4210 }, { "epoch": 1.1067100206916938, "grad_norm": 0.8660596013069153, "learning_rate": 6.312423339758193e-05, "loss": 3.8276, "step": 4212 }, { "epoch": 1.1072355240253555, "grad_norm": 0.8686069250106812, "learning_rate": 6.310671105659716e-05, "loss": 3.8274, "step": 4214 }, { "epoch": 1.1077610273590173, "grad_norm": 0.8472962379455566, "learning_rate": 6.308918871561241e-05, "loss": 3.8574, "step": 4216 }, { "epoch": 1.108286530692679, "grad_norm": 0.9575037360191345, "learning_rate": 6.307166637462766e-05, "loss": 3.8601, "step": 4218 }, { "epoch": 1.1088120340263408, "grad_norm": 0.8052129745483398, "learning_rate": 6.305414403364289e-05, "loss": 3.869, "step": 4220 }, { "epoch": 1.1093375373600027, "grad_norm": 0.8038951754570007, "learning_rate": 6.303662169265814e-05, "loss": 3.8336, "step": 4222 }, { "epoch": 1.1098630406936645, "grad_norm": 0.8999599814414978, "learning_rate": 6.301909935167339e-05, "loss": 3.8245, "step": 4224 }, { "epoch": 1.1103885440273262, "grad_norm": 0.9106491208076477, "learning_rate": 6.300157701068863e-05, "loss": 3.8229, "step": 4226 }, { "epoch": 1.110914047360988, "grad_norm": 0.911315381526947, "learning_rate": 6.298405466970387e-05, "loss": 3.8604, "step": 4228 }, { "epoch": 1.1114395506946497, "grad_norm": 0.8988795876502991, "learning_rate": 6.296653232871912e-05, "loss": 3.8925, "step": 4230 }, { "epoch": 1.1119650540283115, "grad_norm": 0.8902851939201355, "learning_rate": 6.294900998773436e-05, "loss": 3.9004, "step": 4232 }, { "epoch": 1.1124905573619732, "grad_norm": 0.8081946969032288, "learning_rate": 6.293148764674961e-05, "loss": 3.7783, "step": 4234 }, { "epoch": 1.113016060695635, "grad_norm": 0.8276877999305725, "learning_rate": 6.291396530576486e-05, "loss": 3.8688, "step": 4236 }, { "epoch": 1.1135415640292967, "grad_norm": 0.8183386325836182, "learning_rate": 6.28964429647801e-05, "loss": 3.8685, "step": 4238 }, { "epoch": 1.1140670673629587, "grad_norm": 0.7076096534729004, "learning_rate": 6.287892062379534e-05, "loss": 3.8718, "step": 4240 }, { "epoch": 1.1145925706966204, "grad_norm": 0.9460376501083374, "learning_rate": 6.286139828281059e-05, "loss": 3.8547, "step": 4242 }, { "epoch": 1.1151180740302822, "grad_norm": 0.9228415489196777, "learning_rate": 6.284387594182584e-05, "loss": 3.8555, "step": 4244 }, { "epoch": 1.115643577363944, "grad_norm": 0.8436475396156311, "learning_rate": 6.282635360084107e-05, "loss": 3.8074, "step": 4246 }, { "epoch": 1.1161690806976057, "grad_norm": 0.848069965839386, "learning_rate": 6.280883125985632e-05, "loss": 3.8208, "step": 4248 }, { "epoch": 1.1166945840312674, "grad_norm": 0.8181529641151428, "learning_rate": 6.279130891887156e-05, "loss": 3.8424, "step": 4250 }, { "epoch": 1.1172200873649292, "grad_norm": 0.8878666758537292, "learning_rate": 6.277378657788681e-05, "loss": 3.8621, "step": 4252 }, { "epoch": 1.117745590698591, "grad_norm": 0.7899048924446106, "learning_rate": 6.275626423690205e-05, "loss": 3.8294, "step": 4254 }, { "epoch": 1.1182710940322527, "grad_norm": 0.8600915670394897, "learning_rate": 6.27387418959173e-05, "loss": 3.8802, "step": 4256 }, { "epoch": 1.1187965973659146, "grad_norm": 0.9167618751525879, "learning_rate": 6.272121955493254e-05, "loss": 3.8791, "step": 4258 }, { "epoch": 1.1193221006995764, "grad_norm": 0.8310935497283936, "learning_rate": 6.270369721394779e-05, "loss": 3.8695, "step": 4260 }, { "epoch": 1.1198476040332381, "grad_norm": 0.9307309985160828, "learning_rate": 6.268617487296304e-05, "loss": 3.854, "step": 4262 }, { "epoch": 1.1203731073668999, "grad_norm": 0.7989665865898132, "learning_rate": 6.266865253197828e-05, "loss": 3.8844, "step": 4264 }, { "epoch": 1.1208986107005616, "grad_norm": 0.8966204524040222, "learning_rate": 6.265113019099352e-05, "loss": 3.8115, "step": 4266 }, { "epoch": 1.1214241140342234, "grad_norm": 0.8263187408447266, "learning_rate": 6.263360785000877e-05, "loss": 3.8327, "step": 4268 }, { "epoch": 1.1219496173678851, "grad_norm": 0.8118863701820374, "learning_rate": 6.261608550902401e-05, "loss": 3.839, "step": 4270 }, { "epoch": 1.1224751207015469, "grad_norm": 0.7685942649841309, "learning_rate": 6.259856316803925e-05, "loss": 3.8925, "step": 4272 }, { "epoch": 1.1230006240352086, "grad_norm": 0.7944725155830383, "learning_rate": 6.25810408270545e-05, "loss": 3.7566, "step": 4274 }, { "epoch": 1.1235261273688706, "grad_norm": 0.8328652381896973, "learning_rate": 6.256351848606974e-05, "loss": 3.8217, "step": 4276 }, { "epoch": 1.1240516307025323, "grad_norm": 0.9219997525215149, "learning_rate": 6.254599614508499e-05, "loss": 3.8389, "step": 4278 }, { "epoch": 1.124577134036194, "grad_norm": 0.8672502636909485, "learning_rate": 6.252847380410022e-05, "loss": 3.8458, "step": 4280 }, { "epoch": 1.1251026373698558, "grad_norm": 0.9215973615646362, "learning_rate": 6.251095146311547e-05, "loss": 3.8496, "step": 4282 }, { "epoch": 1.1256281407035176, "grad_norm": 0.8999152779579163, "learning_rate": 6.249342912213072e-05, "loss": 3.8295, "step": 4284 }, { "epoch": 1.1261536440371793, "grad_norm": 0.8210328221321106, "learning_rate": 6.247590678114597e-05, "loss": 3.8457, "step": 4286 }, { "epoch": 1.126679147370841, "grad_norm": 0.8853736519813538, "learning_rate": 6.245838444016121e-05, "loss": 3.8409, "step": 4288 }, { "epoch": 1.1272046507045028, "grad_norm": 0.8199600577354431, "learning_rate": 6.244086209917646e-05, "loss": 3.8631, "step": 4290 }, { "epoch": 1.1277301540381646, "grad_norm": 0.979916512966156, "learning_rate": 6.24233397581917e-05, "loss": 3.7928, "step": 4292 }, { "epoch": 1.1282556573718265, "grad_norm": 0.9016053676605225, "learning_rate": 6.240581741720694e-05, "loss": 3.8497, "step": 4294 }, { "epoch": 1.1287811607054883, "grad_norm": 0.8602111339569092, "learning_rate": 6.238829507622218e-05, "loss": 3.8075, "step": 4296 }, { "epoch": 1.12930666403915, "grad_norm": 0.8108006715774536, "learning_rate": 6.237077273523742e-05, "loss": 3.8135, "step": 4298 }, { "epoch": 1.1298321673728118, "grad_norm": 0.7535247206687927, "learning_rate": 6.235325039425267e-05, "loss": 3.8392, "step": 4300 }, { "epoch": 1.1303576707064735, "grad_norm": 0.9031094312667847, "learning_rate": 6.233572805326792e-05, "loss": 3.8316, "step": 4302 }, { "epoch": 1.1308831740401353, "grad_norm": 0.9947889447212219, "learning_rate": 6.231820571228317e-05, "loss": 3.796, "step": 4304 }, { "epoch": 1.131408677373797, "grad_norm": 0.820604681968689, "learning_rate": 6.23006833712984e-05, "loss": 3.8823, "step": 4306 }, { "epoch": 1.1319341807074588, "grad_norm": 0.8597801327705383, "learning_rate": 6.228316103031365e-05, "loss": 3.8264, "step": 4308 }, { "epoch": 1.1324596840411205, "grad_norm": 0.8266255855560303, "learning_rate": 6.22656386893289e-05, "loss": 3.8379, "step": 4310 }, { "epoch": 1.1329851873747825, "grad_norm": 0.8989235758781433, "learning_rate": 6.224811634834414e-05, "loss": 3.7949, "step": 4312 }, { "epoch": 1.1335106907084442, "grad_norm": 0.8044531941413879, "learning_rate": 6.223059400735939e-05, "loss": 3.8241, "step": 4314 }, { "epoch": 1.134036194042106, "grad_norm": 0.8895105719566345, "learning_rate": 6.221307166637464e-05, "loss": 3.8643, "step": 4316 }, { "epoch": 1.1345616973757677, "grad_norm": 0.9155497550964355, "learning_rate": 6.219554932538987e-05, "loss": 3.8074, "step": 4318 }, { "epoch": 1.1350872007094295, "grad_norm": 0.8126175999641418, "learning_rate": 6.217802698440512e-05, "loss": 3.8154, "step": 4320 }, { "epoch": 1.1356127040430912, "grad_norm": 0.8092813491821289, "learning_rate": 6.216050464342036e-05, "loss": 3.8982, "step": 4322 }, { "epoch": 1.136138207376753, "grad_norm": 0.8003079295158386, "learning_rate": 6.21429823024356e-05, "loss": 3.8155, "step": 4324 }, { "epoch": 1.136663710710415, "grad_norm": 0.8982368111610413, "learning_rate": 6.212545996145085e-05, "loss": 3.8645, "step": 4326 }, { "epoch": 1.1371892140440765, "grad_norm": 0.846672773361206, "learning_rate": 6.21079376204661e-05, "loss": 3.826, "step": 4328 }, { "epoch": 1.1377147173777384, "grad_norm": 0.7611628770828247, "learning_rate": 6.209041527948135e-05, "loss": 3.8319, "step": 4330 }, { "epoch": 1.1382402207114002, "grad_norm": 0.8182021379470825, "learning_rate": 6.207289293849658e-05, "loss": 3.8584, "step": 4332 }, { "epoch": 1.138765724045062, "grad_norm": 0.9516531229019165, "learning_rate": 6.205537059751183e-05, "loss": 3.8194, "step": 4334 }, { "epoch": 1.1392912273787237, "grad_norm": 0.8329264521598816, "learning_rate": 6.203784825652707e-05, "loss": 3.8458, "step": 4336 }, { "epoch": 1.1398167307123854, "grad_norm": 0.7663933038711548, "learning_rate": 6.202032591554232e-05, "loss": 3.7954, "step": 4338 }, { "epoch": 1.1403422340460472, "grad_norm": 0.7843668460845947, "learning_rate": 6.200280357455757e-05, "loss": 3.8495, "step": 4340 }, { "epoch": 1.140867737379709, "grad_norm": 0.8237520456314087, "learning_rate": 6.198528123357282e-05, "loss": 3.8169, "step": 4342 }, { "epoch": 1.141393240713371, "grad_norm": 0.8381744027137756, "learning_rate": 6.196775889258805e-05, "loss": 3.8384, "step": 4344 }, { "epoch": 1.1419187440470326, "grad_norm": 0.8540369272232056, "learning_rate": 6.19502365516033e-05, "loss": 3.805, "step": 4346 }, { "epoch": 1.1424442473806944, "grad_norm": 0.8862741589546204, "learning_rate": 6.193271421061853e-05, "loss": 3.8725, "step": 4348 }, { "epoch": 1.1429697507143561, "grad_norm": 0.7567266821861267, "learning_rate": 6.191519186963378e-05, "loss": 3.8526, "step": 4350 }, { "epoch": 1.1434952540480179, "grad_norm": 0.7865324020385742, "learning_rate": 6.189766952864903e-05, "loss": 3.8177, "step": 4352 }, { "epoch": 1.1440207573816796, "grad_norm": 0.7979964017868042, "learning_rate": 6.188014718766428e-05, "loss": 3.7927, "step": 4354 }, { "epoch": 1.1445462607153414, "grad_norm": 0.8127397894859314, "learning_rate": 6.186262484667952e-05, "loss": 3.8138, "step": 4356 }, { "epoch": 1.1450717640490031, "grad_norm": 0.8036642670631409, "learning_rate": 6.184510250569476e-05, "loss": 3.8134, "step": 4358 }, { "epoch": 1.1455972673826649, "grad_norm": 0.865053117275238, "learning_rate": 6.182758016471e-05, "loss": 3.8624, "step": 4360 }, { "epoch": 1.1461227707163268, "grad_norm": 0.7527674436569214, "learning_rate": 6.181005782372525e-05, "loss": 3.7867, "step": 4362 }, { "epoch": 1.1466482740499886, "grad_norm": 0.8269213438034058, "learning_rate": 6.17925354827405e-05, "loss": 3.8352, "step": 4364 }, { "epoch": 1.1471737773836503, "grad_norm": 0.924232006072998, "learning_rate": 6.177501314175575e-05, "loss": 3.8401, "step": 4366 }, { "epoch": 1.147699280717312, "grad_norm": 0.8294438123703003, "learning_rate": 6.1757490800771e-05, "loss": 3.8377, "step": 4368 }, { "epoch": 1.1482247840509738, "grad_norm": 0.8743914365768433, "learning_rate": 6.173996845978623e-05, "loss": 3.8569, "step": 4370 }, { "epoch": 1.1487502873846356, "grad_norm": 0.8120692372322083, "learning_rate": 6.172244611880148e-05, "loss": 3.8347, "step": 4372 }, { "epoch": 1.1492757907182973, "grad_norm": 0.8651371598243713, "learning_rate": 6.170492377781671e-05, "loss": 3.8424, "step": 4374 }, { "epoch": 1.149801294051959, "grad_norm": 0.8301262855529785, "learning_rate": 6.168740143683196e-05, "loss": 3.8641, "step": 4376 }, { "epoch": 1.1503267973856208, "grad_norm": 0.8834009766578674, "learning_rate": 6.16698790958472e-05, "loss": 3.8435, "step": 4378 }, { "epoch": 1.1508523007192828, "grad_norm": 0.795360267162323, "learning_rate": 6.165235675486245e-05, "loss": 3.8322, "step": 4380 }, { "epoch": 1.1513778040529445, "grad_norm": 0.8103901743888855, "learning_rate": 6.16348344138777e-05, "loss": 3.836, "step": 4382 }, { "epoch": 1.1519033073866063, "grad_norm": 0.8875183463096619, "learning_rate": 6.161731207289293e-05, "loss": 3.8418, "step": 4384 }, { "epoch": 1.152428810720268, "grad_norm": 0.8239988088607788, "learning_rate": 6.159978973190818e-05, "loss": 3.8153, "step": 4386 }, { "epoch": 1.1529543140539298, "grad_norm": 0.8566784262657166, "learning_rate": 6.158226739092343e-05, "loss": 3.8316, "step": 4388 }, { "epoch": 1.1534798173875915, "grad_norm": 0.784461498260498, "learning_rate": 6.156474504993868e-05, "loss": 3.7871, "step": 4390 }, { "epoch": 1.1540053207212533, "grad_norm": 0.88035649061203, "learning_rate": 6.154722270895392e-05, "loss": 3.8041, "step": 4392 }, { "epoch": 1.154530824054915, "grad_norm": 0.8237919211387634, "learning_rate": 6.152970036796917e-05, "loss": 3.8221, "step": 4394 }, { "epoch": 1.1550563273885768, "grad_norm": 0.8261709809303284, "learning_rate": 6.15121780269844e-05, "loss": 3.7895, "step": 4396 }, { "epoch": 1.1555818307222387, "grad_norm": 0.8304365277290344, "learning_rate": 6.149465568599964e-05, "loss": 3.8431, "step": 4398 }, { "epoch": 1.1561073340559005, "grad_norm": 0.7977885007858276, "learning_rate": 6.147713334501489e-05, "loss": 3.8753, "step": 4400 }, { "epoch": 1.1561073340559005, "eval_loss": 3.8281021118164062, "eval_runtime": 464.6384, "eval_samples_per_second": 262.116, "eval_steps_per_second": 8.191, "step": 4400 }, { "epoch": 1.1566328373895622, "grad_norm": 0.8793357014656067, "learning_rate": 6.145961100403014e-05, "loss": 3.8398, "step": 4402 }, { "epoch": 1.157158340723224, "grad_norm": 0.7825198173522949, "learning_rate": 6.144208866304538e-05, "loss": 3.8635, "step": 4404 }, { "epoch": 1.1576838440568857, "grad_norm": 0.9685678482055664, "learning_rate": 6.142456632206063e-05, "loss": 3.8062, "step": 4406 }, { "epoch": 1.1582093473905475, "grad_norm": 0.8687357306480408, "learning_rate": 6.140704398107588e-05, "loss": 3.7651, "step": 4408 }, { "epoch": 1.1587348507242092, "grad_norm": 0.8410065770149231, "learning_rate": 6.138952164009111e-05, "loss": 3.8208, "step": 4410 }, { "epoch": 1.159260354057871, "grad_norm": 0.8195879459381104, "learning_rate": 6.137199929910636e-05, "loss": 3.8149, "step": 4412 }, { "epoch": 1.1597858573915327, "grad_norm": 0.8508980870246887, "learning_rate": 6.135447695812161e-05, "loss": 3.8665, "step": 4414 }, { "epoch": 1.1603113607251947, "grad_norm": 0.8329206109046936, "learning_rate": 6.133695461713686e-05, "loss": 3.9068, "step": 4416 }, { "epoch": 1.1608368640588564, "grad_norm": 0.8265892863273621, "learning_rate": 6.13194322761521e-05, "loss": 3.8456, "step": 4418 }, { "epoch": 1.1613623673925182, "grad_norm": 0.9034810066223145, "learning_rate": 6.130190993516735e-05, "loss": 3.796, "step": 4420 }, { "epoch": 1.16188787072618, "grad_norm": 0.8956938982009888, "learning_rate": 6.128438759418258e-05, "loss": 3.8589, "step": 4422 }, { "epoch": 1.1624133740598417, "grad_norm": 0.8347784280776978, "learning_rate": 6.126686525319782e-05, "loss": 3.8555, "step": 4424 }, { "epoch": 1.1629388773935034, "grad_norm": 0.8115684390068054, "learning_rate": 6.124934291221307e-05, "loss": 3.8223, "step": 4426 }, { "epoch": 1.1634643807271652, "grad_norm": 0.7731747031211853, "learning_rate": 6.123182057122831e-05, "loss": 3.8838, "step": 4428 }, { "epoch": 1.163989884060827, "grad_norm": 0.8618146777153015, "learning_rate": 6.121429823024356e-05, "loss": 3.8032, "step": 4430 }, { "epoch": 1.1645153873944887, "grad_norm": 0.8151043653488159, "learning_rate": 6.119677588925881e-05, "loss": 3.8038, "step": 4432 }, { "epoch": 1.1650408907281506, "grad_norm": 0.8538967370986938, "learning_rate": 6.117925354827406e-05, "loss": 3.828, "step": 4434 }, { "epoch": 1.1655663940618124, "grad_norm": 0.8989549279212952, "learning_rate": 6.116173120728929e-05, "loss": 3.7956, "step": 4436 }, { "epoch": 1.1660918973954741, "grad_norm": 0.8290076851844788, "learning_rate": 6.114420886630454e-05, "loss": 3.8262, "step": 4438 }, { "epoch": 1.166617400729136, "grad_norm": 0.8761680722236633, "learning_rate": 6.112668652531979e-05, "loss": 3.8247, "step": 4440 }, { "epoch": 1.1671429040627976, "grad_norm": 0.7954673767089844, "learning_rate": 6.110916418433503e-05, "loss": 3.8238, "step": 4442 }, { "epoch": 1.1676684073964594, "grad_norm": 0.8560276627540588, "learning_rate": 6.109164184335028e-05, "loss": 3.8558, "step": 4444 }, { "epoch": 1.1681939107301211, "grad_norm": 0.822073221206665, "learning_rate": 6.107411950236553e-05, "loss": 3.805, "step": 4446 }, { "epoch": 1.1687194140637829, "grad_norm": 0.9220890402793884, "learning_rate": 6.105659716138076e-05, "loss": 3.858, "step": 4448 }, { "epoch": 1.1692449173974446, "grad_norm": 0.9192367792129517, "learning_rate": 6.103907482039601e-05, "loss": 3.8636, "step": 4450 }, { "epoch": 1.1697704207311066, "grad_norm": 0.9575397968292236, "learning_rate": 6.102155247941125e-05, "loss": 3.8352, "step": 4452 }, { "epoch": 1.1702959240647683, "grad_norm": 0.8960955142974854, "learning_rate": 6.100403013842649e-05, "loss": 3.8632, "step": 4454 }, { "epoch": 1.17082142739843, "grad_norm": 0.7400020956993103, "learning_rate": 6.098650779744174e-05, "loss": 3.8335, "step": 4456 }, { "epoch": 1.1713469307320918, "grad_norm": 0.876659631729126, "learning_rate": 6.0968985456456986e-05, "loss": 3.8365, "step": 4458 }, { "epoch": 1.1718724340657536, "grad_norm": 0.863422691822052, "learning_rate": 6.095146311547223e-05, "loss": 3.8496, "step": 4460 }, { "epoch": 1.1723979373994153, "grad_norm": 0.8732176423072815, "learning_rate": 6.0933940774487474e-05, "loss": 3.8194, "step": 4462 }, { "epoch": 1.172923440733077, "grad_norm": 0.8554118871688843, "learning_rate": 6.091641843350272e-05, "loss": 3.8614, "step": 4464 }, { "epoch": 1.1734489440667388, "grad_norm": 0.8475131988525391, "learning_rate": 6.089889609251796e-05, "loss": 3.7998, "step": 4466 }, { "epoch": 1.1739744474004006, "grad_norm": 0.8838691711425781, "learning_rate": 6.088137375153321e-05, "loss": 3.8374, "step": 4468 }, { "epoch": 1.1744999507340625, "grad_norm": 0.8078128695487976, "learning_rate": 6.086385141054846e-05, "loss": 3.8504, "step": 4470 }, { "epoch": 1.1750254540677243, "grad_norm": 0.8854727745056152, "learning_rate": 6.08463290695637e-05, "loss": 3.8074, "step": 4472 }, { "epoch": 1.175550957401386, "grad_norm": 0.9233645796775818, "learning_rate": 6.0828806728578946e-05, "loss": 3.8387, "step": 4474 }, { "epoch": 1.1760764607350478, "grad_norm": 0.956678032875061, "learning_rate": 6.081128438759418e-05, "loss": 3.8459, "step": 4476 }, { "epoch": 1.1766019640687095, "grad_norm": 0.9831765294075012, "learning_rate": 6.079376204660943e-05, "loss": 3.8431, "step": 4478 }, { "epoch": 1.1771274674023713, "grad_norm": 0.8765532970428467, "learning_rate": 6.077623970562467e-05, "loss": 3.8454, "step": 4480 }, { "epoch": 1.177652970736033, "grad_norm": 0.7546501755714417, "learning_rate": 6.0758717364639916e-05, "loss": 3.7982, "step": 4482 }, { "epoch": 1.178178474069695, "grad_norm": 0.987852156162262, "learning_rate": 6.0741195023655164e-05, "loss": 3.8628, "step": 4484 }, { "epoch": 1.1787039774033565, "grad_norm": 0.8436736464500427, "learning_rate": 6.0723672682670405e-05, "loss": 3.8185, "step": 4486 }, { "epoch": 1.1792294807370185, "grad_norm": 0.9957793354988098, "learning_rate": 6.070615034168565e-05, "loss": 3.8158, "step": 4488 }, { "epoch": 1.1797549840706802, "grad_norm": 0.8174615502357483, "learning_rate": 6.06886280007009e-05, "loss": 3.8325, "step": 4490 }, { "epoch": 1.180280487404342, "grad_norm": 0.9613240361213684, "learning_rate": 6.067110565971614e-05, "loss": 3.7869, "step": 4492 }, { "epoch": 1.1808059907380037, "grad_norm": 0.8189112544059753, "learning_rate": 6.065358331873139e-05, "loss": 3.826, "step": 4494 }, { "epoch": 1.1813314940716655, "grad_norm": 0.8850697875022888, "learning_rate": 6.0636060977746636e-05, "loss": 3.7911, "step": 4496 }, { "epoch": 1.1818569974053272, "grad_norm": 0.7163515090942383, "learning_rate": 6.0618538636761876e-05, "loss": 3.8307, "step": 4498 }, { "epoch": 1.182382500738989, "grad_norm": 0.7237195372581482, "learning_rate": 6.060101629577711e-05, "loss": 3.8077, "step": 4500 }, { "epoch": 1.182908004072651, "grad_norm": 0.7953745722770691, "learning_rate": 6.058349395479236e-05, "loss": 3.8606, "step": 4502 }, { "epoch": 1.1834335074063127, "grad_norm": 0.807701587677002, "learning_rate": 6.0565971613807606e-05, "loss": 3.847, "step": 4504 }, { "epoch": 1.1839590107399744, "grad_norm": 0.7601436376571655, "learning_rate": 6.0548449272822846e-05, "loss": 3.8355, "step": 4506 }, { "epoch": 1.1844845140736362, "grad_norm": 0.8282936811447144, "learning_rate": 6.0530926931838094e-05, "loss": 3.7965, "step": 4508 }, { "epoch": 1.185010017407298, "grad_norm": 0.7251214385032654, "learning_rate": 6.051340459085334e-05, "loss": 3.837, "step": 4510 }, { "epoch": 1.1855355207409597, "grad_norm": 0.8676434755325317, "learning_rate": 6.049588224986858e-05, "loss": 3.8209, "step": 4512 }, { "epoch": 1.1860610240746214, "grad_norm": 0.8372693657875061, "learning_rate": 6.047835990888383e-05, "loss": 3.8888, "step": 4514 }, { "epoch": 1.1865865274082832, "grad_norm": 0.8137038350105286, "learning_rate": 6.046083756789908e-05, "loss": 3.8347, "step": 4516 }, { "epoch": 1.187112030741945, "grad_norm": 0.8567807674407959, "learning_rate": 6.044331522691432e-05, "loss": 3.85, "step": 4518 }, { "epoch": 1.187637534075607, "grad_norm": 0.804301917552948, "learning_rate": 6.0425792885929566e-05, "loss": 3.824, "step": 4520 }, { "epoch": 1.1881630374092687, "grad_norm": 0.8487984538078308, "learning_rate": 6.040827054494481e-05, "loss": 3.8691, "step": 4522 }, { "epoch": 1.1886885407429304, "grad_norm": 0.9305403828620911, "learning_rate": 6.0390748203960054e-05, "loss": 3.8239, "step": 4524 }, { "epoch": 1.1892140440765921, "grad_norm": 0.8882975578308105, "learning_rate": 6.037322586297529e-05, "loss": 3.8581, "step": 4526 }, { "epoch": 1.189739547410254, "grad_norm": 0.8517704010009766, "learning_rate": 6.0355703521990536e-05, "loss": 3.8733, "step": 4528 }, { "epoch": 1.1902650507439156, "grad_norm": 0.8403952121734619, "learning_rate": 6.033818118100578e-05, "loss": 3.8078, "step": 4530 }, { "epoch": 1.1907905540775774, "grad_norm": 0.7444343566894531, "learning_rate": 6.0320658840021024e-05, "loss": 3.8386, "step": 4532 }, { "epoch": 1.1913160574112391, "grad_norm": 0.9642484784126282, "learning_rate": 6.030313649903627e-05, "loss": 3.8629, "step": 4534 }, { "epoch": 1.1918415607449009, "grad_norm": 0.9164651036262512, "learning_rate": 6.028561415805152e-05, "loss": 3.8159, "step": 4536 }, { "epoch": 1.1923670640785629, "grad_norm": 0.8565340638160706, "learning_rate": 6.026809181706676e-05, "loss": 3.8013, "step": 4538 }, { "epoch": 1.1928925674122246, "grad_norm": 0.8455927968025208, "learning_rate": 6.025056947608201e-05, "loss": 3.8279, "step": 4540 }, { "epoch": 1.1934180707458864, "grad_norm": 0.8433894515037537, "learning_rate": 6.0233047135097255e-05, "loss": 3.8471, "step": 4542 }, { "epoch": 1.193943574079548, "grad_norm": 0.7884666919708252, "learning_rate": 6.0215524794112496e-05, "loss": 3.8335, "step": 4544 }, { "epoch": 1.1944690774132098, "grad_norm": 0.8237798810005188, "learning_rate": 6.019800245312774e-05, "loss": 3.8422, "step": 4546 }, { "epoch": 1.1949945807468716, "grad_norm": 0.7411485314369202, "learning_rate": 6.018048011214299e-05, "loss": 3.7605, "step": 4548 }, { "epoch": 1.1955200840805333, "grad_norm": 0.995162844657898, "learning_rate": 6.016295777115823e-05, "loss": 3.8384, "step": 4550 }, { "epoch": 1.196045587414195, "grad_norm": 0.8390908241271973, "learning_rate": 6.0145435430173466e-05, "loss": 3.8305, "step": 4552 }, { "epoch": 1.1965710907478568, "grad_norm": 0.9400625228881836, "learning_rate": 6.012791308918871e-05, "loss": 3.898, "step": 4554 }, { "epoch": 1.1970965940815188, "grad_norm": 0.740731418132782, "learning_rate": 6.011039074820396e-05, "loss": 3.7635, "step": 4556 }, { "epoch": 1.1976220974151806, "grad_norm": 0.8050929307937622, "learning_rate": 6.00928684072192e-05, "loss": 3.8424, "step": 4558 }, { "epoch": 1.1981476007488423, "grad_norm": 0.831756591796875, "learning_rate": 6.007534606623445e-05, "loss": 3.84, "step": 4560 }, { "epoch": 1.198673104082504, "grad_norm": 0.8968485593795776, "learning_rate": 6.00578237252497e-05, "loss": 3.8356, "step": 4562 }, { "epoch": 1.1991986074161658, "grad_norm": 0.780127227306366, "learning_rate": 6.004030138426494e-05, "loss": 3.861, "step": 4564 }, { "epoch": 1.1997241107498275, "grad_norm": 0.7984629273414612, "learning_rate": 6.0022779043280185e-05, "loss": 3.8682, "step": 4566 }, { "epoch": 1.2002496140834893, "grad_norm": 0.8548808693885803, "learning_rate": 6.000525670229543e-05, "loss": 3.8327, "step": 4568 }, { "epoch": 1.200775117417151, "grad_norm": 0.7696551084518433, "learning_rate": 5.9987734361310673e-05, "loss": 3.7867, "step": 4570 }, { "epoch": 1.2013006207508128, "grad_norm": 0.7880712747573853, "learning_rate": 5.997021202032592e-05, "loss": 3.8071, "step": 4572 }, { "epoch": 1.2018261240844748, "grad_norm": 0.8237267136573792, "learning_rate": 5.995268967934117e-05, "loss": 3.8334, "step": 4574 }, { "epoch": 1.2023516274181365, "grad_norm": 0.8293731808662415, "learning_rate": 5.993516733835641e-05, "loss": 3.8443, "step": 4576 }, { "epoch": 1.2028771307517983, "grad_norm": 0.8168049454689026, "learning_rate": 5.991764499737165e-05, "loss": 3.8333, "step": 4578 }, { "epoch": 1.20340263408546, "grad_norm": 0.8100857138633728, "learning_rate": 5.990012265638689e-05, "loss": 3.7969, "step": 4580 }, { "epoch": 1.2039281374191217, "grad_norm": 0.747393786907196, "learning_rate": 5.988260031540214e-05, "loss": 3.8605, "step": 4582 }, { "epoch": 1.2044536407527835, "grad_norm": 0.9425659775733948, "learning_rate": 5.9865077974417386e-05, "loss": 3.7909, "step": 4584 }, { "epoch": 1.2049791440864452, "grad_norm": 0.7911994457244873, "learning_rate": 5.984755563343263e-05, "loss": 3.862, "step": 4586 }, { "epoch": 1.205504647420107, "grad_norm": 0.80666583776474, "learning_rate": 5.9830033292447874e-05, "loss": 3.8212, "step": 4588 }, { "epoch": 1.2060301507537687, "grad_norm": 0.8904913663864136, "learning_rate": 5.981251095146312e-05, "loss": 3.8815, "step": 4590 }, { "epoch": 1.2065556540874307, "grad_norm": 0.8009604811668396, "learning_rate": 5.979498861047836e-05, "loss": 3.8132, "step": 4592 }, { "epoch": 1.2070811574210925, "grad_norm": 0.78388911485672, "learning_rate": 5.977746626949361e-05, "loss": 3.8533, "step": 4594 }, { "epoch": 1.2076066607547542, "grad_norm": 0.8679212331771851, "learning_rate": 5.975994392850886e-05, "loss": 3.8337, "step": 4596 }, { "epoch": 1.208132164088416, "grad_norm": 0.9170918464660645, "learning_rate": 5.97424215875241e-05, "loss": 3.8893, "step": 4598 }, { "epoch": 1.2086576674220777, "grad_norm": 1.0380994081497192, "learning_rate": 5.9724899246539346e-05, "loss": 3.8264, "step": 4600 }, { "epoch": 1.2091831707557394, "grad_norm": 0.9447364807128906, "learning_rate": 5.970737690555458e-05, "loss": 3.7778, "step": 4602 }, { "epoch": 1.2097086740894012, "grad_norm": 0.7449600100517273, "learning_rate": 5.968985456456983e-05, "loss": 3.8278, "step": 4604 }, { "epoch": 1.210234177423063, "grad_norm": 0.781938374042511, "learning_rate": 5.967233222358507e-05, "loss": 3.8831, "step": 4606 }, { "epoch": 1.2107596807567247, "grad_norm": 0.8206570148468018, "learning_rate": 5.9654809882600316e-05, "loss": 3.8062, "step": 4608 }, { "epoch": 1.2112851840903867, "grad_norm": 0.881847620010376, "learning_rate": 5.9637287541615564e-05, "loss": 3.8237, "step": 4610 }, { "epoch": 1.2118106874240484, "grad_norm": 0.7928425073623657, "learning_rate": 5.9619765200630805e-05, "loss": 3.7865, "step": 4612 }, { "epoch": 1.2123361907577102, "grad_norm": 0.8957486152648926, "learning_rate": 5.960224285964605e-05, "loss": 3.7745, "step": 4614 }, { "epoch": 1.212861694091372, "grad_norm": 0.8255022168159485, "learning_rate": 5.95847205186613e-05, "loss": 3.8108, "step": 4616 }, { "epoch": 1.2133871974250336, "grad_norm": 0.7924718260765076, "learning_rate": 5.956719817767654e-05, "loss": 3.847, "step": 4618 }, { "epoch": 1.2139127007586954, "grad_norm": 0.813430905342102, "learning_rate": 5.954967583669179e-05, "loss": 3.7946, "step": 4620 }, { "epoch": 1.2144382040923571, "grad_norm": 0.7615558505058289, "learning_rate": 5.9532153495707036e-05, "loss": 3.7978, "step": 4622 }, { "epoch": 1.214963707426019, "grad_norm": 1.1061402559280396, "learning_rate": 5.9514631154722276e-05, "loss": 3.8221, "step": 4624 }, { "epoch": 1.2154892107596806, "grad_norm": 0.9035989046096802, "learning_rate": 5.9497108813737524e-05, "loss": 3.7553, "step": 4626 }, { "epoch": 1.2160147140933426, "grad_norm": 0.9477826952934265, "learning_rate": 5.947958647275276e-05, "loss": 3.8288, "step": 4628 }, { "epoch": 1.2165402174270044, "grad_norm": 0.8806244730949402, "learning_rate": 5.9462064131768005e-05, "loss": 3.8498, "step": 4630 }, { "epoch": 1.217065720760666, "grad_norm": 0.7685719728469849, "learning_rate": 5.9444541790783246e-05, "loss": 3.783, "step": 4632 }, { "epoch": 1.2175912240943279, "grad_norm": 0.8722796440124512, "learning_rate": 5.9427019449798494e-05, "loss": 3.7858, "step": 4634 }, { "epoch": 1.2181167274279896, "grad_norm": 0.7536334991455078, "learning_rate": 5.940949710881374e-05, "loss": 3.8571, "step": 4636 }, { "epoch": 1.2186422307616513, "grad_norm": 0.7708075046539307, "learning_rate": 5.939197476782898e-05, "loss": 3.8637, "step": 4638 }, { "epoch": 1.219167734095313, "grad_norm": 0.8800306916236877, "learning_rate": 5.937445242684423e-05, "loss": 3.8473, "step": 4640 }, { "epoch": 1.219693237428975, "grad_norm": 0.7647257447242737, "learning_rate": 5.935693008585948e-05, "loss": 3.8876, "step": 4642 }, { "epoch": 1.2202187407626366, "grad_norm": 0.835477352142334, "learning_rate": 5.933940774487472e-05, "loss": 3.8647, "step": 4644 }, { "epoch": 1.2207442440962986, "grad_norm": 0.7777186036109924, "learning_rate": 5.9321885403889966e-05, "loss": 3.8393, "step": 4646 }, { "epoch": 1.2212697474299603, "grad_norm": 0.921582818031311, "learning_rate": 5.930436306290521e-05, "loss": 3.9074, "step": 4648 }, { "epoch": 1.221795250763622, "grad_norm": 0.9928010702133179, "learning_rate": 5.9286840721920454e-05, "loss": 3.8615, "step": 4650 }, { "epoch": 1.2223207540972838, "grad_norm": 0.8341577649116516, "learning_rate": 5.92693183809357e-05, "loss": 3.8124, "step": 4652 }, { "epoch": 1.2228462574309455, "grad_norm": 0.8430451154708862, "learning_rate": 5.9251796039950936e-05, "loss": 3.8051, "step": 4654 }, { "epoch": 1.2233717607646073, "grad_norm": 0.8255882859230042, "learning_rate": 5.923427369896618e-05, "loss": 3.8133, "step": 4656 }, { "epoch": 1.223897264098269, "grad_norm": 0.8252696394920349, "learning_rate": 5.9216751357981424e-05, "loss": 3.813, "step": 4658 }, { "epoch": 1.224422767431931, "grad_norm": 0.7778747081756592, "learning_rate": 5.919922901699667e-05, "loss": 3.8457, "step": 4660 }, { "epoch": 1.2249482707655928, "grad_norm": 0.808528482913971, "learning_rate": 5.918170667601192e-05, "loss": 3.8369, "step": 4662 }, { "epoch": 1.2254737740992545, "grad_norm": 0.8318771719932556, "learning_rate": 5.916418433502716e-05, "loss": 3.846, "step": 4664 }, { "epoch": 1.2259992774329163, "grad_norm": 0.8828108310699463, "learning_rate": 5.914666199404241e-05, "loss": 3.8025, "step": 4666 }, { "epoch": 1.226524780766578, "grad_norm": 0.913719892501831, "learning_rate": 5.9129139653057655e-05, "loss": 3.8297, "step": 4668 }, { "epoch": 1.2270502841002398, "grad_norm": 0.7871080636978149, "learning_rate": 5.9111617312072896e-05, "loss": 3.8278, "step": 4670 }, { "epoch": 1.2275757874339015, "grad_norm": 0.852122962474823, "learning_rate": 5.909409497108814e-05, "loss": 3.8467, "step": 4672 }, { "epoch": 1.2281012907675632, "grad_norm": 0.8182508945465088, "learning_rate": 5.907657263010339e-05, "loss": 3.8416, "step": 4674 }, { "epoch": 1.228626794101225, "grad_norm": 0.9025081992149353, "learning_rate": 5.905905028911863e-05, "loss": 3.786, "step": 4676 }, { "epoch": 1.229152297434887, "grad_norm": 0.7802767753601074, "learning_rate": 5.904152794813388e-05, "loss": 3.8678, "step": 4678 }, { "epoch": 1.2296778007685487, "grad_norm": 0.8001668453216553, "learning_rate": 5.902400560714911e-05, "loss": 3.8189, "step": 4680 }, { "epoch": 1.2302033041022105, "grad_norm": 0.8476680517196655, "learning_rate": 5.900648326616436e-05, "loss": 3.8242, "step": 4682 }, { "epoch": 1.2307288074358722, "grad_norm": 0.7736347317695618, "learning_rate": 5.89889609251796e-05, "loss": 3.7974, "step": 4684 }, { "epoch": 1.231254310769534, "grad_norm": 0.9442012310028076, "learning_rate": 5.897143858419485e-05, "loss": 3.8669, "step": 4686 }, { "epoch": 1.2317798141031957, "grad_norm": 0.7933773994445801, "learning_rate": 5.89539162432101e-05, "loss": 3.8838, "step": 4688 }, { "epoch": 1.2323053174368575, "grad_norm": 0.9719600081443787, "learning_rate": 5.893639390222534e-05, "loss": 3.8074, "step": 4690 }, { "epoch": 1.2328308207705192, "grad_norm": 0.9064353108406067, "learning_rate": 5.8918871561240585e-05, "loss": 3.8629, "step": 4692 }, { "epoch": 1.233356324104181, "grad_norm": 0.8661221265792847, "learning_rate": 5.890134922025583e-05, "loss": 3.7918, "step": 4694 }, { "epoch": 1.233881827437843, "grad_norm": 0.8771145939826965, "learning_rate": 5.8883826879271073e-05, "loss": 3.8053, "step": 4696 }, { "epoch": 1.2344073307715047, "grad_norm": 0.8592641353607178, "learning_rate": 5.886630453828632e-05, "loss": 3.8399, "step": 4698 }, { "epoch": 1.2349328341051664, "grad_norm": 0.9277732968330383, "learning_rate": 5.884878219730157e-05, "loss": 3.8376, "step": 4700 }, { "epoch": 1.2354583374388282, "grad_norm": 0.9190550446510315, "learning_rate": 5.883125985631681e-05, "loss": 3.8262, "step": 4702 }, { "epoch": 1.23598384077249, "grad_norm": 0.9815654754638672, "learning_rate": 5.881373751533204e-05, "loss": 3.8737, "step": 4704 }, { "epoch": 1.2365093441061517, "grad_norm": 0.9522235989570618, "learning_rate": 5.879621517434729e-05, "loss": 3.8374, "step": 4706 }, { "epoch": 1.2370348474398134, "grad_norm": 1.0559568405151367, "learning_rate": 5.877869283336254e-05, "loss": 3.7885, "step": 4708 }, { "epoch": 1.2375603507734751, "grad_norm": 0.8934410810470581, "learning_rate": 5.876117049237778e-05, "loss": 3.8268, "step": 4710 }, { "epoch": 1.238085854107137, "grad_norm": 0.7990152835845947, "learning_rate": 5.874364815139303e-05, "loss": 3.8111, "step": 4712 }, { "epoch": 1.2386113574407989, "grad_norm": 0.7838634848594666, "learning_rate": 5.8726125810408274e-05, "loss": 3.785, "step": 4714 }, { "epoch": 1.2391368607744606, "grad_norm": 0.7326494455337524, "learning_rate": 5.8708603469423515e-05, "loss": 3.865, "step": 4716 }, { "epoch": 1.2396623641081224, "grad_norm": 0.7984856963157654, "learning_rate": 5.869108112843876e-05, "loss": 3.8195, "step": 4718 }, { "epoch": 1.240187867441784, "grad_norm": 0.8375762701034546, "learning_rate": 5.867355878745401e-05, "loss": 3.8043, "step": 4720 }, { "epoch": 1.2407133707754459, "grad_norm": 0.7163873314857483, "learning_rate": 5.865603644646925e-05, "loss": 3.8293, "step": 4722 }, { "epoch": 1.2412388741091076, "grad_norm": 0.754542887210846, "learning_rate": 5.86385141054845e-05, "loss": 3.7922, "step": 4724 }, { "epoch": 1.2417643774427694, "grad_norm": 0.7986310124397278, "learning_rate": 5.8620991764499746e-05, "loss": 3.8311, "step": 4726 }, { "epoch": 1.242289880776431, "grad_norm": 0.813909649848938, "learning_rate": 5.860346942351499e-05, "loss": 3.7979, "step": 4728 }, { "epoch": 1.2428153841100928, "grad_norm": 0.9866297245025635, "learning_rate": 5.858594708253022e-05, "loss": 3.8555, "step": 4730 }, { "epoch": 1.2433408874437548, "grad_norm": 0.9526288509368896, "learning_rate": 5.856842474154547e-05, "loss": 3.8406, "step": 4732 }, { "epoch": 1.2438663907774166, "grad_norm": 0.7564043402671814, "learning_rate": 5.8550902400560716e-05, "loss": 3.8387, "step": 4734 }, { "epoch": 1.2443918941110783, "grad_norm": 0.8527357578277588, "learning_rate": 5.853338005957596e-05, "loss": 3.86, "step": 4736 }, { "epoch": 1.24491739744474, "grad_norm": 0.8298249840736389, "learning_rate": 5.8515857718591205e-05, "loss": 3.8133, "step": 4738 }, { "epoch": 1.2454429007784018, "grad_norm": 0.8185314536094666, "learning_rate": 5.849833537760645e-05, "loss": 3.8107, "step": 4740 }, { "epoch": 1.2459684041120636, "grad_norm": 0.830650806427002, "learning_rate": 5.848081303662169e-05, "loss": 3.8292, "step": 4742 }, { "epoch": 1.2464939074457253, "grad_norm": 0.8352764248847961, "learning_rate": 5.846329069563694e-05, "loss": 3.8185, "step": 4744 }, { "epoch": 1.247019410779387, "grad_norm": 0.856539249420166, "learning_rate": 5.844576835465219e-05, "loss": 3.7957, "step": 4746 }, { "epoch": 1.2475449141130488, "grad_norm": 0.7818179726600647, "learning_rate": 5.842824601366743e-05, "loss": 3.791, "step": 4748 }, { "epoch": 1.2480704174467108, "grad_norm": 0.7755522131919861, "learning_rate": 5.8410723672682676e-05, "loss": 3.7862, "step": 4750 }, { "epoch": 1.2485959207803725, "grad_norm": 0.816373348236084, "learning_rate": 5.8393201331697924e-05, "loss": 3.8279, "step": 4752 }, { "epoch": 1.2491214241140343, "grad_norm": 0.7656562328338623, "learning_rate": 5.8375678990713165e-05, "loss": 3.8085, "step": 4754 }, { "epoch": 1.249646927447696, "grad_norm": 0.7738822102546692, "learning_rate": 5.83581566497284e-05, "loss": 3.7401, "step": 4756 }, { "epoch": 1.2501724307813578, "grad_norm": 0.8885272741317749, "learning_rate": 5.8340634308743646e-05, "loss": 3.8761, "step": 4758 }, { "epoch": 1.2506979341150195, "grad_norm": 0.776105523109436, "learning_rate": 5.8323111967758894e-05, "loss": 3.8647, "step": 4760 }, { "epoch": 1.2512234374486813, "grad_norm": 0.8581206202507019, "learning_rate": 5.8305589626774135e-05, "loss": 3.797, "step": 4762 }, { "epoch": 1.2517489407823432, "grad_norm": 0.8294602036476135, "learning_rate": 5.828806728578938e-05, "loss": 3.8307, "step": 4764 }, { "epoch": 1.2522744441160047, "grad_norm": 0.695077657699585, "learning_rate": 5.827054494480463e-05, "loss": 3.8172, "step": 4766 }, { "epoch": 1.2527999474496667, "grad_norm": 0.6957995295524597, "learning_rate": 5.825302260381987e-05, "loss": 3.8779, "step": 4768 }, { "epoch": 1.2533254507833285, "grad_norm": 0.8192042708396912, "learning_rate": 5.823550026283512e-05, "loss": 3.8207, "step": 4770 }, { "epoch": 1.2538509541169902, "grad_norm": 0.8627820611000061, "learning_rate": 5.8217977921850366e-05, "loss": 3.8598, "step": 4772 }, { "epoch": 1.254376457450652, "grad_norm": 0.7867237329483032, "learning_rate": 5.8200455580865606e-05, "loss": 3.8243, "step": 4774 }, { "epoch": 1.2549019607843137, "grad_norm": 0.9573601484298706, "learning_rate": 5.8182933239880854e-05, "loss": 3.805, "step": 4776 }, { "epoch": 1.2554274641179755, "grad_norm": 0.7832624316215515, "learning_rate": 5.81654108988961e-05, "loss": 3.823, "step": 4778 }, { "epoch": 1.2559529674516372, "grad_norm": 0.8482010364532471, "learning_rate": 5.814788855791134e-05, "loss": 3.8442, "step": 4780 }, { "epoch": 1.2564784707852992, "grad_norm": 0.7806518077850342, "learning_rate": 5.8130366216926576e-05, "loss": 3.8267, "step": 4782 }, { "epoch": 1.2570039741189607, "grad_norm": 0.8091909289360046, "learning_rate": 5.8112843875941824e-05, "loss": 3.8387, "step": 4784 }, { "epoch": 1.2575294774526227, "grad_norm": 0.7041559219360352, "learning_rate": 5.809532153495707e-05, "loss": 3.8775, "step": 4786 }, { "epoch": 1.2580549807862844, "grad_norm": 0.7782458066940308, "learning_rate": 5.807779919397231e-05, "loss": 3.7563, "step": 4788 }, { "epoch": 1.2585804841199462, "grad_norm": 0.8227478265762329, "learning_rate": 5.806027685298756e-05, "loss": 3.8399, "step": 4790 }, { "epoch": 1.259105987453608, "grad_norm": 0.7922804355621338, "learning_rate": 5.804275451200281e-05, "loss": 3.8188, "step": 4792 }, { "epoch": 1.2596314907872697, "grad_norm": 0.7533599734306335, "learning_rate": 5.802523217101805e-05, "loss": 3.8306, "step": 4794 }, { "epoch": 1.2601569941209314, "grad_norm": 0.7531624436378479, "learning_rate": 5.8007709830033296e-05, "loss": 3.7585, "step": 4796 }, { "epoch": 1.2606824974545932, "grad_norm": 0.7023552656173706, "learning_rate": 5.799018748904854e-05, "loss": 3.8102, "step": 4798 }, { "epoch": 1.2612080007882551, "grad_norm": 0.9299274682998657, "learning_rate": 5.7972665148063784e-05, "loss": 3.8352, "step": 4800 }, { "epoch": 1.2612080007882551, "eval_loss": 3.815621852874756, "eval_runtime": 464.8416, "eval_samples_per_second": 262.001, "eval_steps_per_second": 8.188, "step": 4800 }, { "epoch": 1.2617335041219166, "grad_norm": 0.7283897399902344, "learning_rate": 5.795514280707903e-05, "loss": 3.7156, "step": 4802 }, { "epoch": 1.2622590074555786, "grad_norm": 0.7579646706581116, "learning_rate": 5.793762046609428e-05, "loss": 3.8527, "step": 4804 }, { "epoch": 1.2627845107892404, "grad_norm": 0.793414831161499, "learning_rate": 5.792009812510951e-05, "loss": 3.8638, "step": 4806 }, { "epoch": 1.2633100141229021, "grad_norm": 0.7895532846450806, "learning_rate": 5.7902575784124754e-05, "loss": 3.8222, "step": 4808 }, { "epoch": 1.2638355174565639, "grad_norm": 0.932525098323822, "learning_rate": 5.788505344314e-05, "loss": 3.8329, "step": 4810 }, { "epoch": 1.2643610207902256, "grad_norm": 0.8070063591003418, "learning_rate": 5.786753110215525e-05, "loss": 3.7896, "step": 4812 }, { "epoch": 1.2648865241238874, "grad_norm": 0.8257557153701782, "learning_rate": 5.785000876117049e-05, "loss": 3.8108, "step": 4814 }, { "epoch": 1.265412027457549, "grad_norm": 0.8749845027923584, "learning_rate": 5.783248642018574e-05, "loss": 3.7938, "step": 4816 }, { "epoch": 1.265937530791211, "grad_norm": 0.7835293412208557, "learning_rate": 5.7814964079200985e-05, "loss": 3.848, "step": 4818 }, { "epoch": 1.2664630341248726, "grad_norm": 0.8186714053153992, "learning_rate": 5.7797441738216226e-05, "loss": 3.8216, "step": 4820 }, { "epoch": 1.2669885374585346, "grad_norm": 0.8285131454467773, "learning_rate": 5.7779919397231473e-05, "loss": 3.7841, "step": 4822 }, { "epoch": 1.2675140407921963, "grad_norm": 0.8492038249969482, "learning_rate": 5.776239705624672e-05, "loss": 3.8279, "step": 4824 }, { "epoch": 1.268039544125858, "grad_norm": 0.9538490176200867, "learning_rate": 5.774487471526196e-05, "loss": 3.7901, "step": 4826 }, { "epoch": 1.2685650474595198, "grad_norm": 0.8137161135673523, "learning_rate": 5.772735237427721e-05, "loss": 3.8183, "step": 4828 }, { "epoch": 1.2690905507931816, "grad_norm": 0.8362091183662415, "learning_rate": 5.770983003329246e-05, "loss": 3.7675, "step": 4830 }, { "epoch": 1.2696160541268433, "grad_norm": 0.7327852845191956, "learning_rate": 5.769230769230769e-05, "loss": 3.7881, "step": 4832 }, { "epoch": 1.270141557460505, "grad_norm": 0.7991304993629456, "learning_rate": 5.767478535132294e-05, "loss": 3.8041, "step": 4834 }, { "epoch": 1.270667060794167, "grad_norm": 0.8098263144493103, "learning_rate": 5.765726301033818e-05, "loss": 3.8208, "step": 4836 }, { "epoch": 1.2711925641278286, "grad_norm": 0.7998114824295044, "learning_rate": 5.763974066935343e-05, "loss": 3.8388, "step": 4838 }, { "epoch": 1.2717180674614905, "grad_norm": 0.741936206817627, "learning_rate": 5.7622218328368674e-05, "loss": 3.825, "step": 4840 }, { "epoch": 1.2722435707951523, "grad_norm": 0.7386989593505859, "learning_rate": 5.7604695987383915e-05, "loss": 3.8872, "step": 4842 }, { "epoch": 1.272769074128814, "grad_norm": 0.9294496774673462, "learning_rate": 5.758717364639916e-05, "loss": 3.7987, "step": 4844 }, { "epoch": 1.2732945774624758, "grad_norm": 0.8167703151702881, "learning_rate": 5.756965130541441e-05, "loss": 3.822, "step": 4846 }, { "epoch": 1.2738200807961375, "grad_norm": 0.8492014408111572, "learning_rate": 5.755212896442965e-05, "loss": 3.8059, "step": 4848 }, { "epoch": 1.2743455841297993, "grad_norm": 0.9531571865081787, "learning_rate": 5.75346066234449e-05, "loss": 3.8178, "step": 4850 }, { "epoch": 1.274871087463461, "grad_norm": 0.8431397676467896, "learning_rate": 5.7517084282460146e-05, "loss": 3.821, "step": 4852 }, { "epoch": 1.275396590797123, "grad_norm": 0.8921079635620117, "learning_rate": 5.749956194147539e-05, "loss": 3.8239, "step": 4854 }, { "epoch": 1.2759220941307847, "grad_norm": 0.7637450695037842, "learning_rate": 5.7482039600490635e-05, "loss": 3.8376, "step": 4856 }, { "epoch": 1.2764475974644465, "grad_norm": 0.7083262205123901, "learning_rate": 5.746451725950587e-05, "loss": 3.8373, "step": 4858 }, { "epoch": 1.2769731007981082, "grad_norm": 0.9225767850875854, "learning_rate": 5.7446994918521116e-05, "loss": 3.8122, "step": 4860 }, { "epoch": 1.27749860413177, "grad_norm": 0.9182360172271729, "learning_rate": 5.742947257753636e-05, "loss": 3.8388, "step": 4862 }, { "epoch": 1.2780241074654317, "grad_norm": 0.9593896865844727, "learning_rate": 5.7411950236551604e-05, "loss": 3.8184, "step": 4864 }, { "epoch": 1.2785496107990935, "grad_norm": 0.8032509684562683, "learning_rate": 5.739442789556685e-05, "loss": 3.7965, "step": 4866 }, { "epoch": 1.2790751141327552, "grad_norm": 0.8466126918792725, "learning_rate": 5.737690555458209e-05, "loss": 3.7872, "step": 4868 }, { "epoch": 1.279600617466417, "grad_norm": 0.74239581823349, "learning_rate": 5.735938321359734e-05, "loss": 3.851, "step": 4870 }, { "epoch": 1.280126120800079, "grad_norm": 0.8125692009925842, "learning_rate": 5.734186087261259e-05, "loss": 3.8007, "step": 4872 }, { "epoch": 1.2806516241337407, "grad_norm": 0.8895838856697083, "learning_rate": 5.732433853162783e-05, "loss": 3.8141, "step": 4874 }, { "epoch": 1.2811771274674024, "grad_norm": 0.9129476547241211, "learning_rate": 5.7306816190643076e-05, "loss": 3.8059, "step": 4876 }, { "epoch": 1.2817026308010642, "grad_norm": 0.940992534160614, "learning_rate": 5.7289293849658324e-05, "loss": 3.826, "step": 4878 }, { "epoch": 1.282228134134726, "grad_norm": 0.8674393892288208, "learning_rate": 5.7271771508673565e-05, "loss": 3.7689, "step": 4880 }, { "epoch": 1.2827536374683877, "grad_norm": 0.7965447902679443, "learning_rate": 5.72542491676888e-05, "loss": 3.8612, "step": 4882 }, { "epoch": 1.2832791408020494, "grad_norm": 0.8145177364349365, "learning_rate": 5.7236726826704046e-05, "loss": 3.8048, "step": 4884 }, { "epoch": 1.2838046441357112, "grad_norm": 0.916142463684082, "learning_rate": 5.7219204485719294e-05, "loss": 3.8247, "step": 4886 }, { "epoch": 1.284330147469373, "grad_norm": 0.7588464617729187, "learning_rate": 5.7201682144734535e-05, "loss": 3.7722, "step": 4888 }, { "epoch": 1.2848556508030349, "grad_norm": 0.8667325377464294, "learning_rate": 5.718415980374978e-05, "loss": 3.8314, "step": 4890 }, { "epoch": 1.2853811541366966, "grad_norm": 0.7473271489143372, "learning_rate": 5.716663746276503e-05, "loss": 3.8114, "step": 4892 }, { "epoch": 1.2859066574703584, "grad_norm": 0.8576788902282715, "learning_rate": 5.714911512178027e-05, "loss": 3.8227, "step": 4894 }, { "epoch": 1.2864321608040201, "grad_norm": 0.7443214058876038, "learning_rate": 5.713159278079552e-05, "loss": 3.8576, "step": 4896 }, { "epoch": 1.2869576641376819, "grad_norm": 0.7720398902893066, "learning_rate": 5.7114070439810766e-05, "loss": 3.7603, "step": 4898 }, { "epoch": 1.2874831674713436, "grad_norm": 0.7857139110565186, "learning_rate": 5.7096548098826006e-05, "loss": 3.8325, "step": 4900 }, { "epoch": 1.2880086708050054, "grad_norm": 0.9008180499076843, "learning_rate": 5.7079025757841254e-05, "loss": 3.8301, "step": 4902 }, { "epoch": 1.288534174138667, "grad_norm": 0.7941035032272339, "learning_rate": 5.70615034168565e-05, "loss": 3.8062, "step": 4904 }, { "epoch": 1.2890596774723289, "grad_norm": 0.846415102481842, "learning_rate": 5.704398107587174e-05, "loss": 3.873, "step": 4906 }, { "epoch": 1.2895851808059908, "grad_norm": 0.7939431071281433, "learning_rate": 5.7026458734886976e-05, "loss": 3.8161, "step": 4908 }, { "epoch": 1.2901106841396526, "grad_norm": 0.7762883305549622, "learning_rate": 5.7008936393902224e-05, "loss": 3.7996, "step": 4910 }, { "epoch": 1.2906361874733143, "grad_norm": 0.8503370881080627, "learning_rate": 5.699141405291747e-05, "loss": 3.7938, "step": 4912 }, { "epoch": 1.291161690806976, "grad_norm": 0.7604640126228333, "learning_rate": 5.697389171193271e-05, "loss": 3.8347, "step": 4914 }, { "epoch": 1.2916871941406378, "grad_norm": 0.8480825424194336, "learning_rate": 5.695636937094796e-05, "loss": 3.8197, "step": 4916 }, { "epoch": 1.2922126974742996, "grad_norm": 0.792389452457428, "learning_rate": 5.693884702996321e-05, "loss": 3.8281, "step": 4918 }, { "epoch": 1.2927382008079613, "grad_norm": 0.9226304888725281, "learning_rate": 5.692132468897845e-05, "loss": 3.7965, "step": 4920 }, { "epoch": 1.2932637041416233, "grad_norm": 0.8486925363540649, "learning_rate": 5.6903802347993696e-05, "loss": 3.8531, "step": 4922 }, { "epoch": 1.2937892074752848, "grad_norm": 0.8411705493927002, "learning_rate": 5.688628000700894e-05, "loss": 3.842, "step": 4924 }, { "epoch": 1.2943147108089468, "grad_norm": 0.7795764207839966, "learning_rate": 5.6868757666024184e-05, "loss": 3.8526, "step": 4926 }, { "epoch": 1.2948402141426085, "grad_norm": 0.7855992317199707, "learning_rate": 5.685123532503943e-05, "loss": 3.7823, "step": 4928 }, { "epoch": 1.2953657174762703, "grad_norm": 0.7932481169700623, "learning_rate": 5.683371298405468e-05, "loss": 3.7696, "step": 4930 }, { "epoch": 1.295891220809932, "grad_norm": 0.8926216959953308, "learning_rate": 5.681619064306992e-05, "loss": 3.8213, "step": 4932 }, { "epoch": 1.2964167241435938, "grad_norm": 0.82773357629776, "learning_rate": 5.6798668302085154e-05, "loss": 3.8518, "step": 4934 }, { "epoch": 1.2969422274772555, "grad_norm": 0.7400639057159424, "learning_rate": 5.67811459611004e-05, "loss": 3.8525, "step": 4936 }, { "epoch": 1.2974677308109173, "grad_norm": 0.7922859787940979, "learning_rate": 5.676362362011565e-05, "loss": 3.8141, "step": 4938 }, { "epoch": 1.2979932341445792, "grad_norm": 0.8616368174552917, "learning_rate": 5.674610127913089e-05, "loss": 3.7996, "step": 4940 }, { "epoch": 1.2985187374782408, "grad_norm": 0.8677448034286499, "learning_rate": 5.672857893814614e-05, "loss": 3.8555, "step": 4942 }, { "epoch": 1.2990442408119027, "grad_norm": 0.8619234561920166, "learning_rate": 5.6711056597161385e-05, "loss": 3.8017, "step": 4944 }, { "epoch": 1.2995697441455645, "grad_norm": 0.8504317402839661, "learning_rate": 5.6693534256176626e-05, "loss": 3.87, "step": 4946 }, { "epoch": 1.3000952474792262, "grad_norm": 0.9580060839653015, "learning_rate": 5.6676011915191873e-05, "loss": 3.844, "step": 4948 }, { "epoch": 1.300620750812888, "grad_norm": 0.869465172290802, "learning_rate": 5.665848957420712e-05, "loss": 3.8862, "step": 4950 }, { "epoch": 1.3011462541465497, "grad_norm": 0.8783652186393738, "learning_rate": 5.664096723322236e-05, "loss": 3.8412, "step": 4952 }, { "epoch": 1.3016717574802115, "grad_norm": 0.8640948534011841, "learning_rate": 5.662344489223761e-05, "loss": 3.8867, "step": 4954 }, { "epoch": 1.3021972608138732, "grad_norm": 0.7903831601142883, "learning_rate": 5.660592255125286e-05, "loss": 3.8088, "step": 4956 }, { "epoch": 1.3027227641475352, "grad_norm": 0.7904569506645203, "learning_rate": 5.65884002102681e-05, "loss": 3.8179, "step": 4958 }, { "epoch": 1.3032482674811967, "grad_norm": 0.8061838150024414, "learning_rate": 5.657087786928333e-05, "loss": 3.7982, "step": 4960 }, { "epoch": 1.3037737708148587, "grad_norm": 0.7473665475845337, "learning_rate": 5.655335552829858e-05, "loss": 3.8299, "step": 4962 }, { "epoch": 1.3042992741485204, "grad_norm": 0.803360641002655, "learning_rate": 5.653583318731383e-05, "loss": 3.8546, "step": 4964 }, { "epoch": 1.3048247774821822, "grad_norm": 0.854046106338501, "learning_rate": 5.651831084632907e-05, "loss": 3.852, "step": 4966 }, { "epoch": 1.305350280815844, "grad_norm": 0.781623125076294, "learning_rate": 5.6500788505344315e-05, "loss": 3.8049, "step": 4968 }, { "epoch": 1.3058757841495057, "grad_norm": 0.7662547826766968, "learning_rate": 5.648326616435956e-05, "loss": 3.8399, "step": 4970 }, { "epoch": 1.3064012874831674, "grad_norm": 0.8249528408050537, "learning_rate": 5.6465743823374803e-05, "loss": 3.7754, "step": 4972 }, { "epoch": 1.3069267908168292, "grad_norm": 0.8255861401557922, "learning_rate": 5.644822148239005e-05, "loss": 3.7532, "step": 4974 }, { "epoch": 1.3074522941504911, "grad_norm": 0.9999202489852905, "learning_rate": 5.64306991414053e-05, "loss": 3.8473, "step": 4976 }, { "epoch": 1.3079777974841527, "grad_norm": 0.8483130931854248, "learning_rate": 5.641317680042054e-05, "loss": 3.8436, "step": 4978 }, { "epoch": 1.3085033008178146, "grad_norm": 0.9395752549171448, "learning_rate": 5.639565445943579e-05, "loss": 3.8413, "step": 4980 }, { "epoch": 1.3090288041514764, "grad_norm": 0.9028636813163757, "learning_rate": 5.6378132118451035e-05, "loss": 3.8689, "step": 4982 }, { "epoch": 1.3095543074851381, "grad_norm": 0.8689616322517395, "learning_rate": 5.636060977746627e-05, "loss": 3.8213, "step": 4984 }, { "epoch": 1.3100798108187999, "grad_norm": 0.7681336998939514, "learning_rate": 5.634308743648151e-05, "loss": 3.8416, "step": 4986 }, { "epoch": 1.3106053141524616, "grad_norm": 0.8596954941749573, "learning_rate": 5.632556509549676e-05, "loss": 3.8163, "step": 4988 }, { "epoch": 1.3111308174861234, "grad_norm": 0.8067328929901123, "learning_rate": 5.6308042754512004e-05, "loss": 3.8218, "step": 4990 }, { "epoch": 1.3116563208197851, "grad_norm": 0.7493451237678528, "learning_rate": 5.6290520413527245e-05, "loss": 3.8269, "step": 4992 }, { "epoch": 1.312181824153447, "grad_norm": 0.8057706952095032, "learning_rate": 5.627299807254249e-05, "loss": 3.8235, "step": 4994 }, { "epoch": 1.3127073274871086, "grad_norm": 0.7880367636680603, "learning_rate": 5.625547573155774e-05, "loss": 3.8215, "step": 4996 }, { "epoch": 1.3132328308207706, "grad_norm": 0.856469452381134, "learning_rate": 5.623795339057298e-05, "loss": 3.8246, "step": 4998 }, { "epoch": 1.3137583341544323, "grad_norm": 0.7680920362472534, "learning_rate": 5.622043104958823e-05, "loss": 3.8239, "step": 5000 }, { "epoch": 1.314283837488094, "grad_norm": 0.9171178936958313, "learning_rate": 5.6202908708603476e-05, "loss": 3.8516, "step": 5002 }, { "epoch": 1.3148093408217558, "grad_norm": 0.7451171278953552, "learning_rate": 5.618538636761872e-05, "loss": 3.8198, "step": 5004 }, { "epoch": 1.3153348441554176, "grad_norm": 0.8866268992424011, "learning_rate": 5.6167864026633965e-05, "loss": 3.788, "step": 5006 }, { "epoch": 1.3158603474890793, "grad_norm": 0.8023896217346191, "learning_rate": 5.615034168564921e-05, "loss": 3.8073, "step": 5008 }, { "epoch": 1.316385850822741, "grad_norm": 0.802408754825592, "learning_rate": 5.6132819344664446e-05, "loss": 3.8253, "step": 5010 }, { "epoch": 1.316911354156403, "grad_norm": 0.8069076538085938, "learning_rate": 5.611529700367969e-05, "loss": 3.8204, "step": 5012 }, { "epoch": 1.3174368574900648, "grad_norm": 0.8441058397293091, "learning_rate": 5.6097774662694935e-05, "loss": 3.7998, "step": 5014 }, { "epoch": 1.3179623608237265, "grad_norm": 0.8114290237426758, "learning_rate": 5.608025232171018e-05, "loss": 3.8161, "step": 5016 }, { "epoch": 1.3184878641573883, "grad_norm": 0.8113745450973511, "learning_rate": 5.606272998072542e-05, "loss": 3.8029, "step": 5018 }, { "epoch": 1.31901336749105, "grad_norm": 0.8709697723388672, "learning_rate": 5.604520763974067e-05, "loss": 3.8378, "step": 5020 }, { "epoch": 1.3195388708247118, "grad_norm": 0.7455194592475891, "learning_rate": 5.602768529875592e-05, "loss": 3.8314, "step": 5022 }, { "epoch": 1.3200643741583735, "grad_norm": 0.7950137853622437, "learning_rate": 5.601016295777116e-05, "loss": 3.8144, "step": 5024 }, { "epoch": 1.3205898774920353, "grad_norm": 0.8166969418525696, "learning_rate": 5.5992640616786406e-05, "loss": 3.8287, "step": 5026 }, { "epoch": 1.321115380825697, "grad_norm": 0.8612956404685974, "learning_rate": 5.5975118275801654e-05, "loss": 3.8175, "step": 5028 }, { "epoch": 1.321640884159359, "grad_norm": 0.8698770999908447, "learning_rate": 5.5957595934816895e-05, "loss": 3.822, "step": 5030 }, { "epoch": 1.3221663874930207, "grad_norm": 0.8098792433738708, "learning_rate": 5.594007359383214e-05, "loss": 3.7899, "step": 5032 }, { "epoch": 1.3226918908266825, "grad_norm": 0.7321463823318481, "learning_rate": 5.592255125284739e-05, "loss": 3.8223, "step": 5034 }, { "epoch": 1.3232173941603442, "grad_norm": 0.7723495960235596, "learning_rate": 5.5905028911862624e-05, "loss": 3.7899, "step": 5036 }, { "epoch": 1.323742897494006, "grad_norm": 0.8084952235221863, "learning_rate": 5.5887506570877865e-05, "loss": 3.7893, "step": 5038 }, { "epoch": 1.3242684008276677, "grad_norm": 0.7907637357711792, "learning_rate": 5.586998422989311e-05, "loss": 3.8744, "step": 5040 }, { "epoch": 1.3247939041613295, "grad_norm": 0.9078933596611023, "learning_rate": 5.585246188890836e-05, "loss": 3.8403, "step": 5042 }, { "epoch": 1.3253194074949912, "grad_norm": 0.7198517918586731, "learning_rate": 5.58349395479236e-05, "loss": 3.8245, "step": 5044 }, { "epoch": 1.325844910828653, "grad_norm": 0.7972646355628967, "learning_rate": 5.581741720693885e-05, "loss": 3.8073, "step": 5046 }, { "epoch": 1.326370414162315, "grad_norm": 0.8190456032752991, "learning_rate": 5.5799894865954096e-05, "loss": 3.8267, "step": 5048 }, { "epoch": 1.3268959174959767, "grad_norm": 0.7910116910934448, "learning_rate": 5.5782372524969336e-05, "loss": 3.7864, "step": 5050 }, { "epoch": 1.3274214208296384, "grad_norm": 0.7728725671768188, "learning_rate": 5.5764850183984584e-05, "loss": 3.8516, "step": 5052 }, { "epoch": 1.3279469241633002, "grad_norm": 0.8211483359336853, "learning_rate": 5.574732784299983e-05, "loss": 3.8315, "step": 5054 }, { "epoch": 1.328472427496962, "grad_norm": 0.8004226088523865, "learning_rate": 5.572980550201507e-05, "loss": 3.8025, "step": 5056 }, { "epoch": 1.3289979308306237, "grad_norm": 0.8564558029174805, "learning_rate": 5.571228316103032e-05, "loss": 3.8036, "step": 5058 }, { "epoch": 1.3295234341642854, "grad_norm": 0.7553214430809021, "learning_rate": 5.569476082004557e-05, "loss": 3.8081, "step": 5060 }, { "epoch": 1.3300489374979472, "grad_norm": 0.7896230816841125, "learning_rate": 5.56772384790608e-05, "loss": 3.8124, "step": 5062 }, { "epoch": 1.330574440831609, "grad_norm": 0.8280134201049805, "learning_rate": 5.565971613807604e-05, "loss": 3.8693, "step": 5064 }, { "epoch": 1.3310999441652709, "grad_norm": 0.7645221948623657, "learning_rate": 5.564219379709129e-05, "loss": 3.7743, "step": 5066 }, { "epoch": 1.3316254474989326, "grad_norm": 0.8998798727989197, "learning_rate": 5.562467145610654e-05, "loss": 3.8075, "step": 5068 }, { "epoch": 1.3321509508325944, "grad_norm": 0.8117035031318665, "learning_rate": 5.560714911512178e-05, "loss": 3.8117, "step": 5070 }, { "epoch": 1.3326764541662561, "grad_norm": 0.8567090630531311, "learning_rate": 5.5589626774137026e-05, "loss": 3.8558, "step": 5072 }, { "epoch": 1.3332019574999179, "grad_norm": 0.7705716490745544, "learning_rate": 5.557210443315227e-05, "loss": 3.7974, "step": 5074 }, { "epoch": 1.3337274608335796, "grad_norm": 0.8391891121864319, "learning_rate": 5.5554582092167514e-05, "loss": 3.7639, "step": 5076 }, { "epoch": 1.3342529641672414, "grad_norm": 0.8007798194885254, "learning_rate": 5.553705975118276e-05, "loss": 3.8037, "step": 5078 }, { "epoch": 1.3347784675009033, "grad_norm": 0.8497852087020874, "learning_rate": 5.551953741019801e-05, "loss": 3.8273, "step": 5080 }, { "epoch": 1.3353039708345649, "grad_norm": 0.8474531769752502, "learning_rate": 5.550201506921325e-05, "loss": 3.8298, "step": 5082 }, { "epoch": 1.3358294741682268, "grad_norm": 0.8806976675987244, "learning_rate": 5.54844927282285e-05, "loss": 3.8223, "step": 5084 }, { "epoch": 1.3363549775018886, "grad_norm": 0.933415949344635, "learning_rate": 5.546697038724373e-05, "loss": 3.8457, "step": 5086 }, { "epoch": 1.3368804808355503, "grad_norm": 0.7601410150527954, "learning_rate": 5.544944804625898e-05, "loss": 3.8077, "step": 5088 }, { "epoch": 1.337405984169212, "grad_norm": 1.0036529302597046, "learning_rate": 5.543192570527423e-05, "loss": 3.8673, "step": 5090 }, { "epoch": 1.3379314875028738, "grad_norm": 0.8353005051612854, "learning_rate": 5.541440336428947e-05, "loss": 3.818, "step": 5092 }, { "epoch": 1.3384569908365356, "grad_norm": 0.7500554323196411, "learning_rate": 5.5396881023304715e-05, "loss": 3.8234, "step": 5094 }, { "epoch": 1.3389824941701973, "grad_norm": 0.8158127069473267, "learning_rate": 5.537935868231996e-05, "loss": 3.7986, "step": 5096 }, { "epoch": 1.3395079975038593, "grad_norm": 0.8061633110046387, "learning_rate": 5.5361836341335203e-05, "loss": 3.797, "step": 5098 }, { "epoch": 1.3400335008375208, "grad_norm": 0.7927965521812439, "learning_rate": 5.534431400035045e-05, "loss": 3.8069, "step": 5100 }, { "epoch": 1.3405590041711828, "grad_norm": 0.8440743088722229, "learning_rate": 5.53267916593657e-05, "loss": 3.8297, "step": 5102 }, { "epoch": 1.3410845075048445, "grad_norm": 0.7487958669662476, "learning_rate": 5.530926931838094e-05, "loss": 3.844, "step": 5104 }, { "epoch": 1.3416100108385063, "grad_norm": 0.752913236618042, "learning_rate": 5.529174697739619e-05, "loss": 3.7964, "step": 5106 }, { "epoch": 1.342135514172168, "grad_norm": 0.8316803574562073, "learning_rate": 5.5274224636411435e-05, "loss": 3.812, "step": 5108 }, { "epoch": 1.3426610175058298, "grad_norm": 0.9004027247428894, "learning_rate": 5.5256702295426675e-05, "loss": 3.8191, "step": 5110 }, { "epoch": 1.3431865208394915, "grad_norm": 0.7597434520721436, "learning_rate": 5.523917995444191e-05, "loss": 3.8317, "step": 5112 }, { "epoch": 1.3437120241731533, "grad_norm": 0.8378864526748657, "learning_rate": 5.522165761345716e-05, "loss": 3.7955, "step": 5114 }, { "epoch": 1.3442375275068152, "grad_norm": 0.8600500226020813, "learning_rate": 5.5204135272472404e-05, "loss": 3.8072, "step": 5116 }, { "epoch": 1.3447630308404768, "grad_norm": 1.008056402206421, "learning_rate": 5.5186612931487645e-05, "loss": 3.7589, "step": 5118 }, { "epoch": 1.3452885341741387, "grad_norm": 0.8243805170059204, "learning_rate": 5.516909059050289e-05, "loss": 3.8356, "step": 5120 }, { "epoch": 1.3458140375078005, "grad_norm": 0.7942237854003906, "learning_rate": 5.515156824951814e-05, "loss": 3.8476, "step": 5122 }, { "epoch": 1.3463395408414622, "grad_norm": 0.7435653805732727, "learning_rate": 5.513404590853338e-05, "loss": 3.8353, "step": 5124 }, { "epoch": 1.346865044175124, "grad_norm": 0.7649374604225159, "learning_rate": 5.511652356754863e-05, "loss": 3.8526, "step": 5126 }, { "epoch": 1.3473905475087857, "grad_norm": 0.9717674255371094, "learning_rate": 5.5099001226563876e-05, "loss": 3.8185, "step": 5128 }, { "epoch": 1.3479160508424475, "grad_norm": 0.7661210298538208, "learning_rate": 5.508147888557912e-05, "loss": 3.7968, "step": 5130 }, { "epoch": 1.3484415541761092, "grad_norm": 0.735448956489563, "learning_rate": 5.5063956544594365e-05, "loss": 3.8129, "step": 5132 }, { "epoch": 1.3489670575097712, "grad_norm": 0.8127278685569763, "learning_rate": 5.504643420360961e-05, "loss": 3.7897, "step": 5134 }, { "epoch": 1.3494925608434327, "grad_norm": 0.8536175489425659, "learning_rate": 5.502891186262485e-05, "loss": 3.8215, "step": 5136 }, { "epoch": 1.3500180641770947, "grad_norm": 0.7509797215461731, "learning_rate": 5.501138952164009e-05, "loss": 3.7912, "step": 5138 }, { "epoch": 1.3505435675107564, "grad_norm": 0.7415160536766052, "learning_rate": 5.4993867180655335e-05, "loss": 3.8203, "step": 5140 }, { "epoch": 1.3510690708444182, "grad_norm": 0.7461737990379333, "learning_rate": 5.497634483967058e-05, "loss": 3.8347, "step": 5142 }, { "epoch": 1.35159457417808, "grad_norm": 0.8944548964500427, "learning_rate": 5.495882249868582e-05, "loss": 3.7875, "step": 5144 }, { "epoch": 1.3521200775117417, "grad_norm": 0.8421058058738708, "learning_rate": 5.494130015770107e-05, "loss": 3.8034, "step": 5146 }, { "epoch": 1.3526455808454034, "grad_norm": 0.891548216342926, "learning_rate": 5.492377781671632e-05, "loss": 3.8198, "step": 5148 }, { "epoch": 1.3531710841790652, "grad_norm": 0.7774540185928345, "learning_rate": 5.490625547573156e-05, "loss": 3.8593, "step": 5150 }, { "epoch": 1.3536965875127271, "grad_norm": 0.8546125292778015, "learning_rate": 5.4888733134746806e-05, "loss": 3.8423, "step": 5152 }, { "epoch": 1.3542220908463887, "grad_norm": 0.8170947432518005, "learning_rate": 5.4871210793762054e-05, "loss": 3.8444, "step": 5154 }, { "epoch": 1.3547475941800506, "grad_norm": 0.7152564525604248, "learning_rate": 5.4853688452777295e-05, "loss": 3.795, "step": 5156 }, { "epoch": 1.3552730975137124, "grad_norm": 0.803226113319397, "learning_rate": 5.483616611179254e-05, "loss": 3.8622, "step": 5158 }, { "epoch": 1.3557986008473741, "grad_norm": 0.8349176645278931, "learning_rate": 5.481864377080779e-05, "loss": 3.7833, "step": 5160 }, { "epoch": 1.3563241041810359, "grad_norm": 0.7387887239456177, "learning_rate": 5.480112142982303e-05, "loss": 3.8413, "step": 5162 }, { "epoch": 1.3568496075146976, "grad_norm": 0.8702726364135742, "learning_rate": 5.4783599088838265e-05, "loss": 3.7902, "step": 5164 }, { "epoch": 1.3573751108483594, "grad_norm": 0.8416242003440857, "learning_rate": 5.476607674785351e-05, "loss": 3.8411, "step": 5166 }, { "epoch": 1.3579006141820211, "grad_norm": 0.8182898759841919, "learning_rate": 5.474855440686876e-05, "loss": 3.8201, "step": 5168 }, { "epoch": 1.358426117515683, "grad_norm": 0.778207540512085, "learning_rate": 5.4731032065884e-05, "loss": 3.8726, "step": 5170 }, { "epoch": 1.3589516208493448, "grad_norm": 0.7476890683174133, "learning_rate": 5.471350972489925e-05, "loss": 3.8066, "step": 5172 }, { "epoch": 1.3594771241830066, "grad_norm": 0.8105747699737549, "learning_rate": 5.4695987383914496e-05, "loss": 3.8372, "step": 5174 }, { "epoch": 1.3600026275166683, "grad_norm": 0.7809747457504272, "learning_rate": 5.4678465042929736e-05, "loss": 3.8063, "step": 5176 }, { "epoch": 1.36052813085033, "grad_norm": 0.7533472776412964, "learning_rate": 5.4660942701944984e-05, "loss": 3.7733, "step": 5178 }, { "epoch": 1.3610536341839918, "grad_norm": 0.8890501856803894, "learning_rate": 5.464342036096023e-05, "loss": 3.7871, "step": 5180 }, { "epoch": 1.3615791375176536, "grad_norm": 0.799198567867279, "learning_rate": 5.462589801997547e-05, "loss": 3.8174, "step": 5182 }, { "epoch": 1.3621046408513153, "grad_norm": 0.9579505920410156, "learning_rate": 5.460837567899072e-05, "loss": 3.8308, "step": 5184 }, { "epoch": 1.362630144184977, "grad_norm": 0.8550505042076111, "learning_rate": 5.459085333800597e-05, "loss": 3.8281, "step": 5186 }, { "epoch": 1.363155647518639, "grad_norm": 0.7896988391876221, "learning_rate": 5.45733309970212e-05, "loss": 3.8094, "step": 5188 }, { "epoch": 1.3636811508523008, "grad_norm": 0.7914839386940002, "learning_rate": 5.455580865603644e-05, "loss": 3.8713, "step": 5190 }, { "epoch": 1.3642066541859625, "grad_norm": 0.9492586255073547, "learning_rate": 5.453828631505169e-05, "loss": 3.8049, "step": 5192 }, { "epoch": 1.3647321575196243, "grad_norm": 0.8159365057945251, "learning_rate": 5.452076397406694e-05, "loss": 3.7911, "step": 5194 }, { "epoch": 1.365257660853286, "grad_norm": 1.1914432048797607, "learning_rate": 5.450324163308218e-05, "loss": 3.8003, "step": 5196 }, { "epoch": 1.3657831641869478, "grad_norm": 0.9266898036003113, "learning_rate": 5.4485719292097426e-05, "loss": 3.8294, "step": 5198 }, { "epoch": 1.3663086675206095, "grad_norm": 0.8943628668785095, "learning_rate": 5.446819695111267e-05, "loss": 3.8194, "step": 5200 }, { "epoch": 1.3663086675206095, "eval_loss": 3.8006579875946045, "eval_runtime": 464.752, "eval_samples_per_second": 262.052, "eval_steps_per_second": 8.189, "step": 5200 }, { "epoch": 1.3668341708542713, "grad_norm": 0.8772167563438416, "learning_rate": 5.4450674610127914e-05, "loss": 3.7898, "step": 5202 }, { "epoch": 1.367359674187933, "grad_norm": 0.8282027244567871, "learning_rate": 5.443315226914316e-05, "loss": 3.881, "step": 5204 }, { "epoch": 1.367885177521595, "grad_norm": 0.9068541526794434, "learning_rate": 5.441562992815841e-05, "loss": 3.7771, "step": 5206 }, { "epoch": 1.3684106808552567, "grad_norm": 0.8028010725975037, "learning_rate": 5.439810758717365e-05, "loss": 3.8846, "step": 5208 }, { "epoch": 1.3689361841889185, "grad_norm": 0.9210859537124634, "learning_rate": 5.43805852461889e-05, "loss": 3.8519, "step": 5210 }, { "epoch": 1.3694616875225802, "grad_norm": 0.7756946086883545, "learning_rate": 5.4363062905204145e-05, "loss": 3.8267, "step": 5212 }, { "epoch": 1.369987190856242, "grad_norm": 0.7361545562744141, "learning_rate": 5.434554056421938e-05, "loss": 3.7396, "step": 5214 }, { "epoch": 1.3705126941899037, "grad_norm": 0.8202505111694336, "learning_rate": 5.432801822323462e-05, "loss": 3.8232, "step": 5216 }, { "epoch": 1.3710381975235655, "grad_norm": 0.8023722767829895, "learning_rate": 5.431049588224987e-05, "loss": 3.8779, "step": 5218 }, { "epoch": 1.3715637008572272, "grad_norm": 0.7746407985687256, "learning_rate": 5.4292973541265115e-05, "loss": 3.852, "step": 5220 }, { "epoch": 1.372089204190889, "grad_norm": 0.8674836158752441, "learning_rate": 5.4275451200280356e-05, "loss": 3.7532, "step": 5222 }, { "epoch": 1.372614707524551, "grad_norm": 0.8205435276031494, "learning_rate": 5.4257928859295603e-05, "loss": 3.8126, "step": 5224 }, { "epoch": 1.3731402108582127, "grad_norm": 0.7946842908859253, "learning_rate": 5.424040651831085e-05, "loss": 3.8018, "step": 5226 }, { "epoch": 1.3736657141918744, "grad_norm": 0.7436930537223816, "learning_rate": 5.422288417732609e-05, "loss": 3.8315, "step": 5228 }, { "epoch": 1.3741912175255362, "grad_norm": 0.8136159181594849, "learning_rate": 5.420536183634134e-05, "loss": 3.81, "step": 5230 }, { "epoch": 1.374716720859198, "grad_norm": 0.8810936808586121, "learning_rate": 5.418783949535659e-05, "loss": 3.8464, "step": 5232 }, { "epoch": 1.3752422241928597, "grad_norm": 0.8336628675460815, "learning_rate": 5.417031715437183e-05, "loss": 3.8716, "step": 5234 }, { "epoch": 1.3757677275265214, "grad_norm": 0.7666690349578857, "learning_rate": 5.4152794813387075e-05, "loss": 3.8309, "step": 5236 }, { "epoch": 1.3762932308601834, "grad_norm": 0.9719875454902649, "learning_rate": 5.413527247240232e-05, "loss": 3.8075, "step": 5238 }, { "epoch": 1.376818734193845, "grad_norm": 0.8522115349769592, "learning_rate": 5.411775013141756e-05, "loss": 3.8147, "step": 5240 }, { "epoch": 1.377344237527507, "grad_norm": 0.7516109943389893, "learning_rate": 5.41002277904328e-05, "loss": 3.7811, "step": 5242 }, { "epoch": 1.3778697408611686, "grad_norm": 0.831661581993103, "learning_rate": 5.4082705449448045e-05, "loss": 3.7739, "step": 5244 }, { "epoch": 1.3783952441948304, "grad_norm": 0.7653560042381287, "learning_rate": 5.406518310846329e-05, "loss": 3.748, "step": 5246 }, { "epoch": 1.3789207475284921, "grad_norm": 0.7629295587539673, "learning_rate": 5.4047660767478534e-05, "loss": 3.8191, "step": 5248 }, { "epoch": 1.3794462508621539, "grad_norm": 0.8465219736099243, "learning_rate": 5.403013842649378e-05, "loss": 3.8333, "step": 5250 }, { "epoch": 1.3799717541958156, "grad_norm": 0.8008516430854797, "learning_rate": 5.401261608550903e-05, "loss": 3.8695, "step": 5252 }, { "epoch": 1.3804972575294774, "grad_norm": 0.6928293108940125, "learning_rate": 5.399509374452427e-05, "loss": 3.7809, "step": 5254 }, { "epoch": 1.3810227608631394, "grad_norm": 0.8912468552589417, "learning_rate": 5.397757140353952e-05, "loss": 3.8371, "step": 5256 }, { "epoch": 1.3815482641968009, "grad_norm": 0.8200249075889587, "learning_rate": 5.3960049062554765e-05, "loss": 3.8307, "step": 5258 }, { "epoch": 1.3820737675304628, "grad_norm": 0.8227986097335815, "learning_rate": 5.3942526721570005e-05, "loss": 3.7807, "step": 5260 }, { "epoch": 1.3825992708641246, "grad_norm": 0.7350572347640991, "learning_rate": 5.392500438058525e-05, "loss": 3.7992, "step": 5262 }, { "epoch": 1.3831247741977863, "grad_norm": 0.8738299012184143, "learning_rate": 5.39074820396005e-05, "loss": 3.7964, "step": 5264 }, { "epoch": 1.383650277531448, "grad_norm": 0.7572607398033142, "learning_rate": 5.3889959698615735e-05, "loss": 3.7952, "step": 5266 }, { "epoch": 1.3841757808651098, "grad_norm": 0.7477497458457947, "learning_rate": 5.3872437357630975e-05, "loss": 3.8782, "step": 5268 }, { "epoch": 1.3847012841987716, "grad_norm": 0.8792385458946228, "learning_rate": 5.385491501664622e-05, "loss": 3.7761, "step": 5270 }, { "epoch": 1.3852267875324333, "grad_norm": 0.8313059210777283, "learning_rate": 5.383739267566147e-05, "loss": 3.8431, "step": 5272 }, { "epoch": 1.3857522908660953, "grad_norm": 0.8091100454330444, "learning_rate": 5.381987033467671e-05, "loss": 3.8204, "step": 5274 }, { "epoch": 1.3862777941997568, "grad_norm": 0.8150626420974731, "learning_rate": 5.380234799369196e-05, "loss": 3.7763, "step": 5276 }, { "epoch": 1.3868032975334188, "grad_norm": 0.8336070775985718, "learning_rate": 5.3784825652707206e-05, "loss": 3.7885, "step": 5278 }, { "epoch": 1.3873288008670805, "grad_norm": 0.73738694190979, "learning_rate": 5.376730331172245e-05, "loss": 3.8504, "step": 5280 }, { "epoch": 1.3878543042007423, "grad_norm": 0.8337908387184143, "learning_rate": 5.3749780970737695e-05, "loss": 3.8103, "step": 5282 }, { "epoch": 1.388379807534404, "grad_norm": 0.9081438183784485, "learning_rate": 5.373225862975294e-05, "loss": 3.8013, "step": 5284 }, { "epoch": 1.3889053108680658, "grad_norm": 0.9830188751220703, "learning_rate": 5.371473628876818e-05, "loss": 3.7819, "step": 5286 }, { "epoch": 1.3894308142017275, "grad_norm": 0.7558450102806091, "learning_rate": 5.369721394778343e-05, "loss": 3.8121, "step": 5288 }, { "epoch": 1.3899563175353893, "grad_norm": 0.8368049263954163, "learning_rate": 5.3679691606798665e-05, "loss": 3.8501, "step": 5290 }, { "epoch": 1.3904818208690513, "grad_norm": 0.8347126841545105, "learning_rate": 5.366216926581391e-05, "loss": 3.8684, "step": 5292 }, { "epoch": 1.3910073242027128, "grad_norm": 0.7855224609375, "learning_rate": 5.364464692482915e-05, "loss": 3.8524, "step": 5294 }, { "epoch": 1.3915328275363748, "grad_norm": 0.7268229722976685, "learning_rate": 5.36271245838444e-05, "loss": 3.8637, "step": 5296 }, { "epoch": 1.3920583308700365, "grad_norm": 0.8562330007553101, "learning_rate": 5.360960224285965e-05, "loss": 3.7963, "step": 5298 }, { "epoch": 1.3925838342036982, "grad_norm": 0.8070107698440552, "learning_rate": 5.359207990187489e-05, "loss": 3.7991, "step": 5300 }, { "epoch": 1.39310933753736, "grad_norm": 0.7816234230995178, "learning_rate": 5.3574557560890136e-05, "loss": 3.8288, "step": 5302 }, { "epoch": 1.3936348408710217, "grad_norm": 0.8066199421882629, "learning_rate": 5.3557035219905384e-05, "loss": 3.7914, "step": 5304 }, { "epoch": 1.3941603442046835, "grad_norm": 0.8445290923118591, "learning_rate": 5.3539512878920625e-05, "loss": 3.7572, "step": 5306 }, { "epoch": 1.3946858475383452, "grad_norm": 0.854292094707489, "learning_rate": 5.352199053793587e-05, "loss": 3.8251, "step": 5308 }, { "epoch": 1.3952113508720072, "grad_norm": 0.7660707831382751, "learning_rate": 5.350446819695112e-05, "loss": 3.7898, "step": 5310 }, { "epoch": 1.3957368542056687, "grad_norm": 0.8153312802314758, "learning_rate": 5.348694585596636e-05, "loss": 3.8102, "step": 5312 }, { "epoch": 1.3962623575393307, "grad_norm": 0.826738178730011, "learning_rate": 5.346942351498161e-05, "loss": 3.8833, "step": 5314 }, { "epoch": 1.3967878608729924, "grad_norm": 0.8752331137657166, "learning_rate": 5.345190117399684e-05, "loss": 3.8245, "step": 5316 }, { "epoch": 1.3973133642066542, "grad_norm": 0.9647976756095886, "learning_rate": 5.343437883301209e-05, "loss": 3.8271, "step": 5318 }, { "epoch": 1.397838867540316, "grad_norm": 0.8223600387573242, "learning_rate": 5.341685649202733e-05, "loss": 3.788, "step": 5320 }, { "epoch": 1.3983643708739777, "grad_norm": 0.9395845532417297, "learning_rate": 5.339933415104258e-05, "loss": 3.8348, "step": 5322 }, { "epoch": 1.3988898742076394, "grad_norm": 0.8632817268371582, "learning_rate": 5.3381811810057826e-05, "loss": 3.817, "step": 5324 }, { "epoch": 1.3994153775413012, "grad_norm": 0.7990444302558899, "learning_rate": 5.3364289469073067e-05, "loss": 3.8097, "step": 5326 }, { "epoch": 1.3999408808749632, "grad_norm": 0.8228181004524231, "learning_rate": 5.3346767128088314e-05, "loss": 3.8024, "step": 5328 }, { "epoch": 1.400466384208625, "grad_norm": 0.7700957655906677, "learning_rate": 5.332924478710356e-05, "loss": 3.7676, "step": 5330 }, { "epoch": 1.4009918875422867, "grad_norm": 0.893653154373169, "learning_rate": 5.33117224461188e-05, "loss": 3.7698, "step": 5332 }, { "epoch": 1.4015173908759484, "grad_norm": 0.8447924852371216, "learning_rate": 5.329420010513405e-05, "loss": 3.7908, "step": 5334 }, { "epoch": 1.4020428942096101, "grad_norm": 0.7929844856262207, "learning_rate": 5.32766777641493e-05, "loss": 3.7817, "step": 5336 }, { "epoch": 1.402568397543272, "grad_norm": 0.8445258140563965, "learning_rate": 5.325915542316454e-05, "loss": 3.8462, "step": 5338 }, { "epoch": 1.4030939008769336, "grad_norm": 0.7757857441902161, "learning_rate": 5.3241633082179786e-05, "loss": 3.7776, "step": 5340 }, { "epoch": 1.4036194042105954, "grad_norm": 0.7645360827445984, "learning_rate": 5.322411074119502e-05, "loss": 3.8191, "step": 5342 }, { "epoch": 1.4041449075442571, "grad_norm": 0.8456705212593079, "learning_rate": 5.320658840021027e-05, "loss": 3.7834, "step": 5344 }, { "epoch": 1.404670410877919, "grad_norm": 0.8268540501594543, "learning_rate": 5.3189066059225515e-05, "loss": 3.8807, "step": 5346 }, { "epoch": 1.4051959142115809, "grad_norm": 0.7606985569000244, "learning_rate": 5.3171543718240756e-05, "loss": 3.8084, "step": 5348 }, { "epoch": 1.4057214175452426, "grad_norm": 0.7785517573356628, "learning_rate": 5.3154021377256003e-05, "loss": 3.8178, "step": 5350 }, { "epoch": 1.4062469208789043, "grad_norm": 0.826353132724762, "learning_rate": 5.313649903627125e-05, "loss": 3.822, "step": 5352 }, { "epoch": 1.406772424212566, "grad_norm": 0.8289223313331604, "learning_rate": 5.311897669528649e-05, "loss": 3.7515, "step": 5354 }, { "epoch": 1.4072979275462278, "grad_norm": 0.7275674939155579, "learning_rate": 5.310145435430174e-05, "loss": 3.8438, "step": 5356 }, { "epoch": 1.4078234308798896, "grad_norm": 0.803577721118927, "learning_rate": 5.308393201331699e-05, "loss": 3.8376, "step": 5358 }, { "epoch": 1.4083489342135513, "grad_norm": 0.7433082461357117, "learning_rate": 5.306640967233223e-05, "loss": 3.8559, "step": 5360 }, { "epoch": 1.408874437547213, "grad_norm": 0.7552867531776428, "learning_rate": 5.3048887331347475e-05, "loss": 3.7839, "step": 5362 }, { "epoch": 1.409399940880875, "grad_norm": 0.8351026773452759, "learning_rate": 5.303136499036272e-05, "loss": 3.867, "step": 5364 }, { "epoch": 1.4099254442145368, "grad_norm": 0.8231925368309021, "learning_rate": 5.3013842649377964e-05, "loss": 3.8202, "step": 5366 }, { "epoch": 1.4104509475481986, "grad_norm": 0.829059362411499, "learning_rate": 5.29963203083932e-05, "loss": 3.8095, "step": 5368 }, { "epoch": 1.4109764508818603, "grad_norm": 0.7670896649360657, "learning_rate": 5.2978797967408445e-05, "loss": 3.7836, "step": 5370 }, { "epoch": 1.411501954215522, "grad_norm": 0.8551681637763977, "learning_rate": 5.296127562642369e-05, "loss": 3.8088, "step": 5372 }, { "epoch": 1.4120274575491838, "grad_norm": 0.8082032799720764, "learning_rate": 5.2943753285438934e-05, "loss": 3.771, "step": 5374 }, { "epoch": 1.4125529608828455, "grad_norm": 0.7840303182601929, "learning_rate": 5.292623094445418e-05, "loss": 3.8082, "step": 5376 }, { "epoch": 1.4130784642165073, "grad_norm": 0.8529895544052124, "learning_rate": 5.290870860346943e-05, "loss": 3.7724, "step": 5378 }, { "epoch": 1.413603967550169, "grad_norm": 0.8248435854911804, "learning_rate": 5.289118626248467e-05, "loss": 3.8005, "step": 5380 }, { "epoch": 1.414129470883831, "grad_norm": 0.8124716281890869, "learning_rate": 5.287366392149992e-05, "loss": 3.807, "step": 5382 }, { "epoch": 1.4146549742174928, "grad_norm": 0.9543192982673645, "learning_rate": 5.2856141580515165e-05, "loss": 3.7884, "step": 5384 }, { "epoch": 1.4151804775511545, "grad_norm": 0.8204013705253601, "learning_rate": 5.2838619239530405e-05, "loss": 3.7863, "step": 5386 }, { "epoch": 1.4157059808848163, "grad_norm": 0.8300648927688599, "learning_rate": 5.282109689854565e-05, "loss": 3.8661, "step": 5388 }, { "epoch": 1.416231484218478, "grad_norm": 0.8752180933952332, "learning_rate": 5.28035745575609e-05, "loss": 3.8551, "step": 5390 }, { "epoch": 1.4167569875521397, "grad_norm": 0.8593193292617798, "learning_rate": 5.2786052216576134e-05, "loss": 3.8063, "step": 5392 }, { "epoch": 1.4172824908858015, "grad_norm": 0.8264945149421692, "learning_rate": 5.2768529875591375e-05, "loss": 3.8049, "step": 5394 }, { "epoch": 1.4178079942194635, "grad_norm": 0.8230407238006592, "learning_rate": 5.275100753460662e-05, "loss": 3.7632, "step": 5396 }, { "epoch": 1.418333497553125, "grad_norm": 0.9782278537750244, "learning_rate": 5.273348519362187e-05, "loss": 3.819, "step": 5398 }, { "epoch": 1.418859000886787, "grad_norm": 0.8258353471755981, "learning_rate": 5.271596285263711e-05, "loss": 3.7991, "step": 5400 }, { "epoch": 1.4193845042204487, "grad_norm": 0.8113495111465454, "learning_rate": 5.269844051165236e-05, "loss": 3.8158, "step": 5402 }, { "epoch": 1.4199100075541105, "grad_norm": 0.7955632209777832, "learning_rate": 5.2680918170667606e-05, "loss": 3.8222, "step": 5404 }, { "epoch": 1.4204355108877722, "grad_norm": 1.0060091018676758, "learning_rate": 5.266339582968285e-05, "loss": 3.8627, "step": 5406 }, { "epoch": 1.420961014221434, "grad_norm": 0.8857936263084412, "learning_rate": 5.2645873488698095e-05, "loss": 3.7927, "step": 5408 }, { "epoch": 1.4214865175550957, "grad_norm": 0.8246011734008789, "learning_rate": 5.262835114771334e-05, "loss": 3.7959, "step": 5410 }, { "epoch": 1.4220120208887574, "grad_norm": 0.7787899374961853, "learning_rate": 5.261082880672858e-05, "loss": 3.8192, "step": 5412 }, { "epoch": 1.4225375242224194, "grad_norm": 0.8777243494987488, "learning_rate": 5.259330646574383e-05, "loss": 3.8458, "step": 5414 }, { "epoch": 1.423063027556081, "grad_norm": 0.813235342502594, "learning_rate": 5.257578412475908e-05, "loss": 3.7907, "step": 5416 }, { "epoch": 1.423588530889743, "grad_norm": 0.858550488948822, "learning_rate": 5.255826178377431e-05, "loss": 3.7984, "step": 5418 }, { "epoch": 1.4241140342234047, "grad_norm": 0.8101441860198975, "learning_rate": 5.254073944278955e-05, "loss": 3.8302, "step": 5420 }, { "epoch": 1.4246395375570664, "grad_norm": 0.8259087800979614, "learning_rate": 5.25232171018048e-05, "loss": 3.7844, "step": 5422 }, { "epoch": 1.4251650408907282, "grad_norm": 0.8048234581947327, "learning_rate": 5.250569476082005e-05, "loss": 3.8096, "step": 5424 }, { "epoch": 1.42569054422439, "grad_norm": 0.7219498157501221, "learning_rate": 5.248817241983529e-05, "loss": 3.7963, "step": 5426 }, { "epoch": 1.4262160475580516, "grad_norm": 0.8999656438827515, "learning_rate": 5.2470650078850536e-05, "loss": 3.8356, "step": 5428 }, { "epoch": 1.4267415508917134, "grad_norm": 0.9249022006988525, "learning_rate": 5.2453127737865784e-05, "loss": 3.7845, "step": 5430 }, { "epoch": 1.4272670542253754, "grad_norm": 0.8567255735397339, "learning_rate": 5.2435605396881025e-05, "loss": 3.8261, "step": 5432 }, { "epoch": 1.427792557559037, "grad_norm": 0.8123960494995117, "learning_rate": 5.241808305589627e-05, "loss": 3.8092, "step": 5434 }, { "epoch": 1.4283180608926989, "grad_norm": 0.9137079119682312, "learning_rate": 5.240056071491152e-05, "loss": 3.809, "step": 5436 }, { "epoch": 1.4288435642263606, "grad_norm": 0.7289377450942993, "learning_rate": 5.238303837392676e-05, "loss": 3.8238, "step": 5438 }, { "epoch": 1.4293690675600224, "grad_norm": 0.8465163707733154, "learning_rate": 5.236551603294201e-05, "loss": 3.8135, "step": 5440 }, { "epoch": 1.429894570893684, "grad_norm": 0.8069952726364136, "learning_rate": 5.2347993691957256e-05, "loss": 3.8122, "step": 5442 }, { "epoch": 1.4304200742273459, "grad_norm": 0.8606716990470886, "learning_rate": 5.233047135097249e-05, "loss": 3.7856, "step": 5444 }, { "epoch": 1.4309455775610076, "grad_norm": 0.7576964497566223, "learning_rate": 5.231294900998773e-05, "loss": 3.8332, "step": 5446 }, { "epoch": 1.4314710808946693, "grad_norm": 0.8836169242858887, "learning_rate": 5.229542666900298e-05, "loss": 3.8172, "step": 5448 }, { "epoch": 1.4319965842283313, "grad_norm": 0.8972988128662109, "learning_rate": 5.2277904328018226e-05, "loss": 3.7996, "step": 5450 }, { "epoch": 1.4325220875619928, "grad_norm": 0.8390241861343384, "learning_rate": 5.2260381987033467e-05, "loss": 3.8059, "step": 5452 }, { "epoch": 1.4330475908956548, "grad_norm": 0.7485369443893433, "learning_rate": 5.2242859646048714e-05, "loss": 3.8255, "step": 5454 }, { "epoch": 1.4335730942293166, "grad_norm": 0.8171684741973877, "learning_rate": 5.222533730506396e-05, "loss": 3.8331, "step": 5456 }, { "epoch": 1.4340985975629783, "grad_norm": 0.7745270133018494, "learning_rate": 5.22078149640792e-05, "loss": 3.8422, "step": 5458 }, { "epoch": 1.43462410089664, "grad_norm": 0.8034564852714539, "learning_rate": 5.219029262309445e-05, "loss": 3.8275, "step": 5460 }, { "epoch": 1.4351496042303018, "grad_norm": 0.7698829174041748, "learning_rate": 5.21727702821097e-05, "loss": 3.7656, "step": 5462 }, { "epoch": 1.4356751075639635, "grad_norm": 0.8491415977478027, "learning_rate": 5.215524794112494e-05, "loss": 3.828, "step": 5464 }, { "epoch": 1.4362006108976253, "grad_norm": 0.7913371324539185, "learning_rate": 5.2137725600140186e-05, "loss": 3.798, "step": 5466 }, { "epoch": 1.4367261142312873, "grad_norm": 0.777247965335846, "learning_rate": 5.212020325915542e-05, "loss": 3.8002, "step": 5468 }, { "epoch": 1.4372516175649488, "grad_norm": 0.8189447522163391, "learning_rate": 5.210268091817067e-05, "loss": 3.7649, "step": 5470 }, { "epoch": 1.4377771208986108, "grad_norm": 0.7614515423774719, "learning_rate": 5.208515857718591e-05, "loss": 3.7993, "step": 5472 }, { "epoch": 1.4383026242322725, "grad_norm": 0.8353219032287598, "learning_rate": 5.2067636236201156e-05, "loss": 3.8619, "step": 5474 }, { "epoch": 1.4388281275659343, "grad_norm": 0.7375175356864929, "learning_rate": 5.2050113895216403e-05, "loss": 3.7864, "step": 5476 }, { "epoch": 1.439353630899596, "grad_norm": 0.8103005886077881, "learning_rate": 5.2032591554231644e-05, "loss": 3.819, "step": 5478 }, { "epoch": 1.4398791342332578, "grad_norm": 0.8226494193077087, "learning_rate": 5.201506921324689e-05, "loss": 3.7652, "step": 5480 }, { "epoch": 1.4404046375669195, "grad_norm": 0.8519133925437927, "learning_rate": 5.199754687226214e-05, "loss": 3.7911, "step": 5482 }, { "epoch": 1.4409301409005812, "grad_norm": 0.8055192828178406, "learning_rate": 5.198002453127738e-05, "loss": 3.7628, "step": 5484 }, { "epoch": 1.4414556442342432, "grad_norm": 0.760606050491333, "learning_rate": 5.196250219029263e-05, "loss": 3.8021, "step": 5486 }, { "epoch": 1.441981147567905, "grad_norm": 0.9848291873931885, "learning_rate": 5.1944979849307875e-05, "loss": 3.8202, "step": 5488 }, { "epoch": 1.4425066509015667, "grad_norm": 0.8020250201225281, "learning_rate": 5.1927457508323116e-05, "loss": 3.8527, "step": 5490 }, { "epoch": 1.4430321542352285, "grad_norm": 0.8161712884902954, "learning_rate": 5.1909935167338364e-05, "loss": 3.783, "step": 5492 }, { "epoch": 1.4435576575688902, "grad_norm": 0.7724359631538391, "learning_rate": 5.18924128263536e-05, "loss": 3.7991, "step": 5494 }, { "epoch": 1.444083160902552, "grad_norm": 0.8599538803100586, "learning_rate": 5.1874890485368845e-05, "loss": 3.8139, "step": 5496 }, { "epoch": 1.4446086642362137, "grad_norm": 0.8076629042625427, "learning_rate": 5.1857368144384086e-05, "loss": 3.8527, "step": 5498 }, { "epoch": 1.4451341675698754, "grad_norm": 0.985328733921051, "learning_rate": 5.1839845803399334e-05, "loss": 3.8268, "step": 5500 }, { "epoch": 1.4456596709035372, "grad_norm": 0.8373243808746338, "learning_rate": 5.182232346241458e-05, "loss": 3.8613, "step": 5502 }, { "epoch": 1.4461851742371992, "grad_norm": 0.9131022691726685, "learning_rate": 5.180480112142982e-05, "loss": 3.8005, "step": 5504 }, { "epoch": 1.446710677570861, "grad_norm": 0.8112351298332214, "learning_rate": 5.178727878044507e-05, "loss": 3.8035, "step": 5506 }, { "epoch": 1.4472361809045227, "grad_norm": 0.981235682964325, "learning_rate": 5.176975643946032e-05, "loss": 3.7447, "step": 5508 }, { "epoch": 1.4477616842381844, "grad_norm": 0.795303463935852, "learning_rate": 5.175223409847556e-05, "loss": 3.7816, "step": 5510 }, { "epoch": 1.4482871875718462, "grad_norm": 0.780414879322052, "learning_rate": 5.1734711757490805e-05, "loss": 3.8053, "step": 5512 }, { "epoch": 1.448812690905508, "grad_norm": 0.7531141042709351, "learning_rate": 5.171718941650605e-05, "loss": 3.7891, "step": 5514 }, { "epoch": 1.4493381942391697, "grad_norm": 0.7829126119613647, "learning_rate": 5.1699667075521294e-05, "loss": 3.8343, "step": 5516 }, { "epoch": 1.4498636975728314, "grad_norm": 0.7330493927001953, "learning_rate": 5.168214473453654e-05, "loss": 3.7577, "step": 5518 }, { "epoch": 1.4503892009064931, "grad_norm": 0.9643624424934387, "learning_rate": 5.1664622393551775e-05, "loss": 3.8422, "step": 5520 }, { "epoch": 1.4509147042401551, "grad_norm": 0.8860618472099304, "learning_rate": 5.164710005256702e-05, "loss": 3.8184, "step": 5522 }, { "epoch": 1.4514402075738169, "grad_norm": 0.7112120389938354, "learning_rate": 5.1629577711582264e-05, "loss": 3.785, "step": 5524 }, { "epoch": 1.4519657109074786, "grad_norm": 0.8662072420120239, "learning_rate": 5.161205537059751e-05, "loss": 3.7718, "step": 5526 }, { "epoch": 1.4524912142411404, "grad_norm": 0.7539570331573486, "learning_rate": 5.159453302961276e-05, "loss": 3.7747, "step": 5528 }, { "epoch": 1.453016717574802, "grad_norm": 0.9109886288642883, "learning_rate": 5.1577010688628e-05, "loss": 3.8723, "step": 5530 }, { "epoch": 1.4535422209084639, "grad_norm": 0.7160374522209167, "learning_rate": 5.155948834764325e-05, "loss": 3.829, "step": 5532 }, { "epoch": 1.4540677242421256, "grad_norm": 0.8228909373283386, "learning_rate": 5.1541966006658495e-05, "loss": 3.8256, "step": 5534 }, { "epoch": 1.4545932275757874, "grad_norm": 0.7639895081520081, "learning_rate": 5.1524443665673735e-05, "loss": 3.8432, "step": 5536 }, { "epoch": 1.455118730909449, "grad_norm": 0.8514497876167297, "learning_rate": 5.150692132468898e-05, "loss": 3.8121, "step": 5538 }, { "epoch": 1.455644234243111, "grad_norm": 0.7883850932121277, "learning_rate": 5.148939898370423e-05, "loss": 3.7407, "step": 5540 }, { "epoch": 1.4561697375767728, "grad_norm": 0.7939605712890625, "learning_rate": 5.147187664271947e-05, "loss": 3.8386, "step": 5542 }, { "epoch": 1.4566952409104346, "grad_norm": 0.8942492604255676, "learning_rate": 5.145435430173472e-05, "loss": 3.7854, "step": 5544 }, { "epoch": 1.4572207442440963, "grad_norm": 0.7837711572647095, "learning_rate": 5.143683196074995e-05, "loss": 3.7978, "step": 5546 }, { "epoch": 1.457746247577758, "grad_norm": 0.8352466225624084, "learning_rate": 5.14193096197652e-05, "loss": 3.7953, "step": 5548 }, { "epoch": 1.4582717509114198, "grad_norm": 0.9002737998962402, "learning_rate": 5.140178727878044e-05, "loss": 3.7801, "step": 5550 }, { "epoch": 1.4587972542450816, "grad_norm": 0.8742265701293945, "learning_rate": 5.138426493779569e-05, "loss": 3.8207, "step": 5552 }, { "epoch": 1.4593227575787435, "grad_norm": 0.7376540303230286, "learning_rate": 5.1366742596810936e-05, "loss": 3.758, "step": 5554 }, { "epoch": 1.459848260912405, "grad_norm": 0.8591023087501526, "learning_rate": 5.134922025582618e-05, "loss": 3.7624, "step": 5556 }, { "epoch": 1.460373764246067, "grad_norm": 0.893268883228302, "learning_rate": 5.1331697914841425e-05, "loss": 3.8218, "step": 5558 }, { "epoch": 1.4608992675797288, "grad_norm": 0.7790848612785339, "learning_rate": 5.131417557385667e-05, "loss": 3.8497, "step": 5560 }, { "epoch": 1.4614247709133905, "grad_norm": 0.8138498663902283, "learning_rate": 5.129665323287191e-05, "loss": 3.7921, "step": 5562 }, { "epoch": 1.4619502742470523, "grad_norm": 0.8949870467185974, "learning_rate": 5.127913089188716e-05, "loss": 3.7874, "step": 5564 }, { "epoch": 1.462475777580714, "grad_norm": 0.9230346083641052, "learning_rate": 5.126160855090241e-05, "loss": 3.8283, "step": 5566 }, { "epoch": 1.4630012809143758, "grad_norm": 0.747728168964386, "learning_rate": 5.124408620991765e-05, "loss": 3.8575, "step": 5568 }, { "epoch": 1.4635267842480375, "grad_norm": 0.7606027722358704, "learning_rate": 5.122656386893288e-05, "loss": 3.7813, "step": 5570 }, { "epoch": 1.4640522875816995, "grad_norm": 0.8524044752120972, "learning_rate": 5.120904152794813e-05, "loss": 3.7557, "step": 5572 }, { "epoch": 1.464577790915361, "grad_norm": 0.8756555914878845, "learning_rate": 5.119151918696338e-05, "loss": 3.7707, "step": 5574 }, { "epoch": 1.465103294249023, "grad_norm": 0.8148065805435181, "learning_rate": 5.117399684597862e-05, "loss": 3.8523, "step": 5576 }, { "epoch": 1.4656287975826847, "grad_norm": 0.8769358396530151, "learning_rate": 5.1156474504993867e-05, "loss": 3.7591, "step": 5578 }, { "epoch": 1.4661543009163465, "grad_norm": 0.8040145635604858, "learning_rate": 5.1138952164009114e-05, "loss": 3.8036, "step": 5580 }, { "epoch": 1.4666798042500082, "grad_norm": 0.8731288313865662, "learning_rate": 5.1121429823024355e-05, "loss": 3.7805, "step": 5582 }, { "epoch": 1.46720530758367, "grad_norm": 0.7683684825897217, "learning_rate": 5.11039074820396e-05, "loss": 3.8479, "step": 5584 }, { "epoch": 1.4677308109173317, "grad_norm": 0.7817125916481018, "learning_rate": 5.108638514105485e-05, "loss": 3.7999, "step": 5586 }, { "epoch": 1.4682563142509935, "grad_norm": 0.8041476607322693, "learning_rate": 5.10688628000701e-05, "loss": 3.8369, "step": 5588 }, { "epoch": 1.4687818175846554, "grad_norm": 0.7490681409835815, "learning_rate": 5.105134045908534e-05, "loss": 3.7661, "step": 5590 }, { "epoch": 1.469307320918317, "grad_norm": 0.8240200877189636, "learning_rate": 5.1033818118100586e-05, "loss": 3.8335, "step": 5592 }, { "epoch": 1.469832824251979, "grad_norm": 0.9831581711769104, "learning_rate": 5.1016295777115833e-05, "loss": 3.7964, "step": 5594 }, { "epoch": 1.4703583275856407, "grad_norm": 0.7661347985267639, "learning_rate": 5.099877343613107e-05, "loss": 3.856, "step": 5596 }, { "epoch": 1.4708838309193024, "grad_norm": 0.768456757068634, "learning_rate": 5.098125109514631e-05, "loss": 3.7938, "step": 5598 }, { "epoch": 1.4714093342529642, "grad_norm": 0.8076183795928955, "learning_rate": 5.0963728754161556e-05, "loss": 3.8334, "step": 5600 }, { "epoch": 1.4714093342529642, "eval_loss": 3.7884368896484375, "eval_runtime": 464.7146, "eval_samples_per_second": 262.073, "eval_steps_per_second": 8.19, "step": 5600 }, { "epoch": 1.471934837586626, "grad_norm": 0.8743784427642822, "learning_rate": 5.09462064131768e-05, "loss": 3.7653, "step": 5602 }, { "epoch": 1.4724603409202877, "grad_norm": 0.7425801753997803, "learning_rate": 5.0928684072192044e-05, "loss": 3.7671, "step": 5604 }, { "epoch": 1.4729858442539494, "grad_norm": 0.885154128074646, "learning_rate": 5.091116173120729e-05, "loss": 3.8033, "step": 5606 }, { "epoch": 1.4735113475876114, "grad_norm": 0.726342499256134, "learning_rate": 5.089363939022254e-05, "loss": 3.7954, "step": 5608 }, { "epoch": 1.474036850921273, "grad_norm": 0.8145415782928467, "learning_rate": 5.087611704923778e-05, "loss": 3.8211, "step": 5610 }, { "epoch": 1.4745623542549349, "grad_norm": 0.8458108305931091, "learning_rate": 5.085859470825303e-05, "loss": 3.816, "step": 5612 }, { "epoch": 1.4750878575885966, "grad_norm": 0.7960172295570374, "learning_rate": 5.0841072367268275e-05, "loss": 3.8747, "step": 5614 }, { "epoch": 1.4756133609222584, "grad_norm": 0.8095530271530151, "learning_rate": 5.0823550026283516e-05, "loss": 3.8058, "step": 5616 }, { "epoch": 1.4761388642559201, "grad_norm": 0.8585817813873291, "learning_rate": 5.0806027685298764e-05, "loss": 3.8842, "step": 5618 }, { "epoch": 1.4766643675895819, "grad_norm": 0.8184722661972046, "learning_rate": 5.078850534431401e-05, "loss": 3.8554, "step": 5620 }, { "epoch": 1.4771898709232436, "grad_norm": 1.0432604551315308, "learning_rate": 5.0770983003329245e-05, "loss": 3.8082, "step": 5622 }, { "epoch": 1.4777153742569054, "grad_norm": 0.8225014209747314, "learning_rate": 5.0753460662344486e-05, "loss": 3.7836, "step": 5624 }, { "epoch": 1.4782408775905673, "grad_norm": 0.7736078500747681, "learning_rate": 5.0735938321359733e-05, "loss": 3.7529, "step": 5626 }, { "epoch": 1.4787663809242289, "grad_norm": 0.9180154204368591, "learning_rate": 5.071841598037498e-05, "loss": 3.856, "step": 5628 }, { "epoch": 1.4792918842578908, "grad_norm": 0.812774658203125, "learning_rate": 5.070089363939022e-05, "loss": 3.7906, "step": 5630 }, { "epoch": 1.4798173875915526, "grad_norm": 0.9165852665901184, "learning_rate": 5.068337129840547e-05, "loss": 3.8204, "step": 5632 }, { "epoch": 1.4803428909252143, "grad_norm": 0.9148765206336975, "learning_rate": 5.066584895742072e-05, "loss": 3.7464, "step": 5634 }, { "epoch": 1.480868394258876, "grad_norm": 0.7550075650215149, "learning_rate": 5.064832661643596e-05, "loss": 3.7789, "step": 5636 }, { "epoch": 1.4813938975925378, "grad_norm": 1.0190728902816772, "learning_rate": 5.0630804275451205e-05, "loss": 3.7841, "step": 5638 }, { "epoch": 1.4819194009261996, "grad_norm": 0.8609932065010071, "learning_rate": 5.061328193446645e-05, "loss": 3.7749, "step": 5640 }, { "epoch": 1.4824449042598613, "grad_norm": 0.9729079008102417, "learning_rate": 5.0595759593481694e-05, "loss": 3.7851, "step": 5642 }, { "epoch": 1.4829704075935233, "grad_norm": 0.8408140540122986, "learning_rate": 5.057823725249694e-05, "loss": 3.8525, "step": 5644 }, { "epoch": 1.483495910927185, "grad_norm": 0.9662366509437561, "learning_rate": 5.056071491151219e-05, "loss": 3.8095, "step": 5646 }, { "epoch": 1.4840214142608468, "grad_norm": 0.8091290593147278, "learning_rate": 5.054319257052742e-05, "loss": 3.8438, "step": 5648 }, { "epoch": 1.4845469175945085, "grad_norm": 0.8156009912490845, "learning_rate": 5.0525670229542664e-05, "loss": 3.7552, "step": 5650 }, { "epoch": 1.4850724209281703, "grad_norm": 0.793692946434021, "learning_rate": 5.050814788855791e-05, "loss": 3.7819, "step": 5652 }, { "epoch": 1.485597924261832, "grad_norm": 0.7606692910194397, "learning_rate": 5.049062554757316e-05, "loss": 3.8142, "step": 5654 }, { "epoch": 1.4861234275954938, "grad_norm": 0.7728522419929504, "learning_rate": 5.04731032065884e-05, "loss": 3.8325, "step": 5656 }, { "epoch": 1.4866489309291555, "grad_norm": 0.7678856253623962, "learning_rate": 5.045558086560365e-05, "loss": 3.7893, "step": 5658 }, { "epoch": 1.4871744342628173, "grad_norm": 0.9088488221168518, "learning_rate": 5.0438058524618895e-05, "loss": 3.8318, "step": 5660 }, { "epoch": 1.4876999375964792, "grad_norm": 0.7907689809799194, "learning_rate": 5.0420536183634135e-05, "loss": 3.8333, "step": 5662 }, { "epoch": 1.488225440930141, "grad_norm": 0.7841881513595581, "learning_rate": 5.040301384264938e-05, "loss": 3.7822, "step": 5664 }, { "epoch": 1.4887509442638027, "grad_norm": 0.8573900461196899, "learning_rate": 5.038549150166463e-05, "loss": 3.7724, "step": 5666 }, { "epoch": 1.4892764475974645, "grad_norm": 0.7801864743232727, "learning_rate": 5.036796916067987e-05, "loss": 3.7916, "step": 5668 }, { "epoch": 1.4898019509311262, "grad_norm": 0.7908510565757751, "learning_rate": 5.035044681969512e-05, "loss": 3.7936, "step": 5670 }, { "epoch": 1.490327454264788, "grad_norm": 0.7904651761054993, "learning_rate": 5.033292447871035e-05, "loss": 3.7933, "step": 5672 }, { "epoch": 1.4908529575984497, "grad_norm": 0.760500967502594, "learning_rate": 5.03154021377256e-05, "loss": 3.84, "step": 5674 }, { "epoch": 1.4913784609321115, "grad_norm": 0.8796125650405884, "learning_rate": 5.029787979674084e-05, "loss": 3.7885, "step": 5676 }, { "epoch": 1.4919039642657732, "grad_norm": 0.8023958206176758, "learning_rate": 5.028035745575609e-05, "loss": 3.7916, "step": 5678 }, { "epoch": 1.4924294675994352, "grad_norm": 0.8676835894584656, "learning_rate": 5.0262835114771336e-05, "loss": 3.8146, "step": 5680 }, { "epoch": 1.492954970933097, "grad_norm": 0.8643673658370972, "learning_rate": 5.024531277378658e-05, "loss": 3.8623, "step": 5682 }, { "epoch": 1.4934804742667587, "grad_norm": 0.8135111331939697, "learning_rate": 5.0227790432801825e-05, "loss": 3.8012, "step": 5684 }, { "epoch": 1.4940059776004204, "grad_norm": 0.7471112608909607, "learning_rate": 5.021026809181707e-05, "loss": 3.8051, "step": 5686 }, { "epoch": 1.4945314809340822, "grad_norm": 0.8078750371932983, "learning_rate": 5.019274575083231e-05, "loss": 3.7694, "step": 5688 }, { "epoch": 1.495056984267744, "grad_norm": 0.8777804374694824, "learning_rate": 5.017522340984756e-05, "loss": 3.804, "step": 5690 }, { "epoch": 1.4955824876014057, "grad_norm": 0.7878788113594055, "learning_rate": 5.015770106886281e-05, "loss": 3.7806, "step": 5692 }, { "epoch": 1.4961079909350674, "grad_norm": 0.788495659828186, "learning_rate": 5.014017872787805e-05, "loss": 3.8329, "step": 5694 }, { "epoch": 1.4966334942687292, "grad_norm": 0.8134931325912476, "learning_rate": 5.0122656386893297e-05, "loss": 3.811, "step": 5696 }, { "epoch": 1.4971589976023911, "grad_norm": 0.7978057265281677, "learning_rate": 5.010513404590853e-05, "loss": 3.7772, "step": 5698 }, { "epoch": 1.4976845009360529, "grad_norm": 0.7149028182029724, "learning_rate": 5.008761170492378e-05, "loss": 3.7934, "step": 5700 }, { "epoch": 1.4982100042697146, "grad_norm": 0.7647327780723572, "learning_rate": 5.007008936393902e-05, "loss": 3.7619, "step": 5702 }, { "epoch": 1.4987355076033764, "grad_norm": 0.9027557373046875, "learning_rate": 5.0052567022954266e-05, "loss": 3.7787, "step": 5704 }, { "epoch": 1.4992610109370381, "grad_norm": 0.8233598470687866, "learning_rate": 5.0035044681969514e-05, "loss": 3.8401, "step": 5706 }, { "epoch": 1.4997865142706999, "grad_norm": 0.832959771156311, "learning_rate": 5.0017522340984755e-05, "loss": 3.7658, "step": 5708 }, { "epoch": 1.5003120176043616, "grad_norm": 0.8526417016983032, "learning_rate": 5e-05, "loss": 3.7752, "step": 5710 }, { "epoch": 1.5008375209380236, "grad_norm": 0.7825552225112915, "learning_rate": 4.998247765901525e-05, "loss": 3.8267, "step": 5712 }, { "epoch": 1.501363024271685, "grad_norm": 0.8736358880996704, "learning_rate": 4.996495531803049e-05, "loss": 3.8062, "step": 5714 }, { "epoch": 1.501888527605347, "grad_norm": 0.7811210751533508, "learning_rate": 4.994743297704574e-05, "loss": 3.8004, "step": 5716 }, { "epoch": 1.5024140309390086, "grad_norm": 0.8214970231056213, "learning_rate": 4.992991063606098e-05, "loss": 3.7933, "step": 5718 }, { "epoch": 1.5029395342726706, "grad_norm": 0.9351893663406372, "learning_rate": 4.991238829507623e-05, "loss": 3.8522, "step": 5720 }, { "epoch": 1.5034650376063323, "grad_norm": 0.7767342329025269, "learning_rate": 4.989486595409147e-05, "loss": 3.7666, "step": 5722 }, { "epoch": 1.503990540939994, "grad_norm": 0.8351171612739563, "learning_rate": 4.9877343613106715e-05, "loss": 3.7827, "step": 5724 }, { "epoch": 1.5045160442736558, "grad_norm": 0.8139747381210327, "learning_rate": 4.985982127212196e-05, "loss": 3.7775, "step": 5726 }, { "epoch": 1.5050415476073176, "grad_norm": 0.7029263973236084, "learning_rate": 4.98422989311372e-05, "loss": 3.7932, "step": 5728 }, { "epoch": 1.5055670509409795, "grad_norm": 0.8008304238319397, "learning_rate": 4.9824776590152444e-05, "loss": 3.7918, "step": 5730 }, { "epoch": 1.506092554274641, "grad_norm": 0.8330174088478088, "learning_rate": 4.980725424916769e-05, "loss": 3.7801, "step": 5732 }, { "epoch": 1.506618057608303, "grad_norm": 0.8875184059143066, "learning_rate": 4.978973190818293e-05, "loss": 3.7442, "step": 5734 }, { "epoch": 1.5071435609419648, "grad_norm": 0.7884337902069092, "learning_rate": 4.977220956719818e-05, "loss": 3.7881, "step": 5736 }, { "epoch": 1.5076690642756265, "grad_norm": 0.7335008382797241, "learning_rate": 4.975468722621343e-05, "loss": 3.7739, "step": 5738 }, { "epoch": 1.5081945676092883, "grad_norm": 0.8242607116699219, "learning_rate": 4.973716488522867e-05, "loss": 3.8012, "step": 5740 }, { "epoch": 1.50872007094295, "grad_norm": 0.8189392685890198, "learning_rate": 4.971964254424391e-05, "loss": 3.7916, "step": 5742 }, { "epoch": 1.5092455742766118, "grad_norm": 0.999525249004364, "learning_rate": 4.970212020325916e-05, "loss": 3.8089, "step": 5744 }, { "epoch": 1.5097710776102735, "grad_norm": 0.7375800609588623, "learning_rate": 4.9684597862274404e-05, "loss": 3.8276, "step": 5746 }, { "epoch": 1.5102965809439355, "grad_norm": 0.9924771189689636, "learning_rate": 4.9667075521289645e-05, "loss": 3.8315, "step": 5748 }, { "epoch": 1.510822084277597, "grad_norm": 0.7034995555877686, "learning_rate": 4.964955318030489e-05, "loss": 3.8089, "step": 5750 }, { "epoch": 1.511347587611259, "grad_norm": 0.8415871858596802, "learning_rate": 4.963203083932014e-05, "loss": 3.7669, "step": 5752 }, { "epoch": 1.5118730909449207, "grad_norm": 0.7329029440879822, "learning_rate": 4.961450849833538e-05, "loss": 3.8212, "step": 5754 }, { "epoch": 1.5123985942785825, "grad_norm": 0.7459883093833923, "learning_rate": 4.959698615735062e-05, "loss": 3.7818, "step": 5756 }, { "epoch": 1.5129240976122442, "grad_norm": 0.8561874628067017, "learning_rate": 4.957946381636587e-05, "loss": 3.7516, "step": 5758 }, { "epoch": 1.513449600945906, "grad_norm": 0.7641480565071106, "learning_rate": 4.956194147538111e-05, "loss": 3.8144, "step": 5760 }, { "epoch": 1.5139751042795677, "grad_norm": 0.8569730520248413, "learning_rate": 4.954441913439636e-05, "loss": 3.8341, "step": 5762 }, { "epoch": 1.5145006076132295, "grad_norm": 0.8350128531455994, "learning_rate": 4.9526896793411605e-05, "loss": 3.8812, "step": 5764 }, { "epoch": 1.5150261109468914, "grad_norm": 0.8057486414909363, "learning_rate": 4.9509374452426846e-05, "loss": 3.8171, "step": 5766 }, { "epoch": 1.515551614280553, "grad_norm": 0.8782981038093567, "learning_rate": 4.949185211144209e-05, "loss": 3.8043, "step": 5768 }, { "epoch": 1.516077117614215, "grad_norm": 0.839695930480957, "learning_rate": 4.9474329770457334e-05, "loss": 3.7713, "step": 5770 }, { "epoch": 1.5166026209478767, "grad_norm": 0.8945126533508301, "learning_rate": 4.945680742947258e-05, "loss": 3.8272, "step": 5772 }, { "epoch": 1.5171281242815384, "grad_norm": 0.8364235162734985, "learning_rate": 4.943928508848782e-05, "loss": 3.7876, "step": 5774 }, { "epoch": 1.5176536276152002, "grad_norm": 0.8406170606613159, "learning_rate": 4.942176274750307e-05, "loss": 3.8429, "step": 5776 }, { "epoch": 1.518179130948862, "grad_norm": 0.8335036635398865, "learning_rate": 4.940424040651832e-05, "loss": 3.8309, "step": 5778 }, { "epoch": 1.518704634282524, "grad_norm": 0.8886866569519043, "learning_rate": 4.938671806553356e-05, "loss": 3.8211, "step": 5780 }, { "epoch": 1.5192301376161854, "grad_norm": 0.7038913369178772, "learning_rate": 4.93691957245488e-05, "loss": 3.7778, "step": 5782 }, { "epoch": 1.5197556409498474, "grad_norm": 0.9237792491912842, "learning_rate": 4.935167338356405e-05, "loss": 3.7788, "step": 5784 }, { "epoch": 1.520281144283509, "grad_norm": 0.8196488618850708, "learning_rate": 4.933415104257929e-05, "loss": 3.7933, "step": 5786 }, { "epoch": 1.5208066476171709, "grad_norm": 0.9380713701248169, "learning_rate": 4.9316628701594535e-05, "loss": 3.8392, "step": 5788 }, { "epoch": 1.5213321509508326, "grad_norm": 1.0269041061401367, "learning_rate": 4.929910636060978e-05, "loss": 3.823, "step": 5790 }, { "epoch": 1.5218576542844944, "grad_norm": 0.7585693001747131, "learning_rate": 4.9281584019625024e-05, "loss": 3.8483, "step": 5792 }, { "epoch": 1.5223831576181561, "grad_norm": 0.7462103962898254, "learning_rate": 4.9264061678640265e-05, "loss": 3.8108, "step": 5794 }, { "epoch": 1.5229086609518179, "grad_norm": 0.8181387782096863, "learning_rate": 4.924653933765551e-05, "loss": 3.7945, "step": 5796 }, { "epoch": 1.5234341642854798, "grad_norm": 0.7723410129547119, "learning_rate": 4.922901699667076e-05, "loss": 3.7831, "step": 5798 }, { "epoch": 1.5239596676191414, "grad_norm": 0.8784385919570923, "learning_rate": 4.9211494655686e-05, "loss": 3.8179, "step": 5800 }, { "epoch": 1.5244851709528033, "grad_norm": 0.853900134563446, "learning_rate": 4.919397231470125e-05, "loss": 3.8117, "step": 5802 }, { "epoch": 1.5250106742864649, "grad_norm": 0.9072641134262085, "learning_rate": 4.9176449973716496e-05, "loss": 3.7525, "step": 5804 }, { "epoch": 1.5255361776201268, "grad_norm": 0.9070600867271423, "learning_rate": 4.915892763273173e-05, "loss": 3.8403, "step": 5806 }, { "epoch": 1.5260616809537886, "grad_norm": 0.7871101498603821, "learning_rate": 4.914140529174698e-05, "loss": 3.8279, "step": 5808 }, { "epoch": 1.5265871842874503, "grad_norm": 0.8213281631469727, "learning_rate": 4.9123882950762225e-05, "loss": 3.786, "step": 5810 }, { "epoch": 1.527112687621112, "grad_norm": 0.7918179631233215, "learning_rate": 4.9106360609777465e-05, "loss": 3.7481, "step": 5812 }, { "epoch": 1.5276381909547738, "grad_norm": 0.8386132717132568, "learning_rate": 4.908883826879271e-05, "loss": 3.7651, "step": 5814 }, { "epoch": 1.5281636942884358, "grad_norm": 0.8915697336196899, "learning_rate": 4.907131592780796e-05, "loss": 3.8158, "step": 5816 }, { "epoch": 1.5286891976220973, "grad_norm": 0.7526552677154541, "learning_rate": 4.90537935868232e-05, "loss": 3.7887, "step": 5818 }, { "epoch": 1.5292147009557593, "grad_norm": 0.7683132290840149, "learning_rate": 4.903627124583844e-05, "loss": 3.7302, "step": 5820 }, { "epoch": 1.5297402042894208, "grad_norm": 0.8463097810745239, "learning_rate": 4.901874890485369e-05, "loss": 3.8288, "step": 5822 }, { "epoch": 1.5302657076230828, "grad_norm": 0.8176122903823853, "learning_rate": 4.900122656386894e-05, "loss": 3.8312, "step": 5824 }, { "epoch": 1.5307912109567445, "grad_norm": 0.8250756859779358, "learning_rate": 4.898370422288418e-05, "loss": 3.7654, "step": 5826 }, { "epoch": 1.5313167142904063, "grad_norm": 0.8898065090179443, "learning_rate": 4.8966181881899426e-05, "loss": 3.8109, "step": 5828 }, { "epoch": 1.531842217624068, "grad_norm": 0.7898335456848145, "learning_rate": 4.894865954091467e-05, "loss": 3.8141, "step": 5830 }, { "epoch": 1.5323677209577298, "grad_norm": 0.8787068724632263, "learning_rate": 4.893113719992991e-05, "loss": 3.7725, "step": 5832 }, { "epoch": 1.5328932242913917, "grad_norm": 0.8658038973808289, "learning_rate": 4.8913614858945155e-05, "loss": 3.8193, "step": 5834 }, { "epoch": 1.5334187276250533, "grad_norm": 0.7439196109771729, "learning_rate": 4.88960925179604e-05, "loss": 3.7906, "step": 5836 }, { "epoch": 1.5339442309587152, "grad_norm": 0.7960675358772278, "learning_rate": 4.887857017697565e-05, "loss": 3.7958, "step": 5838 }, { "epoch": 1.5344697342923768, "grad_norm": 0.7764498591423035, "learning_rate": 4.886104783599089e-05, "loss": 3.797, "step": 5840 }, { "epoch": 1.5349952376260387, "grad_norm": 0.9149618744850159, "learning_rate": 4.884352549500614e-05, "loss": 3.8356, "step": 5842 }, { "epoch": 1.5355207409597005, "grad_norm": 0.8216813206672668, "learning_rate": 4.882600315402138e-05, "loss": 3.8607, "step": 5844 }, { "epoch": 1.5360462442933622, "grad_norm": 0.872950553894043, "learning_rate": 4.880848081303662e-05, "loss": 3.7818, "step": 5846 }, { "epoch": 1.536571747627024, "grad_norm": 0.8343134522438049, "learning_rate": 4.879095847205187e-05, "loss": 3.7636, "step": 5848 }, { "epoch": 1.5370972509606857, "grad_norm": 0.8001225590705872, "learning_rate": 4.8773436131067115e-05, "loss": 3.7821, "step": 5850 }, { "epoch": 1.5376227542943477, "grad_norm": 0.8232811093330383, "learning_rate": 4.8755913790082356e-05, "loss": 3.8048, "step": 5852 }, { "epoch": 1.5381482576280092, "grad_norm": 1.0725347995758057, "learning_rate": 4.87383914490976e-05, "loss": 3.8066, "step": 5854 }, { "epoch": 1.5386737609616712, "grad_norm": 0.8735494017601013, "learning_rate": 4.872086910811285e-05, "loss": 3.7463, "step": 5856 }, { "epoch": 1.5391992642953327, "grad_norm": 1.0102304220199585, "learning_rate": 4.870334676712809e-05, "loss": 3.7754, "step": 5858 }, { "epoch": 1.5397247676289947, "grad_norm": 0.8088173866271973, "learning_rate": 4.868582442614333e-05, "loss": 3.7302, "step": 5860 }, { "epoch": 1.5402502709626564, "grad_norm": 0.7691337466239929, "learning_rate": 4.866830208515858e-05, "loss": 3.7664, "step": 5862 }, { "epoch": 1.5407757742963182, "grad_norm": 0.8820752501487732, "learning_rate": 4.865077974417383e-05, "loss": 3.8225, "step": 5864 }, { "epoch": 1.54130127762998, "grad_norm": 0.8896738886833191, "learning_rate": 4.863325740318907e-05, "loss": 3.7765, "step": 5866 }, { "epoch": 1.5418267809636417, "grad_norm": 0.8754159212112427, "learning_rate": 4.8615735062204316e-05, "loss": 3.7976, "step": 5868 }, { "epoch": 1.5423522842973036, "grad_norm": 0.9071822166442871, "learning_rate": 4.859821272121956e-05, "loss": 3.8508, "step": 5870 }, { "epoch": 1.5428777876309652, "grad_norm": 0.8777666091918945, "learning_rate": 4.85806903802348e-05, "loss": 3.7541, "step": 5872 }, { "epoch": 1.5434032909646271, "grad_norm": 0.7940821051597595, "learning_rate": 4.8563168039250045e-05, "loss": 3.8088, "step": 5874 }, { "epoch": 1.5439287942982887, "grad_norm": 0.8013737201690674, "learning_rate": 4.854564569826529e-05, "loss": 3.7998, "step": 5876 }, { "epoch": 1.5444542976319506, "grad_norm": 0.7513417601585388, "learning_rate": 4.8528123357280533e-05, "loss": 3.7931, "step": 5878 }, { "epoch": 1.5449798009656124, "grad_norm": 0.9275540113449097, "learning_rate": 4.851060101629578e-05, "loss": 3.8068, "step": 5880 }, { "epoch": 1.5455053042992741, "grad_norm": 0.7647203207015991, "learning_rate": 4.849307867531102e-05, "loss": 3.7834, "step": 5882 }, { "epoch": 1.5460308076329359, "grad_norm": 0.8482243418693542, "learning_rate": 4.847555633432627e-05, "loss": 3.8046, "step": 5884 }, { "epoch": 1.5465563109665976, "grad_norm": 0.7866527438163757, "learning_rate": 4.845803399334151e-05, "loss": 3.7756, "step": 5886 }, { "epoch": 1.5470818143002596, "grad_norm": 0.8967549204826355, "learning_rate": 4.844051165235676e-05, "loss": 3.8398, "step": 5888 }, { "epoch": 1.5476073176339211, "grad_norm": 0.8690900802612305, "learning_rate": 4.8422989311372005e-05, "loss": 3.7644, "step": 5890 }, { "epoch": 1.548132820967583, "grad_norm": 0.7294548749923706, "learning_rate": 4.8405466970387246e-05, "loss": 3.7975, "step": 5892 }, { "epoch": 1.5486583243012448, "grad_norm": 0.7357770800590515, "learning_rate": 4.8387944629402494e-05, "loss": 3.8286, "step": 5894 }, { "epoch": 1.5491838276349066, "grad_norm": 0.7917661070823669, "learning_rate": 4.8370422288417734e-05, "loss": 3.7828, "step": 5896 }, { "epoch": 1.5497093309685683, "grad_norm": 0.8546766042709351, "learning_rate": 4.8352899947432975e-05, "loss": 3.8032, "step": 5898 }, { "epoch": 1.55023483430223, "grad_norm": 0.8752919435501099, "learning_rate": 4.833537760644822e-05, "loss": 3.828, "step": 5900 }, { "epoch": 1.5507603376358918, "grad_norm": 0.7997423410415649, "learning_rate": 4.831785526546347e-05, "loss": 3.8514, "step": 5902 }, { "epoch": 1.5512858409695536, "grad_norm": 0.9060472249984741, "learning_rate": 4.830033292447871e-05, "loss": 3.7698, "step": 5904 }, { "epoch": 1.5518113443032155, "grad_norm": 0.7657392621040344, "learning_rate": 4.828281058349396e-05, "loss": 3.8073, "step": 5906 }, { "epoch": 1.552336847636877, "grad_norm": 0.8406884074211121, "learning_rate": 4.82652882425092e-05, "loss": 3.7767, "step": 5908 }, { "epoch": 1.552862350970539, "grad_norm": 0.9174937009811401, "learning_rate": 4.824776590152445e-05, "loss": 3.7948, "step": 5910 }, { "epoch": 1.5533878543042008, "grad_norm": 0.8100804686546326, "learning_rate": 4.823024356053969e-05, "loss": 3.8069, "step": 5912 }, { "epoch": 1.5539133576378625, "grad_norm": 0.7440721988677979, "learning_rate": 4.8212721219554935e-05, "loss": 3.7796, "step": 5914 }, { "epoch": 1.5544388609715243, "grad_norm": 0.7421438694000244, "learning_rate": 4.819519887857018e-05, "loss": 3.7366, "step": 5916 }, { "epoch": 1.554964364305186, "grad_norm": 0.7974204421043396, "learning_rate": 4.8177676537585424e-05, "loss": 3.7689, "step": 5918 }, { "epoch": 1.5554898676388478, "grad_norm": 0.7706868648529053, "learning_rate": 4.816015419660067e-05, "loss": 3.8191, "step": 5920 }, { "epoch": 1.5560153709725095, "grad_norm": 0.9241908192634583, "learning_rate": 4.814263185561591e-05, "loss": 3.8157, "step": 5922 }, { "epoch": 1.5565408743061715, "grad_norm": 0.7510288953781128, "learning_rate": 4.812510951463115e-05, "loss": 3.7978, "step": 5924 }, { "epoch": 1.557066377639833, "grad_norm": 0.8646233081817627, "learning_rate": 4.81075871736464e-05, "loss": 3.7349, "step": 5926 }, { "epoch": 1.557591880973495, "grad_norm": 1.007100224494934, "learning_rate": 4.809006483266165e-05, "loss": 3.8228, "step": 5928 }, { "epoch": 1.5581173843071567, "grad_norm": 0.8325265049934387, "learning_rate": 4.807254249167689e-05, "loss": 3.7908, "step": 5930 }, { "epoch": 1.5586428876408185, "grad_norm": 0.7148762941360474, "learning_rate": 4.8055020150692136e-05, "loss": 3.7982, "step": 5932 }, { "epoch": 1.5591683909744802, "grad_norm": 0.740319013595581, "learning_rate": 4.803749780970738e-05, "loss": 3.8204, "step": 5934 }, { "epoch": 1.559693894308142, "grad_norm": 0.8821399211883545, "learning_rate": 4.8019975468722625e-05, "loss": 3.7395, "step": 5936 }, { "epoch": 1.560219397641804, "grad_norm": 0.8112403750419617, "learning_rate": 4.8002453127737865e-05, "loss": 3.8528, "step": 5938 }, { "epoch": 1.5607449009754655, "grad_norm": 0.9307340383529663, "learning_rate": 4.798493078675311e-05, "loss": 3.8352, "step": 5940 }, { "epoch": 1.5612704043091274, "grad_norm": 0.8245285749435425, "learning_rate": 4.796740844576836e-05, "loss": 3.8012, "step": 5942 }, { "epoch": 1.561795907642789, "grad_norm": 0.8340120911598206, "learning_rate": 4.79498861047836e-05, "loss": 3.7958, "step": 5944 }, { "epoch": 1.562321410976451, "grad_norm": 0.8080868721008301, "learning_rate": 4.793236376379884e-05, "loss": 3.729, "step": 5946 }, { "epoch": 1.5628469143101127, "grad_norm": 0.8628913760185242, "learning_rate": 4.791484142281409e-05, "loss": 3.7928, "step": 5948 }, { "epoch": 1.5633724176437744, "grad_norm": 0.8633995056152344, "learning_rate": 4.789731908182933e-05, "loss": 3.821, "step": 5950 }, { "epoch": 1.5638979209774362, "grad_norm": 0.8742152452468872, "learning_rate": 4.787979674084458e-05, "loss": 3.8147, "step": 5952 }, { "epoch": 1.564423424311098, "grad_norm": 0.7905998826026917, "learning_rate": 4.7862274399859826e-05, "loss": 3.7701, "step": 5954 }, { "epoch": 1.56494892764476, "grad_norm": 0.9115804433822632, "learning_rate": 4.7844752058875066e-05, "loss": 3.7542, "step": 5956 }, { "epoch": 1.5654744309784214, "grad_norm": 0.8164551854133606, "learning_rate": 4.7827229717890314e-05, "loss": 3.8016, "step": 5958 }, { "epoch": 1.5659999343120834, "grad_norm": 1.0302069187164307, "learning_rate": 4.7809707376905555e-05, "loss": 3.8265, "step": 5960 }, { "epoch": 1.566525437645745, "grad_norm": 0.8925058841705322, "learning_rate": 4.77921850359208e-05, "loss": 3.7767, "step": 5962 }, { "epoch": 1.567050940979407, "grad_norm": 0.8168877959251404, "learning_rate": 4.777466269493604e-05, "loss": 3.8044, "step": 5964 }, { "epoch": 1.5675764443130686, "grad_norm": 0.82443767786026, "learning_rate": 4.775714035395129e-05, "loss": 3.7768, "step": 5966 }, { "epoch": 1.5681019476467304, "grad_norm": 0.8865484595298767, "learning_rate": 4.773961801296654e-05, "loss": 3.7395, "step": 5968 }, { "epoch": 1.5686274509803921, "grad_norm": 0.8249469995498657, "learning_rate": 4.772209567198178e-05, "loss": 3.8382, "step": 5970 }, { "epoch": 1.5691529543140539, "grad_norm": 0.7853021025657654, "learning_rate": 4.770457333099702e-05, "loss": 3.7899, "step": 5972 }, { "epoch": 1.5696784576477159, "grad_norm": 0.8331713080406189, "learning_rate": 4.768705099001227e-05, "loss": 3.8106, "step": 5974 }, { "epoch": 1.5702039609813774, "grad_norm": 0.872168242931366, "learning_rate": 4.7669528649027515e-05, "loss": 3.8008, "step": 5976 }, { "epoch": 1.5707294643150393, "grad_norm": 0.7450631260871887, "learning_rate": 4.7652006308042756e-05, "loss": 3.7981, "step": 5978 }, { "epoch": 1.5712549676487009, "grad_norm": 0.7997294664382935, "learning_rate": 4.7634483967058e-05, "loss": 3.8628, "step": 5980 }, { "epoch": 1.5717804709823628, "grad_norm": 0.9652913212776184, "learning_rate": 4.761696162607325e-05, "loss": 3.7848, "step": 5982 }, { "epoch": 1.5723059743160246, "grad_norm": 0.744698166847229, "learning_rate": 4.7599439285088485e-05, "loss": 3.8334, "step": 5984 }, { "epoch": 1.5728314776496863, "grad_norm": 0.8468793630599976, "learning_rate": 4.758191694410373e-05, "loss": 3.7704, "step": 5986 }, { "epoch": 1.573356980983348, "grad_norm": 0.7813863158226013, "learning_rate": 4.756439460311898e-05, "loss": 3.756, "step": 5988 }, { "epoch": 1.5738824843170098, "grad_norm": 0.8227751851081848, "learning_rate": 4.754687226213422e-05, "loss": 3.7797, "step": 5990 }, { "epoch": 1.5744079876506718, "grad_norm": 0.7578281164169312, "learning_rate": 4.752934992114947e-05, "loss": 3.8489, "step": 5992 }, { "epoch": 1.5749334909843333, "grad_norm": 0.8207914233207703, "learning_rate": 4.7511827580164716e-05, "loss": 3.7362, "step": 5994 }, { "epoch": 1.5754589943179953, "grad_norm": 0.7909433245658875, "learning_rate": 4.749430523917996e-05, "loss": 3.7942, "step": 5996 }, { "epoch": 1.5759844976516568, "grad_norm": 0.7229717373847961, "learning_rate": 4.74767828981952e-05, "loss": 3.8289, "step": 5998 }, { "epoch": 1.5765100009853188, "grad_norm": 0.788748562335968, "learning_rate": 4.7459260557210445e-05, "loss": 3.8279, "step": 6000 }, { "epoch": 1.5765100009853188, "eval_loss": 3.781409978866577, "eval_runtime": 464.5386, "eval_samples_per_second": 262.172, "eval_steps_per_second": 8.193, "step": 6000 }, { "epoch": 1.5770355043189805, "grad_norm": 0.7751213908195496, "learning_rate": 4.744173821622569e-05, "loss": 3.7718, "step": 6002 }, { "epoch": 1.5775610076526423, "grad_norm": 0.7600667476654053, "learning_rate": 4.7424215875240933e-05, "loss": 3.7494, "step": 6004 }, { "epoch": 1.578086510986304, "grad_norm": 0.7804160714149475, "learning_rate": 4.740669353425618e-05, "loss": 3.8258, "step": 6006 }, { "epoch": 1.5786120143199658, "grad_norm": 0.7669584155082703, "learning_rate": 4.738917119327143e-05, "loss": 3.7706, "step": 6008 }, { "epoch": 1.5791375176536278, "grad_norm": 0.8464110493659973, "learning_rate": 4.737164885228666e-05, "loss": 3.8494, "step": 6010 }, { "epoch": 1.5796630209872893, "grad_norm": 0.8135186433792114, "learning_rate": 4.735412651130191e-05, "loss": 3.7468, "step": 6012 }, { "epoch": 1.5801885243209512, "grad_norm": 0.8280919790267944, "learning_rate": 4.733660417031716e-05, "loss": 3.7761, "step": 6014 }, { "epoch": 1.5807140276546128, "grad_norm": 0.8261246085166931, "learning_rate": 4.73190818293324e-05, "loss": 3.8054, "step": 6016 }, { "epoch": 1.5812395309882747, "grad_norm": 0.7896277904510498, "learning_rate": 4.7301559488347646e-05, "loss": 3.8001, "step": 6018 }, { "epoch": 1.5817650343219365, "grad_norm": 0.8395569324493408, "learning_rate": 4.7284037147362894e-05, "loss": 3.8047, "step": 6020 }, { "epoch": 1.5822905376555982, "grad_norm": 0.7667027115821838, "learning_rate": 4.7266514806378134e-05, "loss": 3.791, "step": 6022 }, { "epoch": 1.58281604098926, "grad_norm": 0.8295708298683167, "learning_rate": 4.7248992465393375e-05, "loss": 3.7998, "step": 6024 }, { "epoch": 1.5833415443229217, "grad_norm": 0.8699784874916077, "learning_rate": 4.723147012440862e-05, "loss": 3.7823, "step": 6026 }, { "epoch": 1.5838670476565837, "grad_norm": 0.7432299256324768, "learning_rate": 4.721394778342387e-05, "loss": 3.8634, "step": 6028 }, { "epoch": 1.5843925509902452, "grad_norm": 0.8317083120346069, "learning_rate": 4.719642544243911e-05, "loss": 3.8031, "step": 6030 }, { "epoch": 1.5849180543239072, "grad_norm": 0.8250396847724915, "learning_rate": 4.717890310145436e-05, "loss": 3.8189, "step": 6032 }, { "epoch": 1.5854435576575687, "grad_norm": 0.8903437256813049, "learning_rate": 4.7161380760469606e-05, "loss": 3.7894, "step": 6034 }, { "epoch": 1.5859690609912307, "grad_norm": 0.8001671433448792, "learning_rate": 4.714385841948484e-05, "loss": 3.7871, "step": 6036 }, { "epoch": 1.5864945643248924, "grad_norm": 0.7661740779876709, "learning_rate": 4.712633607850009e-05, "loss": 3.8466, "step": 6038 }, { "epoch": 1.5870200676585542, "grad_norm": 0.8338112831115723, "learning_rate": 4.7108813737515335e-05, "loss": 3.8014, "step": 6040 }, { "epoch": 1.587545570992216, "grad_norm": 0.7802762389183044, "learning_rate": 4.7091291396530576e-05, "loss": 3.7747, "step": 6042 }, { "epoch": 1.5880710743258777, "grad_norm": 0.7693320512771606, "learning_rate": 4.7073769055545824e-05, "loss": 3.7712, "step": 6044 }, { "epoch": 1.5885965776595397, "grad_norm": 0.8809871673583984, "learning_rate": 4.705624671456107e-05, "loss": 3.8025, "step": 6046 }, { "epoch": 1.5891220809932012, "grad_norm": 0.8652351498603821, "learning_rate": 4.703872437357631e-05, "loss": 3.7735, "step": 6048 }, { "epoch": 1.5896475843268632, "grad_norm": 0.8676387071609497, "learning_rate": 4.702120203259155e-05, "loss": 3.8143, "step": 6050 }, { "epoch": 1.590173087660525, "grad_norm": 0.8622456192970276, "learning_rate": 4.70036796916068e-05, "loss": 3.7661, "step": 6052 }, { "epoch": 1.5906985909941866, "grad_norm": 0.8983504772186279, "learning_rate": 4.698615735062205e-05, "loss": 3.8034, "step": 6054 }, { "epoch": 1.5912240943278484, "grad_norm": 0.859561026096344, "learning_rate": 4.696863500963729e-05, "loss": 3.8245, "step": 6056 }, { "epoch": 1.5917495976615101, "grad_norm": 0.8469770550727844, "learning_rate": 4.6951112668652536e-05, "loss": 3.755, "step": 6058 }, { "epoch": 1.5922751009951719, "grad_norm": 0.7490195035934448, "learning_rate": 4.6933590327667784e-05, "loss": 3.8128, "step": 6060 }, { "epoch": 1.5928006043288336, "grad_norm": 0.8199207186698914, "learning_rate": 4.691606798668302e-05, "loss": 3.7599, "step": 6062 }, { "epoch": 1.5933261076624956, "grad_norm": 0.7421783208847046, "learning_rate": 4.6898545645698265e-05, "loss": 3.7657, "step": 6064 }, { "epoch": 1.5938516109961571, "grad_norm": 0.9042335748672485, "learning_rate": 4.688102330471351e-05, "loss": 3.7893, "step": 6066 }, { "epoch": 1.594377114329819, "grad_norm": 0.7951528429985046, "learning_rate": 4.6863500963728754e-05, "loss": 3.7869, "step": 6068 }, { "epoch": 1.5949026176634808, "grad_norm": 0.9142905473709106, "learning_rate": 4.6845978622744e-05, "loss": 3.8014, "step": 6070 }, { "epoch": 1.5954281209971426, "grad_norm": 0.8294028639793396, "learning_rate": 4.682845628175925e-05, "loss": 3.8154, "step": 6072 }, { "epoch": 1.5959536243308043, "grad_norm": 0.807110607624054, "learning_rate": 4.681093394077449e-05, "loss": 3.8261, "step": 6074 }, { "epoch": 1.596479127664466, "grad_norm": 0.8487638831138611, "learning_rate": 4.679341159978973e-05, "loss": 3.8069, "step": 6076 }, { "epoch": 1.597004630998128, "grad_norm": 0.8180763721466064, "learning_rate": 4.677588925880498e-05, "loss": 3.7558, "step": 6078 }, { "epoch": 1.5975301343317896, "grad_norm": 0.7851226925849915, "learning_rate": 4.6758366917820226e-05, "loss": 3.7957, "step": 6080 }, { "epoch": 1.5980556376654516, "grad_norm": 0.8417772054672241, "learning_rate": 4.6740844576835466e-05, "loss": 3.8408, "step": 6082 }, { "epoch": 1.598581140999113, "grad_norm": 0.74514240026474, "learning_rate": 4.6723322235850714e-05, "loss": 3.7259, "step": 6084 }, { "epoch": 1.599106644332775, "grad_norm": 0.7422042489051819, "learning_rate": 4.6705799894865955e-05, "loss": 3.788, "step": 6086 }, { "epoch": 1.5996321476664368, "grad_norm": 0.9796526432037354, "learning_rate": 4.66882775538812e-05, "loss": 3.7722, "step": 6088 }, { "epoch": 1.6001576510000985, "grad_norm": 0.759756863117218, "learning_rate": 4.667075521289644e-05, "loss": 3.7667, "step": 6090 }, { "epoch": 1.6006831543337603, "grad_norm": 0.8089520335197449, "learning_rate": 4.665323287191169e-05, "loss": 3.7872, "step": 6092 }, { "epoch": 1.601208657667422, "grad_norm": 0.86346435546875, "learning_rate": 4.663571053092694e-05, "loss": 3.7662, "step": 6094 }, { "epoch": 1.601734161001084, "grad_norm": 0.7257112264633179, "learning_rate": 4.661818818994218e-05, "loss": 3.8591, "step": 6096 }, { "epoch": 1.6022596643347455, "grad_norm": 0.9100852012634277, "learning_rate": 4.6600665848957427e-05, "loss": 3.8123, "step": 6098 }, { "epoch": 1.6027851676684075, "grad_norm": 0.6923310160636902, "learning_rate": 4.658314350797267e-05, "loss": 3.8138, "step": 6100 }, { "epoch": 1.603310671002069, "grad_norm": 0.7713332176208496, "learning_rate": 4.656562116698791e-05, "loss": 3.7767, "step": 6102 }, { "epoch": 1.603836174335731, "grad_norm": 0.7710595726966858, "learning_rate": 4.6548098826003156e-05, "loss": 3.7531, "step": 6104 }, { "epoch": 1.6043616776693927, "grad_norm": 0.8642397522926331, "learning_rate": 4.65305764850184e-05, "loss": 3.7848, "step": 6106 }, { "epoch": 1.6048871810030545, "grad_norm": 0.8171465396881104, "learning_rate": 4.6513054144033644e-05, "loss": 3.8244, "step": 6108 }, { "epoch": 1.6054126843367162, "grad_norm": 0.7192990779876709, "learning_rate": 4.649553180304889e-05, "loss": 3.7904, "step": 6110 }, { "epoch": 1.605938187670378, "grad_norm": 0.7560930252075195, "learning_rate": 4.647800946206413e-05, "loss": 3.8203, "step": 6112 }, { "epoch": 1.60646369100404, "grad_norm": 0.8945010900497437, "learning_rate": 4.646048712107938e-05, "loss": 3.795, "step": 6114 }, { "epoch": 1.6069891943377015, "grad_norm": 0.8219373226165771, "learning_rate": 4.644296478009462e-05, "loss": 3.8385, "step": 6116 }, { "epoch": 1.6075146976713635, "grad_norm": 0.830161988735199, "learning_rate": 4.642544243910987e-05, "loss": 3.8081, "step": 6118 }, { "epoch": 1.608040201005025, "grad_norm": 0.7727769613265991, "learning_rate": 4.6407920098125116e-05, "loss": 3.7946, "step": 6120 }, { "epoch": 1.608565704338687, "grad_norm": 0.8283730149269104, "learning_rate": 4.639039775714036e-05, "loss": 3.7603, "step": 6122 }, { "epoch": 1.6090912076723487, "grad_norm": 0.7790741920471191, "learning_rate": 4.6372875416155604e-05, "loss": 3.8775, "step": 6124 }, { "epoch": 1.6096167110060104, "grad_norm": 0.8905379772186279, "learning_rate": 4.6355353075170845e-05, "loss": 3.8072, "step": 6126 }, { "epoch": 1.6101422143396722, "grad_norm": 0.7609342932701111, "learning_rate": 4.6337830734186086e-05, "loss": 3.7846, "step": 6128 }, { "epoch": 1.610667717673334, "grad_norm": 0.8833467960357666, "learning_rate": 4.6320308393201333e-05, "loss": 3.8212, "step": 6130 }, { "epoch": 1.611193221006996, "grad_norm": 0.8252526521682739, "learning_rate": 4.630278605221658e-05, "loss": 3.8119, "step": 6132 }, { "epoch": 1.6117187243406574, "grad_norm": 0.8258031010627747, "learning_rate": 4.628526371123182e-05, "loss": 3.8187, "step": 6134 }, { "epoch": 1.6122442276743194, "grad_norm": 0.9074541330337524, "learning_rate": 4.626774137024707e-05, "loss": 3.7933, "step": 6136 }, { "epoch": 1.612769731007981, "grad_norm": 0.8687145709991455, "learning_rate": 4.625021902926231e-05, "loss": 3.8301, "step": 6138 }, { "epoch": 1.613295234341643, "grad_norm": 0.8402942419052124, "learning_rate": 4.623269668827756e-05, "loss": 3.7798, "step": 6140 }, { "epoch": 1.6138207376753047, "grad_norm": 0.7498704195022583, "learning_rate": 4.62151743472928e-05, "loss": 3.793, "step": 6142 }, { "epoch": 1.6143462410089664, "grad_norm": 0.9345159530639648, "learning_rate": 4.6197652006308046e-05, "loss": 3.7789, "step": 6144 }, { "epoch": 1.6148717443426281, "grad_norm": 0.7398616671562195, "learning_rate": 4.6180129665323294e-05, "loss": 3.7846, "step": 6146 }, { "epoch": 1.61539724767629, "grad_norm": 1.0125060081481934, "learning_rate": 4.6162607324338534e-05, "loss": 3.8323, "step": 6148 }, { "epoch": 1.6159227510099519, "grad_norm": 0.9239723682403564, "learning_rate": 4.6145084983353775e-05, "loss": 3.774, "step": 6150 }, { "epoch": 1.6164482543436134, "grad_norm": 0.7836284637451172, "learning_rate": 4.612756264236902e-05, "loss": 3.7921, "step": 6152 }, { "epoch": 1.6169737576772754, "grad_norm": 0.7916969656944275, "learning_rate": 4.6110040301384263e-05, "loss": 3.8057, "step": 6154 }, { "epoch": 1.6174992610109369, "grad_norm": 0.7857925295829773, "learning_rate": 4.609251796039951e-05, "loss": 3.7601, "step": 6156 }, { "epoch": 1.6180247643445989, "grad_norm": 0.8935374021530151, "learning_rate": 4.607499561941476e-05, "loss": 3.7498, "step": 6158 }, { "epoch": 1.6185502676782606, "grad_norm": 0.7509841918945312, "learning_rate": 4.605747327843e-05, "loss": 3.735, "step": 6160 }, { "epoch": 1.6190757710119223, "grad_norm": 0.7284303307533264, "learning_rate": 4.603995093744525e-05, "loss": 3.7786, "step": 6162 }, { "epoch": 1.619601274345584, "grad_norm": 0.86061692237854, "learning_rate": 4.602242859646049e-05, "loss": 3.802, "step": 6164 }, { "epoch": 1.6201267776792458, "grad_norm": 0.7793019413948059, "learning_rate": 4.6004906255475735e-05, "loss": 3.83, "step": 6166 }, { "epoch": 1.6206522810129078, "grad_norm": 0.794456422328949, "learning_rate": 4.5987383914490976e-05, "loss": 3.7871, "step": 6168 }, { "epoch": 1.6211777843465693, "grad_norm": 0.7947176694869995, "learning_rate": 4.5969861573506224e-05, "loss": 3.7997, "step": 6170 }, { "epoch": 1.6217032876802313, "grad_norm": 0.7425632476806641, "learning_rate": 4.595233923252147e-05, "loss": 3.8377, "step": 6172 }, { "epoch": 1.6222287910138928, "grad_norm": 0.8934128284454346, "learning_rate": 4.593481689153671e-05, "loss": 3.7542, "step": 6174 }, { "epoch": 1.6227542943475548, "grad_norm": 0.8235152363777161, "learning_rate": 4.591729455055195e-05, "loss": 3.738, "step": 6176 }, { "epoch": 1.6232797976812166, "grad_norm": 0.947952926158905, "learning_rate": 4.58997722095672e-05, "loss": 3.8586, "step": 6178 }, { "epoch": 1.6238053010148783, "grad_norm": 0.8540204167366028, "learning_rate": 4.588224986858244e-05, "loss": 3.7712, "step": 6180 }, { "epoch": 1.62433080434854, "grad_norm": 0.8388408422470093, "learning_rate": 4.586472752759769e-05, "loss": 3.7717, "step": 6182 }, { "epoch": 1.6248563076822018, "grad_norm": 0.8024148941040039, "learning_rate": 4.5847205186612936e-05, "loss": 3.7821, "step": 6184 }, { "epoch": 1.6253818110158638, "grad_norm": 0.7375851273536682, "learning_rate": 4.582968284562818e-05, "loss": 3.7423, "step": 6186 }, { "epoch": 1.6259073143495253, "grad_norm": 0.8270559310913086, "learning_rate": 4.581216050464342e-05, "loss": 3.8015, "step": 6188 }, { "epoch": 1.6264328176831873, "grad_norm": 0.8300509452819824, "learning_rate": 4.5794638163658665e-05, "loss": 3.7334, "step": 6190 }, { "epoch": 1.6269583210168488, "grad_norm": 0.8057745099067688, "learning_rate": 4.577711582267391e-05, "loss": 3.8273, "step": 6192 }, { "epoch": 1.6274838243505108, "grad_norm": 0.8263880014419556, "learning_rate": 4.5759593481689154e-05, "loss": 3.8109, "step": 6194 }, { "epoch": 1.6280093276841725, "grad_norm": 0.7634310126304626, "learning_rate": 4.57420711407044e-05, "loss": 3.7877, "step": 6196 }, { "epoch": 1.6285348310178343, "grad_norm": 0.8033174872398376, "learning_rate": 4.572454879971965e-05, "loss": 3.8215, "step": 6198 }, { "epoch": 1.629060334351496, "grad_norm": 0.7724741101264954, "learning_rate": 4.570702645873489e-05, "loss": 3.7792, "step": 6200 }, { "epoch": 1.6295858376851577, "grad_norm": 0.8119155764579773, "learning_rate": 4.568950411775013e-05, "loss": 3.7865, "step": 6202 }, { "epoch": 1.6301113410188197, "grad_norm": 0.8747258186340332, "learning_rate": 4.567198177676538e-05, "loss": 3.7907, "step": 6204 }, { "epoch": 1.6306368443524812, "grad_norm": 0.8532177805900574, "learning_rate": 4.565445943578062e-05, "loss": 3.7808, "step": 6206 }, { "epoch": 1.6311623476861432, "grad_norm": 0.7574437260627747, "learning_rate": 4.5636937094795866e-05, "loss": 3.7598, "step": 6208 }, { "epoch": 1.631687851019805, "grad_norm": 0.8304572105407715, "learning_rate": 4.5619414753811114e-05, "loss": 3.7578, "step": 6210 }, { "epoch": 1.6322133543534667, "grad_norm": 0.7524347305297852, "learning_rate": 4.5601892412826355e-05, "loss": 3.7815, "step": 6212 }, { "epoch": 1.6327388576871285, "grad_norm": 0.8259950876235962, "learning_rate": 4.5584370071841596e-05, "loss": 3.7707, "step": 6214 }, { "epoch": 1.6332643610207902, "grad_norm": 0.932498574256897, "learning_rate": 4.556684773085684e-05, "loss": 3.7519, "step": 6216 }, { "epoch": 1.633789864354452, "grad_norm": 0.775809109210968, "learning_rate": 4.554932538987209e-05, "loss": 3.7859, "step": 6218 }, { "epoch": 1.6343153676881137, "grad_norm": 0.7972005009651184, "learning_rate": 4.553180304888733e-05, "loss": 3.876, "step": 6220 }, { "epoch": 1.6348408710217757, "grad_norm": 0.8758171796798706, "learning_rate": 4.551428070790258e-05, "loss": 3.7498, "step": 6222 }, { "epoch": 1.6353663743554372, "grad_norm": 0.8626033663749695, "learning_rate": 4.5496758366917827e-05, "loss": 3.7393, "step": 6224 }, { "epoch": 1.6358918776890992, "grad_norm": 0.8017551302909851, "learning_rate": 4.547923602593307e-05, "loss": 3.7945, "step": 6226 }, { "epoch": 1.636417381022761, "grad_norm": 0.7138306498527527, "learning_rate": 4.546171368494831e-05, "loss": 3.7874, "step": 6228 }, { "epoch": 1.6369428843564227, "grad_norm": 0.7752797603607178, "learning_rate": 4.5444191343963556e-05, "loss": 3.8124, "step": 6230 }, { "epoch": 1.6374683876900844, "grad_norm": 0.8086948394775391, "learning_rate": 4.54266690029788e-05, "loss": 3.7933, "step": 6232 }, { "epoch": 1.6379938910237462, "grad_norm": 0.7098448276519775, "learning_rate": 4.5409146661994044e-05, "loss": 3.7896, "step": 6234 }, { "epoch": 1.6385193943574081, "grad_norm": 0.7922423481941223, "learning_rate": 4.539162432100929e-05, "loss": 3.8457, "step": 6236 }, { "epoch": 1.6390448976910696, "grad_norm": 0.8686381578445435, "learning_rate": 4.537410198002454e-05, "loss": 3.8442, "step": 6238 }, { "epoch": 1.6395704010247316, "grad_norm": 0.964167594909668, "learning_rate": 4.535657963903977e-05, "loss": 3.7755, "step": 6240 }, { "epoch": 1.6400959043583931, "grad_norm": 0.8562595844268799, "learning_rate": 4.533905729805502e-05, "loss": 3.8055, "step": 6242 }, { "epoch": 1.6406214076920551, "grad_norm": 0.8212277293205261, "learning_rate": 4.532153495707027e-05, "loss": 3.7948, "step": 6244 }, { "epoch": 1.6411469110257169, "grad_norm": 0.8664219379425049, "learning_rate": 4.530401261608551e-05, "loss": 3.7702, "step": 6246 }, { "epoch": 1.6416724143593786, "grad_norm": 0.8329643607139587, "learning_rate": 4.528649027510076e-05, "loss": 3.7723, "step": 6248 }, { "epoch": 1.6421979176930404, "grad_norm": 0.8167641758918762, "learning_rate": 4.5268967934116004e-05, "loss": 3.7843, "step": 6250 }, { "epoch": 1.642723421026702, "grad_norm": 0.87490314245224, "learning_rate": 4.5251445593131245e-05, "loss": 3.7926, "step": 6252 }, { "epoch": 1.643248924360364, "grad_norm": 0.9539375305175781, "learning_rate": 4.5233923252146486e-05, "loss": 3.7828, "step": 6254 }, { "epoch": 1.6437744276940256, "grad_norm": 0.8068154454231262, "learning_rate": 4.521640091116173e-05, "loss": 3.7509, "step": 6256 }, { "epoch": 1.6442999310276876, "grad_norm": 0.8105972409248352, "learning_rate": 4.519887857017698e-05, "loss": 3.766, "step": 6258 }, { "epoch": 1.644825434361349, "grad_norm": 0.8490138053894043, "learning_rate": 4.518135622919222e-05, "loss": 3.7775, "step": 6260 }, { "epoch": 1.645350937695011, "grad_norm": 0.7221581339836121, "learning_rate": 4.516383388820747e-05, "loss": 3.7925, "step": 6262 }, { "epoch": 1.6458764410286728, "grad_norm": 0.8765234351158142, "learning_rate": 4.514631154722272e-05, "loss": 3.7303, "step": 6264 }, { "epoch": 1.6464019443623346, "grad_norm": 0.8632600903511047, "learning_rate": 4.512878920623795e-05, "loss": 3.8394, "step": 6266 }, { "epoch": 1.6469274476959963, "grad_norm": 0.7999917268753052, "learning_rate": 4.51112668652532e-05, "loss": 3.8005, "step": 6268 }, { "epoch": 1.647452951029658, "grad_norm": 0.8040006160736084, "learning_rate": 4.5093744524268446e-05, "loss": 3.7904, "step": 6270 }, { "epoch": 1.64797845436332, "grad_norm": 0.8725202083587646, "learning_rate": 4.507622218328369e-05, "loss": 3.8023, "step": 6272 }, { "epoch": 1.6485039576969815, "grad_norm": 0.9212425351142883, "learning_rate": 4.5058699842298934e-05, "loss": 3.8056, "step": 6274 }, { "epoch": 1.6490294610306435, "grad_norm": 0.7302550077438354, "learning_rate": 4.504117750131418e-05, "loss": 3.7289, "step": 6276 }, { "epoch": 1.649554964364305, "grad_norm": 0.8196876049041748, "learning_rate": 4.502365516032942e-05, "loss": 3.7954, "step": 6278 }, { "epoch": 1.650080467697967, "grad_norm": 0.9638647437095642, "learning_rate": 4.5006132819344663e-05, "loss": 3.7513, "step": 6280 }, { "epoch": 1.6506059710316288, "grad_norm": 0.8434025049209595, "learning_rate": 4.498861047835991e-05, "loss": 3.7487, "step": 6282 }, { "epoch": 1.6511314743652905, "grad_norm": 0.8132114410400391, "learning_rate": 4.497108813737516e-05, "loss": 3.8431, "step": 6284 }, { "epoch": 1.6516569776989523, "grad_norm": 0.8278979659080505, "learning_rate": 4.49535657963904e-05, "loss": 3.769, "step": 6286 }, { "epoch": 1.652182481032614, "grad_norm": 0.8986967206001282, "learning_rate": 4.493604345540565e-05, "loss": 3.8307, "step": 6288 }, { "epoch": 1.652707984366276, "grad_norm": 0.8792144656181335, "learning_rate": 4.491852111442089e-05, "loss": 3.7832, "step": 6290 }, { "epoch": 1.6532334876999375, "grad_norm": 0.8274497985839844, "learning_rate": 4.490099877343613e-05, "loss": 3.7921, "step": 6292 }, { "epoch": 1.6537589910335995, "grad_norm": 0.8616675138473511, "learning_rate": 4.4883476432451376e-05, "loss": 3.806, "step": 6294 }, { "epoch": 1.654284494367261, "grad_norm": 0.8646924495697021, "learning_rate": 4.4865954091466624e-05, "loss": 3.818, "step": 6296 }, { "epoch": 1.654809997700923, "grad_norm": 0.8729041218757629, "learning_rate": 4.4848431750481864e-05, "loss": 3.7459, "step": 6298 }, { "epoch": 1.6553355010345847, "grad_norm": 0.7661137580871582, "learning_rate": 4.483090940949711e-05, "loss": 3.7605, "step": 6300 }, { "epoch": 1.6558610043682465, "grad_norm": 0.9590991139411926, "learning_rate": 4.481338706851236e-05, "loss": 3.7759, "step": 6302 }, { "epoch": 1.6563865077019082, "grad_norm": 0.8448615670204163, "learning_rate": 4.47958647275276e-05, "loss": 3.8321, "step": 6304 }, { "epoch": 1.65691201103557, "grad_norm": 0.8737562894821167, "learning_rate": 4.477834238654284e-05, "loss": 3.7546, "step": 6306 }, { "epoch": 1.657437514369232, "grad_norm": 0.8639950752258301, "learning_rate": 4.476082004555809e-05, "loss": 3.8061, "step": 6308 }, { "epoch": 1.6579630177028934, "grad_norm": 0.8835740685462952, "learning_rate": 4.4743297704573336e-05, "loss": 3.8027, "step": 6310 }, { "epoch": 1.6584885210365554, "grad_norm": 0.8180535435676575, "learning_rate": 4.472577536358858e-05, "loss": 3.7959, "step": 6312 }, { "epoch": 1.659014024370217, "grad_norm": 0.8756471276283264, "learning_rate": 4.4708253022603825e-05, "loss": 3.8317, "step": 6314 }, { "epoch": 1.659539527703879, "grad_norm": 0.7994133234024048, "learning_rate": 4.4690730681619065e-05, "loss": 3.7869, "step": 6316 }, { "epoch": 1.6600650310375407, "grad_norm": 0.8069018125534058, "learning_rate": 4.4673208340634306e-05, "loss": 3.8613, "step": 6318 }, { "epoch": 1.6605905343712024, "grad_norm": 0.735649049282074, "learning_rate": 4.4655685999649554e-05, "loss": 3.7609, "step": 6320 }, { "epoch": 1.6611160377048642, "grad_norm": 0.8070588111877441, "learning_rate": 4.46381636586648e-05, "loss": 3.8282, "step": 6322 }, { "epoch": 1.661641541038526, "grad_norm": 0.8298254609107971, "learning_rate": 4.462064131768004e-05, "loss": 3.8165, "step": 6324 }, { "epoch": 1.6621670443721879, "grad_norm": 0.8140451908111572, "learning_rate": 4.460311897669529e-05, "loss": 3.7774, "step": 6326 }, { "epoch": 1.6626925477058494, "grad_norm": 0.8447144627571106, "learning_rate": 4.458559663571053e-05, "loss": 3.7556, "step": 6328 }, { "epoch": 1.6632180510395114, "grad_norm": 0.7852143049240112, "learning_rate": 4.456807429472578e-05, "loss": 3.7573, "step": 6330 }, { "epoch": 1.663743554373173, "grad_norm": 0.9404044151306152, "learning_rate": 4.455055195374102e-05, "loss": 3.7762, "step": 6332 }, { "epoch": 1.6642690577068349, "grad_norm": 0.8048365712165833, "learning_rate": 4.4533029612756266e-05, "loss": 3.8123, "step": 6334 }, { "epoch": 1.6647945610404966, "grad_norm": 0.8518590927124023, "learning_rate": 4.4515507271771514e-05, "loss": 3.7441, "step": 6336 }, { "epoch": 1.6653200643741584, "grad_norm": 0.8763555884361267, "learning_rate": 4.4497984930786755e-05, "loss": 3.7836, "step": 6338 }, { "epoch": 1.66584556770782, "grad_norm": 0.9310688972473145, "learning_rate": 4.4480462589802e-05, "loss": 3.7887, "step": 6340 }, { "epoch": 1.6663710710414819, "grad_norm": 0.9398099184036255, "learning_rate": 4.446294024881724e-05, "loss": 3.7855, "step": 6342 }, { "epoch": 1.6668965743751438, "grad_norm": 0.7764840126037598, "learning_rate": 4.444541790783249e-05, "loss": 3.7519, "step": 6344 }, { "epoch": 1.6674220777088054, "grad_norm": 0.7139030694961548, "learning_rate": 4.442789556684773e-05, "loss": 3.8255, "step": 6346 }, { "epoch": 1.6679475810424673, "grad_norm": 0.7756190299987793, "learning_rate": 4.441037322586298e-05, "loss": 3.724, "step": 6348 }, { "epoch": 1.6684730843761288, "grad_norm": 0.8379669189453125, "learning_rate": 4.4392850884878227e-05, "loss": 3.7906, "step": 6350 }, { "epoch": 1.6689985877097908, "grad_norm": 0.844746470451355, "learning_rate": 4.437532854389347e-05, "loss": 3.8227, "step": 6352 }, { "epoch": 1.6695240910434526, "grad_norm": 0.8059738278388977, "learning_rate": 4.435780620290871e-05, "loss": 3.8087, "step": 6354 }, { "epoch": 1.6700495943771143, "grad_norm": 0.7614632844924927, "learning_rate": 4.4340283861923956e-05, "loss": 3.7978, "step": 6356 }, { "epoch": 1.670575097710776, "grad_norm": 0.7964781522750854, "learning_rate": 4.4322761520939196e-05, "loss": 3.7782, "step": 6358 }, { "epoch": 1.6711006010444378, "grad_norm": 0.8545102477073669, "learning_rate": 4.4305239179954444e-05, "loss": 3.7932, "step": 6360 }, { "epoch": 1.6716261043780998, "grad_norm": 0.8616874814033508, "learning_rate": 4.428771683896969e-05, "loss": 3.8053, "step": 6362 }, { "epoch": 1.6721516077117613, "grad_norm": 0.8300471901893616, "learning_rate": 4.427019449798493e-05, "loss": 3.7555, "step": 6364 }, { "epoch": 1.6726771110454233, "grad_norm": 0.8139323592185974, "learning_rate": 4.425267215700018e-05, "loss": 3.754, "step": 6366 }, { "epoch": 1.673202614379085, "grad_norm": 0.8437221050262451, "learning_rate": 4.423514981601542e-05, "loss": 3.7731, "step": 6368 }, { "epoch": 1.6737281177127468, "grad_norm": 0.7688350081443787, "learning_rate": 4.421762747503067e-05, "loss": 3.7351, "step": 6370 }, { "epoch": 1.6742536210464085, "grad_norm": 0.859720766544342, "learning_rate": 4.420010513404591e-05, "loss": 3.8007, "step": 6372 }, { "epoch": 1.6747791243800703, "grad_norm": 0.8670925498008728, "learning_rate": 4.418258279306116e-05, "loss": 3.7801, "step": 6374 }, { "epoch": 1.675304627713732, "grad_norm": 0.8284136652946472, "learning_rate": 4.4165060452076404e-05, "loss": 3.7605, "step": 6376 }, { "epoch": 1.6758301310473938, "grad_norm": 0.7939130067825317, "learning_rate": 4.4147538111091645e-05, "loss": 3.7777, "step": 6378 }, { "epoch": 1.6763556343810557, "grad_norm": 0.7672622203826904, "learning_rate": 4.4130015770106886e-05, "loss": 3.7789, "step": 6380 }, { "epoch": 1.6768811377147173, "grad_norm": 0.7524271607398987, "learning_rate": 4.411249342912213e-05, "loss": 3.8272, "step": 6382 }, { "epoch": 1.6774066410483792, "grad_norm": 0.7848139405250549, "learning_rate": 4.4094971088137374e-05, "loss": 3.8113, "step": 6384 }, { "epoch": 1.677932144382041, "grad_norm": 0.7559573650360107, "learning_rate": 4.407744874715262e-05, "loss": 3.7991, "step": 6386 }, { "epoch": 1.6784576477157027, "grad_norm": 0.7925426959991455, "learning_rate": 4.405992640616787e-05, "loss": 3.7885, "step": 6388 }, { "epoch": 1.6789831510493645, "grad_norm": 0.74395751953125, "learning_rate": 4.404240406518311e-05, "loss": 3.8356, "step": 6390 }, { "epoch": 1.6795086543830262, "grad_norm": 0.7897252440452576, "learning_rate": 4.402488172419835e-05, "loss": 3.8164, "step": 6392 }, { "epoch": 1.6800341577166882, "grad_norm": 0.7318233251571655, "learning_rate": 4.40073593832136e-05, "loss": 3.8072, "step": 6394 }, { "epoch": 1.6805596610503497, "grad_norm": 0.8634235858917236, "learning_rate": 4.3989837042228846e-05, "loss": 3.8408, "step": 6396 }, { "epoch": 1.6810851643840117, "grad_norm": 0.79674232006073, "learning_rate": 4.397231470124409e-05, "loss": 3.76, "step": 6398 }, { "epoch": 1.6816106677176732, "grad_norm": 0.8952026963233948, "learning_rate": 4.3954792360259334e-05, "loss": 3.8233, "step": 6400 }, { "epoch": 1.6816106677176732, "eval_loss": 3.772761821746826, "eval_runtime": 464.6531, "eval_samples_per_second": 262.107, "eval_steps_per_second": 8.191, "step": 6400 }, { "epoch": 1.6821361710513352, "grad_norm": 0.8046278357505798, "learning_rate": 4.393727001927458e-05, "loss": 3.7445, "step": 6402 }, { "epoch": 1.682661674384997, "grad_norm": 0.7326026558876038, "learning_rate": 4.391974767828982e-05, "loss": 3.8118, "step": 6404 }, { "epoch": 1.6831871777186587, "grad_norm": 0.7497954368591309, "learning_rate": 4.3902225337305063e-05, "loss": 3.7979, "step": 6406 }, { "epoch": 1.6837126810523204, "grad_norm": 0.8473307490348816, "learning_rate": 4.388470299632031e-05, "loss": 3.7913, "step": 6408 }, { "epoch": 1.6842381843859822, "grad_norm": 0.7269306778907776, "learning_rate": 4.386718065533555e-05, "loss": 3.7843, "step": 6410 }, { "epoch": 1.6847636877196441, "grad_norm": 0.7177811861038208, "learning_rate": 4.38496583143508e-05, "loss": 3.7793, "step": 6412 }, { "epoch": 1.6852891910533057, "grad_norm": 0.7353260517120361, "learning_rate": 4.383213597336605e-05, "loss": 3.812, "step": 6414 }, { "epoch": 1.6858146943869676, "grad_norm": 0.8406370282173157, "learning_rate": 4.381461363238129e-05, "loss": 3.7479, "step": 6416 }, { "epoch": 1.6863401977206292, "grad_norm": 0.7413485646247864, "learning_rate": 4.379709129139653e-05, "loss": 3.7769, "step": 6418 }, { "epoch": 1.6868657010542911, "grad_norm": 0.7455936670303345, "learning_rate": 4.3779568950411776e-05, "loss": 3.8226, "step": 6420 }, { "epoch": 1.6873912043879529, "grad_norm": 0.7663591504096985, "learning_rate": 4.3762046609427024e-05, "loss": 3.7842, "step": 6422 }, { "epoch": 1.6879167077216146, "grad_norm": 0.7867127060890198, "learning_rate": 4.3744524268442264e-05, "loss": 3.7606, "step": 6424 }, { "epoch": 1.6884422110552764, "grad_norm": 0.7951709628105164, "learning_rate": 4.372700192745751e-05, "loss": 3.7885, "step": 6426 }, { "epoch": 1.6889677143889381, "grad_norm": 0.7497739195823669, "learning_rate": 4.370947958647276e-05, "loss": 3.7847, "step": 6428 }, { "epoch": 1.6894932177226, "grad_norm": 0.8234437108039856, "learning_rate": 4.3691957245487994e-05, "loss": 3.8055, "step": 6430 }, { "epoch": 1.6900187210562616, "grad_norm": 0.8232117295265198, "learning_rate": 4.367443490450324e-05, "loss": 3.7737, "step": 6432 }, { "epoch": 1.6905442243899236, "grad_norm": 0.8082376718521118, "learning_rate": 4.365691256351849e-05, "loss": 3.7449, "step": 6434 }, { "epoch": 1.691069727723585, "grad_norm": 0.7992094159126282, "learning_rate": 4.363939022253373e-05, "loss": 3.7672, "step": 6436 }, { "epoch": 1.691595231057247, "grad_norm": 0.6737250685691833, "learning_rate": 4.362186788154898e-05, "loss": 3.7633, "step": 6438 }, { "epoch": 1.6921207343909088, "grad_norm": 0.7734641432762146, "learning_rate": 4.3604345540564225e-05, "loss": 3.7854, "step": 6440 }, { "epoch": 1.6926462377245706, "grad_norm": 0.7552005648612976, "learning_rate": 4.3586823199579465e-05, "loss": 3.7806, "step": 6442 }, { "epoch": 1.6931717410582323, "grad_norm": 0.7278103232383728, "learning_rate": 4.3569300858594706e-05, "loss": 3.7338, "step": 6444 }, { "epoch": 1.693697244391894, "grad_norm": 0.7479393482208252, "learning_rate": 4.3551778517609954e-05, "loss": 3.8031, "step": 6446 }, { "epoch": 1.694222747725556, "grad_norm": 0.8282245993614197, "learning_rate": 4.35342561766252e-05, "loss": 3.8145, "step": 6448 }, { "epoch": 1.6947482510592176, "grad_norm": 0.7542039155960083, "learning_rate": 4.351673383564044e-05, "loss": 3.75, "step": 6450 }, { "epoch": 1.6952737543928795, "grad_norm": 0.896307110786438, "learning_rate": 4.349921149465569e-05, "loss": 3.7836, "step": 6452 }, { "epoch": 1.695799257726541, "grad_norm": 0.6820362210273743, "learning_rate": 4.348168915367094e-05, "loss": 3.7956, "step": 6454 }, { "epoch": 1.696324761060203, "grad_norm": 0.8764208555221558, "learning_rate": 4.346416681268617e-05, "loss": 3.7557, "step": 6456 }, { "epoch": 1.6968502643938648, "grad_norm": 0.7216442823410034, "learning_rate": 4.344664447170142e-05, "loss": 3.7902, "step": 6458 }, { "epoch": 1.6973757677275265, "grad_norm": 0.7959151268005371, "learning_rate": 4.3429122130716666e-05, "loss": 3.7703, "step": 6460 }, { "epoch": 1.6979012710611883, "grad_norm": 0.7445836663246155, "learning_rate": 4.341159978973191e-05, "loss": 3.7935, "step": 6462 }, { "epoch": 1.69842677439485, "grad_norm": 0.7805032730102539, "learning_rate": 4.3394077448747155e-05, "loss": 3.7666, "step": 6464 }, { "epoch": 1.698952277728512, "grad_norm": 0.8053767681121826, "learning_rate": 4.33765551077624e-05, "loss": 3.7852, "step": 6466 }, { "epoch": 1.6994777810621735, "grad_norm": 0.9757397770881653, "learning_rate": 4.335903276677764e-05, "loss": 3.8459, "step": 6468 }, { "epoch": 1.7000032843958355, "grad_norm": 0.83754563331604, "learning_rate": 4.3341510425792884e-05, "loss": 3.7268, "step": 6470 }, { "epoch": 1.700528787729497, "grad_norm": 0.9387725591659546, "learning_rate": 4.332398808480813e-05, "loss": 3.7572, "step": 6472 }, { "epoch": 1.701054291063159, "grad_norm": 0.6741616129875183, "learning_rate": 4.330646574382338e-05, "loss": 3.7273, "step": 6474 }, { "epoch": 1.7015797943968207, "grad_norm": 0.7936663627624512, "learning_rate": 4.328894340283862e-05, "loss": 3.766, "step": 6476 }, { "epoch": 1.7021052977304825, "grad_norm": 0.7706166505813599, "learning_rate": 4.327142106185387e-05, "loss": 3.8091, "step": 6478 }, { "epoch": 1.7026308010641442, "grad_norm": 0.8617799878120422, "learning_rate": 4.3253898720869115e-05, "loss": 3.7783, "step": 6480 }, { "epoch": 1.703156304397806, "grad_norm": 0.7131545543670654, "learning_rate": 4.3236376379884356e-05, "loss": 3.7083, "step": 6482 }, { "epoch": 1.703681807731468, "grad_norm": 0.8867887854576111, "learning_rate": 4.3218854038899596e-05, "loss": 3.7802, "step": 6484 }, { "epoch": 1.7042073110651295, "grad_norm": 0.8407551050186157, "learning_rate": 4.3201331697914844e-05, "loss": 3.772, "step": 6486 }, { "epoch": 1.7047328143987914, "grad_norm": 0.7551054954528809, "learning_rate": 4.318380935693009e-05, "loss": 3.8291, "step": 6488 }, { "epoch": 1.705258317732453, "grad_norm": 0.7414567470550537, "learning_rate": 4.316628701594533e-05, "loss": 3.7563, "step": 6490 }, { "epoch": 1.705783821066115, "grad_norm": 0.8178098797798157, "learning_rate": 4.314876467496058e-05, "loss": 3.7387, "step": 6492 }, { "epoch": 1.7063093243997767, "grad_norm": 0.8120384812355042, "learning_rate": 4.313124233397582e-05, "loss": 3.7407, "step": 6494 }, { "epoch": 1.7068348277334384, "grad_norm": 0.7072115540504456, "learning_rate": 4.311371999299106e-05, "loss": 3.8137, "step": 6496 }, { "epoch": 1.7073603310671002, "grad_norm": 0.8112834095954895, "learning_rate": 4.309619765200631e-05, "loss": 3.8356, "step": 6498 }, { "epoch": 1.707885834400762, "grad_norm": 0.8253066539764404, "learning_rate": 4.307867531102156e-05, "loss": 3.8297, "step": 6500 }, { "epoch": 1.7084113377344239, "grad_norm": 0.6890792846679688, "learning_rate": 4.30611529700368e-05, "loss": 3.7851, "step": 6502 }, { "epoch": 1.7089368410680854, "grad_norm": 0.7514976859092712, "learning_rate": 4.3043630629052045e-05, "loss": 3.7627, "step": 6504 }, { "epoch": 1.7094623444017474, "grad_norm": 0.6856784224510193, "learning_rate": 4.302610828806729e-05, "loss": 3.7687, "step": 6506 }, { "epoch": 1.709987847735409, "grad_norm": 0.9328150749206543, "learning_rate": 4.300858594708253e-05, "loss": 3.7783, "step": 6508 }, { "epoch": 1.7105133510690709, "grad_norm": 0.7332636117935181, "learning_rate": 4.2991063606097774e-05, "loss": 3.7396, "step": 6510 }, { "epoch": 1.7110388544027326, "grad_norm": 0.6774783134460449, "learning_rate": 4.297354126511302e-05, "loss": 3.8249, "step": 6512 }, { "epoch": 1.7115643577363944, "grad_norm": 0.8204286694526672, "learning_rate": 4.295601892412827e-05, "loss": 3.7452, "step": 6514 }, { "epoch": 1.7120898610700561, "grad_norm": 0.7154591679573059, "learning_rate": 4.293849658314351e-05, "loss": 3.7888, "step": 6516 }, { "epoch": 1.7126153644037179, "grad_norm": 0.8504859805107117, "learning_rate": 4.292097424215876e-05, "loss": 3.7344, "step": 6518 }, { "epoch": 1.7131408677373798, "grad_norm": 0.7546803951263428, "learning_rate": 4.2903451901174e-05, "loss": 3.8104, "step": 6520 }, { "epoch": 1.7136663710710414, "grad_norm": 0.8606143593788147, "learning_rate": 4.288592956018924e-05, "loss": 3.8241, "step": 6522 }, { "epoch": 1.7141918744047033, "grad_norm": 0.8826828002929688, "learning_rate": 4.286840721920449e-05, "loss": 3.7628, "step": 6524 }, { "epoch": 1.714717377738365, "grad_norm": 0.7505471110343933, "learning_rate": 4.2850884878219734e-05, "loss": 3.7443, "step": 6526 }, { "epoch": 1.7152428810720268, "grad_norm": 0.7197864651679993, "learning_rate": 4.2833362537234975e-05, "loss": 3.7684, "step": 6528 }, { "epoch": 1.7157683844056886, "grad_norm": 0.818454921245575, "learning_rate": 4.281584019625022e-05, "loss": 3.8003, "step": 6530 }, { "epoch": 1.7162938877393503, "grad_norm": 0.8245639801025391, "learning_rate": 4.2798317855265463e-05, "loss": 3.7711, "step": 6532 }, { "epoch": 1.716819391073012, "grad_norm": 0.7353935241699219, "learning_rate": 4.278079551428071e-05, "loss": 3.771, "step": 6534 }, { "epoch": 1.7173448944066738, "grad_norm": 0.7047330737113953, "learning_rate": 4.276327317329595e-05, "loss": 3.7339, "step": 6536 }, { "epoch": 1.7178703977403358, "grad_norm": 0.7107115387916565, "learning_rate": 4.27457508323112e-05, "loss": 3.7712, "step": 6538 }, { "epoch": 1.7183959010739973, "grad_norm": 1.0404963493347168, "learning_rate": 4.272822849132645e-05, "loss": 3.7896, "step": 6540 }, { "epoch": 1.7189214044076593, "grad_norm": 0.8043042421340942, "learning_rate": 4.271070615034169e-05, "loss": 3.798, "step": 6542 }, { "epoch": 1.719446907741321, "grad_norm": 0.7385002374649048, "learning_rate": 4.2693183809356935e-05, "loss": 3.7546, "step": 6544 }, { "epoch": 1.7199724110749828, "grad_norm": 0.7554582953453064, "learning_rate": 4.2675661468372176e-05, "loss": 3.7938, "step": 6546 }, { "epoch": 1.7204979144086445, "grad_norm": 0.7309147715568542, "learning_rate": 4.265813912738742e-05, "loss": 3.7362, "step": 6548 }, { "epoch": 1.7210234177423063, "grad_norm": 0.7382279634475708, "learning_rate": 4.2640616786402664e-05, "loss": 3.8186, "step": 6550 }, { "epoch": 1.7215489210759682, "grad_norm": 0.7357345223426819, "learning_rate": 4.262309444541791e-05, "loss": 3.7979, "step": 6552 }, { "epoch": 1.7220744244096298, "grad_norm": 0.7502026557922363, "learning_rate": 4.260557210443315e-05, "loss": 3.788, "step": 6554 }, { "epoch": 1.7225999277432917, "grad_norm": 0.6964012980461121, "learning_rate": 4.25880497634484e-05, "loss": 3.8012, "step": 6556 }, { "epoch": 1.7231254310769533, "grad_norm": 0.7638494372367859, "learning_rate": 4.257052742246364e-05, "loss": 3.7926, "step": 6558 }, { "epoch": 1.7236509344106152, "grad_norm": 0.7803455591201782, "learning_rate": 4.255300508147889e-05, "loss": 3.7587, "step": 6560 }, { "epoch": 1.724176437744277, "grad_norm": 0.8090100288391113, "learning_rate": 4.253548274049413e-05, "loss": 3.7257, "step": 6562 }, { "epoch": 1.7247019410779387, "grad_norm": 0.800485372543335, "learning_rate": 4.251796039950938e-05, "loss": 3.7842, "step": 6564 }, { "epoch": 1.7252274444116005, "grad_norm": 0.823236882686615, "learning_rate": 4.2500438058524625e-05, "loss": 3.7832, "step": 6566 }, { "epoch": 1.7257529477452622, "grad_norm": 0.8292329907417297, "learning_rate": 4.2482915717539865e-05, "loss": 3.7329, "step": 6568 }, { "epoch": 1.7262784510789242, "grad_norm": 0.7737250328063965, "learning_rate": 4.2465393376555106e-05, "loss": 3.7642, "step": 6570 }, { "epoch": 1.7268039544125857, "grad_norm": 0.759537935256958, "learning_rate": 4.2447871035570354e-05, "loss": 3.7608, "step": 6572 }, { "epoch": 1.7273294577462477, "grad_norm": 0.8189439177513123, "learning_rate": 4.2430348694585594e-05, "loss": 3.8286, "step": 6574 }, { "epoch": 1.7278549610799092, "grad_norm": 0.7559624314308167, "learning_rate": 4.241282635360084e-05, "loss": 3.7836, "step": 6576 }, { "epoch": 1.7283804644135712, "grad_norm": 0.8311588764190674, "learning_rate": 4.239530401261609e-05, "loss": 3.7589, "step": 6578 }, { "epoch": 1.728905967747233, "grad_norm": 0.7955225110054016, "learning_rate": 4.237778167163133e-05, "loss": 3.7344, "step": 6580 }, { "epoch": 1.7294314710808947, "grad_norm": 0.7628048062324524, "learning_rate": 4.236025933064658e-05, "loss": 3.7378, "step": 6582 }, { "epoch": 1.7299569744145564, "grad_norm": 0.7677053809165955, "learning_rate": 4.234273698966182e-05, "loss": 3.7278, "step": 6584 }, { "epoch": 1.7304824777482182, "grad_norm": 0.8405075669288635, "learning_rate": 4.2325214648677066e-05, "loss": 3.7083, "step": 6586 }, { "epoch": 1.7310079810818801, "grad_norm": 0.7650002837181091, "learning_rate": 4.230769230769231e-05, "loss": 3.7826, "step": 6588 }, { "epoch": 1.7315334844155417, "grad_norm": 0.728012204170227, "learning_rate": 4.2290169966707555e-05, "loss": 3.792, "step": 6590 }, { "epoch": 1.7320589877492036, "grad_norm": 0.7633234262466431, "learning_rate": 4.22726476257228e-05, "loss": 3.7926, "step": 6592 }, { "epoch": 1.7325844910828652, "grad_norm": 0.8281554579734802, "learning_rate": 4.225512528473804e-05, "loss": 3.8079, "step": 6594 }, { "epoch": 1.7331099944165271, "grad_norm": 0.7774648666381836, "learning_rate": 4.2237602943753284e-05, "loss": 3.7721, "step": 6596 }, { "epoch": 1.7336354977501889, "grad_norm": 0.8404257893562317, "learning_rate": 4.222008060276853e-05, "loss": 3.7717, "step": 6598 }, { "epoch": 1.7341610010838506, "grad_norm": 0.83598393201828, "learning_rate": 4.220255826178378e-05, "loss": 3.7898, "step": 6600 }, { "epoch": 1.7346865044175124, "grad_norm": 0.8040544986724854, "learning_rate": 4.218503592079902e-05, "loss": 3.7891, "step": 6602 }, { "epoch": 1.7352120077511741, "grad_norm": 0.9085627794265747, "learning_rate": 4.216751357981427e-05, "loss": 3.795, "step": 6604 }, { "epoch": 1.735737511084836, "grad_norm": 1.0422183275222778, "learning_rate": 4.2149991238829515e-05, "loss": 3.7772, "step": 6606 }, { "epoch": 1.7362630144184976, "grad_norm": 0.8696152567863464, "learning_rate": 4.2132468897844756e-05, "loss": 3.7953, "step": 6608 }, { "epoch": 1.7367885177521596, "grad_norm": 0.8573576211929321, "learning_rate": 4.2114946556859996e-05, "loss": 3.7552, "step": 6610 }, { "epoch": 1.7373140210858211, "grad_norm": 0.7740167379379272, "learning_rate": 4.2097424215875244e-05, "loss": 3.7733, "step": 6612 }, { "epoch": 1.737839524419483, "grad_norm": 0.7338257431983948, "learning_rate": 4.2079901874890485e-05, "loss": 3.7941, "step": 6614 }, { "epoch": 1.7383650277531448, "grad_norm": 0.7642579078674316, "learning_rate": 4.206237953390573e-05, "loss": 3.786, "step": 6616 }, { "epoch": 1.7388905310868066, "grad_norm": 0.7649243474006653, "learning_rate": 4.204485719292098e-05, "loss": 3.7733, "step": 6618 }, { "epoch": 1.7394160344204683, "grad_norm": 0.7070594429969788, "learning_rate": 4.202733485193622e-05, "loss": 3.7668, "step": 6620 }, { "epoch": 1.73994153775413, "grad_norm": 0.7779952883720398, "learning_rate": 4.200981251095146e-05, "loss": 3.7742, "step": 6622 }, { "epoch": 1.740467041087792, "grad_norm": 0.812491774559021, "learning_rate": 4.199229016996671e-05, "loss": 3.8504, "step": 6624 }, { "epoch": 1.7409925444214536, "grad_norm": 0.7944936752319336, "learning_rate": 4.1974767828981957e-05, "loss": 3.7755, "step": 6626 }, { "epoch": 1.7415180477551155, "grad_norm": 0.8286548256874084, "learning_rate": 4.19572454879972e-05, "loss": 3.7681, "step": 6628 }, { "epoch": 1.742043551088777, "grad_norm": 0.8547718524932861, "learning_rate": 4.1939723147012445e-05, "loss": 3.8111, "step": 6630 }, { "epoch": 1.742569054422439, "grad_norm": 0.8291828632354736, "learning_rate": 4.192220080602769e-05, "loss": 3.7881, "step": 6632 }, { "epoch": 1.7430945577561008, "grad_norm": 0.7945041060447693, "learning_rate": 4.1904678465042927e-05, "loss": 3.8652, "step": 6634 }, { "epoch": 1.7436200610897625, "grad_norm": 0.882715106010437, "learning_rate": 4.1887156124058174e-05, "loss": 3.7974, "step": 6636 }, { "epoch": 1.7441455644234243, "grad_norm": 0.7813427448272705, "learning_rate": 4.186963378307342e-05, "loss": 3.8036, "step": 6638 }, { "epoch": 1.744671067757086, "grad_norm": 0.7583516240119934, "learning_rate": 4.185211144208866e-05, "loss": 3.795, "step": 6640 }, { "epoch": 1.745196571090748, "grad_norm": 0.8384671807289124, "learning_rate": 4.183458910110391e-05, "loss": 3.7424, "step": 6642 }, { "epoch": 1.7457220744244095, "grad_norm": 0.7922006845474243, "learning_rate": 4.181706676011916e-05, "loss": 3.824, "step": 6644 }, { "epoch": 1.7462475777580715, "grad_norm": 0.8393129110336304, "learning_rate": 4.17995444191344e-05, "loss": 3.7494, "step": 6646 }, { "epoch": 1.746773081091733, "grad_norm": 0.7422060370445251, "learning_rate": 4.178202207814964e-05, "loss": 3.8019, "step": 6648 }, { "epoch": 1.747298584425395, "grad_norm": 0.709226667881012, "learning_rate": 4.176449973716489e-05, "loss": 3.8235, "step": 6650 }, { "epoch": 1.7478240877590567, "grad_norm": 0.6980586647987366, "learning_rate": 4.1746977396180134e-05, "loss": 3.7471, "step": 6652 }, { "epoch": 1.7483495910927185, "grad_norm": 0.7958494424819946, "learning_rate": 4.1729455055195375e-05, "loss": 3.7719, "step": 6654 }, { "epoch": 1.7488750944263802, "grad_norm": 0.790981650352478, "learning_rate": 4.171193271421062e-05, "loss": 3.7457, "step": 6656 }, { "epoch": 1.749400597760042, "grad_norm": 0.8228979706764221, "learning_rate": 4.169441037322587e-05, "loss": 3.7831, "step": 6658 }, { "epoch": 1.749926101093704, "grad_norm": 0.8200069665908813, "learning_rate": 4.1676888032241104e-05, "loss": 3.7693, "step": 6660 }, { "epoch": 1.7504516044273655, "grad_norm": 0.9066669940948486, "learning_rate": 4.165936569125635e-05, "loss": 3.813, "step": 6662 }, { "epoch": 1.7509771077610274, "grad_norm": 1.030377745628357, "learning_rate": 4.16418433502716e-05, "loss": 3.7415, "step": 6664 }, { "epoch": 1.751502611094689, "grad_norm": 0.9227124452590942, "learning_rate": 4.162432100928684e-05, "loss": 3.7638, "step": 6666 }, { "epoch": 1.752028114428351, "grad_norm": 0.7931029200553894, "learning_rate": 4.160679866830209e-05, "loss": 3.7514, "step": 6668 }, { "epoch": 1.7525536177620127, "grad_norm": 0.9260777235031128, "learning_rate": 4.1589276327317335e-05, "loss": 3.7539, "step": 6670 }, { "epoch": 1.7530791210956744, "grad_norm": 0.7904630303382874, "learning_rate": 4.1571753986332576e-05, "loss": 3.7331, "step": 6672 }, { "epoch": 1.7536046244293362, "grad_norm": 0.8276814222335815, "learning_rate": 4.155423164534782e-05, "loss": 3.7567, "step": 6674 }, { "epoch": 1.754130127762998, "grad_norm": 0.859998881816864, "learning_rate": 4.1536709304363064e-05, "loss": 3.7406, "step": 6676 }, { "epoch": 1.75465563109666, "grad_norm": 0.8199462890625, "learning_rate": 4.151918696337831e-05, "loss": 3.7478, "step": 6678 }, { "epoch": 1.7551811344303214, "grad_norm": 0.8189437985420227, "learning_rate": 4.150166462239355e-05, "loss": 3.8283, "step": 6680 }, { "epoch": 1.7557066377639834, "grad_norm": 0.8108528852462769, "learning_rate": 4.14841422814088e-05, "loss": 3.8091, "step": 6682 }, { "epoch": 1.7562321410976451, "grad_norm": 0.7877610325813293, "learning_rate": 4.146661994042405e-05, "loss": 3.7996, "step": 6684 }, { "epoch": 1.7567576444313069, "grad_norm": 0.6935058832168579, "learning_rate": 4.144909759943928e-05, "loss": 3.7615, "step": 6686 }, { "epoch": 1.7572831477649686, "grad_norm": 0.883411705493927, "learning_rate": 4.143157525845453e-05, "loss": 3.8285, "step": 6688 }, { "epoch": 1.7578086510986304, "grad_norm": 0.8788272738456726, "learning_rate": 4.141405291746978e-05, "loss": 3.7918, "step": 6690 }, { "epoch": 1.7583341544322921, "grad_norm": 0.7628341913223267, "learning_rate": 4.139653057648502e-05, "loss": 3.7442, "step": 6692 }, { "epoch": 1.7588596577659539, "grad_norm": 0.814853847026825, "learning_rate": 4.1379008235500265e-05, "loss": 3.7869, "step": 6694 }, { "epoch": 1.7593851610996158, "grad_norm": 0.8793486952781677, "learning_rate": 4.136148589451551e-05, "loss": 3.7731, "step": 6696 }, { "epoch": 1.7599106644332774, "grad_norm": 0.7328564524650574, "learning_rate": 4.1343963553530754e-05, "loss": 3.7545, "step": 6698 }, { "epoch": 1.7604361677669393, "grad_norm": 0.7564105987548828, "learning_rate": 4.1326441212545994e-05, "loss": 3.7672, "step": 6700 }, { "epoch": 1.760961671100601, "grad_norm": 0.845607578754425, "learning_rate": 4.130891887156124e-05, "loss": 3.8456, "step": 6702 }, { "epoch": 1.7614871744342628, "grad_norm": 0.7705516219139099, "learning_rate": 4.129139653057649e-05, "loss": 3.7649, "step": 6704 }, { "epoch": 1.7620126777679246, "grad_norm": 0.8907022476196289, "learning_rate": 4.127387418959173e-05, "loss": 3.8194, "step": 6706 }, { "epoch": 1.7625381811015863, "grad_norm": 0.8002586960792542, "learning_rate": 4.125635184860698e-05, "loss": 3.7977, "step": 6708 }, { "epoch": 1.7630636844352483, "grad_norm": 0.7800107598304749, "learning_rate": 4.123882950762222e-05, "loss": 3.7711, "step": 6710 }, { "epoch": 1.7635891877689098, "grad_norm": 0.8601124882698059, "learning_rate": 4.122130716663746e-05, "loss": 3.7913, "step": 6712 }, { "epoch": 1.7641146911025718, "grad_norm": 0.8352698683738708, "learning_rate": 4.120378482565271e-05, "loss": 3.7148, "step": 6714 }, { "epoch": 1.7646401944362333, "grad_norm": 0.7829784154891968, "learning_rate": 4.1186262484667955e-05, "loss": 3.8214, "step": 6716 }, { "epoch": 1.7651656977698953, "grad_norm": 0.8296517133712769, "learning_rate": 4.1168740143683195e-05, "loss": 3.7722, "step": 6718 }, { "epoch": 1.765691201103557, "grad_norm": 0.7930396199226379, "learning_rate": 4.115121780269844e-05, "loss": 3.7895, "step": 6720 }, { "epoch": 1.7662167044372188, "grad_norm": 0.7626187205314636, "learning_rate": 4.113369546171369e-05, "loss": 3.7647, "step": 6722 }, { "epoch": 1.7667422077708805, "grad_norm": 0.9078258872032166, "learning_rate": 4.111617312072893e-05, "loss": 3.7614, "step": 6724 }, { "epoch": 1.7672677111045423, "grad_norm": 0.7944594621658325, "learning_rate": 4.109865077974417e-05, "loss": 3.833, "step": 6726 }, { "epoch": 1.7677932144382043, "grad_norm": 0.7695104479789734, "learning_rate": 4.108112843875942e-05, "loss": 3.7807, "step": 6728 }, { "epoch": 1.7683187177718658, "grad_norm": 0.8172870874404907, "learning_rate": 4.106360609777467e-05, "loss": 3.781, "step": 6730 }, { "epoch": 1.7688442211055277, "grad_norm": 0.7900203466415405, "learning_rate": 4.104608375678991e-05, "loss": 3.7441, "step": 6732 }, { "epoch": 1.7693697244391893, "grad_norm": 1.0718574523925781, "learning_rate": 4.1028561415805156e-05, "loss": 3.7541, "step": 6734 }, { "epoch": 1.7698952277728512, "grad_norm": 0.9704225659370422, "learning_rate": 4.1011039074820396e-05, "loss": 3.8248, "step": 6736 }, { "epoch": 1.770420731106513, "grad_norm": 0.8183091282844543, "learning_rate": 4.0993516733835644e-05, "loss": 3.7961, "step": 6738 }, { "epoch": 1.7709462344401747, "grad_norm": 0.8536092042922974, "learning_rate": 4.0975994392850885e-05, "loss": 3.831, "step": 6740 }, { "epoch": 1.7714717377738365, "grad_norm": 0.79820317029953, "learning_rate": 4.095847205186613e-05, "loss": 3.8255, "step": 6742 }, { "epoch": 1.7719972411074982, "grad_norm": 0.8466473817825317, "learning_rate": 4.094094971088138e-05, "loss": 3.7768, "step": 6744 }, { "epoch": 1.7725227444411602, "grad_norm": 0.8120173215866089, "learning_rate": 4.092342736989662e-05, "loss": 3.8189, "step": 6746 }, { "epoch": 1.7730482477748217, "grad_norm": 0.8570998907089233, "learning_rate": 4.090590502891187e-05, "loss": 3.793, "step": 6748 }, { "epoch": 1.7735737511084837, "grad_norm": 0.7813621163368225, "learning_rate": 4.088838268792711e-05, "loss": 3.7739, "step": 6750 }, { "epoch": 1.7740992544421452, "grad_norm": 0.8926020860671997, "learning_rate": 4.087086034694235e-05, "loss": 3.7842, "step": 6752 }, { "epoch": 1.7746247577758072, "grad_norm": 0.8642773032188416, "learning_rate": 4.08533380059576e-05, "loss": 3.8004, "step": 6754 }, { "epoch": 1.775150261109469, "grad_norm": 0.8771176338195801, "learning_rate": 4.0835815664972845e-05, "loss": 3.7407, "step": 6756 }, { "epoch": 1.7756757644431307, "grad_norm": 0.8767798542976379, "learning_rate": 4.0818293323988086e-05, "loss": 3.7675, "step": 6758 }, { "epoch": 1.7762012677767924, "grad_norm": 0.7885094285011292, "learning_rate": 4.080077098300333e-05, "loss": 3.8047, "step": 6760 }, { "epoch": 1.7767267711104542, "grad_norm": 0.70883709192276, "learning_rate": 4.0783248642018574e-05, "loss": 3.808, "step": 6762 }, { "epoch": 1.7772522744441162, "grad_norm": 0.9619625806808472, "learning_rate": 4.076572630103382e-05, "loss": 3.7857, "step": 6764 }, { "epoch": 1.7777777777777777, "grad_norm": 0.8603535294532776, "learning_rate": 4.074820396004906e-05, "loss": 3.767, "step": 6766 }, { "epoch": 1.7783032811114396, "grad_norm": 0.8111237287521362, "learning_rate": 4.073068161906431e-05, "loss": 3.8262, "step": 6768 }, { "epoch": 1.7788287844451012, "grad_norm": 0.7609623074531555, "learning_rate": 4.071315927807956e-05, "loss": 3.7966, "step": 6770 }, { "epoch": 1.7793542877787631, "grad_norm": 0.8881996273994446, "learning_rate": 4.06956369370948e-05, "loss": 3.7925, "step": 6772 }, { "epoch": 1.779879791112425, "grad_norm": 0.9231682419776917, "learning_rate": 4.067811459611004e-05, "loss": 3.7366, "step": 6774 }, { "epoch": 1.7804052944460866, "grad_norm": 0.7926375865936279, "learning_rate": 4.066059225512529e-05, "loss": 3.8124, "step": 6776 }, { "epoch": 1.7809307977797484, "grad_norm": 0.8643268346786499, "learning_rate": 4.064306991414053e-05, "loss": 3.7591, "step": 6778 }, { "epoch": 1.7814563011134101, "grad_norm": 0.7792268395423889, "learning_rate": 4.0625547573155775e-05, "loss": 3.822, "step": 6780 }, { "epoch": 1.781981804447072, "grad_norm": 0.8378827571868896, "learning_rate": 4.060802523217102e-05, "loss": 3.8032, "step": 6782 }, { "epoch": 1.7825073077807336, "grad_norm": 0.8528345823287964, "learning_rate": 4.059050289118626e-05, "loss": 3.7493, "step": 6784 }, { "epoch": 1.7830328111143956, "grad_norm": 0.8127536177635193, "learning_rate": 4.057298055020151e-05, "loss": 3.7587, "step": 6786 }, { "epoch": 1.7835583144480571, "grad_norm": 0.9386539459228516, "learning_rate": 4.055545820921675e-05, "loss": 3.7199, "step": 6788 }, { "epoch": 1.784083817781719, "grad_norm": 0.7639591097831726, "learning_rate": 4.0537935868232e-05, "loss": 3.7806, "step": 6790 }, { "epoch": 1.7846093211153808, "grad_norm": 0.8406015634536743, "learning_rate": 4.052041352724724e-05, "loss": 3.7559, "step": 6792 }, { "epoch": 1.7851348244490426, "grad_norm": 0.7880905270576477, "learning_rate": 4.050289118626249e-05, "loss": 3.8243, "step": 6794 }, { "epoch": 1.7856603277827043, "grad_norm": 0.8461362719535828, "learning_rate": 4.0485368845277735e-05, "loss": 3.8402, "step": 6796 }, { "epoch": 1.786185831116366, "grad_norm": 0.9586267471313477, "learning_rate": 4.0467846504292976e-05, "loss": 3.7325, "step": 6798 }, { "epoch": 1.786711334450028, "grad_norm": 0.7644850611686707, "learning_rate": 4.045032416330822e-05, "loss": 3.7066, "step": 6800 }, { "epoch": 1.786711334450028, "eval_loss": 3.762897491455078, "eval_runtime": 464.7273, "eval_samples_per_second": 262.066, "eval_steps_per_second": 8.19, "step": 6800 }, { "epoch": 1.7872368377836896, "grad_norm": 0.7776674628257751, "learning_rate": 4.0432801822323464e-05, "loss": 3.7261, "step": 6802 }, { "epoch": 1.7877623411173515, "grad_norm": 0.8307031393051147, "learning_rate": 4.0415279481338705e-05, "loss": 3.7821, "step": 6804 }, { "epoch": 1.788287844451013, "grad_norm": 0.8042975068092346, "learning_rate": 4.039775714035395e-05, "loss": 3.7935, "step": 6806 }, { "epoch": 1.788813347784675, "grad_norm": 0.7953324317932129, "learning_rate": 4.03802347993692e-05, "loss": 3.7861, "step": 6808 }, { "epoch": 1.7893388511183368, "grad_norm": 0.7461639642715454, "learning_rate": 4.036271245838444e-05, "loss": 3.7792, "step": 6810 }, { "epoch": 1.7898643544519985, "grad_norm": 0.7623462080955505, "learning_rate": 4.034519011739968e-05, "loss": 3.7721, "step": 6812 }, { "epoch": 1.7903898577856603, "grad_norm": 0.7534228563308716, "learning_rate": 4.032766777641493e-05, "loss": 3.7886, "step": 6814 }, { "epoch": 1.790915361119322, "grad_norm": 0.8644856810569763, "learning_rate": 4.031014543543018e-05, "loss": 3.7798, "step": 6816 }, { "epoch": 1.791440864452984, "grad_norm": 0.9150791168212891, "learning_rate": 4.029262309444542e-05, "loss": 3.7748, "step": 6818 }, { "epoch": 1.7919663677866455, "grad_norm": 0.7881348133087158, "learning_rate": 4.0275100753460665e-05, "loss": 3.77, "step": 6820 }, { "epoch": 1.7924918711203075, "grad_norm": 0.8126546144485474, "learning_rate": 4.025757841247591e-05, "loss": 3.7411, "step": 6822 }, { "epoch": 1.793017374453969, "grad_norm": 0.8163486123085022, "learning_rate": 4.0240056071491154e-05, "loss": 3.7886, "step": 6824 }, { "epoch": 1.793542877787631, "grad_norm": 0.8193070292472839, "learning_rate": 4.0222533730506394e-05, "loss": 3.7641, "step": 6826 }, { "epoch": 1.7940683811212927, "grad_norm": 0.8468549847602844, "learning_rate": 4.020501138952164e-05, "loss": 3.7306, "step": 6828 }, { "epoch": 1.7945938844549545, "grad_norm": 0.7572709918022156, "learning_rate": 4.018748904853688e-05, "loss": 3.7715, "step": 6830 }, { "epoch": 1.7951193877886162, "grad_norm": 0.7869740128517151, "learning_rate": 4.016996670755213e-05, "loss": 3.7673, "step": 6832 }, { "epoch": 1.795644891122278, "grad_norm": 0.8349559903144836, "learning_rate": 4.015244436656738e-05, "loss": 3.7441, "step": 6834 }, { "epoch": 1.79617039445594, "grad_norm": 0.7526381611824036, "learning_rate": 4.013492202558262e-05, "loss": 3.7876, "step": 6836 }, { "epoch": 1.7966958977896015, "grad_norm": 0.8823502063751221, "learning_rate": 4.011739968459786e-05, "loss": 3.7672, "step": 6838 }, { "epoch": 1.7972214011232635, "grad_norm": 0.8246528506278992, "learning_rate": 4.009987734361311e-05, "loss": 3.7542, "step": 6840 }, { "epoch": 1.7977469044569252, "grad_norm": 0.9025899171829224, "learning_rate": 4.0082355002628355e-05, "loss": 3.7956, "step": 6842 }, { "epoch": 1.798272407790587, "grad_norm": 0.7570231556892395, "learning_rate": 4.0064832661643595e-05, "loss": 3.8308, "step": 6844 }, { "epoch": 1.7987979111242487, "grad_norm": 0.7741662859916687, "learning_rate": 4.004731032065884e-05, "loss": 3.7565, "step": 6846 }, { "epoch": 1.7993234144579104, "grad_norm": 0.7700703740119934, "learning_rate": 4.002978797967409e-05, "loss": 3.8171, "step": 6848 }, { "epoch": 1.7998489177915722, "grad_norm": 0.8525515198707581, "learning_rate": 4.001226563868933e-05, "loss": 3.7571, "step": 6850 }, { "epoch": 1.800374421125234, "grad_norm": 0.7596049904823303, "learning_rate": 3.999474329770457e-05, "loss": 3.7793, "step": 6852 }, { "epoch": 1.800899924458896, "grad_norm": 0.8376478552818298, "learning_rate": 3.997722095671982e-05, "loss": 3.7635, "step": 6854 }, { "epoch": 1.8014254277925574, "grad_norm": 0.9050460457801819, "learning_rate": 3.995969861573507e-05, "loss": 3.7853, "step": 6856 }, { "epoch": 1.8019509311262194, "grad_norm": 0.8534972667694092, "learning_rate": 3.994217627475031e-05, "loss": 3.7555, "step": 6858 }, { "epoch": 1.8024764344598811, "grad_norm": 0.8024195432662964, "learning_rate": 3.9924653933765556e-05, "loss": 3.7984, "step": 6860 }, { "epoch": 1.803001937793543, "grad_norm": 0.6906440258026123, "learning_rate": 3.99071315927808e-05, "loss": 3.7826, "step": 6862 }, { "epoch": 1.8035274411272046, "grad_norm": 0.8288623690605164, "learning_rate": 3.988960925179604e-05, "loss": 3.7345, "step": 6864 }, { "epoch": 1.8040529444608664, "grad_norm": 0.8112103343009949, "learning_rate": 3.9872086910811285e-05, "loss": 3.7919, "step": 6866 }, { "epoch": 1.8045784477945284, "grad_norm": 0.8490801453590393, "learning_rate": 3.985456456982653e-05, "loss": 3.7437, "step": 6868 }, { "epoch": 1.8051039511281899, "grad_norm": 0.8272614479064941, "learning_rate": 3.983704222884177e-05, "loss": 3.779, "step": 6870 }, { "epoch": 1.8056294544618519, "grad_norm": 0.845544695854187, "learning_rate": 3.981951988785702e-05, "loss": 3.7931, "step": 6872 }, { "epoch": 1.8061549577955134, "grad_norm": 0.7968992590904236, "learning_rate": 3.980199754687227e-05, "loss": 3.7337, "step": 6874 }, { "epoch": 1.8066804611291754, "grad_norm": 0.8595684170722961, "learning_rate": 3.978447520588751e-05, "loss": 3.7528, "step": 6876 }, { "epoch": 1.807205964462837, "grad_norm": 0.7786855101585388, "learning_rate": 3.976695286490275e-05, "loss": 3.8034, "step": 6878 }, { "epoch": 1.8077314677964988, "grad_norm": 0.8265178799629211, "learning_rate": 3.9749430523918e-05, "loss": 3.7916, "step": 6880 }, { "epoch": 1.8082569711301606, "grad_norm": 0.8367806673049927, "learning_rate": 3.9731908182933245e-05, "loss": 3.7871, "step": 6882 }, { "epoch": 1.8087824744638223, "grad_norm": 0.7865480780601501, "learning_rate": 3.9714385841948486e-05, "loss": 3.7507, "step": 6884 }, { "epoch": 1.8093079777974843, "grad_norm": 0.8354262709617615, "learning_rate": 3.969686350096373e-05, "loss": 3.7499, "step": 6886 }, { "epoch": 1.8098334811311458, "grad_norm": 0.7767797708511353, "learning_rate": 3.967934115997898e-05, "loss": 3.7916, "step": 6888 }, { "epoch": 1.8103589844648078, "grad_norm": 0.9347259998321533, "learning_rate": 3.9661818818994215e-05, "loss": 3.7303, "step": 6890 }, { "epoch": 1.8108844877984693, "grad_norm": 0.7582933902740479, "learning_rate": 3.964429647800946e-05, "loss": 3.7984, "step": 6892 }, { "epoch": 1.8114099911321313, "grad_norm": 0.7930318117141724, "learning_rate": 3.962677413702471e-05, "loss": 3.7647, "step": 6894 }, { "epoch": 1.811935494465793, "grad_norm": 0.8934944272041321, "learning_rate": 3.960925179603995e-05, "loss": 3.7628, "step": 6896 }, { "epoch": 1.8124609977994548, "grad_norm": 0.7359493374824524, "learning_rate": 3.95917294550552e-05, "loss": 3.7642, "step": 6898 }, { "epoch": 1.8129865011331165, "grad_norm": 0.8254148960113525, "learning_rate": 3.9574207114070446e-05, "loss": 3.7368, "step": 6900 }, { "epoch": 1.8135120044667783, "grad_norm": 0.702992856502533, "learning_rate": 3.955668477308569e-05, "loss": 3.7838, "step": 6902 }, { "epoch": 1.8140375078004403, "grad_norm": 0.7914687395095825, "learning_rate": 3.953916243210093e-05, "loss": 3.7582, "step": 6904 }, { "epoch": 1.8145630111341018, "grad_norm": 0.8423899412155151, "learning_rate": 3.9521640091116175e-05, "loss": 3.7489, "step": 6906 }, { "epoch": 1.8150885144677638, "grad_norm": 0.698235034942627, "learning_rate": 3.950411775013142e-05, "loss": 3.7296, "step": 6908 }, { "epoch": 1.8156140178014253, "grad_norm": 0.7397322058677673, "learning_rate": 3.948659540914666e-05, "loss": 3.7973, "step": 6910 }, { "epoch": 1.8161395211350873, "grad_norm": 0.9255777597427368, "learning_rate": 3.946907306816191e-05, "loss": 3.7773, "step": 6912 }, { "epoch": 1.816665024468749, "grad_norm": 0.7510200142860413, "learning_rate": 3.945155072717715e-05, "loss": 3.7551, "step": 6914 }, { "epoch": 1.8171905278024107, "grad_norm": 0.8148588538169861, "learning_rate": 3.943402838619239e-05, "loss": 3.7796, "step": 6916 }, { "epoch": 1.8177160311360725, "grad_norm": 0.7270950078964233, "learning_rate": 3.941650604520764e-05, "loss": 3.7615, "step": 6918 }, { "epoch": 1.8182415344697342, "grad_norm": 0.940708339214325, "learning_rate": 3.939898370422289e-05, "loss": 3.737, "step": 6920 }, { "epoch": 1.8187670378033962, "grad_norm": 0.8404219150543213, "learning_rate": 3.938146136323813e-05, "loss": 3.7884, "step": 6922 }, { "epoch": 1.8192925411370577, "grad_norm": 0.7498236894607544, "learning_rate": 3.9363939022253376e-05, "loss": 3.7735, "step": 6924 }, { "epoch": 1.8198180444707197, "grad_norm": 0.7856084704399109, "learning_rate": 3.9346416681268624e-05, "loss": 3.8212, "step": 6926 }, { "epoch": 1.8203435478043812, "grad_norm": 0.7542836666107178, "learning_rate": 3.9328894340283864e-05, "loss": 3.7662, "step": 6928 }, { "epoch": 1.8208690511380432, "grad_norm": 0.8267679214477539, "learning_rate": 3.9311371999299105e-05, "loss": 3.8584, "step": 6930 }, { "epoch": 1.821394554471705, "grad_norm": 0.7070279121398926, "learning_rate": 3.929384965831435e-05, "loss": 3.7768, "step": 6932 }, { "epoch": 1.8219200578053667, "grad_norm": 0.7621411681175232, "learning_rate": 3.92763273173296e-05, "loss": 3.7612, "step": 6934 }, { "epoch": 1.8224455611390284, "grad_norm": 0.8179396390914917, "learning_rate": 3.925880497634484e-05, "loss": 3.7938, "step": 6936 }, { "epoch": 1.8229710644726902, "grad_norm": 1.029765248298645, "learning_rate": 3.924128263536009e-05, "loss": 3.7465, "step": 6938 }, { "epoch": 1.8234965678063522, "grad_norm": 0.7536749839782715, "learning_rate": 3.922376029437533e-05, "loss": 3.7517, "step": 6940 }, { "epoch": 1.8240220711400137, "grad_norm": 0.7844004034996033, "learning_rate": 3.920623795339057e-05, "loss": 3.7937, "step": 6942 }, { "epoch": 1.8245475744736757, "grad_norm": 0.789651095867157, "learning_rate": 3.918871561240582e-05, "loss": 3.7768, "step": 6944 }, { "epoch": 1.8250730778073372, "grad_norm": 0.7462711334228516, "learning_rate": 3.9171193271421065e-05, "loss": 3.7775, "step": 6946 }, { "epoch": 1.8255985811409992, "grad_norm": 0.7539663314819336, "learning_rate": 3.9153670930436306e-05, "loss": 3.73, "step": 6948 }, { "epoch": 1.826124084474661, "grad_norm": 0.8040952086448669, "learning_rate": 3.9136148589451554e-05, "loss": 3.8197, "step": 6950 }, { "epoch": 1.8266495878083227, "grad_norm": 0.7603068947792053, "learning_rate": 3.91186262484668e-05, "loss": 3.7794, "step": 6952 }, { "epoch": 1.8271750911419844, "grad_norm": 0.7606468796730042, "learning_rate": 3.910110390748204e-05, "loss": 3.7535, "step": 6954 }, { "epoch": 1.8277005944756461, "grad_norm": 0.801288366317749, "learning_rate": 3.908358156649728e-05, "loss": 3.8111, "step": 6956 }, { "epoch": 1.8282260978093081, "grad_norm": 0.7987909913063049, "learning_rate": 3.906605922551253e-05, "loss": 3.7272, "step": 6958 }, { "epoch": 1.8287516011429696, "grad_norm": 0.7113742232322693, "learning_rate": 3.904853688452778e-05, "loss": 3.7422, "step": 6960 }, { "epoch": 1.8292771044766316, "grad_norm": 0.8882097601890564, "learning_rate": 3.903101454354302e-05, "loss": 3.7076, "step": 6962 }, { "epoch": 1.8298026078102931, "grad_norm": 0.7231417894363403, "learning_rate": 3.9013492202558266e-05, "loss": 3.7759, "step": 6964 }, { "epoch": 1.830328111143955, "grad_norm": 0.774130642414093, "learning_rate": 3.899596986157351e-05, "loss": 3.8378, "step": 6966 }, { "epoch": 1.8308536144776169, "grad_norm": 0.7747817039489746, "learning_rate": 3.897844752058875e-05, "loss": 3.7583, "step": 6968 }, { "epoch": 1.8313791178112786, "grad_norm": 0.7536869645118713, "learning_rate": 3.8960925179603995e-05, "loss": 3.7812, "step": 6970 }, { "epoch": 1.8319046211449403, "grad_norm": 0.8985234498977661, "learning_rate": 3.894340283861924e-05, "loss": 3.8095, "step": 6972 }, { "epoch": 1.832430124478602, "grad_norm": 0.807640016078949, "learning_rate": 3.8925880497634484e-05, "loss": 3.805, "step": 6974 }, { "epoch": 1.832955627812264, "grad_norm": 0.8222396373748779, "learning_rate": 3.890835815664973e-05, "loss": 3.696, "step": 6976 }, { "epoch": 1.8334811311459256, "grad_norm": 0.8360228538513184, "learning_rate": 3.889083581566497e-05, "loss": 3.7876, "step": 6978 }, { "epoch": 1.8340066344795876, "grad_norm": 0.8886296153068542, "learning_rate": 3.887331347468022e-05, "loss": 3.761, "step": 6980 }, { "epoch": 1.834532137813249, "grad_norm": 0.8245058059692383, "learning_rate": 3.885579113369546e-05, "loss": 3.7863, "step": 6982 }, { "epoch": 1.835057641146911, "grad_norm": 0.9058637619018555, "learning_rate": 3.883826879271071e-05, "loss": 3.8234, "step": 6984 }, { "epoch": 1.8355831444805728, "grad_norm": 0.7474458813667297, "learning_rate": 3.8820746451725956e-05, "loss": 3.7265, "step": 6986 }, { "epoch": 1.8361086478142346, "grad_norm": 0.8486431241035461, "learning_rate": 3.8803224110741196e-05, "loss": 3.7661, "step": 6988 }, { "epoch": 1.8366341511478963, "grad_norm": 0.7955511212348938, "learning_rate": 3.8785701769756444e-05, "loss": 3.7621, "step": 6990 }, { "epoch": 1.837159654481558, "grad_norm": 0.8351337909698486, "learning_rate": 3.8768179428771685e-05, "loss": 3.7713, "step": 6992 }, { "epoch": 1.83768515781522, "grad_norm": 1.043115258216858, "learning_rate": 3.875065708778693e-05, "loss": 3.7786, "step": 6994 }, { "epoch": 1.8382106611488815, "grad_norm": 0.9047161936759949, "learning_rate": 3.873313474680217e-05, "loss": 3.733, "step": 6996 }, { "epoch": 1.8387361644825435, "grad_norm": 0.8050552606582642, "learning_rate": 3.871561240581742e-05, "loss": 3.7457, "step": 6998 }, { "epoch": 1.8392616678162053, "grad_norm": 0.7686793804168701, "learning_rate": 3.869809006483267e-05, "loss": 3.6882, "step": 7000 }, { "epoch": 1.839787171149867, "grad_norm": 0.9280879497528076, "learning_rate": 3.868056772384791e-05, "loss": 3.7868, "step": 7002 }, { "epoch": 1.8403126744835288, "grad_norm": 0.7892923355102539, "learning_rate": 3.866304538286315e-05, "loss": 3.7194, "step": 7004 }, { "epoch": 1.8408381778171905, "grad_norm": 0.7244144082069397, "learning_rate": 3.86455230418784e-05, "loss": 3.7997, "step": 7006 }, { "epoch": 1.8413636811508522, "grad_norm": 0.7875443696975708, "learning_rate": 3.862800070089364e-05, "loss": 3.7997, "step": 7008 }, { "epoch": 1.841889184484514, "grad_norm": 0.9401888847351074, "learning_rate": 3.8610478359908886e-05, "loss": 3.7853, "step": 7010 }, { "epoch": 1.842414687818176, "grad_norm": 0.8290198445320129, "learning_rate": 3.859295601892413e-05, "loss": 3.7284, "step": 7012 }, { "epoch": 1.8429401911518375, "grad_norm": 0.943585216999054, "learning_rate": 3.8575433677939374e-05, "loss": 3.7317, "step": 7014 }, { "epoch": 1.8434656944854995, "grad_norm": 0.7412745952606201, "learning_rate": 3.8557911336954615e-05, "loss": 3.8035, "step": 7016 }, { "epoch": 1.8439911978191612, "grad_norm": 0.7825527191162109, "learning_rate": 3.854038899596986e-05, "loss": 3.8335, "step": 7018 }, { "epoch": 1.844516701152823, "grad_norm": 0.8610004782676697, "learning_rate": 3.852286665498511e-05, "loss": 3.786, "step": 7020 }, { "epoch": 1.8450422044864847, "grad_norm": 0.8845333456993103, "learning_rate": 3.850534431400035e-05, "loss": 3.7777, "step": 7022 }, { "epoch": 1.8455677078201465, "grad_norm": 0.8659244179725647, "learning_rate": 3.84878219730156e-05, "loss": 3.799, "step": 7024 }, { "epoch": 1.8460932111538084, "grad_norm": 0.7839697003364563, "learning_rate": 3.8470299632030846e-05, "loss": 3.7035, "step": 7026 }, { "epoch": 1.84661871448747, "grad_norm": 0.8702651262283325, "learning_rate": 3.845277729104609e-05, "loss": 3.771, "step": 7028 }, { "epoch": 1.847144217821132, "grad_norm": 0.8462900519371033, "learning_rate": 3.843525495006133e-05, "loss": 3.7696, "step": 7030 }, { "epoch": 1.8476697211547934, "grad_norm": 0.7593976259231567, "learning_rate": 3.8417732609076575e-05, "loss": 3.7592, "step": 7032 }, { "epoch": 1.8481952244884554, "grad_norm": 0.750741183757782, "learning_rate": 3.8400210268091816e-05, "loss": 3.8042, "step": 7034 }, { "epoch": 1.8487207278221172, "grad_norm": 0.8348686099052429, "learning_rate": 3.838268792710706e-05, "loss": 3.7642, "step": 7036 }, { "epoch": 1.849246231155779, "grad_norm": 0.7786337733268738, "learning_rate": 3.836516558612231e-05, "loss": 3.7699, "step": 7038 }, { "epoch": 1.8497717344894407, "grad_norm": 0.7430334091186523, "learning_rate": 3.834764324513755e-05, "loss": 3.8033, "step": 7040 }, { "epoch": 1.8502972378231024, "grad_norm": 0.8263676762580872, "learning_rate": 3.833012090415279e-05, "loss": 3.7714, "step": 7042 }, { "epoch": 1.8508227411567644, "grad_norm": 0.7179944515228271, "learning_rate": 3.831259856316804e-05, "loss": 3.7378, "step": 7044 }, { "epoch": 1.851348244490426, "grad_norm": 0.784455418586731, "learning_rate": 3.829507622218329e-05, "loss": 3.7895, "step": 7046 }, { "epoch": 1.8518737478240879, "grad_norm": 0.7715687155723572, "learning_rate": 3.827755388119853e-05, "loss": 3.7866, "step": 7048 }, { "epoch": 1.8523992511577494, "grad_norm": 0.712004542350769, "learning_rate": 3.8260031540213776e-05, "loss": 3.7641, "step": 7050 }, { "epoch": 1.8529247544914114, "grad_norm": 0.7935072779655457, "learning_rate": 3.8242509199229024e-05, "loss": 3.7826, "step": 7052 }, { "epoch": 1.853450257825073, "grad_norm": 0.7749385833740234, "learning_rate": 3.822498685824426e-05, "loss": 3.8146, "step": 7054 }, { "epoch": 1.8539757611587349, "grad_norm": 0.8610692620277405, "learning_rate": 3.8207464517259505e-05, "loss": 3.7695, "step": 7056 }, { "epoch": 1.8545012644923966, "grad_norm": 0.8118829727172852, "learning_rate": 3.818994217627475e-05, "loss": 3.7725, "step": 7058 }, { "epoch": 1.8550267678260584, "grad_norm": 0.8353779315948486, "learning_rate": 3.8172419835289993e-05, "loss": 3.8203, "step": 7060 }, { "epoch": 1.8555522711597203, "grad_norm": 0.9004368782043457, "learning_rate": 3.815489749430524e-05, "loss": 3.822, "step": 7062 }, { "epoch": 1.8560777744933818, "grad_norm": 0.8338342905044556, "learning_rate": 3.813737515332049e-05, "loss": 3.7301, "step": 7064 }, { "epoch": 1.8566032778270438, "grad_norm": 0.8245241641998291, "learning_rate": 3.811985281233573e-05, "loss": 3.778, "step": 7066 }, { "epoch": 1.8571287811607053, "grad_norm": 0.9144248962402344, "learning_rate": 3.810233047135097e-05, "loss": 3.7893, "step": 7068 }, { "epoch": 1.8576542844943673, "grad_norm": 0.8430649638175964, "learning_rate": 3.808480813036622e-05, "loss": 3.8324, "step": 7070 }, { "epoch": 1.858179787828029, "grad_norm": 0.8142551183700562, "learning_rate": 3.8067285789381465e-05, "loss": 3.7758, "step": 7072 }, { "epoch": 1.8587052911616908, "grad_norm": 0.8191378712654114, "learning_rate": 3.8049763448396706e-05, "loss": 3.7601, "step": 7074 }, { "epoch": 1.8592307944953526, "grad_norm": 1.1114091873168945, "learning_rate": 3.8032241107411954e-05, "loss": 3.8123, "step": 7076 }, { "epoch": 1.8597562978290143, "grad_norm": 0.9464265704154968, "learning_rate": 3.80147187664272e-05, "loss": 3.7445, "step": 7078 }, { "epoch": 1.8602818011626763, "grad_norm": 0.8616015315055847, "learning_rate": 3.7997196425442435e-05, "loss": 3.8004, "step": 7080 }, { "epoch": 1.8608073044963378, "grad_norm": 0.7472968101501465, "learning_rate": 3.797967408445768e-05, "loss": 3.7184, "step": 7082 }, { "epoch": 1.8613328078299998, "grad_norm": 0.8536667823791504, "learning_rate": 3.796215174347293e-05, "loss": 3.7312, "step": 7084 }, { "epoch": 1.8618583111636613, "grad_norm": 0.786357581615448, "learning_rate": 3.794462940248817e-05, "loss": 3.762, "step": 7086 }, { "epoch": 1.8623838144973233, "grad_norm": 0.8294268846511841, "learning_rate": 3.792710706150342e-05, "loss": 3.8037, "step": 7088 }, { "epoch": 1.862909317830985, "grad_norm": 0.795151948928833, "learning_rate": 3.7909584720518666e-05, "loss": 3.7343, "step": 7090 }, { "epoch": 1.8634348211646468, "grad_norm": 0.9068311452865601, "learning_rate": 3.789206237953391e-05, "loss": 3.741, "step": 7092 }, { "epoch": 1.8639603244983085, "grad_norm": 0.8108829259872437, "learning_rate": 3.787454003854915e-05, "loss": 3.7755, "step": 7094 }, { "epoch": 1.8644858278319703, "grad_norm": 0.8444769978523254, "learning_rate": 3.7857017697564395e-05, "loss": 3.7311, "step": 7096 }, { "epoch": 1.8650113311656322, "grad_norm": 1.005103588104248, "learning_rate": 3.783949535657964e-05, "loss": 3.7825, "step": 7098 }, { "epoch": 1.8655368344992938, "grad_norm": 0.7985761165618896, "learning_rate": 3.7821973015594884e-05, "loss": 3.8122, "step": 7100 }, { "epoch": 1.8660623378329557, "grad_norm": 0.8081100583076477, "learning_rate": 3.780445067461013e-05, "loss": 3.8036, "step": 7102 }, { "epoch": 1.8665878411666172, "grad_norm": 0.8697844743728638, "learning_rate": 3.778692833362538e-05, "loss": 3.7635, "step": 7104 }, { "epoch": 1.8671133445002792, "grad_norm": 0.8827864527702332, "learning_rate": 3.776940599264062e-05, "loss": 3.7338, "step": 7106 }, { "epoch": 1.867638847833941, "grad_norm": 0.8029345870018005, "learning_rate": 3.775188365165586e-05, "loss": 3.7181, "step": 7108 }, { "epoch": 1.8681643511676027, "grad_norm": 0.7628492712974548, "learning_rate": 3.773436131067111e-05, "loss": 3.7703, "step": 7110 }, { "epoch": 1.8686898545012645, "grad_norm": 0.7883694171905518, "learning_rate": 3.7716838969686356e-05, "loss": 3.7411, "step": 7112 }, { "epoch": 1.8692153578349262, "grad_norm": 0.7872506380081177, "learning_rate": 3.7699316628701596e-05, "loss": 3.751, "step": 7114 }, { "epoch": 1.8697408611685882, "grad_norm": 0.8143070340156555, "learning_rate": 3.7681794287716844e-05, "loss": 3.7631, "step": 7116 }, { "epoch": 1.8702663645022497, "grad_norm": 0.7886106371879578, "learning_rate": 3.7664271946732085e-05, "loss": 3.7264, "step": 7118 }, { "epoch": 1.8707918678359117, "grad_norm": 0.8022585511207581, "learning_rate": 3.7646749605747325e-05, "loss": 3.756, "step": 7120 }, { "epoch": 1.8713173711695732, "grad_norm": 0.7891400456428528, "learning_rate": 3.762922726476257e-05, "loss": 3.7629, "step": 7122 }, { "epoch": 1.8718428745032352, "grad_norm": 0.7349154353141785, "learning_rate": 3.761170492377782e-05, "loss": 3.7769, "step": 7124 }, { "epoch": 1.872368377836897, "grad_norm": 0.8339977860450745, "learning_rate": 3.759418258279306e-05, "loss": 3.7514, "step": 7126 }, { "epoch": 1.8728938811705587, "grad_norm": 0.7589173316955566, "learning_rate": 3.757666024180831e-05, "loss": 3.7675, "step": 7128 }, { "epoch": 1.8734193845042204, "grad_norm": 1.1766217947006226, "learning_rate": 3.7559137900823557e-05, "loss": 3.7822, "step": 7130 }, { "epoch": 1.8739448878378822, "grad_norm": 0.7856595516204834, "learning_rate": 3.75416155598388e-05, "loss": 3.8015, "step": 7132 }, { "epoch": 1.8744703911715441, "grad_norm": 0.7380483150482178, "learning_rate": 3.752409321885404e-05, "loss": 3.7818, "step": 7134 }, { "epoch": 1.8749958945052057, "grad_norm": 0.8165022134780884, "learning_rate": 3.7506570877869286e-05, "loss": 3.7374, "step": 7136 }, { "epoch": 1.8755213978388676, "grad_norm": 0.7303951978683472, "learning_rate": 3.748904853688453e-05, "loss": 3.7706, "step": 7138 }, { "epoch": 1.8760469011725294, "grad_norm": 0.8233087062835693, "learning_rate": 3.7471526195899774e-05, "loss": 3.7977, "step": 7140 }, { "epoch": 1.8765724045061911, "grad_norm": 0.8598429560661316, "learning_rate": 3.745400385491502e-05, "loss": 3.7224, "step": 7142 }, { "epoch": 1.8770979078398529, "grad_norm": 0.7967079877853394, "learning_rate": 3.743648151393026e-05, "loss": 3.7372, "step": 7144 }, { "epoch": 1.8776234111735146, "grad_norm": 0.8271150588989258, "learning_rate": 3.74189591729455e-05, "loss": 3.7345, "step": 7146 }, { "epoch": 1.8781489145071764, "grad_norm": 0.8150941133499146, "learning_rate": 3.740143683196075e-05, "loss": 3.7841, "step": 7148 }, { "epoch": 1.878674417840838, "grad_norm": 0.7777132987976074, "learning_rate": 3.7383914490976e-05, "loss": 3.7858, "step": 7150 }, { "epoch": 1.8791999211745, "grad_norm": 0.7489246129989624, "learning_rate": 3.736639214999124e-05, "loss": 3.7048, "step": 7152 }, { "epoch": 1.8797254245081616, "grad_norm": 0.8273763060569763, "learning_rate": 3.734886980900649e-05, "loss": 3.729, "step": 7154 }, { "epoch": 1.8802509278418236, "grad_norm": 0.8422088027000427, "learning_rate": 3.733134746802173e-05, "loss": 3.7914, "step": 7156 }, { "epoch": 1.8807764311754853, "grad_norm": 0.7503764629364014, "learning_rate": 3.7313825127036975e-05, "loss": 3.8013, "step": 7158 }, { "epoch": 1.881301934509147, "grad_norm": 0.8116047382354736, "learning_rate": 3.7296302786052216e-05, "loss": 3.7622, "step": 7160 }, { "epoch": 1.8818274378428088, "grad_norm": 0.8417380452156067, "learning_rate": 3.727878044506746e-05, "loss": 3.7799, "step": 7162 }, { "epoch": 1.8823529411764706, "grad_norm": 0.7540464997291565, "learning_rate": 3.726125810408271e-05, "loss": 3.7667, "step": 7164 }, { "epoch": 1.8828784445101323, "grad_norm": 0.7369726896286011, "learning_rate": 3.724373576309795e-05, "loss": 3.7591, "step": 7166 }, { "epoch": 1.883403947843794, "grad_norm": 0.818386435508728, "learning_rate": 3.72262134221132e-05, "loss": 3.7607, "step": 7168 }, { "epoch": 1.883929451177456, "grad_norm": 0.8985819816589355, "learning_rate": 3.720869108112844e-05, "loss": 3.7771, "step": 7170 }, { "epoch": 1.8844549545111176, "grad_norm": 0.8328900337219238, "learning_rate": 3.719116874014368e-05, "loss": 3.748, "step": 7172 }, { "epoch": 1.8849804578447795, "grad_norm": 0.820939838886261, "learning_rate": 3.717364639915893e-05, "loss": 3.795, "step": 7174 }, { "epoch": 1.8855059611784413, "grad_norm": 0.8815848231315613, "learning_rate": 3.7156124058174176e-05, "loss": 3.7587, "step": 7176 }, { "epoch": 1.886031464512103, "grad_norm": 0.8702841401100159, "learning_rate": 3.713860171718942e-05, "loss": 3.7112, "step": 7178 }, { "epoch": 1.8865569678457648, "grad_norm": 0.8567859530448914, "learning_rate": 3.7121079376204664e-05, "loss": 3.7304, "step": 7180 }, { "epoch": 1.8870824711794265, "grad_norm": 0.7805899977684021, "learning_rate": 3.7103557035219905e-05, "loss": 3.7195, "step": 7182 }, { "epoch": 1.8876079745130885, "grad_norm": 0.7664670944213867, "learning_rate": 3.708603469423515e-05, "loss": 3.7451, "step": 7184 }, { "epoch": 1.88813347784675, "grad_norm": 0.7236804962158203, "learning_rate": 3.7068512353250393e-05, "loss": 3.7997, "step": 7186 }, { "epoch": 1.888658981180412, "grad_norm": 0.7603566646575928, "learning_rate": 3.705099001226564e-05, "loss": 3.7422, "step": 7188 }, { "epoch": 1.8891844845140735, "grad_norm": 0.7712535262107849, "learning_rate": 3.703346767128089e-05, "loss": 3.7324, "step": 7190 }, { "epoch": 1.8897099878477355, "grad_norm": 0.7514092922210693, "learning_rate": 3.701594533029613e-05, "loss": 3.8147, "step": 7192 }, { "epoch": 1.8902354911813972, "grad_norm": 0.8574222326278687, "learning_rate": 3.699842298931138e-05, "loss": 3.7431, "step": 7194 }, { "epoch": 1.890760994515059, "grad_norm": 0.8488388657569885, "learning_rate": 3.698090064832662e-05, "loss": 3.8158, "step": 7196 }, { "epoch": 1.8912864978487207, "grad_norm": 0.8958685994148254, "learning_rate": 3.696337830734186e-05, "loss": 3.772, "step": 7198 }, { "epoch": 1.8918120011823825, "grad_norm": 0.8519763350486755, "learning_rate": 3.6945855966357106e-05, "loss": 3.7625, "step": 7200 }, { "epoch": 1.8918120011823825, "eval_loss": 3.7543933391571045, "eval_runtime": 464.7532, "eval_samples_per_second": 262.051, "eval_steps_per_second": 8.189, "step": 7200 }, { "epoch": 1.8923375045160444, "grad_norm": 0.7797552943229675, "learning_rate": 3.6928333625372354e-05, "loss": 3.7935, "step": 7202 }, { "epoch": 1.892863007849706, "grad_norm": 0.8318517804145813, "learning_rate": 3.6910811284387594e-05, "loss": 3.759, "step": 7204 }, { "epoch": 1.893388511183368, "grad_norm": 0.7832900285720825, "learning_rate": 3.689328894340284e-05, "loss": 3.7798, "step": 7206 }, { "epoch": 1.8939140145170295, "grad_norm": 0.8736194968223572, "learning_rate": 3.687576660241808e-05, "loss": 3.7516, "step": 7208 }, { "epoch": 1.8944395178506914, "grad_norm": 0.7930798530578613, "learning_rate": 3.685824426143333e-05, "loss": 3.7664, "step": 7210 }, { "epoch": 1.8949650211843532, "grad_norm": 0.9596141576766968, "learning_rate": 3.684072192044857e-05, "loss": 3.7723, "step": 7212 }, { "epoch": 1.895490524518015, "grad_norm": 0.8516499400138855, "learning_rate": 3.682319957946382e-05, "loss": 3.8124, "step": 7214 }, { "epoch": 1.8960160278516767, "grad_norm": 0.812749445438385, "learning_rate": 3.6805677238479066e-05, "loss": 3.7703, "step": 7216 }, { "epoch": 1.8965415311853384, "grad_norm": 0.8009101748466492, "learning_rate": 3.678815489749431e-05, "loss": 3.7415, "step": 7218 }, { "epoch": 1.8970670345190004, "grad_norm": 0.8103194236755371, "learning_rate": 3.677063255650955e-05, "loss": 3.7781, "step": 7220 }, { "epoch": 1.897592537852662, "grad_norm": 0.7919992804527283, "learning_rate": 3.6753110215524795e-05, "loss": 3.7647, "step": 7222 }, { "epoch": 1.8981180411863239, "grad_norm": 0.8658493757247925, "learning_rate": 3.6735587874540036e-05, "loss": 3.8248, "step": 7224 }, { "epoch": 1.8986435445199854, "grad_norm": 0.7933748960494995, "learning_rate": 3.6718065533555284e-05, "loss": 3.8323, "step": 7226 }, { "epoch": 1.8991690478536474, "grad_norm": 0.9019044041633606, "learning_rate": 3.670054319257053e-05, "loss": 3.7797, "step": 7228 }, { "epoch": 1.8996945511873091, "grad_norm": 0.816685140132904, "learning_rate": 3.668302085158577e-05, "loss": 3.7323, "step": 7230 }, { "epoch": 1.9002200545209709, "grad_norm": 0.8297595381736755, "learning_rate": 3.666549851060102e-05, "loss": 3.7424, "step": 7232 }, { "epoch": 1.9007455578546326, "grad_norm": 0.937410831451416, "learning_rate": 3.664797616961626e-05, "loss": 3.783, "step": 7234 }, { "epoch": 1.9012710611882944, "grad_norm": 0.8015429973602295, "learning_rate": 3.663045382863151e-05, "loss": 3.8309, "step": 7236 }, { "epoch": 1.9017965645219563, "grad_norm": 0.8416038155555725, "learning_rate": 3.661293148764675e-05, "loss": 3.7813, "step": 7238 }, { "epoch": 1.9023220678556179, "grad_norm": 0.8654730916023254, "learning_rate": 3.6595409146661996e-05, "loss": 3.7737, "step": 7240 }, { "epoch": 1.9028475711892798, "grad_norm": 0.9184932708740234, "learning_rate": 3.6577886805677244e-05, "loss": 3.7752, "step": 7242 }, { "epoch": 1.9033730745229414, "grad_norm": 0.8490609526634216, "learning_rate": 3.6560364464692485e-05, "loss": 3.7967, "step": 7244 }, { "epoch": 1.9038985778566033, "grad_norm": 0.8382405638694763, "learning_rate": 3.6542842123707725e-05, "loss": 3.7711, "step": 7246 }, { "epoch": 1.904424081190265, "grad_norm": 0.8822634220123291, "learning_rate": 3.652531978272297e-05, "loss": 3.7272, "step": 7248 }, { "epoch": 1.9049495845239268, "grad_norm": 0.8320958614349365, "learning_rate": 3.650779744173822e-05, "loss": 3.7967, "step": 7250 }, { "epoch": 1.9054750878575886, "grad_norm": 0.9033867120742798, "learning_rate": 3.649027510075346e-05, "loss": 3.6989, "step": 7252 }, { "epoch": 1.9060005911912503, "grad_norm": 0.8607004880905151, "learning_rate": 3.647275275976871e-05, "loss": 3.8033, "step": 7254 }, { "epoch": 1.9065260945249123, "grad_norm": 0.8222968578338623, "learning_rate": 3.6455230418783956e-05, "loss": 3.7442, "step": 7256 }, { "epoch": 1.9070515978585738, "grad_norm": 0.808159589767456, "learning_rate": 3.643770807779919e-05, "loss": 3.7799, "step": 7258 }, { "epoch": 1.9075771011922358, "grad_norm": 0.8401079773902893, "learning_rate": 3.642018573681444e-05, "loss": 3.7487, "step": 7260 }, { "epoch": 1.9081026045258973, "grad_norm": 0.8267903923988342, "learning_rate": 3.6402663395829686e-05, "loss": 3.7949, "step": 7262 }, { "epoch": 1.9086281078595593, "grad_norm": 1.0254801511764526, "learning_rate": 3.6385141054844926e-05, "loss": 3.7767, "step": 7264 }, { "epoch": 1.909153611193221, "grad_norm": 0.8202869892120361, "learning_rate": 3.6367618713860174e-05, "loss": 3.7503, "step": 7266 }, { "epoch": 1.9096791145268828, "grad_norm": 0.8672094345092773, "learning_rate": 3.635009637287542e-05, "loss": 3.7806, "step": 7268 }, { "epoch": 1.9102046178605445, "grad_norm": 0.835763156414032, "learning_rate": 3.633257403189066e-05, "loss": 3.7155, "step": 7270 }, { "epoch": 1.9107301211942063, "grad_norm": 0.7753351330757141, "learning_rate": 3.63150516909059e-05, "loss": 3.735, "step": 7272 }, { "epoch": 1.9112556245278682, "grad_norm": 0.811896562576294, "learning_rate": 3.629752934992115e-05, "loss": 3.8282, "step": 7274 }, { "epoch": 1.9117811278615298, "grad_norm": 0.7885233759880066, "learning_rate": 3.62800070089364e-05, "loss": 3.7214, "step": 7276 }, { "epoch": 1.9123066311951917, "grad_norm": 0.8838382959365845, "learning_rate": 3.626248466795164e-05, "loss": 3.7572, "step": 7278 }, { "epoch": 1.9128321345288533, "grad_norm": 0.8656865358352661, "learning_rate": 3.6244962326966887e-05, "loss": 3.782, "step": 7280 }, { "epoch": 1.9133576378625152, "grad_norm": 0.7675895094871521, "learning_rate": 3.6227439985982134e-05, "loss": 3.7695, "step": 7282 }, { "epoch": 1.913883141196177, "grad_norm": 0.8407493233680725, "learning_rate": 3.620991764499737e-05, "loss": 3.8142, "step": 7284 }, { "epoch": 1.9144086445298387, "grad_norm": 0.8503941297531128, "learning_rate": 3.6192395304012616e-05, "loss": 3.8226, "step": 7286 }, { "epoch": 1.9149341478635005, "grad_norm": 0.7693426609039307, "learning_rate": 3.617487296302786e-05, "loss": 3.7943, "step": 7288 }, { "epoch": 1.9154596511971622, "grad_norm": 0.8471866250038147, "learning_rate": 3.6157350622043104e-05, "loss": 3.719, "step": 7290 }, { "epoch": 1.9159851545308242, "grad_norm": 0.7711345553398132, "learning_rate": 3.613982828105835e-05, "loss": 3.766, "step": 7292 }, { "epoch": 1.9165106578644857, "grad_norm": 0.7115740776062012, "learning_rate": 3.61223059400736e-05, "loss": 3.8128, "step": 7294 }, { "epoch": 1.9170361611981477, "grad_norm": 0.8912208080291748, "learning_rate": 3.610478359908884e-05, "loss": 3.7825, "step": 7296 }, { "epoch": 1.9175616645318094, "grad_norm": 0.7602376937866211, "learning_rate": 3.608726125810408e-05, "loss": 3.7291, "step": 7298 }, { "epoch": 1.9180871678654712, "grad_norm": 0.8779393434524536, "learning_rate": 3.606973891711933e-05, "loss": 3.8175, "step": 7300 }, { "epoch": 1.918612671199133, "grad_norm": 0.8252623677253723, "learning_rate": 3.6052216576134576e-05, "loss": 3.754, "step": 7302 }, { "epoch": 1.9191381745327947, "grad_norm": 0.7199923396110535, "learning_rate": 3.603469423514982e-05, "loss": 3.8379, "step": 7304 }, { "epoch": 1.9196636778664564, "grad_norm": 0.8135952353477478, "learning_rate": 3.6017171894165064e-05, "loss": 3.7485, "step": 7306 }, { "epoch": 1.9201891812001182, "grad_norm": 0.8265425562858582, "learning_rate": 3.599964955318031e-05, "loss": 3.7832, "step": 7308 }, { "epoch": 1.9207146845337801, "grad_norm": 0.790228009223938, "learning_rate": 3.5982127212195546e-05, "loss": 3.7491, "step": 7310 }, { "epoch": 1.9212401878674417, "grad_norm": 0.9497824311256409, "learning_rate": 3.5964604871210793e-05, "loss": 3.7731, "step": 7312 }, { "epoch": 1.9217656912011036, "grad_norm": 0.8333897590637207, "learning_rate": 3.594708253022604e-05, "loss": 3.7575, "step": 7314 }, { "epoch": 1.9222911945347654, "grad_norm": 0.724471390247345, "learning_rate": 3.592956018924128e-05, "loss": 3.7506, "step": 7316 }, { "epoch": 1.9228166978684271, "grad_norm": 0.796096920967102, "learning_rate": 3.591203784825653e-05, "loss": 3.7515, "step": 7318 }, { "epoch": 1.9233422012020889, "grad_norm": 0.8611120581626892, "learning_rate": 3.589451550727178e-05, "loss": 3.8087, "step": 7320 }, { "epoch": 1.9238677045357506, "grad_norm": 0.7614686489105225, "learning_rate": 3.587699316628702e-05, "loss": 3.7776, "step": 7322 }, { "epoch": 1.9243932078694124, "grad_norm": 0.8461688756942749, "learning_rate": 3.585947082530226e-05, "loss": 3.7666, "step": 7324 }, { "epoch": 1.9249187112030741, "grad_norm": 0.7563474774360657, "learning_rate": 3.5841948484317506e-05, "loss": 3.775, "step": 7326 }, { "epoch": 1.925444214536736, "grad_norm": 0.7871602773666382, "learning_rate": 3.5824426143332754e-05, "loss": 3.7598, "step": 7328 }, { "epoch": 1.9259697178703976, "grad_norm": 0.746545672416687, "learning_rate": 3.5806903802347994e-05, "loss": 3.7751, "step": 7330 }, { "epoch": 1.9264952212040596, "grad_norm": 0.8497496843338013, "learning_rate": 3.578938146136324e-05, "loss": 3.7374, "step": 7332 }, { "epoch": 1.9270207245377213, "grad_norm": 0.8453095555305481, "learning_rate": 3.577185912037849e-05, "loss": 3.7579, "step": 7334 }, { "epoch": 1.927546227871383, "grad_norm": 0.8349859714508057, "learning_rate": 3.5754336779393724e-05, "loss": 3.749, "step": 7336 }, { "epoch": 1.9280717312050448, "grad_norm": 0.8511797785758972, "learning_rate": 3.573681443840897e-05, "loss": 3.7349, "step": 7338 }, { "epoch": 1.9285972345387066, "grad_norm": 0.8310844898223877, "learning_rate": 3.571929209742422e-05, "loss": 3.7354, "step": 7340 }, { "epoch": 1.9291227378723685, "grad_norm": 0.8500331044197083, "learning_rate": 3.570176975643946e-05, "loss": 3.7362, "step": 7342 }, { "epoch": 1.92964824120603, "grad_norm": 0.8055227994918823, "learning_rate": 3.568424741545471e-05, "loss": 3.8018, "step": 7344 }, { "epoch": 1.930173744539692, "grad_norm": 0.9162694215774536, "learning_rate": 3.5666725074469955e-05, "loss": 3.7771, "step": 7346 }, { "epoch": 1.9306992478733536, "grad_norm": 0.865263044834137, "learning_rate": 3.5649202733485195e-05, "loss": 3.7645, "step": 7348 }, { "epoch": 1.9312247512070155, "grad_norm": 0.8559818267822266, "learning_rate": 3.5631680392500436e-05, "loss": 3.788, "step": 7350 }, { "epoch": 1.9317502545406773, "grad_norm": 0.8365180492401123, "learning_rate": 3.5614158051515684e-05, "loss": 3.7548, "step": 7352 }, { "epoch": 1.932275757874339, "grad_norm": 0.8124203681945801, "learning_rate": 3.559663571053093e-05, "loss": 3.7357, "step": 7354 }, { "epoch": 1.9328012612080008, "grad_norm": 0.8887948989868164, "learning_rate": 3.557911336954617e-05, "loss": 3.809, "step": 7356 }, { "epoch": 1.9333267645416625, "grad_norm": 0.7813817262649536, "learning_rate": 3.556159102856142e-05, "loss": 3.7744, "step": 7358 }, { "epoch": 1.9338522678753245, "grad_norm": 0.7807870507240295, "learning_rate": 3.554406868757666e-05, "loss": 3.7207, "step": 7360 }, { "epoch": 1.934377771208986, "grad_norm": 0.843121349811554, "learning_rate": 3.552654634659191e-05, "loss": 3.7506, "step": 7362 }, { "epoch": 1.934903274542648, "grad_norm": 0.8367806077003479, "learning_rate": 3.550902400560715e-05, "loss": 3.7693, "step": 7364 }, { "epoch": 1.9354287778763095, "grad_norm": 0.8156090974807739, "learning_rate": 3.5491501664622396e-05, "loss": 3.8068, "step": 7366 }, { "epoch": 1.9359542812099715, "grad_norm": 0.8017002940177917, "learning_rate": 3.5473979323637644e-05, "loss": 3.7063, "step": 7368 }, { "epoch": 1.9364797845436332, "grad_norm": 0.8074295520782471, "learning_rate": 3.5456456982652885e-05, "loss": 3.7625, "step": 7370 }, { "epoch": 1.937005287877295, "grad_norm": 0.8606386184692383, "learning_rate": 3.543893464166813e-05, "loss": 3.7543, "step": 7372 }, { "epoch": 1.9375307912109567, "grad_norm": 0.782391369342804, "learning_rate": 3.542141230068337e-05, "loss": 3.7696, "step": 7374 }, { "epoch": 1.9380562945446185, "grad_norm": 0.8216257095336914, "learning_rate": 3.5403889959698614e-05, "loss": 3.7789, "step": 7376 }, { "epoch": 1.9385817978782804, "grad_norm": 0.73822420835495, "learning_rate": 3.538636761871386e-05, "loss": 3.7642, "step": 7378 }, { "epoch": 1.939107301211942, "grad_norm": 0.7938769459724426, "learning_rate": 3.536884527772911e-05, "loss": 3.6983, "step": 7380 }, { "epoch": 1.939632804545604, "grad_norm": 0.9111276865005493, "learning_rate": 3.535132293674435e-05, "loss": 3.7748, "step": 7382 }, { "epoch": 1.9401583078792655, "grad_norm": 0.8689764738082886, "learning_rate": 3.53338005957596e-05, "loss": 3.7353, "step": 7384 }, { "epoch": 1.9406838112129274, "grad_norm": 0.833054780960083, "learning_rate": 3.531627825477484e-05, "loss": 3.8111, "step": 7386 }, { "epoch": 1.9412093145465892, "grad_norm": 0.8502274751663208, "learning_rate": 3.5298755913790086e-05, "loss": 3.7366, "step": 7388 }, { "epoch": 1.941734817880251, "grad_norm": 0.7662941813468933, "learning_rate": 3.5281233572805326e-05, "loss": 3.7546, "step": 7390 }, { "epoch": 1.9422603212139127, "grad_norm": 0.8249865174293518, "learning_rate": 3.5263711231820574e-05, "loss": 3.7441, "step": 7392 }, { "epoch": 1.9427858245475744, "grad_norm": 0.9211465120315552, "learning_rate": 3.524618889083582e-05, "loss": 3.7916, "step": 7394 }, { "epoch": 1.9433113278812364, "grad_norm": 0.7873339653015137, "learning_rate": 3.522866654985106e-05, "loss": 3.8231, "step": 7396 }, { "epoch": 1.943836831214898, "grad_norm": 0.8018662333488464, "learning_rate": 3.52111442088663e-05, "loss": 3.7328, "step": 7398 }, { "epoch": 1.94436233454856, "grad_norm": 0.8127778172492981, "learning_rate": 3.519362186788155e-05, "loss": 3.7695, "step": 7400 }, { "epoch": 1.9448878378822214, "grad_norm": 0.7739711999893188, "learning_rate": 3.517609952689679e-05, "loss": 3.7845, "step": 7402 }, { "epoch": 1.9454133412158834, "grad_norm": 0.8602389693260193, "learning_rate": 3.515857718591204e-05, "loss": 3.7943, "step": 7404 }, { "epoch": 1.9459388445495451, "grad_norm": 0.9560822248458862, "learning_rate": 3.5141054844927287e-05, "loss": 3.8193, "step": 7406 }, { "epoch": 1.9464643478832069, "grad_norm": 0.7726610898971558, "learning_rate": 3.512353250394253e-05, "loss": 3.75, "step": 7408 }, { "epoch": 1.9469898512168686, "grad_norm": 0.8106672763824463, "learning_rate": 3.5106010162957775e-05, "loss": 3.7442, "step": 7410 }, { "epoch": 1.9475153545505304, "grad_norm": 0.8665913343429565, "learning_rate": 3.5088487821973016e-05, "loss": 3.7646, "step": 7412 }, { "epoch": 1.9480408578841923, "grad_norm": 0.7971773743629456, "learning_rate": 3.507096548098826e-05, "loss": 3.7533, "step": 7414 }, { "epoch": 1.9485663612178539, "grad_norm": 0.8165305852890015, "learning_rate": 3.5053443140003504e-05, "loss": 3.7495, "step": 7416 }, { "epoch": 1.9490918645515158, "grad_norm": 0.7828363180160522, "learning_rate": 3.503592079901875e-05, "loss": 3.8213, "step": 7418 }, { "epoch": 1.9496173678851774, "grad_norm": 0.7464872002601624, "learning_rate": 3.5018398458034e-05, "loss": 3.7727, "step": 7420 }, { "epoch": 1.9501428712188393, "grad_norm": 0.8454654812812805, "learning_rate": 3.500087611704924e-05, "loss": 3.7481, "step": 7422 }, { "epoch": 1.950668374552501, "grad_norm": 0.8494216799736023, "learning_rate": 3.498335377606448e-05, "loss": 3.7344, "step": 7424 }, { "epoch": 1.9511938778861628, "grad_norm": 0.8140987753868103, "learning_rate": 3.496583143507973e-05, "loss": 3.7464, "step": 7426 }, { "epoch": 1.9517193812198246, "grad_norm": 0.8646356463432312, "learning_rate": 3.494830909409497e-05, "loss": 3.7965, "step": 7428 }, { "epoch": 1.9522448845534863, "grad_norm": 0.8085780143737793, "learning_rate": 3.493078675311022e-05, "loss": 3.8131, "step": 7430 }, { "epoch": 1.9527703878871483, "grad_norm": 0.767582356929779, "learning_rate": 3.4913264412125464e-05, "loss": 3.7676, "step": 7432 }, { "epoch": 1.9532958912208098, "grad_norm": 0.9825206995010376, "learning_rate": 3.4895742071140705e-05, "loss": 3.7547, "step": 7434 }, { "epoch": 1.9538213945544718, "grad_norm": 0.9007595777511597, "learning_rate": 3.487821973015595e-05, "loss": 3.7784, "step": 7436 }, { "epoch": 1.9543468978881333, "grad_norm": 0.8712841272354126, "learning_rate": 3.486069738917119e-05, "loss": 3.7593, "step": 7438 }, { "epoch": 1.9548724012217953, "grad_norm": 0.8860572576522827, "learning_rate": 3.484317504818644e-05, "loss": 3.8204, "step": 7440 }, { "epoch": 1.955397904555457, "grad_norm": 0.7809417247772217, "learning_rate": 3.482565270720168e-05, "loss": 3.736, "step": 7442 }, { "epoch": 1.9559234078891188, "grad_norm": 0.7812906503677368, "learning_rate": 3.480813036621693e-05, "loss": 3.775, "step": 7444 }, { "epoch": 1.9564489112227805, "grad_norm": 0.8853201866149902, "learning_rate": 3.479060802523218e-05, "loss": 3.7922, "step": 7446 }, { "epoch": 1.9569744145564423, "grad_norm": 0.7846458554267883, "learning_rate": 3.477308568424742e-05, "loss": 3.7756, "step": 7448 }, { "epoch": 1.9574999178901042, "grad_norm": 0.8613007068634033, "learning_rate": 3.475556334326266e-05, "loss": 3.7137, "step": 7450 }, { "epoch": 1.9580254212237658, "grad_norm": 0.8726389408111572, "learning_rate": 3.4738041002277906e-05, "loss": 3.7907, "step": 7452 }, { "epoch": 1.9585509245574277, "grad_norm": 0.8869491219520569, "learning_rate": 3.472051866129315e-05, "loss": 3.7878, "step": 7454 }, { "epoch": 1.9590764278910895, "grad_norm": 0.9047183394432068, "learning_rate": 3.4702996320308394e-05, "loss": 3.7069, "step": 7456 }, { "epoch": 1.9596019312247512, "grad_norm": 0.9288968443870544, "learning_rate": 3.468547397932364e-05, "loss": 3.8252, "step": 7458 }, { "epoch": 1.960127434558413, "grad_norm": 0.7946211099624634, "learning_rate": 3.466795163833888e-05, "loss": 3.7874, "step": 7460 }, { "epoch": 1.9606529378920747, "grad_norm": 0.8276675939559937, "learning_rate": 3.4650429297354123e-05, "loss": 3.7222, "step": 7462 }, { "epoch": 1.9611784412257365, "grad_norm": 0.7864258289337158, "learning_rate": 3.463290695636937e-05, "loss": 3.7681, "step": 7464 }, { "epoch": 1.9617039445593982, "grad_norm": 0.8005943894386292, "learning_rate": 3.461538461538462e-05, "loss": 3.7392, "step": 7466 }, { "epoch": 1.9622294478930602, "grad_norm": 0.7805814743041992, "learning_rate": 3.459786227439986e-05, "loss": 3.7671, "step": 7468 }, { "epoch": 1.9627549512267217, "grad_norm": 0.7442143559455872, "learning_rate": 3.458033993341511e-05, "loss": 3.7563, "step": 7470 }, { "epoch": 1.9632804545603837, "grad_norm": 0.8245219588279724, "learning_rate": 3.4562817592430355e-05, "loss": 3.7829, "step": 7472 }, { "epoch": 1.9638059578940454, "grad_norm": 0.9409242868423462, "learning_rate": 3.4545295251445595e-05, "loss": 3.7266, "step": 7474 }, { "epoch": 1.9643314612277072, "grad_norm": 0.7863244414329529, "learning_rate": 3.4527772910460836e-05, "loss": 3.7789, "step": 7476 }, { "epoch": 1.964856964561369, "grad_norm": 0.7903650999069214, "learning_rate": 3.4510250569476084e-05, "loss": 3.722, "step": 7478 }, { "epoch": 1.9653824678950307, "grad_norm": 0.8328583240509033, "learning_rate": 3.4492728228491324e-05, "loss": 3.7406, "step": 7480 }, { "epoch": 1.9659079712286924, "grad_norm": 0.8077432513237, "learning_rate": 3.447520588750657e-05, "loss": 3.7705, "step": 7482 }, { "epoch": 1.9664334745623542, "grad_norm": 0.8435444831848145, "learning_rate": 3.445768354652182e-05, "loss": 3.7657, "step": 7484 }, { "epoch": 1.9669589778960161, "grad_norm": 0.9052886366844177, "learning_rate": 3.444016120553706e-05, "loss": 3.7814, "step": 7486 }, { "epoch": 1.9674844812296777, "grad_norm": 0.8775551319122314, "learning_rate": 3.44226388645523e-05, "loss": 3.7966, "step": 7488 }, { "epoch": 1.9680099845633396, "grad_norm": 1.0277775526046753, "learning_rate": 3.440511652356755e-05, "loss": 3.7953, "step": 7490 }, { "epoch": 1.9685354878970014, "grad_norm": 0.7745800614356995, "learning_rate": 3.4387594182582796e-05, "loss": 3.78, "step": 7492 }, { "epoch": 1.9690609912306631, "grad_norm": 0.8163340091705322, "learning_rate": 3.437007184159804e-05, "loss": 3.7746, "step": 7494 }, { "epoch": 1.9695864945643249, "grad_norm": 0.8705493807792664, "learning_rate": 3.4352549500613285e-05, "loss": 3.764, "step": 7496 }, { "epoch": 1.9701119978979866, "grad_norm": 0.8812320232391357, "learning_rate": 3.433502715962853e-05, "loss": 3.7867, "step": 7498 }, { "epoch": 1.9706375012316486, "grad_norm": 0.7739870548248291, "learning_rate": 3.431750481864377e-05, "loss": 3.7607, "step": 7500 }, { "epoch": 1.9711630045653101, "grad_norm": 0.9248141050338745, "learning_rate": 3.4299982477659014e-05, "loss": 3.7859, "step": 7502 }, { "epoch": 1.971688507898972, "grad_norm": 0.8008308410644531, "learning_rate": 3.428246013667426e-05, "loss": 3.7559, "step": 7504 }, { "epoch": 1.9722140112326336, "grad_norm": 0.8028952479362488, "learning_rate": 3.426493779568951e-05, "loss": 3.8025, "step": 7506 }, { "epoch": 1.9727395145662956, "grad_norm": 0.7700552940368652, "learning_rate": 3.424741545470475e-05, "loss": 3.7517, "step": 7508 }, { "epoch": 1.9732650178999573, "grad_norm": 0.9207766056060791, "learning_rate": 3.422989311372e-05, "loss": 3.7376, "step": 7510 }, { "epoch": 1.973790521233619, "grad_norm": 0.8478928804397583, "learning_rate": 3.4212370772735245e-05, "loss": 3.7221, "step": 7512 }, { "epoch": 1.9743160245672808, "grad_norm": 0.8757862448692322, "learning_rate": 3.419484843175048e-05, "loss": 3.7958, "step": 7514 }, { "epoch": 1.9748415279009426, "grad_norm": 0.7681872248649597, "learning_rate": 3.4177326090765726e-05, "loss": 3.7693, "step": 7516 }, { "epoch": 1.9753670312346046, "grad_norm": 0.9216195940971375, "learning_rate": 3.4159803749780974e-05, "loss": 3.8247, "step": 7518 }, { "epoch": 1.975892534568266, "grad_norm": 0.8362264037132263, "learning_rate": 3.4142281408796215e-05, "loss": 3.7431, "step": 7520 }, { "epoch": 1.976418037901928, "grad_norm": 0.8072685599327087, "learning_rate": 3.412475906781146e-05, "loss": 3.827, "step": 7522 }, { "epoch": 1.9769435412355896, "grad_norm": 0.9767225980758667, "learning_rate": 3.410723672682671e-05, "loss": 3.7688, "step": 7524 }, { "epoch": 1.9774690445692515, "grad_norm": 0.776461124420166, "learning_rate": 3.408971438584195e-05, "loss": 3.8475, "step": 7526 }, { "epoch": 1.9779945479029133, "grad_norm": 0.7826116681098938, "learning_rate": 3.407219204485719e-05, "loss": 3.7276, "step": 7528 }, { "epoch": 1.978520051236575, "grad_norm": 0.8296884298324585, "learning_rate": 3.405466970387244e-05, "loss": 3.7377, "step": 7530 }, { "epoch": 1.9790455545702368, "grad_norm": 0.7869312763214111, "learning_rate": 3.4037147362887687e-05, "loss": 3.743, "step": 7532 }, { "epoch": 1.9795710579038985, "grad_norm": 0.7890320420265198, "learning_rate": 3.401962502190293e-05, "loss": 3.7264, "step": 7534 }, { "epoch": 1.9800965612375605, "grad_norm": 0.81013423204422, "learning_rate": 3.4002102680918175e-05, "loss": 3.7178, "step": 7536 }, { "epoch": 1.980622064571222, "grad_norm": 0.831632137298584, "learning_rate": 3.398458033993342e-05, "loss": 3.7451, "step": 7538 }, { "epoch": 1.981147567904884, "grad_norm": 0.8130840063095093, "learning_rate": 3.3967057998948656e-05, "loss": 3.7775, "step": 7540 }, { "epoch": 1.9816730712385455, "grad_norm": 0.8577526807785034, "learning_rate": 3.3949535657963904e-05, "loss": 3.8009, "step": 7542 }, { "epoch": 1.9821985745722075, "grad_norm": 0.8200308084487915, "learning_rate": 3.393201331697915e-05, "loss": 3.7159, "step": 7544 }, { "epoch": 1.9827240779058692, "grad_norm": 0.8229500651359558, "learning_rate": 3.391449097599439e-05, "loss": 3.7775, "step": 7546 }, { "epoch": 1.983249581239531, "grad_norm": 0.8909658193588257, "learning_rate": 3.389696863500964e-05, "loss": 3.7486, "step": 7548 }, { "epoch": 1.9837750845731927, "grad_norm": 0.7989559173583984, "learning_rate": 3.387944629402489e-05, "loss": 3.7605, "step": 7550 }, { "epoch": 1.9843005879068545, "grad_norm": 0.8056257963180542, "learning_rate": 3.386192395304013e-05, "loss": 3.7259, "step": 7552 }, { "epoch": 1.9848260912405165, "grad_norm": 0.8427444100379944, "learning_rate": 3.384440161205537e-05, "loss": 3.7431, "step": 7554 }, { "epoch": 1.985351594574178, "grad_norm": 0.8639296889305115, "learning_rate": 3.382687927107062e-05, "loss": 3.8301, "step": 7556 }, { "epoch": 1.98587709790784, "grad_norm": 0.8454062938690186, "learning_rate": 3.3809356930085864e-05, "loss": 3.7436, "step": 7558 }, { "epoch": 1.9864026012415015, "grad_norm": 0.9488638639450073, "learning_rate": 3.3791834589101105e-05, "loss": 3.7751, "step": 7560 }, { "epoch": 1.9869281045751634, "grad_norm": 0.7634671330451965, "learning_rate": 3.377431224811635e-05, "loss": 3.7705, "step": 7562 }, { "epoch": 1.9874536079088252, "grad_norm": 0.8726993799209595, "learning_rate": 3.375678990713159e-05, "loss": 3.7459, "step": 7564 }, { "epoch": 1.987979111242487, "grad_norm": 0.806239902973175, "learning_rate": 3.3739267566146834e-05, "loss": 3.7789, "step": 7566 }, { "epoch": 1.9885046145761487, "grad_norm": 0.9013625979423523, "learning_rate": 3.372174522516208e-05, "loss": 3.7915, "step": 7568 }, { "epoch": 1.9890301179098104, "grad_norm": 0.893684983253479, "learning_rate": 3.370422288417733e-05, "loss": 3.7446, "step": 7570 }, { "epoch": 1.9895556212434724, "grad_norm": 0.8194506168365479, "learning_rate": 3.368670054319257e-05, "loss": 3.7284, "step": 7572 }, { "epoch": 1.990081124577134, "grad_norm": 0.8154358267784119, "learning_rate": 3.366917820220782e-05, "loss": 3.7352, "step": 7574 }, { "epoch": 1.990606627910796, "grad_norm": 0.9577004313468933, "learning_rate": 3.3651655861223065e-05, "loss": 3.7178, "step": 7576 }, { "epoch": 1.9911321312444574, "grad_norm": 0.8824792504310608, "learning_rate": 3.3634133520238306e-05, "loss": 3.7651, "step": 7578 }, { "epoch": 1.9916576345781194, "grad_norm": 0.7990145087242126, "learning_rate": 3.361661117925355e-05, "loss": 3.7157, "step": 7580 }, { "epoch": 1.9921831379117811, "grad_norm": 0.8435682058334351, "learning_rate": 3.3599088838268794e-05, "loss": 3.7588, "step": 7582 }, { "epoch": 1.992708641245443, "grad_norm": 0.8943558931350708, "learning_rate": 3.358156649728404e-05, "loss": 3.7788, "step": 7584 }, { "epoch": 1.9932341445791046, "grad_norm": 0.8135433197021484, "learning_rate": 3.356404415629928e-05, "loss": 3.7418, "step": 7586 }, { "epoch": 1.9937596479127664, "grad_norm": 0.9406774044036865, "learning_rate": 3.354652181531453e-05, "loss": 3.7502, "step": 7588 }, { "epoch": 1.9942851512464284, "grad_norm": 1.1299089193344116, "learning_rate": 3.352899947432977e-05, "loss": 3.8287, "step": 7590 }, { "epoch": 1.9948106545800899, "grad_norm": 0.877992570400238, "learning_rate": 3.351147713334501e-05, "loss": 3.7768, "step": 7592 }, { "epoch": 1.9953361579137519, "grad_norm": 0.8695041537284851, "learning_rate": 3.349395479236026e-05, "loss": 3.8014, "step": 7594 }, { "epoch": 1.9958616612474134, "grad_norm": 0.7310254573822021, "learning_rate": 3.347643245137551e-05, "loss": 3.7408, "step": 7596 }, { "epoch": 1.9963871645810753, "grad_norm": 0.8746644258499146, "learning_rate": 3.345891011039075e-05, "loss": 3.8116, "step": 7598 }, { "epoch": 1.996912667914737, "grad_norm": 0.913506269454956, "learning_rate": 3.3441387769405995e-05, "loss": 3.7696, "step": 7600 }, { "epoch": 1.996912667914737, "eval_loss": 3.7464826107025146, "eval_runtime": 464.712, "eval_samples_per_second": 262.074, "eval_steps_per_second": 8.19, "step": 7600 }, { "epoch": 1.9974381712483988, "grad_norm": 0.908257782459259, "learning_rate": 3.3423865428421236e-05, "loss": 3.8089, "step": 7602 }, { "epoch": 1.9979636745820606, "grad_norm": 0.9790870547294617, "learning_rate": 3.3406343087436484e-05, "loss": 3.7726, "step": 7604 }, { "epoch": 1.9984891779157223, "grad_norm": 0.8051719069480896, "learning_rate": 3.3388820746451724e-05, "loss": 3.7622, "step": 7606 }, { "epoch": 1.9990146812493843, "grad_norm": 0.779961884021759, "learning_rate": 3.337129840546697e-05, "loss": 3.7386, "step": 7608 }, { "epoch": 1.9995401845830458, "grad_norm": 0.912300705909729, "learning_rate": 3.335377606448222e-05, "loss": 3.7653, "step": 7610 }, { "epoch": 2.000065687916708, "grad_norm": 0.8058048486709595, "learning_rate": 3.333625372349746e-05, "loss": 3.776, "step": 7612 }, { "epoch": 2.0005911912503693, "grad_norm": 0.8229011297225952, "learning_rate": 3.331873138251271e-05, "loss": 3.714, "step": 7614 }, { "epoch": 2.0011166945840313, "grad_norm": 0.7919926643371582, "learning_rate": 3.330120904152795e-05, "loss": 3.7107, "step": 7616 }, { "epoch": 2.001642197917693, "grad_norm": 0.8786574602127075, "learning_rate": 3.3283686700543196e-05, "loss": 3.7117, "step": 7618 }, { "epoch": 2.002167701251355, "grad_norm": 0.8600201606750488, "learning_rate": 3.326616435955844e-05, "loss": 3.7314, "step": 7620 }, { "epoch": 2.0026932045850168, "grad_norm": 0.8143011331558228, "learning_rate": 3.3248642018573685e-05, "loss": 3.752, "step": 7622 }, { "epoch": 2.0032187079186783, "grad_norm": 0.9235674142837524, "learning_rate": 3.323111967758893e-05, "loss": 3.7032, "step": 7624 }, { "epoch": 2.0037442112523403, "grad_norm": 0.7629499435424805, "learning_rate": 3.321359733660417e-05, "loss": 3.7089, "step": 7626 }, { "epoch": 2.004269714586002, "grad_norm": 0.774708092212677, "learning_rate": 3.3196074995619414e-05, "loss": 3.7434, "step": 7628 }, { "epoch": 2.0047952179196638, "grad_norm": 0.7808578014373779, "learning_rate": 3.317855265463466e-05, "loss": 3.6682, "step": 7630 }, { "epoch": 2.0053207212533253, "grad_norm": 0.8245192170143127, "learning_rate": 3.31610303136499e-05, "loss": 3.6818, "step": 7632 }, { "epoch": 2.0058462245869872, "grad_norm": 0.8461399674415588, "learning_rate": 3.314350797266515e-05, "loss": 3.65, "step": 7634 }, { "epoch": 2.006371727920649, "grad_norm": 0.9061393141746521, "learning_rate": 3.31259856316804e-05, "loss": 3.7744, "step": 7636 }, { "epoch": 2.0068972312543107, "grad_norm": 0.8837909698486328, "learning_rate": 3.310846329069564e-05, "loss": 3.743, "step": 7638 }, { "epoch": 2.0074227345879727, "grad_norm": 0.8618489503860474, "learning_rate": 3.309094094971088e-05, "loss": 3.7759, "step": 7640 }, { "epoch": 2.0079482379216342, "grad_norm": 0.797562301158905, "learning_rate": 3.3073418608726126e-05, "loss": 3.7318, "step": 7642 }, { "epoch": 2.008473741255296, "grad_norm": 0.9352880716323853, "learning_rate": 3.3055896267741374e-05, "loss": 3.7454, "step": 7644 }, { "epoch": 2.0089992445889577, "grad_norm": 0.8399193286895752, "learning_rate": 3.3038373926756615e-05, "loss": 3.7558, "step": 7646 }, { "epoch": 2.0095247479226197, "grad_norm": 0.8283659815788269, "learning_rate": 3.302085158577186e-05, "loss": 3.7147, "step": 7648 }, { "epoch": 2.0100502512562812, "grad_norm": 0.8366743326187134, "learning_rate": 3.300332924478711e-05, "loss": 3.726, "step": 7650 }, { "epoch": 2.010575754589943, "grad_norm": 0.9376285076141357, "learning_rate": 3.298580690380235e-05, "loss": 3.7188, "step": 7652 }, { "epoch": 2.011101257923605, "grad_norm": 0.8527957201004028, "learning_rate": 3.296828456281759e-05, "loss": 3.7176, "step": 7654 }, { "epoch": 2.0116267612572667, "grad_norm": 0.8235294818878174, "learning_rate": 3.295076222183284e-05, "loss": 3.7225, "step": 7656 }, { "epoch": 2.0121522645909287, "grad_norm": 0.8207552433013916, "learning_rate": 3.293323988084808e-05, "loss": 3.7557, "step": 7658 }, { "epoch": 2.01267776792459, "grad_norm": 0.8280589580535889, "learning_rate": 3.291571753986333e-05, "loss": 3.6905, "step": 7660 }, { "epoch": 2.013203271258252, "grad_norm": 0.9820939898490906, "learning_rate": 3.2898195198878575e-05, "loss": 3.7672, "step": 7662 }, { "epoch": 2.0137287745919137, "grad_norm": 0.9328712224960327, "learning_rate": 3.2880672857893816e-05, "loss": 3.6703, "step": 7664 }, { "epoch": 2.0142542779255757, "grad_norm": 0.8100330829620361, "learning_rate": 3.2863150516909056e-05, "loss": 3.6821, "step": 7666 }, { "epoch": 2.014779781259237, "grad_norm": 0.8750255703926086, "learning_rate": 3.2845628175924304e-05, "loss": 3.678, "step": 7668 }, { "epoch": 2.015305284592899, "grad_norm": 0.8235081434249878, "learning_rate": 3.282810583493955e-05, "loss": 3.7399, "step": 7670 }, { "epoch": 2.015830787926561, "grad_norm": 0.8793992400169373, "learning_rate": 3.281058349395479e-05, "loss": 3.7574, "step": 7672 }, { "epoch": 2.0163562912602226, "grad_norm": 0.8520685434341431, "learning_rate": 3.279306115297004e-05, "loss": 3.724, "step": 7674 }, { "epoch": 2.0168817945938846, "grad_norm": 0.8779662847518921, "learning_rate": 3.277553881198529e-05, "loss": 3.6858, "step": 7676 }, { "epoch": 2.017407297927546, "grad_norm": 0.8015046715736389, "learning_rate": 3.275801647100053e-05, "loss": 3.7051, "step": 7678 }, { "epoch": 2.017932801261208, "grad_norm": 0.8490996360778809, "learning_rate": 3.274049413001577e-05, "loss": 3.644, "step": 7680 }, { "epoch": 2.0184583045948696, "grad_norm": 0.8947710394859314, "learning_rate": 3.272297178903102e-05, "loss": 3.7482, "step": 7682 }, { "epoch": 2.0189838079285316, "grad_norm": 0.9008524417877197, "learning_rate": 3.270544944804626e-05, "loss": 3.6784, "step": 7684 }, { "epoch": 2.019509311262193, "grad_norm": 0.9033370614051819, "learning_rate": 3.2687927107061505e-05, "loss": 3.7685, "step": 7686 }, { "epoch": 2.020034814595855, "grad_norm": 0.8272547721862793, "learning_rate": 3.267040476607675e-05, "loss": 3.7452, "step": 7688 }, { "epoch": 2.020560317929517, "grad_norm": 0.8265538215637207, "learning_rate": 3.265288242509199e-05, "loss": 3.7148, "step": 7690 }, { "epoch": 2.0210858212631786, "grad_norm": 0.8633683323860168, "learning_rate": 3.2635360084107234e-05, "loss": 3.6817, "step": 7692 }, { "epoch": 2.0216113245968406, "grad_norm": 0.8482974171638489, "learning_rate": 3.261783774312248e-05, "loss": 3.6914, "step": 7694 }, { "epoch": 2.022136827930502, "grad_norm": 0.7655878067016602, "learning_rate": 3.260031540213773e-05, "loss": 3.6814, "step": 7696 }, { "epoch": 2.022662331264164, "grad_norm": 0.9384876489639282, "learning_rate": 3.258279306115297e-05, "loss": 3.7153, "step": 7698 }, { "epoch": 2.0231878345978256, "grad_norm": 0.8684605360031128, "learning_rate": 3.256527072016822e-05, "loss": 3.7594, "step": 7700 }, { "epoch": 2.0237133379314876, "grad_norm": 0.8827585577964783, "learning_rate": 3.2547748379183465e-05, "loss": 3.736, "step": 7702 }, { "epoch": 2.024238841265149, "grad_norm": 0.8089144825935364, "learning_rate": 3.25302260381987e-05, "loss": 3.6734, "step": 7704 }, { "epoch": 2.024764344598811, "grad_norm": 0.8658406734466553, "learning_rate": 3.251270369721395e-05, "loss": 3.7319, "step": 7706 }, { "epoch": 2.025289847932473, "grad_norm": 0.8451574444770813, "learning_rate": 3.2495181356229194e-05, "loss": 3.6541, "step": 7708 }, { "epoch": 2.0258153512661345, "grad_norm": 0.8675154447555542, "learning_rate": 3.2477659015244435e-05, "loss": 3.7202, "step": 7710 }, { "epoch": 2.0263408545997965, "grad_norm": 1.0107613801956177, "learning_rate": 3.246013667425968e-05, "loss": 3.6994, "step": 7712 }, { "epoch": 2.026866357933458, "grad_norm": 0.7929757833480835, "learning_rate": 3.244261433327493e-05, "loss": 3.6963, "step": 7714 }, { "epoch": 2.02739186126712, "grad_norm": 0.9062192440032959, "learning_rate": 3.242509199229017e-05, "loss": 3.6964, "step": 7716 }, { "epoch": 2.0279173646007815, "grad_norm": 0.849919319152832, "learning_rate": 3.240756965130541e-05, "loss": 3.7254, "step": 7718 }, { "epoch": 2.0284428679344435, "grad_norm": 1.0017273426055908, "learning_rate": 3.239004731032066e-05, "loss": 3.6715, "step": 7720 }, { "epoch": 2.028968371268105, "grad_norm": 0.8590096235275269, "learning_rate": 3.237252496933591e-05, "loss": 3.7411, "step": 7722 }, { "epoch": 2.029493874601767, "grad_norm": 0.8463311195373535, "learning_rate": 3.235500262835115e-05, "loss": 3.7024, "step": 7724 }, { "epoch": 2.030019377935429, "grad_norm": 0.9526839256286621, "learning_rate": 3.2337480287366395e-05, "loss": 3.6609, "step": 7726 }, { "epoch": 2.0305448812690905, "grad_norm": 0.9120495915412903, "learning_rate": 3.231995794638164e-05, "loss": 3.6682, "step": 7728 }, { "epoch": 2.0310703846027525, "grad_norm": 0.8557276725769043, "learning_rate": 3.230243560539688e-05, "loss": 3.6987, "step": 7730 }, { "epoch": 2.031595887936414, "grad_norm": 0.8610514402389526, "learning_rate": 3.2284913264412124e-05, "loss": 3.7133, "step": 7732 }, { "epoch": 2.032121391270076, "grad_norm": 0.8406181931495667, "learning_rate": 3.226739092342737e-05, "loss": 3.6825, "step": 7734 }, { "epoch": 2.0326468946037375, "grad_norm": 0.7695955634117126, "learning_rate": 3.224986858244261e-05, "loss": 3.6655, "step": 7736 }, { "epoch": 2.0331723979373995, "grad_norm": 1.019102931022644, "learning_rate": 3.223234624145786e-05, "loss": 3.7489, "step": 7738 }, { "epoch": 2.033697901271061, "grad_norm": 0.8832061290740967, "learning_rate": 3.221482390047311e-05, "loss": 3.6713, "step": 7740 }, { "epoch": 2.034223404604723, "grad_norm": 0.7926838994026184, "learning_rate": 3.219730155948835e-05, "loss": 3.6842, "step": 7742 }, { "epoch": 2.034748907938385, "grad_norm": 0.751427173614502, "learning_rate": 3.217977921850359e-05, "loss": 3.7408, "step": 7744 }, { "epoch": 2.0352744112720464, "grad_norm": 0.8421383500099182, "learning_rate": 3.216225687751884e-05, "loss": 3.6945, "step": 7746 }, { "epoch": 2.0357999146057084, "grad_norm": 0.8444478511810303, "learning_rate": 3.2144734536534085e-05, "loss": 3.7218, "step": 7748 }, { "epoch": 2.03632541793937, "grad_norm": 0.7943613529205322, "learning_rate": 3.2127212195549325e-05, "loss": 3.7555, "step": 7750 }, { "epoch": 2.036850921273032, "grad_norm": 0.7983903288841248, "learning_rate": 3.210968985456457e-05, "loss": 3.6908, "step": 7752 }, { "epoch": 2.0373764246066934, "grad_norm": 0.8231672048568726, "learning_rate": 3.209216751357982e-05, "loss": 3.7401, "step": 7754 }, { "epoch": 2.0379019279403554, "grad_norm": 0.8598976135253906, "learning_rate": 3.207464517259506e-05, "loss": 3.7409, "step": 7756 }, { "epoch": 2.038427431274017, "grad_norm": 0.8572884202003479, "learning_rate": 3.20571228316103e-05, "loss": 3.7388, "step": 7758 }, { "epoch": 2.038952934607679, "grad_norm": 0.9576873779296875, "learning_rate": 3.203960049062555e-05, "loss": 3.7254, "step": 7760 }, { "epoch": 2.039478437941341, "grad_norm": 0.7776781320571899, "learning_rate": 3.20220781496408e-05, "loss": 3.733, "step": 7762 }, { "epoch": 2.0400039412750024, "grad_norm": 0.9597026109695435, "learning_rate": 3.200455580865604e-05, "loss": 3.7139, "step": 7764 }, { "epoch": 2.0405294446086644, "grad_norm": 0.9220765829086304, "learning_rate": 3.1987033467671286e-05, "loss": 3.7304, "step": 7766 }, { "epoch": 2.041054947942326, "grad_norm": 0.8913367986679077, "learning_rate": 3.1969511126686526e-05, "loss": 3.723, "step": 7768 }, { "epoch": 2.041580451275988, "grad_norm": 0.8705502152442932, "learning_rate": 3.195198878570177e-05, "loss": 3.7463, "step": 7770 }, { "epoch": 2.0421059546096494, "grad_norm": 0.8724271059036255, "learning_rate": 3.1934466444717015e-05, "loss": 3.7211, "step": 7772 }, { "epoch": 2.0426314579433114, "grad_norm": 0.8624586462974548, "learning_rate": 3.191694410373226e-05, "loss": 3.6949, "step": 7774 }, { "epoch": 2.043156961276973, "grad_norm": 0.751998245716095, "learning_rate": 3.18994217627475e-05, "loss": 3.7212, "step": 7776 }, { "epoch": 2.043682464610635, "grad_norm": 0.767927885055542, "learning_rate": 3.188189942176275e-05, "loss": 3.7012, "step": 7778 }, { "epoch": 2.044207967944297, "grad_norm": 0.7946444153785706, "learning_rate": 3.1864377080778e-05, "loss": 3.7578, "step": 7780 }, { "epoch": 2.0447334712779583, "grad_norm": 0.8751301169395447, "learning_rate": 3.184685473979324e-05, "loss": 3.723, "step": 7782 }, { "epoch": 2.0452589746116203, "grad_norm": 1.00210702419281, "learning_rate": 3.182933239880848e-05, "loss": 3.7066, "step": 7784 }, { "epoch": 2.045784477945282, "grad_norm": 0.9263893365859985, "learning_rate": 3.181181005782373e-05, "loss": 3.7988, "step": 7786 }, { "epoch": 2.046309981278944, "grad_norm": 0.9263496994972229, "learning_rate": 3.1794287716838975e-05, "loss": 3.7512, "step": 7788 }, { "epoch": 2.0468354846126053, "grad_norm": 0.8781086802482605, "learning_rate": 3.1776765375854216e-05, "loss": 3.7136, "step": 7790 }, { "epoch": 2.0473609879462673, "grad_norm": 0.9278703331947327, "learning_rate": 3.175924303486946e-05, "loss": 3.7138, "step": 7792 }, { "epoch": 2.047886491279929, "grad_norm": 0.7922840714454651, "learning_rate": 3.1741720693884704e-05, "loss": 3.7246, "step": 7794 }, { "epoch": 2.048411994613591, "grad_norm": 0.7835402488708496, "learning_rate": 3.1724198352899945e-05, "loss": 3.7151, "step": 7796 }, { "epoch": 2.0489374979472528, "grad_norm": 0.9653986692428589, "learning_rate": 3.170667601191519e-05, "loss": 3.7284, "step": 7798 }, { "epoch": 2.0494630012809143, "grad_norm": 0.7894474864006042, "learning_rate": 3.168915367093044e-05, "loss": 3.6868, "step": 7800 }, { "epoch": 2.0499885046145763, "grad_norm": 0.816857635974884, "learning_rate": 3.167163132994568e-05, "loss": 3.6938, "step": 7802 }, { "epoch": 2.050514007948238, "grad_norm": 0.7978635430335999, "learning_rate": 3.165410898896093e-05, "loss": 3.7077, "step": 7804 }, { "epoch": 2.0510395112818998, "grad_norm": 0.8305758237838745, "learning_rate": 3.163658664797617e-05, "loss": 3.7154, "step": 7806 }, { "epoch": 2.0515650146155613, "grad_norm": 0.9105134010314941, "learning_rate": 3.1619064306991417e-05, "loss": 3.6941, "step": 7808 }, { "epoch": 2.0520905179492233, "grad_norm": 0.8640368580818176, "learning_rate": 3.160154196600666e-05, "loss": 3.7685, "step": 7810 }, { "epoch": 2.0526160212828852, "grad_norm": 1.0461289882659912, "learning_rate": 3.1584019625021905e-05, "loss": 3.7132, "step": 7812 }, { "epoch": 2.0531415246165468, "grad_norm": 0.9310597777366638, "learning_rate": 3.156649728403715e-05, "loss": 3.73, "step": 7814 }, { "epoch": 2.0536670279502087, "grad_norm": 0.9307753443717957, "learning_rate": 3.154897494305239e-05, "loss": 3.6886, "step": 7816 }, { "epoch": 2.0541925312838702, "grad_norm": 0.8503215909004211, "learning_rate": 3.153145260206764e-05, "loss": 3.7119, "step": 7818 }, { "epoch": 2.054718034617532, "grad_norm": 0.967221736907959, "learning_rate": 3.151393026108288e-05, "loss": 3.7505, "step": 7820 }, { "epoch": 2.0552435379511937, "grad_norm": 0.918215811252594, "learning_rate": 3.149640792009812e-05, "loss": 3.697, "step": 7822 }, { "epoch": 2.0557690412848557, "grad_norm": 0.9645288586616516, "learning_rate": 3.147888557911337e-05, "loss": 3.7089, "step": 7824 }, { "epoch": 2.0562945446185172, "grad_norm": 0.8381349444389343, "learning_rate": 3.146136323812862e-05, "loss": 3.733, "step": 7826 }, { "epoch": 2.056820047952179, "grad_norm": 0.8863093852996826, "learning_rate": 3.144384089714386e-05, "loss": 3.767, "step": 7828 }, { "epoch": 2.057345551285841, "grad_norm": 0.7980983853340149, "learning_rate": 3.1426318556159106e-05, "loss": 3.711, "step": 7830 }, { "epoch": 2.0578710546195027, "grad_norm": 0.8290464282035828, "learning_rate": 3.140879621517435e-05, "loss": 3.7365, "step": 7832 }, { "epoch": 2.0583965579531647, "grad_norm": 0.9343044757843018, "learning_rate": 3.1391273874189594e-05, "loss": 3.678, "step": 7834 }, { "epoch": 2.058922061286826, "grad_norm": 0.8570297360420227, "learning_rate": 3.1373751533204835e-05, "loss": 3.7545, "step": 7836 }, { "epoch": 2.059447564620488, "grad_norm": 0.8505580425262451, "learning_rate": 3.135622919222008e-05, "loss": 3.6719, "step": 7838 }, { "epoch": 2.0599730679541497, "grad_norm": 0.8643679618835449, "learning_rate": 3.133870685123533e-05, "loss": 3.6961, "step": 7840 }, { "epoch": 2.0604985712878117, "grad_norm": 0.8901540637016296, "learning_rate": 3.132118451025057e-05, "loss": 3.7178, "step": 7842 }, { "epoch": 2.061024074621473, "grad_norm": 0.9316046833992004, "learning_rate": 3.130366216926581e-05, "loss": 3.7621, "step": 7844 }, { "epoch": 2.061549577955135, "grad_norm": 0.8963480591773987, "learning_rate": 3.128613982828106e-05, "loss": 3.7131, "step": 7846 }, { "epoch": 2.062075081288797, "grad_norm": 0.8267208337783813, "learning_rate": 3.12686174872963e-05, "loss": 3.72, "step": 7848 }, { "epoch": 2.0626005846224587, "grad_norm": 0.8865072131156921, "learning_rate": 3.125109514631155e-05, "loss": 3.671, "step": 7850 }, { "epoch": 2.0631260879561206, "grad_norm": 0.9005493521690369, "learning_rate": 3.1233572805326795e-05, "loss": 3.7356, "step": 7852 }, { "epoch": 2.063651591289782, "grad_norm": 0.897911787033081, "learning_rate": 3.1216050464342036e-05, "loss": 3.7667, "step": 7854 }, { "epoch": 2.064177094623444, "grad_norm": 0.809798538684845, "learning_rate": 3.1198528123357284e-05, "loss": 3.6736, "step": 7856 }, { "epoch": 2.0647025979571056, "grad_norm": 0.8680092096328735, "learning_rate": 3.1181005782372524e-05, "loss": 3.7155, "step": 7858 }, { "epoch": 2.0652281012907676, "grad_norm": 0.8670902252197266, "learning_rate": 3.116348344138777e-05, "loss": 3.67, "step": 7860 }, { "epoch": 2.065753604624429, "grad_norm": 0.8158410787582397, "learning_rate": 3.114596110040301e-05, "loss": 3.6795, "step": 7862 }, { "epoch": 2.066279107958091, "grad_norm": 0.8688318133354187, "learning_rate": 3.112843875941826e-05, "loss": 3.701, "step": 7864 }, { "epoch": 2.066804611291753, "grad_norm": 0.9021475911140442, "learning_rate": 3.111091641843351e-05, "loss": 3.7618, "step": 7866 }, { "epoch": 2.0673301146254146, "grad_norm": 0.8342397212982178, "learning_rate": 3.109339407744875e-05, "loss": 3.7618, "step": 7868 }, { "epoch": 2.0678556179590766, "grad_norm": 0.8684660196304321, "learning_rate": 3.107587173646399e-05, "loss": 3.7137, "step": 7870 }, { "epoch": 2.068381121292738, "grad_norm": 0.7759368419647217, "learning_rate": 3.105834939547924e-05, "loss": 3.7361, "step": 7872 }, { "epoch": 2.0689066246264, "grad_norm": 0.9256752133369446, "learning_rate": 3.1040827054494485e-05, "loss": 3.6848, "step": 7874 }, { "epoch": 2.0694321279600616, "grad_norm": 0.9292667508125305, "learning_rate": 3.1023304713509725e-05, "loss": 3.7401, "step": 7876 }, { "epoch": 2.0699576312937236, "grad_norm": 0.843914270401001, "learning_rate": 3.100578237252497e-05, "loss": 3.7266, "step": 7878 }, { "epoch": 2.070483134627385, "grad_norm": 1.0183846950531006, "learning_rate": 3.098826003154022e-05, "loss": 3.6782, "step": 7880 }, { "epoch": 2.071008637961047, "grad_norm": 0.8519551753997803, "learning_rate": 3.0970737690555454e-05, "loss": 3.7259, "step": 7882 }, { "epoch": 2.071534141294709, "grad_norm": 0.9065747261047363, "learning_rate": 3.09532153495707e-05, "loss": 3.7226, "step": 7884 }, { "epoch": 2.0720596446283706, "grad_norm": 0.8864527344703674, "learning_rate": 3.093569300858595e-05, "loss": 3.7225, "step": 7886 }, { "epoch": 2.0725851479620325, "grad_norm": 0.8941075205802917, "learning_rate": 3.091817066760119e-05, "loss": 3.7611, "step": 7888 }, { "epoch": 2.073110651295694, "grad_norm": 0.8134872317314148, "learning_rate": 3.090064832661644e-05, "loss": 3.7348, "step": 7890 }, { "epoch": 2.073636154629356, "grad_norm": 0.8548062443733215, "learning_rate": 3.0883125985631686e-05, "loss": 3.7449, "step": 7892 }, { "epoch": 2.0741616579630175, "grad_norm": 0.9113209843635559, "learning_rate": 3.0865603644646926e-05, "loss": 3.7101, "step": 7894 }, { "epoch": 2.0746871612966795, "grad_norm": 0.9435298442840576, "learning_rate": 3.084808130366217e-05, "loss": 3.5854, "step": 7896 }, { "epoch": 2.075212664630341, "grad_norm": 0.8446658849716187, "learning_rate": 3.0830558962677415e-05, "loss": 3.6763, "step": 7898 }, { "epoch": 2.075738167964003, "grad_norm": 0.8436275124549866, "learning_rate": 3.081303662169266e-05, "loss": 3.7276, "step": 7900 }, { "epoch": 2.076263671297665, "grad_norm": 0.8463113307952881, "learning_rate": 3.07955142807079e-05, "loss": 3.7078, "step": 7902 }, { "epoch": 2.0767891746313265, "grad_norm": 0.8026580214500427, "learning_rate": 3.077799193972315e-05, "loss": 3.6873, "step": 7904 }, { "epoch": 2.0773146779649885, "grad_norm": 0.7935935258865356, "learning_rate": 3.07604695987384e-05, "loss": 3.6982, "step": 7906 }, { "epoch": 2.07784018129865, "grad_norm": 0.8102018237113953, "learning_rate": 3.074294725775363e-05, "loss": 3.7052, "step": 7908 }, { "epoch": 2.078365684632312, "grad_norm": 0.7911702990531921, "learning_rate": 3.072542491676888e-05, "loss": 3.692, "step": 7910 }, { "epoch": 2.0788911879659735, "grad_norm": 0.7889243364334106, "learning_rate": 3.070790257578413e-05, "loss": 3.7333, "step": 7912 }, { "epoch": 2.0794166912996355, "grad_norm": 0.8327273726463318, "learning_rate": 3.069038023479937e-05, "loss": 3.7081, "step": 7914 }, { "epoch": 2.079942194633297, "grad_norm": 0.8066157102584839, "learning_rate": 3.0672857893814616e-05, "loss": 3.7349, "step": 7916 }, { "epoch": 2.080467697966959, "grad_norm": 0.8386849164962769, "learning_rate": 3.065533555282986e-05, "loss": 3.7316, "step": 7918 }, { "epoch": 2.080993201300621, "grad_norm": 0.7839257717132568, "learning_rate": 3.0637813211845104e-05, "loss": 3.738, "step": 7920 }, { "epoch": 2.0815187046342825, "grad_norm": 0.869803249835968, "learning_rate": 3.0620290870860345e-05, "loss": 3.7191, "step": 7922 }, { "epoch": 2.0820442079679444, "grad_norm": 0.8376153707504272, "learning_rate": 3.060276852987559e-05, "loss": 3.6775, "step": 7924 }, { "epoch": 2.082569711301606, "grad_norm": 0.9468572735786438, "learning_rate": 3.058524618889084e-05, "loss": 3.7437, "step": 7926 }, { "epoch": 2.083095214635268, "grad_norm": 0.8215572237968445, "learning_rate": 3.056772384790608e-05, "loss": 3.7339, "step": 7928 }, { "epoch": 2.0836207179689294, "grad_norm": 0.9146531224250793, "learning_rate": 3.055020150692133e-05, "loss": 3.6699, "step": 7930 }, { "epoch": 2.0841462213025914, "grad_norm": 0.9182584285736084, "learning_rate": 3.0532679165936576e-05, "loss": 3.6871, "step": 7932 }, { "epoch": 2.0846717246362534, "grad_norm": 0.7846736907958984, "learning_rate": 3.0515156824951813e-05, "loss": 3.6755, "step": 7934 }, { "epoch": 2.085197227969915, "grad_norm": 0.9802265167236328, "learning_rate": 3.0497634483967057e-05, "loss": 3.7323, "step": 7936 }, { "epoch": 2.085722731303577, "grad_norm": 0.8102672696113586, "learning_rate": 3.0480112142982305e-05, "loss": 3.6965, "step": 7938 }, { "epoch": 2.0862482346372384, "grad_norm": 0.8579999804496765, "learning_rate": 3.046258980199755e-05, "loss": 3.7209, "step": 7940 }, { "epoch": 2.0867737379709004, "grad_norm": 0.9104049205780029, "learning_rate": 3.0445067461012793e-05, "loss": 3.6899, "step": 7942 }, { "epoch": 2.087299241304562, "grad_norm": 0.8753128051757812, "learning_rate": 3.042754512002804e-05, "loss": 3.7409, "step": 7944 }, { "epoch": 2.087824744638224, "grad_norm": 0.8137463927268982, "learning_rate": 3.0410022779043278e-05, "loss": 3.7303, "step": 7946 }, { "epoch": 2.0883502479718854, "grad_norm": 0.8873860836029053, "learning_rate": 3.0392500438058526e-05, "loss": 3.7211, "step": 7948 }, { "epoch": 2.0888757513055474, "grad_norm": 0.8379363417625427, "learning_rate": 3.037497809707377e-05, "loss": 3.68, "step": 7950 }, { "epoch": 2.089401254639209, "grad_norm": 0.946060061454773, "learning_rate": 3.0357455756089014e-05, "loss": 3.6951, "step": 7952 }, { "epoch": 2.089926757972871, "grad_norm": 0.8697885274887085, "learning_rate": 3.0339933415104262e-05, "loss": 3.6907, "step": 7954 }, { "epoch": 2.090452261306533, "grad_norm": 0.7566511034965515, "learning_rate": 3.0322411074119506e-05, "loss": 3.6966, "step": 7956 }, { "epoch": 2.0909777646401944, "grad_norm": 0.9558420777320862, "learning_rate": 3.030488873313475e-05, "loss": 3.7703, "step": 7958 }, { "epoch": 2.0915032679738563, "grad_norm": 0.8636420965194702, "learning_rate": 3.028736639214999e-05, "loss": 3.7045, "step": 7960 }, { "epoch": 2.092028771307518, "grad_norm": 0.785110592842102, "learning_rate": 3.0269844051165235e-05, "loss": 3.7222, "step": 7962 }, { "epoch": 2.09255427464118, "grad_norm": 0.7816142439842224, "learning_rate": 3.0252321710180483e-05, "loss": 3.6989, "step": 7964 }, { "epoch": 2.0930797779748413, "grad_norm": 0.8499987125396729, "learning_rate": 3.0234799369195727e-05, "loss": 3.7155, "step": 7966 }, { "epoch": 2.0936052813085033, "grad_norm": 0.8120269179344177, "learning_rate": 3.021727702821097e-05, "loss": 3.6693, "step": 7968 }, { "epoch": 2.0941307846421653, "grad_norm": 0.7930347323417664, "learning_rate": 3.019975468722622e-05, "loss": 3.7516, "step": 7970 }, { "epoch": 2.094656287975827, "grad_norm": 0.9570354223251343, "learning_rate": 3.0182232346241456e-05, "loss": 3.6829, "step": 7972 }, { "epoch": 2.095181791309489, "grad_norm": 0.8867546916007996, "learning_rate": 3.0164710005256703e-05, "loss": 3.7466, "step": 7974 }, { "epoch": 2.0957072946431503, "grad_norm": 0.8941977024078369, "learning_rate": 3.0147187664271948e-05, "loss": 3.7224, "step": 7976 }, { "epoch": 2.0962327979768123, "grad_norm": 0.9007421731948853, "learning_rate": 3.0129665323287192e-05, "loss": 3.7149, "step": 7978 }, { "epoch": 2.096758301310474, "grad_norm": 0.815540075302124, "learning_rate": 3.011214298230244e-05, "loss": 3.7284, "step": 7980 }, { "epoch": 2.0972838046441358, "grad_norm": 0.8763511776924133, "learning_rate": 3.0094620641317684e-05, "loss": 3.7341, "step": 7982 }, { "epoch": 2.0978093079777973, "grad_norm": 0.9335790872573853, "learning_rate": 3.0077098300332924e-05, "loss": 3.7404, "step": 7984 }, { "epoch": 2.0983348113114593, "grad_norm": 1.0340864658355713, "learning_rate": 3.005957595934817e-05, "loss": 3.658, "step": 7986 }, { "epoch": 2.0988603146451212, "grad_norm": 0.8303037285804749, "learning_rate": 3.0042053618363413e-05, "loss": 3.7287, "step": 7988 }, { "epoch": 2.0993858179787828, "grad_norm": 0.8116858601570129, "learning_rate": 3.002453127737866e-05, "loss": 3.7089, "step": 7990 }, { "epoch": 2.0999113213124447, "grad_norm": 0.8934987187385559, "learning_rate": 3.0007008936393904e-05, "loss": 3.6926, "step": 7992 }, { "epoch": 2.1004368246461063, "grad_norm": 0.9415079355239868, "learning_rate": 2.998948659540915e-05, "loss": 3.7121, "step": 7994 }, { "epoch": 2.1009623279797682, "grad_norm": 0.7913405895233154, "learning_rate": 2.9971964254424396e-05, "loss": 3.6851, "step": 7996 }, { "epoch": 2.1014878313134298, "grad_norm": 0.9660059809684753, "learning_rate": 2.9954441913439634e-05, "loss": 3.7475, "step": 7998 }, { "epoch": 2.1020133346470917, "grad_norm": 1.063310146331787, "learning_rate": 2.993691957245488e-05, "loss": 3.7041, "step": 8000 }, { "epoch": 2.1020133346470917, "eval_loss": 3.7423858642578125, "eval_runtime": 464.6302, "eval_samples_per_second": 262.12, "eval_steps_per_second": 8.191, "step": 8000 }, { "epoch": 2.1025388379807533, "grad_norm": 0.8171399235725403, "learning_rate": 2.9919397231470125e-05, "loss": 3.7464, "step": 8002 }, { "epoch": 2.103064341314415, "grad_norm": 1.0198076963424683, "learning_rate": 2.990187489048537e-05, "loss": 3.7708, "step": 8004 }, { "epoch": 2.103589844648077, "grad_norm": 1.030068039894104, "learning_rate": 2.9884352549500617e-05, "loss": 3.7604, "step": 8006 }, { "epoch": 2.1041153479817387, "grad_norm": 0.8224868178367615, "learning_rate": 2.986683020851586e-05, "loss": 3.7086, "step": 8008 }, { "epoch": 2.1046408513154007, "grad_norm": 0.8771683573722839, "learning_rate": 2.9849307867531102e-05, "loss": 3.6822, "step": 8010 }, { "epoch": 2.105166354649062, "grad_norm": 0.8575516939163208, "learning_rate": 2.9831785526546346e-05, "loss": 3.6783, "step": 8012 }, { "epoch": 2.105691857982724, "grad_norm": 0.9346481561660767, "learning_rate": 2.981426318556159e-05, "loss": 3.7599, "step": 8014 }, { "epoch": 2.1062173613163857, "grad_norm": 0.878915548324585, "learning_rate": 2.9796740844576838e-05, "loss": 3.799, "step": 8016 }, { "epoch": 2.1067428646500477, "grad_norm": 0.8294042348861694, "learning_rate": 2.9779218503592082e-05, "loss": 3.7432, "step": 8018 }, { "epoch": 2.107268367983709, "grad_norm": 0.892238199710846, "learning_rate": 2.9761696162607326e-05, "loss": 3.75, "step": 8020 }, { "epoch": 2.107793871317371, "grad_norm": 0.893293559551239, "learning_rate": 2.9744173821622574e-05, "loss": 3.728, "step": 8022 }, { "epoch": 2.108319374651033, "grad_norm": 0.839036226272583, "learning_rate": 2.972665148063781e-05, "loss": 3.7231, "step": 8024 }, { "epoch": 2.1088448779846947, "grad_norm": 0.957718551158905, "learning_rate": 2.970912913965306e-05, "loss": 3.7413, "step": 8026 }, { "epoch": 2.1093703813183566, "grad_norm": 0.7994686365127563, "learning_rate": 2.9691606798668303e-05, "loss": 3.767, "step": 8028 }, { "epoch": 2.109895884652018, "grad_norm": 0.8649013638496399, "learning_rate": 2.9674084457683547e-05, "loss": 3.694, "step": 8030 }, { "epoch": 2.11042138798568, "grad_norm": 0.9908792972564697, "learning_rate": 2.9656562116698795e-05, "loss": 3.7089, "step": 8032 }, { "epoch": 2.1109468913193417, "grad_norm": 0.8847101330757141, "learning_rate": 2.963903977571404e-05, "loss": 3.7115, "step": 8034 }, { "epoch": 2.1114723946530036, "grad_norm": 0.8398132920265198, "learning_rate": 2.962151743472928e-05, "loss": 3.7252, "step": 8036 }, { "epoch": 2.111997897986665, "grad_norm": 0.9740709066390991, "learning_rate": 2.9603995093744524e-05, "loss": 3.6366, "step": 8038 }, { "epoch": 2.112523401320327, "grad_norm": 0.8128932118415833, "learning_rate": 2.9586472752759768e-05, "loss": 3.7157, "step": 8040 }, { "epoch": 2.113048904653989, "grad_norm": 0.8274446129798889, "learning_rate": 2.9568950411775016e-05, "loss": 3.6626, "step": 8042 }, { "epoch": 2.1135744079876506, "grad_norm": 0.9031032919883728, "learning_rate": 2.955142807079026e-05, "loss": 3.7165, "step": 8044 }, { "epoch": 2.1140999113213126, "grad_norm": 0.9569668173789978, "learning_rate": 2.9533905729805504e-05, "loss": 3.7486, "step": 8046 }, { "epoch": 2.114625414654974, "grad_norm": 0.8916709423065186, "learning_rate": 2.9516383388820745e-05, "loss": 3.7165, "step": 8048 }, { "epoch": 2.115150917988636, "grad_norm": 0.9503262639045715, "learning_rate": 2.949886104783599e-05, "loss": 3.6976, "step": 8050 }, { "epoch": 2.1156764213222976, "grad_norm": 0.9636269807815552, "learning_rate": 2.9481338706851236e-05, "loss": 3.7478, "step": 8052 }, { "epoch": 2.1162019246559596, "grad_norm": 0.8181533813476562, "learning_rate": 2.946381636586648e-05, "loss": 3.6916, "step": 8054 }, { "epoch": 2.116727427989621, "grad_norm": 0.8951836824417114, "learning_rate": 2.9446294024881725e-05, "loss": 3.6859, "step": 8056 }, { "epoch": 2.117252931323283, "grad_norm": 0.9942171573638916, "learning_rate": 2.9428771683896972e-05, "loss": 3.705, "step": 8058 }, { "epoch": 2.117778434656945, "grad_norm": 0.828910231590271, "learning_rate": 2.9411249342912217e-05, "loss": 3.7232, "step": 8060 }, { "epoch": 2.1183039379906066, "grad_norm": 0.8869870901107788, "learning_rate": 2.9393727001927457e-05, "loss": 3.7663, "step": 8062 }, { "epoch": 2.1188294413242685, "grad_norm": 0.9113820791244507, "learning_rate": 2.93762046609427e-05, "loss": 3.6265, "step": 8064 }, { "epoch": 2.11935494465793, "grad_norm": 0.8693059682846069, "learning_rate": 2.935868231995795e-05, "loss": 3.6672, "step": 8066 }, { "epoch": 2.119880447991592, "grad_norm": 0.7781542539596558, "learning_rate": 2.9341159978973193e-05, "loss": 3.7371, "step": 8068 }, { "epoch": 2.1204059513252536, "grad_norm": 0.8837724328041077, "learning_rate": 2.9323637637988437e-05, "loss": 3.6777, "step": 8070 }, { "epoch": 2.1209314546589155, "grad_norm": 0.8289134502410889, "learning_rate": 2.9306115297003685e-05, "loss": 3.6859, "step": 8072 }, { "epoch": 2.121456957992577, "grad_norm": 0.9057675004005432, "learning_rate": 2.9288592956018922e-05, "loss": 3.6344, "step": 8074 }, { "epoch": 2.121982461326239, "grad_norm": 1.0070940256118774, "learning_rate": 2.927107061503417e-05, "loss": 3.726, "step": 8076 }, { "epoch": 2.122507964659901, "grad_norm": 1.0047520399093628, "learning_rate": 2.9253548274049414e-05, "loss": 3.6863, "step": 8078 }, { "epoch": 2.1230334679935625, "grad_norm": 0.7771101593971252, "learning_rate": 2.923602593306466e-05, "loss": 3.7184, "step": 8080 }, { "epoch": 2.1235589713272245, "grad_norm": 0.8569795489311218, "learning_rate": 2.9218503592079906e-05, "loss": 3.7365, "step": 8082 }, { "epoch": 2.124084474660886, "grad_norm": 0.8309317231178284, "learning_rate": 2.920098125109515e-05, "loss": 3.7362, "step": 8084 }, { "epoch": 2.124609977994548, "grad_norm": 0.9353446960449219, "learning_rate": 2.918345891011039e-05, "loss": 3.6994, "step": 8086 }, { "epoch": 2.1251354813282095, "grad_norm": 0.9829643964767456, "learning_rate": 2.9165936569125635e-05, "loss": 3.718, "step": 8088 }, { "epoch": 2.1256609846618715, "grad_norm": 0.827942430973053, "learning_rate": 2.914841422814088e-05, "loss": 3.674, "step": 8090 }, { "epoch": 2.1261864879955334, "grad_norm": 0.8085716962814331, "learning_rate": 2.9130891887156127e-05, "loss": 3.6925, "step": 8092 }, { "epoch": 2.126711991329195, "grad_norm": 0.965057373046875, "learning_rate": 2.911336954617137e-05, "loss": 3.6729, "step": 8094 }, { "epoch": 2.127237494662857, "grad_norm": 0.8604949712753296, "learning_rate": 2.9095847205186615e-05, "loss": 3.7076, "step": 8096 }, { "epoch": 2.1277629979965185, "grad_norm": 0.8146266937255859, "learning_rate": 2.9078324864201863e-05, "loss": 3.7198, "step": 8098 }, { "epoch": 2.1282885013301804, "grad_norm": 0.8545982241630554, "learning_rate": 2.90608025232171e-05, "loss": 3.701, "step": 8100 }, { "epoch": 2.128814004663842, "grad_norm": 0.9009379148483276, "learning_rate": 2.9043280182232348e-05, "loss": 3.7198, "step": 8102 }, { "epoch": 2.129339507997504, "grad_norm": 0.9085600972175598, "learning_rate": 2.9025757841247592e-05, "loss": 3.7253, "step": 8104 }, { "epoch": 2.1298650113311655, "grad_norm": 0.8901732563972473, "learning_rate": 2.9008235500262836e-05, "loss": 3.7419, "step": 8106 }, { "epoch": 2.1303905146648274, "grad_norm": 0.8988015055656433, "learning_rate": 2.8990713159278084e-05, "loss": 3.712, "step": 8108 }, { "epoch": 2.130916017998489, "grad_norm": 0.926435112953186, "learning_rate": 2.8973190818293328e-05, "loss": 3.7312, "step": 8110 }, { "epoch": 2.131441521332151, "grad_norm": 0.8323521018028259, "learning_rate": 2.895566847730857e-05, "loss": 3.7172, "step": 8112 }, { "epoch": 2.131967024665813, "grad_norm": 0.8864949941635132, "learning_rate": 2.8938146136323813e-05, "loss": 3.7631, "step": 8114 }, { "epoch": 2.1324925279994744, "grad_norm": 1.0221319198608398, "learning_rate": 2.8920623795339057e-05, "loss": 3.7159, "step": 8116 }, { "epoch": 2.1330180313331364, "grad_norm": 0.7902336716651917, "learning_rate": 2.8903101454354304e-05, "loss": 3.722, "step": 8118 }, { "epoch": 2.133543534666798, "grad_norm": 0.842967689037323, "learning_rate": 2.888557911336955e-05, "loss": 3.7146, "step": 8120 }, { "epoch": 2.13406903800046, "grad_norm": 0.908761739730835, "learning_rate": 2.8868056772384793e-05, "loss": 3.7325, "step": 8122 }, { "epoch": 2.1345945413341214, "grad_norm": 0.8153826594352722, "learning_rate": 2.885053443140004e-05, "loss": 3.7253, "step": 8124 }, { "epoch": 2.1351200446677834, "grad_norm": 0.8623270988464355, "learning_rate": 2.8833012090415278e-05, "loss": 3.7108, "step": 8126 }, { "epoch": 2.1356455480014453, "grad_norm": 0.9044506549835205, "learning_rate": 2.8815489749430525e-05, "loss": 3.67, "step": 8128 }, { "epoch": 2.136171051335107, "grad_norm": 0.8039859533309937, "learning_rate": 2.879796740844577e-05, "loss": 3.7496, "step": 8130 }, { "epoch": 2.136696554668769, "grad_norm": 0.8961197733879089, "learning_rate": 2.8780445067461014e-05, "loss": 3.7489, "step": 8132 }, { "epoch": 2.1372220580024304, "grad_norm": 1.0735282897949219, "learning_rate": 2.876292272647626e-05, "loss": 3.701, "step": 8134 }, { "epoch": 2.1377475613360923, "grad_norm": 0.913138747215271, "learning_rate": 2.8745400385491505e-05, "loss": 3.6913, "step": 8136 }, { "epoch": 2.138273064669754, "grad_norm": 0.8296342492103577, "learning_rate": 2.8727878044506746e-05, "loss": 3.6832, "step": 8138 }, { "epoch": 2.138798568003416, "grad_norm": 0.9883130192756653, "learning_rate": 2.871035570352199e-05, "loss": 3.7384, "step": 8140 }, { "epoch": 2.1393240713370774, "grad_norm": 0.9652523398399353, "learning_rate": 2.8692833362537235e-05, "loss": 3.7352, "step": 8142 }, { "epoch": 2.1398495746707393, "grad_norm": 0.8789937496185303, "learning_rate": 2.8675311021552482e-05, "loss": 3.6947, "step": 8144 }, { "epoch": 2.140375078004401, "grad_norm": 0.9040631651878357, "learning_rate": 2.8657788680567726e-05, "loss": 3.7107, "step": 8146 }, { "epoch": 2.140900581338063, "grad_norm": 0.7769992351531982, "learning_rate": 2.864026633958297e-05, "loss": 3.7096, "step": 8148 }, { "epoch": 2.141426084671725, "grad_norm": 0.8929970860481262, "learning_rate": 2.862274399859821e-05, "loss": 3.7586, "step": 8150 }, { "epoch": 2.1419515880053863, "grad_norm": 0.8977286219596863, "learning_rate": 2.8605221657613455e-05, "loss": 3.6923, "step": 8152 }, { "epoch": 2.1424770913390483, "grad_norm": 0.8879009485244751, "learning_rate": 2.8587699316628703e-05, "loss": 3.728, "step": 8154 }, { "epoch": 2.14300259467271, "grad_norm": 0.8811193704605103, "learning_rate": 2.8570176975643947e-05, "loss": 3.707, "step": 8156 }, { "epoch": 2.143528098006372, "grad_norm": 0.863518238067627, "learning_rate": 2.855265463465919e-05, "loss": 3.7251, "step": 8158 }, { "epoch": 2.1440536013400333, "grad_norm": 0.8561248779296875, "learning_rate": 2.853513229367444e-05, "loss": 3.751, "step": 8160 }, { "epoch": 2.1445791046736953, "grad_norm": 0.8905676603317261, "learning_rate": 2.8517609952689683e-05, "loss": 3.7484, "step": 8162 }, { "epoch": 2.1451046080073572, "grad_norm": 0.808931291103363, "learning_rate": 2.8500087611704924e-05, "loss": 3.6547, "step": 8164 }, { "epoch": 2.1456301113410188, "grad_norm": 0.8565650582313538, "learning_rate": 2.8482565270720168e-05, "loss": 3.733, "step": 8166 }, { "epoch": 2.1461556146746807, "grad_norm": 0.8526059985160828, "learning_rate": 2.8465042929735412e-05, "loss": 3.7476, "step": 8168 }, { "epoch": 2.1466811180083423, "grad_norm": 1.0110206604003906, "learning_rate": 2.844752058875066e-05, "loss": 3.7208, "step": 8170 }, { "epoch": 2.1472066213420042, "grad_norm": 0.7523288130760193, "learning_rate": 2.8429998247765904e-05, "loss": 3.7327, "step": 8172 }, { "epoch": 2.1477321246756658, "grad_norm": 0.9221066236495972, "learning_rate": 2.8412475906781148e-05, "loss": 3.6856, "step": 8174 }, { "epoch": 2.1482576280093277, "grad_norm": 0.9174444079399109, "learning_rate": 2.839495356579639e-05, "loss": 3.7224, "step": 8176 }, { "epoch": 2.1487831313429893, "grad_norm": 0.8743519186973572, "learning_rate": 2.8377431224811633e-05, "loss": 3.731, "step": 8178 }, { "epoch": 2.1493086346766512, "grad_norm": 0.8638477325439453, "learning_rate": 2.835990888382688e-05, "loss": 3.6863, "step": 8180 }, { "epoch": 2.149834138010313, "grad_norm": 0.8693050146102905, "learning_rate": 2.8342386542842125e-05, "loss": 3.702, "step": 8182 }, { "epoch": 2.1503596413439747, "grad_norm": 0.9244679808616638, "learning_rate": 2.832486420185737e-05, "loss": 3.7264, "step": 8184 }, { "epoch": 2.1508851446776367, "grad_norm": 0.8042910695075989, "learning_rate": 2.8307341860872617e-05, "loss": 3.7354, "step": 8186 }, { "epoch": 2.151410648011298, "grad_norm": 0.8861294984817505, "learning_rate": 2.8289819519887857e-05, "loss": 3.755, "step": 8188 }, { "epoch": 2.15193615134496, "grad_norm": 1.157036304473877, "learning_rate": 2.82722971789031e-05, "loss": 3.6945, "step": 8190 }, { "epoch": 2.1524616546786217, "grad_norm": 1.0095903873443604, "learning_rate": 2.8254774837918346e-05, "loss": 3.6751, "step": 8192 }, { "epoch": 2.1529871580122837, "grad_norm": 0.8900673985481262, "learning_rate": 2.8237252496933593e-05, "loss": 3.7323, "step": 8194 }, { "epoch": 2.153512661345945, "grad_norm": 0.9441550374031067, "learning_rate": 2.8219730155948837e-05, "loss": 3.7069, "step": 8196 }, { "epoch": 2.154038164679607, "grad_norm": 0.9793975353240967, "learning_rate": 2.820220781496408e-05, "loss": 3.6774, "step": 8198 }, { "epoch": 2.154563668013269, "grad_norm": 0.8286717534065247, "learning_rate": 2.818468547397933e-05, "loss": 3.7123, "step": 8200 }, { "epoch": 2.1550891713469307, "grad_norm": 0.9078981876373291, "learning_rate": 2.8167163132994567e-05, "loss": 3.7129, "step": 8202 }, { "epoch": 2.1556146746805926, "grad_norm": 0.8118013143539429, "learning_rate": 2.8149640792009814e-05, "loss": 3.7594, "step": 8204 }, { "epoch": 2.156140178014254, "grad_norm": 0.825851321220398, "learning_rate": 2.8132118451025058e-05, "loss": 3.64, "step": 8206 }, { "epoch": 2.156665681347916, "grad_norm": 0.8422316312789917, "learning_rate": 2.8114596110040302e-05, "loss": 3.6855, "step": 8208 }, { "epoch": 2.1571911846815777, "grad_norm": 0.8039098381996155, "learning_rate": 2.809707376905555e-05, "loss": 3.6942, "step": 8210 }, { "epoch": 2.1577166880152396, "grad_norm": 0.8266885876655579, "learning_rate": 2.8079551428070794e-05, "loss": 3.7411, "step": 8212 }, { "epoch": 2.158242191348901, "grad_norm": 0.9916283488273621, "learning_rate": 2.8062029087086035e-05, "loss": 3.683, "step": 8214 }, { "epoch": 2.158767694682563, "grad_norm": 0.9340961575508118, "learning_rate": 2.804450674610128e-05, "loss": 3.7506, "step": 8216 }, { "epoch": 2.159293198016225, "grad_norm": 0.88029545545578, "learning_rate": 2.8026984405116523e-05, "loss": 3.7387, "step": 8218 }, { "epoch": 2.1598187013498866, "grad_norm": 0.8614386916160583, "learning_rate": 2.800946206413177e-05, "loss": 3.7434, "step": 8220 }, { "epoch": 2.1603442046835486, "grad_norm": 0.8241007328033447, "learning_rate": 2.7991939723147015e-05, "loss": 3.7233, "step": 8222 }, { "epoch": 2.16086970801721, "grad_norm": 0.8200369477272034, "learning_rate": 2.797441738216226e-05, "loss": 3.7074, "step": 8224 }, { "epoch": 2.161395211350872, "grad_norm": 0.8160844445228577, "learning_rate": 2.79568950411775e-05, "loss": 3.7452, "step": 8226 }, { "epoch": 2.1619207146845336, "grad_norm": 0.8997820615768433, "learning_rate": 2.7939372700192744e-05, "loss": 3.734, "step": 8228 }, { "epoch": 2.1624462180181956, "grad_norm": 0.8075968623161316, "learning_rate": 2.7921850359207992e-05, "loss": 3.6422, "step": 8230 }, { "epoch": 2.162971721351857, "grad_norm": 0.860609769821167, "learning_rate": 2.7904328018223236e-05, "loss": 3.6416, "step": 8232 }, { "epoch": 2.163497224685519, "grad_norm": 0.9949994683265686, "learning_rate": 2.788680567723848e-05, "loss": 3.7238, "step": 8234 }, { "epoch": 2.164022728019181, "grad_norm": 0.830839991569519, "learning_rate": 2.7869283336253728e-05, "loss": 3.7289, "step": 8236 }, { "epoch": 2.1645482313528426, "grad_norm": 0.8570488691329956, "learning_rate": 2.7851760995268972e-05, "loss": 3.6752, "step": 8238 }, { "epoch": 2.1650737346865045, "grad_norm": 1.0423349142074585, "learning_rate": 2.7834238654284213e-05, "loss": 3.7033, "step": 8240 }, { "epoch": 2.165599238020166, "grad_norm": 0.8739895820617676, "learning_rate": 2.7816716313299457e-05, "loss": 3.7253, "step": 8242 }, { "epoch": 2.166124741353828, "grad_norm": 0.7925859093666077, "learning_rate": 2.77991939723147e-05, "loss": 3.6673, "step": 8244 }, { "epoch": 2.1666502446874896, "grad_norm": 0.8085572123527527, "learning_rate": 2.778167163132995e-05, "loss": 3.7663, "step": 8246 }, { "epoch": 2.1671757480211515, "grad_norm": 0.8906912207603455, "learning_rate": 2.7764149290345193e-05, "loss": 3.7174, "step": 8248 }, { "epoch": 2.1677012513548135, "grad_norm": 0.9953876733779907, "learning_rate": 2.7746626949360437e-05, "loss": 3.7152, "step": 8250 }, { "epoch": 2.168226754688475, "grad_norm": 0.9806913733482361, "learning_rate": 2.7729104608375678e-05, "loss": 3.7217, "step": 8252 }, { "epoch": 2.168752258022137, "grad_norm": 0.8729687333106995, "learning_rate": 2.7711582267390922e-05, "loss": 3.7173, "step": 8254 }, { "epoch": 2.1692777613557985, "grad_norm": 0.9722044467926025, "learning_rate": 2.769405992640617e-05, "loss": 3.681, "step": 8256 }, { "epoch": 2.1698032646894605, "grad_norm": 0.770526647567749, "learning_rate": 2.7676537585421414e-05, "loss": 3.6945, "step": 8258 }, { "epoch": 2.170328768023122, "grad_norm": 0.8750258088111877, "learning_rate": 2.7659015244436658e-05, "loss": 3.7058, "step": 8260 }, { "epoch": 2.170854271356784, "grad_norm": 0.8104822635650635, "learning_rate": 2.7641492903451905e-05, "loss": 3.7001, "step": 8262 }, { "epoch": 2.1713797746904455, "grad_norm": 0.7934516072273254, "learning_rate": 2.762397056246715e-05, "loss": 3.6809, "step": 8264 }, { "epoch": 2.1719052780241075, "grad_norm": 0.9304160475730896, "learning_rate": 2.760644822148239e-05, "loss": 3.7086, "step": 8266 }, { "epoch": 2.172430781357769, "grad_norm": 0.9324445128440857, "learning_rate": 2.7588925880497635e-05, "loss": 3.7502, "step": 8268 }, { "epoch": 2.172956284691431, "grad_norm": 0.8598816990852356, "learning_rate": 2.757140353951288e-05, "loss": 3.711, "step": 8270 }, { "epoch": 2.173481788025093, "grad_norm": 0.9330804944038391, "learning_rate": 2.7553881198528126e-05, "loss": 3.6944, "step": 8272 }, { "epoch": 2.1740072913587545, "grad_norm": 0.8977974653244019, "learning_rate": 2.753635885754337e-05, "loss": 3.759, "step": 8274 }, { "epoch": 2.1745327946924164, "grad_norm": 0.8748917579650879, "learning_rate": 2.7518836516558615e-05, "loss": 3.6693, "step": 8276 }, { "epoch": 2.175058298026078, "grad_norm": 0.8060845732688904, "learning_rate": 2.7501314175573855e-05, "loss": 3.6414, "step": 8278 }, { "epoch": 2.17558380135974, "grad_norm": 0.8171510100364685, "learning_rate": 2.74837918345891e-05, "loss": 3.6654, "step": 8280 }, { "epoch": 2.1761093046934015, "grad_norm": 0.8705804347991943, "learning_rate": 2.7466269493604347e-05, "loss": 3.6805, "step": 8282 }, { "epoch": 2.1766348080270634, "grad_norm": 0.8060354590415955, "learning_rate": 2.744874715261959e-05, "loss": 3.7444, "step": 8284 }, { "epoch": 2.1771603113607254, "grad_norm": 0.8791294693946838, "learning_rate": 2.7431224811634835e-05, "loss": 3.6696, "step": 8286 }, { "epoch": 2.177685814694387, "grad_norm": 0.8544911742210388, "learning_rate": 2.7413702470650083e-05, "loss": 3.732, "step": 8288 }, { "epoch": 2.178211318028049, "grad_norm": 0.8673406839370728, "learning_rate": 2.739618012966532e-05, "loss": 3.6714, "step": 8290 }, { "epoch": 2.1787368213617104, "grad_norm": 0.8730568885803223, "learning_rate": 2.7378657788680568e-05, "loss": 3.6927, "step": 8292 }, { "epoch": 2.1792623246953724, "grad_norm": 0.771119236946106, "learning_rate": 2.7361135447695812e-05, "loss": 3.7261, "step": 8294 }, { "epoch": 2.179787828029034, "grad_norm": 0.9189099669456482, "learning_rate": 2.7343613106711056e-05, "loss": 3.67, "step": 8296 }, { "epoch": 2.180313331362696, "grad_norm": 0.8554127812385559, "learning_rate": 2.7326090765726304e-05, "loss": 3.764, "step": 8298 }, { "epoch": 2.1808388346963574, "grad_norm": 0.8397378325462341, "learning_rate": 2.7308568424741548e-05, "loss": 3.6777, "step": 8300 }, { "epoch": 2.1813643380300194, "grad_norm": 0.9806804656982422, "learning_rate": 2.7291046083756792e-05, "loss": 3.7051, "step": 8302 }, { "epoch": 2.181889841363681, "grad_norm": 0.8403685688972473, "learning_rate": 2.7273523742772033e-05, "loss": 3.706, "step": 8304 }, { "epoch": 2.182415344697343, "grad_norm": 0.8347755074501038, "learning_rate": 2.7256001401787277e-05, "loss": 3.7061, "step": 8306 }, { "epoch": 2.182940848031005, "grad_norm": 0.8190112113952637, "learning_rate": 2.7238479060802525e-05, "loss": 3.6871, "step": 8308 }, { "epoch": 2.1834663513646664, "grad_norm": 0.9176834225654602, "learning_rate": 2.722095671981777e-05, "loss": 3.685, "step": 8310 }, { "epoch": 2.1839918546983283, "grad_norm": 0.9380425810813904, "learning_rate": 2.7203434378833013e-05, "loss": 3.7187, "step": 8312 }, { "epoch": 2.18451735803199, "grad_norm": 0.9739280343055725, "learning_rate": 2.718591203784826e-05, "loss": 3.685, "step": 8314 }, { "epoch": 2.185042861365652, "grad_norm": 0.830197274684906, "learning_rate": 2.71683896968635e-05, "loss": 3.7202, "step": 8316 }, { "epoch": 2.1855683646993134, "grad_norm": 0.894064724445343, "learning_rate": 2.7150867355878746e-05, "loss": 3.7089, "step": 8318 }, { "epoch": 2.1860938680329753, "grad_norm": 0.759191632270813, "learning_rate": 2.713334501489399e-05, "loss": 3.7429, "step": 8320 }, { "epoch": 2.1866193713666373, "grad_norm": 0.8307719230651855, "learning_rate": 2.7115822673909237e-05, "loss": 3.7054, "step": 8322 }, { "epoch": 2.187144874700299, "grad_norm": 0.9706530570983887, "learning_rate": 2.709830033292448e-05, "loss": 3.7093, "step": 8324 }, { "epoch": 2.187670378033961, "grad_norm": 0.8437049984931946, "learning_rate": 2.7080777991939726e-05, "loss": 3.6908, "step": 8326 }, { "epoch": 2.1881958813676223, "grad_norm": 0.881845235824585, "learning_rate": 2.7063255650954967e-05, "loss": 3.7227, "step": 8328 }, { "epoch": 2.1887213847012843, "grad_norm": 0.9014148116111755, "learning_rate": 2.704573330997021e-05, "loss": 3.7185, "step": 8330 }, { "epoch": 2.189246888034946, "grad_norm": 1.031626582145691, "learning_rate": 2.7028210968985458e-05, "loss": 3.7225, "step": 8332 }, { "epoch": 2.189772391368608, "grad_norm": 0.8598271012306213, "learning_rate": 2.7010688628000702e-05, "loss": 3.723, "step": 8334 }, { "epoch": 2.1902978947022693, "grad_norm": 0.8904011845588684, "learning_rate": 2.6993166287015947e-05, "loss": 3.6972, "step": 8336 }, { "epoch": 2.1908233980359313, "grad_norm": 0.8493955731391907, "learning_rate": 2.6975643946031194e-05, "loss": 3.685, "step": 8338 }, { "epoch": 2.1913489013695933, "grad_norm": 0.8471643924713135, "learning_rate": 2.695812160504644e-05, "loss": 3.7, "step": 8340 }, { "epoch": 2.191874404703255, "grad_norm": 0.8612148761749268, "learning_rate": 2.694059926406168e-05, "loss": 3.7291, "step": 8342 }, { "epoch": 2.1923999080369168, "grad_norm": 0.9028595685958862, "learning_rate": 2.6923076923076923e-05, "loss": 3.7938, "step": 8344 }, { "epoch": 2.1929254113705783, "grad_norm": 0.8955885171890259, "learning_rate": 2.6905554582092167e-05, "loss": 3.7277, "step": 8346 }, { "epoch": 2.1934509147042403, "grad_norm": 0.8155354261398315, "learning_rate": 2.6888032241107415e-05, "loss": 3.7084, "step": 8348 }, { "epoch": 2.1939764180379018, "grad_norm": 0.8326452970504761, "learning_rate": 2.687050990012266e-05, "loss": 3.6575, "step": 8350 }, { "epoch": 2.1945019213715637, "grad_norm": 0.8597757816314697, "learning_rate": 2.6852987559137903e-05, "loss": 3.6563, "step": 8352 }, { "epoch": 2.1950274247052253, "grad_norm": 0.8363339304924011, "learning_rate": 2.6835465218153144e-05, "loss": 3.6969, "step": 8354 }, { "epoch": 2.1955529280388872, "grad_norm": 0.9134012460708618, "learning_rate": 2.681794287716839e-05, "loss": 3.714, "step": 8356 }, { "epoch": 2.196078431372549, "grad_norm": 0.8481467366218567, "learning_rate": 2.6800420536183636e-05, "loss": 3.6767, "step": 8358 }, { "epoch": 2.1966039347062107, "grad_norm": 0.840603768825531, "learning_rate": 2.678289819519888e-05, "loss": 3.7062, "step": 8360 }, { "epoch": 2.1971294380398727, "grad_norm": 0.9756871461868286, "learning_rate": 2.6765375854214124e-05, "loss": 3.6803, "step": 8362 }, { "epoch": 2.1976549413735342, "grad_norm": 0.901891827583313, "learning_rate": 2.6747853513229372e-05, "loss": 3.6983, "step": 8364 }, { "epoch": 2.198180444707196, "grad_norm": 0.8879050016403198, "learning_rate": 2.6730331172244616e-05, "loss": 3.6724, "step": 8366 }, { "epoch": 2.1987059480408577, "grad_norm": 0.8660027980804443, "learning_rate": 2.6712808831259857e-05, "loss": 3.7215, "step": 8368 }, { "epoch": 2.1992314513745197, "grad_norm": 0.8812180757522583, "learning_rate": 2.66952864902751e-05, "loss": 3.6882, "step": 8370 }, { "epoch": 2.1997569547081812, "grad_norm": 0.8492754697799683, "learning_rate": 2.6677764149290345e-05, "loss": 3.7159, "step": 8372 }, { "epoch": 2.200282458041843, "grad_norm": 0.8163833022117615, "learning_rate": 2.6660241808305593e-05, "loss": 3.696, "step": 8374 }, { "epoch": 2.200807961375505, "grad_norm": 0.9204570651054382, "learning_rate": 2.6642719467320837e-05, "loss": 3.6936, "step": 8376 }, { "epoch": 2.2013334647091667, "grad_norm": 0.805826723575592, "learning_rate": 2.662519712633608e-05, "loss": 3.689, "step": 8378 }, { "epoch": 2.2018589680428287, "grad_norm": 0.8026923537254333, "learning_rate": 2.6607674785351322e-05, "loss": 3.7149, "step": 8380 }, { "epoch": 2.20238447137649, "grad_norm": 0.9795794486999512, "learning_rate": 2.6590152444366566e-05, "loss": 3.7136, "step": 8382 }, { "epoch": 2.202909974710152, "grad_norm": 0.8493843674659729, "learning_rate": 2.6572630103381814e-05, "loss": 3.7165, "step": 8384 }, { "epoch": 2.2034354780438137, "grad_norm": 0.8286498785018921, "learning_rate": 2.6555107762397058e-05, "loss": 3.6931, "step": 8386 }, { "epoch": 2.2039609813774756, "grad_norm": 0.8751018047332764, "learning_rate": 2.6537585421412302e-05, "loss": 3.6849, "step": 8388 }, { "epoch": 2.204486484711137, "grad_norm": 0.976808488368988, "learning_rate": 2.652006308042755e-05, "loss": 3.755, "step": 8390 }, { "epoch": 2.205011988044799, "grad_norm": 0.9292572140693665, "learning_rate": 2.6502540739442787e-05, "loss": 3.6987, "step": 8392 }, { "epoch": 2.205537491378461, "grad_norm": 0.8824182748794556, "learning_rate": 2.6485018398458034e-05, "loss": 3.6543, "step": 8394 }, { "epoch": 2.2060629947121226, "grad_norm": 0.9273900389671326, "learning_rate": 2.646749605747328e-05, "loss": 3.7089, "step": 8396 }, { "epoch": 2.2065884980457846, "grad_norm": 0.8452262878417969, "learning_rate": 2.6449973716488523e-05, "loss": 3.7729, "step": 8398 }, { "epoch": 2.207114001379446, "grad_norm": 0.9353644847869873, "learning_rate": 2.643245137550377e-05, "loss": 3.7223, "step": 8400 }, { "epoch": 2.207114001379446, "eval_loss": 3.7369163036346436, "eval_runtime": 464.6931, "eval_samples_per_second": 262.085, "eval_steps_per_second": 8.19, "step": 8400 }, { "epoch": 2.207639504713108, "grad_norm": 1.0380263328552246, "learning_rate": 2.6414929034519015e-05, "loss": 3.7315, "step": 8402 }, { "epoch": 2.2081650080467696, "grad_norm": 0.9022584557533264, "learning_rate": 2.639740669353426e-05, "loss": 3.6836, "step": 8404 }, { "epoch": 2.2086905113804316, "grad_norm": 0.8685585260391235, "learning_rate": 2.63798843525495e-05, "loss": 3.6622, "step": 8406 }, { "epoch": 2.2092160147140936, "grad_norm": 1.0113734006881714, "learning_rate": 2.6362362011564744e-05, "loss": 3.8056, "step": 8408 }, { "epoch": 2.209741518047755, "grad_norm": 0.872237503528595, "learning_rate": 2.634483967057999e-05, "loss": 3.6955, "step": 8410 }, { "epoch": 2.210267021381417, "grad_norm": 0.9971697330474854, "learning_rate": 2.6327317329595235e-05, "loss": 3.7151, "step": 8412 }, { "epoch": 2.2107925247150786, "grad_norm": 1.1076695919036865, "learning_rate": 2.630979498861048e-05, "loss": 3.6827, "step": 8414 }, { "epoch": 2.2113180280487406, "grad_norm": 0.8869040608406067, "learning_rate": 2.6292272647625727e-05, "loss": 3.7165, "step": 8416 }, { "epoch": 2.211843531382402, "grad_norm": 0.8861736059188843, "learning_rate": 2.6274750306640965e-05, "loss": 3.706, "step": 8418 }, { "epoch": 2.212369034716064, "grad_norm": 0.8684143424034119, "learning_rate": 2.6257227965656212e-05, "loss": 3.6831, "step": 8420 }, { "epoch": 2.2128945380497256, "grad_norm": 0.9816427826881409, "learning_rate": 2.6239705624671456e-05, "loss": 3.6928, "step": 8422 }, { "epoch": 2.2134200413833875, "grad_norm": 0.9428025484085083, "learning_rate": 2.62221832836867e-05, "loss": 3.7444, "step": 8424 }, { "epoch": 2.213945544717049, "grad_norm": 0.8661747574806213, "learning_rate": 2.6204660942701948e-05, "loss": 3.6719, "step": 8426 }, { "epoch": 2.214471048050711, "grad_norm": 0.8142666220664978, "learning_rate": 2.6187138601717192e-05, "loss": 3.6543, "step": 8428 }, { "epoch": 2.214996551384373, "grad_norm": 0.8258982300758362, "learning_rate": 2.6169616260732433e-05, "loss": 3.7401, "step": 8430 }, { "epoch": 2.2155220547180345, "grad_norm": 0.9033029079437256, "learning_rate": 2.6152093919747677e-05, "loss": 3.7064, "step": 8432 }, { "epoch": 2.2160475580516965, "grad_norm": 0.9616712331771851, "learning_rate": 2.613457157876292e-05, "loss": 3.7161, "step": 8434 }, { "epoch": 2.216573061385358, "grad_norm": 0.8698147535324097, "learning_rate": 2.611704923777817e-05, "loss": 3.7679, "step": 8436 }, { "epoch": 2.21709856471902, "grad_norm": 0.821465253829956, "learning_rate": 2.6099526896793413e-05, "loss": 3.7223, "step": 8438 }, { "epoch": 2.2176240680526815, "grad_norm": 0.8450705409049988, "learning_rate": 2.608200455580866e-05, "loss": 3.72, "step": 8440 }, { "epoch": 2.2181495713863435, "grad_norm": 0.8919811844825745, "learning_rate": 2.6064482214823905e-05, "loss": 3.6829, "step": 8442 }, { "epoch": 2.2186750747200055, "grad_norm": 0.8154439926147461, "learning_rate": 2.6046959873839146e-05, "loss": 3.6885, "step": 8444 }, { "epoch": 2.219200578053667, "grad_norm": 0.8560050129890442, "learning_rate": 2.602943753285439e-05, "loss": 3.7428, "step": 8446 }, { "epoch": 2.219726081387329, "grad_norm": 0.9104583859443665, "learning_rate": 2.6011915191869634e-05, "loss": 3.7278, "step": 8448 }, { "epoch": 2.2202515847209905, "grad_norm": 0.8519176840782166, "learning_rate": 2.599439285088488e-05, "loss": 3.7057, "step": 8450 }, { "epoch": 2.2207770880546525, "grad_norm": 0.8362381458282471, "learning_rate": 2.5976870509900126e-05, "loss": 3.7144, "step": 8452 }, { "epoch": 2.221302591388314, "grad_norm": 0.8191369771957397, "learning_rate": 2.595934816891537e-05, "loss": 3.7135, "step": 8454 }, { "epoch": 2.221828094721976, "grad_norm": 0.8976404666900635, "learning_rate": 2.594182582793061e-05, "loss": 3.675, "step": 8456 }, { "epoch": 2.2223535980556375, "grad_norm": 0.8438371419906616, "learning_rate": 2.5924303486945855e-05, "loss": 3.6603, "step": 8458 }, { "epoch": 2.2228791013892994, "grad_norm": 0.8291351795196533, "learning_rate": 2.5906781145961102e-05, "loss": 3.757, "step": 8460 }, { "epoch": 2.223404604722961, "grad_norm": 0.8354798555374146, "learning_rate": 2.5889258804976347e-05, "loss": 3.6344, "step": 8462 }, { "epoch": 2.223930108056623, "grad_norm": 0.8646947741508484, "learning_rate": 2.587173646399159e-05, "loss": 3.7367, "step": 8464 }, { "epoch": 2.224455611390285, "grad_norm": 0.8674966096878052, "learning_rate": 2.585421412300684e-05, "loss": 3.7358, "step": 8466 }, { "epoch": 2.2249811147239464, "grad_norm": 0.8617560863494873, "learning_rate": 2.5836691782022076e-05, "loss": 3.7177, "step": 8468 }, { "epoch": 2.2255066180576084, "grad_norm": 0.8881665468215942, "learning_rate": 2.5819169441037323e-05, "loss": 3.7525, "step": 8470 }, { "epoch": 2.22603212139127, "grad_norm": 0.8350577354431152, "learning_rate": 2.5801647100052567e-05, "loss": 3.6481, "step": 8472 }, { "epoch": 2.226557624724932, "grad_norm": 0.8961943984031677, "learning_rate": 2.578412475906781e-05, "loss": 3.6423, "step": 8474 }, { "epoch": 2.2270831280585934, "grad_norm": 0.9159841537475586, "learning_rate": 2.576660241808306e-05, "loss": 3.7078, "step": 8476 }, { "epoch": 2.2276086313922554, "grad_norm": 0.8731995224952698, "learning_rate": 2.5749080077098303e-05, "loss": 3.6713, "step": 8478 }, { "epoch": 2.2281341347259174, "grad_norm": 0.8071151971817017, "learning_rate": 2.5731557736113548e-05, "loss": 3.7504, "step": 8480 }, { "epoch": 2.228659638059579, "grad_norm": 0.9571130871772766, "learning_rate": 2.571403539512879e-05, "loss": 3.7614, "step": 8482 }, { "epoch": 2.229185141393241, "grad_norm": 0.791068971157074, "learning_rate": 2.5696513054144033e-05, "loss": 3.6718, "step": 8484 }, { "epoch": 2.2297106447269024, "grad_norm": 0.8968987464904785, "learning_rate": 2.567899071315928e-05, "loss": 3.7248, "step": 8486 }, { "epoch": 2.2302361480605644, "grad_norm": 1.0467941761016846, "learning_rate": 2.5661468372174524e-05, "loss": 3.7186, "step": 8488 }, { "epoch": 2.230761651394226, "grad_norm": 0.9190542101860046, "learning_rate": 2.564394603118977e-05, "loss": 3.7347, "step": 8490 }, { "epoch": 2.231287154727888, "grad_norm": 0.8116027116775513, "learning_rate": 2.5626423690205016e-05, "loss": 3.7459, "step": 8492 }, { "epoch": 2.2318126580615494, "grad_norm": 0.8531833291053772, "learning_rate": 2.5608901349220253e-05, "loss": 3.8051, "step": 8494 }, { "epoch": 2.2323381613952114, "grad_norm": 0.8604549169540405, "learning_rate": 2.55913790082355e-05, "loss": 3.7104, "step": 8496 }, { "epoch": 2.2328636647288733, "grad_norm": 0.7991697788238525, "learning_rate": 2.5573856667250745e-05, "loss": 3.722, "step": 8498 }, { "epoch": 2.233389168062535, "grad_norm": 0.9225679636001587, "learning_rate": 2.555633432626599e-05, "loss": 3.7581, "step": 8500 }, { "epoch": 2.233914671396197, "grad_norm": 1.0546472072601318, "learning_rate": 2.5538811985281237e-05, "loss": 3.715, "step": 8502 }, { "epoch": 2.2344401747298583, "grad_norm": 0.866346538066864, "learning_rate": 2.552128964429648e-05, "loss": 3.7209, "step": 8504 }, { "epoch": 2.2349656780635203, "grad_norm": 0.98794025182724, "learning_rate": 2.5503767303311725e-05, "loss": 3.7652, "step": 8506 }, { "epoch": 2.235491181397182, "grad_norm": 1.006956696510315, "learning_rate": 2.5486244962326966e-05, "loss": 3.6759, "step": 8508 }, { "epoch": 2.236016684730844, "grad_norm": 0.839747428894043, "learning_rate": 2.546872262134221e-05, "loss": 3.6653, "step": 8510 }, { "epoch": 2.2365421880645053, "grad_norm": 0.910836935043335, "learning_rate": 2.5451200280357458e-05, "loss": 3.7192, "step": 8512 }, { "epoch": 2.2370676913981673, "grad_norm": 0.8600544929504395, "learning_rate": 2.5433677939372702e-05, "loss": 3.7145, "step": 8514 }, { "epoch": 2.2375931947318293, "grad_norm": 0.9046633243560791, "learning_rate": 2.5416155598387946e-05, "loss": 3.7019, "step": 8516 }, { "epoch": 2.238118698065491, "grad_norm": 0.9135326147079468, "learning_rate": 2.5398633257403194e-05, "loss": 3.705, "step": 8518 }, { "epoch": 2.2386442013991528, "grad_norm": 0.943377673625946, "learning_rate": 2.538111091641843e-05, "loss": 3.6996, "step": 8520 }, { "epoch": 2.2391697047328143, "grad_norm": 0.9357112646102905, "learning_rate": 2.536358857543368e-05, "loss": 3.7141, "step": 8522 }, { "epoch": 2.2396952080664763, "grad_norm": 1.0889650583267212, "learning_rate": 2.5346066234448923e-05, "loss": 3.7536, "step": 8524 }, { "epoch": 2.240220711400138, "grad_norm": 0.8524461984634399, "learning_rate": 2.5328543893464167e-05, "loss": 3.7026, "step": 8526 }, { "epoch": 2.2407462147337998, "grad_norm": 0.9983847141265869, "learning_rate": 2.5311021552479415e-05, "loss": 3.7065, "step": 8528 }, { "epoch": 2.2412717180674613, "grad_norm": 0.8339743614196777, "learning_rate": 2.529349921149466e-05, "loss": 3.6893, "step": 8530 }, { "epoch": 2.2417972214011233, "grad_norm": 0.8762292265892029, "learning_rate": 2.52759768705099e-05, "loss": 3.7407, "step": 8532 }, { "epoch": 2.242322724734785, "grad_norm": 0.8839744925498962, "learning_rate": 2.5258454529525144e-05, "loss": 3.7029, "step": 8534 }, { "epoch": 2.2428482280684467, "grad_norm": 0.9230059385299683, "learning_rate": 2.5240932188540388e-05, "loss": 3.6972, "step": 8536 }, { "epoch": 2.2433737314021087, "grad_norm": 0.8523740172386169, "learning_rate": 2.5223409847555635e-05, "loss": 3.7345, "step": 8538 }, { "epoch": 2.2438992347357702, "grad_norm": 0.828881025314331, "learning_rate": 2.520588750657088e-05, "loss": 3.6661, "step": 8540 }, { "epoch": 2.244424738069432, "grad_norm": 0.8837599158287048, "learning_rate": 2.5188365165586124e-05, "loss": 3.7191, "step": 8542 }, { "epoch": 2.2449502414030937, "grad_norm": 0.8558138012886047, "learning_rate": 2.517084282460137e-05, "loss": 3.7385, "step": 8544 }, { "epoch": 2.2454757447367557, "grad_norm": 0.8277314901351929, "learning_rate": 2.515332048361661e-05, "loss": 3.6647, "step": 8546 }, { "epoch": 2.2460012480704172, "grad_norm": 0.8076902031898499, "learning_rate": 2.5135798142631856e-05, "loss": 3.6676, "step": 8548 }, { "epoch": 2.246526751404079, "grad_norm": 0.8539445996284485, "learning_rate": 2.51182758016471e-05, "loss": 3.6662, "step": 8550 }, { "epoch": 2.247052254737741, "grad_norm": 0.8796378374099731, "learning_rate": 2.5100753460662345e-05, "loss": 3.6814, "step": 8552 }, { "epoch": 2.2475777580714027, "grad_norm": 1.0512335300445557, "learning_rate": 2.5083231119677592e-05, "loss": 3.6802, "step": 8554 }, { "epoch": 2.2481032614050647, "grad_norm": 0.9111982583999634, "learning_rate": 2.5065708778692836e-05, "loss": 3.7392, "step": 8556 }, { "epoch": 2.248628764738726, "grad_norm": 0.8824375867843628, "learning_rate": 2.5048186437708077e-05, "loss": 3.6898, "step": 8558 }, { "epoch": 2.249154268072388, "grad_norm": 0.9274206161499023, "learning_rate": 2.503066409672332e-05, "loss": 3.7252, "step": 8560 }, { "epoch": 2.2496797714060497, "grad_norm": 0.8798378109931946, "learning_rate": 2.5013141755738566e-05, "loss": 3.6753, "step": 8562 }, { "epoch": 2.2502052747397117, "grad_norm": 0.9110990762710571, "learning_rate": 2.4995619414753813e-05, "loss": 3.7253, "step": 8564 }, { "epoch": 2.2507307780733736, "grad_norm": 0.9343361258506775, "learning_rate": 2.4978097073769057e-05, "loss": 3.7518, "step": 8566 }, { "epoch": 2.251256281407035, "grad_norm": 1.101361870765686, "learning_rate": 2.49605747327843e-05, "loss": 3.6584, "step": 8568 }, { "epoch": 2.251781784740697, "grad_norm": 0.9552696943283081, "learning_rate": 2.4943052391799546e-05, "loss": 3.6888, "step": 8570 }, { "epoch": 2.2523072880743586, "grad_norm": 0.950276255607605, "learning_rate": 2.492553005081479e-05, "loss": 3.7265, "step": 8572 }, { "epoch": 2.2528327914080206, "grad_norm": 0.8842815160751343, "learning_rate": 2.4908007709830034e-05, "loss": 3.6716, "step": 8574 }, { "epoch": 2.253358294741682, "grad_norm": 0.9771357178688049, "learning_rate": 2.4890485368845278e-05, "loss": 3.6968, "step": 8576 }, { "epoch": 2.253883798075344, "grad_norm": 0.9456753134727478, "learning_rate": 2.4872963027860526e-05, "loss": 3.7163, "step": 8578 }, { "epoch": 2.2544093014090056, "grad_norm": 0.8846527338027954, "learning_rate": 2.4855440686875766e-05, "loss": 3.7565, "step": 8580 }, { "epoch": 2.2549348047426676, "grad_norm": 0.8905049562454224, "learning_rate": 2.483791834589101e-05, "loss": 3.7071, "step": 8582 }, { "epoch": 2.255460308076329, "grad_norm": 0.9422726631164551, "learning_rate": 2.4820396004906258e-05, "loss": 3.7147, "step": 8584 }, { "epoch": 2.255985811409991, "grad_norm": 0.8958803415298462, "learning_rate": 2.48028736639215e-05, "loss": 3.6829, "step": 8586 }, { "epoch": 2.256511314743653, "grad_norm": 0.9214187264442444, "learning_rate": 2.4785351322936747e-05, "loss": 3.7085, "step": 8588 }, { "epoch": 2.2570368180773146, "grad_norm": 0.8664852380752563, "learning_rate": 2.476782898195199e-05, "loss": 3.7096, "step": 8590 }, { "epoch": 2.2575623214109766, "grad_norm": 0.9428418278694153, "learning_rate": 2.4750306640967235e-05, "loss": 3.7168, "step": 8592 }, { "epoch": 2.258087824744638, "grad_norm": 0.8150144815444946, "learning_rate": 2.473278429998248e-05, "loss": 3.6832, "step": 8594 }, { "epoch": 2.2586133280783, "grad_norm": 0.8549310564994812, "learning_rate": 2.4715261958997723e-05, "loss": 3.6662, "step": 8596 }, { "epoch": 2.2591388314119616, "grad_norm": 0.9776257276535034, "learning_rate": 2.4697739618012967e-05, "loss": 3.7456, "step": 8598 }, { "epoch": 2.2596643347456236, "grad_norm": 0.9063823819160461, "learning_rate": 2.468021727702821e-05, "loss": 3.698, "step": 8600 }, { "epoch": 2.2601898380792855, "grad_norm": 0.9458752274513245, "learning_rate": 2.4662694936043456e-05, "loss": 3.6811, "step": 8602 }, { "epoch": 2.260715341412947, "grad_norm": 0.8988263010978699, "learning_rate": 2.4645172595058703e-05, "loss": 3.6702, "step": 8604 }, { "epoch": 2.261240844746609, "grad_norm": 0.8277187347412109, "learning_rate": 2.4627650254073944e-05, "loss": 3.6806, "step": 8606 }, { "epoch": 2.2617663480802706, "grad_norm": 0.8975244164466858, "learning_rate": 2.461012791308919e-05, "loss": 3.7154, "step": 8608 }, { "epoch": 2.2622918514139325, "grad_norm": 0.8533080816268921, "learning_rate": 2.4592605572104436e-05, "loss": 3.7146, "step": 8610 }, { "epoch": 2.262817354747594, "grad_norm": 0.8882051706314087, "learning_rate": 2.4575083231119677e-05, "loss": 3.7516, "step": 8612 }, { "epoch": 2.263342858081256, "grad_norm": 0.8997262120246887, "learning_rate": 2.4557560890134924e-05, "loss": 3.7438, "step": 8614 }, { "epoch": 2.2638683614149175, "grad_norm": 0.9674894213676453, "learning_rate": 2.454003854915017e-05, "loss": 3.7292, "step": 8616 }, { "epoch": 2.2643938647485795, "grad_norm": 1.0004075765609741, "learning_rate": 2.452251620816541e-05, "loss": 3.7124, "step": 8618 }, { "epoch": 2.264919368082241, "grad_norm": 0.9079523086547852, "learning_rate": 2.4504993867180657e-05, "loss": 3.7352, "step": 8620 }, { "epoch": 2.265444871415903, "grad_norm": 0.8861836791038513, "learning_rate": 2.44874715261959e-05, "loss": 3.695, "step": 8622 }, { "epoch": 2.265970374749565, "grad_norm": 0.9691295623779297, "learning_rate": 2.4469949185211145e-05, "loss": 3.7164, "step": 8624 }, { "epoch": 2.2664958780832265, "grad_norm": 0.8321598768234253, "learning_rate": 2.445242684422639e-05, "loss": 3.7157, "step": 8626 }, { "epoch": 2.2670213814168885, "grad_norm": 1.0298664569854736, "learning_rate": 2.4434904503241633e-05, "loss": 3.7411, "step": 8628 }, { "epoch": 2.26754688475055, "grad_norm": 0.8731899261474609, "learning_rate": 2.441738216225688e-05, "loss": 3.7057, "step": 8630 }, { "epoch": 2.268072388084212, "grad_norm": 0.8733834624290466, "learning_rate": 2.4399859821272122e-05, "loss": 3.7375, "step": 8632 }, { "epoch": 2.2685978914178735, "grad_norm": 1.0555015802383423, "learning_rate": 2.438233748028737e-05, "loss": 3.7027, "step": 8634 }, { "epoch": 2.2691233947515355, "grad_norm": 1.0883374214172363, "learning_rate": 2.4364815139302614e-05, "loss": 3.7108, "step": 8636 }, { "epoch": 2.2696488980851974, "grad_norm": 0.9576637744903564, "learning_rate": 2.4347292798317854e-05, "loss": 3.7461, "step": 8638 }, { "epoch": 2.270174401418859, "grad_norm": 1.0004829168319702, "learning_rate": 2.4329770457333102e-05, "loss": 3.6733, "step": 8640 }, { "epoch": 2.270699904752521, "grad_norm": 0.9054736495018005, "learning_rate": 2.4312248116348346e-05, "loss": 3.65, "step": 8642 }, { "epoch": 2.2712254080861825, "grad_norm": 0.8914501070976257, "learning_rate": 2.429472577536359e-05, "loss": 3.7312, "step": 8644 }, { "epoch": 2.2717509114198444, "grad_norm": 0.8763524293899536, "learning_rate": 2.4277203434378834e-05, "loss": 3.6591, "step": 8646 }, { "epoch": 2.272276414753506, "grad_norm": 0.8364145755767822, "learning_rate": 2.425968109339408e-05, "loss": 3.6937, "step": 8648 }, { "epoch": 2.272801918087168, "grad_norm": 0.8976448774337769, "learning_rate": 2.4242158752409323e-05, "loss": 3.7575, "step": 8650 }, { "epoch": 2.27332742142083, "grad_norm": 0.9814063310623169, "learning_rate": 2.4224636411424567e-05, "loss": 3.6467, "step": 8652 }, { "epoch": 2.2738529247544914, "grad_norm": 0.8381695747375488, "learning_rate": 2.420711407043981e-05, "loss": 3.7119, "step": 8654 }, { "epoch": 2.274378428088153, "grad_norm": 0.9326244592666626, "learning_rate": 2.4189591729455055e-05, "loss": 3.6899, "step": 8656 }, { "epoch": 2.274903931421815, "grad_norm": 0.9281357526779175, "learning_rate": 2.41720693884703e-05, "loss": 3.7225, "step": 8658 }, { "epoch": 2.275429434755477, "grad_norm": 0.8739488124847412, "learning_rate": 2.4154547047485547e-05, "loss": 3.7329, "step": 8660 }, { "epoch": 2.2759549380891384, "grad_norm": 0.8692238330841064, "learning_rate": 2.413702470650079e-05, "loss": 3.6772, "step": 8662 }, { "epoch": 2.2764804414228004, "grad_norm": 0.8703108429908752, "learning_rate": 2.4119502365516032e-05, "loss": 3.7793, "step": 8664 }, { "epoch": 2.277005944756462, "grad_norm": 0.8865333199501038, "learning_rate": 2.410198002453128e-05, "loss": 3.7483, "step": 8666 }, { "epoch": 2.277531448090124, "grad_norm": 0.8464983701705933, "learning_rate": 2.4084457683546524e-05, "loss": 3.6926, "step": 8668 }, { "epoch": 2.2780569514237854, "grad_norm": 0.9374005198478699, "learning_rate": 2.4066935342561768e-05, "loss": 3.697, "step": 8670 }, { "epoch": 2.2785824547574474, "grad_norm": 0.8761271238327026, "learning_rate": 2.4049413001577012e-05, "loss": 3.6246, "step": 8672 }, { "epoch": 2.2791079580911093, "grad_norm": 0.9366564154624939, "learning_rate": 2.4031890660592256e-05, "loss": 3.737, "step": 8674 }, { "epoch": 2.279633461424771, "grad_norm": 0.8893771767616272, "learning_rate": 2.40143683196075e-05, "loss": 3.6974, "step": 8676 }, { "epoch": 2.280158964758433, "grad_norm": 0.8568159937858582, "learning_rate": 2.3996845978622745e-05, "loss": 3.7495, "step": 8678 }, { "epoch": 2.2806844680920944, "grad_norm": 0.9094693064689636, "learning_rate": 2.397932363763799e-05, "loss": 3.7141, "step": 8680 }, { "epoch": 2.2812099714257563, "grad_norm": 0.9162114858627319, "learning_rate": 2.3961801296653233e-05, "loss": 3.7525, "step": 8682 }, { "epoch": 2.281735474759418, "grad_norm": 0.9137269854545593, "learning_rate": 2.3944278955668477e-05, "loss": 3.6518, "step": 8684 }, { "epoch": 2.28226097809308, "grad_norm": 0.806143045425415, "learning_rate": 2.3926756614683725e-05, "loss": 3.7309, "step": 8686 }, { "epoch": 2.282786481426742, "grad_norm": 0.9611552357673645, "learning_rate": 2.3909234273698966e-05, "loss": 3.727, "step": 8688 }, { "epoch": 2.2833119847604033, "grad_norm": 0.8936959505081177, "learning_rate": 2.3891711932714213e-05, "loss": 3.7073, "step": 8690 }, { "epoch": 2.2838374880940653, "grad_norm": 1.0026473999023438, "learning_rate": 2.3874189591729457e-05, "loss": 3.7287, "step": 8692 }, { "epoch": 2.284362991427727, "grad_norm": 0.9841282367706299, "learning_rate": 2.38566672507447e-05, "loss": 3.7369, "step": 8694 }, { "epoch": 2.2848884947613888, "grad_norm": 0.9384371638298035, "learning_rate": 2.3839144909759946e-05, "loss": 3.6782, "step": 8696 }, { "epoch": 2.2854139980950503, "grad_norm": 0.943792462348938, "learning_rate": 2.382162256877519e-05, "loss": 3.7197, "step": 8698 }, { "epoch": 2.2859395014287123, "grad_norm": 0.8338215351104736, "learning_rate": 2.3804100227790434e-05, "loss": 3.7497, "step": 8700 }, { "epoch": 2.286465004762374, "grad_norm": 0.7805085182189941, "learning_rate": 2.3786577886805678e-05, "loss": 3.7076, "step": 8702 }, { "epoch": 2.2869905080960358, "grad_norm": 0.8572628498077393, "learning_rate": 2.3769055545820922e-05, "loss": 3.7391, "step": 8704 }, { "epoch": 2.2875160114296973, "grad_norm": 0.9712094664573669, "learning_rate": 2.375153320483617e-05, "loss": 3.7065, "step": 8706 }, { "epoch": 2.2880415147633593, "grad_norm": 0.8590351939201355, "learning_rate": 2.373401086385141e-05, "loss": 3.7046, "step": 8708 }, { "epoch": 2.2885670180970212, "grad_norm": 0.7928573489189148, "learning_rate": 2.3716488522866655e-05, "loss": 3.6521, "step": 8710 }, { "epoch": 2.2890925214306828, "grad_norm": 0.8730065822601318, "learning_rate": 2.3698966181881902e-05, "loss": 3.7012, "step": 8712 }, { "epoch": 2.2896180247643447, "grad_norm": 0.8622931241989136, "learning_rate": 2.3681443840897143e-05, "loss": 3.7211, "step": 8714 }, { "epoch": 2.2901435280980063, "grad_norm": 0.8519995808601379, "learning_rate": 2.366392149991239e-05, "loss": 3.7357, "step": 8716 }, { "epoch": 2.2906690314316682, "grad_norm": 0.8260533213615417, "learning_rate": 2.3646399158927635e-05, "loss": 3.6943, "step": 8718 }, { "epoch": 2.2911945347653297, "grad_norm": 0.8369278907775879, "learning_rate": 2.3628876817942876e-05, "loss": 3.718, "step": 8720 }, { "epoch": 2.2917200380989917, "grad_norm": 0.8493014574050903, "learning_rate": 2.3611354476958123e-05, "loss": 3.729, "step": 8722 }, { "epoch": 2.2922455414326537, "grad_norm": 1.0115028619766235, "learning_rate": 2.3593832135973367e-05, "loss": 3.6872, "step": 8724 }, { "epoch": 2.292771044766315, "grad_norm": 0.8927851319313049, "learning_rate": 2.357630979498861e-05, "loss": 3.714, "step": 8726 }, { "epoch": 2.293296548099977, "grad_norm": 0.8980404138565063, "learning_rate": 2.3558787454003856e-05, "loss": 3.7314, "step": 8728 }, { "epoch": 2.2938220514336387, "grad_norm": 0.7765140533447266, "learning_rate": 2.35412651130191e-05, "loss": 3.6938, "step": 8730 }, { "epoch": 2.2943475547673007, "grad_norm": 0.887175440788269, "learning_rate": 2.3523742772034348e-05, "loss": 3.723, "step": 8732 }, { "epoch": 2.294873058100962, "grad_norm": 0.7863097190856934, "learning_rate": 2.350622043104959e-05, "loss": 3.6983, "step": 8734 }, { "epoch": 2.295398561434624, "grad_norm": 0.8254992961883545, "learning_rate": 2.3488698090064832e-05, "loss": 3.6416, "step": 8736 }, { "epoch": 2.2959240647682857, "grad_norm": 0.918328583240509, "learning_rate": 2.347117574908008e-05, "loss": 3.6745, "step": 8738 }, { "epoch": 2.2964495681019477, "grad_norm": 0.8571835160255432, "learning_rate": 2.345365340809532e-05, "loss": 3.7306, "step": 8740 }, { "epoch": 2.296975071435609, "grad_norm": 0.8740397095680237, "learning_rate": 2.343613106711057e-05, "loss": 3.7056, "step": 8742 }, { "epoch": 2.297500574769271, "grad_norm": 0.875098466873169, "learning_rate": 2.3418608726125813e-05, "loss": 3.7071, "step": 8744 }, { "epoch": 2.298026078102933, "grad_norm": 1.0460782051086426, "learning_rate": 2.3401086385141053e-05, "loss": 3.7799, "step": 8746 }, { "epoch": 2.2985515814365947, "grad_norm": 0.8156836032867432, "learning_rate": 2.33835640441563e-05, "loss": 3.6658, "step": 8748 }, { "epoch": 2.2990770847702566, "grad_norm": 0.8365504145622253, "learning_rate": 2.3366041703171545e-05, "loss": 3.7407, "step": 8750 }, { "epoch": 2.299602588103918, "grad_norm": 0.8822233080863953, "learning_rate": 2.334851936218679e-05, "loss": 3.7327, "step": 8752 }, { "epoch": 2.30012809143758, "grad_norm": 0.7872963547706604, "learning_rate": 2.3330997021202033e-05, "loss": 3.7549, "step": 8754 }, { "epoch": 2.3006535947712417, "grad_norm": 0.9420776963233948, "learning_rate": 2.3313474680217278e-05, "loss": 3.7102, "step": 8756 }, { "epoch": 2.3011790981049036, "grad_norm": 0.8744617700576782, "learning_rate": 2.3295952339232522e-05, "loss": 3.7421, "step": 8758 }, { "epoch": 2.3017046014385656, "grad_norm": 0.8450965285301208, "learning_rate": 2.3278429998247766e-05, "loss": 3.7108, "step": 8760 }, { "epoch": 2.302230104772227, "grad_norm": 0.8318673372268677, "learning_rate": 2.3260907657263014e-05, "loss": 3.7243, "step": 8762 }, { "epoch": 2.302755608105889, "grad_norm": 0.8435103297233582, "learning_rate": 2.3243385316278258e-05, "loss": 3.7455, "step": 8764 }, { "epoch": 2.3032811114395506, "grad_norm": 0.9342592358589172, "learning_rate": 2.32258629752935e-05, "loss": 3.7153, "step": 8766 }, { "epoch": 2.3038066147732126, "grad_norm": 0.869910717010498, "learning_rate": 2.3208340634308746e-05, "loss": 3.652, "step": 8768 }, { "epoch": 2.304332118106874, "grad_norm": 0.9758911728858948, "learning_rate": 2.319081829332399e-05, "loss": 3.7345, "step": 8770 }, { "epoch": 2.304857621440536, "grad_norm": 0.8466728329658508, "learning_rate": 2.3173295952339234e-05, "loss": 3.7332, "step": 8772 }, { "epoch": 2.3053831247741976, "grad_norm": 0.8723781704902649, "learning_rate": 2.315577361135448e-05, "loss": 3.6988, "step": 8774 }, { "epoch": 2.3059086281078596, "grad_norm": 0.9677135944366455, "learning_rate": 2.3138251270369723e-05, "loss": 3.7826, "step": 8776 }, { "epoch": 2.306434131441521, "grad_norm": 0.9998757243156433, "learning_rate": 2.3120728929384967e-05, "loss": 3.7082, "step": 8778 }, { "epoch": 2.306959634775183, "grad_norm": 0.8178179264068604, "learning_rate": 2.310320658840021e-05, "loss": 3.71, "step": 8780 }, { "epoch": 2.307485138108845, "grad_norm": 0.9065977334976196, "learning_rate": 2.3085684247415455e-05, "loss": 3.7125, "step": 8782 }, { "epoch": 2.3080106414425066, "grad_norm": 0.8715322017669678, "learning_rate": 2.30681619064307e-05, "loss": 3.6833, "step": 8784 }, { "epoch": 2.3085361447761685, "grad_norm": 0.8769671320915222, "learning_rate": 2.3050639565445944e-05, "loss": 3.7081, "step": 8786 }, { "epoch": 2.30906164810983, "grad_norm": 0.8854623436927795, "learning_rate": 2.303311722446119e-05, "loss": 3.7357, "step": 8788 }, { "epoch": 2.309587151443492, "grad_norm": 0.8152974247932434, "learning_rate": 2.3015594883476432e-05, "loss": 3.746, "step": 8790 }, { "epoch": 2.3101126547771536, "grad_norm": 0.9321800470352173, "learning_rate": 2.2998072542491676e-05, "loss": 3.7285, "step": 8792 }, { "epoch": 2.3106381581108155, "grad_norm": 0.9432794451713562, "learning_rate": 2.2980550201506924e-05, "loss": 3.7465, "step": 8794 }, { "epoch": 2.3111636614444775, "grad_norm": 1.045098066329956, "learning_rate": 2.2963027860522165e-05, "loss": 3.7115, "step": 8796 }, { "epoch": 2.311689164778139, "grad_norm": 0.9110353589057922, "learning_rate": 2.2945505519537412e-05, "loss": 3.6965, "step": 8798 }, { "epoch": 2.312214668111801, "grad_norm": 0.9219549894332886, "learning_rate": 2.2927983178552656e-05, "loss": 3.7445, "step": 8800 }, { "epoch": 2.312214668111801, "eval_loss": 3.7362277507781982, "eval_runtime": 464.619, "eval_samples_per_second": 262.127, "eval_steps_per_second": 8.192, "step": 8800 }, { "epoch": 2.3127401714454625, "grad_norm": 0.9357133507728577, "learning_rate": 2.29104608375679e-05, "loss": 3.6814, "step": 8802 }, { "epoch": 2.3132656747791245, "grad_norm": 0.8975182175636292, "learning_rate": 2.2892938496583145e-05, "loss": 3.7303, "step": 8804 }, { "epoch": 2.313791178112786, "grad_norm": 0.9018714427947998, "learning_rate": 2.287541615559839e-05, "loss": 3.7276, "step": 8806 }, { "epoch": 2.314316681446448, "grad_norm": 0.9610909819602966, "learning_rate": 2.2857893814613633e-05, "loss": 3.6892, "step": 8808 }, { "epoch": 2.31484218478011, "grad_norm": 0.916802167892456, "learning_rate": 2.2840371473628877e-05, "loss": 3.7001, "step": 8810 }, { "epoch": 2.3153676881137715, "grad_norm": 1.0030097961425781, "learning_rate": 2.282284913264412e-05, "loss": 3.6839, "step": 8812 }, { "epoch": 2.315893191447433, "grad_norm": 0.7817686796188354, "learning_rate": 2.280532679165937e-05, "loss": 3.6308, "step": 8814 }, { "epoch": 2.316418694781095, "grad_norm": 0.9806745648384094, "learning_rate": 2.278780445067461e-05, "loss": 3.675, "step": 8816 }, { "epoch": 2.316944198114757, "grad_norm": 0.8491572141647339, "learning_rate": 2.2770282109689857e-05, "loss": 3.6592, "step": 8818 }, { "epoch": 2.3174697014484185, "grad_norm": 0.843999981880188, "learning_rate": 2.27527597687051e-05, "loss": 3.6859, "step": 8820 }, { "epoch": 2.3179952047820804, "grad_norm": 0.8718085289001465, "learning_rate": 2.2735237427720342e-05, "loss": 3.7359, "step": 8822 }, { "epoch": 2.318520708115742, "grad_norm": 0.8860839605331421, "learning_rate": 2.271771508673559e-05, "loss": 3.7061, "step": 8824 }, { "epoch": 2.319046211449404, "grad_norm": 0.9317547678947449, "learning_rate": 2.2700192745750834e-05, "loss": 3.7004, "step": 8826 }, { "epoch": 2.3195717147830655, "grad_norm": 0.8051880598068237, "learning_rate": 2.2682670404766078e-05, "loss": 3.6983, "step": 8828 }, { "epoch": 2.3200972181167274, "grad_norm": 0.8509401082992554, "learning_rate": 2.2665148063781322e-05, "loss": 3.7132, "step": 8830 }, { "epoch": 2.3206227214503894, "grad_norm": 1.0351371765136719, "learning_rate": 2.2647625722796566e-05, "loss": 3.7076, "step": 8832 }, { "epoch": 2.321148224784051, "grad_norm": 0.8928101062774658, "learning_rate": 2.2630103381811814e-05, "loss": 3.7166, "step": 8834 }, { "epoch": 2.321673728117713, "grad_norm": 0.8659232258796692, "learning_rate": 2.2612581040827055e-05, "loss": 3.6846, "step": 8836 }, { "epoch": 2.3221992314513744, "grad_norm": 0.9483094215393066, "learning_rate": 2.25950586998423e-05, "loss": 3.7224, "step": 8838 }, { "epoch": 2.3227247347850364, "grad_norm": 0.992347240447998, "learning_rate": 2.2577536358857547e-05, "loss": 3.6774, "step": 8840 }, { "epoch": 2.323250238118698, "grad_norm": 0.8817299604415894, "learning_rate": 2.2560014017872787e-05, "loss": 3.6911, "step": 8842 }, { "epoch": 2.32377574145236, "grad_norm": 0.8838405013084412, "learning_rate": 2.2542491676888035e-05, "loss": 3.7365, "step": 8844 }, { "epoch": 2.324301244786022, "grad_norm": 0.8945544958114624, "learning_rate": 2.252496933590328e-05, "loss": 3.7238, "step": 8846 }, { "epoch": 2.3248267481196834, "grad_norm": 0.9404112696647644, "learning_rate": 2.250744699491852e-05, "loss": 3.7156, "step": 8848 }, { "epoch": 2.3253522514533453, "grad_norm": 0.8692668080329895, "learning_rate": 2.2489924653933767e-05, "loss": 3.6915, "step": 8850 }, { "epoch": 2.325877754787007, "grad_norm": 0.9358903169631958, "learning_rate": 2.247240231294901e-05, "loss": 3.779, "step": 8852 }, { "epoch": 2.326403258120669, "grad_norm": 0.9518730044364929, "learning_rate": 2.2454879971964256e-05, "loss": 3.7878, "step": 8854 }, { "epoch": 2.3269287614543304, "grad_norm": 0.8953529596328735, "learning_rate": 2.24373576309795e-05, "loss": 3.698, "step": 8856 }, { "epoch": 2.3274542647879923, "grad_norm": 0.9498826265335083, "learning_rate": 2.2419835289994744e-05, "loss": 3.6606, "step": 8858 }, { "epoch": 2.327979768121654, "grad_norm": 0.9115431308746338, "learning_rate": 2.2402312949009988e-05, "loss": 3.683, "step": 8860 }, { "epoch": 2.328505271455316, "grad_norm": 1.1074012517929077, "learning_rate": 2.2384790608025232e-05, "loss": 3.6651, "step": 8862 }, { "epoch": 2.3290307747889774, "grad_norm": 1.1252479553222656, "learning_rate": 2.2367268267040477e-05, "loss": 3.7013, "step": 8864 }, { "epoch": 2.3295562781226393, "grad_norm": 0.8951299786567688, "learning_rate": 2.2349745926055724e-05, "loss": 3.7213, "step": 8866 }, { "epoch": 2.3300817814563013, "grad_norm": 0.8286572098731995, "learning_rate": 2.2332223585070965e-05, "loss": 3.6754, "step": 8868 }, { "epoch": 2.330607284789963, "grad_norm": 0.9197611212730408, "learning_rate": 2.2314701244086213e-05, "loss": 3.6477, "step": 8870 }, { "epoch": 2.331132788123625, "grad_norm": 0.9111952781677246, "learning_rate": 2.2297178903101457e-05, "loss": 3.6785, "step": 8872 }, { "epoch": 2.3316582914572863, "grad_norm": 0.9090267419815063, "learning_rate": 2.2279656562116698e-05, "loss": 3.7352, "step": 8874 }, { "epoch": 2.3321837947909483, "grad_norm": 0.7954097986221313, "learning_rate": 2.2262134221131945e-05, "loss": 3.6559, "step": 8876 }, { "epoch": 2.33270929812461, "grad_norm": 0.8289228081703186, "learning_rate": 2.224461188014719e-05, "loss": 3.7441, "step": 8878 }, { "epoch": 2.333234801458272, "grad_norm": 0.926857054233551, "learning_rate": 2.2227089539162433e-05, "loss": 3.7458, "step": 8880 }, { "epoch": 2.3337603047919337, "grad_norm": 0.9446771740913391, "learning_rate": 2.2209567198177678e-05, "loss": 3.7441, "step": 8882 }, { "epoch": 2.3342858081255953, "grad_norm": 0.9127171039581299, "learning_rate": 2.2192044857192922e-05, "loss": 3.7033, "step": 8884 }, { "epoch": 2.3348113114592572, "grad_norm": 0.982109546661377, "learning_rate": 2.2174522516208166e-05, "loss": 3.7421, "step": 8886 }, { "epoch": 2.3353368147929188, "grad_norm": 0.9113138318061829, "learning_rate": 2.215700017522341e-05, "loss": 3.6859, "step": 8888 }, { "epoch": 2.3358623181265807, "grad_norm": 0.9164025187492371, "learning_rate": 2.2139477834238658e-05, "loss": 3.7374, "step": 8890 }, { "epoch": 2.3363878214602423, "grad_norm": 1.031886339187622, "learning_rate": 2.21219554932539e-05, "loss": 3.6938, "step": 8892 }, { "epoch": 2.3369133247939042, "grad_norm": 0.8642853498458862, "learning_rate": 2.2104433152269143e-05, "loss": 3.747, "step": 8894 }, { "epoch": 2.3374388281275658, "grad_norm": 0.7967892289161682, "learning_rate": 2.208691081128439e-05, "loss": 3.6916, "step": 8896 }, { "epoch": 2.3379643314612277, "grad_norm": 0.846596360206604, "learning_rate": 2.206938847029963e-05, "loss": 3.7014, "step": 8898 }, { "epoch": 2.3384898347948893, "grad_norm": 0.8374075293540955, "learning_rate": 2.205186612931488e-05, "loss": 3.7485, "step": 8900 }, { "epoch": 2.3390153381285512, "grad_norm": 0.9856224656105042, "learning_rate": 2.2034343788330123e-05, "loss": 3.6465, "step": 8902 }, { "epoch": 2.339540841462213, "grad_norm": 0.857501208782196, "learning_rate": 2.2016821447345367e-05, "loss": 3.6824, "step": 8904 }, { "epoch": 2.3400663447958747, "grad_norm": 0.9109804630279541, "learning_rate": 2.199929910636061e-05, "loss": 3.6996, "step": 8906 }, { "epoch": 2.3405918481295367, "grad_norm": 0.8643996119499207, "learning_rate": 2.1981776765375855e-05, "loss": 3.7068, "step": 8908 }, { "epoch": 2.341117351463198, "grad_norm": 0.8783021569252014, "learning_rate": 2.19642544243911e-05, "loss": 3.7171, "step": 8910 }, { "epoch": 2.34164285479686, "grad_norm": 0.9301751255989075, "learning_rate": 2.1946732083406344e-05, "loss": 3.7074, "step": 8912 }, { "epoch": 2.3421683581305217, "grad_norm": 0.9219371676445007, "learning_rate": 2.1929209742421588e-05, "loss": 3.6576, "step": 8914 }, { "epoch": 2.3426938614641837, "grad_norm": 0.9757001996040344, "learning_rate": 2.1911687401436835e-05, "loss": 3.7536, "step": 8916 }, { "epoch": 2.3432193647978456, "grad_norm": 0.8444511294364929, "learning_rate": 2.1894165060452076e-05, "loss": 3.6755, "step": 8918 }, { "epoch": 2.343744868131507, "grad_norm": 0.9015287756919861, "learning_rate": 2.187664271946732e-05, "loss": 3.7018, "step": 8920 }, { "epoch": 2.344270371465169, "grad_norm": 0.9522020816802979, "learning_rate": 2.1859120378482568e-05, "loss": 3.7175, "step": 8922 }, { "epoch": 2.3447958747988307, "grad_norm": 0.9523583054542542, "learning_rate": 2.184159803749781e-05, "loss": 3.7253, "step": 8924 }, { "epoch": 2.3453213781324926, "grad_norm": 0.8414160013198853, "learning_rate": 2.1824075696513056e-05, "loss": 3.6971, "step": 8926 }, { "epoch": 2.345846881466154, "grad_norm": 1.0933947563171387, "learning_rate": 2.18065533555283e-05, "loss": 3.6929, "step": 8928 }, { "epoch": 2.346372384799816, "grad_norm": 0.8766870498657227, "learning_rate": 2.178903101454354e-05, "loss": 3.7109, "step": 8930 }, { "epoch": 2.3468978881334777, "grad_norm": 0.8263593316078186, "learning_rate": 2.177150867355879e-05, "loss": 3.7016, "step": 8932 }, { "epoch": 2.3474233914671396, "grad_norm": 0.8389445543289185, "learning_rate": 2.1753986332574033e-05, "loss": 3.7252, "step": 8934 }, { "epoch": 2.347948894800801, "grad_norm": 0.8768371343612671, "learning_rate": 2.1736463991589277e-05, "loss": 3.7318, "step": 8936 }, { "epoch": 2.348474398134463, "grad_norm": 0.8977451324462891, "learning_rate": 2.171894165060452e-05, "loss": 3.6621, "step": 8938 }, { "epoch": 2.348999901468125, "grad_norm": 0.9002242088317871, "learning_rate": 2.1701419309619765e-05, "loss": 3.6767, "step": 8940 }, { "epoch": 2.3495254048017866, "grad_norm": 0.969774067401886, "learning_rate": 2.1683896968635013e-05, "loss": 3.6662, "step": 8942 }, { "epoch": 2.3500509081354486, "grad_norm": 0.8876912593841553, "learning_rate": 2.1666374627650254e-05, "loss": 3.6829, "step": 8944 }, { "epoch": 2.35057641146911, "grad_norm": 0.9282439351081848, "learning_rate": 2.16488522866655e-05, "loss": 3.7243, "step": 8946 }, { "epoch": 2.351101914802772, "grad_norm": 0.9565227031707764, "learning_rate": 2.1631329945680746e-05, "loss": 3.6829, "step": 8948 }, { "epoch": 2.3516274181364336, "grad_norm": 0.8561133742332458, "learning_rate": 2.1613807604695986e-05, "loss": 3.7017, "step": 8950 }, { "epoch": 2.3521529214700956, "grad_norm": 0.9465528130531311, "learning_rate": 2.1596285263711234e-05, "loss": 3.7133, "step": 8952 }, { "epoch": 2.3526784248037576, "grad_norm": 0.8057554960250854, "learning_rate": 2.1578762922726478e-05, "loss": 3.7108, "step": 8954 }, { "epoch": 2.353203928137419, "grad_norm": 0.8881667852401733, "learning_rate": 2.1561240581741722e-05, "loss": 3.6956, "step": 8956 }, { "epoch": 2.353729431471081, "grad_norm": 0.8457796573638916, "learning_rate": 2.1543718240756966e-05, "loss": 3.7087, "step": 8958 }, { "epoch": 2.3542549348047426, "grad_norm": 0.8687591552734375, "learning_rate": 2.152619589977221e-05, "loss": 3.7228, "step": 8960 }, { "epoch": 2.3547804381384045, "grad_norm": 0.9628992676734924, "learning_rate": 2.1508673558787455e-05, "loss": 3.695, "step": 8962 }, { "epoch": 2.355305941472066, "grad_norm": 0.8704159259796143, "learning_rate": 2.14911512178027e-05, "loss": 3.737, "step": 8964 }, { "epoch": 2.355831444805728, "grad_norm": 0.9290614724159241, "learning_rate": 2.1473628876817943e-05, "loss": 3.6805, "step": 8966 }, { "epoch": 2.35635694813939, "grad_norm": 0.8577322363853455, "learning_rate": 2.1456106535833187e-05, "loss": 3.7098, "step": 8968 }, { "epoch": 2.3568824514730515, "grad_norm": 0.8530275821685791, "learning_rate": 2.143858419484843e-05, "loss": 3.7172, "step": 8970 }, { "epoch": 2.357407954806713, "grad_norm": 0.8706390857696533, "learning_rate": 2.142106185386368e-05, "loss": 3.7196, "step": 8972 }, { "epoch": 2.357933458140375, "grad_norm": 0.8249920606613159, "learning_rate": 2.1403539512878923e-05, "loss": 3.7217, "step": 8974 }, { "epoch": 2.358458961474037, "grad_norm": 0.8352787494659424, "learning_rate": 2.1386017171894164e-05, "loss": 3.6958, "step": 8976 }, { "epoch": 2.3589844648076985, "grad_norm": 0.8960438966751099, "learning_rate": 2.136849483090941e-05, "loss": 3.7212, "step": 8978 }, { "epoch": 2.3595099681413605, "grad_norm": 0.860775887966156, "learning_rate": 2.1350972489924656e-05, "loss": 3.7018, "step": 8980 }, { "epoch": 2.360035471475022, "grad_norm": 0.8539716005325317, "learning_rate": 2.13334501489399e-05, "loss": 3.7098, "step": 8982 }, { "epoch": 2.360560974808684, "grad_norm": 0.9208155870437622, "learning_rate": 2.1315927807955144e-05, "loss": 3.6678, "step": 8984 }, { "epoch": 2.3610864781423455, "grad_norm": 0.8761558532714844, "learning_rate": 2.1298405466970388e-05, "loss": 3.7106, "step": 8986 }, { "epoch": 2.3616119814760075, "grad_norm": 0.8468874096870422, "learning_rate": 2.1280883125985632e-05, "loss": 3.713, "step": 8988 }, { "epoch": 2.3621374848096695, "grad_norm": 0.9031239748001099, "learning_rate": 2.1263360785000877e-05, "loss": 3.6887, "step": 8990 }, { "epoch": 2.362662988143331, "grad_norm": 0.869745671749115, "learning_rate": 2.124583844401612e-05, "loss": 3.7563, "step": 8992 }, { "epoch": 2.363188491476993, "grad_norm": 0.9052104949951172, "learning_rate": 2.1228316103031365e-05, "loss": 3.688, "step": 8994 }, { "epoch": 2.3637139948106545, "grad_norm": 0.9475054740905762, "learning_rate": 2.121079376204661e-05, "loss": 3.7028, "step": 8996 }, { "epoch": 2.3642394981443164, "grad_norm": 1.0135228633880615, "learning_rate": 2.1193271421061857e-05, "loss": 3.707, "step": 8998 }, { "epoch": 2.364765001477978, "grad_norm": 0.8683255314826965, "learning_rate": 2.1175749080077097e-05, "loss": 3.7022, "step": 9000 }, { "epoch": 2.36529050481164, "grad_norm": 0.8016043305397034, "learning_rate": 2.115822673909234e-05, "loss": 3.6748, "step": 9002 }, { "epoch": 2.365816008145302, "grad_norm": 0.8381335735321045, "learning_rate": 2.114070439810759e-05, "loss": 3.6785, "step": 9004 }, { "epoch": 2.3663415114789634, "grad_norm": 0.9039446711540222, "learning_rate": 2.1123182057122833e-05, "loss": 3.6889, "step": 9006 }, { "epoch": 2.3668670148126254, "grad_norm": 0.8575254678726196, "learning_rate": 2.1105659716138078e-05, "loss": 3.6558, "step": 9008 }, { "epoch": 2.367392518146287, "grad_norm": 0.8315747380256653, "learning_rate": 2.1088137375153322e-05, "loss": 3.7187, "step": 9010 }, { "epoch": 2.367918021479949, "grad_norm": 0.8694199919700623, "learning_rate": 2.1070615034168566e-05, "loss": 3.7236, "step": 9012 }, { "epoch": 2.3684435248136104, "grad_norm": 0.8315359950065613, "learning_rate": 2.105309269318381e-05, "loss": 3.7546, "step": 9014 }, { "epoch": 2.3689690281472724, "grad_norm": 0.7875381112098694, "learning_rate": 2.1035570352199054e-05, "loss": 3.6988, "step": 9016 }, { "epoch": 2.369494531480934, "grad_norm": 0.8493250608444214, "learning_rate": 2.1018048011214302e-05, "loss": 3.709, "step": 9018 }, { "epoch": 2.370020034814596, "grad_norm": 0.8876301050186157, "learning_rate": 2.1000525670229543e-05, "loss": 3.6657, "step": 9020 }, { "epoch": 2.3705455381482574, "grad_norm": 0.8552539944648743, "learning_rate": 2.0983003329244787e-05, "loss": 3.7022, "step": 9022 }, { "epoch": 2.3710710414819194, "grad_norm": 0.8855098485946655, "learning_rate": 2.0965480988260034e-05, "loss": 3.7057, "step": 9024 }, { "epoch": 2.3715965448155814, "grad_norm": 0.913451611995697, "learning_rate": 2.0947958647275275e-05, "loss": 3.6827, "step": 9026 }, { "epoch": 2.372122048149243, "grad_norm": 0.9736340045928955, "learning_rate": 2.0930436306290523e-05, "loss": 3.6672, "step": 9028 }, { "epoch": 2.372647551482905, "grad_norm": 0.7933185696601868, "learning_rate": 2.0912913965305767e-05, "loss": 3.7289, "step": 9030 }, { "epoch": 2.3731730548165664, "grad_norm": 0.8782123327255249, "learning_rate": 2.0895391624321008e-05, "loss": 3.7242, "step": 9032 }, { "epoch": 2.3736985581502283, "grad_norm": 0.984894871711731, "learning_rate": 2.0877869283336255e-05, "loss": 3.7178, "step": 9034 }, { "epoch": 2.37422406148389, "grad_norm": 0.8953860402107239, "learning_rate": 2.08603469423515e-05, "loss": 3.7531, "step": 9036 }, { "epoch": 2.374749564817552, "grad_norm": 0.9733373522758484, "learning_rate": 2.0842824601366744e-05, "loss": 3.716, "step": 9038 }, { "epoch": 2.375275068151214, "grad_norm": 0.9560746550559998, "learning_rate": 2.0825302260381988e-05, "loss": 3.6931, "step": 9040 }, { "epoch": 2.3758005714848753, "grad_norm": 0.867391049861908, "learning_rate": 2.0807779919397232e-05, "loss": 3.6883, "step": 9042 }, { "epoch": 2.3763260748185373, "grad_norm": 0.8943777680397034, "learning_rate": 2.079025757841248e-05, "loss": 3.6814, "step": 9044 }, { "epoch": 2.376851578152199, "grad_norm": 0.9280069470405579, "learning_rate": 2.077273523742772e-05, "loss": 3.6919, "step": 9046 }, { "epoch": 2.377377081485861, "grad_norm": 0.916554868221283, "learning_rate": 2.0755212896442964e-05, "loss": 3.6726, "step": 9048 }, { "epoch": 2.3779025848195223, "grad_norm": 0.835424542427063, "learning_rate": 2.0737690555458212e-05, "loss": 3.6888, "step": 9050 }, { "epoch": 2.3784280881531843, "grad_norm": 0.9506660103797913, "learning_rate": 2.0720168214473453e-05, "loss": 3.7406, "step": 9052 }, { "epoch": 2.378953591486846, "grad_norm": 0.8699432611465454, "learning_rate": 2.07026458734887e-05, "loss": 3.7389, "step": 9054 }, { "epoch": 2.379479094820508, "grad_norm": 0.9201357364654541, "learning_rate": 2.0685123532503945e-05, "loss": 3.7165, "step": 9056 }, { "epoch": 2.3800045981541693, "grad_norm": 0.89146488904953, "learning_rate": 2.0667601191519185e-05, "loss": 3.7143, "step": 9058 }, { "epoch": 2.3805301014878313, "grad_norm": 0.886868417263031, "learning_rate": 2.0650078850534433e-05, "loss": 3.6706, "step": 9060 }, { "epoch": 2.3810556048214933, "grad_norm": 0.9445696473121643, "learning_rate": 2.0632556509549677e-05, "loss": 3.7115, "step": 9062 }, { "epoch": 2.381581108155155, "grad_norm": 0.9284603595733643, "learning_rate": 2.061503416856492e-05, "loss": 3.7167, "step": 9064 }, { "epoch": 2.3821066114888167, "grad_norm": 0.8704180717468262, "learning_rate": 2.0597511827580165e-05, "loss": 3.6829, "step": 9066 }, { "epoch": 2.3826321148224783, "grad_norm": 0.8867454528808594, "learning_rate": 2.057998948659541e-05, "loss": 3.7767, "step": 9068 }, { "epoch": 2.3831576181561402, "grad_norm": 0.8056176900863647, "learning_rate": 2.0562467145610654e-05, "loss": 3.719, "step": 9070 }, { "epoch": 2.3836831214898018, "grad_norm": 0.8236806988716125, "learning_rate": 2.0544944804625898e-05, "loss": 3.716, "step": 9072 }, { "epoch": 2.3842086248234637, "grad_norm": 0.8335738182067871, "learning_rate": 2.0527422463641146e-05, "loss": 3.6926, "step": 9074 }, { "epoch": 2.3847341281571257, "grad_norm": 0.8668922781944275, "learning_rate": 2.050990012265639e-05, "loss": 3.7483, "step": 9076 }, { "epoch": 2.3852596314907872, "grad_norm": 0.8714184761047363, "learning_rate": 2.049237778167163e-05, "loss": 3.7245, "step": 9078 }, { "epoch": 2.385785134824449, "grad_norm": 0.9334730505943298, "learning_rate": 2.0474855440686878e-05, "loss": 3.7045, "step": 9080 }, { "epoch": 2.3863106381581107, "grad_norm": 0.8643117547035217, "learning_rate": 2.0457333099702122e-05, "loss": 3.7301, "step": 9082 }, { "epoch": 2.3868361414917727, "grad_norm": 0.9075177907943726, "learning_rate": 2.0439810758717366e-05, "loss": 3.7939, "step": 9084 }, { "epoch": 2.3873616448254342, "grad_norm": 0.9756799936294556, "learning_rate": 2.042228841773261e-05, "loss": 3.724, "step": 9086 }, { "epoch": 2.387887148159096, "grad_norm": 0.9238368272781372, "learning_rate": 2.0404766076747855e-05, "loss": 3.7021, "step": 9088 }, { "epoch": 2.3884126514927577, "grad_norm": 1.0033292770385742, "learning_rate": 2.03872437357631e-05, "loss": 3.7728, "step": 9090 }, { "epoch": 2.3889381548264197, "grad_norm": 0.8289501070976257, "learning_rate": 2.0369721394778343e-05, "loss": 3.7241, "step": 9092 }, { "epoch": 2.389463658160081, "grad_norm": 0.7826811671257019, "learning_rate": 2.0352199053793587e-05, "loss": 3.6687, "step": 9094 }, { "epoch": 2.389989161493743, "grad_norm": 0.8547013401985168, "learning_rate": 2.033467671280883e-05, "loss": 3.7161, "step": 9096 }, { "epoch": 2.390514664827405, "grad_norm": 0.8499249219894409, "learning_rate": 2.0317154371824076e-05, "loss": 3.6892, "step": 9098 }, { "epoch": 2.3910401681610667, "grad_norm": 0.9533870220184326, "learning_rate": 2.0299632030839323e-05, "loss": 3.7555, "step": 9100 }, { "epoch": 2.3915656714947287, "grad_norm": 0.8455145955085754, "learning_rate": 2.0282109689854564e-05, "loss": 3.6856, "step": 9102 }, { "epoch": 2.39209117482839, "grad_norm": 1.0259391069412231, "learning_rate": 2.0264587348869808e-05, "loss": 3.7108, "step": 9104 }, { "epoch": 2.392616678162052, "grad_norm": 1.0154415369033813, "learning_rate": 2.0247065007885056e-05, "loss": 3.6908, "step": 9106 }, { "epoch": 2.3931421814957137, "grad_norm": 0.8875961899757385, "learning_rate": 2.02295426669003e-05, "loss": 3.7267, "step": 9108 }, { "epoch": 2.3936676848293756, "grad_norm": 0.8273681998252869, "learning_rate": 2.0212020325915544e-05, "loss": 3.7123, "step": 9110 }, { "epoch": 2.3941931881630376, "grad_norm": 0.917645275592804, "learning_rate": 2.0194497984930788e-05, "loss": 3.7442, "step": 9112 }, { "epoch": 2.394718691496699, "grad_norm": 1.024390459060669, "learning_rate": 2.0176975643946032e-05, "loss": 3.7071, "step": 9114 }, { "epoch": 2.395244194830361, "grad_norm": 0.8851872682571411, "learning_rate": 2.0159453302961277e-05, "loss": 3.7206, "step": 9116 }, { "epoch": 2.3957696981640226, "grad_norm": 0.9306411743164062, "learning_rate": 2.014193096197652e-05, "loss": 3.7111, "step": 9118 }, { "epoch": 2.3962952014976846, "grad_norm": 0.9374563097953796, "learning_rate": 2.0124408620991765e-05, "loss": 3.7177, "step": 9120 }, { "epoch": 2.396820704831346, "grad_norm": 1.0320545434951782, "learning_rate": 2.010688628000701e-05, "loss": 3.7349, "step": 9122 }, { "epoch": 2.397346208165008, "grad_norm": 0.8388657569885254, "learning_rate": 2.0089363939022253e-05, "loss": 3.744, "step": 9124 }, { "epoch": 2.39787171149867, "grad_norm": 0.9789794683456421, "learning_rate": 2.00718415980375e-05, "loss": 3.6817, "step": 9126 }, { "epoch": 2.3983972148323316, "grad_norm": 0.9346893429756165, "learning_rate": 2.005431925705274e-05, "loss": 3.6967, "step": 9128 }, { "epoch": 2.398922718165993, "grad_norm": 0.887564480304718, "learning_rate": 2.0036796916067986e-05, "loss": 3.6993, "step": 9130 }, { "epoch": 2.399448221499655, "grad_norm": 0.8972368836402893, "learning_rate": 2.0019274575083233e-05, "loss": 3.7219, "step": 9132 }, { "epoch": 2.399973724833317, "grad_norm": 0.8357917666435242, "learning_rate": 2.0001752234098474e-05, "loss": 3.7199, "step": 9134 }, { "epoch": 2.4004992281669786, "grad_norm": 0.8946535587310791, "learning_rate": 1.9984229893113722e-05, "loss": 3.7113, "step": 9136 }, { "epoch": 2.4010247315006406, "grad_norm": 0.9001441597938538, "learning_rate": 1.9966707552128966e-05, "loss": 3.713, "step": 9138 }, { "epoch": 2.401550234834302, "grad_norm": 0.9303774833679199, "learning_rate": 1.994918521114421e-05, "loss": 3.6841, "step": 9140 }, { "epoch": 2.402075738167964, "grad_norm": 0.9445380568504333, "learning_rate": 1.9931662870159454e-05, "loss": 3.7039, "step": 9142 }, { "epoch": 2.4026012415016256, "grad_norm": 0.9168652296066284, "learning_rate": 1.99141405291747e-05, "loss": 3.6864, "step": 9144 }, { "epoch": 2.4031267448352875, "grad_norm": 0.9425753355026245, "learning_rate": 1.9896618188189946e-05, "loss": 3.7008, "step": 9146 }, { "epoch": 2.4036522481689495, "grad_norm": 0.9130763411521912, "learning_rate": 1.9879095847205187e-05, "loss": 3.6939, "step": 9148 }, { "epoch": 2.404177751502611, "grad_norm": 1.0015203952789307, "learning_rate": 1.986157350622043e-05, "loss": 3.6804, "step": 9150 }, { "epoch": 2.404703254836273, "grad_norm": 0.8112961053848267, "learning_rate": 1.984405116523568e-05, "loss": 3.689, "step": 9152 }, { "epoch": 2.4052287581699345, "grad_norm": 0.9593541026115417, "learning_rate": 1.982652882425092e-05, "loss": 3.7171, "step": 9154 }, { "epoch": 2.4057542615035965, "grad_norm": 0.9145869612693787, "learning_rate": 1.9809006483266167e-05, "loss": 3.7206, "step": 9156 }, { "epoch": 2.406279764837258, "grad_norm": 0.9422110319137573, "learning_rate": 1.979148414228141e-05, "loss": 3.6944, "step": 9158 }, { "epoch": 2.40680526817092, "grad_norm": 0.9152317047119141, "learning_rate": 1.9773961801296652e-05, "loss": 3.706, "step": 9160 }, { "epoch": 2.407330771504582, "grad_norm": 1.0030242204666138, "learning_rate": 1.97564394603119e-05, "loss": 3.6979, "step": 9162 }, { "epoch": 2.4078562748382435, "grad_norm": 0.9075245261192322, "learning_rate": 1.9738917119327144e-05, "loss": 3.7172, "step": 9164 }, { "epoch": 2.4083817781719055, "grad_norm": 1.0057417154312134, "learning_rate": 1.9721394778342388e-05, "loss": 3.7021, "step": 9166 }, { "epoch": 2.408907281505567, "grad_norm": 0.8683124780654907, "learning_rate": 1.9703872437357632e-05, "loss": 3.709, "step": 9168 }, { "epoch": 2.409432784839229, "grad_norm": 0.8837906718254089, "learning_rate": 1.9686350096372876e-05, "loss": 3.6986, "step": 9170 }, { "epoch": 2.4099582881728905, "grad_norm": 0.8515884876251221, "learning_rate": 1.966882775538812e-05, "loss": 3.6901, "step": 9172 }, { "epoch": 2.4104837915065525, "grad_norm": 0.9771558046340942, "learning_rate": 1.9651305414403364e-05, "loss": 3.6998, "step": 9174 }, { "epoch": 2.411009294840214, "grad_norm": 0.935565710067749, "learning_rate": 1.963378307341861e-05, "loss": 3.6473, "step": 9176 }, { "epoch": 2.411534798173876, "grad_norm": 0.9242103695869446, "learning_rate": 1.9616260732433856e-05, "loss": 3.701, "step": 9178 }, { "epoch": 2.4120603015075375, "grad_norm": 0.9451702237129211, "learning_rate": 1.9598738391449097e-05, "loss": 3.7083, "step": 9180 }, { "epoch": 2.4125858048411994, "grad_norm": 0.8610627055168152, "learning_rate": 1.9581216050464345e-05, "loss": 3.739, "step": 9182 }, { "epoch": 2.4131113081748614, "grad_norm": 0.8812553286552429, "learning_rate": 1.956369370947959e-05, "loss": 3.7113, "step": 9184 }, { "epoch": 2.413636811508523, "grad_norm": 0.9310605525970459, "learning_rate": 1.954617136849483e-05, "loss": 3.6941, "step": 9186 }, { "epoch": 2.414162314842185, "grad_norm": 0.8492050766944885, "learning_rate": 1.9528649027510077e-05, "loss": 3.7026, "step": 9188 }, { "epoch": 2.4146878181758464, "grad_norm": 0.9194096326828003, "learning_rate": 1.951112668652532e-05, "loss": 3.6834, "step": 9190 }, { "epoch": 2.4152133215095084, "grad_norm": 1.0702786445617676, "learning_rate": 1.9493604345540565e-05, "loss": 3.743, "step": 9192 }, { "epoch": 2.41573882484317, "grad_norm": 0.9570682644844055, "learning_rate": 1.947608200455581e-05, "loss": 3.7004, "step": 9194 }, { "epoch": 2.416264328176832, "grad_norm": 0.9402709603309631, "learning_rate": 1.9458559663571054e-05, "loss": 3.7073, "step": 9196 }, { "epoch": 2.416789831510494, "grad_norm": 0.8023794293403625, "learning_rate": 1.9441037322586298e-05, "loss": 3.6977, "step": 9198 }, { "epoch": 2.4173153348441554, "grad_norm": 0.8740757703781128, "learning_rate": 1.9423514981601542e-05, "loss": 3.7023, "step": 9200 }, { "epoch": 2.4173153348441554, "eval_loss": 3.729130268096924, "eval_runtime": 464.7503, "eval_samples_per_second": 262.053, "eval_steps_per_second": 8.189, "step": 9200 }, { "epoch": 2.4178408381778174, "grad_norm": 0.8122054934501648, "learning_rate": 1.940599264061679e-05, "loss": 3.6598, "step": 9202 }, { "epoch": 2.418366341511479, "grad_norm": 1.0686085224151611, "learning_rate": 1.938847029963203e-05, "loss": 3.7118, "step": 9204 }, { "epoch": 2.418891844845141, "grad_norm": 0.9301474094390869, "learning_rate": 1.9370947958647275e-05, "loss": 3.729, "step": 9206 }, { "epoch": 2.4194173481788024, "grad_norm": 0.8331457376480103, "learning_rate": 1.9353425617662522e-05, "loss": 3.6896, "step": 9208 }, { "epoch": 2.4199428515124644, "grad_norm": 0.9412609338760376, "learning_rate": 1.9335903276677763e-05, "loss": 3.664, "step": 9210 }, { "epoch": 2.420468354846126, "grad_norm": 0.8637613654136658, "learning_rate": 1.931838093569301e-05, "loss": 3.7528, "step": 9212 }, { "epoch": 2.420993858179788, "grad_norm": 0.9215237498283386, "learning_rate": 1.9300858594708255e-05, "loss": 3.6887, "step": 9214 }, { "epoch": 2.4215193615134494, "grad_norm": 0.8485170602798462, "learning_rate": 1.92833362537235e-05, "loss": 3.6496, "step": 9216 }, { "epoch": 2.4220448648471113, "grad_norm": 0.8983384966850281, "learning_rate": 1.9265813912738743e-05, "loss": 3.7431, "step": 9218 }, { "epoch": 2.4225703681807733, "grad_norm": 0.8831188082695007, "learning_rate": 1.9248291571753987e-05, "loss": 3.7006, "step": 9220 }, { "epoch": 2.423095871514435, "grad_norm": 0.8800916075706482, "learning_rate": 1.923076923076923e-05, "loss": 3.699, "step": 9222 }, { "epoch": 2.423621374848097, "grad_norm": 0.8508179783821106, "learning_rate": 1.9213246889784476e-05, "loss": 3.7003, "step": 9224 }, { "epoch": 2.4241468781817583, "grad_norm": 0.9210959076881409, "learning_rate": 1.919572454879972e-05, "loss": 3.7083, "step": 9226 }, { "epoch": 2.4246723815154203, "grad_norm": 0.9311825037002563, "learning_rate": 1.9178202207814967e-05, "loss": 3.6815, "step": 9228 }, { "epoch": 2.425197884849082, "grad_norm": 0.8876466751098633, "learning_rate": 1.9160679866830208e-05, "loss": 3.6801, "step": 9230 }, { "epoch": 2.425723388182744, "grad_norm": 0.8136204481124878, "learning_rate": 1.9143157525845452e-05, "loss": 3.7045, "step": 9232 }, { "epoch": 2.4262488915164058, "grad_norm": 0.940893292427063, "learning_rate": 1.91256351848607e-05, "loss": 3.7384, "step": 9234 }, { "epoch": 2.4267743948500673, "grad_norm": 0.9973317384719849, "learning_rate": 1.910811284387594e-05, "loss": 3.6916, "step": 9236 }, { "epoch": 2.4272998981837293, "grad_norm": 0.8603576421737671, "learning_rate": 1.9090590502891188e-05, "loss": 3.7309, "step": 9238 }, { "epoch": 2.427825401517391, "grad_norm": 1.0133614540100098, "learning_rate": 1.9073068161906432e-05, "loss": 3.7175, "step": 9240 }, { "epoch": 2.4283509048510528, "grad_norm": 0.9175349473953247, "learning_rate": 1.9055545820921673e-05, "loss": 3.6703, "step": 9242 }, { "epoch": 2.4288764081847143, "grad_norm": 0.9077808856964111, "learning_rate": 1.903802347993692e-05, "loss": 3.6874, "step": 9244 }, { "epoch": 2.4294019115183763, "grad_norm": 0.9305202960968018, "learning_rate": 1.9020501138952165e-05, "loss": 3.6799, "step": 9246 }, { "epoch": 2.429927414852038, "grad_norm": 0.9223439693450928, "learning_rate": 1.900297879796741e-05, "loss": 3.6774, "step": 9248 }, { "epoch": 2.4304529181856998, "grad_norm": 0.8300948143005371, "learning_rate": 1.8985456456982653e-05, "loss": 3.6805, "step": 9250 }, { "epoch": 2.4309784215193613, "grad_norm": 0.8801530599594116, "learning_rate": 1.8967934115997897e-05, "loss": 3.6921, "step": 9252 }, { "epoch": 2.4315039248530232, "grad_norm": 1.01650071144104, "learning_rate": 1.8950411775013145e-05, "loss": 3.7234, "step": 9254 }, { "epoch": 2.432029428186685, "grad_norm": 1.045246958732605, "learning_rate": 1.8932889434028386e-05, "loss": 3.6894, "step": 9256 }, { "epoch": 2.4325549315203467, "grad_norm": 0.8541380167007446, "learning_rate": 1.891536709304363e-05, "loss": 3.7325, "step": 9258 }, { "epoch": 2.4330804348540087, "grad_norm": 0.924011766910553, "learning_rate": 1.8897844752058878e-05, "loss": 3.7408, "step": 9260 }, { "epoch": 2.4336059381876702, "grad_norm": 0.8632286787033081, "learning_rate": 1.888032241107412e-05, "loss": 3.6729, "step": 9262 }, { "epoch": 2.434131441521332, "grad_norm": 0.9902201890945435, "learning_rate": 1.8862800070089366e-05, "loss": 3.6459, "step": 9264 }, { "epoch": 2.4346569448549937, "grad_norm": 0.9814377427101135, "learning_rate": 1.884527772910461e-05, "loss": 3.7153, "step": 9266 }, { "epoch": 2.4351824481886557, "grad_norm": 0.9199929237365723, "learning_rate": 1.8827755388119854e-05, "loss": 3.7324, "step": 9268 }, { "epoch": 2.4357079515223177, "grad_norm": 0.8515920042991638, "learning_rate": 1.88102330471351e-05, "loss": 3.737, "step": 9270 }, { "epoch": 2.436233454855979, "grad_norm": 0.9813836812973022, "learning_rate": 1.8792710706150343e-05, "loss": 3.6705, "step": 9272 }, { "epoch": 2.436758958189641, "grad_norm": 0.9926552772521973, "learning_rate": 1.8775188365165587e-05, "loss": 3.6546, "step": 9274 }, { "epoch": 2.4372844615233027, "grad_norm": 0.9672073125839233, "learning_rate": 1.875766602418083e-05, "loss": 3.6461, "step": 9276 }, { "epoch": 2.4378099648569647, "grad_norm": 0.8570666909217834, "learning_rate": 1.8740143683196075e-05, "loss": 3.6787, "step": 9278 }, { "epoch": 2.438335468190626, "grad_norm": 0.861189067363739, "learning_rate": 1.8722621342211323e-05, "loss": 3.6927, "step": 9280 }, { "epoch": 2.438860971524288, "grad_norm": 0.9494595527648926, "learning_rate": 1.8705099001226563e-05, "loss": 3.7242, "step": 9282 }, { "epoch": 2.43938647485795, "grad_norm": 0.9480484127998352, "learning_rate": 1.868757666024181e-05, "loss": 3.7899, "step": 9284 }, { "epoch": 2.4399119781916117, "grad_norm": 0.9670366644859314, "learning_rate": 1.8670054319257055e-05, "loss": 3.7801, "step": 9286 }, { "epoch": 2.440437481525273, "grad_norm": 0.8790748119354248, "learning_rate": 1.8652531978272296e-05, "loss": 3.7238, "step": 9288 }, { "epoch": 2.440962984858935, "grad_norm": 0.8515949249267578, "learning_rate": 1.8635009637287544e-05, "loss": 3.6848, "step": 9290 }, { "epoch": 2.441488488192597, "grad_norm": 0.9477934241294861, "learning_rate": 1.8617487296302788e-05, "loss": 3.6399, "step": 9292 }, { "epoch": 2.4420139915262586, "grad_norm": 0.8941419124603271, "learning_rate": 1.8599964955318032e-05, "loss": 3.6449, "step": 9294 }, { "epoch": 2.4425394948599206, "grad_norm": 0.9117879867553711, "learning_rate": 1.8582442614333276e-05, "loss": 3.6887, "step": 9296 }, { "epoch": 2.443064998193582, "grad_norm": 0.8619703650474548, "learning_rate": 1.856492027334852e-05, "loss": 3.7145, "step": 9298 }, { "epoch": 2.443590501527244, "grad_norm": 0.8432109951972961, "learning_rate": 1.8547397932363764e-05, "loss": 3.696, "step": 9300 }, { "epoch": 2.4441160048609056, "grad_norm": 0.9274951219558716, "learning_rate": 1.852987559137901e-05, "loss": 3.6624, "step": 9302 }, { "epoch": 2.4446415081945676, "grad_norm": 0.9362090826034546, "learning_rate": 1.8512353250394253e-05, "loss": 3.7108, "step": 9304 }, { "epoch": 2.4451670115282296, "grad_norm": 0.91484534740448, "learning_rate": 1.8494830909409497e-05, "loss": 3.7299, "step": 9306 }, { "epoch": 2.445692514861891, "grad_norm": 0.9838431477546692, "learning_rate": 1.847730856842474e-05, "loss": 3.7116, "step": 9308 }, { "epoch": 2.446218018195553, "grad_norm": 0.9245963096618652, "learning_rate": 1.845978622743999e-05, "loss": 3.6958, "step": 9310 }, { "epoch": 2.4467435215292146, "grad_norm": 0.9894992709159851, "learning_rate": 1.844226388645523e-05, "loss": 3.6933, "step": 9312 }, { "epoch": 2.4472690248628766, "grad_norm": 1.001935601234436, "learning_rate": 1.8424741545470474e-05, "loss": 3.7023, "step": 9314 }, { "epoch": 2.447794528196538, "grad_norm": 0.9048618078231812, "learning_rate": 1.840721920448572e-05, "loss": 3.76, "step": 9316 }, { "epoch": 2.4483200315302, "grad_norm": 0.8393222093582153, "learning_rate": 1.8389696863500965e-05, "loss": 3.7114, "step": 9318 }, { "epoch": 2.448845534863862, "grad_norm": 0.8902108073234558, "learning_rate": 1.837217452251621e-05, "loss": 3.718, "step": 9320 }, { "epoch": 2.4493710381975236, "grad_norm": 0.925727903842926, "learning_rate": 1.8354652181531454e-05, "loss": 3.6872, "step": 9322 }, { "epoch": 2.4498965415311855, "grad_norm": 0.9918540716171265, "learning_rate": 1.8337129840546698e-05, "loss": 3.6533, "step": 9324 }, { "epoch": 2.450422044864847, "grad_norm": 0.9479742050170898, "learning_rate": 1.8319607499561942e-05, "loss": 3.7334, "step": 9326 }, { "epoch": 2.450947548198509, "grad_norm": 0.8896299600601196, "learning_rate": 1.8302085158577186e-05, "loss": 3.7394, "step": 9328 }, { "epoch": 2.4514730515321705, "grad_norm": 0.8845251202583313, "learning_rate": 1.8284562817592434e-05, "loss": 3.7536, "step": 9330 }, { "epoch": 2.4519985548658325, "grad_norm": 0.9163066744804382, "learning_rate": 1.8267040476607675e-05, "loss": 3.6713, "step": 9332 }, { "epoch": 2.452524058199494, "grad_norm": 0.8856566548347473, "learning_rate": 1.824951813562292e-05, "loss": 3.6635, "step": 9334 }, { "epoch": 2.453049561533156, "grad_norm": 0.8616898655891418, "learning_rate": 1.8231995794638166e-05, "loss": 3.6432, "step": 9336 }, { "epoch": 2.4535750648668175, "grad_norm": 0.8796603679656982, "learning_rate": 1.8214473453653407e-05, "loss": 3.7219, "step": 9338 }, { "epoch": 2.4541005682004795, "grad_norm": 0.8854755759239197, "learning_rate": 1.8196951112668655e-05, "loss": 3.6888, "step": 9340 }, { "epoch": 2.4546260715341415, "grad_norm": 0.9495604634284973, "learning_rate": 1.81794287716839e-05, "loss": 3.7212, "step": 9342 }, { "epoch": 2.455151574867803, "grad_norm": 0.9036046862602234, "learning_rate": 1.816190643069914e-05, "loss": 3.6927, "step": 9344 }, { "epoch": 2.455677078201465, "grad_norm": 0.8509160280227661, "learning_rate": 1.8144384089714387e-05, "loss": 3.7396, "step": 9346 }, { "epoch": 2.4562025815351265, "grad_norm": 0.9178687334060669, "learning_rate": 1.812686174872963e-05, "loss": 3.6832, "step": 9348 }, { "epoch": 2.4567280848687885, "grad_norm": 0.8983644247055054, "learning_rate": 1.8109339407744876e-05, "loss": 3.7015, "step": 9350 }, { "epoch": 2.45725358820245, "grad_norm": 0.9883037805557251, "learning_rate": 1.809181706676012e-05, "loss": 3.7279, "step": 9352 }, { "epoch": 2.457779091536112, "grad_norm": 0.9779164791107178, "learning_rate": 1.8074294725775364e-05, "loss": 3.7083, "step": 9354 }, { "epoch": 2.458304594869774, "grad_norm": 0.8504135608673096, "learning_rate": 1.805677238479061e-05, "loss": 3.6491, "step": 9356 }, { "epoch": 2.4588300982034355, "grad_norm": 0.966069221496582, "learning_rate": 1.8039250043805852e-05, "loss": 3.7168, "step": 9358 }, { "epoch": 2.4593556015370974, "grad_norm": 0.8940038084983826, "learning_rate": 1.8021727702821096e-05, "loss": 3.6744, "step": 9360 }, { "epoch": 2.459881104870759, "grad_norm": 0.9818271994590759, "learning_rate": 1.8004205361836344e-05, "loss": 3.7222, "step": 9362 }, { "epoch": 2.460406608204421, "grad_norm": 0.9859958291053772, "learning_rate": 1.7986683020851585e-05, "loss": 3.7228, "step": 9364 }, { "epoch": 2.4609321115380824, "grad_norm": 0.8950693011283875, "learning_rate": 1.7969160679866832e-05, "loss": 3.6785, "step": 9366 }, { "epoch": 2.4614576148717444, "grad_norm": 0.9335705041885376, "learning_rate": 1.7951638338882077e-05, "loss": 3.767, "step": 9368 }, { "epoch": 2.461983118205406, "grad_norm": 0.8293125629425049, "learning_rate": 1.7934115997897317e-05, "loss": 3.6876, "step": 9370 }, { "epoch": 2.462508621539068, "grad_norm": 1.0057166814804077, "learning_rate": 1.7916593656912565e-05, "loss": 3.6511, "step": 9372 }, { "epoch": 2.4630341248727294, "grad_norm": 0.9249845147132874, "learning_rate": 1.789907131592781e-05, "loss": 3.6475, "step": 9374 }, { "epoch": 2.4635596282063914, "grad_norm": 0.9017667770385742, "learning_rate": 1.7881548974943053e-05, "loss": 3.6776, "step": 9376 }, { "epoch": 2.4640851315400534, "grad_norm": 0.9827386736869812, "learning_rate": 1.7864026633958297e-05, "loss": 3.7121, "step": 9378 }, { "epoch": 2.464610634873715, "grad_norm": 0.8538251519203186, "learning_rate": 1.784650429297354e-05, "loss": 3.6937, "step": 9380 }, { "epoch": 2.465136138207377, "grad_norm": 0.9371615648269653, "learning_rate": 1.7828981951988786e-05, "loss": 3.6851, "step": 9382 }, { "epoch": 2.4656616415410384, "grad_norm": 0.857011616230011, "learning_rate": 1.781145961100403e-05, "loss": 3.6547, "step": 9384 }, { "epoch": 2.4661871448747004, "grad_norm": 0.9119160175323486, "learning_rate": 1.7793937270019274e-05, "loss": 3.6994, "step": 9386 }, { "epoch": 2.466712648208362, "grad_norm": 0.8457512855529785, "learning_rate": 1.777641492903452e-05, "loss": 3.6967, "step": 9388 }, { "epoch": 2.467238151542024, "grad_norm": 0.8444269895553589, "learning_rate": 1.7758892588049762e-05, "loss": 3.7001, "step": 9390 }, { "epoch": 2.467763654875686, "grad_norm": 1.0008835792541504, "learning_rate": 1.774137024706501e-05, "loss": 3.729, "step": 9392 }, { "epoch": 2.4682891582093474, "grad_norm": 0.860206127166748, "learning_rate": 1.7723847906080254e-05, "loss": 3.6934, "step": 9394 }, { "epoch": 2.4688146615430093, "grad_norm": 0.9042295217514038, "learning_rate": 1.77063255650955e-05, "loss": 3.6839, "step": 9396 }, { "epoch": 2.469340164876671, "grad_norm": 0.8755154609680176, "learning_rate": 1.7688803224110743e-05, "loss": 3.6815, "step": 9398 }, { "epoch": 2.469865668210333, "grad_norm": 0.9178483486175537, "learning_rate": 1.7671280883125987e-05, "loss": 3.7137, "step": 9400 }, { "epoch": 2.4703911715439943, "grad_norm": 0.8683056831359863, "learning_rate": 1.765375854214123e-05, "loss": 3.7374, "step": 9402 }, { "epoch": 2.4709166748776563, "grad_norm": 0.8983325362205505, "learning_rate": 1.7636236201156475e-05, "loss": 3.7419, "step": 9404 }, { "epoch": 2.471442178211318, "grad_norm": 0.8718848824501038, "learning_rate": 1.761871386017172e-05, "loss": 3.6762, "step": 9406 }, { "epoch": 2.47196768154498, "grad_norm": 0.8565696477890015, "learning_rate": 1.7601191519186963e-05, "loss": 3.65, "step": 9408 }, { "epoch": 2.4724931848786413, "grad_norm": 0.8900579810142517, "learning_rate": 1.7583669178202208e-05, "loss": 3.7001, "step": 9410 }, { "epoch": 2.4730186882123033, "grad_norm": 0.9369829893112183, "learning_rate": 1.7566146837217455e-05, "loss": 3.6755, "step": 9412 }, { "epoch": 2.4735441915459653, "grad_norm": 0.9546351432800293, "learning_rate": 1.7548624496232696e-05, "loss": 3.7606, "step": 9414 }, { "epoch": 2.474069694879627, "grad_norm": 0.9960475564002991, "learning_rate": 1.753110215524794e-05, "loss": 3.6937, "step": 9416 }, { "epoch": 2.4745951982132888, "grad_norm": 0.9156344532966614, "learning_rate": 1.7513579814263188e-05, "loss": 3.6888, "step": 9418 }, { "epoch": 2.4751207015469503, "grad_norm": 0.9597721695899963, "learning_rate": 1.7496057473278432e-05, "loss": 3.7297, "step": 9420 }, { "epoch": 2.4756462048806123, "grad_norm": 0.8666523098945618, "learning_rate": 1.7478535132293676e-05, "loss": 3.7105, "step": 9422 }, { "epoch": 2.476171708214274, "grad_norm": 0.8895410895347595, "learning_rate": 1.746101279130892e-05, "loss": 3.6904, "step": 9424 }, { "epoch": 2.4766972115479358, "grad_norm": 0.9899903535842896, "learning_rate": 1.7443490450324164e-05, "loss": 3.7189, "step": 9426 }, { "epoch": 2.4772227148815977, "grad_norm": 0.8885858654975891, "learning_rate": 1.742596810933941e-05, "loss": 3.7657, "step": 9428 }, { "epoch": 2.4777482182152593, "grad_norm": 0.9012154936790466, "learning_rate": 1.7408445768354653e-05, "loss": 3.6723, "step": 9430 }, { "epoch": 2.4782737215489212, "grad_norm": 0.8837555050849915, "learning_rate": 1.7390923427369897e-05, "loss": 3.7229, "step": 9432 }, { "epoch": 2.4787992248825828, "grad_norm": 0.8254130482673645, "learning_rate": 1.737340108638514e-05, "loss": 3.666, "step": 9434 }, { "epoch": 2.4793247282162447, "grad_norm": 1.0695559978485107, "learning_rate": 1.7355878745400385e-05, "loss": 3.6837, "step": 9436 }, { "epoch": 2.4798502315499062, "grad_norm": 0.8122057318687439, "learning_rate": 1.7338356404415633e-05, "loss": 3.6983, "step": 9438 }, { "epoch": 2.480375734883568, "grad_norm": 1.0163662433624268, "learning_rate": 1.7320834063430874e-05, "loss": 3.6871, "step": 9440 }, { "epoch": 2.48090123821723, "grad_norm": 0.9224966764450073, "learning_rate": 1.7303311722446118e-05, "loss": 3.6997, "step": 9442 }, { "epoch": 2.4814267415508917, "grad_norm": 0.849280059337616, "learning_rate": 1.7285789381461365e-05, "loss": 3.6935, "step": 9444 }, { "epoch": 2.4819522448845532, "grad_norm": 0.9726102948188782, "learning_rate": 1.7268267040476606e-05, "loss": 3.6733, "step": 9446 }, { "epoch": 2.482477748218215, "grad_norm": 0.9058801531791687, "learning_rate": 1.7250744699491854e-05, "loss": 3.6609, "step": 9448 }, { "epoch": 2.483003251551877, "grad_norm": 0.8440490961074829, "learning_rate": 1.7233222358507098e-05, "loss": 3.6769, "step": 9450 }, { "epoch": 2.4835287548855387, "grad_norm": 0.9359525442123413, "learning_rate": 1.7215700017522342e-05, "loss": 3.6668, "step": 9452 }, { "epoch": 2.4840542582192007, "grad_norm": 0.8639597296714783, "learning_rate": 1.7198177676537586e-05, "loss": 3.6136, "step": 9454 }, { "epoch": 2.484579761552862, "grad_norm": 0.8849471211433411, "learning_rate": 1.718065533555283e-05, "loss": 3.6818, "step": 9456 }, { "epoch": 2.485105264886524, "grad_norm": 0.9176455736160278, "learning_rate": 1.7163132994568078e-05, "loss": 3.6907, "step": 9458 }, { "epoch": 2.4856307682201857, "grad_norm": 0.9266950488090515, "learning_rate": 1.714561065358332e-05, "loss": 3.6748, "step": 9460 }, { "epoch": 2.4861562715538477, "grad_norm": 0.9394159317016602, "learning_rate": 1.7128088312598563e-05, "loss": 3.677, "step": 9462 }, { "epoch": 2.4866817748875096, "grad_norm": 0.9257224202156067, "learning_rate": 1.711056597161381e-05, "loss": 3.7133, "step": 9464 }, { "epoch": 2.487207278221171, "grad_norm": 0.9602984189987183, "learning_rate": 1.709304363062905e-05, "loss": 3.7318, "step": 9466 }, { "epoch": 2.487732781554833, "grad_norm": 0.9133790135383606, "learning_rate": 1.70755212896443e-05, "loss": 3.6622, "step": 9468 }, { "epoch": 2.4882582848884947, "grad_norm": 0.8766359090805054, "learning_rate": 1.7057998948659543e-05, "loss": 3.6872, "step": 9470 }, { "epoch": 2.4887837882221566, "grad_norm": 0.9875863790512085, "learning_rate": 1.7040476607674784e-05, "loss": 3.7332, "step": 9472 }, { "epoch": 2.489309291555818, "grad_norm": 1.0584850311279297, "learning_rate": 1.702295426669003e-05, "loss": 3.7122, "step": 9474 }, { "epoch": 2.48983479488948, "grad_norm": 0.9185423254966736, "learning_rate": 1.7005431925705276e-05, "loss": 3.7, "step": 9476 }, { "epoch": 2.490360298223142, "grad_norm": 1.0839126110076904, "learning_rate": 1.698790958472052e-05, "loss": 3.725, "step": 9478 }, { "epoch": 2.4908858015568036, "grad_norm": 0.8820853233337402, "learning_rate": 1.6970387243735764e-05, "loss": 3.6952, "step": 9480 }, { "epoch": 2.4914113048904656, "grad_norm": 1.0151525735855103, "learning_rate": 1.6952864902751008e-05, "loss": 3.7123, "step": 9482 }, { "epoch": 2.491936808224127, "grad_norm": 1.133479356765747, "learning_rate": 1.6935342561766252e-05, "loss": 3.6434, "step": 9484 }, { "epoch": 2.492462311557789, "grad_norm": 0.9787766933441162, "learning_rate": 1.6917820220781496e-05, "loss": 3.6845, "step": 9486 }, { "epoch": 2.4929878148914506, "grad_norm": 0.9592704176902771, "learning_rate": 1.690029787979674e-05, "loss": 3.7154, "step": 9488 }, { "epoch": 2.4935133182251126, "grad_norm": 0.9156347513198853, "learning_rate": 1.6882775538811988e-05, "loss": 3.701, "step": 9490 }, { "epoch": 2.494038821558774, "grad_norm": 0.8780881762504578, "learning_rate": 1.686525319782723e-05, "loss": 3.7039, "step": 9492 }, { "epoch": 2.494564324892436, "grad_norm": 0.8596661686897278, "learning_rate": 1.6847730856842477e-05, "loss": 3.6732, "step": 9494 }, { "epoch": 2.4950898282260976, "grad_norm": 1.005890130996704, "learning_rate": 1.683020851585772e-05, "loss": 3.7262, "step": 9496 }, { "epoch": 2.4956153315597596, "grad_norm": 0.8638002276420593, "learning_rate": 1.681268617487296e-05, "loss": 3.6877, "step": 9498 }, { "epoch": 2.4961408348934215, "grad_norm": 0.8647028207778931, "learning_rate": 1.679516383388821e-05, "loss": 3.6882, "step": 9500 }, { "epoch": 2.496666338227083, "grad_norm": 0.9320617914199829, "learning_rate": 1.6777641492903453e-05, "loss": 3.7276, "step": 9502 }, { "epoch": 2.497191841560745, "grad_norm": 1.0194247961044312, "learning_rate": 1.6760119151918697e-05, "loss": 3.6913, "step": 9504 }, { "epoch": 2.4977173448944066, "grad_norm": 0.8777153491973877, "learning_rate": 1.674259681093394e-05, "loss": 3.6814, "step": 9506 }, { "epoch": 2.4982428482280685, "grad_norm": 0.9306923747062683, "learning_rate": 1.6725074469949186e-05, "loss": 3.7356, "step": 9508 }, { "epoch": 2.49876835156173, "grad_norm": 0.8997367024421692, "learning_rate": 1.670755212896443e-05, "loss": 3.6817, "step": 9510 }, { "epoch": 2.499293854895392, "grad_norm": 0.915833592414856, "learning_rate": 1.6690029787979674e-05, "loss": 3.7598, "step": 9512 }, { "epoch": 2.499819358229054, "grad_norm": 0.8572295308113098, "learning_rate": 1.6672507446994918e-05, "loss": 3.6841, "step": 9514 }, { "epoch": 2.5003448615627155, "grad_norm": 0.9579532146453857, "learning_rate": 1.6654985106010162e-05, "loss": 3.6799, "step": 9516 }, { "epoch": 2.500870364896377, "grad_norm": 0.8320680260658264, "learning_rate": 1.6637462765025407e-05, "loss": 3.6966, "step": 9518 }, { "epoch": 2.501395868230039, "grad_norm": 0.8987518548965454, "learning_rate": 1.6619940424040654e-05, "loss": 3.6752, "step": 9520 }, { "epoch": 2.501921371563701, "grad_norm": 0.9812500476837158, "learning_rate": 1.66024180830559e-05, "loss": 3.6894, "step": 9522 }, { "epoch": 2.5024468748973625, "grad_norm": 0.8646553158760071, "learning_rate": 1.6584895742071143e-05, "loss": 3.7071, "step": 9524 }, { "epoch": 2.5029723782310245, "grad_norm": 1.0309051275253296, "learning_rate": 1.6567373401086387e-05, "loss": 3.7094, "step": 9526 }, { "epoch": 2.5034978815646864, "grad_norm": 1.0596919059753418, "learning_rate": 1.654985106010163e-05, "loss": 3.644, "step": 9528 }, { "epoch": 2.504023384898348, "grad_norm": 0.9405785799026489, "learning_rate": 1.6532328719116875e-05, "loss": 3.6755, "step": 9530 }, { "epoch": 2.5045488882320095, "grad_norm": 0.9220551252365112, "learning_rate": 1.651480637813212e-05, "loss": 3.718, "step": 9532 }, { "epoch": 2.5050743915656715, "grad_norm": 0.9931498169898987, "learning_rate": 1.6497284037147363e-05, "loss": 3.7233, "step": 9534 }, { "epoch": 2.5055998948993334, "grad_norm": 0.8918694853782654, "learning_rate": 1.6479761696162608e-05, "loss": 3.7383, "step": 9536 }, { "epoch": 2.506125398232995, "grad_norm": 0.8692755699157715, "learning_rate": 1.6462239355177852e-05, "loss": 3.6824, "step": 9538 }, { "epoch": 2.506650901566657, "grad_norm": 0.8515539765357971, "learning_rate": 1.64447170141931e-05, "loss": 3.7508, "step": 9540 }, { "epoch": 2.5071764049003185, "grad_norm": 0.9064961075782776, "learning_rate": 1.642719467320834e-05, "loss": 3.7001, "step": 9542 }, { "epoch": 2.5077019082339804, "grad_norm": 0.9601085186004639, "learning_rate": 1.6409672332223584e-05, "loss": 3.6738, "step": 9544 }, { "epoch": 2.508227411567642, "grad_norm": 0.8916100263595581, "learning_rate": 1.6392149991238832e-05, "loss": 3.7117, "step": 9546 }, { "epoch": 2.508752914901304, "grad_norm": 0.9225867390632629, "learning_rate": 1.6374627650254073e-05, "loss": 3.6875, "step": 9548 }, { "epoch": 2.509278418234966, "grad_norm": 0.9159874320030212, "learning_rate": 1.635710530926932e-05, "loss": 3.6925, "step": 9550 }, { "epoch": 2.5098039215686274, "grad_norm": 0.937225341796875, "learning_rate": 1.6339582968284564e-05, "loss": 3.7386, "step": 9552 }, { "epoch": 2.5103294249022894, "grad_norm": 1.1530628204345703, "learning_rate": 1.6322060627299805e-05, "loss": 3.7021, "step": 9554 }, { "epoch": 2.510854928235951, "grad_norm": 0.9560359120368958, "learning_rate": 1.6304538286315053e-05, "loss": 3.7538, "step": 9556 }, { "epoch": 2.511380431569613, "grad_norm": 0.9009852409362793, "learning_rate": 1.6287015945330297e-05, "loss": 3.7447, "step": 9558 }, { "epoch": 2.5119059349032744, "grad_norm": 0.928413450717926, "learning_rate": 1.626949360434554e-05, "loss": 3.7127, "step": 9560 }, { "epoch": 2.5124314382369364, "grad_norm": 0.8790431022644043, "learning_rate": 1.6251971263360785e-05, "loss": 3.6903, "step": 9562 }, { "epoch": 2.5129569415705983, "grad_norm": 0.8588958978652954, "learning_rate": 1.623444892237603e-05, "loss": 3.689, "step": 9564 }, { "epoch": 2.51348244490426, "grad_norm": 0.9791207313537598, "learning_rate": 1.6216926581391277e-05, "loss": 3.6528, "step": 9566 }, { "epoch": 2.5140079482379214, "grad_norm": 0.9527714252471924, "learning_rate": 1.6199404240406518e-05, "loss": 3.7098, "step": 9568 }, { "epoch": 2.5145334515715834, "grad_norm": 0.9902021288871765, "learning_rate": 1.6181881899421762e-05, "loss": 3.6915, "step": 9570 }, { "epoch": 2.5150589549052453, "grad_norm": 1.0812225341796875, "learning_rate": 1.616435955843701e-05, "loss": 3.7006, "step": 9572 }, { "epoch": 2.515584458238907, "grad_norm": 1.0665171146392822, "learning_rate": 1.614683721745225e-05, "loss": 3.7014, "step": 9574 }, { "epoch": 2.516109961572569, "grad_norm": 0.8948612809181213, "learning_rate": 1.6129314876467498e-05, "loss": 3.6954, "step": 9576 }, { "epoch": 2.5166354649062304, "grad_norm": 0.9153848886489868, "learning_rate": 1.6111792535482742e-05, "loss": 3.675, "step": 9578 }, { "epoch": 2.5171609682398923, "grad_norm": 0.8915504813194275, "learning_rate": 1.6094270194497986e-05, "loss": 3.6988, "step": 9580 }, { "epoch": 2.517686471573554, "grad_norm": 0.91746586561203, "learning_rate": 1.607674785351323e-05, "loss": 3.6764, "step": 9582 }, { "epoch": 2.518211974907216, "grad_norm": 0.9000945091247559, "learning_rate": 1.6059225512528475e-05, "loss": 3.7381, "step": 9584 }, { "epoch": 2.518737478240878, "grad_norm": 0.8409349918365479, "learning_rate": 1.604170317154372e-05, "loss": 3.7515, "step": 9586 }, { "epoch": 2.5192629815745393, "grad_norm": 0.9453901052474976, "learning_rate": 1.6024180830558963e-05, "loss": 3.6865, "step": 9588 }, { "epoch": 2.5197884849082013, "grad_norm": 0.891927182674408, "learning_rate": 1.6006658489574207e-05, "loss": 3.6692, "step": 9590 }, { "epoch": 2.520313988241863, "grad_norm": 1.0071780681610107, "learning_rate": 1.5989136148589455e-05, "loss": 3.7218, "step": 9592 }, { "epoch": 2.520839491575525, "grad_norm": 0.9297993779182434, "learning_rate": 1.5971613807604695e-05, "loss": 3.6275, "step": 9594 }, { "epoch": 2.5213649949091863, "grad_norm": 0.8910967111587524, "learning_rate": 1.5954091466619943e-05, "loss": 3.6987, "step": 9596 }, { "epoch": 2.5218904982428483, "grad_norm": 1.0898264646530151, "learning_rate": 1.5936569125635187e-05, "loss": 3.6967, "step": 9598 }, { "epoch": 2.5224160015765102, "grad_norm": 0.9633069038391113, "learning_rate": 1.5919046784650428e-05, "loss": 3.6926, "step": 9600 }, { "epoch": 2.5224160015765102, "eval_loss": 3.7283613681793213, "eval_runtime": 464.7232, "eval_samples_per_second": 262.068, "eval_steps_per_second": 8.19, "step": 9600 }, { "epoch": 2.5229415049101718, "grad_norm": 1.0056136846542358, "learning_rate": 1.5901524443665676e-05, "loss": 3.6994, "step": 9602 }, { "epoch": 2.5234670082438333, "grad_norm": 0.984417736530304, "learning_rate": 1.588400210268092e-05, "loss": 3.6747, "step": 9604 }, { "epoch": 2.5239925115774953, "grad_norm": 0.9440913796424866, "learning_rate": 1.5866479761696164e-05, "loss": 3.6884, "step": 9606 }, { "epoch": 2.5245180149111572, "grad_norm": 0.8997028470039368, "learning_rate": 1.5848957420711408e-05, "loss": 3.701, "step": 9608 }, { "epoch": 2.5250435182448188, "grad_norm": 0.8244013786315918, "learning_rate": 1.5831435079726652e-05, "loss": 3.7027, "step": 9610 }, { "epoch": 2.5255690215784807, "grad_norm": 0.8649292588233948, "learning_rate": 1.5813912738741896e-05, "loss": 3.6598, "step": 9612 }, { "epoch": 2.5260945249121423, "grad_norm": 0.958624541759491, "learning_rate": 1.579639039775714e-05, "loss": 3.6766, "step": 9614 }, { "epoch": 2.5266200282458042, "grad_norm": 0.9044724106788635, "learning_rate": 1.5778868056772385e-05, "loss": 3.7211, "step": 9616 }, { "epoch": 2.5271455315794658, "grad_norm": 0.9338341951370239, "learning_rate": 1.576134571578763e-05, "loss": 3.6957, "step": 9618 }, { "epoch": 2.5276710349131277, "grad_norm": 1.0202126502990723, "learning_rate": 1.5743823374802873e-05, "loss": 3.7505, "step": 9620 }, { "epoch": 2.5281965382467897, "grad_norm": 0.9157519936561584, "learning_rate": 1.572630103381812e-05, "loss": 3.6452, "step": 9622 }, { "epoch": 2.528722041580451, "grad_norm": 0.8289288282394409, "learning_rate": 1.5708778692833365e-05, "loss": 3.745, "step": 9624 }, { "epoch": 2.529247544914113, "grad_norm": 0.8515139222145081, "learning_rate": 1.5691256351848606e-05, "loss": 3.7185, "step": 9626 }, { "epoch": 2.5297730482477747, "grad_norm": 1.0447688102722168, "learning_rate": 1.5673734010863853e-05, "loss": 3.6847, "step": 9628 }, { "epoch": 2.5302985515814367, "grad_norm": 0.8450469374656677, "learning_rate": 1.5656211669879097e-05, "loss": 3.6469, "step": 9630 }, { "epoch": 2.530824054915098, "grad_norm": 1.0305235385894775, "learning_rate": 1.563868932889434e-05, "loss": 3.7117, "step": 9632 }, { "epoch": 2.53134955824876, "grad_norm": 0.9950250387191772, "learning_rate": 1.5621166987909586e-05, "loss": 3.6707, "step": 9634 }, { "epoch": 2.531875061582422, "grad_norm": 0.8760108947753906, "learning_rate": 1.560364464692483e-05, "loss": 3.6788, "step": 9636 }, { "epoch": 2.5324005649160837, "grad_norm": 0.8308132290840149, "learning_rate": 1.5586122305940074e-05, "loss": 3.7052, "step": 9638 }, { "epoch": 2.532926068249745, "grad_norm": 0.9407414197921753, "learning_rate": 1.5568599964955318e-05, "loss": 3.6569, "step": 9640 }, { "epoch": 2.533451571583407, "grad_norm": 0.9584619998931885, "learning_rate": 1.5551077623970562e-05, "loss": 3.6921, "step": 9642 }, { "epoch": 2.533977074917069, "grad_norm": 0.9499197602272034, "learning_rate": 1.5533555282985807e-05, "loss": 3.7239, "step": 9644 }, { "epoch": 2.5345025782507307, "grad_norm": 1.0083240270614624, "learning_rate": 1.551603294200105e-05, "loss": 3.7094, "step": 9646 }, { "epoch": 2.5350280815843926, "grad_norm": 0.9542543292045593, "learning_rate": 1.54985106010163e-05, "loss": 3.7695, "step": 9648 }, { "epoch": 2.535553584918054, "grad_norm": 0.9464656114578247, "learning_rate": 1.548098826003154e-05, "loss": 3.6892, "step": 9650 }, { "epoch": 2.536079088251716, "grad_norm": 0.8702993392944336, "learning_rate": 1.5463465919046787e-05, "loss": 3.7028, "step": 9652 }, { "epoch": 2.5366045915853777, "grad_norm": 0.9980103969573975, "learning_rate": 1.544594357806203e-05, "loss": 3.6854, "step": 9654 }, { "epoch": 2.5371300949190396, "grad_norm": 0.8537274599075317, "learning_rate": 1.542842123707727e-05, "loss": 3.7417, "step": 9656 }, { "epoch": 2.5376555982527016, "grad_norm": 0.9545750617980957, "learning_rate": 1.541089889609252e-05, "loss": 3.6688, "step": 9658 }, { "epoch": 2.538181101586363, "grad_norm": 0.88575279712677, "learning_rate": 1.5393376555107763e-05, "loss": 3.7281, "step": 9660 }, { "epoch": 2.538706604920025, "grad_norm": 0.9079550504684448, "learning_rate": 1.5375854214123008e-05, "loss": 3.6574, "step": 9662 }, { "epoch": 2.5392321082536866, "grad_norm": 0.9652597308158875, "learning_rate": 1.5358331873138252e-05, "loss": 3.677, "step": 9664 }, { "epoch": 2.5397576115873486, "grad_norm": 1.0523381233215332, "learning_rate": 1.5340809532153496e-05, "loss": 3.7117, "step": 9666 }, { "epoch": 2.54028311492101, "grad_norm": 0.966945469379425, "learning_rate": 1.5323287191168743e-05, "loss": 3.684, "step": 9668 }, { "epoch": 2.540808618254672, "grad_norm": 0.944392204284668, "learning_rate": 1.5305764850183984e-05, "loss": 3.6646, "step": 9670 }, { "epoch": 2.541334121588334, "grad_norm": 0.993937075138092, "learning_rate": 1.528824250919923e-05, "loss": 3.7046, "step": 9672 }, { "epoch": 2.5418596249219956, "grad_norm": 0.9931727051734924, "learning_rate": 1.5270720168214476e-05, "loss": 3.7052, "step": 9674 }, { "epoch": 2.542385128255657, "grad_norm": 0.9998050928115845, "learning_rate": 1.5253197827229718e-05, "loss": 3.7229, "step": 9676 }, { "epoch": 2.542910631589319, "grad_norm": 0.8878262639045715, "learning_rate": 1.5235675486244963e-05, "loss": 3.7408, "step": 9678 }, { "epoch": 2.543436134922981, "grad_norm": 1.1593143939971924, "learning_rate": 1.5218153145260209e-05, "loss": 3.7337, "step": 9680 }, { "epoch": 2.5439616382566426, "grad_norm": 0.9361653923988342, "learning_rate": 1.5200630804275451e-05, "loss": 3.7481, "step": 9682 }, { "epoch": 2.5444871415903045, "grad_norm": 0.8468132615089417, "learning_rate": 1.5183108463290697e-05, "loss": 3.6471, "step": 9684 }, { "epoch": 2.5450126449239665, "grad_norm": 0.9614148139953613, "learning_rate": 1.5165586122305941e-05, "loss": 3.6571, "step": 9686 }, { "epoch": 2.545538148257628, "grad_norm": 0.8998286724090576, "learning_rate": 1.5148063781321184e-05, "loss": 3.7096, "step": 9688 }, { "epoch": 2.5460636515912896, "grad_norm": 0.9508447647094727, "learning_rate": 1.513054144033643e-05, "loss": 3.7348, "step": 9690 }, { "epoch": 2.5465891549249515, "grad_norm": 0.9000710844993591, "learning_rate": 1.5113019099351675e-05, "loss": 3.6705, "step": 9692 }, { "epoch": 2.5471146582586135, "grad_norm": 0.9576178193092346, "learning_rate": 1.509549675836692e-05, "loss": 3.7219, "step": 9694 }, { "epoch": 2.547640161592275, "grad_norm": 0.9227121472358704, "learning_rate": 1.5077974417382162e-05, "loss": 3.7008, "step": 9696 }, { "epoch": 2.548165664925937, "grad_norm": 0.931967556476593, "learning_rate": 1.5060452076397408e-05, "loss": 3.716, "step": 9698 }, { "epoch": 2.5486911682595985, "grad_norm": 1.0434114933013916, "learning_rate": 1.5042929735412654e-05, "loss": 3.745, "step": 9700 }, { "epoch": 2.5492166715932605, "grad_norm": 1.024099588394165, "learning_rate": 1.5025407394427896e-05, "loss": 3.6373, "step": 9702 }, { "epoch": 2.549742174926922, "grad_norm": 0.9442965984344482, "learning_rate": 1.500788505344314e-05, "loss": 3.6961, "step": 9704 }, { "epoch": 2.550267678260584, "grad_norm": 0.9233967065811157, "learning_rate": 1.4990362712458386e-05, "loss": 3.7125, "step": 9706 }, { "epoch": 2.550793181594246, "grad_norm": 1.01677405834198, "learning_rate": 1.4972840371473629e-05, "loss": 3.6594, "step": 9708 }, { "epoch": 2.5513186849279075, "grad_norm": 0.9840818643569946, "learning_rate": 1.4955318030488875e-05, "loss": 3.7031, "step": 9710 }, { "epoch": 2.5518441882615694, "grad_norm": 0.953614354133606, "learning_rate": 1.4937795689504119e-05, "loss": 3.7147, "step": 9712 }, { "epoch": 2.552369691595231, "grad_norm": 0.8345870971679688, "learning_rate": 1.4920273348519361e-05, "loss": 3.6679, "step": 9714 }, { "epoch": 2.552895194928893, "grad_norm": 0.8510845303535461, "learning_rate": 1.4902751007534607e-05, "loss": 3.7009, "step": 9716 }, { "epoch": 2.5534206982625545, "grad_norm": 0.9137921929359436, "learning_rate": 1.4885228666549853e-05, "loss": 3.7012, "step": 9718 }, { "epoch": 2.5539462015962164, "grad_norm": 0.8081386685371399, "learning_rate": 1.4867706325565095e-05, "loss": 3.7516, "step": 9720 }, { "epoch": 2.5544717049298784, "grad_norm": 0.9193587899208069, "learning_rate": 1.485018398458034e-05, "loss": 3.697, "step": 9722 }, { "epoch": 2.55499720826354, "grad_norm": 0.924156904220581, "learning_rate": 1.4832661643595585e-05, "loss": 3.6801, "step": 9724 }, { "epoch": 2.5555227115972015, "grad_norm": 0.9076676368713379, "learning_rate": 1.4815139302610828e-05, "loss": 3.6973, "step": 9726 }, { "epoch": 2.5560482149308634, "grad_norm": 0.8992161154747009, "learning_rate": 1.4797616961626074e-05, "loss": 3.7459, "step": 9728 }, { "epoch": 2.5565737182645254, "grad_norm": 0.9557196497917175, "learning_rate": 1.4780094620641318e-05, "loss": 3.6582, "step": 9730 }, { "epoch": 2.557099221598187, "grad_norm": 1.076859712600708, "learning_rate": 1.4762572279656564e-05, "loss": 3.7356, "step": 9732 }, { "epoch": 2.557624724931849, "grad_norm": 0.849002480506897, "learning_rate": 1.4745049938671806e-05, "loss": 3.6785, "step": 9734 }, { "epoch": 2.5581502282655104, "grad_norm": 0.9006468653678894, "learning_rate": 1.4727527597687052e-05, "loss": 3.669, "step": 9736 }, { "epoch": 2.5586757315991724, "grad_norm": 0.8962019085884094, "learning_rate": 1.4710005256702296e-05, "loss": 3.6798, "step": 9738 }, { "epoch": 2.559201234932834, "grad_norm": 0.950803816318512, "learning_rate": 1.469248291571754e-05, "loss": 3.6685, "step": 9740 }, { "epoch": 2.559726738266496, "grad_norm": 0.8375423550605774, "learning_rate": 1.4674960574732785e-05, "loss": 3.6623, "step": 9742 }, { "epoch": 2.560252241600158, "grad_norm": 0.8519186973571777, "learning_rate": 1.465743823374803e-05, "loss": 3.7052, "step": 9744 }, { "epoch": 2.5607777449338194, "grad_norm": 0.9090122580528259, "learning_rate": 1.4639915892763273e-05, "loss": 3.72, "step": 9746 }, { "epoch": 2.5613032482674813, "grad_norm": 0.8473878502845764, "learning_rate": 1.4622393551778519e-05, "loss": 3.7494, "step": 9748 }, { "epoch": 2.561828751601143, "grad_norm": 1.022227168083191, "learning_rate": 1.4604871210793763e-05, "loss": 3.6844, "step": 9750 }, { "epoch": 2.562354254934805, "grad_norm": 0.9569261074066162, "learning_rate": 1.4587348869809006e-05, "loss": 3.6916, "step": 9752 }, { "epoch": 2.5628797582684664, "grad_norm": 0.8973060250282288, "learning_rate": 1.4569826528824251e-05, "loss": 3.7134, "step": 9754 }, { "epoch": 2.5634052616021283, "grad_norm": 0.8476235866546631, "learning_rate": 1.4552304187839497e-05, "loss": 3.7145, "step": 9756 }, { "epoch": 2.5639307649357903, "grad_norm": 1.08950674533844, "learning_rate": 1.453478184685474e-05, "loss": 3.6839, "step": 9758 }, { "epoch": 2.564456268269452, "grad_norm": 0.9377020597457886, "learning_rate": 1.4517259505869984e-05, "loss": 3.6744, "step": 9760 }, { "epoch": 2.5649817716031134, "grad_norm": 0.9350590109825134, "learning_rate": 1.449973716488523e-05, "loss": 3.6793, "step": 9762 }, { "epoch": 2.5655072749367753, "grad_norm": 1.0033185482025146, "learning_rate": 1.4482214823900476e-05, "loss": 3.6747, "step": 9764 }, { "epoch": 2.5660327782704373, "grad_norm": 1.0114998817443848, "learning_rate": 1.4464692482915718e-05, "loss": 3.6893, "step": 9766 }, { "epoch": 2.566558281604099, "grad_norm": 1.0133854150772095, "learning_rate": 1.4447170141930962e-05, "loss": 3.6958, "step": 9768 }, { "epoch": 2.567083784937761, "grad_norm": 0.8981649875640869, "learning_rate": 1.4429647800946208e-05, "loss": 3.6061, "step": 9770 }, { "epoch": 2.5676092882714223, "grad_norm": 0.9347666501998901, "learning_rate": 1.441212545996145e-05, "loss": 3.6646, "step": 9772 }, { "epoch": 2.5681347916050843, "grad_norm": 0.9668712615966797, "learning_rate": 1.4394603118976697e-05, "loss": 3.7269, "step": 9774 }, { "epoch": 2.568660294938746, "grad_norm": 0.8796441555023193, "learning_rate": 1.437708077799194e-05, "loss": 3.6773, "step": 9776 }, { "epoch": 2.569185798272408, "grad_norm": 1.007232666015625, "learning_rate": 1.4359558437007183e-05, "loss": 3.721, "step": 9778 }, { "epoch": 2.5697113016060698, "grad_norm": 0.8891357779502869, "learning_rate": 1.434203609602243e-05, "loss": 3.6917, "step": 9780 }, { "epoch": 2.5702368049397313, "grad_norm": 0.936434805393219, "learning_rate": 1.4324513755037675e-05, "loss": 3.6519, "step": 9782 }, { "epoch": 2.5707623082733932, "grad_norm": 0.9008485078811646, "learning_rate": 1.4306991414052918e-05, "loss": 3.6977, "step": 9784 }, { "epoch": 2.5712878116070548, "grad_norm": 0.9572525024414062, "learning_rate": 1.4289469073068162e-05, "loss": 3.7134, "step": 9786 }, { "epoch": 2.5718133149407167, "grad_norm": 1.0709177255630493, "learning_rate": 1.4271946732083408e-05, "loss": 3.6734, "step": 9788 }, { "epoch": 2.5723388182743783, "grad_norm": 0.9051386117935181, "learning_rate": 1.425442439109865e-05, "loss": 3.7299, "step": 9790 }, { "epoch": 2.5728643216080402, "grad_norm": 0.8479617238044739, "learning_rate": 1.4236902050113896e-05, "loss": 3.7536, "step": 9792 }, { "epoch": 2.573389824941702, "grad_norm": 0.9002686142921448, "learning_rate": 1.421937970912914e-05, "loss": 3.6259, "step": 9794 }, { "epoch": 2.5739153282753637, "grad_norm": 0.9239231944084167, "learning_rate": 1.4201857368144383e-05, "loss": 3.6878, "step": 9796 }, { "epoch": 2.5744408316090253, "grad_norm": 0.9403279423713684, "learning_rate": 1.4184335027159628e-05, "loss": 3.7295, "step": 9798 }, { "epoch": 2.5749663349426872, "grad_norm": 0.9229673743247986, "learning_rate": 1.4166812686174874e-05, "loss": 3.7219, "step": 9800 }, { "epoch": 2.575491838276349, "grad_norm": 0.872694194316864, "learning_rate": 1.4149290345190118e-05, "loss": 3.69, "step": 9802 }, { "epoch": 2.5760173416100107, "grad_norm": 0.8557255864143372, "learning_rate": 1.4131768004205363e-05, "loss": 3.6832, "step": 9804 }, { "epoch": 2.5765428449436727, "grad_norm": 0.9544682502746582, "learning_rate": 1.4114245663220607e-05, "loss": 3.7129, "step": 9806 }, { "epoch": 2.577068348277334, "grad_norm": 0.9487378001213074, "learning_rate": 1.4096723322235853e-05, "loss": 3.736, "step": 9808 }, { "epoch": 2.577593851610996, "grad_norm": 0.9955893158912659, "learning_rate": 1.4079200981251095e-05, "loss": 3.6776, "step": 9810 }, { "epoch": 2.5781193549446577, "grad_norm": 0.931347131729126, "learning_rate": 1.4061678640266341e-05, "loss": 3.6962, "step": 9812 }, { "epoch": 2.5786448582783197, "grad_norm": 1.1097899675369263, "learning_rate": 1.4044156299281585e-05, "loss": 3.7002, "step": 9814 }, { "epoch": 2.5791703616119817, "grad_norm": 0.9313715696334839, "learning_rate": 1.4026633958296828e-05, "loss": 3.7342, "step": 9816 }, { "epoch": 2.579695864945643, "grad_norm": 0.8298256993293762, "learning_rate": 1.4009111617312074e-05, "loss": 3.7487, "step": 9818 }, { "epoch": 2.580221368279305, "grad_norm": 0.9580847024917603, "learning_rate": 1.399158927632732e-05, "loss": 3.7427, "step": 9820 }, { "epoch": 2.5807468716129667, "grad_norm": 0.9100271463394165, "learning_rate": 1.3974066935342562e-05, "loss": 3.7491, "step": 9822 }, { "epoch": 2.5812723749466286, "grad_norm": 0.8563130497932434, "learning_rate": 1.3956544594357806e-05, "loss": 3.6626, "step": 9824 }, { "epoch": 2.58179787828029, "grad_norm": 0.8639830350875854, "learning_rate": 1.3939022253373052e-05, "loss": 3.6962, "step": 9826 }, { "epoch": 2.582323381613952, "grad_norm": 0.8693371415138245, "learning_rate": 1.3921499912388294e-05, "loss": 3.7249, "step": 9828 }, { "epoch": 2.582848884947614, "grad_norm": 0.9555044770240784, "learning_rate": 1.390397757140354e-05, "loss": 3.7229, "step": 9830 }, { "epoch": 2.5833743882812756, "grad_norm": 0.9275398254394531, "learning_rate": 1.3886455230418784e-05, "loss": 3.6657, "step": 9832 }, { "epoch": 2.583899891614937, "grad_norm": 0.9449527263641357, "learning_rate": 1.386893288943403e-05, "loss": 3.6916, "step": 9834 }, { "epoch": 2.584425394948599, "grad_norm": 0.9332678914070129, "learning_rate": 1.3851410548449273e-05, "loss": 3.7364, "step": 9836 }, { "epoch": 2.584950898282261, "grad_norm": 0.8300040364265442, "learning_rate": 1.3833888207464519e-05, "loss": 3.7009, "step": 9838 }, { "epoch": 2.5854764016159226, "grad_norm": 0.8330608606338501, "learning_rate": 1.3816365866479763e-05, "loss": 3.731, "step": 9840 }, { "epoch": 2.5860019049495846, "grad_norm": 0.9651172757148743, "learning_rate": 1.3798843525495005e-05, "loss": 3.6941, "step": 9842 }, { "epoch": 2.5865274082832466, "grad_norm": 0.9613242149353027, "learning_rate": 1.3781321184510251e-05, "loss": 3.7184, "step": 9844 }, { "epoch": 2.587052911616908, "grad_norm": 0.9677256345748901, "learning_rate": 1.3763798843525497e-05, "loss": 3.7056, "step": 9846 }, { "epoch": 2.5875784149505696, "grad_norm": 0.9581161737442017, "learning_rate": 1.374627650254074e-05, "loss": 3.6912, "step": 9848 }, { "epoch": 2.5881039182842316, "grad_norm": 0.9369687438011169, "learning_rate": 1.3728754161555984e-05, "loss": 3.6821, "step": 9850 }, { "epoch": 2.5886294216178936, "grad_norm": 0.9458882212638855, "learning_rate": 1.371123182057123e-05, "loss": 3.6823, "step": 9852 }, { "epoch": 2.589154924951555, "grad_norm": 0.9050375819206238, "learning_rate": 1.3693709479586472e-05, "loss": 3.6878, "step": 9854 }, { "epoch": 2.589680428285217, "grad_norm": 0.8497835397720337, "learning_rate": 1.3676187138601718e-05, "loss": 3.6576, "step": 9856 }, { "epoch": 2.5902059316188786, "grad_norm": 0.8511841297149658, "learning_rate": 1.3658664797616962e-05, "loss": 3.6343, "step": 9858 }, { "epoch": 2.5907314349525405, "grad_norm": 0.9865280389785767, "learning_rate": 1.3641142456632205e-05, "loss": 3.7445, "step": 9860 }, { "epoch": 2.591256938286202, "grad_norm": 0.9034981727600098, "learning_rate": 1.362362011564745e-05, "loss": 3.7155, "step": 9862 }, { "epoch": 2.591782441619864, "grad_norm": 0.9750564694404602, "learning_rate": 1.3606097774662696e-05, "loss": 3.7064, "step": 9864 }, { "epoch": 2.592307944953526, "grad_norm": 0.9126054644584656, "learning_rate": 1.358857543367794e-05, "loss": 3.6742, "step": 9866 }, { "epoch": 2.5928334482871875, "grad_norm": 0.9354395270347595, "learning_rate": 1.3571053092693185e-05, "loss": 3.643, "step": 9868 }, { "epoch": 2.5933589516208495, "grad_norm": 0.9333736300468445, "learning_rate": 1.3553530751708429e-05, "loss": 3.6699, "step": 9870 }, { "epoch": 2.593884454954511, "grad_norm": 0.8490795493125916, "learning_rate": 1.3536008410723675e-05, "loss": 3.6656, "step": 9872 }, { "epoch": 2.594409958288173, "grad_norm": 0.9665597081184387, "learning_rate": 1.3518486069738917e-05, "loss": 3.6435, "step": 9874 }, { "epoch": 2.5949354616218345, "grad_norm": 0.9001298546791077, "learning_rate": 1.3500963728754163e-05, "loss": 3.6756, "step": 9876 }, { "epoch": 2.5954609649554965, "grad_norm": 0.9630308151245117, "learning_rate": 1.3483441387769407e-05, "loss": 3.7367, "step": 9878 }, { "epoch": 2.5959864682891585, "grad_norm": 1.0364958047866821, "learning_rate": 1.346591904678465e-05, "loss": 3.6891, "step": 9880 }, { "epoch": 2.59651197162282, "grad_norm": 1.1207743883132935, "learning_rate": 1.3448396705799896e-05, "loss": 3.6652, "step": 9882 }, { "epoch": 2.5970374749564815, "grad_norm": 1.0567642450332642, "learning_rate": 1.3430874364815142e-05, "loss": 3.7287, "step": 9884 }, { "epoch": 2.5975629782901435, "grad_norm": 0.8688778281211853, "learning_rate": 1.3413352023830384e-05, "loss": 3.7402, "step": 9886 }, { "epoch": 2.5980884816238055, "grad_norm": 0.9735795259475708, "learning_rate": 1.3395829682845628e-05, "loss": 3.6802, "step": 9888 }, { "epoch": 2.598613984957467, "grad_norm": 0.9442024230957031, "learning_rate": 1.3378307341860874e-05, "loss": 3.7112, "step": 9890 }, { "epoch": 2.599139488291129, "grad_norm": 0.9781617522239685, "learning_rate": 1.3360785000876117e-05, "loss": 3.7377, "step": 9892 }, { "epoch": 2.5996649916247905, "grad_norm": 0.9825936555862427, "learning_rate": 1.3343262659891362e-05, "loss": 3.7721, "step": 9894 }, { "epoch": 2.6001904949584524, "grad_norm": 0.9733375310897827, "learning_rate": 1.3325740318906607e-05, "loss": 3.6657, "step": 9896 }, { "epoch": 2.600715998292114, "grad_norm": 0.8697971105575562, "learning_rate": 1.3308217977921849e-05, "loss": 3.7346, "step": 9898 }, { "epoch": 2.601241501625776, "grad_norm": 0.8562219142913818, "learning_rate": 1.3290695636937095e-05, "loss": 3.7166, "step": 9900 }, { "epoch": 2.601767004959438, "grad_norm": 0.8206565380096436, "learning_rate": 1.327317329595234e-05, "loss": 3.7062, "step": 9902 }, { "epoch": 2.6022925082930994, "grad_norm": 0.9460352659225464, "learning_rate": 1.3255650954967585e-05, "loss": 3.6621, "step": 9904 }, { "epoch": 2.6028180116267614, "grad_norm": 0.9408266544342041, "learning_rate": 1.3238128613982827e-05, "loss": 3.7195, "step": 9906 }, { "epoch": 2.603343514960423, "grad_norm": 0.9788419008255005, "learning_rate": 1.3220606272998073e-05, "loss": 3.7196, "step": 9908 }, { "epoch": 2.603869018294085, "grad_norm": 0.9472336173057556, "learning_rate": 1.320308393201332e-05, "loss": 3.7095, "step": 9910 }, { "epoch": 2.6043945216277464, "grad_norm": 0.9212231636047363, "learning_rate": 1.3185561591028562e-05, "loss": 3.7008, "step": 9912 }, { "epoch": 2.6049200249614084, "grad_norm": 0.8527092337608337, "learning_rate": 1.3168039250043806e-05, "loss": 3.6921, "step": 9914 }, { "epoch": 2.6054455282950704, "grad_norm": 0.8440414667129517, "learning_rate": 1.3150516909059052e-05, "loss": 3.7453, "step": 9916 }, { "epoch": 2.605971031628732, "grad_norm": 0.9634268283843994, "learning_rate": 1.3132994568074294e-05, "loss": 3.6982, "step": 9918 }, { "epoch": 2.6064965349623934, "grad_norm": 0.9141463041305542, "learning_rate": 1.311547222708954e-05, "loss": 3.6943, "step": 9920 }, { "epoch": 2.6070220382960554, "grad_norm": 0.9585528373718262, "learning_rate": 1.3097949886104784e-05, "loss": 3.6703, "step": 9922 }, { "epoch": 2.6075475416297174, "grad_norm": 0.9491535425186157, "learning_rate": 1.3080427545120027e-05, "loss": 3.7159, "step": 9924 }, { "epoch": 2.608073044963379, "grad_norm": 1.0575439929962158, "learning_rate": 1.3062905204135273e-05, "loss": 3.7073, "step": 9926 }, { "epoch": 2.608598548297041, "grad_norm": 0.945391833782196, "learning_rate": 1.3045382863150518e-05, "loss": 3.6644, "step": 9928 }, { "epoch": 2.6091240516307024, "grad_norm": 0.8976839184761047, "learning_rate": 1.3027860522165761e-05, "loss": 3.7023, "step": 9930 }, { "epoch": 2.6096495549643643, "grad_norm": 0.8691785335540771, "learning_rate": 1.3010338181181007e-05, "loss": 3.6546, "step": 9932 }, { "epoch": 2.610175058298026, "grad_norm": 0.8283367156982422, "learning_rate": 1.2992815840196251e-05, "loss": 3.6713, "step": 9934 }, { "epoch": 2.610700561631688, "grad_norm": 0.9940893650054932, "learning_rate": 1.2975293499211497e-05, "loss": 3.7379, "step": 9936 }, { "epoch": 2.61122606496535, "grad_norm": 1.0004079341888428, "learning_rate": 1.295777115822674e-05, "loss": 3.7189, "step": 9938 }, { "epoch": 2.6117515682990113, "grad_norm": 0.96534264087677, "learning_rate": 1.2940248817241985e-05, "loss": 3.7239, "step": 9940 }, { "epoch": 2.6122770716326733, "grad_norm": 0.9618280529975891, "learning_rate": 1.292272647625723e-05, "loss": 3.6762, "step": 9942 }, { "epoch": 2.612802574966335, "grad_norm": 1.0115779638290405, "learning_rate": 1.2905204135272472e-05, "loss": 3.7234, "step": 9944 }, { "epoch": 2.613328078299997, "grad_norm": 1.0110681056976318, "learning_rate": 1.2887681794287718e-05, "loss": 3.68, "step": 9946 }, { "epoch": 2.6138535816336583, "grad_norm": 1.0338388681411743, "learning_rate": 1.2870159453302964e-05, "loss": 3.7265, "step": 9948 }, { "epoch": 2.6143790849673203, "grad_norm": 1.0503895282745361, "learning_rate": 1.2852637112318206e-05, "loss": 3.7153, "step": 9950 }, { "epoch": 2.6149045883009823, "grad_norm": 1.0262867212295532, "learning_rate": 1.283511477133345e-05, "loss": 3.7207, "step": 9952 }, { "epoch": 2.615430091634644, "grad_norm": 0.9356533288955688, "learning_rate": 1.2817592430348696e-05, "loss": 3.6529, "step": 9954 }, { "epoch": 2.6159555949683053, "grad_norm": 0.9604508280754089, "learning_rate": 1.2800070089363939e-05, "loss": 3.7309, "step": 9956 }, { "epoch": 2.6164810983019673, "grad_norm": 0.8855718970298767, "learning_rate": 1.2782547748379184e-05, "loss": 3.6916, "step": 9958 }, { "epoch": 2.6170066016356293, "grad_norm": 1.0859256982803345, "learning_rate": 1.2765025407394429e-05, "loss": 3.7039, "step": 9960 }, { "epoch": 2.617532104969291, "grad_norm": 0.9343173503875732, "learning_rate": 1.2747503066409671e-05, "loss": 3.7205, "step": 9962 }, { "epoch": 2.6180576083029528, "grad_norm": 0.977101743221283, "learning_rate": 1.2729980725424917e-05, "loss": 3.6842, "step": 9964 }, { "epoch": 2.6185831116366143, "grad_norm": 0.8996689915657043, "learning_rate": 1.2712458384440163e-05, "loss": 3.6533, "step": 9966 }, { "epoch": 2.6191086149702762, "grad_norm": 0.8608483076095581, "learning_rate": 1.2694936043455405e-05, "loss": 3.7159, "step": 9968 }, { "epoch": 2.6196341183039378, "grad_norm": 1.0180528163909912, "learning_rate": 1.267741370247065e-05, "loss": 3.6975, "step": 9970 }, { "epoch": 2.6201596216375997, "grad_norm": 0.8830701112747192, "learning_rate": 1.2659891361485895e-05, "loss": 3.6911, "step": 9972 }, { "epoch": 2.6206851249712617, "grad_norm": 0.8659114837646484, "learning_rate": 1.2642369020501141e-05, "loss": 3.6491, "step": 9974 }, { "epoch": 2.6212106283049232, "grad_norm": 0.8639798164367676, "learning_rate": 1.2624846679516384e-05, "loss": 3.6466, "step": 9976 }, { "epoch": 2.621736131638585, "grad_norm": 0.8889381289482117, "learning_rate": 1.2607324338531628e-05, "loss": 3.7302, "step": 9978 }, { "epoch": 2.6222616349722467, "grad_norm": 0.9180617332458496, "learning_rate": 1.2589801997546874e-05, "loss": 3.7416, "step": 9980 }, { "epoch": 2.6227871383059087, "grad_norm": 0.9649747014045715, "learning_rate": 1.2572279656562116e-05, "loss": 3.7508, "step": 9982 }, { "epoch": 2.6233126416395702, "grad_norm": 1.0610945224761963, "learning_rate": 1.2554757315577362e-05, "loss": 3.6974, "step": 9984 }, { "epoch": 2.623838144973232, "grad_norm": 0.9198793768882751, "learning_rate": 1.2537234974592606e-05, "loss": 3.673, "step": 9986 }, { "epoch": 2.624363648306894, "grad_norm": 0.9472860097885132, "learning_rate": 1.2519712633607849e-05, "loss": 3.7398, "step": 9988 }, { "epoch": 2.6248891516405557, "grad_norm": 0.9854425191879272, "learning_rate": 1.2502190292623095e-05, "loss": 3.6941, "step": 9990 }, { "epoch": 2.625414654974217, "grad_norm": 0.8107910752296448, "learning_rate": 1.2484667951638339e-05, "loss": 3.6838, "step": 9992 }, { "epoch": 2.625940158307879, "grad_norm": 0.8729326725006104, "learning_rate": 1.2467145610653585e-05, "loss": 3.7013, "step": 9994 }, { "epoch": 2.626465661641541, "grad_norm": 0.9053599238395691, "learning_rate": 1.2449623269668829e-05, "loss": 3.7551, "step": 9996 }, { "epoch": 2.6269911649752027, "grad_norm": 0.9456164240837097, "learning_rate": 1.2432100928684073e-05, "loss": 3.6729, "step": 9998 }, { "epoch": 2.6275166683088647, "grad_norm": 0.9853535890579224, "learning_rate": 1.2414578587699317e-05, "loss": 3.7196, "step": 10000 }, { "epoch": 2.6275166683088647, "eval_loss": 3.72115159034729, "eval_runtime": 464.6781, "eval_samples_per_second": 262.093, "eval_steps_per_second": 8.191, "step": 10000 }, { "epoch": 2.6280421716425266, "grad_norm": 0.8679309487342834, "learning_rate": 1.2397056246714561e-05, "loss": 3.7142, "step": 10002 }, { "epoch": 2.628567674976188, "grad_norm": 0.8945440053939819, "learning_rate": 1.2379533905729807e-05, "loss": 3.6767, "step": 10004 }, { "epoch": 2.6290931783098497, "grad_norm": 0.9856880903244019, "learning_rate": 1.2370772735237428e-05, "loss": 3.6367, "step": 10006 }, { "epoch": 2.6296186816435116, "grad_norm": 1.0343891382217407, "learning_rate": 1.2353250394252674e-05, "loss": 3.6695, "step": 10008 }, { "epoch": 2.6301441849771736, "grad_norm": 0.8288314938545227, "learning_rate": 1.2335728053267918e-05, "loss": 3.6775, "step": 10010 }, { "epoch": 2.630669688310835, "grad_norm": 0.9320821166038513, "learning_rate": 1.231820571228316e-05, "loss": 3.6962, "step": 10012 }, { "epoch": 2.631195191644497, "grad_norm": 1.0198960304260254, "learning_rate": 1.2300683371298406e-05, "loss": 3.6508, "step": 10014 }, { "epoch": 2.6317206949781586, "grad_norm": 0.9961224794387817, "learning_rate": 1.228316103031365e-05, "loss": 3.7048, "step": 10016 }, { "epoch": 2.6322461983118206, "grad_norm": 0.8792359232902527, "learning_rate": 1.2265638689328896e-05, "loss": 3.6709, "step": 10018 }, { "epoch": 2.632771701645482, "grad_norm": 0.9493117332458496, "learning_rate": 1.2248116348344139e-05, "loss": 3.6625, "step": 10020 }, { "epoch": 2.633297204979144, "grad_norm": 0.907128095626831, "learning_rate": 1.2230594007359383e-05, "loss": 3.7432, "step": 10022 }, { "epoch": 2.633822708312806, "grad_norm": 1.0525599718093872, "learning_rate": 1.2213071666374629e-05, "loss": 3.7059, "step": 10024 }, { "epoch": 2.6343482116464676, "grad_norm": 0.9839680790901184, "learning_rate": 1.2195549325389873e-05, "loss": 3.6905, "step": 10026 }, { "epoch": 2.6348737149801296, "grad_norm": 1.048746943473816, "learning_rate": 1.2178026984405117e-05, "loss": 3.7013, "step": 10028 }, { "epoch": 2.635399218313791, "grad_norm": 0.9262139201164246, "learning_rate": 1.2160504643420361e-05, "loss": 3.702, "step": 10030 }, { "epoch": 2.635924721647453, "grad_norm": 0.8844172358512878, "learning_rate": 1.2142982302435605e-05, "loss": 3.7133, "step": 10032 }, { "epoch": 2.6364502249811146, "grad_norm": 0.9536716341972351, "learning_rate": 1.2125459961450851e-05, "loss": 3.7065, "step": 10034 }, { "epoch": 2.6369757283147766, "grad_norm": 0.83958899974823, "learning_rate": 1.2107937620466095e-05, "loss": 3.7041, "step": 10036 }, { "epoch": 2.6375012316484385, "grad_norm": 0.9618171453475952, "learning_rate": 1.209041527948134e-05, "loss": 3.6632, "step": 10038 }, { "epoch": 2.6380267349821, "grad_norm": 0.9158703684806824, "learning_rate": 1.2072892938496584e-05, "loss": 3.683, "step": 10040 }, { "epoch": 2.6385522383157616, "grad_norm": 1.0614454746246338, "learning_rate": 1.2055370597511828e-05, "loss": 3.6909, "step": 10042 }, { "epoch": 2.6390777416494235, "grad_norm": 0.9133266806602478, "learning_rate": 1.2037848256527072e-05, "loss": 3.6949, "step": 10044 }, { "epoch": 2.6396032449830855, "grad_norm": 0.9436039328575134, "learning_rate": 1.2020325915542318e-05, "loss": 3.7053, "step": 10046 }, { "epoch": 2.640128748316747, "grad_norm": 1.0347496271133423, "learning_rate": 1.200280357455756e-05, "loss": 3.6546, "step": 10048 }, { "epoch": 2.640654251650409, "grad_norm": 0.9012948274612427, "learning_rate": 1.1985281233572806e-05, "loss": 3.6744, "step": 10050 }, { "epoch": 2.6411797549840705, "grad_norm": 0.9978151917457581, "learning_rate": 1.196775889258805e-05, "loss": 3.7588, "step": 10052 }, { "epoch": 2.6417052583177325, "grad_norm": 0.9468878507614136, "learning_rate": 1.1950236551603295e-05, "loss": 3.6549, "step": 10054 }, { "epoch": 2.642230761651394, "grad_norm": 0.9160395860671997, "learning_rate": 1.193271421061854e-05, "loss": 3.6612, "step": 10056 }, { "epoch": 2.642756264985056, "grad_norm": 0.9914864301681519, "learning_rate": 1.1915191869633783e-05, "loss": 3.6879, "step": 10058 }, { "epoch": 2.643281768318718, "grad_norm": 0.8442057967185974, "learning_rate": 1.1897669528649027e-05, "loss": 3.6661, "step": 10060 }, { "epoch": 2.6438072716523795, "grad_norm": 0.9715619683265686, "learning_rate": 1.1880147187664273e-05, "loss": 3.6419, "step": 10062 }, { "epoch": 2.6443327749860415, "grad_norm": 0.9587726593017578, "learning_rate": 1.1862624846679517e-05, "loss": 3.7156, "step": 10064 }, { "epoch": 2.644858278319703, "grad_norm": 0.9682067036628723, "learning_rate": 1.1845102505694761e-05, "loss": 3.6633, "step": 10066 }, { "epoch": 2.645383781653365, "grad_norm": 0.9370916485786438, "learning_rate": 1.1827580164710006e-05, "loss": 3.7076, "step": 10068 }, { "epoch": 2.6459092849870265, "grad_norm": 0.8465678691864014, "learning_rate": 1.181005782372525e-05, "loss": 3.6788, "step": 10070 }, { "epoch": 2.6464347883206885, "grad_norm": 0.8462715744972229, "learning_rate": 1.1792535482740496e-05, "loss": 3.6765, "step": 10072 }, { "epoch": 2.6469602916543504, "grad_norm": 1.03636634349823, "learning_rate": 1.177501314175574e-05, "loss": 3.7125, "step": 10074 }, { "epoch": 2.647485794988012, "grad_norm": 0.8843904137611389, "learning_rate": 1.1757490800770982e-05, "loss": 3.7208, "step": 10076 }, { "epoch": 2.6480112983216735, "grad_norm": 1.036474347114563, "learning_rate": 1.1739968459786228e-05, "loss": 3.687, "step": 10078 }, { "epoch": 2.6485368016553354, "grad_norm": 0.9376391768455505, "learning_rate": 1.1722446118801472e-05, "loss": 3.7164, "step": 10080 }, { "epoch": 2.6490623049889974, "grad_norm": 1.0007855892181396, "learning_rate": 1.1704923777816718e-05, "loss": 3.6547, "step": 10082 }, { "epoch": 2.649587808322659, "grad_norm": 0.9307159185409546, "learning_rate": 1.168740143683196e-05, "loss": 3.7204, "step": 10084 }, { "epoch": 2.650113311656321, "grad_norm": 0.8868526220321655, "learning_rate": 1.1669879095847205e-05, "loss": 3.6709, "step": 10086 }, { "epoch": 2.6506388149899824, "grad_norm": 0.9151231646537781, "learning_rate": 1.165235675486245e-05, "loss": 3.6801, "step": 10088 }, { "epoch": 2.6511643183236444, "grad_norm": 0.9036390781402588, "learning_rate": 1.1634834413877695e-05, "loss": 3.7259, "step": 10090 }, { "epoch": 2.651689821657306, "grad_norm": 0.9958227276802063, "learning_rate": 1.1617312072892939e-05, "loss": 3.687, "step": 10092 }, { "epoch": 2.652215324990968, "grad_norm": 0.867935299873352, "learning_rate": 1.1599789731908183e-05, "loss": 3.7176, "step": 10094 }, { "epoch": 2.65274082832463, "grad_norm": 0.8560516834259033, "learning_rate": 1.1582267390923427e-05, "loss": 3.7481, "step": 10096 }, { "epoch": 2.6532663316582914, "grad_norm": 0.9936392903327942, "learning_rate": 1.1564745049938672e-05, "loss": 3.7045, "step": 10098 }, { "epoch": 2.6537918349919534, "grad_norm": 0.865986704826355, "learning_rate": 1.1547222708953917e-05, "loss": 3.7016, "step": 10100 }, { "epoch": 2.654317338325615, "grad_norm": 0.8805081844329834, "learning_rate": 1.1529700367969162e-05, "loss": 3.7384, "step": 10102 }, { "epoch": 2.654842841659277, "grad_norm": 0.9348127245903015, "learning_rate": 1.1512178026984406e-05, "loss": 3.6834, "step": 10104 }, { "epoch": 2.6553683449929384, "grad_norm": 0.9048742651939392, "learning_rate": 1.149465568599965e-05, "loss": 3.6764, "step": 10106 }, { "epoch": 2.6558938483266004, "grad_norm": 0.9185057282447815, "learning_rate": 1.1477133345014894e-05, "loss": 3.6772, "step": 10108 }, { "epoch": 2.6564193516602623, "grad_norm": 0.965516209602356, "learning_rate": 1.145961100403014e-05, "loss": 3.6726, "step": 10110 }, { "epoch": 2.656944854993924, "grad_norm": 0.9283637404441833, "learning_rate": 1.1442088663045382e-05, "loss": 3.7116, "step": 10112 }, { "epoch": 2.6574703583275854, "grad_norm": 0.8420938849449158, "learning_rate": 1.1424566322060627e-05, "loss": 3.6734, "step": 10114 }, { "epoch": 2.6579958616612473, "grad_norm": 0.9245956540107727, "learning_rate": 1.1407043981075873e-05, "loss": 3.6676, "step": 10116 }, { "epoch": 2.6585213649949093, "grad_norm": 1.0574558973312378, "learning_rate": 1.1389521640091117e-05, "loss": 3.7179, "step": 10118 }, { "epoch": 2.659046868328571, "grad_norm": 0.993865430355072, "learning_rate": 1.1371999299106363e-05, "loss": 3.6883, "step": 10120 }, { "epoch": 2.659572371662233, "grad_norm": 0.9160903096199036, "learning_rate": 1.1354476958121605e-05, "loss": 3.6875, "step": 10122 }, { "epoch": 2.6600978749958943, "grad_norm": 0.9720809459686279, "learning_rate": 1.133695461713685e-05, "loss": 3.6945, "step": 10124 }, { "epoch": 2.6606233783295563, "grad_norm": 0.9858656525611877, "learning_rate": 1.1319432276152095e-05, "loss": 3.6732, "step": 10126 }, { "epoch": 2.661148881663218, "grad_norm": 0.9938924312591553, "learning_rate": 1.130190993516734e-05, "loss": 3.7096, "step": 10128 }, { "epoch": 2.66167438499688, "grad_norm": 0.9017789363861084, "learning_rate": 1.1284387594182583e-05, "loss": 3.6915, "step": 10130 }, { "epoch": 2.6621998883305418, "grad_norm": 1.017029881477356, "learning_rate": 1.1266865253197828e-05, "loss": 3.6957, "step": 10132 }, { "epoch": 2.6627253916642033, "grad_norm": 0.8993884325027466, "learning_rate": 1.1249342912213072e-05, "loss": 3.684, "step": 10134 }, { "epoch": 2.6632508949978653, "grad_norm": 0.9343595504760742, "learning_rate": 1.1231820571228318e-05, "loss": 3.6297, "step": 10136 }, { "epoch": 2.663776398331527, "grad_norm": 0.9336422681808472, "learning_rate": 1.1214298230243562e-05, "loss": 3.7181, "step": 10138 }, { "epoch": 2.6643019016651888, "grad_norm": 0.9886291027069092, "learning_rate": 1.1196775889258804e-05, "loss": 3.7462, "step": 10140 }, { "epoch": 2.6648274049988503, "grad_norm": 0.9055659174919128, "learning_rate": 1.117925354827405e-05, "loss": 3.6665, "step": 10142 }, { "epoch": 2.6653529083325123, "grad_norm": 0.8874049186706543, "learning_rate": 1.1161731207289294e-05, "loss": 3.6863, "step": 10144 }, { "epoch": 2.6658784116661742, "grad_norm": 0.9376559257507324, "learning_rate": 1.1144208866304539e-05, "loss": 3.7215, "step": 10146 }, { "epoch": 2.6664039149998358, "grad_norm": 0.899238109588623, "learning_rate": 1.1126686525319783e-05, "loss": 3.7156, "step": 10148 }, { "epoch": 2.6669294183334973, "grad_norm": 0.9557496905326843, "learning_rate": 1.1109164184335027e-05, "loss": 3.6721, "step": 10150 }, { "epoch": 2.6674549216671593, "grad_norm": 1.0139877796173096, "learning_rate": 1.1091641843350273e-05, "loss": 3.6738, "step": 10152 }, { "epoch": 2.667980425000821, "grad_norm": 0.9075080752372742, "learning_rate": 1.1074119502365517e-05, "loss": 3.6648, "step": 10154 }, { "epoch": 2.6685059283344827, "grad_norm": 0.9366153478622437, "learning_rate": 1.1056597161380761e-05, "loss": 3.687, "step": 10156 }, { "epoch": 2.6690314316681447, "grad_norm": 0.8861843347549438, "learning_rate": 1.1039074820396005e-05, "loss": 3.7349, "step": 10158 }, { "epoch": 2.6695569350018067, "grad_norm": 0.9086836576461792, "learning_rate": 1.102155247941125e-05, "loss": 3.6979, "step": 10160 }, { "epoch": 2.670082438335468, "grad_norm": 0.9736440181732178, "learning_rate": 1.1004030138426494e-05, "loss": 3.7279, "step": 10162 }, { "epoch": 2.6706079416691297, "grad_norm": 0.8877704739570618, "learning_rate": 1.098650779744174e-05, "loss": 3.7209, "step": 10164 }, { "epoch": 2.6711334450027917, "grad_norm": 0.8780971169471741, "learning_rate": 1.0968985456456984e-05, "loss": 3.7269, "step": 10166 }, { "epoch": 2.6716589483364537, "grad_norm": 0.9103610515594482, "learning_rate": 1.0951463115472228e-05, "loss": 3.6757, "step": 10168 }, { "epoch": 2.672184451670115, "grad_norm": 0.886658251285553, "learning_rate": 1.0933940774487472e-05, "loss": 3.6582, "step": 10170 }, { "epoch": 2.672709955003777, "grad_norm": 0.8961069583892822, "learning_rate": 1.0916418433502716e-05, "loss": 3.6982, "step": 10172 }, { "epoch": 2.6732354583374387, "grad_norm": 0.9464656710624695, "learning_rate": 1.0898896092517962e-05, "loss": 3.7079, "step": 10174 }, { "epoch": 2.6737609616711007, "grad_norm": 0.9361804127693176, "learning_rate": 1.0881373751533205e-05, "loss": 3.7035, "step": 10176 }, { "epoch": 2.674286465004762, "grad_norm": 0.9802885055541992, "learning_rate": 1.0863851410548449e-05, "loss": 3.7062, "step": 10178 }, { "epoch": 2.674811968338424, "grad_norm": 0.8911750316619873, "learning_rate": 1.0846329069563695e-05, "loss": 3.6954, "step": 10180 }, { "epoch": 2.675337471672086, "grad_norm": 0.8446317315101624, "learning_rate": 1.0828806728578939e-05, "loss": 3.6545, "step": 10182 }, { "epoch": 2.6758629750057477, "grad_norm": 1.0007728338241577, "learning_rate": 1.0811284387594183e-05, "loss": 3.6721, "step": 10184 }, { "epoch": 2.6763884783394096, "grad_norm": 0.9842635989189148, "learning_rate": 1.0793762046609427e-05, "loss": 3.6865, "step": 10186 }, { "epoch": 2.676913981673071, "grad_norm": 0.8757641911506653, "learning_rate": 1.0776239705624671e-05, "loss": 3.7184, "step": 10188 }, { "epoch": 2.677439485006733, "grad_norm": 0.922672688961029, "learning_rate": 1.0758717364639917e-05, "loss": 3.7382, "step": 10190 }, { "epoch": 2.6779649883403946, "grad_norm": 0.9654558300971985, "learning_rate": 1.0741195023655161e-05, "loss": 3.6891, "step": 10192 }, { "epoch": 2.6784904916740566, "grad_norm": 1.0114730596542358, "learning_rate": 1.0723672682670406e-05, "loss": 3.7476, "step": 10194 }, { "epoch": 2.6790159950077186, "grad_norm": 0.9443972110748291, "learning_rate": 1.070615034168565e-05, "loss": 3.6701, "step": 10196 }, { "epoch": 2.67954149834138, "grad_norm": 0.8853787183761597, "learning_rate": 1.0688628000700894e-05, "loss": 3.6629, "step": 10198 }, { "epoch": 2.6800670016750416, "grad_norm": 0.8798631429672241, "learning_rate": 1.0671105659716138e-05, "loss": 3.7219, "step": 10200 }, { "epoch": 2.6805925050087036, "grad_norm": 0.9124144911766052, "learning_rate": 1.0653583318731384e-05, "loss": 3.7051, "step": 10202 }, { "epoch": 2.6811180083423656, "grad_norm": 0.8644458055496216, "learning_rate": 1.0636060977746626e-05, "loss": 3.7032, "step": 10204 }, { "epoch": 2.681643511676027, "grad_norm": 0.9625971913337708, "learning_rate": 1.0618538636761872e-05, "loss": 3.6799, "step": 10206 }, { "epoch": 2.682169015009689, "grad_norm": 1.0161288976669312, "learning_rate": 1.0601016295777116e-05, "loss": 3.6845, "step": 10208 }, { "epoch": 2.6826945183433506, "grad_norm": 0.8555217981338501, "learning_rate": 1.058349395479236e-05, "loss": 3.6936, "step": 10210 }, { "epoch": 2.6832200216770126, "grad_norm": 0.9446887373924255, "learning_rate": 1.0565971613807605e-05, "loss": 3.6737, "step": 10212 }, { "epoch": 2.683745525010674, "grad_norm": 0.8425707221031189, "learning_rate": 1.0548449272822849e-05, "loss": 3.7008, "step": 10214 }, { "epoch": 2.684271028344336, "grad_norm": 0.9931196570396423, "learning_rate": 1.0530926931838093e-05, "loss": 3.708, "step": 10216 }, { "epoch": 2.684796531677998, "grad_norm": 0.8752531409263611, "learning_rate": 1.0513404590853339e-05, "loss": 3.7297, "step": 10218 }, { "epoch": 2.6853220350116596, "grad_norm": 0.8659684658050537, "learning_rate": 1.0495882249868583e-05, "loss": 3.7326, "step": 10220 }, { "epoch": 2.6858475383453215, "grad_norm": 0.92374187707901, "learning_rate": 1.0478359908883827e-05, "loss": 3.6806, "step": 10222 }, { "epoch": 2.686373041678983, "grad_norm": 0.9552136659622192, "learning_rate": 1.0460837567899072e-05, "loss": 3.6865, "step": 10224 }, { "epoch": 2.686898545012645, "grad_norm": 0.895277202129364, "learning_rate": 1.0443315226914316e-05, "loss": 3.6673, "step": 10226 }, { "epoch": 2.6874240483463065, "grad_norm": 0.8524205684661865, "learning_rate": 1.0425792885929562e-05, "loss": 3.657, "step": 10228 }, { "epoch": 2.6879495516799685, "grad_norm": 1.0449833869934082, "learning_rate": 1.0408270544944806e-05, "loss": 3.6703, "step": 10230 }, { "epoch": 2.6884750550136305, "grad_norm": 0.864609956741333, "learning_rate": 1.0390748203960048e-05, "loss": 3.6814, "step": 10232 }, { "epoch": 2.689000558347292, "grad_norm": 0.923702597618103, "learning_rate": 1.0373225862975294e-05, "loss": 3.6578, "step": 10234 }, { "epoch": 2.6895260616809535, "grad_norm": 1.1111561059951782, "learning_rate": 1.0355703521990538e-05, "loss": 3.6859, "step": 10236 }, { "epoch": 2.6900515650146155, "grad_norm": 0.9622791409492493, "learning_rate": 1.0338181181005784e-05, "loss": 3.631, "step": 10238 }, { "epoch": 2.6905770683482775, "grad_norm": 0.9890488386154175, "learning_rate": 1.0320658840021027e-05, "loss": 3.652, "step": 10240 }, { "epoch": 2.691102571681939, "grad_norm": 0.9375513195991516, "learning_rate": 1.030313649903627e-05, "loss": 3.7006, "step": 10242 }, { "epoch": 2.691628075015601, "grad_norm": 0.9777855277061462, "learning_rate": 1.0285614158051517e-05, "loss": 3.6906, "step": 10244 }, { "epoch": 2.6921535783492625, "grad_norm": 0.9447195529937744, "learning_rate": 1.0268091817066761e-05, "loss": 3.6751, "step": 10246 }, { "epoch": 2.6926790816829245, "grad_norm": 1.0049893856048584, "learning_rate": 1.0250569476082005e-05, "loss": 3.6782, "step": 10248 }, { "epoch": 2.693204585016586, "grad_norm": 0.8750045895576477, "learning_rate": 1.023304713509725e-05, "loss": 3.6724, "step": 10250 }, { "epoch": 2.693730088350248, "grad_norm": 0.9818208813667297, "learning_rate": 1.0215524794112493e-05, "loss": 3.7102, "step": 10252 }, { "epoch": 2.69425559168391, "grad_norm": 0.8788564205169678, "learning_rate": 1.0198002453127738e-05, "loss": 3.6766, "step": 10254 }, { "epoch": 2.6947810950175715, "grad_norm": 0.8795196413993835, "learning_rate": 1.0180480112142983e-05, "loss": 3.7176, "step": 10256 }, { "epoch": 2.6953065983512334, "grad_norm": 0.8845745921134949, "learning_rate": 1.0162957771158228e-05, "loss": 3.7009, "step": 10258 }, { "epoch": 2.695832101684895, "grad_norm": 0.9882046580314636, "learning_rate": 1.0145435430173472e-05, "loss": 3.6327, "step": 10260 }, { "epoch": 2.696357605018557, "grad_norm": 1.0080952644348145, "learning_rate": 1.0127913089188716e-05, "loss": 3.6775, "step": 10262 }, { "epoch": 2.6968831083522184, "grad_norm": 0.9175651669502258, "learning_rate": 1.011039074820396e-05, "loss": 3.6795, "step": 10264 }, { "epoch": 2.6974086116858804, "grad_norm": 0.9124990701675415, "learning_rate": 1.0092868407219206e-05, "loss": 3.7061, "step": 10266 }, { "epoch": 2.6979341150195424, "grad_norm": 0.955845832824707, "learning_rate": 1.0075346066234448e-05, "loss": 3.7028, "step": 10268 }, { "epoch": 2.698459618353204, "grad_norm": 1.0689771175384521, "learning_rate": 1.0057823725249693e-05, "loss": 3.7199, "step": 10270 }, { "epoch": 2.6989851216868654, "grad_norm": 0.9086296558380127, "learning_rate": 1.0040301384264939e-05, "loss": 3.7157, "step": 10272 }, { "epoch": 2.6995106250205274, "grad_norm": 0.9811961650848389, "learning_rate": 1.0022779043280183e-05, "loss": 3.6732, "step": 10274 }, { "epoch": 2.7000361283541894, "grad_norm": 0.9093260169029236, "learning_rate": 1.0005256702295427e-05, "loss": 3.7368, "step": 10276 }, { "epoch": 2.700561631687851, "grad_norm": 0.9529862403869629, "learning_rate": 9.987734361310671e-06, "loss": 3.6416, "step": 10278 }, { "epoch": 2.701087135021513, "grad_norm": 1.0003858804702759, "learning_rate": 9.970212020325915e-06, "loss": 3.698, "step": 10280 }, { "epoch": 2.7016126383551744, "grad_norm": 1.025040626525879, "learning_rate": 9.952689679341161e-06, "loss": 3.7183, "step": 10282 }, { "epoch": 2.7021381416888364, "grad_norm": 0.9975308179855347, "learning_rate": 9.935167338356405e-06, "loss": 3.7198, "step": 10284 }, { "epoch": 2.702663645022498, "grad_norm": 1.12396240234375, "learning_rate": 9.91764499737165e-06, "loss": 3.6706, "step": 10286 }, { "epoch": 2.70318914835616, "grad_norm": 0.9954589009284973, "learning_rate": 9.900122656386894e-06, "loss": 3.6974, "step": 10288 }, { "epoch": 2.703714651689822, "grad_norm": 1.1256235837936401, "learning_rate": 9.882600315402138e-06, "loss": 3.595, "step": 10290 }, { "epoch": 2.7042401550234834, "grad_norm": 1.085795283317566, "learning_rate": 9.865077974417384e-06, "loss": 3.7171, "step": 10292 }, { "epoch": 2.7047656583571453, "grad_norm": 1.1149314641952515, "learning_rate": 9.847555633432628e-06, "loss": 3.7309, "step": 10294 }, { "epoch": 2.705291161690807, "grad_norm": 0.998899519443512, "learning_rate": 9.83003329244787e-06, "loss": 3.653, "step": 10296 }, { "epoch": 2.705816665024469, "grad_norm": 0.8884541988372803, "learning_rate": 9.812510951463116e-06, "loss": 3.682, "step": 10298 }, { "epoch": 2.7063421683581304, "grad_norm": 0.9578174948692322, "learning_rate": 9.79498861047836e-06, "loss": 3.7439, "step": 10300 }, { "epoch": 2.7068676716917923, "grad_norm": 0.9565279483795166, "learning_rate": 9.777466269493605e-06, "loss": 3.6521, "step": 10302 }, { "epoch": 2.7073931750254543, "grad_norm": 0.9345782995223999, "learning_rate": 9.759943928508849e-06, "loss": 3.6895, "step": 10304 }, { "epoch": 2.707918678359116, "grad_norm": 0.9053789377212524, "learning_rate": 9.742421587524093e-06, "loss": 3.704, "step": 10306 }, { "epoch": 2.7084441816927773, "grad_norm": 0.853901207447052, "learning_rate": 9.724899246539339e-06, "loss": 3.6901, "step": 10308 }, { "epoch": 2.7089696850264393, "grad_norm": 0.9180251359939575, "learning_rate": 9.707376905554583e-06, "loss": 3.6667, "step": 10310 }, { "epoch": 2.7094951883601013, "grad_norm": 0.9428471326828003, "learning_rate": 9.689854564569827e-06, "loss": 3.6885, "step": 10312 }, { "epoch": 2.710020691693763, "grad_norm": 0.9138020277023315, "learning_rate": 9.672332223585071e-06, "loss": 3.6964, "step": 10314 }, { "epoch": 2.7105461950274248, "grad_norm": 0.8406910300254822, "learning_rate": 9.654809882600315e-06, "loss": 3.6929, "step": 10316 }, { "epoch": 2.7110716983610867, "grad_norm": 0.90681391954422, "learning_rate": 9.63728754161556e-06, "loss": 3.6277, "step": 10318 }, { "epoch": 2.7115972016947483, "grad_norm": 1.0094361305236816, "learning_rate": 9.619765200630806e-06, "loss": 3.7191, "step": 10320 }, { "epoch": 2.71212270502841, "grad_norm": 0.8878645300865173, "learning_rate": 9.60224285964605e-06, "loss": 3.7185, "step": 10322 }, { "epoch": 2.7126482083620718, "grad_norm": 0.840975284576416, "learning_rate": 9.584720518661294e-06, "loss": 3.6948, "step": 10324 }, { "epoch": 2.7131737116957337, "grad_norm": 0.9520653486251831, "learning_rate": 9.567198177676538e-06, "loss": 3.7308, "step": 10326 }, { "epoch": 2.7136992150293953, "grad_norm": 0.8802115321159363, "learning_rate": 9.549675836691782e-06, "loss": 3.7122, "step": 10328 }, { "epoch": 2.7142247183630572, "grad_norm": 0.9791972041130066, "learning_rate": 9.532153495707028e-06, "loss": 3.675, "step": 10330 }, { "epoch": 2.7147502216967188, "grad_norm": 0.8100082278251648, "learning_rate": 9.51463115472227e-06, "loss": 3.6654, "step": 10332 }, { "epoch": 2.7152757250303807, "grad_norm": 0.9094449877738953, "learning_rate": 9.497108813737515e-06, "loss": 3.7027, "step": 10334 }, { "epoch": 2.7158012283640423, "grad_norm": 0.9098424315452576, "learning_rate": 9.47958647275276e-06, "loss": 3.6958, "step": 10336 }, { "epoch": 2.7163267316977042, "grad_norm": 0.8630164861679077, "learning_rate": 9.462064131768005e-06, "loss": 3.6915, "step": 10338 }, { "epoch": 2.716852235031366, "grad_norm": 0.9861599206924438, "learning_rate": 9.444541790783249e-06, "loss": 3.6804, "step": 10340 }, { "epoch": 2.7173777383650277, "grad_norm": 0.9525960683822632, "learning_rate": 9.427019449798493e-06, "loss": 3.7325, "step": 10342 }, { "epoch": 2.7179032416986897, "grad_norm": 1.0947703123092651, "learning_rate": 9.409497108813737e-06, "loss": 3.7244, "step": 10344 }, { "epoch": 2.718428745032351, "grad_norm": 0.9338451623916626, "learning_rate": 9.391974767828983e-06, "loss": 3.7182, "step": 10346 }, { "epoch": 2.718954248366013, "grad_norm": 0.8449535965919495, "learning_rate": 9.374452426844227e-06, "loss": 3.6626, "step": 10348 }, { "epoch": 2.7194797516996747, "grad_norm": 0.9386139512062073, "learning_rate": 9.356930085859472e-06, "loss": 3.6935, "step": 10350 }, { "epoch": 2.7200052550333367, "grad_norm": 0.9401842951774597, "learning_rate": 9.339407744874716e-06, "loss": 3.6901, "step": 10352 }, { "epoch": 2.7205307583669986, "grad_norm": 0.8934599161148071, "learning_rate": 9.32188540388996e-06, "loss": 3.6832, "step": 10354 }, { "epoch": 2.72105626170066, "grad_norm": 0.9976478815078735, "learning_rate": 9.304363062905204e-06, "loss": 3.6145, "step": 10356 }, { "epoch": 2.7215817650343217, "grad_norm": 1.0098203420639038, "learning_rate": 9.28684072192045e-06, "loss": 3.6472, "step": 10358 }, { "epoch": 2.7221072683679837, "grad_norm": 0.8953266739845276, "learning_rate": 9.269318380935692e-06, "loss": 3.688, "step": 10360 }, { "epoch": 2.7226327717016456, "grad_norm": 1.0188887119293213, "learning_rate": 9.251796039950938e-06, "loss": 3.6956, "step": 10362 }, { "epoch": 2.723158275035307, "grad_norm": 0.9129080176353455, "learning_rate": 9.234273698966182e-06, "loss": 3.6597, "step": 10364 }, { "epoch": 2.723683778368969, "grad_norm": 0.934730052947998, "learning_rate": 9.216751357981427e-06, "loss": 3.7035, "step": 10366 }, { "epoch": 2.7242092817026307, "grad_norm": 0.8798181414604187, "learning_rate": 9.19922901699667e-06, "loss": 3.6551, "step": 10368 }, { "epoch": 2.7247347850362926, "grad_norm": 0.8759210109710693, "learning_rate": 9.181706676011915e-06, "loss": 3.7134, "step": 10370 }, { "epoch": 2.725260288369954, "grad_norm": 1.0057364702224731, "learning_rate": 9.16418433502716e-06, "loss": 3.7008, "step": 10372 }, { "epoch": 2.725785791703616, "grad_norm": 1.0119644403457642, "learning_rate": 9.146661994042405e-06, "loss": 3.6805, "step": 10374 }, { "epoch": 2.726311295037278, "grad_norm": 0.9974101781845093, "learning_rate": 9.12913965305765e-06, "loss": 3.7423, "step": 10376 }, { "epoch": 2.7268367983709396, "grad_norm": 0.8972420692443848, "learning_rate": 9.111617312072893e-06, "loss": 3.6749, "step": 10378 }, { "epoch": 2.7273623017046016, "grad_norm": 0.9838777184486389, "learning_rate": 9.094094971088138e-06, "loss": 3.6897, "step": 10380 }, { "epoch": 2.727887805038263, "grad_norm": 0.9511499404907227, "learning_rate": 9.076572630103382e-06, "loss": 3.6842, "step": 10382 }, { "epoch": 2.728413308371925, "grad_norm": 1.0296194553375244, "learning_rate": 9.059050289118628e-06, "loss": 3.7325, "step": 10384 }, { "epoch": 2.7289388117055866, "grad_norm": 1.0245790481567383, "learning_rate": 9.041527948133872e-06, "loss": 3.7054, "step": 10386 }, { "epoch": 2.7294643150392486, "grad_norm": 0.8936903476715088, "learning_rate": 9.024005607149114e-06, "loss": 3.7072, "step": 10388 }, { "epoch": 2.7299898183729105, "grad_norm": 0.9772065281867981, "learning_rate": 9.00648326616436e-06, "loss": 3.6934, "step": 10390 }, { "epoch": 2.730515321706572, "grad_norm": 0.9194059371948242, "learning_rate": 8.988960925179604e-06, "loss": 3.6698, "step": 10392 }, { "epoch": 2.7310408250402336, "grad_norm": 1.0301440954208374, "learning_rate": 8.97143858419485e-06, "loss": 3.7358, "step": 10394 }, { "epoch": 2.7315663283738956, "grad_norm": 1.025964379310608, "learning_rate": 8.953916243210093e-06, "loss": 3.6821, "step": 10396 }, { "epoch": 2.7320918317075575, "grad_norm": 0.8833057880401611, "learning_rate": 8.936393902225337e-06, "loss": 3.7484, "step": 10398 }, { "epoch": 2.732617335041219, "grad_norm": 1.0555613040924072, "learning_rate": 8.918871561240583e-06, "loss": 3.7469, "step": 10400 }, { "epoch": 2.732617335041219, "eval_loss": 3.718264579772949, "eval_runtime": 464.6322, "eval_samples_per_second": 262.119, "eval_steps_per_second": 8.191, "step": 10400 }, { "epoch": 2.733142838374881, "grad_norm": 0.8992906212806702, "learning_rate": 8.901349220255827e-06, "loss": 3.6384, "step": 10402 }, { "epoch": 2.7336683417085426, "grad_norm": 1.0340425968170166, "learning_rate": 8.883826879271071e-06, "loss": 3.6815, "step": 10404 }, { "epoch": 2.7341938450422045, "grad_norm": 0.8972797393798828, "learning_rate": 8.866304538286315e-06, "loss": 3.6972, "step": 10406 }, { "epoch": 2.734719348375866, "grad_norm": 0.8652665019035339, "learning_rate": 8.84878219730156e-06, "loss": 3.6923, "step": 10408 }, { "epoch": 2.735244851709528, "grad_norm": 0.8630313873291016, "learning_rate": 8.831259856316805e-06, "loss": 3.6045, "step": 10410 }, { "epoch": 2.73577035504319, "grad_norm": 1.0630383491516113, "learning_rate": 8.81373751533205e-06, "loss": 3.682, "step": 10412 }, { "epoch": 2.7362958583768515, "grad_norm": 0.9464606046676636, "learning_rate": 8.796215174347294e-06, "loss": 3.7361, "step": 10414 }, { "epoch": 2.7368213617105135, "grad_norm": 0.9356257915496826, "learning_rate": 8.778692833362538e-06, "loss": 3.6708, "step": 10416 }, { "epoch": 2.737346865044175, "grad_norm": 0.9037308096885681, "learning_rate": 8.761170492377782e-06, "loss": 3.7029, "step": 10418 }, { "epoch": 2.737872368377837, "grad_norm": 0.9763436317443848, "learning_rate": 8.743648151393026e-06, "loss": 3.7644, "step": 10420 }, { "epoch": 2.7383978717114985, "grad_norm": 0.8362236618995667, "learning_rate": 8.726125810408272e-06, "loss": 3.6891, "step": 10422 }, { "epoch": 2.7389233750451605, "grad_norm": 0.9601640701293945, "learning_rate": 8.708603469423514e-06, "loss": 3.685, "step": 10424 }, { "epoch": 2.7394488783788224, "grad_norm": 0.8942646980285645, "learning_rate": 8.691081128438759e-06, "loss": 3.6627, "step": 10426 }, { "epoch": 2.739974381712484, "grad_norm": 0.9389867186546326, "learning_rate": 8.673558787454005e-06, "loss": 3.6954, "step": 10428 }, { "epoch": 2.7404998850461455, "grad_norm": 1.0026203393936157, "learning_rate": 8.656036446469249e-06, "loss": 3.7271, "step": 10430 }, { "epoch": 2.7410253883798075, "grad_norm": 0.9660333395004272, "learning_rate": 8.638514105484493e-06, "loss": 3.6808, "step": 10432 }, { "epoch": 2.7415508917134694, "grad_norm": 0.9159166216850281, "learning_rate": 8.620991764499737e-06, "loss": 3.6835, "step": 10434 }, { "epoch": 2.742076395047131, "grad_norm": 0.9186549186706543, "learning_rate": 8.603469423514981e-06, "loss": 3.7263, "step": 10436 }, { "epoch": 2.742601898380793, "grad_norm": 0.8614769577980042, "learning_rate": 8.585947082530227e-06, "loss": 3.7043, "step": 10438 }, { "epoch": 2.7431274017144545, "grad_norm": 0.9674546122550964, "learning_rate": 8.568424741545471e-06, "loss": 3.6846, "step": 10440 }, { "epoch": 2.7436529050481164, "grad_norm": 0.9408636689186096, "learning_rate": 8.550902400560715e-06, "loss": 3.6561, "step": 10442 }, { "epoch": 2.744178408381778, "grad_norm": 1.0182478427886963, "learning_rate": 8.53338005957596e-06, "loss": 3.6996, "step": 10444 }, { "epoch": 2.74470391171544, "grad_norm": 0.9795921444892883, "learning_rate": 8.515857718591204e-06, "loss": 3.7237, "step": 10446 }, { "epoch": 2.745229415049102, "grad_norm": 1.0365352630615234, "learning_rate": 8.49833537760645e-06, "loss": 3.735, "step": 10448 }, { "epoch": 2.7457549183827634, "grad_norm": 0.87960284948349, "learning_rate": 8.480813036621694e-06, "loss": 3.7087, "step": 10450 }, { "epoch": 2.7462804217164254, "grad_norm": 0.9708579182624817, "learning_rate": 8.463290695636936e-06, "loss": 3.684, "step": 10452 }, { "epoch": 2.746805925050087, "grad_norm": 0.9409327507019043, "learning_rate": 8.445768354652182e-06, "loss": 3.7103, "step": 10454 }, { "epoch": 2.747331428383749, "grad_norm": 0.9574362635612488, "learning_rate": 8.428246013667426e-06, "loss": 3.6789, "step": 10456 }, { "epoch": 2.7478569317174104, "grad_norm": 0.8949871063232422, "learning_rate": 8.41072367268267e-06, "loss": 3.6998, "step": 10458 }, { "epoch": 2.7483824350510724, "grad_norm": 0.9507491588592529, "learning_rate": 8.393201331697915e-06, "loss": 3.6794, "step": 10460 }, { "epoch": 2.7489079383847344, "grad_norm": 0.9129090309143066, "learning_rate": 8.375678990713159e-06, "loss": 3.703, "step": 10462 }, { "epoch": 2.749433441718396, "grad_norm": 0.909673273563385, "learning_rate": 8.358156649728405e-06, "loss": 3.7034, "step": 10464 }, { "epoch": 2.7499589450520574, "grad_norm": 1.0164530277252197, "learning_rate": 8.340634308743649e-06, "loss": 3.691, "step": 10466 }, { "epoch": 2.7504844483857194, "grad_norm": 1.083743691444397, "learning_rate": 8.323111967758893e-06, "loss": 3.6389, "step": 10468 }, { "epoch": 2.7510099517193813, "grad_norm": 1.116892695426941, "learning_rate": 8.305589626774137e-06, "loss": 3.6833, "step": 10470 }, { "epoch": 2.751535455053043, "grad_norm": 0.9110134243965149, "learning_rate": 8.288067285789381e-06, "loss": 3.6959, "step": 10472 }, { "epoch": 2.752060958386705, "grad_norm": 0.9126675724983215, "learning_rate": 8.270544944804626e-06, "loss": 3.708, "step": 10474 }, { "epoch": 2.752586461720367, "grad_norm": 0.890480101108551, "learning_rate": 8.253022603819872e-06, "loss": 3.7166, "step": 10476 }, { "epoch": 2.7531119650540283, "grad_norm": 0.8414918780326843, "learning_rate": 8.235500262835116e-06, "loss": 3.702, "step": 10478 }, { "epoch": 2.75363746838769, "grad_norm": 0.9383008480072021, "learning_rate": 8.21797792185036e-06, "loss": 3.7178, "step": 10480 }, { "epoch": 2.754162971721352, "grad_norm": 0.9135539531707764, "learning_rate": 8.200455580865604e-06, "loss": 3.7416, "step": 10482 }, { "epoch": 2.754688475055014, "grad_norm": 0.9344625473022461, "learning_rate": 8.182933239880848e-06, "loss": 3.6857, "step": 10484 }, { "epoch": 2.7552139783886753, "grad_norm": 0.9261985421180725, "learning_rate": 8.165410898896094e-06, "loss": 3.6913, "step": 10486 }, { "epoch": 2.7557394817223373, "grad_norm": 0.9904360771179199, "learning_rate": 8.147888557911337e-06, "loss": 3.6916, "step": 10488 }, { "epoch": 2.756264985055999, "grad_norm": 0.8813837766647339, "learning_rate": 8.13036621692658e-06, "loss": 3.6897, "step": 10490 }, { "epoch": 2.756790488389661, "grad_norm": 0.8893033862113953, "learning_rate": 8.112843875941827e-06, "loss": 3.7117, "step": 10492 }, { "epoch": 2.7573159917233223, "grad_norm": 1.043723225593567, "learning_rate": 8.09532153495707e-06, "loss": 3.6768, "step": 10494 }, { "epoch": 2.7578414950569843, "grad_norm": 0.8646298050880432, "learning_rate": 8.077799193972315e-06, "loss": 3.6583, "step": 10496 }, { "epoch": 2.7583669983906463, "grad_norm": 0.9461015462875366, "learning_rate": 8.060276852987559e-06, "loss": 3.6887, "step": 10498 }, { "epoch": 2.7588925017243078, "grad_norm": 0.9077832698822021, "learning_rate": 8.042754512002803e-06, "loss": 3.7339, "step": 10500 }, { "epoch": 2.7594180050579697, "grad_norm": 0.9936925172805786, "learning_rate": 8.02523217101805e-06, "loss": 3.7096, "step": 10502 }, { "epoch": 2.7599435083916313, "grad_norm": 0.929649293422699, "learning_rate": 8.007709830033293e-06, "loss": 3.6804, "step": 10504 }, { "epoch": 2.7604690117252932, "grad_norm": 1.0525895357131958, "learning_rate": 7.990187489048538e-06, "loss": 3.6884, "step": 10506 }, { "epoch": 2.7609945150589548, "grad_norm": 0.9672771692276001, "learning_rate": 7.972665148063782e-06, "loss": 3.7088, "step": 10508 }, { "epoch": 2.7615200183926167, "grad_norm": 0.8831276893615723, "learning_rate": 7.955142807079026e-06, "loss": 3.7028, "step": 10510 }, { "epoch": 2.7620455217262787, "grad_norm": 0.8846613168716431, "learning_rate": 7.93762046609427e-06, "loss": 3.6788, "step": 10512 }, { "epoch": 2.7625710250599402, "grad_norm": 0.9128745794296265, "learning_rate": 7.920098125109516e-06, "loss": 3.6652, "step": 10514 }, { "epoch": 2.7630965283936018, "grad_norm": 0.9938401579856873, "learning_rate": 7.902575784124758e-06, "loss": 3.6672, "step": 10516 }, { "epoch": 2.7636220317272637, "grad_norm": 1.0020172595977783, "learning_rate": 7.885053443140004e-06, "loss": 3.663, "step": 10518 }, { "epoch": 2.7641475350609257, "grad_norm": 1.0027137994766235, "learning_rate": 7.867531102155248e-06, "loss": 3.6905, "step": 10520 }, { "epoch": 2.7646730383945872, "grad_norm": 0.8807597160339355, "learning_rate": 7.850008761170493e-06, "loss": 3.6878, "step": 10522 }, { "epoch": 2.765198541728249, "grad_norm": 0.9070327877998352, "learning_rate": 7.832486420185737e-06, "loss": 3.6593, "step": 10524 }, { "epoch": 2.7657240450619107, "grad_norm": 0.9636040925979614, "learning_rate": 7.814964079200981e-06, "loss": 3.6864, "step": 10526 }, { "epoch": 2.7662495483955727, "grad_norm": 0.9460773468017578, "learning_rate": 7.797441738216225e-06, "loss": 3.6465, "step": 10528 }, { "epoch": 2.766775051729234, "grad_norm": 1.029199481010437, "learning_rate": 7.779919397231471e-06, "loss": 3.6907, "step": 10530 }, { "epoch": 2.767300555062896, "grad_norm": 0.8084407448768616, "learning_rate": 7.762397056246715e-06, "loss": 3.7446, "step": 10532 }, { "epoch": 2.767826058396558, "grad_norm": 1.0017296075820923, "learning_rate": 7.74487471526196e-06, "loss": 3.7009, "step": 10534 }, { "epoch": 2.7683515617302197, "grad_norm": 0.9304461479187012, "learning_rate": 7.727352374277204e-06, "loss": 3.6963, "step": 10536 }, { "epoch": 2.7688770650638816, "grad_norm": 0.8858668208122253, "learning_rate": 7.709830033292448e-06, "loss": 3.6499, "step": 10538 }, { "epoch": 2.769402568397543, "grad_norm": 0.997984766960144, "learning_rate": 7.692307692307694e-06, "loss": 3.7041, "step": 10540 }, { "epoch": 2.769928071731205, "grad_norm": 0.925199568271637, "learning_rate": 7.674785351322938e-06, "loss": 3.6989, "step": 10542 }, { "epoch": 2.7704535750648667, "grad_norm": 0.9500083923339844, "learning_rate": 7.65726301033818e-06, "loss": 3.7071, "step": 10544 }, { "epoch": 2.7709790783985286, "grad_norm": 1.0044326782226562, "learning_rate": 7.639740669353426e-06, "loss": 3.7019, "step": 10546 }, { "epoch": 2.7715045817321906, "grad_norm": 0.8868717551231384, "learning_rate": 7.62221832836867e-06, "loss": 3.6706, "step": 10548 }, { "epoch": 2.772030085065852, "grad_norm": 1.0620219707489014, "learning_rate": 7.604695987383915e-06, "loss": 3.6836, "step": 10550 }, { "epoch": 2.7725555883995137, "grad_norm": 0.9205913543701172, "learning_rate": 7.5871736463991595e-06, "loss": 3.6594, "step": 10552 }, { "epoch": 2.7730810917331756, "grad_norm": 0.9297944903373718, "learning_rate": 7.569651305414404e-06, "loss": 3.7221, "step": 10554 }, { "epoch": 2.7736065950668376, "grad_norm": 0.9039854407310486, "learning_rate": 7.552128964429649e-06, "loss": 3.6538, "step": 10556 }, { "epoch": 2.774132098400499, "grad_norm": 0.937041699886322, "learning_rate": 7.534606623444893e-06, "loss": 3.6981, "step": 10558 }, { "epoch": 2.774657601734161, "grad_norm": 0.9394468069076538, "learning_rate": 7.517084282460136e-06, "loss": 3.6959, "step": 10560 }, { "epoch": 2.7751831050678226, "grad_norm": 0.9900912046432495, "learning_rate": 7.499561941475382e-06, "loss": 3.676, "step": 10562 }, { "epoch": 2.7757086084014846, "grad_norm": 0.8952953815460205, "learning_rate": 7.482039600490625e-06, "loss": 3.7224, "step": 10564 }, { "epoch": 2.776234111735146, "grad_norm": 0.9043331146240234, "learning_rate": 7.464517259505871e-06, "loss": 3.6671, "step": 10566 }, { "epoch": 2.776759615068808, "grad_norm": 0.8505910038948059, "learning_rate": 7.446994918521115e-06, "loss": 3.7226, "step": 10568 }, { "epoch": 2.77728511840247, "grad_norm": 1.018817663192749, "learning_rate": 7.429472577536359e-06, "loss": 3.6966, "step": 10570 }, { "epoch": 2.7778106217361316, "grad_norm": 0.9539979696273804, "learning_rate": 7.411950236551604e-06, "loss": 3.6674, "step": 10572 }, { "epoch": 2.7783361250697935, "grad_norm": 1.0019242763519287, "learning_rate": 7.394427895566848e-06, "loss": 3.6681, "step": 10574 }, { "epoch": 2.778861628403455, "grad_norm": 0.8997854590415955, "learning_rate": 7.376905554582092e-06, "loss": 3.647, "step": 10576 }, { "epoch": 2.779387131737117, "grad_norm": 0.9208241701126099, "learning_rate": 7.359383213597337e-06, "loss": 3.6642, "step": 10578 }, { "epoch": 2.7799126350707786, "grad_norm": 1.0638606548309326, "learning_rate": 7.341860872612581e-06, "loss": 3.6528, "step": 10580 }, { "epoch": 2.7804381384044405, "grad_norm": 0.9308770895004272, "learning_rate": 7.324338531627826e-06, "loss": 3.7118, "step": 10582 }, { "epoch": 2.7809636417381025, "grad_norm": 0.8999438285827637, "learning_rate": 7.3068161906430705e-06, "loss": 3.6806, "step": 10584 }, { "epoch": 2.781489145071764, "grad_norm": 1.0477861166000366, "learning_rate": 7.289293849658315e-06, "loss": 3.6805, "step": 10586 }, { "epoch": 2.7820146484054256, "grad_norm": 1.0441722869873047, "learning_rate": 7.27177150867356e-06, "loss": 3.6817, "step": 10588 }, { "epoch": 2.7825401517390875, "grad_norm": 0.9956772923469543, "learning_rate": 7.254249167688804e-06, "loss": 3.7605, "step": 10590 }, { "epoch": 2.7830656550727495, "grad_norm": 0.9155500531196594, "learning_rate": 7.236726826704047e-06, "loss": 3.7388, "step": 10592 }, { "epoch": 2.783591158406411, "grad_norm": 0.9037885665893555, "learning_rate": 7.219204485719293e-06, "loss": 3.6999, "step": 10594 }, { "epoch": 2.784116661740073, "grad_norm": 0.9271174073219299, "learning_rate": 7.201682144734536e-06, "loss": 3.6918, "step": 10596 }, { "epoch": 2.7846421650737345, "grad_norm": 0.9393383860588074, "learning_rate": 7.184159803749781e-06, "loss": 3.6937, "step": 10598 }, { "epoch": 2.7851676684073965, "grad_norm": 0.9259424805641174, "learning_rate": 7.166637462765026e-06, "loss": 3.7215, "step": 10600 }, { "epoch": 2.785693171741058, "grad_norm": 0.879030704498291, "learning_rate": 7.14911512178027e-06, "loss": 3.6717, "step": 10602 }, { "epoch": 2.78621867507472, "grad_norm": 0.9400699734687805, "learning_rate": 7.131592780795515e-06, "loss": 3.7045, "step": 10604 }, { "epoch": 2.786744178408382, "grad_norm": 1.0086450576782227, "learning_rate": 7.114070439810759e-06, "loss": 3.686, "step": 10606 }, { "epoch": 2.7872696817420435, "grad_norm": 0.9999946355819702, "learning_rate": 7.096548098826003e-06, "loss": 3.6927, "step": 10608 }, { "epoch": 2.7877951850757055, "grad_norm": 0.8920970559120178, "learning_rate": 7.079025757841248e-06, "loss": 3.7045, "step": 10610 }, { "epoch": 2.788320688409367, "grad_norm": 0.8328579664230347, "learning_rate": 7.061503416856492e-06, "loss": 3.6719, "step": 10612 }, { "epoch": 2.788846191743029, "grad_norm": 0.9335033297538757, "learning_rate": 7.0439810758717365e-06, "loss": 3.7093, "step": 10614 }, { "epoch": 2.7893716950766905, "grad_norm": 1.0206258296966553, "learning_rate": 7.0264587348869816e-06, "loss": 3.6964, "step": 10616 }, { "epoch": 2.7898971984103524, "grad_norm": 0.9346045255661011, "learning_rate": 7.008936393902226e-06, "loss": 3.7321, "step": 10618 }, { "epoch": 2.7904227017440144, "grad_norm": 1.0318585634231567, "learning_rate": 6.991414052917471e-06, "loss": 3.705, "step": 10620 }, { "epoch": 2.790948205077676, "grad_norm": 0.9875546097755432, "learning_rate": 6.973891711932715e-06, "loss": 3.7407, "step": 10622 }, { "epoch": 2.7914737084113375, "grad_norm": 0.958601176738739, "learning_rate": 6.956369370947958e-06, "loss": 3.7281, "step": 10624 }, { "epoch": 2.7919992117449994, "grad_norm": 0.9005158543586731, "learning_rate": 6.938847029963204e-06, "loss": 3.7295, "step": 10626 }, { "epoch": 2.7925247150786614, "grad_norm": 0.897090494632721, "learning_rate": 6.9213246889784475e-06, "loss": 3.6619, "step": 10628 }, { "epoch": 2.793050218412323, "grad_norm": 0.942797839641571, "learning_rate": 6.903802347993692e-06, "loss": 3.7067, "step": 10630 }, { "epoch": 2.793575721745985, "grad_norm": 0.9048266410827637, "learning_rate": 6.886280007008937e-06, "loss": 3.6406, "step": 10632 }, { "epoch": 2.794101225079647, "grad_norm": 0.8717251420021057, "learning_rate": 6.868757666024181e-06, "loss": 3.6543, "step": 10634 }, { "epoch": 2.7946267284133084, "grad_norm": 0.9905277490615845, "learning_rate": 6.851235325039426e-06, "loss": 3.6617, "step": 10636 }, { "epoch": 2.79515223174697, "grad_norm": 0.9529818892478943, "learning_rate": 6.83371298405467e-06, "loss": 3.7043, "step": 10638 }, { "epoch": 2.795677735080632, "grad_norm": 0.9622431993484497, "learning_rate": 6.816190643069914e-06, "loss": 3.709, "step": 10640 }, { "epoch": 2.796203238414294, "grad_norm": 0.9122003316879272, "learning_rate": 6.798668302085159e-06, "loss": 3.6766, "step": 10642 }, { "epoch": 2.7967287417479554, "grad_norm": 0.9148066639900208, "learning_rate": 6.781145961100403e-06, "loss": 3.6758, "step": 10644 }, { "epoch": 2.7972542450816174, "grad_norm": 0.9577843546867371, "learning_rate": 6.763623620115648e-06, "loss": 3.6897, "step": 10646 }, { "epoch": 2.797779748415279, "grad_norm": 0.9283396601676941, "learning_rate": 6.746101279130893e-06, "loss": 3.7357, "step": 10648 }, { "epoch": 2.798305251748941, "grad_norm": 1.057370901107788, "learning_rate": 6.728578938146137e-06, "loss": 3.6408, "step": 10650 }, { "epoch": 2.7988307550826024, "grad_norm": 0.9322560429573059, "learning_rate": 6.711056597161382e-06, "loss": 3.7098, "step": 10652 }, { "epoch": 2.7993562584162643, "grad_norm": 0.9132063984870911, "learning_rate": 6.693534256176626e-06, "loss": 3.7256, "step": 10654 }, { "epoch": 2.7998817617499263, "grad_norm": 0.8617981672286987, "learning_rate": 6.676011915191869e-06, "loss": 3.7094, "step": 10656 }, { "epoch": 2.800407265083588, "grad_norm": 0.981582760810852, "learning_rate": 6.658489574207115e-06, "loss": 3.6796, "step": 10658 }, { "epoch": 2.80093276841725, "grad_norm": 0.9398915767669678, "learning_rate": 6.6409672332223585e-06, "loss": 3.6972, "step": 10660 }, { "epoch": 2.8014582717509113, "grad_norm": 0.9628801345825195, "learning_rate": 6.623444892237603e-06, "loss": 3.7263, "step": 10662 }, { "epoch": 2.8019837750845733, "grad_norm": 0.9358119964599609, "learning_rate": 6.605922551252848e-06, "loss": 3.6712, "step": 10664 }, { "epoch": 2.802509278418235, "grad_norm": 0.940066397190094, "learning_rate": 6.588400210268092e-06, "loss": 3.6814, "step": 10666 }, { "epoch": 2.803034781751897, "grad_norm": 0.9244860410690308, "learning_rate": 6.570877869283337e-06, "loss": 3.7129, "step": 10668 }, { "epoch": 2.8035602850855588, "grad_norm": 0.8681151270866394, "learning_rate": 6.553355528298581e-06, "loss": 3.6907, "step": 10670 }, { "epoch": 2.8040857884192203, "grad_norm": 0.9513642191886902, "learning_rate": 6.535833187313825e-06, "loss": 3.7218, "step": 10672 }, { "epoch": 2.804611291752882, "grad_norm": 1.0363315343856812, "learning_rate": 6.51831084632907e-06, "loss": 3.6983, "step": 10674 }, { "epoch": 2.805136795086544, "grad_norm": 0.940280556678772, "learning_rate": 6.5007885053443144e-06, "loss": 3.6205, "step": 10676 }, { "epoch": 2.8056622984202058, "grad_norm": 1.0001384019851685, "learning_rate": 6.483266164359559e-06, "loss": 3.6841, "step": 10678 }, { "epoch": 2.8061878017538673, "grad_norm": 0.8739604949951172, "learning_rate": 6.465743823374804e-06, "loss": 3.6373, "step": 10680 }, { "epoch": 2.8067133050875293, "grad_norm": 0.9779549241065979, "learning_rate": 6.448221482390048e-06, "loss": 3.6862, "step": 10682 }, { "epoch": 2.807238808421191, "grad_norm": 0.9959965348243713, "learning_rate": 6.430699141405291e-06, "loss": 3.6727, "step": 10684 }, { "epoch": 2.8077643117548527, "grad_norm": 0.8921753764152527, "learning_rate": 6.413176800420537e-06, "loss": 3.7012, "step": 10686 }, { "epoch": 2.8082898150885143, "grad_norm": 0.9149478077888489, "learning_rate": 6.39565445943578e-06, "loss": 3.6824, "step": 10688 }, { "epoch": 2.8088153184221762, "grad_norm": 0.8898903131484985, "learning_rate": 6.378132118451026e-06, "loss": 3.6877, "step": 10690 }, { "epoch": 2.809340821755838, "grad_norm": 0.9133793711662292, "learning_rate": 6.3606097774662695e-06, "loss": 3.7102, "step": 10692 }, { "epoch": 2.8098663250894997, "grad_norm": 0.913329541683197, "learning_rate": 6.343087436481514e-06, "loss": 3.6118, "step": 10694 }, { "epoch": 2.8103918284231617, "grad_norm": 0.9509990215301514, "learning_rate": 6.325565095496759e-06, "loss": 3.7, "step": 10696 }, { "epoch": 2.8109173317568232, "grad_norm": 0.9556706547737122, "learning_rate": 6.308042754512003e-06, "loss": 3.6657, "step": 10698 }, { "epoch": 2.811442835090485, "grad_norm": 1.1498034000396729, "learning_rate": 6.290520413527247e-06, "loss": 3.6811, "step": 10700 }, { "epoch": 2.8119683384241467, "grad_norm": 0.8901938796043396, "learning_rate": 6.272998072542492e-06, "loss": 3.712, "step": 10702 }, { "epoch": 2.8124938417578087, "grad_norm": 0.9944669604301453, "learning_rate": 6.255475731557736e-06, "loss": 3.7099, "step": 10704 }, { "epoch": 2.8130193450914707, "grad_norm": 0.9460535049438477, "learning_rate": 6.2379533905729805e-06, "loss": 3.7071, "step": 10706 }, { "epoch": 2.813544848425132, "grad_norm": 1.0432227849960327, "learning_rate": 6.2204310495882255e-06, "loss": 3.7038, "step": 10708 }, { "epoch": 2.8140703517587937, "grad_norm": 0.9246929287910461, "learning_rate": 6.20290870860347e-06, "loss": 3.6452, "step": 10710 }, { "epoch": 2.8145958550924557, "grad_norm": 0.9930467009544373, "learning_rate": 6.185386367618714e-06, "loss": 3.6751, "step": 10712 }, { "epoch": 2.8151213584261177, "grad_norm": 0.9714385271072388, "learning_rate": 6.167864026633959e-06, "loss": 3.7284, "step": 10714 }, { "epoch": 2.815646861759779, "grad_norm": 0.8828871846199036, "learning_rate": 6.150341685649203e-06, "loss": 3.6825, "step": 10716 }, { "epoch": 2.816172365093441, "grad_norm": 0.8510504961013794, "learning_rate": 6.132819344664448e-06, "loss": 3.7013, "step": 10718 }, { "epoch": 2.8166978684271027, "grad_norm": 0.9256433248519897, "learning_rate": 6.115297003679691e-06, "loss": 3.6801, "step": 10720 }, { "epoch": 2.8172233717607646, "grad_norm": 0.9146392941474915, "learning_rate": 6.097774662694936e-06, "loss": 3.6587, "step": 10722 }, { "epoch": 2.817748875094426, "grad_norm": 0.8591855764389038, "learning_rate": 6.0802523217101806e-06, "loss": 3.7139, "step": 10724 }, { "epoch": 2.818274378428088, "grad_norm": 1.0373576879501343, "learning_rate": 6.062729980725426e-06, "loss": 3.6039, "step": 10726 }, { "epoch": 2.81879988176175, "grad_norm": 0.8862190842628479, "learning_rate": 6.04520763974067e-06, "loss": 3.747, "step": 10728 }, { "epoch": 2.8193253850954116, "grad_norm": 0.9901747107505798, "learning_rate": 6.027685298755914e-06, "loss": 3.7075, "step": 10730 }, { "epoch": 2.8198508884290736, "grad_norm": 0.9271067380905151, "learning_rate": 6.010162957771159e-06, "loss": 3.674, "step": 10732 }, { "epoch": 2.820376391762735, "grad_norm": 0.9895631670951843, "learning_rate": 5.992640616786403e-06, "loss": 3.6831, "step": 10734 }, { "epoch": 2.820901895096397, "grad_norm": 0.9250300526618958, "learning_rate": 5.975118275801647e-06, "loss": 3.6673, "step": 10736 }, { "epoch": 2.8214273984300586, "grad_norm": 0.9638060927391052, "learning_rate": 5.9575959348168915e-06, "loss": 3.7026, "step": 10738 }, { "epoch": 2.8219529017637206, "grad_norm": 1.0276234149932861, "learning_rate": 5.9400735938321365e-06, "loss": 3.7336, "step": 10740 }, { "epoch": 2.8224784050973826, "grad_norm": 0.9443963170051575, "learning_rate": 5.922551252847381e-06, "loss": 3.6711, "step": 10742 }, { "epoch": 2.823003908431044, "grad_norm": 1.0369791984558105, "learning_rate": 5.905028911862625e-06, "loss": 3.6994, "step": 10744 }, { "epoch": 2.8235294117647056, "grad_norm": 0.9663823843002319, "learning_rate": 5.88750657087787e-06, "loss": 3.7438, "step": 10746 }, { "epoch": 2.8240549150983676, "grad_norm": 0.9476085901260376, "learning_rate": 5.869984229893114e-06, "loss": 3.7194, "step": 10748 }, { "epoch": 2.8245804184320296, "grad_norm": 1.0503820180892944, "learning_rate": 5.852461888908359e-06, "loss": 3.6903, "step": 10750 }, { "epoch": 2.825105921765691, "grad_norm": 0.942451536655426, "learning_rate": 5.834939547923602e-06, "loss": 3.7013, "step": 10752 }, { "epoch": 2.825631425099353, "grad_norm": 0.9429349303245544, "learning_rate": 5.8174172069388474e-06, "loss": 3.6822, "step": 10754 }, { "epoch": 2.8261569284330146, "grad_norm": 0.8687843680381775, "learning_rate": 5.799894865954092e-06, "loss": 3.6931, "step": 10756 }, { "epoch": 2.8266824317666766, "grad_norm": 0.9230526089668274, "learning_rate": 5.782372524969336e-06, "loss": 3.698, "step": 10758 }, { "epoch": 2.827207935100338, "grad_norm": 1.0038299560546875, "learning_rate": 5.764850183984581e-06, "loss": 3.6801, "step": 10760 }, { "epoch": 2.827733438434, "grad_norm": 1.0024610757827759, "learning_rate": 5.747327842999825e-06, "loss": 3.6991, "step": 10762 }, { "epoch": 2.828258941767662, "grad_norm": 0.8841503262519836, "learning_rate": 5.72980550201507e-06, "loss": 3.69, "step": 10764 }, { "epoch": 2.8287844451013235, "grad_norm": 0.9271451830863953, "learning_rate": 5.712283161030313e-06, "loss": 3.6622, "step": 10766 }, { "epoch": 2.8293099484349855, "grad_norm": 0.953376054763794, "learning_rate": 5.694760820045558e-06, "loss": 3.7042, "step": 10768 }, { "epoch": 2.829835451768647, "grad_norm": 0.9500717520713806, "learning_rate": 5.6772384790608025e-06, "loss": 3.7164, "step": 10770 }, { "epoch": 2.830360955102309, "grad_norm": 0.9307326078414917, "learning_rate": 5.6597161380760476e-06, "loss": 3.6374, "step": 10772 }, { "epoch": 2.8308864584359705, "grad_norm": 0.8439242839813232, "learning_rate": 5.642193797091292e-06, "loss": 3.6874, "step": 10774 }, { "epoch": 2.8314119617696325, "grad_norm": 0.9148164987564087, "learning_rate": 5.624671456106536e-06, "loss": 3.697, "step": 10776 }, { "epoch": 2.8319374651032945, "grad_norm": 0.9776525497436523, "learning_rate": 5.607149115121781e-06, "loss": 3.643, "step": 10778 }, { "epoch": 2.832462968436956, "grad_norm": 0.9302638173103333, "learning_rate": 5.589626774137025e-06, "loss": 3.7231, "step": 10780 }, { "epoch": 2.8329884717706175, "grad_norm": 0.9986618757247925, "learning_rate": 5.572104433152269e-06, "loss": 3.6882, "step": 10782 }, { "epoch": 2.8335139751042795, "grad_norm": 0.9710880517959595, "learning_rate": 5.5545820921675135e-06, "loss": 3.6621, "step": 10784 }, { "epoch": 2.8340394784379415, "grad_norm": 0.9772099852561951, "learning_rate": 5.5370597511827585e-06, "loss": 3.6821, "step": 10786 }, { "epoch": 2.834564981771603, "grad_norm": 0.8547819256782532, "learning_rate": 5.519537410198003e-06, "loss": 3.6917, "step": 10788 }, { "epoch": 2.835090485105265, "grad_norm": 0.8911169767379761, "learning_rate": 5.502015069213247e-06, "loss": 3.7187, "step": 10790 }, { "epoch": 2.835615988438927, "grad_norm": 0.9701933264732361, "learning_rate": 5.484492728228492e-06, "loss": 3.7355, "step": 10792 }, { "epoch": 2.8361414917725885, "grad_norm": 0.9122721552848816, "learning_rate": 5.466970387243736e-06, "loss": 3.6423, "step": 10794 }, { "epoch": 2.83666699510625, "grad_norm": 0.9325200915336609, "learning_rate": 5.449448046258981e-06, "loss": 3.6662, "step": 10796 }, { "epoch": 2.837192498439912, "grad_norm": 0.8535316586494446, "learning_rate": 5.431925705274224e-06, "loss": 3.6954, "step": 10798 }, { "epoch": 2.837718001773574, "grad_norm": 0.8857448101043701, "learning_rate": 5.414403364289469e-06, "loss": 3.7082, "step": 10800 }, { "epoch": 2.837718001773574, "eval_loss": 3.7158594131469727, "eval_runtime": 464.6965, "eval_samples_per_second": 262.083, "eval_steps_per_second": 8.19, "step": 10800 }, { "epoch": 2.8382435051072354, "grad_norm": 1.1003066301345825, "learning_rate": 5.3968810233047136e-06, "loss": 3.7201, "step": 10802 }, { "epoch": 2.8387690084408974, "grad_norm": 0.8598603010177612, "learning_rate": 5.379358682319959e-06, "loss": 3.7039, "step": 10804 }, { "epoch": 2.839294511774559, "grad_norm": 0.8617976903915405, "learning_rate": 5.361836341335203e-06, "loss": 3.6562, "step": 10806 }, { "epoch": 2.839820015108221, "grad_norm": 0.9475199580192566, "learning_rate": 5.344314000350447e-06, "loss": 3.7142, "step": 10808 }, { "epoch": 2.8403455184418824, "grad_norm": 0.8837513327598572, "learning_rate": 5.326791659365692e-06, "loss": 3.6816, "step": 10810 }, { "epoch": 2.8408710217755444, "grad_norm": 0.9724593162536621, "learning_rate": 5.309269318380936e-06, "loss": 3.7072, "step": 10812 }, { "epoch": 2.8413965251092064, "grad_norm": 1.257886528968811, "learning_rate": 5.29174697739618e-06, "loss": 3.7707, "step": 10814 }, { "epoch": 2.841922028442868, "grad_norm": 0.9215511679649353, "learning_rate": 5.2742246364114245e-06, "loss": 3.7202, "step": 10816 }, { "epoch": 2.84244753177653, "grad_norm": 0.9608336091041565, "learning_rate": 5.2567022954266695e-06, "loss": 3.6655, "step": 10818 }, { "epoch": 2.8429730351101914, "grad_norm": 0.8913887739181519, "learning_rate": 5.239179954441914e-06, "loss": 3.6721, "step": 10820 }, { "epoch": 2.8434985384438534, "grad_norm": 0.8981270790100098, "learning_rate": 5.221657613457158e-06, "loss": 3.6766, "step": 10822 }, { "epoch": 2.844024041777515, "grad_norm": 1.0001236200332642, "learning_rate": 5.204135272472403e-06, "loss": 3.7043, "step": 10824 }, { "epoch": 2.844549545111177, "grad_norm": 0.9271770715713501, "learning_rate": 5.186612931487647e-06, "loss": 3.6971, "step": 10826 }, { "epoch": 2.845075048444839, "grad_norm": 0.8821377754211426, "learning_rate": 5.169090590502892e-06, "loss": 3.7164, "step": 10828 }, { "epoch": 2.8456005517785004, "grad_norm": 0.897068977355957, "learning_rate": 5.151568249518135e-06, "loss": 3.6786, "step": 10830 }, { "epoch": 2.846126055112162, "grad_norm": 0.9630810618400574, "learning_rate": 5.1340459085333804e-06, "loss": 3.7563, "step": 10832 }, { "epoch": 2.846651558445824, "grad_norm": 0.8859772086143494, "learning_rate": 5.116523567548625e-06, "loss": 3.711, "step": 10834 }, { "epoch": 2.847177061779486, "grad_norm": 0.9252563714981079, "learning_rate": 5.099001226563869e-06, "loss": 3.689, "step": 10836 }, { "epoch": 2.8477025651131473, "grad_norm": 1.0779794454574585, "learning_rate": 5.081478885579114e-06, "loss": 3.6916, "step": 10838 }, { "epoch": 2.8482280684468093, "grad_norm": 0.9410510063171387, "learning_rate": 5.063956544594358e-06, "loss": 3.7128, "step": 10840 }, { "epoch": 2.848753571780471, "grad_norm": 0.9560712575912476, "learning_rate": 5.046434203609603e-06, "loss": 3.6514, "step": 10842 }, { "epoch": 2.849279075114133, "grad_norm": 0.9997665286064148, "learning_rate": 5.028911862624846e-06, "loss": 3.7021, "step": 10844 }, { "epoch": 2.8498045784477943, "grad_norm": 0.8683488368988037, "learning_rate": 5.011389521640091e-06, "loss": 3.7212, "step": 10846 }, { "epoch": 2.8503300817814563, "grad_norm": 0.8917067050933838, "learning_rate": 4.9938671806553355e-06, "loss": 3.7115, "step": 10848 }, { "epoch": 2.8508555851151183, "grad_norm": 0.8837410807609558, "learning_rate": 4.9763448396705806e-06, "loss": 3.6447, "step": 10850 }, { "epoch": 2.85138108844878, "grad_norm": 0.945685625076294, "learning_rate": 4.958822498685825e-06, "loss": 3.674, "step": 10852 }, { "epoch": 2.8519065917824418, "grad_norm": 0.9955798387527466, "learning_rate": 4.941300157701069e-06, "loss": 3.6689, "step": 10854 }, { "epoch": 2.8524320951161033, "grad_norm": 0.9028317928314209, "learning_rate": 4.923777816716314e-06, "loss": 3.7079, "step": 10856 }, { "epoch": 2.8529575984497653, "grad_norm": 0.928165853023529, "learning_rate": 4.906255475731558e-06, "loss": 3.6977, "step": 10858 }, { "epoch": 2.853483101783427, "grad_norm": 0.9817658066749573, "learning_rate": 4.888733134746802e-06, "loss": 3.6474, "step": 10860 }, { "epoch": 2.8540086051170888, "grad_norm": 1.0130159854888916, "learning_rate": 4.8712107937620465e-06, "loss": 3.6728, "step": 10862 }, { "epoch": 2.8545341084507507, "grad_norm": 0.9946650266647339, "learning_rate": 4.8536884527772915e-06, "loss": 3.7063, "step": 10864 }, { "epoch": 2.8550596117844123, "grad_norm": 0.9599988460540771, "learning_rate": 4.836166111792536e-06, "loss": 3.7211, "step": 10866 }, { "epoch": 2.855585115118074, "grad_norm": 0.9873018860816956, "learning_rate": 4.81864377080778e-06, "loss": 3.7177, "step": 10868 }, { "epoch": 2.8561106184517357, "grad_norm": 0.9300923943519592, "learning_rate": 4.801121429823025e-06, "loss": 3.6976, "step": 10870 }, { "epoch": 2.8566361217853977, "grad_norm": 0.9566983580589294, "learning_rate": 4.783599088838269e-06, "loss": 3.6763, "step": 10872 }, { "epoch": 2.8571616251190592, "grad_norm": 1.0043896436691284, "learning_rate": 4.766076747853514e-06, "loss": 3.6642, "step": 10874 }, { "epoch": 2.857687128452721, "grad_norm": 0.9414486289024353, "learning_rate": 4.748554406868757e-06, "loss": 3.6937, "step": 10876 }, { "epoch": 2.8582126317863827, "grad_norm": 0.9772343039512634, "learning_rate": 4.731032065884002e-06, "loss": 3.688, "step": 10878 }, { "epoch": 2.8587381351200447, "grad_norm": 1.0734013319015503, "learning_rate": 4.7135097248992466e-06, "loss": 3.6914, "step": 10880 }, { "epoch": 2.8592636384537062, "grad_norm": 0.9821857810020447, "learning_rate": 4.695987383914492e-06, "loss": 3.6579, "step": 10882 }, { "epoch": 2.859789141787368, "grad_norm": 0.9421532154083252, "learning_rate": 4.678465042929736e-06, "loss": 3.7213, "step": 10884 }, { "epoch": 2.86031464512103, "grad_norm": 0.9092543721199036, "learning_rate": 4.66094270194498e-06, "loss": 3.7081, "step": 10886 }, { "epoch": 2.8608401484546917, "grad_norm": 0.9118620157241821, "learning_rate": 4.643420360960225e-06, "loss": 3.6921, "step": 10888 }, { "epoch": 2.8613656517883537, "grad_norm": 0.8882892727851868, "learning_rate": 4.625898019975469e-06, "loss": 3.6289, "step": 10890 }, { "epoch": 2.861891155122015, "grad_norm": 0.9889925718307495, "learning_rate": 4.608375678990713e-06, "loss": 3.6902, "step": 10892 }, { "epoch": 2.862416658455677, "grad_norm": 1.0989121198654175, "learning_rate": 4.5908533380059575e-06, "loss": 3.7309, "step": 10894 }, { "epoch": 2.8629421617893387, "grad_norm": 0.9515022039413452, "learning_rate": 4.5733309970212025e-06, "loss": 3.6953, "step": 10896 }, { "epoch": 2.8634676651230007, "grad_norm": 1.0235265493392944, "learning_rate": 4.555808656036447e-06, "loss": 3.7261, "step": 10898 }, { "epoch": 2.8639931684566626, "grad_norm": 0.9855708479881287, "learning_rate": 4.538286315051691e-06, "loss": 3.7174, "step": 10900 }, { "epoch": 2.864518671790324, "grad_norm": 1.0045709609985352, "learning_rate": 4.520763974066936e-06, "loss": 3.6545, "step": 10902 }, { "epoch": 2.8650441751239857, "grad_norm": 0.9816344976425171, "learning_rate": 4.50324163308218e-06, "loss": 3.6783, "step": 10904 }, { "epoch": 2.8655696784576477, "grad_norm": 0.8862611651420593, "learning_rate": 4.485719292097425e-06, "loss": 3.7002, "step": 10906 }, { "epoch": 2.8660951817913096, "grad_norm": 0.9849883317947388, "learning_rate": 4.468196951112668e-06, "loss": 3.6634, "step": 10908 }, { "epoch": 2.866620685124971, "grad_norm": 0.8383504748344421, "learning_rate": 4.4506746101279134e-06, "loss": 3.75, "step": 10910 }, { "epoch": 2.867146188458633, "grad_norm": 0.9959409236907959, "learning_rate": 4.433152269143158e-06, "loss": 3.7166, "step": 10912 }, { "epoch": 2.8676716917922946, "grad_norm": 0.8706647753715515, "learning_rate": 4.415629928158403e-06, "loss": 3.7358, "step": 10914 }, { "epoch": 2.8681971951259566, "grad_norm": 0.9207481741905212, "learning_rate": 4.398107587173647e-06, "loss": 3.7101, "step": 10916 }, { "epoch": 2.868722698459618, "grad_norm": 0.9666288495063782, "learning_rate": 4.380585246188891e-06, "loss": 3.6708, "step": 10918 }, { "epoch": 2.86924820179328, "grad_norm": 0.9739460945129395, "learning_rate": 4.363062905204136e-06, "loss": 3.736, "step": 10920 }, { "epoch": 2.869773705126942, "grad_norm": 0.9712940454483032, "learning_rate": 4.345540564219379e-06, "loss": 3.7195, "step": 10922 }, { "epoch": 2.8702992084606036, "grad_norm": 0.9851842522621155, "learning_rate": 4.328018223234624e-06, "loss": 3.6748, "step": 10924 }, { "epoch": 2.8708247117942656, "grad_norm": 1.013093113899231, "learning_rate": 4.3104958822498685e-06, "loss": 3.7046, "step": 10926 }, { "epoch": 2.871350215127927, "grad_norm": 0.8916319012641907, "learning_rate": 4.2929735412651136e-06, "loss": 3.6517, "step": 10928 }, { "epoch": 2.871875718461589, "grad_norm": 0.963735044002533, "learning_rate": 4.275451200280358e-06, "loss": 3.7019, "step": 10930 }, { "epoch": 2.8724012217952506, "grad_norm": 0.8814280033111572, "learning_rate": 4.257928859295602e-06, "loss": 3.6904, "step": 10932 }, { "epoch": 2.8729267251289126, "grad_norm": 0.9448356628417969, "learning_rate": 4.240406518310847e-06, "loss": 3.7254, "step": 10934 }, { "epoch": 2.8734522284625745, "grad_norm": 1.0681278705596924, "learning_rate": 4.222884177326091e-06, "loss": 3.7517, "step": 10936 }, { "epoch": 2.873977731796236, "grad_norm": 0.941702127456665, "learning_rate": 4.205361836341335e-06, "loss": 3.6999, "step": 10938 }, { "epoch": 2.8745032351298976, "grad_norm": 0.8864479064941406, "learning_rate": 4.1878394953565794e-06, "loss": 3.6905, "step": 10940 }, { "epoch": 2.8750287384635596, "grad_norm": 0.9883190989494324, "learning_rate": 4.1703171543718245e-06, "loss": 3.612, "step": 10942 }, { "epoch": 2.8755542417972215, "grad_norm": 0.8960925936698914, "learning_rate": 4.152794813387069e-06, "loss": 3.6639, "step": 10944 }, { "epoch": 2.876079745130883, "grad_norm": 0.9147754311561584, "learning_rate": 4.135272472402313e-06, "loss": 3.6893, "step": 10946 }, { "epoch": 2.876605248464545, "grad_norm": 0.9707074165344238, "learning_rate": 4.117750131417558e-06, "loss": 3.6246, "step": 10948 }, { "epoch": 2.877130751798207, "grad_norm": 0.9609418511390686, "learning_rate": 4.100227790432802e-06, "loss": 3.7197, "step": 10950 }, { "epoch": 2.8776562551318685, "grad_norm": 0.9103037714958191, "learning_rate": 4.082705449448047e-06, "loss": 3.6814, "step": 10952 }, { "epoch": 2.87818175846553, "grad_norm": 0.9278436899185181, "learning_rate": 4.06518310846329e-06, "loss": 3.7244, "step": 10954 }, { "epoch": 2.878707261799192, "grad_norm": 0.9761267304420471, "learning_rate": 4.047660767478535e-06, "loss": 3.6657, "step": 10956 }, { "epoch": 2.879232765132854, "grad_norm": 0.890954315662384, "learning_rate": 4.0301384264937796e-06, "loss": 3.7207, "step": 10958 }, { "epoch": 2.8797582684665155, "grad_norm": 1.0090843439102173, "learning_rate": 4.012616085509025e-06, "loss": 3.7222, "step": 10960 }, { "epoch": 2.8802837718001775, "grad_norm": 0.9379556775093079, "learning_rate": 3.995093744524269e-06, "loss": 3.6711, "step": 10962 }, { "epoch": 2.880809275133839, "grad_norm": 1.0087153911590576, "learning_rate": 3.977571403539513e-06, "loss": 3.738, "step": 10964 }, { "epoch": 2.881334778467501, "grad_norm": 0.8867523670196533, "learning_rate": 3.960049062554758e-06, "loss": 3.7196, "step": 10966 }, { "epoch": 2.8818602818011625, "grad_norm": 1.0239083766937256, "learning_rate": 3.942526721570002e-06, "loss": 3.6592, "step": 10968 }, { "epoch": 2.8823857851348245, "grad_norm": 0.8795169591903687, "learning_rate": 3.925004380585246e-06, "loss": 3.7355, "step": 10970 }, { "epoch": 2.8829112884684864, "grad_norm": 0.9446698427200317, "learning_rate": 3.9074820396004905e-06, "loss": 3.7184, "step": 10972 }, { "epoch": 2.883436791802148, "grad_norm": 0.8848180770874023, "learning_rate": 3.8899596986157355e-06, "loss": 3.7113, "step": 10974 }, { "epoch": 2.88396229513581, "grad_norm": 0.9883624911308289, "learning_rate": 3.87243735763098e-06, "loss": 3.7085, "step": 10976 }, { "epoch": 2.8844877984694715, "grad_norm": 0.8623301386833191, "learning_rate": 3.854915016646224e-06, "loss": 3.7121, "step": 10978 }, { "epoch": 2.8850133018031334, "grad_norm": 0.9402971863746643, "learning_rate": 3.837392675661469e-06, "loss": 3.6833, "step": 10980 }, { "epoch": 2.885538805136795, "grad_norm": 0.8486554026603699, "learning_rate": 3.819870334676713e-06, "loss": 3.6829, "step": 10982 }, { "epoch": 2.886064308470457, "grad_norm": 0.9161088466644287, "learning_rate": 3.8023479936919577e-06, "loss": 3.6693, "step": 10984 }, { "epoch": 2.886589811804119, "grad_norm": 1.0315738916397095, "learning_rate": 3.784825652707202e-06, "loss": 3.6938, "step": 10986 }, { "epoch": 2.8871153151377804, "grad_norm": 0.9065717458724976, "learning_rate": 3.7673033117224464e-06, "loss": 3.6818, "step": 10988 }, { "epoch": 2.887640818471442, "grad_norm": 1.0692954063415527, "learning_rate": 3.749780970737691e-06, "loss": 3.7185, "step": 10990 }, { "epoch": 2.888166321805104, "grad_norm": 1.012778639793396, "learning_rate": 3.7322586297529356e-06, "loss": 3.6504, "step": 10992 }, { "epoch": 2.888691825138766, "grad_norm": 0.9652107954025269, "learning_rate": 3.7147362887681794e-06, "loss": 3.6583, "step": 10994 }, { "epoch": 2.8892173284724274, "grad_norm": 0.8812058568000793, "learning_rate": 3.697213947783424e-06, "loss": 3.6801, "step": 10996 }, { "epoch": 2.8897428318060894, "grad_norm": 0.9809557199478149, "learning_rate": 3.6796916067986686e-06, "loss": 3.6782, "step": 10998 }, { "epoch": 2.890268335139751, "grad_norm": 0.9080129265785217, "learning_rate": 3.662169265813913e-06, "loss": 3.6936, "step": 11000 }, { "epoch": 2.890793838473413, "grad_norm": 0.8859163522720337, "learning_rate": 3.6446469248291574e-06, "loss": 3.6815, "step": 11002 }, { "epoch": 2.8913193418070744, "grad_norm": 0.8996262550354004, "learning_rate": 3.627124583844402e-06, "loss": 3.6891, "step": 11004 }, { "epoch": 2.8918448451407364, "grad_norm": 0.9644185900688171, "learning_rate": 3.6096022428596465e-06, "loss": 3.6953, "step": 11006 }, { "epoch": 2.8923703484743983, "grad_norm": 0.9043797254562378, "learning_rate": 3.5920799018748903e-06, "loss": 3.6825, "step": 11008 }, { "epoch": 2.89289585180806, "grad_norm": 0.9506582617759705, "learning_rate": 3.574557560890135e-06, "loss": 3.6809, "step": 11010 }, { "epoch": 2.893421355141722, "grad_norm": 0.9020773768424988, "learning_rate": 3.5570352199053795e-06, "loss": 3.6631, "step": 11012 }, { "epoch": 2.8939468584753834, "grad_norm": 0.9416654706001282, "learning_rate": 3.539512878920624e-06, "loss": 3.6564, "step": 11014 }, { "epoch": 2.8944723618090453, "grad_norm": 0.911668062210083, "learning_rate": 3.5219905379358683e-06, "loss": 3.69, "step": 11016 }, { "epoch": 2.894997865142707, "grad_norm": 1.017141580581665, "learning_rate": 3.504468196951113e-06, "loss": 3.7421, "step": 11018 }, { "epoch": 2.895523368476369, "grad_norm": 0.940240204334259, "learning_rate": 3.4869458559663575e-06, "loss": 3.6703, "step": 11020 }, { "epoch": 2.896048871810031, "grad_norm": 1.0129897594451904, "learning_rate": 3.469423514981602e-06, "loss": 3.7185, "step": 11022 }, { "epoch": 2.8965743751436923, "grad_norm": 0.9287675023078918, "learning_rate": 3.451901173996846e-06, "loss": 3.6555, "step": 11024 }, { "epoch": 2.897099878477354, "grad_norm": 0.9498471617698669, "learning_rate": 3.4343788330120904e-06, "loss": 3.7172, "step": 11026 }, { "epoch": 2.897625381811016, "grad_norm": 0.9724360704421997, "learning_rate": 3.416856492027335e-06, "loss": 3.7101, "step": 11028 }, { "epoch": 2.898150885144678, "grad_norm": 0.8908922672271729, "learning_rate": 3.3993341510425796e-06, "loss": 3.6779, "step": 11030 }, { "epoch": 2.8986763884783393, "grad_norm": 0.9132648706436157, "learning_rate": 3.381811810057824e-06, "loss": 3.6988, "step": 11032 }, { "epoch": 2.8992018918120013, "grad_norm": 1.0216341018676758, "learning_rate": 3.3642894690730684e-06, "loss": 3.7341, "step": 11034 }, { "epoch": 2.899727395145663, "grad_norm": 0.9806554913520813, "learning_rate": 3.346767128088313e-06, "loss": 3.7242, "step": 11036 }, { "epoch": 2.9002528984793248, "grad_norm": 0.9597538709640503, "learning_rate": 3.3292447871035576e-06, "loss": 3.6761, "step": 11038 }, { "epoch": 2.9007784018129863, "grad_norm": 0.9767919778823853, "learning_rate": 3.3117224461188013e-06, "loss": 3.7025, "step": 11040 }, { "epoch": 2.9013039051466483, "grad_norm": 0.9696051478385925, "learning_rate": 3.294200105134046e-06, "loss": 3.7026, "step": 11042 }, { "epoch": 2.9018294084803102, "grad_norm": 0.9304065704345703, "learning_rate": 3.2766777641492905e-06, "loss": 3.6532, "step": 11044 }, { "epoch": 2.9023549118139718, "grad_norm": 1.0828412771224976, "learning_rate": 3.259155423164535e-06, "loss": 3.6921, "step": 11046 }, { "epoch": 2.9028804151476337, "grad_norm": 0.9074031710624695, "learning_rate": 3.2416330821797793e-06, "loss": 3.6938, "step": 11048 }, { "epoch": 2.9034059184812953, "grad_norm": 0.9412509799003601, "learning_rate": 3.224110741195024e-06, "loss": 3.724, "step": 11050 }, { "epoch": 2.9039314218149572, "grad_norm": 0.9260215163230896, "learning_rate": 3.2065884002102685e-06, "loss": 3.6904, "step": 11052 }, { "epoch": 2.9044569251486188, "grad_norm": 0.9957168102264404, "learning_rate": 3.189066059225513e-06, "loss": 3.6641, "step": 11054 }, { "epoch": 2.9049824284822807, "grad_norm": 0.9675821661949158, "learning_rate": 3.171543718240757e-06, "loss": 3.6771, "step": 11056 }, { "epoch": 2.9055079318159427, "grad_norm": 1.0789614915847778, "learning_rate": 3.1540213772560015e-06, "loss": 3.7045, "step": 11058 }, { "epoch": 2.906033435149604, "grad_norm": 0.929978609085083, "learning_rate": 3.136499036271246e-06, "loss": 3.6868, "step": 11060 }, { "epoch": 2.9065589384832657, "grad_norm": 0.9885504245758057, "learning_rate": 3.1189766952864902e-06, "loss": 3.7217, "step": 11062 }, { "epoch": 2.9070844418169277, "grad_norm": 0.9003853797912598, "learning_rate": 3.101454354301735e-06, "loss": 3.6171, "step": 11064 }, { "epoch": 2.9076099451505897, "grad_norm": 0.8987219333648682, "learning_rate": 3.0839320133169794e-06, "loss": 3.6576, "step": 11066 }, { "epoch": 2.908135448484251, "grad_norm": 0.9357712864875793, "learning_rate": 3.066409672332224e-06, "loss": 3.7199, "step": 11068 }, { "epoch": 2.908660951817913, "grad_norm": 1.0559148788452148, "learning_rate": 3.048887331347468e-06, "loss": 3.7294, "step": 11070 }, { "epoch": 2.9091864551515747, "grad_norm": 0.9609713554382324, "learning_rate": 3.031364990362713e-06, "loss": 3.7019, "step": 11072 }, { "epoch": 2.9097119584852367, "grad_norm": 0.9683553576469421, "learning_rate": 3.013842649377957e-06, "loss": 3.7247, "step": 11074 }, { "epoch": 2.910237461818898, "grad_norm": 1.036950707435608, "learning_rate": 2.9963203083932016e-06, "loss": 3.6806, "step": 11076 }, { "epoch": 2.91076296515256, "grad_norm": 0.9290165305137634, "learning_rate": 2.9787979674084457e-06, "loss": 3.6914, "step": 11078 }, { "epoch": 2.911288468486222, "grad_norm": 0.8758551478385925, "learning_rate": 2.9612756264236903e-06, "loss": 3.7056, "step": 11080 }, { "epoch": 2.9118139718198837, "grad_norm": 1.0238415002822876, "learning_rate": 2.943753285438935e-06, "loss": 3.6593, "step": 11082 }, { "epoch": 2.9123394751535456, "grad_norm": 0.9615103006362915, "learning_rate": 2.9262309444541795e-06, "loss": 3.6704, "step": 11084 }, { "epoch": 2.912864978487207, "grad_norm": 0.904151976108551, "learning_rate": 2.9087086034694237e-06, "loss": 3.5995, "step": 11086 }, { "epoch": 2.913390481820869, "grad_norm": 0.9898989796638489, "learning_rate": 2.891186262484668e-06, "loss": 3.6581, "step": 11088 }, { "epoch": 2.9139159851545307, "grad_norm": 0.9105849862098694, "learning_rate": 2.8736639214999125e-06, "loss": 3.7174, "step": 11090 }, { "epoch": 2.9144414884881926, "grad_norm": 0.7978295683860779, "learning_rate": 2.8561415805151567e-06, "loss": 3.6687, "step": 11092 }, { "epoch": 2.9149669918218546, "grad_norm": 0.9691533446311951, "learning_rate": 2.8386192395304013e-06, "loss": 3.7278, "step": 11094 }, { "epoch": 2.915492495155516, "grad_norm": 0.9498254060745239, "learning_rate": 2.821096898545646e-06, "loss": 3.6866, "step": 11096 }, { "epoch": 2.9160179984891776, "grad_norm": 0.9723640084266663, "learning_rate": 2.8035745575608905e-06, "loss": 3.7638, "step": 11098 }, { "epoch": 2.9165435018228396, "grad_norm": 0.971335768699646, "learning_rate": 2.7860522165761346e-06, "loss": 3.6943, "step": 11100 }, { "epoch": 2.9170690051565016, "grad_norm": 0.9286007285118103, "learning_rate": 2.7685298755913792e-06, "loss": 3.7279, "step": 11102 }, { "epoch": 2.917594508490163, "grad_norm": 0.9971358776092529, "learning_rate": 2.7510075346066234e-06, "loss": 3.7591, "step": 11104 }, { "epoch": 2.918120011823825, "grad_norm": 0.9882368445396423, "learning_rate": 2.733485193621868e-06, "loss": 3.685, "step": 11106 }, { "epoch": 2.918645515157487, "grad_norm": 1.0125949382781982, "learning_rate": 2.715962852637112e-06, "loss": 3.733, "step": 11108 }, { "epoch": 2.9191710184911486, "grad_norm": 0.9613254070281982, "learning_rate": 2.6984405116523568e-06, "loss": 3.6941, "step": 11110 }, { "epoch": 2.91969652182481, "grad_norm": 0.9492978453636169, "learning_rate": 2.6809181706676014e-06, "loss": 3.728, "step": 11112 }, { "epoch": 2.920222025158472, "grad_norm": 1.0359041690826416, "learning_rate": 2.663395829682846e-06, "loss": 3.7001, "step": 11114 }, { "epoch": 2.920747528492134, "grad_norm": 1.0016144514083862, "learning_rate": 2.64587348869809e-06, "loss": 3.6919, "step": 11116 }, { "epoch": 2.9212730318257956, "grad_norm": 1.0802834033966064, "learning_rate": 2.6283511477133348e-06, "loss": 3.7171, "step": 11118 }, { "epoch": 2.9217985351594575, "grad_norm": 1.0018072128295898, "learning_rate": 2.610828806728579e-06, "loss": 3.7062, "step": 11120 }, { "epoch": 2.922324038493119, "grad_norm": 0.9562501311302185, "learning_rate": 2.5933064657438235e-06, "loss": 3.7206, "step": 11122 }, { "epoch": 2.922849541826781, "grad_norm": 0.953520655632019, "learning_rate": 2.5757841247590677e-06, "loss": 3.7095, "step": 11124 }, { "epoch": 2.9233750451604426, "grad_norm": 0.9284921288490295, "learning_rate": 2.5582617837743123e-06, "loss": 3.7277, "step": 11126 }, { "epoch": 2.9239005484941045, "grad_norm": 0.8899246454238892, "learning_rate": 2.540739442789557e-06, "loss": 3.6462, "step": 11128 }, { "epoch": 2.9244260518277665, "grad_norm": 1.022702932357788, "learning_rate": 2.5232171018048015e-06, "loss": 3.7022, "step": 11130 }, { "epoch": 2.924951555161428, "grad_norm": 0.8516330718994141, "learning_rate": 2.5056947608200457e-06, "loss": 3.6784, "step": 11132 }, { "epoch": 2.92547705849509, "grad_norm": 0.8872223496437073, "learning_rate": 2.4881724198352903e-06, "loss": 3.6599, "step": 11134 }, { "epoch": 2.9260025618287515, "grad_norm": 0.8943605422973633, "learning_rate": 2.4706500788505345e-06, "loss": 3.7576, "step": 11136 }, { "epoch": 2.9265280651624135, "grad_norm": 0.9909507036209106, "learning_rate": 2.453127737865779e-06, "loss": 3.7419, "step": 11138 }, { "epoch": 2.927053568496075, "grad_norm": 0.9563024640083313, "learning_rate": 2.4356053968810232e-06, "loss": 3.654, "step": 11140 }, { "epoch": 2.927579071829737, "grad_norm": 0.9430238604545593, "learning_rate": 2.418083055896268e-06, "loss": 3.6421, "step": 11142 }, { "epoch": 2.928104575163399, "grad_norm": 0.9158750772476196, "learning_rate": 2.4005607149115124e-06, "loss": 3.7422, "step": 11144 }, { "epoch": 2.9286300784970605, "grad_norm": 0.910660445690155, "learning_rate": 2.383038373926757e-06, "loss": 3.6797, "step": 11146 }, { "epoch": 2.929155581830722, "grad_norm": 1.015799641609192, "learning_rate": 2.365516032942001e-06, "loss": 3.7141, "step": 11148 }, { "epoch": 2.929681085164384, "grad_norm": 0.8973917365074158, "learning_rate": 2.347993691957246e-06, "loss": 3.6801, "step": 11150 }, { "epoch": 2.930206588498046, "grad_norm": 1.0460809469223022, "learning_rate": 2.33047135097249e-06, "loss": 3.6193, "step": 11152 }, { "epoch": 2.9307320918317075, "grad_norm": 0.9192841649055481, "learning_rate": 2.3129490099877346e-06, "loss": 3.7012, "step": 11154 }, { "epoch": 2.9312575951653694, "grad_norm": 0.9900246858596802, "learning_rate": 2.2954266690029787e-06, "loss": 3.6865, "step": 11156 }, { "epoch": 2.931783098499031, "grad_norm": 0.9606373906135559, "learning_rate": 2.2779043280182233e-06, "loss": 3.7016, "step": 11158 }, { "epoch": 2.932308601832693, "grad_norm": 1.1315178871154785, "learning_rate": 2.260381987033468e-06, "loss": 3.6697, "step": 11160 }, { "epoch": 2.9328341051663545, "grad_norm": 0.9941169619560242, "learning_rate": 2.2428596460487125e-06, "loss": 3.651, "step": 11162 }, { "epoch": 2.9333596085000164, "grad_norm": 1.0116742849349976, "learning_rate": 2.2253373050639567e-06, "loss": 3.6863, "step": 11164 }, { "epoch": 2.9338851118336784, "grad_norm": 0.9675166010856628, "learning_rate": 2.2078149640792013e-06, "loss": 3.6618, "step": 11166 }, { "epoch": 2.93441061516734, "grad_norm": 0.9668328762054443, "learning_rate": 2.1902926230944455e-06, "loss": 3.6676, "step": 11168 }, { "epoch": 2.934936118501002, "grad_norm": 1.0292080640792847, "learning_rate": 2.1727702821096897e-06, "loss": 3.6953, "step": 11170 }, { "epoch": 2.9354616218346634, "grad_norm": 1.0327017307281494, "learning_rate": 2.1552479411249343e-06, "loss": 3.6732, "step": 11172 }, { "epoch": 2.9359871251683254, "grad_norm": 0.9095394015312195, "learning_rate": 2.137725600140179e-06, "loss": 3.6128, "step": 11174 }, { "epoch": 2.936512628501987, "grad_norm": 0.9454618096351624, "learning_rate": 2.1202032591554235e-06, "loss": 3.641, "step": 11176 }, { "epoch": 2.937038131835649, "grad_norm": 0.9181832075119019, "learning_rate": 2.1026809181706676e-06, "loss": 3.6764, "step": 11178 }, { "epoch": 2.937563635169311, "grad_norm": 1.0516782999038696, "learning_rate": 2.0851585771859122e-06, "loss": 3.7543, "step": 11180 }, { "epoch": 2.9380891385029724, "grad_norm": 0.9288402795791626, "learning_rate": 2.0676362362011564e-06, "loss": 3.6587, "step": 11182 }, { "epoch": 2.938614641836634, "grad_norm": 1.073240041732788, "learning_rate": 2.050113895216401e-06, "loss": 3.7128, "step": 11184 }, { "epoch": 2.939140145170296, "grad_norm": 0.9229915738105774, "learning_rate": 2.032591554231645e-06, "loss": 3.7044, "step": 11186 }, { "epoch": 2.939665648503958, "grad_norm": 0.9631633758544922, "learning_rate": 2.0150692132468898e-06, "loss": 3.6967, "step": 11188 }, { "epoch": 2.9401911518376194, "grad_norm": 0.9696780443191528, "learning_rate": 1.9975468722621344e-06, "loss": 3.6929, "step": 11190 }, { "epoch": 2.9407166551712813, "grad_norm": 0.9244800209999084, "learning_rate": 1.980024531277379e-06, "loss": 3.658, "step": 11192 }, { "epoch": 2.941242158504943, "grad_norm": 0.9176260232925415, "learning_rate": 1.962502190292623e-06, "loss": 3.6884, "step": 11194 }, { "epoch": 2.941767661838605, "grad_norm": 0.9566540122032166, "learning_rate": 1.9449798493078678e-06, "loss": 3.7078, "step": 11196 }, { "epoch": 2.9422931651722664, "grad_norm": 0.9136968851089478, "learning_rate": 1.927457508323112e-06, "loss": 3.6554, "step": 11198 }, { "epoch": 2.9428186685059283, "grad_norm": 0.9758254289627075, "learning_rate": 1.9099351673383565e-06, "loss": 3.7284, "step": 11200 }, { "epoch": 2.9428186685059283, "eval_loss": 3.7147085666656494, "eval_runtime": 464.7694, "eval_samples_per_second": 262.042, "eval_steps_per_second": 8.189, "step": 11200 }, { "epoch": 2.9433441718395903, "grad_norm": 0.9459012746810913, "learning_rate": 1.892412826353601e-06, "loss": 3.7183, "step": 11202 }, { "epoch": 2.943869675173252, "grad_norm": 0.9936196208000183, "learning_rate": 1.8748904853688455e-06, "loss": 3.6918, "step": 11204 }, { "epoch": 2.944395178506914, "grad_norm": 1.055371880531311, "learning_rate": 1.8573681443840897e-06, "loss": 3.6823, "step": 11206 }, { "epoch": 2.9449206818405753, "grad_norm": 0.9803134202957153, "learning_rate": 1.8398458033993343e-06, "loss": 3.653, "step": 11208 }, { "epoch": 2.9454461851742373, "grad_norm": 1.0045852661132812, "learning_rate": 1.8223234624145787e-06, "loss": 3.6756, "step": 11210 }, { "epoch": 2.945971688507899, "grad_norm": 1.1067330837249756, "learning_rate": 1.8048011214298233e-06, "loss": 3.6959, "step": 11212 }, { "epoch": 2.946497191841561, "grad_norm": 1.0064362287521362, "learning_rate": 1.7872787804450674e-06, "loss": 3.7302, "step": 11214 }, { "epoch": 2.9470226951752228, "grad_norm": 0.9306373000144958, "learning_rate": 1.769756439460312e-06, "loss": 3.7082, "step": 11216 }, { "epoch": 2.9475481985088843, "grad_norm": 1.0234383344650269, "learning_rate": 1.7522340984755564e-06, "loss": 3.7244, "step": 11218 }, { "epoch": 2.948073701842546, "grad_norm": 0.9399264454841614, "learning_rate": 1.734711757490801e-06, "loss": 3.6752, "step": 11220 }, { "epoch": 2.9485992051762078, "grad_norm": 0.9801008701324463, "learning_rate": 1.7171894165060452e-06, "loss": 3.6726, "step": 11222 }, { "epoch": 2.9491247085098697, "grad_norm": 1.0065388679504395, "learning_rate": 1.6996670755212898e-06, "loss": 3.7416, "step": 11224 }, { "epoch": 2.9496502118435313, "grad_norm": 0.994312584400177, "learning_rate": 1.6821447345365342e-06, "loss": 3.6834, "step": 11226 }, { "epoch": 2.9501757151771932, "grad_norm": 0.9343522191047668, "learning_rate": 1.6646223935517788e-06, "loss": 3.6902, "step": 11228 }, { "epoch": 2.9507012185108548, "grad_norm": 0.9800108075141907, "learning_rate": 1.647100052567023e-06, "loss": 3.6601, "step": 11230 }, { "epoch": 2.9512267218445167, "grad_norm": 1.0017778873443604, "learning_rate": 1.6295777115822676e-06, "loss": 3.6918, "step": 11232 }, { "epoch": 2.9517522251781783, "grad_norm": 1.1442605257034302, "learning_rate": 1.612055370597512e-06, "loss": 3.7124, "step": 11234 }, { "epoch": 2.9522777285118402, "grad_norm": 0.9175592660903931, "learning_rate": 1.5945330296127566e-06, "loss": 3.6948, "step": 11236 }, { "epoch": 2.952803231845502, "grad_norm": 0.9634783864021301, "learning_rate": 1.5770106886280007e-06, "loss": 3.7241, "step": 11238 }, { "epoch": 2.9533287351791637, "grad_norm": 0.9862359762191772, "learning_rate": 1.5594883476432451e-06, "loss": 3.7067, "step": 11240 }, { "epoch": 2.9538542385128257, "grad_norm": 0.9130534529685974, "learning_rate": 1.5419660066584897e-06, "loss": 3.7, "step": 11242 }, { "epoch": 2.954379741846487, "grad_norm": 0.8364992737770081, "learning_rate": 1.524443665673734e-06, "loss": 3.6962, "step": 11244 }, { "epoch": 2.954905245180149, "grad_norm": 0.8927871584892273, "learning_rate": 1.5069213246889785e-06, "loss": 3.7573, "step": 11246 }, { "epoch": 2.9554307485138107, "grad_norm": 1.0674872398376465, "learning_rate": 1.4893989837042229e-06, "loss": 3.7156, "step": 11248 }, { "epoch": 2.9559562518474727, "grad_norm": 0.9547544717788696, "learning_rate": 1.4718766427194675e-06, "loss": 3.7032, "step": 11250 }, { "epoch": 2.9564817551811347, "grad_norm": 0.9578222036361694, "learning_rate": 1.4543543017347119e-06, "loss": 3.6565, "step": 11252 }, { "epoch": 2.957007258514796, "grad_norm": 1.1051939725875854, "learning_rate": 1.4368319607499562e-06, "loss": 3.6955, "step": 11254 }, { "epoch": 2.9575327618484577, "grad_norm": 0.9126796126365662, "learning_rate": 1.4193096197652006e-06, "loss": 3.7007, "step": 11256 }, { "epoch": 2.9580582651821197, "grad_norm": 1.044482946395874, "learning_rate": 1.4017872787804452e-06, "loss": 3.7551, "step": 11258 }, { "epoch": 2.9585837685157816, "grad_norm": 0.9563342332839966, "learning_rate": 1.3842649377956896e-06, "loss": 3.7083, "step": 11260 }, { "epoch": 2.959109271849443, "grad_norm": 0.9194144606590271, "learning_rate": 1.366742596810934e-06, "loss": 3.7229, "step": 11262 }, { "epoch": 2.959634775183105, "grad_norm": 0.9449466466903687, "learning_rate": 1.3492202558261784e-06, "loss": 3.7105, "step": 11264 }, { "epoch": 2.960160278516767, "grad_norm": 0.9076125621795654, "learning_rate": 1.331697914841423e-06, "loss": 3.7279, "step": 11266 }, { "epoch": 2.9606857818504286, "grad_norm": 0.8447644114494324, "learning_rate": 1.3141755738566674e-06, "loss": 3.7202, "step": 11268 }, { "epoch": 2.96121128518409, "grad_norm": 1.0400211811065674, "learning_rate": 1.2966532328719118e-06, "loss": 3.6873, "step": 11270 }, { "epoch": 2.961736788517752, "grad_norm": 0.9404429793357849, "learning_rate": 1.2791308918871562e-06, "loss": 3.7205, "step": 11272 }, { "epoch": 2.962262291851414, "grad_norm": 0.98611980676651, "learning_rate": 1.2616085509024008e-06, "loss": 3.6707, "step": 11274 }, { "epoch": 2.9627877951850756, "grad_norm": 0.9569012522697449, "learning_rate": 1.2440862099176451e-06, "loss": 3.6776, "step": 11276 }, { "epoch": 2.9633132985187376, "grad_norm": 0.9503495693206787, "learning_rate": 1.2265638689328895e-06, "loss": 3.6923, "step": 11278 }, { "epoch": 2.963838801852399, "grad_norm": 0.9663078188896179, "learning_rate": 1.209041527948134e-06, "loss": 3.6671, "step": 11280 }, { "epoch": 2.964364305186061, "grad_norm": 0.9090290069580078, "learning_rate": 1.1915191869633785e-06, "loss": 3.6785, "step": 11282 }, { "epoch": 2.9648898085197226, "grad_norm": 0.9387456178665161, "learning_rate": 1.173996845978623e-06, "loss": 3.6796, "step": 11284 }, { "epoch": 2.9654153118533846, "grad_norm": 0.9760780334472656, "learning_rate": 1.1564745049938673e-06, "loss": 3.7309, "step": 11286 }, { "epoch": 2.9659408151870466, "grad_norm": 0.9806252717971802, "learning_rate": 1.1389521640091117e-06, "loss": 3.7108, "step": 11288 }, { "epoch": 2.966466318520708, "grad_norm": 0.9391472339630127, "learning_rate": 1.1214298230243563e-06, "loss": 3.6635, "step": 11290 }, { "epoch": 2.96699182185437, "grad_norm": 0.93305504322052, "learning_rate": 1.1039074820396007e-06, "loss": 3.6441, "step": 11292 }, { "epoch": 2.9675173251880316, "grad_norm": 0.9926801919937134, "learning_rate": 1.0863851410548448e-06, "loss": 3.6965, "step": 11294 }, { "epoch": 2.9680428285216935, "grad_norm": 0.847822904586792, "learning_rate": 1.0688628000700894e-06, "loss": 3.7427, "step": 11296 }, { "epoch": 2.968568331855355, "grad_norm": 1.0313341617584229, "learning_rate": 1.0513404590853338e-06, "loss": 3.7855, "step": 11298 }, { "epoch": 2.969093835189017, "grad_norm": 0.9803759455680847, "learning_rate": 1.0338181181005782e-06, "loss": 3.6963, "step": 11300 }, { "epoch": 2.969619338522679, "grad_norm": 0.9713161587715149, "learning_rate": 1.0162957771158226e-06, "loss": 3.7178, "step": 11302 }, { "epoch": 2.9701448418563405, "grad_norm": 0.8928664326667786, "learning_rate": 9.987734361310672e-07, "loss": 3.7628, "step": 11304 }, { "epoch": 2.970670345190002, "grad_norm": 0.9056347608566284, "learning_rate": 9.812510951463116e-07, "loss": 3.7203, "step": 11306 }, { "epoch": 2.971195848523664, "grad_norm": 0.9165859222412109, "learning_rate": 9.63728754161556e-07, "loss": 3.6937, "step": 11308 }, { "epoch": 2.971721351857326, "grad_norm": 0.8519420027732849, "learning_rate": 9.462064131768005e-07, "loss": 3.6744, "step": 11310 }, { "epoch": 2.9722468551909875, "grad_norm": 0.9186609387397766, "learning_rate": 9.286840721920448e-07, "loss": 3.7137, "step": 11312 }, { "epoch": 2.9727723585246495, "grad_norm": 0.9123014211654663, "learning_rate": 9.111617312072893e-07, "loss": 3.6953, "step": 11314 }, { "epoch": 2.973297861858311, "grad_norm": 0.8902423977851868, "learning_rate": 8.936393902225337e-07, "loss": 3.6923, "step": 11316 }, { "epoch": 2.973823365191973, "grad_norm": 0.9510392546653748, "learning_rate": 8.761170492377782e-07, "loss": 3.6608, "step": 11318 }, { "epoch": 2.9743488685256345, "grad_norm": 0.9857866764068604, "learning_rate": 8.585947082530226e-07, "loss": 3.707, "step": 11320 }, { "epoch": 2.9748743718592965, "grad_norm": 0.9394118785858154, "learning_rate": 8.410723672682671e-07, "loss": 3.7301, "step": 11322 }, { "epoch": 2.9753998751929585, "grad_norm": 0.9672693014144897, "learning_rate": 8.235500262835115e-07, "loss": 3.7154, "step": 11324 }, { "epoch": 2.97592537852662, "grad_norm": 0.9429049491882324, "learning_rate": 8.06027685298756e-07, "loss": 3.6893, "step": 11326 }, { "epoch": 2.976450881860282, "grad_norm": 0.9729042053222656, "learning_rate": 7.885053443140004e-07, "loss": 3.7151, "step": 11328 }, { "epoch": 2.9769763851939435, "grad_norm": 1.0448862314224243, "learning_rate": 7.709830033292449e-07, "loss": 3.6757, "step": 11330 }, { "epoch": 2.9775018885276054, "grad_norm": 1.0678012371063232, "learning_rate": 7.534606623444892e-07, "loss": 3.677, "step": 11332 }, { "epoch": 2.978027391861267, "grad_norm": 0.952526330947876, "learning_rate": 7.359383213597337e-07, "loss": 3.7275, "step": 11334 }, { "epoch": 2.978552895194929, "grad_norm": 0.8479202389717102, "learning_rate": 7.184159803749781e-07, "loss": 3.6922, "step": 11336 }, { "epoch": 2.979078398528591, "grad_norm": 0.9510994553565979, "learning_rate": 7.008936393902226e-07, "loss": 3.7493, "step": 11338 }, { "epoch": 2.9796039018622524, "grad_norm": 1.1186846494674683, "learning_rate": 6.83371298405467e-07, "loss": 3.7053, "step": 11340 }, { "epoch": 2.980129405195914, "grad_norm": 0.9296010136604309, "learning_rate": 6.658489574207115e-07, "loss": 3.7048, "step": 11342 }, { "epoch": 2.980654908529576, "grad_norm": 1.0048638582229614, "learning_rate": 6.483266164359559e-07, "loss": 3.6796, "step": 11344 }, { "epoch": 2.981180411863238, "grad_norm": 0.8815770149230957, "learning_rate": 6.308042754512004e-07, "loss": 3.68, "step": 11346 }, { "epoch": 2.9817059151968994, "grad_norm": 1.0092130899429321, "learning_rate": 6.132819344664448e-07, "loss": 3.6783, "step": 11348 }, { "epoch": 2.9822314185305614, "grad_norm": 1.0745313167572021, "learning_rate": 5.957595934816893e-07, "loss": 3.6358, "step": 11350 }, { "epoch": 2.982756921864223, "grad_norm": 0.9381189942359924, "learning_rate": 5.782372524969336e-07, "loss": 3.6829, "step": 11352 }, { "epoch": 2.983282425197885, "grad_norm": 1.1586989164352417, "learning_rate": 5.607149115121781e-07, "loss": 3.6699, "step": 11354 }, { "epoch": 2.9838079285315464, "grad_norm": 0.9051302075386047, "learning_rate": 5.431925705274224e-07, "loss": 3.7638, "step": 11356 }, { "epoch": 2.9843334318652084, "grad_norm": 0.9267231225967407, "learning_rate": 5.256702295426669e-07, "loss": 3.6673, "step": 11358 }, { "epoch": 2.9848589351988704, "grad_norm": 0.9592224359512329, "learning_rate": 5.081478885579113e-07, "loss": 3.7007, "step": 11360 }, { "epoch": 2.985384438532532, "grad_norm": 0.953863263130188, "learning_rate": 4.906255475731558e-07, "loss": 3.6676, "step": 11362 }, { "epoch": 2.985909941866194, "grad_norm": 0.9297665953636169, "learning_rate": 4.7310320658840023e-07, "loss": 3.7048, "step": 11364 }, { "epoch": 2.9864354451998554, "grad_norm": 0.9699782133102417, "learning_rate": 4.5558086560364467e-07, "loss": 3.688, "step": 11366 }, { "epoch": 2.9869609485335173, "grad_norm": 1.09873366355896, "learning_rate": 4.380585246188891e-07, "loss": 3.6843, "step": 11368 }, { "epoch": 2.987486451867179, "grad_norm": 0.9258518218994141, "learning_rate": 4.2053618363413355e-07, "loss": 3.7283, "step": 11370 }, { "epoch": 2.988011955200841, "grad_norm": 0.8986594676971436, "learning_rate": 4.03013842649378e-07, "loss": 3.6799, "step": 11372 }, { "epoch": 2.988537458534503, "grad_norm": 0.9104061126708984, "learning_rate": 3.8549150166462243e-07, "loss": 3.6957, "step": 11374 }, { "epoch": 2.9890629618681643, "grad_norm": 0.9657348394393921, "learning_rate": 3.6796916067986687e-07, "loss": 3.6939, "step": 11376 }, { "epoch": 2.989588465201826, "grad_norm": 0.9100803732872009, "learning_rate": 3.504468196951113e-07, "loss": 3.7441, "step": 11378 }, { "epoch": 2.990113968535488, "grad_norm": 0.9651926755905151, "learning_rate": 3.3292447871035575e-07, "loss": 3.6936, "step": 11380 }, { "epoch": 2.99063947186915, "grad_norm": 0.9665604829788208, "learning_rate": 3.154021377256002e-07, "loss": 3.6913, "step": 11382 }, { "epoch": 2.9911649752028113, "grad_norm": 0.9306269288063049, "learning_rate": 2.9787979674084463e-07, "loss": 3.699, "step": 11384 }, { "epoch": 2.9916904785364733, "grad_norm": 0.934059739112854, "learning_rate": 2.8035745575608907e-07, "loss": 3.6914, "step": 11386 }, { "epoch": 2.992215981870135, "grad_norm": 1.0007867813110352, "learning_rate": 2.6283511477133345e-07, "loss": 3.7237, "step": 11388 }, { "epoch": 2.992741485203797, "grad_norm": 0.9055240154266357, "learning_rate": 2.453127737865779e-07, "loss": 3.6827, "step": 11390 }, { "epoch": 2.9932669885374583, "grad_norm": 0.9196228981018066, "learning_rate": 2.2779043280182233e-07, "loss": 3.7581, "step": 11392 }, { "epoch": 2.9937924918711203, "grad_norm": 0.983193576335907, "learning_rate": 2.1026809181706677e-07, "loss": 3.6781, "step": 11394 }, { "epoch": 2.9943179952047823, "grad_norm": 1.051737904548645, "learning_rate": 1.9274575083231121e-07, "loss": 3.6825, "step": 11396 }, { "epoch": 2.994843498538444, "grad_norm": 0.915049135684967, "learning_rate": 1.7522340984755565e-07, "loss": 3.7056, "step": 11398 }, { "epoch": 2.9953690018721058, "grad_norm": 0.9368720650672913, "learning_rate": 1.577010688628001e-07, "loss": 3.6578, "step": 11400 }, { "epoch": 2.9958945052057673, "grad_norm": 0.8954070806503296, "learning_rate": 1.4017872787804453e-07, "loss": 3.6824, "step": 11402 }, { "epoch": 2.9964200085394292, "grad_norm": 0.8923438787460327, "learning_rate": 1.2265638689328895e-07, "loss": 3.6827, "step": 11404 }, { "epoch": 2.9969455118730908, "grad_norm": 0.9434048533439636, "learning_rate": 1.0513404590853339e-07, "loss": 3.6808, "step": 11406 }, { "epoch": 2.9974710152067527, "grad_norm": 0.9715319871902466, "learning_rate": 8.761170492377783e-08, "loss": 3.6831, "step": 11408 }, { "epoch": 2.9979965185404147, "grad_norm": 0.8942244052886963, "learning_rate": 7.008936393902227e-08, "loss": 3.716, "step": 11410 }, { "epoch": 2.9985220218740762, "grad_norm": 0.9693686366081238, "learning_rate": 5.2567022954266694e-08, "loss": 3.6936, "step": 11412 }, { "epoch": 2.9990475252077378, "grad_norm": 0.9845726490020752, "learning_rate": 3.5044681969511133e-08, "loss": 3.6902, "step": 11414 }, { "epoch": 2.999310276874569, "step": 11415, "total_flos": 8.413467008941425e+17, "train_loss": 3.875629249185102, "train_runtime": 55054.2966, "train_samples_per_second": 53.092, "train_steps_per_second": 0.207 }, { "epoch": 2.999310276874569, "eval_loss": 3.713578701019287, "eval_runtime": 464.8285, "eval_samples_per_second": 262.008, "eval_steps_per_second": 8.188, "step": 11415 }, { "epoch": 2.999310276874569, "eval_loss": 3.7107303142547607, "eval_runtime": 463.0078, "eval_samples_per_second": 263.039, "eval_steps_per_second": 8.22, "step": 11415 } ], "logging_steps": 2, "max_steps": 11415, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.413467008941425e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }