{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 928, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008620689655172414, "grad_norm": 0.2490028440952301, "learning_rate": 8.510638297872341e-07, "loss": 3.7051703929901123, "step": 2 }, { "epoch": 0.017241379310344827, "grad_norm": 0.9354268908500671, "learning_rate": 2.553191489361702e-06, "loss": 2.5432679653167725, "step": 4 }, { "epoch": 0.02586206896551724, "grad_norm": 1.1884268522262573, "learning_rate": 4.255319148936171e-06, "loss": 2.127363681793213, "step": 6 }, { "epoch": 0.034482758620689655, "grad_norm": 0.21074028313159943, "learning_rate": 5.957446808510638e-06, "loss": 1.8431488275527954, "step": 8 }, { "epoch": 0.04310344827586207, "grad_norm": 1.0560111999511719, "learning_rate": 7.659574468085107e-06, "loss": 1.2361581325531006, "step": 10 }, { "epoch": 0.05172413793103448, "grad_norm": 0.1471565067768097, "learning_rate": 9.361702127659576e-06, "loss": 1.7195181846618652, "step": 12 }, { "epoch": 0.0603448275862069, "grad_norm": 0.28380081057548523, "learning_rate": 1.1063829787234044e-05, "loss": 1.7923121452331543, "step": 14 }, { "epoch": 0.06896551724137931, "grad_norm": 0.16648316383361816, "learning_rate": 1.2765957446808513e-05, "loss": 0.7930053472518921, "step": 16 }, { "epoch": 0.07758620689655173, "grad_norm": 0.662397027015686, "learning_rate": 1.4468085106382981e-05, "loss": 0.8174023032188416, "step": 18 }, { "epoch": 0.08620689655172414, "grad_norm": 0.13494855165481567, "learning_rate": 1.6170212765957446e-05, "loss": 1.5101033449172974, "step": 20 }, { "epoch": 0.09482758620689655, "grad_norm": 0.1281285285949707, "learning_rate": 1.7872340425531915e-05, "loss": 1.6496508121490479, "step": 22 }, { "epoch": 0.10344827586206896, "grad_norm": 0.19361676275730133, "learning_rate": 1.9574468085106384e-05, "loss": 1.5436862707138062, "step": 24 }, { "epoch": 0.11206896551724138, "grad_norm": 0.26801514625549316, "learning_rate": 2.1276595744680852e-05, "loss": 1.258086919784546, "step": 26 }, { "epoch": 0.1206896551724138, "grad_norm": 0.6263072490692139, "learning_rate": 2.2978723404255324e-05, "loss": 0.4653662145137787, "step": 28 }, { "epoch": 0.12931034482758622, "grad_norm": 0.09440683573484421, "learning_rate": 2.468085106382979e-05, "loss": 1.5492061376571655, "step": 30 }, { "epoch": 0.13793103448275862, "grad_norm": 0.18837958574295044, "learning_rate": 2.6382978723404255e-05, "loss": 1.1802101135253906, "step": 32 }, { "epoch": 0.14655172413793102, "grad_norm": 0.13221898674964905, "learning_rate": 2.8085106382978727e-05, "loss": 1.5317002534866333, "step": 34 }, { "epoch": 0.15517241379310345, "grad_norm": 0.09329993277788162, "learning_rate": 2.9787234042553192e-05, "loss": 1.4935221672058105, "step": 36 }, { "epoch": 0.16379310344827586, "grad_norm": 0.1460646688938141, "learning_rate": 3.1489361702127664e-05, "loss": 1.4348247051239014, "step": 38 }, { "epoch": 0.1724137931034483, "grad_norm": 0.11409169435501099, "learning_rate": 3.319148936170213e-05, "loss": 1.6168015003204346, "step": 40 }, { "epoch": 0.1810344827586207, "grad_norm": 0.212503582239151, "learning_rate": 3.48936170212766e-05, "loss": 1.7296580076217651, "step": 42 }, { "epoch": 0.1896551724137931, "grad_norm": 0.2593461275100708, "learning_rate": 3.6595744680851066e-05, "loss": 1.0080050230026245, "step": 44 }, { "epoch": 0.19827586206896552, "grad_norm": 0.15509171783924103, "learning_rate": 3.829787234042554e-05, "loss": 1.2960833311080933, "step": 46 }, { "epoch": 0.20689655172413793, "grad_norm": 0.11303484439849854, "learning_rate": 4e-05, "loss": 1.463219404220581, "step": 48 }, { "epoch": 0.21551724137931033, "grad_norm": 0.10671091824769974, "learning_rate": 3.999954222867108e-05, "loss": 1.5820165872573853, "step": 50 }, { "epoch": 0.22413793103448276, "grad_norm": 0.2280566394329071, "learning_rate": 3.999816893796815e-05, "loss": 0.7817078232765198, "step": 52 }, { "epoch": 0.23275862068965517, "grad_norm": 0.0907350480556488, "learning_rate": 3.9995880197741576e-05, "loss": 1.5489472150802612, "step": 54 }, { "epoch": 0.2413793103448276, "grad_norm": 0.11344298720359802, "learning_rate": 3.999267612440463e-05, "loss": 1.447698950767517, "step": 56 }, { "epoch": 0.25, "grad_norm": 0.14231091737747192, "learning_rate": 3.9988556880927647e-05, "loss": 1.4835537672042847, "step": 58 }, { "epoch": 0.25862068965517243, "grad_norm": 0.10744752734899521, "learning_rate": 3.998352267682969e-05, "loss": 1.4630577564239502, "step": 60 }, { "epoch": 0.2672413793103448, "grad_norm": 0.33425694704055786, "learning_rate": 3.99775737681679e-05, "loss": 1.2140964269638062, "step": 62 }, { "epoch": 0.27586206896551724, "grad_norm": 0.0645914226770401, "learning_rate": 3.9970710457524474e-05, "loss": 1.4031065702438354, "step": 64 }, { "epoch": 0.28448275862068967, "grad_norm": 0.07510636001825333, "learning_rate": 3.9962933093991296e-05, "loss": 1.3692433834075928, "step": 66 }, { "epoch": 0.29310344827586204, "grad_norm": 0.06527955085039139, "learning_rate": 3.995424207315214e-05, "loss": 1.3801196813583374, "step": 68 }, { "epoch": 0.3017241379310345, "grad_norm": 0.12786391377449036, "learning_rate": 3.994463783706259e-05, "loss": 1.151434063911438, "step": 70 }, { "epoch": 0.3103448275862069, "grad_norm": 0.19329607486724854, "learning_rate": 3.9934120874227505e-05, "loss": 1.0419366359710693, "step": 72 }, { "epoch": 0.31896551724137934, "grad_norm": 0.08128409832715988, "learning_rate": 3.992269171957624e-05, "loss": 1.2281261682510376, "step": 74 }, { "epoch": 0.3275862068965517, "grad_norm": 0.1377028524875641, "learning_rate": 3.991035095443538e-05, "loss": 1.3645248413085938, "step": 76 }, { "epoch": 0.33620689655172414, "grad_norm": 0.05881645902991295, "learning_rate": 3.9897099206499204e-05, "loss": 1.2278920412063599, "step": 78 }, { "epoch": 0.3448275862068966, "grad_norm": 0.15875652432441711, "learning_rate": 3.9882937149797735e-05, "loss": 1.0160530805587769, "step": 80 }, { "epoch": 0.35344827586206895, "grad_norm": 0.09680726379156113, "learning_rate": 3.986786550466246e-05, "loss": 1.4256402254104614, "step": 82 }, { "epoch": 0.3620689655172414, "grad_norm": 0.30841922760009766, "learning_rate": 3.985188503768975e-05, "loss": 1.318834900856018, "step": 84 }, { "epoch": 0.3706896551724138, "grad_norm": 0.14286209642887115, "learning_rate": 3.983499656170176e-05, "loss": 1.474804162979126, "step": 86 }, { "epoch": 0.3793103448275862, "grad_norm": 0.11407710611820221, "learning_rate": 3.981720093570517e-05, "loss": 1.3328633308410645, "step": 88 }, { "epoch": 0.3879310344827586, "grad_norm": 0.4702969193458557, "learning_rate": 3.9798499064847466e-05, "loss": 1.1703399419784546, "step": 90 }, { "epoch": 0.39655172413793105, "grad_norm": 0.13214579224586487, "learning_rate": 3.9778891900370905e-05, "loss": 1.0334569215774536, "step": 92 }, { "epoch": 0.4051724137931034, "grad_norm": 0.17201748490333557, "learning_rate": 3.9758380439564117e-05, "loss": 1.1891283988952637, "step": 94 }, { "epoch": 0.41379310344827586, "grad_norm": 0.2221606969833374, "learning_rate": 3.97369657257114e-05, "loss": 1.4113942384719849, "step": 96 }, { "epoch": 0.4224137931034483, "grad_norm": 0.39266011118888855, "learning_rate": 3.9714648848039655e-05, "loss": 1.2900447845458984, "step": 98 }, { "epoch": 0.43103448275862066, "grad_norm": 0.09964156895875931, "learning_rate": 3.969143094166295e-05, "loss": 1.270521640777588, "step": 100 }, { "epoch": 0.4396551724137931, "grad_norm": 0.5467281937599182, "learning_rate": 3.966731318752484e-05, "loss": 1.2587134838104248, "step": 102 }, { "epoch": 0.4482758620689655, "grad_norm": 0.10468795150518417, "learning_rate": 3.964229681233825e-05, "loss": 1.3480840921401978, "step": 104 }, { "epoch": 0.45689655172413796, "grad_norm": 0.14980582892894745, "learning_rate": 3.961638308852309e-05, "loss": 1.0994645357131958, "step": 106 }, { "epoch": 0.46551724137931033, "grad_norm": 0.22398428618907928, "learning_rate": 3.958957333414157e-05, "loss": 1.233306646347046, "step": 108 }, { "epoch": 0.47413793103448276, "grad_norm": 0.20820066332817078, "learning_rate": 3.9561868912831135e-05, "loss": 1.2420070171356201, "step": 110 }, { "epoch": 0.4827586206896552, "grad_norm": 0.07252290099859238, "learning_rate": 3.953327123373506e-05, "loss": 1.5031483173370361, "step": 112 }, { "epoch": 0.49137931034482757, "grad_norm": 0.30114564299583435, "learning_rate": 3.950378175143088e-05, "loss": 1.2730351686477661, "step": 114 }, { "epoch": 0.5, "grad_norm": 0.5594906806945801, "learning_rate": 3.947340196585631e-05, "loss": 1.0227445363998413, "step": 116 }, { "epoch": 0.5086206896551724, "grad_norm": 0.06979751586914062, "learning_rate": 3.944213342223299e-05, "loss": 1.3545396327972412, "step": 118 }, { "epoch": 0.5172413793103449, "grad_norm": 0.13030360639095306, "learning_rate": 3.9409977710987896e-05, "loss": 1.332112431526184, "step": 120 }, { "epoch": 0.5258620689655172, "grad_norm": 0.0672382041811943, "learning_rate": 3.937693646767245e-05, "loss": 1.3639230728149414, "step": 122 }, { "epoch": 0.5344827586206896, "grad_norm": 0.13222964107990265, "learning_rate": 3.9343011372879275e-05, "loss": 1.5418974161148071, "step": 124 }, { "epoch": 0.5431034482758621, "grad_norm": 0.09776262193918228, "learning_rate": 3.930820415215681e-05, "loss": 1.6416376829147339, "step": 126 }, { "epoch": 0.5517241379310345, "grad_norm": 0.058448825031518936, "learning_rate": 3.927251657592146e-05, "loss": 1.1251301765441895, "step": 128 }, { "epoch": 0.5603448275862069, "grad_norm": 0.052552610635757446, "learning_rate": 3.923595045936757e-05, "loss": 1.3253697156906128, "step": 130 }, { "epoch": 0.5689655172413793, "grad_norm": 0.19469662010669708, "learning_rate": 3.919850766237512e-05, "loss": 1.263968586921692, "step": 132 }, { "epoch": 0.5775862068965517, "grad_norm": 0.17169688642024994, "learning_rate": 3.9160190089415106e-05, "loss": 0.7878425717353821, "step": 134 }, { "epoch": 0.5862068965517241, "grad_norm": 0.07661473006010056, "learning_rate": 3.912099968945268e-05, "loss": 1.0626349449157715, "step": 136 }, { "epoch": 0.5948275862068966, "grad_norm": 0.04046183452010155, "learning_rate": 3.908093845584798e-05, "loss": 1.05846107006073, "step": 138 }, { "epoch": 0.603448275862069, "grad_norm": 0.09312810003757477, "learning_rate": 3.9040008426254824e-05, "loss": 1.0136967897415161, "step": 140 }, { "epoch": 0.6120689655172413, "grad_norm": 0.06341353058815002, "learning_rate": 3.8998211682516976e-05, "loss": 1.0979063510894775, "step": 142 }, { "epoch": 0.6206896551724138, "grad_norm": 0.07724281400442123, "learning_rate": 3.895555035056233e-05, "loss": 1.2308785915374756, "step": 144 }, { "epoch": 0.6293103448275862, "grad_norm": 0.06727743148803711, "learning_rate": 3.891202660029474e-05, "loss": 1.3009754419326782, "step": 146 }, { "epoch": 0.6379310344827587, "grad_norm": 0.06898898631334305, "learning_rate": 3.886764264548363e-05, "loss": 1.3296358585357666, "step": 148 }, { "epoch": 0.646551724137931, "grad_norm": 0.09349878877401352, "learning_rate": 3.882240074365145e-05, "loss": 1.2398273944854736, "step": 150 }, { "epoch": 0.6551724137931034, "grad_norm": 0.17715542018413544, "learning_rate": 3.8776303195958814e-05, "loss": 0.9495888948440552, "step": 152 }, { "epoch": 0.6637931034482759, "grad_norm": 0.7314029932022095, "learning_rate": 3.872935234708747e-05, "loss": 1.1250660419464111, "step": 154 }, { "epoch": 0.6724137931034483, "grad_norm": 0.24987009167671204, "learning_rate": 3.868155058512102e-05, "loss": 1.2095718383789062, "step": 156 }, { "epoch": 0.6810344827586207, "grad_norm": 0.14576859772205353, "learning_rate": 3.8632900341423464e-05, "loss": 1.295078992843628, "step": 158 }, { "epoch": 0.6896551724137931, "grad_norm": 0.11293840408325195, "learning_rate": 3.858340409051558e-05, "loss": 1.338647723197937, "step": 160 }, { "epoch": 0.6982758620689655, "grad_norm": 0.14143511652946472, "learning_rate": 3.853306434994895e-05, "loss": 1.6283466815948486, "step": 162 }, { "epoch": 0.7068965517241379, "grad_norm": 0.06157878786325455, "learning_rate": 3.848188368017803e-05, "loss": 1.055685043334961, "step": 164 }, { "epoch": 0.7155172413793104, "grad_norm": 0.10017464309930801, "learning_rate": 3.8429864684429846e-05, "loss": 1.3115266561508179, "step": 166 }, { "epoch": 0.7241379310344828, "grad_norm": 0.1262677162885666, "learning_rate": 3.837701000857159e-05, "loss": 1.3648704290390015, "step": 168 }, { "epoch": 0.7327586206896551, "grad_norm": 0.1431019902229309, "learning_rate": 3.832332234097606e-05, "loss": 1.2736293077468872, "step": 170 }, { "epoch": 0.7413793103448276, "grad_norm": 0.3130134046077728, "learning_rate": 3.8268804412384936e-05, "loss": 1.2950979471206665, "step": 172 }, { "epoch": 0.75, "grad_norm": 0.10647283494472504, "learning_rate": 3.821345899576982e-05, "loss": 0.9605190753936768, "step": 174 }, { "epoch": 0.7586206896551724, "grad_norm": 0.16715826094150543, "learning_rate": 3.815728890619127e-05, "loss": 0.7583910226821899, "step": 176 }, { "epoch": 0.7672413793103449, "grad_norm": 0.21642223000526428, "learning_rate": 3.8100297000655566e-05, "loss": 1.3208308219909668, "step": 178 }, { "epoch": 0.7758620689655172, "grad_norm": 0.09079600870609283, "learning_rate": 3.804248617796941e-05, "loss": 1.3078787326812744, "step": 180 }, { "epoch": 0.7844827586206896, "grad_norm": 0.08445187658071518, "learning_rate": 3.798385937859249e-05, "loss": 1.0150703191757202, "step": 182 }, { "epoch": 0.7931034482758621, "grad_norm": 0.07777632772922516, "learning_rate": 3.79244195844879e-05, "loss": 0.8019794225692749, "step": 184 }, { "epoch": 0.8017241379310345, "grad_norm": 0.0645889863371849, "learning_rate": 3.7864169818970465e-05, "loss": 1.423434853553772, "step": 186 }, { "epoch": 0.8103448275862069, "grad_norm": 0.14517554640769958, "learning_rate": 3.7803113146553e-05, "loss": 1.5573757886886597, "step": 188 }, { "epoch": 0.8189655172413793, "grad_norm": 0.08607929199934006, "learning_rate": 3.774125267279041e-05, "loss": 1.2926381826400757, "step": 190 }, { "epoch": 0.8275862068965517, "grad_norm": 0.07077501714229584, "learning_rate": 3.767859154412171e-05, "loss": 1.2611286640167236, "step": 192 }, { "epoch": 0.8362068965517241, "grad_norm": 0.08681857585906982, "learning_rate": 3.7615132947710036e-05, "loss": 1.3005847930908203, "step": 194 }, { "epoch": 0.8448275862068966, "grad_norm": 0.15100322663784027, "learning_rate": 3.755088011128049e-05, "loss": 1.3176685571670532, "step": 196 }, { "epoch": 0.853448275862069, "grad_norm": 0.1908247470855713, "learning_rate": 3.7485836302956016e-05, "loss": 1.2926079034805298, "step": 198 }, { "epoch": 0.8620689655172413, "grad_norm": 0.11072229593992233, "learning_rate": 3.7420004831091105e-05, "loss": 1.290125846862793, "step": 200 }, { "epoch": 0.8706896551724138, "grad_norm": 0.10266255587339401, "learning_rate": 3.735338904410358e-05, "loss": 1.333167552947998, "step": 202 }, { "epoch": 0.8793103448275862, "grad_norm": 0.12212225794792175, "learning_rate": 3.728599233030425e-05, "loss": 0.894460916519165, "step": 204 }, { "epoch": 0.8879310344827587, "grad_norm": 0.07256551086902618, "learning_rate": 3.72178181177246e-05, "loss": 1.2725472450256348, "step": 206 }, { "epoch": 0.896551724137931, "grad_norm": 0.12780705094337463, "learning_rate": 3.714886987394238e-05, "loss": 1.3160998821258545, "step": 208 }, { "epoch": 0.9051724137931034, "grad_norm": 0.9440551400184631, "learning_rate": 3.70791511059053e-05, "loss": 0.6522892117500305, "step": 210 }, { "epoch": 0.9137931034482759, "grad_norm": 0.09754825383424759, "learning_rate": 3.700866535975256e-05, "loss": 0.9885504245758057, "step": 212 }, { "epoch": 0.9224137931034483, "grad_norm": 0.4700559675693512, "learning_rate": 3.69374162206346e-05, "loss": 1.2210713624954224, "step": 214 }, { "epoch": 0.9310344827586207, "grad_norm": 0.15046778321266174, "learning_rate": 3.6865407312530635e-05, "loss": 1.2765154838562012, "step": 216 }, { "epoch": 0.9396551724137931, "grad_norm": 0.11940351128578186, "learning_rate": 3.67926422980644e-05, "loss": 1.0038096904754639, "step": 218 }, { "epoch": 0.9482758620689655, "grad_norm": 0.11115628480911255, "learning_rate": 3.671912487831783e-05, "loss": 0.9949377179145813, "step": 220 }, { "epoch": 0.9568965517241379, "grad_norm": 0.07952730357646942, "learning_rate": 3.664485879264279e-05, "loss": 1.3989291191101074, "step": 222 }, { "epoch": 0.9655172413793104, "grad_norm": 0.09136626124382019, "learning_rate": 3.656984781847094e-05, "loss": 0.9785476922988892, "step": 224 }, { "epoch": 0.9741379310344828, "grad_norm": 0.057783424854278564, "learning_rate": 3.649409577112152e-05, "loss": 0.9384239315986633, "step": 226 }, { "epoch": 0.9827586206896551, "grad_norm": 0.07152694463729858, "learning_rate": 3.641760650360736e-05, "loss": 1.2639554738998413, "step": 228 }, { "epoch": 0.9913793103448276, "grad_norm": 0.33899354934692383, "learning_rate": 3.634038390643886e-05, "loss": 0.9682251811027527, "step": 230 }, { "epoch": 1.0, "grad_norm": 0.07940968871116638, "learning_rate": 3.626243190742613e-05, "loss": 0.870396614074707, "step": 232 }, { "epoch": 1.0086206896551724, "grad_norm": 0.12955711781978607, "learning_rate": 3.618375447147918e-05, "loss": 0.9028921723365784, "step": 234 }, { "epoch": 1.0172413793103448, "grad_norm": 0.18779002130031586, "learning_rate": 3.6104355600406284e-05, "loss": 0.7830209136009216, "step": 236 }, { "epoch": 1.0258620689655173, "grad_norm": 0.9097674489021301, "learning_rate": 3.6024239332710415e-05, "loss": 0.8674835562705994, "step": 238 }, { "epoch": 1.0344827586206897, "grad_norm": 0.27647626399993896, "learning_rate": 3.5943409743383826e-05, "loss": 0.6074855327606201, "step": 240 }, { "epoch": 1.043103448275862, "grad_norm": 0.11918103694915771, "learning_rate": 3.586187094370079e-05, "loss": 1.2056649923324585, "step": 242 }, { "epoch": 1.0517241379310345, "grad_norm": 0.11876146495342255, "learning_rate": 3.577962708100851e-05, "loss": 0.39286842942237854, "step": 244 }, { "epoch": 1.0603448275862069, "grad_norm": 0.12174484878778458, "learning_rate": 3.569668233851613e-05, "loss": 0.9662060141563416, "step": 246 }, { "epoch": 1.0689655172413792, "grad_norm": 0.06474713236093521, "learning_rate": 3.561304093508198e-05, "loss": 0.9487460851669312, "step": 248 }, { "epoch": 1.0775862068965518, "grad_norm": 0.1427253633737564, "learning_rate": 3.552870712499898e-05, "loss": 0.8610017895698547, "step": 250 }, { "epoch": 1.0862068965517242, "grad_norm": 0.1900222897529602, "learning_rate": 3.54436851977783e-05, "loss": 0.448039710521698, "step": 252 }, { "epoch": 1.0948275862068966, "grad_norm": 0.22907602787017822, "learning_rate": 3.535797947793111e-05, "loss": 0.8761284351348877, "step": 254 }, { "epoch": 1.103448275862069, "grad_norm": 0.15932975709438324, "learning_rate": 3.527159432474865e-05, "loss": 0.8125666975975037, "step": 256 }, { "epoch": 1.1120689655172413, "grad_norm": 0.349263459444046, "learning_rate": 3.518453413208053e-05, "loss": 0.6569501757621765, "step": 258 }, { "epoch": 1.1206896551724137, "grad_norm": 0.14833390712738037, "learning_rate": 3.509680332811121e-05, "loss": 0.7028253674507141, "step": 260 }, { "epoch": 1.1293103448275863, "grad_norm": 0.05770875886082649, "learning_rate": 3.5008406375134756e-05, "loss": 1.1632713079452515, "step": 262 }, { "epoch": 1.1379310344827587, "grad_norm": 0.055273283272981644, "learning_rate": 3.491934776932791e-05, "loss": 0.8371788263320923, "step": 264 }, { "epoch": 1.146551724137931, "grad_norm": 0.05901844799518585, "learning_rate": 3.482963204052139e-05, "loss": 1.0003291368484497, "step": 266 }, { "epoch": 1.1551724137931034, "grad_norm": 0.06095067784190178, "learning_rate": 3.473926375196943e-05, "loss": 0.9226457476615906, "step": 268 }, { "epoch": 1.1637931034482758, "grad_norm": 0.07438337802886963, "learning_rate": 3.464824750011779e-05, "loss": 1.1149680614471436, "step": 270 }, { "epoch": 1.1724137931034484, "grad_norm": 0.06853963434696198, "learning_rate": 3.455658791436985e-05, "loss": 1.0302170515060425, "step": 272 }, { "epoch": 1.1810344827586208, "grad_norm": 0.0973813459277153, "learning_rate": 3.446428965685121e-05, "loss": 0.7569156289100647, "step": 274 }, { "epoch": 1.1896551724137931, "grad_norm": 0.056327857077121735, "learning_rate": 3.437135742217254e-05, "loss": 0.3794441223144531, "step": 276 }, { "epoch": 1.1982758620689655, "grad_norm": 0.07669582962989807, "learning_rate": 3.427779593719079e-05, "loss": 1.1280944347381592, "step": 278 }, { "epoch": 1.206896551724138, "grad_norm": 0.058203186839818954, "learning_rate": 3.4183609960768764e-05, "loss": 0.9517163634300232, "step": 280 }, { "epoch": 1.2155172413793103, "grad_norm": 0.06986816227436066, "learning_rate": 3.4088804283533094e-05, "loss": 0.6671708822250366, "step": 282 }, { "epoch": 1.2241379310344827, "grad_norm": 0.08936108648777008, "learning_rate": 3.399338372763055e-05, "loss": 0.7694864869117737, "step": 284 }, { "epoch": 1.2327586206896552, "grad_norm": 0.12140902131795883, "learning_rate": 3.389735314648274e-05, "loss": 0.8068587183952332, "step": 286 }, { "epoch": 1.2413793103448276, "grad_norm": 0.04297681525349617, "learning_rate": 3.380071742453931e-05, "loss": 0.40287792682647705, "step": 288 }, { "epoch": 1.25, "grad_norm": 0.11908482015132904, "learning_rate": 3.370348147702949e-05, "loss": 1.0401684045791626, "step": 290 }, { "epoch": 1.2586206896551724, "grad_norm": 0.057489216327667236, "learning_rate": 3.360565024971202e-05, "loss": 0.8889655470848083, "step": 292 }, { "epoch": 1.2672413793103448, "grad_norm": 0.15609467029571533, "learning_rate": 3.350722871862368e-05, "loss": 0.9757481813430786, "step": 294 }, { "epoch": 1.2758620689655173, "grad_norm": 0.1248452365398407, "learning_rate": 3.340822188982616e-05, "loss": 0.7736673355102539, "step": 296 }, { "epoch": 1.2844827586206897, "grad_norm": 0.09071607887744904, "learning_rate": 3.330863479915138e-05, "loss": 1.0164954662322998, "step": 298 }, { "epoch": 1.293103448275862, "grad_norm": 0.06744378060102463, "learning_rate": 3.320847251194546e-05, "loss": 0.9475960731506348, "step": 300 }, { "epoch": 1.3017241379310345, "grad_norm": 0.07189597189426422, "learning_rate": 3.310774012281099e-05, "loss": 0.6825069785118103, "step": 302 }, { "epoch": 1.3103448275862069, "grad_norm": 0.07518645375967026, "learning_rate": 3.300644275534793e-05, "loss": 0.5717735290527344, "step": 304 }, { "epoch": 1.3189655172413794, "grad_norm": 0.09223438799381256, "learning_rate": 3.290458556189299e-05, "loss": 1.3711295127868652, "step": 306 }, { "epoch": 1.3275862068965516, "grad_norm": 0.14958783984184265, "learning_rate": 3.2802173723257604e-05, "loss": 0.6421374082565308, "step": 308 }, { "epoch": 1.3362068965517242, "grad_norm": 0.1238432452082634, "learning_rate": 3.2699212448464385e-05, "loss": 0.9758880734443665, "step": 310 }, { "epoch": 1.3448275862068966, "grad_norm": 0.06866496056318283, "learning_rate": 3.259570697448217e-05, "loss": 0.9329778552055359, "step": 312 }, { "epoch": 1.353448275862069, "grad_norm": 0.10072822868824005, "learning_rate": 3.249166256595967e-05, "loss": 1.2179062366485596, "step": 314 }, { "epoch": 1.3620689655172413, "grad_norm": 0.06878109276294708, "learning_rate": 3.2387084514957675e-05, "loss": 1.3471888303756714, "step": 316 }, { "epoch": 1.3706896551724137, "grad_norm": 0.06524922698736191, "learning_rate": 3.2281978140679894e-05, "loss": 0.9441757202148438, "step": 318 }, { "epoch": 1.3793103448275863, "grad_norm": 0.11429349333047867, "learning_rate": 3.21763487892024e-05, "loss": 0.7498874664306641, "step": 320 }, { "epoch": 1.3879310344827587, "grad_norm": 0.07464733719825745, "learning_rate": 3.207020183320171e-05, "loss": 1.4824918508529663, "step": 322 }, { "epoch": 1.396551724137931, "grad_norm": 0.11774388700723648, "learning_rate": 3.196354267168149e-05, "loss": 0.49022743105888367, "step": 324 }, { "epoch": 1.4051724137931034, "grad_norm": 0.16186490654945374, "learning_rate": 3.185637672969799e-05, "loss": 0.6543675661087036, "step": 326 }, { "epoch": 1.4137931034482758, "grad_norm": 0.10584386438131332, "learning_rate": 3.1748709458084045e-05, "loss": 0.8541685342788696, "step": 328 }, { "epoch": 1.4224137931034484, "grad_norm": 0.07407426834106445, "learning_rate": 3.1640546333171894e-05, "loss": 0.7656717300415039, "step": 330 }, { "epoch": 1.4310344827586206, "grad_norm": 0.16052280366420746, "learning_rate": 3.153189285651458e-05, "loss": 0.6957482695579529, "step": 332 }, { "epoch": 1.4396551724137931, "grad_norm": 0.13904230296611786, "learning_rate": 3.142275455460614e-05, "loss": 0.6638420224189758, "step": 334 }, { "epoch": 1.4482758620689655, "grad_norm": 0.11371087282896042, "learning_rate": 3.131313697860053e-05, "loss": 0.7661845088005066, "step": 336 }, { "epoch": 1.456896551724138, "grad_norm": 0.11575423926115036, "learning_rate": 3.120304570402924e-05, "loss": 1.1160173416137695, "step": 338 }, { "epoch": 1.4655172413793103, "grad_norm": 0.22061830759048462, "learning_rate": 3.1092486330517714e-05, "loss": 1.384441614151001, "step": 340 }, { "epoch": 1.4741379310344827, "grad_norm": 0.12608060240745544, "learning_rate": 3.098146448150055e-05, "loss": 1.145660638809204, "step": 342 }, { "epoch": 1.4827586206896552, "grad_norm": 0.06820492446422577, "learning_rate": 3.086998580393547e-05, "loss": 0.9891381859779358, "step": 344 }, { "epoch": 1.4913793103448276, "grad_norm": 0.11383876949548721, "learning_rate": 3.075805596801605e-05, "loss": 0.6093174815177917, "step": 346 }, { "epoch": 1.5, "grad_norm": 0.2013673037290573, "learning_rate": 3.0645680666883374e-05, "loss": 0.9298641681671143, "step": 348 }, { "epoch": 1.5086206896551724, "grad_norm": 0.08500847220420837, "learning_rate": 3.053286561633644e-05, "loss": 0.9974504113197327, "step": 350 }, { "epoch": 1.5172413793103448, "grad_norm": 0.14812250435352325, "learning_rate": 3.041961655454143e-05, "loss": 0.9739059209823608, "step": 352 }, { "epoch": 1.5258620689655173, "grad_norm": 0.11965472251176834, "learning_rate": 3.030593924173984e-05, "loss": 1.133984088897705, "step": 354 }, { "epoch": 1.5344827586206895, "grad_norm": 0.36424365639686584, "learning_rate": 3.0191839459955514e-05, "loss": 0.8807175755500793, "step": 356 }, { "epoch": 1.543103448275862, "grad_norm": 0.05107448622584343, "learning_rate": 3.0077323012700534e-05, "loss": 0.8361281156539917, "step": 358 }, { "epoch": 1.5517241379310345, "grad_norm": 0.09036049991846085, "learning_rate": 2.996239572468003e-05, "loss": 1.2387166023254395, "step": 360 }, { "epoch": 1.5603448275862069, "grad_norm": 0.06331617385149002, "learning_rate": 2.984706344149595e-05, "loss": 1.0467900037765503, "step": 362 }, { "epoch": 1.5689655172413794, "grad_norm": 0.06433523446321487, "learning_rate": 2.9731332029349667e-05, "loss": 1.0626113414764404, "step": 364 }, { "epoch": 1.5775862068965516, "grad_norm": 0.09752818942070007, "learning_rate": 2.961520737474367e-05, "loss": 1.0128107070922852, "step": 366 }, { "epoch": 1.5862068965517242, "grad_norm": 0.05285457894206047, "learning_rate": 2.9498695384182123e-05, "loss": 0.9877223968505859, "step": 368 }, { "epoch": 1.5948275862068966, "grad_norm": 0.05934653803706169, "learning_rate": 2.9381801983870435e-05, "loss": 0.9603118300437927, "step": 370 }, { "epoch": 1.603448275862069, "grad_norm": 0.22097375988960266, "learning_rate": 2.9264533119413866e-05, "loss": 1.081476092338562, "step": 372 }, { "epoch": 1.6120689655172413, "grad_norm": 0.10628407448530197, "learning_rate": 2.914689475551506e-05, "loss": 0.7714329957962036, "step": 374 }, { "epoch": 1.6206896551724137, "grad_norm": 0.10955756157636642, "learning_rate": 2.902889287567072e-05, "loss": 0.9913143515586853, "step": 376 }, { "epoch": 1.6293103448275863, "grad_norm": 0.07451241463422775, "learning_rate": 2.8910533481867195e-05, "loss": 1.1765313148498535, "step": 378 }, { "epoch": 1.6379310344827587, "grad_norm": 0.07359088957309723, "learning_rate": 2.879182259427528e-05, "loss": 0.7655573487281799, "step": 380 }, { "epoch": 1.646551724137931, "grad_norm": 0.13642138242721558, "learning_rate": 2.8672766250943947e-05, "loss": 1.3452657461166382, "step": 382 }, { "epoch": 1.6551724137931034, "grad_norm": 0.08765345811843872, "learning_rate": 2.8553370507493246e-05, "loss": 0.9972445964813232, "step": 384 }, { "epoch": 1.6637931034482758, "grad_norm": 0.0989682674407959, "learning_rate": 2.8433641436806306e-05, "loss": 0.8845785856246948, "step": 386 }, { "epoch": 1.6724137931034484, "grad_norm": 0.06875207275152206, "learning_rate": 2.8313585128720444e-05, "loss": 1.3110713958740234, "step": 388 }, { "epoch": 1.6810344827586206, "grad_norm": 0.13957612216472626, "learning_rate": 2.8193207689717393e-05, "loss": 0.8128502368927002, "step": 390 }, { "epoch": 1.6896551724137931, "grad_norm": 0.6921377778053284, "learning_rate": 2.807251524261275e-05, "loss": 0.6244351863861084, "step": 392 }, { "epoch": 1.6982758620689655, "grad_norm": 0.30923035740852356, "learning_rate": 2.7951513926244484e-05, "loss": 1.127506136894226, "step": 394 }, { "epoch": 1.706896551724138, "grad_norm": 0.0620148703455925, "learning_rate": 2.7830209895160764e-05, "loss": 1.042289137840271, "step": 396 }, { "epoch": 1.7155172413793105, "grad_norm": 0.16145341098308563, "learning_rate": 2.770860931930687e-05, "loss": 1.0570330619812012, "step": 398 }, { "epoch": 1.7241379310344827, "grad_norm": 0.09267118573188782, "learning_rate": 2.7586718383711367e-05, "loss": 0.9959380626678467, "step": 400 }, { "epoch": 1.7327586206896552, "grad_norm": 0.07319535315036774, "learning_rate": 2.7464543288171558e-05, "loss": 1.0200254917144775, "step": 402 }, { "epoch": 1.7413793103448276, "grad_norm": 0.055158186703920364, "learning_rate": 2.7342090246938076e-05, "loss": 0.6205574870109558, "step": 404 }, { "epoch": 1.75, "grad_norm": 0.07343259453773499, "learning_rate": 2.721936548839887e-05, "loss": 0.8922735452651978, "step": 406 }, { "epoch": 1.7586206896551724, "grad_norm": 0.06107189506292343, "learning_rate": 2.709637525476236e-05, "loss": 0.6991145014762878, "step": 408 }, { "epoch": 1.7672413793103448, "grad_norm": 0.0519319549202919, "learning_rate": 2.697312580173995e-05, "loss": 0.8093492984771729, "step": 410 }, { "epoch": 1.7758620689655173, "grad_norm": 0.07292782515287399, "learning_rate": 2.684962339822785e-05, "loss": 0.7507970929145813, "step": 412 }, { "epoch": 1.7844827586206895, "grad_norm": 0.07456238567829132, "learning_rate": 2.672587432598823e-05, "loss": 0.5883830189704895, "step": 414 }, { "epoch": 1.793103448275862, "grad_norm": 0.11243204772472382, "learning_rate": 2.6601884879329653e-05, "loss": 0.7915773391723633, "step": 416 }, { "epoch": 1.8017241379310345, "grad_norm": 0.07653719186782837, "learning_rate": 2.6477661364786996e-05, "loss": 1.0269769430160522, "step": 418 }, { "epoch": 1.8103448275862069, "grad_norm": 0.14341171085834503, "learning_rate": 2.635321010080062e-05, "loss": 1.053789496421814, "step": 420 }, { "epoch": 1.8189655172413794, "grad_norm": 0.12033911049365997, "learning_rate": 2.6228537417395034e-05, "loss": 1.158492088317871, "step": 422 }, { "epoch": 1.8275862068965516, "grad_norm": 0.047955527901649475, "learning_rate": 2.61036496558569e-05, "loss": 0.9592758417129517, "step": 424 }, { "epoch": 1.8362068965517242, "grad_norm": 0.088678739964962, "learning_rate": 2.59785531684125e-05, "loss": 0.6086317300796509, "step": 426 }, { "epoch": 1.8448275862068966, "grad_norm": 0.07942725718021393, "learning_rate": 2.585325431790464e-05, "loss": 1.0528879165649414, "step": 428 }, { "epoch": 1.853448275862069, "grad_norm": 0.0694958046078682, "learning_rate": 2.572775947746903e-05, "loss": 1.0576783418655396, "step": 430 }, { "epoch": 1.8620689655172413, "grad_norm": 0.17858955264091492, "learning_rate": 2.5602075030210096e-05, "loss": 0.9204137325286865, "step": 432 }, { "epoch": 1.8706896551724137, "grad_norm": 0.296277791261673, "learning_rate": 2.5476207368876334e-05, "loss": 1.114011287689209, "step": 434 }, { "epoch": 1.8793103448275863, "grad_norm": 0.07735295593738556, "learning_rate": 2.535016289553514e-05, "loss": 0.7933326363563538, "step": 436 }, { "epoch": 1.8879310344827587, "grad_norm": 0.12477041035890579, "learning_rate": 2.5223948021247197e-05, "loss": 0.9807726144790649, "step": 438 }, { "epoch": 1.896551724137931, "grad_norm": 0.09196372330188751, "learning_rate": 2.509756916574035e-05, "loss": 1.0345503091812134, "step": 440 }, { "epoch": 1.9051724137931034, "grad_norm": 0.06840290129184723, "learning_rate": 2.4971032757083123e-05, "loss": 1.1201728582382202, "step": 442 }, { "epoch": 1.9137931034482758, "grad_norm": 0.11144451051950455, "learning_rate": 2.4844345231357734e-05, "loss": 0.28341731429100037, "step": 444 }, { "epoch": 1.9224137931034484, "grad_norm": 0.14570969343185425, "learning_rate": 2.4717513032332736e-05, "loss": 0.7789583206176758, "step": 446 }, { "epoch": 1.9310344827586206, "grad_norm": 0.05790058895945549, "learning_rate": 2.4590542611135274e-05, "loss": 1.012285590171814, "step": 448 }, { "epoch": 1.9396551724137931, "grad_norm": 0.05153496563434601, "learning_rate": 2.446344042592295e-05, "loss": 1.0196033716201782, "step": 450 }, { "epoch": 1.9482758620689655, "grad_norm": 0.057060956954956055, "learning_rate": 2.433621294155535e-05, "loss": 0.8052966594696045, "step": 452 }, { "epoch": 1.956896551724138, "grad_norm": 0.0602966733276844, "learning_rate": 2.420886662926521e-05, "loss": 0.9915321469306946, "step": 454 }, { "epoch": 1.9655172413793105, "grad_norm": 0.07094614952802658, "learning_rate": 2.4081407966329256e-05, "loss": 0.9689676761627197, "step": 456 }, { "epoch": 1.9741379310344827, "grad_norm": 0.08627466857433319, "learning_rate": 2.3953843435738775e-05, "loss": 0.41972166299819946, "step": 458 }, { "epoch": 1.9827586206896552, "grad_norm": 0.10626411437988281, "learning_rate": 2.3826179525869836e-05, "loss": 1.1633706092834473, "step": 460 }, { "epoch": 1.9913793103448276, "grad_norm": 0.15631678700447083, "learning_rate": 2.36984227301533e-05, "loss": 0.7487952709197998, "step": 462 }, { "epoch": 2.0, "grad_norm": 0.16628113389015198, "learning_rate": 2.3570579546744504e-05, "loss": 0.8847077488899231, "step": 464 }, { "epoch": 2.0086206896551726, "grad_norm": 0.06411660462617874, "learning_rate": 2.3442656478192794e-05, "loss": 0.484560489654541, "step": 466 }, { "epoch": 2.0172413793103448, "grad_norm": 0.3941573202610016, "learning_rate": 2.331466003111073e-05, "loss": 0.6984850764274597, "step": 468 }, { "epoch": 2.0258620689655173, "grad_norm": 0.044237978756427765, "learning_rate": 2.318659671584316e-05, "loss": 0.4863373935222626, "step": 470 }, { "epoch": 2.0344827586206895, "grad_norm": 0.0645633190870285, "learning_rate": 2.305847304613609e-05, "loss": 0.4588513970375061, "step": 472 }, { "epoch": 2.043103448275862, "grad_norm": 0.05587729066610336, "learning_rate": 2.293029553880536e-05, "loss": 0.4486234486103058, "step": 474 }, { "epoch": 2.0517241379310347, "grad_norm": 0.06679260730743408, "learning_rate": 2.280207071340517e-05, "loss": 0.5298870205879211, "step": 476 }, { "epoch": 2.060344827586207, "grad_norm": 0.08075322210788727, "learning_rate": 2.26738050918965e-05, "loss": 0.4382156729698181, "step": 478 }, { "epoch": 2.0689655172413794, "grad_norm": 0.06546280533075333, "learning_rate": 2.2545505198315346e-05, "loss": 0.5762298107147217, "step": 480 }, { "epoch": 2.0775862068965516, "grad_norm": 0.11915218830108643, "learning_rate": 2.2417177558440907e-05, "loss": 0.36859992146492004, "step": 482 }, { "epoch": 2.086206896551724, "grad_norm": 0.22198820114135742, "learning_rate": 2.2288828699463652e-05, "loss": 0.5293700098991394, "step": 484 }, { "epoch": 2.0948275862068964, "grad_norm": 0.0842965617775917, "learning_rate": 2.2160465149653337e-05, "loss": 0.49147215485572815, "step": 486 }, { "epoch": 2.103448275862069, "grad_norm": 0.11753598600625992, "learning_rate": 2.203209343802692e-05, "loss": 0.5180780291557312, "step": 488 }, { "epoch": 2.1120689655172415, "grad_norm": 0.37540075182914734, "learning_rate": 2.1903720094016537e-05, "loss": 0.581203818321228, "step": 490 }, { "epoch": 2.1206896551724137, "grad_norm": 0.062044426798820496, "learning_rate": 2.1775351647137323e-05, "loss": 0.4889185130596161, "step": 492 }, { "epoch": 2.1293103448275863, "grad_norm": 0.07434380799531937, "learning_rate": 2.1646994626655332e-05, "loss": 0.6391059756278992, "step": 494 }, { "epoch": 2.1379310344827585, "grad_norm": 0.10223301500082016, "learning_rate": 2.151865556125544e-05, "loss": 0.6237853169441223, "step": 496 }, { "epoch": 2.146551724137931, "grad_norm": 0.14267216622829437, "learning_rate": 2.1390340978709254e-05, "loss": 0.36577755212783813, "step": 498 }, { "epoch": 2.1551724137931036, "grad_norm": 0.13929963111877441, "learning_rate": 2.1262057405543115e-05, "loss": 0.49633127450942993, "step": 500 }, { "epoch": 2.163793103448276, "grad_norm": 0.05517968162894249, "learning_rate": 2.1133811366706097e-05, "loss": 0.38259175419807434, "step": 502 }, { "epoch": 2.1724137931034484, "grad_norm": 0.058835044503211975, "learning_rate": 2.100560938523817e-05, "loss": 0.4427034258842468, "step": 504 }, { "epoch": 2.1810344827586206, "grad_norm": 0.15045633912086487, "learning_rate": 2.0877457981938364e-05, "loss": 0.6942803263664246, "step": 506 }, { "epoch": 2.189655172413793, "grad_norm": 2.292686700820923, "learning_rate": 2.074936367503317e-05, "loss": 0.5671365261077881, "step": 508 }, { "epoch": 2.1982758620689653, "grad_norm": 0.046695832163095474, "learning_rate": 2.0621332979844904e-05, "loss": 0.6063480377197266, "step": 510 }, { "epoch": 2.206896551724138, "grad_norm": 0.16905461251735687, "learning_rate": 2.0493372408460425e-05, "loss": 0.6027957201004028, "step": 512 }, { "epoch": 2.2155172413793105, "grad_norm": 0.06160572171211243, "learning_rate": 2.0365488469399795e-05, "loss": 0.6078309416770935, "step": 514 }, { "epoch": 2.2241379310344827, "grad_norm": 0.07821284979581833, "learning_rate": 2.0237687667285345e-05, "loss": 0.3304949402809143, "step": 516 }, { "epoch": 2.2327586206896552, "grad_norm": 0.34748536348342896, "learning_rate": 2.010997650251072e-05, "loss": 0.12825970351696014, "step": 518 }, { "epoch": 2.2413793103448274, "grad_norm": 0.11893010139465332, "learning_rate": 1.9982361470910342e-05, "loss": 0.1828547865152359, "step": 520 }, { "epoch": 2.25, "grad_norm": 0.12491466104984283, "learning_rate": 1.9854849063428926e-05, "loss": 0.6522985696792603, "step": 522 }, { "epoch": 2.2586206896551726, "grad_norm": 0.15903355181217194, "learning_rate": 1.9727445765791405e-05, "loss": 0.47932472825050354, "step": 524 }, { "epoch": 2.2672413793103448, "grad_norm": 0.09779471158981323, "learning_rate": 1.9600158058172974e-05, "loss": 0.4181676208972931, "step": 526 }, { "epoch": 2.2758620689655173, "grad_norm": 0.07378951460123062, "learning_rate": 1.9472992414869534e-05, "loss": 0.46739447116851807, "step": 528 }, { "epoch": 2.2844827586206895, "grad_norm": 0.04063527286052704, "learning_rate": 1.9345955303968365e-05, "loss": 0.38251054286956787, "step": 530 }, { "epoch": 2.293103448275862, "grad_norm": 0.08258794993162155, "learning_rate": 1.9219053187019144e-05, "loss": 0.4366922080516815, "step": 532 }, { "epoch": 2.3017241379310347, "grad_norm": 0.09015543758869171, "learning_rate": 1.909229251870528e-05, "loss": 0.4965798556804657, "step": 534 }, { "epoch": 2.310344827586207, "grad_norm": 0.08743222802877426, "learning_rate": 1.8965679746515628e-05, "loss": 0.43146276473999023, "step": 536 }, { "epoch": 2.3189655172413794, "grad_norm": 0.084476038813591, "learning_rate": 1.88392213104165e-05, "loss": 0.2771337330341339, "step": 538 }, { "epoch": 2.3275862068965516, "grad_norm": 0.07576002180576324, "learning_rate": 1.8712923642524175e-05, "loss": 0.36878013610839844, "step": 540 }, { "epoch": 2.336206896551724, "grad_norm": 0.10497633367776871, "learning_rate": 1.858679316677767e-05, "loss": 0.6058629751205444, "step": 542 }, { "epoch": 2.344827586206897, "grad_norm": 0.13856923580169678, "learning_rate": 1.8460836298612056e-05, "loss": 0.6428977251052856, "step": 544 }, { "epoch": 2.353448275862069, "grad_norm": 0.1172226220369339, "learning_rate": 1.8335059444632078e-05, "loss": 0.2821408212184906, "step": 546 }, { "epoch": 2.3620689655172415, "grad_norm": 0.1798970252275467, "learning_rate": 1.820946900228639e-05, "loss": 0.8290093541145325, "step": 548 }, { "epoch": 2.3706896551724137, "grad_norm": 0.2738807499408722, "learning_rate": 1.808407135954204e-05, "loss": 0.5475698709487915, "step": 550 }, { "epoch": 2.3793103448275863, "grad_norm": 0.20505401492118835, "learning_rate": 1.7958872894559666e-05, "loss": 0.6245191693305969, "step": 552 }, { "epoch": 2.3879310344827585, "grad_norm": 0.05477019026875496, "learning_rate": 1.7833879975368994e-05, "loss": 0.5108689665794373, "step": 554 }, { "epoch": 2.396551724137931, "grad_norm": 0.09034960716962814, "learning_rate": 1.7709098959545015e-05, "loss": 0.5519805550575256, "step": 556 }, { "epoch": 2.405172413793103, "grad_norm": 0.1560261845588684, "learning_rate": 1.758453619388453e-05, "loss": 0.4397192597389221, "step": 558 }, { "epoch": 2.413793103448276, "grad_norm": 0.118907131254673, "learning_rate": 1.7460198014083424e-05, "loss": 0.38739266991615295, "step": 560 }, { "epoch": 2.4224137931034484, "grad_norm": 0.23784895241260529, "learning_rate": 1.733609074441433e-05, "loss": 0.5064358711242676, "step": 562 }, { "epoch": 2.4310344827586206, "grad_norm": 0.09993483871221542, "learning_rate": 1.7212220697405003e-05, "loss": 0.540324330329895, "step": 564 }, { "epoch": 2.439655172413793, "grad_norm": 0.7780280113220215, "learning_rate": 1.7088594173517225e-05, "loss": 0.5431786179542542, "step": 566 }, { "epoch": 2.4482758620689653, "grad_norm": 0.14646178483963013, "learning_rate": 1.6965217460826345e-05, "loss": 0.3365917205810547, "step": 568 }, { "epoch": 2.456896551724138, "grad_norm": 0.07466763257980347, "learning_rate": 1.6842096834701443e-05, "loss": 0.6636412739753723, "step": 570 }, { "epoch": 2.4655172413793105, "grad_norm": 0.3850714862346649, "learning_rate": 1.6719238557486143e-05, "loss": 0.3930183947086334, "step": 572 }, { "epoch": 2.4741379310344827, "grad_norm": 0.12653613090515137, "learning_rate": 1.6596648878180088e-05, "loss": 0.4772527813911438, "step": 574 }, { "epoch": 2.4827586206896552, "grad_norm": 0.10766978561878204, "learning_rate": 1.647433403212112e-05, "loss": 0.6689369082450867, "step": 576 }, { "epoch": 2.4913793103448274, "grad_norm": 0.1643172800540924, "learning_rate": 1.635230024066807e-05, "loss": 0.5050515532493591, "step": 578 }, { "epoch": 2.5, "grad_norm": 0.06176433712244034, "learning_rate": 1.6230553710884373e-05, "loss": 0.6936325430870056, "step": 580 }, { "epoch": 2.5086206896551726, "grad_norm": 0.17540457844734192, "learning_rate": 1.610910063522233e-05, "loss": 0.5566367506980896, "step": 582 }, { "epoch": 2.5172413793103448, "grad_norm": 0.09146937727928162, "learning_rate": 1.598794719120816e-05, "loss": 0.5264196991920471, "step": 584 }, { "epoch": 2.5258620689655173, "grad_norm": 0.08665334433317184, "learning_rate": 1.5867099541127737e-05, "loss": 0.4999127686023712, "step": 586 }, { "epoch": 2.5344827586206895, "grad_norm": 0.05140522122383118, "learning_rate": 1.5746563831713236e-05, "loss": 0.5660111308097839, "step": 588 }, { "epoch": 2.543103448275862, "grad_norm": 0.08618345856666565, "learning_rate": 1.56263461938304e-05, "loss": 0.7160353064537048, "step": 590 }, { "epoch": 2.5517241379310347, "grad_norm": 0.05319703742861748, "learning_rate": 1.5506452742166796e-05, "loss": 0.575738251209259, "step": 592 }, { "epoch": 2.560344827586207, "grad_norm": 0.29011279344558716, "learning_rate": 1.5386889574920692e-05, "loss": 0.35511380434036255, "step": 594 }, { "epoch": 2.5689655172413794, "grad_norm": 0.07296542078256607, "learning_rate": 1.5267662773491e-05, "loss": 0.40391749143600464, "step": 596 }, { "epoch": 2.5775862068965516, "grad_norm": 0.09713292866945267, "learning_rate": 1.514877840216785e-05, "loss": 0.5037810802459717, "step": 598 }, { "epoch": 2.586206896551724, "grad_norm": 0.1726667881011963, "learning_rate": 1.5030242507824215e-05, "loss": 0.6312216520309448, "step": 600 }, { "epoch": 2.594827586206897, "grad_norm": 0.0342765673995018, "learning_rate": 1.4912061119608292e-05, "loss": 0.39456382393836975, "step": 602 }, { "epoch": 2.603448275862069, "grad_norm": 0.45015275478363037, "learning_rate": 1.4794240248636885e-05, "loss": 0.5595788359642029, "step": 604 }, { "epoch": 2.612068965517241, "grad_norm": 0.10634768009185791, "learning_rate": 1.4676785887689614e-05, "loss": 0.41876575350761414, "step": 606 }, { "epoch": 2.6206896551724137, "grad_norm": 0.06522602587938309, "learning_rate": 1.4559704010904145e-05, "loss": 0.6346225142478943, "step": 608 }, { "epoch": 2.6293103448275863, "grad_norm": 0.24831700325012207, "learning_rate": 1.444300057347229e-05, "loss": 0.5777739882469177, "step": 610 }, { "epoch": 2.637931034482759, "grad_norm": 0.06677041202783585, "learning_rate": 1.432668151133712e-05, "loss": 0.5916672945022583, "step": 612 }, { "epoch": 2.646551724137931, "grad_norm": 0.09093949943780899, "learning_rate": 1.4210752740891032e-05, "loss": 0.5175487995147705, "step": 614 }, { "epoch": 2.655172413793103, "grad_norm": 0.1291448175907135, "learning_rate": 1.4095220158674851e-05, "loss": 0.37486380338668823, "step": 616 }, { "epoch": 2.663793103448276, "grad_norm": 0.10089799761772156, "learning_rate": 1.3980089641077864e-05, "loss": 0.5902385115623474, "step": 618 }, { "epoch": 2.6724137931034484, "grad_norm": 0.3151969611644745, "learning_rate": 1.3865367044038972e-05, "loss": 0.3626130223274231, "step": 620 }, { "epoch": 2.6810344827586206, "grad_norm": 0.10858116298913956, "learning_rate": 1.3751058202748815e-05, "loss": 0.6260622143745422, "step": 622 }, { "epoch": 2.689655172413793, "grad_norm": 0.09145694226026535, "learning_rate": 1.3637168931352952e-05, "loss": 0.3847617506980896, "step": 624 }, { "epoch": 2.6982758620689653, "grad_norm": 0.10181720554828644, "learning_rate": 1.3523705022656194e-05, "loss": 0.5213911533355713, "step": 626 }, { "epoch": 2.706896551724138, "grad_norm": 0.07265552878379822, "learning_rate": 1.3410672247827887e-05, "loss": 0.3843521475791931, "step": 628 }, { "epoch": 2.7155172413793105, "grad_norm": 0.06394084542989731, "learning_rate": 1.3298076356108431e-05, "loss": 0.7390468716621399, "step": 630 }, { "epoch": 2.7241379310344827, "grad_norm": 0.08277060091495514, "learning_rate": 1.318592307451683e-05, "loss": 0.3152429461479187, "step": 632 }, { "epoch": 2.7327586206896552, "grad_norm": 0.06954030692577362, "learning_rate": 1.307421810755938e-05, "loss": 0.5903550982475281, "step": 634 }, { "epoch": 2.7413793103448274, "grad_norm": 0.14430810511112213, "learning_rate": 1.296296713693956e-05, "loss": 0.4196533262729645, "step": 636 }, { "epoch": 2.75, "grad_norm": 0.049837417900562286, "learning_rate": 1.2852175821268977e-05, "loss": 0.5849826335906982, "step": 638 }, { "epoch": 2.7586206896551726, "grad_norm": 0.16439993679523468, "learning_rate": 1.274184979577963e-05, "loss": 0.40721848607063293, "step": 640 }, { "epoch": 2.7672413793103448, "grad_norm": 0.15708234906196594, "learning_rate": 1.2631994672037205e-05, "loss": 0.5138668417930603, "step": 642 }, { "epoch": 2.7758620689655173, "grad_norm": 0.0595339760184288, "learning_rate": 1.2522616037655713e-05, "loss": 0.6097421646118164, "step": 644 }, { "epoch": 2.7844827586206895, "grad_norm": 0.14719434082508087, "learning_rate": 1.2413719456013231e-05, "loss": 0.5522211194038391, "step": 646 }, { "epoch": 2.793103448275862, "grad_norm": 0.06864980608224869, "learning_rate": 1.2305310465968985e-05, "loss": 0.3453619182109833, "step": 648 }, { "epoch": 2.8017241379310347, "grad_norm": 0.05219966545701027, "learning_rate": 1.2197394581581561e-05, "loss": 0.7121859788894653, "step": 650 }, { "epoch": 2.810344827586207, "grad_norm": 0.24679023027420044, "learning_rate": 1.2089977291828512e-05, "loss": 0.7990239262580872, "step": 652 }, { "epoch": 2.8189655172413794, "grad_norm": 0.15024927258491516, "learning_rate": 1.1983064060327098e-05, "loss": 0.6081220507621765, "step": 654 }, { "epoch": 2.8275862068965516, "grad_norm": 0.05443995073437691, "learning_rate": 1.187666032505645e-05, "loss": 0.43975335359573364, "step": 656 }, { "epoch": 2.836206896551724, "grad_norm": 0.05697048828005791, "learning_rate": 1.1770771498080921e-05, "loss": 0.6137202978134155, "step": 658 }, { "epoch": 2.844827586206897, "grad_norm": 0.11451619118452072, "learning_rate": 1.1665402965274866e-05, "loss": 0.20562584698200226, "step": 660 }, { "epoch": 2.853448275862069, "grad_norm": 0.22301547229290009, "learning_rate": 1.1560560086048632e-05, "loss": 0.42035165429115295, "step": 662 }, { "epoch": 2.862068965517241, "grad_norm": 0.15491816401481628, "learning_rate": 1.1456248193076027e-05, "loss": 0.6786882877349854, "step": 664 }, { "epoch": 2.8706896551724137, "grad_norm": 0.06417909264564514, "learning_rate": 1.1352472592023026e-05, "loss": 0.34481775760650635, "step": 666 }, { "epoch": 2.8793103448275863, "grad_norm": 0.2559848129749298, "learning_rate": 1.1249238561277957e-05, "loss": 0.37077146768569946, "step": 668 }, { "epoch": 2.887931034482759, "grad_norm": 0.07367434352636337, "learning_rate": 1.1146551351682962e-05, "loss": 0.6234573125839233, "step": 670 }, { "epoch": 2.896551724137931, "grad_norm": 0.13318119943141937, "learning_rate": 1.1044416186266985e-05, "loss": 0.43646591901779175, "step": 672 }, { "epoch": 2.905172413793103, "grad_norm": 0.04189766198396683, "learning_rate": 1.0942838259980065e-05, "loss": 0.6099374890327454, "step": 674 }, { "epoch": 2.913793103448276, "grad_norm": 0.16093385219573975, "learning_rate": 1.0841822739429131e-05, "loss": 0.5961918830871582, "step": 676 }, { "epoch": 2.9224137931034484, "grad_norm": 0.05338941141963005, "learning_rate": 1.0741374762615181e-05, "loss": 0.5247670412063599, "step": 678 }, { "epoch": 2.9310344827586206, "grad_norm": 0.06662659347057343, "learning_rate": 1.0641499438671994e-05, "loss": 0.4245750606060028, "step": 680 }, { "epoch": 2.939655172413793, "grad_norm": 0.03824161738157272, "learning_rate": 1.054220184760619e-05, "loss": 0.21983936429023743, "step": 682 }, { "epoch": 2.9482758620689653, "grad_norm": 0.061386823654174805, "learning_rate": 1.0443487040038919e-05, "loss": 0.3854738771915436, "step": 684 }, { "epoch": 2.956896551724138, "grad_norm": 0.06032966449856758, "learning_rate": 1.0345360036948912e-05, "loss": 0.6782163381576538, "step": 686 }, { "epoch": 2.9655172413793105, "grad_norm": 0.06708291918039322, "learning_rate": 1.0247825829417132e-05, "loss": 0.5401458740234375, "step": 688 }, { "epoch": 2.9741379310344827, "grad_norm": 0.0782044380903244, "learning_rate": 1.0150889378372878e-05, "loss": 0.7114209532737732, "step": 690 }, { "epoch": 2.9827586206896552, "grad_norm": 0.06770720332860947, "learning_rate": 1.00545556143415e-05, "loss": 0.660466730594635, "step": 692 }, { "epoch": 2.9913793103448274, "grad_norm": 0.07091684639453888, "learning_rate": 9.958829437193558e-06, "loss": 0.4320341944694519, "step": 694 }, { "epoch": 3.0, "grad_norm": 0.06834368407726288, "learning_rate": 9.863715715895658e-06, "loss": 0.6856396198272705, "step": 696 }, { "epoch": 3.0086206896551726, "grad_norm": 0.03995652124285698, "learning_rate": 9.769219288262745e-06, "loss": 0.16509434580802917, "step": 698 }, { "epoch": 3.0172413793103448, "grad_norm": 0.043883178383111954, "learning_rate": 9.675344960712074e-06, "loss": 0.29928964376449585, "step": 700 }, { "epoch": 3.0258620689655173, "grad_norm": 0.0733269527554512, "learning_rate": 9.582097508018724e-06, "loss": 0.25162428617477417, "step": 702 }, { "epoch": 3.0344827586206895, "grad_norm": 0.12920475006103516, "learning_rate": 9.489481673072723e-06, "loss": 0.3514169454574585, "step": 704 }, { "epoch": 3.043103448275862, "grad_norm": 0.017986657097935677, "learning_rate": 9.397502166637837e-06, "loss": 0.07074951380491257, "step": 706 }, { "epoch": 3.0517241379310347, "grad_norm": 0.07337481528520584, "learning_rate": 9.30616366711195e-06, "loss": 0.20599356293678284, "step": 708 }, { "epoch": 3.060344827586207, "grad_norm": 0.03576648607850075, "learning_rate": 9.21547082028908e-06, "loss": 0.11480194330215454, "step": 710 }, { "epoch": 3.0689655172413794, "grad_norm": 0.38087305426597595, "learning_rate": 9.125428239123133e-06, "loss": 0.26979854702949524, "step": 712 }, { "epoch": 3.0775862068965516, "grad_norm": 0.0725908949971199, "learning_rate": 9.036040503493213e-06, "loss": 0.42210009694099426, "step": 714 }, { "epoch": 3.086206896551724, "grad_norm": 0.14822497963905334, "learning_rate": 8.947312159970725e-06, "loss": 0.1675470620393753, "step": 716 }, { "epoch": 3.0948275862068964, "grad_norm": 0.08073808997869492, "learning_rate": 8.859247721588064e-06, "loss": 0.20833522081375122, "step": 718 }, { "epoch": 3.103448275862069, "grad_norm": 0.057046178728342056, "learning_rate": 8.77185166760914e-06, "loss": 0.16950953006744385, "step": 720 }, { "epoch": 3.1120689655172415, "grad_norm": 0.10354648530483246, "learning_rate": 8.685128443301465e-06, "loss": 0.12641456723213196, "step": 722 }, { "epoch": 3.1206896551724137, "grad_norm": 0.05845208466053009, "learning_rate": 8.599082459710125e-06, "loss": 0.13568329811096191, "step": 724 }, { "epoch": 3.1293103448275863, "grad_norm": 0.04908813536167145, "learning_rate": 8.513718093433354e-06, "loss": 0.21239104866981506, "step": 726 }, { "epoch": 3.1379310344827585, "grad_norm": 0.13193517923355103, "learning_rate": 8.42903968639999e-06, "loss": 0.2763456702232361, "step": 728 }, { "epoch": 3.146551724137931, "grad_norm": 0.03571261465549469, "learning_rate": 8.345051545648565e-06, "loss": 0.12836386263370514, "step": 730 }, { "epoch": 3.1551724137931036, "grad_norm": 0.06112167611718178, "learning_rate": 8.261757943108296e-06, "loss": 0.16560682654380798, "step": 732 }, { "epoch": 3.163793103448276, "grad_norm": 0.0860171988606453, "learning_rate": 8.179163115381737e-06, "loss": 0.2081730216741562, "step": 734 }, { "epoch": 3.1724137931034484, "grad_norm": 0.03247256577014923, "learning_rate": 8.097271263529346e-06, "loss": 0.14392191171646118, "step": 736 }, { "epoch": 3.1810344827586206, "grad_norm": 0.0918356403708458, "learning_rate": 8.016086552855771e-06, "loss": 0.15577132999897003, "step": 738 }, { "epoch": 3.189655172413793, "grad_norm": 0.06287133693695068, "learning_rate": 7.935613112698003e-06, "loss": 0.0789552852511406, "step": 740 }, { "epoch": 3.1982758620689653, "grad_norm": 0.03986852988600731, "learning_rate": 7.855855036215328e-06, "loss": 0.10101716220378876, "step": 742 }, { "epoch": 3.206896551724138, "grad_norm": 0.13693907856941223, "learning_rate": 7.776816380181165e-06, "loss": 0.1658182144165039, "step": 744 }, { "epoch": 3.2155172413793105, "grad_norm": 0.14548790454864502, "learning_rate": 7.698501164776679e-06, "loss": 0.19248032569885254, "step": 746 }, { "epoch": 3.2241379310344827, "grad_norm": 0.05582420900464058, "learning_rate": 7.620913373386356e-06, "loss": 0.21470694243907928, "step": 748 }, { "epoch": 3.2327586206896552, "grad_norm": 0.04277574643492699, "learning_rate": 7.5440569523953315e-06, "loss": 0.15740104019641876, "step": 750 }, { "epoch": 3.2413793103448274, "grad_norm": 0.14733938872814178, "learning_rate": 7.467935810988729e-06, "loss": 0.18646365404129028, "step": 752 }, { "epoch": 3.25, "grad_norm": 0.06095249578356743, "learning_rate": 7.392553820952764e-06, "loss": 0.22709967195987701, "step": 754 }, { "epoch": 3.2586206896551726, "grad_norm": 0.04888584464788437, "learning_rate": 7.317914816477865e-06, "loss": 0.1782107949256897, "step": 756 }, { "epoch": 3.2672413793103448, "grad_norm": 0.2761983275413513, "learning_rate": 7.244022593963609e-06, "loss": 0.19192323088645935, "step": 758 }, { "epoch": 3.2758620689655173, "grad_norm": 0.041269440203905106, "learning_rate": 7.170880911825657e-06, "loss": 0.13779321312904358, "step": 760 }, { "epoch": 3.2844827586206895, "grad_norm": 0.2219523787498474, "learning_rate": 7.098493490304566e-06, "loss": 0.24427469074726105, "step": 762 }, { "epoch": 3.293103448275862, "grad_norm": 0.7461491227149963, "learning_rate": 7.026864011276575e-06, "loss": 0.32002437114715576, "step": 764 }, { "epoch": 3.3017241379310347, "grad_norm": 0.100465789437294, "learning_rate": 6.955996118066326e-06, "loss": 0.11214806139469147, "step": 766 }, { "epoch": 3.310344827586207, "grad_norm": 0.06019704416394234, "learning_rate": 6.8858934152615646e-06, "loss": 0.1987936794757843, "step": 768 }, { "epoch": 3.3189655172413794, "grad_norm": 0.12379293888807297, "learning_rate": 6.816559468529773e-06, "loss": 0.058321211487054825, "step": 770 }, { "epoch": 3.3275862068965516, "grad_norm": 0.3285755515098572, "learning_rate": 6.747997804436846e-06, "loss": 0.08903615176677704, "step": 772 }, { "epoch": 3.336206896551724, "grad_norm": 0.11563495546579361, "learning_rate": 6.680211910267665e-06, "loss": 0.35364535450935364, "step": 774 }, { "epoch": 3.344827586206897, "grad_norm": 0.07364711910486221, "learning_rate": 6.613205233848783e-06, "loss": 0.20209553837776184, "step": 776 }, { "epoch": 3.353448275862069, "grad_norm": 0.0495804026722908, "learning_rate": 6.546981183373009e-06, "loss": 0.19359779357910156, "step": 778 }, { "epoch": 3.3620689655172415, "grad_norm": 0.13539589941501617, "learning_rate": 6.481543127226073e-06, "loss": 0.28171947598457336, "step": 780 }, { "epoch": 3.3706896551724137, "grad_norm": 0.07525072246789932, "learning_rate": 6.4168943938153e-06, "loss": 0.1644493192434311, "step": 782 }, { "epoch": 3.3793103448275863, "grad_norm": 0.04455971717834473, "learning_rate": 6.353038271400319e-06, "loss": 0.17818251252174377, "step": 784 }, { "epoch": 3.3879310344827585, "grad_norm": 0.04888049513101578, "learning_rate": 6.289978007925791e-06, "loss": 0.08893375098705292, "step": 786 }, { "epoch": 3.396551724137931, "grad_norm": 0.0592099204659462, "learning_rate": 6.227716810856235e-06, "loss": 0.16159863770008087, "step": 788 }, { "epoch": 3.405172413793103, "grad_norm": 0.05931266024708748, "learning_rate": 6.1662578470128595e-06, "loss": 0.19914919137954712, "step": 790 }, { "epoch": 3.413793103448276, "grad_norm": 0.05199761316180229, "learning_rate": 6.105604242412507e-06, "loss": 0.1833517998456955, "step": 792 }, { "epoch": 3.4224137931034484, "grad_norm": 0.07053744047880173, "learning_rate": 6.0457590821086364e-06, "loss": 0.1568892002105713, "step": 794 }, { "epoch": 3.4310344827586206, "grad_norm": 0.11103974282741547, "learning_rate": 5.9867254100344305e-06, "loss": 0.5605343580245972, "step": 796 }, { "epoch": 3.439655172413793, "grad_norm": 0.1462671458721161, "learning_rate": 5.92850622884794e-06, "loss": 0.2542985677719116, "step": 798 }, { "epoch": 3.4482758620689653, "grad_norm": 0.07662937045097351, "learning_rate": 5.871104499779383e-06, "loss": 0.3042844533920288, "step": 800 }, { "epoch": 3.456896551724138, "grad_norm": 0.40208032727241516, "learning_rate": 5.814523142480514e-06, "loss": 0.23688863217830658, "step": 802 }, { "epoch": 3.4655172413793105, "grad_norm": 0.0428071990609169, "learning_rate": 5.758765034876124e-06, "loss": 0.1413598358631134, "step": 804 }, { "epoch": 3.4741379310344827, "grad_norm": 0.18820738792419434, "learning_rate": 5.703833013017659e-06, "loss": 0.26621344685554504, "step": 806 }, { "epoch": 3.4827586206896552, "grad_norm": 0.0474395789206028, "learning_rate": 5.649729870938974e-06, "loss": 0.1856929361820221, "step": 808 }, { "epoch": 3.4913793103448274, "grad_norm": 0.041845474392175674, "learning_rate": 5.596458360514197e-06, "loss": 0.11116787791252136, "step": 810 }, { "epoch": 3.5, "grad_norm": 0.06532273441553116, "learning_rate": 5.544021191317797e-06, "loss": 0.2585083842277527, "step": 812 }, { "epoch": 3.5086206896551726, "grad_norm": 0.17792125046253204, "learning_rate": 5.492421030486723e-06, "loss": 0.24390508234500885, "step": 814 }, { "epoch": 3.5172413793103448, "grad_norm": 0.06345438957214355, "learning_rate": 5.441660502584782e-06, "loss": 0.19690856337547302, "step": 816 }, { "epoch": 3.5258620689655173, "grad_norm": 0.06546366214752197, "learning_rate": 5.391742189469118e-06, "loss": 0.18222372233867645, "step": 818 }, { "epoch": 3.5344827586206895, "grad_norm": 0.06039542332291603, "learning_rate": 5.342668630158901e-06, "loss": 0.14991328120231628, "step": 820 }, { "epoch": 3.543103448275862, "grad_norm": 0.08110994100570679, "learning_rate": 5.294442320706179e-06, "loss": 0.12025367468595505, "step": 822 }, { "epoch": 3.5517241379310347, "grad_norm": 0.04069434478878975, "learning_rate": 5.247065714068933e-06, "loss": 0.0922561064362526, "step": 824 }, { "epoch": 3.560344827586207, "grad_norm": 0.012679479084908962, "learning_rate": 5.200541219986286e-06, "loss": 0.03818206116557121, "step": 826 }, { "epoch": 3.5689655172413794, "grad_norm": 0.22782403230667114, "learning_rate": 5.1548712048559655e-06, "loss": 0.2238304615020752, "step": 828 }, { "epoch": 3.5775862068965516, "grad_norm": 0.05799900367856026, "learning_rate": 5.110057991613912e-06, "loss": 0.1549633890390396, "step": 830 }, { "epoch": 3.586206896551724, "grad_norm": 0.05750008672475815, "learning_rate": 5.0661038596161515e-06, "loss": 0.14927032589912415, "step": 832 }, { "epoch": 3.594827586206897, "grad_norm": 0.24478773772716522, "learning_rate": 5.023011044522834e-06, "loss": 0.2999204397201538, "step": 834 }, { "epoch": 3.603448275862069, "grad_norm": 0.07759716361761093, "learning_rate": 4.980781738184549e-06, "loss": 0.20024727284908295, "step": 836 }, { "epoch": 3.612068965517241, "grad_norm": 0.0670485645532608, "learning_rate": 4.939418088530811e-06, "loss": 0.13863810896873474, "step": 838 }, { "epoch": 3.6206896551724137, "grad_norm": 0.12198883295059204, "learning_rate": 4.898922199460831e-06, "loss": 0.18534965813159943, "step": 840 }, { "epoch": 3.6293103448275863, "grad_norm": 0.1402168571949005, "learning_rate": 4.859296130736489e-06, "loss": 0.15518294274806976, "step": 842 }, { "epoch": 3.637931034482759, "grad_norm": 0.06257359683513641, "learning_rate": 4.820541897877585e-06, "loss": 0.23298737406730652, "step": 844 }, { "epoch": 3.646551724137931, "grad_norm": 0.11765491217374802, "learning_rate": 4.782661472059298e-06, "loss": 0.264419823884964, "step": 846 }, { "epoch": 3.655172413793103, "grad_norm": 0.03430064767599106, "learning_rate": 4.745656780011951e-06, "loss": 0.15973711013793945, "step": 848 }, { "epoch": 3.663793103448276, "grad_norm": 0.03395868092775345, "learning_rate": 4.709529703922993e-06, "loss": 0.17208503186702728, "step": 850 }, { "epoch": 3.6724137931034484, "grad_norm": 0.08469868451356888, "learning_rate": 4.674282081341271e-06, "loss": 0.19475609064102173, "step": 852 }, { "epoch": 3.6810344827586206, "grad_norm": 0.06020957604050636, "learning_rate": 4.639915705083572e-06, "loss": 0.1562570333480835, "step": 854 }, { "epoch": 3.689655172413793, "grad_norm": 0.07793577015399933, "learning_rate": 4.606432323143412e-06, "loss": 0.15900962054729462, "step": 856 }, { "epoch": 3.6982758620689653, "grad_norm": 0.13332881033420563, "learning_rate": 4.573833638602159e-06, "loss": 0.22381483018398285, "step": 858 }, { "epoch": 3.706896551724138, "grad_norm": 0.02578054927289486, "learning_rate": 4.542121309542383e-06, "loss": 0.09598782658576965, "step": 860 }, { "epoch": 3.7155172413793105, "grad_norm": 0.15609142184257507, "learning_rate": 4.511296948963527e-06, "loss": 0.19943147897720337, "step": 862 }, { "epoch": 3.7241379310344827, "grad_norm": 0.01152154989540577, "learning_rate": 4.4813621246998765e-06, "loss": 0.07272744178771973, "step": 864 }, { "epoch": 3.7327586206896552, "grad_norm": 0.06737919896841049, "learning_rate": 4.45231835934079e-06, "loss": 0.1303609311580658, "step": 866 }, { "epoch": 3.7413793103448274, "grad_norm": 0.2695164084434509, "learning_rate": 4.424167130153277e-06, "loss": 0.18073761463165283, "step": 868 }, { "epoch": 3.75, "grad_norm": 0.044187769293785095, "learning_rate": 4.396909869006847e-06, "loss": 0.12275875359773636, "step": 870 }, { "epoch": 3.7586206896551726, "grad_norm": 0.008070076815783978, "learning_rate": 4.3705479623006866e-06, "loss": 0.06019383668899536, "step": 872 }, { "epoch": 3.7672413793103448, "grad_norm": 0.06783478707075119, "learning_rate": 4.345082750893132e-06, "loss": 0.10059908032417297, "step": 874 }, { "epoch": 3.7758620689655173, "grad_norm": 0.10238350927829742, "learning_rate": 4.320515530033487e-06, "loss": 0.30081015825271606, "step": 876 }, { "epoch": 3.7844827586206895, "grad_norm": 0.14017876982688904, "learning_rate": 4.296847549296115e-06, "loss": 0.30149415135383606, "step": 878 }, { "epoch": 3.793103448275862, "grad_norm": 0.06336027383804321, "learning_rate": 4.274080012516909e-06, "loss": 0.13996456563472748, "step": 880 }, { "epoch": 3.8017241379310347, "grad_norm": 0.04175444692373276, "learning_rate": 4.2522140777320404e-06, "loss": 0.09511252492666245, "step": 882 }, { "epoch": 3.810344827586207, "grad_norm": 0.05849481746554375, "learning_rate": 4.23125085711907e-06, "loss": 0.23412549495697021, "step": 884 }, { "epoch": 3.8189655172413794, "grad_norm": 0.06667976826429367, "learning_rate": 4.21119141694037e-06, "loss": 0.160780131816864, "step": 886 }, { "epoch": 3.8275862068965516, "grad_norm": 0.056150369346141815, "learning_rate": 4.192036777488896e-06, "loss": 0.11835036426782608, "step": 888 }, { "epoch": 3.836206896551724, "grad_norm": 0.12212974578142166, "learning_rate": 4.173787913036284e-06, "loss": 0.11370360106229782, "step": 890 }, { "epoch": 3.844827586206897, "grad_norm": 0.08489777147769928, "learning_rate": 4.156445751783308e-06, "loss": 0.17437399923801422, "step": 892 }, { "epoch": 3.853448275862069, "grad_norm": 0.03076266683638096, "learning_rate": 4.140011175812656e-06, "loss": 0.15946733951568604, "step": 894 }, { "epoch": 3.862068965517241, "grad_norm": 0.044967204332351685, "learning_rate": 4.124485021044069e-06, "loss": 0.16160649061203003, "step": 896 }, { "epoch": 3.8706896551724137, "grad_norm": 0.06540035456418991, "learning_rate": 4.1098680771918245e-06, "loss": 0.13039463758468628, "step": 898 }, { "epoch": 3.8793103448275863, "grad_norm": 0.0530594103038311, "learning_rate": 4.096161087724573e-06, "loss": 0.16959071159362793, "step": 900 }, { "epoch": 3.887931034482759, "grad_norm": 0.15922929346561432, "learning_rate": 4.0833647498275085e-06, "loss": 0.20945216715335846, "step": 902 }, { "epoch": 3.896551724137931, "grad_norm": 0.06301749497652054, "learning_rate": 4.07147971436692e-06, "loss": 0.22212789952754974, "step": 904 }, { "epoch": 3.905172413793103, "grad_norm": 0.15187880396842957, "learning_rate": 4.060506585857085e-06, "loss": 0.21481694281101227, "step": 906 }, { "epoch": 3.913793103448276, "grad_norm": 0.32235345244407654, "learning_rate": 4.0504459224295174e-06, "loss": 0.16184020042419434, "step": 908 }, { "epoch": 3.9224137931034484, "grad_norm": 0.07125352323055267, "learning_rate": 4.041298235804577e-06, "loss": 0.1316578984260559, "step": 910 }, { "epoch": 3.9310344827586206, "grad_norm": 0.04981033504009247, "learning_rate": 4.0330639912654516e-06, "loss": 0.11852114647626877, "step": 912 }, { "epoch": 3.939655172413793, "grad_norm": 0.11605612933635712, "learning_rate": 4.02574360763448e-06, "loss": 0.16175302863121033, "step": 914 }, { "epoch": 3.9482758620689653, "grad_norm": 0.05728490650653839, "learning_rate": 4.019337457251857e-06, "loss": 0.16411718726158142, "step": 916 }, { "epoch": 3.956896551724138, "grad_norm": 0.08225003629922867, "learning_rate": 4.013845865956692e-06, "loss": 0.22733992338180542, "step": 918 }, { "epoch": 3.9655172413793105, "grad_norm": 0.2269326150417328, "learning_rate": 4.00926911307043e-06, "loss": 0.1820860654115677, "step": 920 }, { "epoch": 3.9741379310344827, "grad_norm": 0.06667070835828781, "learning_rate": 4.005607431382659e-06, "loss": 0.15438126027584076, "step": 922 }, { "epoch": 3.9827586206896552, "grad_norm": 0.05220466107130051, "learning_rate": 4.002861007139253e-06, "loss": 0.17508190870285034, "step": 924 }, { "epoch": 3.9913793103448274, "grad_norm": 0.07188162952661514, "learning_rate": 4.001029980032909e-06, "loss": 0.1996261328458786, "step": 926 }, { "epoch": 4.0, "grad_norm": 0.3331678509712219, "learning_rate": 4.000114443196044e-06, "loss": 0.2806675136089325, "step": 928 }, { "epoch": 4.0, "step": 928, "total_flos": 3.61934117404672e+18, "train_loss": 0.7229323635552207, "train_runtime": 32622.9814, "train_samples_per_second": 1.707, "train_steps_per_second": 0.028 } ], "logging_steps": 2, "max_steps": 928, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.61934117404672e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }