diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6357 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1804, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004434589800443459, + "grad_norm": 6.676810264587402, + "learning_rate": 1.098901098901099e-07, + "loss": 1.8619344234466553, + "step": 2 + }, + { + "epoch": 0.008869179600886918, + "grad_norm": 8.697616577148438, + "learning_rate": 3.296703296703297e-07, + "loss": 2.1274397373199463, + "step": 4 + }, + { + "epoch": 0.013303769401330377, + "grad_norm": 3.895693063735962, + "learning_rate": 5.494505494505495e-07, + "loss": 1.9044010639190674, + "step": 6 + }, + { + "epoch": 0.017738359201773836, + "grad_norm": 1.11763334274292, + "learning_rate": 7.692307692307694e-07, + "loss": 1.8132928609848022, + "step": 8 + }, + { + "epoch": 0.022172949002217297, + "grad_norm": 2.8607394695281982, + "learning_rate": 9.890109890109891e-07, + "loss": 1.619278907775879, + "step": 10 + }, + { + "epoch": 0.026607538802660754, + "grad_norm": 1.7798055410385132, + "learning_rate": 1.2087912087912089e-06, + "loss": 2.0118155479431152, + "step": 12 + }, + { + "epoch": 0.031042128603104215, + "grad_norm": 4.10674524307251, + "learning_rate": 1.4285714285714286e-06, + "loss": 1.5696207284927368, + "step": 14 + }, + { + "epoch": 0.03547671840354767, + "grad_norm": 4.36632776260376, + "learning_rate": 1.6483516483516484e-06, + "loss": 1.1592055559158325, + "step": 16 + }, + { + "epoch": 0.03991130820399113, + "grad_norm": 2.2460248470306396, + "learning_rate": 1.8681318681318684e-06, + "loss": 1.1627295017242432, + "step": 18 + }, + { + "epoch": 0.04434589800443459, + "grad_norm": 1.9129483699798584, + "learning_rate": 2.0879120879120883e-06, + "loss": 1.2617377042770386, + "step": 20 + }, + { + "epoch": 0.04878048780487805, + "grad_norm": 2.714024066925049, + "learning_rate": 2.307692307692308e-06, + "loss": 1.3001048564910889, + "step": 22 + }, + { + "epoch": 0.05321507760532151, + "grad_norm": 1.8959801197052002, + "learning_rate": 2.5274725274725274e-06, + "loss": 1.5188698768615723, + "step": 24 + }, + { + "epoch": 0.057649667405764965, + "grad_norm": 2.2347946166992188, + "learning_rate": 2.7472527472527476e-06, + "loss": 1.2878303527832031, + "step": 26 + }, + { + "epoch": 0.06208425720620843, + "grad_norm": 0.7839597463607788, + "learning_rate": 2.9670329670329673e-06, + "loss": 1.0919771194458008, + "step": 28 + }, + { + "epoch": 0.06651884700665188, + "grad_norm": 1.253993272781372, + "learning_rate": 3.1868131868131867e-06, + "loss": 1.212958574295044, + "step": 30 + }, + { + "epoch": 0.07095343680709534, + "grad_norm": 1.9655462503433228, + "learning_rate": 3.406593406593407e-06, + "loss": 1.1135960817337036, + "step": 32 + }, + { + "epoch": 0.07538802660753881, + "grad_norm": 0.9812124967575073, + "learning_rate": 3.6263736263736266e-06, + "loss": 1.4904026985168457, + "step": 34 + }, + { + "epoch": 0.07982261640798226, + "grad_norm": 4.495583534240723, + "learning_rate": 3.846153846153847e-06, + "loss": 1.2115815877914429, + "step": 36 + }, + { + "epoch": 0.08425720620842572, + "grad_norm": 6.047755241394043, + "learning_rate": 4.065934065934066e-06, + "loss": 1.5368636846542358, + "step": 38 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 1.0357166528701782, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.7659682631492615, + "step": 40 + }, + { + "epoch": 0.09312638580931264, + "grad_norm": 1.6552459001541138, + "learning_rate": 4.505494505494506e-06, + "loss": 1.398829698562622, + "step": 42 + }, + { + "epoch": 0.0975609756097561, + "grad_norm": 0.8732961416244507, + "learning_rate": 4.725274725274726e-06, + "loss": 1.3438159227371216, + "step": 44 + }, + { + "epoch": 0.10199556541019955, + "grad_norm": 2.114135265350342, + "learning_rate": 4.945054945054946e-06, + "loss": 1.3703439235687256, + "step": 46 + }, + { + "epoch": 0.10643015521064302, + "grad_norm": 2.1336262226104736, + "learning_rate": 5.164835164835166e-06, + "loss": 1.6028835773468018, + "step": 48 + }, + { + "epoch": 0.11086474501108648, + "grad_norm": 1.4948954582214355, + "learning_rate": 5.384615384615385e-06, + "loss": 1.4623559713363647, + "step": 50 + }, + { + "epoch": 0.11529933481152993, + "grad_norm": 1.0568091869354248, + "learning_rate": 5.604395604395605e-06, + "loss": 1.3791553974151611, + "step": 52 + }, + { + "epoch": 0.1197339246119734, + "grad_norm": 1.6316254138946533, + "learning_rate": 5.824175824175825e-06, + "loss": 1.3742573261260986, + "step": 54 + }, + { + "epoch": 0.12416851441241686, + "grad_norm": 3.871581554412842, + "learning_rate": 6.043956043956044e-06, + "loss": 1.1069594621658325, + "step": 56 + }, + { + "epoch": 0.1286031042128603, + "grad_norm": 1.0459665060043335, + "learning_rate": 6.2637362637362645e-06, + "loss": 1.3376773595809937, + "step": 58 + }, + { + "epoch": 0.13303769401330376, + "grad_norm": 0.9134399890899658, + "learning_rate": 6.483516483516485e-06, + "loss": 1.3568081855773926, + "step": 60 + }, + { + "epoch": 0.13747228381374724, + "grad_norm": 1.6270042657852173, + "learning_rate": 6.703296703296703e-06, + "loss": 1.3422999382019043, + "step": 62 + }, + { + "epoch": 0.1419068736141907, + "grad_norm": 1.9580867290496826, + "learning_rate": 6.923076923076923e-06, + "loss": 1.397621512413025, + "step": 64 + }, + { + "epoch": 0.14634146341463414, + "grad_norm": 1.5168925523757935, + "learning_rate": 7.1428571428571436e-06, + "loss": 1.4576083421707153, + "step": 66 + }, + { + "epoch": 0.15077605321507762, + "grad_norm": 1.0614873170852661, + "learning_rate": 7.362637362637364e-06, + "loss": 1.329024314880371, + "step": 68 + }, + { + "epoch": 0.15521064301552107, + "grad_norm": 0.9294531345367432, + "learning_rate": 7.582417582417583e-06, + "loss": 1.2933138608932495, + "step": 70 + }, + { + "epoch": 0.15964523281596452, + "grad_norm": 0.9475209712982178, + "learning_rate": 7.802197802197802e-06, + "loss": 1.3392677307128906, + "step": 72 + }, + { + "epoch": 0.164079822616408, + "grad_norm": 1.461456060409546, + "learning_rate": 8.021978021978023e-06, + "loss": 1.2334855794906616, + "step": 74 + }, + { + "epoch": 0.16851441241685144, + "grad_norm": 1.7553434371948242, + "learning_rate": 8.241758241758243e-06, + "loss": 1.3260265588760376, + "step": 76 + }, + { + "epoch": 0.1729490022172949, + "grad_norm": 1.0386556386947632, + "learning_rate": 8.461538461538462e-06, + "loss": 1.3698723316192627, + "step": 78 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 1.352614164352417, + "learning_rate": 8.681318681318681e-06, + "loss": 1.328392744064331, + "step": 80 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.3509490489959717, + "learning_rate": 8.9010989010989e-06, + "loss": 0.8664931654930115, + "step": 82 + }, + { + "epoch": 0.18625277161862527, + "grad_norm": 0.7466371059417725, + "learning_rate": 9.120879120879122e-06, + "loss": 1.0877059698104858, + "step": 84 + }, + { + "epoch": 0.19068736141906872, + "grad_norm": 1.6466113328933716, + "learning_rate": 9.340659340659341e-06, + "loss": 1.044270396232605, + "step": 86 + }, + { + "epoch": 0.1951219512195122, + "grad_norm": 1.104884386062622, + "learning_rate": 9.560439560439562e-06, + "loss": 1.2637630701065063, + "step": 88 + }, + { + "epoch": 0.19955654101995565, + "grad_norm": 1.8905885219573975, + "learning_rate": 9.780219780219781e-06, + "loss": 1.3959146738052368, + "step": 90 + }, + { + "epoch": 0.2039911308203991, + "grad_norm": 1.2638733386993408, + "learning_rate": 1e-05, + "loss": 1.2453607320785522, + "step": 92 + }, + { + "epoch": 0.20842572062084258, + "grad_norm": 0.8784367442131042, + "learning_rate": 9.99996972898091e-06, + "loss": 1.2674062252044678, + "step": 94 + }, + { + "epoch": 0.21286031042128603, + "grad_norm": 2.076108932495117, + "learning_rate": 9.999878916330893e-06, + "loss": 1.806736946105957, + "step": 96 + }, + { + "epoch": 0.21729490022172948, + "grad_norm": 1.4329758882522583, + "learning_rate": 9.999727563271727e-06, + "loss": 0.9466310143470764, + "step": 98 + }, + { + "epoch": 0.22172949002217296, + "grad_norm": 0.8503994941711426, + "learning_rate": 9.999515671839682e-06, + "loss": 1.1875429153442383, + "step": 100 + }, + { + "epoch": 0.2261640798226164, + "grad_norm": 1.2419079542160034, + "learning_rate": 9.999243244885499e-06, + "loss": 1.3442714214324951, + "step": 102 + }, + { + "epoch": 0.23059866962305986, + "grad_norm": 1.420776605606079, + "learning_rate": 9.998910286074355e-06, + "loss": 1.0798242092132568, + "step": 104 + }, + { + "epoch": 0.23503325942350334, + "grad_norm": 0.7273538112640381, + "learning_rate": 9.998516799885806e-06, + "loss": 1.3031997680664062, + "step": 106 + }, + { + "epoch": 0.2394678492239468, + "grad_norm": 1.5330766439437866, + "learning_rate": 9.998062791613729e-06, + "loss": 1.5008981227874756, + "step": 108 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.8395664095878601, + "learning_rate": 9.997548267366255e-06, + "loss": 1.0781991481781006, + "step": 110 + }, + { + "epoch": 0.24833702882483372, + "grad_norm": 5.73301362991333, + "learning_rate": 9.996973234065685e-06, + "loss": 1.315911054611206, + "step": 112 + }, + { + "epoch": 0.25277161862527714, + "grad_norm": 3.2818472385406494, + "learning_rate": 9.996337699448392e-06, + "loss": 0.8265340924263, + "step": 114 + }, + { + "epoch": 0.2572062084257206, + "grad_norm": 1.1112722158432007, + "learning_rate": 9.995641672064726e-06, + "loss": 1.3526400327682495, + "step": 116 + }, + { + "epoch": 0.2616407982261641, + "grad_norm": 0.6785824298858643, + "learning_rate": 9.994885161278885e-06, + "loss": 1.0982011556625366, + "step": 118 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 6.2376508712768555, + "learning_rate": 9.994068177268807e-06, + "loss": 1.276107668876648, + "step": 120 + }, + { + "epoch": 0.270509977827051, + "grad_norm": 1.5868315696716309, + "learning_rate": 9.993190731026024e-06, + "loss": 1.213824987411499, + "step": 122 + }, + { + "epoch": 0.2749445676274945, + "grad_norm": 1.3105850219726562, + "learning_rate": 9.992252834355503e-06, + "loss": 1.576740026473999, + "step": 124 + }, + { + "epoch": 0.2793791574279379, + "grad_norm": 0.6459128260612488, + "learning_rate": 9.99125449987551e-06, + "loss": 0.9588035941123962, + "step": 126 + }, + { + "epoch": 0.2838137472283814, + "grad_norm": 0.8222401738166809, + "learning_rate": 9.990195741017422e-06, + "loss": 1.2112478017807007, + "step": 128 + }, + { + "epoch": 0.28824833702882485, + "grad_norm": 1.1047145128250122, + "learning_rate": 9.989076572025554e-06, + "loss": 0.9570561647415161, + "step": 130 + }, + { + "epoch": 0.2926829268292683, + "grad_norm": 4.633395671844482, + "learning_rate": 9.987897007956968e-06, + "loss": 1.005781888961792, + "step": 132 + }, + { + "epoch": 0.29711751662971175, + "grad_norm": 2.7191808223724365, + "learning_rate": 9.986657064681267e-06, + "loss": 0.9942412376403809, + "step": 134 + }, + { + "epoch": 0.30155210643015523, + "grad_norm": 5.270407676696777, + "learning_rate": 9.98535675888038e-06, + "loss": 1.249075174331665, + "step": 136 + }, + { + "epoch": 0.30598669623059865, + "grad_norm": 2.1436080932617188, + "learning_rate": 9.983996108048345e-06, + "loss": 0.8106088638305664, + "step": 138 + }, + { + "epoch": 0.31042128603104213, + "grad_norm": 1.0196813344955444, + "learning_rate": 9.982575130491068e-06, + "loss": 1.129402995109558, + "step": 140 + }, + { + "epoch": 0.3148558758314856, + "grad_norm": 0.7778133749961853, + "learning_rate": 9.981093845326079e-06, + "loss": 0.9775266647338867, + "step": 142 + }, + { + "epoch": 0.31929046563192903, + "grad_norm": 2.3564629554748535, + "learning_rate": 9.979552272482268e-06, + "loss": 0.9763571619987488, + "step": 144 + }, + { + "epoch": 0.3237250554323725, + "grad_norm": 1.7340655326843262, + "learning_rate": 9.977950432699629e-06, + "loss": 0.9292731881141663, + "step": 146 + }, + { + "epoch": 0.328159645232816, + "grad_norm": 2.971210241317749, + "learning_rate": 9.976288347528972e-06, + "loss": 1.4188424348831177, + "step": 148 + }, + { + "epoch": 0.3325942350332594, + "grad_norm": 1.8986440896987915, + "learning_rate": 9.974566039331634e-06, + "loss": 1.1764416694641113, + "step": 150 + }, + { + "epoch": 0.3370288248337029, + "grad_norm": 4.777287483215332, + "learning_rate": 9.972783531279184e-06, + "loss": 1.0751426219940186, + "step": 152 + }, + { + "epoch": 0.34146341463414637, + "grad_norm": 1.0308066606521606, + "learning_rate": 9.970940847353103e-06, + "loss": 1.2747527360916138, + "step": 154 + }, + { + "epoch": 0.3458980044345898, + "grad_norm": 1.398594617843628, + "learning_rate": 9.969038012344465e-06, + "loss": 1.239310622215271, + "step": 156 + }, + { + "epoch": 0.35033259423503327, + "grad_norm": 1.4237884283065796, + "learning_rate": 9.967075051853609e-06, + "loss": 1.2969309091567993, + "step": 158 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.5553252100944519, + "learning_rate": 9.965051992289782e-06, + "loss": 1.18067467212677, + "step": 160 + }, + { + "epoch": 0.35920177383592017, + "grad_norm": 1.2233624458312988, + "learning_rate": 9.962968860870798e-06, + "loss": 0.8922700881958008, + "step": 162 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 8.296998977661133, + "learning_rate": 9.96082568562266e-06, + "loss": 0.9263341426849365, + "step": 164 + }, + { + "epoch": 0.36807095343680707, + "grad_norm": 1.4043900966644287, + "learning_rate": 9.958622495379193e-06, + "loss": 1.2886468172073364, + "step": 166 + }, + { + "epoch": 0.37250554323725055, + "grad_norm": 2.4009010791778564, + "learning_rate": 9.956359319781642e-06, + "loss": 1.2114157676696777, + "step": 168 + }, + { + "epoch": 0.376940133037694, + "grad_norm": 0.9832214713096619, + "learning_rate": 9.954036189278292e-06, + "loss": 1.2078379392623901, + "step": 170 + }, + { + "epoch": 0.38137472283813745, + "grad_norm": 1.6480237245559692, + "learning_rate": 9.951653135124045e-06, + "loss": 0.8002850413322449, + "step": 172 + }, + { + "epoch": 0.3858093126385809, + "grad_norm": 4.175271987915039, + "learning_rate": 9.94921018938e-06, + "loss": 1.6485449075698853, + "step": 174 + }, + { + "epoch": 0.3902439024390244, + "grad_norm": 0.9011497497558594, + "learning_rate": 9.946707384913027e-06, + "loss": 1.3081176280975342, + "step": 176 + }, + { + "epoch": 0.3946784922394678, + "grad_norm": 1.0457979440689087, + "learning_rate": 9.944144755395321e-06, + "loss": 1.3417953252792358, + "step": 178 + }, + { + "epoch": 0.3991130820399113, + "grad_norm": 0.6271790862083435, + "learning_rate": 9.941522335303955e-06, + "loss": 1.094902753829956, + "step": 180 + }, + { + "epoch": 0.4035476718403548, + "grad_norm": 0.9381081461906433, + "learning_rate": 9.938840159920406e-06, + "loss": 1.2949843406677246, + "step": 182 + }, + { + "epoch": 0.4079822616407982, + "grad_norm": 0.9942634105682373, + "learning_rate": 9.93609826533009e-06, + "loss": 0.8491639494895935, + "step": 184 + }, + { + "epoch": 0.4124168514412417, + "grad_norm": 1.0290542840957642, + "learning_rate": 9.933296688421872e-06, + "loss": 0.9982600808143616, + "step": 186 + }, + { + "epoch": 0.41685144124168516, + "grad_norm": 1.6970468759536743, + "learning_rate": 9.930435466887564e-06, + "loss": 0.9716013669967651, + "step": 188 + }, + { + "epoch": 0.4212860310421286, + "grad_norm": 5.200766086578369, + "learning_rate": 9.927514639221433e-06, + "loss": 1.0068128108978271, + "step": 190 + }, + { + "epoch": 0.42572062084257206, + "grad_norm": 2.2745022773742676, + "learning_rate": 9.92453424471967e-06, + "loss": 0.921920895576477, + "step": 192 + }, + { + "epoch": 0.43015521064301554, + "grad_norm": 1.7652184963226318, + "learning_rate": 9.921494323479862e-06, + "loss": 1.3151664733886719, + "step": 194 + }, + { + "epoch": 0.43458980044345896, + "grad_norm": 1.6204708814620972, + "learning_rate": 9.918394916400465e-06, + "loss": 1.5716735124588013, + "step": 196 + }, + { + "epoch": 0.43902439024390244, + "grad_norm": 1.9356732368469238, + "learning_rate": 9.915236065180235e-06, + "loss": 1.3005520105361938, + "step": 198 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.5605640411376953, + "learning_rate": 9.912017812317684e-06, + "loss": 1.1581960916519165, + "step": 200 + }, + { + "epoch": 0.44789356984478934, + "grad_norm": 0.8719038963317871, + "learning_rate": 9.908740201110497e-06, + "loss": 1.2721892595291138, + "step": 202 + }, + { + "epoch": 0.4523281596452328, + "grad_norm": 0.739933431148529, + "learning_rate": 9.905403275654951e-06, + "loss": 1.2388838529586792, + "step": 204 + }, + { + "epoch": 0.4567627494456763, + "grad_norm": 2.4202499389648438, + "learning_rate": 9.902007080845336e-06, + "loss": 1.101776123046875, + "step": 206 + }, + { + "epoch": 0.4611973392461197, + "grad_norm": 0.8520665764808655, + "learning_rate": 9.898551662373325e-06, + "loss": 1.1120811700820923, + "step": 208 + }, + { + "epoch": 0.4656319290465632, + "grad_norm": 3.274420976638794, + "learning_rate": 9.895037066727382e-06, + "loss": 0.6910912990570068, + "step": 210 + }, + { + "epoch": 0.4700665188470067, + "grad_norm": 1.9249039888381958, + "learning_rate": 9.891463341192124e-06, + "loss": 0.874860405921936, + "step": 212 + }, + { + "epoch": 0.4745011086474501, + "grad_norm": 0.7327234148979187, + "learning_rate": 9.88783053384769e-06, + "loss": 1.1093487739562988, + "step": 214 + }, + { + "epoch": 0.4789356984478936, + "grad_norm": 0.6075326800346375, + "learning_rate": 9.884138693569095e-06, + "loss": 1.126065731048584, + "step": 216 + }, + { + "epoch": 0.48337028824833705, + "grad_norm": 0.8966704607009888, + "learning_rate": 9.88038787002557e-06, + "loss": 1.1469029188156128, + "step": 218 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 3.5954678058624268, + "learning_rate": 9.876578113679891e-06, + "loss": 1.2308658361434937, + "step": 220 + }, + { + "epoch": 0.49223946784922396, + "grad_norm": 0.9438884854316711, + "learning_rate": 9.872709475787708e-06, + "loss": 1.246482253074646, + "step": 222 + }, + { + "epoch": 0.49667405764966743, + "grad_norm": 0.6973426938056946, + "learning_rate": 9.868782008396848e-06, + "loss": 1.2197092771530151, + "step": 224 + }, + { + "epoch": 0.5011086474501109, + "grad_norm": 1.284665822982788, + "learning_rate": 9.864795764346615e-06, + "loss": 1.2148925065994263, + "step": 226 + }, + { + "epoch": 0.5055432372505543, + "grad_norm": 0.6621900200843811, + "learning_rate": 9.860750797267085e-06, + "loss": 1.2680370807647705, + "step": 228 + }, + { + "epoch": 0.5099778270509978, + "grad_norm": 1.091399073600769, + "learning_rate": 9.856647161578384e-06, + "loss": 1.854086995124817, + "step": 230 + }, + { + "epoch": 0.5144124168514412, + "grad_norm": 1.1289076805114746, + "learning_rate": 9.852484912489946e-06, + "loss": 0.9193805456161499, + "step": 232 + }, + { + "epoch": 0.5188470066518847, + "grad_norm": 0.5568757653236389, + "learning_rate": 9.848264105999783e-06, + "loss": 1.2920536994934082, + "step": 234 + }, + { + "epoch": 0.5232815964523282, + "grad_norm": 0.6778749227523804, + "learning_rate": 9.843984798893722e-06, + "loss": 0.9870902299880981, + "step": 236 + }, + { + "epoch": 0.5277161862527716, + "grad_norm": 2.20177960395813, + "learning_rate": 9.839647048744645e-06, + "loss": 1.0330405235290527, + "step": 238 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.7545985579490662, + "learning_rate": 9.83525091391172e-06, + "loss": 1.282772421836853, + "step": 240 + }, + { + "epoch": 0.5365853658536586, + "grad_norm": 0.8147256970405579, + "learning_rate": 9.8307964535396e-06, + "loss": 1.3363362550735474, + "step": 242 + }, + { + "epoch": 0.541019955654102, + "grad_norm": 4.192741870880127, + "learning_rate": 9.826283727557644e-06, + "loss": 0.9940363764762878, + "step": 244 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.9781287312507629, + "learning_rate": 9.821712796679106e-06, + "loss": 1.2755184173583984, + "step": 246 + }, + { + "epoch": 0.549889135254989, + "grad_norm": 1.4529197216033936, + "learning_rate": 9.817083722400309e-06, + "loss": 1.5036003589630127, + "step": 248 + }, + { + "epoch": 0.5543237250554324, + "grad_norm": 1.9173634052276611, + "learning_rate": 9.812396566999832e-06, + "loss": 1.2569671869277954, + "step": 250 + }, + { + "epoch": 0.5587583148558758, + "grad_norm": 2.1625723838806152, + "learning_rate": 9.807651393537659e-06, + "loss": 0.7617548704147339, + "step": 252 + }, + { + "epoch": 0.5631929046563193, + "grad_norm": 3.9226653575897217, + "learning_rate": 9.802848265854343e-06, + "loss": 1.5005896091461182, + "step": 254 + }, + { + "epoch": 0.5676274944567627, + "grad_norm": 3.566586971282959, + "learning_rate": 9.797987248570137e-06, + "loss": 1.352405071258545, + "step": 256 + }, + { + "epoch": 0.5720620842572062, + "grad_norm": 3.0757203102111816, + "learning_rate": 9.793068407084125e-06, + "loss": 1.306916356086731, + "step": 258 + }, + { + "epoch": 0.5764966740576497, + "grad_norm": 2.998680830001831, + "learning_rate": 9.78809180757335e-06, + "loss": 1.3136558532714844, + "step": 260 + }, + { + "epoch": 0.5809312638580931, + "grad_norm": 5.087543487548828, + "learning_rate": 9.783057516991921e-06, + "loss": 0.7688280940055847, + "step": 262 + }, + { + "epoch": 0.5853658536585366, + "grad_norm": 9.870433807373047, + "learning_rate": 9.777965603070106e-06, + "loss": 1.354008674621582, + "step": 264 + }, + { + "epoch": 0.5898004434589801, + "grad_norm": 2.463515520095825, + "learning_rate": 9.772816134313424e-06, + "loss": 1.0871102809906006, + "step": 266 + }, + { + "epoch": 0.5942350332594235, + "grad_norm": 2.047605514526367, + "learning_rate": 9.76760918000173e-06, + "loss": 1.5661654472351074, + "step": 268 + }, + { + "epoch": 0.5986696230598669, + "grad_norm": 3.3246095180511475, + "learning_rate": 9.762344810188276e-06, + "loss": 1.1864956617355347, + "step": 270 + }, + { + "epoch": 0.6031042128603105, + "grad_norm": 1.8888014554977417, + "learning_rate": 9.757023095698766e-06, + "loss": 1.2435264587402344, + "step": 272 + }, + { + "epoch": 0.6075388026607539, + "grad_norm": 2.089836835861206, + "learning_rate": 9.751644108130405e-06, + "loss": 1.2407060861587524, + "step": 274 + }, + { + "epoch": 0.6119733924611973, + "grad_norm": 2.53568959236145, + "learning_rate": 9.746207919850951e-06, + "loss": 1.2337491512298584, + "step": 276 + }, + { + "epoch": 0.6164079822616408, + "grad_norm": 6.390379428863525, + "learning_rate": 9.740714603997712e-06, + "loss": 1.2231154441833496, + "step": 278 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 1.5388522148132324, + "learning_rate": 9.735164234476588e-06, + "loss": 1.30294668674469, + "step": 280 + }, + { + "epoch": 0.6252771618625277, + "grad_norm": 2.599823236465454, + "learning_rate": 9.729556885961064e-06, + "loss": 0.8776879906654358, + "step": 282 + }, + { + "epoch": 0.6297117516629712, + "grad_norm": 2.7698779106140137, + "learning_rate": 9.72389263389121e-06, + "loss": 1.264892339706421, + "step": 284 + }, + { + "epoch": 0.6341463414634146, + "grad_norm": 3.3235433101654053, + "learning_rate": 9.718171554472662e-06, + "loss": 1.3354568481445312, + "step": 286 + }, + { + "epoch": 0.6385809312638581, + "grad_norm": 2.9492456912994385, + "learning_rate": 9.712393724675597e-06, + "loss": 1.2577565908432007, + "step": 288 + }, + { + "epoch": 0.6430155210643016, + "grad_norm": 1.416076421737671, + "learning_rate": 9.706559222233704e-06, + "loss": 1.2443256378173828, + "step": 290 + }, + { + "epoch": 0.647450110864745, + "grad_norm": 2.3916337490081787, + "learning_rate": 9.700668125643132e-06, + "loss": 1.3704557418823242, + "step": 292 + }, + { + "epoch": 0.6518847006651884, + "grad_norm": 2.637317180633545, + "learning_rate": 9.694720514161437e-06, + "loss": 0.9171057343482971, + "step": 294 + }, + { + "epoch": 0.656319290465632, + "grad_norm": 2.0566046237945557, + "learning_rate": 9.688716467806508e-06, + "loss": 0.9948861002922058, + "step": 296 + }, + { + "epoch": 0.6607538802660754, + "grad_norm": 1.9021042585372925, + "learning_rate": 9.682656067355505e-06, + "loss": 1.2095568180084229, + "step": 298 + }, + { + "epoch": 0.6651884700665188, + "grad_norm": 4.523650646209717, + "learning_rate": 9.67653939434376e-06, + "loss": 1.2600574493408203, + "step": 300 + }, + { + "epoch": 0.6696230598669624, + "grad_norm": 1.1016621589660645, + "learning_rate": 9.670366531063686e-06, + "loss": 1.2038549184799194, + "step": 302 + }, + { + "epoch": 0.6740576496674058, + "grad_norm": 2.1605725288391113, + "learning_rate": 9.664137560563663e-06, + "loss": 1.2652329206466675, + "step": 304 + }, + { + "epoch": 0.6784922394678492, + "grad_norm": 1.6918096542358398, + "learning_rate": 9.657852566646929e-06, + "loss": 1.2377828359603882, + "step": 306 + }, + { + "epoch": 0.6829268292682927, + "grad_norm": 1.1809215545654297, + "learning_rate": 9.651511633870451e-06, + "loss": 0.8821107149124146, + "step": 308 + }, + { + "epoch": 0.6873614190687362, + "grad_norm": 0.7865143418312073, + "learning_rate": 9.645114847543781e-06, + "loss": 1.2431116104125977, + "step": 310 + }, + { + "epoch": 0.6917960088691796, + "grad_norm": 1.0868396759033203, + "learning_rate": 9.638662293727916e-06, + "loss": 1.219387412071228, + "step": 312 + }, + { + "epoch": 0.6962305986696231, + "grad_norm": 0.804448127746582, + "learning_rate": 9.632154059234137e-06, + "loss": 1.23236083984375, + "step": 314 + }, + { + "epoch": 0.7006651884700665, + "grad_norm": 0.7761425971984863, + "learning_rate": 9.625590231622837e-06, + "loss": 1.3466178178787231, + "step": 316 + }, + { + "epoch": 0.70509977827051, + "grad_norm": 1.0411393642425537, + "learning_rate": 9.618970899202354e-06, + "loss": 1.077616572380066, + "step": 318 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 1.4453805685043335, + "learning_rate": 9.612296151027765e-06, + "loss": 0.9873568415641785, + "step": 320 + }, + { + "epoch": 0.7139689578713969, + "grad_norm": 3.3780412673950195, + "learning_rate": 9.605566076899714e-06, + "loss": 0.9905721545219421, + "step": 322 + }, + { + "epoch": 0.7184035476718403, + "grad_norm": 1.9473044872283936, + "learning_rate": 9.598780767363174e-06, + "loss": 1.0931217670440674, + "step": 324 + }, + { + "epoch": 0.7228381374722838, + "grad_norm": 2.0977659225463867, + "learning_rate": 9.591940313706248e-06, + "loss": 1.1007053852081299, + "step": 326 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.8838526606559753, + "learning_rate": 9.585044807958942e-06, + "loss": 0.8777045011520386, + "step": 328 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 1.6722099781036377, + "learning_rate": 9.578094342891915e-06, + "loss": 0.8626028895378113, + "step": 330 + }, + { + "epoch": 0.7361419068736141, + "grad_norm": 2.46458101272583, + "learning_rate": 9.571089012015237e-06, + "loss": 1.4064295291900635, + "step": 332 + }, + { + "epoch": 0.7405764966740577, + "grad_norm": 2.49685001373291, + "learning_rate": 9.564028909577132e-06, + "loss": 1.2061187028884888, + "step": 334 + }, + { + "epoch": 0.7450110864745011, + "grad_norm": 0.7096370458602905, + "learning_rate": 9.55691413056271e-06, + "loss": 1.2485088109970093, + "step": 336 + }, + { + "epoch": 0.7494456762749445, + "grad_norm": 1.6750073432922363, + "learning_rate": 9.54974477069269e-06, + "loss": 0.7527822256088257, + "step": 338 + }, + { + "epoch": 0.753880266075388, + "grad_norm": 1.7847540378570557, + "learning_rate": 9.542520926422105e-06, + "loss": 0.377991259098053, + "step": 340 + }, + { + "epoch": 0.7583148558758315, + "grad_norm": 1.3835326433181763, + "learning_rate": 9.535242694939011e-06, + "loss": 0.8986580967903137, + "step": 342 + }, + { + "epoch": 0.7627494456762749, + "grad_norm": 0.9166545867919922, + "learning_rate": 9.527910174163179e-06, + "loss": 1.3222459554672241, + "step": 344 + }, + { + "epoch": 0.7671840354767184, + "grad_norm": 0.854032576084137, + "learning_rate": 9.520523462744776e-06, + "loss": 1.2536201477050781, + "step": 346 + }, + { + "epoch": 0.7716186252771619, + "grad_norm": 2.517990827560425, + "learning_rate": 9.51308266006304e-06, + "loss": 1.1171575784683228, + "step": 348 + }, + { + "epoch": 0.7760532150776053, + "grad_norm": 0.8596073389053345, + "learning_rate": 9.505587866224939e-06, + "loss": 1.2495148181915283, + "step": 350 + }, + { + "epoch": 0.7804878048780488, + "grad_norm": 1.1972160339355469, + "learning_rate": 9.498039182063828e-06, + "loss": 1.203031301498413, + "step": 352 + }, + { + "epoch": 0.7849223946784922, + "grad_norm": 5.1456618309021, + "learning_rate": 9.49043670913809e-06, + "loss": 1.050938367843628, + "step": 354 + }, + { + "epoch": 0.7893569844789357, + "grad_norm": 1.1678507328033447, + "learning_rate": 9.48278054972977e-06, + "loss": 1.2427358627319336, + "step": 356 + }, + { + "epoch": 0.7937915742793792, + "grad_norm": 1.5696804523468018, + "learning_rate": 9.475070806843202e-06, + "loss": 1.291693925857544, + "step": 358 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.6834315657615662, + "learning_rate": 9.467307584203619e-06, + "loss": 1.2494440078735352, + "step": 360 + }, + { + "epoch": 0.802660753880266, + "grad_norm": 1.043826699256897, + "learning_rate": 9.459490986255756e-06, + "loss": 0.6734438538551331, + "step": 362 + }, + { + "epoch": 0.8070953436807096, + "grad_norm": 5.6669206619262695, + "learning_rate": 9.451621118162453e-06, + "loss": 1.3504599332809448, + "step": 364 + }, + { + "epoch": 0.811529933481153, + "grad_norm": 0.6810413002967834, + "learning_rate": 9.443698085803235e-06, + "loss": 1.2325347661972046, + "step": 366 + }, + { + "epoch": 0.8159645232815964, + "grad_norm": 1.6936004161834717, + "learning_rate": 9.435721995772884e-06, + "loss": 1.0272517204284668, + "step": 368 + }, + { + "epoch": 0.8203991130820399, + "grad_norm": 1.543861746788025, + "learning_rate": 9.42769295538001e-06, + "loss": 0.9654221534729004, + "step": 370 + }, + { + "epoch": 0.8248337028824834, + "grad_norm": 0.6361260414123535, + "learning_rate": 9.419611072645608e-06, + "loss": 1.255522608757019, + "step": 372 + }, + { + "epoch": 0.8292682926829268, + "grad_norm": 0.7957190871238708, + "learning_rate": 9.4114764563016e-06, + "loss": 1.24748694896698, + "step": 374 + }, + { + "epoch": 0.8337028824833703, + "grad_norm": 0.6240236759185791, + "learning_rate": 9.403289215789373e-06, + "loss": 1.2206608057022095, + "step": 376 + }, + { + "epoch": 0.8381374722838137, + "grad_norm": 1.7681220769882202, + "learning_rate": 9.395049461258318e-06, + "loss": 1.245053768157959, + "step": 378 + }, + { + "epoch": 0.8425720620842572, + "grad_norm": 2.775740385055542, + "learning_rate": 9.386757303564323e-06, + "loss": 0.801120936870575, + "step": 380 + }, + { + "epoch": 0.8470066518847007, + "grad_norm": 0.6272016763687134, + "learning_rate": 9.37841285426831e-06, + "loss": 1.337688684463501, + "step": 382 + }, + { + "epoch": 0.8514412416851441, + "grad_norm": 0.7218549847602844, + "learning_rate": 9.370016225634719e-06, + "loss": 1.1829020977020264, + "step": 384 + }, + { + "epoch": 0.8558758314855875, + "grad_norm": 2.081940174102783, + "learning_rate": 9.361567530629988e-06, + "loss": 1.2651656866073608, + "step": 386 + }, + { + "epoch": 0.8603104212860311, + "grad_norm": 0.46879851818084717, + "learning_rate": 9.353066882921063e-06, + "loss": 1.1930689811706543, + "step": 388 + }, + { + "epoch": 0.8647450110864745, + "grad_norm": 0.878593921661377, + "learning_rate": 9.344514396873837e-06, + "loss": 1.2674225568771362, + "step": 390 + }, + { + "epoch": 0.8691796008869179, + "grad_norm": 0.6476463675498962, + "learning_rate": 9.335910187551628e-06, + "loss": 0.959172785282135, + "step": 392 + }, + { + "epoch": 0.8736141906873615, + "grad_norm": 2.1412248611450195, + "learning_rate": 9.327254370713636e-06, + "loss": 1.2319204807281494, + "step": 394 + }, + { + "epoch": 0.8780487804878049, + "grad_norm": 0.5421054363250732, + "learning_rate": 9.31854706281336e-06, + "loss": 1.222408652305603, + "step": 396 + }, + { + "epoch": 0.8824833702882483, + "grad_norm": 3.0554358959198, + "learning_rate": 9.309788380997069e-06, + "loss": 1.2220014333724976, + "step": 398 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.52972412109375, + "learning_rate": 9.30097844310219e-06, + "loss": 0.5164414644241333, + "step": 400 + }, + { + "epoch": 0.8913525498891353, + "grad_norm": 1.3212813138961792, + "learning_rate": 9.292117367655749e-06, + "loss": 1.0734343528747559, + "step": 402 + }, + { + "epoch": 0.8957871396895787, + "grad_norm": 0.2849816679954529, + "learning_rate": 9.283205273872757e-06, + "loss": 0.9632551074028015, + "step": 404 + }, + { + "epoch": 0.9002217294900222, + "grad_norm": 0.9623860716819763, + "learning_rate": 9.274242281654621e-06, + "loss": 1.2757704257965088, + "step": 406 + }, + { + "epoch": 0.9046563192904656, + "grad_norm": 5.387387752532959, + "learning_rate": 9.265228511587525e-06, + "loss": 1.1458454132080078, + "step": 408 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.2370455265045166, + "learning_rate": 9.2561640849408e-06, + "loss": 1.1194361448287964, + "step": 410 + }, + { + "epoch": 0.9135254988913526, + "grad_norm": 0.8080571293830872, + "learning_rate": 9.247049123665306e-06, + "loss": 1.2487624883651733, + "step": 412 + }, + { + "epoch": 0.917960088691796, + "grad_norm": 0.6047589778900146, + "learning_rate": 9.237883750391786e-06, + "loss": 1.2612035274505615, + "step": 414 + }, + { + "epoch": 0.9223946784922394, + "grad_norm": 3.471346616744995, + "learning_rate": 9.228668088429212e-06, + "loss": 1.0114153623580933, + "step": 416 + }, + { + "epoch": 0.926829268292683, + "grad_norm": 0.7095761299133301, + "learning_rate": 9.219402261763129e-06, + "loss": 1.2827666997909546, + "step": 418 + }, + { + "epoch": 0.9312638580931264, + "grad_norm": 1.4821585416793823, + "learning_rate": 9.210086395053992e-06, + "loss": 0.8083788156509399, + "step": 420 + }, + { + "epoch": 0.9356984478935698, + "grad_norm": 0.8971059322357178, + "learning_rate": 9.200720613635476e-06, + "loss": 1.487288236618042, + "step": 422 + }, + { + "epoch": 0.9401330376940134, + "grad_norm": 1.123197317123413, + "learning_rate": 9.191305043512806e-06, + "loss": 1.0854390859603882, + "step": 424 + }, + { + "epoch": 0.9445676274944568, + "grad_norm": 0.7748414278030396, + "learning_rate": 9.181839811361048e-06, + "loss": 1.219201683998108, + "step": 426 + }, + { + "epoch": 0.9490022172949002, + "grad_norm": 0.6483752131462097, + "learning_rate": 9.172325044523413e-06, + "loss": 1.115012526512146, + "step": 428 + }, + { + "epoch": 0.9534368070953437, + "grad_norm": 2.7231156826019287, + "learning_rate": 9.16276087100954e-06, + "loss": 0.7529615163803101, + "step": 430 + }, + { + "epoch": 0.9578713968957872, + "grad_norm": 0.8433653116226196, + "learning_rate": 9.153147419493774e-06, + "loss": 1.238058090209961, + "step": 432 + }, + { + "epoch": 0.9623059866962306, + "grad_norm": 1.2854188680648804, + "learning_rate": 9.143484819313441e-06, + "loss": 1.1210715770721436, + "step": 434 + }, + { + "epoch": 0.9667405764966741, + "grad_norm": 0.8472381830215454, + "learning_rate": 9.133773200467095e-06, + "loss": 1.2828236818313599, + "step": 436 + }, + { + "epoch": 0.9711751662971175, + "grad_norm": 0.5409514307975769, + "learning_rate": 9.12401269361278e-06, + "loss": 1.2514156103134155, + "step": 438 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 2.096435070037842, + "learning_rate": 9.114203430066273e-06, + "loss": 0.8250770568847656, + "step": 440 + }, + { + "epoch": 0.9800443458980045, + "grad_norm": 0.692021906375885, + "learning_rate": 9.104345541799304e-06, + "loss": 1.0744602680206299, + "step": 442 + }, + { + "epoch": 0.9844789356984479, + "grad_norm": 0.8786086440086365, + "learning_rate": 9.094439161437797e-06, + "loss": 1.3004294633865356, + "step": 444 + }, + { + "epoch": 0.9889135254988913, + "grad_norm": 1.082552433013916, + "learning_rate": 9.084484422260079e-06, + "loss": 1.217628836631775, + "step": 446 + }, + { + "epoch": 0.9933481152993349, + "grad_norm": 4.506486892700195, + "learning_rate": 9.074481458195077e-06, + "loss": 1.2548447847366333, + "step": 448 + }, + { + "epoch": 0.9977827050997783, + "grad_norm": 0.7525108456611633, + "learning_rate": 9.064430403820538e-06, + "loss": 1.056371808052063, + "step": 450 + }, + { + "epoch": 1.0022172949002217, + "grad_norm": 1.5104862451553345, + "learning_rate": 9.054331394361195e-06, + "loss": 1.058455228805542, + "step": 452 + }, + { + "epoch": 1.0066518847006651, + "grad_norm": 1.4822558164596558, + "learning_rate": 9.044184565686963e-06, + "loss": 1.272789478302002, + "step": 454 + }, + { + "epoch": 1.0110864745011086, + "grad_norm": 1.1703786849975586, + "learning_rate": 9.033990054311108e-06, + "loss": 1.1383321285247803, + "step": 456 + }, + { + "epoch": 1.0155210643015522, + "grad_norm": 1.1142799854278564, + "learning_rate": 9.023747997388409e-06, + "loss": 0.7421969175338745, + "step": 458 + }, + { + "epoch": 1.0199556541019956, + "grad_norm": 0.2857581079006195, + "learning_rate": 9.013458532713303e-06, + "loss": 0.7711299657821655, + "step": 460 + }, + { + "epoch": 1.024390243902439, + "grad_norm": 1.1388225555419922, + "learning_rate": 9.003121798718055e-06, + "loss": 0.6186503767967224, + "step": 462 + }, + { + "epoch": 1.0288248337028825, + "grad_norm": 1.9768539667129517, + "learning_rate": 8.992737934470875e-06, + "loss": 0.7862461805343628, + "step": 464 + }, + { + "epoch": 1.033259423503326, + "grad_norm": 1.0883772373199463, + "learning_rate": 8.982307079674051e-06, + "loss": 0.8056252002716064, + "step": 466 + }, + { + "epoch": 1.0376940133037693, + "grad_norm": 2.1854653358459473, + "learning_rate": 8.971829374662075e-06, + "loss": 1.0135068893432617, + "step": 468 + }, + { + "epoch": 1.042128603104213, + "grad_norm": 1.331917643547058, + "learning_rate": 8.961304960399746e-06, + "loss": 0.7019103765487671, + "step": 470 + }, + { + "epoch": 1.0465631929046564, + "grad_norm": 0.9092720746994019, + "learning_rate": 8.950733978480295e-06, + "loss": 0.9827821850776672, + "step": 472 + }, + { + "epoch": 1.0509977827050998, + "grad_norm": 1.0283437967300415, + "learning_rate": 8.940116571123442e-06, + "loss": 1.0825997591018677, + "step": 474 + }, + { + "epoch": 1.0554323725055432, + "grad_norm": 0.6597787141799927, + "learning_rate": 8.929452881173522e-06, + "loss": 1.0318479537963867, + "step": 476 + }, + { + "epoch": 1.0598669623059866, + "grad_norm": 1.391737937927246, + "learning_rate": 8.91874305209754e-06, + "loss": 1.1522279977798462, + "step": 478 + }, + { + "epoch": 1.06430155210643, + "grad_norm": 2.9688658714294434, + "learning_rate": 8.907987227983244e-06, + "loss": 0.659584105014801, + "step": 480 + }, + { + "epoch": 1.0687361419068737, + "grad_norm": 0.6712129712104797, + "learning_rate": 8.897185553537199e-06, + "loss": 1.168315052986145, + "step": 482 + }, + { + "epoch": 1.0731707317073171, + "grad_norm": 0.35381272435188293, + "learning_rate": 8.886338174082818e-06, + "loss": 0.9410740733146667, + "step": 484 + }, + { + "epoch": 1.0776053215077606, + "grad_norm": 0.4841127097606659, + "learning_rate": 8.875445235558429e-06, + "loss": 1.022413969039917, + "step": 486 + }, + { + "epoch": 1.082039911308204, + "grad_norm": 0.8757275938987732, + "learning_rate": 8.864506884515298e-06, + "loss": 0.8324956297874451, + "step": 488 + }, + { + "epoch": 1.0864745011086474, + "grad_norm": 0.5024043917655945, + "learning_rate": 8.853523268115662e-06, + "loss": 0.9657794237136841, + "step": 490 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 1.3689531087875366, + "learning_rate": 8.84249453413075e-06, + "loss": 1.1539636850357056, + "step": 492 + }, + { + "epoch": 1.0953436807095343, + "grad_norm": 2.9836928844451904, + "learning_rate": 8.831420830938787e-06, + "loss": 1.0504236221313477, + "step": 494 + }, + { + "epoch": 1.099778270509978, + "grad_norm": 1.9407044649124146, + "learning_rate": 8.820302307523012e-06, + "loss": 0.8722038865089417, + "step": 496 + }, + { + "epoch": 1.1042128603104213, + "grad_norm": 0.7889557480812073, + "learning_rate": 8.809139113469664e-06, + "loss": 1.0063974857330322, + "step": 498 + }, + { + "epoch": 1.1086474501108647, + "grad_norm": 1.6550654172897339, + "learning_rate": 8.797931398965968e-06, + "loss": 0.712496817111969, + "step": 500 + }, + { + "epoch": 1.1130820399113082, + "grad_norm": 0.7474198937416077, + "learning_rate": 8.78667931479812e-06, + "loss": 0.9868514537811279, + "step": 502 + }, + { + "epoch": 1.1175166297117516, + "grad_norm": 0.713065505027771, + "learning_rate": 8.775383012349255e-06, + "loss": 0.8493003249168396, + "step": 504 + }, + { + "epoch": 1.1219512195121952, + "grad_norm": 1.046800136566162, + "learning_rate": 8.764042643597413e-06, + "loss": 0.9514451026916504, + "step": 506 + }, + { + "epoch": 1.1263858093126387, + "grad_norm": 1.4471544027328491, + "learning_rate": 8.75265836111349e-06, + "loss": 0.7326334714889526, + "step": 508 + }, + { + "epoch": 1.130820399113082, + "grad_norm": 0.7963923215866089, + "learning_rate": 8.741230318059188e-06, + "loss": 1.1156243085861206, + "step": 510 + }, + { + "epoch": 1.1352549889135255, + "grad_norm": 9.904563903808594, + "learning_rate": 8.72975866818496e-06, + "loss": 0.5709845423698425, + "step": 512 + }, + { + "epoch": 1.139689578713969, + "grad_norm": 0.8027311563491821, + "learning_rate": 8.718243565827927e-06, + "loss": 0.4392659664154053, + "step": 514 + }, + { + "epoch": 1.1441241685144123, + "grad_norm": 1.0292977094650269, + "learning_rate": 8.706685165909817e-06, + "loss": 0.8699248433113098, + "step": 516 + }, + { + "epoch": 1.1485587583148558, + "grad_norm": 1.7027775049209595, + "learning_rate": 8.695083623934872e-06, + "loss": 1.1637617349624634, + "step": 518 + }, + { + "epoch": 1.1529933481152994, + "grad_norm": 1.0440765619277954, + "learning_rate": 8.683439095987758e-06, + "loss": 1.1635072231292725, + "step": 520 + }, + { + "epoch": 1.1574279379157428, + "grad_norm": 0.5651006102561951, + "learning_rate": 8.671751738731464e-06, + "loss": 0.6186004877090454, + "step": 522 + }, + { + "epoch": 1.1618625277161863, + "grad_norm": 0.8910722732543945, + "learning_rate": 8.660021709405197e-06, + "loss": 1.0334709882736206, + "step": 524 + }, + { + "epoch": 1.1662971175166297, + "grad_norm": 1.5807560682296753, + "learning_rate": 8.648249165822265e-06, + "loss": 1.1915044784545898, + "step": 526 + }, + { + "epoch": 1.170731707317073, + "grad_norm": 0.9486162662506104, + "learning_rate": 8.636434266367956e-06, + "loss": 0.6182337403297424, + "step": 528 + }, + { + "epoch": 1.1751662971175167, + "grad_norm": 0.9549699425697327, + "learning_rate": 8.624577169997394e-06, + "loss": 0.8050264120101929, + "step": 530 + }, + { + "epoch": 1.1796008869179602, + "grad_norm": 0.6812994480133057, + "learning_rate": 8.612678036233428e-06, + "loss": 0.9113385081291199, + "step": 532 + }, + { + "epoch": 1.1840354767184036, + "grad_norm": 0.7580647468566895, + "learning_rate": 8.600737025164454e-06, + "loss": 0.9746559858322144, + "step": 534 + }, + { + "epoch": 1.188470066518847, + "grad_norm": 0.36553850769996643, + "learning_rate": 8.588754297442288e-06, + "loss": 0.7387748956680298, + "step": 536 + }, + { + "epoch": 1.1929046563192904, + "grad_norm": 0.8339745998382568, + "learning_rate": 8.576730014279982e-06, + "loss": 1.0821408033370972, + "step": 538 + }, + { + "epoch": 1.1973392461197339, + "grad_norm": 0.8957899808883667, + "learning_rate": 8.564664337449677e-06, + "loss": 0.7081119418144226, + "step": 540 + }, + { + "epoch": 1.2017738359201773, + "grad_norm": 1.1581319570541382, + "learning_rate": 8.552557429280407e-06, + "loss": 0.6470943689346313, + "step": 542 + }, + { + "epoch": 1.206208425720621, + "grad_norm": 1.4035844802856445, + "learning_rate": 8.540409452655927e-06, + "loss": 1.0830638408660889, + "step": 544 + }, + { + "epoch": 1.2106430155210643, + "grad_norm": 0.36817386746406555, + "learning_rate": 8.528220571012518e-06, + "loss": 0.7960520386695862, + "step": 546 + }, + { + "epoch": 1.2150776053215078, + "grad_norm": 0.7596889734268188, + "learning_rate": 8.51599094833679e-06, + "loss": 1.2095422744750977, + "step": 548 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 4.009555816650391, + "learning_rate": 8.503720749163472e-06, + "loss": 0.6231736540794373, + "step": 550 + }, + { + "epoch": 1.2239467849223946, + "grad_norm": 1.435842752456665, + "learning_rate": 8.491410138573201e-06, + "loss": 1.1608552932739258, + "step": 552 + }, + { + "epoch": 1.2283813747228383, + "grad_norm": 8.738020896911621, + "learning_rate": 8.479059282190298e-06, + "loss": 0.7340165376663208, + "step": 554 + }, + { + "epoch": 1.2328159645232817, + "grad_norm": 0.31565892696380615, + "learning_rate": 8.466668346180548e-06, + "loss": 0.728275716304779, + "step": 556 + }, + { + "epoch": 1.237250554323725, + "grad_norm": 3.1685538291931152, + "learning_rate": 8.454237497248956e-06, + "loss": 0.5666266679763794, + "step": 558 + }, + { + "epoch": 1.2416851441241685, + "grad_norm": 1.385015606880188, + "learning_rate": 8.441766902637506e-06, + "loss": 1.1794722080230713, + "step": 560 + }, + { + "epoch": 1.246119733924612, + "grad_norm": 2.720320701599121, + "learning_rate": 8.429256730122909e-06, + "loss": 1.013020396232605, + "step": 562 + }, + { + "epoch": 1.2505543237250554, + "grad_norm": 0.49003681540489197, + "learning_rate": 8.416707148014358e-06, + "loss": 0.6282749176025391, + "step": 564 + }, + { + "epoch": 1.2549889135254988, + "grad_norm": 0.9545651078224182, + "learning_rate": 8.404118325151245e-06, + "loss": 0.9549443125724792, + "step": 566 + }, + { + "epoch": 1.2594235033259422, + "grad_norm": 0.6707999110221863, + "learning_rate": 8.391490430900902e-06, + "loss": 0.9106066823005676, + "step": 568 + }, + { + "epoch": 1.2638580931263859, + "grad_norm": 1.83707594871521, + "learning_rate": 8.378823635156319e-06, + "loss": 1.1188719272613525, + "step": 570 + }, + { + "epoch": 1.2682926829268293, + "grad_norm": 0.6922241449356079, + "learning_rate": 8.366118108333861e-06, + "loss": 0.7410668134689331, + "step": 572 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 1.7329115867614746, + "learning_rate": 8.353374021370967e-06, + "loss": 1.2692005634307861, + "step": 574 + }, + { + "epoch": 1.2771618625277161, + "grad_norm": 1.7935478687286377, + "learning_rate": 8.340591545723861e-06, + "loss": 1.0476911067962646, + "step": 576 + }, + { + "epoch": 1.2815964523281598, + "grad_norm": 1.0415445566177368, + "learning_rate": 8.327770853365238e-06, + "loss": 1.1656912565231323, + "step": 578 + }, + { + "epoch": 1.2860310421286032, + "grad_norm": 1.2668659687042236, + "learning_rate": 8.314912116781954e-06, + "loss": 1.0213876962661743, + "step": 580 + }, + { + "epoch": 1.2904656319290466, + "grad_norm": 2.997131586074829, + "learning_rate": 8.302015508972702e-06, + "loss": 1.3410066366195679, + "step": 582 + }, + { + "epoch": 1.29490022172949, + "grad_norm": 0.563655436038971, + "learning_rate": 8.289081203445686e-06, + "loss": 0.7119391560554504, + "step": 584 + }, + { + "epoch": 1.2993348115299335, + "grad_norm": 0.6684980988502502, + "learning_rate": 8.276109374216286e-06, + "loss": 0.7561792135238647, + "step": 586 + }, + { + "epoch": 1.3037694013303769, + "grad_norm": 0.9710568189620972, + "learning_rate": 8.263100195804722e-06, + "loss": 0.5911573767662048, + "step": 588 + }, + { + "epoch": 1.3082039911308203, + "grad_norm": 3.2726967334747314, + "learning_rate": 8.250053843233704e-06, + "loss": 0.9861688017845154, + "step": 590 + }, + { + "epoch": 1.3126385809312637, + "grad_norm": 0.7993785738945007, + "learning_rate": 8.236970492026063e-06, + "loss": 0.994674563407898, + "step": 592 + }, + { + "epoch": 1.3170731707317074, + "grad_norm": 0.8381823897361755, + "learning_rate": 8.223850318202415e-06, + "loss": 0.9940106868743896, + "step": 594 + }, + { + "epoch": 1.3215077605321508, + "grad_norm": 0.7063131928443909, + "learning_rate": 8.210693498278773e-06, + "loss": 1.1006373167037964, + "step": 596 + }, + { + "epoch": 1.3259423503325942, + "grad_norm": 6.862369060516357, + "learning_rate": 8.197500209264181e-06, + "loss": 1.1485295295715332, + "step": 598 + }, + { + "epoch": 1.3303769401330376, + "grad_norm": 0.639260470867157, + "learning_rate": 8.18427062865833e-06, + "loss": 0.46675366163253784, + "step": 600 + }, + { + "epoch": 1.3348115299334813, + "grad_norm": 1.624756097793579, + "learning_rate": 8.171004934449166e-06, + "loss": 0.9039447903633118, + "step": 602 + }, + { + "epoch": 1.3392461197339247, + "grad_norm": 1.2824625968933105, + "learning_rate": 8.157703305110508e-06, + "loss": 0.8456693887710571, + "step": 604 + }, + { + "epoch": 1.3436807095343681, + "grad_norm": 0.45296287536621094, + "learning_rate": 8.144365919599632e-06, + "loss": 0.7544465065002441, + "step": 606 + }, + { + "epoch": 1.3481152993348116, + "grad_norm": 0.7048214673995972, + "learning_rate": 8.130992957354872e-06, + "loss": 1.0760173797607422, + "step": 608 + }, + { + "epoch": 1.352549889135255, + "grad_norm": 18.7407283782959, + "learning_rate": 8.117584598293204e-06, + "loss": 1.1418424844741821, + "step": 610 + }, + { + "epoch": 1.3569844789356984, + "grad_norm": 1.2046879529953003, + "learning_rate": 8.104141022807824e-06, + "loss": 0.7227436304092407, + "step": 612 + }, + { + "epoch": 1.3614190687361418, + "grad_norm": 1.5291908979415894, + "learning_rate": 8.090662411765726e-06, + "loss": 1.1172561645507812, + "step": 614 + }, + { + "epoch": 1.3658536585365852, + "grad_norm": 0.47800955176353455, + "learning_rate": 8.077148946505258e-06, + "loss": 0.6969360709190369, + "step": 616 + }, + { + "epoch": 1.370288248337029, + "grad_norm": 1.2343717813491821, + "learning_rate": 8.063600808833698e-06, + "loss": 0.932880163192749, + "step": 618 + }, + { + "epoch": 1.3747228381374723, + "grad_norm": 0.9165163040161133, + "learning_rate": 8.050018181024788e-06, + "loss": 0.15981429815292358, + "step": 620 + }, + { + "epoch": 1.3791574279379157, + "grad_norm": 0.6961973905563354, + "learning_rate": 8.036401245816306e-06, + "loss": 0.9840935468673706, + "step": 622 + }, + { + "epoch": 1.3835920177383592, + "grad_norm": 0.7490546703338623, + "learning_rate": 8.022750186407586e-06, + "loss": 0.9857772588729858, + "step": 624 + }, + { + "epoch": 1.3880266075388026, + "grad_norm": 3.150604724884033, + "learning_rate": 8.009065186457061e-06, + "loss": 0.9109776616096497, + "step": 626 + }, + { + "epoch": 1.3924611973392462, + "grad_norm": 0.8762973546981812, + "learning_rate": 7.995346430079799e-06, + "loss": 1.051713228225708, + "step": 628 + }, + { + "epoch": 1.3968957871396896, + "grad_norm": 0.9203951358795166, + "learning_rate": 7.981594101845012e-06, + "loss": 1.0516294240951538, + "step": 630 + }, + { + "epoch": 1.401330376940133, + "grad_norm": 3.257267713546753, + "learning_rate": 7.967808386773591e-06, + "loss": 0.9409018754959106, + "step": 632 + }, + { + "epoch": 1.4057649667405765, + "grad_norm": 2.1354942321777344, + "learning_rate": 7.953989470335592e-06, + "loss": 1.0173999071121216, + "step": 634 + }, + { + "epoch": 1.41019955654102, + "grad_norm": 1.2958922386169434, + "learning_rate": 7.940137538447769e-06, + "loss": 1.0542725324630737, + "step": 636 + }, + { + "epoch": 1.4146341463414633, + "grad_norm": 2.297118663787842, + "learning_rate": 7.92625277747105e-06, + "loss": 0.7847860455513, + "step": 638 + }, + { + "epoch": 1.4190687361419068, + "grad_norm": 0.7170055508613586, + "learning_rate": 7.912335374208043e-06, + "loss": 0.6514250636100769, + "step": 640 + }, + { + "epoch": 1.4235033259423504, + "grad_norm": 2.0470762252807617, + "learning_rate": 7.898385515900517e-06, + "loss": 0.8069249391555786, + "step": 642 + }, + { + "epoch": 1.4279379157427938, + "grad_norm": 1.757605791091919, + "learning_rate": 7.884403390226883e-06, + "loss": 0.657196044921875, + "step": 644 + }, + { + "epoch": 1.4323725055432373, + "grad_norm": 1.3288108110427856, + "learning_rate": 7.870389185299672e-06, + "loss": 1.1910220384597778, + "step": 646 + }, + { + "epoch": 1.4368070953436807, + "grad_norm": 0.8881821036338806, + "learning_rate": 7.856343089663002e-06, + "loss": 1.0706536769866943, + "step": 648 + }, + { + "epoch": 1.441241685144124, + "grad_norm": 1.1441328525543213, + "learning_rate": 7.842265292290039e-06, + "loss": 1.190679669380188, + "step": 650 + }, + { + "epoch": 1.4456762749445677, + "grad_norm": 2.3278324604034424, + "learning_rate": 7.828155982580465e-06, + "loss": 0.928497314453125, + "step": 652 + }, + { + "epoch": 1.4501108647450112, + "grad_norm": 1.4663766622543335, + "learning_rate": 7.814015350357912e-06, + "loss": 0.9704734086990356, + "step": 654 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 1.560611605644226, + "learning_rate": 7.799843585867426e-06, + "loss": 0.8245765566825867, + "step": 656 + }, + { + "epoch": 1.458980044345898, + "grad_norm": 0.9043145775794983, + "learning_rate": 7.785640879772897e-06, + "loss": 0.7227462530136108, + "step": 658 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 2.190586805343628, + "learning_rate": 7.771407423154498e-06, + "loss": 0.9887614250183105, + "step": 660 + }, + { + "epoch": 1.4678492239467849, + "grad_norm": 0.7352641224861145, + "learning_rate": 7.757143407506111e-06, + "loss": 0.9774073362350464, + "step": 662 + }, + { + "epoch": 1.4722838137472283, + "grad_norm": 0.72539883852005, + "learning_rate": 7.742849024732754e-06, + "loss": 0.7537736296653748, + "step": 664 + }, + { + "epoch": 1.476718403547672, + "grad_norm": 0.5807918906211853, + "learning_rate": 7.728524467148e-06, + "loss": 0.4493647515773773, + "step": 666 + }, + { + "epoch": 1.4811529933481153, + "grad_norm": 5.449716567993164, + "learning_rate": 7.714169927471379e-06, + "loss": 0.5409796237945557, + "step": 668 + }, + { + "epoch": 1.4855875831485588, + "grad_norm": 0.9701158404350281, + "learning_rate": 7.699785598825805e-06, + "loss": 0.9008554220199585, + "step": 670 + }, + { + "epoch": 1.4900221729490022, + "grad_norm": 0.798380434513092, + "learning_rate": 7.68537167473496e-06, + "loss": 1.005531668663025, + "step": 672 + }, + { + "epoch": 1.4944567627494456, + "grad_norm": 1.1633052825927734, + "learning_rate": 7.670928349120699e-06, + "loss": 0.6692250967025757, + "step": 674 + }, + { + "epoch": 1.4988913525498893, + "grad_norm": 2.026198387145996, + "learning_rate": 7.656455816300434e-06, + "loss": 0.7974430322647095, + "step": 676 + }, + { + "epoch": 1.5033259423503327, + "grad_norm": 0.7488698363304138, + "learning_rate": 7.641954270984532e-06, + "loss": 1.02421236038208, + "step": 678 + }, + { + "epoch": 1.507760532150776, + "grad_norm": 0.6743457317352295, + "learning_rate": 7.627423908273683e-06, + "loss": 0.5709149837493896, + "step": 680 + }, + { + "epoch": 1.5121951219512195, + "grad_norm": 1.009034276008606, + "learning_rate": 7.61286492365628e-06, + "loss": 0.5336272716522217, + "step": 682 + }, + { + "epoch": 1.516629711751663, + "grad_norm": 2.056926965713501, + "learning_rate": 7.598277513005793e-06, + "loss": 0.9375542402267456, + "step": 684 + }, + { + "epoch": 1.5210643015521064, + "grad_norm": 0.9676103591918945, + "learning_rate": 7.583661872578124e-06, + "loss": 0.9711353778839111, + "step": 686 + }, + { + "epoch": 1.5254988913525498, + "grad_norm": 2.404435873031616, + "learning_rate": 7.569018199008976e-06, + "loss": 0.6847166419029236, + "step": 688 + }, + { + "epoch": 1.5299334811529932, + "grad_norm": 0.8333436250686646, + "learning_rate": 7.554346689311205e-06, + "loss": 0.2782055735588074, + "step": 690 + }, + { + "epoch": 1.5343680709534369, + "grad_norm": 10.112131118774414, + "learning_rate": 7.539647540872165e-06, + "loss": 0.7305975556373596, + "step": 692 + }, + { + "epoch": 1.5388026607538803, + "grad_norm": 1.3367574214935303, + "learning_rate": 7.5249209514510595e-06, + "loss": 0.9035903215408325, + "step": 694 + }, + { + "epoch": 1.5432372505543237, + "grad_norm": 2.384665012359619, + "learning_rate": 7.510167119176273e-06, + "loss": 0.6485692858695984, + "step": 696 + }, + { + "epoch": 1.5476718403547673, + "grad_norm": 0.8054559230804443, + "learning_rate": 7.49538624254271e-06, + "loss": 1.073509693145752, + "step": 698 + }, + { + "epoch": 1.5521064301552108, + "grad_norm": 0.9697364568710327, + "learning_rate": 7.48057852040913e-06, + "loss": 1.1030398607254028, + "step": 700 + }, + { + "epoch": 1.5565410199556542, + "grad_norm": 1.6702967882156372, + "learning_rate": 7.465744151995458e-06, + "loss": 1.0122231245040894, + "step": 702 + }, + { + "epoch": 1.5609756097560976, + "grad_norm": 1.9088103771209717, + "learning_rate": 7.450883336880116e-06, + "loss": 0.8053731918334961, + "step": 704 + }, + { + "epoch": 1.565410199556541, + "grad_norm": 1.449622392654419, + "learning_rate": 7.435996274997337e-06, + "loss": 0.4717741310596466, + "step": 706 + }, + { + "epoch": 1.5698447893569845, + "grad_norm": 0.5445607304573059, + "learning_rate": 7.421083166634466e-06, + "loss": 1.0504510402679443, + "step": 708 + }, + { + "epoch": 1.5742793791574279, + "grad_norm": 1.3481144905090332, + "learning_rate": 7.40614421242928e-06, + "loss": 0.8399388790130615, + "step": 710 + }, + { + "epoch": 1.5787139689578713, + "grad_norm": 1.32907235622406, + "learning_rate": 7.391179613367272e-06, + "loss": 0.6359143853187561, + "step": 712 + }, + { + "epoch": 1.5831485587583147, + "grad_norm": 1.3502752780914307, + "learning_rate": 7.37618957077896e-06, + "loss": 0.9908859729766846, + "step": 714 + }, + { + "epoch": 1.5875831485587582, + "grad_norm": 1.0625886917114258, + "learning_rate": 7.361174286337175e-06, + "loss": 1.0245836973190308, + "step": 716 + }, + { + "epoch": 1.5920177383592018, + "grad_norm": 1.4929050207138062, + "learning_rate": 7.346133962054341e-06, + "loss": 1.190230369567871, + "step": 718 + }, + { + "epoch": 1.5964523281596452, + "grad_norm": 1.8333940505981445, + "learning_rate": 7.33106880027977e-06, + "loss": 0.6086799502372742, + "step": 720 + }, + { + "epoch": 1.6008869179600886, + "grad_norm": 0.9325003623962402, + "learning_rate": 7.315979003696927e-06, + "loss": 0.4897584319114685, + "step": 722 + }, + { + "epoch": 1.6053215077605323, + "grad_norm": 3.4681403636932373, + "learning_rate": 7.300864775320708e-06, + "loss": 1.0312981605529785, + "step": 724 + }, + { + "epoch": 1.6097560975609757, + "grad_norm": 0.8704633116722107, + "learning_rate": 7.285726318494717e-06, + "loss": 1.0136796236038208, + "step": 726 + }, + { + "epoch": 1.6141906873614191, + "grad_norm": 1.0489435195922852, + "learning_rate": 7.2705638368885105e-06, + "loss": 0.739902913570404, + "step": 728 + }, + { + "epoch": 1.6186252771618626, + "grad_norm": 1.260585069656372, + "learning_rate": 7.255377534494875e-06, + "loss": 1.07662034034729, + "step": 730 + }, + { + "epoch": 1.623059866962306, + "grad_norm": 0.7534044981002808, + "learning_rate": 7.240167615627082e-06, + "loss": 0.9939879179000854, + "step": 732 + }, + { + "epoch": 1.6274944567627494, + "grad_norm": 1.093334674835205, + "learning_rate": 7.224934284916127e-06, + "loss": 1.0733040571212769, + "step": 734 + }, + { + "epoch": 1.6319290465631928, + "grad_norm": 0.3485592007637024, + "learning_rate": 7.209677747307982e-06, + "loss": 0.5521863698959351, + "step": 736 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 1.0937601327896118, + "learning_rate": 7.194398208060848e-06, + "loss": 1.2027066946029663, + "step": 738 + }, + { + "epoch": 1.6407982261640797, + "grad_norm": 0.7582866549491882, + "learning_rate": 7.179095872742378e-06, + "loss": 1.0804038047790527, + "step": 740 + }, + { + "epoch": 1.6452328159645233, + "grad_norm": 1.5659160614013672, + "learning_rate": 7.16377094722692e-06, + "loss": 1.1643147468566895, + "step": 742 + }, + { + "epoch": 1.6496674057649667, + "grad_norm": 1.0531456470489502, + "learning_rate": 7.148423637692748e-06, + "loss": 0.8503185510635376, + "step": 744 + }, + { + "epoch": 1.6541019955654102, + "grad_norm": 1.2781857252120972, + "learning_rate": 7.133054150619282e-06, + "loss": 0.8742875456809998, + "step": 746 + }, + { + "epoch": 1.6585365853658538, + "grad_norm": 0.585116446018219, + "learning_rate": 7.117662692784318e-06, + "loss": 0.8233492970466614, + "step": 748 + }, + { + "epoch": 1.6629711751662972, + "grad_norm": 0.9041104912757874, + "learning_rate": 7.102249471261241e-06, + "loss": 0.9978519678115845, + "step": 750 + }, + { + "epoch": 1.6674057649667406, + "grad_norm": 0.6088887453079224, + "learning_rate": 7.0868146934162365e-06, + "loss": 0.752709686756134, + "step": 752 + }, + { + "epoch": 1.671840354767184, + "grad_norm": 2.9724321365356445, + "learning_rate": 7.071358566905507e-06, + "loss": 0.6525120139122009, + "step": 754 + }, + { + "epoch": 1.6762749445676275, + "grad_norm": 0.9356153011322021, + "learning_rate": 7.055881299672476e-06, + "loss": 0.9493626952171326, + "step": 756 + }, + { + "epoch": 1.680709534368071, + "grad_norm": 0.806330144405365, + "learning_rate": 7.040383099944988e-06, + "loss": 1.0765941143035889, + "step": 758 + }, + { + "epoch": 1.6851441241685143, + "grad_norm": 1.2727959156036377, + "learning_rate": 7.02486417623251e-06, + "loss": 0.8836164474487305, + "step": 760 + }, + { + "epoch": 1.6895787139689578, + "grad_norm": 1.7035382986068726, + "learning_rate": 7.009324737323325e-06, + "loss": 1.0562770366668701, + "step": 762 + }, + { + "epoch": 1.6940133037694012, + "grad_norm": 3.1537766456604004, + "learning_rate": 6.993764992281722e-06, + "loss": 0.9952316880226135, + "step": 764 + }, + { + "epoch": 1.6984478935698448, + "grad_norm": 1.5445963144302368, + "learning_rate": 6.978185150445187e-06, + "loss": 1.0095694065093994, + "step": 766 + }, + { + "epoch": 1.7028824833702882, + "grad_norm": 1.5164811611175537, + "learning_rate": 6.96258542142158e-06, + "loss": 0.8055514693260193, + "step": 768 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 0.6044250130653381, + "learning_rate": 6.946966015086321e-06, + "loss": 1.0075352191925049, + "step": 770 + }, + { + "epoch": 1.7117516629711753, + "grad_norm": 4.677299976348877, + "learning_rate": 6.931327141579565e-06, + "loss": 1.030186414718628, + "step": 772 + }, + { + "epoch": 1.7161862527716187, + "grad_norm": 0.6739047765731812, + "learning_rate": 6.915669011303374e-06, + "loss": 0.85605788230896, + "step": 774 + }, + { + "epoch": 1.7206208425720622, + "grad_norm": 0.9369874596595764, + "learning_rate": 6.899991834918884e-06, + "loss": 1.0663691759109497, + "step": 776 + }, + { + "epoch": 1.7250554323725056, + "grad_norm": 2.9482765197753906, + "learning_rate": 6.884295823343479e-06, + "loss": 1.0655351877212524, + "step": 778 + }, + { + "epoch": 1.729490022172949, + "grad_norm": 1.6967729330062866, + "learning_rate": 6.868581187747941e-06, + "loss": 0.7047576904296875, + "step": 780 + }, + { + "epoch": 1.7339246119733924, + "grad_norm": 0.87360680103302, + "learning_rate": 6.852848139553619e-06, + "loss": 0.9772816896438599, + "step": 782 + }, + { + "epoch": 1.7383592017738358, + "grad_norm": 0.9227361083030701, + "learning_rate": 6.837096890429582e-06, + "loss": 0.6554882526397705, + "step": 784 + }, + { + "epoch": 1.7427937915742793, + "grad_norm": 0.7863869071006775, + "learning_rate": 6.821327652289768e-06, + "loss": 0.7389634251594543, + "step": 786 + }, + { + "epoch": 1.7472283813747227, + "grad_norm": 0.7384536266326904, + "learning_rate": 6.8055406372901344e-06, + "loss": 0.6390055418014526, + "step": 788 + }, + { + "epoch": 1.7516629711751663, + "grad_norm": 5.335806369781494, + "learning_rate": 6.789736057825812e-06, + "loss": 0.6638211607933044, + "step": 790 + }, + { + "epoch": 1.7560975609756098, + "grad_norm": 0.5783770084381104, + "learning_rate": 6.77391412652823e-06, + "loss": 0.7075257897377014, + "step": 792 + }, + { + "epoch": 1.7605321507760532, + "grad_norm": 0.5881092548370361, + "learning_rate": 6.758075056262271e-06, + "loss": 0.8987762928009033, + "step": 794 + }, + { + "epoch": 1.7649667405764968, + "grad_norm": 21.343101501464844, + "learning_rate": 6.742219060123403e-06, + "loss": 0.7939308881759644, + "step": 796 + }, + { + "epoch": 1.7694013303769403, + "grad_norm": 1.0837807655334473, + "learning_rate": 6.7263463514348095e-06, + "loss": 0.983272135257721, + "step": 798 + }, + { + "epoch": 1.7738359201773837, + "grad_norm": 0.8009697198867798, + "learning_rate": 6.710457143744519e-06, + "loss": 1.1391663551330566, + "step": 800 + }, + { + "epoch": 1.778270509977827, + "grad_norm": 0.8815924525260925, + "learning_rate": 6.6945516508225325e-06, + "loss": 1.1954277753829956, + "step": 802 + }, + { + "epoch": 1.7827050997782705, + "grad_norm": 1.515954613685608, + "learning_rate": 6.678630086657959e-06, + "loss": 1.089769959449768, + "step": 804 + }, + { + "epoch": 1.787139689578714, + "grad_norm": 0.716978907585144, + "learning_rate": 6.662692665456115e-06, + "loss": 0.739348292350769, + "step": 806 + }, + { + "epoch": 1.7915742793791574, + "grad_norm": 0.7229488492012024, + "learning_rate": 6.646739601635661e-06, + "loss": 1.0049620866775513, + "step": 808 + }, + { + "epoch": 1.7960088691796008, + "grad_norm": 1.194189190864563, + "learning_rate": 6.6307711098257074e-06, + "loss": 0.5872386693954468, + "step": 810 + }, + { + "epoch": 1.8004434589800442, + "grad_norm": 0.8289064764976501, + "learning_rate": 6.6147874048629294e-06, + "loss": 0.5987858772277832, + "step": 812 + }, + { + "epoch": 1.8048780487804879, + "grad_norm": 2.004324436187744, + "learning_rate": 6.598788701788677e-06, + "loss": 1.2394013404846191, + "step": 814 + }, + { + "epoch": 1.8093126385809313, + "grad_norm": 3.997391700744629, + "learning_rate": 6.582775215846082e-06, + "loss": 0.9413332939147949, + "step": 816 + }, + { + "epoch": 1.8137472283813747, + "grad_norm": 0.9381686449050903, + "learning_rate": 6.566747162477164e-06, + "loss": 0.4953380525112152, + "step": 818 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 2.6868515014648438, + "learning_rate": 6.5507047573199235e-06, + "loss": 1.0671703815460205, + "step": 820 + }, + { + "epoch": 1.8226164079822618, + "grad_norm": 1.8891699314117432, + "learning_rate": 6.5346482162054526e-06, + "loss": 1.0773987770080566, + "step": 822 + }, + { + "epoch": 1.8270509977827052, + "grad_norm": 0.7535659074783325, + "learning_rate": 6.518577755155024e-06, + "loss": 1.138119101524353, + "step": 824 + }, + { + "epoch": 1.8314855875831486, + "grad_norm": 1.3209606409072876, + "learning_rate": 6.502493590377184e-06, + "loss": 0.8813806772232056, + "step": 826 + }, + { + "epoch": 1.835920177383592, + "grad_norm": 1.151709794998169, + "learning_rate": 6.48639593826485e-06, + "loss": 0.7886914610862732, + "step": 828 + }, + { + "epoch": 1.8403547671840355, + "grad_norm": 0.8656094074249268, + "learning_rate": 6.4702850153923915e-06, + "loss": 0.8985581398010254, + "step": 830 + }, + { + "epoch": 1.8447893569844789, + "grad_norm": 3.7648069858551025, + "learning_rate": 6.45416103851272e-06, + "loss": 1.02861750125885, + "step": 832 + }, + { + "epoch": 1.8492239467849223, + "grad_norm": 0.9988770484924316, + "learning_rate": 6.438024224554378e-06, + "loss": 0.9510617852210999, + "step": 834 + }, + { + "epoch": 1.8536585365853657, + "grad_norm": 0.5282806754112244, + "learning_rate": 6.421874790618608e-06, + "loss": 0.9773460030555725, + "step": 836 + }, + { + "epoch": 1.8580931263858091, + "grad_norm": 2.268371343612671, + "learning_rate": 6.405712953976444e-06, + "loss": 0.9805348515510559, + "step": 838 + }, + { + "epoch": 1.8625277161862528, + "grad_norm": 1.1176053285598755, + "learning_rate": 6.389538932065783e-06, + "loss": 1.1074886322021484, + "step": 840 + }, + { + "epoch": 1.8669623059866962, + "grad_norm": 0.6541219353675842, + "learning_rate": 6.373352942488455e-06, + "loss": 0.6198570728302002, + "step": 842 + }, + { + "epoch": 1.8713968957871396, + "grad_norm": 0.7045478224754333, + "learning_rate": 6.357155203007307e-06, + "loss": 0.7260794639587402, + "step": 844 + }, + { + "epoch": 1.8758314855875833, + "grad_norm": 3.3756415843963623, + "learning_rate": 6.340945931543263e-06, + "loss": 0.8276649117469788, + "step": 846 + }, + { + "epoch": 1.8802660753880267, + "grad_norm": 0.923382043838501, + "learning_rate": 6.324725346172399e-06, + "loss": 0.8496088981628418, + "step": 848 + }, + { + "epoch": 1.8847006651884701, + "grad_norm": 1.331862211227417, + "learning_rate": 6.308493665123e-06, + "loss": 1.321401834487915, + "step": 850 + }, + { + "epoch": 1.8891352549889135, + "grad_norm": 0.8982786536216736, + "learning_rate": 6.2922511067726365e-06, + "loss": 0.7383803725242615, + "step": 852 + }, + { + "epoch": 1.893569844789357, + "grad_norm": 0.964008629322052, + "learning_rate": 6.2759978896452155e-06, + "loss": 1.0080775022506714, + "step": 854 + }, + { + "epoch": 1.8980044345898004, + "grad_norm": 0.9472535252571106, + "learning_rate": 6.259734232408047e-06, + "loss": 0.8919001221656799, + "step": 856 + }, + { + "epoch": 1.9024390243902438, + "grad_norm": 0.727502703666687, + "learning_rate": 6.2434603538688975e-06, + "loss": 0.7772597670555115, + "step": 858 + }, + { + "epoch": 1.9068736141906872, + "grad_norm": 1.1779029369354248, + "learning_rate": 6.2271764729730525e-06, + "loss": 0.7538490891456604, + "step": 860 + }, + { + "epoch": 1.9113082039911307, + "grad_norm": 1.4906489849090576, + "learning_rate": 6.210882808800366e-06, + "loss": 1.079421043395996, + "step": 862 + }, + { + "epoch": 1.9157427937915743, + "grad_norm": 1.1503466367721558, + "learning_rate": 6.19457958056231e-06, + "loss": 0.8554342985153198, + "step": 864 + }, + { + "epoch": 1.9201773835920177, + "grad_norm": 2.031961441040039, + "learning_rate": 6.178267007599034e-06, + "loss": 0.636574923992157, + "step": 866 + }, + { + "epoch": 1.9246119733924612, + "grad_norm": 2.903661012649536, + "learning_rate": 6.161945309376409e-06, + "loss": 1.124933123588562, + "step": 868 + }, + { + "epoch": 1.9290465631929048, + "grad_norm": 0.6506541967391968, + "learning_rate": 6.145614705483075e-06, + "loss": 0.9398843050003052, + "step": 870 + }, + { + "epoch": 1.9334811529933482, + "grad_norm": 0.6698076725006104, + "learning_rate": 6.129275415627485e-06, + "loss": 0.6359599828720093, + "step": 872 + }, + { + "epoch": 1.9379157427937916, + "grad_norm": 1.9438526630401611, + "learning_rate": 6.11292765963495e-06, + "loss": 0.6653733849525452, + "step": 874 + }, + { + "epoch": 1.942350332594235, + "grad_norm": 1.083453893661499, + "learning_rate": 6.09657165744469e-06, + "loss": 0.6323903203010559, + "step": 876 + }, + { + "epoch": 1.9467849223946785, + "grad_norm": 0.547383725643158, + "learning_rate": 6.080207629106859e-06, + "loss": 0.7757695913314819, + "step": 878 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.8963095545768738, + "learning_rate": 6.063835794779598e-06, + "loss": 0.8518045544624329, + "step": 880 + }, + { + "epoch": 1.9556541019955653, + "grad_norm": 2.536919116973877, + "learning_rate": 6.047456374726067e-06, + "loss": 0.6314433813095093, + "step": 882 + }, + { + "epoch": 1.9600886917960088, + "grad_norm": 0.7438651919364929, + "learning_rate": 6.031069589311481e-06, + "loss": 0.7835251688957214, + "step": 884 + }, + { + "epoch": 1.9645232815964522, + "grad_norm": 1.984107255935669, + "learning_rate": 6.01467565900015e-06, + "loss": 0.9823970794677734, + "step": 886 + }, + { + "epoch": 1.9689578713968958, + "grad_norm": 0.6768189668655396, + "learning_rate": 5.99827480435251e-06, + "loss": 1.0069411993026733, + "step": 888 + }, + { + "epoch": 1.9733924611973392, + "grad_norm": 1.5261571407318115, + "learning_rate": 5.981867246022149e-06, + "loss": 1.0389683246612549, + "step": 890 + }, + { + "epoch": 1.9778270509977827, + "grad_norm": 1.2349331378936768, + "learning_rate": 5.965453204752855e-06, + "loss": 1.0331605672836304, + "step": 892 + }, + { + "epoch": 1.9822616407982263, + "grad_norm": 0.9871334433555603, + "learning_rate": 5.949032901375627e-06, + "loss": 1.0539100170135498, + "step": 894 + }, + { + "epoch": 1.9866962305986697, + "grad_norm": 3.3653595447540283, + "learning_rate": 5.932606556805719e-06, + "loss": 0.8658461570739746, + "step": 896 + }, + { + "epoch": 1.9911308203991132, + "grad_norm": 0.6986981630325317, + "learning_rate": 5.916174392039659e-06, + "loss": 1.0008927583694458, + "step": 898 + }, + { + "epoch": 1.9955654101995566, + "grad_norm": 0.7081918716430664, + "learning_rate": 5.899736628152284e-06, + "loss": 0.7502108812332153, + "step": 900 + }, + { + "epoch": 2.0, + "grad_norm": 0.6118723154067993, + "learning_rate": 5.88329348629375e-06, + "loss": 0.977774977684021, + "step": 902 + }, + { + "epoch": 2.0044345898004434, + "grad_norm": 1.0688210725784302, + "learning_rate": 5.8668451876865736e-06, + "loss": 0.7613685727119446, + "step": 904 + }, + { + "epoch": 2.008869179600887, + "grad_norm": 0.9312078952789307, + "learning_rate": 5.850391953622652e-06, + "loss": 0.7516049742698669, + "step": 906 + }, + { + "epoch": 2.0133037694013303, + "grad_norm": 0.6763688325881958, + "learning_rate": 5.8339340054602775e-06, + "loss": 0.9630460143089294, + "step": 908 + }, + { + "epoch": 2.0177383592017737, + "grad_norm": 1.2758315801620483, + "learning_rate": 5.817471564621169e-06, + "loss": 0.6900556683540344, + "step": 910 + }, + { + "epoch": 2.022172949002217, + "grad_norm": 1.3393687009811401, + "learning_rate": 5.801004852587485e-06, + "loss": 0.48663175106048584, + "step": 912 + }, + { + "epoch": 2.0266075388026605, + "grad_norm": 1.5841343402862549, + "learning_rate": 5.784534090898849e-06, + "loss": 0.5389075875282288, + "step": 914 + }, + { + "epoch": 2.0310421286031044, + "grad_norm": 1.7100311517715454, + "learning_rate": 5.768059501149369e-06, + "loss": 0.5357539653778076, + "step": 916 + }, + { + "epoch": 2.035476718403548, + "grad_norm": 0.8878626823425293, + "learning_rate": 5.751581304984657e-06, + "loss": 0.6872968077659607, + "step": 918 + }, + { + "epoch": 2.0399113082039912, + "grad_norm": 1.035952091217041, + "learning_rate": 5.735099724098838e-06, + "loss": 0.5238696336746216, + "step": 920 + }, + { + "epoch": 2.0443458980044347, + "grad_norm": 10.002982139587402, + "learning_rate": 5.718614980231582e-06, + "loss": 0.6934325695037842, + "step": 922 + }, + { + "epoch": 2.048780487804878, + "grad_norm": 0.6786181926727295, + "learning_rate": 5.702127295165107e-06, + "loss": 0.1512947678565979, + "step": 924 + }, + { + "epoch": 2.0532150776053215, + "grad_norm": 0.6792353987693787, + "learning_rate": 5.685636890721205e-06, + "loss": 0.7085365653038025, + "step": 926 + }, + { + "epoch": 2.057649667405765, + "grad_norm": 3.1427464485168457, + "learning_rate": 5.669143988758253e-06, + "loss": 0.31929752230644226, + "step": 928 + }, + { + "epoch": 2.0620842572062084, + "grad_norm": 0.4643585681915283, + "learning_rate": 5.652648811168228e-06, + "loss": 0.47944340109825134, + "step": 930 + }, + { + "epoch": 2.066518847006652, + "grad_norm": 1.509389877319336, + "learning_rate": 5.636151579873726e-06, + "loss": 0.617887020111084, + "step": 932 + }, + { + "epoch": 2.070953436807095, + "grad_norm": 0.7840235233306885, + "learning_rate": 5.619652516824967e-06, + "loss": 0.7485859394073486, + "step": 934 + }, + { + "epoch": 2.0753880266075386, + "grad_norm": 0.9361806511878967, + "learning_rate": 5.603151843996822e-06, + "loss": 0.8575759530067444, + "step": 936 + }, + { + "epoch": 2.079822616407982, + "grad_norm": 0.6722026467323303, + "learning_rate": 5.586649783385813e-06, + "loss": 0.485722154378891, + "step": 938 + }, + { + "epoch": 2.084257206208426, + "grad_norm": 4.570074081420898, + "learning_rate": 5.570146557007141e-06, + "loss": 0.5747966766357422, + "step": 940 + }, + { + "epoch": 2.0886917960088693, + "grad_norm": 1.012689232826233, + "learning_rate": 5.553642386891683e-06, + "loss": 0.7173465490341187, + "step": 942 + }, + { + "epoch": 2.0931263858093128, + "grad_norm": 0.9818177819252014, + "learning_rate": 5.537137495083018e-06, + "loss": 0.4014107286930084, + "step": 944 + }, + { + "epoch": 2.097560975609756, + "grad_norm": 1.064757227897644, + "learning_rate": 5.5206321036344304e-06, + "loss": 0.7004286050796509, + "step": 946 + }, + { + "epoch": 2.1019955654101996, + "grad_norm": 0.9812616109848022, + "learning_rate": 5.504126434605932e-06, + "loss": 0.7097384929656982, + "step": 948 + }, + { + "epoch": 2.106430155210643, + "grad_norm": 1.1268335580825806, + "learning_rate": 5.487620710061262e-06, + "loss": 0.5271036028862, + "step": 950 + }, + { + "epoch": 2.1108647450110865, + "grad_norm": 1.0241403579711914, + "learning_rate": 5.471115152064916e-06, + "loss": 0.5063275098800659, + "step": 952 + }, + { + "epoch": 2.11529933481153, + "grad_norm": 1.4809362888336182, + "learning_rate": 5.454609982679138e-06, + "loss": 0.5966112613677979, + "step": 954 + }, + { + "epoch": 2.1197339246119733, + "grad_norm": 0.6269991397857666, + "learning_rate": 5.4381054239609525e-06, + "loss": 0.7026289701461792, + "step": 956 + }, + { + "epoch": 2.1241685144124167, + "grad_norm": 1.0289415121078491, + "learning_rate": 5.421601697959164e-06, + "loss": 0.5304814577102661, + "step": 958 + }, + { + "epoch": 2.12860310421286, + "grad_norm": 2.167893648147583, + "learning_rate": 5.405099026711374e-06, + "loss": 0.4472266435623169, + "step": 960 + }, + { + "epoch": 2.1330376940133036, + "grad_norm": 2.798168659210205, + "learning_rate": 5.388597632240994e-06, + "loss": 0.43469172716140747, + "step": 962 + }, + { + "epoch": 2.1374722838137474, + "grad_norm": 2.809424638748169, + "learning_rate": 5.372097736554261e-06, + "loss": 0.6800037026405334, + "step": 964 + }, + { + "epoch": 2.141906873614191, + "grad_norm": 0.6289676427841187, + "learning_rate": 5.35559956163724e-06, + "loss": 0.5739249587059021, + "step": 966 + }, + { + "epoch": 2.1463414634146343, + "grad_norm": 0.6345292925834656, + "learning_rate": 5.339103329452856e-06, + "loss": 0.4366436302661896, + "step": 968 + }, + { + "epoch": 2.1507760532150777, + "grad_norm": 0.9140567779541016, + "learning_rate": 5.322609261937887e-06, + "loss": 0.6112205982208252, + "step": 970 + }, + { + "epoch": 2.155210643015521, + "grad_norm": 1.1668734550476074, + "learning_rate": 5.306117580999993e-06, + "loss": 0.620341420173645, + "step": 972 + }, + { + "epoch": 2.1596452328159645, + "grad_norm": 1.511851191520691, + "learning_rate": 5.289628508514725e-06, + "loss": 0.7794575095176697, + "step": 974 + }, + { + "epoch": 2.164079822616408, + "grad_norm": 0.9918836951255798, + "learning_rate": 5.2731422663225385e-06, + "loss": 0.6446636915206909, + "step": 976 + }, + { + "epoch": 2.1685144124168514, + "grad_norm": 7.138890743255615, + "learning_rate": 5.256659076225813e-06, + "loss": 0.47500547766685486, + "step": 978 + }, + { + "epoch": 2.172949002217295, + "grad_norm": 1.688381552696228, + "learning_rate": 5.240179159985866e-06, + "loss": 0.8513635993003845, + "step": 980 + }, + { + "epoch": 2.1773835920177382, + "grad_norm": 0.8748887777328491, + "learning_rate": 5.2237027393199645e-06, + "loss": 0.44088250398635864, + "step": 982 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.724166750907898, + "learning_rate": 5.207230035898356e-06, + "loss": 0.21423432230949402, + "step": 984 + }, + { + "epoch": 2.186252771618625, + "grad_norm": 6.790086269378662, + "learning_rate": 5.190761271341268e-06, + "loss": 0.6213544011116028, + "step": 986 + }, + { + "epoch": 2.1906873614190685, + "grad_norm": 3.65374755859375, + "learning_rate": 5.174296667215939e-06, + "loss": 0.35641536116600037, + "step": 988 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 1.8495643138885498, + "learning_rate": 5.157836445033636e-06, + "loss": 0.7459386587142944, + "step": 990 + }, + { + "epoch": 2.199556541019956, + "grad_norm": 2.664592981338501, + "learning_rate": 5.141380826246667e-06, + "loss": 0.7802680730819702, + "step": 992 + }, + { + "epoch": 2.203991130820399, + "grad_norm": 0.6977004408836365, + "learning_rate": 5.124930032245415e-06, + "loss": 0.6166579127311707, + "step": 994 + }, + { + "epoch": 2.2084257206208426, + "grad_norm": 2.4659013748168945, + "learning_rate": 5.108484284355339e-06, + "loss": 0.7725459933280945, + "step": 996 + }, + { + "epoch": 2.212860310421286, + "grad_norm": 1.1927064657211304, + "learning_rate": 5.0920438038340194e-06, + "loss": 0.7431020736694336, + "step": 998 + }, + { + "epoch": 2.2172949002217295, + "grad_norm": 1.1199584007263184, + "learning_rate": 5.075608811868169e-06, + "loss": 0.5989236831665039, + "step": 1000 + }, + { + "epoch": 2.221729490022173, + "grad_norm": 2.5724501609802246, + "learning_rate": 5.059179529570657e-06, + "loss": 0.35090845823287964, + "step": 1002 + }, + { + "epoch": 2.2261640798226163, + "grad_norm": 0.6757038235664368, + "learning_rate": 5.042756177977534e-06, + "loss": 0.7968004941940308, + "step": 1004 + }, + { + "epoch": 2.2305986696230597, + "grad_norm": 2.796313762664795, + "learning_rate": 5.026338978045062e-06, + "loss": 0.5947654247283936, + "step": 1006 + }, + { + "epoch": 2.235033259423503, + "grad_norm": 0.7423799633979797, + "learning_rate": 5.009928150646741e-06, + "loss": 0.7034870982170105, + "step": 1008 + }, + { + "epoch": 2.2394678492239466, + "grad_norm": 0.793771505355835, + "learning_rate": 4.993523916570334e-06, + "loss": 0.4220029413700104, + "step": 1010 + }, + { + "epoch": 2.2439024390243905, + "grad_norm": 0.9843591451644897, + "learning_rate": 4.977126496514902e-06, + "loss": 0.7488526105880737, + "step": 1012 + }, + { + "epoch": 2.248337028824834, + "grad_norm": 12.598713874816895, + "learning_rate": 4.960736111087827e-06, + "loss": 0.697821319103241, + "step": 1014 + }, + { + "epoch": 2.2527716186252773, + "grad_norm": 1.4622032642364502, + "learning_rate": 4.9443529808018545e-06, + "loss": 0.8177450299263, + "step": 1016 + }, + { + "epoch": 2.2572062084257207, + "grad_norm": 0.886799156665802, + "learning_rate": 4.927977326072115e-06, + "loss": 0.45011892914772034, + "step": 1018 + }, + { + "epoch": 2.261640798226164, + "grad_norm": 1.24601149559021, + "learning_rate": 4.911609367213168e-06, + "loss": 0.7044653296470642, + "step": 1020 + }, + { + "epoch": 2.2660753880266076, + "grad_norm": 0.47574278712272644, + "learning_rate": 4.895249324436035e-06, + "loss": 0.49686291813850403, + "step": 1022 + }, + { + "epoch": 2.270509977827051, + "grad_norm": 3.832730293273926, + "learning_rate": 4.8788974178452316e-06, + "loss": 0.8501523733139038, + "step": 1024 + }, + { + "epoch": 2.2749445676274944, + "grad_norm": 0.9218156933784485, + "learning_rate": 4.86255386743582e-06, + "loss": 0.5115727186203003, + "step": 1026 + }, + { + "epoch": 2.279379157427938, + "grad_norm": 0.600154459476471, + "learning_rate": 4.846218893090426e-06, + "loss": 0.7313248515129089, + "step": 1028 + }, + { + "epoch": 2.2838137472283813, + "grad_norm": 0.7732475399971008, + "learning_rate": 4.829892714576307e-06, + "loss": 0.8301377892494202, + "step": 1030 + }, + { + "epoch": 2.2882483370288247, + "grad_norm": 1.1284780502319336, + "learning_rate": 4.813575551542381e-06, + "loss": 0.5336828827857971, + "step": 1032 + }, + { + "epoch": 2.292682926829268, + "grad_norm": 1.2494477033615112, + "learning_rate": 4.7972676235162714e-06, + "loss": 0.5561960339546204, + "step": 1034 + }, + { + "epoch": 2.2971175166297115, + "grad_norm": 0.9986409544944763, + "learning_rate": 4.780969149901354e-06, + "loss": 0.7975015044212341, + "step": 1036 + }, + { + "epoch": 2.3015521064301554, + "grad_norm": 2.619966745376587, + "learning_rate": 4.764680349973812e-06, + "loss": 0.7354569435119629, + "step": 1038 + }, + { + "epoch": 2.305986696230599, + "grad_norm": 3.9357285499572754, + "learning_rate": 4.748401442879674e-06, + "loss": 0.7495402693748474, + "step": 1040 + }, + { + "epoch": 2.3104212860310422, + "grad_norm": 0.9967501163482666, + "learning_rate": 4.732132647631881e-06, + "loss": 0.793857991695404, + "step": 1042 + }, + { + "epoch": 2.3148558758314857, + "grad_norm": 1.5514739751815796, + "learning_rate": 4.715874183107324e-06, + "loss": 0.5644249320030212, + "step": 1044 + }, + { + "epoch": 2.319290465631929, + "grad_norm": 0.940541684627533, + "learning_rate": 4.699626268043911e-06, + "loss": 0.7716040015220642, + "step": 1046 + }, + { + "epoch": 2.3237250554323725, + "grad_norm": 0.7030808925628662, + "learning_rate": 4.683389121037618e-06, + "loss": 0.7232730388641357, + "step": 1048 + }, + { + "epoch": 2.328159645232816, + "grad_norm": 0.8067787885665894, + "learning_rate": 4.667162960539552e-06, + "loss": 0.7696553468704224, + "step": 1050 + }, + { + "epoch": 2.3325942350332594, + "grad_norm": 1.9469633102416992, + "learning_rate": 4.650948004853006e-06, + "loss": 0.598371684551239, + "step": 1052 + }, + { + "epoch": 2.337028824833703, + "grad_norm": 1.243727684020996, + "learning_rate": 4.634744472130529e-06, + "loss": 0.5612485408782959, + "step": 1054 + }, + { + "epoch": 2.341463414634146, + "grad_norm": 0.789696455001831, + "learning_rate": 4.618552580370988e-06, + "loss": 0.6212122440338135, + "step": 1056 + }, + { + "epoch": 2.3458980044345896, + "grad_norm": 0.28680506348609924, + "learning_rate": 4.6023725474166324e-06, + "loss": 0.4215806722640991, + "step": 1058 + }, + { + "epoch": 2.3503325942350335, + "grad_norm": 1.192357063293457, + "learning_rate": 4.586204590950169e-06, + "loss": 0.9317978024482727, + "step": 1060 + }, + { + "epoch": 2.354767184035477, + "grad_norm": 1.958794116973877, + "learning_rate": 4.570048928491824e-06, + "loss": 0.42631787061691284, + "step": 1062 + }, + { + "epoch": 2.3592017738359203, + "grad_norm": 0.31139975786209106, + "learning_rate": 4.5539057773964316e-06, + "loss": 0.4592714011669159, + "step": 1064 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 1.8716444969177246, + "learning_rate": 4.537775354850496e-06, + "loss": 0.68389493227005, + "step": 1066 + }, + { + "epoch": 2.368070953436807, + "grad_norm": 0.7594958543777466, + "learning_rate": 4.5216578778692725e-06, + "loss": 0.8488420844078064, + "step": 1068 + }, + { + "epoch": 2.3725055432372506, + "grad_norm": 0.747459888458252, + "learning_rate": 4.5055535632938526e-06, + "loss": 0.7568836212158203, + "step": 1070 + }, + { + "epoch": 2.376940133037694, + "grad_norm": 0.8709418773651123, + "learning_rate": 4.489462627788242e-06, + "loss": 0.6960753202438354, + "step": 1072 + }, + { + "epoch": 2.3813747228381374, + "grad_norm": 0.1814187914133072, + "learning_rate": 4.473385287836448e-06, + "loss": 0.03584207594394684, + "step": 1074 + }, + { + "epoch": 2.385809312638581, + "grad_norm": 0.8838277459144592, + "learning_rate": 4.457321759739567e-06, + "loss": 0.4423300325870514, + "step": 1076 + }, + { + "epoch": 2.3902439024390243, + "grad_norm": 0.6482821106910706, + "learning_rate": 4.4412722596128686e-06, + "loss": 0.2545888423919678, + "step": 1078 + }, + { + "epoch": 2.3946784922394677, + "grad_norm": 1.0765259265899658, + "learning_rate": 4.425237003382903e-06, + "loss": 0.6147640943527222, + "step": 1080 + }, + { + "epoch": 2.399113082039911, + "grad_norm": 0.2775437831878662, + "learning_rate": 4.409216206784577e-06, + "loss": 0.4364810287952423, + "step": 1082 + }, + { + "epoch": 2.4035476718403546, + "grad_norm": 2.510125160217285, + "learning_rate": 4.393210085358265e-06, + "loss": 0.7311280369758606, + "step": 1084 + }, + { + "epoch": 2.4079822616407984, + "grad_norm": 0.7599928379058838, + "learning_rate": 4.3772188544469016e-06, + "loss": 0.7626463770866394, + "step": 1086 + }, + { + "epoch": 2.412416851441242, + "grad_norm": 0.3223716616630554, + "learning_rate": 4.3612427291930915e-06, + "loss": 0.23752209544181824, + "step": 1088 + }, + { + "epoch": 2.4168514412416853, + "grad_norm": 1.2272253036499023, + "learning_rate": 4.345281924536208e-06, + "loss": 0.7394731044769287, + "step": 1090 + }, + { + "epoch": 2.4212860310421287, + "grad_norm": 1.4568101167678833, + "learning_rate": 4.329336655209505e-06, + "loss": 0.7773704528808594, + "step": 1092 + }, + { + "epoch": 2.425720620842572, + "grad_norm": 1.198792576789856, + "learning_rate": 4.31340713573723e-06, + "loss": 0.48740097880363464, + "step": 1094 + }, + { + "epoch": 2.4301552106430155, + "grad_norm": 2.5480594635009766, + "learning_rate": 4.297493580431732e-06, + "loss": 0.47421979904174805, + "step": 1096 + }, + { + "epoch": 2.434589800443459, + "grad_norm": 0.6080421805381775, + "learning_rate": 4.281596203390582e-06, + "loss": 0.40746811032295227, + "step": 1098 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 1.584132194519043, + "learning_rate": 4.265715218493695e-06, + "loss": 0.6294921636581421, + "step": 1100 + }, + { + "epoch": 2.443458980044346, + "grad_norm": 0.6097115278244019, + "learning_rate": 4.249850839400446e-06, + "loss": 0.7974519729614258, + "step": 1102 + }, + { + "epoch": 2.4478935698447892, + "grad_norm": 2.477348566055298, + "learning_rate": 4.2340032795468e-06, + "loss": 0.4605816602706909, + "step": 1104 + }, + { + "epoch": 2.4523281596452327, + "grad_norm": 2.5225706100463867, + "learning_rate": 4.218172752142442e-06, + "loss": 0.8402256369590759, + "step": 1106 + }, + { + "epoch": 2.4567627494456765, + "grad_norm": 2.1070775985717773, + "learning_rate": 4.202359470167903e-06, + "loss": 0.7417095303535461, + "step": 1108 + }, + { + "epoch": 2.4611973392461195, + "grad_norm": 0.8467901349067688, + "learning_rate": 4.186563646371696e-06, + "loss": 0.825175940990448, + "step": 1110 + }, + { + "epoch": 2.4656319290465634, + "grad_norm": 0.6079756617546082, + "learning_rate": 4.170785493267463e-06, + "loss": 0.5299561023712158, + "step": 1112 + }, + { + "epoch": 2.470066518847007, + "grad_norm": 0.8165512084960938, + "learning_rate": 4.155025223131102e-06, + "loss": 0.7136436700820923, + "step": 1114 + }, + { + "epoch": 2.47450110864745, + "grad_norm": 1.9828038215637207, + "learning_rate": 4.139283047997919e-06, + "loss": 0.35226771235466003, + "step": 1116 + }, + { + "epoch": 2.4789356984478936, + "grad_norm": 1.1542699337005615, + "learning_rate": 4.123559179659771e-06, + "loss": 0.707044243812561, + "step": 1118 + }, + { + "epoch": 2.483370288248337, + "grad_norm": 0.7548565864562988, + "learning_rate": 4.107853829662224e-06, + "loss": 0.6722221970558167, + "step": 1120 + }, + { + "epoch": 2.4878048780487805, + "grad_norm": 0.8779303431510925, + "learning_rate": 4.0921672093017e-06, + "loss": 0.6999770998954773, + "step": 1122 + }, + { + "epoch": 2.492239467849224, + "grad_norm": 2.153297185897827, + "learning_rate": 4.076499529622636e-06, + "loss": 0.8417968153953552, + "step": 1124 + }, + { + "epoch": 2.4966740576496673, + "grad_norm": 0.8999984264373779, + "learning_rate": 4.0608510014146455e-06, + "loss": 0.8947934508323669, + "step": 1126 + }, + { + "epoch": 2.5011086474501107, + "grad_norm": 0.6626126766204834, + "learning_rate": 4.045221835209684e-06, + "loss": 0.5889415740966797, + "step": 1128 + }, + { + "epoch": 2.505543237250554, + "grad_norm": 1.688296914100647, + "learning_rate": 4.02961224127921e-06, + "loss": 0.5885995030403137, + "step": 1130 + }, + { + "epoch": 2.5099778270509976, + "grad_norm": 0.8422841429710388, + "learning_rate": 4.014022429631368e-06, + "loss": 0.6949036717414856, + "step": 1132 + }, + { + "epoch": 2.5144124168514415, + "grad_norm": 0.7921580672264099, + "learning_rate": 3.998452610008147e-06, + "loss": 0.46577969193458557, + "step": 1134 + }, + { + "epoch": 2.5188470066518844, + "grad_norm": 0.9153636693954468, + "learning_rate": 3.982902991882578e-06, + "loss": 0.8383475542068481, + "step": 1136 + }, + { + "epoch": 2.5232815964523283, + "grad_norm": 0.7820035219192505, + "learning_rate": 3.967373784455896e-06, + "loss": 0.6323981285095215, + "step": 1138 + }, + { + "epoch": 2.5277161862527717, + "grad_norm": 0.7881942391395569, + "learning_rate": 3.951865196654738e-06, + "loss": 0.7108765244483948, + "step": 1140 + }, + { + "epoch": 2.532150776053215, + "grad_norm": 4.971331596374512, + "learning_rate": 3.936377437128329e-06, + "loss": 0.44235268235206604, + "step": 1142 + }, + { + "epoch": 2.5365853658536586, + "grad_norm": 0.2698603570461273, + "learning_rate": 3.920910714245679e-06, + "loss": 0.4289158880710602, + "step": 1144 + }, + { + "epoch": 2.541019955654102, + "grad_norm": 0.8469025492668152, + "learning_rate": 3.905465236092771e-06, + "loss": 0.7589935660362244, + "step": 1146 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 0.5552167892456055, + "learning_rate": 3.890041210469765e-06, + "loss": 0.7610056400299072, + "step": 1148 + }, + { + "epoch": 2.549889135254989, + "grad_norm": 0.765277624130249, + "learning_rate": 3.8746388448882055e-06, + "loss": 0.7218910455703735, + "step": 1150 + }, + { + "epoch": 2.5543237250554323, + "grad_norm": 2.669365644454956, + "learning_rate": 3.859258346568228e-06, + "loss": 0.6901261210441589, + "step": 1152 + }, + { + "epoch": 2.5587583148558757, + "grad_norm": 0.28330180048942566, + "learning_rate": 3.843899922435767e-06, + "loss": 0.3145535886287689, + "step": 1154 + }, + { + "epoch": 2.5631929046563195, + "grad_norm": 1.1048952341079712, + "learning_rate": 3.8285637791197815e-06, + "loss": 0.41754475235939026, + "step": 1156 + }, + { + "epoch": 2.5676274944567625, + "grad_norm": 1.7286397218704224, + "learning_rate": 3.8132501229494635e-06, + "loss": 0.4815715551376343, + "step": 1158 + }, + { + "epoch": 2.5720620842572064, + "grad_norm": 2.333894729614258, + "learning_rate": 3.7979591599514696e-06, + "loss": 0.611275315284729, + "step": 1160 + }, + { + "epoch": 2.57649667405765, + "grad_norm": 1.0634231567382812, + "learning_rate": 3.782691095847151e-06, + "loss": 0.810359001159668, + "step": 1162 + }, + { + "epoch": 2.5809312638580932, + "grad_norm": 3.6835250854492188, + "learning_rate": 3.767446136049775e-06, + "loss": 0.7754755616188049, + "step": 1164 + }, + { + "epoch": 2.5853658536585367, + "grad_norm": 0.7622874975204468, + "learning_rate": 3.752224485661775e-06, + "loss": 0.46459802985191345, + "step": 1166 + }, + { + "epoch": 2.58980044345898, + "grad_norm": 0.6675243973731995, + "learning_rate": 3.7370263494719805e-06, + "loss": 0.6850208044052124, + "step": 1168 + }, + { + "epoch": 2.5942350332594235, + "grad_norm": 1.2453693151474, + "learning_rate": 3.721851931952869e-06, + "loss": 0.657964825630188, + "step": 1170 + }, + { + "epoch": 2.598669623059867, + "grad_norm": 1.1155322790145874, + "learning_rate": 3.706701437257808e-06, + "loss": 0.31276965141296387, + "step": 1172 + }, + { + "epoch": 2.6031042128603104, + "grad_norm": 0.5230674743652344, + "learning_rate": 3.691575069218314e-06, + "loss": 0.5281472206115723, + "step": 1174 + }, + { + "epoch": 2.6075388026607538, + "grad_norm": 0.7821836471557617, + "learning_rate": 3.676473031341313e-06, + "loss": 0.5338150858879089, + "step": 1176 + }, + { + "epoch": 2.611973392461197, + "grad_norm": 0.8501307368278503, + "learning_rate": 3.661395526806395e-06, + "loss": 0.4878658056259155, + "step": 1178 + }, + { + "epoch": 2.6164079822616406, + "grad_norm": 1.2892802953720093, + "learning_rate": 3.6463427584630806e-06, + "loss": 0.6299321055412292, + "step": 1180 + }, + { + "epoch": 2.6208425720620845, + "grad_norm": 0.6108219623565674, + "learning_rate": 3.631314928828099e-06, + "loss": 0.7980747222900391, + "step": 1182 + }, + { + "epoch": 2.6252771618625275, + "grad_norm": 4.147970676422119, + "learning_rate": 3.616312240082659e-06, + "loss": 0.7594188451766968, + "step": 1184 + }, + { + "epoch": 2.6297117516629713, + "grad_norm": 0.6734055280685425, + "learning_rate": 3.601334894069728e-06, + "loss": 0.7057641744613647, + "step": 1186 + }, + { + "epoch": 2.6341463414634148, + "grad_norm": 3.3473269939422607, + "learning_rate": 3.5863830922913147e-06, + "loss": 0.8521836400032043, + "step": 1188 + }, + { + "epoch": 2.638580931263858, + "grad_norm": 0.7423378825187683, + "learning_rate": 3.5714570359057676e-06, + "loss": 0.4189079999923706, + "step": 1190 + }, + { + "epoch": 2.6430155210643016, + "grad_norm": 1.9489797353744507, + "learning_rate": 3.556556925725061e-06, + "loss": 0.6845167875289917, + "step": 1192 + }, + { + "epoch": 2.647450110864745, + "grad_norm": 5.555662631988525, + "learning_rate": 3.5416829622120875e-06, + "loss": 0.5077641010284424, + "step": 1194 + }, + { + "epoch": 2.6518847006651884, + "grad_norm": 1.2466117143630981, + "learning_rate": 3.526835345477978e-06, + "loss": 0.636448860168457, + "step": 1196 + }, + { + "epoch": 2.656319290465632, + "grad_norm": 0.4763660430908203, + "learning_rate": 3.5120142752793907e-06, + "loss": 0.2538270056247711, + "step": 1198 + }, + { + "epoch": 2.6607538802660753, + "grad_norm": 1.4780601263046265, + "learning_rate": 3.4972199510158393e-06, + "loss": 0.7502221465110779, + "step": 1200 + }, + { + "epoch": 2.6651884700665187, + "grad_norm": 0.6862211227416992, + "learning_rate": 3.4824525717269975e-06, + "loss": 0.8194411993026733, + "step": 1202 + }, + { + "epoch": 2.6696230598669626, + "grad_norm": 0.31569620966911316, + "learning_rate": 3.4677123360900342e-06, + "loss": 0.36762019991874695, + "step": 1204 + }, + { + "epoch": 2.6740576496674056, + "grad_norm": 0.7146638631820679, + "learning_rate": 3.4529994424169233e-06, + "loss": 0.5894730091094971, + "step": 1206 + }, + { + "epoch": 2.6784922394678494, + "grad_norm": 2.2065584659576416, + "learning_rate": 3.4383140886517953e-06, + "loss": 0.6572399139404297, + "step": 1208 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 4.649989128112793, + "learning_rate": 3.423656472368262e-06, + "loss": 0.5220817923545837, + "step": 1210 + }, + { + "epoch": 2.6873614190687363, + "grad_norm": 3.845285177230835, + "learning_rate": 3.409026790766756e-06, + "loss": 0.27520784735679626, + "step": 1212 + }, + { + "epoch": 2.6917960088691797, + "grad_norm": 0.713187575340271, + "learning_rate": 3.394425240671891e-06, + "loss": 0.43082767724990845, + "step": 1214 + }, + { + "epoch": 2.696230598669623, + "grad_norm": 2.7740540504455566, + "learning_rate": 3.379852018529799e-06, + "loss": 0.5405531525611877, + "step": 1216 + }, + { + "epoch": 2.7006651884700665, + "grad_norm": 0.9415165781974792, + "learning_rate": 3.3653073204054942e-06, + "loss": 0.6151460409164429, + "step": 1218 + }, + { + "epoch": 2.70509977827051, + "grad_norm": 1.3651831150054932, + "learning_rate": 3.3507913419802403e-06, + "loss": 0.8346598744392395, + "step": 1220 + }, + { + "epoch": 2.7095343680709534, + "grad_norm": 1.3357763290405273, + "learning_rate": 3.336304278548903e-06, + "loss": 0.5949963331222534, + "step": 1222 + }, + { + "epoch": 2.713968957871397, + "grad_norm": 2.2765064239501953, + "learning_rate": 3.321846325017342e-06, + "loss": 0.7419912815093994, + "step": 1224 + }, + { + "epoch": 2.7184035476718402, + "grad_norm": 1.6282012462615967, + "learning_rate": 3.3074176758997744e-06, + "loss": 0.4085941016674042, + "step": 1226 + }, + { + "epoch": 2.7228381374722836, + "grad_norm": 0.9617356657981873, + "learning_rate": 3.2930185253161574e-06, + "loss": 0.8997126817703247, + "step": 1228 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.9657713174819946, + "learning_rate": 3.2786490669895883e-06, + "loss": 0.6804481744766235, + "step": 1230 + }, + { + "epoch": 2.7317073170731705, + "grad_norm": 0.6607502102851868, + "learning_rate": 3.2643094942436865e-06, + "loss": 0.6290354132652283, + "step": 1232 + }, + { + "epoch": 2.7361419068736144, + "grad_norm": 3.5074925422668457, + "learning_rate": 3.2500000000000015e-06, + "loss": 0.3904164433479309, + "step": 1234 + }, + { + "epoch": 2.740576496674058, + "grad_norm": 0.9359789490699768, + "learning_rate": 3.2357207767754063e-06, + "loss": 0.7830184698104858, + "step": 1236 + }, + { + "epoch": 2.745011086474501, + "grad_norm": 0.5713577270507812, + "learning_rate": 3.221472016679521e-06, + "loss": 0.4108898341655731, + "step": 1238 + }, + { + "epoch": 2.7494456762749446, + "grad_norm": 1.344169020652771, + "learning_rate": 3.2072539114121188e-06, + "loss": 0.4244284927845001, + "step": 1240 + }, + { + "epoch": 2.753880266075388, + "grad_norm": 0.8237523436546326, + "learning_rate": 3.193066652260547e-06, + "loss": 0.8027604818344116, + "step": 1242 + }, + { + "epoch": 2.7583148558758315, + "grad_norm": 0.7854613065719604, + "learning_rate": 3.1789104300971603e-06, + "loss": 0.7452245950698853, + "step": 1244 + }, + { + "epoch": 2.762749445676275, + "grad_norm": 0.665742039680481, + "learning_rate": 3.164785435376745e-06, + "loss": 0.43021270632743835, + "step": 1246 + }, + { + "epoch": 2.7671840354767183, + "grad_norm": 1.347853183746338, + "learning_rate": 3.1506918581339583e-06, + "loss": 0.422033429145813, + "step": 1248 + }, + { + "epoch": 2.7716186252771617, + "grad_norm": 1.2096498012542725, + "learning_rate": 3.136629887980781e-06, + "loss": 0.5619819164276123, + "step": 1250 + }, + { + "epoch": 2.776053215077605, + "grad_norm": 0.8340552449226379, + "learning_rate": 3.122599714103949e-06, + "loss": 0.9244868755340576, + "step": 1252 + }, + { + "epoch": 2.7804878048780486, + "grad_norm": 0.8274340033531189, + "learning_rate": 3.1086015252624257e-06, + "loss": 0.8055827021598816, + "step": 1254 + }, + { + "epoch": 2.7849223946784925, + "grad_norm": 0.7741687893867493, + "learning_rate": 3.0946355097848535e-06, + "loss": 0.7520518898963928, + "step": 1256 + }, + { + "epoch": 2.7893569844789354, + "grad_norm": 0.3145367503166199, + "learning_rate": 3.0807018555670153e-06, + "loss": 0.12884804606437683, + "step": 1258 + }, + { + "epoch": 2.7937915742793793, + "grad_norm": 2.4322192668914795, + "learning_rate": 3.0668007500693216e-06, + "loss": 0.7578915357589722, + "step": 1260 + }, + { + "epoch": 2.7982261640798227, + "grad_norm": 0.9636224508285522, + "learning_rate": 3.0529323803142697e-06, + "loss": 0.4767170548439026, + "step": 1262 + }, + { + "epoch": 2.802660753880266, + "grad_norm": 0.6876578330993652, + "learning_rate": 3.0390969328839464e-06, + "loss": 0.4632313549518585, + "step": 1264 + }, + { + "epoch": 2.8070953436807096, + "grad_norm": 0.7122061848640442, + "learning_rate": 3.0252945939175004e-06, + "loss": 0.9437022805213928, + "step": 1266 + }, + { + "epoch": 2.811529933481153, + "grad_norm": 1.677951693534851, + "learning_rate": 3.0115255491086537e-06, + "loss": 0.7852752804756165, + "step": 1268 + }, + { + "epoch": 2.8159645232815964, + "grad_norm": 0.9509062170982361, + "learning_rate": 2.9977899837031895e-06, + "loss": 0.8075975179672241, + "step": 1270 + }, + { + "epoch": 2.82039911308204, + "grad_norm": 4.19722318649292, + "learning_rate": 2.984088082496469e-06, + "loss": 0.31876012682914734, + "step": 1272 + }, + { + "epoch": 2.8248337028824833, + "grad_norm": 0.47213754057884216, + "learning_rate": 2.970420029830946e-06, + "loss": 0.44580259919166565, + "step": 1274 + }, + { + "epoch": 2.8292682926829267, + "grad_norm": 0.5697877407073975, + "learning_rate": 2.9567860095936775e-06, + "loss": 0.8971878290176392, + "step": 1276 + }, + { + "epoch": 2.8337028824833705, + "grad_norm": 0.7168667912483215, + "learning_rate": 2.9431862052138545e-06, + "loss": 0.7423674464225769, + "step": 1278 + }, + { + "epoch": 2.8381374722838135, + "grad_norm": 1.2420377731323242, + "learning_rate": 2.929620799660343e-06, + "loss": 0.34553366899490356, + "step": 1280 + }, + { + "epoch": 2.8425720620842574, + "grad_norm": 1.5156524181365967, + "learning_rate": 2.916089975439207e-06, + "loss": 0.5446531176567078, + "step": 1282 + }, + { + "epoch": 2.847006651884701, + "grad_norm": 0.7382546663284302, + "learning_rate": 2.9025939145912655e-06, + "loss": 0.526816725730896, + "step": 1284 + }, + { + "epoch": 2.8514412416851442, + "grad_norm": 0.8963580131530762, + "learning_rate": 2.8891327986896345e-06, + "loss": 0.7592861652374268, + "step": 1286 + }, + { + "epoch": 2.8558758314855877, + "grad_norm": 3.473883867263794, + "learning_rate": 2.875706808837292e-06, + "loss": 0.17455817759037018, + "step": 1288 + }, + { + "epoch": 2.860310421286031, + "grad_norm": 0.5690093636512756, + "learning_rate": 2.862316125664636e-06, + "loss": 0.7300111055374146, + "step": 1290 + }, + { + "epoch": 2.8647450110864745, + "grad_norm": 3.732383966445923, + "learning_rate": 2.848960929327053e-06, + "loss": 0.5489775538444519, + "step": 1292 + }, + { + "epoch": 2.869179600886918, + "grad_norm": 0.8943015336990356, + "learning_rate": 2.8356413995025044e-06, + "loss": 0.6973212957382202, + "step": 1294 + }, + { + "epoch": 2.8736141906873613, + "grad_norm": 0.9269885420799255, + "learning_rate": 2.8223577153890934e-06, + "loss": 0.7550508379936218, + "step": 1296 + }, + { + "epoch": 2.8780487804878048, + "grad_norm": 1.5174607038497925, + "learning_rate": 2.8091100557026702e-06, + "loss": 0.48215049505233765, + "step": 1298 + }, + { + "epoch": 2.882483370288248, + "grad_norm": 2.0186781883239746, + "learning_rate": 2.795898598674415e-06, + "loss": 0.7024590969085693, + "step": 1300 + }, + { + "epoch": 2.8869179600886916, + "grad_norm": 1.6526554822921753, + "learning_rate": 2.782723522048444e-06, + "loss": 0.25810810923576355, + "step": 1302 + }, + { + "epoch": 2.8913525498891355, + "grad_norm": 0.9696333408355713, + "learning_rate": 2.7695850030794293e-06, + "loss": 0.6788902878761292, + "step": 1304 + }, + { + "epoch": 2.8957871396895785, + "grad_norm": 0.8711814880371094, + "learning_rate": 2.7564832185301915e-06, + "loss": 0.7060118913650513, + "step": 1306 + }, + { + "epoch": 2.9002217294900223, + "grad_norm": 0.27649205923080444, + "learning_rate": 2.7434183446693397e-06, + "loss": 0.2528712749481201, + "step": 1308 + }, + { + "epoch": 2.9046563192904657, + "grad_norm": 0.891995906829834, + "learning_rate": 2.730390557268897e-06, + "loss": 0.3852493166923523, + "step": 1310 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.6957062482833862, + "learning_rate": 2.7174000316019277e-06, + "loss": 0.7152572870254517, + "step": 1312 + }, + { + "epoch": 2.9135254988913526, + "grad_norm": 0.7434815764427185, + "learning_rate": 2.704446942440191e-06, + "loss": 0.8421391844749451, + "step": 1314 + }, + { + "epoch": 2.917960088691796, + "grad_norm": 1.4164069890975952, + "learning_rate": 2.6915314640517755e-06, + "loss": 0.5117487907409668, + "step": 1316 + }, + { + "epoch": 2.9223946784922394, + "grad_norm": 1.4213894605636597, + "learning_rate": 2.6786537701987703e-06, + "loss": 0.7917452454566956, + "step": 1318 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 0.5963442921638489, + "learning_rate": 2.665814034134916e-06, + "loss": 0.49720868468284607, + "step": 1320 + }, + { + "epoch": 2.9312638580931263, + "grad_norm": 0.8147727847099304, + "learning_rate": 2.6530124286032755e-06, + "loss": 0.750374972820282, + "step": 1322 + }, + { + "epoch": 2.9356984478935697, + "grad_norm": 0.6078750491142273, + "learning_rate": 2.640249125833915e-06, + "loss": 0.7463411092758179, + "step": 1324 + }, + { + "epoch": 2.9401330376940136, + "grad_norm": 1.327854037284851, + "learning_rate": 2.6275242975415804e-06, + "loss": 0.7810685634613037, + "step": 1326 + }, + { + "epoch": 2.9445676274944566, + "grad_norm": 0.6636197566986084, + "learning_rate": 2.614838114923394e-06, + "loss": 0.8080487251281738, + "step": 1328 + }, + { + "epoch": 2.9490022172949004, + "grad_norm": 0.5991776585578918, + "learning_rate": 2.6021907486565447e-06, + "loss": 0.9131799936294556, + "step": 1330 + }, + { + "epoch": 2.953436807095344, + "grad_norm": 0.8838467597961426, + "learning_rate": 2.589582368895992e-06, + "loss": 0.8187052607536316, + "step": 1332 + }, + { + "epoch": 2.9578713968957873, + "grad_norm": 0.7488482594490051, + "learning_rate": 2.577013145272185e-06, + "loss": 0.362490713596344, + "step": 1334 + }, + { + "epoch": 2.9623059866962307, + "grad_norm": 0.9917161464691162, + "learning_rate": 2.564483246888772e-06, + "loss": 0.7155488133430481, + "step": 1336 + }, + { + "epoch": 2.966740576496674, + "grad_norm": 0.6550630927085876, + "learning_rate": 2.5519928423203266e-06, + "loss": 0.7411721348762512, + "step": 1338 + }, + { + "epoch": 2.9711751662971175, + "grad_norm": 0.624010443687439, + "learning_rate": 2.539542099610084e-06, + "loss": 0.5994629263877869, + "step": 1340 + }, + { + "epoch": 2.975609756097561, + "grad_norm": 1.2733458280563354, + "learning_rate": 2.5271311862676727e-06, + "loss": 0.5431544780731201, + "step": 1342 + }, + { + "epoch": 2.9800443458980044, + "grad_norm": 0.21305102109909058, + "learning_rate": 2.514760269266871e-06, + "loss": 0.18989457190036774, + "step": 1344 + }, + { + "epoch": 2.984478935698448, + "grad_norm": 0.6358434557914734, + "learning_rate": 2.50242951504335e-06, + "loss": 0.4404347240924835, + "step": 1346 + }, + { + "epoch": 2.988913525498891, + "grad_norm": 0.2760062515735626, + "learning_rate": 2.490139089492443e-06, + "loss": 0.4140094518661499, + "step": 1348 + }, + { + "epoch": 2.9933481152993346, + "grad_norm": 1.270507574081421, + "learning_rate": 2.4778891579669067e-06, + "loss": 0.5242727994918823, + "step": 1350 + }, + { + "epoch": 2.9977827050997785, + "grad_norm": 0.6698117852210999, + "learning_rate": 2.4656798852747023e-06, + "loss": 0.593068540096283, + "step": 1352 + }, + { + "epoch": 3.002217294900222, + "grad_norm": 0.8496447801589966, + "learning_rate": 2.453511435676777e-06, + "loss": 0.6407611966133118, + "step": 1354 + }, + { + "epoch": 3.0066518847006654, + "grad_norm": 0.6371109485626221, + "learning_rate": 2.441383972884848e-06, + "loss": 0.25417429208755493, + "step": 1356 + }, + { + "epoch": 3.011086474501109, + "grad_norm": 1.1947884559631348, + "learning_rate": 2.4292976600592095e-06, + "loss": 0.4258137345314026, + "step": 1358 + }, + { + "epoch": 3.015521064301552, + "grad_norm": 0.6553521156311035, + "learning_rate": 2.4172526598065304e-06, + "loss": 0.5452851057052612, + "step": 1360 + }, + { + "epoch": 3.0199556541019956, + "grad_norm": 1.3817777633666992, + "learning_rate": 2.4052491341776686e-06, + "loss": 0.4110011160373688, + "step": 1362 + }, + { + "epoch": 3.024390243902439, + "grad_norm": 0.9155776500701904, + "learning_rate": 2.393287244665494e-06, + "loss": 0.5112780928611755, + "step": 1364 + }, + { + "epoch": 3.0288248337028825, + "grad_norm": 0.14318493008613586, + "learning_rate": 2.3813671522027094e-06, + "loss": 0.17776285111904144, + "step": 1366 + }, + { + "epoch": 3.033259423503326, + "grad_norm": 0.8653013706207275, + "learning_rate": 2.369489017159692e-06, + "loss": 0.28078657388687134, + "step": 1368 + }, + { + "epoch": 3.0376940133037693, + "grad_norm": 1.8689088821411133, + "learning_rate": 2.357652999342334e-06, + "loss": 0.3095449209213257, + "step": 1370 + }, + { + "epoch": 3.0421286031042127, + "grad_norm": 1.557530164718628, + "learning_rate": 2.345859257989886e-06, + "loss": 0.43448740243911743, + "step": 1372 + }, + { + "epoch": 3.046563192904656, + "grad_norm": 1.1236419677734375, + "learning_rate": 2.334107951772826e-06, + "loss": 0.22937437891960144, + "step": 1374 + }, + { + "epoch": 3.0509977827050996, + "grad_norm": 1.4432166814804077, + "learning_rate": 2.3223992387907137e-06, + "loss": 0.26416340470314026, + "step": 1376 + }, + { + "epoch": 3.0554323725055434, + "grad_norm": 0.8640435338020325, + "learning_rate": 2.3107332765700733e-06, + "loss": 0.21329531073570251, + "step": 1378 + }, + { + "epoch": 3.059866962305987, + "grad_norm": 0.08765086531639099, + "learning_rate": 2.2991102220622647e-06, + "loss": 0.2234346866607666, + "step": 1380 + }, + { + "epoch": 3.0643015521064303, + "grad_norm": 1.6020888090133667, + "learning_rate": 2.2875302316413807e-06, + "loss": 0.12742415070533752, + "step": 1382 + }, + { + "epoch": 3.0687361419068737, + "grad_norm": 3.4508602619171143, + "learning_rate": 2.275993461102138e-06, + "loss": 0.34950244426727295, + "step": 1384 + }, + { + "epoch": 3.073170731707317, + "grad_norm": 0.3769654929637909, + "learning_rate": 2.2645000656577793e-06, + "loss": 0.17151916027069092, + "step": 1386 + }, + { + "epoch": 3.0776053215077606, + "grad_norm": 0.18798436224460602, + "learning_rate": 2.2530501999379932e-06, + "loss": 0.15390659868717194, + "step": 1388 + }, + { + "epoch": 3.082039911308204, + "grad_norm": 0.8774737119674683, + "learning_rate": 2.2416440179868236e-06, + "loss": 0.3711448907852173, + "step": 1390 + }, + { + "epoch": 3.0864745011086474, + "grad_norm": 1.811327338218689, + "learning_rate": 2.230281673260605e-06, + "loss": 0.16538086533546448, + "step": 1392 + }, + { + "epoch": 3.090909090909091, + "grad_norm": 3.584960699081421, + "learning_rate": 2.218963318625895e-06, + "loss": 0.2704678773880005, + "step": 1394 + }, + { + "epoch": 3.0953436807095343, + "grad_norm": 1.5164047479629517, + "learning_rate": 2.2076891063574167e-06, + "loss": 0.46402987837791443, + "step": 1396 + }, + { + "epoch": 3.0997782705099777, + "grad_norm": 1.3325133323669434, + "learning_rate": 2.196459188136014e-06, + "loss": 0.4478250741958618, + "step": 1398 + }, + { + "epoch": 3.104212860310421, + "grad_norm": 0.3600890040397644, + "learning_rate": 2.1852737150466064e-06, + "loss": 0.38654354214668274, + "step": 1400 + }, + { + "epoch": 3.1086474501108645, + "grad_norm": 0.36502784490585327, + "learning_rate": 2.174132837576156e-06, + "loss": 0.14690890908241272, + "step": 1402 + }, + { + "epoch": 3.1130820399113084, + "grad_norm": 0.8481831550598145, + "learning_rate": 2.1630367056116496e-06, + "loss": 0.2545594871044159, + "step": 1404 + }, + { + "epoch": 3.117516629711752, + "grad_norm": 1.2579960823059082, + "learning_rate": 2.1519854684380724e-06, + "loss": 0.41790008544921875, + "step": 1406 + }, + { + "epoch": 3.1219512195121952, + "grad_norm": 0.7599564790725708, + "learning_rate": 2.1409792747364103e-06, + "loss": 0.4978216886520386, + "step": 1408 + }, + { + "epoch": 3.1263858093126387, + "grad_norm": 0.8300555348396301, + "learning_rate": 2.1300182725816378e-06, + "loss": 0.3864242732524872, + "step": 1410 + }, + { + "epoch": 3.130820399113082, + "grad_norm": 0.8185410499572754, + "learning_rate": 2.1191026094407386e-06, + "loss": 0.3967961072921753, + "step": 1412 + }, + { + "epoch": 3.1352549889135255, + "grad_norm": 0.9340487718582153, + "learning_rate": 2.1082324321707075e-06, + "loss": 0.2589724361896515, + "step": 1414 + }, + { + "epoch": 3.139689578713969, + "grad_norm": 0.12941651046276093, + "learning_rate": 2.0974078870165882e-06, + "loss": 0.018333781510591507, + "step": 1416 + }, + { + "epoch": 3.1441241685144123, + "grad_norm": 1.1716057062149048, + "learning_rate": 2.086629119609499e-06, + "loss": 0.36201152205467224, + "step": 1418 + }, + { + "epoch": 3.1485587583148558, + "grad_norm": 4.401972770690918, + "learning_rate": 2.0758962749646716e-06, + "loss": 0.3494262397289276, + "step": 1420 + }, + { + "epoch": 3.152993348115299, + "grad_norm": 1.9533579349517822, + "learning_rate": 2.065209497479502e-06, + "loss": 0.3848462998867035, + "step": 1422 + }, + { + "epoch": 3.1574279379157426, + "grad_norm": 0.20573538541793823, + "learning_rate": 2.0545689309316138e-06, + "loss": 0.05736740678548813, + "step": 1424 + }, + { + "epoch": 3.1618625277161865, + "grad_norm": 0.9018411636352539, + "learning_rate": 2.043974718476911e-06, + "loss": 0.348215788602829, + "step": 1426 + }, + { + "epoch": 3.16629711751663, + "grad_norm": 1.5437663793563843, + "learning_rate": 2.033427002647668e-06, + "loss": 0.33423206210136414, + "step": 1428 + }, + { + "epoch": 3.1707317073170733, + "grad_norm": 1.4751362800598145, + "learning_rate": 2.0229259253505946e-06, + "loss": 0.29139864444732666, + "step": 1430 + }, + { + "epoch": 3.1751662971175167, + "grad_norm": 0.32802924513816833, + "learning_rate": 2.012471627864943e-06, + "loss": 0.2629394829273224, + "step": 1432 + }, + { + "epoch": 3.17960088691796, + "grad_norm": 0.9640628099441528, + "learning_rate": 2.0020642508405984e-06, + "loss": 0.2902968227863312, + "step": 1434 + }, + { + "epoch": 3.1840354767184036, + "grad_norm": 1.5065374374389648, + "learning_rate": 1.9917039342961837e-06, + "loss": 0.15028798580169678, + "step": 1436 + }, + { + "epoch": 3.188470066518847, + "grad_norm": 0.6139736771583557, + "learning_rate": 1.9813908176171857e-06, + "loss": 0.2818301320075989, + "step": 1438 + }, + { + "epoch": 3.1929046563192904, + "grad_norm": 1.8703267574310303, + "learning_rate": 1.97112503955407e-06, + "loss": 0.4048473834991455, + "step": 1440 + }, + { + "epoch": 3.197339246119734, + "grad_norm": 0.6717519164085388, + "learning_rate": 1.9609067382204224e-06, + "loss": 0.4510973393917084, + "step": 1442 + }, + { + "epoch": 3.2017738359201773, + "grad_norm": 1.1299378871917725, + "learning_rate": 1.950736051091084e-06, + "loss": 0.41415518522262573, + "step": 1444 + }, + { + "epoch": 3.2062084257206207, + "grad_norm": 0.5919221639633179, + "learning_rate": 1.9406131150003036e-06, + "loss": 0.3593963384628296, + "step": 1446 + }, + { + "epoch": 3.210643015521064, + "grad_norm": 4.163533687591553, + "learning_rate": 1.930538066139904e-06, + "loss": 0.13564369082450867, + "step": 1448 + }, + { + "epoch": 3.2150776053215075, + "grad_norm": 1.6044939756393433, + "learning_rate": 1.9205110400574368e-06, + "loss": 0.47921210527420044, + "step": 1450 + }, + { + "epoch": 3.2195121951219514, + "grad_norm": 1.731406331062317, + "learning_rate": 1.910532171654367e-06, + "loss": 0.28882908821105957, + "step": 1452 + }, + { + "epoch": 3.223946784922395, + "grad_norm": 1.6707464456558228, + "learning_rate": 1.9006015951842587e-06, + "loss": 0.32913684844970703, + "step": 1454 + }, + { + "epoch": 3.2283813747228383, + "grad_norm": 4.336728572845459, + "learning_rate": 1.8907194442509642e-06, + "loss": 0.5413320064544678, + "step": 1456 + }, + { + "epoch": 3.2328159645232817, + "grad_norm": 2.011685371398926, + "learning_rate": 1.8808858518068312e-06, + "loss": 0.34133854508399963, + "step": 1458 + }, + { + "epoch": 3.237250554323725, + "grad_norm": 0.9558274745941162, + "learning_rate": 1.8711009501509087e-06, + "loss": 0.3718280792236328, + "step": 1460 + }, + { + "epoch": 3.2416851441241685, + "grad_norm": 0.8279819488525391, + "learning_rate": 1.8613648709271732e-06, + "loss": 0.3327626585960388, + "step": 1462 + }, + { + "epoch": 3.246119733924612, + "grad_norm": 1.6108118295669556, + "learning_rate": 1.8516777451227552e-06, + "loss": 0.5488408207893372, + "step": 1464 + }, + { + "epoch": 3.2505543237250554, + "grad_norm": 1.2620062828063965, + "learning_rate": 1.842039703066172e-06, + "loss": 0.4193928837776184, + "step": 1466 + }, + { + "epoch": 3.254988913525499, + "grad_norm": 0.27675992250442505, + "learning_rate": 1.8324508744255842e-06, + "loss": 0.05343915894627571, + "step": 1468 + }, + { + "epoch": 3.259423503325942, + "grad_norm": 2.2762207984924316, + "learning_rate": 1.8229113882070398e-06, + "loss": 0.31694963574409485, + "step": 1470 + }, + { + "epoch": 3.2638580931263856, + "grad_norm": 0.6302704215049744, + "learning_rate": 1.8134213727527504e-06, + "loss": 0.4034433364868164, + "step": 1472 + }, + { + "epoch": 3.2682926829268295, + "grad_norm": 0.8992738723754883, + "learning_rate": 1.803980955739354e-06, + "loss": 0.17774128913879395, + "step": 1474 + }, + { + "epoch": 3.2727272727272725, + "grad_norm": 0.9962568283081055, + "learning_rate": 1.7945902641762027e-06, + "loss": 0.37571877241134644, + "step": 1476 + }, + { + "epoch": 3.2771618625277164, + "grad_norm": 0.39244315028190613, + "learning_rate": 1.785249424403654e-06, + "loss": 0.23455193638801575, + "step": 1478 + }, + { + "epoch": 3.2815964523281598, + "grad_norm": 0.49077776074409485, + "learning_rate": 1.7759585620913723e-06, + "loss": 0.27589744329452515, + "step": 1480 + }, + { + "epoch": 3.286031042128603, + "grad_norm": 0.7578119039535522, + "learning_rate": 1.7667178022366294e-06, + "loss": 0.45988088846206665, + "step": 1482 + }, + { + "epoch": 3.2904656319290466, + "grad_norm": 0.6393482089042664, + "learning_rate": 1.757527269162636e-06, + "loss": 0.40383821725845337, + "step": 1484 + }, + { + "epoch": 3.29490022172949, + "grad_norm": 0.6597006916999817, + "learning_rate": 1.7483870865168585e-06, + "loss": 0.27575039863586426, + "step": 1486 + }, + { + "epoch": 3.2993348115299335, + "grad_norm": 8.2012939453125, + "learning_rate": 1.739297377269361e-06, + "loss": 0.33307555317878723, + "step": 1488 + }, + { + "epoch": 3.303769401330377, + "grad_norm": 3.670855760574341, + "learning_rate": 1.730258263711149e-06, + "loss": 0.15401190519332886, + "step": 1490 + }, + { + "epoch": 3.3082039911308203, + "grad_norm": 1.0713074207305908, + "learning_rate": 1.7212698674525246e-06, + "loss": 0.4863121807575226, + "step": 1492 + }, + { + "epoch": 3.3126385809312637, + "grad_norm": 0.9712854623794556, + "learning_rate": 1.7123323094214485e-06, + "loss": 0.21804000437259674, + "step": 1494 + }, + { + "epoch": 3.317073170731707, + "grad_norm": 1.2361743450164795, + "learning_rate": 1.7034457098619176e-06, + "loss": 0.28596603870391846, + "step": 1496 + }, + { + "epoch": 3.3215077605321506, + "grad_norm": 0.9230791330337524, + "learning_rate": 1.6946101883323435e-06, + "loss": 0.4915310740470886, + "step": 1498 + }, + { + "epoch": 3.3259423503325944, + "grad_norm": 0.9403104186058044, + "learning_rate": 1.6858258637039421e-06, + "loss": 0.23124194145202637, + "step": 1500 + }, + { + "epoch": 3.330376940133038, + "grad_norm": 1.3576343059539795, + "learning_rate": 1.677092854159142e-06, + "loss": 0.375247597694397, + "step": 1502 + }, + { + "epoch": 3.3348115299334813, + "grad_norm": 0.921538233757019, + "learning_rate": 1.6684112771899858e-06, + "loss": 0.5058495402336121, + "step": 1504 + }, + { + "epoch": 3.3392461197339247, + "grad_norm": 0.24488000571727753, + "learning_rate": 1.6597812495965537e-06, + "loss": 0.19275178015232086, + "step": 1506 + }, + { + "epoch": 3.343680709534368, + "grad_norm": 0.7813226580619812, + "learning_rate": 1.651202887485394e-06, + "loss": 0.19424784183502197, + "step": 1508 + }, + { + "epoch": 3.3481152993348116, + "grad_norm": 1.6763211488723755, + "learning_rate": 1.6426763062679553e-06, + "loss": 0.2872462272644043, + "step": 1510 + }, + { + "epoch": 3.352549889135255, + "grad_norm": 0.20118102431297302, + "learning_rate": 1.63420162065904e-06, + "loss": 0.21339718997478485, + "step": 1512 + }, + { + "epoch": 3.3569844789356984, + "grad_norm": 0.7732570171356201, + "learning_rate": 1.625778944675257e-06, + "loss": 0.5782560706138611, + "step": 1514 + }, + { + "epoch": 3.361419068736142, + "grad_norm": 4.678039073944092, + "learning_rate": 1.6174083916334877e-06, + "loss": 0.27637556195259094, + "step": 1516 + }, + { + "epoch": 3.3658536585365852, + "grad_norm": 0.8129257559776306, + "learning_rate": 1.609090074149366e-06, + "loss": 0.268915057182312, + "step": 1518 + }, + { + "epoch": 3.3702882483370287, + "grad_norm": 0.653376042842865, + "learning_rate": 1.6008241041357535e-06, + "loss": 0.37645894289016724, + "step": 1520 + }, + { + "epoch": 3.374722838137472, + "grad_norm": 0.17347639799118042, + "learning_rate": 1.5926105928012486e-06, + "loss": 0.27448832988739014, + "step": 1522 + }, + { + "epoch": 3.3791574279379155, + "grad_norm": 1.2720452547073364, + "learning_rate": 1.5844496506486734e-06, + "loss": 0.5296311974525452, + "step": 1524 + }, + { + "epoch": 3.3835920177383594, + "grad_norm": 2.3222310543060303, + "learning_rate": 1.576341387473601e-06, + "loss": 0.2787100076675415, + "step": 1526 + }, + { + "epoch": 3.388026607538803, + "grad_norm": 0.8329764604568481, + "learning_rate": 1.568285912362872e-06, + "loss": 0.26376861333847046, + "step": 1528 + }, + { + "epoch": 3.3924611973392462, + "grad_norm": 0.6690333485603333, + "learning_rate": 1.5602833336931242e-06, + "loss": 0.23260486125946045, + "step": 1530 + }, + { + "epoch": 3.3968957871396896, + "grad_norm": 4.120120525360107, + "learning_rate": 1.552333759129344e-06, + "loss": 0.10462646931409836, + "step": 1532 + }, + { + "epoch": 3.401330376940133, + "grad_norm": 0.9870664477348328, + "learning_rate": 1.5444372956234062e-06, + "loss": 0.33666056394577026, + "step": 1534 + }, + { + "epoch": 3.4057649667405765, + "grad_norm": 1.2383733987808228, + "learning_rate": 1.5365940494126424e-06, + "loss": 0.4268462061882019, + "step": 1536 + }, + { + "epoch": 3.41019955654102, + "grad_norm": 1.192371129989624, + "learning_rate": 1.5288041260184132e-06, + "loss": 0.315461128950119, + "step": 1538 + }, + { + "epoch": 3.4146341463414633, + "grad_norm": 0.773826539516449, + "learning_rate": 1.5210676302446801e-06, + "loss": 0.43709278106689453, + "step": 1540 + }, + { + "epoch": 3.4190687361419068, + "grad_norm": 1.3438527584075928, + "learning_rate": 1.5133846661766058e-06, + "loss": 0.3504549264907837, + "step": 1542 + }, + { + "epoch": 3.42350332594235, + "grad_norm": 0.9224695563316345, + "learning_rate": 1.5057553371791461e-06, + "loss": 0.42883241176605225, + "step": 1544 + }, + { + "epoch": 3.4279379157427936, + "grad_norm": 1.5111298561096191, + "learning_rate": 1.4981797458956624e-06, + "loss": 0.09244281053543091, + "step": 1546 + }, + { + "epoch": 3.4323725055432375, + "grad_norm": 0.24797746539115906, + "learning_rate": 1.490657994246542e-06, + "loss": 0.08839371800422668, + "step": 1548 + }, + { + "epoch": 3.436807095343681, + "grad_norm": 0.5342960953712463, + "learning_rate": 1.4831901834278212e-06, + "loss": 0.4299216568470001, + "step": 1550 + }, + { + "epoch": 3.4412416851441243, + "grad_norm": 1.4827715158462524, + "learning_rate": 1.4757764139098332e-06, + "loss": 0.3927062153816223, + "step": 1552 + }, + { + "epoch": 3.4456762749445677, + "grad_norm": 0.07252507656812668, + "learning_rate": 1.468416785435847e-06, + "loss": 0.25046950578689575, + "step": 1554 + }, + { + "epoch": 3.450110864745011, + "grad_norm": 0.833500325679779, + "learning_rate": 1.461111397020732e-06, + "loss": 0.48848727345466614, + "step": 1556 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 0.7718623280525208, + "learning_rate": 1.4538603469496215e-06, + "loss": 0.5105925798416138, + "step": 1558 + }, + { + "epoch": 3.458980044345898, + "grad_norm": 1.4482234716415405, + "learning_rate": 1.4466637327765937e-06, + "loss": 0.5265946984291077, + "step": 1560 + }, + { + "epoch": 3.4634146341463414, + "grad_norm": 7.474737644195557, + "learning_rate": 1.4395216513233584e-06, + "loss": 0.10489355027675629, + "step": 1562 + }, + { + "epoch": 3.467849223946785, + "grad_norm": 0.6549527645111084, + "learning_rate": 1.4324341986779527e-06, + "loss": 0.4963974058628082, + "step": 1564 + }, + { + "epoch": 3.4722838137472283, + "grad_norm": 14.025774955749512, + "learning_rate": 1.4254014701934481e-06, + "loss": 0.30700963735580444, + "step": 1566 + }, + { + "epoch": 3.4767184035476717, + "grad_norm": 0.6876983642578125, + "learning_rate": 1.4184235604866725e-06, + "loss": 0.3548239469528198, + "step": 1568 + }, + { + "epoch": 3.481152993348115, + "grad_norm": 1.9139790534973145, + "learning_rate": 1.4115005634369296e-06, + "loss": 0.30847689509391785, + "step": 1570 + }, + { + "epoch": 3.4855875831485585, + "grad_norm": 0.30100584030151367, + "learning_rate": 1.4046325721847443e-06, + "loss": 0.2665141522884369, + "step": 1572 + }, + { + "epoch": 3.4900221729490024, + "grad_norm": 0.7305917739868164, + "learning_rate": 1.397819679130601e-06, + "loss": 0.6849204301834106, + "step": 1574 + }, + { + "epoch": 3.494456762749446, + "grad_norm": 1.1686631441116333, + "learning_rate": 1.3910619759337074e-06, + "loss": 0.22372813522815704, + "step": 1576 + }, + { + "epoch": 3.4988913525498893, + "grad_norm": 0.7269113659858704, + "learning_rate": 1.3843595535107587e-06, + "loss": 0.24144387245178223, + "step": 1578 + }, + { + "epoch": 3.5033259423503327, + "grad_norm": 1.8978197574615479, + "learning_rate": 1.377712502034712e-06, + "loss": 0.46985891461372375, + "step": 1580 + }, + { + "epoch": 3.507760532150776, + "grad_norm": 1.9783498048782349, + "learning_rate": 1.3711209109335793e-06, + "loss": 0.5767348408699036, + "step": 1582 + }, + { + "epoch": 3.5121951219512195, + "grad_norm": 0.8796296119689941, + "learning_rate": 1.3645848688892162e-06, + "loss": 0.2534613311290741, + "step": 1584 + }, + { + "epoch": 3.516629711751663, + "grad_norm": 1.0521845817565918, + "learning_rate": 1.3581044638361373e-06, + "loss": 0.2456229329109192, + "step": 1586 + }, + { + "epoch": 3.5210643015521064, + "grad_norm": 0.8070515394210815, + "learning_rate": 1.3516797829603256e-06, + "loss": 0.40543049573898315, + "step": 1588 + }, + { + "epoch": 3.52549889135255, + "grad_norm": 1.262500286102295, + "learning_rate": 1.3453109126980643e-06, + "loss": 0.11292517930269241, + "step": 1590 + }, + { + "epoch": 3.529933481152993, + "grad_norm": 2.395573616027832, + "learning_rate": 1.3389979387347743e-06, + "loss": 0.28261229395866394, + "step": 1592 + }, + { + "epoch": 3.5343680709534366, + "grad_norm": 0.8870358467102051, + "learning_rate": 1.332740946003857e-06, + "loss": 0.47545742988586426, + "step": 1594 + }, + { + "epoch": 3.5388026607538805, + "grad_norm": 1.3032435178756714, + "learning_rate": 1.3265400186855548e-06, + "loss": 0.07019691914319992, + "step": 1596 + }, + { + "epoch": 3.5432372505543235, + "grad_norm": 1.8053206205368042, + "learning_rate": 1.320395240205819e-06, + "loss": 0.23545832931995392, + "step": 1598 + }, + { + "epoch": 3.5476718403547673, + "grad_norm": 0.7974863052368164, + "learning_rate": 1.3143066932351856e-06, + "loss": 0.2672840356826782, + "step": 1600 + }, + { + "epoch": 3.5521064301552108, + "grad_norm": 0.7222623229026794, + "learning_rate": 1.308274459687665e-06, + "loss": 0.4520362615585327, + "step": 1602 + }, + { + "epoch": 3.556541019955654, + "grad_norm": 1.16770601272583, + "learning_rate": 1.3022986207196367e-06, + "loss": 0.6893855929374695, + "step": 1604 + }, + { + "epoch": 3.5609756097560976, + "grad_norm": 1.1768697500228882, + "learning_rate": 1.2963792567287617e-06, + "loss": 0.6617428064346313, + "step": 1606 + }, + { + "epoch": 3.565410199556541, + "grad_norm": 0.14280186593532562, + "learning_rate": 1.290516447352899e-06, + "loss": 0.03455912321805954, + "step": 1608 + }, + { + "epoch": 3.5698447893569845, + "grad_norm": 1.9226244688034058, + "learning_rate": 1.2847102714690308e-06, + "loss": 0.2757825255393982, + "step": 1610 + }, + { + "epoch": 3.574279379157428, + "grad_norm": 1.6937259435653687, + "learning_rate": 1.2789608071922076e-06, + "loss": 0.1058734580874443, + "step": 1612 + }, + { + "epoch": 3.5787139689578713, + "grad_norm": 4.820397853851318, + "learning_rate": 1.2732681318744923e-06, + "loss": 0.37984806299209595, + "step": 1614 + }, + { + "epoch": 3.5831485587583147, + "grad_norm": 1.1603871583938599, + "learning_rate": 1.2676323221039236e-06, + "loss": 0.47421249747276306, + "step": 1616 + }, + { + "epoch": 3.587583148558758, + "grad_norm": 4.291319370269775, + "learning_rate": 1.2620534537034795e-06, + "loss": 0.2353006899356842, + "step": 1618 + }, + { + "epoch": 3.5920177383592016, + "grad_norm": 3.524076461791992, + "learning_rate": 1.2565316017300635e-06, + "loss": 0.2935880422592163, + "step": 1620 + }, + { + "epoch": 3.5964523281596454, + "grad_norm": 0.8411036729812622, + "learning_rate": 1.2510668404734924e-06, + "loss": 0.4112304151058197, + "step": 1622 + }, + { + "epoch": 3.6008869179600884, + "grad_norm": 0.9849219918251038, + "learning_rate": 1.2456592434554963e-06, + "loss": 0.3476102352142334, + "step": 1624 + }, + { + "epoch": 3.6053215077605323, + "grad_norm": 0.9225306510925293, + "learning_rate": 1.2403088834287282e-06, + "loss": 0.10709763318300247, + "step": 1626 + }, + { + "epoch": 3.6097560975609757, + "grad_norm": 0.8501572012901306, + "learning_rate": 1.2350158323757903e-06, + "loss": 0.4931592345237732, + "step": 1628 + }, + { + "epoch": 3.614190687361419, + "grad_norm": 2.1776418685913086, + "learning_rate": 1.229780161508259e-06, + "loss": 0.3885037302970886, + "step": 1630 + }, + { + "epoch": 3.6186252771618626, + "grad_norm": 0.9818041920661926, + "learning_rate": 1.2246019412657319e-06, + "loss": 0.3582894504070282, + "step": 1632 + }, + { + "epoch": 3.623059866962306, + "grad_norm": 1.3523777723312378, + "learning_rate": 1.2194812413148756e-06, + "loss": 0.5462644100189209, + "step": 1634 + }, + { + "epoch": 3.6274944567627494, + "grad_norm": 0.7082298398017883, + "learning_rate": 1.214418130548495e-06, + "loss": 0.21956975758075714, + "step": 1636 + }, + { + "epoch": 3.631929046563193, + "grad_norm": 1.0334546566009521, + "learning_rate": 1.2094126770845986e-06, + "loss": 0.27501288056373596, + "step": 1638 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 1.2771334648132324, + "learning_rate": 1.2044649482654876e-06, + "loss": 0.4401727318763733, + "step": 1640 + }, + { + "epoch": 3.6407982261640797, + "grad_norm": 0.47875624895095825, + "learning_rate": 1.1995750106568496e-06, + "loss": 0.2337026745080948, + "step": 1642 + }, + { + "epoch": 3.6452328159645235, + "grad_norm": 1.0887197256088257, + "learning_rate": 1.1947429300468575e-06, + "loss": 0.5300090312957764, + "step": 1644 + }, + { + "epoch": 3.6496674057649665, + "grad_norm": 0.05958491191267967, + "learning_rate": 1.1899687714452932e-06, + "loss": 0.19211959838867188, + "step": 1646 + }, + { + "epoch": 3.6541019955654104, + "grad_norm": 0.5240092277526855, + "learning_rate": 1.1852525990826658e-06, + "loss": 0.1384681761264801, + "step": 1648 + }, + { + "epoch": 3.658536585365854, + "grad_norm": 0.9349181652069092, + "learning_rate": 1.1805944764093484e-06, + "loss": 0.3181173503398895, + "step": 1650 + }, + { + "epoch": 3.662971175166297, + "grad_norm": 1.3443922996520996, + "learning_rate": 1.1759944660947301e-06, + "loss": 0.5139885544776917, + "step": 1652 + }, + { + "epoch": 3.6674057649667406, + "grad_norm": 0.8461628556251526, + "learning_rate": 1.171452630026365e-06, + "loss": 0.22307844460010529, + "step": 1654 + }, + { + "epoch": 3.671840354767184, + "grad_norm": 0.8421247005462646, + "learning_rate": 1.1669690293091452e-06, + "loss": 0.5296832919120789, + "step": 1656 + }, + { + "epoch": 3.6762749445676275, + "grad_norm": 1.7748911380767822, + "learning_rate": 1.1625437242644772e-06, + "loss": 0.24938040971755981, + "step": 1658 + }, + { + "epoch": 3.680709534368071, + "grad_norm": 0.7158502340316772, + "learning_rate": 1.1581767744294682e-06, + "loss": 0.26214003562927246, + "step": 1660 + }, + { + "epoch": 3.6851441241685143, + "grad_norm": 0.19944681227207184, + "learning_rate": 1.1538682385561286e-06, + "loss": 0.20961648225784302, + "step": 1662 + }, + { + "epoch": 3.6895787139689578, + "grad_norm": 0.48064079880714417, + "learning_rate": 1.1496181746105784e-06, + "loss": 0.14569465816020966, + "step": 1664 + }, + { + "epoch": 3.694013303769401, + "grad_norm": 0.9198098182678223, + "learning_rate": 1.1454266397722707e-06, + "loss": 0.42964571714401245, + "step": 1666 + }, + { + "epoch": 3.6984478935698446, + "grad_norm": 0.9416403770446777, + "learning_rate": 1.1412936904332181e-06, + "loss": 0.2964041233062744, + "step": 1668 + }, + { + "epoch": 3.7028824833702885, + "grad_norm": 0.7431701421737671, + "learning_rate": 1.1372193821972379e-06, + "loss": 0.31188875436782837, + "step": 1670 + }, + { + "epoch": 3.7073170731707314, + "grad_norm": 1.164131760597229, + "learning_rate": 1.1332037698792033e-06, + "loss": 0.33080539107322693, + "step": 1672 + }, + { + "epoch": 3.7117516629711753, + "grad_norm": 1.1789630651474, + "learning_rate": 1.1292469075043026e-06, + "loss": 0.585115909576416, + "step": 1674 + }, + { + "epoch": 3.7161862527716187, + "grad_norm": 1.0405234098434448, + "learning_rate": 1.1253488483073177e-06, + "loss": 0.46077483892440796, + "step": 1676 + }, + { + "epoch": 3.720620842572062, + "grad_norm": 0.7198275923728943, + "learning_rate": 1.1215096447319038e-06, + "loss": 0.3957710266113281, + "step": 1678 + }, + { + "epoch": 3.7250554323725056, + "grad_norm": 3.161853551864624, + "learning_rate": 1.117729348429884e-06, + "loss": 0.09382948279380798, + "step": 1680 + }, + { + "epoch": 3.729490022172949, + "grad_norm": 0.9764834046363831, + "learning_rate": 1.114008010260558e-06, + "loss": 0.5524929761886597, + "step": 1682 + }, + { + "epoch": 3.7339246119733924, + "grad_norm": 0.702358603477478, + "learning_rate": 1.1103456802900134e-06, + "loss": 0.2092130333185196, + "step": 1684 + }, + { + "epoch": 3.738359201773836, + "grad_norm": 5.8001580238342285, + "learning_rate": 1.1067424077904555e-06, + "loss": 0.44477126002311707, + "step": 1686 + }, + { + "epoch": 3.7427937915742793, + "grad_norm": 0.21515698730945587, + "learning_rate": 1.103198241239542e-06, + "loss": 0.07311274856328964, + "step": 1688 + }, + { + "epoch": 3.7472283813747227, + "grad_norm": 0.9483621716499329, + "learning_rate": 1.0997132283197324e-06, + "loss": 0.5884689092636108, + "step": 1690 + }, + { + "epoch": 3.7516629711751666, + "grad_norm": 0.7396944761276245, + "learning_rate": 1.0962874159176454e-06, + "loss": 0.4286979138851166, + "step": 1692 + }, + { + "epoch": 3.7560975609756095, + "grad_norm": 0.5899630784988403, + "learning_rate": 1.0929208501234286e-06, + "loss": 0.5086148977279663, + "step": 1694 + }, + { + "epoch": 3.7605321507760534, + "grad_norm": 0.7641669511795044, + "learning_rate": 1.0896135762301393e-06, + "loss": 0.4777107536792755, + "step": 1696 + }, + { + "epoch": 3.764966740576497, + "grad_norm": 0.6806691288948059, + "learning_rate": 1.0863656387331328e-06, + "loss": 0.22383804619312286, + "step": 1698 + }, + { + "epoch": 3.7694013303769403, + "grad_norm": 1.0728694200515747, + "learning_rate": 1.0831770813294668e-06, + "loss": 0.2801818549633026, + "step": 1700 + }, + { + "epoch": 3.7738359201773837, + "grad_norm": 0.5353509783744812, + "learning_rate": 1.0800479469173101e-06, + "loss": 0.8013766407966614, + "step": 1702 + }, + { + "epoch": 3.778270509977827, + "grad_norm": 0.26873287558555603, + "learning_rate": 1.076978277595369e-06, + "loss": 0.04525094851851463, + "step": 1704 + }, + { + "epoch": 3.7827050997782705, + "grad_norm": 0.272559255361557, + "learning_rate": 1.0739681146623185e-06, + "loss": 0.22566770017147064, + "step": 1706 + }, + { + "epoch": 3.787139689578714, + "grad_norm": 0.6693012714385986, + "learning_rate": 1.0710174986162471e-06, + "loss": 0.23584027588367462, + "step": 1708 + }, + { + "epoch": 3.7915742793791574, + "grad_norm": 0.6419113278388977, + "learning_rate": 1.0681264691541127e-06, + "loss": 0.6023858785629272, + "step": 1710 + }, + { + "epoch": 3.796008869179601, + "grad_norm": 0.9070783853530884, + "learning_rate": 1.0652950651712072e-06, + "loss": 0.37740230560302734, + "step": 1712 + }, + { + "epoch": 3.800443458980044, + "grad_norm": 2.0648586750030518, + "learning_rate": 1.0625233247606348e-06, + "loss": 0.31113240122795105, + "step": 1714 + }, + { + "epoch": 3.8048780487804876, + "grad_norm": 0.7624364495277405, + "learning_rate": 1.059811285212799e-06, + "loss": 0.40173619985580444, + "step": 1716 + }, + { + "epoch": 3.8093126385809315, + "grad_norm": 0.5731363892555237, + "learning_rate": 1.0571589830149e-06, + "loss": 0.36406075954437256, + "step": 1718 + }, + { + "epoch": 3.8137472283813745, + "grad_norm": 4.027352809906006, + "learning_rate": 1.054566453850444e-06, + "loss": 0.30006563663482666, + "step": 1720 + }, + { + "epoch": 3.8181818181818183, + "grad_norm": 0.8597157001495361, + "learning_rate": 1.0520337325987649e-06, + "loss": 0.5175278186798096, + "step": 1722 + }, + { + "epoch": 3.8226164079822618, + "grad_norm": 1.0701987743377686, + "learning_rate": 1.049560853334553e-06, + "loss": 0.48277679085731506, + "step": 1724 + }, + { + "epoch": 3.827050997782705, + "grad_norm": 1.4645849466323853, + "learning_rate": 1.0471478493273976e-06, + "loss": 0.3768943250179291, + "step": 1726 + }, + { + "epoch": 3.8314855875831486, + "grad_norm": 0.7238010168075562, + "learning_rate": 1.0447947530413389e-06, + "loss": 0.20410099625587463, + "step": 1728 + }, + { + "epoch": 3.835920177383592, + "grad_norm": 1.628004789352417, + "learning_rate": 1.042501596134431e-06, + "loss": 0.2359320968389511, + "step": 1730 + }, + { + "epoch": 3.8403547671840355, + "grad_norm": 0.8455327153205872, + "learning_rate": 1.0402684094583173e-06, + "loss": 0.5593616366386414, + "step": 1732 + }, + { + "epoch": 3.844789356984479, + "grad_norm": 1.36236572265625, + "learning_rate": 1.0380952230578125e-06, + "loss": 0.35933902859687805, + "step": 1734 + }, + { + "epoch": 3.8492239467849223, + "grad_norm": 1.1344467401504517, + "learning_rate": 1.0359820661705042e-06, + "loss": 0.39542049169540405, + "step": 1736 + }, + { + "epoch": 3.8536585365853657, + "grad_norm": 1.3236117362976074, + "learning_rate": 1.0339289672263519e-06, + "loss": 0.5226433873176575, + "step": 1738 + }, + { + "epoch": 3.858093126385809, + "grad_norm": 0.8914376497268677, + "learning_rate": 1.0319359538473107e-06, + "loss": 0.41254663467407227, + "step": 1740 + }, + { + "epoch": 3.8625277161862526, + "grad_norm": 1.1892915964126587, + "learning_rate": 1.0300030528469564e-06, + "loss": 0.24275799095630646, + "step": 1742 + }, + { + "epoch": 3.8669623059866964, + "grad_norm": 1.1853784322738647, + "learning_rate": 1.0281302902301254e-06, + "loss": 0.405990868806839, + "step": 1744 + }, + { + "epoch": 3.8713968957871394, + "grad_norm": 1.119568109512329, + "learning_rate": 1.026317691192567e-06, + "loss": 0.3362556993961334, + "step": 1746 + }, + { + "epoch": 3.8758314855875833, + "grad_norm": 0.7554269433021545, + "learning_rate": 1.0245652801205999e-06, + "loss": 0.21234464645385742, + "step": 1748 + }, + { + "epoch": 3.8802660753880267, + "grad_norm": 6.679217338562012, + "learning_rate": 1.0228730805907891e-06, + "loss": 0.2865106165409088, + "step": 1750 + }, + { + "epoch": 3.88470066518847, + "grad_norm": 1.4342660903930664, + "learning_rate": 1.0212411153696247e-06, + "loss": 0.3992903232574463, + "step": 1752 + }, + { + "epoch": 3.8891352549889135, + "grad_norm": 1.467846393585205, + "learning_rate": 1.019669406413218e-06, + "loss": 0.3722279965877533, + "step": 1754 + }, + { + "epoch": 3.893569844789357, + "grad_norm": 1.52994704246521, + "learning_rate": 1.0181579748670054e-06, + "loss": 0.39275604486465454, + "step": 1756 + }, + { + "epoch": 3.8980044345898004, + "grad_norm": 0.8423678278923035, + "learning_rate": 1.0167068410654643e-06, + "loss": 0.464008092880249, + "step": 1758 + }, + { + "epoch": 3.902439024390244, + "grad_norm": 0.012710033915936947, + "learning_rate": 1.0153160245318384e-06, + "loss": 0.0020313351415097713, + "step": 1760 + }, + { + "epoch": 3.9068736141906872, + "grad_norm": 1.350422978401184, + "learning_rate": 1.0139855439778766e-06, + "loss": 0.18985067307949066, + "step": 1762 + }, + { + "epoch": 3.9113082039911307, + "grad_norm": 0.1394459307193756, + "learning_rate": 1.0127154173035787e-06, + "loss": 0.23882459104061127, + "step": 1764 + }, + { + "epoch": 3.9157427937915745, + "grad_norm": 2.188660144805908, + "learning_rate": 1.0115056615969584e-06, + "loss": 0.38833871483802795, + "step": 1766 + }, + { + "epoch": 3.9201773835920175, + "grad_norm": 1.0415091514587402, + "learning_rate": 1.0103562931338105e-06, + "loss": 0.579535186290741, + "step": 1768 + }, + { + "epoch": 3.9246119733924614, + "grad_norm": 0.9112776517868042, + "learning_rate": 1.009267327377492e-06, + "loss": 0.498271644115448, + "step": 1770 + }, + { + "epoch": 3.929046563192905, + "grad_norm": 0.9135385155677795, + "learning_rate": 1.008238778978716e-06, + "loss": 0.16285663843154907, + "step": 1772 + }, + { + "epoch": 3.933481152993348, + "grad_norm": 0.6705300807952881, + "learning_rate": 1.0072706617753528e-06, + "loss": 0.3432881236076355, + "step": 1774 + }, + { + "epoch": 3.9379157427937916, + "grad_norm": 1.4115146398544312, + "learning_rate": 1.0063629887922441e-06, + "loss": 0.47384950518608093, + "step": 1776 + }, + { + "epoch": 3.942350332594235, + "grad_norm": 0.19208544492721558, + "learning_rate": 1.0055157722410279e-06, + "loss": 0.06654756516218185, + "step": 1778 + }, + { + "epoch": 3.9467849223946785, + "grad_norm": 7.167059421539307, + "learning_rate": 1.0047290235199753e-06, + "loss": 0.3578657805919647, + "step": 1780 + }, + { + "epoch": 3.951219512195122, + "grad_norm": 0.15908405184745789, + "learning_rate": 1.0040027532138351e-06, + "loss": 0.32968664169311523, + "step": 1782 + }, + { + "epoch": 3.9556541019955653, + "grad_norm": 0.6541703939437866, + "learning_rate": 1.0033369710936928e-06, + "loss": 0.3075524568557739, + "step": 1784 + }, + { + "epoch": 3.9600886917960088, + "grad_norm": 1.4662445783615112, + "learning_rate": 1.0027316861168388e-06, + "loss": 0.4341813623905182, + "step": 1786 + }, + { + "epoch": 3.964523281596452, + "grad_norm": 0.8622516393661499, + "learning_rate": 1.0021869064266472e-06, + "loss": 0.25563016533851624, + "step": 1788 + }, + { + "epoch": 3.9689578713968956, + "grad_norm": 1.2049661874771118, + "learning_rate": 1.0017026393524684e-06, + "loss": 0.3270076811313629, + "step": 1790 + }, + { + "epoch": 3.9733924611973395, + "grad_norm": 0.9360433220863342, + "learning_rate": 1.0012788914095275e-06, + "loss": 0.38704222440719604, + "step": 1792 + }, + { + "epoch": 3.9778270509977824, + "grad_norm": 0.4730996787548065, + "learning_rate": 1.0009156682988395e-06, + "loss": 0.22841157019138336, + "step": 1794 + }, + { + "epoch": 3.9822616407982263, + "grad_norm": 1.477320671081543, + "learning_rate": 1.0006129749071298e-06, + "loss": 0.2755080759525299, + "step": 1796 + }, + { + "epoch": 3.9866962305986697, + "grad_norm": 1.052030324935913, + "learning_rate": 1.00037081530677e-06, + "loss": 0.3632432520389557, + "step": 1798 + }, + { + "epoch": 3.991130820399113, + "grad_norm": 2.82387113571167, + "learning_rate": 1.0001891927557255e-06, + "loss": 0.3730500340461731, + "step": 1800 + }, + { + "epoch": 3.9955654101995566, + "grad_norm": 1.2022005319595337, + "learning_rate": 1.0000681096975056e-06, + "loss": 0.4655001163482666, + "step": 1802 + }, + { + "epoch": 4.0, + "grad_norm": 1.2275028228759766, + "learning_rate": 1.0000075677611364e-06, + "loss": 0.3692970275878906, + "step": 1804 + }, + { + "epoch": 4.0, + "step": 1804, + "total_flos": 3.4175049861232067e+18, + "train_loss": 0.759928425279923, + "train_runtime": 7985.8505, + "train_samples_per_second": 6.777, + "train_steps_per_second": 0.226 + } + ], + "logging_steps": 2, + "max_steps": 1804, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.4175049861232067e+18, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}