diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18537 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 5284, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001514004542013626, + "grad_norm": 4.028285980224609, + "learning_rate": 1.1320754716981132e-08, + "loss": 1.7578186988830566, + "step": 2 + }, + { + "epoch": 0.003028009084027252, + "grad_norm": 8.948590278625488, + "learning_rate": 3.39622641509434e-08, + "loss": 2.091785192489624, + "step": 4 + }, + { + "epoch": 0.004542013626040878, + "grad_norm": 44.11714553833008, + "learning_rate": 5.660377358490566e-08, + "loss": 2.222198247909546, + "step": 6 + }, + { + "epoch": 0.006056018168054504, + "grad_norm": 5.267591953277588, + "learning_rate": 7.924528301886792e-08, + "loss": 1.6964834928512573, + "step": 8 + }, + { + "epoch": 0.00757002271006813, + "grad_norm": 19.414339065551758, + "learning_rate": 1.0188679245283018e-07, + "loss": 1.853297472000122, + "step": 10 + }, + { + "epoch": 0.009084027252081756, + "grad_norm": 5.677351474761963, + "learning_rate": 1.2452830188679246e-07, + "loss": 1.484681248664856, + "step": 12 + }, + { + "epoch": 0.010598031794095382, + "grad_norm": 34.2765998840332, + "learning_rate": 1.4716981132075472e-07, + "loss": 1.9242310523986816, + "step": 14 + }, + { + "epoch": 0.012112036336109008, + "grad_norm": 4.927520275115967, + "learning_rate": 1.6981132075471698e-07, + "loss": 1.7784415483474731, + "step": 16 + }, + { + "epoch": 0.013626040878122634, + "grad_norm": 7.1213274002075195, + "learning_rate": 1.9245283018867924e-07, + "loss": 1.57778000831604, + "step": 18 + }, + { + "epoch": 0.01514004542013626, + "grad_norm": 2.761359691619873, + "learning_rate": 2.150943396226415e-07, + "loss": 1.8546152114868164, + "step": 20 + }, + { + "epoch": 0.016654049962149888, + "grad_norm": 2.7739017009735107, + "learning_rate": 2.3773584905660376e-07, + "loss": 1.8686250448226929, + "step": 22 + }, + { + "epoch": 0.018168054504163512, + "grad_norm": 4.913271427154541, + "learning_rate": 2.60377358490566e-07, + "loss": 1.6648024320602417, + "step": 24 + }, + { + "epoch": 0.01968205904617714, + "grad_norm": 19.3339786529541, + "learning_rate": 2.8301886792452833e-07, + "loss": 1.6994813680648804, + "step": 26 + }, + { + "epoch": 0.021196063588190765, + "grad_norm": 5.690567493438721, + "learning_rate": 3.056603773584906e-07, + "loss": 1.6049681901931763, + "step": 28 + }, + { + "epoch": 0.022710068130204392, + "grad_norm": 10.998921394348145, + "learning_rate": 3.2830188679245285e-07, + "loss": 1.8233933448791504, + "step": 30 + }, + { + "epoch": 0.024224072672218017, + "grad_norm": 4.003087520599365, + "learning_rate": 3.509433962264151e-07, + "loss": 1.212135910987854, + "step": 32 + }, + { + "epoch": 0.025738077214231644, + "grad_norm": 2.037114143371582, + "learning_rate": 3.7358490566037737e-07, + "loss": 1.8270328044891357, + "step": 34 + }, + { + "epoch": 0.02725208175624527, + "grad_norm": 1.9307130575180054, + "learning_rate": 3.9622641509433963e-07, + "loss": 1.8230198621749878, + "step": 36 + }, + { + "epoch": 0.028766086298258896, + "grad_norm": 16.358869552612305, + "learning_rate": 4.188679245283019e-07, + "loss": 1.7805675268173218, + "step": 38 + }, + { + "epoch": 0.03028009084027252, + "grad_norm": 2.5071308612823486, + "learning_rate": 4.4150943396226415e-07, + "loss": 1.8819431066513062, + "step": 40 + }, + { + "epoch": 0.03179409538228615, + "grad_norm": 2.3226277828216553, + "learning_rate": 4.641509433962264e-07, + "loss": 1.448043942451477, + "step": 42 + }, + { + "epoch": 0.033308099924299776, + "grad_norm": 3.53702449798584, + "learning_rate": 4.867924528301886e-07, + "loss": 1.4034466743469238, + "step": 44 + }, + { + "epoch": 0.0348221044663134, + "grad_norm": 3.172464609146118, + "learning_rate": 5.094339622641509e-07, + "loss": 0.826553463935852, + "step": 46 + }, + { + "epoch": 0.036336109008327025, + "grad_norm": 6.114887237548828, + "learning_rate": 5.320754716981131e-07, + "loss": 1.4066932201385498, + "step": 48 + }, + { + "epoch": 0.03785011355034065, + "grad_norm": 2.313415288925171, + "learning_rate": 5.547169811320755e-07, + "loss": 1.76339852809906, + "step": 50 + }, + { + "epoch": 0.03936411809235428, + "grad_norm": 13.030013084411621, + "learning_rate": 5.773584905660378e-07, + "loss": 1.01431405544281, + "step": 52 + }, + { + "epoch": 0.0408781226343679, + "grad_norm": 1.8231401443481445, + "learning_rate": 6.000000000000001e-07, + "loss": 0.833857536315918, + "step": 54 + }, + { + "epoch": 0.04239212717638153, + "grad_norm": 6.299084186553955, + "learning_rate": 6.226415094339623e-07, + "loss": 0.7415087819099426, + "step": 56 + }, + { + "epoch": 0.04390613171839516, + "grad_norm": 11.754128456115723, + "learning_rate": 6.452830188679246e-07, + "loss": 1.2531957626342773, + "step": 58 + }, + { + "epoch": 0.045420136260408785, + "grad_norm": 2.98806095123291, + "learning_rate": 6.679245283018868e-07, + "loss": 1.9698634147644043, + "step": 60 + }, + { + "epoch": 0.046934140802422405, + "grad_norm": 3.6988701820373535, + "learning_rate": 6.905660377358491e-07, + "loss": 1.33149254322052, + "step": 62 + }, + { + "epoch": 0.04844814534443603, + "grad_norm": 1.5481542348861694, + "learning_rate": 7.132075471698113e-07, + "loss": 1.633347988128662, + "step": 64 + }, + { + "epoch": 0.04996214988644966, + "grad_norm": 3.048712730407715, + "learning_rate": 7.358490566037736e-07, + "loss": 1.7247980833053589, + "step": 66 + }, + { + "epoch": 0.05147615442846329, + "grad_norm": 5.191999435424805, + "learning_rate": 7.584905660377358e-07, + "loss": 0.8415547013282776, + "step": 68 + }, + { + "epoch": 0.05299015897047691, + "grad_norm": 3.5893232822418213, + "learning_rate": 7.811320754716982e-07, + "loss": 0.8306036591529846, + "step": 70 + }, + { + "epoch": 0.05450416351249054, + "grad_norm": 2.8714966773986816, + "learning_rate": 8.037735849056604e-07, + "loss": 0.8097044229507446, + "step": 72 + }, + { + "epoch": 0.056018168054504165, + "grad_norm": 3.8042104244232178, + "learning_rate": 8.264150943396227e-07, + "loss": 0.790952742099762, + "step": 74 + }, + { + "epoch": 0.05753217259651779, + "grad_norm": 2.469749689102173, + "learning_rate": 8.490566037735849e-07, + "loss": 0.675955593585968, + "step": 76 + }, + { + "epoch": 0.059046177138531414, + "grad_norm": 2.3359975814819336, + "learning_rate": 8.716981132075472e-07, + "loss": 0.7343652844429016, + "step": 78 + }, + { + "epoch": 0.06056018168054504, + "grad_norm": 2.1326544284820557, + "learning_rate": 8.943396226415094e-07, + "loss": 1.0913138389587402, + "step": 80 + }, + { + "epoch": 0.06207418622255867, + "grad_norm": 2.8311381340026855, + "learning_rate": 9.169811320754717e-07, + "loss": 0.8511497378349304, + "step": 82 + }, + { + "epoch": 0.0635881907645723, + "grad_norm": 2.2718989849090576, + "learning_rate": 9.396226415094339e-07, + "loss": 1.2040461301803589, + "step": 84 + }, + { + "epoch": 0.06510219530658592, + "grad_norm": 1.683356761932373, + "learning_rate": 9.622641509433961e-07, + "loss": 1.0493688583374023, + "step": 86 + }, + { + "epoch": 0.06661619984859955, + "grad_norm": 7.712815284729004, + "learning_rate": 9.849056603773586e-07, + "loss": 0.6960101127624512, + "step": 88 + }, + { + "epoch": 0.06813020439061317, + "grad_norm": 3.5836563110351562, + "learning_rate": 1.0075471698113208e-06, + "loss": 1.0709052085876465, + "step": 90 + }, + { + "epoch": 0.0696442089326268, + "grad_norm": 1.968751072883606, + "learning_rate": 1.030188679245283e-06, + "loss": 0.9806250333786011, + "step": 92 + }, + { + "epoch": 0.07115821347464042, + "grad_norm": 2.416091203689575, + "learning_rate": 1.0528301886792452e-06, + "loss": 1.0578032732009888, + "step": 94 + }, + { + "epoch": 0.07267221801665405, + "grad_norm": 1.988963007926941, + "learning_rate": 1.0754716981132076e-06, + "loss": 0.7393626570701599, + "step": 96 + }, + { + "epoch": 0.07418622255866768, + "grad_norm": 3.124030351638794, + "learning_rate": 1.0981132075471698e-06, + "loss": 0.6623208522796631, + "step": 98 + }, + { + "epoch": 0.0757002271006813, + "grad_norm": 3.8476712703704834, + "learning_rate": 1.120754716981132e-06, + "loss": 1.1105657815933228, + "step": 100 + }, + { + "epoch": 0.07721423164269493, + "grad_norm": 1.9554381370544434, + "learning_rate": 1.1433962264150944e-06, + "loss": 1.4922415018081665, + "step": 102 + }, + { + "epoch": 0.07872823618470856, + "grad_norm": 2.661355972290039, + "learning_rate": 1.1660377358490566e-06, + "loss": 0.5279350280761719, + "step": 104 + }, + { + "epoch": 0.08024224072672217, + "grad_norm": 3.056816816329956, + "learning_rate": 1.188679245283019e-06, + "loss": 0.5749517679214478, + "step": 106 + }, + { + "epoch": 0.0817562452687358, + "grad_norm": 2.767310380935669, + "learning_rate": 1.2113207547169813e-06, + "loss": 0.94456946849823, + "step": 108 + }, + { + "epoch": 0.08327024981074943, + "grad_norm": 4.44754695892334, + "learning_rate": 1.2339622641509435e-06, + "loss": 1.0132474899291992, + "step": 110 + }, + { + "epoch": 0.08478425435276306, + "grad_norm": 2.7119500637054443, + "learning_rate": 1.2566037735849057e-06, + "loss": 0.9866548180580139, + "step": 112 + }, + { + "epoch": 0.08629825889477669, + "grad_norm": 4.809918403625488, + "learning_rate": 1.279245283018868e-06, + "loss": 0.5868025422096252, + "step": 114 + }, + { + "epoch": 0.08781226343679031, + "grad_norm": 2.0895748138427734, + "learning_rate": 1.3018867924528303e-06, + "loss": 1.142189383506775, + "step": 116 + }, + { + "epoch": 0.08932626797880394, + "grad_norm": 8.27457046508789, + "learning_rate": 1.3245283018867925e-06, + "loss": 0.8774496912956238, + "step": 118 + }, + { + "epoch": 0.09084027252081757, + "grad_norm": 2.5352494716644287, + "learning_rate": 1.3471698113207547e-06, + "loss": 0.8927730321884155, + "step": 120 + }, + { + "epoch": 0.09235427706283118, + "grad_norm": 1.6000349521636963, + "learning_rate": 1.3698113207547171e-06, + "loss": 0.6039988994598389, + "step": 122 + }, + { + "epoch": 0.09386828160484481, + "grad_norm": 9.85887336730957, + "learning_rate": 1.3924528301886793e-06, + "loss": 0.6106588840484619, + "step": 124 + }, + { + "epoch": 0.09538228614685844, + "grad_norm": 1.7373552322387695, + "learning_rate": 1.4150943396226415e-06, + "loss": 1.4290205240249634, + "step": 126 + }, + { + "epoch": 0.09689629068887207, + "grad_norm": 1.9764314889907837, + "learning_rate": 1.4377358490566038e-06, + "loss": 1.4135056734085083, + "step": 128 + }, + { + "epoch": 0.0984102952308857, + "grad_norm": 1.3218179941177368, + "learning_rate": 1.4603773584905662e-06, + "loss": 1.102647304534912, + "step": 130 + }, + { + "epoch": 0.09992429977289932, + "grad_norm": 2.5453078746795654, + "learning_rate": 1.4830188679245284e-06, + "loss": 0.8086697459220886, + "step": 132 + }, + { + "epoch": 0.10143830431491295, + "grad_norm": 3.7117722034454346, + "learning_rate": 1.5056603773584906e-06, + "loss": 0.758213996887207, + "step": 134 + }, + { + "epoch": 0.10295230885692658, + "grad_norm": 5.010377407073975, + "learning_rate": 1.5283018867924528e-06, + "loss": 0.872275710105896, + "step": 136 + }, + { + "epoch": 0.10446631339894019, + "grad_norm": 3.5814974308013916, + "learning_rate": 1.5509433962264152e-06, + "loss": 1.0034674406051636, + "step": 138 + }, + { + "epoch": 0.10598031794095382, + "grad_norm": 1.4908024072647095, + "learning_rate": 1.5735849056603774e-06, + "loss": 0.4415053725242615, + "step": 140 + }, + { + "epoch": 0.10749432248296745, + "grad_norm": 2.945338487625122, + "learning_rate": 1.5962264150943396e-06, + "loss": 1.0510526895523071, + "step": 142 + }, + { + "epoch": 0.10900832702498107, + "grad_norm": 1.622044563293457, + "learning_rate": 1.6188679245283018e-06, + "loss": 1.4364440441131592, + "step": 144 + }, + { + "epoch": 0.1105223315669947, + "grad_norm": 1.9746934175491333, + "learning_rate": 1.6415094339622643e-06, + "loss": 0.7608726024627686, + "step": 146 + }, + { + "epoch": 0.11203633610900833, + "grad_norm": 4.025785446166992, + "learning_rate": 1.6641509433962265e-06, + "loss": 0.8212873935699463, + "step": 148 + }, + { + "epoch": 0.11355034065102196, + "grad_norm": 2.139375925064087, + "learning_rate": 1.6867924528301887e-06, + "loss": 1.0184704065322876, + "step": 150 + }, + { + "epoch": 0.11506434519303559, + "grad_norm": 2.9564414024353027, + "learning_rate": 1.7094339622641509e-06, + "loss": 0.7412694096565247, + "step": 152 + }, + { + "epoch": 0.1165783497350492, + "grad_norm": 2.5149290561676025, + "learning_rate": 1.7320754716981133e-06, + "loss": 0.827344536781311, + "step": 154 + }, + { + "epoch": 0.11809235427706283, + "grad_norm": 5.308546543121338, + "learning_rate": 1.7547169811320755e-06, + "loss": 1.2992665767669678, + "step": 156 + }, + { + "epoch": 0.11960635881907646, + "grad_norm": 1.8298921585083008, + "learning_rate": 1.7773584905660377e-06, + "loss": 1.3940523862838745, + "step": 158 + }, + { + "epoch": 0.12112036336109008, + "grad_norm": 2.9986073970794678, + "learning_rate": 1.8e-06, + "loss": 1.0409047603607178, + "step": 160 + }, + { + "epoch": 0.12263436790310371, + "grad_norm": 1.6920502185821533, + "learning_rate": 1.8226415094339623e-06, + "loss": 1.1713602542877197, + "step": 162 + }, + { + "epoch": 0.12414837244511734, + "grad_norm": 3.7834837436676025, + "learning_rate": 1.8452830188679245e-06, + "loss": 0.5241885781288147, + "step": 164 + }, + { + "epoch": 0.12566237698713095, + "grad_norm": 1.7689934968948364, + "learning_rate": 1.8679245283018868e-06, + "loss": 0.9450485110282898, + "step": 166 + }, + { + "epoch": 0.1271763815291446, + "grad_norm": 2.874657154083252, + "learning_rate": 1.890566037735849e-06, + "loss": 0.5266307592391968, + "step": 168 + }, + { + "epoch": 0.1286903860711582, + "grad_norm": 1.9352918863296509, + "learning_rate": 1.913207547169811e-06, + "loss": 1.362914800643921, + "step": 170 + }, + { + "epoch": 0.13020439061317185, + "grad_norm": 1.8365625143051147, + "learning_rate": 1.9358490566037734e-06, + "loss": 0.5974559187889099, + "step": 172 + }, + { + "epoch": 0.13171839515518546, + "grad_norm": 1.4278931617736816, + "learning_rate": 1.958490566037736e-06, + "loss": 1.2225620746612549, + "step": 174 + }, + { + "epoch": 0.1332323996971991, + "grad_norm": 1.5506442785263062, + "learning_rate": 1.981132075471698e-06, + "loss": 1.4445343017578125, + "step": 176 + }, + { + "epoch": 0.13474640423921272, + "grad_norm": 4.27222204208374, + "learning_rate": 2.0037735849056604e-06, + "loss": 0.7266371846199036, + "step": 178 + }, + { + "epoch": 0.13626040878122633, + "grad_norm": 3.9819698333740234, + "learning_rate": 2.0264150943396226e-06, + "loss": 0.5341688990592957, + "step": 180 + }, + { + "epoch": 0.13777441332323997, + "grad_norm": 2.0499777793884277, + "learning_rate": 2.049056603773585e-06, + "loss": 1.3794103860855103, + "step": 182 + }, + { + "epoch": 0.1392884178652536, + "grad_norm": 2.1297922134399414, + "learning_rate": 2.071698113207547e-06, + "loss": 0.6234973073005676, + "step": 184 + }, + { + "epoch": 0.14080242240726723, + "grad_norm": 3.5820069313049316, + "learning_rate": 2.0943396226415092e-06, + "loss": 0.8500635623931885, + "step": 186 + }, + { + "epoch": 0.14231642694928084, + "grad_norm": 1.7805827856063843, + "learning_rate": 2.1169811320754715e-06, + "loss": 1.1354131698608398, + "step": 188 + }, + { + "epoch": 0.14383043149129449, + "grad_norm": 3.0115771293640137, + "learning_rate": 2.139622641509434e-06, + "loss": 0.5908098220825195, + "step": 190 + }, + { + "epoch": 0.1453444360333081, + "grad_norm": 1.7477163076400757, + "learning_rate": 2.1622641509433963e-06, + "loss": 0.9433279633522034, + "step": 192 + }, + { + "epoch": 0.1468584405753217, + "grad_norm": 3.3435184955596924, + "learning_rate": 2.1849056603773585e-06, + "loss": 0.5525610446929932, + "step": 194 + }, + { + "epoch": 0.14837244511733536, + "grad_norm": 2.3403079509735107, + "learning_rate": 2.2075471698113207e-06, + "loss": 1.1096783876419067, + "step": 196 + }, + { + "epoch": 0.14988644965934897, + "grad_norm": 14.39986515045166, + "learning_rate": 2.230188679245283e-06, + "loss": 0.9414764642715454, + "step": 198 + }, + { + "epoch": 0.1514004542013626, + "grad_norm": 3.031162738800049, + "learning_rate": 2.2528301886792455e-06, + "loss": 0.8100090622901917, + "step": 200 + }, + { + "epoch": 0.15291445874337622, + "grad_norm": 1.9326648712158203, + "learning_rate": 2.2754716981132078e-06, + "loss": 0.9946622848510742, + "step": 202 + }, + { + "epoch": 0.15442846328538987, + "grad_norm": 3.8317861557006836, + "learning_rate": 2.29811320754717e-06, + "loss": 0.9192565083503723, + "step": 204 + }, + { + "epoch": 0.15594246782740348, + "grad_norm": 3.980340003967285, + "learning_rate": 2.320754716981132e-06, + "loss": 0.9914010763168335, + "step": 206 + }, + { + "epoch": 0.15745647236941712, + "grad_norm": 5.659903049468994, + "learning_rate": 2.3433962264150944e-06, + "loss": 0.638492226600647, + "step": 208 + }, + { + "epoch": 0.15897047691143074, + "grad_norm": 2.102769613265991, + "learning_rate": 2.366037735849057e-06, + "loss": 0.8544633388519287, + "step": 210 + }, + { + "epoch": 0.16048448145344435, + "grad_norm": 1.4139082431793213, + "learning_rate": 2.388679245283019e-06, + "loss": 0.7294167876243591, + "step": 212 + }, + { + "epoch": 0.161998485995458, + "grad_norm": 4.672745704650879, + "learning_rate": 2.4113207547169814e-06, + "loss": 1.1625778675079346, + "step": 214 + }, + { + "epoch": 0.1635124905374716, + "grad_norm": 1.5398153066635132, + "learning_rate": 2.4339622641509436e-06, + "loss": 0.5668327808380127, + "step": 216 + }, + { + "epoch": 0.16502649507948525, + "grad_norm": 3.935654640197754, + "learning_rate": 2.456603773584906e-06, + "loss": 0.6235625743865967, + "step": 218 + }, + { + "epoch": 0.16654049962149886, + "grad_norm": 6.329226016998291, + "learning_rate": 2.479245283018868e-06, + "loss": 1.387952208518982, + "step": 220 + }, + { + "epoch": 0.1680545041635125, + "grad_norm": 2.0209875106811523, + "learning_rate": 2.5018867924528302e-06, + "loss": 0.9451707601547241, + "step": 222 + }, + { + "epoch": 0.16956850870552612, + "grad_norm": 1.4571036100387573, + "learning_rate": 2.5245283018867925e-06, + "loss": 0.5464656352996826, + "step": 224 + }, + { + "epoch": 0.17108251324753973, + "grad_norm": 2.3737196922302246, + "learning_rate": 2.547169811320755e-06, + "loss": 1.0254173278808594, + "step": 226 + }, + { + "epoch": 0.17259651778955337, + "grad_norm": 1.1943808794021606, + "learning_rate": 2.5698113207547173e-06, + "loss": 0.9058595895767212, + "step": 228 + }, + { + "epoch": 0.17411052233156699, + "grad_norm": 3.953221559524536, + "learning_rate": 2.5924528301886795e-06, + "loss": 0.5741055607795715, + "step": 230 + }, + { + "epoch": 0.17562452687358063, + "grad_norm": 3.6732680797576904, + "learning_rate": 2.6150943396226417e-06, + "loss": 0.9347317218780518, + "step": 232 + }, + { + "epoch": 0.17713853141559424, + "grad_norm": 1.8431740999221802, + "learning_rate": 2.637735849056604e-06, + "loss": 0.5431374907493591, + "step": 234 + }, + { + "epoch": 0.17865253595760788, + "grad_norm": 2.2940425872802734, + "learning_rate": 2.660377358490566e-06, + "loss": 0.42906224727630615, + "step": 236 + }, + { + "epoch": 0.1801665404996215, + "grad_norm": 4.07905387878418, + "learning_rate": 2.6830188679245283e-06, + "loss": 0.6392327547073364, + "step": 238 + }, + { + "epoch": 0.18168054504163514, + "grad_norm": 1.1140233278274536, + "learning_rate": 2.7056603773584905e-06, + "loss": 0.47808319330215454, + "step": 240 + }, + { + "epoch": 0.18319454958364875, + "grad_norm": 2.902217388153076, + "learning_rate": 2.728301886792453e-06, + "loss": 0.8900514841079712, + "step": 242 + }, + { + "epoch": 0.18470855412566237, + "grad_norm": 3.2045326232910156, + "learning_rate": 2.7509433962264154e-06, + "loss": 0.530337393283844, + "step": 244 + }, + { + "epoch": 0.186222558667676, + "grad_norm": 1.6070469617843628, + "learning_rate": 2.7735849056603776e-06, + "loss": 1.1014389991760254, + "step": 246 + }, + { + "epoch": 0.18773656320968962, + "grad_norm": 2.2355546951293945, + "learning_rate": 2.7962264150943398e-06, + "loss": 0.7248896956443787, + "step": 248 + }, + { + "epoch": 0.18925056775170326, + "grad_norm": 2.490844964981079, + "learning_rate": 2.818867924528302e-06, + "loss": 1.0347926616668701, + "step": 250 + }, + { + "epoch": 0.19076457229371688, + "grad_norm": 2.5905075073242188, + "learning_rate": 2.841509433962264e-06, + "loss": 0.9506483674049377, + "step": 252 + }, + { + "epoch": 0.19227857683573052, + "grad_norm": 1.4147306680679321, + "learning_rate": 2.8641509433962264e-06, + "loss": 1.185529351234436, + "step": 254 + }, + { + "epoch": 0.19379258137774413, + "grad_norm": 1.7973873615264893, + "learning_rate": 2.8867924528301886e-06, + "loss": 1.3266160488128662, + "step": 256 + }, + { + "epoch": 0.19530658591975775, + "grad_norm": 1.9541969299316406, + "learning_rate": 2.9094339622641512e-06, + "loss": 1.3120050430297852, + "step": 258 + }, + { + "epoch": 0.1968205904617714, + "grad_norm": 1.5864051580429077, + "learning_rate": 2.9320754716981135e-06, + "loss": 0.8817747831344604, + "step": 260 + }, + { + "epoch": 0.198334595003785, + "grad_norm": 1.9231981039047241, + "learning_rate": 2.9547169811320757e-06, + "loss": 0.5132631659507751, + "step": 262 + }, + { + "epoch": 0.19984859954579864, + "grad_norm": 2.6671013832092285, + "learning_rate": 2.977358490566038e-06, + "loss": 1.3915808200836182, + "step": 264 + }, + { + "epoch": 0.20136260408781226, + "grad_norm": 3.2218801975250244, + "learning_rate": 3e-06, + "loss": 0.538420557975769, + "step": 266 + }, + { + "epoch": 0.2028766086298259, + "grad_norm": 2.265723705291748, + "learning_rate": 2.9999989421378913e-06, + "loss": 1.1453967094421387, + "step": 268 + }, + { + "epoch": 0.2043906131718395, + "grad_norm": 2.8631465435028076, + "learning_rate": 2.999995768553224e-06, + "loss": 0.8248827457427979, + "step": 270 + }, + { + "epoch": 0.20590461771385316, + "grad_norm": 1.6934646368026733, + "learning_rate": 2.9999904792509703e-06, + "loss": 1.3081867694854736, + "step": 272 + }, + { + "epoch": 0.20741862225586677, + "grad_norm": 1.8252122402191162, + "learning_rate": 2.999983074239421e-06, + "loss": 0.9383316040039062, + "step": 274 + }, + { + "epoch": 0.20893262679788038, + "grad_norm": 3.5256311893463135, + "learning_rate": 2.99997355353018e-06, + "loss": 0.984582781791687, + "step": 276 + }, + { + "epoch": 0.21044663133989402, + "grad_norm": 2.678860902786255, + "learning_rate": 2.9999619171381696e-06, + "loss": 0.9605339169502258, + "step": 278 + }, + { + "epoch": 0.21196063588190764, + "grad_norm": 1.9203742742538452, + "learning_rate": 2.9999481650816256e-06, + "loss": 0.9731888175010681, + "step": 280 + }, + { + "epoch": 0.21347464042392128, + "grad_norm": 2.9102401733398438, + "learning_rate": 2.9999322973821e-06, + "loss": 1.0470885038375854, + "step": 282 + }, + { + "epoch": 0.2149886449659349, + "grad_norm": 4.546679973602295, + "learning_rate": 2.9999143140644616e-06, + "loss": 0.6083078980445862, + "step": 284 + }, + { + "epoch": 0.21650264950794854, + "grad_norm": 8.816315650939941, + "learning_rate": 2.9998942151568927e-06, + "loss": 0.717522382736206, + "step": 286 + }, + { + "epoch": 0.21801665404996215, + "grad_norm": 2.417719841003418, + "learning_rate": 2.9998720006908934e-06, + "loss": 0.8954467177391052, + "step": 288 + }, + { + "epoch": 0.21953065859197576, + "grad_norm": 1.2781686782836914, + "learning_rate": 2.9998476707012776e-06, + "loss": 0.7305607199668884, + "step": 290 + }, + { + "epoch": 0.2210446631339894, + "grad_norm": 1.3508599996566772, + "learning_rate": 2.999821225226176e-06, + "loss": 0.9659827351570129, + "step": 292 + }, + { + "epoch": 0.22255866767600302, + "grad_norm": 5.050702095031738, + "learning_rate": 2.9997926643070335e-06, + "loss": 0.8743492960929871, + "step": 294 + }, + { + "epoch": 0.22407267221801666, + "grad_norm": 1.8615310192108154, + "learning_rate": 2.999761987988611e-06, + "loss": 0.5038611888885498, + "step": 296 + }, + { + "epoch": 0.22558667676003027, + "grad_norm": 3.819754123687744, + "learning_rate": 2.9997291963189844e-06, + "loss": 0.7133132219314575, + "step": 298 + }, + { + "epoch": 0.22710068130204392, + "grad_norm": 1.5813655853271484, + "learning_rate": 2.9996942893495453e-06, + "loss": 1.2508494853973389, + "step": 300 + }, + { + "epoch": 0.22861468584405753, + "grad_norm": 2.666712999343872, + "learning_rate": 2.999657267135e-06, + "loss": 0.9611169099807739, + "step": 302 + }, + { + "epoch": 0.23012869038607117, + "grad_norm": 1.817322850227356, + "learning_rate": 2.9996181297333692e-06, + "loss": 0.978617250919342, + "step": 304 + }, + { + "epoch": 0.23164269492808479, + "grad_norm": 1.4741506576538086, + "learning_rate": 2.99957687720599e-06, + "loss": 1.2954121828079224, + "step": 306 + }, + { + "epoch": 0.2331566994700984, + "grad_norm": 9.097981452941895, + "learning_rate": 2.999533509617513e-06, + "loss": 0.6156519055366516, + "step": 308 + }, + { + "epoch": 0.23467070401211204, + "grad_norm": 2.0447330474853516, + "learning_rate": 2.9994880270359037e-06, + "loss": 0.9546732306480408, + "step": 310 + }, + { + "epoch": 0.23618470855412566, + "grad_norm": 3.228562593460083, + "learning_rate": 2.999440429532443e-06, + "loss": 0.6118265986442566, + "step": 312 + }, + { + "epoch": 0.2376987130961393, + "grad_norm": 3.7214834690093994, + "learning_rate": 2.9993907171817255e-06, + "loss": 0.5688620805740356, + "step": 314 + }, + { + "epoch": 0.2392127176381529, + "grad_norm": 7.145568370819092, + "learning_rate": 2.9993388900616615e-06, + "loss": 0.8227903842926025, + "step": 316 + }, + { + "epoch": 0.24072672218016655, + "grad_norm": 1.5378928184509277, + "learning_rate": 2.9992849482534736e-06, + "loss": 0.8674030900001526, + "step": 318 + }, + { + "epoch": 0.24224072672218017, + "grad_norm": 9.813234329223633, + "learning_rate": 2.9992288918417003e-06, + "loss": 0.9824652075767517, + "step": 320 + }, + { + "epoch": 0.24375473126419378, + "grad_norm": 2.0780982971191406, + "learning_rate": 2.999170720914192e-06, + "loss": 1.2990097999572754, + "step": 322 + }, + { + "epoch": 0.24526873580620742, + "grad_norm": 12.514634132385254, + "learning_rate": 2.999110435562116e-06, + "loss": 0.4773423671722412, + "step": 324 + }, + { + "epoch": 0.24678274034822104, + "grad_norm": 1.4305341243743896, + "learning_rate": 2.999048035879951e-06, + "loss": 0.8895021677017212, + "step": 326 + }, + { + "epoch": 0.24829674489023468, + "grad_norm": 3.919238805770874, + "learning_rate": 2.9989835219654903e-06, + "loss": 1.0978517532348633, + "step": 328 + }, + { + "epoch": 0.2498107494322483, + "grad_norm": 1.8205312490463257, + "learning_rate": 2.99891689391984e-06, + "loss": 1.3268001079559326, + "step": 330 + }, + { + "epoch": 0.2513247539742619, + "grad_norm": 3.3887805938720703, + "learning_rate": 2.99884815184742e-06, + "loss": 0.5721341967582703, + "step": 332 + }, + { + "epoch": 0.2528387585162756, + "grad_norm": 14.343548774719238, + "learning_rate": 2.9987772958559627e-06, + "loss": 0.6671388149261475, + "step": 334 + }, + { + "epoch": 0.2543527630582892, + "grad_norm": 1.891146183013916, + "learning_rate": 2.998704326056514e-06, + "loss": 1.3301867246627808, + "step": 336 + }, + { + "epoch": 0.2558667676003028, + "grad_norm": 1.5941251516342163, + "learning_rate": 2.998629242563432e-06, + "loss": 0.3873002529144287, + "step": 338 + }, + { + "epoch": 0.2573807721423164, + "grad_norm": 1.8552782535552979, + "learning_rate": 2.998552045494389e-06, + "loss": 0.4803125858306885, + "step": 340 + }, + { + "epoch": 0.25889477668433003, + "grad_norm": 3.4722740650177, + "learning_rate": 2.998472734970367e-06, + "loss": 0.512718677520752, + "step": 342 + }, + { + "epoch": 0.2604087812263437, + "grad_norm": 5.149838447570801, + "learning_rate": 2.998391311115663e-06, + "loss": 0.7515890598297119, + "step": 344 + }, + { + "epoch": 0.2619227857683573, + "grad_norm": 1.195176124572754, + "learning_rate": 2.9983077740578835e-06, + "loss": 0.49589306116104126, + "step": 346 + }, + { + "epoch": 0.2634367903103709, + "grad_norm": 2.7547714710235596, + "learning_rate": 2.998222123927949e-06, + "loss": 0.5078981518745422, + "step": 348 + }, + { + "epoch": 0.26495079485238454, + "grad_norm": 2.2462663650512695, + "learning_rate": 2.9981343608600907e-06, + "loss": 0.5312693119049072, + "step": 350 + }, + { + "epoch": 0.2664647993943982, + "grad_norm": 2.0346577167510986, + "learning_rate": 2.99804448499185e-06, + "loss": 1.0556764602661133, + "step": 352 + }, + { + "epoch": 0.2679788039364118, + "grad_norm": 2.053784132003784, + "learning_rate": 2.9979524964640826e-06, + "loss": 0.6005126237869263, + "step": 354 + }, + { + "epoch": 0.26949280847842544, + "grad_norm": 8.271651268005371, + "learning_rate": 2.997858395420951e-06, + "loss": 0.7354615330696106, + "step": 356 + }, + { + "epoch": 0.27100681302043905, + "grad_norm": 1.7472703456878662, + "learning_rate": 2.9977621820099316e-06, + "loss": 1.216623306274414, + "step": 358 + }, + { + "epoch": 0.27252081756245267, + "grad_norm": 2.6638777256011963, + "learning_rate": 2.997663856381811e-06, + "loss": 0.4220973551273346, + "step": 360 + }, + { + "epoch": 0.27403482210446634, + "grad_norm": 1.9354782104492188, + "learning_rate": 2.997563418690685e-06, + "loss": 0.9713519215583801, + "step": 362 + }, + { + "epoch": 0.27554882664647995, + "grad_norm": 1.0535366535186768, + "learning_rate": 2.997460869093959e-06, + "loss": 0.29745492339134216, + "step": 364 + }, + { + "epoch": 0.27706283118849356, + "grad_norm": 2.887793779373169, + "learning_rate": 2.9973562077523503e-06, + "loss": 0.6364420056343079, + "step": 366 + }, + { + "epoch": 0.2785768357305072, + "grad_norm": 1.7000067234039307, + "learning_rate": 2.9972494348298837e-06, + "loss": 0.8678056597709656, + "step": 368 + }, + { + "epoch": 0.2800908402725208, + "grad_norm": 3.3849411010742188, + "learning_rate": 2.9971405504938943e-06, + "loss": 0.46991249918937683, + "step": 370 + }, + { + "epoch": 0.28160484481453446, + "grad_norm": 1.9020545482635498, + "learning_rate": 2.9970295549150265e-06, + "loss": 0.9194963574409485, + "step": 372 + }, + { + "epoch": 0.2831188493565481, + "grad_norm": 1.3911938667297363, + "learning_rate": 2.996916448267232e-06, + "loss": 0.9554241895675659, + "step": 374 + }, + { + "epoch": 0.2846328538985617, + "grad_norm": 1.1377285718917847, + "learning_rate": 2.9968012307277723e-06, + "loss": 0.48879367113113403, + "step": 376 + }, + { + "epoch": 0.2861468584405753, + "grad_norm": 1.421636939048767, + "learning_rate": 2.9966839024772165e-06, + "loss": 0.8501567840576172, + "step": 378 + }, + { + "epoch": 0.28766086298258897, + "grad_norm": 1.6783747673034668, + "learning_rate": 2.9965644636994427e-06, + "loss": 1.0313618183135986, + "step": 380 + }, + { + "epoch": 0.2891748675246026, + "grad_norm": 2.0127832889556885, + "learning_rate": 2.996442914581634e-06, + "loss": 0.5957201719284058, + "step": 382 + }, + { + "epoch": 0.2906888720666162, + "grad_norm": 2.471005439758301, + "learning_rate": 2.996319255314284e-06, + "loss": 0.5133829116821289, + "step": 384 + }, + { + "epoch": 0.2922028766086298, + "grad_norm": 1.2407896518707275, + "learning_rate": 2.996193486091192e-06, + "loss": 1.2660177946090698, + "step": 386 + }, + { + "epoch": 0.2937168811506434, + "grad_norm": 1.4875528812408447, + "learning_rate": 2.9960656071094628e-06, + "loss": 0.9659662842750549, + "step": 388 + }, + { + "epoch": 0.2952308856926571, + "grad_norm": 1.935443639755249, + "learning_rate": 2.9959356185695096e-06, + "loss": 1.1267831325531006, + "step": 390 + }, + { + "epoch": 0.2967448902346707, + "grad_norm": 0.8851851224899292, + "learning_rate": 2.9958035206750504e-06, + "loss": 1.1239861249923706, + "step": 392 + }, + { + "epoch": 0.2982588947766843, + "grad_norm": 2.0656418800354004, + "learning_rate": 2.9956693136331096e-06, + "loss": 0.9480706453323364, + "step": 394 + }, + { + "epoch": 0.29977289931869794, + "grad_norm": 2.263044595718384, + "learning_rate": 2.995532997654017e-06, + "loss": 0.9983291625976562, + "step": 396 + }, + { + "epoch": 0.3012869038607116, + "grad_norm": 1.9356629848480225, + "learning_rate": 2.9953945729514073e-06, + "loss": 0.5419582724571228, + "step": 398 + }, + { + "epoch": 0.3028009084027252, + "grad_norm": 3.4083566665649414, + "learning_rate": 2.99525403974222e-06, + "loss": 0.8274163007736206, + "step": 400 + }, + { + "epoch": 0.30431491294473884, + "grad_norm": 0.945384681224823, + "learning_rate": 2.9951113982466996e-06, + "loss": 0.5026131272315979, + "step": 402 + }, + { + "epoch": 0.30582891748675245, + "grad_norm": 3.428576946258545, + "learning_rate": 2.9949666486883937e-06, + "loss": 0.9896385073661804, + "step": 404 + }, + { + "epoch": 0.30734292202876606, + "grad_norm": 1.581592082977295, + "learning_rate": 2.9948197912941546e-06, + "loss": 0.2815803587436676, + "step": 406 + }, + { + "epoch": 0.30885692657077973, + "grad_norm": 3.294218063354492, + "learning_rate": 2.994670826294138e-06, + "loss": 1.5582009553909302, + "step": 408 + }, + { + "epoch": 0.31037093111279335, + "grad_norm": 7.936295509338379, + "learning_rate": 2.9945197539218017e-06, + "loss": 0.653652012348175, + "step": 410 + }, + { + "epoch": 0.31188493565480696, + "grad_norm": 1.7961171865463257, + "learning_rate": 2.9943665744139075e-06, + "loss": 1.3276002407073975, + "step": 412 + }, + { + "epoch": 0.3133989401968206, + "grad_norm": 1.9038255214691162, + "learning_rate": 2.9942112880105175e-06, + "loss": 0.9262287616729736, + "step": 414 + }, + { + "epoch": 0.31491294473883424, + "grad_norm": 2.037210464477539, + "learning_rate": 2.9940538949549984e-06, + "loss": 0.8984797596931458, + "step": 416 + }, + { + "epoch": 0.31642694928084786, + "grad_norm": 2.1120893955230713, + "learning_rate": 2.9938943954940167e-06, + "loss": 0.927636444568634, + "step": 418 + }, + { + "epoch": 0.31794095382286147, + "grad_norm": 1.608696460723877, + "learning_rate": 2.9937327898775406e-06, + "loss": 0.5792948603630066, + "step": 420 + }, + { + "epoch": 0.3194549583648751, + "grad_norm": 4.962751388549805, + "learning_rate": 2.9935690783588378e-06, + "loss": 0.4227072298526764, + "step": 422 + }, + { + "epoch": 0.3209689629068887, + "grad_norm": 2.3434197902679443, + "learning_rate": 2.993403261194479e-06, + "loss": 1.243710994720459, + "step": 424 + }, + { + "epoch": 0.32248296744890237, + "grad_norm": 1.5609174966812134, + "learning_rate": 2.9932353386443325e-06, + "loss": 0.7371015548706055, + "step": 426 + }, + { + "epoch": 0.323996971990916, + "grad_norm": 2.0090510845184326, + "learning_rate": 2.993065310971568e-06, + "loss": 1.1069867610931396, + "step": 428 + }, + { + "epoch": 0.3255109765329296, + "grad_norm": 1.6548137664794922, + "learning_rate": 2.992893178442652e-06, + "loss": 1.288056492805481, + "step": 430 + }, + { + "epoch": 0.3270249810749432, + "grad_norm": 3.1023433208465576, + "learning_rate": 2.9927189413273517e-06, + "loss": 0.5962305665016174, + "step": 432 + }, + { + "epoch": 0.3285389856169568, + "grad_norm": 2.1617929935455322, + "learning_rate": 2.9925425998987326e-06, + "loss": 0.5677591562271118, + "step": 434 + }, + { + "epoch": 0.3300529901589705, + "grad_norm": 2.704984426498413, + "learning_rate": 2.992364154433157e-06, + "loss": 0.916948676109314, + "step": 436 + }, + { + "epoch": 0.3315669947009841, + "grad_norm": 17.37657356262207, + "learning_rate": 2.9921836052102853e-06, + "loss": 1.235106110572815, + "step": 438 + }, + { + "epoch": 0.3330809992429977, + "grad_norm": 1.302228331565857, + "learning_rate": 2.992000952513075e-06, + "loss": 0.512943685054779, + "step": 440 + }, + { + "epoch": 0.33459500378501134, + "grad_norm": 1.9216958284378052, + "learning_rate": 2.991816196627779e-06, + "loss": 0.5267124772071838, + "step": 442 + }, + { + "epoch": 0.336109008327025, + "grad_norm": 2.1327264308929443, + "learning_rate": 2.991629337843949e-06, + "loss": 0.5403533577919006, + "step": 444 + }, + { + "epoch": 0.3376230128690386, + "grad_norm": 1.376194953918457, + "learning_rate": 2.9914403764544296e-06, + "loss": 1.2272928953170776, + "step": 446 + }, + { + "epoch": 0.33913701741105223, + "grad_norm": 1.4925450086593628, + "learning_rate": 2.991249312755362e-06, + "loss": 0.9830273389816284, + "step": 448 + }, + { + "epoch": 0.34065102195306585, + "grad_norm": 1.5139658451080322, + "learning_rate": 2.991056147046181e-06, + "loss": 0.5266055464744568, + "step": 450 + }, + { + "epoch": 0.34216502649507946, + "grad_norm": 4.077967643737793, + "learning_rate": 2.990860879629618e-06, + "loss": 1.2411589622497559, + "step": 452 + }, + { + "epoch": 0.34367903103709313, + "grad_norm": 1.9039182662963867, + "learning_rate": 2.990663510811695e-06, + "loss": 1.0279943943023682, + "step": 454 + }, + { + "epoch": 0.34519303557910674, + "grad_norm": 5.866885185241699, + "learning_rate": 2.9904640409017305e-06, + "loss": 0.8960055112838745, + "step": 456 + }, + { + "epoch": 0.34670704012112036, + "grad_norm": 1.1375224590301514, + "learning_rate": 2.9902624702123334e-06, + "loss": 0.5561424493789673, + "step": 458 + }, + { + "epoch": 0.34822104466313397, + "grad_norm": 2.7199909687042236, + "learning_rate": 2.9900587990594068e-06, + "loss": 0.7852529883384705, + "step": 460 + }, + { + "epoch": 0.34973504920514764, + "grad_norm": 2.472648859024048, + "learning_rate": 2.989853027762144e-06, + "loss": 0.9063277244567871, + "step": 462 + }, + { + "epoch": 0.35124905374716126, + "grad_norm": 12.205218315124512, + "learning_rate": 2.989645156643031e-06, + "loss": 1.0444289445877075, + "step": 464 + }, + { + "epoch": 0.35276305828917487, + "grad_norm": 1.9877824783325195, + "learning_rate": 2.9894351860278433e-06, + "loss": 0.3915345370769501, + "step": 466 + }, + { + "epoch": 0.3542770628311885, + "grad_norm": 2.004340648651123, + "learning_rate": 2.989223116245648e-06, + "loss": 0.5365378856658936, + "step": 468 + }, + { + "epoch": 0.3557910673732021, + "grad_norm": 2.2721107006073, + "learning_rate": 2.9890089476288017e-06, + "loss": 0.8007775545120239, + "step": 470 + }, + { + "epoch": 0.35730507191521577, + "grad_norm": 1.6089833974838257, + "learning_rate": 2.98879268051295e-06, + "loss": 0.8194999694824219, + "step": 472 + }, + { + "epoch": 0.3588190764572294, + "grad_norm": 9.460573196411133, + "learning_rate": 2.9885743152370267e-06, + "loss": 0.4735422134399414, + "step": 474 + }, + { + "epoch": 0.360333080999243, + "grad_norm": 2.6137776374816895, + "learning_rate": 2.9883538521432557e-06, + "loss": 0.9495224952697754, + "step": 476 + }, + { + "epoch": 0.3618470855412566, + "grad_norm": 10.498940467834473, + "learning_rate": 2.988131291577147e-06, + "loss": 0.866862416267395, + "step": 478 + }, + { + "epoch": 0.3633610900832703, + "grad_norm": 2.243198871612549, + "learning_rate": 2.9879066338874974e-06, + "loss": 0.9765901565551758, + "step": 480 + }, + { + "epoch": 0.3648750946252839, + "grad_norm": 3.3587148189544678, + "learning_rate": 2.9876798794263923e-06, + "loss": 0.4574289619922638, + "step": 482 + }, + { + "epoch": 0.3663890991672975, + "grad_norm": 1.6754565238952637, + "learning_rate": 2.9874510285492013e-06, + "loss": 1.2471226453781128, + "step": 484 + }, + { + "epoch": 0.3679031037093111, + "grad_norm": 1.4370969533920288, + "learning_rate": 2.9872200816145807e-06, + "loss": 1.3035950660705566, + "step": 486 + }, + { + "epoch": 0.36941710825132473, + "grad_norm": 3.8013172149658203, + "learning_rate": 2.986987038984471e-06, + "loss": 0.6177915334701538, + "step": 488 + }, + { + "epoch": 0.3709311127933384, + "grad_norm": 2.8266143798828125, + "learning_rate": 2.9867519010240975e-06, + "loss": 0.5951381921768188, + "step": 490 + }, + { + "epoch": 0.372445117335352, + "grad_norm": 3.7607829570770264, + "learning_rate": 2.9865146681019688e-06, + "loss": 1.2834447622299194, + "step": 492 + }, + { + "epoch": 0.37395912187736563, + "grad_norm": 2.4031784534454346, + "learning_rate": 2.986275340589877e-06, + "loss": 0.75788414478302, + "step": 494 + }, + { + "epoch": 0.37547312641937924, + "grad_norm": 2.1432557106018066, + "learning_rate": 2.9860339188628978e-06, + "loss": 1.14529550075531, + "step": 496 + }, + { + "epoch": 0.37698713096139286, + "grad_norm": 2.7245068550109863, + "learning_rate": 2.985790403299387e-06, + "loss": 0.7696921825408936, + "step": 498 + }, + { + "epoch": 0.3785011355034065, + "grad_norm": 2.435703754425049, + "learning_rate": 2.9855447942809834e-06, + "loss": 0.8636212348937988, + "step": 500 + }, + { + "epoch": 0.38001514004542014, + "grad_norm": 6.706935882568359, + "learning_rate": 2.985297092192606e-06, + "loss": 0.9131227731704712, + "step": 502 + }, + { + "epoch": 0.38152914458743376, + "grad_norm": 2.21355938911438, + "learning_rate": 2.9850472974224557e-06, + "loss": 1.3491765260696411, + "step": 504 + }, + { + "epoch": 0.38304314912944737, + "grad_norm": 0.9106066823005676, + "learning_rate": 2.9847954103620095e-06, + "loss": 0.8370890617370605, + "step": 506 + }, + { + "epoch": 0.38455715367146104, + "grad_norm": 9.963761329650879, + "learning_rate": 2.9845414314060265e-06, + "loss": 0.6827934384346008, + "step": 508 + }, + { + "epoch": 0.38607115821347465, + "grad_norm": 6.23288106918335, + "learning_rate": 2.984285360952544e-06, + "loss": 0.5571025609970093, + "step": 510 + }, + { + "epoch": 0.38758516275548827, + "grad_norm": 2.5447981357574463, + "learning_rate": 2.9840271994028754e-06, + "loss": 0.4963566064834595, + "step": 512 + }, + { + "epoch": 0.3890991672975019, + "grad_norm": 5.765628814697266, + "learning_rate": 2.9837669471616124e-06, + "loss": 0.6732578277587891, + "step": 514 + }, + { + "epoch": 0.3906131718395155, + "grad_norm": 2.02893328666687, + "learning_rate": 2.9835046046366237e-06, + "loss": 1.2975317239761353, + "step": 516 + }, + { + "epoch": 0.39212717638152916, + "grad_norm": 4.2726545333862305, + "learning_rate": 2.983240172239053e-06, + "loss": 0.8535081148147583, + "step": 518 + }, + { + "epoch": 0.3936411809235428, + "grad_norm": 2.2877910137176514, + "learning_rate": 2.98297365038332e-06, + "loss": 0.5142571926116943, + "step": 520 + }, + { + "epoch": 0.3951551854655564, + "grad_norm": 0.9271646738052368, + "learning_rate": 2.982705039487118e-06, + "loss": 0.4773847758769989, + "step": 522 + }, + { + "epoch": 0.39666919000757, + "grad_norm": 3.5444107055664062, + "learning_rate": 2.9824343399714144e-06, + "loss": 0.9543272256851196, + "step": 524 + }, + { + "epoch": 0.3981831945495837, + "grad_norm": 2.23673415184021, + "learning_rate": 2.9821615522604515e-06, + "loss": 1.3242028951644897, + "step": 526 + }, + { + "epoch": 0.3996971990915973, + "grad_norm": 1.3811380863189697, + "learning_rate": 2.9818866767817425e-06, + "loss": 0.8625052571296692, + "step": 528 + }, + { + "epoch": 0.4012112036336109, + "grad_norm": 2.9766619205474854, + "learning_rate": 2.981609713966073e-06, + "loss": 0.4589157700538635, + "step": 530 + }, + { + "epoch": 0.4027252081756245, + "grad_norm": 3.958052396774292, + "learning_rate": 2.9813306642475005e-06, + "loss": 1.2542142868041992, + "step": 532 + }, + { + "epoch": 0.40423921271763813, + "grad_norm": 7.971195697784424, + "learning_rate": 2.9810495280633517e-06, + "loss": 0.8737133145332336, + "step": 534 + }, + { + "epoch": 0.4057532172596518, + "grad_norm": 2.8728365898132324, + "learning_rate": 2.980766305854225e-06, + "loss": 0.8348803520202637, + "step": 536 + }, + { + "epoch": 0.4072672218016654, + "grad_norm": 1.8818715810775757, + "learning_rate": 2.9804809980639865e-06, + "loss": 0.8300620913505554, + "step": 538 + }, + { + "epoch": 0.408781226343679, + "grad_norm": 1.7600295543670654, + "learning_rate": 2.9801936051397717e-06, + "loss": 0.5276261568069458, + "step": 540 + }, + { + "epoch": 0.41029523088569264, + "grad_norm": 11.18760871887207, + "learning_rate": 2.979904127531984e-06, + "loss": 0.5877084732055664, + "step": 542 + }, + { + "epoch": 0.4118092354277063, + "grad_norm": 5.164269924163818, + "learning_rate": 2.9796125656942925e-06, + "loss": 0.6118907332420349, + "step": 544 + }, + { + "epoch": 0.4133232399697199, + "grad_norm": 1.9854013919830322, + "learning_rate": 2.9793189200836356e-06, + "loss": 0.6037644147872925, + "step": 546 + }, + { + "epoch": 0.41483724451173354, + "grad_norm": 2.224184036254883, + "learning_rate": 2.9790231911602143e-06, + "loss": 0.5628061294555664, + "step": 548 + }, + { + "epoch": 0.41635124905374715, + "grad_norm": 2.595224380493164, + "learning_rate": 2.9787253793874958e-06, + "loss": 0.9052645564079285, + "step": 550 + }, + { + "epoch": 0.41786525359576077, + "grad_norm": 2.4498090744018555, + "learning_rate": 2.9784254852322125e-06, + "loss": 0.6325636506080627, + "step": 552 + }, + { + "epoch": 0.41937925813777444, + "grad_norm": 2.5664796829223633, + "learning_rate": 2.9781235091643587e-06, + "loss": 0.8503198623657227, + "step": 554 + }, + { + "epoch": 0.42089326267978805, + "grad_norm": 2.6050705909729004, + "learning_rate": 2.977819451657193e-06, + "loss": 0.5251026153564453, + "step": 556 + }, + { + "epoch": 0.42240726722180166, + "grad_norm": 3.4257054328918457, + "learning_rate": 2.9775133131872347e-06, + "loss": 1.2795467376708984, + "step": 558 + }, + { + "epoch": 0.4239212717638153, + "grad_norm": 2.2738325595855713, + "learning_rate": 2.977205094234265e-06, + "loss": 0.9735236167907715, + "step": 560 + }, + { + "epoch": 0.4254352763058289, + "grad_norm": 1.9380100965499878, + "learning_rate": 2.976894795281326e-06, + "loss": 0.867192804813385, + "step": 562 + }, + { + "epoch": 0.42694928084784256, + "grad_norm": 3.4724042415618896, + "learning_rate": 2.9765824168147184e-06, + "loss": 0.7334169149398804, + "step": 564 + }, + { + "epoch": 0.4284632853898562, + "grad_norm": 2.5322248935699463, + "learning_rate": 2.9762679593240034e-06, + "loss": 0.5401719808578491, + "step": 566 + }, + { + "epoch": 0.4299772899318698, + "grad_norm": 15.750800132751465, + "learning_rate": 2.975951423302e-06, + "loss": 1.0404959917068481, + "step": 568 + }, + { + "epoch": 0.4314912944738834, + "grad_norm": 1.8556584119796753, + "learning_rate": 2.9756328092447835e-06, + "loss": 1.2649303674697876, + "step": 570 + }, + { + "epoch": 0.43300529901589707, + "grad_norm": 0.8886945247650146, + "learning_rate": 2.9753121176516876e-06, + "loss": 0.662155032157898, + "step": 572 + }, + { + "epoch": 0.4345193035579107, + "grad_norm": 2.469963312149048, + "learning_rate": 2.9749893490253e-06, + "loss": 1.0964909791946411, + "step": 574 + }, + { + "epoch": 0.4360333080999243, + "grad_norm": 4.345314025878906, + "learning_rate": 2.974664503871467e-06, + "loss": 1.0193687677383423, + "step": 576 + }, + { + "epoch": 0.4375473126419379, + "grad_norm": 2.640014171600342, + "learning_rate": 2.9743375826992848e-06, + "loss": 1.0608943700790405, + "step": 578 + }, + { + "epoch": 0.4390613171839515, + "grad_norm": 4.732244968414307, + "learning_rate": 2.9740085860211066e-06, + "loss": 1.284674882888794, + "step": 580 + }, + { + "epoch": 0.4405753217259652, + "grad_norm": 1.8341491222381592, + "learning_rate": 2.9736775143525363e-06, + "loss": 0.9327861666679382, + "step": 582 + }, + { + "epoch": 0.4420893262679788, + "grad_norm": 4.778522968292236, + "learning_rate": 2.9733443682124303e-06, + "loss": 0.6498082876205444, + "step": 584 + }, + { + "epoch": 0.4436033308099924, + "grad_norm": 7.313467502593994, + "learning_rate": 2.9730091481228967e-06, + "loss": 1.2512699365615845, + "step": 586 + }, + { + "epoch": 0.44511733535200604, + "grad_norm": 3.8918874263763428, + "learning_rate": 2.9726718546092934e-06, + "loss": 1.0238436460494995, + "step": 588 + }, + { + "epoch": 0.4466313398940197, + "grad_norm": 3.3440310955047607, + "learning_rate": 2.972332488200228e-06, + "loss": 0.904268741607666, + "step": 590 + }, + { + "epoch": 0.4481453444360333, + "grad_norm": 3.956916093826294, + "learning_rate": 2.971991049427556e-06, + "loss": 1.2141236066818237, + "step": 592 + }, + { + "epoch": 0.44965934897804694, + "grad_norm": 2.698284387588501, + "learning_rate": 2.9716475388263818e-06, + "loss": 1.3313930034637451, + "step": 594 + }, + { + "epoch": 0.45117335352006055, + "grad_norm": 1.3989747762680054, + "learning_rate": 2.9713019569350568e-06, + "loss": 0.6560528874397278, + "step": 596 + }, + { + "epoch": 0.45268735806207416, + "grad_norm": 3.8785853385925293, + "learning_rate": 2.9709543042951764e-06, + "loss": 0.5024439692497253, + "step": 598 + }, + { + "epoch": 0.45420136260408783, + "grad_norm": 1.062403917312622, + "learning_rate": 2.970604581451585e-06, + "loss": 0.4830036759376526, + "step": 600 + }, + { + "epoch": 0.45571536714610145, + "grad_norm": 6.831843852996826, + "learning_rate": 2.970252788952368e-06, + "loss": 1.2678862810134888, + "step": 602 + }, + { + "epoch": 0.45722937168811506, + "grad_norm": 2.5472588539123535, + "learning_rate": 2.9698989273488553e-06, + "loss": 0.572661280632019, + "step": 604 + }, + { + "epoch": 0.4587433762301287, + "grad_norm": 1.5087025165557861, + "learning_rate": 2.9695429971956215e-06, + "loss": 0.9104925394058228, + "step": 606 + }, + { + "epoch": 0.46025738077214234, + "grad_norm": 1.2133845090866089, + "learning_rate": 2.96918499905048e-06, + "loss": 0.5393221378326416, + "step": 608 + }, + { + "epoch": 0.46177138531415596, + "grad_norm": 10.903751373291016, + "learning_rate": 2.968824933474487e-06, + "loss": 1.2028796672821045, + "step": 610 + }, + { + "epoch": 0.46328538985616957, + "grad_norm": 2.8977067470550537, + "learning_rate": 2.968462801031939e-06, + "loss": 0.7516031265258789, + "step": 612 + }, + { + "epoch": 0.4647993943981832, + "grad_norm": 4.32919454574585, + "learning_rate": 2.9680986022903703e-06, + "loss": 0.8674114942550659, + "step": 614 + }, + { + "epoch": 0.4663133989401968, + "grad_norm": 1.608612060546875, + "learning_rate": 2.9677323378205546e-06, + "loss": 0.595661461353302, + "step": 616 + }, + { + "epoch": 0.46782740348221047, + "grad_norm": 2.312485456466675, + "learning_rate": 2.967364008196503e-06, + "loss": 0.8987520933151245, + "step": 618 + }, + { + "epoch": 0.4693414080242241, + "grad_norm": 2.1147568225860596, + "learning_rate": 2.966993613995462e-06, + "loss": 0.9148374795913696, + "step": 620 + }, + { + "epoch": 0.4708554125662377, + "grad_norm": 1.7400703430175781, + "learning_rate": 2.966621155797916e-06, + "loss": 0.7914771437644958, + "step": 622 + }, + { + "epoch": 0.4723694171082513, + "grad_norm": 2.3664000034332275, + "learning_rate": 2.9662466341875814e-06, + "loss": 1.1130759716033936, + "step": 624 + }, + { + "epoch": 0.4738834216502649, + "grad_norm": 2.528683662414551, + "learning_rate": 2.96587004975141e-06, + "loss": 0.5625622868537903, + "step": 626 + }, + { + "epoch": 0.4753974261922786, + "grad_norm": 1.8007179498672485, + "learning_rate": 2.965491403079586e-06, + "loss": 0.9524298310279846, + "step": 628 + }, + { + "epoch": 0.4769114307342922, + "grad_norm": 2.555586814880371, + "learning_rate": 2.965110694765526e-06, + "loss": 0.4652448892593384, + "step": 630 + }, + { + "epoch": 0.4784254352763058, + "grad_norm": 2.9922497272491455, + "learning_rate": 2.964727925405877e-06, + "loss": 0.5585123896598816, + "step": 632 + }, + { + "epoch": 0.47993943981831944, + "grad_norm": 2.435415267944336, + "learning_rate": 2.9643430956005166e-06, + "loss": 0.7860139608383179, + "step": 634 + }, + { + "epoch": 0.4814534443603331, + "grad_norm": 0.8732883930206299, + "learning_rate": 2.963956205952551e-06, + "loss": 0.5169360637664795, + "step": 636 + }, + { + "epoch": 0.4829674489023467, + "grad_norm": 4.100015640258789, + "learning_rate": 2.9635672570683145e-06, + "loss": 0.8051522970199585, + "step": 638 + }, + { + "epoch": 0.48448145344436033, + "grad_norm": 2.0434277057647705, + "learning_rate": 2.9631762495573697e-06, + "loss": 0.9732469320297241, + "step": 640 + }, + { + "epoch": 0.48599545798637395, + "grad_norm": 1.884527564048767, + "learning_rate": 2.9627831840325043e-06, + "loss": 1.0066033601760864, + "step": 642 + }, + { + "epoch": 0.48750946252838756, + "grad_norm": 1.9356602430343628, + "learning_rate": 2.9623880611097323e-06, + "loss": 0.925056517124176, + "step": 644 + }, + { + "epoch": 0.48902346707040123, + "grad_norm": 2.3697316646575928, + "learning_rate": 2.961990881408291e-06, + "loss": 1.2187111377716064, + "step": 646 + }, + { + "epoch": 0.49053747161241484, + "grad_norm": 2.063014030456543, + "learning_rate": 2.9615916455506424e-06, + "loss": 0.8997195959091187, + "step": 648 + }, + { + "epoch": 0.49205147615442846, + "grad_norm": 2.8104512691497803, + "learning_rate": 2.9611903541624695e-06, + "loss": 0.5274589657783508, + "step": 650 + }, + { + "epoch": 0.49356548069644207, + "grad_norm": 2.428311824798584, + "learning_rate": 2.960787007872678e-06, + "loss": 0.5732669830322266, + "step": 652 + }, + { + "epoch": 0.49507948523845574, + "grad_norm": 2.9070358276367188, + "learning_rate": 2.960381607313393e-06, + "loss": 0.7458028197288513, + "step": 654 + }, + { + "epoch": 0.49659348978046935, + "grad_norm": 1.7552977800369263, + "learning_rate": 2.959974153119959e-06, + "loss": 0.8947643041610718, + "step": 656 + }, + { + "epoch": 0.49810749432248297, + "grad_norm": 1.5749355554580688, + "learning_rate": 2.959564645930941e-06, + "loss": 1.2562661170959473, + "step": 658 + }, + { + "epoch": 0.4996214988644966, + "grad_norm": 11.023453712463379, + "learning_rate": 2.959153086388119e-06, + "loss": 0.9333547949790955, + "step": 660 + }, + { + "epoch": 0.5011355034065103, + "grad_norm": 1.7231638431549072, + "learning_rate": 2.9587394751364895e-06, + "loss": 0.6585370302200317, + "step": 662 + }, + { + "epoch": 0.5026495079485238, + "grad_norm": 2.0775532722473145, + "learning_rate": 2.9583238128242673e-06, + "loss": 0.7238444089889526, + "step": 664 + }, + { + "epoch": 0.5041635124905375, + "grad_norm": 0.9623629450798035, + "learning_rate": 2.9579061001028787e-06, + "loss": 0.922552227973938, + "step": 666 + }, + { + "epoch": 0.5056775170325511, + "grad_norm": 1.735793948173523, + "learning_rate": 2.9574863376269644e-06, + "loss": 0.5483730435371399, + "step": 668 + }, + { + "epoch": 0.5071915215745647, + "grad_norm": 2.608731269836426, + "learning_rate": 2.9570645260543773e-06, + "loss": 0.663311779499054, + "step": 670 + }, + { + "epoch": 0.5087055261165784, + "grad_norm": 6.55451774597168, + "learning_rate": 2.9566406660461816e-06, + "loss": 0.6987183690071106, + "step": 672 + }, + { + "epoch": 0.5102195306585919, + "grad_norm": 1.7770209312438965, + "learning_rate": 2.956214758266653e-06, + "loss": 1.04190993309021, + "step": 674 + }, + { + "epoch": 0.5117335352006056, + "grad_norm": 0.809238076210022, + "learning_rate": 2.955786803383275e-06, + "loss": 0.8445099592208862, + "step": 676 + }, + { + "epoch": 0.5132475397426193, + "grad_norm": 1.9151666164398193, + "learning_rate": 2.9553568020667393e-06, + "loss": 1.3137012720108032, + "step": 678 + }, + { + "epoch": 0.5147615442846328, + "grad_norm": 2.247816324234009, + "learning_rate": 2.9549247549909456e-06, + "loss": 0.7027629017829895, + "step": 680 + }, + { + "epoch": 0.5162755488266465, + "grad_norm": 1.8459402322769165, + "learning_rate": 2.954490662833e-06, + "loss": 1.1782515048980713, + "step": 682 + }, + { + "epoch": 0.5177895533686601, + "grad_norm": 1.2116007804870605, + "learning_rate": 2.954054526273213e-06, + "loss": 0.49978435039520264, + "step": 684 + }, + { + "epoch": 0.5193035579106737, + "grad_norm": 2.160564661026001, + "learning_rate": 2.9536163459950984e-06, + "loss": 0.3752554655075073, + "step": 686 + }, + { + "epoch": 0.5208175624526874, + "grad_norm": 2.0724048614501953, + "learning_rate": 2.953176122685374e-06, + "loss": 1.3308016061782837, + "step": 688 + }, + { + "epoch": 0.522331566994701, + "grad_norm": 2.1259520053863525, + "learning_rate": 2.952733857033959e-06, + "loss": 1.015698790550232, + "step": 690 + }, + { + "epoch": 0.5238455715367146, + "grad_norm": 2.9767887592315674, + "learning_rate": 2.9522895497339746e-06, + "loss": 1.0793743133544922, + "step": 692 + }, + { + "epoch": 0.5253595760787282, + "grad_norm": 2.5563642978668213, + "learning_rate": 2.9518432014817396e-06, + "loss": 0.4725661277770996, + "step": 694 + }, + { + "epoch": 0.5268735806207419, + "grad_norm": 2.641418218612671, + "learning_rate": 2.951394812976772e-06, + "loss": 0.6408408880233765, + "step": 696 + }, + { + "epoch": 0.5283875851627555, + "grad_norm": 1.3676077127456665, + "learning_rate": 2.9509443849217887e-06, + "loss": 1.2852436304092407, + "step": 698 + }, + { + "epoch": 0.5299015897047691, + "grad_norm": 2.7387311458587646, + "learning_rate": 2.9504919180227014e-06, + "loss": 0.5113032460212708, + "step": 700 + }, + { + "epoch": 0.5314155942467828, + "grad_norm": 3.1218018531799316, + "learning_rate": 2.9500374129886178e-06, + "loss": 1.0129162073135376, + "step": 702 + }, + { + "epoch": 0.5329295987887964, + "grad_norm": 4.050351619720459, + "learning_rate": 2.9495808705318392e-06, + "loss": 0.8722813129425049, + "step": 704 + }, + { + "epoch": 0.53444360333081, + "grad_norm": 27.964889526367188, + "learning_rate": 2.9491222913678613e-06, + "loss": 1.1573745012283325, + "step": 706 + }, + { + "epoch": 0.5359576078728236, + "grad_norm": 1.8653537034988403, + "learning_rate": 2.9486616762153697e-06, + "loss": 1.091062068939209, + "step": 708 + }, + { + "epoch": 0.5374716124148372, + "grad_norm": 2.453317403793335, + "learning_rate": 2.948199025796242e-06, + "loss": 0.8969689607620239, + "step": 710 + }, + { + "epoch": 0.5389856169568509, + "grad_norm": 1.2765965461730957, + "learning_rate": 2.9477343408355466e-06, + "loss": 0.8222131729125977, + "step": 712 + }, + { + "epoch": 0.5404996214988645, + "grad_norm": 2.10074782371521, + "learning_rate": 2.947267622061538e-06, + "loss": 0.46775394678115845, + "step": 714 + }, + { + "epoch": 0.5420136260408781, + "grad_norm": 1.2866603136062622, + "learning_rate": 2.9467988702056594e-06, + "loss": 0.896325945854187, + "step": 716 + }, + { + "epoch": 0.5435276305828918, + "grad_norm": 3.4520423412323, + "learning_rate": 2.946328086002541e-06, + "loss": 0.9878024458885193, + "step": 718 + }, + { + "epoch": 0.5450416351249053, + "grad_norm": 1.849073052406311, + "learning_rate": 2.945855270189997e-06, + "loss": 0.9559434056282043, + "step": 720 + }, + { + "epoch": 0.546555639666919, + "grad_norm": 1.4615992307662964, + "learning_rate": 2.9453804235090253e-06, + "loss": 0.8828739523887634, + "step": 722 + }, + { + "epoch": 0.5480696442089327, + "grad_norm": 5.629069805145264, + "learning_rate": 2.944903546703807e-06, + "loss": 0.6443637013435364, + "step": 724 + }, + { + "epoch": 0.5495836487509462, + "grad_norm": 6.054827690124512, + "learning_rate": 2.944424640521706e-06, + "loss": 1.0102320909500122, + "step": 726 + }, + { + "epoch": 0.5510976532929599, + "grad_norm": 2.1354260444641113, + "learning_rate": 2.9439437057132643e-06, + "loss": 0.909126877784729, + "step": 728 + }, + { + "epoch": 0.5526116578349735, + "grad_norm": 1.5889453887939453, + "learning_rate": 2.943460743032205e-06, + "loss": 0.7864683866500854, + "step": 730 + }, + { + "epoch": 0.5541256623769871, + "grad_norm": 4.267853736877441, + "learning_rate": 2.942975753235429e-06, + "loss": 0.8896602392196655, + "step": 732 + }, + { + "epoch": 0.5556396669190008, + "grad_norm": 2.129723072052002, + "learning_rate": 2.9424887370830134e-06, + "loss": 0.5310478210449219, + "step": 734 + }, + { + "epoch": 0.5571536714610144, + "grad_norm": 12.761570930480957, + "learning_rate": 2.9419996953382116e-06, + "loss": 0.4966410994529724, + "step": 736 + }, + { + "epoch": 0.558667676003028, + "grad_norm": 1.056359052658081, + "learning_rate": 2.9415086287674513e-06, + "loss": 0.8116670250892639, + "step": 738 + }, + { + "epoch": 0.5601816805450416, + "grad_norm": 11.245153427124023, + "learning_rate": 2.941015538140334e-06, + "loss": 0.47575944662094116, + "step": 740 + }, + { + "epoch": 0.5616956850870553, + "grad_norm": 3.585188627243042, + "learning_rate": 2.940520424229632e-06, + "loss": 0.8942469358444214, + "step": 742 + }, + { + "epoch": 0.5632096896290689, + "grad_norm": 4.114293575286865, + "learning_rate": 2.94002328781129e-06, + "loss": 1.1968226432800293, + "step": 744 + }, + { + "epoch": 0.5647236941710825, + "grad_norm": 3.1713132858276367, + "learning_rate": 2.9395241296644217e-06, + "loss": 1.006760835647583, + "step": 746 + }, + { + "epoch": 0.5662376987130961, + "grad_norm": 1.791144847869873, + "learning_rate": 2.939022950571309e-06, + "loss": 1.2270033359527588, + "step": 748 + }, + { + "epoch": 0.5677517032551098, + "grad_norm": 11.67522144317627, + "learning_rate": 2.938519751317402e-06, + "loss": 1.0643365383148193, + "step": 750 + }, + { + "epoch": 0.5692657077971234, + "grad_norm": 1.0967658758163452, + "learning_rate": 2.938014532691315e-06, + "loss": 0.9841832518577576, + "step": 752 + }, + { + "epoch": 0.570779712339137, + "grad_norm": 1.52469801902771, + "learning_rate": 2.937507295484829e-06, + "loss": 0.951506495475769, + "step": 754 + }, + { + "epoch": 0.5722937168811506, + "grad_norm": 2.3377974033355713, + "learning_rate": 2.936998040492888e-06, + "loss": 0.5932363271713257, + "step": 756 + }, + { + "epoch": 0.5738077214231643, + "grad_norm": 3.601687431335449, + "learning_rate": 2.936486768513597e-06, + "loss": 1.264681100845337, + "step": 758 + }, + { + "epoch": 0.5753217259651779, + "grad_norm": 1.2724038362503052, + "learning_rate": 2.9359734803482244e-06, + "loss": 0.7797785997390747, + "step": 760 + }, + { + "epoch": 0.5768357305071915, + "grad_norm": 2.051760196685791, + "learning_rate": 2.935458176801196e-06, + "loss": 0.5461609959602356, + "step": 762 + }, + { + "epoch": 0.5783497350492052, + "grad_norm": 1.8419607877731323, + "learning_rate": 2.9349408586800974e-06, + "loss": 0.8952110409736633, + "step": 764 + }, + { + "epoch": 0.5798637395912187, + "grad_norm": 12.950570106506348, + "learning_rate": 2.9344215267956716e-06, + "loss": 0.9577568173408508, + "step": 766 + }, + { + "epoch": 0.5813777441332324, + "grad_norm": 2.6900603771209717, + "learning_rate": 2.933900181961816e-06, + "loss": 0.6937704086303711, + "step": 768 + }, + { + "epoch": 0.5828917486752461, + "grad_norm": 1.4522511959075928, + "learning_rate": 2.933376824995585e-06, + "loss": 0.8444547653198242, + "step": 770 + }, + { + "epoch": 0.5844057532172596, + "grad_norm": 1.75600266456604, + "learning_rate": 2.932851456717185e-06, + "loss": 1.034805417060852, + "step": 772 + }, + { + "epoch": 0.5859197577592733, + "grad_norm": 2.222780704498291, + "learning_rate": 2.9323240779499744e-06, + "loss": 0.8588864803314209, + "step": 774 + }, + { + "epoch": 0.5874337623012869, + "grad_norm": 2.5241029262542725, + "learning_rate": 2.9317946895204634e-06, + "loss": 0.9585127234458923, + "step": 776 + }, + { + "epoch": 0.5889477668433005, + "grad_norm": 1.3337754011154175, + "learning_rate": 2.9312632922583108e-06, + "loss": 1.185091495513916, + "step": 778 + }, + { + "epoch": 0.5904617713853142, + "grad_norm": 3.3314359188079834, + "learning_rate": 2.930729886996324e-06, + "loss": 0.5059195756912231, + "step": 780 + }, + { + "epoch": 0.5919757759273278, + "grad_norm": 1.0932236909866333, + "learning_rate": 2.930194474570458e-06, + "loss": 0.9334269165992737, + "step": 782 + }, + { + "epoch": 0.5934897804693414, + "grad_norm": 1.2207541465759277, + "learning_rate": 2.929657055819812e-06, + "loss": 0.7467067837715149, + "step": 784 + }, + { + "epoch": 0.595003785011355, + "grad_norm": 3.1017303466796875, + "learning_rate": 2.9291176315866315e-06, + "loss": 0.4495176076889038, + "step": 786 + }, + { + "epoch": 0.5965177895533686, + "grad_norm": 1.5837126970291138, + "learning_rate": 2.928576202716302e-06, + "loss": 0.7723722457885742, + "step": 788 + }, + { + "epoch": 0.5980317940953823, + "grad_norm": 1.3979252576828003, + "learning_rate": 2.9280327700573545e-06, + "loss": 0.9797012209892273, + "step": 790 + }, + { + "epoch": 0.5995457986373959, + "grad_norm": 1.2105636596679688, + "learning_rate": 2.9274873344614567e-06, + "loss": 0.8930003643035889, + "step": 792 + }, + { + "epoch": 0.6010598031794095, + "grad_norm": 1.439759373664856, + "learning_rate": 2.926939896783418e-06, + "loss": 0.8449392318725586, + "step": 794 + }, + { + "epoch": 0.6025738077214232, + "grad_norm": 1.6218822002410889, + "learning_rate": 2.926390457881185e-06, + "loss": 0.8345645666122437, + "step": 796 + }, + { + "epoch": 0.6040878122634368, + "grad_norm": 1.7808403968811035, + "learning_rate": 2.9258390186158377e-06, + "loss": 0.8866439461708069, + "step": 798 + }, + { + "epoch": 0.6056018168054504, + "grad_norm": 1.4405946731567383, + "learning_rate": 2.925285579851596e-06, + "loss": 0.6789926886558533, + "step": 800 + }, + { + "epoch": 0.607115821347464, + "grad_norm": 2.2952048778533936, + "learning_rate": 2.924730142455809e-06, + "loss": 0.8826119899749756, + "step": 802 + }, + { + "epoch": 0.6086298258894777, + "grad_norm": 3.04705548286438, + "learning_rate": 2.924172707298962e-06, + "loss": 0.9613844156265259, + "step": 804 + }, + { + "epoch": 0.6101438304314913, + "grad_norm": 5.385547161102295, + "learning_rate": 2.9236132752546675e-06, + "loss": 0.7768161296844482, + "step": 806 + }, + { + "epoch": 0.6116578349735049, + "grad_norm": 1.2202529907226562, + "learning_rate": 2.9230518471996693e-06, + "loss": 0.8363648653030396, + "step": 808 + }, + { + "epoch": 0.6131718395155186, + "grad_norm": 1.4032362699508667, + "learning_rate": 2.922488424013839e-06, + "loss": 0.7807202339172363, + "step": 810 + }, + { + "epoch": 0.6146858440575321, + "grad_norm": 1.9782370328903198, + "learning_rate": 2.9219230065801762e-06, + "loss": 0.925172746181488, + "step": 812 + }, + { + "epoch": 0.6161998485995458, + "grad_norm": 4.227447986602783, + "learning_rate": 2.9213555957848045e-06, + "loss": 0.6153373122215271, + "step": 814 + }, + { + "epoch": 0.6177138531415595, + "grad_norm": 1.032610535621643, + "learning_rate": 2.920786192516971e-06, + "loss": 0.4650787115097046, + "step": 816 + }, + { + "epoch": 0.619227857683573, + "grad_norm": 1.4408961534500122, + "learning_rate": 2.920214797669047e-06, + "loss": 0.4763714671134949, + "step": 818 + }, + { + "epoch": 0.6207418622255867, + "grad_norm": 3.2754526138305664, + "learning_rate": 2.9196414121365246e-06, + "loss": 0.9566894173622131, + "step": 820 + }, + { + "epoch": 0.6222558667676003, + "grad_norm": 4.002590656280518, + "learning_rate": 2.919066036818015e-06, + "loss": 0.3634946048259735, + "step": 822 + }, + { + "epoch": 0.6237698713096139, + "grad_norm": 4.929750919342041, + "learning_rate": 2.918488672615248e-06, + "loss": 0.5025648474693298, + "step": 824 + }, + { + "epoch": 0.6252838758516276, + "grad_norm": 1.6044130325317383, + "learning_rate": 2.9179093204330706e-06, + "loss": 1.0299111604690552, + "step": 826 + }, + { + "epoch": 0.6267978803936411, + "grad_norm": 1.8787809610366821, + "learning_rate": 2.917327981179446e-06, + "loss": 0.8319679498672485, + "step": 828 + }, + { + "epoch": 0.6283118849356548, + "grad_norm": 4.7387614250183105, + "learning_rate": 2.91674465576545e-06, + "loss": 0.5684910416603088, + "step": 830 + }, + { + "epoch": 0.6298258894776685, + "grad_norm": 5.5031609535217285, + "learning_rate": 2.9161593451052726e-06, + "loss": 0.6575965881347656, + "step": 832 + }, + { + "epoch": 0.631339894019682, + "grad_norm": 1.5484586954116821, + "learning_rate": 2.9155720501162143e-06, + "loss": 1.17416250705719, + "step": 834 + }, + { + "epoch": 0.6328538985616957, + "grad_norm": 2.1155788898468018, + "learning_rate": 2.9149827717186858e-06, + "loss": 0.7271778583526611, + "step": 836 + }, + { + "epoch": 0.6343679031037093, + "grad_norm": 42.32135009765625, + "learning_rate": 2.914391510836206e-06, + "loss": 1.0217803716659546, + "step": 838 + }, + { + "epoch": 0.6358819076457229, + "grad_norm": 2.134539842605591, + "learning_rate": 2.9137982683954005e-06, + "loss": 0.7875545620918274, + "step": 840 + }, + { + "epoch": 0.6373959121877366, + "grad_norm": 4.138916969299316, + "learning_rate": 2.9132030453260012e-06, + "loss": 0.9088811278343201, + "step": 842 + }, + { + "epoch": 0.6389099167297502, + "grad_norm": 1.0700823068618774, + "learning_rate": 2.912605842560843e-06, + "loss": 1.1727579832077026, + "step": 844 + }, + { + "epoch": 0.6404239212717638, + "grad_norm": 2.2020580768585205, + "learning_rate": 2.9120066610358644e-06, + "loss": 1.2225433588027954, + "step": 846 + }, + { + "epoch": 0.6419379258137774, + "grad_norm": 2.0243654251098633, + "learning_rate": 2.911405501690104e-06, + "loss": 0.5479593873023987, + "step": 848 + }, + { + "epoch": 0.6434519303557911, + "grad_norm": 2.634003162384033, + "learning_rate": 2.910802365465702e-06, + "loss": 0.8216032385826111, + "step": 850 + }, + { + "epoch": 0.6449659348978047, + "grad_norm": 1.7789942026138306, + "learning_rate": 2.9101972533078937e-06, + "loss": 1.2453292608261108, + "step": 852 + }, + { + "epoch": 0.6464799394398183, + "grad_norm": 28.211029052734375, + "learning_rate": 2.909590166165013e-06, + "loss": 0.4019825756549835, + "step": 854 + }, + { + "epoch": 0.647993943981832, + "grad_norm": 2.574488639831543, + "learning_rate": 2.9089811049884896e-06, + "loss": 0.5381205081939697, + "step": 856 + }, + { + "epoch": 0.6495079485238455, + "grad_norm": 2.394144296646118, + "learning_rate": 2.908370070732846e-06, + "loss": 1.1024041175842285, + "step": 858 + }, + { + "epoch": 0.6510219530658592, + "grad_norm": 2.664809465408325, + "learning_rate": 2.9077570643556967e-06, + "loss": 0.9122069478034973, + "step": 860 + }, + { + "epoch": 0.6525359576078729, + "grad_norm": 1.8201414346694946, + "learning_rate": 2.9071420868177464e-06, + "loss": 0.905881404876709, + "step": 862 + }, + { + "epoch": 0.6540499621498864, + "grad_norm": 2.0792040824890137, + "learning_rate": 2.906525139082791e-06, + "loss": 0.8034082651138306, + "step": 864 + }, + { + "epoch": 0.6555639666919001, + "grad_norm": 2.1485233306884766, + "learning_rate": 2.9059062221177133e-06, + "loss": 0.8683022856712341, + "step": 866 + }, + { + "epoch": 0.6570779712339136, + "grad_norm": 1.865644931793213, + "learning_rate": 2.90528533689248e-06, + "loss": 0.8978813290596008, + "step": 868 + }, + { + "epoch": 0.6585919757759273, + "grad_norm": 4.107750415802002, + "learning_rate": 2.904662484380146e-06, + "loss": 0.519726037979126, + "step": 870 + }, + { + "epoch": 0.660105980317941, + "grad_norm": 2.102914333343506, + "learning_rate": 2.9040376655568473e-06, + "loss": 0.9007350206375122, + "step": 872 + }, + { + "epoch": 0.6616199848599545, + "grad_norm": 1.5866848230361938, + "learning_rate": 2.903410881401801e-06, + "loss": 1.266746163368225, + "step": 874 + }, + { + "epoch": 0.6631339894019682, + "grad_norm": 2.8819000720977783, + "learning_rate": 2.9027821328973073e-06, + "loss": 0.8907691836357117, + "step": 876 + }, + { + "epoch": 0.6646479939439819, + "grad_norm": 2.2902755737304688, + "learning_rate": 2.9021514210287405e-06, + "loss": 0.4960283935070038, + "step": 878 + }, + { + "epoch": 0.6661619984859954, + "grad_norm": 2.3201727867126465, + "learning_rate": 2.9015187467845555e-06, + "loss": 0.896564781665802, + "step": 880 + }, + { + "epoch": 0.6676760030280091, + "grad_norm": 1.8795450925827026, + "learning_rate": 2.900884111156281e-06, + "loss": 0.8979120254516602, + "step": 882 + }, + { + "epoch": 0.6691900075700227, + "grad_norm": 9.0372953414917, + "learning_rate": 2.90024751513852e-06, + "loss": 0.9341409206390381, + "step": 884 + }, + { + "epoch": 0.6707040121120363, + "grad_norm": 2.159026622772217, + "learning_rate": 2.8996089597289474e-06, + "loss": 1.2215708494186401, + "step": 886 + }, + { + "epoch": 0.67221801665405, + "grad_norm": 3.647932291030884, + "learning_rate": 2.8989684459283103e-06, + "loss": 0.8979953527450562, + "step": 888 + }, + { + "epoch": 0.6737320211960636, + "grad_norm": 1.569602370262146, + "learning_rate": 2.898325974740423e-06, + "loss": 0.8550502061843872, + "step": 890 + }, + { + "epoch": 0.6752460257380772, + "grad_norm": 3.763561725616455, + "learning_rate": 2.8976815471721686e-06, + "loss": 0.7107431888580322, + "step": 892 + }, + { + "epoch": 0.6767600302800908, + "grad_norm": 10.984675407409668, + "learning_rate": 2.8970351642334953e-06, + "loss": 0.6723986864089966, + "step": 894 + }, + { + "epoch": 0.6782740348221045, + "grad_norm": 2.890449047088623, + "learning_rate": 2.896386826937418e-06, + "loss": 1.2286604642868042, + "step": 896 + }, + { + "epoch": 0.6797880393641181, + "grad_norm": 12.061135292053223, + "learning_rate": 2.895736536300012e-06, + "loss": 0.49984970688819885, + "step": 898 + }, + { + "epoch": 0.6813020439061317, + "grad_norm": 6.587899208068848, + "learning_rate": 2.895084293340415e-06, + "loss": 1.0647372007369995, + "step": 900 + }, + { + "epoch": 0.6828160484481454, + "grad_norm": 2.388646125793457, + "learning_rate": 2.8944300990808232e-06, + "loss": 0.7765811681747437, + "step": 902 + }, + { + "epoch": 0.6843300529901589, + "grad_norm": 3.4749341011047363, + "learning_rate": 2.8937739545464937e-06, + "loss": 0.494584858417511, + "step": 904 + }, + { + "epoch": 0.6858440575321726, + "grad_norm": 3.2410855293273926, + "learning_rate": 2.8931158607657375e-06, + "loss": 0.6335573196411133, + "step": 906 + }, + { + "epoch": 0.6873580620741863, + "grad_norm": 1.5804246664047241, + "learning_rate": 2.89245581876992e-06, + "loss": 0.5073976516723633, + "step": 908 + }, + { + "epoch": 0.6888720666161998, + "grad_norm": 1.4231654405593872, + "learning_rate": 2.891793829593463e-06, + "loss": 0.9102616310119629, + "step": 910 + }, + { + "epoch": 0.6903860711582135, + "grad_norm": 2.26094913482666, + "learning_rate": 2.8911298942738372e-06, + "loss": 1.047849416732788, + "step": 912 + }, + { + "epoch": 0.691900075700227, + "grad_norm": 1.5220335721969604, + "learning_rate": 2.8904640138515636e-06, + "loss": 0.826735258102417, + "step": 914 + }, + { + "epoch": 0.6934140802422407, + "grad_norm": 0.7623918056488037, + "learning_rate": 2.889796189370213e-06, + "loss": 0.49449172616004944, + "step": 916 + }, + { + "epoch": 0.6949280847842544, + "grad_norm": 2.015327215194702, + "learning_rate": 2.8891264218764013e-06, + "loss": 0.8959734439849854, + "step": 918 + }, + { + "epoch": 0.6964420893262679, + "grad_norm": 1.917296290397644, + "learning_rate": 2.8884547124197904e-06, + "loss": 0.5376745462417603, + "step": 920 + }, + { + "epoch": 0.6979560938682816, + "grad_norm": 0.5840858817100525, + "learning_rate": 2.887781062053086e-06, + "loss": 1.095085620880127, + "step": 922 + }, + { + "epoch": 0.6994700984102953, + "grad_norm": 4.917339324951172, + "learning_rate": 2.887105471832034e-06, + "loss": 0.8940989971160889, + "step": 924 + }, + { + "epoch": 0.7009841029523088, + "grad_norm": 1.5196433067321777, + "learning_rate": 2.8864279428154225e-06, + "loss": 0.5308735370635986, + "step": 926 + }, + { + "epoch": 0.7024981074943225, + "grad_norm": 7.36590051651001, + "learning_rate": 2.8857484760650765e-06, + "loss": 0.6894451975822449, + "step": 928 + }, + { + "epoch": 0.7040121120363361, + "grad_norm": 1.9489446878433228, + "learning_rate": 2.885067072645859e-06, + "loss": 0.7315176725387573, + "step": 930 + }, + { + "epoch": 0.7055261165783497, + "grad_norm": 1.2081562280654907, + "learning_rate": 2.8843837336256664e-06, + "loss": 0.8387755751609802, + "step": 932 + }, + { + "epoch": 0.7070401211203634, + "grad_norm": 1.6697505712509155, + "learning_rate": 2.883698460075431e-06, + "loss": 0.9529602527618408, + "step": 934 + }, + { + "epoch": 0.708554125662377, + "grad_norm": 1.2275534868240356, + "learning_rate": 2.8830112530691147e-06, + "loss": 1.245858907699585, + "step": 936 + }, + { + "epoch": 0.7100681302043906, + "grad_norm": 10.275951385498047, + "learning_rate": 2.8823221136837114e-06, + "loss": 0.4762369394302368, + "step": 938 + }, + { + "epoch": 0.7115821347464042, + "grad_norm": 1.4351997375488281, + "learning_rate": 2.881631042999242e-06, + "loss": 0.8527016043663025, + "step": 940 + }, + { + "epoch": 0.7130961392884179, + "grad_norm": 2.3546645641326904, + "learning_rate": 2.8809380420987547e-06, + "loss": 0.8944950699806213, + "step": 942 + }, + { + "epoch": 0.7146101438304315, + "grad_norm": 2.901038885116577, + "learning_rate": 2.8802431120683226e-06, + "loss": 0.7213137149810791, + "step": 944 + }, + { + "epoch": 0.7161241483724451, + "grad_norm": 2.7750065326690674, + "learning_rate": 2.8795462539970417e-06, + "loss": 1.0235939025878906, + "step": 946 + }, + { + "epoch": 0.7176381529144588, + "grad_norm": 1.4316633939743042, + "learning_rate": 2.878847468977032e-06, + "loss": 0.8671280741691589, + "step": 948 + }, + { + "epoch": 0.7191521574564723, + "grad_norm": 1.9463779926300049, + "learning_rate": 2.8781467581034287e-06, + "loss": 1.0982471704483032, + "step": 950 + }, + { + "epoch": 0.720666161998486, + "grad_norm": 3.363389015197754, + "learning_rate": 2.877444122474391e-06, + "loss": 1.253629207611084, + "step": 952 + }, + { + "epoch": 0.7221801665404997, + "grad_norm": 1.4445604085922241, + "learning_rate": 2.8767395631910893e-06, + "loss": 1.2379037141799927, + "step": 954 + }, + { + "epoch": 0.7236941710825132, + "grad_norm": 1.4205392599105835, + "learning_rate": 2.8760330813577127e-06, + "loss": 0.8827942609786987, + "step": 956 + }, + { + "epoch": 0.7252081756245269, + "grad_norm": 2.034050941467285, + "learning_rate": 2.8753246780814606e-06, + "loss": 0.8953526020050049, + "step": 958 + }, + { + "epoch": 0.7267221801665406, + "grad_norm": 3.173511266708374, + "learning_rate": 2.8746143544725454e-06, + "loss": 0.5323604941368103, + "step": 960 + }, + { + "epoch": 0.7282361847085541, + "grad_norm": 4.285152435302734, + "learning_rate": 2.8739021116441887e-06, + "loss": 0.9036399722099304, + "step": 962 + }, + { + "epoch": 0.7297501892505678, + "grad_norm": 1.7868789434432983, + "learning_rate": 2.8731879507126194e-06, + "loss": 0.8735661506652832, + "step": 964 + }, + { + "epoch": 0.7312641937925813, + "grad_norm": 1.9185863733291626, + "learning_rate": 2.872471872797073e-06, + "loss": 0.773938775062561, + "step": 966 + }, + { + "epoch": 0.732778198334595, + "grad_norm": 0.955786406993866, + "learning_rate": 2.8717538790197887e-06, + "loss": 0.7239243984222412, + "step": 968 + }, + { + "epoch": 0.7342922028766087, + "grad_norm": 1.9443094730377197, + "learning_rate": 2.8710339705060085e-06, + "loss": 0.7597388625144958, + "step": 970 + }, + { + "epoch": 0.7358062074186222, + "grad_norm": 2.2941946983337402, + "learning_rate": 2.870312148383976e-06, + "loss": 0.8585048913955688, + "step": 972 + }, + { + "epoch": 0.7373202119606359, + "grad_norm": 1.9667870998382568, + "learning_rate": 2.8695884137849317e-06, + "loss": 0.45781540870666504, + "step": 974 + }, + { + "epoch": 0.7388342165026495, + "grad_norm": 2.0550363063812256, + "learning_rate": 2.868862767843116e-06, + "loss": 1.2025854587554932, + "step": 976 + }, + { + "epoch": 0.7403482210446631, + "grad_norm": 2.0096893310546875, + "learning_rate": 2.868135211695763e-06, + "loss": 1.2732635736465454, + "step": 978 + }, + { + "epoch": 0.7418622255866768, + "grad_norm": 1.3000602722167969, + "learning_rate": 2.8674057464831016e-06, + "loss": 0.8870718479156494, + "step": 980 + }, + { + "epoch": 0.7433762301286904, + "grad_norm": 1.5397157669067383, + "learning_rate": 2.866674373348351e-06, + "loss": 0.7638609409332275, + "step": 982 + }, + { + "epoch": 0.744890234670704, + "grad_norm": 3.6748268604278564, + "learning_rate": 2.865941093437721e-06, + "loss": 0.8565575480461121, + "step": 984 + }, + { + "epoch": 0.7464042392127176, + "grad_norm": 7.63801383972168, + "learning_rate": 2.865205907900412e-06, + "loss": 0.9564031958580017, + "step": 986 + }, + { + "epoch": 0.7479182437547313, + "grad_norm": 2.4677016735076904, + "learning_rate": 2.864468817888608e-06, + "loss": 1.2542637586593628, + "step": 988 + }, + { + "epoch": 0.7494322482967449, + "grad_norm": 1.5017268657684326, + "learning_rate": 2.863729824557479e-06, + "loss": 1.0353670120239258, + "step": 990 + }, + { + "epoch": 0.7509462528387585, + "grad_norm": 1.2101831436157227, + "learning_rate": 2.862988929065177e-06, + "loss": 1.191741943359375, + "step": 992 + }, + { + "epoch": 0.7524602573807722, + "grad_norm": 4.140659332275391, + "learning_rate": 2.862246132572837e-06, + "loss": 0.8285727500915527, + "step": 994 + }, + { + "epoch": 0.7539742619227857, + "grad_norm": 16.98643684387207, + "learning_rate": 2.8615014362445708e-06, + "loss": 0.8691332340240479, + "step": 996 + }, + { + "epoch": 0.7554882664647994, + "grad_norm": 2.471562623977661, + "learning_rate": 2.860754841247469e-06, + "loss": 1.109788417816162, + "step": 998 + }, + { + "epoch": 0.757002271006813, + "grad_norm": 2.814229965209961, + "learning_rate": 2.860006348751598e-06, + "loss": 0.5025942921638489, + "step": 1000 + }, + { + "epoch": 0.7585162755488266, + "grad_norm": 2.1887619495391846, + "learning_rate": 2.8592559599299976e-06, + "loss": 0.5041999816894531, + "step": 1002 + }, + { + "epoch": 0.7600302800908403, + "grad_norm": 1.2107658386230469, + "learning_rate": 2.8585036759586796e-06, + "loss": 0.6934733986854553, + "step": 1004 + }, + { + "epoch": 0.761544284632854, + "grad_norm": 0.8850097060203552, + "learning_rate": 2.857749498016625e-06, + "loss": 0.6282389163970947, + "step": 1006 + }, + { + "epoch": 0.7630582891748675, + "grad_norm": 2.179121732711792, + "learning_rate": 2.856993427285784e-06, + "loss": 0.5650244951248169, + "step": 1008 + }, + { + "epoch": 0.7645722937168812, + "grad_norm": 2.607090473175049, + "learning_rate": 2.856235464951074e-06, + "loss": 0.40634971857070923, + "step": 1010 + }, + { + "epoch": 0.7660862982588947, + "grad_norm": 4.742982864379883, + "learning_rate": 2.8554756122003755e-06, + "loss": 0.8095629215240479, + "step": 1012 + }, + { + "epoch": 0.7676003028009084, + "grad_norm": 2.4433038234710693, + "learning_rate": 2.8547138702245316e-06, + "loss": 1.2153220176696777, + "step": 1014 + }, + { + "epoch": 0.7691143073429221, + "grad_norm": 1.80650794506073, + "learning_rate": 2.853950240217347e-06, + "loss": 0.540834367275238, + "step": 1016 + }, + { + "epoch": 0.7706283118849356, + "grad_norm": 4.6063971519470215, + "learning_rate": 2.8531847233755852e-06, + "loss": 1.183655858039856, + "step": 1018 + }, + { + "epoch": 0.7721423164269493, + "grad_norm": 1.1085045337677002, + "learning_rate": 2.852417320898966e-06, + "loss": 1.1908199787139893, + "step": 1020 + }, + { + "epoch": 0.7736563209689629, + "grad_norm": 2.1212046146392822, + "learning_rate": 2.8516480339901663e-06, + "loss": 0.6177248358726501, + "step": 1022 + }, + { + "epoch": 0.7751703255109765, + "grad_norm": 1.71256422996521, + "learning_rate": 2.8508768638548135e-06, + "loss": 1.1025025844573975, + "step": 1024 + }, + { + "epoch": 0.7766843300529902, + "grad_norm": 4.706640243530273, + "learning_rate": 2.850103811701488e-06, + "loss": 0.5664300322532654, + "step": 1026 + }, + { + "epoch": 0.7781983345950038, + "grad_norm": 2.5584557056427, + "learning_rate": 2.849328878741721e-06, + "loss": 0.8416152596473694, + "step": 1028 + }, + { + "epoch": 0.7797123391370174, + "grad_norm": 3.9955708980560303, + "learning_rate": 2.8485520661899875e-06, + "loss": 0.9651318788528442, + "step": 1030 + }, + { + "epoch": 0.781226343679031, + "grad_norm": 1.443227767944336, + "learning_rate": 2.8477733752637124e-06, + "loss": 0.47583842277526855, + "step": 1032 + }, + { + "epoch": 0.7827403482210447, + "grad_norm": 2.5279667377471924, + "learning_rate": 2.8469928071832622e-06, + "loss": 0.5478159785270691, + "step": 1034 + }, + { + "epoch": 0.7842543527630583, + "grad_norm": 1.6589131355285645, + "learning_rate": 2.846210363171945e-06, + "loss": 0.9488869905471802, + "step": 1036 + }, + { + "epoch": 0.7857683573050719, + "grad_norm": 1.463948130607605, + "learning_rate": 2.8454260444560107e-06, + "loss": 0.7418044209480286, + "step": 1038 + }, + { + "epoch": 0.7872823618470856, + "grad_norm": 2.582495927810669, + "learning_rate": 2.8446398522646444e-06, + "loss": 0.37741395831108093, + "step": 1040 + }, + { + "epoch": 0.7887963663890991, + "grad_norm": 2.034688711166382, + "learning_rate": 2.8438517878299704e-06, + "loss": 0.8056525588035583, + "step": 1042 + }, + { + "epoch": 0.7903103709311128, + "grad_norm": 3.680302381515503, + "learning_rate": 2.8430618523870454e-06, + "loss": 1.058135986328125, + "step": 1044 + }, + { + "epoch": 0.7918243754731265, + "grad_norm": 1.6540333032608032, + "learning_rate": 2.842270047173858e-06, + "loss": 1.2269434928894043, + "step": 1046 + }, + { + "epoch": 0.79333838001514, + "grad_norm": 1.3814526796340942, + "learning_rate": 2.841476373431328e-06, + "loss": 1.046462893486023, + "step": 1048 + }, + { + "epoch": 0.7948523845571537, + "grad_norm": 1.3701473474502563, + "learning_rate": 2.8406808324033043e-06, + "loss": 1.245253324508667, + "step": 1050 + }, + { + "epoch": 0.7963663890991673, + "grad_norm": 1.2616037130355835, + "learning_rate": 2.83988342533656e-06, + "loss": 0.8982032537460327, + "step": 1052 + }, + { + "epoch": 0.7978803936411809, + "grad_norm": 1.299059271812439, + "learning_rate": 2.839084153480795e-06, + "loss": 1.2602475881576538, + "step": 1054 + }, + { + "epoch": 0.7993943981831946, + "grad_norm": 2.242746591567993, + "learning_rate": 2.83828301808863e-06, + "loss": 0.4404146075248718, + "step": 1056 + }, + { + "epoch": 0.8009084027252081, + "grad_norm": 2.1068410873413086, + "learning_rate": 2.837480020415607e-06, + "loss": 1.0584217309951782, + "step": 1058 + }, + { + "epoch": 0.8024224072672218, + "grad_norm": 1.2306238412857056, + "learning_rate": 2.836675161720187e-06, + "loss": 1.2572702169418335, + "step": 1060 + }, + { + "epoch": 0.8039364118092355, + "grad_norm": 7.1440749168396, + "learning_rate": 2.8358684432637464e-06, + "loss": 1.242936372756958, + "step": 1062 + }, + { + "epoch": 0.805450416351249, + "grad_norm": 1.6501164436340332, + "learning_rate": 2.8350598663105774e-06, + "loss": 0.5116027593612671, + "step": 1064 + }, + { + "epoch": 0.8069644208932627, + "grad_norm": 1.3650797605514526, + "learning_rate": 2.834249432127884e-06, + "loss": 0.9526708722114563, + "step": 1066 + }, + { + "epoch": 0.8084784254352763, + "grad_norm": 5.26108455657959, + "learning_rate": 2.833437141985781e-06, + "loss": 0.6141349077224731, + "step": 1068 + }, + { + "epoch": 0.8099924299772899, + "grad_norm": 3.335373640060425, + "learning_rate": 2.832622997157292e-06, + "loss": 0.8614574670791626, + "step": 1070 + }, + { + "epoch": 0.8115064345193036, + "grad_norm": 6.905012607574463, + "learning_rate": 2.831806998918348e-06, + "loss": 1.2377727031707764, + "step": 1072 + }, + { + "epoch": 0.8130204390613172, + "grad_norm": 2.2727155685424805, + "learning_rate": 2.8309891485477835e-06, + "loss": 0.6964701414108276, + "step": 1074 + }, + { + "epoch": 0.8145344436033308, + "grad_norm": 2.301574945449829, + "learning_rate": 2.830169447327336e-06, + "loss": 0.5446033477783203, + "step": 1076 + }, + { + "epoch": 0.8160484481453444, + "grad_norm": 2.7499730587005615, + "learning_rate": 2.8293478965416444e-06, + "loss": 0.8786720633506775, + "step": 1078 + }, + { + "epoch": 0.817562452687358, + "grad_norm": 3.3307502269744873, + "learning_rate": 2.8285244974782453e-06, + "loss": 0.8752889633178711, + "step": 1080 + }, + { + "epoch": 0.8190764572293717, + "grad_norm": 1.018991470336914, + "learning_rate": 2.827699251427572e-06, + "loss": 0.9326798319816589, + "step": 1082 + }, + { + "epoch": 0.8205904617713853, + "grad_norm": 2.248389959335327, + "learning_rate": 2.8268721596829532e-06, + "loss": 1.2054373025894165, + "step": 1084 + }, + { + "epoch": 0.822104466313399, + "grad_norm": 3.472452163696289, + "learning_rate": 2.8260432235406094e-06, + "loss": 0.728631317615509, + "step": 1086 + }, + { + "epoch": 0.8236184708554126, + "grad_norm": 3.7358202934265137, + "learning_rate": 2.825212444299652e-06, + "loss": 0.44140198826789856, + "step": 1088 + }, + { + "epoch": 0.8251324753974262, + "grad_norm": 1.1387553215026855, + "learning_rate": 2.8243798232620807e-06, + "loss": 0.9339531064033508, + "step": 1090 + }, + { + "epoch": 0.8266464799394398, + "grad_norm": 1.736079454421997, + "learning_rate": 2.823545361732782e-06, + "loss": 0.5052375793457031, + "step": 1092 + }, + { + "epoch": 0.8281604844814534, + "grad_norm": 5.955495834350586, + "learning_rate": 2.8227090610195265e-06, + "loss": 0.7765507698059082, + "step": 1094 + }, + { + "epoch": 0.8296744890234671, + "grad_norm": 3.198878049850464, + "learning_rate": 2.821870922432967e-06, + "loss": 0.8172941207885742, + "step": 1096 + }, + { + "epoch": 0.8311884935654807, + "grad_norm": 3.3747644424438477, + "learning_rate": 2.8210309472866375e-06, + "loss": 0.865540623664856, + "step": 1098 + }, + { + "epoch": 0.8327024981074943, + "grad_norm": 1.676251769065857, + "learning_rate": 2.820189136896949e-06, + "loss": 1.1563292741775513, + "step": 1100 + }, + { + "epoch": 0.834216502649508, + "grad_norm": 3.305302619934082, + "learning_rate": 2.8193454925831903e-06, + "loss": 0.5128759741783142, + "step": 1102 + }, + { + "epoch": 0.8357305071915215, + "grad_norm": 1.7015798091888428, + "learning_rate": 2.818500015667523e-06, + "loss": 0.6986027956008911, + "step": 1104 + }, + { + "epoch": 0.8372445117335352, + "grad_norm": 2.9527363777160645, + "learning_rate": 2.8176527074749803e-06, + "loss": 0.882337212562561, + "step": 1106 + }, + { + "epoch": 0.8387585162755489, + "grad_norm": 11.77180004119873, + "learning_rate": 2.816803569333467e-06, + "loss": 0.38584715127944946, + "step": 1108 + }, + { + "epoch": 0.8402725208175624, + "grad_norm": 1.3982672691345215, + "learning_rate": 2.815952602573755e-06, + "loss": 0.8777285814285278, + "step": 1110 + }, + { + "epoch": 0.8417865253595761, + "grad_norm": 1.108565092086792, + "learning_rate": 2.815099808529482e-06, + "loss": 1.4833998680114746, + "step": 1112 + }, + { + "epoch": 0.8433005299015897, + "grad_norm": 1.0646542310714722, + "learning_rate": 2.81424518853715e-06, + "loss": 0.8019735813140869, + "step": 1114 + }, + { + "epoch": 0.8448145344436033, + "grad_norm": 0.854870080947876, + "learning_rate": 2.8133887439361213e-06, + "loss": 0.8715571165084839, + "step": 1116 + }, + { + "epoch": 0.846328538985617, + "grad_norm": 11.543094635009766, + "learning_rate": 2.8125304760686196e-06, + "loss": 0.7676863670349121, + "step": 1118 + }, + { + "epoch": 0.8478425435276306, + "grad_norm": 1.7124183177947998, + "learning_rate": 2.811670386279724e-06, + "loss": 0.9376462697982788, + "step": 1120 + }, + { + "epoch": 0.8493565480696442, + "grad_norm": 1.7174268960952759, + "learning_rate": 2.8108084759173695e-06, + "loss": 1.0287573337554932, + "step": 1122 + }, + { + "epoch": 0.8508705526116578, + "grad_norm": 6.151965618133545, + "learning_rate": 2.809944746332346e-06, + "loss": 0.8226707577705383, + "step": 1124 + }, + { + "epoch": 0.8523845571536715, + "grad_norm": 4.121500492095947, + "learning_rate": 2.8090791988782928e-06, + "loss": 0.4312731921672821, + "step": 1126 + }, + { + "epoch": 0.8538985616956851, + "grad_norm": 1.6570124626159668, + "learning_rate": 2.808211834911698e-06, + "loss": 0.7892210483551025, + "step": 1128 + }, + { + "epoch": 0.8554125662376987, + "grad_norm": 1.5975044965744019, + "learning_rate": 2.8073426557918975e-06, + "loss": 0.8778940439224243, + "step": 1130 + }, + { + "epoch": 0.8569265707797123, + "grad_norm": 5.983201503753662, + "learning_rate": 2.806471662881072e-06, + "loss": 0.5457919239997864, + "step": 1132 + }, + { + "epoch": 0.858440575321726, + "grad_norm": 2.1791043281555176, + "learning_rate": 2.8055988575442435e-06, + "loss": 0.6023417711257935, + "step": 1134 + }, + { + "epoch": 0.8599545798637396, + "grad_norm": 3.206430435180664, + "learning_rate": 2.804724241149276e-06, + "loss": 0.6399402022361755, + "step": 1136 + }, + { + "epoch": 0.8614685844057532, + "grad_norm": 1.5075228214263916, + "learning_rate": 2.8038478150668704e-06, + "loss": 1.2906348705291748, + "step": 1138 + }, + { + "epoch": 0.8629825889477668, + "grad_norm": 3.7131893634796143, + "learning_rate": 2.8029695806705645e-06, + "loss": 0.80905681848526, + "step": 1140 + }, + { + "epoch": 0.8644965934897805, + "grad_norm": 2.9525985717773438, + "learning_rate": 2.8020895393367304e-06, + "loss": 0.4328504800796509, + "step": 1142 + }, + { + "epoch": 0.8660105980317941, + "grad_norm": 1.725476861000061, + "learning_rate": 2.8012076924445715e-06, + "loss": 1.2165340185165405, + "step": 1144 + }, + { + "epoch": 0.8675246025738077, + "grad_norm": 1.2453216314315796, + "learning_rate": 2.8003240413761203e-06, + "loss": 0.7229538559913635, + "step": 1146 + }, + { + "epoch": 0.8690386071158214, + "grad_norm": 3.2585346698760986, + "learning_rate": 2.7994385875162384e-06, + "loss": 0.7827056646347046, + "step": 1148 + }, + { + "epoch": 0.8705526116578349, + "grad_norm": 0.9169425964355469, + "learning_rate": 2.7985513322526113e-06, + "loss": 1.1099709272384644, + "step": 1150 + }, + { + "epoch": 0.8720666161998486, + "grad_norm": 2.0088422298431396, + "learning_rate": 2.797662276975749e-06, + "loss": 0.7103772163391113, + "step": 1152 + }, + { + "epoch": 0.8735806207418623, + "grad_norm": 2.144426107406616, + "learning_rate": 2.7967714230789814e-06, + "loss": 0.8265763521194458, + "step": 1154 + }, + { + "epoch": 0.8750946252838758, + "grad_norm": 0.7836669087409973, + "learning_rate": 2.7958787719584563e-06, + "loss": 0.8146635890007019, + "step": 1156 + }, + { + "epoch": 0.8766086298258895, + "grad_norm": 1.5560650825500488, + "learning_rate": 2.794984325013141e-06, + "loss": 0.4608868956565857, + "step": 1158 + }, + { + "epoch": 0.878122634367903, + "grad_norm": 1.347021222114563, + "learning_rate": 2.7940880836448146e-06, + "loss": 0.8238049745559692, + "step": 1160 + }, + { + "epoch": 0.8796366389099167, + "grad_norm": 2.6864876747131348, + "learning_rate": 2.7931900492580693e-06, + "loss": 1.1887385845184326, + "step": 1162 + }, + { + "epoch": 0.8811506434519304, + "grad_norm": 1.0439181327819824, + "learning_rate": 2.7922902232603086e-06, + "loss": 0.29744046926498413, + "step": 1164 + }, + { + "epoch": 0.882664647993944, + "grad_norm": 5.212288856506348, + "learning_rate": 2.7913886070617414e-06, + "loss": 0.813435435295105, + "step": 1166 + }, + { + "epoch": 0.8841786525359576, + "grad_norm": 2.2403671741485596, + "learning_rate": 2.7904852020753835e-06, + "loss": 0.8788347244262695, + "step": 1168 + }, + { + "epoch": 0.8856926570779712, + "grad_norm": 1.2678157091140747, + "learning_rate": 2.789580009717054e-06, + "loss": 0.8745746612548828, + "step": 1170 + }, + { + "epoch": 0.8872066616199848, + "grad_norm": 1.6722967624664307, + "learning_rate": 2.788673031405374e-06, + "loss": 0.5342118740081787, + "step": 1172 + }, + { + "epoch": 0.8887206661619985, + "grad_norm": 1.7086647748947144, + "learning_rate": 2.787764268561762e-06, + "loss": 0.730110228061676, + "step": 1174 + }, + { + "epoch": 0.8902346707040121, + "grad_norm": 1.1516423225402832, + "learning_rate": 2.7868537226104346e-06, + "loss": 0.794441819190979, + "step": 1176 + }, + { + "epoch": 0.8917486752460257, + "grad_norm": 11.606199264526367, + "learning_rate": 2.7859413949784013e-06, + "loss": 0.47964078187942505, + "step": 1178 + }, + { + "epoch": 0.8932626797880394, + "grad_norm": 1.1536232233047485, + "learning_rate": 2.7850272870954657e-06, + "loss": 1.2453587055206299, + "step": 1180 + }, + { + "epoch": 0.894776684330053, + "grad_norm": 1.559417963027954, + "learning_rate": 2.78411140039422e-06, + "loss": 1.136699914932251, + "step": 1182 + }, + { + "epoch": 0.8962906888720666, + "grad_norm": 1.796173095703125, + "learning_rate": 2.783193736310045e-06, + "loss": 0.7995592355728149, + "step": 1184 + }, + { + "epoch": 0.8978046934140802, + "grad_norm": 1.4185960292816162, + "learning_rate": 2.782274296281107e-06, + "loss": 1.2520816326141357, + "step": 1186 + }, + { + "epoch": 0.8993186979560939, + "grad_norm": 1.3419097661972046, + "learning_rate": 2.7813530817483537e-06, + "loss": 1.2061359882354736, + "step": 1188 + }, + { + "epoch": 0.9008327024981075, + "grad_norm": 4.111332416534424, + "learning_rate": 2.780430094155517e-06, + "loss": 0.9155566692352295, + "step": 1190 + }, + { + "epoch": 0.9023467070401211, + "grad_norm": 0.8896114826202393, + "learning_rate": 2.7795053349491062e-06, + "loss": 0.2986781895160675, + "step": 1192 + }, + { + "epoch": 0.9038607115821348, + "grad_norm": 2.29109263420105, + "learning_rate": 2.778578805578406e-06, + "loss": 0.8259222507476807, + "step": 1194 + }, + { + "epoch": 0.9053747161241483, + "grad_norm": 1.6004855632781982, + "learning_rate": 2.7776505074954756e-06, + "loss": 0.901458740234375, + "step": 1196 + }, + { + "epoch": 0.906888720666162, + "grad_norm": 1.1440821886062622, + "learning_rate": 2.776720442155148e-06, + "loss": 0.5986334681510925, + "step": 1198 + }, + { + "epoch": 0.9084027252081757, + "grad_norm": 6.9205780029296875, + "learning_rate": 2.7757886110150234e-06, + "loss": 1.014857292175293, + "step": 1200 + }, + { + "epoch": 0.9099167297501892, + "grad_norm": 1.279496192932129, + "learning_rate": 2.774855015535471e-06, + "loss": 1.0329720973968506, + "step": 1202 + }, + { + "epoch": 0.9114307342922029, + "grad_norm": 1.1504024267196655, + "learning_rate": 2.7739196571796242e-06, + "loss": 1.2614200115203857, + "step": 1204 + }, + { + "epoch": 0.9129447388342165, + "grad_norm": 1.451379418373108, + "learning_rate": 2.7729825374133805e-06, + "loss": 0.7176874876022339, + "step": 1206 + }, + { + "epoch": 0.9144587433762301, + "grad_norm": 1.406685709953308, + "learning_rate": 2.7720436577053957e-06, + "loss": 0.9029539823532104, + "step": 1208 + }, + { + "epoch": 0.9159727479182438, + "grad_norm": 5.806619167327881, + "learning_rate": 2.7711030195270854e-06, + "loss": 0.7357524633407593, + "step": 1210 + }, + { + "epoch": 0.9174867524602573, + "grad_norm": 1.5039846897125244, + "learning_rate": 2.770160624352621e-06, + "loss": 0.8037659525871277, + "step": 1212 + }, + { + "epoch": 0.919000757002271, + "grad_norm": 1.2760533094406128, + "learning_rate": 2.7692164736589273e-06, + "loss": 0.8593114614486694, + "step": 1214 + }, + { + "epoch": 0.9205147615442847, + "grad_norm": 1.9237477779388428, + "learning_rate": 2.7682705689256797e-06, + "loss": 1.420215368270874, + "step": 1216 + }, + { + "epoch": 0.9220287660862982, + "grad_norm": 2.0388343334198, + "learning_rate": 2.767322911635303e-06, + "loss": 0.4175781011581421, + "step": 1218 + }, + { + "epoch": 0.9235427706283119, + "grad_norm": 2.2387585639953613, + "learning_rate": 2.7663735032729687e-06, + "loss": 0.9758048057556152, + "step": 1220 + }, + { + "epoch": 0.9250567751703255, + "grad_norm": 4.407744407653809, + "learning_rate": 2.7654223453265933e-06, + "loss": 0.9266854524612427, + "step": 1222 + }, + { + "epoch": 0.9265707797123391, + "grad_norm": 1.4438966512680054, + "learning_rate": 2.7644694392868335e-06, + "loss": 0.6441771984100342, + "step": 1224 + }, + { + "epoch": 0.9280847842543528, + "grad_norm": 45.29044723510742, + "learning_rate": 2.7635147866470874e-06, + "loss": 0.19651535153388977, + "step": 1226 + }, + { + "epoch": 0.9295987887963664, + "grad_norm": 1.3128368854522705, + "learning_rate": 2.7625583889034892e-06, + "loss": 0.8569998145103455, + "step": 1228 + }, + { + "epoch": 0.93111279333838, + "grad_norm": 2.554664373397827, + "learning_rate": 2.7616002475549083e-06, + "loss": 0.4308600127696991, + "step": 1230 + }, + { + "epoch": 0.9326267978803936, + "grad_norm": 2.3758127689361572, + "learning_rate": 2.7606403641029477e-06, + "loss": 1.1944650411605835, + "step": 1232 + }, + { + "epoch": 0.9341408024224073, + "grad_norm": 2.3627445697784424, + "learning_rate": 2.7596787400519383e-06, + "loss": 1.236472487449646, + "step": 1234 + }, + { + "epoch": 0.9356548069644209, + "grad_norm": 3.527974843978882, + "learning_rate": 2.7587153769089416e-06, + "loss": 0.7329102754592896, + "step": 1236 + }, + { + "epoch": 0.9371688115064345, + "grad_norm": 3.3482489585876465, + "learning_rate": 2.757750276183743e-06, + "loss": 1.3514375686645508, + "step": 1238 + }, + { + "epoch": 0.9386828160484482, + "grad_norm": 2.0988523960113525, + "learning_rate": 2.756783439388851e-06, + "loss": 0.9067206978797913, + "step": 1240 + }, + { + "epoch": 0.9401968205904617, + "grad_norm": 1.1335437297821045, + "learning_rate": 2.7558148680394947e-06, + "loss": 0.4036295711994171, + "step": 1242 + }, + { + "epoch": 0.9417108251324754, + "grad_norm": 1.8524996042251587, + "learning_rate": 2.7548445636536234e-06, + "loss": 0.6532028317451477, + "step": 1244 + }, + { + "epoch": 0.9432248296744891, + "grad_norm": 2.2678205966949463, + "learning_rate": 2.753872527751901e-06, + "loss": 1.0668600797653198, + "step": 1246 + }, + { + "epoch": 0.9447388342165026, + "grad_norm": 1.7660356760025024, + "learning_rate": 2.7528987618577047e-06, + "loss": 1.1286046504974365, + "step": 1248 + }, + { + "epoch": 0.9462528387585163, + "grad_norm": 1.1056783199310303, + "learning_rate": 2.7519232674971233e-06, + "loss": 0.7087100148200989, + "step": 1250 + }, + { + "epoch": 0.9477668433005298, + "grad_norm": 2.968543767929077, + "learning_rate": 2.750946046198955e-06, + "loss": 0.8986905813217163, + "step": 1252 + }, + { + "epoch": 0.9492808478425435, + "grad_norm": 2.2909116744995117, + "learning_rate": 2.749967099494704e-06, + "loss": 0.8354098796844482, + "step": 1254 + }, + { + "epoch": 0.9507948523845572, + "grad_norm": 4.356247901916504, + "learning_rate": 2.7489864289185786e-06, + "loss": 0.8094468712806702, + "step": 1256 + }, + { + "epoch": 0.9523088569265707, + "grad_norm": 6.734137535095215, + "learning_rate": 2.7480040360074886e-06, + "loss": 0.9045544862747192, + "step": 1258 + }, + { + "epoch": 0.9538228614685844, + "grad_norm": 1.4713701009750366, + "learning_rate": 2.7470199223010446e-06, + "loss": 0.9489148259162903, + "step": 1260 + }, + { + "epoch": 0.9553368660105981, + "grad_norm": 2.812283754348755, + "learning_rate": 2.7460340893415503e-06, + "loss": 0.6814359426498413, + "step": 1262 + }, + { + "epoch": 0.9568508705526116, + "grad_norm": 1.9035266637802124, + "learning_rate": 2.7450465386740074e-06, + "loss": 0.8841050863265991, + "step": 1264 + }, + { + "epoch": 0.9583648750946253, + "grad_norm": 1.500565528869629, + "learning_rate": 2.744057271846109e-06, + "loss": 1.2327930927276611, + "step": 1266 + }, + { + "epoch": 0.9598788796366389, + "grad_norm": 1.3923603296279907, + "learning_rate": 2.743066290408236e-06, + "loss": 0.775505006313324, + "step": 1268 + }, + { + "epoch": 0.9613928841786525, + "grad_norm": 2.7446627616882324, + "learning_rate": 2.7420735959134585e-06, + "loss": 0.5238674879074097, + "step": 1270 + }, + { + "epoch": 0.9629068887206662, + "grad_norm": 1.355055570602417, + "learning_rate": 2.74107918991753e-06, + "loss": 1.2805811166763306, + "step": 1272 + }, + { + "epoch": 0.9644208932626798, + "grad_norm": 1.3523149490356445, + "learning_rate": 2.740083073978887e-06, + "loss": 0.5425784587860107, + "step": 1274 + }, + { + "epoch": 0.9659348978046934, + "grad_norm": 1.2970373630523682, + "learning_rate": 2.739085249658645e-06, + "loss": 0.7968797087669373, + "step": 1276 + }, + { + "epoch": 0.967448902346707, + "grad_norm": 1.0297974348068237, + "learning_rate": 2.738085718520598e-06, + "loss": 0.38737136125564575, + "step": 1278 + }, + { + "epoch": 0.9689629068887207, + "grad_norm": 1.3623170852661133, + "learning_rate": 2.7370844821312133e-06, + "loss": 0.713742733001709, + "step": 1280 + }, + { + "epoch": 0.9704769114307343, + "grad_norm": 1.12141752243042, + "learning_rate": 2.736081542059633e-06, + "loss": 1.2867194414138794, + "step": 1282 + }, + { + "epoch": 0.9719909159727479, + "grad_norm": 1.434508204460144, + "learning_rate": 2.7350768998776668e-06, + "loss": 1.1784629821777344, + "step": 1284 + }, + { + "epoch": 0.9735049205147616, + "grad_norm": 1.407881736755371, + "learning_rate": 2.7340705571597945e-06, + "loss": 0.9067625999450684, + "step": 1286 + }, + { + "epoch": 0.9750189250567751, + "grad_norm": 1.0897945165634155, + "learning_rate": 2.733062515483158e-06, + "loss": 0.6832272410392761, + "step": 1288 + }, + { + "epoch": 0.9765329295987888, + "grad_norm": 0.6153534650802612, + "learning_rate": 2.732052776427564e-06, + "loss": 0.8100548386573792, + "step": 1290 + }, + { + "epoch": 0.9780469341408025, + "grad_norm": 0.7920557260513306, + "learning_rate": 2.7310413415754785e-06, + "loss": 0.7890947461128235, + "step": 1292 + }, + { + "epoch": 0.979560938682816, + "grad_norm": 1.6804354190826416, + "learning_rate": 2.7300282125120255e-06, + "loss": 0.5669286251068115, + "step": 1294 + }, + { + "epoch": 0.9810749432248297, + "grad_norm": 2.5095572471618652, + "learning_rate": 2.7290133908249844e-06, + "loss": 0.9195560812950134, + "step": 1296 + }, + { + "epoch": 0.9825889477668432, + "grad_norm": 2.6254382133483887, + "learning_rate": 2.7279968781047867e-06, + "loss": 0.8261404037475586, + "step": 1298 + }, + { + "epoch": 0.9841029523088569, + "grad_norm": 1.6926733255386353, + "learning_rate": 2.726978675944514e-06, + "loss": 1.032439947128296, + "step": 1300 + }, + { + "epoch": 0.9856169568508706, + "grad_norm": 1.1733735799789429, + "learning_rate": 2.7259587859398954e-06, + "loss": 0.844183087348938, + "step": 1302 + }, + { + "epoch": 0.9871309613928841, + "grad_norm": 1.1385763883590698, + "learning_rate": 2.7249372096893075e-06, + "loss": 0.4663428068161011, + "step": 1304 + }, + { + "epoch": 0.9886449659348978, + "grad_norm": 1.6468255519866943, + "learning_rate": 2.723913948793766e-06, + "loss": 0.8818427920341492, + "step": 1306 + }, + { + "epoch": 0.9901589704769115, + "grad_norm": 9.980447769165039, + "learning_rate": 2.7228890048569294e-06, + "loss": 0.506644606590271, + "step": 1308 + }, + { + "epoch": 0.991672975018925, + "grad_norm": 3.1808173656463623, + "learning_rate": 2.721862379485094e-06, + "loss": 0.7562835216522217, + "step": 1310 + }, + { + "epoch": 0.9931869795609387, + "grad_norm": 0.9776075482368469, + "learning_rate": 2.720834074287188e-06, + "loss": 0.7175700664520264, + "step": 1312 + }, + { + "epoch": 0.9947009841029523, + "grad_norm": 1.1597323417663574, + "learning_rate": 2.719804090874776e-06, + "loss": 0.7824996709823608, + "step": 1314 + }, + { + "epoch": 0.9962149886449659, + "grad_norm": 0.8739152550697327, + "learning_rate": 2.7187724308620507e-06, + "loss": 0.8707235455513, + "step": 1316 + }, + { + "epoch": 0.9977289931869796, + "grad_norm": 1.5921571254730225, + "learning_rate": 2.7177390958658336e-06, + "loss": 0.9199660420417786, + "step": 1318 + }, + { + "epoch": 0.9992429977289932, + "grad_norm": 3.554126739501953, + "learning_rate": 2.71670408750557e-06, + "loss": 0.3907288610935211, + "step": 1320 + }, + { + "epoch": 1.0007570022710068, + "grad_norm": 4.065595626831055, + "learning_rate": 2.715667407403328e-06, + "loss": 0.6484695672988892, + "step": 1322 + }, + { + "epoch": 1.0022710068130205, + "grad_norm": 1.5117783546447754, + "learning_rate": 2.7146290571837965e-06, + "loss": 0.6858867406845093, + "step": 1324 + }, + { + "epoch": 1.0037850113550342, + "grad_norm": 4.27254581451416, + "learning_rate": 2.7135890384742804e-06, + "loss": 0.8471809029579163, + "step": 1326 + }, + { + "epoch": 1.0052990158970476, + "grad_norm": 2.3244190216064453, + "learning_rate": 2.712547352904701e-06, + "loss": 0.4096202552318573, + "step": 1328 + }, + { + "epoch": 1.0068130204390613, + "grad_norm": 1.1917668581008911, + "learning_rate": 2.7115040021075915e-06, + "loss": 1.1091688871383667, + "step": 1330 + }, + { + "epoch": 1.008327024981075, + "grad_norm": 1.4938757419586182, + "learning_rate": 2.710458987718094e-06, + "loss": 0.7808442711830139, + "step": 1332 + }, + { + "epoch": 1.0098410295230886, + "grad_norm": 2.786776542663574, + "learning_rate": 2.7094123113739576e-06, + "loss": 0.7021288275718689, + "step": 1334 + }, + { + "epoch": 1.0113550340651023, + "grad_norm": 2.1231276988983154, + "learning_rate": 2.7083639747155376e-06, + "loss": 0.40648818016052246, + "step": 1336 + }, + { + "epoch": 1.0128690386071157, + "grad_norm": 3.733562469482422, + "learning_rate": 2.7073139793857908e-06, + "loss": 0.5841293334960938, + "step": 1338 + }, + { + "epoch": 1.0143830431491294, + "grad_norm": 2.1196770668029785, + "learning_rate": 2.706262327030272e-06, + "loss": 0.6489790678024292, + "step": 1340 + }, + { + "epoch": 1.015897047691143, + "grad_norm": 1.146486759185791, + "learning_rate": 2.705209019297135e-06, + "loss": 0.4999706745147705, + "step": 1342 + }, + { + "epoch": 1.0174110522331568, + "grad_norm": 10.101701736450195, + "learning_rate": 2.7041540578371273e-06, + "loss": 0.8022160530090332, + "step": 1344 + }, + { + "epoch": 1.0189250567751704, + "grad_norm": 1.2306183576583862, + "learning_rate": 2.7030974443035864e-06, + "loss": 0.9291307926177979, + "step": 1346 + }, + { + "epoch": 1.0204390613171839, + "grad_norm": 1.7999662160873413, + "learning_rate": 2.7020391803524415e-06, + "loss": 0.6551251411437988, + "step": 1348 + }, + { + "epoch": 1.0219530658591975, + "grad_norm": 1.3833560943603516, + "learning_rate": 2.7009792676422067e-06, + "loss": 0.7547062039375305, + "step": 1350 + }, + { + "epoch": 1.0234670704012112, + "grad_norm": 0.5464763045310974, + "learning_rate": 2.6999177078339807e-06, + "loss": 0.608425498008728, + "step": 1352 + }, + { + "epoch": 1.0249810749432249, + "grad_norm": 6.251483917236328, + "learning_rate": 2.6988545025914437e-06, + "loss": 0.8611260652542114, + "step": 1354 + }, + { + "epoch": 1.0264950794852385, + "grad_norm": 2.0895566940307617, + "learning_rate": 2.697789653580853e-06, + "loss": 0.7628198862075806, + "step": 1356 + }, + { + "epoch": 1.028009084027252, + "grad_norm": 1.1401382684707642, + "learning_rate": 2.6967231624710447e-06, + "loss": 0.9044088125228882, + "step": 1358 + }, + { + "epoch": 1.0295230885692657, + "grad_norm": 2.221172571182251, + "learning_rate": 2.695655030933426e-06, + "loss": 1.0025997161865234, + "step": 1360 + }, + { + "epoch": 1.0310370931112793, + "grad_norm": 4.832877159118652, + "learning_rate": 2.694585260641977e-06, + "loss": 0.6454758644104004, + "step": 1362 + }, + { + "epoch": 1.032551097653293, + "grad_norm": 2.013058662414551, + "learning_rate": 2.6935138532732442e-06, + "loss": 0.7279521226882935, + "step": 1364 + }, + { + "epoch": 1.0340651021953067, + "grad_norm": 3.6711158752441406, + "learning_rate": 2.6924408105063405e-06, + "loss": 0.7960509657859802, + "step": 1366 + }, + { + "epoch": 1.0355791067373201, + "grad_norm": 0.5056184530258179, + "learning_rate": 2.6913661340229423e-06, + "loss": 0.4651731550693512, + "step": 1368 + }, + { + "epoch": 1.0370931112793338, + "grad_norm": 3.005361318588257, + "learning_rate": 2.6902898255072865e-06, + "loss": 0.6823665499687195, + "step": 1370 + }, + { + "epoch": 1.0386071158213475, + "grad_norm": 2.50605845451355, + "learning_rate": 2.6892118866461664e-06, + "loss": 0.3197304606437683, + "step": 1372 + }, + { + "epoch": 1.0401211203633611, + "grad_norm": 1.1979652643203735, + "learning_rate": 2.6881323191289305e-06, + "loss": 0.7594679594039917, + "step": 1374 + }, + { + "epoch": 1.0416351249053748, + "grad_norm": 9.729329109191895, + "learning_rate": 2.6870511246474815e-06, + "loss": 0.9013135433197021, + "step": 1376 + }, + { + "epoch": 1.0431491294473882, + "grad_norm": 1.1205658912658691, + "learning_rate": 2.685968304896271e-06, + "loss": 0.6933802366256714, + "step": 1378 + }, + { + "epoch": 1.044663133989402, + "grad_norm": 1.5449239015579224, + "learning_rate": 2.6848838615722962e-06, + "loss": 1.1561776399612427, + "step": 1380 + }, + { + "epoch": 1.0461771385314156, + "grad_norm": 1.6639689207077026, + "learning_rate": 2.683797796375101e-06, + "loss": 0.7652785181999207, + "step": 1382 + }, + { + "epoch": 1.0476911430734293, + "grad_norm": 1.3527640104293823, + "learning_rate": 2.68271011100677e-06, + "loss": 0.8150340914726257, + "step": 1384 + }, + { + "epoch": 1.049205147615443, + "grad_norm": 2.725649356842041, + "learning_rate": 2.6816208071719274e-06, + "loss": 1.0739437341690063, + "step": 1386 + }, + { + "epoch": 1.0507191521574564, + "grad_norm": 1.135989785194397, + "learning_rate": 2.680529886577733e-06, + "loss": 1.1149940490722656, + "step": 1388 + }, + { + "epoch": 1.05223315669947, + "grad_norm": 2.67694354057312, + "learning_rate": 2.6794373509338812e-06, + "loss": 0.7776299715042114, + "step": 1390 + }, + { + "epoch": 1.0537471612414837, + "grad_norm": 5.595714569091797, + "learning_rate": 2.6783432019525967e-06, + "loss": 0.49512526392936707, + "step": 1392 + }, + { + "epoch": 1.0552611657834974, + "grad_norm": 2.249765396118164, + "learning_rate": 2.6772474413486345e-06, + "loss": 0.767449140548706, + "step": 1394 + }, + { + "epoch": 1.056775170325511, + "grad_norm": 0.6284688115119934, + "learning_rate": 2.6761500708392727e-06, + "loss": 0.7256978750228882, + "step": 1396 + }, + { + "epoch": 1.0582891748675245, + "grad_norm": 1.9604918956756592, + "learning_rate": 2.675051092144315e-06, + "loss": 0.2871326506137848, + "step": 1398 + }, + { + "epoch": 1.0598031794095382, + "grad_norm": 1.2212276458740234, + "learning_rate": 2.6739505069860835e-06, + "loss": 0.8681665658950806, + "step": 1400 + }, + { + "epoch": 1.0613171839515518, + "grad_norm": 2.299312114715576, + "learning_rate": 2.672848317089419e-06, + "loss": 0.45025742053985596, + "step": 1402 + }, + { + "epoch": 1.0628311884935655, + "grad_norm": 2.737454652786255, + "learning_rate": 2.6717445241816766e-06, + "loss": 0.4308546483516693, + "step": 1404 + }, + { + "epoch": 1.0643451930355792, + "grad_norm": 1.5491366386413574, + "learning_rate": 2.670639129992724e-06, + "loss": 0.8330565094947815, + "step": 1406 + }, + { + "epoch": 1.0658591975775926, + "grad_norm": 2.574392795562744, + "learning_rate": 2.669532136254939e-06, + "loss": 0.7954061627388, + "step": 1408 + }, + { + "epoch": 1.0673732021196063, + "grad_norm": 5.209728240966797, + "learning_rate": 2.668423544703205e-06, + "loss": 1.1869783401489258, + "step": 1410 + }, + { + "epoch": 1.06888720666162, + "grad_norm": 4.869787693023682, + "learning_rate": 2.6673133570749107e-06, + "loss": 0.7794291973114014, + "step": 1412 + }, + { + "epoch": 1.0704012112036336, + "grad_norm": 2.1914610862731934, + "learning_rate": 2.666201575109945e-06, + "loss": 1.114492654800415, + "step": 1414 + }, + { + "epoch": 1.0719152157456473, + "grad_norm": 1.7754486799240112, + "learning_rate": 2.665088200550697e-06, + "loss": 0.8374594449996948, + "step": 1416 + }, + { + "epoch": 1.073429220287661, + "grad_norm": 12.656476974487305, + "learning_rate": 2.6639732351420494e-06, + "loss": 0.707738995552063, + "step": 1418 + }, + { + "epoch": 1.0749432248296744, + "grad_norm": 1.9860066175460815, + "learning_rate": 2.66285668063138e-06, + "loss": 0.7245526313781738, + "step": 1420 + }, + { + "epoch": 1.076457229371688, + "grad_norm": 2.6649181842803955, + "learning_rate": 2.6617385387685576e-06, + "loss": 0.4846969544887543, + "step": 1422 + }, + { + "epoch": 1.0779712339137018, + "grad_norm": 1.3532438278198242, + "learning_rate": 2.6606188113059366e-06, + "loss": 0.4601760506629944, + "step": 1424 + }, + { + "epoch": 1.0794852384557154, + "grad_norm": 2.258784055709839, + "learning_rate": 2.6594974999983575e-06, + "loss": 0.6497401595115662, + "step": 1426 + }, + { + "epoch": 1.080999242997729, + "grad_norm": 1.5624862909317017, + "learning_rate": 2.6583746066031428e-06, + "loss": 0.32594579458236694, + "step": 1428 + }, + { + "epoch": 1.0825132475397425, + "grad_norm": 1.3750901222229004, + "learning_rate": 2.6572501328800947e-06, + "loss": 0.743114173412323, + "step": 1430 + }, + { + "epoch": 1.0840272520817562, + "grad_norm": 1.798200249671936, + "learning_rate": 2.656124080591492e-06, + "loss": 0.44229811429977417, + "step": 1432 + }, + { + "epoch": 1.0855412566237699, + "grad_norm": 4.197088718414307, + "learning_rate": 2.654996451502087e-06, + "loss": 0.6145902872085571, + "step": 1434 + }, + { + "epoch": 1.0870552611657835, + "grad_norm": 1.6194286346435547, + "learning_rate": 2.653867247379104e-06, + "loss": 1.12995183467865, + "step": 1436 + }, + { + "epoch": 1.0885692657077972, + "grad_norm": 2.891066551208496, + "learning_rate": 2.6527364699922356e-06, + "loss": 0.7188288569450378, + "step": 1438 + }, + { + "epoch": 1.0900832702498107, + "grad_norm": 2.989654064178467, + "learning_rate": 2.651604121113639e-06, + "loss": 0.4282357096672058, + "step": 1440 + }, + { + "epoch": 1.0915972747918243, + "grad_norm": 1.4547679424285889, + "learning_rate": 2.6504702025179344e-06, + "loss": 0.7891103625297546, + "step": 1442 + }, + { + "epoch": 1.093111279333838, + "grad_norm": 1.7716169357299805, + "learning_rate": 2.6493347159822033e-06, + "loss": 0.33154594898223877, + "step": 1444 + }, + { + "epoch": 1.0946252838758517, + "grad_norm": 3.8357415199279785, + "learning_rate": 2.6481976632859837e-06, + "loss": 0.5994241833686829, + "step": 1446 + }, + { + "epoch": 1.0961392884178653, + "grad_norm": 1.1723169088363647, + "learning_rate": 2.647059046211268e-06, + "loss": 1.1812413930892944, + "step": 1448 + }, + { + "epoch": 1.0976532929598788, + "grad_norm": 3.084104061126709, + "learning_rate": 2.645918866542501e-06, + "loss": 0.8157893419265747, + "step": 1450 + }, + { + "epoch": 1.0991672975018925, + "grad_norm": 1.1051583290100098, + "learning_rate": 2.6447771260665746e-06, + "loss": 0.7416723966598511, + "step": 1452 + }, + { + "epoch": 1.1006813020439061, + "grad_norm": 0.9261996746063232, + "learning_rate": 2.643633826572829e-06, + "loss": 0.7483739256858826, + "step": 1454 + }, + { + "epoch": 1.1021953065859198, + "grad_norm": 4.7306413650512695, + "learning_rate": 2.6424889698530463e-06, + "loss": 0.7851324677467346, + "step": 1456 + }, + { + "epoch": 1.1037093111279335, + "grad_norm": 5.931928634643555, + "learning_rate": 2.64134255770145e-06, + "loss": 0.6997071504592896, + "step": 1458 + }, + { + "epoch": 1.105223315669947, + "grad_norm": 1.2409297227859497, + "learning_rate": 2.6401945919147e-06, + "loss": 0.7714464068412781, + "step": 1460 + }, + { + "epoch": 1.1067373202119606, + "grad_norm": 4.654412746429443, + "learning_rate": 2.6390450742918934e-06, + "loss": 0.8759744167327881, + "step": 1462 + }, + { + "epoch": 1.1082513247539743, + "grad_norm": 3.47519588470459, + "learning_rate": 2.6378940066345563e-06, + "loss": 0.7907311916351318, + "step": 1464 + }, + { + "epoch": 1.109765329295988, + "grad_norm": 2.070798873901367, + "learning_rate": 2.636741390746646e-06, + "loss": 0.8151794672012329, + "step": 1466 + }, + { + "epoch": 1.1112793338380016, + "grad_norm": 1.7536894083023071, + "learning_rate": 2.635587228434546e-06, + "loss": 0.4881594777107239, + "step": 1468 + }, + { + "epoch": 1.112793338380015, + "grad_norm": 1.508069634437561, + "learning_rate": 2.6344315215070623e-06, + "loss": 0.6714631915092468, + "step": 1470 + }, + { + "epoch": 1.1143073429220287, + "grad_norm": 1.018688440322876, + "learning_rate": 2.633274271775423e-06, + "loss": 0.6701086759567261, + "step": 1472 + }, + { + "epoch": 1.1158213474640424, + "grad_norm": 3.284794807434082, + "learning_rate": 2.6321154810532736e-06, + "loss": 1.1665421724319458, + "step": 1474 + }, + { + "epoch": 1.117335352006056, + "grad_norm": 2.2673730850219727, + "learning_rate": 2.6309551511566748e-06, + "loss": 0.7662602066993713, + "step": 1476 + }, + { + "epoch": 1.1188493565480697, + "grad_norm": 2.1317033767700195, + "learning_rate": 2.629793283904098e-06, + "loss": 0.7136352062225342, + "step": 1478 + }, + { + "epoch": 1.1203633610900834, + "grad_norm": 5.779655933380127, + "learning_rate": 2.628629881116427e-06, + "loss": 0.9151337146759033, + "step": 1480 + }, + { + "epoch": 1.1218773656320968, + "grad_norm": 0.9079190492630005, + "learning_rate": 2.6274649446169484e-06, + "loss": 0.2999037206172943, + "step": 1482 + }, + { + "epoch": 1.1233913701741105, + "grad_norm": 1.6747114658355713, + "learning_rate": 2.6262984762313558e-06, + "loss": 0.5830443501472473, + "step": 1484 + }, + { + "epoch": 1.1249053747161242, + "grad_norm": 1.9732108116149902, + "learning_rate": 2.625130477787741e-06, + "loss": 0.7363815307617188, + "step": 1486 + }, + { + "epoch": 1.1264193792581378, + "grad_norm": 2.125129461288452, + "learning_rate": 2.623960951116596e-06, + "loss": 0.4688108265399933, + "step": 1488 + }, + { + "epoch": 1.1279333838001513, + "grad_norm": 2.8928117752075195, + "learning_rate": 2.622789898050805e-06, + "loss": 0.49477702379226685, + "step": 1490 + }, + { + "epoch": 1.129447388342165, + "grad_norm": 2.086865186691284, + "learning_rate": 2.6216173204256478e-06, + "loss": 0.6846297979354858, + "step": 1492 + }, + { + "epoch": 1.1309613928841786, + "grad_norm": 2.331041097640991, + "learning_rate": 2.6204432200787906e-06, + "loss": 0.39177241921424866, + "step": 1494 + }, + { + "epoch": 1.1324753974261923, + "grad_norm": 1.9901928901672363, + "learning_rate": 2.6192675988502873e-06, + "loss": 0.5326924324035645, + "step": 1496 + }, + { + "epoch": 1.133989401968206, + "grad_norm": 3.5177462100982666, + "learning_rate": 2.6180904585825756e-06, + "loss": 0.5386614203453064, + "step": 1498 + }, + { + "epoch": 1.1355034065102196, + "grad_norm": 2.0425875186920166, + "learning_rate": 2.6169118011204733e-06, + "loss": 0.4488964080810547, + "step": 1500 + }, + { + "epoch": 1.137017411052233, + "grad_norm": 4.189729690551758, + "learning_rate": 2.615731628311175e-06, + "loss": 0.6858019232749939, + "step": 1502 + }, + { + "epoch": 1.1385314155942468, + "grad_norm": 3.267101287841797, + "learning_rate": 2.614549942004253e-06, + "loss": 0.7634918689727783, + "step": 1504 + }, + { + "epoch": 1.1400454201362604, + "grad_norm": 2.262455701828003, + "learning_rate": 2.613366744051648e-06, + "loss": 0.6932145357131958, + "step": 1506 + }, + { + "epoch": 1.141559424678274, + "grad_norm": 3.6853957176208496, + "learning_rate": 2.612182036307673e-06, + "loss": 0.7956565618515015, + "step": 1508 + }, + { + "epoch": 1.1430734292202875, + "grad_norm": 1.7257254123687744, + "learning_rate": 2.6109958206290047e-06, + "loss": 0.7833234667778015, + "step": 1510 + }, + { + "epoch": 1.1445874337623012, + "grad_norm": 1.2528127431869507, + "learning_rate": 2.6098080988746847e-06, + "loss": 0.8072184920310974, + "step": 1512 + }, + { + "epoch": 1.1461014383043149, + "grad_norm": 2.246197462081909, + "learning_rate": 2.6086188729061137e-06, + "loss": 0.3839539885520935, + "step": 1514 + }, + { + "epoch": 1.1476154428463285, + "grad_norm": 1.6792188882827759, + "learning_rate": 2.6074281445870504e-06, + "loss": 0.7050773501396179, + "step": 1516 + }, + { + "epoch": 1.1491294473883422, + "grad_norm": 1.5236520767211914, + "learning_rate": 2.6062359157836085e-06, + "loss": 1.004371166229248, + "step": 1518 + }, + { + "epoch": 1.1506434519303559, + "grad_norm": 1.5533944368362427, + "learning_rate": 2.6050421883642523e-06, + "loss": 0.9530134201049805, + "step": 1520 + }, + { + "epoch": 1.1521574564723693, + "grad_norm": 1.8872240781784058, + "learning_rate": 2.603846964199795e-06, + "loss": 0.569072425365448, + "step": 1522 + }, + { + "epoch": 1.153671461014383, + "grad_norm": 2.460739850997925, + "learning_rate": 2.602650245163396e-06, + "loss": 1.1149699687957764, + "step": 1524 + }, + { + "epoch": 1.1551854655563967, + "grad_norm": 3.2289810180664062, + "learning_rate": 2.601452033130556e-06, + "loss": 1.064550518989563, + "step": 1526 + }, + { + "epoch": 1.1566994700984103, + "grad_norm": 1.4160879850387573, + "learning_rate": 2.6002523299791185e-06, + "loss": 0.810400664806366, + "step": 1528 + }, + { + "epoch": 1.158213474640424, + "grad_norm": 1.8709641695022583, + "learning_rate": 2.599051137589261e-06, + "loss": 0.9655773043632507, + "step": 1530 + }, + { + "epoch": 1.1597274791824375, + "grad_norm": 1.6488513946533203, + "learning_rate": 2.5978484578434956e-06, + "loss": 0.40531933307647705, + "step": 1532 + }, + { + "epoch": 1.1612414837244511, + "grad_norm": 2.911818027496338, + "learning_rate": 2.5966442926266667e-06, + "loss": 0.411218523979187, + "step": 1534 + }, + { + "epoch": 1.1627554882664648, + "grad_norm": 3.381363868713379, + "learning_rate": 2.5954386438259446e-06, + "loss": 0.31722086668014526, + "step": 1536 + }, + { + "epoch": 1.1642694928084785, + "grad_norm": 2.443310499191284, + "learning_rate": 2.5942315133308264e-06, + "loss": 0.6441760659217834, + "step": 1538 + }, + { + "epoch": 1.1657834973504921, + "grad_norm": 2.732754707336426, + "learning_rate": 2.5930229030331323e-06, + "loss": 0.821896493434906, + "step": 1540 + }, + { + "epoch": 1.1672975018925056, + "grad_norm": 2.5103321075439453, + "learning_rate": 2.5918128148269977e-06, + "loss": 0.7254232168197632, + "step": 1542 + }, + { + "epoch": 1.1688115064345193, + "grad_norm": 1.630812168121338, + "learning_rate": 2.590601250608878e-06, + "loss": 0.7317939400672913, + "step": 1544 + }, + { + "epoch": 1.170325510976533, + "grad_norm": 2.458219289779663, + "learning_rate": 2.58938821227754e-06, + "loss": 0.9973943829536438, + "step": 1546 + }, + { + "epoch": 1.1718395155185466, + "grad_norm": 4.012199401855469, + "learning_rate": 2.588173701734061e-06, + "loss": 0.34447503089904785, + "step": 1548 + }, + { + "epoch": 1.1733535200605603, + "grad_norm": 4.598755836486816, + "learning_rate": 2.5869577208818264e-06, + "loss": 0.7477306127548218, + "step": 1550 + }, + { + "epoch": 1.1748675246025737, + "grad_norm": 1.816725730895996, + "learning_rate": 2.585740271626525e-06, + "loss": 0.7996265292167664, + "step": 1552 + }, + { + "epoch": 1.1763815291445874, + "grad_norm": 1.624489188194275, + "learning_rate": 2.5845213558761464e-06, + "loss": 0.5608413219451904, + "step": 1554 + }, + { + "epoch": 1.177895533686601, + "grad_norm": 0.9994196891784668, + "learning_rate": 2.5833009755409798e-06, + "loss": 0.7846572399139404, + "step": 1556 + }, + { + "epoch": 1.1794095382286147, + "grad_norm": 1.754160761833191, + "learning_rate": 2.582079132533609e-06, + "loss": 0.7681211233139038, + "step": 1558 + }, + { + "epoch": 1.1809235427706284, + "grad_norm": 1.2052630186080933, + "learning_rate": 2.58085582876891e-06, + "loss": 0.7920397520065308, + "step": 1560 + }, + { + "epoch": 1.182437547312642, + "grad_norm": 4.706214427947998, + "learning_rate": 2.579631066164048e-06, + "loss": 0.8161523938179016, + "step": 1562 + }, + { + "epoch": 1.1839515518546555, + "grad_norm": 2.612334966659546, + "learning_rate": 2.5784048466384754e-06, + "loss": 0.4959687292575836, + "step": 1564 + }, + { + "epoch": 1.1854655563966692, + "grad_norm": 5.091418743133545, + "learning_rate": 2.577177172113927e-06, + "loss": 0.36079561710357666, + "step": 1566 + }, + { + "epoch": 1.1869795609386828, + "grad_norm": 12.294486045837402, + "learning_rate": 2.5759480445144183e-06, + "loss": 0.4350496530532837, + "step": 1568 + }, + { + "epoch": 1.1884935654806965, + "grad_norm": 2.2385106086730957, + "learning_rate": 2.5747174657662415e-06, + "loss": 0.3525988459587097, + "step": 1570 + }, + { + "epoch": 1.19000757002271, + "grad_norm": 1.2902114391326904, + "learning_rate": 2.5734854377979643e-06, + "loss": 0.5721244215965271, + "step": 1572 + }, + { + "epoch": 1.1915215745647236, + "grad_norm": 1.8585532903671265, + "learning_rate": 2.572251962540424e-06, + "loss": 0.8713744878768921, + "step": 1574 + }, + { + "epoch": 1.1930355791067373, + "grad_norm": 2.197967529296875, + "learning_rate": 2.571017041926727e-06, + "loss": 0.3259797990322113, + "step": 1576 + }, + { + "epoch": 1.194549583648751, + "grad_norm": 2.8391945362091064, + "learning_rate": 2.5697806778922442e-06, + "loss": 0.4257240295410156, + "step": 1578 + }, + { + "epoch": 1.1960635881907646, + "grad_norm": 1.7602157592773438, + "learning_rate": 2.5685428723746106e-06, + "loss": 0.6905187964439392, + "step": 1580 + }, + { + "epoch": 1.1975775927327783, + "grad_norm": 3.1169686317443848, + "learning_rate": 2.567303627313718e-06, + "loss": 0.7006649374961853, + "step": 1582 + }, + { + "epoch": 1.1990915972747918, + "grad_norm": 2.1062004566192627, + "learning_rate": 2.566062944651715e-06, + "loss": 0.31509527564048767, + "step": 1584 + }, + { + "epoch": 1.2006056018168054, + "grad_norm": 3.239856481552124, + "learning_rate": 2.5648208263330033e-06, + "loss": 0.36794421076774597, + "step": 1586 + }, + { + "epoch": 1.202119606358819, + "grad_norm": 1.8824299573898315, + "learning_rate": 2.5635772743042354e-06, + "loss": 0.8412524461746216, + "step": 1588 + }, + { + "epoch": 1.2036336109008328, + "grad_norm": 3.809107780456543, + "learning_rate": 2.562332290514309e-06, + "loss": 0.5399434566497803, + "step": 1590 + }, + { + "epoch": 1.2051476154428462, + "grad_norm": 1.8606302738189697, + "learning_rate": 2.5610858769143673e-06, + "loss": 0.7604067325592041, + "step": 1592 + }, + { + "epoch": 1.2066616199848599, + "grad_norm": 1.2861109972000122, + "learning_rate": 2.5598380354577934e-06, + "loss": 1.0838377475738525, + "step": 1594 + }, + { + "epoch": 1.2081756245268735, + "grad_norm": 4.150733470916748, + "learning_rate": 2.5585887681002077e-06, + "loss": 0.9925789833068848, + "step": 1596 + }, + { + "epoch": 1.2096896290688872, + "grad_norm": 3.083434820175171, + "learning_rate": 2.5573380767994667e-06, + "loss": 0.6757802963256836, + "step": 1598 + }, + { + "epoch": 1.2112036336109009, + "grad_norm": 3.0484514236450195, + "learning_rate": 2.556085963515657e-06, + "loss": 0.8041543364524841, + "step": 1600 + }, + { + "epoch": 1.2127176381529146, + "grad_norm": 1.458834171295166, + "learning_rate": 2.554832430211095e-06, + "loss": 0.7150396108627319, + "step": 1602 + }, + { + "epoch": 1.214231642694928, + "grad_norm": 4.906940460205078, + "learning_rate": 2.5535774788503226e-06, + "loss": 0.6995714902877808, + "step": 1604 + }, + { + "epoch": 1.2157456472369417, + "grad_norm": 1.4084808826446533, + "learning_rate": 2.552321111400102e-06, + "loss": 1.1149778366088867, + "step": 1606 + }, + { + "epoch": 1.2172596517789553, + "grad_norm": 3.868961811065674, + "learning_rate": 2.551063329829417e-06, + "loss": 0.31372106075286865, + "step": 1608 + }, + { + "epoch": 1.218773656320969, + "grad_norm": 2.428809881210327, + "learning_rate": 2.5498041361094675e-06, + "loss": 0.5318719744682312, + "step": 1610 + }, + { + "epoch": 1.2202876608629827, + "grad_norm": 3.32373046875, + "learning_rate": 2.548543532213664e-06, + "loss": 0.6109141111373901, + "step": 1612 + }, + { + "epoch": 1.2218016654049961, + "grad_norm": 1.0111043453216553, + "learning_rate": 2.5472815201176305e-06, + "loss": 0.4362989068031311, + "step": 1614 + }, + { + "epoch": 1.2233156699470098, + "grad_norm": 2.8957338333129883, + "learning_rate": 2.5460181017991965e-06, + "loss": 0.40466615557670593, + "step": 1616 + }, + { + "epoch": 1.2248296744890235, + "grad_norm": 1.037187933921814, + "learning_rate": 2.5447532792383934e-06, + "loss": 1.1984484195709229, + "step": 1618 + }, + { + "epoch": 1.2263436790310371, + "grad_norm": 2.051070213317871, + "learning_rate": 2.5434870544174565e-06, + "loss": 0.9026281237602234, + "step": 1620 + }, + { + "epoch": 1.2278576835730508, + "grad_norm": 2.1154625415802, + "learning_rate": 2.542219429320816e-06, + "loss": 0.6213358640670776, + "step": 1622 + }, + { + "epoch": 1.2293716881150643, + "grad_norm": 1.5054359436035156, + "learning_rate": 2.5409504059350997e-06, + "loss": 0.6148259043693542, + "step": 1624 + }, + { + "epoch": 1.230885692657078, + "grad_norm": 2.0706820487976074, + "learning_rate": 2.5396799862491234e-06, + "loss": 0.8346166610717773, + "step": 1626 + }, + { + "epoch": 1.2323996971990916, + "grad_norm": 1.2801012992858887, + "learning_rate": 2.5384081722538944e-06, + "loss": 0.9002949595451355, + "step": 1628 + }, + { + "epoch": 1.2339137017411053, + "grad_norm": 2.350609064102173, + "learning_rate": 2.537134965942602e-06, + "loss": 0.7283930778503418, + "step": 1630 + }, + { + "epoch": 1.235427706283119, + "grad_norm": 2.341456413269043, + "learning_rate": 2.53586036931062e-06, + "loss": 0.7903137803077698, + "step": 1632 + }, + { + "epoch": 1.2369417108251324, + "grad_norm": 3.2067198753356934, + "learning_rate": 2.5345843843554997e-06, + "loss": 0.7844091653823853, + "step": 1634 + }, + { + "epoch": 1.238455715367146, + "grad_norm": 1.3830692768096924, + "learning_rate": 2.5333070130769693e-06, + "loss": 0.514484167098999, + "step": 1636 + }, + { + "epoch": 1.2399697199091597, + "grad_norm": 1.7025067806243896, + "learning_rate": 2.5320282574769286e-06, + "loss": 0.7913303375244141, + "step": 1638 + }, + { + "epoch": 1.2414837244511734, + "grad_norm": 0.5552706122398376, + "learning_rate": 2.530748119559447e-06, + "loss": 0.8302839994430542, + "step": 1640 + }, + { + "epoch": 1.242997728993187, + "grad_norm": 1.9868996143341064, + "learning_rate": 2.5294666013307625e-06, + "loss": 0.798913836479187, + "step": 1642 + }, + { + "epoch": 1.2445117335352007, + "grad_norm": 1.6438952684402466, + "learning_rate": 2.528183704799272e-06, + "loss": 0.42257753014564514, + "step": 1644 + }, + { + "epoch": 1.2460257380772142, + "grad_norm": 3.404130220413208, + "learning_rate": 2.5268994319755364e-06, + "loss": 0.49919620156288147, + "step": 1646 + }, + { + "epoch": 1.2475397426192278, + "grad_norm": 3.0103302001953125, + "learning_rate": 2.5256137848722716e-06, + "loss": 0.4483895003795624, + "step": 1648 + }, + { + "epoch": 1.2490537471612415, + "grad_norm": 1.1427053213119507, + "learning_rate": 2.5243267655043485e-06, + "loss": 0.7294577360153198, + "step": 1650 + }, + { + "epoch": 1.2505677517032552, + "grad_norm": 2.8634467124938965, + "learning_rate": 2.523038375888787e-06, + "loss": 0.7409688234329224, + "step": 1652 + }, + { + "epoch": 1.2520817562452686, + "grad_norm": 1.8294243812561035, + "learning_rate": 2.521748618044755e-06, + "loss": 0.7569249272346497, + "step": 1654 + }, + { + "epoch": 1.2535957607872823, + "grad_norm": 0.9663636684417725, + "learning_rate": 2.520457493993566e-06, + "loss": 0.7880640625953674, + "step": 1656 + }, + { + "epoch": 1.255109765329296, + "grad_norm": 2.7252726554870605, + "learning_rate": 2.519165005758674e-06, + "loss": 0.8395610451698303, + "step": 1658 + }, + { + "epoch": 1.2566237698713096, + "grad_norm": 5.612975597381592, + "learning_rate": 2.5178711553656694e-06, + "loss": 1.1597182750701904, + "step": 1660 + }, + { + "epoch": 1.2581377744133233, + "grad_norm": 0.812967836856842, + "learning_rate": 2.5165759448422783e-06, + "loss": 0.29927507042884827, + "step": 1662 + }, + { + "epoch": 1.259651778955337, + "grad_norm": 1.4541382789611816, + "learning_rate": 2.5152793762183605e-06, + "loss": 0.43437737226486206, + "step": 1664 + }, + { + "epoch": 1.2611657834973504, + "grad_norm": 6.267176628112793, + "learning_rate": 2.5139814515259005e-06, + "loss": 0.2858128547668457, + "step": 1666 + }, + { + "epoch": 1.262679788039364, + "grad_norm": 4.074245929718018, + "learning_rate": 2.5126821727990115e-06, + "loss": 1.1706475019454956, + "step": 1668 + }, + { + "epoch": 1.2641937925813778, + "grad_norm": 1.6763298511505127, + "learning_rate": 2.5113815420739265e-06, + "loss": 0.6928737759590149, + "step": 1670 + }, + { + "epoch": 1.2657077971233914, + "grad_norm": 1.9774268865585327, + "learning_rate": 2.5100795613889975e-06, + "loss": 1.1182012557983398, + "step": 1672 + }, + { + "epoch": 1.2672218016654049, + "grad_norm": 2.3122832775115967, + "learning_rate": 2.5087762327846932e-06, + "loss": 0.8243232369422913, + "step": 1674 + }, + { + "epoch": 1.2687358062074185, + "grad_norm": 3.401611328125, + "learning_rate": 2.5074715583035945e-06, + "loss": 0.5442602038383484, + "step": 1676 + }, + { + "epoch": 1.2702498107494322, + "grad_norm": 5.750372886657715, + "learning_rate": 2.506165539990391e-06, + "loss": 0.5971079468727112, + "step": 1678 + }, + { + "epoch": 1.2717638152914459, + "grad_norm": 1.3833726644515991, + "learning_rate": 2.5048581798918786e-06, + "loss": 0.7560960054397583, + "step": 1680 + }, + { + "epoch": 1.2732778198334596, + "grad_norm": 3.221604824066162, + "learning_rate": 2.5035494800569568e-06, + "loss": 0.35544896125793457, + "step": 1682 + }, + { + "epoch": 1.2747918243754732, + "grad_norm": 4.361257076263428, + "learning_rate": 2.5022394425366226e-06, + "loss": 0.32855644822120667, + "step": 1684 + }, + { + "epoch": 1.2763058289174867, + "grad_norm": 1.4156495332717896, + "learning_rate": 2.5009280693839733e-06, + "loss": 0.42612865567207336, + "step": 1686 + }, + { + "epoch": 1.2778198334595003, + "grad_norm": 2.2424139976501465, + "learning_rate": 2.4996153626541942e-06, + "loss": 0.4638764262199402, + "step": 1688 + }, + { + "epoch": 1.279333838001514, + "grad_norm": 1.8984419107437134, + "learning_rate": 2.498301324404565e-06, + "loss": 0.512053370475769, + "step": 1690 + }, + { + "epoch": 1.2808478425435277, + "grad_norm": 1.0238373279571533, + "learning_rate": 2.4969859566944504e-06, + "loss": 0.6156362891197205, + "step": 1692 + }, + { + "epoch": 1.2823618470855411, + "grad_norm": 2.871161937713623, + "learning_rate": 2.495669261585299e-06, + "loss": 0.5264954566955566, + "step": 1694 + }, + { + "epoch": 1.2838758516275548, + "grad_norm": 2.222243309020996, + "learning_rate": 2.494351241140639e-06, + "loss": 0.8876517415046692, + "step": 1696 + }, + { + "epoch": 1.2853898561695685, + "grad_norm": 2.5161256790161133, + "learning_rate": 2.493031897426076e-06, + "loss": 0.3502652049064636, + "step": 1698 + }, + { + "epoch": 1.2869038607115821, + "grad_norm": 3.8380353450775146, + "learning_rate": 2.4917112325092903e-06, + "loss": 0.3527962267398834, + "step": 1700 + }, + { + "epoch": 1.2884178652535958, + "grad_norm": 1.8813527822494507, + "learning_rate": 2.4903892484600315e-06, + "loss": 0.4331962764263153, + "step": 1702 + }, + { + "epoch": 1.2899318697956095, + "grad_norm": 4.55714225769043, + "learning_rate": 2.489065947350117e-06, + "loss": 0.3833101987838745, + "step": 1704 + }, + { + "epoch": 1.2914458743376231, + "grad_norm": 1.418331265449524, + "learning_rate": 2.4877413312534293e-06, + "loss": 0.3526386022567749, + "step": 1706 + }, + { + "epoch": 1.2929598788796366, + "grad_norm": 3.9424781799316406, + "learning_rate": 2.4864154022459104e-06, + "loss": 0.7249223589897156, + "step": 1708 + }, + { + "epoch": 1.2944738834216503, + "grad_norm": 1.072214126586914, + "learning_rate": 2.48508816240556e-06, + "loss": 0.8337989449501038, + "step": 1710 + }, + { + "epoch": 1.295987887963664, + "grad_norm": 2.0571506023406982, + "learning_rate": 2.4837596138124327e-06, + "loss": 0.6471192240715027, + "step": 1712 + }, + { + "epoch": 1.2975018925056774, + "grad_norm": 2.29038667678833, + "learning_rate": 2.4824297585486346e-06, + "loss": 0.6219044923782349, + "step": 1714 + }, + { + "epoch": 1.299015897047691, + "grad_norm": 6.321503162384033, + "learning_rate": 2.4810985986983177e-06, + "loss": 0.6852396726608276, + "step": 1716 + }, + { + "epoch": 1.3005299015897047, + "grad_norm": 2.3172402381896973, + "learning_rate": 2.479766136347682e-06, + "loss": 0.8772772550582886, + "step": 1718 + }, + { + "epoch": 1.3020439061317184, + "grad_norm": 9.401739120483398, + "learning_rate": 2.478432373584964e-06, + "loss": 0.26919516921043396, + "step": 1720 + }, + { + "epoch": 1.303557910673732, + "grad_norm": 8.985549926757812, + "learning_rate": 2.477097312500444e-06, + "loss": 0.7459433674812317, + "step": 1722 + }, + { + "epoch": 1.3050719152157457, + "grad_norm": 1.5415024757385254, + "learning_rate": 2.4757609551864307e-06, + "loss": 1.0596632957458496, + "step": 1724 + }, + { + "epoch": 1.3065859197577594, + "grad_norm": 2.774165391921997, + "learning_rate": 2.4744233037372697e-06, + "loss": 0.30680298805236816, + "step": 1726 + }, + { + "epoch": 1.3080999242997728, + "grad_norm": 11.78912353515625, + "learning_rate": 2.4730843602493315e-06, + "loss": 1.0973109006881714, + "step": 1728 + }, + { + "epoch": 1.3096139288417865, + "grad_norm": 6.063043594360352, + "learning_rate": 2.4717441268210127e-06, + "loss": 0.52330082654953, + "step": 1730 + }, + { + "epoch": 1.3111279333838002, + "grad_norm": 2.5771913528442383, + "learning_rate": 2.4704026055527315e-06, + "loss": 0.6835691332817078, + "step": 1732 + }, + { + "epoch": 1.3126419379258139, + "grad_norm": 1.0834791660308838, + "learning_rate": 2.469059798546924e-06, + "loss": 1.0945249795913696, + "step": 1734 + }, + { + "epoch": 1.3141559424678273, + "grad_norm": 1.7767627239227295, + "learning_rate": 2.4677157079080418e-06, + "loss": 1.3096091747283936, + "step": 1736 + }, + { + "epoch": 1.315669947009841, + "grad_norm": 1.4419454336166382, + "learning_rate": 2.4663703357425475e-06, + "loss": 1.1470904350280762, + "step": 1738 + }, + { + "epoch": 1.3171839515518546, + "grad_norm": 2.81369948387146, + "learning_rate": 2.4650236841589136e-06, + "loss": 0.8008354902267456, + "step": 1740 + }, + { + "epoch": 1.3186979560938683, + "grad_norm": 2.3629062175750732, + "learning_rate": 2.4636757552676146e-06, + "loss": 1.1127903461456299, + "step": 1742 + }, + { + "epoch": 1.320211960635882, + "grad_norm": 1.949653148651123, + "learning_rate": 2.4623265511811316e-06, + "loss": 0.6830950379371643, + "step": 1744 + }, + { + "epoch": 1.3217259651778956, + "grad_norm": 3.8533966541290283, + "learning_rate": 2.4609760740139393e-06, + "loss": 0.620603084564209, + "step": 1746 + }, + { + "epoch": 1.323239969719909, + "grad_norm": 2.704603910446167, + "learning_rate": 2.4596243258825107e-06, + "loss": 0.3389583230018616, + "step": 1748 + }, + { + "epoch": 1.3247539742619228, + "grad_norm": 1.073254942893982, + "learning_rate": 2.458271308905309e-06, + "loss": 0.816598653793335, + "step": 1750 + }, + { + "epoch": 1.3262679788039364, + "grad_norm": 3.498077630996704, + "learning_rate": 2.4569170252027877e-06, + "loss": 0.339092880487442, + "step": 1752 + }, + { + "epoch": 1.32778198334595, + "grad_norm": 1.5943024158477783, + "learning_rate": 2.4555614768973842e-06, + "loss": 0.3954799473285675, + "step": 1754 + }, + { + "epoch": 1.3292959878879635, + "grad_norm": 1.2842795848846436, + "learning_rate": 2.454204666113517e-06, + "loss": 0.2065241038799286, + "step": 1756 + }, + { + "epoch": 1.3308099924299772, + "grad_norm": 1.4345365762710571, + "learning_rate": 2.452846594977585e-06, + "loss": 0.5425804853439331, + "step": 1758 + }, + { + "epoch": 1.3323239969719909, + "grad_norm": 1.7627195119857788, + "learning_rate": 2.451487265617962e-06, + "loss": 0.9363067150115967, + "step": 1760 + }, + { + "epoch": 1.3338380015140046, + "grad_norm": 2.0413122177124023, + "learning_rate": 2.450126680164992e-06, + "loss": 0.535262405872345, + "step": 1762 + }, + { + "epoch": 1.3353520060560182, + "grad_norm": 1.342808723449707, + "learning_rate": 2.4487648407509897e-06, + "loss": 0.7674925327301025, + "step": 1764 + }, + { + "epoch": 1.336866010598032, + "grad_norm": 1.189123511314392, + "learning_rate": 2.447401749510234e-06, + "loss": 0.7066727876663208, + "step": 1766 + }, + { + "epoch": 1.3383800151400453, + "grad_norm": 1.8224396705627441, + "learning_rate": 2.446037408578965e-06, + "loss": 0.3402438461780548, + "step": 1768 + }, + { + "epoch": 1.339894019682059, + "grad_norm": 3.179891347885132, + "learning_rate": 2.444671820095383e-06, + "loss": 0.8113234639167786, + "step": 1770 + }, + { + "epoch": 1.3414080242240727, + "grad_norm": 2.4284238815307617, + "learning_rate": 2.443304986199642e-06, + "loss": 0.7537884712219238, + "step": 1772 + }, + { + "epoch": 1.3429220287660864, + "grad_norm": 1.0442854166030884, + "learning_rate": 2.4419369090338485e-06, + "loss": 0.5505790710449219, + "step": 1774 + }, + { + "epoch": 1.3444360333080998, + "grad_norm": 1.5223824977874756, + "learning_rate": 2.4405675907420575e-06, + "loss": 1.1271681785583496, + "step": 1776 + }, + { + "epoch": 1.3459500378501135, + "grad_norm": 1.4880656003952026, + "learning_rate": 2.439197033470269e-06, + "loss": 1.0472644567489624, + "step": 1778 + }, + { + "epoch": 1.3474640423921271, + "grad_norm": 1.9913803339004517, + "learning_rate": 2.437825239366424e-06, + "loss": 0.7882799506187439, + "step": 1780 + }, + { + "epoch": 1.3489780469341408, + "grad_norm": 3.984208345413208, + "learning_rate": 2.4364522105804026e-06, + "loss": 1.0910110473632812, + "step": 1782 + }, + { + "epoch": 1.3504920514761545, + "grad_norm": 3.6103146076202393, + "learning_rate": 2.4350779492640203e-06, + "loss": 0.34733736515045166, + "step": 1784 + }, + { + "epoch": 1.3520060560181681, + "grad_norm": 1.3475333452224731, + "learning_rate": 2.433702457571024e-06, + "loss": 0.5586898922920227, + "step": 1786 + }, + { + "epoch": 1.3535200605601818, + "grad_norm": 3.0186610221862793, + "learning_rate": 2.432325737657087e-06, + "loss": 0.44951939582824707, + "step": 1788 + }, + { + "epoch": 1.3550340651021953, + "grad_norm": 4.293313980102539, + "learning_rate": 2.430947791679811e-06, + "loss": 0.9010276794433594, + "step": 1790 + }, + { + "epoch": 1.356548069644209, + "grad_norm": 5.8307695388793945, + "learning_rate": 2.4295686217987165e-06, + "loss": 0.33603745698928833, + "step": 1792 + }, + { + "epoch": 1.3580620741862226, + "grad_norm": 0.6447650194168091, + "learning_rate": 2.4281882301752424e-06, + "loss": 0.5842790603637695, + "step": 1794 + }, + { + "epoch": 1.359576078728236, + "grad_norm": 4.703621864318848, + "learning_rate": 2.4268066189727427e-06, + "loss": 0.3971193730831146, + "step": 1796 + }, + { + "epoch": 1.3610900832702497, + "grad_norm": 2.2932517528533936, + "learning_rate": 2.4254237903564834e-06, + "loss": 1.0342694520950317, + "step": 1798 + }, + { + "epoch": 1.3626040878122634, + "grad_norm": 2.307915687561035, + "learning_rate": 2.424039746493638e-06, + "loss": 0.42861407995224, + "step": 1800 + }, + { + "epoch": 1.364118092354277, + "grad_norm": 2.2713558673858643, + "learning_rate": 2.4226544895532837e-06, + "loss": 0.7584144473075867, + "step": 1802 + }, + { + "epoch": 1.3656320968962907, + "grad_norm": 1.5200345516204834, + "learning_rate": 2.4212680217063996e-06, + "loss": 0.6773474216461182, + "step": 1804 + }, + { + "epoch": 1.3671461014383044, + "grad_norm": 1.5733002424240112, + "learning_rate": 2.4198803451258624e-06, + "loss": 0.26620084047317505, + "step": 1806 + }, + { + "epoch": 1.368660105980318, + "grad_norm": 3.7056772708892822, + "learning_rate": 2.418491461986444e-06, + "loss": 0.48931899666786194, + "step": 1808 + }, + { + "epoch": 1.3701741105223315, + "grad_norm": 1.6496108770370483, + "learning_rate": 2.4171013744648053e-06, + "loss": 0.7211140990257263, + "step": 1810 + }, + { + "epoch": 1.3716881150643452, + "grad_norm": 2.1490478515625, + "learning_rate": 2.4157100847394964e-06, + "loss": 0.8123199939727783, + "step": 1812 + }, + { + "epoch": 1.3732021196063589, + "grad_norm": 1.8352160453796387, + "learning_rate": 2.414317594990951e-06, + "loss": 0.31417787075042725, + "step": 1814 + }, + { + "epoch": 1.3747161241483725, + "grad_norm": 1.7687348127365112, + "learning_rate": 2.412923907401483e-06, + "loss": 1.1272692680358887, + "step": 1816 + }, + { + "epoch": 1.376230128690386, + "grad_norm": 1.0783947706222534, + "learning_rate": 2.411529024155284e-06, + "loss": 0.7188507318496704, + "step": 1818 + }, + { + "epoch": 1.3777441332323996, + "grad_norm": 1.2748942375183105, + "learning_rate": 2.41013294743842e-06, + "loss": 0.7735112309455872, + "step": 1820 + }, + { + "epoch": 1.3792581377744133, + "grad_norm": 3.49548077583313, + "learning_rate": 2.408735679438825e-06, + "loss": 0.1862514466047287, + "step": 1822 + }, + { + "epoch": 1.380772142316427, + "grad_norm": 1.2513797283172607, + "learning_rate": 2.4073372223463043e-06, + "loss": 0.8623074889183044, + "step": 1824 + }, + { + "epoch": 1.3822861468584406, + "grad_norm": 8.05462646484375, + "learning_rate": 2.405937578352523e-06, + "loss": 0.8720998764038086, + "step": 1826 + }, + { + "epoch": 1.3838001514004543, + "grad_norm": 1.3153167963027954, + "learning_rate": 2.404536749651007e-06, + "loss": 1.1153795719146729, + "step": 1828 + }, + { + "epoch": 1.3853141559424678, + "grad_norm": 1.2078781127929688, + "learning_rate": 2.40313473843714e-06, + "loss": 1.115010142326355, + "step": 1830 + }, + { + "epoch": 1.3868281604844814, + "grad_norm": 1.2034944295883179, + "learning_rate": 2.4017315469081583e-06, + "loss": 1.1347001791000366, + "step": 1832 + }, + { + "epoch": 1.388342165026495, + "grad_norm": 1.8428653478622437, + "learning_rate": 2.400327177263148e-06, + "loss": 0.6002182960510254, + "step": 1834 + }, + { + "epoch": 1.3898561695685088, + "grad_norm": 15.55579662322998, + "learning_rate": 2.3989216317030422e-06, + "loss": 0.7954713702201843, + "step": 1836 + }, + { + "epoch": 1.3913701741105222, + "grad_norm": 1.8133504390716553, + "learning_rate": 2.3975149124306153e-06, + "loss": 0.8916700482368469, + "step": 1838 + }, + { + "epoch": 1.3928841786525359, + "grad_norm": 1.8518298864364624, + "learning_rate": 2.396107021650482e-06, + "loss": 0.389128178358078, + "step": 1840 + }, + { + "epoch": 1.3943981831945496, + "grad_norm": 3.1076130867004395, + "learning_rate": 2.3946979615690946e-06, + "loss": 0.885290801525116, + "step": 1842 + }, + { + "epoch": 1.3959121877365632, + "grad_norm": 0.9769076108932495, + "learning_rate": 2.393287734394735e-06, + "loss": 0.788175642490387, + "step": 1844 + }, + { + "epoch": 1.397426192278577, + "grad_norm": 6.388150215148926, + "learning_rate": 2.3918763423375162e-06, + "loss": 0.7048123478889465, + "step": 1846 + }, + { + "epoch": 1.3989401968205906, + "grad_norm": 2.9132094383239746, + "learning_rate": 2.3904637876093765e-06, + "loss": 0.4340665936470032, + "step": 1848 + }, + { + "epoch": 1.400454201362604, + "grad_norm": 1.9239084720611572, + "learning_rate": 2.3890500724240754e-06, + "loss": 0.7962247133255005, + "step": 1850 + }, + { + "epoch": 1.4019682059046177, + "grad_norm": 66.01080322265625, + "learning_rate": 2.387635198997193e-06, + "loss": 0.37155330181121826, + "step": 1852 + }, + { + "epoch": 1.4034822104466314, + "grad_norm": 2.0512757301330566, + "learning_rate": 2.386219169546122e-06, + "loss": 1.1434550285339355, + "step": 1854 + }, + { + "epoch": 1.404996214988645, + "grad_norm": 2.588855266571045, + "learning_rate": 2.3848019862900684e-06, + "loss": 0.4890868067741394, + "step": 1856 + }, + { + "epoch": 1.4065102195306585, + "grad_norm": 1.6390639543533325, + "learning_rate": 2.383383651450047e-06, + "loss": 0.7672550082206726, + "step": 1858 + }, + { + "epoch": 1.4080242240726721, + "grad_norm": 3.1538286209106445, + "learning_rate": 2.3819641672488756e-06, + "loss": 0.6715327501296997, + "step": 1860 + }, + { + "epoch": 1.4095382286146858, + "grad_norm": 2.88683819770813, + "learning_rate": 2.3805435359111753e-06, + "loss": 0.5086562037467957, + "step": 1862 + }, + { + "epoch": 1.4110522331566995, + "grad_norm": 1.8502843379974365, + "learning_rate": 2.379121759663363e-06, + "loss": 0.8395087122917175, + "step": 1864 + }, + { + "epoch": 1.4125662376987131, + "grad_norm": 1.6283674240112305, + "learning_rate": 2.377698840733652e-06, + "loss": 0.7248956561088562, + "step": 1866 + }, + { + "epoch": 1.4140802422407268, + "grad_norm": 1.4117423295974731, + "learning_rate": 2.3762747813520437e-06, + "loss": 0.8159021139144897, + "step": 1868 + }, + { + "epoch": 1.4155942467827405, + "grad_norm": 0.9731475710868835, + "learning_rate": 2.3748495837503302e-06, + "loss": 1.085202693939209, + "step": 1870 + }, + { + "epoch": 1.417108251324754, + "grad_norm": 2.913374185562134, + "learning_rate": 2.3734232501620843e-06, + "loss": 1.0411560535430908, + "step": 1872 + }, + { + "epoch": 1.4186222558667676, + "grad_norm": 2.0996501445770264, + "learning_rate": 2.371995782822661e-06, + "loss": 0.32785797119140625, + "step": 1874 + }, + { + "epoch": 1.4201362604087813, + "grad_norm": 1.0450830459594727, + "learning_rate": 2.3705671839691915e-06, + "loss": 1.0494458675384521, + "step": 1876 + }, + { + "epoch": 1.4216502649507947, + "grad_norm": 4.396561145782471, + "learning_rate": 2.3691374558405806e-06, + "loss": 0.7100043296813965, + "step": 1878 + }, + { + "epoch": 1.4231642694928084, + "grad_norm": 1.1967666149139404, + "learning_rate": 2.3677066006775023e-06, + "loss": 0.7854270339012146, + "step": 1880 + }, + { + "epoch": 1.424678274034822, + "grad_norm": 2.0385992527008057, + "learning_rate": 2.3662746207223975e-06, + "loss": 1.0396957397460938, + "step": 1882 + }, + { + "epoch": 1.4261922785768357, + "grad_norm": 1.8718552589416504, + "learning_rate": 2.36484151821947e-06, + "loss": 0.3418787121772766, + "step": 1884 + }, + { + "epoch": 1.4277062831188494, + "grad_norm": 1.2310271263122559, + "learning_rate": 2.363407295414681e-06, + "loss": 0.3402215242385864, + "step": 1886 + }, + { + "epoch": 1.429220287660863, + "grad_norm": 4.057934761047363, + "learning_rate": 2.361971954555751e-06, + "loss": 0.6020709872245789, + "step": 1888 + }, + { + "epoch": 1.4307342922028767, + "grad_norm": 1.5722401142120361, + "learning_rate": 2.3605354978921497e-06, + "loss": 0.7824895977973938, + "step": 1890 + }, + { + "epoch": 1.4322482967448902, + "grad_norm": 1.1254881620407104, + "learning_rate": 2.359097927675097e-06, + "loss": 1.1387289762496948, + "step": 1892 + }, + { + "epoch": 1.4337623012869039, + "grad_norm": 1.4716461896896362, + "learning_rate": 2.3576592461575562e-06, + "loss": 0.718011736869812, + "step": 1894 + }, + { + "epoch": 1.4352763058289175, + "grad_norm": 1.1156413555145264, + "learning_rate": 2.356219455594234e-06, + "loss": 1.1016600131988525, + "step": 1896 + }, + { + "epoch": 1.4367903103709312, + "grad_norm": 1.3907214403152466, + "learning_rate": 2.3547785582415757e-06, + "loss": 0.6810643076896667, + "step": 1898 + }, + { + "epoch": 1.4383043149129446, + "grad_norm": 2.610874891281128, + "learning_rate": 2.353336556357759e-06, + "loss": 0.3342776596546173, + "step": 1900 + }, + { + "epoch": 1.4398183194549583, + "grad_norm": 1.8081077337265015, + "learning_rate": 2.351893452202694e-06, + "loss": 1.0931655168533325, + "step": 1902 + }, + { + "epoch": 1.441332323996972, + "grad_norm": 3.5058650970458984, + "learning_rate": 2.350449248038018e-06, + "loss": 0.7352507710456848, + "step": 1904 + }, + { + "epoch": 1.4428463285389856, + "grad_norm": 1.9327387809753418, + "learning_rate": 2.349003946127093e-06, + "loss": 0.6707648634910583, + "step": 1906 + }, + { + "epoch": 1.4443603330809993, + "grad_norm": 1.1505838632583618, + "learning_rate": 2.3475575487349996e-06, + "loss": 0.9618532061576843, + "step": 1908 + }, + { + "epoch": 1.445874337623013, + "grad_norm": 3.8157570362091064, + "learning_rate": 2.3461100581285374e-06, + "loss": 0.7543551921844482, + "step": 1910 + }, + { + "epoch": 1.4473883421650264, + "grad_norm": 1.3796827793121338, + "learning_rate": 2.344661476576217e-06, + "loss": 0.6786009669303894, + "step": 1912 + }, + { + "epoch": 1.44890234670704, + "grad_norm": 5.803293228149414, + "learning_rate": 2.343211806348261e-06, + "loss": 0.4491370916366577, + "step": 1914 + }, + { + "epoch": 1.4504163512490538, + "grad_norm": 2.445295810699463, + "learning_rate": 2.3417610497165965e-06, + "loss": 0.4134538173675537, + "step": 1916 + }, + { + "epoch": 1.4519303557910674, + "grad_norm": 19.466123580932617, + "learning_rate": 2.3403092089548533e-06, + "loss": 0.4438095688819885, + "step": 1918 + }, + { + "epoch": 1.4534443603330809, + "grad_norm": 2.4652929306030273, + "learning_rate": 2.3388562863383623e-06, + "loss": 0.6905573606491089, + "step": 1920 + }, + { + "epoch": 1.4549583648750946, + "grad_norm": 2.971496343612671, + "learning_rate": 2.3374022841441473e-06, + "loss": 0.7411724328994751, + "step": 1922 + }, + { + "epoch": 1.4564723694171082, + "grad_norm": 3.0368525981903076, + "learning_rate": 2.3359472046509254e-06, + "loss": 0.666789174079895, + "step": 1924 + }, + { + "epoch": 1.457986373959122, + "grad_norm": 0.707574725151062, + "learning_rate": 2.3344910501391012e-06, + "loss": 0.3020743429660797, + "step": 1926 + }, + { + "epoch": 1.4595003785011356, + "grad_norm": 1.5272055864334106, + "learning_rate": 2.3330338228907653e-06, + "loss": 0.7165997624397278, + "step": 1928 + }, + { + "epoch": 1.4610143830431492, + "grad_norm": 1.222330093383789, + "learning_rate": 2.3315755251896883e-06, + "loss": 0.7357701063156128, + "step": 1930 + }, + { + "epoch": 1.4625283875851627, + "grad_norm": 1.4718995094299316, + "learning_rate": 2.3301161593213196e-06, + "loss": 0.6662452816963196, + "step": 1932 + }, + { + "epoch": 1.4640423921271764, + "grad_norm": 2.3255410194396973, + "learning_rate": 2.328655727572781e-06, + "loss": 0.628060519695282, + "step": 1934 + }, + { + "epoch": 1.46555639666919, + "grad_norm": 2.189652681350708, + "learning_rate": 2.327194232232866e-06, + "loss": 0.3172926902770996, + "step": 1936 + }, + { + "epoch": 1.4670704012112037, + "grad_norm": 1.8337234258651733, + "learning_rate": 2.3257316755920356e-06, + "loss": 1.149836540222168, + "step": 1938 + }, + { + "epoch": 1.4685844057532171, + "grad_norm": 1.8608667850494385, + "learning_rate": 2.3242680599424116e-06, + "loss": 0.671994686126709, + "step": 1940 + }, + { + "epoch": 1.4700984102952308, + "grad_norm": 3.7468669414520264, + "learning_rate": 2.3228033875777787e-06, + "loss": 0.5343575477600098, + "step": 1942 + }, + { + "epoch": 1.4716124148372445, + "grad_norm": 1.2300831079483032, + "learning_rate": 2.321337660793574e-06, + "loss": 0.7672010660171509, + "step": 1944 + }, + { + "epoch": 1.4731264193792581, + "grad_norm": 4.202813625335693, + "learning_rate": 2.319870881886891e-06, + "loss": 0.7292002439498901, + "step": 1946 + }, + { + "epoch": 1.4746404239212718, + "grad_norm": 3.736546039581299, + "learning_rate": 2.318403053156469e-06, + "loss": 1.0978435277938843, + "step": 1948 + }, + { + "epoch": 1.4761544284632855, + "grad_norm": 1.076294183731079, + "learning_rate": 2.316934176902694e-06, + "loss": 0.6914529800415039, + "step": 1950 + }, + { + "epoch": 1.4776684330052992, + "grad_norm": 0.9137081503868103, + "learning_rate": 2.3154642554275942e-06, + "loss": 0.31867456436157227, + "step": 1952 + }, + { + "epoch": 1.4791824375473126, + "grad_norm": 1.4948675632476807, + "learning_rate": 2.313993291034834e-06, + "loss": 0.6049136519432068, + "step": 1954 + }, + { + "epoch": 1.4806964420893263, + "grad_norm": 1.252981185913086, + "learning_rate": 2.312521286029714e-06, + "loss": 0.6632291078567505, + "step": 1956 + }, + { + "epoch": 1.48221044663134, + "grad_norm": 2.0565993785858154, + "learning_rate": 2.3110482427191647e-06, + "loss": 0.30923762917518616, + "step": 1958 + }, + { + "epoch": 1.4837244511733534, + "grad_norm": 1.3827341794967651, + "learning_rate": 2.309574163411745e-06, + "loss": 0.6597527265548706, + "step": 1960 + }, + { + "epoch": 1.485238455715367, + "grad_norm": 1.3622335195541382, + "learning_rate": 2.308099050417636e-06, + "loss": 0.7235625982284546, + "step": 1962 + }, + { + "epoch": 1.4867524602573807, + "grad_norm": 1.3936517238616943, + "learning_rate": 2.3066229060486395e-06, + "loss": 1.118982195854187, + "step": 1964 + }, + { + "epoch": 1.4882664647993944, + "grad_norm": 1.5030118227005005, + "learning_rate": 2.3051457326181727e-06, + "loss": 0.6712927222251892, + "step": 1966 + }, + { + "epoch": 1.489780469341408, + "grad_norm": 2.5098156929016113, + "learning_rate": 2.303667532441268e-06, + "loss": 0.7121206521987915, + "step": 1968 + }, + { + "epoch": 1.4912944738834217, + "grad_norm": 1.4885815382003784, + "learning_rate": 2.3021883078345644e-06, + "loss": 1.1207520961761475, + "step": 1970 + }, + { + "epoch": 1.4928084784254354, + "grad_norm": 2.203951358795166, + "learning_rate": 2.3007080611163075e-06, + "loss": 0.6138882040977478, + "step": 1972 + }, + { + "epoch": 1.4943224829674489, + "grad_norm": 2.069188117980957, + "learning_rate": 2.2992267946063442e-06, + "loss": 0.667793869972229, + "step": 1974 + }, + { + "epoch": 1.4958364875094625, + "grad_norm": 1.594058871269226, + "learning_rate": 2.2977445106261203e-06, + "loss": 0.6595074534416199, + "step": 1976 + }, + { + "epoch": 1.4973504920514762, + "grad_norm": 1.4582384824752808, + "learning_rate": 2.2962612114986766e-06, + "loss": 1.0721561908721924, + "step": 1978 + }, + { + "epoch": 1.4988644965934899, + "grad_norm": 57.13948059082031, + "learning_rate": 2.2947768995486425e-06, + "loss": 0.3397125005722046, + "step": 1980 + }, + { + "epoch": 1.5003785011355033, + "grad_norm": 1.059604287147522, + "learning_rate": 2.293291577102238e-06, + "loss": 1.0371211767196655, + "step": 1982 + }, + { + "epoch": 1.501892505677517, + "grad_norm": 1.3177804946899414, + "learning_rate": 2.291805246487264e-06, + "loss": 0.9598127603530884, + "step": 1984 + }, + { + "epoch": 1.5034065102195306, + "grad_norm": 1.3356648683547974, + "learning_rate": 2.2903179100331036e-06, + "loss": 1.0830658674240112, + "step": 1986 + }, + { + "epoch": 1.5049205147615443, + "grad_norm": 1.2679076194763184, + "learning_rate": 2.2888295700707136e-06, + "loss": 0.2132422775030136, + "step": 1988 + }, + { + "epoch": 1.506434519303558, + "grad_norm": 1.068151593208313, + "learning_rate": 2.287340228932626e-06, + "loss": 0.8712539076805115, + "step": 1990 + }, + { + "epoch": 1.5079485238455717, + "grad_norm": 4.26716947555542, + "learning_rate": 2.2858498889529404e-06, + "loss": 0.6837059259414673, + "step": 1992 + }, + { + "epoch": 1.5094625283875853, + "grad_norm": 1.4669735431671143, + "learning_rate": 2.284358552467323e-06, + "loss": 1.0805472135543823, + "step": 1994 + }, + { + "epoch": 1.5109765329295988, + "grad_norm": 2.512695550918579, + "learning_rate": 2.282866221813001e-06, + "loss": 0.7257423400878906, + "step": 1996 + }, + { + "epoch": 1.5124905374716124, + "grad_norm": 1.424889326095581, + "learning_rate": 2.2813728993287584e-06, + "loss": 1.2146888971328735, + "step": 1998 + }, + { + "epoch": 1.5140045420136259, + "grad_norm": 5.44967794418335, + "learning_rate": 2.279878587354936e-06, + "loss": 0.8089492321014404, + "step": 2000 + }, + { + "epoch": 1.5155185465556396, + "grad_norm": 1.7360683679580688, + "learning_rate": 2.2783832882334237e-06, + "loss": 0.5648525357246399, + "step": 2002 + }, + { + "epoch": 1.5170325510976532, + "grad_norm": 1.1932352781295776, + "learning_rate": 2.2768870043076593e-06, + "loss": 0.5296794176101685, + "step": 2004 + }, + { + "epoch": 1.518546555639667, + "grad_norm": 0.8741418123245239, + "learning_rate": 2.2753897379226236e-06, + "loss": 1.0491361618041992, + "step": 2006 + }, + { + "epoch": 1.5200605601816806, + "grad_norm": 14.517936706542969, + "learning_rate": 2.2738914914248375e-06, + "loss": 0.35164278745651245, + "step": 2008 + }, + { + "epoch": 1.5215745647236942, + "grad_norm": 1.3566627502441406, + "learning_rate": 2.272392267162356e-06, + "loss": 0.9264833331108093, + "step": 2010 + }, + { + "epoch": 1.523088569265708, + "grad_norm": 2.2202181816101074, + "learning_rate": 2.27089206748477e-06, + "loss": 0.21586671471595764, + "step": 2012 + }, + { + "epoch": 1.5246025738077216, + "grad_norm": 1.2431819438934326, + "learning_rate": 2.269390894743196e-06, + "loss": 0.691186785697937, + "step": 2014 + }, + { + "epoch": 1.526116578349735, + "grad_norm": 2.915321111679077, + "learning_rate": 2.2678887512902772e-06, + "loss": 0.24071253836154938, + "step": 2016 + }, + { + "epoch": 1.5276305828917487, + "grad_norm": 2.3926448822021484, + "learning_rate": 2.266385639480177e-06, + "loss": 0.6835507750511169, + "step": 2018 + }, + { + "epoch": 1.5291445874337621, + "grad_norm": 3.000807046890259, + "learning_rate": 2.264881561668577e-06, + "loss": 0.6608117818832397, + "step": 2020 + }, + { + "epoch": 1.5306585919757758, + "grad_norm": 1.1589049100875854, + "learning_rate": 2.263376520212673e-06, + "loss": 0.7263166308403015, + "step": 2022 + }, + { + "epoch": 1.5321725965177895, + "grad_norm": 1.4489938020706177, + "learning_rate": 2.261870517471171e-06, + "loss": 1.0755257606506348, + "step": 2024 + }, + { + "epoch": 1.5336866010598031, + "grad_norm": 1.791056752204895, + "learning_rate": 2.260363555804282e-06, + "loss": 0.2279585748910904, + "step": 2026 + }, + { + "epoch": 1.5352006056018168, + "grad_norm": 1.019209384918213, + "learning_rate": 2.2588556375737217e-06, + "loss": 0.6466829180717468, + "step": 2028 + }, + { + "epoch": 1.5367146101438305, + "grad_norm": 3.3081586360931396, + "learning_rate": 2.2573467651427044e-06, + "loss": 0.2524896562099457, + "step": 2030 + }, + { + "epoch": 1.5382286146858442, + "grad_norm": 2.9480090141296387, + "learning_rate": 2.2558369408759395e-06, + "loss": 0.4058088958263397, + "step": 2032 + }, + { + "epoch": 1.5397426192278578, + "grad_norm": 6.846377849578857, + "learning_rate": 2.254326167139628e-06, + "loss": 0.9160577058792114, + "step": 2034 + }, + { + "epoch": 1.5412566237698713, + "grad_norm": 2.892534017562866, + "learning_rate": 2.2528144463014607e-06, + "loss": 0.7248266339302063, + "step": 2036 + }, + { + "epoch": 1.542770628311885, + "grad_norm": 10.962396621704102, + "learning_rate": 2.2513017807306087e-06, + "loss": 0.6049357652664185, + "step": 2038 + }, + { + "epoch": 1.5442846328538986, + "grad_norm": 3.0405397415161133, + "learning_rate": 2.2497881727977283e-06, + "loss": 1.067530870437622, + "step": 2040 + }, + { + "epoch": 1.545798637395912, + "grad_norm": 2.092886447906494, + "learning_rate": 2.24827362487495e-06, + "loss": 0.3755670189857483, + "step": 2042 + }, + { + "epoch": 1.5473126419379257, + "grad_norm": 1.4965420961380005, + "learning_rate": 2.246758139335878e-06, + "loss": 0.5254254937171936, + "step": 2044 + }, + { + "epoch": 1.5488266464799394, + "grad_norm": 5.255483150482178, + "learning_rate": 2.245241718555586e-06, + "loss": 0.34625244140625, + "step": 2046 + }, + { + "epoch": 1.550340651021953, + "grad_norm": 2.690295696258545, + "learning_rate": 2.2437243649106126e-06, + "loss": 0.7206704616546631, + "step": 2048 + }, + { + "epoch": 1.5518546555639667, + "grad_norm": 1.249977946281433, + "learning_rate": 2.2422060807789602e-06, + "loss": 0.5133241415023804, + "step": 2050 + }, + { + "epoch": 1.5533686601059804, + "grad_norm": 1.2926579713821411, + "learning_rate": 2.240686868540088e-06, + "loss": 0.3254395127296448, + "step": 2052 + }, + { + "epoch": 1.554882664647994, + "grad_norm": 2.2255635261535645, + "learning_rate": 2.23916673057491e-06, + "loss": 0.1421680748462677, + "step": 2054 + }, + { + "epoch": 1.5563966691900075, + "grad_norm": 9.698051452636719, + "learning_rate": 2.2376456692657917e-06, + "loss": 0.41860195994377136, + "step": 2056 + }, + { + "epoch": 1.5579106737320212, + "grad_norm": 4.903704643249512, + "learning_rate": 2.2361236869965447e-06, + "loss": 0.2064562439918518, + "step": 2058 + }, + { + "epoch": 1.5594246782740349, + "grad_norm": 1.4226408004760742, + "learning_rate": 2.234600786152425e-06, + "loss": 1.2417998313903809, + "step": 2060 + }, + { + "epoch": 1.5609386828160483, + "grad_norm": 5.216070175170898, + "learning_rate": 2.2330769691201267e-06, + "loss": 0.745749294757843, + "step": 2062 + }, + { + "epoch": 1.562452687358062, + "grad_norm": 2.597033739089966, + "learning_rate": 2.231552238287781e-06, + "loss": 0.3117476999759674, + "step": 2064 + }, + { + "epoch": 1.5639666919000756, + "grad_norm": 4.07472562789917, + "learning_rate": 2.230026596044951e-06, + "loss": 0.32264429330825806, + "step": 2066 + }, + { + "epoch": 1.5654806964420893, + "grad_norm": 2.9109983444213867, + "learning_rate": 2.2285000447826276e-06, + "loss": 0.7304523587226868, + "step": 2068 + }, + { + "epoch": 1.566994700984103, + "grad_norm": 7.669963836669922, + "learning_rate": 2.2269725868932266e-06, + "loss": 0.36783677339553833, + "step": 2070 + }, + { + "epoch": 1.5685087055261167, + "grad_norm": 1.8309413194656372, + "learning_rate": 2.2254442247705855e-06, + "loss": 0.6806103587150574, + "step": 2072 + }, + { + "epoch": 1.5700227100681303, + "grad_norm": 1.2219653129577637, + "learning_rate": 2.223914960809958e-06, + "loss": 1.1403934955596924, + "step": 2074 + }, + { + "epoch": 1.571536714610144, + "grad_norm": 1.4226986169815063, + "learning_rate": 2.222384797408011e-06, + "loss": 1.0667498111724854, + "step": 2076 + }, + { + "epoch": 1.5730507191521574, + "grad_norm": 2.742898941040039, + "learning_rate": 2.220853736962821e-06, + "loss": 0.5636183023452759, + "step": 2078 + }, + { + "epoch": 1.574564723694171, + "grad_norm": 1.659138560295105, + "learning_rate": 2.2193217818738714e-06, + "loss": 0.594592809677124, + "step": 2080 + }, + { + "epoch": 1.5760787282361846, + "grad_norm": 2.238586902618408, + "learning_rate": 2.217788934542047e-06, + "loss": 0.3791021406650543, + "step": 2082 + }, + { + "epoch": 1.5775927327781982, + "grad_norm": 2.290127754211426, + "learning_rate": 2.2162551973696306e-06, + "loss": 0.32889947295188904, + "step": 2084 + }, + { + "epoch": 1.579106737320212, + "grad_norm": 1.2820953130722046, + "learning_rate": 2.2147205727603e-06, + "loss": 0.6531152129173279, + "step": 2086 + }, + { + "epoch": 1.5806207418622256, + "grad_norm": 1.0417596101760864, + "learning_rate": 2.2131850631191237e-06, + "loss": 0.7622494697570801, + "step": 2088 + }, + { + "epoch": 1.5821347464042392, + "grad_norm": 1.1884350776672363, + "learning_rate": 2.2116486708525576e-06, + "loss": 0.7171990871429443, + "step": 2090 + }, + { + "epoch": 1.583648750946253, + "grad_norm": 7.166083812713623, + "learning_rate": 2.2101113983684397e-06, + "loss": 0.3641316890716553, + "step": 2092 + }, + { + "epoch": 1.5851627554882666, + "grad_norm": 1.2821331024169922, + "learning_rate": 2.208573248075989e-06, + "loss": 0.8434643149375916, + "step": 2094 + }, + { + "epoch": 1.5866767600302802, + "grad_norm": 3.2446162700653076, + "learning_rate": 2.2070342223857986e-06, + "loss": 0.7080972790718079, + "step": 2096 + }, + { + "epoch": 1.5881907645722937, + "grad_norm": 1.4993233680725098, + "learning_rate": 2.205494323709835e-06, + "loss": 0.3765650987625122, + "step": 2098 + }, + { + "epoch": 1.5897047691143074, + "grad_norm": 3.7634449005126953, + "learning_rate": 2.2039535544614325e-06, + "loss": 0.4482704699039459, + "step": 2100 + }, + { + "epoch": 1.5912187736563208, + "grad_norm": 2.2245254516601562, + "learning_rate": 2.2024119170552886e-06, + "loss": 0.14600801467895508, + "step": 2102 + }, + { + "epoch": 1.5927327781983345, + "grad_norm": 20.059284210205078, + "learning_rate": 2.2008694139074623e-06, + "loss": 0.3217051029205322, + "step": 2104 + }, + { + "epoch": 1.5942467827403481, + "grad_norm": 3.0502076148986816, + "learning_rate": 2.19932604743537e-06, + "loss": 0.6332624554634094, + "step": 2106 + }, + { + "epoch": 1.5957607872823618, + "grad_norm": 1.2478991746902466, + "learning_rate": 2.19778182005778e-06, + "loss": 1.0543266534805298, + "step": 2108 + }, + { + "epoch": 1.5972747918243755, + "grad_norm": 4.512229919433594, + "learning_rate": 2.1962367341948103e-06, + "loss": 1.1217621564865112, + "step": 2110 + }, + { + "epoch": 1.5987887963663892, + "grad_norm": 2.077763319015503, + "learning_rate": 2.194690792267925e-06, + "loss": 0.8677721619606018, + "step": 2112 + }, + { + "epoch": 1.6003028009084028, + "grad_norm": 1.8816295862197876, + "learning_rate": 2.1931439966999285e-06, + "loss": 0.7273671627044678, + "step": 2114 + }, + { + "epoch": 1.6018168054504165, + "grad_norm": 2.403968334197998, + "learning_rate": 2.191596349914964e-06, + "loss": 0.5366280674934387, + "step": 2116 + }, + { + "epoch": 1.60333080999243, + "grad_norm": 3.831437587738037, + "learning_rate": 2.1900478543385073e-06, + "loss": 0.6811137795448303, + "step": 2118 + }, + { + "epoch": 1.6048448145344436, + "grad_norm": 1.3589093685150146, + "learning_rate": 2.188498512397367e-06, + "loss": 0.8576026558876038, + "step": 2120 + }, + { + "epoch": 1.6063588190764573, + "grad_norm": 3.0552351474761963, + "learning_rate": 2.186948326519675e-06, + "loss": 0.3504766821861267, + "step": 2122 + }, + { + "epoch": 1.6078728236184707, + "grad_norm": 4.210336685180664, + "learning_rate": 2.1853972991348895e-06, + "loss": 0.15103736519813538, + "step": 2124 + }, + { + "epoch": 1.6093868281604844, + "grad_norm": 2.4473674297332764, + "learning_rate": 2.1838454326737836e-06, + "loss": 0.3876769244670868, + "step": 2126 + }, + { + "epoch": 1.610900832702498, + "grad_norm": 7.054347515106201, + "learning_rate": 2.182292729568448e-06, + "loss": 0.5042491555213928, + "step": 2128 + }, + { + "epoch": 1.6124148372445117, + "grad_norm": 9.706134796142578, + "learning_rate": 2.180739192252284e-06, + "loss": 0.8015500903129578, + "step": 2130 + }, + { + "epoch": 1.6139288417865254, + "grad_norm": 3.156993865966797, + "learning_rate": 2.1791848231600002e-06, + "loss": 0.6276577115058899, + "step": 2132 + }, + { + "epoch": 1.615442846328539, + "grad_norm": 0.6051416993141174, + "learning_rate": 2.1776296247276077e-06, + "loss": 0.33691293001174927, + "step": 2134 + }, + { + "epoch": 1.6169568508705527, + "grad_norm": 1.6786423921585083, + "learning_rate": 2.1760735993924196e-06, + "loss": 0.7172066569328308, + "step": 2136 + }, + { + "epoch": 1.6184708554125662, + "grad_norm": 4.236973762512207, + "learning_rate": 2.174516749593044e-06, + "loss": 0.7028883099555969, + "step": 2138 + }, + { + "epoch": 1.6199848599545799, + "grad_norm": 1.6318327188491821, + "learning_rate": 2.172959077769379e-06, + "loss": 0.8917480111122131, + "step": 2140 + }, + { + "epoch": 1.6214988644965935, + "grad_norm": 4.097431182861328, + "learning_rate": 2.1714005863626143e-06, + "loss": 0.5675381422042847, + "step": 2142 + }, + { + "epoch": 1.623012869038607, + "grad_norm": 2.701396942138672, + "learning_rate": 2.169841277815221e-06, + "loss": 0.7977233529090881, + "step": 2144 + }, + { + "epoch": 1.6245268735806206, + "grad_norm": 1.2214967012405396, + "learning_rate": 2.168281154570954e-06, + "loss": 0.9403069615364075, + "step": 2146 + }, + { + "epoch": 1.6260408781226343, + "grad_norm": 2.3771018981933594, + "learning_rate": 2.1667202190748423e-06, + "loss": 0.34528490900993347, + "step": 2148 + }, + { + "epoch": 1.627554882664648, + "grad_norm": 2.057330369949341, + "learning_rate": 2.165158473773189e-06, + "loss": 0.3051750659942627, + "step": 2150 + }, + { + "epoch": 1.6290688872066617, + "grad_norm": 2.357961416244507, + "learning_rate": 2.163595921113567e-06, + "loss": 0.4863918721675873, + "step": 2152 + }, + { + "epoch": 1.6305828917486753, + "grad_norm": 7.160678386688232, + "learning_rate": 2.1620325635448127e-06, + "loss": 0.7289515137672424, + "step": 2154 + }, + { + "epoch": 1.632096896290689, + "grad_norm": 0.9503294229507446, + "learning_rate": 2.1604684035170253e-06, + "loss": 0.3476891815662384, + "step": 2156 + }, + { + "epoch": 1.6336109008327027, + "grad_norm": 1.6186283826828003, + "learning_rate": 2.158903443481561e-06, + "loss": 0.7037507891654968, + "step": 2158 + }, + { + "epoch": 1.635124905374716, + "grad_norm": 2.762922525405884, + "learning_rate": 2.157337685891031e-06, + "loss": 0.801786482334137, + "step": 2160 + }, + { + "epoch": 1.6366389099167298, + "grad_norm": 1.9310704469680786, + "learning_rate": 2.155771133199294e-06, + "loss": 0.5244750380516052, + "step": 2162 + }, + { + "epoch": 1.6381529144587432, + "grad_norm": 1.9318583011627197, + "learning_rate": 2.154203787861458e-06, + "loss": 0.5340541005134583, + "step": 2164 + }, + { + "epoch": 1.639666919000757, + "grad_norm": 3.1494526863098145, + "learning_rate": 2.1526356523338704e-06, + "loss": 0.6972000002861023, + "step": 2166 + }, + { + "epoch": 1.6411809235427706, + "grad_norm": 1.7522597312927246, + "learning_rate": 2.1510667290741183e-06, + "loss": 0.4181288778781891, + "step": 2168 + }, + { + "epoch": 1.6426949280847842, + "grad_norm": 1.9032869338989258, + "learning_rate": 2.149497020541023e-06, + "loss": 0.6532211303710938, + "step": 2170 + }, + { + "epoch": 1.644208932626798, + "grad_norm": 1.9003527164459229, + "learning_rate": 2.1479265291946365e-06, + "loss": 0.8412811160087585, + "step": 2172 + }, + { + "epoch": 1.6457229371688116, + "grad_norm": 1.7914382219314575, + "learning_rate": 2.146355257496239e-06, + "loss": 0.7056155800819397, + "step": 2174 + }, + { + "epoch": 1.6472369417108252, + "grad_norm": 2.497628688812256, + "learning_rate": 2.1447832079083306e-06, + "loss": 0.36582863330841064, + "step": 2176 + }, + { + "epoch": 1.648750946252839, + "grad_norm": 2.5495173931121826, + "learning_rate": 2.1432103828946335e-06, + "loss": 1.061801552772522, + "step": 2178 + }, + { + "epoch": 1.6502649507948524, + "grad_norm": 3.407198429107666, + "learning_rate": 2.141636784920083e-06, + "loss": 0.7316385507583618, + "step": 2180 + }, + { + "epoch": 1.651778955336866, + "grad_norm": 1.808796763420105, + "learning_rate": 2.1400624164508283e-06, + "loss": 0.6635830402374268, + "step": 2182 + }, + { + "epoch": 1.6532929598788795, + "grad_norm": 3.266706705093384, + "learning_rate": 2.1384872799542233e-06, + "loss": 0.5899509191513062, + "step": 2184 + }, + { + "epoch": 1.6548069644208931, + "grad_norm": 2.588864803314209, + "learning_rate": 2.1369113778988272e-06, + "loss": 0.645592212677002, + "step": 2186 + }, + { + "epoch": 1.6563209689629068, + "grad_norm": 1.7220063209533691, + "learning_rate": 2.135334712754399e-06, + "loss": 0.2553539276123047, + "step": 2188 + }, + { + "epoch": 1.6578349735049205, + "grad_norm": 1.6802575588226318, + "learning_rate": 2.1337572869918926e-06, + "loss": 1.1775015592575073, + "step": 2190 + }, + { + "epoch": 1.6593489780469342, + "grad_norm": 1.2012619972229004, + "learning_rate": 2.132179103083455e-06, + "loss": 1.1570957899093628, + "step": 2192 + }, + { + "epoch": 1.6608629825889478, + "grad_norm": 0.9240208268165588, + "learning_rate": 2.1306001635024204e-06, + "loss": 1.0634489059448242, + "step": 2194 + }, + { + "epoch": 1.6623769871309615, + "grad_norm": 2.1204588413238525, + "learning_rate": 2.129020470723309e-06, + "loss": 0.5567243695259094, + "step": 2196 + }, + { + "epoch": 1.6638909916729752, + "grad_norm": 2.298380136489868, + "learning_rate": 2.127440027221819e-06, + "loss": 0.40538960695266724, + "step": 2198 + }, + { + "epoch": 1.6654049962149886, + "grad_norm": 1.3897780179977417, + "learning_rate": 2.1258588354748273e-06, + "loss": 0.32329684495925903, + "step": 2200 + }, + { + "epoch": 1.6669190007570023, + "grad_norm": 1.9075648784637451, + "learning_rate": 2.1242768979603817e-06, + "loss": 0.3049321472644806, + "step": 2202 + }, + { + "epoch": 1.668433005299016, + "grad_norm": 3.9162702560424805, + "learning_rate": 2.1226942171577e-06, + "loss": 0.6182920932769775, + "step": 2204 + }, + { + "epoch": 1.6699470098410294, + "grad_norm": 0.9531748294830322, + "learning_rate": 2.1211107955471653e-06, + "loss": 0.6769225597381592, + "step": 2206 + }, + { + "epoch": 1.671461014383043, + "grad_norm": 1.360073208808899, + "learning_rate": 2.1195266356103194e-06, + "loss": 0.27617713809013367, + "step": 2208 + }, + { + "epoch": 1.6729750189250567, + "grad_norm": 1.5711253881454468, + "learning_rate": 2.117941739829864e-06, + "loss": 0.3255770206451416, + "step": 2210 + }, + { + "epoch": 1.6744890234670704, + "grad_norm": 3.3778274059295654, + "learning_rate": 2.116356110689652e-06, + "loss": 0.5651096701622009, + "step": 2212 + }, + { + "epoch": 1.676003028009084, + "grad_norm": 1.074960708618164, + "learning_rate": 2.1147697506746865e-06, + "loss": 0.6221394538879395, + "step": 2214 + }, + { + "epoch": 1.6775170325510977, + "grad_norm": 2.2665066719055176, + "learning_rate": 2.1131826622711157e-06, + "loss": 0.3510792851448059, + "step": 2216 + }, + { + "epoch": 1.6790310370931114, + "grad_norm": 1.576399564743042, + "learning_rate": 2.1115948479662303e-06, + "loss": 0.8238088488578796, + "step": 2218 + }, + { + "epoch": 1.6805450416351249, + "grad_norm": 1.5586379766464233, + "learning_rate": 2.1100063102484567e-06, + "loss": 0.4264649450778961, + "step": 2220 + }, + { + "epoch": 1.6820590461771385, + "grad_norm": 1.966614007949829, + "learning_rate": 2.1084170516073566e-06, + "loss": 0.2770790755748749, + "step": 2222 + }, + { + "epoch": 1.6835730507191522, + "grad_norm": 1.851863145828247, + "learning_rate": 2.106827074533622e-06, + "loss": 0.3171614706516266, + "step": 2224 + }, + { + "epoch": 1.6850870552611656, + "grad_norm": 4.551060199737549, + "learning_rate": 2.1052363815190685e-06, + "loss": 0.15965092182159424, + "step": 2226 + }, + { + "epoch": 1.6866010598031793, + "grad_norm": 2.0899815559387207, + "learning_rate": 2.1036449750566363e-06, + "loss": 0.6077163815498352, + "step": 2228 + }, + { + "epoch": 1.688115064345193, + "grad_norm": 1.7703354358673096, + "learning_rate": 2.102052857640381e-06, + "loss": 0.3241073489189148, + "step": 2230 + }, + { + "epoch": 1.6896290688872067, + "grad_norm": 1.7518664598464966, + "learning_rate": 2.1004600317654764e-06, + "loss": 0.7068959474563599, + "step": 2232 + }, + { + "epoch": 1.6911430734292203, + "grad_norm": 3.3787121772766113, + "learning_rate": 2.0988664999282025e-06, + "loss": 0.1686900407075882, + "step": 2234 + }, + { + "epoch": 1.692657077971234, + "grad_norm": 2.553187131881714, + "learning_rate": 2.0972722646259476e-06, + "loss": 1.103459119796753, + "step": 2236 + }, + { + "epoch": 1.6941710825132477, + "grad_norm": 3.8670506477355957, + "learning_rate": 2.095677328357202e-06, + "loss": 0.2276834398508072, + "step": 2238 + }, + { + "epoch": 1.6956850870552613, + "grad_norm": 1.991747260093689, + "learning_rate": 2.0940816936215553e-06, + "loss": 0.6074253916740417, + "step": 2240 + }, + { + "epoch": 1.6971990915972748, + "grad_norm": 2.182133436203003, + "learning_rate": 2.0924853629196918e-06, + "loss": 0.3474281430244446, + "step": 2242 + }, + { + "epoch": 1.6987130961392884, + "grad_norm": 1.3940695524215698, + "learning_rate": 2.090888338753385e-06, + "loss": 0.2169879823923111, + "step": 2244 + }, + { + "epoch": 1.700227100681302, + "grad_norm": 3.1887905597686768, + "learning_rate": 2.0892906236254966e-06, + "loss": 0.6765353083610535, + "step": 2246 + }, + { + "epoch": 1.7017411052233156, + "grad_norm": 2.9300456047058105, + "learning_rate": 2.08769222003997e-06, + "loss": 1.045607328414917, + "step": 2248 + }, + { + "epoch": 1.7032551097653292, + "grad_norm": 1.0384591817855835, + "learning_rate": 2.08609313050183e-06, + "loss": 0.6681134700775146, + "step": 2250 + }, + { + "epoch": 1.704769114307343, + "grad_norm": 3.0921647548675537, + "learning_rate": 2.0844933575171725e-06, + "loss": 0.5692518353462219, + "step": 2252 + }, + { + "epoch": 1.7062831188493566, + "grad_norm": 1.2953038215637207, + "learning_rate": 2.0828929035931685e-06, + "loss": 0.796726644039154, + "step": 2254 + }, + { + "epoch": 1.7077971233913702, + "grad_norm": 15.550329208374023, + "learning_rate": 2.0812917712380533e-06, + "loss": 0.6785337924957275, + "step": 2256 + }, + { + "epoch": 1.709311127933384, + "grad_norm": 1.1053526401519775, + "learning_rate": 2.0796899629611274e-06, + "loss": 0.684733510017395, + "step": 2258 + }, + { + "epoch": 1.7108251324753976, + "grad_norm": 0.7494604587554932, + "learning_rate": 2.078087481272749e-06, + "loss": 0.540725827217102, + "step": 2260 + }, + { + "epoch": 1.712339137017411, + "grad_norm": 0.9445919990539551, + "learning_rate": 2.0764843286843326e-06, + "loss": 0.823205828666687, + "step": 2262 + }, + { + "epoch": 1.7138531415594247, + "grad_norm": 3.844160795211792, + "learning_rate": 2.0748805077083444e-06, + "loss": 0.4753996729850769, + "step": 2264 + }, + { + "epoch": 1.7153671461014381, + "grad_norm": 1.1701700687408447, + "learning_rate": 2.0732760208582967e-06, + "loss": 0.755233645439148, + "step": 2266 + }, + { + "epoch": 1.7168811506434518, + "grad_norm": 1.7789427042007446, + "learning_rate": 2.0716708706487476e-06, + "loss": 0.6500770449638367, + "step": 2268 + }, + { + "epoch": 1.7183951551854655, + "grad_norm": 1.0526847839355469, + "learning_rate": 2.0700650595952925e-06, + "loss": 0.25998467206954956, + "step": 2270 + }, + { + "epoch": 1.7199091597274792, + "grad_norm": 1.1626585721969604, + "learning_rate": 2.0684585902145637e-06, + "loss": 0.6699547171592712, + "step": 2272 + }, + { + "epoch": 1.7214231642694928, + "grad_norm": 3.187211275100708, + "learning_rate": 2.0668514650242252e-06, + "loss": 0.6712363958358765, + "step": 2274 + }, + { + "epoch": 1.7229371688115065, + "grad_norm": 2.7952966690063477, + "learning_rate": 2.0652436865429685e-06, + "loss": 0.1482997089624405, + "step": 2276 + }, + { + "epoch": 1.7244511733535202, + "grad_norm": 2.0699210166931152, + "learning_rate": 2.0636352572905093e-06, + "loss": 0.6270683407783508, + "step": 2278 + }, + { + "epoch": 1.7259651778955338, + "grad_norm": 2.2090189456939697, + "learning_rate": 2.0620261797875824e-06, + "loss": 0.35708731412887573, + "step": 2280 + }, + { + "epoch": 1.7274791824375473, + "grad_norm": 1.9764044284820557, + "learning_rate": 2.06041645655594e-06, + "loss": 0.6808338165283203, + "step": 2282 + }, + { + "epoch": 1.728993186979561, + "grad_norm": 1.0456839799880981, + "learning_rate": 2.0588060901183444e-06, + "loss": 0.7046934962272644, + "step": 2284 + }, + { + "epoch": 1.7305071915215746, + "grad_norm": 2.8990256786346436, + "learning_rate": 2.0571950829985673e-06, + "loss": 0.7881788611412048, + "step": 2286 + }, + { + "epoch": 1.732021196063588, + "grad_norm": 1.0929368734359741, + "learning_rate": 2.0555834377213843e-06, + "loss": 0.3059574067592621, + "step": 2288 + }, + { + "epoch": 1.7335352006056017, + "grad_norm": 1.664184808731079, + "learning_rate": 2.0539711568125707e-06, + "loss": 0.3661767542362213, + "step": 2290 + }, + { + "epoch": 1.7350492051476154, + "grad_norm": 5.792791843414307, + "learning_rate": 2.052358242798898e-06, + "loss": 0.23789961636066437, + "step": 2292 + }, + { + "epoch": 1.736563209689629, + "grad_norm": 2.356179714202881, + "learning_rate": 2.050744698208131e-06, + "loss": 0.42035624384880066, + "step": 2294 + }, + { + "epoch": 1.7380772142316427, + "grad_norm": 1.2474614381790161, + "learning_rate": 2.0491305255690207e-06, + "loss": 0.7155369520187378, + "step": 2296 + }, + { + "epoch": 1.7395912187736564, + "grad_norm": 7.224576950073242, + "learning_rate": 2.047515727411304e-06, + "loss": 0.6099893450737, + "step": 2298 + }, + { + "epoch": 1.74110522331567, + "grad_norm": 2.597243309020996, + "learning_rate": 2.0459003062656975e-06, + "loss": 0.244624525308609, + "step": 2300 + }, + { + "epoch": 1.7426192278576835, + "grad_norm": 0.954690158367157, + "learning_rate": 2.0442842646638944e-06, + "loss": 0.6599948406219482, + "step": 2302 + }, + { + "epoch": 1.7441332323996972, + "grad_norm": 2.959341049194336, + "learning_rate": 2.0426676051385603e-06, + "loss": 0.22240900993347168, + "step": 2304 + }, + { + "epoch": 1.7456472369417109, + "grad_norm": 3.261805295944214, + "learning_rate": 2.041050330223328e-06, + "loss": 0.7270727157592773, + "step": 2306 + }, + { + "epoch": 1.7471612414837243, + "grad_norm": 2.7572824954986572, + "learning_rate": 2.0394324424527976e-06, + "loss": 0.2548970580101013, + "step": 2308 + }, + { + "epoch": 1.748675246025738, + "grad_norm": 0.5910503268241882, + "learning_rate": 2.0378139443625263e-06, + "loss": 0.7279835939407349, + "step": 2310 + }, + { + "epoch": 1.7501892505677517, + "grad_norm": 11.673319816589355, + "learning_rate": 2.03619483848903e-06, + "loss": 0.5778782367706299, + "step": 2312 + }, + { + "epoch": 1.7517032551097653, + "grad_norm": 1.0020543336868286, + "learning_rate": 2.034575127369776e-06, + "loss": 0.9038550853729248, + "step": 2314 + }, + { + "epoch": 1.753217259651779, + "grad_norm": 1.8281447887420654, + "learning_rate": 2.0329548135431816e-06, + "loss": 1.2146755456924438, + "step": 2316 + }, + { + "epoch": 1.7547312641937927, + "grad_norm": 2.975893259048462, + "learning_rate": 2.0313338995486073e-06, + "loss": 0.2834610939025879, + "step": 2318 + }, + { + "epoch": 1.7562452687358063, + "grad_norm": 1.9861137866973877, + "learning_rate": 2.0297123879263546e-06, + "loss": 1.0655642747879028, + "step": 2320 + }, + { + "epoch": 1.7577592732778198, + "grad_norm": 1.660569429397583, + "learning_rate": 2.0280902812176607e-06, + "loss": 0.7628564238548279, + "step": 2322 + }, + { + "epoch": 1.7592732778198334, + "grad_norm": 2.630014419555664, + "learning_rate": 2.0264675819646977e-06, + "loss": 1.0666989088058472, + "step": 2324 + }, + { + "epoch": 1.7607872823618471, + "grad_norm": 1.3035540580749512, + "learning_rate": 2.0248442927105635e-06, + "loss": 1.0882095098495483, + "step": 2326 + }, + { + "epoch": 1.7623012869038606, + "grad_norm": 3.2478580474853516, + "learning_rate": 2.0232204159992833e-06, + "loss": 0.5712844133377075, + "step": 2328 + }, + { + "epoch": 1.7638152914458742, + "grad_norm": 1.2534379959106445, + "learning_rate": 2.021595954375801e-06, + "loss": 0.6776572465896606, + "step": 2330 + }, + { + "epoch": 1.765329295987888, + "grad_norm": 1.858695387840271, + "learning_rate": 2.0199709103859784e-06, + "loss": 0.3096883296966553, + "step": 2332 + }, + { + "epoch": 1.7668433005299016, + "grad_norm": 1.3701905012130737, + "learning_rate": 2.0183452865765893e-06, + "loss": 1.075918436050415, + "step": 2334 + }, + { + "epoch": 1.7683573050719152, + "grad_norm": 2.156057357788086, + "learning_rate": 2.0167190854953167e-06, + "loss": 0.18352749943733215, + "step": 2336 + }, + { + "epoch": 1.769871309613929, + "grad_norm": 1.3396556377410889, + "learning_rate": 2.0150923096907473e-06, + "loss": 0.668737530708313, + "step": 2338 + }, + { + "epoch": 1.7713853141559426, + "grad_norm": 2.025547504425049, + "learning_rate": 2.0134649617123697e-06, + "loss": 1.0675408840179443, + "step": 2340 + }, + { + "epoch": 1.7728993186979563, + "grad_norm": 2.4622342586517334, + "learning_rate": 2.011837044110569e-06, + "loss": 0.9288083910942078, + "step": 2342 + }, + { + "epoch": 1.7744133232399697, + "grad_norm": 1.1340492963790894, + "learning_rate": 2.0102085594366227e-06, + "loss": 1.103867530822754, + "step": 2344 + }, + { + "epoch": 1.7759273277819834, + "grad_norm": 4.099803447723389, + "learning_rate": 2.0085795102426962e-06, + "loss": 0.3933102488517761, + "step": 2346 + }, + { + "epoch": 1.7774413323239968, + "grad_norm": 1.8901548385620117, + "learning_rate": 2.0069498990818417e-06, + "loss": 0.5600078701972961, + "step": 2348 + }, + { + "epoch": 1.7789553368660105, + "grad_norm": 0.8852607011795044, + "learning_rate": 2.00531972850799e-06, + "loss": 0.5761555433273315, + "step": 2350 + }, + { + "epoch": 1.7804693414080242, + "grad_norm": 1.7660948038101196, + "learning_rate": 2.00368900107595e-06, + "loss": 0.254497766494751, + "step": 2352 + }, + { + "epoch": 1.7819833459500378, + "grad_norm": 1.6987003087997437, + "learning_rate": 2.0020577193414025e-06, + "loss": 1.0343949794769287, + "step": 2354 + }, + { + "epoch": 1.7834973504920515, + "grad_norm": 2.0711705684661865, + "learning_rate": 2.0004258858608973e-06, + "loss": 0.7034203410148621, + "step": 2356 + }, + { + "epoch": 1.7850113550340652, + "grad_norm": 0.9459677338600159, + "learning_rate": 1.9987935031918496e-06, + "loss": 0.7459084391593933, + "step": 2358 + }, + { + "epoch": 1.7865253595760788, + "grad_norm": 1.8944565057754517, + "learning_rate": 1.997160573892534e-06, + "loss": 0.6802893877029419, + "step": 2360 + }, + { + "epoch": 1.7880393641180925, + "grad_norm": 1.37995445728302, + "learning_rate": 1.9955271005220826e-06, + "loss": 1.0456674098968506, + "step": 2362 + }, + { + "epoch": 1.789553368660106, + "grad_norm": 2.364560842514038, + "learning_rate": 1.9938930856404796e-06, + "loss": 0.4406447410583496, + "step": 2364 + }, + { + "epoch": 1.7910673732021196, + "grad_norm": 5.360681056976318, + "learning_rate": 1.9922585318085586e-06, + "loss": 0.29750534892082214, + "step": 2366 + }, + { + "epoch": 1.7925813777441333, + "grad_norm": 0.8640894293785095, + "learning_rate": 1.990623441587998e-06, + "loss": 0.12734900414943695, + "step": 2368 + }, + { + "epoch": 1.7940953822861467, + "grad_norm": 1.3062753677368164, + "learning_rate": 1.988987817541315e-06, + "loss": 1.0919393301010132, + "step": 2370 + }, + { + "epoch": 1.7956093868281604, + "grad_norm": 1.2311357259750366, + "learning_rate": 1.9873516622318655e-06, + "loss": 0.5747874975204468, + "step": 2372 + }, + { + "epoch": 1.797123391370174, + "grad_norm": 1.9897187948226929, + "learning_rate": 1.9857149782238376e-06, + "loss": 0.46847808361053467, + "step": 2374 + }, + { + "epoch": 1.7986373959121877, + "grad_norm": 1.2085386514663696, + "learning_rate": 1.9840777680822465e-06, + "loss": 1.1324481964111328, + "step": 2376 + }, + { + "epoch": 1.8001514004542014, + "grad_norm": 1.2892783880233765, + "learning_rate": 1.982440034372934e-06, + "loss": 0.9932860732078552, + "step": 2378 + }, + { + "epoch": 1.801665404996215, + "grad_norm": 1.09601891040802, + "learning_rate": 1.9808017796625614e-06, + "loss": 0.2635461688041687, + "step": 2380 + }, + { + "epoch": 1.8031794095382288, + "grad_norm": 1.2857344150543213, + "learning_rate": 1.979163006518606e-06, + "loss": 0.7466263771057129, + "step": 2382 + }, + { + "epoch": 1.8046934140802422, + "grad_norm": 1.8865554332733154, + "learning_rate": 1.977523717509359e-06, + "loss": 0.7405322790145874, + "step": 2384 + }, + { + "epoch": 1.8062074186222559, + "grad_norm": 1.9653666019439697, + "learning_rate": 1.9758839152039183e-06, + "loss": 0.653364896774292, + "step": 2386 + }, + { + "epoch": 1.8077214231642695, + "grad_norm": 1.5651435852050781, + "learning_rate": 1.974243602172188e-06, + "loss": 0.7000844478607178, + "step": 2388 + }, + { + "epoch": 1.809235427706283, + "grad_norm": 1.5864546298980713, + "learning_rate": 1.972602780984871e-06, + "loss": 0.6359418034553528, + "step": 2390 + }, + { + "epoch": 1.8107494322482967, + "grad_norm": 3.3621535301208496, + "learning_rate": 1.9709614542134684e-06, + "loss": 0.7095925211906433, + "step": 2392 + }, + { + "epoch": 1.8122634367903103, + "grad_norm": 1.7060909271240234, + "learning_rate": 1.969319624430272e-06, + "loss": 0.2471795678138733, + "step": 2394 + }, + { + "epoch": 1.813777441332324, + "grad_norm": 2.3302419185638428, + "learning_rate": 1.9676772942083627e-06, + "loss": 0.29656991362571716, + "step": 2396 + }, + { + "epoch": 1.8152914458743377, + "grad_norm": 1.8736639022827148, + "learning_rate": 1.9660344661216058e-06, + "loss": 0.6928511261940002, + "step": 2398 + }, + { + "epoch": 1.8168054504163513, + "grad_norm": 4.308074951171875, + "learning_rate": 1.9643911427446458e-06, + "loss": 0.2448200136423111, + "step": 2400 + }, + { + "epoch": 1.818319454958365, + "grad_norm": 1.3856513500213623, + "learning_rate": 1.9627473266529055e-06, + "loss": 0.10885316133499146, + "step": 2402 + }, + { + "epoch": 1.8198334595003784, + "grad_norm": 4.017141819000244, + "learning_rate": 1.9611030204225776e-06, + "loss": 0.64460688829422, + "step": 2404 + }, + { + "epoch": 1.8213474640423921, + "grad_norm": 3.188676595687866, + "learning_rate": 1.9594582266306244e-06, + "loss": 0.8740291595458984, + "step": 2406 + }, + { + "epoch": 1.8228614685844058, + "grad_norm": 3.1326589584350586, + "learning_rate": 1.957812947854771e-06, + "loss": 0.3625584840774536, + "step": 2408 + }, + { + "epoch": 1.8243754731264192, + "grad_norm": 6.708612442016602, + "learning_rate": 1.9561671866735053e-06, + "loss": 0.44755834341049194, + "step": 2410 + }, + { + "epoch": 1.825889477668433, + "grad_norm": 1.5371860265731812, + "learning_rate": 1.954520945666068e-06, + "loss": 0.6847828030586243, + "step": 2412 + }, + { + "epoch": 1.8274034822104466, + "grad_norm": 1.0486583709716797, + "learning_rate": 1.9528742274124527e-06, + "loss": 0.7758064270019531, + "step": 2414 + }, + { + "epoch": 1.8289174867524602, + "grad_norm": 1.6456979513168335, + "learning_rate": 1.9512270344934027e-06, + "loss": 0.8207234740257263, + "step": 2416 + }, + { + "epoch": 1.830431491294474, + "grad_norm": 1.2004839181900024, + "learning_rate": 1.949579369490403e-06, + "loss": 0.5843564867973328, + "step": 2418 + }, + { + "epoch": 1.8319454958364876, + "grad_norm": 1.1727814674377441, + "learning_rate": 1.9479312349856796e-06, + "loss": 0.698358952999115, + "step": 2420 + }, + { + "epoch": 1.8334595003785013, + "grad_norm": 1.2794691324234009, + "learning_rate": 1.946282633562194e-06, + "loss": 0.9013885259628296, + "step": 2422 + }, + { + "epoch": 1.834973504920515, + "grad_norm": 0.47201627492904663, + "learning_rate": 1.94463356780364e-06, + "loss": 0.7946898937225342, + "step": 2424 + }, + { + "epoch": 1.8364875094625284, + "grad_norm": 7.024264812469482, + "learning_rate": 1.942984040294438e-06, + "loss": 0.7085845470428467, + "step": 2426 + }, + { + "epoch": 1.838001514004542, + "grad_norm": 0.6164209246635437, + "learning_rate": 1.9413340536197326e-06, + "loss": 0.623291015625, + "step": 2428 + }, + { + "epoch": 1.8395155185465555, + "grad_norm": 1.6277772188186646, + "learning_rate": 1.9396836103653883e-06, + "loss": 0.6516735553741455, + "step": 2430 + }, + { + "epoch": 1.8410295230885692, + "grad_norm": 1.297823190689087, + "learning_rate": 1.938032713117985e-06, + "loss": 0.7141111493110657, + "step": 2432 + }, + { + "epoch": 1.8425435276305828, + "grad_norm": 1.170405387878418, + "learning_rate": 1.936381364464814e-06, + "loss": 0.3000938892364502, + "step": 2434 + }, + { + "epoch": 1.8440575321725965, + "grad_norm": 2.2090070247650146, + "learning_rate": 1.934729566993874e-06, + "loss": 0.5821021795272827, + "step": 2436 + }, + { + "epoch": 1.8455715367146102, + "grad_norm": 4.766334533691406, + "learning_rate": 1.9330773232938673e-06, + "loss": 0.5470134615898132, + "step": 2438 + }, + { + "epoch": 1.8470855412566238, + "grad_norm": 1.2278108596801758, + "learning_rate": 1.931424635954195e-06, + "loss": 0.8106773495674133, + "step": 2440 + }, + { + "epoch": 1.8485995457986375, + "grad_norm": 1.0747406482696533, + "learning_rate": 1.9297715075649543e-06, + "loss": 1.1363415718078613, + "step": 2442 + }, + { + "epoch": 1.8501135503406512, + "grad_norm": 1.4492613077163696, + "learning_rate": 1.928117940716933e-06, + "loss": 0.32763731479644775, + "step": 2444 + }, + { + "epoch": 1.8516275548826646, + "grad_norm": 3.6296162605285645, + "learning_rate": 1.9264639380016063e-06, + "loss": 0.782953679561615, + "step": 2446 + }, + { + "epoch": 1.8531415594246783, + "grad_norm": 1.6317901611328125, + "learning_rate": 1.9248095020111323e-06, + "loss": 0.7343980073928833, + "step": 2448 + }, + { + "epoch": 1.8546555639666917, + "grad_norm": 1.8290022611618042, + "learning_rate": 1.923154635338348e-06, + "loss": 0.7735444903373718, + "step": 2450 + }, + { + "epoch": 1.8561695685087054, + "grad_norm": 3.731858968734741, + "learning_rate": 1.921499340576766e-06, + "loss": 0.5414127707481384, + "step": 2452 + }, + { + "epoch": 1.857683573050719, + "grad_norm": 1.2392865419387817, + "learning_rate": 1.9198436203205694e-06, + "loss": 0.6408743858337402, + "step": 2454 + }, + { + "epoch": 1.8591975775927327, + "grad_norm": 1.2399605512619019, + "learning_rate": 1.9181874771646085e-06, + "loss": 1.078896403312683, + "step": 2456 + }, + { + "epoch": 1.8607115821347464, + "grad_norm": 2.1670081615448, + "learning_rate": 1.916530913704395e-06, + "loss": 0.6702110767364502, + "step": 2458 + }, + { + "epoch": 1.86222558667676, + "grad_norm": 0.9086241722106934, + "learning_rate": 1.9148739325361015e-06, + "loss": 1.0246226787567139, + "step": 2460 + }, + { + "epoch": 1.8637395912187738, + "grad_norm": 9.267091751098633, + "learning_rate": 1.913216536256553e-06, + "loss": 0.6695601344108582, + "step": 2462 + }, + { + "epoch": 1.8652535957607874, + "grad_norm": 15.206988334655762, + "learning_rate": 1.9115587274632274e-06, + "loss": 0.33277830481529236, + "step": 2464 + }, + { + "epoch": 1.8667676003028009, + "grad_norm": 1.1482388973236084, + "learning_rate": 1.9099005087542467e-06, + "loss": 0.9272125363349915, + "step": 2466 + }, + { + "epoch": 1.8682816048448145, + "grad_norm": 1.264325499534607, + "learning_rate": 1.9082418827283766e-06, + "loss": 0.6004555225372314, + "step": 2468 + }, + { + "epoch": 1.8697956093868282, + "grad_norm": 1.219954252243042, + "learning_rate": 1.9065828519850212e-06, + "loss": 0.6846041679382324, + "step": 2470 + }, + { + "epoch": 1.8713096139288417, + "grad_norm": 1.4639081954956055, + "learning_rate": 1.9049234191242185e-06, + "loss": 0.5992491841316223, + "step": 2472 + }, + { + "epoch": 1.8728236184708553, + "grad_norm": 1.1715030670166016, + "learning_rate": 1.9032635867466376e-06, + "loss": 1.1000680923461914, + "step": 2474 + }, + { + "epoch": 1.874337623012869, + "grad_norm": 1.3767962455749512, + "learning_rate": 1.9016033574535719e-06, + "loss": 0.8394884467124939, + "step": 2476 + }, + { + "epoch": 1.8758516275548827, + "grad_norm": 1.2146269083023071, + "learning_rate": 1.8999427338469386e-06, + "loss": 0.560164213180542, + "step": 2478 + }, + { + "epoch": 1.8773656320968963, + "grad_norm": 4.951380729675293, + "learning_rate": 1.8982817185292722e-06, + "loss": 0.7559271454811096, + "step": 2480 + }, + { + "epoch": 1.87887963663891, + "grad_norm": 1.567231297492981, + "learning_rate": 1.896620314103721e-06, + "loss": 0.2578279376029968, + "step": 2482 + }, + { + "epoch": 1.8803936411809237, + "grad_norm": 1.6768829822540283, + "learning_rate": 1.894958523174043e-06, + "loss": 0.5915406346321106, + "step": 2484 + }, + { + "epoch": 1.8819076457229371, + "grad_norm": 2.4871420860290527, + "learning_rate": 1.8932963483446027e-06, + "loss": 0.2807641923427582, + "step": 2486 + }, + { + "epoch": 1.8834216502649508, + "grad_norm": 2.202843427658081, + "learning_rate": 1.8916337922203647e-06, + "loss": 0.6352521181106567, + "step": 2488 + }, + { + "epoch": 1.8849356548069645, + "grad_norm": 1.5407449007034302, + "learning_rate": 1.8899708574068928e-06, + "loss": 0.8104386329650879, + "step": 2490 + }, + { + "epoch": 1.886449659348978, + "grad_norm": 3.5783214569091797, + "learning_rate": 1.8883075465103431e-06, + "loss": 0.3671552836894989, + "step": 2492 + }, + { + "epoch": 1.8879636638909916, + "grad_norm": 5.131018161773682, + "learning_rate": 1.8866438621374628e-06, + "loss": 0.16824926435947418, + "step": 2494 + }, + { + "epoch": 1.8894776684330052, + "grad_norm": 2.5843448638916016, + "learning_rate": 1.8849798068955823e-06, + "loss": 0.25553545355796814, + "step": 2496 + }, + { + "epoch": 1.890991672975019, + "grad_norm": 6.959071636199951, + "learning_rate": 1.8833153833926146e-06, + "loss": 0.6022192239761353, + "step": 2498 + }, + { + "epoch": 1.8925056775170326, + "grad_norm": 2.539820432662964, + "learning_rate": 1.8816505942370496e-06, + "loss": 0.5048632621765137, + "step": 2500 + }, + { + "epoch": 1.8940196820590463, + "grad_norm": 3.718407392501831, + "learning_rate": 1.8799854420379487e-06, + "loss": 0.693874716758728, + "step": 2502 + }, + { + "epoch": 1.89553368660106, + "grad_norm": 2.7148478031158447, + "learning_rate": 1.8783199294049453e-06, + "loss": 0.23292014002799988, + "step": 2504 + }, + { + "epoch": 1.8970476911430736, + "grad_norm": 1.9649187326431274, + "learning_rate": 1.8766540589482351e-06, + "loss": 1.0881057977676392, + "step": 2506 + }, + { + "epoch": 1.898561695685087, + "grad_norm": 2.5872409343719482, + "learning_rate": 1.8749878332785754e-06, + "loss": 0.39136096835136414, + "step": 2508 + }, + { + "epoch": 1.9000757002271007, + "grad_norm": 5.54245138168335, + "learning_rate": 1.8733212550072807e-06, + "loss": 0.773819088935852, + "step": 2510 + }, + { + "epoch": 1.9015897047691142, + "grad_norm": 1.4344931840896606, + "learning_rate": 1.8716543267462177e-06, + "loss": 0.203386589884758, + "step": 2512 + }, + { + "epoch": 1.9031037093111278, + "grad_norm": 1.2182456254959106, + "learning_rate": 1.8699870511078017e-06, + "loss": 0.2231517881155014, + "step": 2514 + }, + { + "epoch": 1.9046177138531415, + "grad_norm": 4.056568145751953, + "learning_rate": 1.8683194307049918e-06, + "loss": 0.2915230691432953, + "step": 2516 + }, + { + "epoch": 1.9061317183951552, + "grad_norm": 5.621555328369141, + "learning_rate": 1.866651468151288e-06, + "loss": 0.9799841642379761, + "step": 2518 + }, + { + "epoch": 1.9076457229371688, + "grad_norm": 1.1394728422164917, + "learning_rate": 1.8649831660607262e-06, + "loss": 0.6989929676055908, + "step": 2520 + }, + { + "epoch": 1.9091597274791825, + "grad_norm": 2.0895166397094727, + "learning_rate": 1.8633145270478757e-06, + "loss": 0.15757252275943756, + "step": 2522 + }, + { + "epoch": 1.9106737320211962, + "grad_norm": 1.6885128021240234, + "learning_rate": 1.8616455537278318e-06, + "loss": 0.6867902278900146, + "step": 2524 + }, + { + "epoch": 1.9121877365632098, + "grad_norm": 4.005339622497559, + "learning_rate": 1.8599762487162146e-06, + "loss": 0.2128538340330124, + "step": 2526 + }, + { + "epoch": 1.9137017411052233, + "grad_norm": 1.4398667812347412, + "learning_rate": 1.858306614629165e-06, + "loss": 1.0285841226577759, + "step": 2528 + }, + { + "epoch": 1.915215745647237, + "grad_norm": 7.004055500030518, + "learning_rate": 1.856636654083338e-06, + "loss": 0.29121777415275574, + "step": 2530 + }, + { + "epoch": 1.9167297501892504, + "grad_norm": 11.084014892578125, + "learning_rate": 1.8549663696959016e-06, + "loss": 0.8370655179023743, + "step": 2532 + }, + { + "epoch": 1.918243754731264, + "grad_norm": 1.8854495286941528, + "learning_rate": 1.8532957640845296e-06, + "loss": 1.1610441207885742, + "step": 2534 + }, + { + "epoch": 1.9197577592732777, + "grad_norm": 3.447690725326538, + "learning_rate": 1.851624839867402e-06, + "loss": 0.19941510260105133, + "step": 2536 + }, + { + "epoch": 1.9212717638152914, + "grad_norm": 1.0303577184677124, + "learning_rate": 1.8499535996631946e-06, + "loss": 1.0520439147949219, + "step": 2538 + }, + { + "epoch": 1.922785768357305, + "grad_norm": 5.277173042297363, + "learning_rate": 1.8482820460910817e-06, + "loss": 0.32459500432014465, + "step": 2540 + }, + { + "epoch": 1.9242997728993188, + "grad_norm": 2.280444383621216, + "learning_rate": 1.8466101817707271e-06, + "loss": 0.7902064919471741, + "step": 2542 + }, + { + "epoch": 1.9258137774413324, + "grad_norm": 3.376765727996826, + "learning_rate": 1.844938009322281e-06, + "loss": 0.7843132019042969, + "step": 2544 + }, + { + "epoch": 1.927327781983346, + "grad_norm": 1.1487925052642822, + "learning_rate": 1.8432655313663771e-06, + "loss": 1.1185412406921387, + "step": 2546 + }, + { + "epoch": 1.9288417865253595, + "grad_norm": 1.0185201168060303, + "learning_rate": 1.8415927505241298e-06, + "loss": 1.1568421125411987, + "step": 2548 + }, + { + "epoch": 1.9303557910673732, + "grad_norm": 3.1142921447753906, + "learning_rate": 1.8399196694171252e-06, + "loss": 0.3019213080406189, + "step": 2550 + }, + { + "epoch": 1.9318697956093869, + "grad_norm": 0.9440644383430481, + "learning_rate": 1.838246290667421e-06, + "loss": 0.6780743598937988, + "step": 2552 + }, + { + "epoch": 1.9333838001514003, + "grad_norm": 3.2379441261291504, + "learning_rate": 1.8365726168975425e-06, + "loss": 0.2820740342140198, + "step": 2554 + }, + { + "epoch": 1.934897804693414, + "grad_norm": 2.119868755340576, + "learning_rate": 1.8348986507304757e-06, + "loss": 0.6184734106063843, + "step": 2556 + }, + { + "epoch": 1.9364118092354277, + "grad_norm": 0.9730226993560791, + "learning_rate": 1.833224394789666e-06, + "loss": 1.096203088760376, + "step": 2558 + }, + { + "epoch": 1.9379258137774413, + "grad_norm": 1.5899343490600586, + "learning_rate": 1.8315498516990123e-06, + "loss": 0.5974529981613159, + "step": 2560 + }, + { + "epoch": 1.939439818319455, + "grad_norm": 0.6129815578460693, + "learning_rate": 1.8298750240828638e-06, + "loss": 0.5191044807434082, + "step": 2562 + }, + { + "epoch": 1.9409538228614687, + "grad_norm": 2.552039623260498, + "learning_rate": 1.828199914566016e-06, + "loss": 0.577735185623169, + "step": 2564 + }, + { + "epoch": 1.9424678274034823, + "grad_norm": 2.82818603515625, + "learning_rate": 1.826524525773705e-06, + "loss": 1.1200027465820312, + "step": 2566 + }, + { + "epoch": 1.9439818319454958, + "grad_norm": 4.20834493637085, + "learning_rate": 1.8248488603316063e-06, + "loss": 0.8434380888938904, + "step": 2568 + }, + { + "epoch": 1.9454958364875095, + "grad_norm": 1.315384030342102, + "learning_rate": 1.8231729208658271e-06, + "loss": 0.6373313665390015, + "step": 2570 + }, + { + "epoch": 1.9470098410295231, + "grad_norm": 1.0309563875198364, + "learning_rate": 1.821496710002905e-06, + "loss": 0.6392558813095093, + "step": 2572 + }, + { + "epoch": 1.9485238455715366, + "grad_norm": 2.582773208618164, + "learning_rate": 1.8198202303698038e-06, + "loss": 0.7057768702507019, + "step": 2574 + }, + { + "epoch": 1.9500378501135502, + "grad_norm": 2.0692710876464844, + "learning_rate": 1.8181434845939077e-06, + "loss": 0.7295529246330261, + "step": 2576 + }, + { + "epoch": 1.951551854655564, + "grad_norm": 1.8829554319381714, + "learning_rate": 1.8164664753030164e-06, + "loss": 0.7367881536483765, + "step": 2578 + }, + { + "epoch": 1.9530658591975776, + "grad_norm": 4.824670314788818, + "learning_rate": 1.8147892051253455e-06, + "loss": 0.3595852255821228, + "step": 2580 + }, + { + "epoch": 1.9545798637395913, + "grad_norm": 3.236971855163574, + "learning_rate": 1.8131116766895169e-06, + "loss": 0.7151616811752319, + "step": 2582 + }, + { + "epoch": 1.956093868281605, + "grad_norm": 1.09900963306427, + "learning_rate": 1.8114338926245596e-06, + "loss": 0.6052298545837402, + "step": 2584 + }, + { + "epoch": 1.9576078728236186, + "grad_norm": 1.101852536201477, + "learning_rate": 1.8097558555599016e-06, + "loss": 1.1992706060409546, + "step": 2586 + }, + { + "epoch": 1.9591218773656323, + "grad_norm": 3.309662342071533, + "learning_rate": 1.8080775681253673e-06, + "loss": 0.7572805285453796, + "step": 2588 + }, + { + "epoch": 1.9606358819076457, + "grad_norm": 23.722898483276367, + "learning_rate": 1.8063990329511749e-06, + "loss": 0.435437947511673, + "step": 2590 + }, + { + "epoch": 1.9621498864496594, + "grad_norm": 0.9340125322341919, + "learning_rate": 1.8047202526679291e-06, + "loss": 0.6136574745178223, + "step": 2592 + }, + { + "epoch": 1.9636638909916728, + "grad_norm": 3.3110859394073486, + "learning_rate": 1.8030412299066201e-06, + "loss": 0.21591010689735413, + "step": 2594 + }, + { + "epoch": 1.9651778955336865, + "grad_norm": 2.339695930480957, + "learning_rate": 1.8013619672986173e-06, + "loss": 0.23273268342018127, + "step": 2596 + }, + { + "epoch": 1.9666919000757002, + "grad_norm": 1.499258041381836, + "learning_rate": 1.799682467475667e-06, + "loss": 0.2774851620197296, + "step": 2598 + }, + { + "epoch": 1.9682059046177138, + "grad_norm": 5.845078945159912, + "learning_rate": 1.798002733069886e-06, + "loss": 0.286465585231781, + "step": 2600 + }, + { + "epoch": 1.9697199091597275, + "grad_norm": 2.347050905227661, + "learning_rate": 1.796322766713759e-06, + "loss": 0.3097231388092041, + "step": 2602 + }, + { + "epoch": 1.9712339137017412, + "grad_norm": 2.2310895919799805, + "learning_rate": 1.7946425710401357e-06, + "loss": 0.6837670207023621, + "step": 2604 + }, + { + "epoch": 1.9727479182437548, + "grad_norm": 1.228422999382019, + "learning_rate": 1.7929621486822223e-06, + "loss": 0.6572983264923096, + "step": 2606 + }, + { + "epoch": 1.9742619227857685, + "grad_norm": 3.3019800186157227, + "learning_rate": 1.7912815022735837e-06, + "loss": 0.1820627599954605, + "step": 2608 + }, + { + "epoch": 1.975775927327782, + "grad_norm": 1.2635468244552612, + "learning_rate": 1.789600634448133e-06, + "loss": 1.140749216079712, + "step": 2610 + }, + { + "epoch": 1.9772899318697956, + "grad_norm": 2.928532361984253, + "learning_rate": 1.7879195478401319e-06, + "loss": 0.6802592873573303, + "step": 2612 + }, + { + "epoch": 1.978803936411809, + "grad_norm": 1.6667323112487793, + "learning_rate": 1.7862382450841844e-06, + "loss": 0.701367199420929, + "step": 2614 + }, + { + "epoch": 1.9803179409538227, + "grad_norm": 4.334624290466309, + "learning_rate": 1.784556728815234e-06, + "loss": 0.6233445405960083, + "step": 2616 + }, + { + "epoch": 1.9818319454958364, + "grad_norm": 5.343925476074219, + "learning_rate": 1.7828750016685576e-06, + "loss": 0.14193308353424072, + "step": 2618 + }, + { + "epoch": 1.98334595003785, + "grad_norm": 4.146429061889648, + "learning_rate": 1.7811930662797638e-06, + "loss": 0.7617315649986267, + "step": 2620 + }, + { + "epoch": 1.9848599545798638, + "grad_norm": 1.9355127811431885, + "learning_rate": 1.7795109252847867e-06, + "loss": 0.8325185179710388, + "step": 2622 + }, + { + "epoch": 1.9863739591218774, + "grad_norm": 1.6665109395980835, + "learning_rate": 1.7778285813198826e-06, + "loss": 0.27139389514923096, + "step": 2624 + }, + { + "epoch": 1.987887963663891, + "grad_norm": 4.608744144439697, + "learning_rate": 1.7761460370216267e-06, + "loss": 0.6247193813323975, + "step": 2626 + }, + { + "epoch": 1.9894019682059048, + "grad_norm": 1.610708236694336, + "learning_rate": 1.7744632950269075e-06, + "loss": 1.086316466331482, + "step": 2628 + }, + { + "epoch": 1.9909159727479182, + "grad_norm": 1.352529764175415, + "learning_rate": 1.772780357972924e-06, + "loss": 0.17539137601852417, + "step": 2630 + }, + { + "epoch": 1.9924299772899319, + "grad_norm": 0.9066135883331299, + "learning_rate": 1.7710972284971793e-06, + "loss": 0.9914766550064087, + "step": 2632 + }, + { + "epoch": 1.9939439818319455, + "grad_norm": 1.6388225555419922, + "learning_rate": 1.7694139092374802e-06, + "loss": 0.6597426533699036, + "step": 2634 + }, + { + "epoch": 1.995457986373959, + "grad_norm": 2.3083913326263428, + "learning_rate": 1.7677304028319295e-06, + "loss": 0.9186007976531982, + "step": 2636 + }, + { + "epoch": 1.9969719909159727, + "grad_norm": 2.0622379779815674, + "learning_rate": 1.7660467119189236e-06, + "loss": 0.1676500141620636, + "step": 2638 + }, + { + "epoch": 1.9984859954579863, + "grad_norm": 14.542463302612305, + "learning_rate": 1.7643628391371484e-06, + "loss": 0.34981614351272583, + "step": 2640 + }, + { + "epoch": 2.0, + "grad_norm": 1.9076281785964966, + "learning_rate": 1.762678787125574e-06, + "loss": 0.7317795157432556, + "step": 2642 + }, + { + "epoch": 2.0015140045420137, + "grad_norm": 0.3897881507873535, + "learning_rate": 1.7609945585234533e-06, + "loss": 0.1255609542131424, + "step": 2644 + }, + { + "epoch": 2.0030280090840273, + "grad_norm": 1.0713465213775635, + "learning_rate": 1.7593101559703132e-06, + "loss": 0.5909802317619324, + "step": 2646 + }, + { + "epoch": 2.004542013626041, + "grad_norm": 1.4597793817520142, + "learning_rate": 1.7576255821059549e-06, + "loss": 0.5179256200790405, + "step": 2648 + }, + { + "epoch": 2.0060560181680547, + "grad_norm": 2.0294458866119385, + "learning_rate": 1.7559408395704483e-06, + "loss": 0.13123907148838043, + "step": 2650 + }, + { + "epoch": 2.0075700227100683, + "grad_norm": 1.9035502672195435, + "learning_rate": 1.7542559310041272e-06, + "loss": 0.5252999067306519, + "step": 2652 + }, + { + "epoch": 2.0090840272520816, + "grad_norm": 14.54428768157959, + "learning_rate": 1.7525708590475855e-06, + "loss": 0.0995216816663742, + "step": 2654 + }, + { + "epoch": 2.0105980317940952, + "grad_norm": 8.40689754486084, + "learning_rate": 1.7508856263416728e-06, + "loss": 0.19288885593414307, + "step": 2656 + }, + { + "epoch": 2.012112036336109, + "grad_norm": 5.6519341468811035, + "learning_rate": 1.7492002355274917e-06, + "loss": 0.15885737538337708, + "step": 2658 + }, + { + "epoch": 2.0136260408781226, + "grad_norm": 1.5922472476959229, + "learning_rate": 1.7475146892463911e-06, + "loss": 0.6549367308616638, + "step": 2660 + }, + { + "epoch": 2.0151400454201362, + "grad_norm": 4.163782596588135, + "learning_rate": 1.7458289901399652e-06, + "loss": 0.5245522260665894, + "step": 2662 + }, + { + "epoch": 2.01665404996215, + "grad_norm": 3.053931474685669, + "learning_rate": 1.7441431408500469e-06, + "loss": 0.5646790266036987, + "step": 2664 + }, + { + "epoch": 2.0181680545041636, + "grad_norm": 2.4408509731292725, + "learning_rate": 1.7424571440187036e-06, + "loss": 0.9292944669723511, + "step": 2666 + }, + { + "epoch": 2.0196820590461773, + "grad_norm": 1.557814359664917, + "learning_rate": 1.7407710022882353e-06, + "loss": 0.6096762418746948, + "step": 2668 + }, + { + "epoch": 2.021196063588191, + "grad_norm": 0.5476527214050293, + "learning_rate": 1.7390847183011696e-06, + "loss": 0.1541244387626648, + "step": 2670 + }, + { + "epoch": 2.0227100681302046, + "grad_norm": 1.2953879833221436, + "learning_rate": 1.7373982947002545e-06, + "loss": 0.5491629242897034, + "step": 2672 + }, + { + "epoch": 2.024224072672218, + "grad_norm": 1.1560670137405396, + "learning_rate": 1.7357117341284586e-06, + "loss": 0.48358261585235596, + "step": 2674 + }, + { + "epoch": 2.0257380772142315, + "grad_norm": 1.6420649290084839, + "learning_rate": 1.7340250392289654e-06, + "loss": 0.5866899490356445, + "step": 2676 + }, + { + "epoch": 2.027252081756245, + "grad_norm": 1.2109801769256592, + "learning_rate": 1.7323382126451683e-06, + "loss": 0.9471041560173035, + "step": 2678 + }, + { + "epoch": 2.028766086298259, + "grad_norm": 1.4129337072372437, + "learning_rate": 1.7306512570206675e-06, + "loss": 0.9473216533660889, + "step": 2680 + }, + { + "epoch": 2.0302800908402725, + "grad_norm": 1.448809027671814, + "learning_rate": 1.7289641749992642e-06, + "loss": 0.54279625415802, + "step": 2682 + }, + { + "epoch": 2.031794095382286, + "grad_norm": 1.7061611413955688, + "learning_rate": 1.7272769692249596e-06, + "loss": 0.46282655000686646, + "step": 2684 + }, + { + "epoch": 2.0333080999243, + "grad_norm": 1.9974749088287354, + "learning_rate": 1.7255896423419474e-06, + "loss": 0.217169389128685, + "step": 2686 + }, + { + "epoch": 2.0348221044663135, + "grad_norm": 2.0513646602630615, + "learning_rate": 1.7239021969946115e-06, + "loss": 0.44249847531318665, + "step": 2688 + }, + { + "epoch": 2.036336109008327, + "grad_norm": 3.0641345977783203, + "learning_rate": 1.7222146358275214e-06, + "loss": 0.12440875172615051, + "step": 2690 + }, + { + "epoch": 2.037850113550341, + "grad_norm": 2.184577703475952, + "learning_rate": 1.720526961485429e-06, + "loss": 0.8467821478843689, + "step": 2692 + }, + { + "epoch": 2.039364118092354, + "grad_norm": 2.6016576290130615, + "learning_rate": 1.7188391766132618e-06, + "loss": 0.17708347737789154, + "step": 2694 + }, + { + "epoch": 2.0408781226343677, + "grad_norm": 1.0353049039840698, + "learning_rate": 1.7171512838561219e-06, + "loss": 0.8818671703338623, + "step": 2696 + }, + { + "epoch": 2.0423921271763814, + "grad_norm": 1.9168877601623535, + "learning_rate": 1.7154632858592804e-06, + "loss": 0.5648282766342163, + "step": 2698 + }, + { + "epoch": 2.043906131718395, + "grad_norm": 2.6725902557373047, + "learning_rate": 1.7137751852681728e-06, + "loss": 0.5454245805740356, + "step": 2700 + }, + { + "epoch": 2.0454201362604087, + "grad_norm": 2.3722050189971924, + "learning_rate": 1.7120869847283955e-06, + "loss": 0.13117262721061707, + "step": 2702 + }, + { + "epoch": 2.0469341408024224, + "grad_norm": 8.196845054626465, + "learning_rate": 1.7103986868857016e-06, + "loss": 0.6494061946868896, + "step": 2704 + }, + { + "epoch": 2.048448145344436, + "grad_norm": 1.1092252731323242, + "learning_rate": 1.7087102943859973e-06, + "loss": 0.5388786792755127, + "step": 2706 + }, + { + "epoch": 2.0499621498864498, + "grad_norm": 2.326972246170044, + "learning_rate": 1.7070218098753363e-06, + "loss": 0.48522213101387024, + "step": 2708 + }, + { + "epoch": 2.0514761544284634, + "grad_norm": 2.6585757732391357, + "learning_rate": 1.7053332359999166e-06, + "loss": 0.5669052600860596, + "step": 2710 + }, + { + "epoch": 2.052990158970477, + "grad_norm": 2.057875871658325, + "learning_rate": 1.7036445754060766e-06, + "loss": 1.1287788152694702, + "step": 2712 + }, + { + "epoch": 2.0545041635124903, + "grad_norm": 1.085253119468689, + "learning_rate": 1.7019558307402901e-06, + "loss": 0.9731413722038269, + "step": 2714 + }, + { + "epoch": 2.056018168054504, + "grad_norm": 1.3714567422866821, + "learning_rate": 1.7002670046491641e-06, + "loss": 0.4441766142845154, + "step": 2716 + }, + { + "epoch": 2.0575321725965177, + "grad_norm": 2.284273147583008, + "learning_rate": 1.6985780997794308e-06, + "loss": 0.30170539021492004, + "step": 2718 + }, + { + "epoch": 2.0590461771385313, + "grad_norm": 0.35695508122444153, + "learning_rate": 1.696889118777948e-06, + "loss": 0.4801369309425354, + "step": 2720 + }, + { + "epoch": 2.060560181680545, + "grad_norm": 2.643681049346924, + "learning_rate": 1.6952000642916918e-06, + "loss": 0.3815020024776459, + "step": 2722 + }, + { + "epoch": 2.0620741862225587, + "grad_norm": 1.5044199228286743, + "learning_rate": 1.6935109389677534e-06, + "loss": 0.4453948438167572, + "step": 2724 + }, + { + "epoch": 2.0635881907645723, + "grad_norm": 1.911596417427063, + "learning_rate": 1.6918217454533359e-06, + "loss": 0.20199593901634216, + "step": 2726 + }, + { + "epoch": 2.065102195306586, + "grad_norm": 4.796116352081299, + "learning_rate": 1.6901324863957482e-06, + "loss": 0.24311362206935883, + "step": 2728 + }, + { + "epoch": 2.0666161998485997, + "grad_norm": 0.43947383761405945, + "learning_rate": 1.6884431644424022e-06, + "loss": 0.12444750219583511, + "step": 2730 + }, + { + "epoch": 2.0681302043906133, + "grad_norm": 1.2617090940475464, + "learning_rate": 1.6867537822408093e-06, + "loss": 0.4943394064903259, + "step": 2732 + }, + { + "epoch": 2.069644208932627, + "grad_norm": 0.9398093223571777, + "learning_rate": 1.6850643424385733e-06, + "loss": 0.0803423523902893, + "step": 2734 + }, + { + "epoch": 2.0711582134746402, + "grad_norm": 1.8321349620819092, + "learning_rate": 1.6833748476833906e-06, + "loss": 0.4664800465106964, + "step": 2736 + }, + { + "epoch": 2.072672218016654, + "grad_norm": 1.0088332891464233, + "learning_rate": 1.6816853006230427e-06, + "loss": 0.9422508478164673, + "step": 2738 + }, + { + "epoch": 2.0741862225586676, + "grad_norm": 1.2515548467636108, + "learning_rate": 1.6799957039053924e-06, + "loss": 0.563462495803833, + "step": 2740 + }, + { + "epoch": 2.0757002271006812, + "grad_norm": 4.0011887550354, + "learning_rate": 1.6783060601783816e-06, + "loss": 0.6821116805076599, + "step": 2742 + }, + { + "epoch": 2.077214231642695, + "grad_norm": 1.3348299264907837, + "learning_rate": 1.6766163720900242e-06, + "loss": 0.4521723985671997, + "step": 2744 + }, + { + "epoch": 2.0787282361847086, + "grad_norm": 3.2145044803619385, + "learning_rate": 1.674926642288406e-06, + "loss": 0.14956560730934143, + "step": 2746 + }, + { + "epoch": 2.0802422407267223, + "grad_norm": 2.3695411682128906, + "learning_rate": 1.6732368734216756e-06, + "loss": 0.9937928915023804, + "step": 2748 + }, + { + "epoch": 2.081756245268736, + "grad_norm": 1.76715087890625, + "learning_rate": 1.6715470681380446e-06, + "loss": 0.5776373744010925, + "step": 2750 + }, + { + "epoch": 2.0832702498107496, + "grad_norm": 2.0524466037750244, + "learning_rate": 1.6698572290857814e-06, + "loss": 0.5806592106819153, + "step": 2752 + }, + { + "epoch": 2.0847842543527633, + "grad_norm": 2.8029470443725586, + "learning_rate": 1.6681673589132063e-06, + "loss": 0.4764387309551239, + "step": 2754 + }, + { + "epoch": 2.0862982588947765, + "grad_norm": 2.1075892448425293, + "learning_rate": 1.6664774602686903e-06, + "loss": 0.5034778714179993, + "step": 2756 + }, + { + "epoch": 2.08781226343679, + "grad_norm": 1.682845950126648, + "learning_rate": 1.6647875358006466e-06, + "loss": 0.5085087418556213, + "step": 2758 + }, + { + "epoch": 2.089326267978804, + "grad_norm": 5.653558731079102, + "learning_rate": 1.663097588157531e-06, + "loss": 0.07237551361322403, + "step": 2760 + }, + { + "epoch": 2.0908402725208175, + "grad_norm": 7.3040242195129395, + "learning_rate": 1.661407619987834e-06, + "loss": 0.5782265663146973, + "step": 2762 + }, + { + "epoch": 2.092354277062831, + "grad_norm": 2.716933250427246, + "learning_rate": 1.6597176339400792e-06, + "loss": 0.7445158958435059, + "step": 2764 + }, + { + "epoch": 2.093868281604845, + "grad_norm": 2.811447858810425, + "learning_rate": 1.6580276326628184e-06, + "loss": 0.22737711668014526, + "step": 2766 + }, + { + "epoch": 2.0953822861468585, + "grad_norm": 1.1847596168518066, + "learning_rate": 1.6563376188046265e-06, + "loss": 0.20628035068511963, + "step": 2768 + }, + { + "epoch": 2.096896290688872, + "grad_norm": 2.039379596710205, + "learning_rate": 1.6546475950140986e-06, + "loss": 0.2262708693742752, + "step": 2770 + }, + { + "epoch": 2.098410295230886, + "grad_norm": 1.84257972240448, + "learning_rate": 1.6529575639398453e-06, + "loss": 0.3347666561603546, + "step": 2772 + }, + { + "epoch": 2.0999242997728995, + "grad_norm": 4.7747602462768555, + "learning_rate": 1.6512675282304884e-06, + "loss": 0.35915127396583557, + "step": 2774 + }, + { + "epoch": 2.1014383043149127, + "grad_norm": 4.393153667449951, + "learning_rate": 1.6495774905346575e-06, + "loss": 0.09371113777160645, + "step": 2776 + }, + { + "epoch": 2.1029523088569264, + "grad_norm": 3.672687292098999, + "learning_rate": 1.6478874535009847e-06, + "loss": 1.0071041584014893, + "step": 2778 + }, + { + "epoch": 2.10446631339894, + "grad_norm": 4.046291351318359, + "learning_rate": 1.6461974197781015e-06, + "loss": 0.4997747242450714, + "step": 2780 + }, + { + "epoch": 2.1059803179409537, + "grad_norm": 2.4618570804595947, + "learning_rate": 1.6445073920146336e-06, + "loss": 0.6052404642105103, + "step": 2782 + }, + { + "epoch": 2.1074943224829674, + "grad_norm": 1.6199997663497925, + "learning_rate": 1.6428173728591981e-06, + "loss": 0.9817624688148499, + "step": 2784 + }, + { + "epoch": 2.109008327024981, + "grad_norm": 1.276282548904419, + "learning_rate": 1.6411273649603988e-06, + "loss": 1.0199428796768188, + "step": 2786 + }, + { + "epoch": 2.1105223315669948, + "grad_norm": 1.0883533954620361, + "learning_rate": 1.6394373709668207e-06, + "loss": 0.5439098477363586, + "step": 2788 + }, + { + "epoch": 2.1120363361090084, + "grad_norm": 1.2916643619537354, + "learning_rate": 1.6377473935270272e-06, + "loss": 0.489423930644989, + "step": 2790 + }, + { + "epoch": 2.113550340651022, + "grad_norm": 1.513370394706726, + "learning_rate": 1.6360574352895573e-06, + "loss": 0.9049098491668701, + "step": 2792 + }, + { + "epoch": 2.1150643451930358, + "grad_norm": 1.0032689571380615, + "learning_rate": 1.6343674989029185e-06, + "loss": 0.6421214938163757, + "step": 2794 + }, + { + "epoch": 2.116578349735049, + "grad_norm": 0.7619471549987793, + "learning_rate": 1.632677587015584e-06, + "loss": 0.9970691800117493, + "step": 2796 + }, + { + "epoch": 2.1180923542770627, + "grad_norm": 1.3667787313461304, + "learning_rate": 1.6309877022759894e-06, + "loss": 1.014106035232544, + "step": 2798 + }, + { + "epoch": 2.1196063588190763, + "grad_norm": 5.2187819480896, + "learning_rate": 1.6292978473325269e-06, + "loss": 0.2597154378890991, + "step": 2800 + }, + { + "epoch": 2.12112036336109, + "grad_norm": 2.2075483798980713, + "learning_rate": 1.627608024833543e-06, + "loss": 0.549049437046051, + "step": 2802 + }, + { + "epoch": 2.1226343679031037, + "grad_norm": 1.4524098634719849, + "learning_rate": 1.6259182374273325e-06, + "loss": 0.552122950553894, + "step": 2804 + }, + { + "epoch": 2.1241483724451173, + "grad_norm": 7.213001728057861, + "learning_rate": 1.6242284877621352e-06, + "loss": 0.18227112293243408, + "step": 2806 + }, + { + "epoch": 2.125662376987131, + "grad_norm": 0.9364220499992371, + "learning_rate": 1.6225387784861332e-06, + "loss": 0.9016151428222656, + "step": 2808 + }, + { + "epoch": 2.1271763815291447, + "grad_norm": 3.105168104171753, + "learning_rate": 1.6208491122474423e-06, + "loss": 0.16485339403152466, + "step": 2810 + }, + { + "epoch": 2.1286903860711583, + "grad_norm": 2.6125948429107666, + "learning_rate": 1.6191594916941145e-06, + "loss": 0.5917508006095886, + "step": 2812 + }, + { + "epoch": 2.130204390613172, + "grad_norm": 2.9224133491516113, + "learning_rate": 1.6174699194741276e-06, + "loss": 0.6452589631080627, + "step": 2814 + }, + { + "epoch": 2.1317183951551852, + "grad_norm": 1.6223468780517578, + "learning_rate": 1.6157803982353844e-06, + "loss": 0.5422303676605225, + "step": 2816 + }, + { + "epoch": 2.133232399697199, + "grad_norm": 1.1642112731933594, + "learning_rate": 1.6140909306257075e-06, + "loss": 0.49290937185287476, + "step": 2818 + }, + { + "epoch": 2.1347464042392126, + "grad_norm": 5.4384613037109375, + "learning_rate": 1.6124015192928368e-06, + "loss": 0.07090282440185547, + "step": 2820 + }, + { + "epoch": 2.1362604087812262, + "grad_norm": 5.15507698059082, + "learning_rate": 1.6107121668844229e-06, + "loss": 0.9372910857200623, + "step": 2822 + }, + { + "epoch": 2.13777441332324, + "grad_norm": 1.4201043844223022, + "learning_rate": 1.6090228760480233e-06, + "loss": 0.3477085530757904, + "step": 2824 + }, + { + "epoch": 2.1392884178652536, + "grad_norm": 1.6032931804656982, + "learning_rate": 1.6073336494311e-06, + "loss": 0.5849123597145081, + "step": 2826 + }, + { + "epoch": 2.1408024224072673, + "grad_norm": 1.297904133796692, + "learning_rate": 1.605644489681015e-06, + "loss": 0.5991969704627991, + "step": 2828 + }, + { + "epoch": 2.142316426949281, + "grad_norm": 1.5555821657180786, + "learning_rate": 1.6039553994450242e-06, + "loss": 0.36953338980674744, + "step": 2830 + }, + { + "epoch": 2.1438304314912946, + "grad_norm": 0.9411146640777588, + "learning_rate": 1.602266381370275e-06, + "loss": 0.5315758585929871, + "step": 2832 + }, + { + "epoch": 2.1453444360333083, + "grad_norm": 4.079467296600342, + "learning_rate": 1.6005774381038027e-06, + "loss": 0.5107845664024353, + "step": 2834 + }, + { + "epoch": 2.146858440575322, + "grad_norm": 6.101061820983887, + "learning_rate": 1.5988885722925236e-06, + "loss": 0.1170424371957779, + "step": 2836 + }, + { + "epoch": 2.148372445117335, + "grad_norm": 2.4054558277130127, + "learning_rate": 1.5971997865832336e-06, + "loss": 0.11238373816013336, + "step": 2838 + }, + { + "epoch": 2.149886449659349, + "grad_norm": 2.8460471630096436, + "learning_rate": 1.5955110836226026e-06, + "loss": 0.5437900424003601, + "step": 2840 + }, + { + "epoch": 2.1514004542013625, + "grad_norm": 0.8729496598243713, + "learning_rate": 1.593822466057172e-06, + "loss": 0.08469262719154358, + "step": 2842 + }, + { + "epoch": 2.152914458743376, + "grad_norm": 12.420215606689453, + "learning_rate": 1.592133936533348e-06, + "loss": 0.4006781578063965, + "step": 2844 + }, + { + "epoch": 2.15442846328539, + "grad_norm": 1.931260108947754, + "learning_rate": 1.5904454976973997e-06, + "loss": 0.07847610116004944, + "step": 2846 + }, + { + "epoch": 2.1559424678274035, + "grad_norm": 1.1206101179122925, + "learning_rate": 1.5887571521954526e-06, + "loss": 0.9250069856643677, + "step": 2848 + }, + { + "epoch": 2.157456472369417, + "grad_norm": 1.3575100898742676, + "learning_rate": 1.5870689026734887e-06, + "loss": 1.0127815008163452, + "step": 2850 + }, + { + "epoch": 2.158970476911431, + "grad_norm": 1.4162954092025757, + "learning_rate": 1.5853807517773366e-06, + "loss": 0.8721470236778259, + "step": 2852 + }, + { + "epoch": 2.1604844814534445, + "grad_norm": 0.927931010723114, + "learning_rate": 1.5836927021526724e-06, + "loss": 0.20841215550899506, + "step": 2854 + }, + { + "epoch": 2.161998485995458, + "grad_norm": 2.4305167198181152, + "learning_rate": 1.5820047564450122e-06, + "loss": 0.9724196195602417, + "step": 2856 + }, + { + "epoch": 2.1635124905374714, + "grad_norm": 1.445548415184021, + "learning_rate": 1.5803169172997105e-06, + "loss": 0.7447295188903809, + "step": 2858 + }, + { + "epoch": 2.165026495079485, + "grad_norm": 2.3129169940948486, + "learning_rate": 1.578629187361954e-06, + "loss": 0.17623990774154663, + "step": 2860 + }, + { + "epoch": 2.1665404996214987, + "grad_norm": 2.175723075866699, + "learning_rate": 1.576941569276757e-06, + "loss": 0.1559004932641983, + "step": 2862 + }, + { + "epoch": 2.1680545041635124, + "grad_norm": 2.820603370666504, + "learning_rate": 1.5752540656889617e-06, + "loss": 0.15261517465114594, + "step": 2864 + }, + { + "epoch": 2.169568508705526, + "grad_norm": 2.5395634174346924, + "learning_rate": 1.5735666792432283e-06, + "loss": 0.14439840614795685, + "step": 2866 + }, + { + "epoch": 2.1710825132475398, + "grad_norm": 1.755033016204834, + "learning_rate": 1.5718794125840328e-06, + "loss": 0.5299633145332336, + "step": 2868 + }, + { + "epoch": 2.1725965177895534, + "grad_norm": 2.8602867126464844, + "learning_rate": 1.570192268355667e-06, + "loss": 0.20961156487464905, + "step": 2870 + }, + { + "epoch": 2.174110522331567, + "grad_norm": 1.7750110626220703, + "learning_rate": 1.5685052492022274e-06, + "loss": 0.2456619143486023, + "step": 2872 + }, + { + "epoch": 2.1756245268735808, + "grad_norm": 0.880752682685852, + "learning_rate": 1.5668183577676157e-06, + "loss": 0.5891717076301575, + "step": 2874 + }, + { + "epoch": 2.1771385314155944, + "grad_norm": 1.4070804119110107, + "learning_rate": 1.5651315966955332e-06, + "loss": 0.5674391984939575, + "step": 2876 + }, + { + "epoch": 2.1786525359576077, + "grad_norm": 3.136618137359619, + "learning_rate": 1.5634449686294778e-06, + "loss": 0.27115634083747864, + "step": 2878 + }, + { + "epoch": 2.1801665404996213, + "grad_norm": 1.1265112161636353, + "learning_rate": 1.561758476212738e-06, + "loss": 0.6448878049850464, + "step": 2880 + }, + { + "epoch": 2.181680545041635, + "grad_norm": 1.073952555656433, + "learning_rate": 1.56007212208839e-06, + "loss": 0.0826905369758606, + "step": 2882 + }, + { + "epoch": 2.1831945495836487, + "grad_norm": 1.2602885961532593, + "learning_rate": 1.5583859088992927e-06, + "loss": 0.4014187455177307, + "step": 2884 + }, + { + "epoch": 2.1847085541256623, + "grad_norm": 0.9830121994018555, + "learning_rate": 1.5566998392880854e-06, + "loss": 0.7575472593307495, + "step": 2886 + }, + { + "epoch": 2.186222558667676, + "grad_norm": 1.6392723321914673, + "learning_rate": 1.5550139158971817e-06, + "loss": 0.24749702215194702, + "step": 2888 + }, + { + "epoch": 2.1877365632096897, + "grad_norm": 1.1800613403320312, + "learning_rate": 1.553328141368765e-06, + "loss": 0.5460869669914246, + "step": 2890 + }, + { + "epoch": 2.1892505677517033, + "grad_norm": 1.165030598640442, + "learning_rate": 1.551642518344788e-06, + "loss": 0.7511795163154602, + "step": 2892 + }, + { + "epoch": 2.190764572293717, + "grad_norm": 3.8945846557617188, + "learning_rate": 1.5499570494669635e-06, + "loss": 0.7876091599464417, + "step": 2894 + }, + { + "epoch": 2.1922785768357307, + "grad_norm": 2.5353126525878906, + "learning_rate": 1.548271737376763e-06, + "loss": 0.5594918131828308, + "step": 2896 + }, + { + "epoch": 2.1937925813777444, + "grad_norm": 1.0810461044311523, + "learning_rate": 1.5465865847154133e-06, + "loss": 0.49233466386795044, + "step": 2898 + }, + { + "epoch": 2.1953065859197576, + "grad_norm": 1.424275517463684, + "learning_rate": 1.5449015941238916e-06, + "loss": 0.10253500938415527, + "step": 2900 + }, + { + "epoch": 2.1968205904617712, + "grad_norm": 1.9398629665374756, + "learning_rate": 1.5432167682429199e-06, + "loss": 0.1983412206172943, + "step": 2902 + }, + { + "epoch": 2.198334595003785, + "grad_norm": 0.47313499450683594, + "learning_rate": 1.541532109712962e-06, + "loss": 0.6859210729598999, + "step": 2904 + }, + { + "epoch": 2.1998485995457986, + "grad_norm": 1.4736558198928833, + "learning_rate": 1.5398476211742212e-06, + "loss": 0.6153842210769653, + "step": 2906 + }, + { + "epoch": 2.2013626040878123, + "grad_norm": 1.6076915264129639, + "learning_rate": 1.5381633052666323e-06, + "loss": 0.06841941922903061, + "step": 2908 + }, + { + "epoch": 2.202876608629826, + "grad_norm": 0.9196807742118835, + "learning_rate": 1.5364791646298612e-06, + "loss": 0.8942004442214966, + "step": 2910 + }, + { + "epoch": 2.2043906131718396, + "grad_norm": 4.5777201652526855, + "learning_rate": 1.5347952019032969e-06, + "loss": 0.16147112846374512, + "step": 2912 + }, + { + "epoch": 2.2059046177138533, + "grad_norm": 1.9058843851089478, + "learning_rate": 1.533111419726053e-06, + "loss": 0.5567464232444763, + "step": 2914 + }, + { + "epoch": 2.207418622255867, + "grad_norm": 2.2641146183013916, + "learning_rate": 1.5314278207369572e-06, + "loss": 0.31114301085472107, + "step": 2916 + }, + { + "epoch": 2.20893262679788, + "grad_norm": 6.013691425323486, + "learning_rate": 1.5297444075745511e-06, + "loss": 0.6485071778297424, + "step": 2918 + }, + { + "epoch": 2.210446631339894, + "grad_norm": 2.468949317932129, + "learning_rate": 1.5280611828770842e-06, + "loss": 0.5490062832832336, + "step": 2920 + }, + { + "epoch": 2.2119606358819075, + "grad_norm": 2.49491548538208, + "learning_rate": 1.5263781492825134e-06, + "loss": 0.45480069518089294, + "step": 2922 + }, + { + "epoch": 2.213474640423921, + "grad_norm": 2.91528058052063, + "learning_rate": 1.524695309428493e-06, + "loss": 0.7654291391372681, + "step": 2924 + }, + { + "epoch": 2.214988644965935, + "grad_norm": 1.5504508018493652, + "learning_rate": 1.5230126659523748e-06, + "loss": 0.4938569664955139, + "step": 2926 + }, + { + "epoch": 2.2165026495079485, + "grad_norm": 1.441898226737976, + "learning_rate": 1.5213302214912033e-06, + "loss": 0.35826438665390015, + "step": 2928 + }, + { + "epoch": 2.218016654049962, + "grad_norm": 1.7175021171569824, + "learning_rate": 1.5196479786817105e-06, + "loss": 0.5516992807388306, + "step": 2930 + }, + { + "epoch": 2.219530658591976, + "grad_norm": 1.2882024049758911, + "learning_rate": 1.517965940160313e-06, + "loss": 0.49597468972206116, + "step": 2932 + }, + { + "epoch": 2.2210446631339895, + "grad_norm": 7.071352005004883, + "learning_rate": 1.5162841085631062e-06, + "loss": 0.08909988403320312, + "step": 2934 + }, + { + "epoch": 2.222558667676003, + "grad_norm": 1.9846807718276978, + "learning_rate": 1.5146024865258626e-06, + "loss": 1.073472261428833, + "step": 2936 + }, + { + "epoch": 2.224072672218017, + "grad_norm": 1.784736156463623, + "learning_rate": 1.512921076684025e-06, + "loss": 0.5595353245735168, + "step": 2938 + }, + { + "epoch": 2.22558667676003, + "grad_norm": 3.0223000049591064, + "learning_rate": 1.5112398816727044e-06, + "loss": 0.6427146196365356, + "step": 2940 + }, + { + "epoch": 2.2271006813020437, + "grad_norm": 1.1277985572814941, + "learning_rate": 1.5095589041266737e-06, + "loss": 0.5653051733970642, + "step": 2942 + }, + { + "epoch": 2.2286146858440574, + "grad_norm": 1.9536253213882446, + "learning_rate": 1.5078781466803683e-06, + "loss": 0.350731760263443, + "step": 2944 + }, + { + "epoch": 2.230128690386071, + "grad_norm": 0.8452926874160767, + "learning_rate": 1.5061976119678749e-06, + "loss": 0.6324775218963623, + "step": 2946 + }, + { + "epoch": 2.2316426949280848, + "grad_norm": 1.2329614162445068, + "learning_rate": 1.5045173026229326e-06, + "loss": 0.39321663975715637, + "step": 2948 + }, + { + "epoch": 2.2331566994700984, + "grad_norm": 1.8933361768722534, + "learning_rate": 1.502837221278929e-06, + "loss": 0.27046847343444824, + "step": 2950 + }, + { + "epoch": 2.234670704012112, + "grad_norm": 0.9778454303741455, + "learning_rate": 1.5011573705688922e-06, + "loss": 0.8379035592079163, + "step": 2952 + }, + { + "epoch": 2.2361847085541258, + "grad_norm": 1.423471450805664, + "learning_rate": 1.4994777531254882e-06, + "loss": 0.5086550116539001, + "step": 2954 + }, + { + "epoch": 2.2376987130961394, + "grad_norm": 0.8852792382240295, + "learning_rate": 1.49779837158102e-06, + "loss": 0.03382871299982071, + "step": 2956 + }, + { + "epoch": 2.239212717638153, + "grad_norm": 5.467677593231201, + "learning_rate": 1.4961192285674194e-06, + "loss": 0.5554404854774475, + "step": 2958 + }, + { + "epoch": 2.2407267221801668, + "grad_norm": 2.3810720443725586, + "learning_rate": 1.494440326716245e-06, + "loss": 0.5296036005020142, + "step": 2960 + }, + { + "epoch": 2.24224072672218, + "grad_norm": 1.4890040159225464, + "learning_rate": 1.4927616686586755e-06, + "loss": 0.46809694170951843, + "step": 2962 + }, + { + "epoch": 2.2437547312641937, + "grad_norm": 3.5411429405212402, + "learning_rate": 1.4910832570255105e-06, + "loss": 0.2190769761800766, + "step": 2964 + }, + { + "epoch": 2.2452687358062073, + "grad_norm": 2.4040260314941406, + "learning_rate": 1.489405094447162e-06, + "loss": 0.053907375782728195, + "step": 2966 + }, + { + "epoch": 2.246782740348221, + "grad_norm": 1.8134297132492065, + "learning_rate": 1.4877271835536508e-06, + "loss": 0.18153570592403412, + "step": 2968 + }, + { + "epoch": 2.2482967448902347, + "grad_norm": 1.2748960256576538, + "learning_rate": 1.486049526974604e-06, + "loss": 0.09789960086345673, + "step": 2970 + }, + { + "epoch": 2.2498107494322483, + "grad_norm": 2.113441228866577, + "learning_rate": 1.4843721273392512e-06, + "loss": 0.21388627588748932, + "step": 2972 + }, + { + "epoch": 2.251324753974262, + "grad_norm": 1.8878917694091797, + "learning_rate": 1.4826949872764181e-06, + "loss": 0.45056116580963135, + "step": 2974 + }, + { + "epoch": 2.2528387585162757, + "grad_norm": 3.475752592086792, + "learning_rate": 1.4810181094145231e-06, + "loss": 0.5707042217254639, + "step": 2976 + }, + { + "epoch": 2.2543527630582894, + "grad_norm": 2.3447210788726807, + "learning_rate": 1.4793414963815745e-06, + "loss": 0.2774571180343628, + "step": 2978 + }, + { + "epoch": 2.2558667676003026, + "grad_norm": 1.1211177110671997, + "learning_rate": 1.4776651508051667e-06, + "loss": 0.5376030802726746, + "step": 2980 + }, + { + "epoch": 2.2573807721423162, + "grad_norm": 0.713952362537384, + "learning_rate": 1.4759890753124724e-06, + "loss": 0.6337863802909851, + "step": 2982 + }, + { + "epoch": 2.25889477668433, + "grad_norm": 1.3238987922668457, + "learning_rate": 1.4743132725302427e-06, + "loss": 0.5528537631034851, + "step": 2984 + }, + { + "epoch": 2.2604087812263436, + "grad_norm": 0.9952222108840942, + "learning_rate": 1.472637745084801e-06, + "loss": 0.43501901626586914, + "step": 2986 + }, + { + "epoch": 2.2619227857683573, + "grad_norm": 4.0580220222473145, + "learning_rate": 1.47096249560204e-06, + "loss": 0.9046013355255127, + "step": 2988 + }, + { + "epoch": 2.263436790310371, + "grad_norm": 1.193737268447876, + "learning_rate": 1.469287526707415e-06, + "loss": 0.8010193109512329, + "step": 2990 + }, + { + "epoch": 2.2649507948523846, + "grad_norm": 9.777836799621582, + "learning_rate": 1.467612841025942e-06, + "loss": 0.3662986755371094, + "step": 2992 + }, + { + "epoch": 2.2664647993943983, + "grad_norm": 0.6894108057022095, + "learning_rate": 1.465938441182195e-06, + "loss": 0.47696706652641296, + "step": 2994 + }, + { + "epoch": 2.267978803936412, + "grad_norm": 2.3913896083831787, + "learning_rate": 1.4642643298002977e-06, + "loss": 0.6009340882301331, + "step": 2996 + }, + { + "epoch": 2.2694928084784256, + "grad_norm": 2.373789072036743, + "learning_rate": 1.4625905095039232e-06, + "loss": 0.987229585647583, + "step": 2998 + }, + { + "epoch": 2.2710068130204393, + "grad_norm": 3.011552572250366, + "learning_rate": 1.4609169829162866e-06, + "loss": 0.528290867805481, + "step": 3000 + }, + { + "epoch": 2.2725208175624525, + "grad_norm": 0.723567545413971, + "learning_rate": 1.4592437526601462e-06, + "loss": 0.02787677012383938, + "step": 3002 + }, + { + "epoch": 2.274034822104466, + "grad_norm": 2.0487186908721924, + "learning_rate": 1.4575708213577915e-06, + "loss": 0.5389975309371948, + "step": 3004 + }, + { + "epoch": 2.27554882664648, + "grad_norm": 1.3139487504959106, + "learning_rate": 1.4558981916310474e-06, + "loss": 0.07102423906326294, + "step": 3006 + }, + { + "epoch": 2.2770628311884935, + "grad_norm": 1.8657398223876953, + "learning_rate": 1.4542258661012636e-06, + "loss": 0.502851128578186, + "step": 3008 + }, + { + "epoch": 2.278576835730507, + "grad_norm": 1.8828381299972534, + "learning_rate": 1.4525538473893138e-06, + "loss": 0.052939970046281815, + "step": 3010 + }, + { + "epoch": 2.280090840272521, + "grad_norm": 3.5837385654449463, + "learning_rate": 1.4508821381155916e-06, + "loss": 0.0660877674818039, + "step": 3012 + }, + { + "epoch": 2.2816048448145345, + "grad_norm": 1.7557404041290283, + "learning_rate": 1.4492107409000037e-06, + "loss": 0.147925466299057, + "step": 3014 + }, + { + "epoch": 2.283118849356548, + "grad_norm": 1.5166267156600952, + "learning_rate": 1.4475396583619706e-06, + "loss": 0.3314111828804016, + "step": 3016 + }, + { + "epoch": 2.284632853898562, + "grad_norm": 2.2240238189697266, + "learning_rate": 1.445868893120417e-06, + "loss": 0.9709873199462891, + "step": 3018 + }, + { + "epoch": 2.286146858440575, + "grad_norm": 1.196077585220337, + "learning_rate": 1.444198447793772e-06, + "loss": 0.9430187940597534, + "step": 3020 + }, + { + "epoch": 2.287660862982589, + "grad_norm": 1.904353141784668, + "learning_rate": 1.4425283249999626e-06, + "loss": 0.7404723167419434, + "step": 3022 + }, + { + "epoch": 2.2891748675246024, + "grad_norm": 1.7155346870422363, + "learning_rate": 1.4408585273564101e-06, + "loss": 0.5739380717277527, + "step": 3024 + }, + { + "epoch": 2.290688872066616, + "grad_norm": 1.2293956279754639, + "learning_rate": 1.4391890574800273e-06, + "loss": 0.44090327620506287, + "step": 3026 + }, + { + "epoch": 2.2922028766086298, + "grad_norm": 1.4451079368591309, + "learning_rate": 1.4375199179872111e-06, + "loss": 0.9696823358535767, + "step": 3028 + }, + { + "epoch": 2.2937168811506434, + "grad_norm": 36.32086181640625, + "learning_rate": 1.435851111493844e-06, + "loss": 0.3338770270347595, + "step": 3030 + }, + { + "epoch": 2.295230885692657, + "grad_norm": 8.2905855178833, + "learning_rate": 1.434182640615284e-06, + "loss": 0.5509998202323914, + "step": 3032 + }, + { + "epoch": 2.2967448902346708, + "grad_norm": 2.4570791721343994, + "learning_rate": 1.4325145079663634e-06, + "loss": 0.13118332624435425, + "step": 3034 + }, + { + "epoch": 2.2982588947766844, + "grad_norm": 1.0852617025375366, + "learning_rate": 1.4308467161613854e-06, + "loss": 0.467588871717453, + "step": 3036 + }, + { + "epoch": 2.299772899318698, + "grad_norm": 1.4151700735092163, + "learning_rate": 1.4291792678141184e-06, + "loss": 0.8230699300765991, + "step": 3038 + }, + { + "epoch": 2.3012869038607118, + "grad_norm": 1.6991437673568726, + "learning_rate": 1.4275121655377932e-06, + "loss": 0.05372127518057823, + "step": 3040 + }, + { + "epoch": 2.302800908402725, + "grad_norm": 1.8634898662567139, + "learning_rate": 1.4258454119450961e-06, + "loss": 0.6053745150566101, + "step": 3042 + }, + { + "epoch": 2.3043149129447387, + "grad_norm": 1.8450437784194946, + "learning_rate": 1.4241790096481704e-06, + "loss": 0.30308300256729126, + "step": 3044 + }, + { + "epoch": 2.3058289174867523, + "grad_norm": 5.503345489501953, + "learning_rate": 1.4225129612586064e-06, + "loss": 0.684380292892456, + "step": 3046 + }, + { + "epoch": 2.307342922028766, + "grad_norm": 1.7327464818954468, + "learning_rate": 1.4208472693874397e-06, + "loss": 0.5232458710670471, + "step": 3048 + }, + { + "epoch": 2.3088569265707797, + "grad_norm": 1.1458314657211304, + "learning_rate": 1.4191819366451482e-06, + "loss": 0.983380138874054, + "step": 3050 + }, + { + "epoch": 2.3103709311127933, + "grad_norm": 1.4206262826919556, + "learning_rate": 1.4175169656416467e-06, + "loss": 0.551274299621582, + "step": 3052 + }, + { + "epoch": 2.311884935654807, + "grad_norm": 1.4261451959609985, + "learning_rate": 1.4158523589862829e-06, + "loss": 0.5678657293319702, + "step": 3054 + }, + { + "epoch": 2.3133989401968207, + "grad_norm": 3.07586932182312, + "learning_rate": 1.4141881192878332e-06, + "loss": 0.21762025356292725, + "step": 3056 + }, + { + "epoch": 2.3149129447388344, + "grad_norm": 0.4984033703804016, + "learning_rate": 1.4125242491545e-06, + "loss": 0.47255775332450867, + "step": 3058 + }, + { + "epoch": 2.316426949280848, + "grad_norm": 0.9979020953178406, + "learning_rate": 1.4108607511939053e-06, + "loss": 0.40149471163749695, + "step": 3060 + }, + { + "epoch": 2.3179409538228617, + "grad_norm": 0.9560830593109131, + "learning_rate": 1.4091976280130884e-06, + "loss": 0.07733272761106491, + "step": 3062 + }, + { + "epoch": 2.319454958364875, + "grad_norm": 2.5434560775756836, + "learning_rate": 1.4075348822185006e-06, + "loss": 0.17353244125843048, + "step": 3064 + }, + { + "epoch": 2.3209689629068886, + "grad_norm": 0.8594362139701843, + "learning_rate": 1.4058725164160035e-06, + "loss": 0.20494288206100464, + "step": 3066 + }, + { + "epoch": 2.3224829674489023, + "grad_norm": 2.008943796157837, + "learning_rate": 1.404210533210861e-06, + "loss": 0.8089302182197571, + "step": 3068 + }, + { + "epoch": 2.323996971990916, + "grad_norm": 1.4592000246047974, + "learning_rate": 1.4025489352077387e-06, + "loss": 0.7138203382492065, + "step": 3070 + }, + { + "epoch": 2.3255109765329296, + "grad_norm": 1.2846051454544067, + "learning_rate": 1.4008877250106977e-06, + "loss": 0.5384699106216431, + "step": 3072 + }, + { + "epoch": 2.3270249810749433, + "grad_norm": 1.8643642663955688, + "learning_rate": 1.399226905223193e-06, + "loss": 1.0451525449752808, + "step": 3074 + }, + { + "epoch": 2.328538985616957, + "grad_norm": 2.154496908187866, + "learning_rate": 1.3975664784480653e-06, + "loss": 0.9221094250679016, + "step": 3076 + }, + { + "epoch": 2.3300529901589706, + "grad_norm": 0.9635958075523376, + "learning_rate": 1.3959064472875406e-06, + "loss": 0.12245582044124603, + "step": 3078 + }, + { + "epoch": 2.3315669947009843, + "grad_norm": 2.344228744506836, + "learning_rate": 1.3942468143432263e-06, + "loss": 0.6670504808425903, + "step": 3080 + }, + { + "epoch": 2.3330809992429975, + "grad_norm": 6.57515287399292, + "learning_rate": 1.3925875822161034e-06, + "loss": 0.10075915604829788, + "step": 3082 + }, + { + "epoch": 2.334595003785011, + "grad_norm": 3.713717222213745, + "learning_rate": 1.3909287535065254e-06, + "loss": 0.42904263734817505, + "step": 3084 + }, + { + "epoch": 2.336109008327025, + "grad_norm": 2.4940359592437744, + "learning_rate": 1.3892703308142144e-06, + "loss": 0.48184409737586975, + "step": 3086 + }, + { + "epoch": 2.3376230128690385, + "grad_norm": 1.3162837028503418, + "learning_rate": 1.3876123167382551e-06, + "loss": 0.9502226710319519, + "step": 3088 + }, + { + "epoch": 2.339137017411052, + "grad_norm": 7.361681938171387, + "learning_rate": 1.3859547138770932e-06, + "loss": 0.599661648273468, + "step": 3090 + }, + { + "epoch": 2.340651021953066, + "grad_norm": 1.5515596866607666, + "learning_rate": 1.3842975248285284e-06, + "loss": 0.5803654193878174, + "step": 3092 + }, + { + "epoch": 2.3421650264950795, + "grad_norm": 2.7542948722839355, + "learning_rate": 1.382640752189712e-06, + "loss": 0.8318883180618286, + "step": 3094 + }, + { + "epoch": 2.343679031037093, + "grad_norm": 1.3153477907180786, + "learning_rate": 1.380984398557145e-06, + "loss": 1.0047526359558105, + "step": 3096 + }, + { + "epoch": 2.345193035579107, + "grad_norm": 1.5192776918411255, + "learning_rate": 1.3793284665266681e-06, + "loss": 0.2914261221885681, + "step": 3098 + }, + { + "epoch": 2.3467070401211205, + "grad_norm": 1.7307908535003662, + "learning_rate": 1.3776729586934643e-06, + "loss": 0.08925025165081024, + "step": 3100 + }, + { + "epoch": 2.348221044663134, + "grad_norm": 1.367954969406128, + "learning_rate": 1.3760178776520502e-06, + "loss": 0.10482754558324814, + "step": 3102 + }, + { + "epoch": 2.3497350492051474, + "grad_norm": 0.9124159812927246, + "learning_rate": 1.3743632259962745e-06, + "loss": 1.0023339986801147, + "step": 3104 + }, + { + "epoch": 2.351249053747161, + "grad_norm": 1.9778529405593872, + "learning_rate": 1.3727090063193114e-06, + "loss": 0.33621156215667725, + "step": 3106 + }, + { + "epoch": 2.3527630582891748, + "grad_norm": 1.0378131866455078, + "learning_rate": 1.3710552212136604e-06, + "loss": 0.09334226697683334, + "step": 3108 + }, + { + "epoch": 2.3542770628311884, + "grad_norm": 1.84617018699646, + "learning_rate": 1.3694018732711379e-06, + "loss": 0.5694966912269592, + "step": 3110 + }, + { + "epoch": 2.355791067373202, + "grad_norm": 2.9862098693847656, + "learning_rate": 1.367748965082876e-06, + "loss": 0.4267437160015106, + "step": 3112 + }, + { + "epoch": 2.3573050719152158, + "grad_norm": 1.6970902681350708, + "learning_rate": 1.3660964992393176e-06, + "loss": 0.9923496246337891, + "step": 3114 + }, + { + "epoch": 2.3588190764572294, + "grad_norm": 5.03739070892334, + "learning_rate": 1.3644444783302122e-06, + "loss": 0.5381168723106384, + "step": 3116 + }, + { + "epoch": 2.360333080999243, + "grad_norm": 1.1617183685302734, + "learning_rate": 1.3627929049446132e-06, + "loss": 0.8619621992111206, + "step": 3118 + }, + { + "epoch": 2.3618470855412568, + "grad_norm": 1.5070496797561646, + "learning_rate": 1.3611417816708704e-06, + "loss": 0.10040760785341263, + "step": 3120 + }, + { + "epoch": 2.3633610900832704, + "grad_norm": 7.007774829864502, + "learning_rate": 1.3594911110966294e-06, + "loss": 0.637200653553009, + "step": 3122 + }, + { + "epoch": 2.364875094625284, + "grad_norm": 1.424198865890503, + "learning_rate": 1.357840895808827e-06, + "loss": 0.5357862710952759, + "step": 3124 + }, + { + "epoch": 2.3663890991672973, + "grad_norm": 1.7460556030273438, + "learning_rate": 1.3561911383936855e-06, + "loss": 0.4575690031051636, + "step": 3126 + }, + { + "epoch": 2.367903103709311, + "grad_norm": 1.4075205326080322, + "learning_rate": 1.3545418414367094e-06, + "loss": 0.5583956837654114, + "step": 3128 + }, + { + "epoch": 2.3694171082513247, + "grad_norm": 1.6578543186187744, + "learning_rate": 1.3528930075226817e-06, + "loss": 0.7681494951248169, + "step": 3130 + }, + { + "epoch": 2.3709311127933383, + "grad_norm": 1.2633215188980103, + "learning_rate": 1.3512446392356616e-06, + "loss": 0.10331685096025467, + "step": 3132 + }, + { + "epoch": 2.372445117335352, + "grad_norm": 1.1421338319778442, + "learning_rate": 1.3495967391589757e-06, + "loss": 0.6246542930603027, + "step": 3134 + }, + { + "epoch": 2.3739591218773657, + "grad_norm": 1.9815109968185425, + "learning_rate": 1.347949309875219e-06, + "loss": 0.114561066031456, + "step": 3136 + }, + { + "epoch": 2.3754731264193794, + "grad_norm": 2.0402708053588867, + "learning_rate": 1.3463023539662466e-06, + "loss": 0.5803054571151733, + "step": 3138 + }, + { + "epoch": 2.376987130961393, + "grad_norm": 2.7386202812194824, + "learning_rate": 1.3446558740131748e-06, + "loss": 0.15507030487060547, + "step": 3140 + }, + { + "epoch": 2.3785011355034067, + "grad_norm": 3.707937479019165, + "learning_rate": 1.3430098725963704e-06, + "loss": 0.7307823300361633, + "step": 3142 + }, + { + "epoch": 2.38001514004542, + "grad_norm": 1.6937426328659058, + "learning_rate": 1.3413643522954523e-06, + "loss": 0.15916629135608673, + "step": 3144 + }, + { + "epoch": 2.3815291445874336, + "grad_norm": 7.806919097900391, + "learning_rate": 1.3397193156892861e-06, + "loss": 0.26189538836479187, + "step": 3146 + }, + { + "epoch": 2.3830431491294473, + "grad_norm": 1.6772302389144897, + "learning_rate": 1.3380747653559774e-06, + "loss": 0.28399085998535156, + "step": 3148 + }, + { + "epoch": 2.384557153671461, + "grad_norm": 5.274273872375488, + "learning_rate": 1.3364307038728712e-06, + "loss": 0.4567217528820038, + "step": 3150 + }, + { + "epoch": 2.3860711582134746, + "grad_norm": 3.8594324588775635, + "learning_rate": 1.3347871338165446e-06, + "loss": 0.24846740067005157, + "step": 3152 + }, + { + "epoch": 2.3875851627554883, + "grad_norm": 1.2358424663543701, + "learning_rate": 1.3331440577628072e-06, + "loss": 0.4945383071899414, + "step": 3154 + }, + { + "epoch": 2.389099167297502, + "grad_norm": 1.220064401626587, + "learning_rate": 1.3315014782866924e-06, + "loss": 0.9436984062194824, + "step": 3156 + }, + { + "epoch": 2.3906131718395156, + "grad_norm": 3.0590875148773193, + "learning_rate": 1.3298593979624551e-06, + "loss": 0.05348268523812294, + "step": 3158 + }, + { + "epoch": 2.3921271763815293, + "grad_norm": 3.1153619289398193, + "learning_rate": 1.3282178193635696e-06, + "loss": 0.2244076132774353, + "step": 3160 + }, + { + "epoch": 2.393641180923543, + "grad_norm": 0.6548517346382141, + "learning_rate": 1.3265767450627227e-06, + "loss": 0.17198483645915985, + "step": 3162 + }, + { + "epoch": 2.3951551854655566, + "grad_norm": 6.255575180053711, + "learning_rate": 1.3249361776318117e-06, + "loss": 0.2843590974807739, + "step": 3164 + }, + { + "epoch": 2.39666919000757, + "grad_norm": 2.9246954917907715, + "learning_rate": 1.3232961196419376e-06, + "loss": 0.623583197593689, + "step": 3166 + }, + { + "epoch": 2.3981831945495835, + "grad_norm": 0.50059574842453, + "learning_rate": 1.321656573663406e-06, + "loss": 0.09894771873950958, + "step": 3168 + }, + { + "epoch": 2.399697199091597, + "grad_norm": 1.3362988233566284, + "learning_rate": 1.3200175422657182e-06, + "loss": 0.5679229497909546, + "step": 3170 + }, + { + "epoch": 2.401211203633611, + "grad_norm": 2.1153697967529297, + "learning_rate": 1.318379028017568e-06, + "loss": 0.10766123980283737, + "step": 3172 + }, + { + "epoch": 2.4027252081756245, + "grad_norm": 1.3370146751403809, + "learning_rate": 1.3167410334868418e-06, + "loss": 0.5436999797821045, + "step": 3174 + }, + { + "epoch": 2.404239212717638, + "grad_norm": 0.9573742747306824, + "learning_rate": 1.3151035612406088e-06, + "loss": 0.6489812731742859, + "step": 3176 + }, + { + "epoch": 2.405753217259652, + "grad_norm": 2.13118577003479, + "learning_rate": 1.3134666138451209e-06, + "loss": 0.05830124020576477, + "step": 3178 + }, + { + "epoch": 2.4072672218016655, + "grad_norm": 3.4382195472717285, + "learning_rate": 1.3118301938658064e-06, + "loss": 0.3100920617580414, + "step": 3180 + }, + { + "epoch": 2.408781226343679, + "grad_norm": 1.7725976705551147, + "learning_rate": 1.3101943038672687e-06, + "loss": 0.15139536559581757, + "step": 3182 + }, + { + "epoch": 2.4102952308856924, + "grad_norm": 16.863624572753906, + "learning_rate": 1.30855894641328e-06, + "loss": 0.09493289142847061, + "step": 3184 + }, + { + "epoch": 2.4118092354277065, + "grad_norm": 1.4564752578735352, + "learning_rate": 1.3069241240667765e-06, + "loss": 0.8515955805778503, + "step": 3186 + }, + { + "epoch": 2.4133232399697198, + "grad_norm": 1.554567575454712, + "learning_rate": 1.3052898393898576e-06, + "loss": 1.0115079879760742, + "step": 3188 + }, + { + "epoch": 2.4148372445117334, + "grad_norm": 1.5233845710754395, + "learning_rate": 1.303656094943779e-06, + "loss": 0.5995775461196899, + "step": 3190 + }, + { + "epoch": 2.416351249053747, + "grad_norm": 2.3655357360839844, + "learning_rate": 1.3020228932889508e-06, + "loss": 0.20200370252132416, + "step": 3192 + }, + { + "epoch": 2.4178652535957608, + "grad_norm": 2.0905120372772217, + "learning_rate": 1.3003902369849306e-06, + "loss": 0.4998927116394043, + "step": 3194 + }, + { + "epoch": 2.4193792581377744, + "grad_norm": 4.869546413421631, + "learning_rate": 1.2987581285904236e-06, + "loss": 0.5835165977478027, + "step": 3196 + }, + { + "epoch": 2.420893262679788, + "grad_norm": 2.21663236618042, + "learning_rate": 1.2971265706632747e-06, + "loss": 0.16086173057556152, + "step": 3198 + }, + { + "epoch": 2.4224072672218018, + "grad_norm": 1.6782759428024292, + "learning_rate": 1.2954955657604666e-06, + "loss": 0.12522250413894653, + "step": 3200 + }, + { + "epoch": 2.4239212717638154, + "grad_norm": 1.6303638219833374, + "learning_rate": 1.293865116438115e-06, + "loss": 0.17928239703178406, + "step": 3202 + }, + { + "epoch": 2.425435276305829, + "grad_norm": 3.6860408782958984, + "learning_rate": 1.2922352252514653e-06, + "loss": 0.19742460548877716, + "step": 3204 + }, + { + "epoch": 2.4269492808478423, + "grad_norm": 1.208554983139038, + "learning_rate": 1.2906058947548886e-06, + "loss": 0.4462728202342987, + "step": 3206 + }, + { + "epoch": 2.428463285389856, + "grad_norm": 1.8932050466537476, + "learning_rate": 1.2889771275018757e-06, + "loss": 1.0223256349563599, + "step": 3208 + }, + { + "epoch": 2.4299772899318697, + "grad_norm": 1.7575585842132568, + "learning_rate": 1.287348926045037e-06, + "loss": 0.41016995906829834, + "step": 3210 + }, + { + "epoch": 2.4314912944738833, + "grad_norm": 4.98886775970459, + "learning_rate": 1.285721292936094e-06, + "loss": 0.5454086065292358, + "step": 3212 + }, + { + "epoch": 2.433005299015897, + "grad_norm": 1.27085542678833, + "learning_rate": 1.2840942307258784e-06, + "loss": 0.5145463943481445, + "step": 3214 + }, + { + "epoch": 2.4345193035579107, + "grad_norm": 1.5302115678787231, + "learning_rate": 1.2824677419643277e-06, + "loss": 0.5293673276901245, + "step": 3216 + }, + { + "epoch": 2.4360333080999244, + "grad_norm": 1.2060223817825317, + "learning_rate": 1.2808418292004795e-06, + "loss": 0.4708445370197296, + "step": 3218 + }, + { + "epoch": 2.437547312641938, + "grad_norm": 1.0390387773513794, + "learning_rate": 1.2792164949824702e-06, + "loss": 0.6700956225395203, + "step": 3220 + }, + { + "epoch": 2.4390613171839517, + "grad_norm": 1.1483389139175415, + "learning_rate": 1.2775917418575284e-06, + "loss": 0.5343015193939209, + "step": 3222 + }, + { + "epoch": 2.4405753217259654, + "grad_norm": 2.0548574924468994, + "learning_rate": 1.275967572371971e-06, + "loss": 0.5125769972801208, + "step": 3224 + }, + { + "epoch": 2.442089326267979, + "grad_norm": 1.7981852293014526, + "learning_rate": 1.2743439890712035e-06, + "loss": 0.12532752752304077, + "step": 3226 + }, + { + "epoch": 2.4436033308099923, + "grad_norm": 1.0528297424316406, + "learning_rate": 1.2727209944997099e-06, + "loss": 0.8808407187461853, + "step": 3228 + }, + { + "epoch": 2.445117335352006, + "grad_norm": 2.0501883029937744, + "learning_rate": 1.2710985912010514e-06, + "loss": 0.5831062197685242, + "step": 3230 + }, + { + "epoch": 2.4466313398940196, + "grad_norm": 1.6000312566757202, + "learning_rate": 1.2694767817178651e-06, + "loss": 0.5078587532043457, + "step": 3232 + }, + { + "epoch": 2.4481453444360333, + "grad_norm": 1.526246428489685, + "learning_rate": 1.2678555685918549e-06, + "loss": 0.8551164865493774, + "step": 3234 + }, + { + "epoch": 2.449659348978047, + "grad_norm": 1.3986458778381348, + "learning_rate": 1.2662349543637915e-06, + "loss": 0.5159830451011658, + "step": 3236 + }, + { + "epoch": 2.4511733535200606, + "grad_norm": 2.8799469470977783, + "learning_rate": 1.2646149415735061e-06, + "loss": 0.1353384107351303, + "step": 3238 + }, + { + "epoch": 2.4526873580620743, + "grad_norm": 1.6056617498397827, + "learning_rate": 1.2629955327598884e-06, + "loss": 0.4236334264278412, + "step": 3240 + }, + { + "epoch": 2.454201362604088, + "grad_norm": 1.8303232192993164, + "learning_rate": 1.2613767304608808e-06, + "loss": 0.9977210760116577, + "step": 3242 + }, + { + "epoch": 2.4557153671461016, + "grad_norm": 1.178375482559204, + "learning_rate": 1.2597585372134754e-06, + "loss": 0.4265325963497162, + "step": 3244 + }, + { + "epoch": 2.457229371688115, + "grad_norm": 1.1785249710083008, + "learning_rate": 1.2581409555537087e-06, + "loss": 0.10759641230106354, + "step": 3246 + }, + { + "epoch": 2.4587433762301285, + "grad_norm": 2.1448614597320557, + "learning_rate": 1.2565239880166613e-06, + "loss": 0.0391768217086792, + "step": 3248 + }, + { + "epoch": 2.460257380772142, + "grad_norm": 1.1517486572265625, + "learning_rate": 1.2549076371364487e-06, + "loss": 0.09283555299043655, + "step": 3250 + }, + { + "epoch": 2.461771385314156, + "grad_norm": 3.247016668319702, + "learning_rate": 1.2532919054462209e-06, + "loss": 0.1504117250442505, + "step": 3252 + }, + { + "epoch": 2.4632853898561695, + "grad_norm": 1.9994336366653442, + "learning_rate": 1.2516767954781588e-06, + "loss": 0.09418138116598129, + "step": 3254 + }, + { + "epoch": 2.464799394398183, + "grad_norm": 1.2370266914367676, + "learning_rate": 1.250062309763467e-06, + "loss": 0.49576517939567566, + "step": 3256 + }, + { + "epoch": 2.466313398940197, + "grad_norm": 2.999490737915039, + "learning_rate": 1.248448450832373e-06, + "loss": 0.08977462351322174, + "step": 3258 + }, + { + "epoch": 2.4678274034822105, + "grad_norm": 0.6803156733512878, + "learning_rate": 1.2468352212141202e-06, + "loss": 0.0976734608411789, + "step": 3260 + }, + { + "epoch": 2.469341408024224, + "grad_norm": 2.0646297931671143, + "learning_rate": 1.245222623436969e-06, + "loss": 0.704110324382782, + "step": 3262 + }, + { + "epoch": 2.470855412566238, + "grad_norm": 2.659524440765381, + "learning_rate": 1.243610660028186e-06, + "loss": 0.26327335834503174, + "step": 3264 + }, + { + "epoch": 2.4723694171082515, + "grad_norm": 13.316092491149902, + "learning_rate": 1.2419993335140467e-06, + "loss": 0.5661664009094238, + "step": 3266 + }, + { + "epoch": 2.4738834216502648, + "grad_norm": 1.3815058469772339, + "learning_rate": 1.2403886464198259e-06, + "loss": 0.4332255721092224, + "step": 3268 + }, + { + "epoch": 2.4753974261922784, + "grad_norm": 1.0289678573608398, + "learning_rate": 1.2387786012697987e-06, + "loss": 0.5129609107971191, + "step": 3270 + }, + { + "epoch": 2.476911430734292, + "grad_norm": 2.0394294261932373, + "learning_rate": 1.237169200587232e-06, + "loss": 0.02880547195672989, + "step": 3272 + }, + { + "epoch": 2.4784254352763058, + "grad_norm": 1.1159138679504395, + "learning_rate": 1.235560446894383e-06, + "loss": 0.1847442090511322, + "step": 3274 + }, + { + "epoch": 2.4799394398183194, + "grad_norm": 1.8514620065689087, + "learning_rate": 1.233952342712497e-06, + "loss": 0.1308874487876892, + "step": 3276 + }, + { + "epoch": 2.481453444360333, + "grad_norm": 3.212641716003418, + "learning_rate": 1.232344890561799e-06, + "loss": 0.506878674030304, + "step": 3278 + }, + { + "epoch": 2.4829674489023468, + "grad_norm": 1.7196052074432373, + "learning_rate": 1.2307380929614932e-06, + "loss": 0.5555287003517151, + "step": 3280 + }, + { + "epoch": 2.4844814534443604, + "grad_norm": 1.4464601278305054, + "learning_rate": 1.2291319524297573e-06, + "loss": 0.5358646512031555, + "step": 3282 + }, + { + "epoch": 2.485995457986374, + "grad_norm": 1.9261735677719116, + "learning_rate": 1.2275264714837408e-06, + "loss": 0.12778237462043762, + "step": 3284 + }, + { + "epoch": 2.4875094625283873, + "grad_norm": 1.3511003255844116, + "learning_rate": 1.225921652639558e-06, + "loss": 0.3071487843990326, + "step": 3286 + }, + { + "epoch": 2.4890234670704015, + "grad_norm": 1.225162148475647, + "learning_rate": 1.2243174984122853e-06, + "loss": 0.49318164587020874, + "step": 3288 + }, + { + "epoch": 2.4905374716124147, + "grad_norm": 4.127883434295654, + "learning_rate": 1.2227140113159594e-06, + "loss": 0.9633057117462158, + "step": 3290 + }, + { + "epoch": 2.4920514761544283, + "grad_norm": 1.5876481533050537, + "learning_rate": 1.2211111938635695e-06, + "loss": 0.9431458115577698, + "step": 3292 + }, + { + "epoch": 2.493565480696442, + "grad_norm": 2.0850558280944824, + "learning_rate": 1.2195090485670563e-06, + "loss": 0.8991146087646484, + "step": 3294 + }, + { + "epoch": 2.4950794852384557, + "grad_norm": 0.811373770236969, + "learning_rate": 1.2179075779373064e-06, + "loss": 0.5426760315895081, + "step": 3296 + }, + { + "epoch": 2.4965934897804694, + "grad_norm": 2.9668161869049072, + "learning_rate": 1.216306784484151e-06, + "loss": 0.5902142524719238, + "step": 3298 + }, + { + "epoch": 2.498107494322483, + "grad_norm": 1.5236780643463135, + "learning_rate": 1.2147066707163578e-06, + "loss": 0.23543943464756012, + "step": 3300 + }, + { + "epoch": 2.4996214988644967, + "grad_norm": 2.149080753326416, + "learning_rate": 1.2131072391416298e-06, + "loss": 0.5079725980758667, + "step": 3302 + }, + { + "epoch": 2.5011355034065104, + "grad_norm": 1.0887055397033691, + "learning_rate": 1.2115084922666007e-06, + "loss": 0.926274299621582, + "step": 3304 + }, + { + "epoch": 2.502649507948524, + "grad_norm": 4.145946979522705, + "learning_rate": 1.2099104325968327e-06, + "loss": 0.5575621128082275, + "step": 3306 + }, + { + "epoch": 2.5041635124905373, + "grad_norm": 1.8884152173995972, + "learning_rate": 1.20831306263681e-06, + "loss": 0.26198723912239075, + "step": 3308 + }, + { + "epoch": 2.5056775170325514, + "grad_norm": 1.486365556716919, + "learning_rate": 1.2067163848899345e-06, + "loss": 0.3277100920677185, + "step": 3310 + }, + { + "epoch": 2.5071915215745646, + "grad_norm": 1.6638970375061035, + "learning_rate": 1.2051204018585258e-06, + "loss": 0.46174928545951843, + "step": 3312 + }, + { + "epoch": 2.5087055261165783, + "grad_norm": 1.7497713565826416, + "learning_rate": 1.203525116043813e-06, + "loss": 0.5363208651542664, + "step": 3314 + }, + { + "epoch": 2.510219530658592, + "grad_norm": 1.1188297271728516, + "learning_rate": 1.201930529945933e-06, + "loss": 0.4995245933532715, + "step": 3316 + }, + { + "epoch": 2.5117335352006056, + "grad_norm": 9.45166301727295, + "learning_rate": 1.2003366460639257e-06, + "loss": 0.47632646560668945, + "step": 3318 + }, + { + "epoch": 2.5132475397426193, + "grad_norm": 1.8116624355316162, + "learning_rate": 1.1987434668957316e-06, + "loss": 0.22631500661373138, + "step": 3320 + }, + { + "epoch": 2.514761544284633, + "grad_norm": 1.246431827545166, + "learning_rate": 1.1971509949381862e-06, + "loss": 0.9838817119598389, + "step": 3322 + }, + { + "epoch": 2.5162755488266466, + "grad_norm": 1.5256738662719727, + "learning_rate": 1.1955592326870153e-06, + "loss": 0.8555259704589844, + "step": 3324 + }, + { + "epoch": 2.51778955336866, + "grad_norm": 2.2011032104492188, + "learning_rate": 1.1939681826368353e-06, + "loss": 0.5096079707145691, + "step": 3326 + }, + { + "epoch": 2.519303557910674, + "grad_norm": 1.5316205024719238, + "learning_rate": 1.192377847281144e-06, + "loss": 0.481001079082489, + "step": 3328 + }, + { + "epoch": 2.520817562452687, + "grad_norm": 1.4563915729522705, + "learning_rate": 1.1907882291123196e-06, + "loss": 0.1763199418783188, + "step": 3330 + }, + { + "epoch": 2.522331566994701, + "grad_norm": 1.2318475246429443, + "learning_rate": 1.1891993306216168e-06, + "loss": 0.5268822908401489, + "step": 3332 + }, + { + "epoch": 2.5238455715367145, + "grad_norm": 5.150367259979248, + "learning_rate": 1.187611154299163e-06, + "loss": 0.6080965399742126, + "step": 3334 + }, + { + "epoch": 2.525359576078728, + "grad_norm": 1.6307241916656494, + "learning_rate": 1.1860237026339524e-06, + "loss": 0.4704788625240326, + "step": 3336 + }, + { + "epoch": 2.526873580620742, + "grad_norm": 0.9883468151092529, + "learning_rate": 1.1844369781138445e-06, + "loss": 0.504618227481842, + "step": 3338 + }, + { + "epoch": 2.5283875851627555, + "grad_norm": 1.9262429475784302, + "learning_rate": 1.1828509832255586e-06, + "loss": 0.7013383507728577, + "step": 3340 + }, + { + "epoch": 2.529901589704769, + "grad_norm": 1.9441081285476685, + "learning_rate": 1.181265720454671e-06, + "loss": 0.4719160199165344, + "step": 3342 + }, + { + "epoch": 2.531415594246783, + "grad_norm": 1.0725170373916626, + "learning_rate": 1.1796811922856107e-06, + "loss": 0.9367634057998657, + "step": 3344 + }, + { + "epoch": 2.5329295987887965, + "grad_norm": 1.2831497192382812, + "learning_rate": 1.1780974012016552e-06, + "loss": 0.07334476709365845, + "step": 3346 + }, + { + "epoch": 2.5344436033308098, + "grad_norm": 1.7957955598831177, + "learning_rate": 1.1765143496849262e-06, + "loss": 0.16385282576084137, + "step": 3348 + }, + { + "epoch": 2.535957607872824, + "grad_norm": 1.5566226243972778, + "learning_rate": 1.1749320402163878e-06, + "loss": 0.0595238171517849, + "step": 3350 + }, + { + "epoch": 2.537471612414837, + "grad_norm": 1.362262487411499, + "learning_rate": 1.1733504752758404e-06, + "loss": 0.6075219511985779, + "step": 3352 + }, + { + "epoch": 2.5389856169568508, + "grad_norm": 3.1830029487609863, + "learning_rate": 1.1717696573419162e-06, + "loss": 0.25107669830322266, + "step": 3354 + }, + { + "epoch": 2.5404996214988644, + "grad_norm": 1.8373825550079346, + "learning_rate": 1.1701895888920792e-06, + "loss": 0.39207756519317627, + "step": 3356 + }, + { + "epoch": 2.542013626040878, + "grad_norm": 3.7029011249542236, + "learning_rate": 1.1686102724026177e-06, + "loss": 0.621699869632721, + "step": 3358 + }, + { + "epoch": 2.5435276305828918, + "grad_norm": 1.0903701782226562, + "learning_rate": 1.1670317103486403e-06, + "loss": 0.4250500202178955, + "step": 3360 + }, + { + "epoch": 2.5450416351249054, + "grad_norm": 3.5209951400756836, + "learning_rate": 1.165453905204076e-06, + "loss": 0.3211110532283783, + "step": 3362 + }, + { + "epoch": 2.546555639666919, + "grad_norm": 0.7389101386070251, + "learning_rate": 1.1638768594416648e-06, + "loss": 0.5573105812072754, + "step": 3364 + }, + { + "epoch": 2.548069644208933, + "grad_norm": 2.8440351486206055, + "learning_rate": 1.162300575532958e-06, + "loss": 0.050939396023750305, + "step": 3366 + }, + { + "epoch": 2.5495836487509465, + "grad_norm": 1.738464117050171, + "learning_rate": 1.1607250559483121e-06, + "loss": 1.0303083658218384, + "step": 3368 + }, + { + "epoch": 2.5510976532929597, + "grad_norm": 2.664393186569214, + "learning_rate": 1.1591503031568875e-06, + "loss": 0.502128005027771, + "step": 3370 + }, + { + "epoch": 2.5526116578349733, + "grad_norm": 4.097185134887695, + "learning_rate": 1.1575763196266412e-06, + "loss": 0.6981651782989502, + "step": 3372 + }, + { + "epoch": 2.554125662376987, + "grad_norm": 5.088730812072754, + "learning_rate": 1.1560031078243248e-06, + "loss": 0.49354976415634155, + "step": 3374 + }, + { + "epoch": 2.5556396669190007, + "grad_norm": 0.8263038992881775, + "learning_rate": 1.1544306702154807e-06, + "loss": 0.6446477770805359, + "step": 3376 + }, + { + "epoch": 2.5571536714610144, + "grad_norm": 1.9018847942352295, + "learning_rate": 1.1528590092644387e-06, + "loss": 0.3001963794231415, + "step": 3378 + }, + { + "epoch": 2.558667676003028, + "grad_norm": 1.765356421470642, + "learning_rate": 1.1512881274343105e-06, + "loss": 0.9689096212387085, + "step": 3380 + }, + { + "epoch": 2.5601816805450417, + "grad_norm": 1.7506810426712036, + "learning_rate": 1.1497180271869862e-06, + "loss": 0.6143833994865417, + "step": 3382 + }, + { + "epoch": 2.5616956850870554, + "grad_norm": 1.060329556465149, + "learning_rate": 1.1481487109831329e-06, + "loss": 0.8665130138397217, + "step": 3384 + }, + { + "epoch": 2.563209689629069, + "grad_norm": 0.9969649314880371, + "learning_rate": 1.1465801812821875e-06, + "loss": 0.08530528098344803, + "step": 3386 + }, + { + "epoch": 2.5647236941710823, + "grad_norm": 1.7984051704406738, + "learning_rate": 1.1450124405423544e-06, + "loss": 0.8119886517524719, + "step": 3388 + }, + { + "epoch": 2.5662376987130964, + "grad_norm": 1.815270185470581, + "learning_rate": 1.1434454912206018e-06, + "loss": 0.17262788116931915, + "step": 3390 + }, + { + "epoch": 2.5677517032551096, + "grad_norm": 2.2209391593933105, + "learning_rate": 1.1418793357726579e-06, + "loss": 0.06922617554664612, + "step": 3392 + }, + { + "epoch": 2.5692657077971233, + "grad_norm": 1.1411380767822266, + "learning_rate": 1.1403139766530063e-06, + "loss": 0.49777814745903015, + "step": 3394 + }, + { + "epoch": 2.570779712339137, + "grad_norm": 2.31252121925354, + "learning_rate": 1.1387494163148827e-06, + "loss": 0.07047592848539352, + "step": 3396 + }, + { + "epoch": 2.5722937168811506, + "grad_norm": 1.3099836111068726, + "learning_rate": 1.1371856572102705e-06, + "loss": 0.28612416982650757, + "step": 3398 + }, + { + "epoch": 2.5738077214231643, + "grad_norm": 2.352262020111084, + "learning_rate": 1.1356227017898985e-06, + "loss": 0.4649505615234375, + "step": 3400 + }, + { + "epoch": 2.575321725965178, + "grad_norm": 1.1120270490646362, + "learning_rate": 1.1340605525032353e-06, + "loss": 0.49471914768218994, + "step": 3402 + }, + { + "epoch": 2.5768357305071916, + "grad_norm": 1.9915766716003418, + "learning_rate": 1.1324992117984852e-06, + "loss": 0.9112046957015991, + "step": 3404 + }, + { + "epoch": 2.5783497350492053, + "grad_norm": 5.889375686645508, + "learning_rate": 1.1309386821225879e-06, + "loss": 0.43909478187561035, + "step": 3406 + }, + { + "epoch": 2.579863739591219, + "grad_norm": 2.1949453353881836, + "learning_rate": 1.1293789659212089e-06, + "loss": 0.486044317483902, + "step": 3408 + }, + { + "epoch": 2.581377744133232, + "grad_norm": 1.4413392543792725, + "learning_rate": 1.127820065638741e-06, + "loss": 0.9781872630119324, + "step": 3410 + }, + { + "epoch": 2.5828917486752463, + "grad_norm": 1.0431206226348877, + "learning_rate": 1.1262619837182968e-06, + "loss": 0.16925407946109772, + "step": 3412 + }, + { + "epoch": 2.5844057532172595, + "grad_norm": 1.9224302768707275, + "learning_rate": 1.1247047226017085e-06, + "loss": 0.6010867357254028, + "step": 3414 + }, + { + "epoch": 2.585919757759273, + "grad_norm": 1.5290940999984741, + "learning_rate": 1.1231482847295195e-06, + "loss": 0.9337325096130371, + "step": 3416 + }, + { + "epoch": 2.587433762301287, + "grad_norm": 1.5830674171447754, + "learning_rate": 1.1215926725409841e-06, + "loss": 0.21593010425567627, + "step": 3418 + }, + { + "epoch": 2.5889477668433005, + "grad_norm": 7.3055949211120605, + "learning_rate": 1.1200378884740637e-06, + "loss": 0.5212623476982117, + "step": 3420 + }, + { + "epoch": 2.590461771385314, + "grad_norm": 2.4063234329223633, + "learning_rate": 1.1184839349654195e-06, + "loss": 0.1991664469242096, + "step": 3422 + }, + { + "epoch": 2.591975775927328, + "grad_norm": 1.4251947402954102, + "learning_rate": 1.116930814450413e-06, + "loss": 0.9825526475906372, + "step": 3424 + }, + { + "epoch": 2.5934897804693415, + "grad_norm": 1.1361982822418213, + "learning_rate": 1.1153785293630988e-06, + "loss": 0.47728219628334045, + "step": 3426 + }, + { + "epoch": 2.5950037850113548, + "grad_norm": 0.7538416385650635, + "learning_rate": 1.1138270821362239e-06, + "loss": 0.9502293467521667, + "step": 3428 + }, + { + "epoch": 2.596517789553369, + "grad_norm": 1.8407939672470093, + "learning_rate": 1.1122764752012208e-06, + "loss": 0.09078755229711533, + "step": 3430 + }, + { + "epoch": 2.598031794095382, + "grad_norm": 2.0287883281707764, + "learning_rate": 1.1107267109882053e-06, + "loss": 0.5384723544120789, + "step": 3432 + }, + { + "epoch": 2.5995457986373958, + "grad_norm": 1.5482550859451294, + "learning_rate": 1.1091777919259723e-06, + "loss": 0.5190199017524719, + "step": 3434 + }, + { + "epoch": 2.6010598031794094, + "grad_norm": 4.078047752380371, + "learning_rate": 1.107629720441994e-06, + "loss": 0.41486960649490356, + "step": 3436 + }, + { + "epoch": 2.602573807721423, + "grad_norm": 1.4642376899719238, + "learning_rate": 1.1060824989624123e-06, + "loss": 0.608837902545929, + "step": 3438 + }, + { + "epoch": 2.6040878122634368, + "grad_norm": 1.1845002174377441, + "learning_rate": 1.1045361299120364e-06, + "loss": 0.06246378272771835, + "step": 3440 + }, + { + "epoch": 2.6056018168054504, + "grad_norm": 1.218395471572876, + "learning_rate": 1.1029906157143425e-06, + "loss": 0.7018975615501404, + "step": 3442 + }, + { + "epoch": 2.607115821347464, + "grad_norm": 1.7283616065979004, + "learning_rate": 1.1014459587914638e-06, + "loss": 0.5213047862052917, + "step": 3444 + }, + { + "epoch": 2.608629825889478, + "grad_norm": 1.154510498046875, + "learning_rate": 1.0999021615641927e-06, + "loss": 0.2990804612636566, + "step": 3446 + }, + { + "epoch": 2.6101438304314915, + "grad_norm": 1.0660480260849, + "learning_rate": 1.0983592264519717e-06, + "loss": 0.5228883624076843, + "step": 3448 + }, + { + "epoch": 2.6116578349735047, + "grad_norm": 1.193127989768982, + "learning_rate": 1.0968171558728945e-06, + "loss": 0.4204186499118805, + "step": 3450 + }, + { + "epoch": 2.613171839515519, + "grad_norm": 3.8585636615753174, + "learning_rate": 1.0952759522436987e-06, + "loss": 0.8483620285987854, + "step": 3452 + }, + { + "epoch": 2.614685844057532, + "grad_norm": 1.208863377571106, + "learning_rate": 1.0937356179797627e-06, + "loss": 0.9998847842216492, + "step": 3454 + }, + { + "epoch": 2.6161998485995457, + "grad_norm": 1.6825779676437378, + "learning_rate": 1.092196155495105e-06, + "loss": 0.14849629998207092, + "step": 3456 + }, + { + "epoch": 2.6177138531415594, + "grad_norm": 1.4217236042022705, + "learning_rate": 1.0906575672023743e-06, + "loss": 0.8051562905311584, + "step": 3458 + }, + { + "epoch": 2.619227857683573, + "grad_norm": 1.0774335861206055, + "learning_rate": 1.0891198555128516e-06, + "loss": 0.9358351826667786, + "step": 3460 + }, + { + "epoch": 2.6207418622255867, + "grad_norm": 2.2848949432373047, + "learning_rate": 1.0875830228364431e-06, + "loss": 0.5606520175933838, + "step": 3462 + }, + { + "epoch": 2.6222558667676004, + "grad_norm": 11.711904525756836, + "learning_rate": 1.0860470715816785e-06, + "loss": 0.13679350912570953, + "step": 3464 + }, + { + "epoch": 2.623769871309614, + "grad_norm": 1.3417608737945557, + "learning_rate": 1.0845120041557049e-06, + "loss": 0.5893025994300842, + "step": 3466 + }, + { + "epoch": 2.6252838758516277, + "grad_norm": 5.780949592590332, + "learning_rate": 1.0829778229642848e-06, + "loss": 0.06503201276063919, + "step": 3468 + }, + { + "epoch": 2.6267978803936414, + "grad_norm": 1.6110750436782837, + "learning_rate": 1.0814445304117917e-06, + "loss": 0.04634002223610878, + "step": 3470 + }, + { + "epoch": 2.6283118849356546, + "grad_norm": 2.3381261825561523, + "learning_rate": 1.0799121289012068e-06, + "loss": 0.5691334009170532, + "step": 3472 + }, + { + "epoch": 2.6298258894776687, + "grad_norm": 1.5458658933639526, + "learning_rate": 1.0783806208341141e-06, + "loss": 0.1492072194814682, + "step": 3474 + }, + { + "epoch": 2.631339894019682, + "grad_norm": 0.5779021978378296, + "learning_rate": 1.0768500086106978e-06, + "loss": 0.08680196106433868, + "step": 3476 + }, + { + "epoch": 2.6328538985616956, + "grad_norm": 0.20146629214286804, + "learning_rate": 1.075320294629739e-06, + "loss": 0.4616508185863495, + "step": 3478 + }, + { + "epoch": 2.6343679031037093, + "grad_norm": 1.5835942029953003, + "learning_rate": 1.0737914812886094e-06, + "loss": 0.4882984757423401, + "step": 3480 + }, + { + "epoch": 2.635881907645723, + "grad_norm": 1.129563331604004, + "learning_rate": 1.07226357098327e-06, + "loss": 0.11486337333917618, + "step": 3482 + }, + { + "epoch": 2.6373959121877366, + "grad_norm": 1.1769084930419922, + "learning_rate": 1.0707365661082674e-06, + "loss": 0.152438685297966, + "step": 3484 + }, + { + "epoch": 2.6389099167297503, + "grad_norm": 0.10828366875648499, + "learning_rate": 1.069210469056727e-06, + "loss": 0.05638560280203819, + "step": 3486 + }, + { + "epoch": 2.640423921271764, + "grad_norm": 5.986564636230469, + "learning_rate": 1.0676852822203547e-06, + "loss": 0.26844215393066406, + "step": 3488 + }, + { + "epoch": 2.641937925813777, + "grad_norm": 1.1511362791061401, + "learning_rate": 1.0661610079894268e-06, + "loss": 1.050552248954773, + "step": 3490 + }, + { + "epoch": 2.6434519303557913, + "grad_norm": 1.1341627836227417, + "learning_rate": 1.0646376487527907e-06, + "loss": 0.5503925681114197, + "step": 3492 + }, + { + "epoch": 2.6449659348978045, + "grad_norm": 7.367921829223633, + "learning_rate": 1.0631152068978604e-06, + "loss": 0.19944076240062714, + "step": 3494 + }, + { + "epoch": 2.646479939439818, + "grad_norm": 1.2620512247085571, + "learning_rate": 1.0615936848106113e-06, + "loss": 0.9938873648643494, + "step": 3496 + }, + { + "epoch": 2.647993943981832, + "grad_norm": 2.375256299972534, + "learning_rate": 1.0600730848755767e-06, + "loss": 0.5115200877189636, + "step": 3498 + }, + { + "epoch": 2.6495079485238455, + "grad_norm": 1.3954941034317017, + "learning_rate": 1.058553409475847e-06, + "loss": 0.5089138150215149, + "step": 3500 + }, + { + "epoch": 2.651021953065859, + "grad_norm": 0.8934672474861145, + "learning_rate": 1.0570346609930612e-06, + "loss": 0.40142881870269775, + "step": 3502 + }, + { + "epoch": 2.652535957607873, + "grad_norm": 0.7154641151428223, + "learning_rate": 1.0555168418074074e-06, + "loss": 0.2074945569038391, + "step": 3504 + }, + { + "epoch": 2.6540499621498865, + "grad_norm": 0.8344953656196594, + "learning_rate": 1.0539999542976152e-06, + "loss": 0.9636800289154053, + "step": 3506 + }, + { + "epoch": 2.6555639666919, + "grad_norm": 1.5777517557144165, + "learning_rate": 1.0524840008409575e-06, + "loss": 0.07826955616474152, + "step": 3508 + }, + { + "epoch": 2.657077971233914, + "grad_norm": 1.0673317909240723, + "learning_rate": 1.0509689838132395e-06, + "loss": 0.019867384806275368, + "step": 3510 + }, + { + "epoch": 2.658591975775927, + "grad_norm": 1.7061318159103394, + "learning_rate": 1.0494549055888013e-06, + "loss": 0.559928297996521, + "step": 3512 + }, + { + "epoch": 2.660105980317941, + "grad_norm": 1.2090964317321777, + "learning_rate": 1.0479417685405115e-06, + "loss": 0.9240732192993164, + "step": 3514 + }, + { + "epoch": 2.6616199848599544, + "grad_norm": 3.8992819786071777, + "learning_rate": 1.0464295750397626e-06, + "loss": 0.06411410868167877, + "step": 3516 + }, + { + "epoch": 2.663133989401968, + "grad_norm": 1.1974992752075195, + "learning_rate": 1.044918327456469e-06, + "loss": 0.6988753080368042, + "step": 3518 + }, + { + "epoch": 2.6646479939439818, + "grad_norm": 0.8864660859107971, + "learning_rate": 1.0434080281590626e-06, + "loss": 0.5867937207221985, + "step": 3520 + }, + { + "epoch": 2.6661619984859954, + "grad_norm": 2.2140252590179443, + "learning_rate": 1.0418986795144896e-06, + "loss": 0.2569411098957062, + "step": 3522 + }, + { + "epoch": 2.667676003028009, + "grad_norm": 2.632136106491089, + "learning_rate": 1.0403902838882056e-06, + "loss": 0.626483142375946, + "step": 3524 + }, + { + "epoch": 2.669190007570023, + "grad_norm": 1.3987592458724976, + "learning_rate": 1.0388828436441733e-06, + "loss": 0.9838293790817261, + "step": 3526 + }, + { + "epoch": 2.6707040121120365, + "grad_norm": 0.7196727991104126, + "learning_rate": 1.0373763611448567e-06, + "loss": 0.49087047576904297, + "step": 3528 + }, + { + "epoch": 2.67221801665405, + "grad_norm": 1.2425364255905151, + "learning_rate": 1.035870838751221e-06, + "loss": 0.5570994019508362, + "step": 3530 + }, + { + "epoch": 2.673732021196064, + "grad_norm": 11.238651275634766, + "learning_rate": 1.0343662788227249e-06, + "loss": 0.07974950969219208, + "step": 3532 + }, + { + "epoch": 2.675246025738077, + "grad_norm": 4.597038745880127, + "learning_rate": 1.0328626837173202e-06, + "loss": 0.42632856965065, + "step": 3534 + }, + { + "epoch": 2.6767600302800907, + "grad_norm": 1.0757285356521606, + "learning_rate": 1.0313600557914452e-06, + "loss": 0.2603739798069, + "step": 3536 + }, + { + "epoch": 2.6782740348221044, + "grad_norm": 1.0890934467315674, + "learning_rate": 1.029858397400023e-06, + "loss": 0.11274994164705276, + "step": 3538 + }, + { + "epoch": 2.679788039364118, + "grad_norm": 1.827724575996399, + "learning_rate": 1.028357710896458e-06, + "loss": 0.4546879231929779, + "step": 3540 + }, + { + "epoch": 2.6813020439061317, + "grad_norm": 1.3522696495056152, + "learning_rate": 1.0268579986326298e-06, + "loss": 0.504364550113678, + "step": 3542 + }, + { + "epoch": 2.6828160484481454, + "grad_norm": 1.1733466386795044, + "learning_rate": 1.0253592629588934e-06, + "loss": 0.8879395127296448, + "step": 3544 + }, + { + "epoch": 2.684330052990159, + "grad_norm": 2.6019883155822754, + "learning_rate": 1.0238615062240713e-06, + "loss": 0.5116626024246216, + "step": 3546 + }, + { + "epoch": 2.6858440575321727, + "grad_norm": 8.017656326293945, + "learning_rate": 1.0223647307754524e-06, + "loss": 0.42767584323883057, + "step": 3548 + }, + { + "epoch": 2.6873580620741864, + "grad_norm": 2.214622735977173, + "learning_rate": 1.0208689389587875e-06, + "loss": 0.1172211766242981, + "step": 3550 + }, + { + "epoch": 2.6888720666161996, + "grad_norm": 2.183483362197876, + "learning_rate": 1.0193741331182873e-06, + "loss": 0.2731698155403137, + "step": 3552 + }, + { + "epoch": 2.6903860711582137, + "grad_norm": 2.043632984161377, + "learning_rate": 1.0178803155966158e-06, + "loss": 0.347670316696167, + "step": 3554 + }, + { + "epoch": 2.691900075700227, + "grad_norm": 0.25137412548065186, + "learning_rate": 1.0163874887348873e-06, + "loss": 0.1319126933813095, + "step": 3556 + }, + { + "epoch": 2.6934140802422406, + "grad_norm": 3.07865309715271, + "learning_rate": 1.0148956548726668e-06, + "loss": 0.49457916617393494, + "step": 3558 + }, + { + "epoch": 2.6949280847842543, + "grad_norm": 1.2678008079528809, + "learning_rate": 1.0134048163479599e-06, + "loss": 1.0596566200256348, + "step": 3560 + }, + { + "epoch": 2.696442089326268, + "grad_norm": 2.128063201904297, + "learning_rate": 1.0119149754972132e-06, + "loss": 0.5196185111999512, + "step": 3562 + }, + { + "epoch": 2.6979560938682816, + "grad_norm": 0.9698646664619446, + "learning_rate": 1.0104261346553096e-06, + "loss": 0.49274688959121704, + "step": 3564 + }, + { + "epoch": 2.6994700984102953, + "grad_norm": 1.621178150177002, + "learning_rate": 1.0089382961555663e-06, + "loss": 0.5723605751991272, + "step": 3566 + }, + { + "epoch": 2.700984102952309, + "grad_norm": 1.4272880554199219, + "learning_rate": 1.0074514623297277e-06, + "loss": 0.49057722091674805, + "step": 3568 + }, + { + "epoch": 2.7024981074943226, + "grad_norm": 1.0688875913619995, + "learning_rate": 1.005965635507964e-06, + "loss": 0.3502315282821655, + "step": 3570 + }, + { + "epoch": 2.7040121120363363, + "grad_norm": 3.4009177684783936, + "learning_rate": 1.0044808180188685e-06, + "loss": 0.23893460631370544, + "step": 3572 + }, + { + "epoch": 2.7055261165783495, + "grad_norm": 0.5148216485977173, + "learning_rate": 1.0029970121894516e-06, + "loss": 0.13902714848518372, + "step": 3574 + }, + { + "epoch": 2.7070401211203636, + "grad_norm": 0.5016177296638489, + "learning_rate": 1.0015142203451384e-06, + "loss": 0.25262537598609924, + "step": 3576 + }, + { + "epoch": 2.708554125662377, + "grad_norm": 1.079883098602295, + "learning_rate": 1.000032444809764e-06, + "loss": 0.491366982460022, + "step": 3578 + }, + { + "epoch": 2.7100681302043905, + "grad_norm": 1.2552731037139893, + "learning_rate": 9.985516879055733e-07, + "loss": 0.8901302218437195, + "step": 3580 + }, + { + "epoch": 2.711582134746404, + "grad_norm": 2.92391300201416, + "learning_rate": 9.970719519532123e-07, + "loss": 0.5162824988365173, + "step": 3582 + }, + { + "epoch": 2.713096139288418, + "grad_norm": 1.6279443502426147, + "learning_rate": 9.955932392717273e-07, + "loss": 0.8412918448448181, + "step": 3584 + }, + { + "epoch": 2.7146101438304315, + "grad_norm": 1.443770408630371, + "learning_rate": 9.941155521785622e-07, + "loss": 0.5280138254165649, + "step": 3586 + }, + { + "epoch": 2.716124148372445, + "grad_norm": 10.262187957763672, + "learning_rate": 9.926388929895523e-07, + "loss": 0.22188910841941833, + "step": 3588 + }, + { + "epoch": 2.717638152914459, + "grad_norm": 4.570699691772461, + "learning_rate": 9.91163264018923e-07, + "loss": 0.1173437088727951, + "step": 3590 + }, + { + "epoch": 2.719152157456472, + "grad_norm": 1.6797869205474854, + "learning_rate": 9.89688667579284e-07, + "loss": 0.5324147343635559, + "step": 3592 + }, + { + "epoch": 2.720666161998486, + "grad_norm": 1.699195146560669, + "learning_rate": 9.882151059816286e-07, + "loss": 0.5256529450416565, + "step": 3594 + }, + { + "epoch": 2.7221801665404994, + "grad_norm": 2.9241087436676025, + "learning_rate": 9.867425815353263e-07, + "loss": 0.525752067565918, + "step": 3596 + }, + { + "epoch": 2.723694171082513, + "grad_norm": 5.643086910247803, + "learning_rate": 9.852710965481219e-07, + "loss": 0.07988660782575607, + "step": 3598 + }, + { + "epoch": 2.7252081756245268, + "grad_norm": 0.9148181080818176, + "learning_rate": 9.83800653326131e-07, + "loss": 0.8206759691238403, + "step": 3600 + }, + { + "epoch": 2.7267221801665404, + "grad_norm": 1.2723417282104492, + "learning_rate": 9.823312541738378e-07, + "loss": 0.883294403553009, + "step": 3602 + }, + { + "epoch": 2.728236184708554, + "grad_norm": 1.789671778678894, + "learning_rate": 9.808629013940889e-07, + "loss": 0.5650665760040283, + "step": 3604 + }, + { + "epoch": 2.729750189250568, + "grad_norm": 1.0919922590255737, + "learning_rate": 9.793955972880904e-07, + "loss": 0.5554988384246826, + "step": 3606 + }, + { + "epoch": 2.7312641937925815, + "grad_norm": 2.393855571746826, + "learning_rate": 9.779293441554072e-07, + "loss": 0.07354193925857544, + "step": 3608 + }, + { + "epoch": 2.732778198334595, + "grad_norm": 1.1642277240753174, + "learning_rate": 9.764641442939552e-07, + "loss": 0.5331453084945679, + "step": 3610 + }, + { + "epoch": 2.734292202876609, + "grad_norm": 7.8056464195251465, + "learning_rate": 9.750000000000004e-07, + "loss": 0.6716046333312988, + "step": 3612 + }, + { + "epoch": 2.735806207418622, + "grad_norm": 0.970569908618927, + "learning_rate": 9.735369135681535e-07, + "loss": 0.466024786233902, + "step": 3614 + }, + { + "epoch": 2.737320211960636, + "grad_norm": 0.8802584409713745, + "learning_rate": 9.720748872913692e-07, + "loss": 0.028787286952137947, + "step": 3616 + }, + { + "epoch": 2.7388342165026494, + "grad_norm": 17.782005310058594, + "learning_rate": 9.706139234609395e-07, + "loss": 0.5882828235626221, + "step": 3618 + }, + { + "epoch": 2.740348221044663, + "grad_norm": 1.973081350326538, + "learning_rate": 9.691540243664913e-07, + "loss": 0.307500422000885, + "step": 3620 + }, + { + "epoch": 2.7418622255866767, + "grad_norm": 3.1623072624206543, + "learning_rate": 9.67695192295982e-07, + "loss": 0.9052594900131226, + "step": 3622 + }, + { + "epoch": 2.7433762301286904, + "grad_norm": 1.3754016160964966, + "learning_rate": 9.662374295356995e-07, + "loss": 0.4912913739681244, + "step": 3624 + }, + { + "epoch": 2.744890234670704, + "grad_norm": 3.9563827514648438, + "learning_rate": 9.647807383702534e-07, + "loss": 0.10663387924432755, + "step": 3626 + }, + { + "epoch": 2.7464042392127177, + "grad_norm": 1.1170125007629395, + "learning_rate": 9.63325121082574e-07, + "loss": 1.0227893590927124, + "step": 3628 + }, + { + "epoch": 2.7479182437547314, + "grad_norm": 1.5155786275863647, + "learning_rate": 9.618705799539105e-07, + "loss": 0.10028337687253952, + "step": 3630 + }, + { + "epoch": 2.749432248296745, + "grad_norm": 2.075418710708618, + "learning_rate": 9.604171172638233e-07, + "loss": 0.5387253761291504, + "step": 3632 + }, + { + "epoch": 2.7509462528387587, + "grad_norm": 2.6641299724578857, + "learning_rate": 9.589647352901837e-07, + "loss": 0.13054971396923065, + "step": 3634 + }, + { + "epoch": 2.752460257380772, + "grad_norm": 1.4470781087875366, + "learning_rate": 9.575134363091702e-07, + "loss": 0.7052936553955078, + "step": 3636 + }, + { + "epoch": 2.7539742619227856, + "grad_norm": 3.241410493850708, + "learning_rate": 9.560632225952626e-07, + "loss": 0.15796498954296112, + "step": 3638 + }, + { + "epoch": 2.7554882664647993, + "grad_norm": 1.5761303901672363, + "learning_rate": 9.546140964212397e-07, + "loss": 0.9275044798851013, + "step": 3640 + }, + { + "epoch": 2.757002271006813, + "grad_norm": 2.0394246578216553, + "learning_rate": 9.531660600581774e-07, + "loss": 0.19777211546897888, + "step": 3642 + }, + { + "epoch": 2.7585162755488266, + "grad_norm": 2.120290994644165, + "learning_rate": 9.517191157754421e-07, + "loss": 0.05213087424635887, + "step": 3644 + }, + { + "epoch": 2.7600302800908403, + "grad_norm": 2.3269333839416504, + "learning_rate": 9.502732658406903e-07, + "loss": 0.5643972158432007, + "step": 3646 + }, + { + "epoch": 2.761544284632854, + "grad_norm": 2.3903770446777344, + "learning_rate": 9.488285125198622e-07, + "loss": 0.19163475930690765, + "step": 3648 + }, + { + "epoch": 2.7630582891748676, + "grad_norm": 1.672661304473877, + "learning_rate": 9.473848580771794e-07, + "loss": 0.9451408386230469, + "step": 3650 + }, + { + "epoch": 2.7645722937168813, + "grad_norm": 1.8734040260314941, + "learning_rate": 9.459423047751417e-07, + "loss": 0.9429214000701904, + "step": 3652 + }, + { + "epoch": 2.7660862982588945, + "grad_norm": 2.350248098373413, + "learning_rate": 9.445008548745238e-07, + "loss": 0.2139756977558136, + "step": 3654 + }, + { + "epoch": 2.7676003028009086, + "grad_norm": 1.426629900932312, + "learning_rate": 9.430605106343696e-07, + "loss": 0.09413161873817444, + "step": 3656 + }, + { + "epoch": 2.769114307342922, + "grad_norm": 1.0268224477767944, + "learning_rate": 9.416212743119911e-07, + "loss": 0.48298001289367676, + "step": 3658 + }, + { + "epoch": 2.7706283118849355, + "grad_norm": 1.2974029779434204, + "learning_rate": 9.401831481629649e-07, + "loss": 0.5410240292549133, + "step": 3660 + }, + { + "epoch": 2.772142316426949, + "grad_norm": 1.1393475532531738, + "learning_rate": 9.387461344411263e-07, + "loss": 0.9460940361022949, + "step": 3662 + }, + { + "epoch": 2.773656320968963, + "grad_norm": 0.3923311233520508, + "learning_rate": 9.373102353985668e-07, + "loss": 0.4947371780872345, + "step": 3664 + }, + { + "epoch": 2.7751703255109765, + "grad_norm": 0.9876829385757446, + "learning_rate": 9.358754532856334e-07, + "loss": 0.44703051447868347, + "step": 3666 + }, + { + "epoch": 2.77668433005299, + "grad_norm": 1.155160665512085, + "learning_rate": 9.344417903509201e-07, + "loss": 0.6172246932983398, + "step": 3668 + }, + { + "epoch": 2.778198334595004, + "grad_norm": 3.6872024536132812, + "learning_rate": 9.330092488412687e-07, + "loss": 0.51617032289505, + "step": 3670 + }, + { + "epoch": 2.7797123391370175, + "grad_norm": 0.6839119791984558, + "learning_rate": 9.315778310017616e-07, + "loss": 0.4908495545387268, + "step": 3672 + }, + { + "epoch": 2.781226343679031, + "grad_norm": 1.9566898345947266, + "learning_rate": 9.301475390757222e-07, + "loss": 0.4251103401184082, + "step": 3674 + }, + { + "epoch": 2.7827403482210444, + "grad_norm": 3.2777817249298096, + "learning_rate": 9.287183753047082e-07, + "loss": 0.2105153352022171, + "step": 3676 + }, + { + "epoch": 2.7842543527630585, + "grad_norm": 3.134899377822876, + "learning_rate": 9.272903419285096e-07, + "loss": 0.11907383799552917, + "step": 3678 + }, + { + "epoch": 2.7857683573050718, + "grad_norm": 1.1065566539764404, + "learning_rate": 9.258634411851445e-07, + "loss": 0.8950471878051758, + "step": 3680 + }, + { + "epoch": 2.7872823618470854, + "grad_norm": 1.293508529663086, + "learning_rate": 9.244376753108567e-07, + "loss": 0.1413707137107849, + "step": 3682 + }, + { + "epoch": 2.788796366389099, + "grad_norm": 1.6691392660140991, + "learning_rate": 9.230130465401107e-07, + "loss": 0.06067422404885292, + "step": 3684 + }, + { + "epoch": 2.790310370931113, + "grad_norm": 2.895246982574463, + "learning_rate": 9.215895571055886e-07, + "loss": 0.2084033042192459, + "step": 3686 + }, + { + "epoch": 2.7918243754731265, + "grad_norm": 1.0409832000732422, + "learning_rate": 9.201672092381885e-07, + "loss": 0.5444228649139404, + "step": 3688 + }, + { + "epoch": 2.79333838001514, + "grad_norm": 1.63600492477417, + "learning_rate": 9.187460051670173e-07, + "loss": 0.4831315577030182, + "step": 3690 + }, + { + "epoch": 2.794852384557154, + "grad_norm": 1.0883995294570923, + "learning_rate": 9.173259471193918e-07, + "loss": 0.4594913721084595, + "step": 3692 + }, + { + "epoch": 2.7963663890991675, + "grad_norm": 5.779807090759277, + "learning_rate": 9.159070373208301e-07, + "loss": 0.15681150555610657, + "step": 3694 + }, + { + "epoch": 2.797880393641181, + "grad_norm": 9.085508346557617, + "learning_rate": 9.144892779950532e-07, + "loss": 0.6308251023292542, + "step": 3696 + }, + { + "epoch": 2.7993943981831944, + "grad_norm": 2.256226062774658, + "learning_rate": 9.130726713639774e-07, + "loss": 0.09662806242704391, + "step": 3698 + }, + { + "epoch": 2.800908402725208, + "grad_norm": 1.3712506294250488, + "learning_rate": 9.116572196477129e-07, + "loss": 0.496097594499588, + "step": 3700 + }, + { + "epoch": 2.8024224072672217, + "grad_norm": 1.227725863456726, + "learning_rate": 9.102429250645598e-07, + "loss": 0.9627173542976379, + "step": 3702 + }, + { + "epoch": 2.8039364118092354, + "grad_norm": 1.0436882972717285, + "learning_rate": 9.088297898310059e-07, + "loss": 0.7140069007873535, + "step": 3704 + }, + { + "epoch": 2.805450416351249, + "grad_norm": 3.405240297317505, + "learning_rate": 9.074178161617206e-07, + "loss": 0.38255664706230164, + "step": 3706 + }, + { + "epoch": 2.8069644208932627, + "grad_norm": 1.1840120553970337, + "learning_rate": 9.06007006269553e-07, + "loss": 0.50471431016922, + "step": 3708 + }, + { + "epoch": 2.8084784254352764, + "grad_norm": 2.5293736457824707, + "learning_rate": 9.045973623655298e-07, + "loss": 0.5546978712081909, + "step": 3710 + }, + { + "epoch": 2.80999242997729, + "grad_norm": 2.4201784133911133, + "learning_rate": 9.031888866588486e-07, + "loss": 0.5356602072715759, + "step": 3712 + }, + { + "epoch": 2.8115064345193037, + "grad_norm": 1.1867656707763672, + "learning_rate": 9.017815813568773e-07, + "loss": 0.5516183972358704, + "step": 3714 + }, + { + "epoch": 2.813020439061317, + "grad_norm": 1.6858242750167847, + "learning_rate": 9.003754486651483e-07, + "loss": 0.17489120364189148, + "step": 3716 + }, + { + "epoch": 2.814534443603331, + "grad_norm": 1.3293143510818481, + "learning_rate": 8.989704907873585e-07, + "loss": 0.515713632106781, + "step": 3718 + }, + { + "epoch": 2.8160484481453443, + "grad_norm": 0.9379466772079468, + "learning_rate": 8.975667099253615e-07, + "loss": 0.09776206314563751, + "step": 3720 + }, + { + "epoch": 2.817562452687358, + "grad_norm": 1.1216363906860352, + "learning_rate": 8.961641082791665e-07, + "loss": 0.014124134555459023, + "step": 3722 + }, + { + "epoch": 2.8190764572293716, + "grad_norm": 1.1533939838409424, + "learning_rate": 8.947626880469365e-07, + "loss": 0.5552777051925659, + "step": 3724 + }, + { + "epoch": 2.8205904617713853, + "grad_norm": 0.6609148383140564, + "learning_rate": 8.933624514249809e-07, + "loss": 0.654082179069519, + "step": 3726 + }, + { + "epoch": 2.822104466313399, + "grad_norm": 0.8997098207473755, + "learning_rate": 8.919634006077551e-07, + "loss": 0.4427497982978821, + "step": 3728 + }, + { + "epoch": 2.8236184708554126, + "grad_norm": 1.8845670223236084, + "learning_rate": 8.905655377878552e-07, + "loss": 0.7544617652893066, + "step": 3730 + }, + { + "epoch": 2.8251324753974263, + "grad_norm": 0.6770662665367126, + "learning_rate": 8.891688651560177e-07, + "loss": 0.047169242054224014, + "step": 3732 + }, + { + "epoch": 2.82664647993944, + "grad_norm": 1.2962541580200195, + "learning_rate": 8.87773384901111e-07, + "loss": 0.9296197295188904, + "step": 3734 + }, + { + "epoch": 2.8281604844814536, + "grad_norm": 1.2323596477508545, + "learning_rate": 8.863790992101359e-07, + "loss": 0.5393604636192322, + "step": 3736 + }, + { + "epoch": 2.829674489023467, + "grad_norm": 2.7934513092041016, + "learning_rate": 8.849860102682226e-07, + "loss": 0.643133282661438, + "step": 3738 + }, + { + "epoch": 2.831188493565481, + "grad_norm": 0.6922993063926697, + "learning_rate": 8.835941202586237e-07, + "loss": 0.02751312404870987, + "step": 3740 + }, + { + "epoch": 2.832702498107494, + "grad_norm": 1.5087149143218994, + "learning_rate": 8.822034313627131e-07, + "loss": 0.9720838665962219, + "step": 3742 + }, + { + "epoch": 2.834216502649508, + "grad_norm": 1.9601054191589355, + "learning_rate": 8.808139457599839e-07, + "loss": 0.5711642503738403, + "step": 3744 + }, + { + "epoch": 2.8357305071915215, + "grad_norm": 0.6599406003952026, + "learning_rate": 8.794256656280411e-07, + "loss": 0.5517204999923706, + "step": 3746 + }, + { + "epoch": 2.837244511733535, + "grad_norm": 1.2271597385406494, + "learning_rate": 8.780385931426028e-07, + "loss": 0.08901163935661316, + "step": 3748 + }, + { + "epoch": 2.838758516275549, + "grad_norm": 1.0120880603790283, + "learning_rate": 8.766527304774929e-07, + "loss": 0.24437592923641205, + "step": 3750 + }, + { + "epoch": 2.8402725208175625, + "grad_norm": 1.4647727012634277, + "learning_rate": 8.752680798046388e-07, + "loss": 0.13146477937698364, + "step": 3752 + }, + { + "epoch": 2.841786525359576, + "grad_norm": 1.6725610494613647, + "learning_rate": 8.738846432940708e-07, + "loss": 0.5949679613113403, + "step": 3754 + }, + { + "epoch": 2.8433005299015894, + "grad_norm": 1.2268952131271362, + "learning_rate": 8.725024231139141e-07, + "loss": 0.5065262913703918, + "step": 3756 + }, + { + "epoch": 2.8448145344436035, + "grad_norm": 0.3921285569667816, + "learning_rate": 8.711214214303882e-07, + "loss": 0.4775087833404541, + "step": 3758 + }, + { + "epoch": 2.8463285389856168, + "grad_norm": 1.0800436735153198, + "learning_rate": 8.69741640407804e-07, + "loss": 0.15427321195602417, + "step": 3760 + }, + { + "epoch": 2.8478425435276304, + "grad_norm": 8.044784545898438, + "learning_rate": 8.683630822085586e-07, + "loss": 0.1547515094280243, + "step": 3762 + }, + { + "epoch": 2.849356548069644, + "grad_norm": 3.9401752948760986, + "learning_rate": 8.669857489931323e-07, + "loss": 0.44284185767173767, + "step": 3764 + }, + { + "epoch": 2.850870552611658, + "grad_norm": 3.7742526531219482, + "learning_rate": 8.656096429200857e-07, + "loss": 0.09116940200328827, + "step": 3766 + }, + { + "epoch": 2.8523845571536715, + "grad_norm": 2.8929426670074463, + "learning_rate": 8.642347661460574e-07, + "loss": 0.08468050509691238, + "step": 3768 + }, + { + "epoch": 2.853898561695685, + "grad_norm": 1.0520923137664795, + "learning_rate": 8.628611208257582e-07, + "loss": 0.4239636957645416, + "step": 3770 + }, + { + "epoch": 2.855412566237699, + "grad_norm": 2.8016903400421143, + "learning_rate": 8.614887091119692e-07, + "loss": 0.49609190225601196, + "step": 3772 + }, + { + "epoch": 2.8569265707797125, + "grad_norm": 1.6472960710525513, + "learning_rate": 8.60117533155538e-07, + "loss": 0.07066318392753601, + "step": 3774 + }, + { + "epoch": 2.858440575321726, + "grad_norm": 2.728069305419922, + "learning_rate": 8.587475951053769e-07, + "loss": 0.4633954167366028, + "step": 3776 + }, + { + "epoch": 2.8599545798637394, + "grad_norm": 1.7422723770141602, + "learning_rate": 8.573788971084563e-07, + "loss": 0.22740291059017181, + "step": 3778 + }, + { + "epoch": 2.8614685844057535, + "grad_norm": 1.3749862909317017, + "learning_rate": 8.560114413098036e-07, + "loss": 0.6364903450012207, + "step": 3780 + }, + { + "epoch": 2.8629825889477667, + "grad_norm": 1.1808003187179565, + "learning_rate": 8.54645229852501e-07, + "loss": 0.12124611437320709, + "step": 3782 + }, + { + "epoch": 2.8644965934897804, + "grad_norm": 2.028859853744507, + "learning_rate": 8.532802648776786e-07, + "loss": 0.5597202181816101, + "step": 3784 + }, + { + "epoch": 2.866010598031794, + "grad_norm": 3.725170135498047, + "learning_rate": 8.519165485245139e-07, + "loss": 0.07686395198106766, + "step": 3786 + }, + { + "epoch": 2.8675246025738077, + "grad_norm": 2.2122673988342285, + "learning_rate": 8.505540829302267e-07, + "loss": 0.5101752877235413, + "step": 3788 + }, + { + "epoch": 2.8690386071158214, + "grad_norm": 0.5155825018882751, + "learning_rate": 8.491928702300788e-07, + "loss": 0.41851285099983215, + "step": 3790 + }, + { + "epoch": 2.870552611657835, + "grad_norm": 3.692025899887085, + "learning_rate": 8.478329125573654e-07, + "loss": 0.08599580079317093, + "step": 3792 + }, + { + "epoch": 2.8720666161998487, + "grad_norm": 1.0661647319793701, + "learning_rate": 8.464742120434181e-07, + "loss": 0.06629923731088638, + "step": 3794 + }, + { + "epoch": 2.8735806207418624, + "grad_norm": 1.6379257440567017, + "learning_rate": 8.451167708175949e-07, + "loss": 0.26821276545524597, + "step": 3796 + }, + { + "epoch": 2.875094625283876, + "grad_norm": 1.8655433654785156, + "learning_rate": 8.437605910072835e-07, + "loss": 0.5183701515197754, + "step": 3798 + }, + { + "epoch": 2.8766086298258893, + "grad_norm": 1.6501489877700806, + "learning_rate": 8.424056747378924e-07, + "loss": 0.5763782262802124, + "step": 3800 + }, + { + "epoch": 2.878122634367903, + "grad_norm": 4.954941272735596, + "learning_rate": 8.410520241328499e-07, + "loss": 0.4887532889842987, + "step": 3802 + }, + { + "epoch": 2.8796366389099166, + "grad_norm": 1.991117238998413, + "learning_rate": 8.396996413136029e-07, + "loss": 0.06312289088964462, + "step": 3804 + }, + { + "epoch": 2.8811506434519303, + "grad_norm": 1.9310789108276367, + "learning_rate": 8.383485283996091e-07, + "loss": 0.5399723649024963, + "step": 3806 + }, + { + "epoch": 2.882664647993944, + "grad_norm": 0.9640549421310425, + "learning_rate": 8.369986875083369e-07, + "loss": 0.49352431297302246, + "step": 3808 + }, + { + "epoch": 2.8841786525359576, + "grad_norm": 1.2278974056243896, + "learning_rate": 8.356501207552611e-07, + "loss": 0.5587291121482849, + "step": 3810 + }, + { + "epoch": 2.8856926570779713, + "grad_norm": 1.5222783088684082, + "learning_rate": 8.3430283025386e-07, + "loss": 0.5829649567604065, + "step": 3812 + }, + { + "epoch": 2.887206661619985, + "grad_norm": 1.2479456663131714, + "learning_rate": 8.329568181156116e-07, + "loss": 0.9048027396202087, + "step": 3814 + }, + { + "epoch": 2.8887206661619986, + "grad_norm": 1.6707185506820679, + "learning_rate": 8.316120864499896e-07, + "loss": 0.255351722240448, + "step": 3816 + }, + { + "epoch": 2.890234670704012, + "grad_norm": 0.8826047778129578, + "learning_rate": 8.302686373644626e-07, + "loss": 0.46603119373321533, + "step": 3818 + }, + { + "epoch": 2.891748675246026, + "grad_norm": 0.37061068415641785, + "learning_rate": 8.289264729644878e-07, + "loss": 0.4425652027130127, + "step": 3820 + }, + { + "epoch": 2.893262679788039, + "grad_norm": 0.9885351061820984, + "learning_rate": 8.275855953535097e-07, + "loss": 0.48779723048210144, + "step": 3822 + }, + { + "epoch": 2.894776684330053, + "grad_norm": 0.540049135684967, + "learning_rate": 8.262460066329549e-07, + "loss": 0.45148253440856934, + "step": 3824 + }, + { + "epoch": 2.8962906888720665, + "grad_norm": 0.054746102541685104, + "learning_rate": 8.249077089022323e-07, + "loss": 0.5173460245132446, + "step": 3826 + }, + { + "epoch": 2.89780469341408, + "grad_norm": 1.6062202453613281, + "learning_rate": 8.235707042587258e-07, + "loss": 0.9697235226631165, + "step": 3828 + }, + { + "epoch": 2.899318697956094, + "grad_norm": 5.622974395751953, + "learning_rate": 8.222349947977929e-07, + "loss": 0.1524309515953064, + "step": 3830 + }, + { + "epoch": 2.9008327024981075, + "grad_norm": 1.5508605241775513, + "learning_rate": 8.209005826127616e-07, + "loss": 0.516213059425354, + "step": 3832 + }, + { + "epoch": 2.902346707040121, + "grad_norm": 1.0680675506591797, + "learning_rate": 8.195674697949277e-07, + "loss": 0.6413286924362183, + "step": 3834 + }, + { + "epoch": 2.903860711582135, + "grad_norm": 2.605649471282959, + "learning_rate": 8.182356584335491e-07, + "loss": 0.10368496924638748, + "step": 3836 + }, + { + "epoch": 2.9053747161241485, + "grad_norm": 1.0486398935317993, + "learning_rate": 8.169051506158443e-07, + "loss": 1.0008890628814697, + "step": 3838 + }, + { + "epoch": 2.9068887206661618, + "grad_norm": 2.419147253036499, + "learning_rate": 8.155759484269905e-07, + "loss": 0.7527313232421875, + "step": 3840 + }, + { + "epoch": 2.908402725208176, + "grad_norm": 4.255338668823242, + "learning_rate": 8.142480539501167e-07, + "loss": 0.5065209269523621, + "step": 3842 + }, + { + "epoch": 2.909916729750189, + "grad_norm": 1.3409066200256348, + "learning_rate": 8.129214692663032e-07, + "loss": 0.5724047422409058, + "step": 3844 + }, + { + "epoch": 2.911430734292203, + "grad_norm": 1.3575468063354492, + "learning_rate": 8.115961964545783e-07, + "loss": 0.09961085021495819, + "step": 3846 + }, + { + "epoch": 2.9129447388342165, + "grad_norm": 1.3521260023117065, + "learning_rate": 8.10272237591913e-07, + "loss": 0.021654373034834862, + "step": 3848 + }, + { + "epoch": 2.91445874337623, + "grad_norm": 1.365338683128357, + "learning_rate": 8.089495947532204e-07, + "loss": 0.6579216718673706, + "step": 3850 + }, + { + "epoch": 2.915972747918244, + "grad_norm": 0.5357253551483154, + "learning_rate": 8.0762827001135e-07, + "loss": 0.17811284959316254, + "step": 3852 + }, + { + "epoch": 2.9174867524602575, + "grad_norm": 1.5824048519134521, + "learning_rate": 8.063082654370859e-07, + "loss": 0.0712343230843544, + "step": 3854 + }, + { + "epoch": 2.919000757002271, + "grad_norm": 1.762145757675171, + "learning_rate": 8.049895830991442e-07, + "loss": 0.0337158739566803, + "step": 3856 + }, + { + "epoch": 2.920514761544285, + "grad_norm": 2.789889335632324, + "learning_rate": 8.036722250641675e-07, + "loss": 0.5362149477005005, + "step": 3858 + }, + { + "epoch": 2.9220287660862985, + "grad_norm": 1.0759830474853516, + "learning_rate": 8.023561933967231e-07, + "loss": 0.030107945203781128, + "step": 3860 + }, + { + "epoch": 2.9235427706283117, + "grad_norm": 1.1900837421417236, + "learning_rate": 8.010414901593006e-07, + "loss": 0.026596467941999435, + "step": 3862 + }, + { + "epoch": 2.9250567751703254, + "grad_norm": 1.9306000471115112, + "learning_rate": 7.997281174123065e-07, + "loss": 0.5237943530082703, + "step": 3864 + }, + { + "epoch": 2.926570779712339, + "grad_norm": 2.8298351764678955, + "learning_rate": 7.98416077214063e-07, + "loss": 0.04856697842478752, + "step": 3866 + }, + { + "epoch": 2.9280847842543527, + "grad_norm": 0.30419406294822693, + "learning_rate": 7.971053716208031e-07, + "loss": 0.028724314644932747, + "step": 3868 + }, + { + "epoch": 2.9295987887963664, + "grad_norm": 2.131240129470825, + "learning_rate": 7.957960026866695e-07, + "loss": 0.5273154973983765, + "step": 3870 + }, + { + "epoch": 2.93111279333838, + "grad_norm": 1.4633896350860596, + "learning_rate": 7.944879724637089e-07, + "loss": 0.9569279551506042, + "step": 3872 + }, + { + "epoch": 2.9326267978803937, + "grad_norm": 1.4682790040969849, + "learning_rate": 7.931812830018696e-07, + "loss": 0.48464494943618774, + "step": 3874 + }, + { + "epoch": 2.9341408024224074, + "grad_norm": 1.173094391822815, + "learning_rate": 7.918759363490007e-07, + "loss": 0.4379115700721741, + "step": 3876 + }, + { + "epoch": 2.935654806964421, + "grad_norm": 3.3173611164093018, + "learning_rate": 7.905719345508448e-07, + "loss": 0.5269387364387512, + "step": 3878 + }, + { + "epoch": 2.9371688115064343, + "grad_norm": 2.3758342266082764, + "learning_rate": 7.89269279651038e-07, + "loss": 0.7734465003013611, + "step": 3880 + }, + { + "epoch": 2.9386828160484484, + "grad_norm": 1.2772712707519531, + "learning_rate": 7.879679736911043e-07, + "loss": 0.38165703415870667, + "step": 3882 + }, + { + "epoch": 2.9401968205904616, + "grad_norm": 1.7364046573638916, + "learning_rate": 7.866680187104554e-07, + "loss": 0.6038261651992798, + "step": 3884 + }, + { + "epoch": 2.9417108251324753, + "grad_norm": 1.9593379497528076, + "learning_rate": 7.853694167463845e-07, + "loss": 0.5354672074317932, + "step": 3886 + }, + { + "epoch": 2.943224829674489, + "grad_norm": 2.5874977111816406, + "learning_rate": 7.840721698340645e-07, + "loss": 0.4401940107345581, + "step": 3888 + }, + { + "epoch": 2.9447388342165026, + "grad_norm": 0.5861150622367859, + "learning_rate": 7.827762800065447e-07, + "loss": 0.03342803567647934, + "step": 3890 + }, + { + "epoch": 2.9462528387585163, + "grad_norm": 0.07254718244075775, + "learning_rate": 7.814817492947481e-07, + "loss": 0.09036199003458023, + "step": 3892 + }, + { + "epoch": 2.94776684330053, + "grad_norm": 1.2350249290466309, + "learning_rate": 7.801885797274668e-07, + "loss": 0.5221207141876221, + "step": 3894 + }, + { + "epoch": 2.9492808478425436, + "grad_norm": 10.142313003540039, + "learning_rate": 7.788967733313607e-07, + "loss": 0.6032928824424744, + "step": 3896 + }, + { + "epoch": 2.9507948523845573, + "grad_norm": 0.11258590221405029, + "learning_rate": 7.776063321309522e-07, + "loss": 0.009810122661292553, + "step": 3898 + }, + { + "epoch": 2.952308856926571, + "grad_norm": 1.5286842584609985, + "learning_rate": 7.763172581486256e-07, + "loss": 0.024380333721637726, + "step": 3900 + }, + { + "epoch": 2.953822861468584, + "grad_norm": 1.2205860614776611, + "learning_rate": 7.750295534046214e-07, + "loss": 1.0053309202194214, + "step": 3902 + }, + { + "epoch": 2.9553368660105983, + "grad_norm": 1.4816066026687622, + "learning_rate": 7.737432199170336e-07, + "loss": 0.09616224467754364, + "step": 3904 + }, + { + "epoch": 2.9568508705526115, + "grad_norm": 1.3818708658218384, + "learning_rate": 7.724582597018097e-07, + "loss": 0.489177942276001, + "step": 3906 + }, + { + "epoch": 2.958364875094625, + "grad_norm": 1.9293603897094727, + "learning_rate": 7.711746747727421e-07, + "loss": 0.5863303542137146, + "step": 3908 + }, + { + "epoch": 2.959878879636639, + "grad_norm": 1.427994966506958, + "learning_rate": 7.698924671414689e-07, + "loss": 0.8889341354370117, + "step": 3910 + }, + { + "epoch": 2.9613928841786525, + "grad_norm": 4.070063591003418, + "learning_rate": 7.686116388174711e-07, + "loss": 0.29550179839134216, + "step": 3912 + }, + { + "epoch": 2.962906888720666, + "grad_norm": 0.7458814978599548, + "learning_rate": 7.67332191808066e-07, + "loss": 0.46560460329055786, + "step": 3914 + }, + { + "epoch": 2.96442089326268, + "grad_norm": 0.845970094203949, + "learning_rate": 7.660541281184074e-07, + "loss": 0.7280870079994202, + "step": 3916 + }, + { + "epoch": 2.9659348978046935, + "grad_norm": 0.814699113368988, + "learning_rate": 7.6477744975148e-07, + "loss": 0.42352423071861267, + "step": 3918 + }, + { + "epoch": 2.9674489023467068, + "grad_norm": 1.0757745504379272, + "learning_rate": 7.63502158708099e-07, + "loss": 0.49008259177207947, + "step": 3920 + }, + { + "epoch": 2.968962906888721, + "grad_norm": 1.148490071296692, + "learning_rate": 7.62228256986904e-07, + "loss": 0.8969548344612122, + "step": 3922 + }, + { + "epoch": 2.970476911430734, + "grad_norm": 0.07679852098226547, + "learning_rate": 7.609557465843581e-07, + "loss": 0.47735390067100525, + "step": 3924 + }, + { + "epoch": 2.971990915972748, + "grad_norm": 0.947769045829773, + "learning_rate": 7.596846294947427e-07, + "loss": 0.22102048993110657, + "step": 3926 + }, + { + "epoch": 2.9735049205147615, + "grad_norm": 1.290168046951294, + "learning_rate": 7.584149077101576e-07, + "loss": 0.5241299867630005, + "step": 3928 + }, + { + "epoch": 2.975018925056775, + "grad_norm": 0.7361161112785339, + "learning_rate": 7.571465832205142e-07, + "loss": 0.11590026319026947, + "step": 3930 + }, + { + "epoch": 2.976532929598789, + "grad_norm": 1.601361632347107, + "learning_rate": 7.558796580135345e-07, + "loss": 0.97297203540802, + "step": 3932 + }, + { + "epoch": 2.9780469341408025, + "grad_norm": 2.5603744983673096, + "learning_rate": 7.546141340747478e-07, + "loss": 0.3908730745315552, + "step": 3934 + }, + { + "epoch": 2.979560938682816, + "grad_norm": 5.403840065002441, + "learning_rate": 7.533500133874874e-07, + "loss": 0.3258212208747864, + "step": 3936 + }, + { + "epoch": 2.98107494322483, + "grad_norm": 0.8196927905082703, + "learning_rate": 7.52087297932887e-07, + "loss": 0.6264731884002686, + "step": 3938 + }, + { + "epoch": 2.9825889477668435, + "grad_norm": 0.9482421278953552, + "learning_rate": 7.508259896898774e-07, + "loss": 0.9203871488571167, + "step": 3940 + }, + { + "epoch": 2.9841029523088567, + "grad_norm": 1.5888142585754395, + "learning_rate": 7.495660906351864e-07, + "loss": 0.5230554342269897, + "step": 3942 + }, + { + "epoch": 2.985616956850871, + "grad_norm": 0.728779673576355, + "learning_rate": 7.483076027433309e-07, + "loss": 0.5012220144271851, + "step": 3944 + }, + { + "epoch": 2.987130961392884, + "grad_norm": 3.4751663208007812, + "learning_rate": 7.470505279866165e-07, + "loss": 0.4676293730735779, + "step": 3946 + }, + { + "epoch": 2.9886449659348977, + "grad_norm": 3.838146448135376, + "learning_rate": 7.457948683351357e-07, + "loss": 0.5397993326187134, + "step": 3948 + }, + { + "epoch": 2.9901589704769114, + "grad_norm": 2.288231611251831, + "learning_rate": 7.445406257567613e-07, + "loss": 0.5099469423294067, + "step": 3950 + }, + { + "epoch": 2.991672975018925, + "grad_norm": 0.98577880859375, + "learning_rate": 7.432878022171473e-07, + "loss": 0.52032071352005, + "step": 3952 + }, + { + "epoch": 2.9931869795609387, + "grad_norm": 1.9891493320465088, + "learning_rate": 7.420363996797214e-07, + "loss": 0.09683854877948761, + "step": 3954 + }, + { + "epoch": 2.9947009841029524, + "grad_norm": 1.8507225513458252, + "learning_rate": 7.407864201056869e-07, + "loss": 0.49484390020370483, + "step": 3956 + }, + { + "epoch": 2.996214988644966, + "grad_norm": 1.7818666696548462, + "learning_rate": 7.395378654540147e-07, + "loss": 0.6634348034858704, + "step": 3958 + }, + { + "epoch": 2.9977289931869797, + "grad_norm": 1.0862374305725098, + "learning_rate": 7.382907376814441e-07, + "loss": 0.5016591548919678, + "step": 3960 + }, + { + "epoch": 2.9992429977289934, + "grad_norm": 1.0315511226654053, + "learning_rate": 7.370450387424767e-07, + "loss": 0.9380726218223572, + "step": 3962 + }, + { + "epoch": 3.0007570022710066, + "grad_norm": 2.3969194889068604, + "learning_rate": 7.358007705893771e-07, + "loss": 0.6164969205856323, + "step": 3964 + }, + { + "epoch": 3.0022710068130203, + "grad_norm": 0.2718479037284851, + "learning_rate": 7.345579351721656e-07, + "loss": 0.42529329657554626, + "step": 3966 + }, + { + "epoch": 3.003785011355034, + "grad_norm": 0.9466289281845093, + "learning_rate": 7.333165344386171e-07, + "loss": 0.17686264216899872, + "step": 3968 + }, + { + "epoch": 3.0052990158970476, + "grad_norm": 4.593746185302734, + "learning_rate": 7.3207657033426e-07, + "loss": 0.2262856364250183, + "step": 3970 + }, + { + "epoch": 3.0068130204390613, + "grad_norm": 0.4080749750137329, + "learning_rate": 7.308380448023691e-07, + "loss": 0.44627752900123596, + "step": 3972 + }, + { + "epoch": 3.008327024981075, + "grad_norm": 1.3409069776535034, + "learning_rate": 7.296009597839658e-07, + "loss": 0.3770182430744171, + "step": 3974 + }, + { + "epoch": 3.0098410295230886, + "grad_norm": 1.118722677230835, + "learning_rate": 7.283653172178129e-07, + "loss": 0.4042755663394928, + "step": 3976 + }, + { + "epoch": 3.0113550340651023, + "grad_norm": 1.0860754251480103, + "learning_rate": 7.271311190404144e-07, + "loss": 0.49542757868766785, + "step": 3978 + }, + { + "epoch": 3.012869038607116, + "grad_norm": 1.6815706491470337, + "learning_rate": 7.25898367186009e-07, + "loss": 0.8869043588638306, + "step": 3980 + }, + { + "epoch": 3.0143830431491296, + "grad_norm": 1.4714423418045044, + "learning_rate": 7.246670635865692e-07, + "loss": 0.0653425082564354, + "step": 3982 + }, + { + "epoch": 3.015897047691143, + "grad_norm": 1.1048444509506226, + "learning_rate": 7.234372101717974e-07, + "loss": 0.808556079864502, + "step": 3984 + }, + { + "epoch": 3.0174110522331565, + "grad_norm": 1.7007668018341064, + "learning_rate": 7.222088088691246e-07, + "loss": 0.4317365288734436, + "step": 3986 + }, + { + "epoch": 3.01892505677517, + "grad_norm": 0.37070685625076294, + "learning_rate": 7.209818616037046e-07, + "loss": 0.009090625680983067, + "step": 3988 + }, + { + "epoch": 3.020439061317184, + "grad_norm": 0.25970911979675293, + "learning_rate": 7.197563702984131e-07, + "loss": 0.4831356108188629, + "step": 3990 + }, + { + "epoch": 3.0219530658591975, + "grad_norm": 2.4647092819213867, + "learning_rate": 7.185323368738442e-07, + "loss": 0.42847299575805664, + "step": 3992 + }, + { + "epoch": 3.023467070401211, + "grad_norm": 1.4203499555587769, + "learning_rate": 7.173097632483067e-07, + "loss": 0.5414004921913147, + "step": 3994 + }, + { + "epoch": 3.024981074943225, + "grad_norm": 0.4966915249824524, + "learning_rate": 7.160886513378211e-07, + "loss": 0.5168024897575378, + "step": 3996 + }, + { + "epoch": 3.0264950794852385, + "grad_norm": 0.33384162187576294, + "learning_rate": 7.148690030561192e-07, + "loss": 0.4039287269115448, + "step": 3998 + }, + { + "epoch": 3.028009084027252, + "grad_norm": 4.462475299835205, + "learning_rate": 7.136508203146364e-07, + "loss": 0.4377596080303192, + "step": 4000 + }, + { + "epoch": 3.029523088569266, + "grad_norm": 0.33090612292289734, + "learning_rate": 7.124341050225133e-07, + "loss": 0.00277205603197217, + "step": 4002 + }, + { + "epoch": 3.031037093111279, + "grad_norm": 2.197788715362549, + "learning_rate": 7.112188590865894e-07, + "loss": 0.6341095566749573, + "step": 4004 + }, + { + "epoch": 3.032551097653293, + "grad_norm": 3.956519603729248, + "learning_rate": 7.10005084411402e-07, + "loss": 0.029052812606096268, + "step": 4006 + }, + { + "epoch": 3.0340651021953065, + "grad_norm": 0.7870645523071289, + "learning_rate": 7.087927828991828e-07, + "loss": 0.2910933494567871, + "step": 4008 + }, + { + "epoch": 3.03557910673732, + "grad_norm": 1.1830310821533203, + "learning_rate": 7.075819564498545e-07, + "loss": 0.3489729166030884, + "step": 4010 + }, + { + "epoch": 3.037093111279334, + "grad_norm": 2.2264347076416016, + "learning_rate": 7.063726069610276e-07, + "loss": 0.4663190543651581, + "step": 4012 + }, + { + "epoch": 3.0386071158213475, + "grad_norm": 9.285035133361816, + "learning_rate": 7.05164736327999e-07, + "loss": 0.11393094062805176, + "step": 4014 + }, + { + "epoch": 3.040121120363361, + "grad_norm": 1.6699323654174805, + "learning_rate": 7.039583464437473e-07, + "loss": 0.5791230201721191, + "step": 4016 + }, + { + "epoch": 3.041635124905375, + "grad_norm": 1.1014317274093628, + "learning_rate": 7.027534391989301e-07, + "loss": 0.3018898069858551, + "step": 4018 + }, + { + "epoch": 3.0431491294473885, + "grad_norm": 1.926364541053772, + "learning_rate": 7.015500164818816e-07, + "loss": 0.0015229706186801195, + "step": 4020 + }, + { + "epoch": 3.044663133989402, + "grad_norm": 0.9938611388206482, + "learning_rate": 7.003480801786104e-07, + "loss": 0.677412211894989, + "step": 4022 + }, + { + "epoch": 3.046177138531416, + "grad_norm": 1.8977714776992798, + "learning_rate": 6.991476321727945e-07, + "loss": 0.020947730168700218, + "step": 4024 + }, + { + "epoch": 3.047691143073429, + "grad_norm": 1.0303181409835815, + "learning_rate": 6.979486743457794e-07, + "loss": 0.12567976117134094, + "step": 4026 + }, + { + "epoch": 3.0492051476154427, + "grad_norm": 2.8299500942230225, + "learning_rate": 6.967512085765763e-07, + "loss": 0.4234556257724762, + "step": 4028 + }, + { + "epoch": 3.0507191521574564, + "grad_norm": 1.4620380401611328, + "learning_rate": 6.955552367418566e-07, + "loss": 0.2195405811071396, + "step": 4030 + }, + { + "epoch": 3.05223315669947, + "grad_norm": 0.009019579738378525, + "learning_rate": 6.943607607159516e-07, + "loss": 0.4009798765182495, + "step": 4032 + }, + { + "epoch": 3.0537471612414837, + "grad_norm": 1.2741161584854126, + "learning_rate": 6.93167782370847e-07, + "loss": 0.4938404858112335, + "step": 4034 + }, + { + "epoch": 3.0552611657834974, + "grad_norm": 1.9303240776062012, + "learning_rate": 6.919763035761835e-07, + "loss": 0.43660661578178406, + "step": 4036 + }, + { + "epoch": 3.056775170325511, + "grad_norm": 1.19795560836792, + "learning_rate": 6.907863261992494e-07, + "loss": 0.5328879952430725, + "step": 4038 + }, + { + "epoch": 3.0582891748675247, + "grad_norm": 6.347302436828613, + "learning_rate": 6.895978521049816e-07, + "loss": 0.09802637249231339, + "step": 4040 + }, + { + "epoch": 3.0598031794095384, + "grad_norm": 1.2104865312576294, + "learning_rate": 6.884108831559594e-07, + "loss": 0.43484431505203247, + "step": 4042 + }, + { + "epoch": 3.061317183951552, + "grad_norm": 0.5178834199905396, + "learning_rate": 6.872254212124053e-07, + "loss": 0.43175745010375977, + "step": 4044 + }, + { + "epoch": 3.0628311884935653, + "grad_norm": 0.3627015948295593, + "learning_rate": 6.860414681321787e-07, + "loss": 0.09141885489225388, + "step": 4046 + }, + { + "epoch": 3.064345193035579, + "grad_norm": 2.9537100791931152, + "learning_rate": 6.848590257707741e-07, + "loss": 0.4429933726787567, + "step": 4048 + }, + { + "epoch": 3.0658591975775926, + "grad_norm": 1.0850509405136108, + "learning_rate": 6.836780959813194e-07, + "loss": 0.4049634337425232, + "step": 4050 + }, + { + "epoch": 3.0673732021196063, + "grad_norm": 1.1354448795318604, + "learning_rate": 6.82498680614571e-07, + "loss": 0.3931359052658081, + "step": 4052 + }, + { + "epoch": 3.06888720666162, + "grad_norm": 1.5777790546417236, + "learning_rate": 6.81320781518913e-07, + "loss": 0.5058997273445129, + "step": 4054 + }, + { + "epoch": 3.0704012112036336, + "grad_norm": 0.276546448469162, + "learning_rate": 6.801444005403517e-07, + "loss": 0.002168760634958744, + "step": 4056 + }, + { + "epoch": 3.0719152157456473, + "grad_norm": 1.09465754032135, + "learning_rate": 6.789695395225158e-07, + "loss": 0.37967199087142944, + "step": 4058 + }, + { + "epoch": 3.073429220287661, + "grad_norm": 1.6758774518966675, + "learning_rate": 6.777962003066511e-07, + "loss": 0.492207795381546, + "step": 4060 + }, + { + "epoch": 3.0749432248296746, + "grad_norm": 1.3080397844314575, + "learning_rate": 6.766243847316176e-07, + "loss": 0.4363994002342224, + "step": 4062 + }, + { + "epoch": 3.0764572293716883, + "grad_norm": 1.4788641929626465, + "learning_rate": 6.754540946338894e-07, + "loss": 0.26849645376205444, + "step": 4064 + }, + { + "epoch": 3.0779712339137015, + "grad_norm": 1.5961706638336182, + "learning_rate": 6.742853318475486e-07, + "loss": 0.6835228800773621, + "step": 4066 + }, + { + "epoch": 3.079485238455715, + "grad_norm": 1.7173904180526733, + "learning_rate": 6.731180982042835e-07, + "loss": 0.46800684928894043, + "step": 4068 + }, + { + "epoch": 3.080999242997729, + "grad_norm": 1.2513922452926636, + "learning_rate": 6.719523955333861e-07, + "loss": 0.02254556305706501, + "step": 4070 + }, + { + "epoch": 3.0825132475397425, + "grad_norm": 1.5062390565872192, + "learning_rate": 6.707882256617498e-07, + "loss": 0.34975719451904297, + "step": 4072 + }, + { + "epoch": 3.084027252081756, + "grad_norm": 1.8490229845046997, + "learning_rate": 6.696255904138654e-07, + "loss": 0.556843101978302, + "step": 4074 + }, + { + "epoch": 3.08554125662377, + "grad_norm": 0.7304385900497437, + "learning_rate": 6.684644916118179e-07, + "loss": 0.4064416289329529, + "step": 4076 + }, + { + "epoch": 3.0870552611657835, + "grad_norm": 1.6273658275604248, + "learning_rate": 6.673049310752851e-07, + "loss": 0.43567830324172974, + "step": 4078 + }, + { + "epoch": 3.088569265707797, + "grad_norm": 3.8026535511016846, + "learning_rate": 6.661469106215343e-07, + "loss": 0.14637161791324615, + "step": 4080 + }, + { + "epoch": 3.090083270249811, + "grad_norm": 0.9847654104232788, + "learning_rate": 6.649904320654185e-07, + "loss": 0.4798167943954468, + "step": 4082 + }, + { + "epoch": 3.0915972747918246, + "grad_norm": 0.1427965611219406, + "learning_rate": 6.638354972193742e-07, + "loss": 0.002032442716881633, + "step": 4084 + }, + { + "epoch": 3.0931112793338382, + "grad_norm": 1.081045150756836, + "learning_rate": 6.626821078934197e-07, + "loss": 0.5119474530220032, + "step": 4086 + }, + { + "epoch": 3.0946252838758515, + "grad_norm": 1.0046815872192383, + "learning_rate": 6.615302658951497e-07, + "loss": 0.7853630781173706, + "step": 4088 + }, + { + "epoch": 3.096139288417865, + "grad_norm": 0.12222945690155029, + "learning_rate": 6.603799730297347e-07, + "loss": 0.0054857018403708935, + "step": 4090 + }, + { + "epoch": 3.097653292959879, + "grad_norm": 0.9781043529510498, + "learning_rate": 6.592312310999173e-07, + "loss": 0.35689669847488403, + "step": 4092 + }, + { + "epoch": 3.0991672975018925, + "grad_norm": 2.571282148361206, + "learning_rate": 6.580840419060095e-07, + "loss": 0.01572362892329693, + "step": 4094 + }, + { + "epoch": 3.100681302043906, + "grad_norm": 1.3768620491027832, + "learning_rate": 6.5693840724589e-07, + "loss": 0.756076455116272, + "step": 4096 + }, + { + "epoch": 3.10219530658592, + "grad_norm": 0.694009006023407, + "learning_rate": 6.557943289150002e-07, + "loss": 0.11142466217279434, + "step": 4098 + }, + { + "epoch": 3.1037093111279335, + "grad_norm": 0.17246878147125244, + "learning_rate": 6.546518087063444e-07, + "loss": 0.0007862098282203078, + "step": 4100 + }, + { + "epoch": 3.105223315669947, + "grad_norm": 0.8544809818267822, + "learning_rate": 6.535108484104827e-07, + "loss": 0.07932410389184952, + "step": 4102 + }, + { + "epoch": 3.106737320211961, + "grad_norm": 0.796825647354126, + "learning_rate": 6.523714498155326e-07, + "loss": 0.025672433897852898, + "step": 4104 + }, + { + "epoch": 3.1082513247539745, + "grad_norm": 1.4668720960617065, + "learning_rate": 6.512336147071624e-07, + "loss": 0.5809639692306519, + "step": 4106 + }, + { + "epoch": 3.1097653292959877, + "grad_norm": 2.3516106605529785, + "learning_rate": 6.500973448685914e-07, + "loss": 0.22618822753429413, + "step": 4108 + }, + { + "epoch": 3.1112793338380014, + "grad_norm": 0.6618440747261047, + "learning_rate": 6.489626420805851e-07, + "loss": 0.3518376350402832, + "step": 4110 + }, + { + "epoch": 3.112793338380015, + "grad_norm": 0.692665696144104, + "learning_rate": 6.47829508121453e-07, + "loss": 0.0329466387629509, + "step": 4112 + }, + { + "epoch": 3.1143073429220287, + "grad_norm": 1.5528597831726074, + "learning_rate": 6.466979447670463e-07, + "loss": 0.0887966901063919, + "step": 4114 + }, + { + "epoch": 3.1158213474640424, + "grad_norm": 0.40037980675697327, + "learning_rate": 6.455679537907548e-07, + "loss": 0.013906463049352169, + "step": 4116 + }, + { + "epoch": 3.117335352006056, + "grad_norm": 1.0792657136917114, + "learning_rate": 6.44439536963504e-07, + "loss": 0.7016704082489014, + "step": 4118 + }, + { + "epoch": 3.1188493565480697, + "grad_norm": 1.677802562713623, + "learning_rate": 6.433126960537513e-07, + "loss": 0.7152868509292603, + "step": 4120 + }, + { + "epoch": 3.1203633610900834, + "grad_norm": 1.3043384552001953, + "learning_rate": 6.421874328274865e-07, + "loss": 0.8352264165878296, + "step": 4122 + }, + { + "epoch": 3.121877365632097, + "grad_norm": 1.8514552116394043, + "learning_rate": 6.410637490482252e-07, + "loss": 0.8518596291542053, + "step": 4124 + }, + { + "epoch": 3.1233913701741107, + "grad_norm": 1.6068618297576904, + "learning_rate": 6.399416464770082e-07, + "loss": 0.1147489994764328, + "step": 4126 + }, + { + "epoch": 3.124905374716124, + "grad_norm": 1.5815989971160889, + "learning_rate": 6.388211268723975e-07, + "loss": 0.6219792366027832, + "step": 4128 + }, + { + "epoch": 3.1264193792581376, + "grad_norm": 3.630340337753296, + "learning_rate": 6.377021919904758e-07, + "loss": 0.047322895377874374, + "step": 4130 + }, + { + "epoch": 3.1279333838001513, + "grad_norm": 0.7049509882926941, + "learning_rate": 6.365848435848412e-07, + "loss": 0.3931571841239929, + "step": 4132 + }, + { + "epoch": 3.129447388342165, + "grad_norm": 1.6559326648712158, + "learning_rate": 6.354690834066054e-07, + "loss": 0.8177527785301208, + "step": 4134 + }, + { + "epoch": 3.1309613928841786, + "grad_norm": 2.969405174255371, + "learning_rate": 6.343549132043909e-07, + "loss": 0.12549491226673126, + "step": 4136 + }, + { + "epoch": 3.1324753974261923, + "grad_norm": 1.4503098726272583, + "learning_rate": 6.332423347243294e-07, + "loss": 0.8987895250320435, + "step": 4138 + }, + { + "epoch": 3.133989401968206, + "grad_norm": 1.4494681358337402, + "learning_rate": 6.321313497100571e-07, + "loss": 0.3016546368598938, + "step": 4140 + }, + { + "epoch": 3.1355034065102196, + "grad_norm": 2.3474137783050537, + "learning_rate": 6.310219599027128e-07, + "loss": 0.557498037815094, + "step": 4142 + }, + { + "epoch": 3.1370174110522333, + "grad_norm": 2.055933713912964, + "learning_rate": 6.299141670409361e-07, + "loss": 0.4614870548248291, + "step": 4144 + }, + { + "epoch": 3.138531415594247, + "grad_norm": 2.6461069583892822, + "learning_rate": 6.288079728608635e-07, + "loss": 0.8577829003334045, + "step": 4146 + }, + { + "epoch": 3.14004542013626, + "grad_norm": 1.2636239528656006, + "learning_rate": 6.277033790961259e-07, + "loss": 0.765476405620575, + "step": 4148 + }, + { + "epoch": 3.141559424678274, + "grad_norm": 0.20606276392936707, + "learning_rate": 6.266003874778456e-07, + "loss": 0.4212927520275116, + "step": 4150 + }, + { + "epoch": 3.1430734292202875, + "grad_norm": 1.0059120655059814, + "learning_rate": 6.254989997346353e-07, + "loss": 0.030231747776269913, + "step": 4152 + }, + { + "epoch": 3.144587433762301, + "grad_norm": 1.2911632061004639, + "learning_rate": 6.243992175925925e-07, + "loss": 0.2676365375518799, + "step": 4154 + }, + { + "epoch": 3.146101438304315, + "grad_norm": 1.1610504388809204, + "learning_rate": 6.233010427753001e-07, + "loss": 0.4617763161659241, + "step": 4156 + }, + { + "epoch": 3.1476154428463285, + "grad_norm": 0.2687191367149353, + "learning_rate": 6.222044770038203e-07, + "loss": 0.3713476359844208, + "step": 4158 + }, + { + "epoch": 3.149129447388342, + "grad_norm": 1.655655860900879, + "learning_rate": 6.21109521996695e-07, + "loss": 0.7506855726242065, + "step": 4160 + }, + { + "epoch": 3.150643451930356, + "grad_norm": 3.0118119716644287, + "learning_rate": 6.20016179469941e-07, + "loss": 0.7359092831611633, + "step": 4162 + }, + { + "epoch": 3.1521574564723696, + "grad_norm": 1.8462358713150024, + "learning_rate": 6.189244511370476e-07, + "loss": 0.482887327671051, + "step": 4164 + }, + { + "epoch": 3.1536714610143832, + "grad_norm": 1.8491785526275635, + "learning_rate": 6.178343387089756e-07, + "loss": 0.7653396129608154, + "step": 4166 + }, + { + "epoch": 3.1551854655563965, + "grad_norm": 1.4998102188110352, + "learning_rate": 6.16745843894152e-07, + "loss": 0.46621713042259216, + "step": 4168 + }, + { + "epoch": 3.15669947009841, + "grad_norm": 2.092540740966797, + "learning_rate": 6.156589683984697e-07, + "loss": 0.02423417754471302, + "step": 4170 + }, + { + "epoch": 3.158213474640424, + "grad_norm": 3.369731903076172, + "learning_rate": 6.145737139252829e-07, + "loss": 0.039859574288129807, + "step": 4172 + }, + { + "epoch": 3.1597274791824375, + "grad_norm": 2.0534422397613525, + "learning_rate": 6.134900821754063e-07, + "loss": 0.07779321819543839, + "step": 4174 + }, + { + "epoch": 3.161241483724451, + "grad_norm": 2.3966152667999268, + "learning_rate": 6.124080748471109e-07, + "loss": 0.08836057037115097, + "step": 4176 + }, + { + "epoch": 3.162755488266465, + "grad_norm": 1.1762347221374512, + "learning_rate": 6.113276936361215e-07, + "loss": 0.34370484948158264, + "step": 4178 + }, + { + "epoch": 3.1642694928084785, + "grad_norm": 1.4453004598617554, + "learning_rate": 6.102489402356153e-07, + "loss": 0.872462272644043, + "step": 4180 + }, + { + "epoch": 3.165783497350492, + "grad_norm": 0.5855422616004944, + "learning_rate": 6.091718163362182e-07, + "loss": 0.41354164481163025, + "step": 4182 + }, + { + "epoch": 3.167297501892506, + "grad_norm": 1.7227216958999634, + "learning_rate": 6.080963236260016e-07, + "loss": 0.7299938201904297, + "step": 4184 + }, + { + "epoch": 3.1688115064345195, + "grad_norm": 0.8702467679977417, + "learning_rate": 6.070224637904811e-07, + "loss": 0.4417323172092438, + "step": 4186 + }, + { + "epoch": 3.170325510976533, + "grad_norm": 3.277219295501709, + "learning_rate": 6.059502385126138e-07, + "loss": 0.8119851350784302, + "step": 4188 + }, + { + "epoch": 3.1718395155185464, + "grad_norm": 1.4700698852539062, + "learning_rate": 6.04879649472794e-07, + "loss": 0.4506599009037018, + "step": 4190 + }, + { + "epoch": 3.17335352006056, + "grad_norm": 0.4654270112514496, + "learning_rate": 6.038106983488523e-07, + "loss": 0.05376291275024414, + "step": 4192 + }, + { + "epoch": 3.1748675246025737, + "grad_norm": 0.6102208495140076, + "learning_rate": 6.027433868160518e-07, + "loss": 0.0416281633079052, + "step": 4194 + }, + { + "epoch": 3.1763815291445874, + "grad_norm": 0.038999054580926895, + "learning_rate": 6.016777165470872e-07, + "loss": 0.0013046637177467346, + "step": 4196 + }, + { + "epoch": 3.177895533686601, + "grad_norm": 0.9198364615440369, + "learning_rate": 6.0061368921208e-07, + "loss": 0.024049073457717896, + "step": 4198 + }, + { + "epoch": 3.1794095382286147, + "grad_norm": 1.7429077625274658, + "learning_rate": 5.99551306478577e-07, + "loss": 0.1302247941493988, + "step": 4200 + }, + { + "epoch": 3.1809235427706284, + "grad_norm": 1.1480247974395752, + "learning_rate": 5.98490570011548e-07, + "loss": 0.42881670594215393, + "step": 4202 + }, + { + "epoch": 3.182437547312642, + "grad_norm": 1.5936020612716675, + "learning_rate": 5.97431481473382e-07, + "loss": 0.40928295254707336, + "step": 4204 + }, + { + "epoch": 3.1839515518546557, + "grad_norm": 2.0750679969787598, + "learning_rate": 5.963740425238867e-07, + "loss": 0.7856857776641846, + "step": 4206 + }, + { + "epoch": 3.1854655563966694, + "grad_norm": 1.3993339538574219, + "learning_rate": 5.953182548202828e-07, + "loss": 0.142067551612854, + "step": 4208 + }, + { + "epoch": 3.1869795609386826, + "grad_norm": 1.1035765409469604, + "learning_rate": 5.94264120017205e-07, + "loss": 0.025681478902697563, + "step": 4210 + }, + { + "epoch": 3.1884935654806963, + "grad_norm": 1.44044828414917, + "learning_rate": 5.932116397666961e-07, + "loss": 0.19360049068927765, + "step": 4212 + }, + { + "epoch": 3.19000757002271, + "grad_norm": 3.2448890209198, + "learning_rate": 5.921608157182062e-07, + "loss": 0.7855092883110046, + "step": 4214 + }, + { + "epoch": 3.1915215745647236, + "grad_norm": 1.3087652921676636, + "learning_rate": 5.911116495185907e-07, + "loss": 0.5407652258872986, + "step": 4216 + }, + { + "epoch": 3.1930355791067373, + "grad_norm": 1.2526170015335083, + "learning_rate": 5.900641428121059e-07, + "loss": 0.5142260789871216, + "step": 4218 + }, + { + "epoch": 3.194549583648751, + "grad_norm": 2.792015790939331, + "learning_rate": 5.890182972404074e-07, + "loss": 0.038477823138237, + "step": 4220 + }, + { + "epoch": 3.1960635881907646, + "grad_norm": 0.9680241942405701, + "learning_rate": 5.879741144425475e-07, + "loss": 0.47152355313301086, + "step": 4222 + }, + { + "epoch": 3.1975775927327783, + "grad_norm": 4.391843795776367, + "learning_rate": 5.869315960549734e-07, + "loss": 0.4090409576892853, + "step": 4224 + }, + { + "epoch": 3.199091597274792, + "grad_norm": 0.3873880207538605, + "learning_rate": 5.858907437115225e-07, + "loss": 0.004784378223121166, + "step": 4226 + }, + { + "epoch": 3.2006056018168056, + "grad_norm": 2.0868403911590576, + "learning_rate": 5.848515590434222e-07, + "loss": 0.8621227741241455, + "step": 4228 + }, + { + "epoch": 3.202119606358819, + "grad_norm": 1.5555455684661865, + "learning_rate": 5.838140436792856e-07, + "loss": 0.7451547384262085, + "step": 4230 + }, + { + "epoch": 3.2036336109008325, + "grad_norm": 0.5912542939186096, + "learning_rate": 5.827781992451105e-07, + "loss": 0.11990676075220108, + "step": 4232 + }, + { + "epoch": 3.205147615442846, + "grad_norm": 0.2775070071220398, + "learning_rate": 5.817440273642755e-07, + "loss": 0.018691135570406914, + "step": 4234 + }, + { + "epoch": 3.20666161998486, + "grad_norm": 1.3487552404403687, + "learning_rate": 5.807115296575374e-07, + "loss": 0.7999797463417053, + "step": 4236 + }, + { + "epoch": 3.2081756245268735, + "grad_norm": 1.9631119966506958, + "learning_rate": 5.796807077430305e-07, + "loss": 0.45478659868240356, + "step": 4238 + }, + { + "epoch": 3.209689629068887, + "grad_norm": 0.05802164971828461, + "learning_rate": 5.78651563236262e-07, + "loss": 0.051538702100515366, + "step": 4240 + }, + { + "epoch": 3.211203633610901, + "grad_norm": 1.9430551528930664, + "learning_rate": 5.776240977501102e-07, + "loss": 0.5016130805015564, + "step": 4242 + }, + { + "epoch": 3.2127176381529146, + "grad_norm": 1.081489086151123, + "learning_rate": 5.765983128948217e-07, + "loss": 0.029119107872247696, + "step": 4244 + }, + { + "epoch": 3.2142316426949282, + "grad_norm": 2.924699306488037, + "learning_rate": 5.75574210278011e-07, + "loss": 0.01439094077795744, + "step": 4246 + }, + { + "epoch": 3.215745647236942, + "grad_norm": 0.40770334005355835, + "learning_rate": 5.745517915046542e-07, + "loss": 0.0037577252369374037, + "step": 4248 + }, + { + "epoch": 3.2172596517789556, + "grad_norm": 0.6793141961097717, + "learning_rate": 5.735310581770891e-07, + "loss": 0.14361703395843506, + "step": 4250 + }, + { + "epoch": 3.218773656320969, + "grad_norm": 1.6379170417785645, + "learning_rate": 5.725120118950119e-07, + "loss": 0.8105175495147705, + "step": 4252 + }, + { + "epoch": 3.2202876608629825, + "grad_norm": 1.2172212600708008, + "learning_rate": 5.71494654255476e-07, + "loss": 0.8411913514137268, + "step": 4254 + }, + { + "epoch": 3.221801665404996, + "grad_norm": 0.5244411826133728, + "learning_rate": 5.704789868528865e-07, + "loss": 0.46244922280311584, + "step": 4256 + }, + { + "epoch": 3.22331566994701, + "grad_norm": 0.7769549489021301, + "learning_rate": 5.694650112790013e-07, + "loss": 0.008184045553207397, + "step": 4258 + }, + { + "epoch": 3.2248296744890235, + "grad_norm": 1.0518815517425537, + "learning_rate": 5.684527291229256e-07, + "loss": 0.01648992858827114, + "step": 4260 + }, + { + "epoch": 3.226343679031037, + "grad_norm": 0.38810011744499207, + "learning_rate": 5.674421419711116e-07, + "loss": 0.061996109783649445, + "step": 4262 + }, + { + "epoch": 3.227857683573051, + "grad_norm": 3.983027458190918, + "learning_rate": 5.664332514073544e-07, + "loss": 0.35552293062210083, + "step": 4264 + }, + { + "epoch": 3.2293716881150645, + "grad_norm": 1.513490915298462, + "learning_rate": 5.6542605901279e-07, + "loss": 0.5510594248771667, + "step": 4266 + }, + { + "epoch": 3.230885692657078, + "grad_norm": 0.5401914715766907, + "learning_rate": 5.644205663658943e-07, + "loss": 0.03198845311999321, + "step": 4268 + }, + { + "epoch": 3.2323996971990914, + "grad_norm": 2.198777914047241, + "learning_rate": 5.634167750424781e-07, + "loss": 0.46873044967651367, + "step": 4270 + }, + { + "epoch": 3.233913701741105, + "grad_norm": 2.9599900245666504, + "learning_rate": 5.624146866156859e-07, + "loss": 0.10928899794816971, + "step": 4272 + }, + { + "epoch": 3.2354277062831187, + "grad_norm": 4.502335071563721, + "learning_rate": 5.614143026559947e-07, + "loss": 0.2749214768409729, + "step": 4274 + }, + { + "epoch": 3.2369417108251324, + "grad_norm": 1.588077425956726, + "learning_rate": 5.604156247312088e-07, + "loss": 0.4192395806312561, + "step": 4276 + }, + { + "epoch": 3.238455715367146, + "grad_norm": 1.5815722942352295, + "learning_rate": 5.594186544064595e-07, + "loss": 0.1117347851395607, + "step": 4278 + }, + { + "epoch": 3.2399697199091597, + "grad_norm": 1.866127371788025, + "learning_rate": 5.584233932442013e-07, + "loss": 0.13487142324447632, + "step": 4280 + }, + { + "epoch": 3.2414837244511734, + "grad_norm": 0.5325085520744324, + "learning_rate": 5.574298428042114e-07, + "loss": 0.10841602087020874, + "step": 4282 + }, + { + "epoch": 3.242997728993187, + "grad_norm": 0.1962842047214508, + "learning_rate": 5.564380046435849e-07, + "loss": 0.4596641957759857, + "step": 4284 + }, + { + "epoch": 3.2445117335352007, + "grad_norm": 0.5517534017562866, + "learning_rate": 5.554478803167336e-07, + "loss": 0.06260286271572113, + "step": 4286 + }, + { + "epoch": 3.2460257380772144, + "grad_norm": 1.419721245765686, + "learning_rate": 5.544594713753831e-07, + "loss": 0.08346284925937653, + "step": 4288 + }, + { + "epoch": 3.247539742619228, + "grad_norm": 8.173982620239258, + "learning_rate": 5.534727793685715e-07, + "loss": 0.11065147072076797, + "step": 4290 + }, + { + "epoch": 3.2490537471612413, + "grad_norm": 1.3091762065887451, + "learning_rate": 5.524878058426454e-07, + "loss": 0.010228373110294342, + "step": 4292 + }, + { + "epoch": 3.250567751703255, + "grad_norm": 0.7658984065055847, + "learning_rate": 5.515045523412582e-07, + "loss": 0.4764876961708069, + "step": 4294 + }, + { + "epoch": 3.2520817562452686, + "grad_norm": 1.7351553440093994, + "learning_rate": 5.505230204053681e-07, + "loss": 0.8360491394996643, + "step": 4296 + }, + { + "epoch": 3.2535957607872823, + "grad_norm": 0.6220263242721558, + "learning_rate": 5.495432115732351e-07, + "loss": 0.4632798135280609, + "step": 4298 + }, + { + "epoch": 3.255109765329296, + "grad_norm": 1.2251489162445068, + "learning_rate": 5.485651273804185e-07, + "loss": 0.47250670194625854, + "step": 4300 + }, + { + "epoch": 3.2566237698713096, + "grad_norm": 0.8385485410690308, + "learning_rate": 5.475887693597747e-07, + "loss": 0.4410651624202728, + "step": 4302 + }, + { + "epoch": 3.2581377744133233, + "grad_norm": 1.6110773086547852, + "learning_rate": 5.466141390414554e-07, + "loss": 0.9065512418746948, + "step": 4304 + }, + { + "epoch": 3.259651778955337, + "grad_norm": 1.928664207458496, + "learning_rate": 5.456412379529038e-07, + "loss": 0.49542078375816345, + "step": 4306 + }, + { + "epoch": 3.2611657834973506, + "grad_norm": 0.8389179110527039, + "learning_rate": 5.446700676188544e-07, + "loss": 0.41085904836654663, + "step": 4308 + }, + { + "epoch": 3.262679788039364, + "grad_norm": 1.4285856485366821, + "learning_rate": 5.437006295613273e-07, + "loss": 0.42602914571762085, + "step": 4310 + }, + { + "epoch": 3.264193792581378, + "grad_norm": 1.5370839834213257, + "learning_rate": 5.427329252996299e-07, + "loss": 0.4867514371871948, + "step": 4312 + }, + { + "epoch": 3.265707797123391, + "grad_norm": 0.8010324835777283, + "learning_rate": 5.417669563503508e-07, + "loss": 0.47227951884269714, + "step": 4314 + }, + { + "epoch": 3.267221801665405, + "grad_norm": 1.351555585861206, + "learning_rate": 5.408027242273592e-07, + "loss": 0.5203832387924194, + "step": 4316 + }, + { + "epoch": 3.2687358062074185, + "grad_norm": 1.0237430334091187, + "learning_rate": 5.398402304418033e-07, + "loss": 0.5021886229515076, + "step": 4318 + }, + { + "epoch": 3.270249810749432, + "grad_norm": 5.211171627044678, + "learning_rate": 5.388794765021063e-07, + "loss": 0.41523194313049316, + "step": 4320 + }, + { + "epoch": 3.271763815291446, + "grad_norm": 1.2868260145187378, + "learning_rate": 5.379204639139646e-07, + "loss": 0.6151956915855408, + "step": 4322 + }, + { + "epoch": 3.2732778198334596, + "grad_norm": 1.3792760372161865, + "learning_rate": 5.369631941803455e-07, + "loss": 0.8121111989021301, + "step": 4324 + }, + { + "epoch": 3.2747918243754732, + "grad_norm": 0.7304401993751526, + "learning_rate": 5.360076688014856e-07, + "loss": 0.03146474063396454, + "step": 4326 + }, + { + "epoch": 3.276305828917487, + "grad_norm": 0.7728518843650818, + "learning_rate": 5.350538892748871e-07, + "loss": 0.18248653411865234, + "step": 4328 + }, + { + "epoch": 3.2778198334595006, + "grad_norm": 2.537102699279785, + "learning_rate": 5.341018570953158e-07, + "loss": 0.5372970104217529, + "step": 4330 + }, + { + "epoch": 3.279333838001514, + "grad_norm": 0.11801496148109436, + "learning_rate": 5.331515737548002e-07, + "loss": 0.39382505416870117, + "step": 4332 + }, + { + "epoch": 3.2808478425435275, + "grad_norm": 1.0290801525115967, + "learning_rate": 5.32203040742627e-07, + "loss": 0.29253843426704407, + "step": 4334 + }, + { + "epoch": 3.282361847085541, + "grad_norm": 0.21218901872634888, + "learning_rate": 5.3125625954534e-07, + "loss": 0.018512556329369545, + "step": 4336 + }, + { + "epoch": 3.283875851627555, + "grad_norm": 3.1282711029052734, + "learning_rate": 5.303112316467375e-07, + "loss": 0.5378385186195374, + "step": 4338 + }, + { + "epoch": 3.2853898561695685, + "grad_norm": 1.450197458267212, + "learning_rate": 5.293679585278709e-07, + "loss": 0.46585577726364136, + "step": 4340 + }, + { + "epoch": 3.286903860711582, + "grad_norm": 1.130822777748108, + "learning_rate": 5.284264416670402e-07, + "loss": 0.34854355454444885, + "step": 4342 + }, + { + "epoch": 3.288417865253596, + "grad_norm": 0.3349727690219879, + "learning_rate": 5.274866825397937e-07, + "loss": 0.01301285158842802, + "step": 4344 + }, + { + "epoch": 3.2899318697956095, + "grad_norm": 1.1839556694030762, + "learning_rate": 5.265486826189246e-07, + "loss": 0.029658641666173935, + "step": 4346 + }, + { + "epoch": 3.291445874337623, + "grad_norm": 0.8426259756088257, + "learning_rate": 5.256124433744697e-07, + "loss": 0.4117862284183502, + "step": 4348 + }, + { + "epoch": 3.292959878879637, + "grad_norm": 1.4926836490631104, + "learning_rate": 5.246779662737058e-07, + "loss": 0.3558712303638458, + "step": 4350 + }, + { + "epoch": 3.2944738834216505, + "grad_norm": 1.6061350107192993, + "learning_rate": 5.237452527811482e-07, + "loss": 0.3738000988960266, + "step": 4352 + }, + { + "epoch": 3.2959878879636637, + "grad_norm": 0.5416836142539978, + "learning_rate": 5.228143043585488e-07, + "loss": 0.07174525409936905, + "step": 4354 + }, + { + "epoch": 3.2975018925056774, + "grad_norm": 1.7639620304107666, + "learning_rate": 5.21885122464893e-07, + "loss": 0.006380655337125063, + "step": 4356 + }, + { + "epoch": 3.299015897047691, + "grad_norm": 0.26390185952186584, + "learning_rate": 5.209577085563968e-07, + "loss": 0.4813000559806824, + "step": 4358 + }, + { + "epoch": 3.3005299015897047, + "grad_norm": 1.716133713722229, + "learning_rate": 5.200320640865069e-07, + "loss": 0.4223863482475281, + "step": 4360 + }, + { + "epoch": 3.3020439061317184, + "grad_norm": 0.010280990973114967, + "learning_rate": 5.191081905058956e-07, + "loss": 0.43456804752349854, + "step": 4362 + }, + { + "epoch": 3.303557910673732, + "grad_norm": 1.255383849143982, + "learning_rate": 5.181860892624612e-07, + "loss": 0.4126952886581421, + "step": 4364 + }, + { + "epoch": 3.3050719152157457, + "grad_norm": 4.131265163421631, + "learning_rate": 5.17265761801323e-07, + "loss": 0.2613856792449951, + "step": 4366 + }, + { + "epoch": 3.3065859197577594, + "grad_norm": 4.043173789978027, + "learning_rate": 5.163472095648216e-07, + "loss": 0.41220447421073914, + "step": 4368 + }, + { + "epoch": 3.308099924299773, + "grad_norm": 1.2004311084747314, + "learning_rate": 5.154304339925146e-07, + "loss": 0.8680997490882874, + "step": 4370 + }, + { + "epoch": 3.3096139288417863, + "grad_norm": 0.8893518447875977, + "learning_rate": 5.145154365211757e-07, + "loss": 0.0636923760175705, + "step": 4372 + }, + { + "epoch": 3.3111279333838, + "grad_norm": 1.4642027616500854, + "learning_rate": 5.136022185847912e-07, + "loss": 0.025864215567708015, + "step": 4374 + }, + { + "epoch": 3.3126419379258136, + "grad_norm": 1.8180480003356934, + "learning_rate": 5.126907816145599e-07, + "loss": 0.48631978034973145, + "step": 4376 + }, + { + "epoch": 3.3141559424678273, + "grad_norm": 0.9317322969436646, + "learning_rate": 5.117811270388885e-07, + "loss": 0.01056060753762722, + "step": 4378 + }, + { + "epoch": 3.315669947009841, + "grad_norm": 1.074005365371704, + "learning_rate": 5.108732562833903e-07, + "loss": 0.45045316219329834, + "step": 4380 + }, + { + "epoch": 3.3171839515518546, + "grad_norm": 1.3077199459075928, + "learning_rate": 5.09967170770883e-07, + "loss": 0.08453009277582169, + "step": 4382 + }, + { + "epoch": 3.3186979560938683, + "grad_norm": 0.8466632962226868, + "learning_rate": 5.090628719213871e-07, + "loss": 0.6531447172164917, + "step": 4384 + }, + { + "epoch": 3.320211960635882, + "grad_norm": 2.3048253059387207, + "learning_rate": 5.081603611521223e-07, + "loss": 0.03339333459734917, + "step": 4386 + }, + { + "epoch": 3.3217259651778956, + "grad_norm": 0.03835700824856758, + "learning_rate": 5.072596398775062e-07, + "loss": 0.0003226663975510746, + "step": 4388 + }, + { + "epoch": 3.3232399697199093, + "grad_norm": 1.4669675827026367, + "learning_rate": 5.063607095091524e-07, + "loss": 0.02945004031062126, + "step": 4390 + }, + { + "epoch": 3.324753974261923, + "grad_norm": 0.9835191965103149, + "learning_rate": 5.054635714558673e-07, + "loss": 0.9128386974334717, + "step": 4392 + }, + { + "epoch": 3.326267978803936, + "grad_norm": 0.8198468089103699, + "learning_rate": 5.045682271236481e-07, + "loss": 0.4045681357383728, + "step": 4394 + }, + { + "epoch": 3.32778198334595, + "grad_norm": 2.310675621032715, + "learning_rate": 5.036746779156812e-07, + "loss": 0.06954420357942581, + "step": 4396 + }, + { + "epoch": 3.3292959878879635, + "grad_norm": 1.2264914512634277, + "learning_rate": 5.027829252323402e-07, + "loss": 0.4230402708053589, + "step": 4398 + }, + { + "epoch": 3.330809992429977, + "grad_norm": 1.6409683227539062, + "learning_rate": 5.018929704711824e-07, + "loss": 0.6752891540527344, + "step": 4400 + }, + { + "epoch": 3.332323996971991, + "grad_norm": 3.365297555923462, + "learning_rate": 5.010048150269478e-07, + "loss": 0.03340877592563629, + "step": 4402 + }, + { + "epoch": 3.3338380015140046, + "grad_norm": 0.16656474769115448, + "learning_rate": 5.001184602915561e-07, + "loss": 0.0028345210012048483, + "step": 4404 + }, + { + "epoch": 3.3353520060560182, + "grad_norm": 1.7526333332061768, + "learning_rate": 4.992339076541056e-07, + "loss": 0.36279603838920593, + "step": 4406 + }, + { + "epoch": 3.336866010598032, + "grad_norm": 4.4570698738098145, + "learning_rate": 4.983511585008695e-07, + "loss": 0.7866038680076599, + "step": 4408 + }, + { + "epoch": 3.3383800151400456, + "grad_norm": 0.5094184875488281, + "learning_rate": 4.974702142152955e-07, + "loss": 0.13831552863121033, + "step": 4410 + }, + { + "epoch": 3.3398940196820592, + "grad_norm": 1.0854383707046509, + "learning_rate": 4.965910761780018e-07, + "loss": 0.46864956617355347, + "step": 4412 + }, + { + "epoch": 3.341408024224073, + "grad_norm": 0.7424245476722717, + "learning_rate": 4.957137457667769e-07, + "loss": 0.008912669494748116, + "step": 4414 + }, + { + "epoch": 3.342922028766086, + "grad_norm": 0.4839117228984833, + "learning_rate": 4.948382243565753e-07, + "loss": 0.0016555218026041985, + "step": 4416 + }, + { + "epoch": 3.3444360333081, + "grad_norm": 1.4709498882293701, + "learning_rate": 4.939645133195168e-07, + "loss": 0.44227319955825806, + "step": 4418 + }, + { + "epoch": 3.3459500378501135, + "grad_norm": 1.975531816482544, + "learning_rate": 4.930926140248843e-07, + "loss": 0.7941685914993286, + "step": 4420 + }, + { + "epoch": 3.347464042392127, + "grad_norm": 12.68311882019043, + "learning_rate": 4.922225278391212e-07, + "loss": 0.005699877627193928, + "step": 4422 + }, + { + "epoch": 3.348978046934141, + "grad_norm": 1.1432486772537231, + "learning_rate": 4.913542561258286e-07, + "loss": 0.6154124140739441, + "step": 4424 + }, + { + "epoch": 3.3504920514761545, + "grad_norm": 0.31212812662124634, + "learning_rate": 4.904878002457658e-07, + "loss": 0.522244930267334, + "step": 4426 + }, + { + "epoch": 3.352006056018168, + "grad_norm": 0.3725129961967468, + "learning_rate": 4.896231615568442e-07, + "loss": 0.03355798125267029, + "step": 4428 + }, + { + "epoch": 3.353520060560182, + "grad_norm": 1.4501001834869385, + "learning_rate": 4.887603414141288e-07, + "loss": 0.8817067742347717, + "step": 4430 + }, + { + "epoch": 3.3550340651021955, + "grad_norm": 3.1281211376190186, + "learning_rate": 4.878993411698335e-07, + "loss": 0.30019909143447876, + "step": 4432 + }, + { + "epoch": 3.3565480696442087, + "grad_norm": 0.8836855292320251, + "learning_rate": 4.87040162173321e-07, + "loss": 0.02827022410929203, + "step": 4434 + }, + { + "epoch": 3.3580620741862224, + "grad_norm": 1.6291098594665527, + "learning_rate": 4.861828057710993e-07, + "loss": 0.7493427395820618, + "step": 4436 + }, + { + "epoch": 3.359576078728236, + "grad_norm": 12.279897689819336, + "learning_rate": 4.853272733068198e-07, + "loss": 0.8481546640396118, + "step": 4438 + }, + { + "epoch": 3.3610900832702497, + "grad_norm": 0.9661763906478882, + "learning_rate": 4.844735661212755e-07, + "loss": 0.49169448018074036, + "step": 4440 + }, + { + "epoch": 3.3626040878122634, + "grad_norm": 1.233472466468811, + "learning_rate": 4.836216855523995e-07, + "loss": 0.8403797149658203, + "step": 4442 + }, + { + "epoch": 3.364118092354277, + "grad_norm": 2.297745943069458, + "learning_rate": 4.827716329352615e-07, + "loss": 0.7104783058166504, + "step": 4444 + }, + { + "epoch": 3.3656320968962907, + "grad_norm": 1.0891869068145752, + "learning_rate": 4.819234096020662e-07, + "loss": 0.13051089644432068, + "step": 4446 + }, + { + "epoch": 3.3671461014383044, + "grad_norm": 2.444359064102173, + "learning_rate": 4.810770168821524e-07, + "loss": 0.1246347576379776, + "step": 4448 + }, + { + "epoch": 3.368660105980318, + "grad_norm": 2.1024117469787598, + "learning_rate": 4.802324561019895e-07, + "loss": 0.1468682438135147, + "step": 4450 + }, + { + "epoch": 3.3701741105223317, + "grad_norm": 1.3625942468643188, + "learning_rate": 4.793897285851753e-07, + "loss": 0.3592517077922821, + "step": 4452 + }, + { + "epoch": 3.3716881150643454, + "grad_norm": 1.7665133476257324, + "learning_rate": 4.785488356524347e-07, + "loss": 0.7478952407836914, + "step": 4454 + }, + { + "epoch": 3.3732021196063586, + "grad_norm": 2.2539124488830566, + "learning_rate": 4.777097786216188e-07, + "loss": 0.882686972618103, + "step": 4456 + }, + { + "epoch": 3.3747161241483723, + "grad_norm": 0.9639107584953308, + "learning_rate": 4.768725588076997e-07, + "loss": 0.43562182784080505, + "step": 4458 + }, + { + "epoch": 3.376230128690386, + "grad_norm": 0.34762042760849, + "learning_rate": 4.7603717752277094e-07, + "loss": 0.2970896363258362, + "step": 4460 + }, + { + "epoch": 3.3777441332323996, + "grad_norm": 1.6363152265548706, + "learning_rate": 4.752036360760449e-07, + "loss": 0.569332480430603, + "step": 4462 + }, + { + "epoch": 3.3792581377744133, + "grad_norm": 1.1818920373916626, + "learning_rate": 4.743719357738499e-07, + "loss": 0.4673527479171753, + "step": 4464 + }, + { + "epoch": 3.380772142316427, + "grad_norm": 1.155680537223816, + "learning_rate": 4.735420779196299e-07, + "loss": 0.5272207856178284, + "step": 4466 + }, + { + "epoch": 3.3822861468584406, + "grad_norm": 1.1912206411361694, + "learning_rate": 4.727140638139402e-07, + "loss": 0.07881423830986023, + "step": 4468 + }, + { + "epoch": 3.3838001514004543, + "grad_norm": 1.1271605491638184, + "learning_rate": 4.718878947544473e-07, + "loss": 0.42218998074531555, + "step": 4470 + }, + { + "epoch": 3.385314155942468, + "grad_norm": 1.3475745916366577, + "learning_rate": 4.7106357203592613e-07, + "loss": 0.1039365753531456, + "step": 4472 + }, + { + "epoch": 3.386828160484481, + "grad_norm": 0.22108665108680725, + "learning_rate": 4.702410969502575e-07, + "loss": 0.35169756412506104, + "step": 4474 + }, + { + "epoch": 3.3883421650264953, + "grad_norm": 1.329148530960083, + "learning_rate": 4.694204707864267e-07, + "loss": 0.5133784413337708, + "step": 4476 + }, + { + "epoch": 3.3898561695685085, + "grad_norm": 0.8461185097694397, + "learning_rate": 4.6860169483052253e-07, + "loss": 0.4518618583679199, + "step": 4478 + }, + { + "epoch": 3.391370174110522, + "grad_norm": 4.864930629730225, + "learning_rate": 4.677847703657323e-07, + "loss": 0.8412244915962219, + "step": 4480 + }, + { + "epoch": 3.392884178652536, + "grad_norm": 0.5239626169204712, + "learning_rate": 4.669696986723429e-07, + "loss": 0.4736073613166809, + "step": 4482 + }, + { + "epoch": 3.3943981831945496, + "grad_norm": 6.462657928466797, + "learning_rate": 4.6615648102773745e-07, + "loss": 0.452423095703125, + "step": 4484 + }, + { + "epoch": 3.3959121877365632, + "grad_norm": 1.4285900592803955, + "learning_rate": 4.653451187063932e-07, + "loss": 0.4780837297439575, + "step": 4486 + }, + { + "epoch": 3.397426192278577, + "grad_norm": 1.6400142908096313, + "learning_rate": 4.6453561297987923e-07, + "loss": 0.43030688166618347, + "step": 4488 + }, + { + "epoch": 3.3989401968205906, + "grad_norm": 1.5580307245254517, + "learning_rate": 4.637279651168556e-07, + "loss": 0.12556520104408264, + "step": 4490 + }, + { + "epoch": 3.4004542013626042, + "grad_norm": 2.3359642028808594, + "learning_rate": 4.6292217638307104e-07, + "loss": 0.7898648977279663, + "step": 4492 + }, + { + "epoch": 3.401968205904618, + "grad_norm": 1.286458969116211, + "learning_rate": 4.6211824804135973e-07, + "loss": 0.6899897456169128, + "step": 4494 + }, + { + "epoch": 3.403482210446631, + "grad_norm": 0.481658011674881, + "learning_rate": 4.6131618135164077e-07, + "loss": 0.007716108113527298, + "step": 4496 + }, + { + "epoch": 3.404996214988645, + "grad_norm": 1.132920742034912, + "learning_rate": 4.605159775709153e-07, + "loss": 0.8279612064361572, + "step": 4498 + }, + { + "epoch": 3.4065102195306585, + "grad_norm": 4.667835235595703, + "learning_rate": 4.5971763795326586e-07, + "loss": 0.5123430490493774, + "step": 4500 + }, + { + "epoch": 3.408024224072672, + "grad_norm": 0.9381482601165771, + "learning_rate": 4.589211637498522e-07, + "loss": 0.04801633954048157, + "step": 4502 + }, + { + "epoch": 3.409538228614686, + "grad_norm": 0.9878865480422974, + "learning_rate": 4.5812655620891124e-07, + "loss": 0.45255210995674133, + "step": 4504 + }, + { + "epoch": 3.4110522331566995, + "grad_norm": 1.0772947072982788, + "learning_rate": 4.5733381657575493e-07, + "loss": 0.49673977494239807, + "step": 4506 + }, + { + "epoch": 3.412566237698713, + "grad_norm": 1.4868048429489136, + "learning_rate": 4.5654294609276676e-07, + "loss": 0.9014337658882141, + "step": 4508 + }, + { + "epoch": 3.414080242240727, + "grad_norm": 1.9662097692489624, + "learning_rate": 4.5575394599940126e-07, + "loss": 0.4788070023059845, + "step": 4510 + }, + { + "epoch": 3.4155942467827405, + "grad_norm": 0.6768618822097778, + "learning_rate": 4.5496681753218254e-07, + "loss": 0.016892148181796074, + "step": 4512 + }, + { + "epoch": 3.417108251324754, + "grad_norm": 1.4944841861724854, + "learning_rate": 4.541815619247002e-07, + "loss": 0.3708324432373047, + "step": 4514 + }, + { + "epoch": 3.418622255866768, + "grad_norm": 1.0815892219543457, + "learning_rate": 4.533981804076097e-07, + "loss": 0.3778248727321625, + "step": 4516 + }, + { + "epoch": 3.420136260408781, + "grad_norm": 1.3007804155349731, + "learning_rate": 4.526166742086288e-07, + "loss": 0.646586000919342, + "step": 4518 + }, + { + "epoch": 3.4216502649507947, + "grad_norm": 1.3324419260025024, + "learning_rate": 4.5183704455253603e-07, + "loss": 0.0615563839673996, + "step": 4520 + }, + { + "epoch": 3.4231642694928084, + "grad_norm": 1.1914488077163696, + "learning_rate": 4.5105929266117035e-07, + "loss": 0.8116185665130615, + "step": 4522 + }, + { + "epoch": 3.424678274034822, + "grad_norm": 1.1742284297943115, + "learning_rate": 4.5028341975342617e-07, + "loss": 0.4476127624511719, + "step": 4524 + }, + { + "epoch": 3.4261922785768357, + "grad_norm": 1.468469500541687, + "learning_rate": 4.495094270452542e-07, + "loss": 0.23907433450222015, + "step": 4526 + }, + { + "epoch": 3.4277062831188494, + "grad_norm": 0.4972544312477112, + "learning_rate": 4.487373157496584e-07, + "loss": 0.04643627628684044, + "step": 4528 + }, + { + "epoch": 3.429220287660863, + "grad_norm": 1.3653373718261719, + "learning_rate": 4.479670870766938e-07, + "loss": 0.02480243518948555, + "step": 4530 + }, + { + "epoch": 3.4307342922028767, + "grad_norm": 1.0246663093566895, + "learning_rate": 4.4719874223346523e-07, + "loss": 0.8878403902053833, + "step": 4532 + }, + { + "epoch": 3.4322482967448904, + "grad_norm": 1.6896597146987915, + "learning_rate": 4.4643228242412494e-07, + "loss": 0.36560678482055664, + "step": 4534 + }, + { + "epoch": 3.4337623012869036, + "grad_norm": 1.5463826656341553, + "learning_rate": 4.456677088498715e-07, + "loss": 0.4867248237133026, + "step": 4536 + }, + { + "epoch": 3.4352763058289173, + "grad_norm": 1.7708585262298584, + "learning_rate": 4.449050227089469e-07, + "loss": 0.6812244653701782, + "step": 4538 + }, + { + "epoch": 3.436790310370931, + "grad_norm": 2.635740041732788, + "learning_rate": 4.4414422519663487e-07, + "loss": 0.4240436553955078, + "step": 4540 + }, + { + "epoch": 3.4383043149129446, + "grad_norm": 1.5549581050872803, + "learning_rate": 4.4338531750526014e-07, + "loss": 0.7371824979782104, + "step": 4542 + }, + { + "epoch": 3.4398183194549583, + "grad_norm": 1.545595407485962, + "learning_rate": 4.426283008241852e-07, + "loss": 0.4680671691894531, + "step": 4544 + }, + { + "epoch": 3.441332323996972, + "grad_norm": 0.9727444052696228, + "learning_rate": 4.418731763398087e-07, + "loss": 0.482888787984848, + "step": 4546 + }, + { + "epoch": 3.4428463285389856, + "grad_norm": 0.931499183177948, + "learning_rate": 4.4111994523556387e-07, + "loss": 0.3971640467643738, + "step": 4548 + }, + { + "epoch": 3.4443603330809993, + "grad_norm": 0.7772575616836548, + "learning_rate": 4.4036860869191734e-07, + "loss": 0.4910150170326233, + "step": 4550 + }, + { + "epoch": 3.445874337623013, + "grad_norm": 0.6824910640716553, + "learning_rate": 4.3961916788636594e-07, + "loss": 0.11980995535850525, + "step": 4552 + }, + { + "epoch": 3.4473883421650267, + "grad_norm": 1.5512605905532837, + "learning_rate": 4.388716239934357e-07, + "loss": 0.4402855336666107, + "step": 4554 + }, + { + "epoch": 3.4489023467070403, + "grad_norm": 1.6966400146484375, + "learning_rate": 4.381259781846793e-07, + "loss": 0.10582391172647476, + "step": 4556 + }, + { + "epoch": 3.4504163512490535, + "grad_norm": 1.7826142311096191, + "learning_rate": 4.3738223162867586e-07, + "loss": 0.008787122555077076, + "step": 4558 + }, + { + "epoch": 3.451930355791067, + "grad_norm": 2.331568717956543, + "learning_rate": 4.3664038549102674e-07, + "loss": 0.5988529920578003, + "step": 4560 + }, + { + "epoch": 3.453444360333081, + "grad_norm": 1.8919638395309448, + "learning_rate": 4.3590044093435584e-07, + "loss": 0.3934869170188904, + "step": 4562 + }, + { + "epoch": 3.4549583648750946, + "grad_norm": 1.8817998170852661, + "learning_rate": 4.351623991183065e-07, + "loss": 0.04238894581794739, + "step": 4564 + }, + { + "epoch": 3.4564723694171082, + "grad_norm": 1.2943494319915771, + "learning_rate": 4.3442626119953994e-07, + "loss": 0.4833775460720062, + "step": 4566 + }, + { + "epoch": 3.457986373959122, + "grad_norm": 1.130948781967163, + "learning_rate": 4.336920283317344e-07, + "loss": 0.3738098740577698, + "step": 4568 + }, + { + "epoch": 3.4595003785011356, + "grad_norm": 0.6324016451835632, + "learning_rate": 4.329597016655811e-07, + "loss": 0.05688200145959854, + "step": 4570 + }, + { + "epoch": 3.4610143830431492, + "grad_norm": 1.1047627925872803, + "learning_rate": 4.3222928234878545e-07, + "loss": 0.3809468746185303, + "step": 4572 + }, + { + "epoch": 3.462528387585163, + "grad_norm": 2.3786842823028564, + "learning_rate": 4.315007715260625e-07, + "loss": 0.8480952978134155, + "step": 4574 + }, + { + "epoch": 3.4640423921271766, + "grad_norm": 1.5654650926589966, + "learning_rate": 4.307741703391366e-07, + "loss": 0.051764726638793945, + "step": 4576 + }, + { + "epoch": 3.4655563966691902, + "grad_norm": 1.0105109214782715, + "learning_rate": 4.3004947992673973e-07, + "loss": 0.3938671350479126, + "step": 4578 + }, + { + "epoch": 3.4670704012112035, + "grad_norm": 0.009669061750173569, + "learning_rate": 4.2932670142460887e-07, + "loss": 0.008482669480144978, + "step": 4580 + }, + { + "epoch": 3.468584405753217, + "grad_norm": 0.9874246716499329, + "learning_rate": 4.286058359654846e-07, + "loss": 0.4697713851928711, + "step": 4582 + }, + { + "epoch": 3.470098410295231, + "grad_norm": 1.8340425491333008, + "learning_rate": 4.278868846791094e-07, + "loss": 0.2904302179813385, + "step": 4584 + }, + { + "epoch": 3.4716124148372445, + "grad_norm": 2.5403199195861816, + "learning_rate": 4.2716984869222625e-07, + "loss": 0.408109575510025, + "step": 4586 + }, + { + "epoch": 3.473126419379258, + "grad_norm": 1.7513412237167358, + "learning_rate": 4.2645472912857617e-07, + "loss": 0.32123586535453796, + "step": 4588 + }, + { + "epoch": 3.474640423921272, + "grad_norm": 0.5798364281654358, + "learning_rate": 4.257415271088966e-07, + "loss": 0.040756113827228546, + "step": 4590 + }, + { + "epoch": 3.4761544284632855, + "grad_norm": 1.8140665292739868, + "learning_rate": 4.2503024375092e-07, + "loss": 0.4627116024494171, + "step": 4592 + }, + { + "epoch": 3.477668433005299, + "grad_norm": 0.7017685174942017, + "learning_rate": 4.24320880169372e-07, + "loss": 0.47174185514450073, + "step": 4594 + }, + { + "epoch": 3.479182437547313, + "grad_norm": 1.0116022825241089, + "learning_rate": 4.236134374759694e-07, + "loss": 0.8078396320343018, + "step": 4596 + }, + { + "epoch": 3.480696442089326, + "grad_norm": 0.4890778362751007, + "learning_rate": 4.229079167794184e-07, + "loss": 0.10163736343383789, + "step": 4598 + }, + { + "epoch": 3.4822104466313397, + "grad_norm": 1.362009882926941, + "learning_rate": 4.2220431918541345e-07, + "loss": 0.8850791454315186, + "step": 4600 + }, + { + "epoch": 3.4837244511733534, + "grad_norm": 0.34711387753486633, + "learning_rate": 4.2150264579663514e-07, + "loss": 0.4698050618171692, + "step": 4602 + }, + { + "epoch": 3.485238455715367, + "grad_norm": 0.49857738614082336, + "learning_rate": 4.208028977127476e-07, + "loss": 0.03801488131284714, + "step": 4604 + }, + { + "epoch": 3.4867524602573807, + "grad_norm": 2.171999216079712, + "learning_rate": 4.2010507603039853e-07, + "loss": 0.6666553616523743, + "step": 4606 + }, + { + "epoch": 3.4882664647993944, + "grad_norm": 0.7819331288337708, + "learning_rate": 4.194091818432163e-07, + "loss": 0.39224740862846375, + "step": 4608 + }, + { + "epoch": 3.489780469341408, + "grad_norm": 0.6680170893669128, + "learning_rate": 4.187152162418084e-07, + "loss": 0.3971290588378906, + "step": 4610 + }, + { + "epoch": 3.4912944738834217, + "grad_norm": 1.471892237663269, + "learning_rate": 4.1802318031375967e-07, + "loss": 0.7852472066879272, + "step": 4612 + }, + { + "epoch": 3.4928084784254354, + "grad_norm": 1.010901689529419, + "learning_rate": 4.1733307514363146e-07, + "loss": 0.34848764538764954, + "step": 4614 + }, + { + "epoch": 3.494322482967449, + "grad_norm": 1.4986950159072876, + "learning_rate": 4.1664490181295813e-07, + "loss": 0.03112611174583435, + "step": 4616 + }, + { + "epoch": 3.4958364875094627, + "grad_norm": 1.8016529083251953, + "learning_rate": 4.1595866140024767e-07, + "loss": 0.4087305963039398, + "step": 4618 + }, + { + "epoch": 3.497350492051476, + "grad_norm": 1.2827506065368652, + "learning_rate": 4.1527435498097803e-07, + "loss": 0.4458525776863098, + "step": 4620 + }, + { + "epoch": 3.4988644965934896, + "grad_norm": 0.4394834041595459, + "learning_rate": 4.145919836275961e-07, + "loss": 0.5897008776664734, + "step": 4622 + }, + { + "epoch": 3.5003785011355033, + "grad_norm": 0.33566799759864807, + "learning_rate": 4.1391154840951664e-07, + "loss": 0.1127045750617981, + "step": 4624 + }, + { + "epoch": 3.501892505677517, + "grad_norm": 1.7849324941635132, + "learning_rate": 4.1323305039311985e-07, + "loss": 0.4438413977622986, + "step": 4626 + }, + { + "epoch": 3.5034065102195306, + "grad_norm": 0.21457764506340027, + "learning_rate": 4.125564906417497e-07, + "loss": 0.4769860506057739, + "step": 4628 + }, + { + "epoch": 3.5049205147615443, + "grad_norm": 4.685155391693115, + "learning_rate": 4.118818702157131e-07, + "loss": 0.0909000113606453, + "step": 4630 + }, + { + "epoch": 3.506434519303558, + "grad_norm": 7.335435390472412, + "learning_rate": 4.1120919017227715e-07, + "loss": 0.46416670083999634, + "step": 4632 + }, + { + "epoch": 3.5079485238455717, + "grad_norm": 1.4116568565368652, + "learning_rate": 4.105384515656678e-07, + "loss": 0.05284082517027855, + "step": 4634 + }, + { + "epoch": 3.5094625283875853, + "grad_norm": 1.7040904760360718, + "learning_rate": 4.098696554470691e-07, + "loss": 0.11267206072807312, + "step": 4636 + }, + { + "epoch": 3.5109765329295985, + "grad_norm": 1.1666487455368042, + "learning_rate": 4.0920280286462046e-07, + "loss": 0.8619710206985474, + "step": 4638 + }, + { + "epoch": 3.5124905374716127, + "grad_norm": 0.5301636457443237, + "learning_rate": 4.0853789486341506e-07, + "loss": 0.0020394865423440933, + "step": 4640 + }, + { + "epoch": 3.514004542013626, + "grad_norm": 0.11858732998371124, + "learning_rate": 4.078749324854988e-07, + "loss": 0.012813457287847996, + "step": 4642 + }, + { + "epoch": 3.5155185465556396, + "grad_norm": 1.2074733972549438, + "learning_rate": 4.0721391676986864e-07, + "loss": 0.7910439372062683, + "step": 4644 + }, + { + "epoch": 3.5170325510976532, + "grad_norm": 1.4610702991485596, + "learning_rate": 4.0655484875247025e-07, + "loss": 0.8097201585769653, + "step": 4646 + }, + { + "epoch": 3.518546555639667, + "grad_norm": 1.497173547744751, + "learning_rate": 4.058977294661972e-07, + "loss": 0.4035246670246124, + "step": 4648 + }, + { + "epoch": 3.5200605601816806, + "grad_norm": 0.41538205742836, + "learning_rate": 4.0524255994088855e-07, + "loss": 0.10689140111207962, + "step": 4650 + }, + { + "epoch": 3.5215745647236942, + "grad_norm": 2.098738670349121, + "learning_rate": 4.0458934120332856e-07, + "loss": 0.31118324398994446, + "step": 4652 + }, + { + "epoch": 3.523088569265708, + "grad_norm": 1.3471760749816895, + "learning_rate": 4.039380742772435e-07, + "loss": 0.4800387918949127, + "step": 4654 + }, + { + "epoch": 3.5246025738077216, + "grad_norm": 1.138135552406311, + "learning_rate": 4.032887601833006e-07, + "loss": 0.549173891544342, + "step": 4656 + }, + { + "epoch": 3.5261165783497352, + "grad_norm": 0.052607882767915726, + "learning_rate": 4.026413999391074e-07, + "loss": 0.002175856614485383, + "step": 4658 + }, + { + "epoch": 3.5276305828917485, + "grad_norm": 1.0348937511444092, + "learning_rate": 4.0199599455920866e-07, + "loss": 0.1844429224729538, + "step": 4660 + }, + { + "epoch": 3.529144587433762, + "grad_norm": 1.2181382179260254, + "learning_rate": 4.013525450550858e-07, + "loss": 0.39277511835098267, + "step": 4662 + }, + { + "epoch": 3.530658591975776, + "grad_norm": 1.421201229095459, + "learning_rate": 4.007110524351546e-07, + "loss": 0.4726411700248718, + "step": 4664 + }, + { + "epoch": 3.5321725965177895, + "grad_norm": 0.6177908182144165, + "learning_rate": 4.000715177047648e-07, + "loss": 0.3939692974090576, + "step": 4666 + }, + { + "epoch": 3.533686601059803, + "grad_norm": 0.04804205521941185, + "learning_rate": 3.994339418661967e-07, + "loss": 0.37155166268348694, + "step": 4668 + }, + { + "epoch": 3.535200605601817, + "grad_norm": 0.465692400932312, + "learning_rate": 3.9879832591866166e-07, + "loss": 0.007390935905277729, + "step": 4670 + }, + { + "epoch": 3.5367146101438305, + "grad_norm": 0.5763087868690491, + "learning_rate": 3.981646708582985e-07, + "loss": 0.3744857609272003, + "step": 4672 + }, + { + "epoch": 3.538228614685844, + "grad_norm": 0.04744290933012962, + "learning_rate": 3.9753297767817374e-07, + "loss": 0.018537191674113274, + "step": 4674 + }, + { + "epoch": 3.539742619227858, + "grad_norm": 3.1067702770233154, + "learning_rate": 3.969032473682789e-07, + "loss": 0.013560655526816845, + "step": 4676 + }, + { + "epoch": 3.541256623769871, + "grad_norm": 4.437751770019531, + "learning_rate": 3.962754809155289e-07, + "loss": 0.8951266407966614, + "step": 4678 + }, + { + "epoch": 3.542770628311885, + "grad_norm": 0.16521283984184265, + "learning_rate": 3.956496793037618e-07, + "loss": 0.43674352765083313, + "step": 4680 + }, + { + "epoch": 3.5442846328538984, + "grad_norm": 2.568776845932007, + "learning_rate": 3.950258435137358e-07, + "loss": 0.5231333374977112, + "step": 4682 + }, + { + "epoch": 3.545798637395912, + "grad_norm": 0.4502345025539398, + "learning_rate": 3.944039745231279e-07, + "loss": 0.38986167311668396, + "step": 4684 + }, + { + "epoch": 3.5473126419379257, + "grad_norm": 0.2962474226951599, + "learning_rate": 3.937840733065333e-07, + "loss": 0.13552610576152802, + "step": 4686 + }, + { + "epoch": 3.5488266464799394, + "grad_norm": 0.27366212010383606, + "learning_rate": 3.9316614083546326e-07, + "loss": 0.3905160427093506, + "step": 4688 + }, + { + "epoch": 3.550340651021953, + "grad_norm": 0.7198954224586487, + "learning_rate": 3.925501780783433e-07, + "loss": 0.0175030454993248, + "step": 4690 + }, + { + "epoch": 3.5518546555639667, + "grad_norm": 3.4487533569335938, + "learning_rate": 3.9193618600051217e-07, + "loss": 0.4703693091869354, + "step": 4692 + }, + { + "epoch": 3.5533686601059804, + "grad_norm": 1.2452820539474487, + "learning_rate": 3.913241655642205e-07, + "loss": 0.4393383860588074, + "step": 4694 + }, + { + "epoch": 3.554882664647994, + "grad_norm": 1.7162336111068726, + "learning_rate": 3.907141177286283e-07, + "loss": 0.5500848889350891, + "step": 4696 + }, + { + "epoch": 3.5563966691900077, + "grad_norm": 1.1964905261993408, + "learning_rate": 3.901060434498047e-07, + "loss": 0.4737063944339752, + "step": 4698 + }, + { + "epoch": 3.557910673732021, + "grad_norm": 0.9332811236381531, + "learning_rate": 3.89499943680725e-07, + "loss": 0.5127653479576111, + "step": 4700 + }, + { + "epoch": 3.559424678274035, + "grad_norm": 1.586294174194336, + "learning_rate": 3.8889581937127145e-07, + "loss": 0.6559292674064636, + "step": 4702 + }, + { + "epoch": 3.5609386828160483, + "grad_norm": 0.2954985797405243, + "learning_rate": 3.882936714682292e-07, + "loss": 0.03178177773952484, + "step": 4704 + }, + { + "epoch": 3.562452687358062, + "grad_norm": 0.17098116874694824, + "learning_rate": 3.876935009152862e-07, + "loss": 0.01636277325451374, + "step": 4706 + }, + { + "epoch": 3.5639666919000756, + "grad_norm": 1.02326500415802, + "learning_rate": 3.870953086530317e-07, + "loss": 0.1606455147266388, + "step": 4708 + }, + { + "epoch": 3.5654806964420893, + "grad_norm": 2.4530370235443115, + "learning_rate": 3.8649909561895453e-07, + "loss": 0.806109607219696, + "step": 4710 + }, + { + "epoch": 3.566994700984103, + "grad_norm": 1.8953279256820679, + "learning_rate": 3.8590486274744177e-07, + "loss": 0.15628452599048615, + "step": 4712 + }, + { + "epoch": 3.5685087055261167, + "grad_norm": 4.308723449707031, + "learning_rate": 3.853126109697766e-07, + "loss": 0.07021796703338623, + "step": 4714 + }, + { + "epoch": 3.5700227100681303, + "grad_norm": 0.10224975645542145, + "learning_rate": 3.8472234121413844e-07, + "loss": 0.013389738276600838, + "step": 4716 + }, + { + "epoch": 3.571536714610144, + "grad_norm": 0.8156806826591492, + "learning_rate": 3.841340544055992e-07, + "loss": 0.04783736169338226, + "step": 4718 + }, + { + "epoch": 3.5730507191521577, + "grad_norm": 1.5311906337738037, + "learning_rate": 3.8354775146612457e-07, + "loss": 0.41418713331222534, + "step": 4720 + }, + { + "epoch": 3.574564723694171, + "grad_norm": 4.635899543762207, + "learning_rate": 3.8296343331456966e-07, + "loss": 0.44917136430740356, + "step": 4722 + }, + { + "epoch": 3.5760787282361846, + "grad_norm": 1.110069751739502, + "learning_rate": 3.823811008666803e-07, + "loss": 0.769571840763092, + "step": 4724 + }, + { + "epoch": 3.5775927327781982, + "grad_norm": 0.8576781153678894, + "learning_rate": 3.818007550350892e-07, + "loss": 0.009633282199501991, + "step": 4726 + }, + { + "epoch": 3.579106737320212, + "grad_norm": 1.8381321430206299, + "learning_rate": 3.8122239672931647e-07, + "loss": 0.16560888290405273, + "step": 4728 + }, + { + "epoch": 3.5806207418622256, + "grad_norm": 2.974501371383667, + "learning_rate": 3.8064602685576696e-07, + "loss": 0.7759672403335571, + "step": 4730 + }, + { + "epoch": 3.5821347464042392, + "grad_norm": 0.8361567854881287, + "learning_rate": 3.800716463177295e-07, + "loss": 0.03352303057909012, + "step": 4732 + }, + { + "epoch": 3.583648750946253, + "grad_norm": 1.254820466041565, + "learning_rate": 3.7949925601537495e-07, + "loss": 0.22655794024467468, + "step": 4734 + }, + { + "epoch": 3.5851627554882666, + "grad_norm": 1.78774893283844, + "learning_rate": 3.789288568457548e-07, + "loss": 0.4909651577472687, + "step": 4736 + }, + { + "epoch": 3.5866767600302802, + "grad_norm": 0.13378266990184784, + "learning_rate": 3.78360449702801e-07, + "loss": 0.4644806683063507, + "step": 4738 + }, + { + "epoch": 3.5881907645722935, + "grad_norm": 0.9797360897064209, + "learning_rate": 3.777940354773227e-07, + "loss": 0.009982442483305931, + "step": 4740 + }, + { + "epoch": 3.5897047691143076, + "grad_norm": 1.1383854150772095, + "learning_rate": 3.7722961505700625e-07, + "loss": 0.382017582654953, + "step": 4742 + }, + { + "epoch": 3.591218773656321, + "grad_norm": 1.046236515045166, + "learning_rate": 3.766671893264126e-07, + "loss": 0.5301817059516907, + "step": 4744 + }, + { + "epoch": 3.5927327781983345, + "grad_norm": 0.2451898753643036, + "learning_rate": 3.7610675916697766e-07, + "loss": 0.02839820086956024, + "step": 4746 + }, + { + "epoch": 3.594246782740348, + "grad_norm": 1.4236079454421997, + "learning_rate": 3.7554832545700917e-07, + "loss": 0.40499091148376465, + "step": 4748 + }, + { + "epoch": 3.595760787282362, + "grad_norm": 1.9924352169036865, + "learning_rate": 3.7499188907168595e-07, + "loss": 0.42945483326911926, + "step": 4750 + }, + { + "epoch": 3.5972747918243755, + "grad_norm": 0.9860897660255432, + "learning_rate": 3.7443745088305723e-07, + "loss": 0.05443042889237404, + "step": 4752 + }, + { + "epoch": 3.598788796366389, + "grad_norm": 0.9116902947425842, + "learning_rate": 3.7388501176004005e-07, + "loss": 0.025338076055049896, + "step": 4754 + }, + { + "epoch": 3.600302800908403, + "grad_norm": 0.4423716962337494, + "learning_rate": 3.733345725684187e-07, + "loss": 0.09729881584644318, + "step": 4756 + }, + { + "epoch": 3.6018168054504165, + "grad_norm": 1.359941005706787, + "learning_rate": 3.727861341708432e-07, + "loss": 0.023140886798501015, + "step": 4758 + }, + { + "epoch": 3.60333080999243, + "grad_norm": 1.1945161819458008, + "learning_rate": 3.7223969742682807e-07, + "loss": 0.4655897319316864, + "step": 4760 + }, + { + "epoch": 3.6048448145344434, + "grad_norm": 0.6171092391014099, + "learning_rate": 3.716952631927505e-07, + "loss": 0.06750277429819107, + "step": 4762 + }, + { + "epoch": 3.6063588190764575, + "grad_norm": 0.2752968966960907, + "learning_rate": 3.711528323218495e-07, + "loss": 0.010420771315693855, + "step": 4764 + }, + { + "epoch": 3.6078728236184707, + "grad_norm": 0.013336937874555588, + "learning_rate": 3.7061240566422476e-07, + "loss": 0.5345507264137268, + "step": 4766 + }, + { + "epoch": 3.6093868281604844, + "grad_norm": 2.168750047683716, + "learning_rate": 3.700739840668343e-07, + "loss": 0.46138548851013184, + "step": 4768 + }, + { + "epoch": 3.610900832702498, + "grad_norm": 2.0301623344421387, + "learning_rate": 3.695375683734941e-07, + "loss": 0.4544313848018646, + "step": 4770 + }, + { + "epoch": 3.6124148372445117, + "grad_norm": 0.05788467079401016, + "learning_rate": 3.6900315942487687e-07, + "loss": 0.4140000641345978, + "step": 4772 + }, + { + "epoch": 3.6139288417865254, + "grad_norm": 1.1849918365478516, + "learning_rate": 3.6847075805850934e-07, + "loss": 0.008691606111824512, + "step": 4774 + }, + { + "epoch": 3.615442846328539, + "grad_norm": 1.3880338668823242, + "learning_rate": 3.6794036510877343e-07, + "loss": 0.5095647573471069, + "step": 4776 + }, + { + "epoch": 3.6169568508705527, + "grad_norm": 12.818706512451172, + "learning_rate": 3.6741198140690205e-07, + "loss": 0.022655198350548744, + "step": 4778 + }, + { + "epoch": 3.618470855412566, + "grad_norm": 0.3411676585674286, + "learning_rate": 3.668856077809798e-07, + "loss": 0.46022307872772217, + "step": 4780 + }, + { + "epoch": 3.61998485995458, + "grad_norm": 0.4783356785774231, + "learning_rate": 3.663612450559414e-07, + "loss": 0.04059567674994469, + "step": 4782 + }, + { + "epoch": 3.6214988644965933, + "grad_norm": 1.2483948469161987, + "learning_rate": 3.658388940535696e-07, + "loss": 0.5810346007347107, + "step": 4784 + }, + { + "epoch": 3.623012869038607, + "grad_norm": 0.7853220701217651, + "learning_rate": 3.653185555924943e-07, + "loss": 0.42636728286743164, + "step": 4786 + }, + { + "epoch": 3.6245268735806206, + "grad_norm": 2.4304869174957275, + "learning_rate": 3.6480023048819196e-07, + "loss": 0.8784370422363281, + "step": 4788 + }, + { + "epoch": 3.6260408781226343, + "grad_norm": 1.00038480758667, + "learning_rate": 3.64283919552983e-07, + "loss": 0.05079210549592972, + "step": 4790 + }, + { + "epoch": 3.627554882664648, + "grad_norm": 0.6209474205970764, + "learning_rate": 3.637696235960316e-07, + "loss": 0.21178439259529114, + "step": 4792 + }, + { + "epoch": 3.6290688872066617, + "grad_norm": 1.2060880661010742, + "learning_rate": 3.632573434233438e-07, + "loss": 0.47263360023498535, + "step": 4794 + }, + { + "epoch": 3.6305828917486753, + "grad_norm": 1.5786904096603394, + "learning_rate": 3.6274707983776723e-07, + "loss": 0.8546769618988037, + "step": 4796 + }, + { + "epoch": 3.632096896290689, + "grad_norm": 0.7086766958236694, + "learning_rate": 3.622388336389881e-07, + "loss": 0.4730693995952606, + "step": 4798 + }, + { + "epoch": 3.6336109008327027, + "grad_norm": 2.0723345279693604, + "learning_rate": 3.6173260562353163e-07, + "loss": 0.19019421935081482, + "step": 4800 + }, + { + "epoch": 3.635124905374716, + "grad_norm": 2.0880532264709473, + "learning_rate": 3.6122839658475964e-07, + "loss": 0.7235720157623291, + "step": 4802 + }, + { + "epoch": 3.63663890991673, + "grad_norm": 1.0905261039733887, + "learning_rate": 3.6072620731287066e-07, + "loss": 0.3838179409503937, + "step": 4804 + }, + { + "epoch": 3.6381529144587432, + "grad_norm": 0.4263961911201477, + "learning_rate": 3.6022603859489704e-07, + "loss": 0.09932809323072433, + "step": 4806 + }, + { + "epoch": 3.639666919000757, + "grad_norm": 0.6843163371086121, + "learning_rate": 3.597278912147044e-07, + "loss": 0.03752122446894646, + "step": 4808 + }, + { + "epoch": 3.6411809235427706, + "grad_norm": 0.8009296655654907, + "learning_rate": 3.592317659529913e-07, + "loss": 0.5528997182846069, + "step": 4810 + }, + { + "epoch": 3.6426949280847842, + "grad_norm": 0.005887988954782486, + "learning_rate": 3.5873766358728684e-07, + "loss": 0.021872470155358315, + "step": 4812 + }, + { + "epoch": 3.644208932626798, + "grad_norm": 1.266884684562683, + "learning_rate": 3.582455848919496e-07, + "loss": 0.41149476170539856, + "step": 4814 + }, + { + "epoch": 3.6457229371688116, + "grad_norm": 1.6484144926071167, + "learning_rate": 3.577555306381666e-07, + "loss": 0.6315127015113831, + "step": 4816 + }, + { + "epoch": 3.6472369417108252, + "grad_norm": 1.2175683975219727, + "learning_rate": 3.5726750159395296e-07, + "loss": 0.7806538939476013, + "step": 4818 + }, + { + "epoch": 3.648750946252839, + "grad_norm": 1.5449774265289307, + "learning_rate": 3.5678149852414884e-07, + "loss": 0.36743658781051636, + "step": 4820 + }, + { + "epoch": 3.6502649507948526, + "grad_norm": 2.437978744506836, + "learning_rate": 3.5629752219042015e-07, + "loss": 0.3218212127685547, + "step": 4822 + }, + { + "epoch": 3.651778955336866, + "grad_norm": 2.3048338890075684, + "learning_rate": 3.558155733512557e-07, + "loss": 0.4483577311038971, + "step": 4824 + }, + { + "epoch": 3.6532929598788795, + "grad_norm": 1.678977608680725, + "learning_rate": 3.5533565276196766e-07, + "loss": 0.11367465555667877, + "step": 4826 + }, + { + "epoch": 3.654806964420893, + "grad_norm": 0.47975626587867737, + "learning_rate": 3.5485776117468887e-07, + "loss": 0.04281741380691528, + "step": 4828 + }, + { + "epoch": 3.656320968962907, + "grad_norm": 1.2723637819290161, + "learning_rate": 3.543818993383724e-07, + "loss": 0.8528371453285217, + "step": 4830 + }, + { + "epoch": 3.6578349735049205, + "grad_norm": 0.05294456332921982, + "learning_rate": 3.5390806799879065e-07, + "loss": 0.42165085673332214, + "step": 4832 + }, + { + "epoch": 3.659348978046934, + "grad_norm": 1.1577458381652832, + "learning_rate": 3.534362678985334e-07, + "loss": 0.37871477007865906, + "step": 4834 + }, + { + "epoch": 3.660862982588948, + "grad_norm": 0.32841092348098755, + "learning_rate": 3.529664997770074e-07, + "loss": 0.11999566107988358, + "step": 4836 + }, + { + "epoch": 3.6623769871309615, + "grad_norm": 1.034641981124878, + "learning_rate": 3.524987643704343e-07, + "loss": 0.7581790685653687, + "step": 4838 + }, + { + "epoch": 3.663890991672975, + "grad_norm": 1.251404881477356, + "learning_rate": 3.5203306241185103e-07, + "loss": 0.33820396661758423, + "step": 4840 + }, + { + "epoch": 3.6654049962149884, + "grad_norm": 1.7323718070983887, + "learning_rate": 3.515693946311069e-07, + "loss": 0.0171514842659235, + "step": 4842 + }, + { + "epoch": 3.6669190007570025, + "grad_norm": 0.4067513644695282, + "learning_rate": 3.511077617548632e-07, + "loss": 0.00200492306612432, + "step": 4844 + }, + { + "epoch": 3.6684330052990157, + "grad_norm": 0.6219225525856018, + "learning_rate": 3.506481645065932e-07, + "loss": 0.4262062609195709, + "step": 4846 + }, + { + "epoch": 3.6699470098410294, + "grad_norm": 1.2826002836227417, + "learning_rate": 3.501906036065784e-07, + "loss": 0.38665691018104553, + "step": 4848 + }, + { + "epoch": 3.671461014383043, + "grad_norm": 1.7009577751159668, + "learning_rate": 3.497350797719101e-07, + "loss": 0.1622554063796997, + "step": 4850 + }, + { + "epoch": 3.6729750189250567, + "grad_norm": 1.2407199144363403, + "learning_rate": 3.4928159371648635e-07, + "loss": 0.3439951539039612, + "step": 4852 + }, + { + "epoch": 3.6744890234670704, + "grad_norm": 0.6743606925010681, + "learning_rate": 3.488301461510123e-07, + "loss": 0.46227577328681946, + "step": 4854 + }, + { + "epoch": 3.676003028009084, + "grad_norm": 1.5045247077941895, + "learning_rate": 3.4838073778299776e-07, + "loss": 0.6379145383834839, + "step": 4856 + }, + { + "epoch": 3.6775170325510977, + "grad_norm": 1.7786139249801636, + "learning_rate": 3.4793336931675693e-07, + "loss": 0.260003000497818, + "step": 4858 + }, + { + "epoch": 3.6790310370931114, + "grad_norm": 1.5225580930709839, + "learning_rate": 3.4748804145340693e-07, + "loss": 0.6078323125839233, + "step": 4860 + }, + { + "epoch": 3.680545041635125, + "grad_norm": 1.459561824798584, + "learning_rate": 3.470447548908672e-07, + "loss": 0.49153566360473633, + "step": 4862 + }, + { + "epoch": 3.6820590461771383, + "grad_norm": 0.3002374768257141, + "learning_rate": 3.466035103238579e-07, + "loss": 0.11484368145465851, + "step": 4864 + }, + { + "epoch": 3.6835730507191524, + "grad_norm": 3.1435353755950928, + "learning_rate": 3.461643084438984e-07, + "loss": 0.4672025740146637, + "step": 4866 + }, + { + "epoch": 3.6850870552611656, + "grad_norm": 3.5278983116149902, + "learning_rate": 3.4572714993930797e-07, + "loss": 0.8416491150856018, + "step": 4868 + }, + { + "epoch": 3.6866010598031793, + "grad_norm": 2.033142328262329, + "learning_rate": 3.4529203549520226e-07, + "loss": 0.13594593107700348, + "step": 4870 + }, + { + "epoch": 3.688115064345193, + "grad_norm": 0.14393934607505798, + "learning_rate": 3.4485896579349415e-07, + "loss": 0.008193666115403175, + "step": 4872 + }, + { + "epoch": 3.6896290688872067, + "grad_norm": 1.3200061321258545, + "learning_rate": 3.444279415128919e-07, + "loss": 0.08467790484428406, + "step": 4874 + }, + { + "epoch": 3.6911430734292203, + "grad_norm": 1.435538411140442, + "learning_rate": 3.43998963328898e-07, + "loss": 0.7755026817321777, + "step": 4876 + }, + { + "epoch": 3.692657077971234, + "grad_norm": 1.7662804126739502, + "learning_rate": 3.435720319138084e-07, + "loss": 0.8459671139717102, + "step": 4878 + }, + { + "epoch": 3.6941710825132477, + "grad_norm": 0.06649971008300781, + "learning_rate": 3.431471479367113e-07, + "loss": 0.38383612036705017, + "step": 4880 + }, + { + "epoch": 3.6956850870552613, + "grad_norm": 0.027875706553459167, + "learning_rate": 3.4272431206348635e-07, + "loss": 0.35727202892303467, + "step": 4882 + }, + { + "epoch": 3.697199091597275, + "grad_norm": 1.2968356609344482, + "learning_rate": 3.4230352495680335e-07, + "loss": 0.9672133922576904, + "step": 4884 + }, + { + "epoch": 3.6987130961392882, + "grad_norm": 1.6936166286468506, + "learning_rate": 3.4188478727612087e-07, + "loss": 0.8826351165771484, + "step": 4886 + }, + { + "epoch": 3.700227100681302, + "grad_norm": 1.757140874862671, + "learning_rate": 3.4146809967768595e-07, + "loss": 0.864463210105896, + "step": 4888 + }, + { + "epoch": 3.7017411052233156, + "grad_norm": 2.8798224925994873, + "learning_rate": 3.41053462814533e-07, + "loss": 0.008782775141298771, + "step": 4890 + }, + { + "epoch": 3.7032551097653292, + "grad_norm": 1.3960527181625366, + "learning_rate": 3.4064087733648184e-07, + "loss": 0.8980627059936523, + "step": 4892 + }, + { + "epoch": 3.704769114307343, + "grad_norm": 1.2567702531814575, + "learning_rate": 3.40230343890138e-07, + "loss": 0.758456289768219, + "step": 4894 + }, + { + "epoch": 3.7062831188493566, + "grad_norm": 0.1362496018409729, + "learning_rate": 3.3982186311889046e-07, + "loss": 0.4230273962020874, + "step": 4896 + }, + { + "epoch": 3.7077971233913702, + "grad_norm": 0.48844024538993835, + "learning_rate": 3.3941543566291193e-07, + "loss": 0.026354506611824036, + "step": 4898 + }, + { + "epoch": 3.709311127933384, + "grad_norm": 1.114485263824463, + "learning_rate": 3.390110621591566e-07, + "loss": 0.33992433547973633, + "step": 4900 + }, + { + "epoch": 3.7108251324753976, + "grad_norm": 1.13433837890625, + "learning_rate": 3.3860874324135974e-07, + "loss": 0.45256906747817993, + "step": 4902 + }, + { + "epoch": 3.712339137017411, + "grad_norm": 1.7313382625579834, + "learning_rate": 3.3820847954003713e-07, + "loss": 0.012239966541528702, + "step": 4904 + }, + { + "epoch": 3.713853141559425, + "grad_norm": 1.3510212898254395, + "learning_rate": 3.378102716824832e-07, + "loss": 0.06442499160766602, + "step": 4906 + }, + { + "epoch": 3.715367146101438, + "grad_norm": 2.4342682361602783, + "learning_rate": 3.3741412029277034e-07, + "loss": 0.4134178161621094, + "step": 4908 + }, + { + "epoch": 3.716881150643452, + "grad_norm": 0.33617573976516724, + "learning_rate": 3.370200259917483e-07, + "loss": 0.3299219012260437, + "step": 4910 + }, + { + "epoch": 3.7183951551854655, + "grad_norm": 1.1140594482421875, + "learning_rate": 3.3662798939704323e-07, + "loss": 0.020723015069961548, + "step": 4912 + }, + { + "epoch": 3.719909159727479, + "grad_norm": 1.1104437112808228, + "learning_rate": 3.3623801112305577e-07, + "loss": 0.09151849895715714, + "step": 4914 + }, + { + "epoch": 3.721423164269493, + "grad_norm": 0.526814341545105, + "learning_rate": 3.3585009178096114e-07, + "loss": 0.01095789484679699, + "step": 4916 + }, + { + "epoch": 3.7229371688115065, + "grad_norm": 0.06857289373874664, + "learning_rate": 3.3546423197870793e-07, + "loss": 0.4291396737098694, + "step": 4918 + }, + { + "epoch": 3.72445117335352, + "grad_norm": 2.0456442832946777, + "learning_rate": 3.350804323210165e-07, + "loss": 0.31084686517715454, + "step": 4920 + }, + { + "epoch": 3.725965177895534, + "grad_norm": 0.12176215648651123, + "learning_rate": 3.3469869340937915e-07, + "loss": 0.057937733829021454, + "step": 4922 + }, + { + "epoch": 3.7274791824375475, + "grad_norm": 0.48398348689079285, + "learning_rate": 3.3431901584205834e-07, + "loss": 0.45625340938568115, + "step": 4924 + }, + { + "epoch": 3.7289931869795607, + "grad_norm": 1.5193073749542236, + "learning_rate": 3.3394140021408555e-07, + "loss": 0.3990243375301361, + "step": 4926 + }, + { + "epoch": 3.730507191521575, + "grad_norm": 3.3346807956695557, + "learning_rate": 3.335658471172615e-07, + "loss": 0.4398367702960968, + "step": 4928 + }, + { + "epoch": 3.732021196063588, + "grad_norm": 1.239586591720581, + "learning_rate": 3.3319235714015426e-07, + "loss": 0.07557009160518646, + "step": 4930 + }, + { + "epoch": 3.7335352006056017, + "grad_norm": 0.1686035841703415, + "learning_rate": 3.328209308680981e-07, + "loss": 0.4428555965423584, + "step": 4932 + }, + { + "epoch": 3.7350492051476154, + "grad_norm": 0.33270350098609924, + "learning_rate": 3.324515688831939e-07, + "loss": 0.01723654568195343, + "step": 4934 + }, + { + "epoch": 3.736563209689629, + "grad_norm": 1.8708503246307373, + "learning_rate": 3.320842717643066e-07, + "loss": 0.46671631932258606, + "step": 4936 + }, + { + "epoch": 3.7380772142316427, + "grad_norm": 2.122084856033325, + "learning_rate": 3.317190400870653e-07, + "loss": 0.8873423933982849, + "step": 4938 + }, + { + "epoch": 3.7395912187736564, + "grad_norm": 2.7678170204162598, + "learning_rate": 3.3135587442386263e-07, + "loss": 0.4686410427093506, + "step": 4940 + }, + { + "epoch": 3.74110522331567, + "grad_norm": 1.3442670106887817, + "learning_rate": 3.309947753438528e-07, + "loss": 0.4087761342525482, + "step": 4942 + }, + { + "epoch": 3.7426192278576833, + "grad_norm": 0.1441280096769333, + "learning_rate": 3.3063574341295124e-07, + "loss": 0.14754869043827057, + "step": 4944 + }, + { + "epoch": 3.7441332323996974, + "grad_norm": 1.3364726305007935, + "learning_rate": 3.302787791938339e-07, + "loss": 0.47264066338539124, + "step": 4946 + }, + { + "epoch": 3.7456472369417106, + "grad_norm": 0.6875699162483215, + "learning_rate": 3.299238832459364e-07, + "loss": 0.010902936570346355, + "step": 4948 + }, + { + "epoch": 3.7471612414837243, + "grad_norm": 0.6394464373588562, + "learning_rate": 3.2957105612545275e-07, + "loss": 0.2788526117801666, + "step": 4950 + }, + { + "epoch": 3.748675246025738, + "grad_norm": 0.2793129086494446, + "learning_rate": 3.292202983853347e-07, + "loss": 0.45828747749328613, + "step": 4952 + }, + { + "epoch": 3.7501892505677517, + "grad_norm": 1.1479758024215698, + "learning_rate": 3.2887161057529067e-07, + "loss": 0.3941059708595276, + "step": 4954 + }, + { + "epoch": 3.7517032551097653, + "grad_norm": 1.5251473188400269, + "learning_rate": 3.2852499324178554e-07, + "loss": 0.7015395164489746, + "step": 4956 + }, + { + "epoch": 3.753217259651779, + "grad_norm": 1.719663381576538, + "learning_rate": 3.281804469280392e-07, + "loss": 0.8031244874000549, + "step": 4958 + }, + { + "epoch": 3.7547312641937927, + "grad_norm": 0.2364034503698349, + "learning_rate": 3.2783797217402525e-07, + "loss": 0.3604779839515686, + "step": 4960 + }, + { + "epoch": 3.7562452687358063, + "grad_norm": 0.6598563194274902, + "learning_rate": 3.274975695164716e-07, + "loss": 0.3032711446285248, + "step": 4962 + }, + { + "epoch": 3.75775927327782, + "grad_norm": 0.27643054723739624, + "learning_rate": 3.271592394888583e-07, + "loss": 0.03242190554738045, + "step": 4964 + }, + { + "epoch": 3.7592732778198332, + "grad_norm": 1.4321353435516357, + "learning_rate": 3.2682298262141696e-07, + "loss": 0.9903982877731323, + "step": 4966 + }, + { + "epoch": 3.7607872823618473, + "grad_norm": 0.7925037145614624, + "learning_rate": 3.264887994411306e-07, + "loss": 0.031347546726465225, + "step": 4968 + }, + { + "epoch": 3.7623012869038606, + "grad_norm": 0.11465443670749664, + "learning_rate": 3.2615669047173225e-07, + "loss": 0.43080469965934753, + "step": 4970 + }, + { + "epoch": 3.7638152914458742, + "grad_norm": 0.3840098977088928, + "learning_rate": 3.2582665623370385e-07, + "loss": 0.41830727458000183, + "step": 4972 + }, + { + "epoch": 3.765329295987888, + "grad_norm": 0.5051982998847961, + "learning_rate": 3.2549869724427634e-07, + "loss": 0.034080203622579575, + "step": 4974 + }, + { + "epoch": 3.7668433005299016, + "grad_norm": 2.4309990406036377, + "learning_rate": 3.251728140174279e-07, + "loss": 0.8621931672096252, + "step": 4976 + }, + { + "epoch": 3.7683573050719152, + "grad_norm": 1.0540837049484253, + "learning_rate": 3.248490070638837e-07, + "loss": 0.6200702786445618, + "step": 4978 + }, + { + "epoch": 3.769871309613929, + "grad_norm": 1.4778248071670532, + "learning_rate": 3.2452727689111543e-07, + "loss": 0.4134140610694885, + "step": 4980 + }, + { + "epoch": 3.7713853141559426, + "grad_norm": 0.19006557762622833, + "learning_rate": 3.2420762400333903e-07, + "loss": 0.451352059841156, + "step": 4982 + }, + { + "epoch": 3.7728993186979563, + "grad_norm": 2.122025489807129, + "learning_rate": 3.238900489015158e-07, + "loss": 0.4227078855037689, + "step": 4984 + }, + { + "epoch": 3.77441332323997, + "grad_norm": 1.944567322731018, + "learning_rate": 3.235745520833506e-07, + "loss": 0.4337266683578491, + "step": 4986 + }, + { + "epoch": 3.775927327781983, + "grad_norm": 0.41088730096817017, + "learning_rate": 3.232611340432908e-07, + "loss": 0.4283662438392639, + "step": 4988 + }, + { + "epoch": 3.777441332323997, + "grad_norm": 2.9886157512664795, + "learning_rate": 3.22949795272526e-07, + "loss": 0.11887612193822861, + "step": 4990 + }, + { + "epoch": 3.7789553368660105, + "grad_norm": 4.230180263519287, + "learning_rate": 3.226405362589877e-07, + "loss": 0.41138648986816406, + "step": 4992 + }, + { + "epoch": 3.780469341408024, + "grad_norm": 0.07120750844478607, + "learning_rate": 3.2233335748734724e-07, + "loss": 0.44773682951927185, + "step": 4994 + }, + { + "epoch": 3.781983345950038, + "grad_norm": 0.03910240903496742, + "learning_rate": 3.220282594390161e-07, + "loss": 0.0012030932120978832, + "step": 4996 + }, + { + "epoch": 3.7834973504920515, + "grad_norm": 4.673849582672119, + "learning_rate": 3.2172524259214534e-07, + "loss": 0.3496311604976654, + "step": 4998 + }, + { + "epoch": 3.785011355034065, + "grad_norm": 0.4953646659851074, + "learning_rate": 3.2142430742162355e-07, + "loss": 0.0004449788830243051, + "step": 5000 + }, + { + "epoch": 3.786525359576079, + "grad_norm": 1.1354345083236694, + "learning_rate": 3.211254543990774e-07, + "loss": 0.0014750072732567787, + "step": 5002 + }, + { + "epoch": 3.7880393641180925, + "grad_norm": 1.3143340349197388, + "learning_rate": 3.208286839928701e-07, + "loss": 0.8150073885917664, + "step": 5004 + }, + { + "epoch": 3.7895533686601057, + "grad_norm": 0.048394881188869476, + "learning_rate": 3.2053399666810133e-07, + "loss": 0.4464598596096039, + "step": 5006 + }, + { + "epoch": 3.79106737320212, + "grad_norm": 0.7394788265228271, + "learning_rate": 3.2024139288660594e-07, + "loss": 0.09451478719711304, + "step": 5008 + }, + { + "epoch": 3.792581377744133, + "grad_norm": 0.05495171993970871, + "learning_rate": 3.1995087310695357e-07, + "loss": 0.003531100694090128, + "step": 5010 + }, + { + "epoch": 3.7940953822861467, + "grad_norm": 1.2948366403579712, + "learning_rate": 3.196624377844476e-07, + "loss": 0.008393335156142712, + "step": 5012 + }, + { + "epoch": 3.7956093868281604, + "grad_norm": 1.4162871837615967, + "learning_rate": 3.1937608737112493e-07, + "loss": 0.48247841000556946, + "step": 5014 + }, + { + "epoch": 3.797123391370174, + "grad_norm": 0.5562068223953247, + "learning_rate": 3.190918223157546e-07, + "loss": 0.4237671196460724, + "step": 5016 + }, + { + "epoch": 3.7986373959121877, + "grad_norm": 0.5564634799957275, + "learning_rate": 3.188096430638377e-07, + "loss": 0.0319337323307991, + "step": 5018 + }, + { + "epoch": 3.8001514004542014, + "grad_norm": 1.1125496625900269, + "learning_rate": 3.1852955005760665e-07, + "loss": 0.4008614420890808, + "step": 5020 + }, + { + "epoch": 3.801665404996215, + "grad_norm": 0.48999980092048645, + "learning_rate": 3.1825154373602416e-07, + "loss": 0.4390028119087219, + "step": 5022 + }, + { + "epoch": 3.8031794095382288, + "grad_norm": 1.588819980621338, + "learning_rate": 3.179756245347822e-07, + "loss": 0.4818562865257263, + "step": 5024 + }, + { + "epoch": 3.8046934140802424, + "grad_norm": 0.07910662144422531, + "learning_rate": 3.1770179288630244e-07, + "loss": 0.172007754445076, + "step": 5026 + }, + { + "epoch": 3.8062074186222556, + "grad_norm": 0.39034202694892883, + "learning_rate": 3.174300492197346e-07, + "loss": 0.0110551118850708, + "step": 5028 + }, + { + "epoch": 3.8077214231642698, + "grad_norm": 1.754652976989746, + "learning_rate": 3.1716039396095646e-07, + "loss": 0.38824954628944397, + "step": 5030 + }, + { + "epoch": 3.809235427706283, + "grad_norm": 0.4586925506591797, + "learning_rate": 3.168928275325724e-07, + "loss": 0.377737432718277, + "step": 5032 + }, + { + "epoch": 3.8107494322482967, + "grad_norm": 1.6731964349746704, + "learning_rate": 3.1662735035391325e-07, + "loss": 0.4779408872127533, + "step": 5034 + }, + { + "epoch": 3.8122634367903103, + "grad_norm": 1.218449592590332, + "learning_rate": 3.1636396284103594e-07, + "loss": 0.7690878510475159, + "step": 5036 + }, + { + "epoch": 3.813777441332324, + "grad_norm": 2.97236967086792, + "learning_rate": 3.16102665406722e-07, + "loss": 0.8440889120101929, + "step": 5038 + }, + { + "epoch": 3.8152914458743377, + "grad_norm": 20.792285919189453, + "learning_rate": 3.1584345846047765e-07, + "loss": 0.509138822555542, + "step": 5040 + }, + { + "epoch": 3.8168054504163513, + "grad_norm": 1.915735125541687, + "learning_rate": 3.155863424085327e-07, + "loss": 0.4671858847141266, + "step": 5042 + }, + { + "epoch": 3.818319454958365, + "grad_norm": 1.17746102809906, + "learning_rate": 3.1533131765384063e-07, + "loss": 0.4532623887062073, + "step": 5044 + }, + { + "epoch": 3.8198334595003782, + "grad_norm": 1.118971586227417, + "learning_rate": 3.150783845960765e-07, + "loss": 0.45628684759140015, + "step": 5046 + }, + { + "epoch": 3.8213474640423923, + "grad_norm": 0.5616589188575745, + "learning_rate": 3.148275436316381e-07, + "loss": 0.30977633595466614, + "step": 5048 + }, + { + "epoch": 3.8228614685844056, + "grad_norm": 1.131917119026184, + "learning_rate": 3.145787951536441e-07, + "loss": 0.29055628180503845, + "step": 5050 + }, + { + "epoch": 3.8243754731264192, + "grad_norm": 0.7852630615234375, + "learning_rate": 3.1433213955193404e-07, + "loss": 0.43340763449668884, + "step": 5052 + }, + { + "epoch": 3.825889477668433, + "grad_norm": 1.758888602256775, + "learning_rate": 3.1408757721306693e-07, + "loss": 0.8919140696525574, + "step": 5054 + }, + { + "epoch": 3.8274034822104466, + "grad_norm": 0.9975840449333191, + "learning_rate": 3.1384510852032213e-07, + "loss": 0.010779071599245071, + "step": 5056 + }, + { + "epoch": 3.8289174867524602, + "grad_norm": 0.16059216856956482, + "learning_rate": 3.1360473385369705e-07, + "loss": 0.014391905628144741, + "step": 5058 + }, + { + "epoch": 3.830431491294474, + "grad_norm": 12.744454383850098, + "learning_rate": 3.133664535899078e-07, + "loss": 0.7757210731506348, + "step": 5060 + }, + { + "epoch": 3.8319454958364876, + "grad_norm": 1.00156831741333, + "learning_rate": 3.131302681023877e-07, + "loss": 0.7180578112602234, + "step": 5062 + }, + { + "epoch": 3.8334595003785013, + "grad_norm": 1.8191956281661987, + "learning_rate": 3.128961777612876e-07, + "loss": 0.09756748378276825, + "step": 5064 + }, + { + "epoch": 3.834973504920515, + "grad_norm": 7.763867378234863, + "learning_rate": 3.1266418293347464e-07, + "loss": 0.5316522717475891, + "step": 5066 + }, + { + "epoch": 3.836487509462528, + "grad_norm": 0.5803042054176331, + "learning_rate": 3.124342839825314e-07, + "loss": 0.012462900951504707, + "step": 5068 + }, + { + "epoch": 3.8380015140045423, + "grad_norm": 2.1896307468414307, + "learning_rate": 3.122064812687568e-07, + "loss": 0.6004921197891235, + "step": 5070 + }, + { + "epoch": 3.8395155185465555, + "grad_norm": 0.1068381518125534, + "learning_rate": 3.1198077514916374e-07, + "loss": 0.24291282892227173, + "step": 5072 + }, + { + "epoch": 3.841029523088569, + "grad_norm": 2.504046678543091, + "learning_rate": 3.1175716597747953e-07, + "loss": 0.3830157518386841, + "step": 5074 + }, + { + "epoch": 3.842543527630583, + "grad_norm": 7.473776340484619, + "learning_rate": 3.1153565410414514e-07, + "loss": 0.4516468644142151, + "step": 5076 + }, + { + "epoch": 3.8440575321725965, + "grad_norm": 0.055708762258291245, + "learning_rate": 3.113162398763148e-07, + "loss": 0.3676786422729492, + "step": 5078 + }, + { + "epoch": 3.84557153671461, + "grad_norm": 2.0055947303771973, + "learning_rate": 3.110989236378549e-07, + "loss": 0.6928231120109558, + "step": 5080 + }, + { + "epoch": 3.847085541256624, + "grad_norm": 1.0609912872314453, + "learning_rate": 3.108837057293445e-07, + "loss": 0.004521830938756466, + "step": 5082 + }, + { + "epoch": 3.8485995457986375, + "grad_norm": 0.22726251184940338, + "learning_rate": 3.106705864880735e-07, + "loss": 0.44820210337638855, + "step": 5084 + }, + { + "epoch": 3.850113550340651, + "grad_norm": 0.17275147140026093, + "learning_rate": 3.1045956624804317e-07, + "loss": 0.44499069452285767, + "step": 5086 + }, + { + "epoch": 3.851627554882665, + "grad_norm": 1.3751364946365356, + "learning_rate": 3.1025064533996517e-07, + "loss": 0.41673925518989563, + "step": 5088 + }, + { + "epoch": 3.853141559424678, + "grad_norm": 1.9896053075790405, + "learning_rate": 3.1004382409126064e-07, + "loss": 0.47557222843170166, + "step": 5090 + }, + { + "epoch": 3.8546555639666917, + "grad_norm": 1.3645247220993042, + "learning_rate": 3.0983910282606083e-07, + "loss": 0.46344706416130066, + "step": 5092 + }, + { + "epoch": 3.8561695685087054, + "grad_norm": 3.7738399505615234, + "learning_rate": 3.096364818652055e-07, + "loss": 0.4740281403064728, + "step": 5094 + }, + { + "epoch": 3.857683573050719, + "grad_norm": 0.10791615396738052, + "learning_rate": 3.0943596152624284e-07, + "loss": 0.0011476953513920307, + "step": 5096 + }, + { + "epoch": 3.8591975775927327, + "grad_norm": 1.1051290035247803, + "learning_rate": 3.0923754212342886e-07, + "loss": 0.6011137366294861, + "step": 5098 + }, + { + "epoch": 3.8607115821347464, + "grad_norm": 1.1801726818084717, + "learning_rate": 3.0904122396772705e-07, + "loss": 0.004883904475718737, + "step": 5100 + }, + { + "epoch": 3.86222558667676, + "grad_norm": 2.525057077407837, + "learning_rate": 3.08847007366808e-07, + "loss": 0.44737234711647034, + "step": 5102 + }, + { + "epoch": 3.8637395912187738, + "grad_norm": 1.5845142602920532, + "learning_rate": 3.0865489262504844e-07, + "loss": 0.45857617259025574, + "step": 5104 + }, + { + "epoch": 3.8652535957607874, + "grad_norm": 0.3961471617221832, + "learning_rate": 3.0846488004353116e-07, + "loss": 0.015096918679773808, + "step": 5106 + }, + { + "epoch": 3.8667676003028006, + "grad_norm": 1.1940659284591675, + "learning_rate": 3.0827696992004456e-07, + "loss": 0.8227152824401855, + "step": 5108 + }, + { + "epoch": 3.8682816048448148, + "grad_norm": 0.9350260496139526, + "learning_rate": 3.0809116254908205e-07, + "loss": 0.8203873038291931, + "step": 5110 + }, + { + "epoch": 3.869795609386828, + "grad_norm": 1.5724537372589111, + "learning_rate": 3.079074582218412e-07, + "loss": 0.7634638547897339, + "step": 5112 + }, + { + "epoch": 3.8713096139288417, + "grad_norm": 3.372626304626465, + "learning_rate": 3.077258572262245e-07, + "loss": 0.5848556756973267, + "step": 5114 + }, + { + "epoch": 3.8728236184708553, + "grad_norm": 1.3702733516693115, + "learning_rate": 3.0754635984683733e-07, + "loss": 0.40469735860824585, + "step": 5116 + }, + { + "epoch": 3.874337623012869, + "grad_norm": 0.3992665708065033, + "learning_rate": 3.0736896636498844e-07, + "loss": 0.2156638205051422, + "step": 5118 + }, + { + "epoch": 3.8758516275548827, + "grad_norm": 0.9194819927215576, + "learning_rate": 3.0719367705868947e-07, + "loss": 0.08066434413194656, + "step": 5120 + }, + { + "epoch": 3.8773656320968963, + "grad_norm": 0.6540529131889343, + "learning_rate": 3.0702049220265463e-07, + "loss": 0.41603922843933105, + "step": 5122 + }, + { + "epoch": 3.87887963663891, + "grad_norm": 0.34057825803756714, + "learning_rate": 3.0684941206829945e-07, + "loss": 0.11957624554634094, + "step": 5124 + }, + { + "epoch": 3.8803936411809237, + "grad_norm": 0.31294089555740356, + "learning_rate": 3.0668043692374135e-07, + "loss": 0.0026283212937414646, + "step": 5126 + }, + { + "epoch": 3.8819076457229373, + "grad_norm": 0.4604136645793915, + "learning_rate": 3.06513567033799e-07, + "loss": 0.43892979621887207, + "step": 5128 + }, + { + "epoch": 3.8834216502649506, + "grad_norm": 1.3849968910217285, + "learning_rate": 3.063488026599908e-07, + "loss": 0.4195421040058136, + "step": 5130 + }, + { + "epoch": 3.8849356548069647, + "grad_norm": 1.155035376548767, + "learning_rate": 3.061861440605366e-07, + "loss": 0.3778454661369324, + "step": 5132 + }, + { + "epoch": 3.886449659348978, + "grad_norm": 0.16972775757312775, + "learning_rate": 3.060255914903552e-07, + "loss": 0.45963388681411743, + "step": 5134 + }, + { + "epoch": 3.8879636638909916, + "grad_norm": 1.0293406248092651, + "learning_rate": 3.05867145201065e-07, + "loss": 0.17590171098709106, + "step": 5136 + }, + { + "epoch": 3.8894776684330052, + "grad_norm": 0.7280426621437073, + "learning_rate": 3.057108054409837e-07, + "loss": 0.37898415327072144, + "step": 5138 + }, + { + "epoch": 3.890991672975019, + "grad_norm": 1.5102609395980835, + "learning_rate": 3.0555657245512737e-07, + "loss": 0.048597052693367004, + "step": 5140 + }, + { + "epoch": 3.8925056775170326, + "grad_norm": 2.5251283645629883, + "learning_rate": 3.0540444648521044e-07, + "loss": 0.42764487862586975, + "step": 5142 + }, + { + "epoch": 3.8940196820590463, + "grad_norm": 0.17893707752227783, + "learning_rate": 3.052544277696452e-07, + "loss": 0.03166920691728592, + "step": 5144 + }, + { + "epoch": 3.89553368660106, + "grad_norm": 0.9614020586013794, + "learning_rate": 3.0510651654354144e-07, + "loss": 0.07043096423149109, + "step": 5146 + }, + { + "epoch": 3.8970476911430736, + "grad_norm": 1.7453250885009766, + "learning_rate": 3.0496071303870617e-07, + "loss": 0.024295624345541, + "step": 5148 + }, + { + "epoch": 3.8985616956850873, + "grad_norm": 1.2456791400909424, + "learning_rate": 3.0481701748364293e-07, + "loss": 0.770180344581604, + "step": 5150 + }, + { + "epoch": 3.9000757002271005, + "grad_norm": 1.7926623821258545, + "learning_rate": 3.04675430103552e-07, + "loss": 0.7948777675628662, + "step": 5152 + }, + { + "epoch": 3.901589704769114, + "grad_norm": 1.1297966241836548, + "learning_rate": 3.045359511203297e-07, + "loss": 0.36591821908950806, + "step": 5154 + }, + { + "epoch": 3.903103709311128, + "grad_norm": 0.4602949619293213, + "learning_rate": 3.043985807525675e-07, + "loss": 0.4371163249015808, + "step": 5156 + }, + { + "epoch": 3.9046177138531415, + "grad_norm": 1.1267192363739014, + "learning_rate": 3.04263319215553e-07, + "loss": 0.07285796850919724, + "step": 5158 + }, + { + "epoch": 3.906131718395155, + "grad_norm": 3.856311082839966, + "learning_rate": 3.041301667212684e-07, + "loss": 0.49703073501586914, + "step": 5160 + }, + { + "epoch": 3.907645722937169, + "grad_norm": 0.7211084961891174, + "learning_rate": 3.039991234783906e-07, + "loss": 0.01246828492730856, + "step": 5162 + }, + { + "epoch": 3.9091597274791825, + "grad_norm": 1.4662927389144897, + "learning_rate": 3.0387018969229133e-07, + "loss": 0.02732657454907894, + "step": 5164 + }, + { + "epoch": 3.910673732021196, + "grad_norm": 1.3582857847213745, + "learning_rate": 3.0374336556503574e-07, + "loss": 0.849976122379303, + "step": 5166 + }, + { + "epoch": 3.91218773656321, + "grad_norm": 1.5621212720870972, + "learning_rate": 3.0361865129538293e-07, + "loss": 0.6532884240150452, + "step": 5168 + }, + { + "epoch": 3.913701741105223, + "grad_norm": 1.8237923383712769, + "learning_rate": 3.0349604707878565e-07, + "loss": 0.16516126692295074, + "step": 5170 + }, + { + "epoch": 3.915215745647237, + "grad_norm": 2.2848901748657227, + "learning_rate": 3.033755531073897e-07, + "loss": 0.2639697790145874, + "step": 5172 + }, + { + "epoch": 3.9167297501892504, + "grad_norm": 2.287353277206421, + "learning_rate": 3.0325716957003346e-07, + "loss": 0.37060511112213135, + "step": 5174 + }, + { + "epoch": 3.918243754731264, + "grad_norm": 1.4160830974578857, + "learning_rate": 3.03140896652248e-07, + "loss": 0.889275312423706, + "step": 5176 + }, + { + "epoch": 3.9197577592732777, + "grad_norm": 1.3192116022109985, + "learning_rate": 3.0302673453625645e-07, + "loss": 0.46674323081970215, + "step": 5178 + }, + { + "epoch": 3.9212717638152914, + "grad_norm": 0.06028641387820244, + "learning_rate": 3.0291468340097433e-07, + "loss": 0.0030056757386773825, + "step": 5180 + }, + { + "epoch": 3.922785768357305, + "grad_norm": 0.5621559619903564, + "learning_rate": 3.0280474342200826e-07, + "loss": 0.15215720236301422, + "step": 5182 + }, + { + "epoch": 3.9242997728993188, + "grad_norm": 1.154634952545166, + "learning_rate": 3.0269691477165676e-07, + "loss": 0.06476107239723206, + "step": 5184 + }, + { + "epoch": 3.9258137774413324, + "grad_norm": 1.1321563720703125, + "learning_rate": 3.025911976189091e-07, + "loss": 0.10336454212665558, + "step": 5186 + }, + { + "epoch": 3.927327781983346, + "grad_norm": 3.2537643909454346, + "learning_rate": 3.024875921294456e-07, + "loss": 0.327021062374115, + "step": 5188 + }, + { + "epoch": 3.9288417865253598, + "grad_norm": 1.62344491481781, + "learning_rate": 3.0238609846563696e-07, + "loss": 0.4893304109573364, + "step": 5190 + }, + { + "epoch": 3.930355791067373, + "grad_norm": 1.2770984172821045, + "learning_rate": 3.0228671678654446e-07, + "loss": 0.33251699805259705, + "step": 5192 + }, + { + "epoch": 3.931869795609387, + "grad_norm": 1.6710693836212158, + "learning_rate": 3.0218944724791933e-07, + "loss": 0.5169814229011536, + "step": 5194 + }, + { + "epoch": 3.9333838001514003, + "grad_norm": 0.5025178790092468, + "learning_rate": 3.020942900022029e-07, + "loss": 0.0014017641078680754, + "step": 5196 + }, + { + "epoch": 3.934897804693414, + "grad_norm": 0.7163488864898682, + "learning_rate": 3.020012451985256e-07, + "loss": 0.03129759058356285, + "step": 5198 + }, + { + "epoch": 3.9364118092354277, + "grad_norm": 4.61578369140625, + "learning_rate": 3.019103129827078e-07, + "loss": 0.053232885897159576, + "step": 5200 + }, + { + "epoch": 3.9379258137774413, + "grad_norm": 1.6417152881622314, + "learning_rate": 3.018214934972586e-07, + "loss": 0.4831180274486542, + "step": 5202 + }, + { + "epoch": 3.939439818319455, + "grad_norm": 0.5890576243400574, + "learning_rate": 3.0173478688137626e-07, + "loss": 0.05670785903930664, + "step": 5204 + }, + { + "epoch": 3.9409538228614687, + "grad_norm": 0.5211370587348938, + "learning_rate": 3.0165019327094754e-07, + "loss": 0.44620001316070557, + "step": 5206 + }, + { + "epoch": 3.9424678274034823, + "grad_norm": 0.22458039224147797, + "learning_rate": 3.0156771279854786e-07, + "loss": 0.38849005103111267, + "step": 5208 + }, + { + "epoch": 3.9439818319454956, + "grad_norm": 0.11804883927106857, + "learning_rate": 3.0148734559344096e-07, + "loss": 0.003378600114956498, + "step": 5210 + }, + { + "epoch": 3.9454958364875097, + "grad_norm": 1.418165683746338, + "learning_rate": 3.0140909178157846e-07, + "loss": 0.4826388955116272, + "step": 5212 + }, + { + "epoch": 3.947009841029523, + "grad_norm": 0.005847345106303692, + "learning_rate": 3.0133295148559994e-07, + "loss": 0.24013184010982513, + "step": 5214 + }, + { + "epoch": 3.9485238455715366, + "grad_norm": 1.4362314939498901, + "learning_rate": 3.0125892482483296e-07, + "loss": 0.5238326191902161, + "step": 5216 + }, + { + "epoch": 3.9500378501135502, + "grad_norm": 0.4979008436203003, + "learning_rate": 3.0118701191529207e-07, + "loss": 0.019607849419116974, + "step": 5218 + }, + { + "epoch": 3.951551854655564, + "grad_norm": 0.9970922470092773, + "learning_rate": 3.011172128696795e-07, + "loss": 0.6862800717353821, + "step": 5220 + }, + { + "epoch": 3.9530658591975776, + "grad_norm": 1.9834884405136108, + "learning_rate": 3.0104952779738463e-07, + "loss": 0.12774519622325897, + "step": 5222 + }, + { + "epoch": 3.9545798637395913, + "grad_norm": 3.775111675262451, + "learning_rate": 3.0098395680448356e-07, + "loss": 0.9710557460784912, + "step": 5224 + }, + { + "epoch": 3.956093868281605, + "grad_norm": 1.23189115524292, + "learning_rate": 3.0092049999373956e-07, + "loss": 0.4356076419353485, + "step": 5226 + }, + { + "epoch": 3.9576078728236186, + "grad_norm": 0.1431012898683548, + "learning_rate": 3.00859157464602e-07, + "loss": 0.01511440146714449, + "step": 5228 + }, + { + "epoch": 3.9591218773656323, + "grad_norm": 1.6917046308517456, + "learning_rate": 3.0079992931320743e-07, + "loss": 0.18109403550624847, + "step": 5230 + }, + { + "epoch": 3.9606358819076455, + "grad_norm": 0.7762451171875, + "learning_rate": 3.0074281563237815e-07, + "loss": 0.02766716107726097, + "step": 5232 + }, + { + "epoch": 3.9621498864496596, + "grad_norm": 0.6131830215454102, + "learning_rate": 3.0068781651162303e-07, + "loss": 0.3837210536003113, + "step": 5234 + }, + { + "epoch": 3.963663890991673, + "grad_norm": 1.9030412435531616, + "learning_rate": 3.006349320371369e-07, + "loss": 0.4320211708545685, + "step": 5236 + }, + { + "epoch": 3.9651778955336865, + "grad_norm": 1.4596877098083496, + "learning_rate": 3.005841622918003e-07, + "loss": 0.6626247763633728, + "step": 5238 + }, + { + "epoch": 3.9666919000757, + "grad_norm": 1.9642925262451172, + "learning_rate": 3.0053550735517994e-07, + "loss": 0.9146741032600403, + "step": 5240 + }, + { + "epoch": 3.968205904617714, + "grad_norm": 1.2844886779785156, + "learning_rate": 3.004889673035278e-07, + "loss": 0.42892059683799744, + "step": 5242 + }, + { + "epoch": 3.9697199091597275, + "grad_norm": 2.3491554260253906, + "learning_rate": 3.0044454220978176e-07, + "loss": 0.13402020931243896, + "step": 5244 + }, + { + "epoch": 3.971233913701741, + "grad_norm": 1.8004578351974487, + "learning_rate": 3.004022321435649e-07, + "loss": 0.3509193956851959, + "step": 5246 + }, + { + "epoch": 3.972747918243755, + "grad_norm": 1.4899842739105225, + "learning_rate": 3.003620371711856e-07, + "loss": 0.4320584535598755, + "step": 5248 + }, + { + "epoch": 3.9742619227857685, + "grad_norm": 2.3373172283172607, + "learning_rate": 3.0032395735563773e-07, + "loss": 0.3609815537929535, + "step": 5250 + }, + { + "epoch": 3.975775927327782, + "grad_norm": 1.387648105621338, + "learning_rate": 3.0028799275659976e-07, + "loss": 0.4423655569553375, + "step": 5252 + }, + { + "epoch": 3.9772899318697954, + "grad_norm": 0.3810553252696991, + "learning_rate": 3.002541434304361e-07, + "loss": 0.37218791246414185, + "step": 5254 + }, + { + "epoch": 3.978803936411809, + "grad_norm": 1.0008951425552368, + "learning_rate": 3.0022240943019483e-07, + "loss": 0.04895136132836342, + "step": 5256 + }, + { + "epoch": 3.9803179409538227, + "grad_norm": 1.7369999885559082, + "learning_rate": 3.001927908056102e-07, + "loss": 0.43774598836898804, + "step": 5258 + }, + { + "epoch": 3.9818319454958364, + "grad_norm": 0.6022624969482422, + "learning_rate": 3.0016528760310013e-07, + "loss": 0.12158283591270447, + "step": 5260 + }, + { + "epoch": 3.98334595003785, + "grad_norm": 1.699537754058838, + "learning_rate": 3.00139899865768e-07, + "loss": 0.581796407699585, + "step": 5262 + }, + { + "epoch": 3.9848599545798638, + "grad_norm": 1.786253571510315, + "learning_rate": 3.001166276334015e-07, + "loss": 0.04827537387609482, + "step": 5264 + }, + { + "epoch": 3.9863739591218774, + "grad_norm": 0.15848782658576965, + "learning_rate": 3.0009547094247273e-07, + "loss": 0.00740088801831007, + "step": 5266 + }, + { + "epoch": 3.987887963663891, + "grad_norm": 1.4823129177093506, + "learning_rate": 3.000764298261389e-07, + "loss": 0.499202162027359, + "step": 5268 + }, + { + "epoch": 3.9894019682059048, + "grad_norm": 0.7671037316322327, + "learning_rate": 3.000595043142412e-07, + "loss": 0.06256823986768723, + "step": 5270 + }, + { + "epoch": 3.990915972747918, + "grad_norm": 0.398441880941391, + "learning_rate": 3.0004469443330506e-07, + "loss": 0.058787524700164795, + "step": 5272 + }, + { + "epoch": 3.992429977289932, + "grad_norm": 1.2512152194976807, + "learning_rate": 3.0003200020654085e-07, + "loss": 0.7201032042503357, + "step": 5274 + }, + { + "epoch": 3.9939439818319453, + "grad_norm": 1.2734607458114624, + "learning_rate": 3.0002142165384287e-07, + "loss": 0.44729602336883545, + "step": 5276 + }, + { + "epoch": 3.995457986373959, + "grad_norm": 0.06556880474090576, + "learning_rate": 3.0001295879179e-07, + "loss": 0.021310647949576378, + "step": 5278 + }, + { + "epoch": 3.9969719909159727, + "grad_norm": 2.316617727279663, + "learning_rate": 3.0000661163364527e-07, + "loss": 0.5290903449058533, + "step": 5280 + }, + { + "epoch": 3.9984859954579863, + "grad_norm": 1.1645081043243408, + "learning_rate": 3.000023801893557e-07, + "loss": 0.10492955148220062, + "step": 5282 + }, + { + "epoch": 4.0, + "grad_norm": 1.5504534244537354, + "learning_rate": 3.0000026446555307e-07, + "loss": 0.762908399105072, + "step": 5284 + }, + { + "epoch": 4.0, + "step": 5284, + "total_flos": 6.453258552541708e+18, + "train_loss": 0.5934332229983358, + "train_runtime": 17017.4219, + "train_samples_per_second": 4.968, + "train_steps_per_second": 0.311 + } + ], + "logging_steps": 2, + "max_steps": 5284, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.453258552541708e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}