diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3886 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1098, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00546448087431694, + "grad_norm": 1.0751630067825317, + "learning_rate": 5.454545454545455e-07, + "loss": 2.654850959777832, + "step": 2 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 0.6832827925682068, + "learning_rate": 1.6363636363636363e-06, + "loss": 1.9340370893478394, + "step": 4 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 0.44048377871513367, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.8661857843399048, + "step": 6 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 0.6479131579399109, + "learning_rate": 3.818181818181818e-06, + "loss": 1.7092058658599854, + "step": 8 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 0.4293830096721649, + "learning_rate": 4.90909090909091e-06, + "loss": 1.6882967948913574, + "step": 10 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.36752375960350037, + "learning_rate": 6e-06, + "loss": 1.5482172966003418, + "step": 12 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 0.45134979486465454, + "learning_rate": 7.090909090909091e-06, + "loss": 1.4477657079696655, + "step": 14 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 1.1161218881607056, + "learning_rate": 8.181818181818181e-06, + "loss": 1.2040513753890991, + "step": 16 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 0.4559285044670105, + "learning_rate": 9.272727272727273e-06, + "loss": 1.1814911365509033, + "step": 18 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 0.3014647364616394, + "learning_rate": 1.0363636363636364e-05, + "loss": 1.4038846492767334, + "step": 20 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 0.2782735228538513, + "learning_rate": 1.1454545454545455e-05, + "loss": 1.331674575805664, + "step": 22 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.2593174874782562, + "learning_rate": 1.2545454545454545e-05, + "loss": 1.39420485496521, + "step": 24 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 0.7720677256584167, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.5995798110961914, + "step": 26 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 0.2457510083913803, + "learning_rate": 1.4727272727272728e-05, + "loss": 1.3586376905441284, + "step": 28 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.333055704832077, + "learning_rate": 1.5818181818181818e-05, + "loss": 1.2947840690612793, + "step": 30 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.3230854570865631, + "learning_rate": 1.6909090909090907e-05, + "loss": 1.2872313261032104, + "step": 32 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 0.3659067153930664, + "learning_rate": 1.8e-05, + "loss": 1.013819694519043, + "step": 34 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.19767461717128754, + "learning_rate": 1.909090909090909e-05, + "loss": 1.1190152168273926, + "step": 36 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 0.3944986164569855, + "learning_rate": 2.0181818181818183e-05, + "loss": 1.3628164529800415, + "step": 38 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.5476788878440857, + "learning_rate": 2.1272727272727273e-05, + "loss": 1.3132535219192505, + "step": 40 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 0.6100736260414124, + "learning_rate": 2.2363636363636366e-05, + "loss": 1.0027183294296265, + "step": 42 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 0.32569247484207153, + "learning_rate": 2.3454545454545456e-05, + "loss": 1.01920747756958, + "step": 44 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 0.2778260111808777, + "learning_rate": 2.454545454545455e-05, + "loss": 1.3417850732803345, + "step": 46 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.2733558416366577, + "learning_rate": 2.5636363636363635e-05, + "loss": 1.32207453250885, + "step": 48 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.28408652544021606, + "learning_rate": 2.6727272727272728e-05, + "loss": 1.2943894863128662, + "step": 50 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 0.2375349998474121, + "learning_rate": 2.7818181818181818e-05, + "loss": 1.2506234645843506, + "step": 52 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 1.0531994104385376, + "learning_rate": 2.890909090909091e-05, + "loss": 0.8835932612419128, + "step": 54 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 0.3646933436393738, + "learning_rate": 3e-05, + "loss": 1.4430650472640991, + "step": 56 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 0.44717085361480713, + "learning_rate": 2.9997491688899256e-05, + "loss": 1.4402170181274414, + "step": 58 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.25877314805984497, + "learning_rate": 2.998996768768956e-05, + "loss": 1.3341705799102783, + "step": 60 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 0.3694762587547302, + "learning_rate": 2.9977430792302124e-05, + "loss": 1.5157015323638916, + "step": 62 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 0.7582504749298096, + "learning_rate": 2.9959885661467903e-05, + "loss": 1.2413986921310425, + "step": 64 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.7258033752441406, + "learning_rate": 2.993733881498636e-05, + "loss": 1.328322410583496, + "step": 66 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 0.4196822941303253, + "learning_rate": 2.9909798631302736e-05, + "loss": 1.3006751537322998, + "step": 68 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.8249898552894592, + "learning_rate": 2.987727534439457e-05, + "loss": 1.3277437686920166, + "step": 70 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 0.4432520568370819, + "learning_rate": 2.983978103996877e-05, + "loss": 1.1078237295150757, + "step": 72 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 0.557328999042511, + "learning_rate": 2.9797329650970525e-05, + "loss": 1.1530853509902954, + "step": 74 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 0.9870507121086121, + "learning_rate": 2.974993695240579e-05, + "loss": 1.3453693389892578, + "step": 76 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 1.0352654457092285, + "learning_rate": 2.9697620555479297e-05, + "loss": 0.6454078555107117, + "step": 78 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.39752116799354553, + "learning_rate": 2.9640399901050182e-05, + "loss": 0.6325056552886963, + "step": 80 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 0.3930615782737732, + "learning_rate": 2.9578296252407734e-05, + "loss": 1.3635072708129883, + "step": 82 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.3616217374801636, + "learning_rate": 2.9511332687369917e-05, + "loss": 0.8247113227844238, + "step": 84 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 0.6772981286048889, + "learning_rate": 2.9439534089707624e-05, + "loss": 1.3125399351119995, + "step": 86 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 0.3286456763744354, + "learning_rate": 2.9362927139897832e-05, + "loss": 1.272586464881897, + "step": 88 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 0.8590133190155029, + "learning_rate": 2.9281540305209068e-05, + "loss": 1.0977256298065186, + "step": 90 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 0.780782163143158, + "learning_rate": 2.919540382912294e-05, + "loss": 1.323669195175171, + "step": 92 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 0.2833038866519928, + "learning_rate": 2.9104549720095634e-05, + "loss": 1.1723302602767944, + "step": 94 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.4925417900085449, + "learning_rate": 2.9009011739663467e-05, + "loss": 1.5599980354309082, + "step": 96 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 0.5641136169433594, + "learning_rate": 2.8908825389897094e-05, + "loss": 1.2302381992340088, + "step": 98 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.3122362792491913, + "learning_rate": 2.8804027900208843e-05, + "loss": 1.1082335710525513, + "step": 100 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 0.3353098928928375, + "learning_rate": 2.8694658213518226e-05, + "loss": 1.2942110300064087, + "step": 102 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 0.933445155620575, + "learning_rate": 2.8580756971780686e-05, + "loss": 0.9880049824714661, + "step": 104 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 0.27208322286605835, + "learning_rate": 2.846236650088497e-05, + "loss": 1.2753208875656128, + "step": 106 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 0.2629159390926361, + "learning_rate": 2.833953079492476e-05, + "loss": 1.410078525543213, + "step": 108 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 0.756142795085907, + "learning_rate": 2.82122954998504e-05, + "loss": 1.249933123588562, + "step": 110 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.3108830451965332, + "learning_rate": 2.808070789650679e-05, + "loss": 1.2787212133407593, + "step": 112 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 0.8976105451583862, + "learning_rate": 2.7944816883063727e-05, + "loss": 1.3811240196228027, + "step": 114 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 0.48146891593933105, + "learning_rate": 2.7804672956845295e-05, + "loss": 1.2635105848312378, + "step": 116 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 0.3662451505661011, + "learning_rate": 2.766032819556495e-05, + "loss": 1.0110238790512085, + "step": 118 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 1.0501397848129272, + "learning_rate": 2.7511836237973366e-05, + "loss": 1.2500156164169312, + "step": 120 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.6555485725402832, + "learning_rate": 2.735925226392618e-05, + "loss": 1.3619412183761597, + "step": 122 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 0.3158622682094574, + "learning_rate": 2.7202632973879086e-05, + "loss": 1.0549049377441406, + "step": 124 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 1.4328356981277466, + "learning_rate": 2.7042036567817838e-05, + "loss": 1.0704551935195923, + "step": 126 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 0.3114672005176544, + "learning_rate": 2.6877522723631036e-05, + "loss": 1.324272632598877, + "step": 128 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.41898471117019653, + "learning_rate": 2.6709152574933727e-05, + "loss": 1.2493348121643066, + "step": 130 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.28673115372657776, + "learning_rate": 2.6536988688350067e-05, + "loss": 1.1967706680297852, + "step": 132 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 1.0675979852676392, + "learning_rate": 2.6361095040263437e-05, + "loss": 1.0794986486434937, + "step": 134 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 0.4790109097957611, + "learning_rate": 2.618153699304274e-05, + "loss": 1.0123281478881836, + "step": 136 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 0.3467015326023102, + "learning_rate": 2.599838127075361e-05, + "loss": 0.9970463514328003, + "step": 138 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.2335558384656906, + "learning_rate": 2.5811695934363666e-05, + "loss": 0.640708863735199, + "step": 140 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 0.4442676603794098, + "learning_rate": 2.5621550356450914e-05, + "loss": 1.2411315441131592, + "step": 142 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.2910548150539398, + "learning_rate": 2.5428015195424825e-05, + "loss": 1.2713288068771362, + "step": 144 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 0.38326823711395264, + "learning_rate": 2.5231162369269498e-05, + "loss": 1.225158929824829, + "step": 146 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 0.9609261751174927, + "learning_rate": 2.503106502881889e-05, + "loss": 1.216736078262329, + "step": 148 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.4641711115837097, + "learning_rate": 2.4827797530573762e-05, + "loss": 1.1474337577819824, + "step": 150 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 0.2951766550540924, + "learning_rate": 2.4621435409070757e-05, + "loss": 1.2647650241851807, + "step": 152 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 0.32328906655311584, + "learning_rate": 2.4412055348813602e-05, + "loss": 1.2341688871383667, + "step": 154 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.4336468279361725, + "learning_rate": 2.4199735155777017e-05, + "loss": 1.2835301160812378, + "step": 156 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 0.4230714738368988, + "learning_rate": 2.3984553728493914e-05, + "loss": 1.0034432411193848, + "step": 158 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.2932489812374115, + "learning_rate": 2.3766591028736547e-05, + "loss": 1.213523268699646, + "step": 160 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 3.7618532180786133, + "learning_rate": 2.3545928051802588e-05, + "loss": 0.6747933030128479, + "step": 162 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 0.7841958403587341, + "learning_rate": 2.332264679641717e-05, + "loss": 1.5993993282318115, + "step": 164 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 0.647459864616394, + "learning_rate": 2.3096830234261996e-05, + "loss": 0.753265917301178, + "step": 166 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.5001090168952942, + "learning_rate": 2.2868562279142912e-05, + "loss": 1.1555891036987305, + "step": 168 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.2678913176059723, + "learning_rate": 2.2637927755807458e-05, + "loss": 1.246796727180481, + "step": 170 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 0.417643666267395, + "learning_rate": 2.2405012368423786e-05, + "loss": 1.3203487396240234, + "step": 172 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 0.5594274401664734, + "learning_rate": 2.2169902668732893e-05, + "loss": 1.417712926864624, + "step": 174 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.31678417325019836, + "learning_rate": 2.193268602388583e-05, + "loss": 1.2238776683807373, + "step": 176 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 0.6878252625465393, + "learning_rate": 2.1693450583977953e-05, + "loss": 1.1790485382080078, + "step": 178 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.28679513931274414, + "learning_rate": 2.1452285249292147e-05, + "loss": 1.1895920038223267, + "step": 180 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 0.322842538356781, + "learning_rate": 2.12092796372634e-05, + "loss": 1.174275517463684, + "step": 182 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 0.6668521761894226, + "learning_rate": 2.096452404917679e-05, + "loss": 1.3329179286956787, + "step": 184 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 0.7745230197906494, + "learning_rate": 2.0718109436611348e-05, + "loss": 1.1880346536636353, + "step": 186 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 0.5202271938323975, + "learning_rate": 2.0470127367642345e-05, + "loss": 1.1781209707260132, + "step": 188 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.5798245668411255, + "learning_rate": 2.022066999281444e-05, + "loss": 1.233837366104126, + "step": 190 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.5479984879493713, + "learning_rate": 1.9969830010898358e-05, + "loss": 1.2089662551879883, + "step": 192 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 0.23682758212089539, + "learning_rate": 1.9717700634443903e-05, + "loss": 1.218327522277832, + "step": 194 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 0.3199704587459564, + "learning_rate": 1.9464375555142e-05, + "loss": 1.2755553722381592, + "step": 196 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.2734091877937317, + "learning_rate": 1.9209948909008734e-05, + "loss": 1.1226749420166016, + "step": 198 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.25196999311447144, + "learning_rate": 1.8954515241404218e-05, + "loss": 1.2352243661880493, + "step": 200 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 0.28729572892189026, + "learning_rate": 1.8698169471899414e-05, + "loss": 1.0766348838806152, + "step": 202 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.5765514373779297, + "learning_rate": 1.8441006859003842e-05, + "loss": 0.8550919890403748, + "step": 204 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 0.5975605845451355, + "learning_rate": 1.818312296476737e-05, + "loss": 1.239629864692688, + "step": 206 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.23489779233932495, + "learning_rate": 1.792461361926921e-05, + "loss": 1.2550042867660522, + "step": 208 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.4763418138027191, + "learning_rate": 1.766557488500727e-05, + "loss": 1.2072700262069702, + "step": 210 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 0.23921915888786316, + "learning_rate": 1.7406103021201212e-05, + "loss": 1.5028775930404663, + "step": 212 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 0.42051005363464355, + "learning_rate": 1.7146294448022335e-05, + "loss": 1.2044477462768555, + "step": 214 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 1.0494626760482788, + "learning_rate": 1.688624571076371e-05, + "loss": 1.560670018196106, + "step": 216 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 0.4011470377445221, + "learning_rate": 1.6626053443963762e-05, + "loss": 1.1750210523605347, + "step": 218 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 0.4405505657196045, + "learning_rate": 1.636581433549674e-05, + "loss": 1.2384079694747925, + "step": 220 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.86158287525177, + "learning_rate": 1.610562509064332e-05, + "loss": 1.091644048690796, + "step": 222 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 0.40252819657325745, + "learning_rate": 1.5845582396154786e-05, + "loss": 1.0258910655975342, + "step": 224 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 0.2338150292634964, + "learning_rate": 1.5585782884324064e-05, + "loss": 1.2002568244934082, + "step": 226 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 0.5918247699737549, + "learning_rate": 1.5326323097077015e-05, + "loss": 1.2207123041152954, + "step": 228 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.31474488973617554, + "learning_rate": 1.5067299450097261e-05, + "loss": 0.70670485496521, + "step": 230 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 0.32660529017448425, + "learning_rate": 1.4808808196998006e-05, + "loss": 1.2120585441589355, + "step": 232 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 1.468921422958374, + "learning_rate": 1.4550945393554004e-05, + "loss": 0.9596046209335327, + "step": 234 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 2.728471279144287, + "learning_rate": 1.4293806862007085e-05, + "loss": 0.903359055519104, + "step": 236 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 0.6711524724960327, + "learning_rate": 1.4037488155458448e-05, + "loss": 1.2866835594177246, + "step": 238 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.40566977858543396, + "learning_rate": 1.3782084522360981e-05, + "loss": 1.20192289352417, + "step": 240 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 0.3911570608615875, + "learning_rate": 1.3527690871124762e-05, + "loss": 1.1558232307434082, + "step": 242 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5365505218505859, + "learning_rate": 1.3274401734848958e-05, + "loss": 1.029914379119873, + "step": 244 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 0.2497815638780594, + "learning_rate": 1.3022311236193156e-05, + "loss": 1.1662428379058838, + "step": 246 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 0.4294073283672333, + "learning_rate": 1.2771513052401236e-05, + "loss": 1.2328400611877441, + "step": 248 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.352763831615448, + "learning_rate": 1.2522100380490744e-05, + "loss": 1.1996644735336304, + "step": 250 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.46486788988113403, + "learning_rate": 1.2274165902620732e-05, + "loss": 1.1101069450378418, + "step": 252 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 0.24696502089500427, + "learning_rate": 1.2027801751650918e-05, + "loss": 1.1543952226638794, + "step": 254 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.25500428676605225, + "learning_rate": 1.1783099476904972e-05, + "loss": 1.148316740989685, + "step": 256 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 0.37820199131965637, + "learning_rate": 1.1540150010150599e-05, + "loss": 1.079501748085022, + "step": 258 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.48190441727638245, + "learning_rate": 1.1299043631809205e-05, + "loss": 1.1459242105484009, + "step": 260 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 0.5813128352165222, + "learning_rate": 1.1059869937407486e-05, + "loss": 1.240357756614685, + "step": 262 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.2543342113494873, + "learning_rate": 1.082271780428362e-05, + "loss": 1.1864430904388428, + "step": 264 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 0.26917213201522827, + "learning_rate": 1.0587675358560278e-05, + "loss": 1.0468827486038208, + "step": 266 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 0.4517075717449188, + "learning_rate": 1.0354829942396837e-05, + "loss": 1.1757278442382812, + "step": 268 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.23133859038352966, + "learning_rate": 1.012426808153287e-05, + "loss": 1.178215742111206, + "step": 270 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.3878965377807617, + "learning_rate": 9.896075453135039e-06, + "loss": 1.1388592720031738, + "step": 272 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 0.3238541781902313, + "learning_rate": 9.67033685395934e-06, + "loss": 1.161685585975647, + "step": 274 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.27236634492874146, + "learning_rate": 9.447136168840466e-06, + "loss": 1.2053990364074707, + "step": 276 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 0.3311232924461365, + "learning_rate": 9.226556339520069e-06, + "loss": 1.1811869144439697, + "step": 278 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 0.824040412902832, + "learning_rate": 9.008679333825478e-06, + "loss": 0.9021319150924683, + "step": 280 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.3380263149738312, + "learning_rate": 8.793586115210326e-06, + "loss": 1.1787052154541016, + "step": 282 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 0.408220112323761, + "learning_rate": 8.581356612668382e-06, + "loss": 1.1672601699829102, + "step": 284 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 0.35263848304748535, + "learning_rate": 8.372069691031804e-06, + "loss": 1.1768161058425903, + "step": 286 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.37962621450424194, + "learning_rate": 8.165803121664869e-06, + "loss": 1.146876335144043, + "step": 288 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.27549535036087036, + "learning_rate": 7.962633553563965e-06, + "loss": 1.1030545234680176, + "step": 290 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 0.591926097869873, + "learning_rate": 7.762636484874723e-06, + "loss": 1.1360045671463013, + "step": 292 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.26219072937965393, + "learning_rate": 7.565886234836767e-06, + "loss": 1.1480509042739868, + "step": 294 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 0.4182840585708618, + "learning_rate": 7.3724559161665876e-06, + "loss": 1.494627594947815, + "step": 296 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 0.7810959219932556, + "learning_rate": 7.182417407888703e-06, + "loss": 0.8853966593742371, + "step": 298 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.3140762448310852, + "learning_rate": 6.995841328625321e-06, + "loss": 1.5603413581848145, + "step": 300 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 0.3591131269931793, + "learning_rate": 6.812797010354325e-06, + "loss": 1.1755470037460327, + "step": 302 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 2.338329315185547, + "learning_rate": 6.63335247264542e-06, + "loss": 0.7928791046142578, + "step": 304 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 0.6885122060775757, + "learning_rate": 6.457574397383919e-06, + "loss": 1.514641284942627, + "step": 306 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 0.3260650336742401, + "learning_rate": 6.285528103991665e-06, + "loss": 0.8320769667625427, + "step": 308 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.39165613055229187, + "learning_rate": 6.117277525154225e-06, + "loss": 1.1644271612167358, + "step": 310 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.34331265091896057, + "learning_rate": 5.952885183063397e-06, + "loss": 1.17978835105896, + "step": 312 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 0.3723102807998657, + "learning_rate": 5.792412166183841e-06, + "loss": 1.1927130222320557, + "step": 314 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 0.5044635534286499, + "learning_rate": 5.635918106552546e-06, + "loss": 1.1958658695220947, + "step": 316 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 0.5813291668891907, + "learning_rate": 5.483461157619428e-06, + "loss": 1.0818508863449097, + "step": 318 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.23069122433662415, + "learning_rate": 5.335097972637441e-06, + "loss": 1.2942509651184082, + "step": 320 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 0.31037190556526184, + "learning_rate": 5.1908836836101135e-06, + "loss": 1.1336814165115356, + "step": 322 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.43608394265174866, + "learning_rate": 5.050871880804414e-06, + "loss": 0.5545326471328735, + "step": 324 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 0.40804681181907654, + "learning_rate": 4.915114592836521e-06, + "loss": 1.1569054126739502, + "step": 326 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 0.6417044997215271, + "learning_rate": 4.783662267337909e-06, + "loss": 1.0306603908538818, + "step": 328 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.3709650933742523, + "learning_rate": 4.656563752208907e-06, + "loss": 1.184291958808899, + "step": 330 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 0.5156346559524536, + "learning_rate": 4.533866277466767e-06, + "loss": 1.239979863166809, + "step": 332 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 0.32417598366737366, + "learning_rate": 4.415615437694876e-06, + "loss": 1.0216443538665771, + "step": 334 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.31912070512771606, + "learning_rate": 4.3018551750997694e-06, + "loss": 1.132981777191162, + "step": 336 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 0.6049280762672424, + "learning_rate": 4.192627763182111e-06, + "loss": 1.1525002717971802, + "step": 338 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.398091584444046, + "learning_rate": 4.087973791027797e-06, + "loss": 1.151615858078003, + "step": 340 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.7425937652587891, + "learning_rate": 3.987932148224993e-06, + "loss": 1.1880524158477783, + "step": 342 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 0.9477114677429199, + "learning_rate": 3.8925400104126834e-06, + "loss": 1.1307789087295532, + "step": 344 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 0.3224574327468872, + "learning_rate": 3.8018328254661618e-06, + "loss": 1.508508324623108, + "step": 346 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.42406103014945984, + "learning_rate": 3.715844300324527e-06, + "loss": 0.8069607615470886, + "step": 348 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.2576044201850891, + "learning_rate": 3.6346063884651327e-06, + "loss": 0.806373655796051, + "step": 350 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.5339590907096863, + "learning_rate": 3.558149278029624e-06, + "loss": 1.1425732374191284, + "step": 352 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 0.2579115331172943, + "learning_rate": 3.4865013806059817e-06, + "loss": 0.8368316292762756, + "step": 354 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 0.4226749539375305, + "learning_rate": 3.419689320670712e-06, + "loss": 1.0304514169692993, + "step": 356 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 0.23506596684455872, + "learning_rate": 3.35773792569517e-06, + "loss": 0.720623254776001, + "step": 358 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.3969164490699768, + "learning_rate": 3.300670216919602e-06, + "loss": 1.1500842571258545, + "step": 360 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 0.6647183895111084, + "learning_rate": 3.2485074007984468e-06, + "loss": 0.9328727126121521, + "step": 362 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.2883590757846832, + "learning_rate": 3.2012688611199566e-06, + "loss": 1.1320202350616455, + "step": 364 + }, + { + "epoch": 1.0, + "grad_norm": 0.3654276728630066, + "learning_rate": 3.158972151803165e-06, + "loss": 1.1198816299438477, + "step": 366 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.3304823040962219, + "learning_rate": 3.1216329903748095e-06, + "loss": 0.9562312364578247, + "step": 368 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.43168309330940247, + "learning_rate": 3.089265252128686e-06, + "loss": 1.0244945287704468, + "step": 370 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.25342443585395813, + "learning_rate": 3.061880964969555e-06, + "loss": 0.9778792858123779, + "step": 372 + }, + { + "epoch": 1.0218579234972678, + "grad_norm": 0.35910850763320923, + "learning_rate": 3.039490304943562e-06, + "loss": 0.9393002390861511, + "step": 374 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 0.4086434245109558, + "learning_rate": 3.022101592456795e-06, + "loss": 0.8444104790687561, + "step": 376 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 0.4661693871021271, + "learning_rate": 3.0097212891834095e-06, + "loss": 0.9321051836013794, + "step": 378 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.37025684118270874, + "learning_rate": 3.0023539956644634e-06, + "loss": 0.6185029745101929, + "step": 380 + }, + { + "epoch": 1.0437158469945356, + "grad_norm": 0.4136655628681183, + "learning_rate": 3.0000024495983428e-06, + "loss": 0.8996444940567017, + "step": 382 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.2609977126121521, + "learning_rate": 3.002667524823434e-06, + "loss": 0.951973021030426, + "step": 384 + }, + { + "epoch": 1.0546448087431695, + "grad_norm": 1.5468955039978027, + "learning_rate": 3.010348230993402e-06, + "loss": 0.6188804507255554, + "step": 386 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 0.5185752511024475, + "learning_rate": 3.0230417139451987e-06, + "loss": 0.5328562259674072, + "step": 388 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.5812699794769287, + "learning_rate": 3.0407432567596883e-06, + "loss": 1.1187443733215332, + "step": 390 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 0.32687872648239136, + "learning_rate": 3.0634462815144474e-06, + "loss": 0.8662930727005005, + "step": 392 + }, + { + "epoch": 1.0765027322404372, + "grad_norm": 0.4097544550895691, + "learning_rate": 3.0911423517281404e-06, + "loss": 0.5184633135795593, + "step": 394 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.33863401412963867, + "learning_rate": 3.1238211754955294e-06, + "loss": 0.9240728616714478, + "step": 396 + }, + { + "epoch": 1.0874316939890711, + "grad_norm": 0.28052061796188354, + "learning_rate": 3.161470609311961e-06, + "loss": 0.9106302857398987, + "step": 398 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.4579223394393921, + "learning_rate": 3.2040766625859115e-06, + "loss": 1.0799837112426758, + "step": 400 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 0.930515706539154, + "learning_rate": 3.2516235028379157e-06, + "loss": 0.9288961291313171, + "step": 402 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 0.37287023663520813, + "learning_rate": 3.304093461583944e-06, + "loss": 0.9670571684837341, + "step": 404 + }, + { + "epoch": 1.1092896174863387, + "grad_norm": 1.240659475326538, + "learning_rate": 3.3614670409010353e-06, + "loss": 1.0186374187469482, + "step": 406 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.8452418446540833, + "learning_rate": 3.4237229206727602e-06, + "loss": 0.6442115306854248, + "step": 408 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.36524245142936707, + "learning_rate": 3.490837966511817e-06, + "loss": 1.2035061120986938, + "step": 410 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 0.6859813928604126, + "learning_rate": 3.5627872383567937e-06, + "loss": 0.6509969830513, + "step": 412 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.28654637932777405, + "learning_rate": 3.6395439997399494e-06, + "loss": 0.6548624634742737, + "step": 414 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 0.6482637524604797, + "learning_rate": 3.721079727722522e-06, + "loss": 0.6231327056884766, + "step": 416 + }, + { + "epoch": 1.1420765027322404, + "grad_norm": 0.40834060311317444, + "learning_rate": 3.8073641234939055e-06, + "loss": 0.9411674737930298, + "step": 418 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.42818281054496765, + "learning_rate": 3.898365123630732e-06, + "loss": 0.7944842576980591, + "step": 420 + }, + { + "epoch": 1.1530054644808743, + "grad_norm": 0.29905006289482117, + "learning_rate": 3.994048912011692e-06, + "loss": 0.9097725749015808, + "step": 422 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 0.789088249206543, + "learning_rate": 4.094379932383666e-06, + "loss": 0.7131223678588867, + "step": 424 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 0.3939225375652313, + "learning_rate": 4.199320901574489e-06, + "loss": 0.6140289306640625, + "step": 426 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 0.4326651096343994, + "learning_rate": 4.3088328233474185e-06, + "loss": 1.078157901763916, + "step": 428 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.30610015988349915, + "learning_rate": 4.422875002892234e-06, + "loss": 0.9334272742271423, + "step": 430 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.34937551617622375, + "learning_rate": 4.54140506194747e-06, + "loss": 0.9314782023429871, + "step": 432 + }, + { + "epoch": 1.185792349726776, + "grad_norm": 0.8679988384246826, + "learning_rate": 4.664378954548241e-06, + "loss": 0.7978003025054932, + "step": 434 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 0.3360673785209656, + "learning_rate": 4.791750983393832e-06, + "loss": 0.9832219481468201, + "step": 436 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 0.264480859041214, + "learning_rate": 4.9234738168288466e-06, + "loss": 0.8762679696083069, + "step": 438 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.39207419753074646, + "learning_rate": 5.059498506431758e-06, + "loss": 0.7775551080703735, + "step": 440 + }, + { + "epoch": 1.2076502732240437, + "grad_norm": 0.2239266186952591, + "learning_rate": 5.199774505204206e-06, + "loss": 0.93987637758255, + "step": 442 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.33733996748924255, + "learning_rate": 5.344249686354357e-06, + "loss": 0.9468815922737122, + "step": 444 + }, + { + "epoch": 1.2185792349726776, + "grad_norm": 0.47637975215911865, + "learning_rate": 5.492870362667299e-06, + "loss": 0.5318366289138794, + "step": 446 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.26395881175994873, + "learning_rate": 5.645581306455302e-06, + "loss": 0.8348568081855774, + "step": 448 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.5106167197227478, + "learning_rate": 5.802325770080506e-06, + "loss": 0.9297827482223511, + "step": 450 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 0.47476786375045776, + "learning_rate": 5.96304550704246e-06, + "loss": 0.988614022731781, + "step": 452 + }, + { + "epoch": 1.2404371584699454, + "grad_norm": 0.5340549349784851, + "learning_rate": 6.127680793622588e-06, + "loss": 0.42125749588012695, + "step": 454 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 0.9180600643157959, + "learning_rate": 6.296170451077657e-06, + "loss": 0.7597475647926331, + "step": 456 + }, + { + "epoch": 1.2513661202185793, + "grad_norm": 0.46278026700019836, + "learning_rate": 6.468451868373856e-06, + "loss": 0.6985284686088562, + "step": 458 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.27935847640037537, + "learning_rate": 6.6444610254532e-06, + "loss": 0.9660181999206543, + "step": 460 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 0.8495526909828186, + "learning_rate": 6.824132517023449e-06, + "loss": 0.7250743508338928, + "step": 462 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 7.655890941619873, + "learning_rate": 7.007399576862872e-06, + "loss": 0.9311206936836243, + "step": 464 + }, + { + "epoch": 1.273224043715847, + "grad_norm": 0.7682108283042908, + "learning_rate": 7.1941941026306275e-06, + "loss": 0.976463258266449, + "step": 466 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.31356796622276306, + "learning_rate": 7.3844466811737555e-06, + "loss": 0.9272637963294983, + "step": 468 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.304342657327652, + "learning_rate": 7.578086614321175e-06, + "loss": 0.9191130995750427, + "step": 470 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 0.4537976086139679, + "learning_rate": 7.775041945155295e-06, + "loss": 1.185452938079834, + "step": 472 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.7000725269317627, + "learning_rate": 7.975239484751258e-06, + "loss": 0.859736979007721, + "step": 474 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 0.49727311730384827, + "learning_rate": 8.178604839374125e-06, + "loss": 0.5100428462028503, + "step": 476 + }, + { + "epoch": 1.3060109289617485, + "grad_norm": 0.3634011149406433, + "learning_rate": 8.385062438123673e-06, + "loss": 0.9873999953269958, + "step": 478 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.3196913003921509, + "learning_rate": 8.594535561016661e-06, + "loss": 1.0027223825454712, + "step": 480 + }, + { + "epoch": 1.3169398907103824, + "grad_norm": 0.275206595659256, + "learning_rate": 8.806946367496155e-06, + "loss": 0.8830161094665527, + "step": 482 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 0.7456058263778687, + "learning_rate": 9.02221592535712e-06, + "loss": 0.6142900586128235, + "step": 484 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 0.27207374572753906, + "learning_rate": 9.240264240077859e-06, + "loss": 0.8691650629043579, + "step": 486 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.31759360432624817, + "learning_rate": 9.461010284546016e-06, + "loss": 1.0310981273651123, + "step": 488 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.6895498037338257, + "learning_rate": 9.684372029168438e-06, + "loss": 0.660420298576355, + "step": 490 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 0.4173939824104309, + "learning_rate": 9.91026647235348e-06, + "loss": 1.0019088983535767, + "step": 492 + }, + { + "epoch": 1.349726775956284, + "grad_norm": 0.21376250684261322, + "learning_rate": 1.0138609671354586e-05, + "loss": 0.7606089115142822, + "step": 494 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.6442067623138428, + "learning_rate": 1.0369316773463458e-05, + "loss": 0.9546438455581665, + "step": 496 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 3.213313579559326, + "learning_rate": 1.0602302047541566e-05, + "loss": 0.9225825071334839, + "step": 498 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.356067031621933, + "learning_rate": 1.083747891587788e-05, + "loss": 1.08867609500885, + "step": 500 + }, + { + "epoch": 1.3715846994535519, + "grad_norm": 0.46825549006462097, + "learning_rate": 1.1074759986361392e-05, + "loss": 0.9150928258895874, + "step": 502 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.5194010138511658, + "learning_rate": 1.1314057084956073e-05, + "loss": 0.5116444826126099, + "step": 504 + }, + { + "epoch": 1.3825136612021858, + "grad_norm": 0.7615500092506409, + "learning_rate": 1.1555281288466553e-05, + "loss": 0.7318437695503235, + "step": 506 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 0.32076653838157654, + "learning_rate": 1.1798342957582084e-05, + "loss": 0.9810932278633118, + "step": 508 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.26743048429489136, + "learning_rate": 1.2043151770186725e-05, + "loss": 1.0388559103012085, + "step": 510 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.34534981846809387, + "learning_rate": 1.2289616754923078e-05, + "loss": 0.8841665387153625, + "step": 512 + }, + { + "epoch": 1.4043715846994536, + "grad_norm": 0.16258741915225983, + "learning_rate": 1.253764632499752e-05, + "loss": 0.66373211145401, + "step": 514 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 0.3468623161315918, + "learning_rate": 1.2787148312213901e-05, + "loss": 0.7078993320465088, + "step": 516 + }, + { + "epoch": 1.4153005464480874, + "grad_norm": 0.2647278904914856, + "learning_rate": 1.3038030001223439e-05, + "loss": 0.9880443215370178, + "step": 518 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 0.8398441672325134, + "learning_rate": 1.3290198163977933e-05, + "loss": 0.6314539313316345, + "step": 520 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.3027450442314148, + "learning_rate": 1.3543559094373372e-05, + "loss": 1.0502456426620483, + "step": 522 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 0.7764866352081299, + "learning_rate": 1.3798018643071386e-05, + "loss": 0.9405328631401062, + "step": 524 + }, + { + "epoch": 1.4371584699453552, + "grad_norm": 0.37671011686325073, + "learning_rate": 1.4053482252485178e-05, + "loss": 0.9506468772888184, + "step": 526 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.20187242329120636, + "learning_rate": 1.4309854991917388e-05, + "loss": 0.6395024061203003, + "step": 528 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.4782966673374176, + "learning_rate": 1.4567041592836413e-05, + "loss": 0.748798131942749, + "step": 530 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 3.2700560092926025, + "learning_rate": 1.48249464842784e-05, + "loss": 0.755515456199646, + "step": 532 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 0.676388144493103, + "learning_rate": 1.508347382836153e-05, + "loss": 1.0103578567504883, + "step": 534 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 0.30977484583854675, + "learning_rate": 1.534252755589961e-05, + "loss": 0.982090413570404, + "step": 536 + }, + { + "epoch": 1.469945355191257, + "grad_norm": 0.39242854714393616, + "learning_rate": 1.5602011402101432e-05, + "loss": 1.0859917402267456, + "step": 538 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.3935968279838562, + "learning_rate": 1.5861828942343037e-05, + "loss": 1.0562843084335327, + "step": 540 + }, + { + "epoch": 1.4808743169398908, + "grad_norm": 0.33550116419792175, + "learning_rate": 1.612188362799917e-05, + "loss": 0.9903753995895386, + "step": 542 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.4036963880062103, + "learning_rate": 1.6382078822320964e-05, + "loss": 0.8334465622901917, + "step": 544 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 0.6883772611618042, + "learning_rate": 1.6642317836346324e-05, + "loss": 1.1092900037765503, + "step": 546 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 3.7706735134124756, + "learning_rate": 1.6902503964829644e-05, + "loss": 0.4429691433906555, + "step": 548 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 1.0984396934509277, + "learning_rate": 1.7162540522177685e-05, + "loss": 1.0124402046203613, + "step": 550 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.457646906375885, + "learning_rate": 1.7422330878378113e-05, + "loss": 1.025429368019104, + "step": 552 + }, + { + "epoch": 1.5136612021857925, + "grad_norm": 0.2908318340778351, + "learning_rate": 1.7681778494907298e-05, + "loss": 0.9907000660896301, + "step": 554 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 0.4297858774662018, + "learning_rate": 1.794078696060429e-05, + "loss": 1.1015424728393555, + "step": 556 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.35660913586616516, + "learning_rate": 1.819926002749727e-05, + "loss": 0.8739935755729675, + "step": 558 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.345431923866272, + "learning_rate": 1.84571016465695e-05, + "loss": 0.5142934322357178, + "step": 560 + }, + { + "epoch": 1.5355191256830603, + "grad_norm": 2.3219211101531982, + "learning_rate": 1.8714216003451295e-05, + "loss": 0.9209707975387573, + "step": 562 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 1.1686280965805054, + "learning_rate": 1.8970507554024827e-05, + "loss": 0.8513541221618652, + "step": 564 + }, + { + "epoch": 1.5464480874316942, + "grad_norm": 0.2546832859516144, + "learning_rate": 1.922588105992838e-05, + "loss": 0.9810701012611389, + "step": 566 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 0.5964091420173645, + "learning_rate": 1.9480241623947206e-05, + "loss": 0.8227204084396362, + "step": 568 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.34018632769584656, + "learning_rate": 1.9733494725277413e-05, + "loss": 1.1703555583953857, + "step": 570 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 0.36660236120224, + "learning_rate": 1.998554625465005e-05, + "loss": 0.9431501626968384, + "step": 572 + }, + { + "epoch": 1.5683060109289617, + "grad_norm": 0.26774540543556213, + "learning_rate": 2.0236302549302293e-05, + "loss": 0.996977686882019, + "step": 574 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.28664132952690125, + "learning_rate": 2.0485670427782644e-05, + "loss": 0.7795593738555908, + "step": 576 + }, + { + "epoch": 1.5792349726775956, + "grad_norm": 0.3652798533439636, + "learning_rate": 2.073355722457739e-05, + "loss": 1.0711528062820435, + "step": 578 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.1629379242658615, + "learning_rate": 2.0979870824545165e-05, + "loss": 0.5316720008850098, + "step": 580 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.2500196099281311, + "learning_rate": 2.1224519697147145e-05, + "loss": 0.730338454246521, + "step": 582 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 0.3823454976081848, + "learning_rate": 2.1467412930459936e-05, + "loss": 0.6454403400421143, + "step": 584 + }, + { + "epoch": 1.6010928961748634, + "grad_norm": 0.796212375164032, + "learning_rate": 2.1708460264958595e-05, + "loss": 0.9703986048698425, + "step": 586 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 2.617865800857544, + "learning_rate": 2.194757212705718e-05, + "loss": 0.992373526096344, + "step": 588 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.4272755980491638, + "learning_rate": 2.2184659662394522e-05, + "loss": 0.9483923316001892, + "step": 590 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.9268650412559509, + "learning_rate": 2.24196347688526e-05, + "loss": 1.0497581958770752, + "step": 592 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 1.5188337564468384, + "learning_rate": 2.265241012929541e-05, + "loss": 0.8677475452423096, + "step": 594 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 0.49448922276496887, + "learning_rate": 2.28828992440162e-05, + "loss": 0.4782347083091736, + "step": 596 + }, + { + "epoch": 1.633879781420765, + "grad_norm": 0.259139746427536, + "learning_rate": 2.3111016462880873e-05, + "loss": 1.2155531644821167, + "step": 598 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.24349214136600494, + "learning_rate": 2.333667701715578e-05, + "loss": 1.1115421056747437, + "step": 600 + }, + { + "epoch": 1.644808743169399, + "grad_norm": 0.36063164472579956, + "learning_rate": 2.3559797051007815e-05, + "loss": 1.0202444791793823, + "step": 602 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 0.23508520424365997, + "learning_rate": 2.3780293652665477e-05, + "loss": 1.013314127922058, + "step": 604 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.3779641389846802, + "learning_rate": 2.399808488522895e-05, + "loss": 1.1837308406829834, + "step": 606 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.2397950142621994, + "learning_rate": 2.4213089817118078e-05, + "loss": 0.9537526965141296, + "step": 608 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.7148740887641907, + "learning_rate": 2.4425228552146573e-05, + "loss": 1.135209083557129, + "step": 610 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 2.617220401763916, + "learning_rate": 2.4634422259211614e-05, + "loss": 1.0198928117752075, + "step": 612 + }, + { + "epoch": 1.6775956284153004, + "grad_norm": 0.2389843910932541, + "learning_rate": 2.4840593201587626e-05, + "loss": 1.0087116956710815, + "step": 614 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 0.24383345246315002, + "learning_rate": 2.5043664765813377e-05, + "loss": 1.0644434690475464, + "step": 616 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.17243260145187378, + "learning_rate": 2.524356149016163e-05, + "loss": 0.5880939960479736, + "step": 618 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.32494592666625977, + "learning_rate": 2.544020909268085e-05, + "loss": 1.1145374774932861, + "step": 620 + }, + { + "epoch": 1.6994535519125682, + "grad_norm": 0.3530786633491516, + "learning_rate": 2.5633534498798598e-05, + "loss": 1.1676689386367798, + "step": 622 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.21238695085048676, + "learning_rate": 2.5823465868475985e-05, + "loss": 0.8080005049705505, + "step": 624 + }, + { + "epoch": 1.710382513661202, + "grad_norm": 0.22660064697265625, + "learning_rate": 2.60099326229037e-05, + "loss": 0.7797802686691284, + "step": 626 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 0.1813984215259552, + "learning_rate": 2.619286547072914e-05, + "loss": 1.1150728464126587, + "step": 628 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.6166960597038269, + "learning_rate": 2.6372196433805214e-05, + "loss": 0.788729727268219, + "step": 630 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 0.21812567114830017, + "learning_rate": 2.654785887245112e-05, + "loss": 1.0968449115753174, + "step": 632 + }, + { + "epoch": 1.7322404371584699, + "grad_norm": 0.25673896074295044, + "learning_rate": 2.671978751021577e-05, + "loss": 1.1825098991394043, + "step": 634 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.22896477580070496, + "learning_rate": 2.6887918458134622e-05, + "loss": 0.9839988350868225, + "step": 636 + }, + { + "epoch": 1.7431693989071038, + "grad_norm": 0.22252342104911804, + "learning_rate": 2.705218923847093e-05, + "loss": 1.037245750427246, + "step": 638 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.724755048751831, + "learning_rate": 2.7212538807932576e-05, + "loss": 0.9484571814537048, + "step": 640 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.1993311047554016, + "learning_rate": 2.7368907580355843e-05, + "loss": 1.1303632259368896, + "step": 642 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 0.3169814944267273, + "learning_rate": 2.7521237448847734e-05, + "loss": 1.0198063850402832, + "step": 644 + }, + { + "epoch": 1.7650273224043715, + "grad_norm": 0.9802204370498657, + "learning_rate": 2.766947180737861e-05, + "loss": 0.7934456467628479, + "step": 646 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.22061200439929962, + "learning_rate": 2.781355557181706e-05, + "loss": 1.0378159284591675, + "step": 648 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.21627365052700043, + "learning_rate": 2.7953435200399262e-05, + "loss": 1.0567606687545776, + "step": 650 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 0.1991584300994873, + "learning_rate": 2.8089058713625194e-05, + "loss": 1.1661193370819092, + "step": 652 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.36394891142845154, + "learning_rate": 2.8220375713574307e-05, + "loss": 0.9987577795982361, + "step": 654 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 1.2923098802566528, + "learning_rate": 2.8347337402633456e-05, + "loss": 1.0449413061141968, + "step": 656 + }, + { + "epoch": 1.7978142076502732, + "grad_norm": 1.273789644241333, + "learning_rate": 2.846989660163019e-05, + "loss": 0.9262195825576782, + "step": 658 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 1.2179731130599976, + "learning_rate": 2.858800776736461e-05, + "loss": 1.1149400472640991, + "step": 660 + }, + { + "epoch": 1.8087431693989071, + "grad_norm": 0.2583407163619995, + "learning_rate": 2.87016270095333e-05, + "loss": 1.1010037660598755, + "step": 662 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 0.29756709933280945, + "learning_rate": 2.8810712107039e-05, + "loss": 1.0471609830856323, + "step": 664 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 0.24317026138305664, + "learning_rate": 2.8915222523680082e-05, + "loss": 1.5057708024978638, + "step": 666 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 0.3088081479072571, + "learning_rate": 2.9015119423213857e-05, + "loss": 1.0954713821411133, + "step": 668 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 0.613685667514801, + "learning_rate": 2.9110365683788173e-05, + "loss": 0.7805195450782776, + "step": 670 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.6538958549499512, + "learning_rate": 2.9200925911735956e-05, + "loss": 1.2720482349395752, + "step": 672 + }, + { + "epoch": 1.8415300546448088, + "grad_norm": 0.2265540212392807, + "learning_rate": 2.9286766454727563e-05, + "loss": 1.1147041320800781, + "step": 674 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 0.1871262490749359, + "learning_rate": 2.9367855414276073e-05, + "loss": 1.138785719871521, + "step": 676 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.47468239068984985, + "learning_rate": 2.9444162657590747e-05, + "loss": 0.32045072317123413, + "step": 678 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.19715717434883118, + "learning_rate": 2.951565982877447e-05, + "loss": 1.0369230508804321, + "step": 680 + }, + { + "epoch": 1.8633879781420766, + "grad_norm": 0.2893429398536682, + "learning_rate": 2.9582320359360864e-05, + "loss": 1.0589567422866821, + "step": 682 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.28792470693588257, + "learning_rate": 2.9644119478187126e-05, + "loss": 1.1283237934112549, + "step": 684 + }, + { + "epoch": 1.8743169398907105, + "grad_norm": 1.121538519859314, + "learning_rate": 2.9701034220599074e-05, + "loss": 0.8325943946838379, + "step": 686 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.2843276560306549, + "learning_rate": 2.975304343698483e-05, + "loss": 1.1573818922042847, + "step": 688 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.2129017412662506, + "learning_rate": 2.980012780063404e-05, + "loss": 1.1440218687057495, + "step": 690 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 0.20859314501285553, + "learning_rate": 2.9842269814919755e-05, + "loss": 0.918772280216217, + "step": 692 + }, + { + "epoch": 1.8961748633879782, + "grad_norm": 2.0496702194213867, + "learning_rate": 2.9879453819800156e-05, + "loss": 1.1310737133026123, + "step": 694 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.19168691337108612, + "learning_rate": 2.991166599763788e-05, + "loss": 1.150341510772705, + "step": 696 + }, + { + "epoch": 1.9071038251366121, + "grad_norm": 0.4647641181945801, + "learning_rate": 2.993889437833466e-05, + "loss": 1.0134280920028687, + "step": 698 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.22240066528320312, + "learning_rate": 2.9961128843779457e-05, + "loss": 1.093998670578003, + "step": 700 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.21184808015823364, + "learning_rate": 2.9978361131608348e-05, + "loss": 1.0832879543304443, + "step": 702 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 0.2890876829624176, + "learning_rate": 2.999058483827483e-05, + "loss": 1.1648348569869995, + "step": 704 + }, + { + "epoch": 1.92896174863388, + "grad_norm": 0.23976802825927734, + "learning_rate": 2.9997795421429404e-05, + "loss": 1.1615331172943115, + "step": 706 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 0.21567755937576294, + "learning_rate": 2.9999990201607516e-05, + "loss": 1.2002637386322021, + "step": 708 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 0.30602723360061646, + "learning_rate": 2.999716836322524e-05, + "loss": 0.9376878142356873, + "step": 710 + }, + { + "epoch": 1.9453551912568305, + "grad_norm": 0.18824896216392517, + "learning_rate": 2.9989330954882366e-05, + "loss": 0.7084695100784302, + "step": 712 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 0.2943885028362274, + "learning_rate": 2.9976480888972708e-05, + "loss": 1.133286714553833, + "step": 714 + }, + { + "epoch": 1.9562841530054644, + "grad_norm": 0.2867864668369293, + "learning_rate": 2.9958622940601907e-05, + "loss": 0.9091066122055054, + "step": 716 + }, + { + "epoch": 1.9617486338797814, + "grad_norm": 0.22444675862789154, + "learning_rate": 2.9935763745812935e-05, + "loss": 1.1386990547180176, + "step": 718 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.27985692024230957, + "learning_rate": 2.990791179912017e-05, + "loss": 1.0977388620376587, + "step": 720 + }, + { + "epoch": 1.9726775956284153, + "grad_norm": 0.4131743907928467, + "learning_rate": 2.9875077450352817e-05, + "loss": 1.1935728788375854, + "step": 722 + }, + { + "epoch": 1.9781420765027322, + "grad_norm": 0.2299981564283371, + "learning_rate": 2.9837272900808863e-05, + "loss": 1.0896987915039062, + "step": 724 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 0.6982898712158203, + "learning_rate": 2.9794512198721092e-05, + "loss": 1.1530712842941284, + "step": 726 + }, + { + "epoch": 1.989071038251366, + "grad_norm": 0.4606316387653351, + "learning_rate": 2.9746811234036736e-05, + "loss": 1.090704083442688, + "step": 728 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 0.3608991503715515, + "learning_rate": 2.9694187732512702e-05, + "loss": 1.044235110282898, + "step": 730 + }, + { + "epoch": 2.0, + "grad_norm": 0.45757871866226196, + "learning_rate": 2.96366612491287e-05, + "loss": 1.1520291566848755, + "step": 732 + }, + { + "epoch": 2.0054644808743167, + "grad_norm": 0.30250194668769836, + "learning_rate": 2.9574253160820573e-05, + "loss": 0.7071864008903503, + "step": 734 + }, + { + "epoch": 2.010928961748634, + "grad_norm": 0.549813449382782, + "learning_rate": 2.9506986658536562e-05, + "loss": 0.7574299573898315, + "step": 736 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 0.4930074214935303, + "learning_rate": 2.9434886738619537e-05, + "loss": 0.7851999402046204, + "step": 738 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 0.5250383615493774, + "learning_rate": 2.9357980193518312e-05, + "loss": 0.8171732425689697, + "step": 740 + }, + { + "epoch": 2.0273224043715845, + "grad_norm": 1.0003880262374878, + "learning_rate": 2.927629560183153e-05, + "loss": 0.6959396004676819, + "step": 742 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 1.62253737449646, + "learning_rate": 2.91898633176878e-05, + "loss": 0.9657958745956421, + "step": 744 + }, + { + "epoch": 2.0382513661202184, + "grad_norm": 0.3060198426246643, + "learning_rate": 2.909871545946603e-05, + "loss": 0.965573251247406, + "step": 746 + }, + { + "epoch": 2.0437158469945356, + "grad_norm": 0.36276838183403015, + "learning_rate": 2.9002885897860252e-05, + "loss": 0.971846878528595, + "step": 748 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 0.6612604260444641, + "learning_rate": 2.8902410243293152e-05, + "loss": 0.6588478088378906, + "step": 750 + }, + { + "epoch": 2.0546448087431695, + "grad_norm": 0.6954399943351746, + "learning_rate": 2.8797325832683208e-05, + "loss": 0.5652149319648743, + "step": 752 + }, + { + "epoch": 2.060109289617486, + "grad_norm": 0.6179106831550598, + "learning_rate": 2.868767171557021e-05, + "loss": 0.7592162489891052, + "step": 754 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 0.5607179999351501, + "learning_rate": 2.8573488639604418e-05, + "loss": 1.0012261867523193, + "step": 756 + }, + { + "epoch": 2.07103825136612, + "grad_norm": 0.3789028823375702, + "learning_rate": 2.845481903540464e-05, + "loss": 0.9981750249862671, + "step": 758 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 0.286051869392395, + "learning_rate": 2.8331707000790954e-05, + "loss": 0.8374607563018799, + "step": 760 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 0.28156572580337524, + "learning_rate": 2.820419828439788e-05, + "loss": 0.9713491797447205, + "step": 762 + }, + { + "epoch": 2.087431693989071, + "grad_norm": 0.21401195228099823, + "learning_rate": 2.8072340268674133e-05, + "loss": 1.0316338539123535, + "step": 764 + }, + { + "epoch": 2.092896174863388, + "grad_norm": 0.3717721998691559, + "learning_rate": 2.793618195227521e-05, + "loss": 1.0349146127700806, + "step": 766 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 1.2073692083358765, + "learning_rate": 2.779577393185539e-05, + "loss": 0.9263788461685181, + "step": 768 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 0.2607380747795105, + "learning_rate": 2.765116838326597e-05, + "loss": 0.9181485176086426, + "step": 770 + }, + { + "epoch": 2.109289617486339, + "grad_norm": 0.5935834646224976, + "learning_rate": 2.750241904216663e-05, + "loss": 0.8803791403770447, + "step": 772 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 0.22448551654815674, + "learning_rate": 2.7349581184057144e-05, + "loss": 0.7459222674369812, + "step": 774 + }, + { + "epoch": 2.120218579234973, + "grad_norm": 0.31990522146224976, + "learning_rate": 2.719271160373693e-05, + "loss": 0.9014583230018616, + "step": 776 + }, + { + "epoch": 2.1256830601092895, + "grad_norm": 1.4967182874679565, + "learning_rate": 2.703186859420002e-05, + "loss": 0.8457669615745544, + "step": 778 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 0.23060189187526703, + "learning_rate": 2.6867111924973283e-05, + "loss": 0.7118249535560608, + "step": 780 + }, + { + "epoch": 2.1366120218579234, + "grad_norm": 0.22133475542068481, + "learning_rate": 2.6698502819905935e-05, + "loss": 0.8637291193008423, + "step": 782 + }, + { + "epoch": 2.1420765027322406, + "grad_norm": 0.21762791275978088, + "learning_rate": 2.652610393441872e-05, + "loss": 0.9261735677719116, + "step": 784 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 0.22464466094970703, + "learning_rate": 2.6349979332220992e-05, + "loss": 0.6325854659080505, + "step": 786 + }, + { + "epoch": 2.1530054644808745, + "grad_norm": 1.0833162069320679, + "learning_rate": 2.6170194461504586e-05, + "loss": 0.7055314779281616, + "step": 788 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 0.38726380467414856, + "learning_rate": 2.5986816130623133e-05, + "loss": 1.0715676546096802, + "step": 790 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 0.24775686860084534, + "learning_rate": 2.579991248326594e-05, + "loss": 0.6376872658729553, + "step": 792 + }, + { + "epoch": 2.169398907103825, + "grad_norm": 0.6708298921585083, + "learning_rate": 2.560955297313575e-05, + "loss": 0.7486907243728638, + "step": 794 + }, + { + "epoch": 2.1748633879781423, + "grad_norm": 0.2694443464279175, + "learning_rate": 2.5415808338139595e-05, + "loss": 0.8056567311286926, + "step": 796 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 0.26330071687698364, + "learning_rate": 2.5218750574102465e-05, + "loss": 0.8525882363319397, + "step": 798 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 1.952973484992981, + "learning_rate": 2.5018452908013522e-05, + "loss": 0.8876383304595947, + "step": 800 + }, + { + "epoch": 2.191256830601093, + "grad_norm": 0.3089567720890045, + "learning_rate": 2.48149897708149e-05, + "loss": 0.848046600818634, + "step": 802 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 0.1766747534275055, + "learning_rate": 2.4608436769743e-05, + "loss": 0.8710022568702698, + "step": 804 + }, + { + "epoch": 2.202185792349727, + "grad_norm": 0.21580883860588074, + "learning_rate": 2.4398870660232684e-05, + "loss": 0.8753240704536438, + "step": 806 + }, + { + "epoch": 2.2076502732240435, + "grad_norm": 0.9658389091491699, + "learning_rate": 2.418636931739491e-05, + "loss": 0.5726881623268127, + "step": 808 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 0.22094398736953735, + "learning_rate": 2.3971011707078125e-05, + "loss": 0.6364350914955139, + "step": 810 + }, + { + "epoch": 2.2185792349726774, + "grad_norm": 0.19857634603977203, + "learning_rate": 2.3752877856524532e-05, + "loss": 0.8574658632278442, + "step": 812 + }, + { + "epoch": 2.2240437158469946, + "grad_norm": 0.20962177217006683, + "learning_rate": 2.353204882463168e-05, + "loss": 0.776292085647583, + "step": 814 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 0.23855309188365936, + "learning_rate": 2.330860667183101e-05, + "loss": 0.642566442489624, + "step": 816 + }, + { + "epoch": 2.2349726775956285, + "grad_norm": 0.24430891871452332, + "learning_rate": 2.308263442959396e-05, + "loss": 0.8425507545471191, + "step": 818 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 0.6459073424339294, + "learning_rate": 2.2854216069577376e-05, + "loss": 0.9843753576278687, + "step": 820 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 0.46932026743888855, + "learning_rate": 2.2623436472419476e-05, + "loss": 0.9202935099601746, + "step": 822 + }, + { + "epoch": 2.251366120218579, + "grad_norm": 0.5952751636505127, + "learning_rate": 2.2390381396198102e-05, + "loss": 0.6778359413146973, + "step": 824 + }, + { + "epoch": 2.2568306010928962, + "grad_norm": 3.2729830741882324, + "learning_rate": 2.2155137444562842e-05, + "loss": 0.8709823489189148, + "step": 826 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 0.560955822467804, + "learning_rate": 2.191779203455302e-05, + "loss": 0.6671867966651917, + "step": 828 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 0.21046458184719086, + "learning_rate": 2.1678433364113297e-05, + "loss": 0.7721139788627625, + "step": 830 + }, + { + "epoch": 2.273224043715847, + "grad_norm": 2.744577646255493, + "learning_rate": 2.1437150379319245e-05, + "loss": 0.543931245803833, + "step": 832 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 0.26252231001853943, + "learning_rate": 2.1194032741324823e-05, + "loss": 0.7887152433395386, + "step": 834 + }, + { + "epoch": 2.2841530054644807, + "grad_norm": 0.7648658156394958, + "learning_rate": 2.0949170793044142e-05, + "loss": 0.9888522624969482, + "step": 836 + }, + { + "epoch": 2.289617486338798, + "grad_norm": 0.8900395035743713, + "learning_rate": 2.070265552557985e-05, + "loss": 0.8767361640930176, + "step": 838 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 0.1943552941083908, + "learning_rate": 2.0454578544410758e-05, + "loss": 0.6835895776748657, + "step": 840 + }, + { + "epoch": 2.300546448087432, + "grad_norm": 0.16506916284561157, + "learning_rate": 2.0205032035351043e-05, + "loss": 0.918617844581604, + "step": 842 + }, + { + "epoch": 2.3060109289617485, + "grad_norm": 0.2222490906715393, + "learning_rate": 1.9954108730293875e-05, + "loss": 0.9415408968925476, + "step": 844 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 0.173303484916687, + "learning_rate": 1.9701901872752047e-05, + "loss": 0.4551407992839813, + "step": 846 + }, + { + "epoch": 2.3169398907103824, + "grad_norm": 0.23867173492908478, + "learning_rate": 1.9448505183208607e-05, + "loss": 0.9028974175453186, + "step": 848 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 0.23550698161125183, + "learning_rate": 1.919401282429013e-05, + "loss": 0.8313841223716736, + "step": 850 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 1.4692848920822144, + "learning_rate": 1.893851936577567e-05, + "loss": 0.7707965970039368, + "step": 852 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.29081717133522034, + "learning_rate": 1.868211974945461e-05, + "loss": 0.8993242979049683, + "step": 854 + }, + { + "epoch": 2.33879781420765, + "grad_norm": 0.21171832084655762, + "learning_rate": 1.842490925384604e-05, + "loss": 0.8694086074829102, + "step": 856 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 0.2612394392490387, + "learning_rate": 1.816698345879313e-05, + "loss": 0.5785654783248901, + "step": 858 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 0.1895606517791748, + "learning_rate": 1.790843820994548e-05, + "loss": 0.6962811350822449, + "step": 860 + }, + { + "epoch": 2.3551912568306013, + "grad_norm": 0.19078537821769714, + "learning_rate": 1.7649369583142763e-05, + "loss": 0.8344950675964355, + "step": 862 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 0.20540055632591248, + "learning_rate": 1.738987384871274e-05, + "loss": 0.5912719368934631, + "step": 864 + }, + { + "epoch": 2.366120218579235, + "grad_norm": 0.5205765962600708, + "learning_rate": 1.7130047435697118e-05, + "loss": 1.2037533521652222, + "step": 866 + }, + { + "epoch": 2.371584699453552, + "grad_norm": 0.16610278189182281, + "learning_rate": 1.6869986896018226e-05, + "loss": 0.8778015375137329, + "step": 868 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.21313391625881195, + "learning_rate": 1.66097888686003e-05, + "loss": 0.6815759539604187, + "step": 870 + }, + { + "epoch": 2.3825136612021858, + "grad_norm": 0.19833733141422272, + "learning_rate": 1.6349550043458252e-05, + "loss": 0.7003133893013, + "step": 872 + }, + { + "epoch": 2.387978142076503, + "grad_norm": 0.24588751792907715, + "learning_rate": 1.608936712576749e-05, + "loss": 0.8853883147239685, + "step": 874 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 1.4030362367630005, + "learning_rate": 1.582933679992809e-05, + "loss": 0.8193628191947937, + "step": 876 + }, + { + "epoch": 2.3989071038251364, + "grad_norm": 0.25144752860069275, + "learning_rate": 1.556955569363678e-05, + "loss": 0.7706636190414429, + "step": 878 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 0.20421741902828217, + "learning_rate": 1.531012034197988e-05, + "loss": 0.9738537073135376, + "step": 880 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 0.32910487055778503, + "learning_rate": 1.5051127151560745e-05, + "loss": 0.4443007707595825, + "step": 882 + }, + { + "epoch": 2.4153005464480874, + "grad_norm": 0.2254200130701065, + "learning_rate": 1.4792672364674816e-05, + "loss": 0.7611709237098694, + "step": 884 + }, + { + "epoch": 2.420765027322404, + "grad_norm": 0.2349507212638855, + "learning_rate": 1.4534852023545968e-05, + "loss": 0.7841254472732544, + "step": 886 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 0.5601933002471924, + "learning_rate": 1.4277761934636963e-05, + "loss": 0.631168007850647, + "step": 888 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 0.3481084406375885, + "learning_rate": 1.4021497633047664e-05, + "loss": 0.5096313953399658, + "step": 890 + }, + { + "epoch": 2.4371584699453552, + "grad_norm": 0.5794140696525574, + "learning_rate": 1.3766154347013933e-05, + "loss": 0.8481501936912537, + "step": 892 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 0.248020201921463, + "learning_rate": 1.3511826962520809e-05, + "loss": 0.4762151539325714, + "step": 894 + }, + { + "epoch": 2.448087431693989, + "grad_norm": 0.27063384652137756, + "learning_rate": 1.3258609988042627e-05, + "loss": 0.6739501357078552, + "step": 896 + }, + { + "epoch": 2.453551912568306, + "grad_norm": 0.2560213506221771, + "learning_rate": 1.300659751942353e-05, + "loss": 0.7505039572715759, + "step": 898 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.27310049533843994, + "learning_rate": 1.2755883204911305e-05, + "loss": 0.8457780480384827, + "step": 900 + }, + { + "epoch": 2.4644808743169397, + "grad_norm": 0.22151170670986176, + "learning_rate": 1.2506560210357541e-05, + "loss": 0.7800691723823547, + "step": 902 + }, + { + "epoch": 2.469945355191257, + "grad_norm": 0.22379451990127563, + "learning_rate": 1.225872118459706e-05, + "loss": 0.74601811170578, + "step": 904 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 0.47037237882614136, + "learning_rate": 1.2012458225019375e-05, + "loss": 0.9208009839057922, + "step": 906 + }, + { + "epoch": 2.480874316939891, + "grad_norm": 0.14209216833114624, + "learning_rate": 1.176786284334528e-05, + "loss": 0.3645989000797272, + "step": 908 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 0.13509748876094818, + "learning_rate": 1.1525025931620855e-05, + "loss": 0.6787592172622681, + "step": 910 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 0.18052458763122559, + "learning_rate": 1.1284037728441877e-05, + "loss": 0.7986164093017578, + "step": 912 + }, + { + "epoch": 2.4972677595628414, + "grad_norm": 0.22115804255008698, + "learning_rate": 1.1044987785420924e-05, + "loss": 0.8646562099456787, + "step": 914 + }, + { + "epoch": 2.5027322404371586, + "grad_norm": 0.2403498888015747, + "learning_rate": 1.0807964933909975e-05, + "loss": 0.737191379070282, + "step": 916 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 0.29979637265205383, + "learning_rate": 1.0573057251990443e-05, + "loss": 0.8144265413284302, + "step": 918 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 0.18022961914539337, + "learning_rate": 1.0340352031743256e-05, + "loss": 0.8598709106445312, + "step": 920 + }, + { + "epoch": 2.519125683060109, + "grad_norm": 0.22041092813014984, + "learning_rate": 1.010993574681095e-05, + "loss": 0.7569805383682251, + "step": 922 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 0.2608581781387329, + "learning_rate": 9.881894020263938e-06, + "loss": 0.5048292875289917, + "step": 924 + }, + { + "epoch": 2.530054644808743, + "grad_norm": 0.3491852283477783, + "learning_rate": 9.656311592782831e-06, + "loss": 0.6451077461242676, + "step": 926 + }, + { + "epoch": 2.5355191256830603, + "grad_norm": 0.3480936288833618, + "learning_rate": 9.433272291168689e-06, + "loss": 0.6237382292747498, + "step": 928 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 0.29056069254875183, + "learning_rate": 9.212858997192744e-06, + "loss": 0.93817138671875, + "step": 930 + }, + { + "epoch": 2.546448087431694, + "grad_norm": 0.3035399317741394, + "learning_rate": 8.995153616797544e-06, + "loss": 0.6400088667869568, + "step": 932 + }, + { + "epoch": 2.551912568306011, + "grad_norm": 0.3459092974662781, + "learning_rate": 8.78023704966047e-06, + "loss": 0.8244706988334656, + "step": 934 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 0.38768166303634644, + "learning_rate": 8.568189159131336e-06, + "loss": 0.724708616733551, + "step": 936 + }, + { + "epoch": 2.5628415300546448, + "grad_norm": 1.209683895111084, + "learning_rate": 8.359088742554941e-06, + "loss": 0.5176321268081665, + "step": 938 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 0.33049488067626953, + "learning_rate": 8.15301350198999e-06, + "loss": 0.8478241562843323, + "step": 940 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 0.7910704612731934, + "learning_rate": 7.950040015334789e-06, + "loss": 0.5429540276527405, + "step": 942 + }, + { + "epoch": 2.579234972677596, + "grad_norm": 0.21583837270736694, + "learning_rate": 7.750243707870748e-06, + "loss": 0.8436353802680969, + "step": 944 + }, + { + "epoch": 2.5846994535519126, + "grad_norm": 0.3202984631061554, + "learning_rate": 7.553698824234314e-06, + "loss": 0.25999099016189575, + "step": 946 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 0.2738170623779297, + "learning_rate": 7.360478400827475e-06, + "loss": 0.6493714451789856, + "step": 948 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 0.19985666871070862, + "learning_rate": 7.170654238677331e-06, + "loss": 0.7574492692947388, + "step": 950 + }, + { + "epoch": 2.6010928961748636, + "grad_norm": 0.24671149253845215, + "learning_rate": 6.984296876754711e-06, + "loss": 0.8211013674736023, + "step": 952 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 0.1873648315668106, + "learning_rate": 6.801475565761783e-06, + "loss": 0.43293893337249756, + "step": 954 + }, + { + "epoch": 2.612021857923497, + "grad_norm": 0.17501504719257355, + "learning_rate": 6.622258242398371e-06, + "loss": 0.27505725622177124, + "step": 956 + }, + { + "epoch": 2.6174863387978142, + "grad_norm": 0.22481496632099152, + "learning_rate": 6.4467115041165855e-06, + "loss": 0.4883240759372711, + "step": 958 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.21473781764507294, + "learning_rate": 6.2749005843730336e-06, + "loss": 0.654322624206543, + "step": 960 + }, + { + "epoch": 2.628415300546448, + "grad_norm": 0.5412459969520569, + "learning_rate": 6.106889328388064e-06, + "loss": 0.6429625153541565, + "step": 962 + }, + { + "epoch": 2.633879781420765, + "grad_norm": 0.4350409209728241, + "learning_rate": 5.942740169420701e-06, + "loss": 0.9000084400177002, + "step": 964 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 0.23633772134780884, + "learning_rate": 5.7825141055683895e-06, + "loss": 0.7249687314033508, + "step": 966 + }, + { + "epoch": 2.644808743169399, + "grad_norm": 0.31932181119918823, + "learning_rate": 5.62627067709992e-06, + "loss": 0.692674458026886, + "step": 968 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 0.4318002462387085, + "learning_rate": 5.474067944330285e-06, + "loss": 0.7996351718902588, + "step": 970 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 0.23726297914981842, + "learning_rate": 5.325962466045282e-06, + "loss": 0.9792457818984985, + "step": 972 + }, + { + "epoch": 2.66120218579235, + "grad_norm": 0.19819673895835876, + "learning_rate": 5.18200927848421e-06, + "loss": 0.6123301982879639, + "step": 974 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.3812166452407837, + "learning_rate": 5.042261874888308e-06, + "loss": 0.7370781302452087, + "step": 976 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 0.17140597105026245, + "learning_rate": 4.906772185622572e-06, + "loss": 0.7503936886787415, + "step": 978 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 0.18407227098941803, + "learning_rate": 4.775590558878368e-06, + "loss": 0.7234741449356079, + "step": 980 + }, + { + "epoch": 2.6830601092896176, + "grad_norm": 0.21530817449092865, + "learning_rate": 4.648765741963903e-06, + "loss": 0.7761690616607666, + "step": 982 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 0.21703749895095825, + "learning_rate": 4.526344863189724e-06, + "loss": 0.6027669906616211, + "step": 984 + }, + { + "epoch": 2.6939890710382515, + "grad_norm": 0.26828664541244507, + "learning_rate": 4.408373414355714e-06, + "loss": 0.9321015477180481, + "step": 986 + }, + { + "epoch": 2.699453551912568, + "grad_norm": 0.20866970717906952, + "learning_rate": 4.29489523384628e-06, + "loss": 0.8720956444740295, + "step": 988 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 0.2858516573905945, + "learning_rate": 4.185952490339899e-06, + "loss": 0.7041909098625183, + "step": 990 + }, + { + "epoch": 2.710382513661202, + "grad_norm": 0.2261834740638733, + "learning_rate": 4.081585667139231e-06, + "loss": 0.45303869247436523, + "step": 992 + }, + { + "epoch": 2.7158469945355193, + "grad_norm": 0.1847374141216278, + "learning_rate": 3.981833547127413e-06, + "loss": 0.8106403946876526, + "step": 994 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 0.20215702056884766, + "learning_rate": 3.886733198356298e-06, + "loss": 0.8668434619903564, + "step": 996 + }, + { + "epoch": 2.726775956284153, + "grad_norm": 0.15933895111083984, + "learning_rate": 3.7963199602718717e-06, + "loss": 0.6896766424179077, + "step": 998 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 0.7066519260406494, + "learning_rate": 3.7106274305821034e-06, + "loss": 0.99676513671875, + "step": 1000 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 0.24640890955924988, + "learning_rate": 3.6296874527719515e-06, + "loss": 0.7258309125900269, + "step": 1002 + }, + { + "epoch": 2.7431693989071038, + "grad_norm": 0.48880383372306824, + "learning_rate": 3.553530104270281e-06, + "loss": 0.6444882750511169, + "step": 1004 + }, + { + "epoch": 2.748633879781421, + "grad_norm": 0.7180402278900146, + "learning_rate": 3.4821836852730384e-06, + "loss": 0.30940499901771545, + "step": 1006 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 0.25939175486564636, + "learning_rate": 3.41567470822686e-06, + "loss": 0.9526011943817139, + "step": 1008 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 0.3068941533565521, + "learning_rate": 3.354027887976989e-06, + "loss": 0.9272940158843994, + "step": 1010 + }, + { + "epoch": 2.7650273224043715, + "grad_norm": 0.298452764749527, + "learning_rate": 3.297266132583221e-06, + "loss": 0.6733279228210449, + "step": 1012 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 0.253140389919281, + "learning_rate": 3.245410534807195e-06, + "loss": 0.9813092947006226, + "step": 1014 + }, + { + "epoch": 2.7759562841530054, + "grad_norm": 0.38031837344169617, + "learning_rate": 3.1984803642743314e-06, + "loss": 0.6480425596237183, + "step": 1016 + }, + { + "epoch": 2.781420765027322, + "grad_norm": 0.9425625205039978, + "learning_rate": 3.1564930603131777e-06, + "loss": 0.9104164838790894, + "step": 1018 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.33443892002105713, + "learning_rate": 3.1194642254749395e-06, + "loss": 0.5776557326316833, + "step": 1020 + }, + { + "epoch": 2.7923497267759565, + "grad_norm": 0.2240370362997055, + "learning_rate": 3.0874076197355317e-06, + "loss": 0.8249503970146179, + "step": 1022 + }, + { + "epoch": 2.797814207650273, + "grad_norm": 0.1996547132730484, + "learning_rate": 3.0603351553823717e-06, + "loss": 0.4351382553577423, + "step": 1024 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 0.23715117573738098, + "learning_rate": 3.038256892587734e-06, + "loss": 0.9221212267875671, + "step": 1026 + }, + { + "epoch": 2.808743169398907, + "grad_norm": 0.6460992693901062, + "learning_rate": 3.0211810356703803e-06, + "loss": 0.6869730353355408, + "step": 1028 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 0.29128873348236084, + "learning_rate": 3.0091139300468266e-06, + "loss": 0.5977322459220886, + "step": 1030 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 0.2111499011516571, + "learning_rate": 3.0020600598733656e-06, + "loss": 0.6241099834442139, + "step": 1032 + }, + { + "epoch": 2.8251366120218577, + "grad_norm": 0.3315550684928894, + "learning_rate": 3.000022046379753e-06, + "loss": 0.7141521573066711, + "step": 1034 + }, + { + "epoch": 2.830601092896175, + "grad_norm": 0.30943799018859863, + "learning_rate": 3.0030006468951557e-06, + "loss": 0.9645689725875854, + "step": 1036 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 0.17847982048988342, + "learning_rate": 3.0109947545667246e-06, + "loss": 0.6744089126586914, + "step": 1038 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 0.24691838026046753, + "learning_rate": 3.024001398770901e-06, + "loss": 0.801510751247406, + "step": 1040 + }, + { + "epoch": 2.8469945355191255, + "grad_norm": 0.21006713807582855, + "learning_rate": 3.042015746217308e-06, + "loss": 0.8129590749740601, + "step": 1042 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 0.26454004645347595, + "learning_rate": 3.0650311027448116e-06, + "loss": 0.6099104881286621, + "step": 1044 + }, + { + "epoch": 2.8579234972677594, + "grad_norm": 0.48515036702156067, + "learning_rate": 3.0930389158090754e-06, + "loss": 0.9171488881111145, + "step": 1046 + }, + { + "epoch": 2.8633879781420766, + "grad_norm": 0.14590629935264587, + "learning_rate": 3.1260287776607025e-06, + "loss": 0.3860487639904022, + "step": 1048 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 0.2847556471824646, + "learning_rate": 3.163988429212773e-06, + "loss": 0.7123141288757324, + "step": 1050 + }, + { + "epoch": 2.8743169398907105, + "grad_norm": 0.21968185901641846, + "learning_rate": 3.206903764596349e-06, + "loss": 0.7720679640769958, + "step": 1052 + }, + { + "epoch": 2.879781420765027, + "grad_norm": 0.24895037710666656, + "learning_rate": 3.254758836402225e-06, + "loss": 0.9379729628562927, + "step": 1054 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 1.1485183238983154, + "learning_rate": 3.3075358616070144e-06, + "loss": 0.6869798898696899, + "step": 1056 + }, + { + "epoch": 2.890710382513661, + "grad_norm": 2.465815305709839, + "learning_rate": 3.365215228181358e-06, + "loss": 0.594801664352417, + "step": 1058 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 0.19229429960250854, + "learning_rate": 3.4277755023777795e-06, + "loss": 0.7733983397483826, + "step": 1060 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 0.20262494683265686, + "learning_rate": 3.495193436695504e-06, + "loss": 0.7548971176147461, + "step": 1062 + }, + { + "epoch": 2.907103825136612, + "grad_norm": 0.3187836706638336, + "learning_rate": 3.567443978519267e-06, + "loss": 0.7963505983352661, + "step": 1064 + }, + { + "epoch": 2.912568306010929, + "grad_norm": 0.22692307829856873, + "learning_rate": 3.6445002794288992e-06, + "loss": 0.4752350449562073, + "step": 1066 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 0.2349584698677063, + "learning_rate": 3.7263337051762718e-06, + "loss": 0.649222731590271, + "step": 1068 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 0.5317696928977966, + "learning_rate": 3.8129138463257943e-06, + "loss": 0.7322983145713806, + "step": 1070 + }, + { + "epoch": 2.92896174863388, + "grad_norm": 0.24103935062885284, + "learning_rate": 3.904208529554625e-06, + "loss": 0.9223740100860596, + "step": 1072 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 0.2071618139743805, + "learning_rate": 4.000183829608332e-06, + "loss": 0.47986942529678345, + "step": 1074 + }, + { + "epoch": 2.939890710382514, + "grad_norm": 0.3855530917644501, + "learning_rate": 4.100804081907595e-06, + "loss": 0.5404314398765564, + "step": 1076 + }, + { + "epoch": 2.9453551912568305, + "grad_norm": 0.9493356347084045, + "learning_rate": 4.206031895801176e-06, + "loss": 0.547483503818512, + "step": 1078 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.3453940153121948, + "learning_rate": 4.315828168460367e-06, + "loss": 0.932254433631897, + "step": 1080 + }, + { + "epoch": 2.9562841530054644, + "grad_norm": 0.19494763016700745, + "learning_rate": 4.430152099409704e-06, + "loss": 0.5538932085037231, + "step": 1082 + }, + { + "epoch": 2.9617486338797816, + "grad_norm": 0.2678419053554535, + "learning_rate": 4.548961205688424e-06, + "loss": 0.7946881651878357, + "step": 1084 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 0.45159897208213806, + "learning_rate": 4.672211337637246e-06, + "loss": 0.5122095942497253, + "step": 1086 + }, + { + "epoch": 2.972677595628415, + "grad_norm": 0.3175777196884155, + "learning_rate": 4.7998566953044445e-06, + "loss": 1.1099841594696045, + "step": 1088 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 0.21449358761310577, + "learning_rate": 4.931849845465193e-06, + "loss": 0.6492303013801575, + "step": 1090 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 0.25702965259552, + "learning_rate": 5.06814173924782e-06, + "loss": 0.7961921095848083, + "step": 1092 + }, + { + "epoch": 2.989071038251366, + "grad_norm": 0.22738364338874817, + "learning_rate": 5.208681730360458e-06, + "loss": 0.7180767059326172, + "step": 1094 + }, + { + "epoch": 2.994535519125683, + "grad_norm": 1.2310293912887573, + "learning_rate": 5.3534175939112694e-06, + "loss": 0.590616762638092, + "step": 1096 + }, + { + "epoch": 3.0, + "grad_norm": 0.243174210190773, + "learning_rate": 5.50229554581536e-06, + "loss": 0.7194076180458069, + "step": 1098 + }, + { + "epoch": 3.0, + "step": 1098, + "total_flos": 4.931331991116186e+18, + "train_loss": 0.9604242436981375, + "train_runtime": 12050.7893, + "train_samples_per_second": 5.467, + "train_steps_per_second": 0.091 + } + ], + "logging_steps": 2, + "max_steps": 1098, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.931331991116186e+18, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}