diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3886 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1098, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00546448087431694, + "grad_norm": 0.23241105675697327, + "learning_rate": 3.6363636363636366e-07, + "loss": 2.7663679122924805, + "step": 2 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 0.20422983169555664, + "learning_rate": 1.090909090909091e-06, + "loss": 1.9384474754333496, + "step": 4 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 0.199067622423172, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.9044784307479858, + "step": 6 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 0.23435044288635254, + "learning_rate": 2.5454545454545456e-06, + "loss": 1.7989522218704224, + "step": 8 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 0.22486603260040283, + "learning_rate": 3.272727272727273e-06, + "loss": 1.8125925064086914, + "step": 10 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.16707547008991241, + "learning_rate": 4.000000000000001e-06, + "loss": 1.6252872943878174, + "step": 12 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 0.11193601042032242, + "learning_rate": 4.727272727272728e-06, + "loss": 1.5324492454528809, + "step": 14 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 0.10569358617067337, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.2910876274108887, + "step": 16 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 0.2930830419063568, + "learning_rate": 6.181818181818182e-06, + "loss": 1.1632479429244995, + "step": 18 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 0.16945035755634308, + "learning_rate": 6.90909090909091e-06, + "loss": 1.4446367025375366, + "step": 20 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 0.16067177057266235, + "learning_rate": 7.636363636363638e-06, + "loss": 1.2816951274871826, + "step": 22 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.11728192120790482, + "learning_rate": 8.363636363636365e-06, + "loss": 1.4356242418289185, + "step": 24 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 0.33978065848350525, + "learning_rate": 9.090909090909091e-06, + "loss": 1.6120140552520752, + "step": 26 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 0.13070262968540192, + "learning_rate": 9.81818181818182e-06, + "loss": 1.393876314163208, + "step": 28 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.10509130358695984, + "learning_rate": 1.0545454545454546e-05, + "loss": 1.2700122594833374, + "step": 30 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.1495838314294815, + "learning_rate": 1.1272727272727272e-05, + "loss": 1.3333828449249268, + "step": 32 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 0.5792630314826965, + "learning_rate": 1.2e-05, + "loss": 1.0171189308166504, + "step": 34 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.09606198966503143, + "learning_rate": 1.2727272727272728e-05, + "loss": 1.119741439819336, + "step": 36 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 0.12487119436264038, + "learning_rate": 1.3454545454545455e-05, + "loss": 1.3798904418945312, + "step": 38 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.07102937251329422, + "learning_rate": 1.4181818181818183e-05, + "loss": 1.3242915868759155, + "step": 40 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 0.19717426598072052, + "learning_rate": 1.4909090909090911e-05, + "loss": 0.9329365491867065, + "step": 42 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 0.08408994972705841, + "learning_rate": 1.563636363636364e-05, + "loss": 0.8571876883506775, + "step": 44 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 0.09981869906187057, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.3420907258987427, + "step": 46 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.11734086275100708, + "learning_rate": 1.7090909090909092e-05, + "loss": 1.304795503616333, + "step": 48 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.14026959240436554, + "learning_rate": 1.781818181818182e-05, + "loss": 1.269639492034912, + "step": 50 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 0.12195869535207748, + "learning_rate": 1.8545454545454545e-05, + "loss": 1.2368900775909424, + "step": 52 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 0.32812365889549255, + "learning_rate": 1.9272727272727275e-05, + "loss": 0.7080382108688354, + "step": 54 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 0.19641871750354767, + "learning_rate": 2e-05, + "loss": 1.4196542501449585, + "step": 56 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 0.13444043695926666, + "learning_rate": 1.9998327792599505e-05, + "loss": 1.4084272384643555, + "step": 58 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.16206051409244537, + "learning_rate": 1.999331179179304e-05, + "loss": 1.2947711944580078, + "step": 60 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 0.13851064443588257, + "learning_rate": 1.9984953861534752e-05, + "loss": 1.5364221334457397, + "step": 62 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 0.24502862989902496, + "learning_rate": 1.997325710764527e-05, + "loss": 1.1034942865371704, + "step": 64 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.21653936803340912, + "learning_rate": 1.9958225876657575e-05, + "loss": 1.3003474473953247, + "step": 66 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 0.12236165255308151, + "learning_rate": 1.9939865754201825e-05, + "loss": 1.2659997940063477, + "step": 68 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.2843843996524811, + "learning_rate": 1.9918183562929717e-05, + "loss": 1.320336103439331, + "step": 70 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 0.1539740264415741, + "learning_rate": 1.9893187359979183e-05, + "loss": 1.1016457080841064, + "step": 72 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 0.1354515552520752, + "learning_rate": 1.986488643398035e-05, + "loss": 1.4368224143981934, + "step": 74 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 0.14250975847244263, + "learning_rate": 1.9833291301603863e-05, + "loss": 1.3135349750518799, + "step": 76 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 2.250478744506836, + "learning_rate": 1.9798413703652867e-05, + "loss": 1.1084109544754028, + "step": 78 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.07655666023492813, + "learning_rate": 1.976026660070012e-05, + "loss": 0.7299535870552063, + "step": 80 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 0.11502671241760254, + "learning_rate": 1.9718864168271823e-05, + "loss": 1.3118858337402344, + "step": 82 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.11101143807172775, + "learning_rate": 1.9674221791579946e-05, + "loss": 0.7135340571403503, + "step": 84 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 0.06640731543302536, + "learning_rate": 1.9626356059805085e-05, + "loss": 1.2577087879180908, + "step": 86 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 0.10447928309440613, + "learning_rate": 1.957528475993189e-05, + "loss": 1.236086130142212, + "step": 88 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 0.7586353421211243, + "learning_rate": 1.952102687013938e-05, + "loss": 0.9215405583381653, + "step": 90 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 0.17681649327278137, + "learning_rate": 1.946360255274863e-05, + "loss": 1.3206024169921875, + "step": 92 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 0.09881044924259186, + "learning_rate": 1.9403033146730424e-05, + "loss": 1.1016768217086792, + "step": 94 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.18392790853977203, + "learning_rate": 1.9339341159775647e-05, + "loss": 1.5615978240966797, + "step": 96 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 0.10873087495565414, + "learning_rate": 1.9272550259931398e-05, + "loss": 0.9698639512062073, + "step": 98 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.09143725037574768, + "learning_rate": 1.9202685266805896e-05, + "loss": 1.022253155708313, + "step": 100 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 0.21886828541755676, + "learning_rate": 1.9129772142345484e-05, + "loss": 1.229724645614624, + "step": 102 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 0.21221189200878143, + "learning_rate": 1.9053837981187125e-05, + "loss": 0.9091554284095764, + "step": 104 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 0.16236251592636108, + "learning_rate": 1.897491100058998e-05, + "loss": 1.2335375547409058, + "step": 106 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 0.1724518984556198, + "learning_rate": 1.8893020529949838e-05, + "loss": 1.3930922746658325, + "step": 108 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 0.33721551299095154, + "learning_rate": 1.880819699990027e-05, + "loss": 1.2754117250442505, + "step": 110 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.07887385040521622, + "learning_rate": 1.8720471931004526e-05, + "loss": 1.2538396120071411, + "step": 112 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 0.2448997050523758, + "learning_rate": 1.8629877922042485e-05, + "loss": 1.3684722185134888, + "step": 114 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 0.10887415707111359, + "learning_rate": 1.8536448637896866e-05, + "loss": 1.2443941831588745, + "step": 116 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 0.09768451005220413, + "learning_rate": 1.84402187970433e-05, + "loss": 1.0020421743392944, + "step": 118 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.20493730902671814, + "learning_rate": 1.834122415864891e-05, + "loss": 1.2622389793395996, + "step": 120 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.22100353240966797, + "learning_rate": 1.8239501509284123e-05, + "loss": 1.334355354309082, + "step": 122 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 0.10817477852106094, + "learning_rate": 1.8135088649252725e-05, + "loss": 0.9958959221839905, + "step": 124 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 0.1272984892129898, + "learning_rate": 1.8028024378545224e-05, + "loss": 0.9933403134346008, + "step": 126 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 0.15175071358680725, + "learning_rate": 1.7918348482420692e-05, + "loss": 1.309685230255127, + "step": 128 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.08403439074754715, + "learning_rate": 1.7806101716622486e-05, + "loss": 1.237076759338379, + "step": 130 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.0816800594329834, + "learning_rate": 1.7691325792233378e-05, + "loss": 1.1985251903533936, + "step": 132 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 0.1994868814945221, + "learning_rate": 1.7574063360175625e-05, + "loss": 1.0995125770568848, + "step": 134 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 0.21855318546295166, + "learning_rate": 1.745435799536183e-05, + "loss": 0.9361122250556946, + "step": 136 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 0.09870662540197372, + "learning_rate": 1.7332254180502407e-05, + "loss": 0.9252831935882568, + "step": 138 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.16326409578323364, + "learning_rate": 1.7207797289575777e-05, + "loss": 0.5687403678894043, + "step": 140 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 0.10890620201826096, + "learning_rate": 1.708103357096728e-05, + "loss": 1.2131319046020508, + "step": 142 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.09764160215854645, + "learning_rate": 1.695201013028322e-05, + "loss": 1.2462400197982788, + "step": 144 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 0.16440223157405853, + "learning_rate": 1.6820774912846335e-05, + "loss": 1.2056647539138794, + "step": 146 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 0.3588426113128662, + "learning_rate": 1.668737668587926e-05, + "loss": 1.2008482217788696, + "step": 148 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.11627013236284256, + "learning_rate": 1.655186502038251e-05, + "loss": 1.1384965181350708, + "step": 150 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 0.08151569962501526, + "learning_rate": 1.641429027271384e-05, + "loss": 1.2373915910720825, + "step": 152 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 0.11254062503576279, + "learning_rate": 1.6274703565875736e-05, + "loss": 1.2118054628372192, + "step": 154 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.18413741886615753, + "learning_rate": 1.613315677051801e-05, + "loss": 1.2632805109024048, + "step": 156 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 0.13130782544612885, + "learning_rate": 1.598970248566261e-05, + "loss": 0.971778154373169, + "step": 158 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.11005275696516037, + "learning_rate": 1.5844394019157697e-05, + "loss": 1.1907391548156738, + "step": 160 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 0.27727824449539185, + "learning_rate": 1.5697285367868393e-05, + "loss": 0.6190683841705322, + "step": 162 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 0.14197178184986115, + "learning_rate": 1.5548431197611448e-05, + "loss": 1.5833417177200317, + "step": 164 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 0.4161648452281952, + "learning_rate": 1.539788682284133e-05, + "loss": 0.7078179717063904, + "step": 166 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.10775762796401978, + "learning_rate": 1.5245708186095275e-05, + "loss": 1.1315946578979492, + "step": 168 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.12663501501083374, + "learning_rate": 1.5091951837204973e-05, + "loss": 1.218740463256836, + "step": 170 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 0.23885132372379303, + "learning_rate": 1.4936674912282525e-05, + "loss": 1.2896463871002197, + "step": 172 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 0.8457244634628296, + "learning_rate": 1.4779935112488597e-05, + "loss": 1.4194824695587158, + "step": 174 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.08335787802934647, + "learning_rate": 1.4621790682590556e-05, + "loss": 1.2021723985671997, + "step": 176 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 0.10485303401947021, + "learning_rate": 1.4462300389318635e-05, + "loss": 1.1588122844696045, + "step": 178 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.11188313364982605, + "learning_rate": 1.4301523499528099e-05, + "loss": 1.1702033281326294, + "step": 180 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 0.12386136502027512, + "learning_rate": 1.4139519758175602e-05, + "loss": 1.1308914422988892, + "step": 182 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 0.2648775577545166, + "learning_rate": 1.3976349366117861e-05, + "loss": 1.3322463035583496, + "step": 184 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 0.1508917361497879, + "learning_rate": 1.3812072957740898e-05, + "loss": 1.1352683305740356, + "step": 186 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 0.26597723364830017, + "learning_rate": 1.3646751578428231e-05, + "loss": 1.1803580522537231, + "step": 188 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.09911347925662994, + "learning_rate": 1.3480446661876295e-05, + "loss": 1.22324538230896, + "step": 190 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.2967340648174286, + "learning_rate": 1.3313220007265572e-05, + "loss": 1.1710972785949707, + "step": 192 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 0.09805179387331009, + "learning_rate": 1.3145133756295936e-05, + "loss": 1.194231390953064, + "step": 194 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 0.1383579820394516, + "learning_rate": 1.2976250370094668e-05, + "loss": 1.253728985786438, + "step": 196 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.0955134779214859, + "learning_rate": 1.2806632606005822e-05, + "loss": 1.0406149625778198, + "step": 198 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.1062760204076767, + "learning_rate": 1.2636343494269479e-05, + "loss": 1.2303603887557983, + "step": 200 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 0.0993717610836029, + "learning_rate": 1.2465446314599609e-05, + "loss": 1.0265294313430786, + "step": 202 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.26846495270729065, + "learning_rate": 1.2294004572669228e-05, + "loss": 0.8886614441871643, + "step": 204 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 0.1153939738869667, + "learning_rate": 1.2122081976511581e-05, + "loss": 1.2078245878219604, + "step": 206 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.16638191044330597, + "learning_rate": 1.1949742412846142e-05, + "loss": 1.1955982446670532, + "step": 208 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.12783703207969666, + "learning_rate": 1.177704992333818e-05, + "loss": 1.194825291633606, + "step": 210 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 0.10933943092823029, + "learning_rate": 1.1604068680800809e-05, + "loss": 1.4728552103042603, + "step": 212 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 0.09695154428482056, + "learning_rate": 1.1430862965348224e-05, + "loss": 1.187117099761963, + "step": 214 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 0.1849643737077713, + "learning_rate": 1.1257497140509141e-05, + "loss": 1.547567367553711, + "step": 216 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 0.11740607768297195, + "learning_rate": 1.1084035629309176e-05, + "loss": 1.1611906290054321, + "step": 218 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 0.20028969645500183, + "learning_rate": 1.0910542890331162e-05, + "loss": 1.2552645206451416, + "step": 220 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.21303501725196838, + "learning_rate": 1.0737083393762213e-05, + "loss": 1.0489044189453125, + "step": 222 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 0.47717615962028503, + "learning_rate": 1.0563721597436525e-05, + "loss": 1.0029484033584595, + "step": 224 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 0.09277418255805969, + "learning_rate": 1.039052192288271e-05, + "loss": 1.2073832750320435, + "step": 226 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 1.184064269065857, + "learning_rate": 1.0217548731384677e-05, + "loss": 1.2144676446914673, + "step": 228 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.08467663079500198, + "learning_rate": 1.0044866300064842e-05, + "loss": 0.6675528287887573, + "step": 230 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 0.12579751014709473, + "learning_rate": 9.872538797998672e-06, + "loss": 1.21137273311615, + "step": 232 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 0.20576755702495575, + "learning_rate": 9.700630262369337e-06, + "loss": 0.9463342428207397, + "step": 234 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 0.11935203522443771, + "learning_rate": 9.529204574671391e-06, + "loss": 0.8104769587516785, + "step": 236 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 0.1750505119562149, + "learning_rate": 9.3583254369723e-06, + "loss": 1.2798433303833008, + "step": 238 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.2776470184326172, + "learning_rate": 9.188056348240655e-06, + "loss": 1.2103146314620972, + "step": 240 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 0.3298132121562958, + "learning_rate": 9.018460580749842e-06, + "loss": 1.1388304233551025, + "step": 242 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.11938371509313583, + "learning_rate": 8.849601156565972e-06, + "loss": 0.9363460540771484, + "step": 244 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 0.09460209310054779, + "learning_rate": 8.68154082412877e-06, + "loss": 1.1563094854354858, + "step": 246 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 0.15788520872592926, + "learning_rate": 8.514342034934159e-06, + "loss": 1.2280900478363037, + "step": 248 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.948505163192749, + "learning_rate": 8.348066920327163e-06, + "loss": 1.1906379461288452, + "step": 250 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.12994378805160522, + "learning_rate": 8.182777268413822e-06, + "loss": 1.0413599014282227, + "step": 252 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 0.1265498697757721, + "learning_rate": 8.018534501100611e-06, + "loss": 1.1458451747894287, + "step": 254 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.11216244846582413, + "learning_rate": 7.855399651269982e-06, + "loss": 1.1375198364257812, + "step": 256 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 0.10773710161447525, + "learning_rate": 7.6934333401004e-06, + "loss": 1.0727012157440186, + "step": 258 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.12872548401355743, + "learning_rate": 7.53269575453947e-06, + "loss": 1.1428802013397217, + "step": 260 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 0.20889542996883392, + "learning_rate": 7.373246624938324e-06, + "loss": 1.2273454666137695, + "step": 262 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.21219481527805328, + "learning_rate": 7.215145202855746e-06, + "loss": 1.1816091537475586, + "step": 264 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 0.10101748257875443, + "learning_rate": 7.0584502390401865e-06, + "loss": 0.9967770576477051, + "step": 266 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 0.22779417037963867, + "learning_rate": 6.903219961597891e-06, + "loss": 1.1276934146881104, + "step": 268 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.0829336941242218, + "learning_rate": 6.7495120543552475e-06, + "loss": 1.172729730606079, + "step": 270 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.1042322888970375, + "learning_rate": 6.59738363542336e-06, + "loss": 1.133504867553711, + "step": 272 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 0.12385440617799759, + "learning_rate": 6.446891235972894e-06, + "loss": 1.1612567901611328, + "step": 274 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.2772475779056549, + "learning_rate": 6.298090779226977e-06, + "loss": 1.207811951637268, + "step": 276 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 0.11887267976999283, + "learning_rate": 6.151037559680047e-06, + "loss": 1.1781216859817505, + "step": 278 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 0.24976657330989838, + "learning_rate": 6.005786222550319e-06, + "loss": 0.8156208992004395, + "step": 280 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.09995827078819275, + "learning_rate": 5.8623907434735515e-06, + "loss": 1.2025524377822876, + "step": 282 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 0.17814989387989044, + "learning_rate": 5.720904408445589e-06, + "loss": 1.1644903421401978, + "step": 284 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 0.08649664372205734, + "learning_rate": 5.581379794021202e-06, + "loss": 1.1790955066680908, + "step": 286 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.13469792902469635, + "learning_rate": 5.443868747776579e-06, + "loss": 1.1462353467941284, + "step": 288 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.08754783868789673, + "learning_rate": 5.308422369042644e-06, + "loss": 1.0591084957122803, + "step": 290 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 0.5429996252059937, + "learning_rate": 5.175090989916483e-06, + "loss": 1.1352485418319702, + "step": 292 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.0844668596982956, + "learning_rate": 5.043924156557844e-06, + "loss": 1.1482046842575073, + "step": 294 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 0.7184237837791443, + "learning_rate": 4.914970610777725e-06, + "loss": 1.4919184446334839, + "step": 296 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 0.13735006749629974, + "learning_rate": 4.788278271925802e-06, + "loss": 0.8253867626190186, + "step": 298 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.1467600017786026, + "learning_rate": 4.663894219083548e-06, + "loss": 1.5542583465576172, + "step": 300 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 0.19855302572250366, + "learning_rate": 4.541864673569551e-06, + "loss": 1.1880155801773071, + "step": 302 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 0.06581619381904602, + "learning_rate": 4.422234981763613e-06, + "loss": 0.719045877456665, + "step": 304 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 0.24002951383590698, + "learning_rate": 4.305049598255946e-06, + "loss": 1.526334285736084, + "step": 306 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 0.12296368926763535, + "learning_rate": 4.190352069327777e-06, + "loss": 0.8090255856513977, + "step": 308 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.11732060462236404, + "learning_rate": 4.078185016769484e-06, + "loss": 1.188048005104065, + "step": 310 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.11243297159671783, + "learning_rate": 3.968590122042265e-06, + "loss": 1.1868927478790283, + "step": 312 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 0.19882725179195404, + "learning_rate": 3.861608110789228e-06, + "loss": 1.194838285446167, + "step": 314 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 0.1381818950176239, + "learning_rate": 3.757278737701697e-06, + "loss": 1.203479528427124, + "step": 316 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 0.12520438432693481, + "learning_rate": 3.6556407717462856e-06, + "loss": 1.0257017612457275, + "step": 318 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.09709858149290085, + "learning_rate": 3.5567319817582944e-06, + "loss": 1.3691281080245972, + "step": 320 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 0.08708924800157547, + "learning_rate": 3.4605891224067423e-06, + "loss": 1.0822453498840332, + "step": 322 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.10322949290275574, + "learning_rate": 3.3672479205362764e-06, + "loss": 0.46839311718940735, + "step": 324 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 0.15026327967643738, + "learning_rate": 3.276743061891014e-06, + "loss": 1.159508228302002, + "step": 326 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 0.295035183429718, + "learning_rate": 3.1891081782252726e-06, + "loss": 1.0394455194473267, + "step": 328 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.11241143941879272, + "learning_rate": 3.1043758348059384e-06, + "loss": 1.2095626592636108, + "step": 330 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 0.20652705430984497, + "learning_rate": 3.0225775183111784e-06, + "loss": 1.2480474710464478, + "step": 332 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 0.12176910787820816, + "learning_rate": 2.943743625129917e-06, + "loss": 0.9860284924507141, + "step": 334 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.7734562754631042, + "learning_rate": 2.867903450066513e-06, + "loss": 1.1385536193847656, + "step": 336 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 0.1453840732574463, + "learning_rate": 2.795085175454741e-06, + "loss": 1.1383990049362183, + "step": 338 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.11631204187870026, + "learning_rate": 2.7253158606851983e-06, + "loss": 1.151549220085144, + "step": 340 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.4026157855987549, + "learning_rate": 2.6586214321499952e-06, + "loss": 1.23691987991333, + "step": 342 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 0.3281600773334503, + "learning_rate": 2.5950266736084558e-06, + "loss": 1.1340752840042114, + "step": 344 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 0.1483088731765747, + "learning_rate": 2.5345552169774413e-06, + "loss": 1.5082932710647583, + "step": 346 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.1787729412317276, + "learning_rate": 2.477229533549685e-06, + "loss": 0.7625555396080017, + "step": 348 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.09882060438394547, + "learning_rate": 2.423070925643422e-06, + "loss": 0.7790077924728394, + "step": 350 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.09993410110473633, + "learning_rate": 2.372099518686416e-06, + "loss": 1.1514798402786255, + "step": 352 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 0.0816822201013565, + "learning_rate": 2.324334253737321e-06, + "loss": 0.8181242942810059, + "step": 354 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 0.13193772733211517, + "learning_rate": 2.2797928804471413e-06, + "loss": 1.0354417562484741, + "step": 356 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 0.09249108284711838, + "learning_rate": 2.2384919504634465e-06, + "loss": 0.6940630674362183, + "step": 358 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.10684558004140854, + "learning_rate": 2.2004468112797345e-06, + "loss": 1.154773473739624, + "step": 360 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 0.19959105551242828, + "learning_rate": 2.165671600532298e-06, + "loss": 0.9004580974578857, + "step": 362 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.08442464470863342, + "learning_rate": 2.134179240746638e-06, + "loss": 1.1354343891143799, + "step": 364 + }, + { + "epoch": 1.0, + "grad_norm": 0.13439127802848816, + "learning_rate": 2.1059814345354434e-06, + "loss": 1.111499309539795, + "step": 366 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.3562772274017334, + "learning_rate": 2.0810886602498733e-06, + "loss": 1.0153237581253052, + "step": 368 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.1327170431613922, + "learning_rate": 2.059510168085791e-06, + "loss": 1.090681791305542, + "step": 370 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.08891897648572922, + "learning_rate": 2.0412539766463697e-06, + "loss": 1.055408000946045, + "step": 372 + }, + { + "epoch": 1.0218579234972678, + "grad_norm": 0.11671415716409683, + "learning_rate": 2.0263268699623746e-06, + "loss": 1.0153884887695312, + "step": 374 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 0.1955808699131012, + "learning_rate": 2.0147343949711965e-06, + "loss": 0.8129119277000427, + "step": 376 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 0.11424767225980759, + "learning_rate": 2.0064808594556066e-06, + "loss": 1.0032305717468262, + "step": 378 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.09951414912939072, + "learning_rate": 2.0015693304429757e-06, + "loss": 0.6713349223136902, + "step": 380 + }, + { + "epoch": 1.0437158469945356, + "grad_norm": 0.157700777053833, + "learning_rate": 2.000001633065562e-06, + "loss": 0.9018054008483887, + "step": 382 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.11729155480861664, + "learning_rate": 2.0017783498822896e-06, + "loss": 1.0324310064315796, + "step": 384 + }, + { + "epoch": 1.0546448087431695, + "grad_norm": 0.12463416904211044, + "learning_rate": 2.006898820662268e-06, + "loss": 0.6445884704589844, + "step": 386 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 0.47830504179000854, + "learning_rate": 2.0153611426301325e-06, + "loss": 0.5118352174758911, + "step": 388 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.20129992067813873, + "learning_rate": 2.027162171173126e-06, + "loss": 1.1220747232437134, + "step": 390 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 0.10175406187772751, + "learning_rate": 2.0422975210096317e-06, + "loss": 0.9721101522445679, + "step": 392 + }, + { + "epoch": 1.0765027322404372, + "grad_norm": 0.15038686990737915, + "learning_rate": 2.0607615678187605e-06, + "loss": 0.5728106498718262, + "step": 394 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.12541523575782776, + "learning_rate": 2.082547450330353e-06, + "loss": 1.0776091814041138, + "step": 396 + }, + { + "epoch": 1.0874316939890711, + "grad_norm": 0.1422613561153412, + "learning_rate": 2.1076470728746407e-06, + "loss": 0.9820076823234558, + "step": 398 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.4176431894302368, + "learning_rate": 2.136051108390608e-06, + "loss": 1.1005867719650269, + "step": 400 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 0.17192092537879944, + "learning_rate": 2.167749001891944e-06, + "loss": 0.9484550356864929, + "step": 402 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 0.19980178773403168, + "learning_rate": 2.202728974389296e-06, + "loss": 1.039968729019165, + "step": 404 + }, + { + "epoch": 1.1092896174863387, + "grad_norm": 0.3821330666542053, + "learning_rate": 2.240978027267357e-06, + "loss": 1.1210821866989136, + "step": 406 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.17232535779476166, + "learning_rate": 2.2824819471151736e-06, + "loss": 0.6310506463050842, + "step": 408 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.1672593653202057, + "learning_rate": 2.327225311007878e-06, + "loss": 1.1975295543670654, + "step": 410 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 0.1546320766210556, + "learning_rate": 2.3751914922378623e-06, + "loss": 0.6439824104309082, + "step": 412 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.12194650620222092, + "learning_rate": 2.4263626664932998e-06, + "loss": 0.7317380309104919, + "step": 414 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 0.4037162959575653, + "learning_rate": 2.4807198184816817e-06, + "loss": 0.6151731610298157, + "step": 416 + }, + { + "epoch": 1.1420765027322404, + "grad_norm": 0.14404965937137604, + "learning_rate": 2.5382427489959373e-06, + "loss": 1.0358096361160278, + "step": 418 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.14871248602867126, + "learning_rate": 2.5989100824204876e-06, + "loss": 0.9391909837722778, + "step": 420 + }, + { + "epoch": 1.1530054644808743, + "grad_norm": 0.10675734281539917, + "learning_rate": 2.662699274674462e-06, + "loss": 1.0057426691055298, + "step": 422 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 0.14187511801719666, + "learning_rate": 2.7295866215891107e-06, + "loss": 0.7259042859077454, + "step": 424 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 0.23252590000629425, + "learning_rate": 2.799547267716326e-06, + "loss": 0.6342077851295471, + "step": 426 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 0.11477074027061462, + "learning_rate": 2.872555215564946e-06, + "loss": 1.1057250499725342, + "step": 428 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.12571924924850464, + "learning_rate": 2.9485833352614895e-06, + "loss": 1.0438354015350342, + "step": 430 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.11532790958881378, + "learning_rate": 3.027603374631647e-06, + "loss": 1.0102614164352417, + "step": 432 + }, + { + "epoch": 1.185792349726776, + "grad_norm": 0.12962767481803894, + "learning_rate": 3.1095859696988273e-06, + "loss": 0.7555112242698669, + "step": 434 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 0.16368405520915985, + "learning_rate": 3.1945006555958885e-06, + "loss": 1.0447275638580322, + "step": 436 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 0.10939529538154602, + "learning_rate": 3.2823158778858976e-06, + "loss": 1.0501242876052856, + "step": 438 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.28014352917671204, + "learning_rate": 3.372999004287839e-06, + "loss": 0.8902407288551331, + "step": 440 + }, + { + "epoch": 1.2076502732240437, + "grad_norm": 0.22510351240634918, + "learning_rate": 3.4665163368028044e-06, + "loss": 1.0529145002365112, + "step": 442 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.11745640635490417, + "learning_rate": 3.562833124236238e-06, + "loss": 1.0241260528564453, + "step": 444 + }, + { + "epoch": 1.2185792349726776, + "grad_norm": 0.10882380604743958, + "learning_rate": 3.6619135751115325e-06, + "loss": 0.5796948075294495, + "step": 446 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.11475440114736557, + "learning_rate": 3.763720870970201e-06, + "loss": 0.9688222408294678, + "step": 448 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.11639797687530518, + "learning_rate": 3.86821718005367e-06, + "loss": 1.0046788454055786, + "step": 450 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 0.16791078448295593, + "learning_rate": 3.975363671361641e-06, + "loss": 0.9738149642944336, + "step": 452 + }, + { + "epoch": 1.2404371584699454, + "grad_norm": 0.27022409439086914, + "learning_rate": 4.0851205290817254e-06, + "loss": 0.25824788212776184, + "step": 454 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 0.3601982295513153, + "learning_rate": 4.197446967385105e-06, + "loss": 0.7551949620246887, + "step": 456 + }, + { + "epoch": 1.2513661202185793, + "grad_norm": 0.3413498103618622, + "learning_rate": 4.312301245582571e-06, + "loss": 0.6455928683280945, + "step": 458 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.15276941657066345, + "learning_rate": 4.429640683635466e-06, + "loss": 1.0521637201309204, + "step": 460 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 0.28445929288864136, + "learning_rate": 4.549421678015633e-06, + "loss": 0.6878266334533691, + "step": 462 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 0.34402215480804443, + "learning_rate": 4.671599717908582e-06, + "loss": 0.9702850580215454, + "step": 464 + }, + { + "epoch": 1.273224043715847, + "grad_norm": 0.12396188825368881, + "learning_rate": 4.796129401753752e-06, + "loss": 1.006730556488037, + "step": 466 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.1135745644569397, + "learning_rate": 4.922964454115837e-06, + "loss": 0.9473859667778015, + "step": 468 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.1671561449766159, + "learning_rate": 5.0520577428807835e-06, + "loss": 0.9789319634437561, + "step": 470 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 0.09278880804777145, + "learning_rate": 5.183361296770197e-06, + "loss": 1.2953110933303833, + "step": 472 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.14171801507472992, + "learning_rate": 5.316826323167505e-06, + "loss": 0.9902119636535645, + "step": 474 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 0.233398899435997, + "learning_rate": 5.4524032262494175e-06, + "loss": 0.49475741386413574, + "step": 476 + }, + { + "epoch": 1.3060109289617485, + "grad_norm": 0.15780876576900482, + "learning_rate": 5.590041625415783e-06, + "loss": 1.0600972175598145, + "step": 478 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.10637230426073074, + "learning_rate": 5.7296903740111076e-06, + "loss": 1.0665287971496582, + "step": 480 + }, + { + "epoch": 1.3169398907103824, + "grad_norm": 0.09512165188789368, + "learning_rate": 5.87129757833077e-06, + "loss": 0.8561606407165527, + "step": 482 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 0.19713814556598663, + "learning_rate": 6.014810616904747e-06, + "loss": 0.6718865633010864, + "step": 484 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 0.11173432320356369, + "learning_rate": 6.160176160051906e-06, + "loss": 0.9568278789520264, + "step": 486 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.1460486352443695, + "learning_rate": 6.307340189697344e-06, + "loss": 1.088982105255127, + "step": 488 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.17626726627349854, + "learning_rate": 6.456248019445626e-06, + "loss": 0.632489800453186, + "step": 490 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 0.0857512503862381, + "learning_rate": 6.606844314902321e-06, + "loss": 1.0565383434295654, + "step": 492 + }, + { + "epoch": 1.349726775956284, + "grad_norm": 0.04572976753115654, + "learning_rate": 6.7590731142363915e-06, + "loss": 0.8708642721176147, + "step": 494 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.25216469168663025, + "learning_rate": 6.912877848975638e-06, + "loss": 1.0493139028549194, + "step": 496 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 0.2923753559589386, + "learning_rate": 7.068201365027712e-06, + "loss": 0.8828723430633545, + "step": 498 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.21058304607868195, + "learning_rate": 7.2249859439185875e-06, + "loss": 1.1987463235855103, + "step": 500 + }, + { + "epoch": 1.3715846994535519, + "grad_norm": 0.12390316277742386, + "learning_rate": 7.3831733242409285e-06, + "loss": 0.9241117835044861, + "step": 502 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.09202788770198822, + "learning_rate": 7.5427047233040485e-06, + "loss": 0.5438337922096252, + "step": 504 + }, + { + "epoch": 1.3825136612021858, + "grad_norm": 0.09361044317483902, + "learning_rate": 7.703520858977702e-06, + "loss": 0.8399773240089417, + "step": 506 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 0.11776993423700333, + "learning_rate": 7.865561971721389e-06, + "loss": 1.012689471244812, + "step": 508 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.11572909355163574, + "learning_rate": 8.02876784679115e-06, + "loss": 1.0279278755187988, + "step": 510 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.15213985741138458, + "learning_rate": 8.193077836615386e-06, + "loss": 1.1416020393371582, + "step": 512 + }, + { + "epoch": 1.4043715846994536, + "grad_norm": 0.04757779836654663, + "learning_rate": 8.35843088333168e-06, + "loss": 0.7897922396659851, + "step": 514 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 0.1259109377861023, + "learning_rate": 8.524765541475935e-06, + "loss": 0.7295854687690735, + "step": 516 + }, + { + "epoch": 1.4153005464480874, + "grad_norm": 0.7025004625320435, + "learning_rate": 8.692020000815627e-06, + "loss": 1.0215600728988647, + "step": 518 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 0.3152309060096741, + "learning_rate": 8.860132109318622e-06, + "loss": 0.5573855638504028, + "step": 520 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.14202211797237396, + "learning_rate": 9.029039396248916e-06, + "loss": 1.0810049772262573, + "step": 522 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 0.15890736877918243, + "learning_rate": 9.198679095380924e-06, + "loss": 0.958807647228241, + "step": 524 + }, + { + "epoch": 1.4371584699453552, + "grad_norm": 0.440880686044693, + "learning_rate": 9.368988168323451e-06, + "loss": 0.9619909524917603, + "step": 526 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.09399222582578659, + "learning_rate": 9.539903327944926e-06, + "loss": 0.6587854623794556, + "step": 528 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.43031588196754456, + "learning_rate": 9.711361061890942e-06, + "loss": 0.7227563261985779, + "step": 530 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 0.2343519926071167, + "learning_rate": 9.8832976561856e-06, + "loss": 0.6935963034629822, + "step": 532 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 0.8154575824737549, + "learning_rate": 1.0055649218907688e-05, + "loss": 1.0704255104064941, + "step": 534 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 0.1126905083656311, + "learning_rate": 1.0228351703933075e-05, + "loss": 1.0404987335205078, + "step": 536 + }, + { + "epoch": 1.469945355191257, + "grad_norm": 0.11301713436841965, + "learning_rate": 1.0401340934734287e-05, + "loss": 1.0789377689361572, + "step": 538 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.07608981430530548, + "learning_rate": 1.0574552628228691e-05, + "loss": 1.0320706367492676, + "step": 540 + }, + { + "epoch": 1.4808743169398908, + "grad_norm": 0.3283179700374603, + "learning_rate": 1.0747922418666115e-05, + "loss": 1.0364580154418945, + "step": 542 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.16681601107120514, + "learning_rate": 1.0921385881547311e-05, + "loss": 0.8939827680587769, + "step": 544 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 0.21713408827781677, + "learning_rate": 1.1094878557564217e-05, + "loss": 1.1266794204711914, + "step": 546 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 0.14162234961986542, + "learning_rate": 1.1268335976553098e-05, + "loss": 0.37908974289894104, + "step": 548 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 0.08663295209407806, + "learning_rate": 1.144169368145179e-05, + "loss": 1.0336085557937622, + "step": 550 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.1834978312253952, + "learning_rate": 1.1614887252252076e-05, + "loss": 1.0525389909744263, + "step": 552 + }, + { + "epoch": 1.5136612021857925, + "grad_norm": 0.13017012178897858, + "learning_rate": 1.1787852329938198e-05, + "loss": 1.0565928220748901, + "step": 554 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 0.10501047968864441, + "learning_rate": 1.1960524640402862e-05, + "loss": 1.1221007108688354, + "step": 556 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.0911460742354393, + "learning_rate": 1.2132840018331514e-05, + "loss": 0.7799986004829407, + "step": 558 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.12136353552341461, + "learning_rate": 1.2304734431046335e-05, + "loss": 0.6321024298667908, + "step": 560 + }, + { + "epoch": 1.5355191256830603, + "grad_norm": 0.6391102075576782, + "learning_rate": 1.2476144002300864e-05, + "loss": 0.8896136283874512, + "step": 562 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 0.21880479156970978, + "learning_rate": 1.264700503601655e-05, + "loss": 0.7882946133613586, + "step": 564 + }, + { + "epoch": 1.5464480874316942, + "grad_norm": 0.10457222163677216, + "learning_rate": 1.2817254039952253e-05, + "loss": 1.0469473600387573, + "step": 566 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 0.21689452230930328, + "learning_rate": 1.2986827749298138e-05, + "loss": 0.7880579233169556, + "step": 568 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.10684338957071304, + "learning_rate": 1.3155663150184942e-05, + "loss": 1.1512001752853394, + "step": 570 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 0.08281126618385315, + "learning_rate": 1.3323697503100035e-05, + "loss": 1.0108344554901123, + "step": 572 + }, + { + "epoch": 1.5683060109289617, + "grad_norm": 0.08205148577690125, + "learning_rate": 1.3490868366201527e-05, + "loss": 1.0739396810531616, + "step": 574 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.14288611710071564, + "learning_rate": 1.3657113618521763e-05, + "loss": 0.7893604040145874, + "step": 576 + }, + { + "epoch": 1.5792349726775956, + "grad_norm": 0.1151369959115982, + "learning_rate": 1.3822371483051593e-05, + "loss": 1.0953106880187988, + "step": 578 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.12557563185691833, + "learning_rate": 1.3986580549696777e-05, + "loss": 0.5041826367378235, + "step": 580 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.09901826828718185, + "learning_rate": 1.4149679798098097e-05, + "loss": 0.6635845899581909, + "step": 582 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 0.1395505666732788, + "learning_rate": 1.4311608620306626e-05, + "loss": 0.6669992804527283, + "step": 584 + }, + { + "epoch": 1.6010928961748634, + "grad_norm": 0.1357533186674118, + "learning_rate": 1.447230684330573e-05, + "loss": 1.0561128854751587, + "step": 586 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 0.07983239740133286, + "learning_rate": 1.4631714751371456e-05, + "loss": 1.016318440437317, + "step": 588 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.11813154071569443, + "learning_rate": 1.4789773108263016e-05, + "loss": 0.7854833006858826, + "step": 590 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.14835046231746674, + "learning_rate": 1.4946423179235068e-05, + "loss": 1.0739095211029053, + "step": 592 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 0.17924216389656067, + "learning_rate": 1.5101606752863606e-05, + "loss": 0.840874433517456, + "step": 594 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 0.10418872535228729, + "learning_rate": 1.5255266162677466e-05, + "loss": 0.336093544960022, + "step": 596 + }, + { + "epoch": 1.633879781420765, + "grad_norm": 0.13057978451251984, + "learning_rate": 1.540734430858725e-05, + "loss": 1.1894792318344116, + "step": 598 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.12744714319705963, + "learning_rate": 1.5557784678103852e-05, + "loss": 1.1345218420028687, + "step": 600 + }, + { + "epoch": 1.644808743169399, + "grad_norm": 0.11343944817781448, + "learning_rate": 1.5706531367338546e-05, + "loss": 1.0770057439804077, + "step": 602 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 0.10060634464025497, + "learning_rate": 1.5853529101776985e-05, + "loss": 1.0950847864151, + "step": 604 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.3996407985687256, + "learning_rate": 1.5998723256819298e-05, + "loss": 1.1565852165222168, + "step": 606 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.1252470165491104, + "learning_rate": 1.614205987807872e-05, + "loss": 0.9427963495254517, + "step": 608 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.08708234131336212, + "learning_rate": 1.628348570143105e-05, + "loss": 1.1373974084854126, + "step": 610 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 0.6713199019432068, + "learning_rate": 1.6422948172807745e-05, + "loss": 0.7651968002319336, + "step": 612 + }, + { + "epoch": 1.6775956284153004, + "grad_norm": 0.18165543675422668, + "learning_rate": 1.6560395467725086e-05, + "loss": 1.0242265462875366, + "step": 614 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 0.21526610851287842, + "learning_rate": 1.6695776510542253e-05, + "loss": 1.1028822660446167, + "step": 616 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.09745687246322632, + "learning_rate": 1.6829040993441085e-05, + "loss": 0.6104081869125366, + "step": 618 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.1572287529706955, + "learning_rate": 1.696013939512057e-05, + "loss": 1.0959326028823853, + "step": 620 + }, + { + "epoch": 1.6994535519125682, + "grad_norm": 0.17827695608139038, + "learning_rate": 1.7089022999199064e-05, + "loss": 1.233556866645813, + "step": 622 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.1473863422870636, + "learning_rate": 1.7215643912317323e-05, + "loss": 0.732525110244751, + "step": 624 + }, + { + "epoch": 1.710382513661202, + "grad_norm": 0.12968477606773376, + "learning_rate": 1.73399550819358e-05, + "loss": 0.6740303635597229, + "step": 626 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 0.06875219196081161, + "learning_rate": 1.746191031381943e-05, + "loss": 1.095910906791687, + "step": 628 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.13739925622940063, + "learning_rate": 1.7581464289203475e-05, + "loss": 0.8012250661849976, + "step": 630 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 0.16408398747444153, + "learning_rate": 1.7698572581634083e-05, + "loss": 1.0673834085464478, + "step": 632 + }, + { + "epoch": 1.7322404371584699, + "grad_norm": 0.09738543629646301, + "learning_rate": 1.781319167347718e-05, + "loss": 1.1305607557296753, + "step": 634 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.09281567484140396, + "learning_rate": 1.7925278972089748e-05, + "loss": 0.9764953851699829, + "step": 636 + }, + { + "epoch": 1.7431693989071038, + "grad_norm": 0.09890922904014587, + "learning_rate": 1.8034792825647287e-05, + "loss": 1.0016361474990845, + "step": 638 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.34490931034088135, + "learning_rate": 1.8141692538621716e-05, + "loss": 0.8936794996261597, + "step": 640 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.10054787993431091, + "learning_rate": 1.8245938386903896e-05, + "loss": 1.1207398176193237, + "step": 642 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 0.2044786661863327, + "learning_rate": 1.8347491632565156e-05, + "loss": 1.0461924076080322, + "step": 644 + }, + { + "epoch": 1.7650273224043715, + "grad_norm": 0.06816514581441879, + "learning_rate": 1.8446314538252407e-05, + "loss": 0.6392666101455688, + "step": 646 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.0854775607585907, + "learning_rate": 1.8542370381211374e-05, + "loss": 1.0239224433898926, + "step": 648 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.09861481189727783, + "learning_rate": 1.8635623466932843e-05, + "loss": 1.0791953802108765, + "step": 650 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 0.09283293783664703, + "learning_rate": 1.8726039142416796e-05, + "loss": 1.1317287683486938, + "step": 652 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.1488487869501114, + "learning_rate": 1.881358380904954e-05, + "loss": 0.9950046539306641, + "step": 654 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 0.4881657361984253, + "learning_rate": 1.889822493508897e-05, + "loss": 1.0590057373046875, + "step": 656 + }, + { + "epoch": 1.7978142076502732, + "grad_norm": 0.1867472380399704, + "learning_rate": 1.897993106775346e-05, + "loss": 0.9436238408088684, + "step": 658 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.19811779260635376, + "learning_rate": 1.9058671844909742e-05, + "loss": 1.0728529691696167, + "step": 660 + }, + { + "epoch": 1.8087431693989071, + "grad_norm": 0.0855625569820404, + "learning_rate": 1.9134418006355532e-05, + "loss": 1.1045749187469482, + "step": 662 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 0.1337461769580841, + "learning_rate": 1.9207141404692667e-05, + "loss": 1.0828365087509155, + "step": 664 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 0.08743517845869064, + "learning_rate": 1.927681501578672e-05, + "loss": 1.3723547458648682, + "step": 666 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 0.0916970744729042, + "learning_rate": 1.934341294880924e-05, + "loss": 1.1413710117340088, + "step": 668 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 0.13558737933635712, + "learning_rate": 1.9406910455858783e-05, + "loss": 0.7378701567649841, + "step": 670 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.23225541412830353, + "learning_rate": 1.9467283941157304e-05, + "loss": 1.2266825437545776, + "step": 672 + }, + { + "epoch": 1.8415300546448088, + "grad_norm": 0.12275075912475586, + "learning_rate": 1.952451096981838e-05, + "loss": 1.105077862739563, + "step": 674 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 0.08732561767101288, + "learning_rate": 1.957857027618405e-05, + "loss": 1.1363401412963867, + "step": 676 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.05239817127585411, + "learning_rate": 1.9629441771727166e-05, + "loss": 0.27085718512535095, + "step": 678 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.0899297297000885, + "learning_rate": 1.9677106552516317e-05, + "loss": 1.068323016166687, + "step": 680 + }, + { + "epoch": 1.8633879781420766, + "grad_norm": 0.07155214250087738, + "learning_rate": 1.9721546906240577e-05, + "loss": 1.0824123620986938, + "step": 682 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.09562800824642181, + "learning_rate": 1.976274631879142e-05, + "loss": 1.0398223400115967, + "step": 684 + }, + { + "epoch": 1.8743169398907105, + "grad_norm": 0.09326054900884628, + "learning_rate": 1.9800689480399383e-05, + "loss": 0.726788341999054, + "step": 686 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.16305072605609894, + "learning_rate": 1.9835362291323222e-05, + "loss": 1.152515172958374, + "step": 688 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.08776555955410004, + "learning_rate": 1.9866751867089363e-05, + "loss": 1.1057515144348145, + "step": 690 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 0.34914299845695496, + "learning_rate": 1.9894846543279838e-05, + "loss": 0.7560766339302063, + "step": 692 + }, + { + "epoch": 1.8961748633879782, + "grad_norm": 0.13801546394824982, + "learning_rate": 1.991963587986677e-05, + "loss": 1.1189141273498535, + "step": 694 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.33698874711990356, + "learning_rate": 1.9941110665091922e-05, + "loss": 1.122489333152771, + "step": 696 + }, + { + "epoch": 1.9071038251366121, + "grad_norm": 0.15018677711486816, + "learning_rate": 1.9959262918889774e-05, + "loss": 0.9824838638305664, + "step": 698 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.07216078788042068, + "learning_rate": 1.9974085895852973e-05, + "loss": 1.1157559156417847, + "step": 700 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.07695994526147842, + "learning_rate": 1.99855740877389e-05, + "loss": 1.0270991325378418, + "step": 702 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 0.10790321230888367, + "learning_rate": 1.9993723225516553e-05, + "loss": 1.146857738494873, + "step": 704 + }, + { + "epoch": 1.92896174863388, + "grad_norm": 0.09250786900520325, + "learning_rate": 1.9998530280952938e-05, + "loss": 1.1385565996170044, + "step": 706 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 0.07818273454904556, + "learning_rate": 1.9999993467738345e-05, + "loss": 1.1723568439483643, + "step": 708 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 0.09822240471839905, + "learning_rate": 1.9998112242150162e-05, + "loss": 0.9018577337265015, + "step": 710 + }, + { + "epoch": 1.9453551912568305, + "grad_norm": 0.07508859038352966, + "learning_rate": 1.999288730325491e-05, + "loss": 0.6304239630699158, + "step": 712 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 0.18517723679542542, + "learning_rate": 1.9984320592648474e-05, + "loss": 1.108601689338684, + "step": 714 + }, + { + "epoch": 1.9562841530054644, + "grad_norm": 0.2125580906867981, + "learning_rate": 1.9972415293734607e-05, + "loss": 0.8507819175720215, + "step": 716 + }, + { + "epoch": 1.9617486338797814, + "grad_norm": 0.09725981950759888, + "learning_rate": 1.995717583054196e-05, + "loss": 1.1092121601104736, + "step": 718 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.16010290384292603, + "learning_rate": 1.9938607866080114e-05, + "loss": 1.0979053974151611, + "step": 720 + }, + { + "epoch": 1.9726775956284153, + "grad_norm": 0.10000172257423401, + "learning_rate": 1.991671830023521e-05, + "loss": 1.1372826099395752, + "step": 722 + }, + { + "epoch": 1.9781420765027322, + "grad_norm": 0.12482127547264099, + "learning_rate": 1.989151526720591e-05, + "loss": 1.0039864778518677, + "step": 724 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 0.19064390659332275, + "learning_rate": 1.986300813248073e-05, + "loss": 1.076819896697998, + "step": 726 + }, + { + "epoch": 1.989071038251366, + "grad_norm": 0.0709768533706665, + "learning_rate": 1.9831207489357825e-05, + "loss": 1.0897186994552612, + "step": 728 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 0.1523556411266327, + "learning_rate": 1.979612515500847e-05, + "loss": 1.0046844482421875, + "step": 730 + }, + { + "epoch": 2.0, + "grad_norm": 0.07687041908502579, + "learning_rate": 1.97577741660858e-05, + "loss": 1.1337778568267822, + "step": 732 + }, + { + "epoch": 2.0054644808743167, + "grad_norm": 0.06963507831096649, + "learning_rate": 1.9716168773880382e-05, + "loss": 0.6142993569374084, + "step": 734 + }, + { + "epoch": 2.010928961748634, + "grad_norm": 0.08646684885025024, + "learning_rate": 1.9671324439024374e-05, + "loss": 0.8584022521972656, + "step": 736 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 0.09564807265996933, + "learning_rate": 1.9623257825746357e-05, + "loss": 0.8110524415969849, + "step": 738 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 0.11263297498226166, + "learning_rate": 1.9571986795678878e-05, + "loss": 0.8699367642402649, + "step": 740 + }, + { + "epoch": 2.0273224043715845, + "grad_norm": 0.33164191246032715, + "learning_rate": 1.951753040122102e-05, + "loss": 0.8090985417366028, + "step": 742 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 0.13913966715335846, + "learning_rate": 1.9459908878458532e-05, + "loss": 0.9339433908462524, + "step": 744 + }, + { + "epoch": 2.0382513661202184, + "grad_norm": 0.1326131969690323, + "learning_rate": 1.939914363964402e-05, + "loss": 1.050421953201294, + "step": 746 + }, + { + "epoch": 2.0437158469945356, + "grad_norm": 0.11959031224250793, + "learning_rate": 1.9335257265240168e-05, + "loss": 1.0097161531448364, + "step": 748 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 0.20392794907093048, + "learning_rate": 1.9268273495528768e-05, + "loss": 0.7200921773910522, + "step": 750 + }, + { + "epoch": 2.0546448087431695, + "grad_norm": 0.3636847138404846, + "learning_rate": 1.9198217221788806e-05, + "loss": 0.605461597442627, + "step": 752 + }, + { + "epoch": 2.060109289617486, + "grad_norm": 1.16413414478302, + "learning_rate": 1.9125114477046807e-05, + "loss": 0.6320473551750183, + "step": 754 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 0.21701183915138245, + "learning_rate": 1.9048992426402947e-05, + "loss": 1.0904141664505005, + "step": 756 + }, + { + "epoch": 2.07103825136612, + "grad_norm": 0.14577004313468933, + "learning_rate": 1.896987935693643e-05, + "loss": 1.002879023551941, + "step": 758 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 0.2534794509410858, + "learning_rate": 1.888780466719397e-05, + "loss": 0.9008995890617371, + "step": 760 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 0.13845816254615784, + "learning_rate": 1.8802798856265254e-05, + "loss": 1.0071299076080322, + "step": 762 + }, + { + "epoch": 2.087431693989071, + "grad_norm": 0.1185113936662674, + "learning_rate": 1.8714893512449424e-05, + "loss": 0.9370182752609253, + "step": 764 + }, + { + "epoch": 2.092896174863388, + "grad_norm": 0.1452389359474182, + "learning_rate": 1.8624121301516808e-05, + "loss": 1.0715157985687256, + "step": 766 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 0.10411622375249863, + "learning_rate": 1.853051595457026e-05, + "loss": 0.8296530246734619, + "step": 768 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 0.26561427116394043, + "learning_rate": 1.843411225551065e-05, + "loss": 0.9580831527709961, + "step": 770 + }, + { + "epoch": 2.109289617486339, + "grad_norm": 0.29404425621032715, + "learning_rate": 1.8334946028111088e-05, + "loss": 0.8596486449241638, + "step": 772 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 0.08718276768922806, + "learning_rate": 1.8233054122704765e-05, + "loss": 0.8322362899780273, + "step": 774 + }, + { + "epoch": 2.120218579234973, + "grad_norm": 0.08565963804721832, + "learning_rate": 1.8128474402491286e-05, + "loss": 0.9187090396881104, + "step": 776 + }, + { + "epoch": 2.1256830601092895, + "grad_norm": 0.413748174905777, + "learning_rate": 1.802124572946668e-05, + "loss": 0.8059758543968201, + "step": 778 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 0.08740316331386566, + "learning_rate": 1.791140794998219e-05, + "loss": 0.8861984610557556, + "step": 780 + }, + { + "epoch": 2.1366120218579234, + "grad_norm": 0.06927383691072464, + "learning_rate": 1.7799001879937294e-05, + "loss": 0.9507087469100952, + "step": 782 + }, + { + "epoch": 2.1420765027322406, + "grad_norm": 0.08840641379356384, + "learning_rate": 1.768406928961248e-05, + "loss": 1.0136234760284424, + "step": 784 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 0.1088033989071846, + "learning_rate": 1.7566652888147328e-05, + "loss": 0.7027506828308105, + "step": 786 + }, + { + "epoch": 2.1530054644808745, + "grad_norm": 0.2902282178401947, + "learning_rate": 1.7446796307669725e-05, + "loss": 0.8324766755104065, + "step": 788 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 0.12461357563734055, + "learning_rate": 1.732454408708209e-05, + "loss": 1.0693421363830566, + "step": 790 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 0.11356167495250702, + "learning_rate": 1.719994165551063e-05, + "loss": 0.6516219973564148, + "step": 792 + }, + { + "epoch": 2.169398907103825, + "grad_norm": 0.3933297097682953, + "learning_rate": 1.7073035315423838e-05, + "loss": 0.7549520134925842, + "step": 794 + }, + { + "epoch": 2.1748633879781423, + "grad_norm": 0.0880681499838829, + "learning_rate": 1.6943872225426396e-05, + "loss": 0.890471875667572, + "step": 796 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 0.13298313319683075, + "learning_rate": 1.6812500382734977e-05, + "loss": 0.9139057397842407, + "step": 798 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 0.22635987401008606, + "learning_rate": 1.6678968605342348e-05, + "loss": 0.8639968037605286, + "step": 800 + }, + { + "epoch": 2.191256830601093, + "grad_norm": 0.15127940475940704, + "learning_rate": 1.6543326513876602e-05, + "loss": 0.949240505695343, + "step": 802 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 0.1014222800731659, + "learning_rate": 1.6405624513162002e-05, + "loss": 0.8963988423347473, + "step": 804 + }, + { + "epoch": 2.202185792349727, + "grad_norm": 0.08098163455724716, + "learning_rate": 1.6265913773488456e-05, + "loss": 0.9640499353408813, + "step": 806 + }, + { + "epoch": 2.2076502732240435, + "grad_norm": 0.5575600266456604, + "learning_rate": 1.6124246211596606e-05, + "loss": 0.8397250771522522, + "step": 808 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 0.09522633999586105, + "learning_rate": 1.598067447138542e-05, + "loss": 0.6495193839073181, + "step": 810 + }, + { + "epoch": 2.2185792349726774, + "grad_norm": 0.0872296467423439, + "learning_rate": 1.5835251904349688e-05, + "loss": 0.9478666186332703, + "step": 812 + }, + { + "epoch": 2.2240437158469946, + "grad_norm": 0.14140820503234863, + "learning_rate": 1.5688032549754453e-05, + "loss": 0.8577138185501099, + "step": 814 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 0.11184604465961456, + "learning_rate": 1.553907111455401e-05, + "loss": 0.8024045825004578, + "step": 816 + }, + { + "epoch": 2.2349726775956285, + "grad_norm": 0.1329488754272461, + "learning_rate": 1.538842295306264e-05, + "loss": 0.9610792994499207, + "step": 818 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 0.1702757477760315, + "learning_rate": 1.5236144046384917e-05, + "loss": 0.9774719476699829, + "step": 820 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 0.26356950402259827, + "learning_rate": 1.5082290981612987e-05, + "loss": 1.0529048442840576, + "step": 822 + }, + { + "epoch": 2.251366120218579, + "grad_norm": 0.08985842764377594, + "learning_rate": 1.4926920930798736e-05, + "loss": 0.7446960806846619, + "step": 824 + }, + { + "epoch": 2.2568306010928962, + "grad_norm": 0.278573602437973, + "learning_rate": 1.4770091629708562e-05, + "loss": 0.5461352467536926, + "step": 826 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 0.6094837784767151, + "learning_rate": 1.461186135636868e-05, + "loss": 0.7116088271141052, + "step": 828 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 0.09571769088506699, + "learning_rate": 1.4452288909408864e-05, + "loss": 0.8776818513870239, + "step": 830 + }, + { + "epoch": 2.273224043715847, + "grad_norm": 0.15052512288093567, + "learning_rate": 1.4291433586212831e-05, + "loss": 0.5412632822990417, + "step": 832 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 0.2911688983440399, + "learning_rate": 1.4129355160883216e-05, + "loss": 0.6831727623939514, + "step": 834 + }, + { + "epoch": 2.2841530054644807, + "grad_norm": 0.2999238073825836, + "learning_rate": 1.3966113862029429e-05, + "loss": 1.0725103616714478, + "step": 836 + }, + { + "epoch": 2.289617486338798, + "grad_norm": 0.33608266711235046, + "learning_rate": 1.3801770350386568e-05, + "loss": 0.9445420503616333, + "step": 838 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 0.11400384455919266, + "learning_rate": 1.363638569627384e-05, + "loss": 0.7809190154075623, + "step": 840 + }, + { + "epoch": 2.300546448087432, + "grad_norm": 0.07464587688446045, + "learning_rate": 1.3470021356900696e-05, + "loss": 0.9480050802230835, + "step": 842 + }, + { + "epoch": 2.3060109289617485, + "grad_norm": 0.0809248611330986, + "learning_rate": 1.3302739153529252e-05, + "loss": 1.0223753452301025, + "step": 844 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 0.0836072638630867, + "learning_rate": 1.3134601248501366e-05, + "loss": 0.5357997417449951, + "step": 846 + }, + { + "epoch": 2.3169398907103824, + "grad_norm": 0.10634348541498184, + "learning_rate": 1.2965670122139071e-05, + "loss": 0.9999401569366455, + "step": 848 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 0.09218322485685349, + "learning_rate": 1.2796008549526752e-05, + "loss": 0.9290571808815002, + "step": 850 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 0.13691577315330505, + "learning_rate": 1.262567957718378e-05, + "loss": 0.80766761302948, + "step": 852 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.5151943564414978, + "learning_rate": 1.2454746499636408e-05, + "loss": 0.9886870384216309, + "step": 854 + }, + { + "epoch": 2.33879781420765, + "grad_norm": 0.08581722527742386, + "learning_rate": 1.2283272835897359e-05, + "loss": 0.9574921727180481, + "step": 856 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 0.13452446460723877, + "learning_rate": 1.2111322305862088e-05, + "loss": 0.6717698574066162, + "step": 858 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 0.10753488540649414, + "learning_rate": 1.1938958806630322e-05, + "loss": 0.8548917770385742, + "step": 860 + }, + { + "epoch": 2.3551912568306013, + "grad_norm": 0.0970119908452034, + "learning_rate": 1.1766246388761841e-05, + "loss": 0.9632062911987305, + "step": 862 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 0.13308054208755493, + "learning_rate": 1.1593249232475162e-05, + "loss": 0.6643819808959961, + "step": 864 + }, + { + "epoch": 2.366120218579235, + "grad_norm": 0.24448184669017792, + "learning_rate": 1.142003162379808e-05, + "loss": 1.241449236869812, + "step": 866 + }, + { + "epoch": 2.371584699453552, + "grad_norm": 0.14363636076450348, + "learning_rate": 1.1246657930678817e-05, + "loss": 0.9594497680664062, + "step": 868 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.08895833790302277, + "learning_rate": 1.1073192579066867e-05, + "loss": 0.8083928227424622, + "step": 870 + }, + { + "epoch": 2.3825136612021858, + "grad_norm": 0.1053352952003479, + "learning_rate": 1.0899700028972169e-05, + "loss": 0.859566330909729, + "step": 872 + }, + { + "epoch": 2.387978142076503, + "grad_norm": 0.16995586454868317, + "learning_rate": 1.072624475051166e-05, + "loss": 0.9505580067634583, + "step": 874 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 0.42757925391197205, + "learning_rate": 1.055289119995206e-05, + "loss": 0.6944902539253235, + "step": 876 + }, + { + "epoch": 2.3989071038251364, + "grad_norm": 0.122571662068367, + "learning_rate": 1.0379703795757853e-05, + "loss": 0.8935880064964294, + "step": 878 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 0.09966963529586792, + "learning_rate": 1.0206746894653252e-05, + "loss": 0.9855829477310181, + "step": 880 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 0.09130659699440002, + "learning_rate": 1.0034084767707164e-05, + "loss": 0.5046952962875366, + "step": 882 + }, + { + "epoch": 2.4153005464480874, + "grad_norm": 0.08370860666036606, + "learning_rate": 9.861781576449879e-06, + "loss": 0.8278293609619141, + "step": 884 + }, + { + "epoch": 2.420765027322404, + "grad_norm": 0.09103255718946457, + "learning_rate": 9.689901349030646e-06, + "loss": 0.9132435917854309, + "step": 886 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 0.17922845482826233, + "learning_rate": 9.518507956424643e-06, + "loss": 0.6458944082260132, + "step": 888 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 0.07886765152215958, + "learning_rate": 9.347665088698444e-06, + "loss": 0.5420209765434265, + "step": 890 + }, + { + "epoch": 2.4371584699453552, + "grad_norm": 0.12056568264961243, + "learning_rate": 9.177436231342623e-06, + "loss": 0.8368343710899353, + "step": 892 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 0.1841779351234436, + "learning_rate": 9.00788464168054e-06, + "loss": 0.5313267111778259, + "step": 894 + }, + { + "epoch": 2.448087431693989, + "grad_norm": 0.09516401588916779, + "learning_rate": 8.839073325361751e-06, + "loss": 0.7824960947036743, + "step": 896 + }, + { + "epoch": 2.453551912568306, + "grad_norm": 0.11680221557617188, + "learning_rate": 8.67106501294902e-06, + "loss": 0.8697406053543091, + "step": 898 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.29455330967903137, + "learning_rate": 8.503922136607536e-06, + "loss": 0.9425011277198792, + "step": 900 + }, + { + "epoch": 2.4644808743169397, + "grad_norm": 0.0904950276017189, + "learning_rate": 8.337706806905029e-06, + "loss": 0.8952743411064148, + "step": 902 + }, + { + "epoch": 2.469945355191257, + "grad_norm": 0.09989660233259201, + "learning_rate": 8.172480789731374e-06, + "loss": 0.7953454256057739, + "step": 904 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 0.20192794501781464, + "learning_rate": 8.00830548334625e-06, + "loss": 0.8293022513389587, + "step": 906 + }, + { + "epoch": 2.480874316939891, + "grad_norm": 0.04934593662619591, + "learning_rate": 7.84524189556352e-06, + "loss": 0.40420135855674744, + "step": 908 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 0.0565824918448925, + "learning_rate": 7.68335062108057e-06, + "loss": 0.8528041243553162, + "step": 910 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 0.13608448207378387, + "learning_rate": 7.522691818961252e-06, + "loss": 0.7207663059234619, + "step": 912 + }, + { + "epoch": 2.4972677595628414, + "grad_norm": 0.13163967430591583, + "learning_rate": 7.3633251902806165e-06, + "loss": 0.942747950553894, + "step": 914 + }, + { + "epoch": 2.5027322404371586, + "grad_norm": 0.09808902442455292, + "learning_rate": 7.205309955939983e-06, + "loss": 0.7634099125862122, + "step": 916 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 0.09887581318616867, + "learning_rate": 7.048704834660296e-06, + "loss": 0.9049314260482788, + "step": 918 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 0.0817629024386406, + "learning_rate": 6.8935680211621715e-06, + "loss": 0.8354104161262512, + "step": 920 + }, + { + "epoch": 2.519125683060109, + "grad_norm": 0.09578151255846024, + "learning_rate": 6.739957164540634e-06, + "loss": 0.8901323080062866, + "step": 922 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 0.2716790735721588, + "learning_rate": 6.587929346842625e-06, + "loss": 0.542881429195404, + "step": 924 + }, + { + "epoch": 2.530054644808743, + "grad_norm": 0.07322418689727783, + "learning_rate": 6.437541061855222e-06, + "loss": 0.48108160495758057, + "step": 926 + }, + { + "epoch": 2.5355191256830603, + "grad_norm": 0.18133673071861267, + "learning_rate": 6.288848194112459e-06, + "loss": 0.605663537979126, + "step": 928 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 0.23896083235740662, + "learning_rate": 6.141905998128495e-06, + "loss": 0.9464154243469238, + "step": 930 + }, + { + "epoch": 2.546448087431694, + "grad_norm": 0.16439692676067352, + "learning_rate": 5.996769077865029e-06, + "loss": 0.616335391998291, + "step": 932 + }, + { + "epoch": 2.551912568306011, + "grad_norm": 0.1357869654893875, + "learning_rate": 5.853491366440313e-06, + "loss": 0.899514377117157, + "step": 934 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 0.6998143196105957, + "learning_rate": 5.712126106087557e-06, + "loss": 0.7158923149108887, + "step": 936 + }, + { + "epoch": 2.5628415300546448, + "grad_norm": 0.22336842119693756, + "learning_rate": 5.572725828369961e-06, + "loss": 0.5384737849235535, + "step": 938 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 0.13469916582107544, + "learning_rate": 5.4353423346599944e-06, + "loss": 0.9431443214416504, + "step": 940 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 0.13996966183185577, + "learning_rate": 5.30002667688986e-06, + "loss": 0.5406258702278137, + "step": 942 + }, + { + "epoch": 2.579234972677596, + "grad_norm": 0.17135582864284515, + "learning_rate": 5.1668291385804995e-06, + "loss": 0.9542285203933716, + "step": 944 + }, + { + "epoch": 2.5846994535519126, + "grad_norm": 0.07237273454666138, + "learning_rate": 5.03579921615621e-06, + "loss": 0.2618236243724823, + "step": 946 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 0.09927342087030411, + "learning_rate": 4.906985600551651e-06, + "loss": 0.8278122544288635, + "step": 948 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 0.12567152082920074, + "learning_rate": 4.780436159118221e-06, + "loss": 0.6845787167549133, + "step": 950 + }, + { + "epoch": 2.6010928961748636, + "grad_norm": 0.1773616075515747, + "learning_rate": 4.656197917836474e-06, + "loss": 0.856772243976593, + "step": 952 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 0.1038050726056099, + "learning_rate": 4.5343170438411885e-06, + "loss": 0.5110206604003906, + "step": 954 + }, + { + "epoch": 2.612021857923497, + "grad_norm": 0.10283994674682617, + "learning_rate": 4.414838828265581e-06, + "loss": 0.33818623423576355, + "step": 956 + }, + { + "epoch": 2.6174863387978142, + "grad_norm": 0.09708721190690994, + "learning_rate": 4.297807669411057e-06, + "loss": 0.5298760533332825, + "step": 958 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.08416345715522766, + "learning_rate": 4.183267056248689e-06, + "loss": 0.7988206744194031, + "step": 960 + }, + { + "epoch": 2.628415300546448, + "grad_norm": 0.4326683580875397, + "learning_rate": 4.071259552258709e-06, + "loss": 0.6375377178192139, + "step": 962 + }, + { + "epoch": 2.633879781420765, + "grad_norm": 0.0943351686000824, + "learning_rate": 3.961826779613801e-06, + "loss": 0.9425459504127502, + "step": 964 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 0.08805088698863983, + "learning_rate": 3.85500940371226e-06, + "loss": 0.8150487542152405, + "step": 966 + }, + { + "epoch": 2.644808743169399, + "grad_norm": 0.07647102326154709, + "learning_rate": 3.750847118066614e-06, + "loss": 0.7014102935791016, + "step": 968 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 0.17416299879550934, + "learning_rate": 3.6493786295535234e-06, + "loss": 0.7047572135925293, + "step": 970 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 0.1062597781419754, + "learning_rate": 3.5506416440301885e-06, + "loss": 1.0425673723220825, + "step": 972 + }, + { + "epoch": 2.66120218579235, + "grad_norm": 0.1533363163471222, + "learning_rate": 3.4546728523228067e-06, + "loss": 0.5896174311637878, + "step": 974 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.14529746770858765, + "learning_rate": 3.361507916592206e-06, + "loss": 0.8626460433006287, + "step": 976 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 0.07237977534532547, + "learning_rate": 3.271181457081715e-06, + "loss": 0.9003605246543884, + "step": 978 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 0.08640500158071518, + "learning_rate": 3.1837270392522456e-06, + "loss": 0.8033831715583801, + "step": 980 + }, + { + "epoch": 2.6830601092896176, + "grad_norm": 0.1235438659787178, + "learning_rate": 3.0991771613092686e-06, + "loss": 1.0247108936309814, + "step": 982 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 0.09095709025859833, + "learning_rate": 3.017563242126483e-06, + "loss": 0.5988020896911621, + "step": 984 + }, + { + "epoch": 2.6939890710382515, + "grad_norm": 0.1297115683555603, + "learning_rate": 2.9389156095704764e-06, + "loss": 1.0478209257125854, + "step": 986 + }, + { + "epoch": 2.699453551912568, + "grad_norm": 0.12489040940999985, + "learning_rate": 2.8632634892308535e-06, + "loss": 0.8566729426383972, + "step": 988 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 0.08687499165534973, + "learning_rate": 2.7906349935599326e-06, + "loss": 0.7906680703163147, + "step": 990 + }, + { + "epoch": 2.710382513661202, + "grad_norm": 0.26650258898735046, + "learning_rate": 2.721057111426154e-06, + "loss": 0.49348220229148865, + "step": 992 + }, + { + "epoch": 2.7158469945355193, + "grad_norm": 0.1376582682132721, + "learning_rate": 2.6545556980849417e-06, + "loss": 0.9247549176216125, + "step": 994 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 0.08857110887765884, + "learning_rate": 2.591155465570866e-06, + "loss": 0.8927129507064819, + "step": 996 + }, + { + "epoch": 2.726775956284153, + "grad_norm": 0.07012514770030975, + "learning_rate": 2.5308799735145813e-06, + "loss": 0.8281713724136353, + "step": 998 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 0.36359021067619324, + "learning_rate": 2.473751620388069e-06, + "loss": 1.0165328979492188, + "step": 1000 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 0.1266205608844757, + "learning_rate": 2.419791635181301e-06, + "loss": 0.8732631206512451, + "step": 1002 + }, + { + "epoch": 2.7431693989071038, + "grad_norm": 0.0775340348482132, + "learning_rate": 2.369020069513521e-06, + "loss": 0.8283519744873047, + "step": 1004 + }, + { + "epoch": 2.748633879781421, + "grad_norm": 0.26210060715675354, + "learning_rate": 2.3214557901820258e-06, + "loss": 0.3722708523273468, + "step": 1006 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 0.1599731147289276, + "learning_rate": 2.27711647215124e-06, + "loss": 1.0166616439819336, + "step": 1008 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 0.18406230211257935, + "learning_rate": 2.2360185919846593e-06, + "loss": 1.0183711051940918, + "step": 1010 + }, + { + "epoch": 2.7650273224043715, + "grad_norm": 0.24661096930503845, + "learning_rate": 2.1981774217221474e-06, + "loss": 0.6995247602462769, + "step": 1012 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 0.10282541066408157, + "learning_rate": 2.1636070232047966e-06, + "loss": 0.9614273309707642, + "step": 1014 + }, + { + "epoch": 2.7759562841530054, + "grad_norm": 0.15983141958713531, + "learning_rate": 2.1323202428495544e-06, + "loss": 0.7532384991645813, + "step": 1016 + }, + { + "epoch": 2.781420765027322, + "grad_norm": 0.09075122326612473, + "learning_rate": 2.104328706875452e-06, + "loss": 0.9738923907279968, + "step": 1018 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.21000061929225922, + "learning_rate": 2.079642816983293e-06, + "loss": 0.6343734860420227, + "step": 1020 + }, + { + "epoch": 2.7923497267759565, + "grad_norm": 0.12519368529319763, + "learning_rate": 2.0582717464903546e-06, + "loss": 0.9371435046195984, + "step": 1022 + }, + { + "epoch": 2.797814207650273, + "grad_norm": 0.08356457203626633, + "learning_rate": 2.040223436921581e-06, + "loss": 0.455655038356781, + "step": 1024 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 0.1446380466222763, + "learning_rate": 2.025504595058489e-06, + "loss": 0.9525520205497742, + "step": 1026 + }, + { + "epoch": 2.808743169398907, + "grad_norm": 0.30293965339660645, + "learning_rate": 2.0141206904469206e-06, + "loss": 0.6948819160461426, + "step": 1028 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 0.07843527942895889, + "learning_rate": 2.006075953364551e-06, + "loss": 0.7919158935546875, + "step": 1030 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 0.10344526171684265, + "learning_rate": 2.0013733732489103e-06, + "loss": 0.789183497428894, + "step": 1032 + }, + { + "epoch": 2.8251366120218577, + "grad_norm": 0.1789560317993164, + "learning_rate": 2.000014697586502e-06, + "loss": 0.7047494053840637, + "step": 1034 + }, + { + "epoch": 2.830601092896175, + "grad_norm": 0.11639666557312012, + "learning_rate": 2.0020004312634374e-06, + "loss": 1.0080572366714478, + "step": 1036 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 0.08672972768545151, + "learning_rate": 2.0073298363778166e-06, + "loss": 0.8355114459991455, + "step": 1038 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 0.11700020730495453, + "learning_rate": 2.016000932513934e-06, + "loss": 0.9271637797355652, + "step": 1040 + }, + { + "epoch": 2.8469945355191255, + "grad_norm": 0.07531187683343887, + "learning_rate": 2.0280104974782058e-06, + "loss": 0.9033800363540649, + "step": 1042 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 0.1744087040424347, + "learning_rate": 2.043354068496541e-06, + "loss": 0.7458989024162292, + "step": 1044 + }, + { + "epoch": 2.8579234972677594, + "grad_norm": 0.09162133932113647, + "learning_rate": 2.0620259438727168e-06, + "loss": 0.9592632055282593, + "step": 1046 + }, + { + "epoch": 2.8633879781420766, + "grad_norm": 0.030708037316799164, + "learning_rate": 2.084019185107135e-06, + "loss": 0.46672046184539795, + "step": 1048 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 0.12400135397911072, + "learning_rate": 2.1093256194751822e-06, + "loss": 0.8463804721832275, + "step": 1050 + }, + { + "epoch": 2.8743169398907105, + "grad_norm": 0.18339043855667114, + "learning_rate": 2.137935843064233e-06, + "loss": 0.8194495439529419, + "step": 1052 + }, + { + "epoch": 2.879781420765027, + "grad_norm": 0.11495642364025116, + "learning_rate": 2.1698392242681502e-06, + "loss": 0.9895132184028625, + "step": 1054 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 0.8155518770217896, + "learning_rate": 2.2050239077380097e-06, + "loss": 0.7439733743667603, + "step": 1056 + }, + { + "epoch": 2.890710382513661, + "grad_norm": 0.3202117383480072, + "learning_rate": 2.2434768187875723e-06, + "loss": 0.6009439826011658, + "step": 1058 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 0.10869040340185165, + "learning_rate": 2.285183668251853e-06, + "loss": 0.8892866969108582, + "step": 1060 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 0.12712760269641876, + "learning_rate": 2.3301289577970028e-06, + "loss": 0.8923851847648621, + "step": 1062 + }, + { + "epoch": 2.907103825136612, + "grad_norm": 0.08466093987226486, + "learning_rate": 2.3782959856795113e-06, + "loss": 0.9176651239395142, + "step": 1064 + }, + { + "epoch": 2.912568306010929, + "grad_norm": 0.08564656227827072, + "learning_rate": 2.4296668529525998e-06, + "loss": 0.5260547995567322, + "step": 1066 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 0.5003576278686523, + "learning_rate": 2.4842224701175147e-06, + "loss": 0.6537207961082458, + "step": 1068 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 0.1762370467185974, + "learning_rate": 2.541942564217196e-06, + "loss": 0.7581856846809387, + "step": 1070 + }, + { + "epoch": 2.92896174863388, + "grad_norm": 0.09134093672037125, + "learning_rate": 2.6028056863697506e-06, + "loss": 0.9717255234718323, + "step": 1072 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 0.10428964346647263, + "learning_rate": 2.6667892197388884e-06, + "loss": 0.5648120045661926, + "step": 1074 + }, + { + "epoch": 2.939890710382514, + "grad_norm": 0.1324595808982849, + "learning_rate": 2.7338693879383967e-06, + "loss": 0.6206448674201965, + "step": 1076 + }, + { + "epoch": 2.9453551912568305, + "grad_norm": 0.17810961604118347, + "learning_rate": 2.8040212638674506e-06, + "loss": 0.5412209033966064, + "step": 1078 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.15168237686157227, + "learning_rate": 2.877218778973578e-06, + "loss": 0.979107677936554, + "step": 1080 + }, + { + "epoch": 2.9562841530054644, + "grad_norm": 0.3206607401371002, + "learning_rate": 2.9534347329398027e-06, + "loss": 0.6219339370727539, + "step": 1082 + }, + { + "epoch": 2.9617486338797816, + "grad_norm": 0.10671282559633255, + "learning_rate": 3.0326408037922827e-06, + "loss": 0.9135304093360901, + "step": 1084 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 0.0563555508852005, + "learning_rate": 3.1148075584248306e-06, + "loss": 0.5136356353759766, + "step": 1086 + }, + { + "epoch": 2.972677595628415, + "grad_norm": 0.10856077820062637, + "learning_rate": 3.199904463536296e-06, + "loss": 1.0512590408325195, + "step": 1088 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 0.08381670713424683, + "learning_rate": 3.2878998969767954e-06, + "loss": 0.7392516732215881, + "step": 1090 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 0.08783172816038132, + "learning_rate": 3.378761159498547e-06, + "loss": 0.744635820388794, + "step": 1092 + }, + { + "epoch": 2.989071038251366, + "grad_norm": 1.6826421022415161, + "learning_rate": 3.472454486906972e-06, + "loss": 0.8086671233177185, + "step": 1094 + }, + { + "epoch": 2.994535519125683, + "grad_norm": 0.23645569384098053, + "learning_rate": 3.5689450626075132e-06, + "loss": 0.5458236336708069, + "step": 1096 + }, + { + "epoch": 3.0, + "grad_norm": 0.11009235680103302, + "learning_rate": 3.668197030543573e-06, + "loss": 0.8465088605880737, + "step": 1098 + }, + { + "epoch": 3.0, + "step": 1098, + "total_flos": 4.842473815785079e+18, + "train_loss": 0.9778182457162166, + "train_runtime": 38444.4554, + "train_samples_per_second": 1.714, + "train_steps_per_second": 0.029 + } + ], + "logging_steps": 2, + "max_steps": 1098, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.842473815785079e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}