diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3886 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1098, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00546448087431694, + "grad_norm": 0.7586923837661743, + "learning_rate": 3.6363636363636366e-07, + "loss": 2.660745620727539, + "step": 2 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 1.0751519203186035, + "learning_rate": 1.090909090909091e-06, + "loss": 1.938146948814392, + "step": 4 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 0.8923681974411011, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.9057133197784424, + "step": 6 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 0.7297345399856567, + "learning_rate": 2.5454545454545456e-06, + "loss": 1.7945566177368164, + "step": 8 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 0.818263828754425, + "learning_rate": 3.272727272727273e-06, + "loss": 1.770993709564209, + "step": 10 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.4792977273464203, + "learning_rate": 4.000000000000001e-06, + "loss": 1.62543523311615, + "step": 12 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 0.4317499101161957, + "learning_rate": 4.727272727272728e-06, + "loss": 1.519499659538269, + "step": 14 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 0.3740278482437134, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.2777570486068726, + "step": 16 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 0.652406632900238, + "learning_rate": 6.181818181818182e-06, + "loss": 1.1676603555679321, + "step": 18 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 0.3864721953868866, + "learning_rate": 6.90909090909091e-06, + "loss": 1.4224069118499756, + "step": 20 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 0.541534960269928, + "learning_rate": 7.636363636363638e-06, + "loss": 1.3205312490463257, + "step": 22 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.44199395179748535, + "learning_rate": 8.363636363636365e-06, + "loss": 1.4163978099822998, + "step": 24 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 0.8410191535949707, + "learning_rate": 9.090909090909091e-06, + "loss": 1.598981261253357, + "step": 26 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 1.757845163345337, + "learning_rate": 9.81818181818182e-06, + "loss": 1.375367283821106, + "step": 28 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.46648648381233215, + "learning_rate": 1.0545454545454546e-05, + "loss": 1.2981315851211548, + "step": 30 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.24247297644615173, + "learning_rate": 1.1272727272727272e-05, + "loss": 1.2934328317642212, + "step": 32 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 0.7804842591285706, + "learning_rate": 1.2e-05, + "loss": 1.0114089250564575, + "step": 34 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.24668505787849426, + "learning_rate": 1.2727272727272728e-05, + "loss": 1.1152716875076294, + "step": 36 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 0.3483196198940277, + "learning_rate": 1.3454545454545455e-05, + "loss": 1.3653188943862915, + "step": 38 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.5274494886398315, + "learning_rate": 1.4181818181818183e-05, + "loss": 1.316364049911499, + "step": 40 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 1.7519822120666504, + "learning_rate": 1.4909090909090911e-05, + "loss": 1.0375714302062988, + "step": 42 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 0.2554769814014435, + "learning_rate": 1.563636363636364e-05, + "loss": 0.9050353169441223, + "step": 44 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 0.2430635243654251, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.3280794620513916, + "step": 46 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.3220200836658478, + "learning_rate": 1.7090909090909092e-05, + "loss": 1.3129568099975586, + "step": 48 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.3611713647842407, + "learning_rate": 1.781818181818182e-05, + "loss": 1.2813783884048462, + "step": 50 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 0.3371615707874298, + "learning_rate": 1.8545454545454545e-05, + "loss": 1.2453889846801758, + "step": 52 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 3.5102477073669434, + "learning_rate": 1.9272727272727275e-05, + "loss": 0.9431768655776978, + "step": 54 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 0.3596407175064087, + "learning_rate": 2e-05, + "loss": 1.4184821844100952, + "step": 56 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 0.6473796367645264, + "learning_rate": 1.9998327792599505e-05, + "loss": 1.4325121641159058, + "step": 58 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.4482963979244232, + "learning_rate": 1.999331179179304e-05, + "loss": 1.3262360095977783, + "step": 60 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 0.45529159903526306, + "learning_rate": 1.9984953861534752e-05, + "loss": 1.505706787109375, + "step": 62 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 1.139926791191101, + "learning_rate": 1.997325710764527e-05, + "loss": 1.1403143405914307, + "step": 64 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.5250591039657593, + "learning_rate": 1.9958225876657575e-05, + "loss": 1.30501127243042, + "step": 66 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 1.2981220483779907, + "learning_rate": 1.9939865754201825e-05, + "loss": 1.2804065942764282, + "step": 68 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.49463051557540894, + "learning_rate": 1.9918183562929717e-05, + "loss": 1.309598445892334, + "step": 70 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 0.5240225195884705, + "learning_rate": 1.9893187359979183e-05, + "loss": 1.0792406797409058, + "step": 72 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 0.3060704469680786, + "learning_rate": 1.986488643398035e-05, + "loss": 1.3067054748535156, + "step": 74 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 0.6043591499328613, + "learning_rate": 1.9833291301603863e-05, + "loss": 1.307369589805603, + "step": 76 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 1.3378225564956665, + "learning_rate": 1.9798413703652867e-05, + "loss": 0.5469598174095154, + "step": 78 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.36606472730636597, + "learning_rate": 1.976026660070012e-05, + "loss": 0.5283974409103394, + "step": 80 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 0.4779462218284607, + "learning_rate": 1.9718864168271823e-05, + "loss": 1.3443689346313477, + "step": 82 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.3752780854701996, + "learning_rate": 1.9674221791579946e-05, + "loss": 0.729979395866394, + "step": 84 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 0.23232732713222504, + "learning_rate": 1.9626356059805085e-05, + "loss": 1.2514656782150269, + "step": 86 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 0.37319672107696533, + "learning_rate": 1.957528475993189e-05, + "loss": 1.2355018854141235, + "step": 88 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 1.1993027925491333, + "learning_rate": 1.952102687013938e-05, + "loss": 0.86878901720047, + "step": 90 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 0.5708634257316589, + "learning_rate": 1.946360255274863e-05, + "loss": 1.2946072816848755, + "step": 92 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 0.2725592851638794, + "learning_rate": 1.9403033146730424e-05, + "loss": 1.1185024976730347, + "step": 94 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.6162624955177307, + "learning_rate": 1.9339341159775647e-05, + "loss": 1.5296945571899414, + "step": 96 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 0.30999237298965454, + "learning_rate": 1.9272550259931398e-05, + "loss": 0.9915840029716492, + "step": 98 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.27694171667099, + "learning_rate": 1.9202685266805896e-05, + "loss": 1.0551968812942505, + "step": 100 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 0.34108948707580566, + "learning_rate": 1.9129772142345484e-05, + "loss": 1.253743290901184, + "step": 102 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 0.3947954773902893, + "learning_rate": 1.9053837981187125e-05, + "loss": 0.9518170952796936, + "step": 104 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 0.3549712896347046, + "learning_rate": 1.897491100058998e-05, + "loss": 1.2458157539367676, + "step": 106 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 0.30054518580436707, + "learning_rate": 1.8893020529949838e-05, + "loss": 1.386643648147583, + "step": 108 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 1.3760162591934204, + "learning_rate": 1.880819699990027e-05, + "loss": 1.2258188724517822, + "step": 110 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.2765187621116638, + "learning_rate": 1.8720471931004526e-05, + "loss": 1.2596882581710815, + "step": 112 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 0.6589577794075012, + "learning_rate": 1.8629877922042485e-05, + "loss": 1.3348774909973145, + "step": 114 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 0.3075348436832428, + "learning_rate": 1.8536448637896866e-05, + "loss": 1.2452787160873413, + "step": 116 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 0.3071703016757965, + "learning_rate": 1.84402187970433e-05, + "loss": 1.0434746742248535, + "step": 118 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 2.645376682281494, + "learning_rate": 1.834122415864891e-05, + "loss": 1.2307820320129395, + "step": 120 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5483799576759338, + "learning_rate": 1.8239501509284123e-05, + "loss": 1.3374857902526855, + "step": 122 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 0.3543969988822937, + "learning_rate": 1.8135088649252725e-05, + "loss": 1.030819296836853, + "step": 124 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 0.5614028573036194, + "learning_rate": 1.8028024378545224e-05, + "loss": 1.0396068096160889, + "step": 126 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 0.4731024503707886, + "learning_rate": 1.7918348482420692e-05, + "loss": 1.3047938346862793, + "step": 128 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.42884743213653564, + "learning_rate": 1.7806101716622486e-05, + "loss": 1.2398240566253662, + "step": 130 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.32270482182502747, + "learning_rate": 1.7691325792233378e-05, + "loss": 1.1840364933013916, + "step": 132 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 0.8914838433265686, + "learning_rate": 1.7574063360175625e-05, + "loss": 1.0900051593780518, + "step": 134 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 0.4666799306869507, + "learning_rate": 1.745435799536183e-05, + "loss": 0.976508378982544, + "step": 136 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 0.3514363467693329, + "learning_rate": 1.7332254180502407e-05, + "loss": 0.9537405967712402, + "step": 138 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.22689107060432434, + "learning_rate": 1.7207797289575777e-05, + "loss": 0.6181514859199524, + "step": 140 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 0.337587833404541, + "learning_rate": 1.708103357096728e-05, + "loss": 1.2152214050292969, + "step": 142 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.34758704900741577, + "learning_rate": 1.695201013028322e-05, + "loss": 1.2426707744598389, + "step": 144 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 0.2902460992336273, + "learning_rate": 1.6820774912846335e-05, + "loss": 1.2040138244628906, + "step": 146 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 0.3400241434574127, + "learning_rate": 1.668737668587926e-05, + "loss": 1.1996750831604004, + "step": 148 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.35784855484962463, + "learning_rate": 1.655186502038251e-05, + "loss": 1.1286766529083252, + "step": 150 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 0.2693675458431244, + "learning_rate": 1.641429027271384e-05, + "loss": 1.2409323453903198, + "step": 152 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 0.483333557844162, + "learning_rate": 1.6274703565875736e-05, + "loss": 1.2143176794052124, + "step": 154 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.4158734679222107, + "learning_rate": 1.613315677051801e-05, + "loss": 1.2641037702560425, + "step": 156 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 0.6228234171867371, + "learning_rate": 1.598970248566261e-05, + "loss": 0.9883160591125488, + "step": 158 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.3108808398246765, + "learning_rate": 1.5844394019157697e-05, + "loss": 1.1966590881347656, + "step": 160 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 0.4892992377281189, + "learning_rate": 1.5697285367868393e-05, + "loss": 0.6886841058731079, + "step": 162 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 0.3893924653530121, + "learning_rate": 1.5548431197611448e-05, + "loss": 1.5789108276367188, + "step": 164 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 0.6205107569694519, + "learning_rate": 1.539788682284133e-05, + "loss": 0.724977970123291, + "step": 166 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.2539767324924469, + "learning_rate": 1.5245708186095275e-05, + "loss": 1.1216144561767578, + "step": 168 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.31149035692214966, + "learning_rate": 1.5091951837204973e-05, + "loss": 1.2175238132476807, + "step": 170 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 0.5983729958534241, + "learning_rate": 1.4936674912282525e-05, + "loss": 1.292568325996399, + "step": 172 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 0.41234031319618225, + "learning_rate": 1.4779935112488597e-05, + "loss": 1.404640793800354, + "step": 174 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.25759682059288025, + "learning_rate": 1.4621790682590556e-05, + "loss": 1.200560450553894, + "step": 176 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 0.8127933144569397, + "learning_rate": 1.4462300389318635e-05, + "loss": 1.161008358001709, + "step": 178 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.2617407739162445, + "learning_rate": 1.4301523499528099e-05, + "loss": 1.167149543762207, + "step": 180 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 0.38304898142814636, + "learning_rate": 1.4139519758175602e-05, + "loss": 1.154693841934204, + "step": 182 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 1.8618555068969727, + "learning_rate": 1.3976349366117861e-05, + "loss": 1.2942745685577393, + "step": 184 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 0.7506605386734009, + "learning_rate": 1.3812072957740898e-05, + "loss": 1.165328860282898, + "step": 186 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 0.716810405254364, + "learning_rate": 1.3646751578428231e-05, + "loss": 1.1603736877441406, + "step": 188 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.2964042127132416, + "learning_rate": 1.3480446661876295e-05, + "loss": 1.2188032865524292, + "step": 190 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.5146150588989258, + "learning_rate": 1.3313220007265572e-05, + "loss": 1.1854382753372192, + "step": 192 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 0.3248935341835022, + "learning_rate": 1.3145133756295936e-05, + "loss": 1.1913241147994995, + "step": 194 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 0.5341064929962158, + "learning_rate": 1.2976250370094668e-05, + "loss": 1.2624198198318481, + "step": 196 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.31245383620262146, + "learning_rate": 1.2806632606005822e-05, + "loss": 1.0798206329345703, + "step": 198 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.46515926718711853, + "learning_rate": 1.2636343494269479e-05, + "loss": 1.2240967750549316, + "step": 200 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 0.4435195326805115, + "learning_rate": 1.2465446314599609e-05, + "loss": 1.0411522388458252, + "step": 202 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.45708101987838745, + "learning_rate": 1.2294004572669228e-05, + "loss": 0.869902491569519, + "step": 204 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 0.31885483860969543, + "learning_rate": 1.2122081976511581e-05, + "loss": 1.2103023529052734, + "step": 206 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.6482557058334351, + "learning_rate": 1.1949742412846142e-05, + "loss": 1.1392232179641724, + "step": 208 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.3855375349521637, + "learning_rate": 1.177704992333818e-05, + "loss": 1.1707613468170166, + "step": 210 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 0.326550155878067, + "learning_rate": 1.1604068680800809e-05, + "loss": 1.4717496633529663, + "step": 212 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 0.5021712183952332, + "learning_rate": 1.1430862965348224e-05, + "loss": 1.184004306793213, + "step": 214 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 0.9780371189117432, + "learning_rate": 1.1257497140509141e-05, + "loss": 1.53951895236969, + "step": 216 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 0.3760947585105896, + "learning_rate": 1.1084035629309176e-05, + "loss": 1.152180790901184, + "step": 218 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 0.3090932071208954, + "learning_rate": 1.0910542890331162e-05, + "loss": 1.22081458568573, + "step": 220 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.7166253328323364, + "learning_rate": 1.0737083393762213e-05, + "loss": 1.041406273841858, + "step": 222 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 1.7460894584655762, + "learning_rate": 1.0563721597436525e-05, + "loss": 0.9747711420059204, + "step": 224 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 0.7671729326248169, + "learning_rate": 1.039052192288271e-05, + "loss": 1.1867763996124268, + "step": 226 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 1.2053930759429932, + "learning_rate": 1.0217548731384677e-05, + "loss": 1.1788854598999023, + "step": 228 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.2746277153491974, + "learning_rate": 1.0044866300064842e-05, + "loss": 0.6711171865463257, + "step": 230 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 0.643056333065033, + "learning_rate": 9.872538797998672e-06, + "loss": 1.1947264671325684, + "step": 232 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 10.281498908996582, + "learning_rate": 9.700630262369337e-06, + "loss": 0.8791699409484863, + "step": 234 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 0.8242344260215759, + "learning_rate": 9.529204574671391e-06, + "loss": 0.8508569598197937, + "step": 236 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 0.825342059135437, + "learning_rate": 9.3583254369723e-06, + "loss": 1.25392746925354, + "step": 238 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.4026271104812622, + "learning_rate": 9.188056348240655e-06, + "loss": 1.1808273792266846, + "step": 240 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 0.5428042411804199, + "learning_rate": 9.018460580749842e-06, + "loss": 1.1250752210617065, + "step": 242 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5019088387489319, + "learning_rate": 8.849601156565972e-06, + "loss": 0.9948179125785828, + "step": 244 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 0.32495370507240295, + "learning_rate": 8.68154082412877e-06, + "loss": 1.1509572267532349, + "step": 246 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 0.47498419880867004, + "learning_rate": 8.514342034934159e-06, + "loss": 1.2174043655395508, + "step": 248 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.3702382445335388, + "learning_rate": 8.348066920327163e-06, + "loss": 1.1846040487289429, + "step": 250 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.4739413857460022, + "learning_rate": 8.182777268413822e-06, + "loss": 1.0815660953521729, + "step": 252 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 0.3407984673976898, + "learning_rate": 8.018534501100611e-06, + "loss": 1.136920690536499, + "step": 254 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.3524422347545624, + "learning_rate": 7.855399651269982e-06, + "loss": 1.1331547498703003, + "step": 256 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 0.28807151317596436, + "learning_rate": 7.6934333401004e-06, + "loss": 1.0285248756408691, + "step": 258 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.3343144655227661, + "learning_rate": 7.53269575453947e-06, + "loss": 1.1325820684432983, + "step": 260 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 0.38119539618492126, + "learning_rate": 7.373246624938324e-06, + "loss": 1.2263553142547607, + "step": 262 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.312750905752182, + "learning_rate": 7.215145202855746e-06, + "loss": 1.1782656908035278, + "step": 264 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 0.37758493423461914, + "learning_rate": 7.0584502390401865e-06, + "loss": 1.0094555616378784, + "step": 266 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 0.5180346369743347, + "learning_rate": 6.903219961597891e-06, + "loss": 1.1325385570526123, + "step": 268 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.38575479388237, + "learning_rate": 6.7495120543552475e-06, + "loss": 1.1679610013961792, + "step": 270 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.3461146056652069, + "learning_rate": 6.59738363542336e-06, + "loss": 1.1273691654205322, + "step": 272 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 0.4262787401676178, + "learning_rate": 6.446891235972894e-06, + "loss": 1.1511529684066772, + "step": 274 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.2948058843612671, + "learning_rate": 6.298090779226977e-06, + "loss": 1.2013486623764038, + "step": 276 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 0.3147113621234894, + "learning_rate": 6.151037559680047e-06, + "loss": 1.1714566946029663, + "step": 278 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 0.4043079912662506, + "learning_rate": 6.005786222550319e-06, + "loss": 0.8109129071235657, + "step": 280 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.31658855080604553, + "learning_rate": 5.8623907434735515e-06, + "loss": 1.1680246591567993, + "step": 282 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 0.4184737801551819, + "learning_rate": 5.720904408445589e-06, + "loss": 1.159695029258728, + "step": 284 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 0.69264817237854, + "learning_rate": 5.581379794021202e-06, + "loss": 1.173396348953247, + "step": 286 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.4731365144252777, + "learning_rate": 5.443868747776579e-06, + "loss": 1.141501545906067, + "step": 288 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.39572659134864807, + "learning_rate": 5.308422369042644e-06, + "loss": 1.077953577041626, + "step": 290 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 0.4793470501899719, + "learning_rate": 5.175090989916483e-06, + "loss": 1.1118550300598145, + "step": 292 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.2662019729614258, + "learning_rate": 5.043924156557844e-06, + "loss": 1.1436463594436646, + "step": 294 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 0.4349708557128906, + "learning_rate": 4.914970610777725e-06, + "loss": 1.4781270027160645, + "step": 296 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 0.49073347449302673, + "learning_rate": 4.788278271925802e-06, + "loss": 0.8274983763694763, + "step": 298 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.3969985842704773, + "learning_rate": 4.663894219083548e-06, + "loss": 1.5442594289779663, + "step": 300 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 0.4534980356693268, + "learning_rate": 4.541864673569551e-06, + "loss": 1.1690313816070557, + "step": 302 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 0.554429292678833, + "learning_rate": 4.422234981763613e-06, + "loss": 0.7355527281761169, + "step": 304 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 1.9707756042480469, + "learning_rate": 4.305049598255946e-06, + "loss": 1.5113667249679565, + "step": 306 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 0.36261430382728577, + "learning_rate": 4.190352069327777e-06, + "loss": 0.8062604069709778, + "step": 308 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.3810434639453888, + "learning_rate": 4.078185016769484e-06, + "loss": 1.1614996194839478, + "step": 310 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.39326491951942444, + "learning_rate": 3.968590122042265e-06, + "loss": 1.177278757095337, + "step": 312 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 0.40779945254325867, + "learning_rate": 3.861608110789228e-06, + "loss": 1.1831945180892944, + "step": 314 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 0.9780954718589783, + "learning_rate": 3.757278737701697e-06, + "loss": 1.1853556632995605, + "step": 316 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 1.053917407989502, + "learning_rate": 3.6556407717462856e-06, + "loss": 1.0666594505310059, + "step": 318 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.37668082118034363, + "learning_rate": 3.5567319817582944e-06, + "loss": 1.272523045539856, + "step": 320 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 0.5615814328193665, + "learning_rate": 3.4605891224067423e-06, + "loss": 1.1502110958099365, + "step": 322 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.3317868709564209, + "learning_rate": 3.3672479205362764e-06, + "loss": 0.5082761645317078, + "step": 324 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 0.3119502365589142, + "learning_rate": 3.276743061891014e-06, + "loss": 1.153531551361084, + "step": 326 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 0.6352930068969727, + "learning_rate": 3.1891081782252726e-06, + "loss": 1.0034343004226685, + "step": 328 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.3119983673095703, + "learning_rate": 3.1043758348059384e-06, + "loss": 1.181221842765808, + "step": 330 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 0.9960837960243225, + "learning_rate": 3.0225775183111784e-06, + "loss": 1.2305197715759277, + "step": 332 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 0.45826148986816406, + "learning_rate": 2.943743625129917e-06, + "loss": 1.013006567955017, + "step": 334 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.4111308455467224, + "learning_rate": 2.867903450066513e-06, + "loss": 1.1280704736709595, + "step": 336 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 0.4843418300151825, + "learning_rate": 2.795085175454741e-06, + "loss": 1.1432486772537231, + "step": 338 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.2703630030155182, + "learning_rate": 2.7253158606851983e-06, + "loss": 1.1488100290298462, + "step": 340 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.32970309257507324, + "learning_rate": 2.6586214321499952e-06, + "loss": 1.187166690826416, + "step": 342 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 0.714718222618103, + "learning_rate": 2.5950266736084558e-06, + "loss": 1.1312419176101685, + "step": 344 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 0.5256864428520203, + "learning_rate": 2.5345552169774413e-06, + "loss": 1.5058574676513672, + "step": 346 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.6660139560699463, + "learning_rate": 2.477229533549685e-06, + "loss": 0.7294368743896484, + "step": 348 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.3410135805606842, + "learning_rate": 2.423070925643422e-06, + "loss": 0.7677943110466003, + "step": 350 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.3177313506603241, + "learning_rate": 2.372099518686416e-06, + "loss": 1.1427574157714844, + "step": 352 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 0.3382633626461029, + "learning_rate": 2.324334253737321e-06, + "loss": 0.8091766238212585, + "step": 354 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 0.31415683031082153, + "learning_rate": 2.2797928804471413e-06, + "loss": 1.0229932069778442, + "step": 356 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 0.33465299010276794, + "learning_rate": 2.2384919504634465e-06, + "loss": 0.7048438787460327, + "step": 358 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.7479493618011475, + "learning_rate": 2.2004468112797345e-06, + "loss": 1.145750880241394, + "step": 360 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 0.6303642988204956, + "learning_rate": 2.165671600532298e-06, + "loss": 0.9127320051193237, + "step": 362 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.678534746170044, + "learning_rate": 2.134179240746638e-06, + "loss": 1.1293567419052124, + "step": 364 + }, + { + "epoch": 1.0, + "grad_norm": 0.48102641105651855, + "learning_rate": 2.1059814345354434e-06, + "loss": 1.1180713176727295, + "step": 366 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.26332399249076843, + "learning_rate": 2.0810886602498733e-06, + "loss": 0.9773157238960266, + "step": 368 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.48079046607017517, + "learning_rate": 2.059510168085791e-06, + "loss": 1.077552318572998, + "step": 370 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.3228934407234192, + "learning_rate": 2.0412539766463697e-06, + "loss": 1.0045071840286255, + "step": 372 + }, + { + "epoch": 1.0218579234972678, + "grad_norm": 0.3166205883026123, + "learning_rate": 2.0263268699623746e-06, + "loss": 1.0588802099227905, + "step": 374 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 1.7906802892684937, + "learning_rate": 2.0147343949711965e-06, + "loss": 0.8532091975212097, + "step": 376 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 0.40486088395118713, + "learning_rate": 2.0064808594556066e-06, + "loss": 0.959248960018158, + "step": 378 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.5614176392555237, + "learning_rate": 2.0015693304429757e-06, + "loss": 0.6482217311859131, + "step": 380 + }, + { + "epoch": 1.0437158469945356, + "grad_norm": 0.9394697546958923, + "learning_rate": 2.000001633065562e-06, + "loss": 0.9249519109725952, + "step": 382 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.38094520568847656, + "learning_rate": 2.0017783498822896e-06, + "loss": 0.9705590605735779, + "step": 384 + }, + { + "epoch": 1.0546448087431695, + "grad_norm": 1.61484956741333, + "learning_rate": 2.006898820662268e-06, + "loss": 0.623333215713501, + "step": 386 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 0.5918154120445251, + "learning_rate": 2.0153611426301325e-06, + "loss": 0.5663283467292786, + "step": 388 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.5102803707122803, + "learning_rate": 2.027162171173126e-06, + "loss": 1.11468505859375, + "step": 390 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 0.4305904507637024, + "learning_rate": 2.0422975210096317e-06, + "loss": 0.9271813631057739, + "step": 392 + }, + { + "epoch": 1.0765027322404372, + "grad_norm": 0.36596590280532837, + "learning_rate": 2.0607615678187605e-06, + "loss": 0.6126598119735718, + "step": 394 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.41866230964660645, + "learning_rate": 2.082547450330353e-06, + "loss": 0.971298336982727, + "step": 396 + }, + { + "epoch": 1.0874316939890711, + "grad_norm": 0.4388103187084198, + "learning_rate": 2.1076470728746407e-06, + "loss": 0.9021328091621399, + "step": 398 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 3.2539632320404053, + "learning_rate": 2.136051108390608e-06, + "loss": 1.0814286470413208, + "step": 400 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 0.8472021222114563, + "learning_rate": 2.167749001891944e-06, + "loss": 0.9241260290145874, + "step": 402 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 0.5540516376495361, + "learning_rate": 2.202728974389296e-06, + "loss": 1.0775995254516602, + "step": 404 + }, + { + "epoch": 1.1092896174863387, + "grad_norm": 1.049932599067688, + "learning_rate": 2.240978027267357e-06, + "loss": 1.1052746772766113, + "step": 406 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.5960119962692261, + "learning_rate": 2.2824819471151736e-06, + "loss": 0.610589861869812, + "step": 408 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.3115493953227997, + "learning_rate": 2.327225311007878e-06, + "loss": 1.1588163375854492, + "step": 410 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 1.5775190591812134, + "learning_rate": 2.3751914922378623e-06, + "loss": 0.662068784236908, + "step": 412 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.49976834654808044, + "learning_rate": 2.4263626664932998e-06, + "loss": 0.7565730214118958, + "step": 414 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 1.2116132974624634, + "learning_rate": 2.4807198184816817e-06, + "loss": 0.6296537518501282, + "step": 416 + }, + { + "epoch": 1.1420765027322404, + "grad_norm": 1.1844316720962524, + "learning_rate": 2.5382427489959373e-06, + "loss": 0.9881916046142578, + "step": 418 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.5167036652565002, + "learning_rate": 2.5989100824204876e-06, + "loss": 0.9343990683555603, + "step": 420 + }, + { + "epoch": 1.1530054644808743, + "grad_norm": 0.558880627155304, + "learning_rate": 2.662699274674462e-06, + "loss": 1.0110293626785278, + "step": 422 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 1.1780766248703003, + "learning_rate": 2.7295866215891107e-06, + "loss": 0.7126035690307617, + "step": 424 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 0.3370395004749298, + "learning_rate": 2.799547267716326e-06, + "loss": 0.6112310886383057, + "step": 426 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 0.8769522905349731, + "learning_rate": 2.872555215564946e-06, + "loss": 1.1242718696594238, + "step": 428 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.6202550530433655, + "learning_rate": 2.9485833352614895e-06, + "loss": 0.9957835078239441, + "step": 430 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.48545965552330017, + "learning_rate": 3.027603374631647e-06, + "loss": 1.0449634790420532, + "step": 432 + }, + { + "epoch": 1.185792349726776, + "grad_norm": 0.49180424213409424, + "learning_rate": 3.1095859696988273e-06, + "loss": 0.7476254105567932, + "step": 434 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 0.4692288041114807, + "learning_rate": 3.1945006555958885e-06, + "loss": 1.060101866722107, + "step": 436 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 0.26823532581329346, + "learning_rate": 3.2823158778858976e-06, + "loss": 0.9638835191726685, + "step": 438 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.3634532392024994, + "learning_rate": 3.372999004287839e-06, + "loss": 0.8774067163467407, + "step": 440 + }, + { + "epoch": 1.2076502732240437, + "grad_norm": 0.2828170359134674, + "learning_rate": 3.4665163368028044e-06, + "loss": 1.0114994049072266, + "step": 442 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.4509800970554352, + "learning_rate": 3.562833124236238e-06, + "loss": 0.9758923053741455, + "step": 444 + }, + { + "epoch": 1.2185792349726776, + "grad_norm": 0.9190130829811096, + "learning_rate": 3.6619135751115325e-06, + "loss": 0.5354660153388977, + "step": 446 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.39201292395591736, + "learning_rate": 3.763720870970201e-06, + "loss": 0.9081273674964905, + "step": 448 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.6562516689300537, + "learning_rate": 3.86821718005367e-06, + "loss": 0.9579398036003113, + "step": 450 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 0.5844786763191223, + "learning_rate": 3.975363671361641e-06, + "loss": 0.9740945100784302, + "step": 452 + }, + { + "epoch": 1.2404371584699454, + "grad_norm": 0.6142482757568359, + "learning_rate": 4.0851205290817254e-06, + "loss": 0.34633180499076843, + "step": 454 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 0.7541310787200928, + "learning_rate": 4.197446967385105e-06, + "loss": 0.7628012895584106, + "step": 456 + }, + { + "epoch": 1.2513661202185793, + "grad_norm": 0.44816914200782776, + "learning_rate": 4.312301245582571e-06, + "loss": 0.7231332063674927, + "step": 458 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.9218317866325378, + "learning_rate": 4.429640683635466e-06, + "loss": 1.0563876628875732, + "step": 460 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 2.9262020587921143, + "learning_rate": 4.549421678015633e-06, + "loss": 0.6581028699874878, + "step": 462 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 0.4514838457107544, + "learning_rate": 4.671599717908582e-06, + "loss": 0.985448956489563, + "step": 464 + }, + { + "epoch": 1.273224043715847, + "grad_norm": 0.3441561758518219, + "learning_rate": 4.796129401753752e-06, + "loss": 1.0194132328033447, + "step": 466 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.3873656690120697, + "learning_rate": 4.922964454115837e-06, + "loss": 0.9317984580993652, + "step": 468 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.5289583802223206, + "learning_rate": 5.0520577428807835e-06, + "loss": 0.9312957525253296, + "step": 470 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 0.3908224403858185, + "learning_rate": 5.183361296770197e-06, + "loss": 1.2282073497772217, + "step": 472 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.4844124913215637, + "learning_rate": 5.316826323167505e-06, + "loss": 0.9655548334121704, + "step": 474 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 0.6156196594238281, + "learning_rate": 5.4524032262494175e-06, + "loss": 0.5244005918502808, + "step": 476 + }, + { + "epoch": 1.3060109289617485, + "grad_norm": 0.43006354570388794, + "learning_rate": 5.590041625415783e-06, + "loss": 1.0403926372528076, + "step": 478 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.2751167118549347, + "learning_rate": 5.7296903740111076e-06, + "loss": 1.0301437377929688, + "step": 480 + }, + { + "epoch": 1.3169398907103824, + "grad_norm": 0.8657461404800415, + "learning_rate": 5.87129757833077e-06, + "loss": 0.923163652420044, + "step": 482 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 0.6536735892295837, + "learning_rate": 6.014810616904747e-06, + "loss": 0.6813654899597168, + "step": 484 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 0.3719988167285919, + "learning_rate": 6.160176160051906e-06, + "loss": 0.958286702632904, + "step": 486 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.5692084431648254, + "learning_rate": 6.307340189697344e-06, + "loss": 1.0355274677276611, + "step": 488 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.5361304879188538, + "learning_rate": 6.456248019445626e-06, + "loss": 0.5718910694122314, + "step": 490 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 0.46025416254997253, + "learning_rate": 6.606844314902321e-06, + "loss": 1.0728013515472412, + "step": 492 + }, + { + "epoch": 1.349726775956284, + "grad_norm": 0.14679169654846191, + "learning_rate": 6.7590731142363915e-06, + "loss": 0.7085850238800049, + "step": 494 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.4374118149280548, + "learning_rate": 6.912877848975638e-06, + "loss": 1.0255423784255981, + "step": 496 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 1.178649663925171, + "learning_rate": 7.068201365027712e-06, + "loss": 0.8840912580490112, + "step": 498 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.41059738397598267, + "learning_rate": 7.2249859439185875e-06, + "loss": 1.1463178396224976, + "step": 500 + }, + { + "epoch": 1.3715846994535519, + "grad_norm": 0.38757556676864624, + "learning_rate": 7.3831733242409285e-06, + "loss": 0.8971600532531738, + "step": 502 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.3446296155452728, + "learning_rate": 7.5427047233040485e-06, + "loss": 0.48697370290756226, + "step": 504 + }, + { + "epoch": 1.3825136612021858, + "grad_norm": 0.24354782700538635, + "learning_rate": 7.703520858977702e-06, + "loss": 0.7695150375366211, + "step": 506 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 0.39158889651298523, + "learning_rate": 7.865561971721389e-06, + "loss": 0.9563601016998291, + "step": 508 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.47387513518333435, + "learning_rate": 8.02876784679115e-06, + "loss": 0.97979336977005, + "step": 510 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.5868260860443115, + "learning_rate": 8.193077836615386e-06, + "loss": 0.9143297672271729, + "step": 512 + }, + { + "epoch": 1.4043715846994536, + "grad_norm": 0.3844831883907318, + "learning_rate": 8.35843088333168e-06, + "loss": 0.6998130679130554, + "step": 514 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 0.3410232663154602, + "learning_rate": 8.524765541475935e-06, + "loss": 0.7496625185012817, + "step": 516 + }, + { + "epoch": 1.4153005464480874, + "grad_norm": 0.3015901744365692, + "learning_rate": 8.692020000815627e-06, + "loss": 0.9690025448799133, + "step": 518 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 1.459832787513733, + "learning_rate": 8.860132109318622e-06, + "loss": 0.7310088872909546, + "step": 520 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.32720673084259033, + "learning_rate": 9.029039396248916e-06, + "loss": 1.0313293933868408, + "step": 522 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 0.7867904901504517, + "learning_rate": 9.198679095380924e-06, + "loss": 1.0015819072723389, + "step": 524 + }, + { + "epoch": 1.4371584699453552, + "grad_norm": 0.7222017049789429, + "learning_rate": 9.368988168323451e-06, + "loss": 1.0047054290771484, + "step": 526 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.5384648442268372, + "learning_rate": 9.539903327944926e-06, + "loss": 0.6519490480422974, + "step": 528 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.9112948179244995, + "learning_rate": 9.711361061890942e-06, + "loss": 0.7796599864959717, + "step": 530 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 1.056015133857727, + "learning_rate": 9.8832976561856e-06, + "loss": 0.7928510308265686, + "step": 532 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 0.4741556644439697, + "learning_rate": 1.0055649218907688e-05, + "loss": 1.0018107891082764, + "step": 534 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 0.5680137872695923, + "learning_rate": 1.0228351703933075e-05, + "loss": 0.9849597811698914, + "step": 536 + }, + { + "epoch": 1.469945355191257, + "grad_norm": 0.31153297424316406, + "learning_rate": 1.0401340934734287e-05, + "loss": 1.0444822311401367, + "step": 538 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.26671940088272095, + "learning_rate": 1.0574552628228691e-05, + "loss": 1.0459693670272827, + "step": 540 + }, + { + "epoch": 1.4808743169398908, + "grad_norm": 0.48387154936790466, + "learning_rate": 1.0747922418666115e-05, + "loss": 1.0217854976654053, + "step": 542 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.41534528136253357, + "learning_rate": 1.0921385881547311e-05, + "loss": 0.8353682160377502, + "step": 544 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 1.1913517713546753, + "learning_rate": 1.1094878557564217e-05, + "loss": 1.092334270477295, + "step": 546 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 0.35618963837623596, + "learning_rate": 1.1268335976553098e-05, + "loss": 0.4362333118915558, + "step": 548 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 0.45524540543556213, + "learning_rate": 1.144169368145179e-05, + "loss": 1.027480125427246, + "step": 550 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.34574341773986816, + "learning_rate": 1.1614887252252076e-05, + "loss": 0.9967020153999329, + "step": 552 + }, + { + "epoch": 1.5136612021857925, + "grad_norm": 0.33023980259895325, + "learning_rate": 1.1787852329938198e-05, + "loss": 1.0062674283981323, + "step": 554 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 0.41989317536354065, + "learning_rate": 1.1960524640402862e-05, + "loss": 1.0924804210662842, + "step": 556 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.4085758924484253, + "learning_rate": 1.2132840018331514e-05, + "loss": 0.802219033241272, + "step": 558 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.5502913594245911, + "learning_rate": 1.2304734431046335e-05, + "loss": 0.5939708948135376, + "step": 560 + }, + { + "epoch": 1.5355191256830603, + "grad_norm": 0.8035076856613159, + "learning_rate": 1.2476144002300864e-05, + "loss": 0.915844738483429, + "step": 562 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 0.35893988609313965, + "learning_rate": 1.264700503601655e-05, + "loss": 0.8058934807777405, + "step": 564 + }, + { + "epoch": 1.5464480874316942, + "grad_norm": 0.40697717666625977, + "learning_rate": 1.2817254039952253e-05, + "loss": 1.0083532333374023, + "step": 566 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 0.9251991510391235, + "learning_rate": 1.2986827749298138e-05, + "loss": 0.8175945281982422, + "step": 568 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.4043232500553131, + "learning_rate": 1.3155663150184942e-05, + "loss": 1.135609745979309, + "step": 570 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 0.41902995109558105, + "learning_rate": 1.3323697503100035e-05, + "loss": 0.9482666850090027, + "step": 572 + }, + { + "epoch": 1.5683060109289617, + "grad_norm": 0.30340319871902466, + "learning_rate": 1.3490868366201527e-05, + "loss": 1.0169366598129272, + "step": 574 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.36948615312576294, + "learning_rate": 1.3657113618521763e-05, + "loss": 0.7928745746612549, + "step": 576 + }, + { + "epoch": 1.5792349726775956, + "grad_norm": 0.4047572910785675, + "learning_rate": 1.3822371483051593e-05, + "loss": 1.0838329792022705, + "step": 578 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.16013656556606293, + "learning_rate": 1.3986580549696777e-05, + "loss": 0.5277568697929382, + "step": 580 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.30199313163757324, + "learning_rate": 1.4149679798098097e-05, + "loss": 0.7084992527961731, + "step": 582 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 0.751322865486145, + "learning_rate": 1.4311608620306626e-05, + "loss": 0.6402939558029175, + "step": 584 + }, + { + "epoch": 1.6010928961748634, + "grad_norm": 0.4171648621559143, + "learning_rate": 1.447230684330573e-05, + "loss": 0.9727059602737427, + "step": 586 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 0.4916969835758209, + "learning_rate": 1.4631714751371456e-05, + "loss": 0.9658234119415283, + "step": 588 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.7164318561553955, + "learning_rate": 1.4789773108263016e-05, + "loss": 0.9527289867401123, + "step": 590 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.4969192445278168, + "learning_rate": 1.4946423179235068e-05, + "loss": 1.0558418035507202, + "step": 592 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 3.1464731693267822, + "learning_rate": 1.5101606752863606e-05, + "loss": 0.8807100057601929, + "step": 594 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 0.38706547021865845, + "learning_rate": 1.5255266162677466e-05, + "loss": 0.46191394329071045, + "step": 596 + }, + { + "epoch": 1.633879781420765, + "grad_norm": 0.5958669185638428, + "learning_rate": 1.540734430858725e-05, + "loss": 1.1970001459121704, + "step": 598 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.4041631519794464, + "learning_rate": 1.5557784678103852e-05, + "loss": 1.1198264360427856, + "step": 600 + }, + { + "epoch": 1.644808743169399, + "grad_norm": 0.27836379408836365, + "learning_rate": 1.5706531367338546e-05, + "loss": 0.9951107501983643, + "step": 602 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 0.3826007544994354, + "learning_rate": 1.5853529101776985e-05, + "loss": 1.015580654144287, + "step": 604 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.3726518452167511, + "learning_rate": 1.5998723256819298e-05, + "loss": 1.165217638015747, + "step": 606 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.3667909502983093, + "learning_rate": 1.614205987807872e-05, + "loss": 0.8437522649765015, + "step": 608 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.27238333225250244, + "learning_rate": 1.628348570143105e-05, + "loss": 1.1401313543319702, + "step": 610 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 0.6837865710258484, + "learning_rate": 1.6422948172807745e-05, + "loss": 0.6602663993835449, + "step": 612 + }, + { + "epoch": 1.6775956284153004, + "grad_norm": 0.23932527005672455, + "learning_rate": 1.6560395467725086e-05, + "loss": 0.9976965188980103, + "step": 614 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 0.3285486400127411, + "learning_rate": 1.6695776510542253e-05, + "loss": 1.039743185043335, + "step": 616 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.25682809948921204, + "learning_rate": 1.6829040993441085e-05, + "loss": 0.5925134420394897, + "step": 618 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.8280110955238342, + "learning_rate": 1.696013939512057e-05, + "loss": 1.1452248096466064, + "step": 620 + }, + { + "epoch": 1.6994535519125682, + "grad_norm": 1.396665334701538, + "learning_rate": 1.7089022999199064e-05, + "loss": 1.2453051805496216, + "step": 622 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.29501214623451233, + "learning_rate": 1.7215643912317323e-05, + "loss": 0.7819148302078247, + "step": 624 + }, + { + "epoch": 1.710382513661202, + "grad_norm": 0.2633409798145294, + "learning_rate": 1.73399550819358e-05, + "loss": 0.681013286113739, + "step": 626 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 1.002079963684082, + "learning_rate": 1.746191031381943e-05, + "loss": 1.099534511566162, + "step": 628 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.31711679697036743, + "learning_rate": 1.7581464289203475e-05, + "loss": 0.8212696313858032, + "step": 630 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 0.2709159851074219, + "learning_rate": 1.7698572581634083e-05, + "loss": 1.104303002357483, + "step": 632 + }, + { + "epoch": 1.7322404371584699, + "grad_norm": 0.3629956841468811, + "learning_rate": 1.781319167347718e-05, + "loss": 1.1676666736602783, + "step": 634 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.4824794828891754, + "learning_rate": 1.7925278972089748e-05, + "loss": 1.005893349647522, + "step": 636 + }, + { + "epoch": 1.7431693989071038, + "grad_norm": 0.28539636731147766, + "learning_rate": 1.8034792825647287e-05, + "loss": 1.1372097730636597, + "step": 638 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.5941718816757202, + "learning_rate": 1.8141692538621716e-05, + "loss": 0.9606054425239563, + "step": 640 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.23658542335033417, + "learning_rate": 1.8245938386903896e-05, + "loss": 1.1501632928848267, + "step": 642 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 0.31275659799575806, + "learning_rate": 1.8347491632565156e-05, + "loss": 1.0679880380630493, + "step": 644 + }, + { + "epoch": 1.7650273224043715, + "grad_norm": 0.31820371747016907, + "learning_rate": 1.8446314538252407e-05, + "loss": 0.765678882598877, + "step": 646 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.2747284173965454, + "learning_rate": 1.8542370381211374e-05, + "loss": 1.0073585510253906, + "step": 648 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.33899250626564026, + "learning_rate": 1.8635623466932843e-05, + "loss": 1.0795483589172363, + "step": 650 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 0.3018670976161957, + "learning_rate": 1.8726039142416796e-05, + "loss": 1.139539122581482, + "step": 652 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.40128761529922485, + "learning_rate": 1.881358380904954e-05, + "loss": 0.9823530316352844, + "step": 654 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 1.3912447690963745, + "learning_rate": 1.889822493508897e-05, + "loss": 0.9858295917510986, + "step": 656 + }, + { + "epoch": 1.7978142076502732, + "grad_norm": 0.6013797521591187, + "learning_rate": 1.897993106775346e-05, + "loss": 1.03643000125885, + "step": 658 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.6496340036392212, + "learning_rate": 1.9058671844909742e-05, + "loss": 1.0535352230072021, + "step": 660 + }, + { + "epoch": 1.8087431693989071, + "grad_norm": 0.25427573919296265, + "learning_rate": 1.9134418006355532e-05, + "loss": 1.0733699798583984, + "step": 662 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 0.5028781890869141, + "learning_rate": 1.9207141404692667e-05, + "loss": 1.0931841135025024, + "step": 664 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 0.33322790265083313, + "learning_rate": 1.927681501578672e-05, + "loss": 1.368470549583435, + "step": 666 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 0.48202478885650635, + "learning_rate": 1.934341294880924e-05, + "loss": 1.1083478927612305, + "step": 668 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 0.265699565410614, + "learning_rate": 1.9406910455858783e-05, + "loss": 0.7845261693000793, + "step": 670 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.9111495018005371, + "learning_rate": 1.9467283941157304e-05, + "loss": 1.3333193063735962, + "step": 672 + }, + { + "epoch": 1.8415300546448088, + "grad_norm": 0.41249316930770874, + "learning_rate": 1.952451096981838e-05, + "loss": 1.0828064680099487, + "step": 674 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 0.34011855721473694, + "learning_rate": 1.957857027618405e-05, + "loss": 1.128922700881958, + "step": 676 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.11010833084583282, + "learning_rate": 1.9629441771727166e-05, + "loss": 0.27124765515327454, + "step": 678 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.34342408180236816, + "learning_rate": 1.9677106552516317e-05, + "loss": 0.9879894256591797, + "step": 680 + }, + { + "epoch": 1.8633879781420766, + "grad_norm": 0.39624279737472534, + "learning_rate": 1.9721546906240577e-05, + "loss": 1.00787353515625, + "step": 682 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.3148435056209564, + "learning_rate": 1.976274631879142e-05, + "loss": 1.0767008066177368, + "step": 684 + }, + { + "epoch": 1.8743169398907105, + "grad_norm": 0.601940393447876, + "learning_rate": 1.9800689480399383e-05, + "loss": 1.0266121625900269, + "step": 686 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.5932305455207825, + "learning_rate": 1.9835362291323222e-05, + "loss": 1.14261794090271, + "step": 688 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.7920188903808594, + "learning_rate": 1.9866751867089363e-05, + "loss": 1.1080957651138306, + "step": 690 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 0.9028313755989075, + "learning_rate": 1.9894846543279838e-05, + "loss": 0.8868638277053833, + "step": 692 + }, + { + "epoch": 1.8961748633879782, + "grad_norm": 0.44754672050476074, + "learning_rate": 1.991963587986677e-05, + "loss": 1.16123366355896, + "step": 694 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.3467576205730438, + "learning_rate": 1.9941110665091922e-05, + "loss": 1.111083745956421, + "step": 696 + }, + { + "epoch": 1.9071038251366121, + "grad_norm": 0.8681450486183167, + "learning_rate": 1.9959262918889774e-05, + "loss": 1.0020962953567505, + "step": 698 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.37495821714401245, + "learning_rate": 1.9974085895852973e-05, + "loss": 1.062408208847046, + "step": 700 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.728013277053833, + "learning_rate": 1.99855740877389e-05, + "loss": 1.0249536037445068, + "step": 702 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 0.37011271715164185, + "learning_rate": 1.9993723225516553e-05, + "loss": 1.1557523012161255, + "step": 704 + }, + { + "epoch": 1.92896174863388, + "grad_norm": 0.5814104676246643, + "learning_rate": 1.9998530280952938e-05, + "loss": 1.1579149961471558, + "step": 706 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 0.2298351526260376, + "learning_rate": 1.9999993467738345e-05, + "loss": 1.1778736114501953, + "step": 708 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 0.38486695289611816, + "learning_rate": 1.9998112242150162e-05, + "loss": 0.9057983160018921, + "step": 710 + }, + { + "epoch": 1.9453551912568305, + "grad_norm": 0.8062682151794434, + "learning_rate": 1.999288730325491e-05, + "loss": 0.6416868567466736, + "step": 712 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 0.29092615842819214, + "learning_rate": 1.9984320592648474e-05, + "loss": 1.1394217014312744, + "step": 714 + }, + { + "epoch": 1.9562841530054644, + "grad_norm": 1.3473927974700928, + "learning_rate": 1.9972415293734607e-05, + "loss": 0.8599758148193359, + "step": 716 + }, + { + "epoch": 1.9617486338797814, + "grad_norm": 0.2769167423248291, + "learning_rate": 1.995717583054196e-05, + "loss": 1.1438926458358765, + "step": 718 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.41115716099739075, + "learning_rate": 1.9938607866080114e-05, + "loss": 1.095086693763733, + "step": 720 + }, + { + "epoch": 1.9726775956284153, + "grad_norm": 1.553356409072876, + "learning_rate": 1.991671830023521e-05, + "loss": 1.1327463388442993, + "step": 722 + }, + { + "epoch": 1.9781420765027322, + "grad_norm": 0.21956849098205566, + "learning_rate": 1.989151526720591e-05, + "loss": 1.078493595123291, + "step": 724 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 0.5972254276275635, + "learning_rate": 1.986300813248073e-05, + "loss": 1.0689141750335693, + "step": 726 + }, + { + "epoch": 1.989071038251366, + "grad_norm": 0.3301437497138977, + "learning_rate": 1.9831207489357825e-05, + "loss": 1.1041065454483032, + "step": 728 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 0.9666980504989624, + "learning_rate": 1.979612515500847e-05, + "loss": 0.987746000289917, + "step": 730 + }, + { + "epoch": 2.0, + "grad_norm": 0.3877274692058563, + "learning_rate": 1.97577741660858e-05, + "loss": 1.1460107564926147, + "step": 732 + }, + { + "epoch": 2.0054644808743167, + "grad_norm": 0.25538286566734314, + "learning_rate": 1.9716168773880382e-05, + "loss": 0.6630098223686218, + "step": 734 + }, + { + "epoch": 2.010928961748634, + "grad_norm": 0.4147356450557709, + "learning_rate": 1.9671324439024374e-05, + "loss": 0.8690657615661621, + "step": 736 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 0.4897252023220062, + "learning_rate": 1.9623257825746357e-05, + "loss": 0.9130824208259583, + "step": 738 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 0.4052647352218628, + "learning_rate": 1.9571986795678878e-05, + "loss": 0.8773000240325928, + "step": 740 + }, + { + "epoch": 2.0273224043715845, + "grad_norm": 1.3314751386642456, + "learning_rate": 1.951753040122102e-05, + "loss": 0.8227751851081848, + "step": 742 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 3.8880512714385986, + "learning_rate": 1.9459908878458532e-05, + "loss": 1.0034403800964355, + "step": 744 + }, + { + "epoch": 2.0382513661202184, + "grad_norm": 0.4356456995010376, + "learning_rate": 1.939914363964402e-05, + "loss": 0.8533210754394531, + "step": 746 + }, + { + "epoch": 2.0437158469945356, + "grad_norm": 0.41386353969573975, + "learning_rate": 1.9335257265240168e-05, + "loss": 0.9781957864761353, + "step": 748 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 2.5565185546875, + "learning_rate": 1.9268273495528768e-05, + "loss": 0.6280704736709595, + "step": 750 + }, + { + "epoch": 2.0546448087431695, + "grad_norm": 0.46574458479881287, + "learning_rate": 1.9198217221788806e-05, + "loss": 0.6308586597442627, + "step": 752 + }, + { + "epoch": 2.060109289617486, + "grad_norm": 2.530082941055298, + "learning_rate": 1.9125114477046807e-05, + "loss": 0.6913048028945923, + "step": 754 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 0.8439071178436279, + "learning_rate": 1.9048992426402947e-05, + "loss": 1.3085075616836548, + "step": 756 + }, + { + "epoch": 2.07103825136612, + "grad_norm": 1.0051873922348022, + "learning_rate": 1.896987935693643e-05, + "loss": 1.0452558994293213, + "step": 758 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 0.3603721261024475, + "learning_rate": 1.888780466719397e-05, + "loss": 0.9518926739692688, + "step": 760 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 0.6717904210090637, + "learning_rate": 1.8802798856265254e-05, + "loss": 1.1386586427688599, + "step": 762 + }, + { + "epoch": 2.087431693989071, + "grad_norm": 1.6152324676513672, + "learning_rate": 1.8714893512449424e-05, + "loss": 0.8252012729644775, + "step": 764 + }, + { + "epoch": 2.092896174863388, + "grad_norm": 0.29463207721710205, + "learning_rate": 1.8624121301516808e-05, + "loss": 1.0371973514556885, + "step": 766 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 0.48132818937301636, + "learning_rate": 1.853051595457026e-05, + "loss": 0.7042374610900879, + "step": 768 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 0.4673232436180115, + "learning_rate": 1.843411225551065e-05, + "loss": 0.9674187302589417, + "step": 770 + }, + { + "epoch": 2.109289617486339, + "grad_norm": 3.6729300022125244, + "learning_rate": 1.8334946028111088e-05, + "loss": 0.9104331135749817, + "step": 772 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 0.38922831416130066, + "learning_rate": 1.8233054122704765e-05, + "loss": 0.8225538730621338, + "step": 774 + }, + { + "epoch": 2.120218579234973, + "grad_norm": 0.29239991307258606, + "learning_rate": 1.8128474402491286e-05, + "loss": 0.8900973200798035, + "step": 776 + }, + { + "epoch": 2.1256830601092895, + "grad_norm": 1.132876992225647, + "learning_rate": 1.802124572946668e-05, + "loss": 0.6526266932487488, + "step": 778 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 0.3578123450279236, + "learning_rate": 1.791140794998219e-05, + "loss": 0.673201858997345, + "step": 780 + }, + { + "epoch": 2.1366120218579234, + "grad_norm": 0.305058091878891, + "learning_rate": 1.7799001879937294e-05, + "loss": 0.8488634824752808, + "step": 782 + }, + { + "epoch": 2.1420765027322406, + "grad_norm": 0.27180346846580505, + "learning_rate": 1.768406928961248e-05, + "loss": 1.0290989875793457, + "step": 784 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 0.129277765750885, + "learning_rate": 1.7566652888147328e-05, + "loss": 0.646569550037384, + "step": 786 + }, + { + "epoch": 2.1530054644808745, + "grad_norm": 7.248612403869629, + "learning_rate": 1.7446796307669725e-05, + "loss": 0.6828120350837708, + "step": 788 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 0.5306554436683655, + "learning_rate": 1.732454408708209e-05, + "loss": 0.9795069098472595, + "step": 790 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 0.3850613236427307, + "learning_rate": 1.719994165551063e-05, + "loss": 0.611956000328064, + "step": 792 + }, + { + "epoch": 2.169398907103825, + "grad_norm": 2.014388084411621, + "learning_rate": 1.7073035315423838e-05, + "loss": 0.8363724946975708, + "step": 794 + }, + { + "epoch": 2.1748633879781423, + "grad_norm": 0.5221168398857117, + "learning_rate": 1.6943872225426396e-05, + "loss": 0.8072232007980347, + "step": 796 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 0.2606959044933319, + "learning_rate": 1.6812500382734977e-05, + "loss": 0.8440264463424683, + "step": 798 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 1.0706250667572021, + "learning_rate": 1.6678968605342348e-05, + "loss": 1.0057344436645508, + "step": 800 + }, + { + "epoch": 2.191256830601093, + "grad_norm": 0.4442984163761139, + "learning_rate": 1.6543326513876602e-05, + "loss": 0.948396623134613, + "step": 802 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 0.4648842215538025, + "learning_rate": 1.6405624513162002e-05, + "loss": 1.0737464427947998, + "step": 804 + }, + { + "epoch": 2.202185792349727, + "grad_norm": 0.2643176019191742, + "learning_rate": 1.6265913773488456e-05, + "loss": 0.9078125357627869, + "step": 806 + }, + { + "epoch": 2.2076502732240435, + "grad_norm": 0.3123255670070648, + "learning_rate": 1.6124246211596606e-05, + "loss": 0.5129435062408447, + "step": 808 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 0.33814558386802673, + "learning_rate": 1.598067447138542e-05, + "loss": 0.6270028352737427, + "step": 810 + }, + { + "epoch": 2.2185792349726774, + "grad_norm": 0.2907693684101105, + "learning_rate": 1.5835251904349688e-05, + "loss": 0.8713023662567139, + "step": 812 + }, + { + "epoch": 2.2240437158469946, + "grad_norm": 0.32165563106536865, + "learning_rate": 1.5688032549754453e-05, + "loss": 0.7885038256645203, + "step": 814 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 0.33382898569107056, + "learning_rate": 1.553907111455401e-05, + "loss": 0.847852885723114, + "step": 816 + }, + { + "epoch": 2.2349726775956285, + "grad_norm": 0.31947633624076843, + "learning_rate": 1.538842295306264e-05, + "loss": 0.9458706378936768, + "step": 818 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 0.444782018661499, + "learning_rate": 1.5236144046384917e-05, + "loss": 0.9850488305091858, + "step": 820 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 0.9171160459518433, + "learning_rate": 1.5082290981612987e-05, + "loss": 1.1842435598373413, + "step": 822 + }, + { + "epoch": 2.251366120218579, + "grad_norm": 0.29313865303993225, + "learning_rate": 1.4926920930798736e-05, + "loss": 0.8148974776268005, + "step": 824 + }, + { + "epoch": 2.2568306010928962, + "grad_norm": 1.1459228992462158, + "learning_rate": 1.4770091629708562e-05, + "loss": 0.6716075539588928, + "step": 826 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 0.2907215654850006, + "learning_rate": 1.461186135636868e-05, + "loss": 0.5930277109146118, + "step": 828 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 0.36279723048210144, + "learning_rate": 1.4452288909408864e-05, + "loss": 0.803901195526123, + "step": 830 + }, + { + "epoch": 2.273224043715847, + "grad_norm": 0.987859308719635, + "learning_rate": 1.4291433586212831e-05, + "loss": 0.586320698261261, + "step": 832 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 0.4347521662712097, + "learning_rate": 1.4129355160883216e-05, + "loss": 0.7422566413879395, + "step": 834 + }, + { + "epoch": 2.2841530054644807, + "grad_norm": 0.6859447360038757, + "learning_rate": 1.3966113862029429e-05, + "loss": 0.9895762205123901, + "step": 836 + }, + { + "epoch": 2.289617486338798, + "grad_norm": 1.1535533666610718, + "learning_rate": 1.3801770350386568e-05, + "loss": 0.9528064131736755, + "step": 838 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 0.333217978477478, + "learning_rate": 1.363638569627384e-05, + "loss": 0.6650771498680115, + "step": 840 + }, + { + "epoch": 2.300546448087432, + "grad_norm": 0.3299920856952667, + "learning_rate": 1.3470021356900696e-05, + "loss": 0.9850642681121826, + "step": 842 + }, + { + "epoch": 2.3060109289617485, + "grad_norm": 0.2731953263282776, + "learning_rate": 1.3302739153529252e-05, + "loss": 1.0595612525939941, + "step": 844 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 0.20923754572868347, + "learning_rate": 1.3134601248501366e-05, + "loss": 0.5519586801528931, + "step": 846 + }, + { + "epoch": 2.3169398907103824, + "grad_norm": 0.3694831132888794, + "learning_rate": 1.2965670122139071e-05, + "loss": 0.9783721566200256, + "step": 848 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 0.2181791067123413, + "learning_rate": 1.2796008549526752e-05, + "loss": 0.8303213119506836, + "step": 850 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 0.5311651229858398, + "learning_rate": 1.262567957718378e-05, + "loss": 0.7890332341194153, + "step": 852 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.6432619690895081, + "learning_rate": 1.2454746499636408e-05, + "loss": 0.9261720776557922, + "step": 854 + }, + { + "epoch": 2.33879781420765, + "grad_norm": 0.23040394484996796, + "learning_rate": 1.2283272835897359e-05, + "loss": 0.8952475190162659, + "step": 856 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 0.4025140404701233, + "learning_rate": 1.2111322305862088e-05, + "loss": 0.641585648059845, + "step": 858 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 0.2443774938583374, + "learning_rate": 1.1938958806630322e-05, + "loss": 0.7296821475028992, + "step": 860 + }, + { + "epoch": 2.3551912568306013, + "grad_norm": 0.3356713652610779, + "learning_rate": 1.1766246388761841e-05, + "loss": 0.8933297395706177, + "step": 862 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 0.40507715940475464, + "learning_rate": 1.1593249232475162e-05, + "loss": 0.5992748737335205, + "step": 864 + }, + { + "epoch": 2.366120218579235, + "grad_norm": 0.8139327168464661, + "learning_rate": 1.142003162379808e-05, + "loss": 1.2816228866577148, + "step": 866 + }, + { + "epoch": 2.371584699453552, + "grad_norm": 0.22665131092071533, + "learning_rate": 1.1246657930678817e-05, + "loss": 0.9113897085189819, + "step": 868 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.2706243097782135, + "learning_rate": 1.1073192579066867e-05, + "loss": 0.7394488453865051, + "step": 870 + }, + { + "epoch": 2.3825136612021858, + "grad_norm": 0.2607859671115875, + "learning_rate": 1.0899700028972169e-05, + "loss": 0.8141605257987976, + "step": 872 + }, + { + "epoch": 2.387978142076503, + "grad_norm": 0.3351480960845947, + "learning_rate": 1.072624475051166e-05, + "loss": 0.9203171730041504, + "step": 874 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 2.871534824371338, + "learning_rate": 1.055289119995206e-05, + "loss": 0.7342777848243713, + "step": 876 + }, + { + "epoch": 2.3989071038251364, + "grad_norm": 0.31686869263648987, + "learning_rate": 1.0379703795757853e-05, + "loss": 0.822813093662262, + "step": 878 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 0.3635275363922119, + "learning_rate": 1.0206746894653252e-05, + "loss": 1.01185941696167, + "step": 880 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 0.2960561215877533, + "learning_rate": 1.0034084767707164e-05, + "loss": 0.491107702255249, + "step": 882 + }, + { + "epoch": 2.4153005464480874, + "grad_norm": 0.32399389147758484, + "learning_rate": 9.861781576449879e-06, + "loss": 0.7038915157318115, + "step": 884 + }, + { + "epoch": 2.420765027322404, + "grad_norm": 0.3570184111595154, + "learning_rate": 9.689901349030646e-06, + "loss": 0.8117411136627197, + "step": 886 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 0.3321293890476227, + "learning_rate": 9.518507956424643e-06, + "loss": 0.6311563849449158, + "step": 888 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 0.3352676331996918, + "learning_rate": 9.347665088698444e-06, + "loss": 0.5507946610450745, + "step": 890 + }, + { + "epoch": 2.4371584699453552, + "grad_norm": 0.7419760227203369, + "learning_rate": 9.177436231342623e-06, + "loss": 0.8605806827545166, + "step": 892 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 0.3643116056919098, + "learning_rate": 9.00788464168054e-06, + "loss": 0.49560198187828064, + "step": 894 + }, + { + "epoch": 2.448087431693989, + "grad_norm": 0.32895293831825256, + "learning_rate": 8.839073325361751e-06, + "loss": 0.8086169958114624, + "step": 896 + }, + { + "epoch": 2.453551912568306, + "grad_norm": 0.9999696016311646, + "learning_rate": 8.67106501294902e-06, + "loss": 0.7644316554069519, + "step": 898 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.32013651728630066, + "learning_rate": 8.503922136607536e-06, + "loss": 0.8786826133728027, + "step": 900 + }, + { + "epoch": 2.4644808743169397, + "grad_norm": 0.34828099608421326, + "learning_rate": 8.337706806905029e-06, + "loss": 0.9184771776199341, + "step": 902 + }, + { + "epoch": 2.469945355191257, + "grad_norm": 0.21218842267990112, + "learning_rate": 8.172480789731374e-06, + "loss": 0.7311652302742004, + "step": 904 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 0.5770183205604553, + "learning_rate": 8.00830548334625e-06, + "loss": 0.972994327545166, + "step": 906 + }, + { + "epoch": 2.480874316939891, + "grad_norm": 0.15764155983924866, + "learning_rate": 7.84524189556352e-06, + "loss": 0.4051420986652374, + "step": 908 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 0.23087617754936218, + "learning_rate": 7.68335062108057e-06, + "loss": 0.6997837424278259, + "step": 910 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 0.3418113887310028, + "learning_rate": 7.522691818961252e-06, + "loss": 0.42771270871162415, + "step": 912 + }, + { + "epoch": 2.4972677595628414, + "grad_norm": 0.38769078254699707, + "learning_rate": 7.3633251902806165e-06, + "loss": 0.9568927884101868, + "step": 914 + }, + { + "epoch": 2.5027322404371586, + "grad_norm": 0.30056893825531006, + "learning_rate": 7.205309955939983e-06, + "loss": 0.7710816264152527, + "step": 916 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 0.3079778850078583, + "learning_rate": 7.048704834660296e-06, + "loss": 0.7985804677009583, + "step": 918 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 0.29017210006713867, + "learning_rate": 6.8935680211621715e-06, + "loss": 0.695990264415741, + "step": 920 + }, + { + "epoch": 2.519125683060109, + "grad_norm": 0.24671095609664917, + "learning_rate": 6.739957164540634e-06, + "loss": 0.7687212228775024, + "step": 922 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 0.2934419512748718, + "learning_rate": 6.587929346842625e-06, + "loss": 0.5792368650436401, + "step": 924 + }, + { + "epoch": 2.530054644808743, + "grad_norm": 0.43741247057914734, + "learning_rate": 6.437541061855222e-06, + "loss": 0.550622820854187, + "step": 926 + }, + { + "epoch": 2.5355191256830603, + "grad_norm": 0.28486597537994385, + "learning_rate": 6.288848194112459e-06, + "loss": 0.6305705904960632, + "step": 928 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 0.3551803529262543, + "learning_rate": 6.141905998128495e-06, + "loss": 0.9627759456634521, + "step": 930 + }, + { + "epoch": 2.546448087431694, + "grad_norm": 0.34825268387794495, + "learning_rate": 5.996769077865029e-06, + "loss": 0.6931189894676208, + "step": 932 + }, + { + "epoch": 2.551912568306011, + "grad_norm": 0.39621520042419434, + "learning_rate": 5.853491366440313e-06, + "loss": 0.8206515908241272, + "step": 934 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 6.864598274230957, + "learning_rate": 5.712126106087557e-06, + "loss": 0.6882966160774231, + "step": 936 + }, + { + "epoch": 2.5628415300546448, + "grad_norm": 0.7489693760871887, + "learning_rate": 5.572725828369961e-06, + "loss": 0.5850929021835327, + "step": 938 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 0.5904379487037659, + "learning_rate": 5.4353423346599944e-06, + "loss": 1.0272624492645264, + "step": 940 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 0.33465638756752014, + "learning_rate": 5.30002667688986e-06, + "loss": 0.5956709980964661, + "step": 942 + }, + { + "epoch": 2.579234972677596, + "grad_norm": 0.49614542722702026, + "learning_rate": 5.1668291385804995e-06, + "loss": 0.9119790196418762, + "step": 944 + }, + { + "epoch": 2.5846994535519126, + "grad_norm": 0.18383480608463287, + "learning_rate": 5.03579921615621e-06, + "loss": 0.339104026556015, + "step": 946 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 0.39025598764419556, + "learning_rate": 4.906985600551651e-06, + "loss": 0.7046570777893066, + "step": 948 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 0.32544654607772827, + "learning_rate": 4.780436159118221e-06, + "loss": 0.7014875411987305, + "step": 950 + }, + { + "epoch": 2.6010928961748636, + "grad_norm": 0.4727187752723694, + "learning_rate": 4.656197917836474e-06, + "loss": 0.9602597951889038, + "step": 952 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 0.5159199237823486, + "learning_rate": 4.5343170438411885e-06, + "loss": 0.45191124081611633, + "step": 954 + }, + { + "epoch": 2.612021857923497, + "grad_norm": 0.42049863934516907, + "learning_rate": 4.414838828265581e-06, + "loss": 0.2999444603919983, + "step": 956 + }, + { + "epoch": 2.6174863387978142, + "grad_norm": 0.2952582538127899, + "learning_rate": 4.297807669411057e-06, + "loss": 0.518054187297821, + "step": 958 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.30971741676330566, + "learning_rate": 4.183267056248689e-06, + "loss": 0.763922929763794, + "step": 960 + }, + { + "epoch": 2.628415300546448, + "grad_norm": 0.3294529616832733, + "learning_rate": 4.071259552258709e-06, + "loss": 0.6879506707191467, + "step": 962 + }, + { + "epoch": 2.633879781420765, + "grad_norm": 0.4937688708305359, + "learning_rate": 3.961826779613801e-06, + "loss": 0.9857227802276611, + "step": 964 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 0.3687383532524109, + "learning_rate": 3.85500940371226e-06, + "loss": 0.6435363292694092, + "step": 966 + }, + { + "epoch": 2.644808743169399, + "grad_norm": 0.3145747184753418, + "learning_rate": 3.750847118066614e-06, + "loss": 0.6846204400062561, + "step": 968 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 0.8533056974411011, + "learning_rate": 3.6493786295535234e-06, + "loss": 0.8020285367965698, + "step": 970 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 0.38222646713256836, + "learning_rate": 3.5506416440301885e-06, + "loss": 1.064414143562317, + "step": 972 + }, + { + "epoch": 2.66120218579235, + "grad_norm": 0.7065198421478271, + "learning_rate": 3.4546728523228067e-06, + "loss": 0.6177721619606018, + "step": 974 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.28936606645584106, + "learning_rate": 3.361507916592206e-06, + "loss": 0.8083776831626892, + "step": 976 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 0.21318885684013367, + "learning_rate": 3.271181457081715e-06, + "loss": 0.7788843512535095, + "step": 978 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 0.35676872730255127, + "learning_rate": 3.1837270392522456e-06, + "loss": 0.8598240613937378, + "step": 980 + }, + { + "epoch": 2.6830601092896176, + "grad_norm": 0.2766752243041992, + "learning_rate": 3.0991771613092686e-06, + "loss": 0.9411115050315857, + "step": 982 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 0.2711624801158905, + "learning_rate": 3.017563242126483e-06, + "loss": 0.6280143857002258, + "step": 984 + }, + { + "epoch": 2.6939890710382515, + "grad_norm": 0.2689661681652069, + "learning_rate": 2.9389156095704764e-06, + "loss": 0.9351664185523987, + "step": 986 + }, + { + "epoch": 2.699453551912568, + "grad_norm": 0.962471067905426, + "learning_rate": 2.8632634892308535e-06, + "loss": 0.8302200436592102, + "step": 988 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 0.39670348167419434, + "learning_rate": 2.7906349935599326e-06, + "loss": 0.8161414265632629, + "step": 990 + }, + { + "epoch": 2.710382513661202, + "grad_norm": 0.35174086689949036, + "learning_rate": 2.721057111426154e-06, + "loss": 0.5259367227554321, + "step": 992 + }, + { + "epoch": 2.7158469945355193, + "grad_norm": 0.2511337399482727, + "learning_rate": 2.6545556980849417e-06, + "loss": 0.9182239770889282, + "step": 994 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 0.21024306118488312, + "learning_rate": 2.591155465570866e-06, + "loss": 0.8048747181892395, + "step": 996 + }, + { + "epoch": 2.726775956284153, + "grad_norm": 0.2689720690250397, + "learning_rate": 2.5308799735145813e-06, + "loss": 0.7127807140350342, + "step": 998 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 0.7397425770759583, + "learning_rate": 2.473751620388069e-06, + "loss": 1.054977536201477, + "step": 1000 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 0.28134724497795105, + "learning_rate": 2.419791635181301e-06, + "loss": 0.8681496381759644, + "step": 1002 + }, + { + "epoch": 2.7431693989071038, + "grad_norm": 0.22395454347133636, + "learning_rate": 2.369020069513521e-06, + "loss": 0.6971544027328491, + "step": 1004 + }, + { + "epoch": 2.748633879781421, + "grad_norm": 0.273333877325058, + "learning_rate": 2.3214557901820258e-06, + "loss": 0.35114356875419617, + "step": 1006 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 0.33767545223236084, + "learning_rate": 2.27711647215124e-06, + "loss": 0.9685407280921936, + "step": 1008 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 0.3212963938713074, + "learning_rate": 2.2360185919846593e-06, + "loss": 1.0383319854736328, + "step": 1010 + }, + { + "epoch": 2.7650273224043715, + "grad_norm": 0.4074728190898895, + "learning_rate": 2.1981774217221474e-06, + "loss": 0.7557249069213867, + "step": 1012 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 0.3645928204059601, + "learning_rate": 2.1636070232047966e-06, + "loss": 0.9979292154312134, + "step": 1014 + }, + { + "epoch": 2.7759562841530054, + "grad_norm": 0.5972633361816406, + "learning_rate": 2.1323202428495544e-06, + "loss": 0.7092854380607605, + "step": 1016 + }, + { + "epoch": 2.781420765027322, + "grad_norm": 0.33370643854141235, + "learning_rate": 2.104328706875452e-06, + "loss": 0.9313384890556335, + "step": 1018 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.4620024263858795, + "learning_rate": 2.079642816983293e-06, + "loss": 0.6462956666946411, + "step": 1020 + }, + { + "epoch": 2.7923497267759565, + "grad_norm": 0.3582730293273926, + "learning_rate": 2.0582717464903546e-06, + "loss": 0.8746689558029175, + "step": 1022 + }, + { + "epoch": 2.797814207650273, + "grad_norm": 0.618171751499176, + "learning_rate": 2.040223436921581e-06, + "loss": 0.4330008327960968, + "step": 1024 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 0.33677467703819275, + "learning_rate": 2.025504595058489e-06, + "loss": 0.9515740275382996, + "step": 1026 + }, + { + "epoch": 2.808743169398907, + "grad_norm": 1.293221354484558, + "learning_rate": 2.0141206904469206e-06, + "loss": 0.6902921795845032, + "step": 1028 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 0.2411491721868515, + "learning_rate": 2.006075953364551e-06, + "loss": 0.7488702535629272, + "step": 1030 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 0.27980777621269226, + "learning_rate": 2.0013733732489103e-06, + "loss": 0.7026211619377136, + "step": 1032 + }, + { + "epoch": 2.8251366120218577, + "grad_norm": 0.5924912095069885, + "learning_rate": 2.000014697586502e-06, + "loss": 0.7316262125968933, + "step": 1034 + }, + { + "epoch": 2.830601092896175, + "grad_norm": 0.7772489190101624, + "learning_rate": 2.0020004312634374e-06, + "loss": 1.0607091188430786, + "step": 1036 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 0.23920530080795288, + "learning_rate": 2.0073298363778166e-06, + "loss": 0.7372963428497314, + "step": 1038 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 0.32360178232192993, + "learning_rate": 2.016000932513934e-06, + "loss": 0.8663780093193054, + "step": 1040 + }, + { + "epoch": 2.8469945355191255, + "grad_norm": 0.2977835536003113, + "learning_rate": 2.0280104974782058e-06, + "loss": 0.8212502002716064, + "step": 1042 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 0.29036644101142883, + "learning_rate": 2.043354068496541e-06, + "loss": 0.7420258522033691, + "step": 1044 + }, + { + "epoch": 2.8579234972677594, + "grad_norm": 0.3778425455093384, + "learning_rate": 2.0620259438727168e-06, + "loss": 0.9037473797798157, + "step": 1046 + }, + { + "epoch": 2.8633879781420766, + "grad_norm": 0.11288527399301529, + "learning_rate": 2.084019185107135e-06, + "loss": 0.38631874322891235, + "step": 1048 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 0.42890432476997375, + "learning_rate": 2.1093256194751822e-06, + "loss": 0.787931501865387, + "step": 1050 + }, + { + "epoch": 2.8743169398907105, + "grad_norm": 0.3188338279724121, + "learning_rate": 2.137935843064233e-06, + "loss": 0.7466610074043274, + "step": 1052 + }, + { + "epoch": 2.879781420765027, + "grad_norm": 0.3111341595649719, + "learning_rate": 2.1698392242681502e-06, + "loss": 0.966293454170227, + "step": 1054 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 0.7533649802207947, + "learning_rate": 2.2050239077380097e-06, + "loss": 0.6824079155921936, + "step": 1056 + }, + { + "epoch": 2.890710382513661, + "grad_norm": 2.3769004344940186, + "learning_rate": 2.2434768187875723e-06, + "loss": 0.5979699492454529, + "step": 1058 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 0.33416691422462463, + "learning_rate": 2.285183668251853e-06, + "loss": 0.8557246327400208, + "step": 1060 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 0.44861775636672974, + "learning_rate": 2.3301289577970028e-06, + "loss": 0.8509962558746338, + "step": 1062 + }, + { + "epoch": 2.907103825136612, + "grad_norm": 0.28698277473449707, + "learning_rate": 2.3782959856795113e-06, + "loss": 0.8196679353713989, + "step": 1064 + }, + { + "epoch": 2.912568306010929, + "grad_norm": 0.17325085401535034, + "learning_rate": 2.4296668529525998e-06, + "loss": 0.46036872267723083, + "step": 1066 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 0.31135252118110657, + "learning_rate": 2.4842224701175147e-06, + "loss": 0.6244635581970215, + "step": 1068 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 0.5156043171882629, + "learning_rate": 2.541942564217196e-06, + "loss": 0.7163093090057373, + "step": 1070 + }, + { + "epoch": 2.92896174863388, + "grad_norm": 0.2627166509628296, + "learning_rate": 2.6028056863697506e-06, + "loss": 0.9640206694602966, + "step": 1072 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 0.42738401889801025, + "learning_rate": 2.6667892197388884e-06, + "loss": 0.5566335916519165, + "step": 1074 + }, + { + "epoch": 2.939890710382514, + "grad_norm": 0.6010952591896057, + "learning_rate": 2.7338693879383967e-06, + "loss": 0.48107990622520447, + "step": 1076 + }, + { + "epoch": 2.9453551912568305, + "grad_norm": 0.4749331474304199, + "learning_rate": 2.8040212638674506e-06, + "loss": 0.5274478793144226, + "step": 1078 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.38017284870147705, + "learning_rate": 2.877218778973578e-06, + "loss": 0.9354230165481567, + "step": 1080 + }, + { + "epoch": 2.9562841530054644, + "grad_norm": 0.40139836072921753, + "learning_rate": 2.9534347329398027e-06, + "loss": 0.5934174656867981, + "step": 1082 + }, + { + "epoch": 2.9617486338797816, + "grad_norm": 0.30364465713500977, + "learning_rate": 3.0326408037922827e-06, + "loss": 0.8147178292274475, + "step": 1084 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 0.1606474071741104, + "learning_rate": 3.1148075584248306e-06, + "loss": 0.5487402677536011, + "step": 1086 + }, + { + "epoch": 2.972677595628415, + "grad_norm": 0.5508518815040588, + "learning_rate": 3.199904463536296e-06, + "loss": 1.1423696279525757, + "step": 1088 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 0.6574386954307556, + "learning_rate": 3.2878998969767954e-06, + "loss": 0.6705480813980103, + "step": 1090 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 0.4296954572200775, + "learning_rate": 3.378761159498547e-06, + "loss": 0.8128272891044617, + "step": 1092 + }, + { + "epoch": 2.989071038251366, + "grad_norm": 0.28330495953559875, + "learning_rate": 3.472454486906972e-06, + "loss": 0.9465227723121643, + "step": 1094 + }, + { + "epoch": 2.994535519125683, + "grad_norm": 0.679470956325531, + "learning_rate": 3.5689450626075132e-06, + "loss": 0.5051727890968323, + "step": 1096 + }, + { + "epoch": 3.0, + "grad_norm": 0.3425956070423126, + "learning_rate": 3.668197030543573e-06, + "loss": 0.7532654404640198, + "step": 1098 + }, + { + "epoch": 3.0, + "step": 1098, + "total_flos": 4.931331991116186e+18, + "train_loss": 0.9669766378315854, + "train_runtime": 12176.252, + "train_samples_per_second": 5.411, + "train_steps_per_second": 0.09 + } + ], + "logging_steps": 2, + "max_steps": 1098, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.931331991116186e+18, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}