diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3886 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1098, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00546448087431694, + "grad_norm": 0.9973115921020508, + "learning_rate": 5.454545454545455e-07, + "loss": 2.5618793964385986, + "step": 2 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 1.3840559720993042, + "learning_rate": 1.6363636363636363e-06, + "loss": 1.9354113340377808, + "step": 4 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 2.0087080001831055, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.8657492399215698, + "step": 6 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 0.9125573635101318, + "learning_rate": 3.818181818181818e-06, + "loss": 1.7251954078674316, + "step": 8 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 0.5344090461730957, + "learning_rate": 4.90909090909091e-06, + "loss": 1.6777604818344116, + "step": 10 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.604454755783081, + "learning_rate": 6e-06, + "loss": 1.5453071594238281, + "step": 12 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 0.7670559883117676, + "learning_rate": 7.090909090909091e-06, + "loss": 1.4508954286575317, + "step": 14 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 0.4149356782436371, + "learning_rate": 8.181818181818181e-06, + "loss": 1.1633912324905396, + "step": 16 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 0.7089836001396179, + "learning_rate": 9.272727272727273e-06, + "loss": 1.164050579071045, + "step": 18 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 0.39713212847709656, + "learning_rate": 1.0363636363636364e-05, + "loss": 1.3879873752593994, + "step": 20 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 0.5368804335594177, + "learning_rate": 1.1454545454545455e-05, + "loss": 1.3274301290512085, + "step": 22 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.30250489711761475, + "learning_rate": 1.2545454545454545e-05, + "loss": 1.385084867477417, + "step": 24 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 0.9990394115447998, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.5793037414550781, + "step": 26 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 0.3947621285915375, + "learning_rate": 1.4727272727272728e-05, + "loss": 1.3561811447143555, + "step": 28 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.49388208985328674, + "learning_rate": 1.5818181818181818e-05, + "loss": 1.273718237876892, + "step": 30 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.32329627871513367, + "learning_rate": 1.6909090909090907e-05, + "loss": 1.3053655624389648, + "step": 32 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 0.718172013759613, + "learning_rate": 1.8e-05, + "loss": 1.0134309530258179, + "step": 34 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.27196696400642395, + "learning_rate": 1.909090909090909e-05, + "loss": 1.1276304721832275, + "step": 36 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 0.4729017913341522, + "learning_rate": 2.0181818181818183e-05, + "loss": 1.3713783025741577, + "step": 38 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.337189644575119, + "learning_rate": 2.1272727272727273e-05, + "loss": 1.3217278718948364, + "step": 40 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 0.8202743530273438, + "learning_rate": 2.2363636363636366e-05, + "loss": 1.0108166933059692, + "step": 42 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 0.3670739233493805, + "learning_rate": 2.3454545454545456e-05, + "loss": 0.988700807094574, + "step": 44 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 0.3262913227081299, + "learning_rate": 2.454545454545455e-05, + "loss": 1.3549995422363281, + "step": 46 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.3616645336151123, + "learning_rate": 2.5636363636363635e-05, + "loss": 1.3315670490264893, + "step": 48 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.3672925531864166, + "learning_rate": 2.6727272727272728e-05, + "loss": 1.3015525341033936, + "step": 50 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 0.42094483971595764, + "learning_rate": 2.7818181818181818e-05, + "loss": 1.2684640884399414, + "step": 52 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 2.618896722793579, + "learning_rate": 2.890909090909091e-05, + "loss": 0.8703136444091797, + "step": 54 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 0.3619275391101837, + "learning_rate": 3e-05, + "loss": 1.4717642068862915, + "step": 56 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 0.3370303213596344, + "learning_rate": 2.9997491688899256e-05, + "loss": 1.451904535293579, + "step": 58 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.35009288787841797, + "learning_rate": 2.998996768768956e-05, + "loss": 1.3383069038391113, + "step": 60 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 0.4444884955883026, + "learning_rate": 2.9977430792302124e-05, + "loss": 1.5286966562271118, + "step": 62 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 0.7974561452865601, + "learning_rate": 2.9959885661467903e-05, + "loss": 1.2279592752456665, + "step": 64 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.3576641082763672, + "learning_rate": 2.993733881498636e-05, + "loss": 1.3408796787261963, + "step": 66 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 0.2858317494392395, + "learning_rate": 2.9909798631302736e-05, + "loss": 1.3054747581481934, + "step": 68 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.6741045117378235, + "learning_rate": 2.987727534439457e-05, + "loss": 1.3402438163757324, + "step": 70 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 4.621718406677246, + "learning_rate": 2.983978103996877e-05, + "loss": 1.189091444015503, + "step": 72 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 0.3123322129249573, + "learning_rate": 2.9797329650970525e-05, + "loss": 1.1878712177276611, + "step": 74 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 0.34585925936698914, + "learning_rate": 2.974993695240579e-05, + "loss": 1.38385808467865, + "step": 76 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 2.415900468826294, + "learning_rate": 2.9697620555479297e-05, + "loss": 0.7725762128829956, + "step": 78 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.48625677824020386, + "learning_rate": 2.9640399901050182e-05, + "loss": 0.8228914737701416, + "step": 80 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 0.5314300060272217, + "learning_rate": 2.9578296252407734e-05, + "loss": 1.5916510820388794, + "step": 82 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.7009768486022949, + "learning_rate": 2.9511332687369917e-05, + "loss": 0.8212544918060303, + "step": 84 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 0.2534927427768707, + "learning_rate": 2.9439534089707624e-05, + "loss": 1.3987239599227905, + "step": 86 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 0.3312060534954071, + "learning_rate": 2.9362927139897832e-05, + "loss": 1.3285937309265137, + "step": 88 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 0.9822537899017334, + "learning_rate": 2.9281540305209068e-05, + "loss": 0.9706141352653503, + "step": 90 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 0.6968846321105957, + "learning_rate": 2.919540382912294e-05, + "loss": 1.4107171297073364, + "step": 92 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 0.4691411256790161, + "learning_rate": 2.9104549720095634e-05, + "loss": 1.2292362451553345, + "step": 94 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.8305396437644958, + "learning_rate": 2.9009011739663467e-05, + "loss": 1.6303619146347046, + "step": 96 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 0.4047655761241913, + "learning_rate": 2.8908825389897094e-05, + "loss": 1.355845332145691, + "step": 98 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.2581457793712616, + "learning_rate": 2.8804027900208843e-05, + "loss": 1.1548889875411987, + "step": 100 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 0.41052544116973877, + "learning_rate": 2.8694658213518226e-05, + "loss": 1.3467285633087158, + "step": 102 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 0.6879177689552307, + "learning_rate": 2.8580756971780686e-05, + "loss": 1.0806087255477905, + "step": 104 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 0.27630922198295593, + "learning_rate": 2.846236650088497e-05, + "loss": 1.3213335275650024, + "step": 106 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 0.2548673152923584, + "learning_rate": 2.833953079492476e-05, + "loss": 1.4522829055786133, + "step": 108 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 0.6334999203681946, + "learning_rate": 2.82122954998504e-05, + "loss": 1.3047692775726318, + "step": 110 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.2597161531448364, + "learning_rate": 2.808070789650679e-05, + "loss": 1.3145617246627808, + "step": 112 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 0.5719535946846008, + "learning_rate": 2.7944816883063727e-05, + "loss": 1.4522080421447754, + "step": 114 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 0.3497866690158844, + "learning_rate": 2.7804672956845295e-05, + "loss": 1.2907500267028809, + "step": 116 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 0.5994348526000977, + "learning_rate": 2.766032819556495e-05, + "loss": 1.1124215126037598, + "step": 118 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 3.034389019012451, + "learning_rate": 2.7511836237973366e-05, + "loss": 1.3056573867797852, + "step": 120 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.41870880126953125, + "learning_rate": 2.735925226392618e-05, + "loss": 1.3432774543762207, + "step": 122 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 0.3683644235134125, + "learning_rate": 2.7202632973879086e-05, + "loss": 1.0805763006210327, + "step": 124 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 0.34786108136177063, + "learning_rate": 2.7042036567817838e-05, + "loss": 1.1195154190063477, + "step": 126 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 1.6386656761169434, + "learning_rate": 2.6877522723631036e-05, + "loss": 1.3379027843475342, + "step": 128 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.6847893595695496, + "learning_rate": 2.6709152574933727e-05, + "loss": 1.282078742980957, + "step": 130 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.2398746758699417, + "learning_rate": 2.6536988688350067e-05, + "loss": 1.2325066328048706, + "step": 132 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 1.0792237520217896, + "learning_rate": 2.6361095040263437e-05, + "loss": 1.1317086219787598, + "step": 134 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 1.1165977716445923, + "learning_rate": 2.618153699304274e-05, + "loss": 1.0417739152908325, + "step": 136 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 0.18367436528205872, + "learning_rate": 2.599838127075361e-05, + "loss": 1.01582932472229, + "step": 138 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.4686935842037201, + "learning_rate": 2.5811695934363666e-05, + "loss": 0.6732921004295349, + "step": 140 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 0.2868647277355194, + "learning_rate": 2.5621550356450914e-05, + "loss": 1.253921389579773, + "step": 142 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.22526639699935913, + "learning_rate": 2.5428015195424825e-05, + "loss": 1.2781373262405396, + "step": 144 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 0.2401130348443985, + "learning_rate": 2.5231162369269498e-05, + "loss": 1.240103006362915, + "step": 146 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 0.34567099809646606, + "learning_rate": 2.503106502881889e-05, + "loss": 1.2351547479629517, + "step": 148 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.38193589448928833, + "learning_rate": 2.4827797530573762e-05, + "loss": 1.1751936674118042, + "step": 150 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 0.2586861848831177, + "learning_rate": 2.4621435409070757e-05, + "loss": 1.2725920677185059, + "step": 152 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 0.3629944920539856, + "learning_rate": 2.4412055348813602e-05, + "loss": 1.2570769786834717, + "step": 154 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.4387210011482239, + "learning_rate": 2.4199735155777017e-05, + "loss": 1.3026829957962036, + "step": 156 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 0.7413071393966675, + "learning_rate": 2.3984553728493914e-05, + "loss": 1.008730173110962, + "step": 158 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.2501511573791504, + "learning_rate": 2.3766591028736547e-05, + "loss": 1.2349580526351929, + "step": 160 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 1.2075088024139404, + "learning_rate": 2.3545928051802588e-05, + "loss": 0.7488419413566589, + "step": 162 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 0.5644823312759399, + "learning_rate": 2.332264679641717e-05, + "loss": 1.6321135759353638, + "step": 164 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 1.0617799758911133, + "learning_rate": 2.3096830234261996e-05, + "loss": 0.7753019332885742, + "step": 166 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.28146958351135254, + "learning_rate": 2.2868562279142912e-05, + "loss": 1.1729671955108643, + "step": 168 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.29739731550216675, + "learning_rate": 2.2637927755807458e-05, + "loss": 1.268522024154663, + "step": 170 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 1.04466712474823, + "learning_rate": 2.2405012368423786e-05, + "loss": 1.330442190170288, + "step": 172 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 0.46707776188850403, + "learning_rate": 2.2169902668732893e-05, + "loss": 1.4469293355941772, + "step": 174 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.4159034192562103, + "learning_rate": 2.193268602388583e-05, + "loss": 1.2424243688583374, + "step": 176 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 0.5740254521369934, + "learning_rate": 2.1693450583977953e-05, + "loss": 1.2044943571090698, + "step": 178 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.35157784819602966, + "learning_rate": 2.1452285249292147e-05, + "loss": 1.2033029794692993, + "step": 180 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 0.5813264846801758, + "learning_rate": 2.12092796372634e-05, + "loss": 1.2133545875549316, + "step": 182 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 1.1736332178115845, + "learning_rate": 2.096452404917679e-05, + "loss": 1.3634682893753052, + "step": 184 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 0.48712271451950073, + "learning_rate": 2.0718109436611348e-05, + "loss": 1.239789605140686, + "step": 186 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 0.7002618908882141, + "learning_rate": 2.0470127367642345e-05, + "loss": 1.2088130712509155, + "step": 188 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.3209463059902191, + "learning_rate": 2.022066999281444e-05, + "loss": 1.2543022632598877, + "step": 190 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.5514739155769348, + "learning_rate": 1.9969830010898358e-05, + "loss": 1.2285394668579102, + "step": 192 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 0.41652482748031616, + "learning_rate": 1.9717700634443903e-05, + "loss": 1.2374595403671265, + "step": 194 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 0.3611076772212982, + "learning_rate": 1.9464375555142e-05, + "loss": 1.288169503211975, + "step": 196 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.3940355181694031, + "learning_rate": 1.9209948909008734e-05, + "loss": 1.1237924098968506, + "step": 198 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.3027671277523041, + "learning_rate": 1.8954515241404218e-05, + "loss": 1.2617192268371582, + "step": 200 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 0.3746492266654968, + "learning_rate": 1.8698169471899414e-05, + "loss": 1.1039533615112305, + "step": 202 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.8580636382102966, + "learning_rate": 1.8441006859003842e-05, + "loss": 0.9195830821990967, + "step": 204 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 0.33917945623397827, + "learning_rate": 1.818312296476737e-05, + "loss": 1.270058274269104, + "step": 206 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.3017415404319763, + "learning_rate": 1.792461361926921e-05, + "loss": 1.0368424654006958, + "step": 208 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.3447563052177429, + "learning_rate": 1.766557488500727e-05, + "loss": 1.216840386390686, + "step": 210 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 0.3713569641113281, + "learning_rate": 1.7406103021201212e-05, + "loss": 1.5115100145339966, + "step": 212 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 0.19028295576572418, + "learning_rate": 1.7146294448022335e-05, + "loss": 1.2231677770614624, + "step": 214 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 0.9744280576705933, + "learning_rate": 1.688624571076371e-05, + "loss": 1.5730527639389038, + "step": 216 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 0.2510036528110504, + "learning_rate": 1.6626053443963762e-05, + "loss": 1.2018402814865112, + "step": 218 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 0.26169928908348083, + "learning_rate": 1.636581433549674e-05, + "loss": 1.2398658990859985, + "step": 220 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.34778648614883423, + "learning_rate": 1.610562509064332e-05, + "loss": 1.107602596282959, + "step": 222 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 0.5183984637260437, + "learning_rate": 1.5845582396154786e-05, + "loss": 1.040785551071167, + "step": 224 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 0.21732346713542938, + "learning_rate": 1.5585782884324064e-05, + "loss": 1.2164437770843506, + "step": 226 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 0.8948683142662048, + "learning_rate": 1.5326323097077015e-05, + "loss": 1.229734182357788, + "step": 228 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.2247624546289444, + "learning_rate": 1.5067299450097261e-05, + "loss": 0.7190737128257751, + "step": 230 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 0.305253267288208, + "learning_rate": 1.4808808196998006e-05, + "loss": 1.22504460811615, + "step": 232 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 1.4114190340042114, + "learning_rate": 1.4550945393554004e-05, + "loss": 0.8749437928199768, + "step": 234 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 1.5399497747421265, + "learning_rate": 1.4293806862007085e-05, + "loss": 0.9087380766868591, + "step": 236 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 0.6150895357131958, + "learning_rate": 1.4037488155458448e-05, + "loss": 1.303091287612915, + "step": 238 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.5875135064125061, + "learning_rate": 1.3782084522360981e-05, + "loss": 1.1951395273208618, + "step": 240 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 0.7042756676673889, + "learning_rate": 1.3527690871124762e-05, + "loss": 1.1392256021499634, + "step": 242 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.628507673740387, + "learning_rate": 1.3274401734848958e-05, + "loss": 1.0318241119384766, + "step": 244 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 0.24752730131149292, + "learning_rate": 1.3022311236193156e-05, + "loss": 1.1875934600830078, + "step": 246 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 0.5493347644805908, + "learning_rate": 1.2771513052401236e-05, + "loss": 1.2356622219085693, + "step": 248 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.3261685371398926, + "learning_rate": 1.2522100380490744e-05, + "loss": 1.2128081321716309, + "step": 250 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.4376692771911621, + "learning_rate": 1.2274165902620732e-05, + "loss": 1.1218929290771484, + "step": 252 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 0.335615336894989, + "learning_rate": 1.2027801751650918e-05, + "loss": 1.1654932498931885, + "step": 254 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.4503217041492462, + "learning_rate": 1.1783099476904972e-05, + "loss": 1.1567964553833008, + "step": 256 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 0.3116782605648041, + "learning_rate": 1.1540150010150599e-05, + "loss": 1.0923233032226562, + "step": 258 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.2738410234451294, + "learning_rate": 1.1299043631809205e-05, + "loss": 1.159534215927124, + "step": 260 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 0.8347621560096741, + "learning_rate": 1.1059869937407486e-05, + "loss": 1.2569533586502075, + "step": 262 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.2674843370914459, + "learning_rate": 1.082271780428362e-05, + "loss": 1.1951040029525757, + "step": 264 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 0.3185964524745941, + "learning_rate": 1.0587675358560278e-05, + "loss": 1.0627436637878418, + "step": 266 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 0.5771196484565735, + "learning_rate": 1.0354829942396837e-05, + "loss": 1.1797475814819336, + "step": 268 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.2694435119628906, + "learning_rate": 1.012426808153287e-05, + "loss": 1.1862552165985107, + "step": 270 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.3423052132129669, + "learning_rate": 9.896075453135039e-06, + "loss": 1.1492632627487183, + "step": 272 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 0.38316285610198975, + "learning_rate": 9.67033685395934e-06, + "loss": 1.167414903640747, + "step": 274 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.3846551477909088, + "learning_rate": 9.447136168840466e-06, + "loss": 1.2184109687805176, + "step": 276 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 0.44415712356567383, + "learning_rate": 9.226556339520069e-06, + "loss": 1.1867570877075195, + "step": 278 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 0.8969565033912659, + "learning_rate": 9.008679333825478e-06, + "loss": 0.8917623162269592, + "step": 280 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.28591468930244446, + "learning_rate": 8.793586115210326e-06, + "loss": 1.188754677772522, + "step": 282 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 0.3005872666835785, + "learning_rate": 8.581356612668382e-06, + "loss": 1.173933744430542, + "step": 284 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 0.2967031002044678, + "learning_rate": 8.372069691031804e-06, + "loss": 1.1837879419326782, + "step": 286 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.2758060395717621, + "learning_rate": 8.165803121664869e-06, + "loss": 1.1517661809921265, + "step": 288 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.29872316122055054, + "learning_rate": 7.962633553563965e-06, + "loss": 1.1168982982635498, + "step": 290 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 0.3943071961402893, + "learning_rate": 7.762636484874723e-06, + "loss": 1.1435798406600952, + "step": 292 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.4497228264808655, + "learning_rate": 7.565886234836767e-06, + "loss": 1.1566179990768433, + "step": 294 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 0.354144424200058, + "learning_rate": 7.3724559161665876e-06, + "loss": 1.5012860298156738, + "step": 296 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 1.309401273727417, + "learning_rate": 7.182417407888703e-06, + "loss": 0.9079286456108093, + "step": 298 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.3585984706878662, + "learning_rate": 6.995841328625321e-06, + "loss": 1.5736669301986694, + "step": 300 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 0.5723401308059692, + "learning_rate": 6.812797010354325e-06, + "loss": 1.192858099937439, + "step": 302 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 2.9767942428588867, + "learning_rate": 6.63335247264542e-06, + "loss": 0.7880658507347107, + "step": 304 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 1.3081823587417603, + "learning_rate": 6.457574397383919e-06, + "loss": 1.5271462202072144, + "step": 306 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 0.34140118956565857, + "learning_rate": 6.285528103991665e-06, + "loss": 0.8412893414497375, + "step": 308 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.3129923939704895, + "learning_rate": 6.117277525154225e-06, + "loss": 1.20363187789917, + "step": 310 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.383305162191391, + "learning_rate": 5.952885183063397e-06, + "loss": 1.1824288368225098, + "step": 312 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 0.3238297402858734, + "learning_rate": 5.792412166183841e-06, + "loss": 1.1946678161621094, + "step": 314 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 2.138742208480835, + "learning_rate": 5.635918106552546e-06, + "loss": 1.201963186264038, + "step": 316 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 0.3680674433708191, + "learning_rate": 5.483461157619428e-06, + "loss": 1.141302227973938, + "step": 318 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.2648313641548157, + "learning_rate": 5.335097972637441e-06, + "loss": 1.3266459703445435, + "step": 320 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 0.2844366431236267, + "learning_rate": 5.1908836836101135e-06, + "loss": 1.123349666595459, + "step": 322 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.5806925296783447, + "learning_rate": 5.050871880804414e-06, + "loss": 0.5679168701171875, + "step": 324 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 0.2976682484149933, + "learning_rate": 4.915114592836521e-06, + "loss": 1.1642491817474365, + "step": 326 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 1.0322918891906738, + "learning_rate": 4.783662267337909e-06, + "loss": 1.0334808826446533, + "step": 328 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.48543456196784973, + "learning_rate": 4.656563752208907e-06, + "loss": 1.186019778251648, + "step": 330 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 0.8842418789863586, + "learning_rate": 4.533866277466767e-06, + "loss": 1.249828815460205, + "step": 332 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 0.45925524830818176, + "learning_rate": 4.415615437694876e-06, + "loss": 1.0486047267913818, + "step": 334 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.49386680126190186, + "learning_rate": 4.3018551750997694e-06, + "loss": 1.14175283908844, + "step": 336 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 0.4923037886619568, + "learning_rate": 4.192627763182111e-06, + "loss": 1.1671501398086548, + "step": 338 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.31538236141204834, + "learning_rate": 4.087973791027797e-06, + "loss": 1.1591607332229614, + "step": 340 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.31360623240470886, + "learning_rate": 3.987932148224993e-06, + "loss": 1.1940336227416992, + "step": 342 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 1.0204191207885742, + "learning_rate": 3.8925400104126834e-06, + "loss": 1.1359769105911255, + "step": 344 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 0.6455877423286438, + "learning_rate": 3.8018328254661618e-06, + "loss": 1.5209176540374756, + "step": 346 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.6131592392921448, + "learning_rate": 3.715844300324527e-06, + "loss": 0.8120126724243164, + "step": 348 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.3298521339893341, + "learning_rate": 3.6346063884651327e-06, + "loss": 0.8144136071205139, + "step": 350 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.35196903347969055, + "learning_rate": 3.558149278029624e-06, + "loss": 1.1486433744430542, + "step": 352 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 1.0103577375411987, + "learning_rate": 3.4865013806059817e-06, + "loss": 0.8401246666908264, + "step": 354 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 0.42854589223861694, + "learning_rate": 3.419689320670712e-06, + "loss": 1.042449951171875, + "step": 356 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 0.37466299533843994, + "learning_rate": 3.35773792569517e-06, + "loss": 0.7528899312019348, + "step": 358 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.2885702848434448, + "learning_rate": 3.300670216919602e-06, + "loss": 1.155380129814148, + "step": 360 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 0.33391743898391724, + "learning_rate": 3.2485074007984468e-06, + "loss": 0.9418554306030273, + "step": 362 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.2690609097480774, + "learning_rate": 3.2012688611199566e-06, + "loss": 1.1393744945526123, + "step": 364 + }, + { + "epoch": 1.0, + "grad_norm": 0.4000090956687927, + "learning_rate": 3.158972151803165e-06, + "loss": 1.1187925338745117, + "step": 366 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.40801742672920227, + "learning_rate": 3.1216329903748095e-06, + "loss": 0.981923520565033, + "step": 368 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.39118003845214844, + "learning_rate": 3.089265252128686e-06, + "loss": 1.0384995937347412, + "step": 370 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.3082128167152405, + "learning_rate": 3.061880964969555e-06, + "loss": 0.9732365608215332, + "step": 372 + }, + { + "epoch": 1.0218579234972678, + "grad_norm": 0.41411662101745605, + "learning_rate": 3.039490304943562e-06, + "loss": 1.0013678073883057, + "step": 374 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 0.6949254870414734, + "learning_rate": 3.022101592456795e-06, + "loss": 0.848197877407074, + "step": 376 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 0.4030425250530243, + "learning_rate": 3.0097212891834095e-06, + "loss": 0.886638879776001, + "step": 378 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.37187013030052185, + "learning_rate": 3.0023539956644634e-06, + "loss": 0.60066819190979, + "step": 380 + }, + { + "epoch": 1.0437158469945356, + "grad_norm": 0.7314932942390442, + "learning_rate": 3.0000024495983428e-06, + "loss": 0.9156309962272644, + "step": 382 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.4109429121017456, + "learning_rate": 3.002667524823434e-06, + "loss": 0.9761273860931396, + "step": 384 + }, + { + "epoch": 1.0546448087431695, + "grad_norm": 0.7147516012191772, + "learning_rate": 3.010348230993402e-06, + "loss": 0.6097055077552795, + "step": 386 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 0.5150865316390991, + "learning_rate": 3.0230417139451987e-06, + "loss": 0.5354875326156616, + "step": 388 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.5118747353553772, + "learning_rate": 3.0407432567596883e-06, + "loss": 1.1215546131134033, + "step": 390 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 0.3243095874786377, + "learning_rate": 3.0634462815144474e-06, + "loss": 0.8853414058685303, + "step": 392 + }, + { + "epoch": 1.0765027322404372, + "grad_norm": 1.0932362079620361, + "learning_rate": 3.0911423517281404e-06, + "loss": 0.5655322670936584, + "step": 394 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.5749414563179016, + "learning_rate": 3.1238211754955294e-06, + "loss": 0.8961263298988342, + "step": 396 + }, + { + "epoch": 1.0874316939890711, + "grad_norm": 0.4210751950740814, + "learning_rate": 3.161470609311961e-06, + "loss": 0.9246914386749268, + "step": 398 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.4931108057498932, + "learning_rate": 3.2040766625859115e-06, + "loss": 1.0521095991134644, + "step": 400 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 3.348501205444336, + "learning_rate": 3.2516235028379157e-06, + "loss": 0.9077386260032654, + "step": 402 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 0.3238441050052643, + "learning_rate": 3.304093461583944e-06, + "loss": 0.970809817314148, + "step": 404 + }, + { + "epoch": 1.1092896174863387, + "grad_norm": 1.0212552547454834, + "learning_rate": 3.3614670409010353e-06, + "loss": 1.094781756401062, + "step": 406 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.5059043765068054, + "learning_rate": 3.4237229206727602e-06, + "loss": 0.643450140953064, + "step": 408 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.34191587567329407, + "learning_rate": 3.490837966511817e-06, + "loss": 1.1581542491912842, + "step": 410 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 1.0587300062179565, + "learning_rate": 3.5627872383567937e-06, + "loss": 0.7080036997795105, + "step": 412 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.2536488175392151, + "learning_rate": 3.6395439997399494e-06, + "loss": 0.6488229632377625, + "step": 414 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 0.8057353496551514, + "learning_rate": 3.721079727722522e-06, + "loss": 0.6402009129524231, + "step": 416 + }, + { + "epoch": 1.1420765027322404, + "grad_norm": 0.29355666041374207, + "learning_rate": 3.8073641234939055e-06, + "loss": 0.9499251842498779, + "step": 418 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.3825172185897827, + "learning_rate": 3.898365123630732e-06, + "loss": 0.8552459478378296, + "step": 420 + }, + { + "epoch": 1.1530054644808743, + "grad_norm": 0.3960343897342682, + "learning_rate": 3.994048912011692e-06, + "loss": 0.9106036424636841, + "step": 422 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 1.2985180616378784, + "learning_rate": 4.094379932383666e-06, + "loss": 0.7228477597236633, + "step": 424 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 0.29699158668518066, + "learning_rate": 4.199320901574489e-06, + "loss": 0.6496458053588867, + "step": 426 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 0.3441046476364136, + "learning_rate": 4.3088328233474185e-06, + "loss": 1.1220229864120483, + "step": 428 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.38102617859840393, + "learning_rate": 4.422875002892234e-06, + "loss": 0.9543809294700623, + "step": 430 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 1.1369144916534424, + "learning_rate": 4.54140506194747e-06, + "loss": 0.8527680039405823, + "step": 432 + }, + { + "epoch": 1.185792349726776, + "grad_norm": 3.073438882827759, + "learning_rate": 4.664378954548241e-06, + "loss": 0.7841230630874634, + "step": 434 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 0.28754952549934387, + "learning_rate": 4.791750983393832e-06, + "loss": 0.9805347919464111, + "step": 436 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 0.23602163791656494, + "learning_rate": 4.9234738168288466e-06, + "loss": 0.8651551008224487, + "step": 438 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.3787078857421875, + "learning_rate": 5.059498506431758e-06, + "loss": 0.9147478342056274, + "step": 440 + }, + { + "epoch": 1.2076502732240437, + "grad_norm": 0.513213038444519, + "learning_rate": 5.199774505204206e-06, + "loss": 0.9759296774864197, + "step": 442 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.38409918546676636, + "learning_rate": 5.344249686354357e-06, + "loss": 0.9177553653717041, + "step": 444 + }, + { + "epoch": 1.2185792349726776, + "grad_norm": 0.296101450920105, + "learning_rate": 5.492870362667299e-06, + "loss": 0.5355038046836853, + "step": 446 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.6465612053871155, + "learning_rate": 5.645581306455302e-06, + "loss": 0.8347874879837036, + "step": 448 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.378653347492218, + "learning_rate": 5.802325770080506e-06, + "loss": 0.9033167362213135, + "step": 450 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 0.48362499475479126, + "learning_rate": 5.96304550704246e-06, + "loss": 0.971287727355957, + "step": 452 + }, + { + "epoch": 1.2404371584699454, + "grad_norm": 0.3260844647884369, + "learning_rate": 6.127680793622588e-06, + "loss": 0.27598556876182556, + "step": 454 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 4.79307746887207, + "learning_rate": 6.296170451077657e-06, + "loss": 0.7720631957054138, + "step": 456 + }, + { + "epoch": 1.2513661202185793, + "grad_norm": 0.5071384906768799, + "learning_rate": 6.468451868373856e-06, + "loss": 0.6973821520805359, + "step": 458 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.4039640426635742, + "learning_rate": 6.6444610254532e-06, + "loss": 0.9609543085098267, + "step": 460 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 0.5991283059120178, + "learning_rate": 6.824132517023449e-06, + "loss": 0.6427564024925232, + "step": 462 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 1.2359015941619873, + "learning_rate": 7.007399576862872e-06, + "loss": 1.003535509109497, + "step": 464 + }, + { + "epoch": 1.273224043715847, + "grad_norm": 0.6143922209739685, + "learning_rate": 7.1941941026306275e-06, + "loss": 0.9678149819374084, + "step": 466 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.19856023788452148, + "learning_rate": 7.3844466811737555e-06, + "loss": 0.8354513645172119, + "step": 468 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.4483174979686737, + "learning_rate": 7.578086614321175e-06, + "loss": 1.0250217914581299, + "step": 470 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 0.2900082468986511, + "learning_rate": 7.775041945155295e-06, + "loss": 1.206943154335022, + "step": 472 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.25206395983695984, + "learning_rate": 7.975239484751258e-06, + "loss": 0.8465134501457214, + "step": 474 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 0.44423073530197144, + "learning_rate": 8.178604839374125e-06, + "loss": 0.4682804346084595, + "step": 476 + }, + { + "epoch": 1.3060109289617485, + "grad_norm": 0.40745142102241516, + "learning_rate": 8.385062438123673e-06, + "loss": 1.0709359645843506, + "step": 478 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.5735993385314941, + "learning_rate": 8.594535561016661e-06, + "loss": 1.0265499353408813, + "step": 480 + }, + { + "epoch": 1.3169398907103824, + "grad_norm": 0.25811538100242615, + "learning_rate": 8.806946367496155e-06, + "loss": 0.9305548667907715, + "step": 482 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 0.5877478122711182, + "learning_rate": 9.02221592535712e-06, + "loss": 0.6250959634780884, + "step": 484 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 0.31082847714424133, + "learning_rate": 9.240264240077859e-06, + "loss": 0.8635732531547546, + "step": 486 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.4074012041091919, + "learning_rate": 9.461010284546016e-06, + "loss": 1.0225374698638916, + "step": 488 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.7163453102111816, + "learning_rate": 9.684372029168438e-06, + "loss": 0.6194128394126892, + "step": 490 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 0.3905765414237976, + "learning_rate": 9.91026647235348e-06, + "loss": 1.0774719715118408, + "step": 492 + }, + { + "epoch": 1.349726775956284, + "grad_norm": 0.6559311151504517, + "learning_rate": 1.0138609671354586e-05, + "loss": 0.877537727355957, + "step": 494 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.4489327371120453, + "learning_rate": 1.0369316773463458e-05, + "loss": 0.9511969685554504, + "step": 496 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 0.5031542778015137, + "learning_rate": 1.0602302047541566e-05, + "loss": 0.83577960729599, + "step": 498 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.40769895911216736, + "learning_rate": 1.083747891587788e-05, + "loss": 1.0597424507141113, + "step": 500 + }, + { + "epoch": 1.3715846994535519, + "grad_norm": 0.4040791690349579, + "learning_rate": 1.1074759986361392e-05, + "loss": 0.9861813187599182, + "step": 502 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.4598991870880127, + "learning_rate": 1.1314057084956073e-05, + "loss": 0.5206344723701477, + "step": 504 + }, + { + "epoch": 1.3825136612021858, + "grad_norm": 0.2866649627685547, + "learning_rate": 1.1555281288466553e-05, + "loss": 0.7465002536773682, + "step": 506 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 0.230438232421875, + "learning_rate": 1.1798342957582084e-05, + "loss": 0.9311132431030273, + "step": 508 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.28892356157302856, + "learning_rate": 1.2043151770186725e-05, + "loss": 0.964007556438446, + "step": 510 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.2942189872264862, + "learning_rate": 1.2289616754923078e-05, + "loss": 1.0333213806152344, + "step": 512 + }, + { + "epoch": 1.4043715846994536, + "grad_norm": 0.29071956872940063, + "learning_rate": 1.253764632499752e-05, + "loss": 0.6722708344459534, + "step": 514 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 0.2511497437953949, + "learning_rate": 1.2787148312213901e-05, + "loss": 0.7008714079856873, + "step": 516 + }, + { + "epoch": 1.4153005464480874, + "grad_norm": 0.30298787355422974, + "learning_rate": 1.3038030001223439e-05, + "loss": 0.9937577843666077, + "step": 518 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 0.8531264066696167, + "learning_rate": 1.3290198163977933e-05, + "loss": 0.8048759698867798, + "step": 520 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.24621592462062836, + "learning_rate": 1.3543559094373372e-05, + "loss": 1.039415955543518, + "step": 522 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 0.6622775793075562, + "learning_rate": 1.3798018643071386e-05, + "loss": 0.996492862701416, + "step": 524 + }, + { + "epoch": 1.4371584699453552, + "grad_norm": 0.8804835677146912, + "learning_rate": 1.4053482252485178e-05, + "loss": 0.9746305346488953, + "step": 526 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.5442449450492859, + "learning_rate": 1.4309854991917388e-05, + "loss": 0.7142194509506226, + "step": 528 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.4819408059120178, + "learning_rate": 1.4567041592836413e-05, + "loss": 0.7606571316719055, + "step": 530 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 0.6655029654502869, + "learning_rate": 1.48249464842784e-05, + "loss": 0.7583669424057007, + "step": 532 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 0.30279040336608887, + "learning_rate": 1.508347382836153e-05, + "loss": 1.036142349243164, + "step": 534 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 0.6906257271766663, + "learning_rate": 1.534252755589961e-05, + "loss": 0.963323712348938, + "step": 536 + }, + { + "epoch": 1.469945355191257, + "grad_norm": 0.3248192369937897, + "learning_rate": 1.5602011402101432e-05, + "loss": 1.0202207565307617, + "step": 538 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.5597788691520691, + "learning_rate": 1.5861828942343037e-05, + "loss": 1.0704936981201172, + "step": 540 + }, + { + "epoch": 1.4808743169398908, + "grad_norm": 0.3918614089488983, + "learning_rate": 1.612188362799917e-05, + "loss": 0.9736133217811584, + "step": 542 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.2836878299713135, + "learning_rate": 1.6382078822320964e-05, + "loss": 0.8202962875366211, + "step": 544 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 1.0198228359222412, + "learning_rate": 1.6642317836346324e-05, + "loss": 1.0485470294952393, + "step": 546 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 0.48356863856315613, + "learning_rate": 1.6902503964829644e-05, + "loss": 0.4609794318675995, + "step": 548 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 0.30593419075012207, + "learning_rate": 1.7162540522177685e-05, + "loss": 0.9458433985710144, + "step": 550 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.22826838493347168, + "learning_rate": 1.7422330878378113e-05, + "loss": 0.9808536171913147, + "step": 552 + }, + { + "epoch": 1.5136612021857925, + "grad_norm": 0.24512672424316406, + "learning_rate": 1.7681778494907298e-05, + "loss": 1.0151616334915161, + "step": 554 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 0.26424238085746765, + "learning_rate": 1.794078696060429e-05, + "loss": 1.1077044010162354, + "step": 556 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.2831886410713196, + "learning_rate": 1.819926002749727e-05, + "loss": 0.777061939239502, + "step": 558 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.46841201186180115, + "learning_rate": 1.84571016465695e-05, + "loss": 0.5536693334579468, + "step": 560 + }, + { + "epoch": 1.5355191256830603, + "grad_norm": 0.40938982367515564, + "learning_rate": 1.8714216003451295e-05, + "loss": 0.966119110584259, + "step": 562 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 0.24322885274887085, + "learning_rate": 1.8970507554024827e-05, + "loss": 0.8420946002006531, + "step": 564 + }, + { + "epoch": 1.5464480874316942, + "grad_norm": 0.31624189019203186, + "learning_rate": 1.922588105992838e-05, + "loss": 0.9916543364524841, + "step": 566 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 0.8164126873016357, + "learning_rate": 1.9480241623947206e-05, + "loss": 0.8523349761962891, + "step": 568 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.3358438014984131, + "learning_rate": 1.9733494725277413e-05, + "loss": 1.1619439125061035, + "step": 570 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 0.2056051790714264, + "learning_rate": 1.998554625465005e-05, + "loss": 0.9285228848457336, + "step": 572 + }, + { + "epoch": 1.5683060109289617, + "grad_norm": 0.28299570083618164, + "learning_rate": 2.0236302549302293e-05, + "loss": 1.0506218671798706, + "step": 574 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.7570331692695618, + "learning_rate": 2.0485670427782644e-05, + "loss": 0.8511103391647339, + "step": 576 + }, + { + "epoch": 1.5792349726775956, + "grad_norm": 0.34417569637298584, + "learning_rate": 2.073355722457739e-05, + "loss": 1.080989956855774, + "step": 578 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.7556970715522766, + "learning_rate": 2.0979870824545165e-05, + "loss": 0.5906158089637756, + "step": 580 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.2745168209075928, + "learning_rate": 2.1224519697147145e-05, + "loss": 0.7238953113555908, + "step": 582 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 0.4340955913066864, + "learning_rate": 2.1467412930459936e-05, + "loss": 0.6840563416481018, + "step": 584 + }, + { + "epoch": 1.6010928961748634, + "grad_norm": 0.2478933185338974, + "learning_rate": 2.1708460264958595e-05, + "loss": 1.0408371686935425, + "step": 586 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 0.2583276033401489, + "learning_rate": 2.194757212705718e-05, + "loss": 1.0583165884017944, + "step": 588 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.31770971417427063, + "learning_rate": 2.2184659662394522e-05, + "loss": 0.8676682710647583, + "step": 590 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.1893642693758011, + "learning_rate": 2.24196347688526e-05, + "loss": 1.0388176441192627, + "step": 592 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 1.554314374923706, + "learning_rate": 2.265241012929541e-05, + "loss": 0.8767275214195251, + "step": 594 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 0.3525921404361725, + "learning_rate": 2.28828992440162e-05, + "loss": 0.40555280447006226, + "step": 596 + }, + { + "epoch": 1.633879781420765, + "grad_norm": 0.24459926784038544, + "learning_rate": 2.3111016462880873e-05, + "loss": 1.2219781875610352, + "step": 598 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.24944064021110535, + "learning_rate": 2.333667701715578e-05, + "loss": 1.16285240650177, + "step": 600 + }, + { + "epoch": 1.644808743169399, + "grad_norm": 0.21063530445098877, + "learning_rate": 2.3559797051007815e-05, + "loss": 1.0140953063964844, + "step": 602 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 0.23877571523189545, + "learning_rate": 2.3780293652665477e-05, + "loss": 1.088266134262085, + "step": 604 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.33986350893974304, + "learning_rate": 2.399808488522895e-05, + "loss": 1.1801600456237793, + "step": 606 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.26012417674064636, + "learning_rate": 2.4213089817118078e-05, + "loss": 0.8795619606971741, + "step": 608 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.2159816473722458, + "learning_rate": 2.4425228552146573e-05, + "loss": 1.19132661819458, + "step": 610 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 1.038149118423462, + "learning_rate": 2.4634422259211614e-05, + "loss": 0.7393221259117126, + "step": 612 + }, + { + "epoch": 1.6775956284153004, + "grad_norm": 0.21248742938041687, + "learning_rate": 2.4840593201587626e-05, + "loss": 1.010488748550415, + "step": 614 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 0.21196357905864716, + "learning_rate": 2.5043664765813377e-05, + "loss": 1.0944840908050537, + "step": 616 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.35981178283691406, + "learning_rate": 2.524356149016163e-05, + "loss": 0.5883587598800659, + "step": 618 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.3057361841201782, + "learning_rate": 2.544020909268085e-05, + "loss": 1.1578078269958496, + "step": 620 + }, + { + "epoch": 1.6994535519125682, + "grad_norm": 0.5280173420906067, + "learning_rate": 2.5633534498798598e-05, + "loss": 1.2335985898971558, + "step": 622 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.1886816918849945, + "learning_rate": 2.5823465868475985e-05, + "loss": 0.8703290820121765, + "step": 624 + }, + { + "epoch": 1.710382513661202, + "grad_norm": 0.16325658559799194, + "learning_rate": 2.60099326229037e-05, + "loss": 0.7182884216308594, + "step": 626 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 0.17951571941375732, + "learning_rate": 2.619286547072914e-05, + "loss": 1.076783299446106, + "step": 628 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.49342671036720276, + "learning_rate": 2.6372196433805214e-05, + "loss": 0.7743050456047058, + "step": 630 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 0.3581479787826538, + "learning_rate": 2.654785887245112e-05, + "loss": 1.046736478805542, + "step": 632 + }, + { + "epoch": 1.7322404371584699, + "grad_norm": 0.19438432157039642, + "learning_rate": 2.671978751021577e-05, + "loss": 1.142443060874939, + "step": 634 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.20565520226955414, + "learning_rate": 2.6887918458134622e-05, + "loss": 1.021065354347229, + "step": 636 + }, + { + "epoch": 1.7431693989071038, + "grad_norm": 0.16628266870975494, + "learning_rate": 2.705218923847093e-05, + "loss": 1.1118640899658203, + "step": 638 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.35274869203567505, + "learning_rate": 2.7212538807932576e-05, + "loss": 1.0129691362380981, + "step": 640 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.19920802116394043, + "learning_rate": 2.7368907580355843e-05, + "loss": 1.0399836301803589, + "step": 642 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 0.2887803018093109, + "learning_rate": 2.7521237448847734e-05, + "loss": 1.0195159912109375, + "step": 644 + }, + { + "epoch": 1.7650273224043715, + "grad_norm": 0.2489539533853531, + "learning_rate": 2.766947180737861e-05, + "loss": 0.6208384037017822, + "step": 646 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.2961595058441162, + "learning_rate": 2.781355557181706e-05, + "loss": 1.0299938917160034, + "step": 648 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.2027096301317215, + "learning_rate": 2.7953435200399262e-05, + "loss": 1.0568764209747314, + "step": 650 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 0.19219909608364105, + "learning_rate": 2.8089058713625194e-05, + "loss": 1.142011284828186, + "step": 652 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.17235226929187775, + "learning_rate": 2.8220375713574307e-05, + "loss": 1.0625348091125488, + "step": 654 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 1.209734559059143, + "learning_rate": 2.8347337402633456e-05, + "loss": 1.0341904163360596, + "step": 656 + }, + { + "epoch": 1.7978142076502732, + "grad_norm": 0.4596608579158783, + "learning_rate": 2.846989660163019e-05, + "loss": 1.0296437740325928, + "step": 658 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.36770763993263245, + "learning_rate": 2.858800776736461e-05, + "loss": 1.1404849290847778, + "step": 660 + }, + { + "epoch": 1.8087431693989071, + "grad_norm": 0.2144036591053009, + "learning_rate": 2.87016270095333e-05, + "loss": 1.0944205522537231, + "step": 662 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 0.2577725350856781, + "learning_rate": 2.8810712107039e-05, + "loss": 1.1880568265914917, + "step": 664 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 1.9875679016113281, + "learning_rate": 2.8915222523680082e-05, + "loss": 1.3935034275054932, + "step": 666 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 0.17387168109416962, + "learning_rate": 2.9015119423213857e-05, + "loss": 1.1480551958084106, + "step": 668 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 0.2092132717370987, + "learning_rate": 2.9110365683788173e-05, + "loss": 0.7491433620452881, + "step": 670 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.49205076694488525, + "learning_rate": 2.9200925911735956e-05, + "loss": 1.3666801452636719, + "step": 672 + }, + { + "epoch": 1.8415300546448088, + "grad_norm": 0.21502411365509033, + "learning_rate": 2.9286766454727563e-05, + "loss": 1.0823509693145752, + "step": 674 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 0.19282038509845734, + "learning_rate": 2.9367855414276073e-05, + "loss": 1.1538946628570557, + "step": 676 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.1708725392818451, + "learning_rate": 2.9444162657590747e-05, + "loss": 0.36654239892959595, + "step": 678 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.18482287228107452, + "learning_rate": 2.951565982877447e-05, + "loss": 1.1071395874023438, + "step": 680 + }, + { + "epoch": 1.8633879781420766, + "grad_norm": 0.1782202124595642, + "learning_rate": 2.9582320359360864e-05, + "loss": 1.1017282009124756, + "step": 682 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.251668244600296, + "learning_rate": 2.9644119478187126e-05, + "loss": 1.1279935836791992, + "step": 684 + }, + { + "epoch": 1.8743169398907105, + "grad_norm": 9.041706085205078, + "learning_rate": 2.9701034220599074e-05, + "loss": 1.0460294485092163, + "step": 686 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.18143460154533386, + "learning_rate": 2.975304343698483e-05, + "loss": 1.1729539632797241, + "step": 688 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.19067686796188354, + "learning_rate": 2.980012780063404e-05, + "loss": 1.1014466285705566, + "step": 690 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 0.23826494812965393, + "learning_rate": 2.9842269814919755e-05, + "loss": 0.8912120461463928, + "step": 692 + }, + { + "epoch": 1.8961748633879782, + "grad_norm": 0.23750869929790497, + "learning_rate": 2.9879453819800156e-05, + "loss": 1.1922415494918823, + "step": 694 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.18918921053409576, + "learning_rate": 2.991166599763788e-05, + "loss": 1.1193846464157104, + "step": 696 + }, + { + "epoch": 1.9071038251366121, + "grad_norm": 0.3542817533016205, + "learning_rate": 2.993889437833466e-05, + "loss": 1.0432852506637573, + "step": 698 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.17179521918296814, + "learning_rate": 2.9961128843779457e-05, + "loss": 1.1211903095245361, + "step": 700 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.2006503790616989, + "learning_rate": 2.9978361131608348e-05, + "loss": 1.0682426691055298, + "step": 702 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 0.204038605093956, + "learning_rate": 2.999058483827483e-05, + "loss": 1.1585360765457153, + "step": 704 + }, + { + "epoch": 1.92896174863388, + "grad_norm": 0.28496795892715454, + "learning_rate": 2.9997795421429404e-05, + "loss": 1.1962625980377197, + "step": 706 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 0.18176399171352386, + "learning_rate": 2.9999990201607516e-05, + "loss": 1.1941583156585693, + "step": 708 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 0.2823626399040222, + "learning_rate": 2.999716836322524e-05, + "loss": 0.9332343339920044, + "step": 710 + }, + { + "epoch": 1.9453551912568305, + "grad_norm": 0.19465626776218414, + "learning_rate": 2.9989330954882366e-05, + "loss": 0.6781972050666809, + "step": 712 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 0.45259955525398254, + "learning_rate": 2.9976480888972708e-05, + "loss": 1.159096598625183, + "step": 714 + }, + { + "epoch": 1.9562841530054644, + "grad_norm": 0.23192590475082397, + "learning_rate": 2.9958622940601907e-05, + "loss": 0.8674803376197815, + "step": 716 + }, + { + "epoch": 1.9617486338797814, + "grad_norm": 0.8216494917869568, + "learning_rate": 2.9935763745812935e-05, + "loss": 1.116053581237793, + "step": 718 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.38614794611930847, + "learning_rate": 2.990791179912017e-05, + "loss": 1.1189167499542236, + "step": 720 + }, + { + "epoch": 1.9726775956284153, + "grad_norm": 0.20481540262699127, + "learning_rate": 2.9875077450352817e-05, + "loss": 1.139620065689087, + "step": 722 + }, + { + "epoch": 1.9781420765027322, + "grad_norm": 0.17928683757781982, + "learning_rate": 2.9837272900808863e-05, + "loss": 1.0577901601791382, + "step": 724 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 0.38278234004974365, + "learning_rate": 2.9794512198721092e-05, + "loss": 1.0994060039520264, + "step": 726 + }, + { + "epoch": 1.989071038251366, + "grad_norm": 0.39787301421165466, + "learning_rate": 2.9746811234036736e-05, + "loss": 1.0890076160430908, + "step": 728 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 0.3581365942955017, + "learning_rate": 2.9694187732512702e-05, + "loss": 1.0212537050247192, + "step": 730 + }, + { + "epoch": 2.0, + "grad_norm": 0.21458348631858826, + "learning_rate": 2.96366612491287e-05, + "loss": 1.1633673906326294, + "step": 732 + }, + { + "epoch": 2.0054644808743167, + "grad_norm": 0.1837082952260971, + "learning_rate": 2.9574253160820573e-05, + "loss": 0.7306780219078064, + "step": 734 + }, + { + "epoch": 2.010928961748634, + "grad_norm": 0.2729966342449188, + "learning_rate": 2.9506986658536562e-05, + "loss": 0.7522571682929993, + "step": 736 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 0.34518009424209595, + "learning_rate": 2.9434886738619537e-05, + "loss": 0.8162837624549866, + "step": 738 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 0.31829017400741577, + "learning_rate": 2.9357980193518312e-05, + "loss": 0.8290322422981262, + "step": 740 + }, + { + "epoch": 2.0273224043715845, + "grad_norm": 0.9189549684524536, + "learning_rate": 2.927629560183153e-05, + "loss": 0.7381850481033325, + "step": 742 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 0.4974863529205322, + "learning_rate": 2.91898633176878e-05, + "loss": 0.9779095649719238, + "step": 744 + }, + { + "epoch": 2.0382513661202184, + "grad_norm": 0.22083698213100433, + "learning_rate": 2.909871545946603e-05, + "loss": 0.7365862131118774, + "step": 746 + }, + { + "epoch": 2.0437158469945356, + "grad_norm": 0.2540753483772278, + "learning_rate": 2.9002885897860252e-05, + "loss": 0.9322983026504517, + "step": 748 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 0.41510656476020813, + "learning_rate": 2.8902410243293152e-05, + "loss": 0.60332190990448, + "step": 750 + }, + { + "epoch": 2.0546448087431695, + "grad_norm": 0.34592926502227783, + "learning_rate": 2.8797325832683208e-05, + "loss": 0.523179829120636, + "step": 752 + }, + { + "epoch": 2.060109289617486, + "grad_norm": 0.8202958703041077, + "learning_rate": 2.868767171557021e-05, + "loss": 0.6999484896659851, + "step": 754 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 0.5787069201469421, + "learning_rate": 2.8573488639604418e-05, + "loss": 1.0504649877548218, + "step": 756 + }, + { + "epoch": 2.07103825136612, + "grad_norm": 0.36719974875450134, + "learning_rate": 2.845481903540464e-05, + "loss": 1.0751094818115234, + "step": 758 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 0.242353156208992, + "learning_rate": 2.8331707000790954e-05, + "loss": 0.8690615296363831, + "step": 760 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 0.3211243152618408, + "learning_rate": 2.820419828439788e-05, + "loss": 0.9909636974334717, + "step": 762 + }, + { + "epoch": 2.087431693989071, + "grad_norm": 0.19196948409080505, + "learning_rate": 2.8072340268674133e-05, + "loss": 0.8517364263534546, + "step": 764 + }, + { + "epoch": 2.092896174863388, + "grad_norm": 0.1766076534986496, + "learning_rate": 2.793618195227521e-05, + "loss": 1.0775508880615234, + "step": 766 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 0.5555011630058289, + "learning_rate": 2.779577393185539e-05, + "loss": 0.7964032888412476, + "step": 768 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 0.24965371191501617, + "learning_rate": 2.765116838326597e-05, + "loss": 0.90714031457901, + "step": 770 + }, + { + "epoch": 2.109289617486339, + "grad_norm": 0.38016757369041443, + "learning_rate": 2.750241904216663e-05, + "loss": 0.9129889607429504, + "step": 772 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 0.21674519777297974, + "learning_rate": 2.7349581184057144e-05, + "loss": 0.7319467663764954, + "step": 774 + }, + { + "epoch": 2.120218579234973, + "grad_norm": 0.28484922647476196, + "learning_rate": 2.719271160373693e-05, + "loss": 0.9362223744392395, + "step": 776 + }, + { + "epoch": 2.1256830601092895, + "grad_norm": 0.3728918731212616, + "learning_rate": 2.703186859420002e-05, + "loss": 0.7095181941986084, + "step": 778 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 0.2245863527059555, + "learning_rate": 2.6867111924973283e-05, + "loss": 0.7740026116371155, + "step": 780 + }, + { + "epoch": 2.1366120218579234, + "grad_norm": 0.1715814769268036, + "learning_rate": 2.6698502819905935e-05, + "loss": 0.8628339171409607, + "step": 782 + }, + { + "epoch": 2.1420765027322406, + "grad_norm": 0.1785881519317627, + "learning_rate": 2.652610393441872e-05, + "loss": 1.0129051208496094, + "step": 784 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 0.16147422790527344, + "learning_rate": 2.6349979332220992e-05, + "loss": 0.6580853462219238, + "step": 786 + }, + { + "epoch": 2.1530054644808745, + "grad_norm": 0.6977578401565552, + "learning_rate": 2.6170194461504586e-05, + "loss": 0.8348934054374695, + "step": 788 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 0.2910391390323639, + "learning_rate": 2.5986816130623133e-05, + "loss": 1.0212098360061646, + "step": 790 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 0.23879674077033997, + "learning_rate": 2.579991248326594e-05, + "loss": 0.6800191402435303, + "step": 792 + }, + { + "epoch": 2.169398907103825, + "grad_norm": 0.7448290586471558, + "learning_rate": 2.560955297313575e-05, + "loss": 0.8883236646652222, + "step": 794 + }, + { + "epoch": 2.1748633879781423, + "grad_norm": 0.2625773251056671, + "learning_rate": 2.5415808338139595e-05, + "loss": 0.7563661336898804, + "step": 796 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 0.21875064074993134, + "learning_rate": 2.5218750574102465e-05, + "loss": 0.8323014974594116, + "step": 798 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 0.535250723361969, + "learning_rate": 2.5018452908013522e-05, + "loss": 0.9481061697006226, + "step": 800 + }, + { + "epoch": 2.191256830601093, + "grad_norm": 0.3556676506996155, + "learning_rate": 2.48149897708149e-05, + "loss": 0.8436076641082764, + "step": 802 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 0.23821324110031128, + "learning_rate": 2.4608436769743e-05, + "loss": 0.8622503876686096, + "step": 804 + }, + { + "epoch": 2.202185792349727, + "grad_norm": 0.2570849657058716, + "learning_rate": 2.4398870660232684e-05, + "loss": 0.8588972091674805, + "step": 806 + }, + { + "epoch": 2.2076502732240435, + "grad_norm": 2.3321070671081543, + "learning_rate": 2.418636931739491e-05, + "loss": 0.48147663474082947, + "step": 808 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 0.26416775584220886, + "learning_rate": 2.3971011707078125e-05, + "loss": 0.6585652232170105, + "step": 810 + }, + { + "epoch": 2.2185792349726774, + "grad_norm": 0.21810820698738098, + "learning_rate": 2.3752877856524532e-05, + "loss": 0.8585912585258484, + "step": 812 + }, + { + "epoch": 2.2240437158469946, + "grad_norm": 0.2282879799604416, + "learning_rate": 2.353204882463168e-05, + "loss": 0.6812423467636108, + "step": 814 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 0.4175258278846741, + "learning_rate": 2.330860667183101e-05, + "loss": 0.7331743240356445, + "step": 816 + }, + { + "epoch": 2.2349726775956285, + "grad_norm": 0.27887681126594543, + "learning_rate": 2.308263442959396e-05, + "loss": 0.9462811350822449, + "step": 818 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 0.24890534579753876, + "learning_rate": 2.2854216069577376e-05, + "loss": 0.9054920673370361, + "step": 820 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 0.6545159816741943, + "learning_rate": 2.2623436472419476e-05, + "loss": 1.0144051313400269, + "step": 822 + }, + { + "epoch": 2.251366120218579, + "grad_norm": 0.48057249188423157, + "learning_rate": 2.2390381396198102e-05, + "loss": 0.7360069751739502, + "step": 824 + }, + { + "epoch": 2.2568306010928962, + "grad_norm": 2.75382399559021, + "learning_rate": 2.2155137444562842e-05, + "loss": 0.6045563220977783, + "step": 826 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 0.6408010125160217, + "learning_rate": 2.191779203455302e-05, + "loss": 0.5113135576248169, + "step": 828 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 0.23403555154800415, + "learning_rate": 2.1678433364113297e-05, + "loss": 0.8575177192687988, + "step": 830 + }, + { + "epoch": 2.273224043715847, + "grad_norm": 0.49736300110816956, + "learning_rate": 2.1437150379319245e-05, + "loss": 0.5503892302513123, + "step": 832 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 0.48317691683769226, + "learning_rate": 2.1194032741324823e-05, + "loss": 0.6569101810455322, + "step": 834 + }, + { + "epoch": 2.2841530054644807, + "grad_norm": 0.5925542712211609, + "learning_rate": 2.0949170793044142e-05, + "loss": 0.9202378988265991, + "step": 836 + }, + { + "epoch": 2.289617486338798, + "grad_norm": 0.41072195768356323, + "learning_rate": 2.070265552557985e-05, + "loss": 0.9517123699188232, + "step": 838 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 0.22094306349754333, + "learning_rate": 2.0454578544410758e-05, + "loss": 0.7035161852836609, + "step": 840 + }, + { + "epoch": 2.300546448087432, + "grad_norm": 0.29024970531463623, + "learning_rate": 2.0205032035351043e-05, + "loss": 0.8569685220718384, + "step": 842 + }, + { + "epoch": 2.3060109289617485, + "grad_norm": 0.19582630693912506, + "learning_rate": 1.9954108730293875e-05, + "loss": 0.975737452507019, + "step": 844 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 0.37361687421798706, + "learning_rate": 1.9701901872752047e-05, + "loss": 0.5019787549972534, + "step": 846 + }, + { + "epoch": 2.3169398907103824, + "grad_norm": 0.1886816918849945, + "learning_rate": 1.9448505183208607e-05, + "loss": 0.9394426345825195, + "step": 848 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 0.6417620182037354, + "learning_rate": 1.919401282429013e-05, + "loss": 0.8482524156570435, + "step": 850 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 0.4325665831565857, + "learning_rate": 1.893851936577567e-05, + "loss": 0.8246769905090332, + "step": 852 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.23205913603305817, + "learning_rate": 1.868211974945461e-05, + "loss": 0.9343006610870361, + "step": 854 + }, + { + "epoch": 2.33879781420765, + "grad_norm": 0.2003387063741684, + "learning_rate": 1.842490925384604e-05, + "loss": 0.9182968735694885, + "step": 856 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 0.2010021060705185, + "learning_rate": 1.816698345879313e-05, + "loss": 0.6269927620887756, + "step": 858 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 0.21301890909671783, + "learning_rate": 1.790843820994548e-05, + "loss": 0.7751089334487915, + "step": 860 + }, + { + "epoch": 2.3551912568306013, + "grad_norm": 0.20366328954696655, + "learning_rate": 1.7649369583142763e-05, + "loss": 0.897426962852478, + "step": 862 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 0.21869035065174103, + "learning_rate": 1.738987384871274e-05, + "loss": 0.6120696067810059, + "step": 864 + }, + { + "epoch": 2.366120218579235, + "grad_norm": 0.5016121864318848, + "learning_rate": 1.7130047435697118e-05, + "loss": 1.1808212995529175, + "step": 866 + }, + { + "epoch": 2.371584699453552, + "grad_norm": 0.20122063159942627, + "learning_rate": 1.6869986896018226e-05, + "loss": 0.8688129782676697, + "step": 868 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.18649625778198242, + "learning_rate": 1.66097888686003e-05, + "loss": 0.7019689083099365, + "step": 870 + }, + { + "epoch": 2.3825136612021858, + "grad_norm": 0.2085297703742981, + "learning_rate": 1.6349550043458252e-05, + "loss": 0.7158989310264587, + "step": 872 + }, + { + "epoch": 2.387978142076503, + "grad_norm": 0.26028934121131897, + "learning_rate": 1.608936712576749e-05, + "loss": 0.8539477586746216, + "step": 874 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 0.5020867586135864, + "learning_rate": 1.582933679992809e-05, + "loss": 0.7443297505378723, + "step": 876 + }, + { + "epoch": 2.3989071038251364, + "grad_norm": 0.2335505485534668, + "learning_rate": 1.556955569363678e-05, + "loss": 0.760352611541748, + "step": 878 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 0.2085113674402237, + "learning_rate": 1.531012034197988e-05, + "loss": 0.9256618022918701, + "step": 880 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 0.23211164772510529, + "learning_rate": 1.5051127151560745e-05, + "loss": 0.4930667281150818, + "step": 882 + }, + { + "epoch": 2.4153005464480874, + "grad_norm": 0.2237941026687622, + "learning_rate": 1.4792672364674816e-05, + "loss": 0.7172381281852722, + "step": 884 + }, + { + "epoch": 2.420765027322404, + "grad_norm": 0.21497218310832977, + "learning_rate": 1.4534852023545968e-05, + "loss": 0.823451578617096, + "step": 886 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 0.23108679056167603, + "learning_rate": 1.4277761934636963e-05, + "loss": 0.6176611185073853, + "step": 888 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 0.2470959722995758, + "learning_rate": 1.4021497633047664e-05, + "loss": 0.509010374546051, + "step": 890 + }, + { + "epoch": 2.4371584699453552, + "grad_norm": 0.21755096316337585, + "learning_rate": 1.3766154347013933e-05, + "loss": 0.7690192461013794, + "step": 892 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 0.19507208466529846, + "learning_rate": 1.3511826962520809e-05, + "loss": 0.4858570992946625, + "step": 894 + }, + { + "epoch": 2.448087431693989, + "grad_norm": 0.19376493990421295, + "learning_rate": 1.3258609988042627e-05, + "loss": 0.7765929698944092, + "step": 896 + }, + { + "epoch": 2.453551912568306, + "grad_norm": 0.15309995412826538, + "learning_rate": 1.300659751942353e-05, + "loss": 0.7383701205253601, + "step": 898 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.20970426499843597, + "learning_rate": 1.2755883204911305e-05, + "loss": 0.8502997159957886, + "step": 900 + }, + { + "epoch": 2.4644808743169397, + "grad_norm": 0.20087599754333496, + "learning_rate": 1.2506560210357541e-05, + "loss": 0.8558724522590637, + "step": 902 + }, + { + "epoch": 2.469945355191257, + "grad_norm": 0.15894219279289246, + "learning_rate": 1.225872118459706e-05, + "loss": 0.6488800048828125, + "step": 904 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 0.3884161412715912, + "learning_rate": 1.2012458225019375e-05, + "loss": 0.9488789439201355, + "step": 906 + }, + { + "epoch": 2.480874316939891, + "grad_norm": 0.1707250028848648, + "learning_rate": 1.176786284334528e-05, + "loss": 0.5347706079483032, + "step": 908 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 0.11856268346309662, + "learning_rate": 1.1525025931620855e-05, + "loss": 0.6109844446182251, + "step": 910 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 0.16962800920009613, + "learning_rate": 1.1284037728441877e-05, + "loss": 0.44916611909866333, + "step": 912 + }, + { + "epoch": 2.4972677595628414, + "grad_norm": 0.1900777965784073, + "learning_rate": 1.1044987785420924e-05, + "loss": 0.8547566533088684, + "step": 914 + }, + { + "epoch": 2.5027322404371586, + "grad_norm": 0.20876799523830414, + "learning_rate": 1.0807964933909975e-05, + "loss": 0.7574434876441956, + "step": 916 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 0.21594154834747314, + "learning_rate": 1.0573057251990443e-05, + "loss": 0.7612369060516357, + "step": 918 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 0.17668958008289337, + "learning_rate": 1.0340352031743256e-05, + "loss": 0.7049843668937683, + "step": 920 + }, + { + "epoch": 2.519125683060109, + "grad_norm": 0.22894662618637085, + "learning_rate": 1.010993574681095e-05, + "loss": 0.754566490650177, + "step": 922 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 0.3702336847782135, + "learning_rate": 9.881894020263938e-06, + "loss": 0.48762306571006775, + "step": 924 + }, + { + "epoch": 2.530054644808743, + "grad_norm": 0.30566778779029846, + "learning_rate": 9.656311592782831e-06, + "loss": 0.5707772374153137, + "step": 926 + }, + { + "epoch": 2.5355191256830603, + "grad_norm": 0.2742585837841034, + "learning_rate": 9.433272291168689e-06, + "loss": 0.6204325556755066, + "step": 928 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 0.29101449251174927, + "learning_rate": 9.212858997192744e-06, + "loss": 1.0284860134124756, + "step": 930 + }, + { + "epoch": 2.546448087431694, + "grad_norm": 0.2771863341331482, + "learning_rate": 8.995153616797544e-06, + "loss": 0.6103931069374084, + "step": 932 + }, + { + "epoch": 2.551912568306011, + "grad_norm": 0.20665957033634186, + "learning_rate": 8.78023704966047e-06, + "loss": 0.7804996371269226, + "step": 934 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 0.5313201546669006, + "learning_rate": 8.568189159131336e-06, + "loss": 0.6838847398757935, + "step": 936 + }, + { + "epoch": 2.5628415300546448, + "grad_norm": 0.8041807413101196, + "learning_rate": 8.359088742554941e-06, + "loss": 0.5385434031486511, + "step": 938 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 0.26967042684555054, + "learning_rate": 8.15301350198999e-06, + "loss": 0.7957769632339478, + "step": 940 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 1.993246078491211, + "learning_rate": 7.950040015334789e-06, + "loss": 0.5354985594749451, + "step": 942 + }, + { + "epoch": 2.579234972677596, + "grad_norm": 0.19206559658050537, + "learning_rate": 7.750243707870748e-06, + "loss": 0.8785912394523621, + "step": 944 + }, + { + "epoch": 2.5846994535519126, + "grad_norm": 0.2833126187324524, + "learning_rate": 7.553698824234314e-06, + "loss": 0.25215014815330505, + "step": 946 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 0.1929856687784195, + "learning_rate": 7.360478400827475e-06, + "loss": 0.6651497483253479, + "step": 948 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 0.20867817103862762, + "learning_rate": 7.170654238677331e-06, + "loss": 0.6793198585510254, + "step": 950 + }, + { + "epoch": 2.6010928961748636, + "grad_norm": 0.32787761092185974, + "learning_rate": 6.984296876754711e-06, + "loss": 0.737807035446167, + "step": 952 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 1.5362244844436646, + "learning_rate": 6.801475565761783e-06, + "loss": 0.4789329171180725, + "step": 954 + }, + { + "epoch": 2.612021857923497, + "grad_norm": 0.14147597551345825, + "learning_rate": 6.622258242398371e-06, + "loss": 0.24011307954788208, + "step": 956 + }, + { + "epoch": 2.6174863387978142, + "grad_norm": 0.17159943282604218, + "learning_rate": 6.4467115041165855e-06, + "loss": 0.5085139274597168, + "step": 958 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.17652657628059387, + "learning_rate": 6.2749005843730336e-06, + "loss": 0.6372896432876587, + "step": 960 + }, + { + "epoch": 2.628415300546448, + "grad_norm": 0.8388151526451111, + "learning_rate": 6.106889328388064e-06, + "loss": 0.6449273824691772, + "step": 962 + }, + { + "epoch": 2.633879781420765, + "grad_norm": 0.25947555899620056, + "learning_rate": 5.942740169420701e-06, + "loss": 1.000860333442688, + "step": 964 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 0.15862314403057098, + "learning_rate": 5.7825141055683895e-06, + "loss": 0.6797659993171692, + "step": 966 + }, + { + "epoch": 2.644808743169399, + "grad_norm": 0.2170058637857437, + "learning_rate": 5.62627067709992e-06, + "loss": 0.7449020743370056, + "step": 968 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 0.5447753071784973, + "learning_rate": 5.474067944330285e-06, + "loss": 0.7707789540290833, + "step": 970 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 0.2564184069633484, + "learning_rate": 5.325962466045282e-06, + "loss": 1.0045585632324219, + "step": 972 + }, + { + "epoch": 2.66120218579235, + "grad_norm": 0.15954433381557465, + "learning_rate": 5.18200927848421e-06, + "loss": 0.5719258785247803, + "step": 974 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.15210166573524475, + "learning_rate": 5.042261874888308e-06, + "loss": 0.7093988060951233, + "step": 976 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 0.13644114136695862, + "learning_rate": 4.906772185622572e-06, + "loss": 0.7509814500808716, + "step": 978 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 0.15326020121574402, + "learning_rate": 4.775590558878368e-06, + "loss": 0.6545107960700989, + "step": 980 + }, + { + "epoch": 2.6830601092896176, + "grad_norm": 0.17753270268440247, + "learning_rate": 4.648765741963903e-06, + "loss": 0.8449227213859558, + "step": 982 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 0.3523927927017212, + "learning_rate": 4.526344863189724e-06, + "loss": 0.6510394811630249, + "step": 984 + }, + { + "epoch": 2.6939890710382515, + "grad_norm": 0.2865069508552551, + "learning_rate": 4.408373414355714e-06, + "loss": 0.9356023669242859, + "step": 986 + }, + { + "epoch": 2.699453551912568, + "grad_norm": 0.1558021456003189, + "learning_rate": 4.29489523384628e-06, + "loss": 0.8081143498420715, + "step": 988 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 0.2865149974822998, + "learning_rate": 4.185952490339899e-06, + "loss": 0.6692199110984802, + "step": 990 + }, + { + "epoch": 2.710382513661202, + "grad_norm": 0.19687670469284058, + "learning_rate": 4.081585667139231e-06, + "loss": 0.5164180397987366, + "step": 992 + }, + { + "epoch": 2.7158469945355193, + "grad_norm": 0.18878105282783508, + "learning_rate": 3.981833547127413e-06, + "loss": 0.811371922492981, + "step": 994 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 0.16217704117298126, + "learning_rate": 3.886733198356298e-06, + "loss": 0.8148671388626099, + "step": 996 + }, + { + "epoch": 2.726775956284153, + "grad_norm": 0.15565072000026703, + "learning_rate": 3.7963199602718717e-06, + "loss": 0.6758864521980286, + "step": 998 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 0.6546015739440918, + "learning_rate": 3.7106274305821034e-06, + "loss": 0.9574772119522095, + "step": 1000 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 0.18078891932964325, + "learning_rate": 3.6296874527719515e-06, + "loss": 0.8926464915275574, + "step": 1002 + }, + { + "epoch": 2.7431693989071038, + "grad_norm": 0.22077669203281403, + "learning_rate": 3.553530104270281e-06, + "loss": 0.6632700562477112, + "step": 1004 + }, + { + "epoch": 2.748633879781421, + "grad_norm": 0.381185919046402, + "learning_rate": 3.4821836852730384e-06, + "loss": 0.35226285457611084, + "step": 1006 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 0.18125414848327637, + "learning_rate": 3.41567470822686e-06, + "loss": 0.9504106044769287, + "step": 1008 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 0.25593459606170654, + "learning_rate": 3.354027887976989e-06, + "loss": 0.9168705344200134, + "step": 1010 + }, + { + "epoch": 2.7650273224043715, + "grad_norm": 0.6389570832252502, + "learning_rate": 3.297266132583221e-06, + "loss": 0.6682818531990051, + "step": 1012 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 0.32402268052101135, + "learning_rate": 3.245410534807195e-06, + "loss": 0.9942286610603333, + "step": 1014 + }, + { + "epoch": 2.7759562841530054, + "grad_norm": 0.20764689147472382, + "learning_rate": 3.1984803642743314e-06, + "loss": 0.6048266291618347, + "step": 1016 + }, + { + "epoch": 2.781420765027322, + "grad_norm": 0.1485147923231125, + "learning_rate": 3.1564930603131777e-06, + "loss": 0.8052763342857361, + "step": 1018 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.38492631912231445, + "learning_rate": 3.1194642254749395e-06, + "loss": 0.6152138113975525, + "step": 1020 + }, + { + "epoch": 2.7923497267759565, + "grad_norm": 0.19493553042411804, + "learning_rate": 3.0874076197355317e-06, + "loss": 0.8494120836257935, + "step": 1022 + }, + { + "epoch": 2.797814207650273, + "grad_norm": 0.4454888701438904, + "learning_rate": 3.0603351553823717e-06, + "loss": 0.4298951029777527, + "step": 1024 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 0.19092072546482086, + "learning_rate": 3.038256892587734e-06, + "loss": 0.8518891930580139, + "step": 1026 + }, + { + "epoch": 2.808743169398907, + "grad_norm": 0.6577255725860596, + "learning_rate": 3.0211810356703803e-06, + "loss": 0.7430834174156189, + "step": 1028 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 0.1505049765110016, + "learning_rate": 3.0091139300468266e-06, + "loss": 0.6131287813186646, + "step": 1030 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 0.1536058932542801, + "learning_rate": 3.0020600598733656e-06, + "loss": 0.6610476970672607, + "step": 1032 + }, + { + "epoch": 2.8251366120218577, + "grad_norm": 1.4271342754364014, + "learning_rate": 3.000022046379753e-06, + "loss": 0.7583919763565063, + "step": 1034 + }, + { + "epoch": 2.830601092896175, + "grad_norm": 0.24797333776950836, + "learning_rate": 3.0030006468951557e-06, + "loss": 0.9907567501068115, + "step": 1036 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 0.18040958046913147, + "learning_rate": 3.0109947545667246e-06, + "loss": 0.7043365240097046, + "step": 1038 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 0.38979649543762207, + "learning_rate": 3.024001398770901e-06, + "loss": 0.7790292501449585, + "step": 1040 + }, + { + "epoch": 2.8469945355191255, + "grad_norm": 0.1695755124092102, + "learning_rate": 3.042015746217308e-06, + "loss": 0.7965599298477173, + "step": 1042 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 0.16300812363624573, + "learning_rate": 3.0650311027448116e-06, + "loss": 0.7110670804977417, + "step": 1044 + }, + { + "epoch": 2.8579234972677594, + "grad_norm": 0.18763573467731476, + "learning_rate": 3.0930389158090754e-06, + "loss": 0.9114327430725098, + "step": 1046 + }, + { + "epoch": 2.8633879781420766, + "grad_norm": 0.7911413311958313, + "learning_rate": 3.1260287776607025e-06, + "loss": 0.3735189735889435, + "step": 1048 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 0.20250028371810913, + "learning_rate": 3.163988429212773e-06, + "loss": 0.7022408246994019, + "step": 1050 + }, + { + "epoch": 2.8743169398907105, + "grad_norm": 0.21102067828178406, + "learning_rate": 3.206903764596349e-06, + "loss": 0.6459388732910156, + "step": 1052 + }, + { + "epoch": 2.879781420765027, + "grad_norm": 0.21323570609092712, + "learning_rate": 3.254758836402225e-06, + "loss": 0.9086355566978455, + "step": 1054 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 1.73580002784729, + "learning_rate": 3.3075358616070144e-06, + "loss": 0.7384663820266724, + "step": 1056 + }, + { + "epoch": 2.890710382513661, + "grad_norm": 1.4918919801712036, + "learning_rate": 3.365215228181358e-06, + "loss": 0.6115441918373108, + "step": 1058 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 0.22330603003501892, + "learning_rate": 3.4277755023777795e-06, + "loss": 0.7794143557548523, + "step": 1060 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 0.19227337837219238, + "learning_rate": 3.495193436695504e-06, + "loss": 0.7703116536140442, + "step": 1062 + }, + { + "epoch": 2.907103825136612, + "grad_norm": 0.3918929398059845, + "learning_rate": 3.567443978519267e-06, + "loss": 0.876200795173645, + "step": 1064 + }, + { + "epoch": 2.912568306010929, + "grad_norm": 0.18335311114788055, + "learning_rate": 3.6445002794288992e-06, + "loss": 0.431761234998703, + "step": 1066 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 0.227519690990448, + "learning_rate": 3.7263337051762718e-06, + "loss": 0.5974105000495911, + "step": 1068 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 0.43164753913879395, + "learning_rate": 3.8129138463257943e-06, + "loss": 0.6901212930679321, + "step": 1070 + }, + { + "epoch": 2.92896174863388, + "grad_norm": 0.22106696665287018, + "learning_rate": 3.904208529554625e-06, + "loss": 0.9134948253631592, + "step": 1072 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 0.17417727410793304, + "learning_rate": 4.000183829608332e-06, + "loss": 0.49246451258659363, + "step": 1074 + }, + { + "epoch": 2.939890710382514, + "grad_norm": 1.0001611709594727, + "learning_rate": 4.100804081907595e-06, + "loss": 0.5228325128555298, + "step": 1076 + }, + { + "epoch": 2.9453551912568305, + "grad_norm": 0.38090279698371887, + "learning_rate": 4.206031895801176e-06, + "loss": 0.5014840364456177, + "step": 1078 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.2635762691497803, + "learning_rate": 4.315828168460367e-06, + "loss": 0.9488551020622253, + "step": 1080 + }, + { + "epoch": 2.9562841530054644, + "grad_norm": 0.23455342650413513, + "learning_rate": 4.430152099409704e-06, + "loss": 0.5974031686782837, + "step": 1082 + }, + { + "epoch": 2.9617486338797816, + "grad_norm": 0.306372731924057, + "learning_rate": 4.548961205688424e-06, + "loss": 0.748331606388092, + "step": 1084 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 0.1417221575975418, + "learning_rate": 4.672211337637246e-06, + "loss": 0.52958744764328, + "step": 1086 + }, + { + "epoch": 2.972677595628415, + "grad_norm": 0.18579013645648956, + "learning_rate": 4.7998566953044445e-06, + "loss": 0.995449960231781, + "step": 1088 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 0.31965434551239014, + "learning_rate": 4.931849845465193e-06, + "loss": 0.767081081867218, + "step": 1090 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 0.15356819331645966, + "learning_rate": 5.06814173924782e-06, + "loss": 0.6917383670806885, + "step": 1092 + }, + { + "epoch": 2.989071038251366, + "grad_norm": 0.17739875614643097, + "learning_rate": 5.208681730360458e-06, + "loss": 0.7700910568237305, + "step": 1094 + }, + { + "epoch": 2.994535519125683, + "grad_norm": 0.5607284903526306, + "learning_rate": 5.3534175939112694e-06, + "loss": 0.5115733742713928, + "step": 1096 + }, + { + "epoch": 3.0, + "grad_norm": 0.22396568953990936, + "learning_rate": 5.50229554581536e-06, + "loss": 0.8061965107917786, + "step": 1098 + }, + { + "epoch": 3.0, + "step": 1098, + "total_flos": 4.957143256761631e+18, + "train_loss": 0.9666990600322765, + "train_runtime": 11531.2166, + "train_samples_per_second": 5.713, + "train_steps_per_second": 0.095 + } + ], + "logging_steps": 2, + "max_steps": 1098, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.957143256761631e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}