diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,57730 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 16482, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00036406662419222717, + "grad_norm": 2.203125, + "learning_rate": 6.250000000000001e-08, + "loss": 1.4439427852630615, + "step": 2 + }, + { + "epoch": 0.0007281332483844543, + "grad_norm": 31.375, + "learning_rate": 1.875e-07, + "loss": 1.8952510356903076, + "step": 4 + }, + { + "epoch": 0.0010921998725766816, + "grad_norm": 17.0, + "learning_rate": 3.125e-07, + "loss": 1.9810845851898193, + "step": 6 + }, + { + "epoch": 0.0014562664967689087, + "grad_norm": 4.46875, + "learning_rate": 4.375e-07, + "loss": 1.1081929206848145, + "step": 8 + }, + { + "epoch": 0.0018203331209611358, + "grad_norm": 58.25, + "learning_rate": 5.625e-07, + "loss": 1.6922560930252075, + "step": 10 + }, + { + "epoch": 0.002184399745153363, + "grad_norm": 67.5, + "learning_rate": 6.875000000000001e-07, + "loss": 2.8736701011657715, + "step": 12 + }, + { + "epoch": 0.0025484663693455902, + "grad_norm": 13.6875, + "learning_rate": 8.125000000000001e-07, + "loss": 1.8159987926483154, + "step": 14 + }, + { + "epoch": 0.0029125329935378174, + "grad_norm": 5.0, + "learning_rate": 9.375000000000001e-07, + "loss": 1.4211392402648926, + "step": 16 + }, + { + "epoch": 0.0032765996177300445, + "grad_norm": 19.25, + "learning_rate": 1.0625e-06, + "loss": 1.7765966653823853, + "step": 18 + }, + { + "epoch": 0.0036406662419222716, + "grad_norm": 36.5, + "learning_rate": 1.1875e-06, + "loss": 2.275758981704712, + "step": 20 + }, + { + "epoch": 0.004004732866114499, + "grad_norm": 179.0, + "learning_rate": 1.3125000000000001e-06, + "loss": 2.7087841033935547, + "step": 22 + }, + { + "epoch": 0.004368799490306726, + "grad_norm": 30.25, + "learning_rate": 1.4375e-06, + "loss": 1.7012183666229248, + "step": 24 + }, + { + "epoch": 0.004732866114498953, + "grad_norm": 16.625, + "learning_rate": 1.5625e-06, + "loss": 1.5296523571014404, + "step": 26 + }, + { + "epoch": 0.0050969327386911805, + "grad_norm": 24.75, + "learning_rate": 1.6875000000000001e-06, + "loss": 1.8775393962860107, + "step": 28 + }, + { + "epoch": 0.005460999362883408, + "grad_norm": 10.0, + "learning_rate": 1.8125e-06, + "loss": 1.9424123764038086, + "step": 30 + }, + { + "epoch": 0.005825065987075635, + "grad_norm": 17.25, + "learning_rate": 1.9375e-06, + "loss": 1.7390620708465576, + "step": 32 + }, + { + "epoch": 0.006189132611267862, + "grad_norm": 28.125, + "learning_rate": 2.0625e-06, + "loss": 2.4498658180236816, + "step": 34 + }, + { + "epoch": 0.006553199235460089, + "grad_norm": 10.375, + "learning_rate": 2.1875000000000002e-06, + "loss": 1.2069549560546875, + "step": 36 + }, + { + "epoch": 0.006917265859652316, + "grad_norm": 61.5, + "learning_rate": 2.3125000000000003e-06, + "loss": 1.9799649715423584, + "step": 38 + }, + { + "epoch": 0.007281332483844543, + "grad_norm": 9.5625, + "learning_rate": 2.4375e-06, + "loss": 1.8582046031951904, + "step": 40 + }, + { + "epoch": 0.007645399108036771, + "grad_norm": 15.0, + "learning_rate": 2.5625e-06, + "loss": 1.9181056022644043, + "step": 42 + }, + { + "epoch": 0.008009465732228998, + "grad_norm": 10.5, + "learning_rate": 2.6875e-06, + "loss": 1.8542757034301758, + "step": 44 + }, + { + "epoch": 0.008373532356421225, + "grad_norm": 68.0, + "learning_rate": 2.8125e-06, + "loss": 2.026920795440674, + "step": 46 + }, + { + "epoch": 0.008737598980613452, + "grad_norm": 12.125, + "learning_rate": 2.9375000000000003e-06, + "loss": 1.7167916297912598, + "step": 48 + }, + { + "epoch": 0.00910166560480568, + "grad_norm": 27.875, + "learning_rate": 3.0625000000000003e-06, + "loss": 2.0626585483551025, + "step": 50 + }, + { + "epoch": 0.009465732228997907, + "grad_norm": 9.75, + "learning_rate": 3.1875e-06, + "loss": 1.354461908340454, + "step": 52 + }, + { + "epoch": 0.009829798853190134, + "grad_norm": 2.09375, + "learning_rate": 3.3125e-06, + "loss": 1.2345013618469238, + "step": 54 + }, + { + "epoch": 0.010193865477382361, + "grad_norm": 5.65625, + "learning_rate": 3.4375e-06, + "loss": 1.0628052949905396, + "step": 56 + }, + { + "epoch": 0.010557932101574588, + "grad_norm": 13.9375, + "learning_rate": 3.5625e-06, + "loss": 1.9976742267608643, + "step": 58 + }, + { + "epoch": 0.010921998725766815, + "grad_norm": 10.1875, + "learning_rate": 3.6875000000000007e-06, + "loss": 1.4608995914459229, + "step": 60 + }, + { + "epoch": 0.011286065349959042, + "grad_norm": 17.75, + "learning_rate": 3.8125e-06, + "loss": 1.997931957244873, + "step": 62 + }, + { + "epoch": 0.01165013197415127, + "grad_norm": 5.875, + "learning_rate": 3.9375e-06, + "loss": 1.4563177824020386, + "step": 64 + }, + { + "epoch": 0.012014198598343497, + "grad_norm": 11.5625, + "learning_rate": 4.0625000000000005e-06, + "loss": 1.8709797859191895, + "step": 66 + }, + { + "epoch": 0.012378265222535724, + "grad_norm": 42.5, + "learning_rate": 4.1875e-06, + "loss": 1.2400493621826172, + "step": 68 + }, + { + "epoch": 0.01274233184672795, + "grad_norm": 7.28125, + "learning_rate": 4.312500000000001e-06, + "loss": 0.9505149126052856, + "step": 70 + }, + { + "epoch": 0.013106398470920178, + "grad_norm": 24.875, + "learning_rate": 4.4375e-06, + "loss": 1.850921392440796, + "step": 72 + }, + { + "epoch": 0.013470465095112405, + "grad_norm": 15.3125, + "learning_rate": 4.5625e-06, + "loss": 1.7793394327163696, + "step": 74 + }, + { + "epoch": 0.013834531719304632, + "grad_norm": 18.375, + "learning_rate": 4.6875000000000004e-06, + "loss": 1.660430908203125, + "step": 76 + }, + { + "epoch": 0.01419859834349686, + "grad_norm": 18.125, + "learning_rate": 4.8125e-06, + "loss": 1.788752794265747, + "step": 78 + }, + { + "epoch": 0.014562664967689086, + "grad_norm": 26.0, + "learning_rate": 4.937500000000001e-06, + "loss": 1.2316336631774902, + "step": 80 + }, + { + "epoch": 0.014926731591881313, + "grad_norm": 17.5, + "learning_rate": 4.9999999633135135e-06, + "loss": 1.8627386093139648, + "step": 82 + }, + { + "epoch": 0.015290798216073542, + "grad_norm": 3.71875, + "learning_rate": 4.9999996698216244e-06, + "loss": 1.1692750453948975, + "step": 84 + }, + { + "epoch": 0.015654864840265768, + "grad_norm": 9.4375, + "learning_rate": 4.999999082837889e-06, + "loss": 1.704923152923584, + "step": 86 + }, + { + "epoch": 0.016018931464457997, + "grad_norm": 11.875, + "learning_rate": 4.9999982023623925e-06, + "loss": 1.838752269744873, + "step": 88 + }, + { + "epoch": 0.016382998088650222, + "grad_norm": 9.25, + "learning_rate": 4.999997028395266e-06, + "loss": 1.5959210395812988, + "step": 90 + }, + { + "epoch": 0.01674706471284245, + "grad_norm": 16.75, + "learning_rate": 4.999995560936682e-06, + "loss": 1.7869584560394287, + "step": 92 + }, + { + "epoch": 0.017111131337034676, + "grad_norm": 17.0, + "learning_rate": 4.999993799986852e-06, + "loss": 1.9351712465286255, + "step": 94 + }, + { + "epoch": 0.017475197961226905, + "grad_norm": 3.9375, + "learning_rate": 4.9999917455460385e-06, + "loss": 1.0348005294799805, + "step": 96 + }, + { + "epoch": 0.01783926458541913, + "grad_norm": 54.25, + "learning_rate": 4.999989397614542e-06, + "loss": 2.414271593093872, + "step": 98 + }, + { + "epoch": 0.01820333120961136, + "grad_norm": 12.6875, + "learning_rate": 4.999986756192706e-06, + "loss": 1.5969310998916626, + "step": 100 + }, + { + "epoch": 0.018567397833803585, + "grad_norm": 13.25, + "learning_rate": 4.999983821280919e-06, + "loss": 1.6561098098754883, + "step": 102 + }, + { + "epoch": 0.018931464457995813, + "grad_norm": 15.6875, + "learning_rate": 4.999980592879612e-06, + "loss": 1.6747602224349976, + "step": 104 + }, + { + "epoch": 0.01929553108218804, + "grad_norm": 38.0, + "learning_rate": 4.999977070989258e-06, + "loss": 1.5859401226043701, + "step": 106 + }, + { + "epoch": 0.019659597706380268, + "grad_norm": 16.5, + "learning_rate": 4.999973255610374e-06, + "loss": 1.6103729009628296, + "step": 108 + }, + { + "epoch": 0.020023664330572493, + "grad_norm": 9.6875, + "learning_rate": 4.99996914674352e-06, + "loss": 1.655151605606079, + "step": 110 + }, + { + "epoch": 0.020387730954764722, + "grad_norm": 9.5, + "learning_rate": 4.999964744389298e-06, + "loss": 1.4251558780670166, + "step": 112 + }, + { + "epoch": 0.02075179757895695, + "grad_norm": 21.625, + "learning_rate": 4.999960048548356e-06, + "loss": 1.7941898107528687, + "step": 114 + }, + { + "epoch": 0.021115864203149176, + "grad_norm": 28.375, + "learning_rate": 4.999955059221381e-06, + "loss": 1.775604009628296, + "step": 116 + }, + { + "epoch": 0.021479930827341405, + "grad_norm": 37.5, + "learning_rate": 4.999949776409106e-06, + "loss": 1.9844095706939697, + "step": 118 + }, + { + "epoch": 0.02184399745153363, + "grad_norm": 12.375, + "learning_rate": 4.999944200112308e-06, + "loss": 1.089508295059204, + "step": 120 + }, + { + "epoch": 0.02220806407572586, + "grad_norm": 4.875, + "learning_rate": 4.999938330331802e-06, + "loss": 1.1590930223464966, + "step": 122 + }, + { + "epoch": 0.022572130699918085, + "grad_norm": 11.9375, + "learning_rate": 4.999932167068452e-06, + "loss": 1.5268014669418335, + "step": 124 + }, + { + "epoch": 0.022936197324110313, + "grad_norm": 8.5, + "learning_rate": 4.999925710323161e-06, + "loss": 1.1743862628936768, + "step": 126 + }, + { + "epoch": 0.02330026394830254, + "grad_norm": 13.125, + "learning_rate": 4.999918960096878e-06, + "loss": 1.8321239948272705, + "step": 128 + }, + { + "epoch": 0.023664330572494768, + "grad_norm": 15.4375, + "learning_rate": 4.999911916390592e-06, + "loss": 1.7687184810638428, + "step": 130 + }, + { + "epoch": 0.024028397196686993, + "grad_norm": 14.3125, + "learning_rate": 4.999904579205337e-06, + "loss": 1.763999342918396, + "step": 132 + }, + { + "epoch": 0.024392463820879222, + "grad_norm": 16.25, + "learning_rate": 4.99989694854219e-06, + "loss": 1.6820811033248901, + "step": 134 + }, + { + "epoch": 0.024756530445071447, + "grad_norm": 8.5, + "learning_rate": 4.9998890244022705e-06, + "loss": 1.6157740354537964, + "step": 136 + }, + { + "epoch": 0.025120597069263676, + "grad_norm": 40.0, + "learning_rate": 4.999880806786742e-06, + "loss": 1.7108687162399292, + "step": 138 + }, + { + "epoch": 0.0254846636934559, + "grad_norm": 12.8125, + "learning_rate": 4.999872295696809e-06, + "loss": 1.628999948501587, + "step": 140 + }, + { + "epoch": 0.02584873031764813, + "grad_norm": 9.1875, + "learning_rate": 4.999863491133722e-06, + "loss": 0.9917442202568054, + "step": 142 + }, + { + "epoch": 0.026212796941840356, + "grad_norm": 10.1875, + "learning_rate": 4.999854393098773e-06, + "loss": 1.8497142791748047, + "step": 144 + }, + { + "epoch": 0.026576863566032585, + "grad_norm": 31.5, + "learning_rate": 4.999845001593295e-06, + "loss": 1.157009482383728, + "step": 146 + }, + { + "epoch": 0.02694093019022481, + "grad_norm": 5.15625, + "learning_rate": 4.999835316618668e-06, + "loss": 0.9564247131347656, + "step": 148 + }, + { + "epoch": 0.02730499681441704, + "grad_norm": 3.453125, + "learning_rate": 4.999825338176315e-06, + "loss": 0.9530770778656006, + "step": 150 + }, + { + "epoch": 0.027669063438609264, + "grad_norm": 24.875, + "learning_rate": 4.999815066267696e-06, + "loss": 2.0179812908172607, + "step": 152 + }, + { + "epoch": 0.028033130062801493, + "grad_norm": 9.625, + "learning_rate": 4.999804500894322e-06, + "loss": 1.5720603466033936, + "step": 154 + }, + { + "epoch": 0.02839719668699372, + "grad_norm": 11.75, + "learning_rate": 4.999793642057741e-06, + "loss": 1.8015835285186768, + "step": 156 + }, + { + "epoch": 0.028761263311185947, + "grad_norm": 12.5, + "learning_rate": 4.999782489759548e-06, + "loss": 1.7068777084350586, + "step": 158 + }, + { + "epoch": 0.029125329935378173, + "grad_norm": 8.125, + "learning_rate": 4.999771044001378e-06, + "loss": 1.7578529119491577, + "step": 160 + }, + { + "epoch": 0.0294893965595704, + "grad_norm": 10.3125, + "learning_rate": 4.999759304784912e-06, + "loss": 1.7013229131698608, + "step": 162 + }, + { + "epoch": 0.029853463183762627, + "grad_norm": 5.75, + "learning_rate": 4.999747272111874e-06, + "loss": 1.1762781143188477, + "step": 164 + }, + { + "epoch": 0.030217529807954856, + "grad_norm": 13.1875, + "learning_rate": 4.999734945984026e-06, + "loss": 1.6700516939163208, + "step": 166 + }, + { + "epoch": 0.030581596432147085, + "grad_norm": 16.25, + "learning_rate": 4.9997223264031805e-06, + "loss": 1.02428138256073, + "step": 168 + }, + { + "epoch": 0.03094566305633931, + "grad_norm": 9.6875, + "learning_rate": 4.999709413371187e-06, + "loss": 1.6712379455566406, + "step": 170 + }, + { + "epoch": 0.031309729680531535, + "grad_norm": 34.5, + "learning_rate": 4.999696206889942e-06, + "loss": 1.5965200662612915, + "step": 172 + }, + { + "epoch": 0.031673796304723764, + "grad_norm": 63.25, + "learning_rate": 4.999682706961381e-06, + "loss": 1.1542786359786987, + "step": 174 + }, + { + "epoch": 0.03203786292891599, + "grad_norm": 14.0625, + "learning_rate": 4.999668913587488e-06, + "loss": 1.7522170543670654, + "step": 176 + }, + { + "epoch": 0.03240192955310822, + "grad_norm": 22.25, + "learning_rate": 4.999654826770285e-06, + "loss": 1.4951016902923584, + "step": 178 + }, + { + "epoch": 0.032765996177300444, + "grad_norm": 5.875, + "learning_rate": 4.999640446511841e-06, + "loss": 1.3878135681152344, + "step": 180 + }, + { + "epoch": 0.03313006280149267, + "grad_norm": 42.0, + "learning_rate": 4.999625772814265e-06, + "loss": 2.111569404602051, + "step": 182 + }, + { + "epoch": 0.0334941294256849, + "grad_norm": 4.9375, + "learning_rate": 4.99961080567971e-06, + "loss": 1.096387267112732, + "step": 184 + }, + { + "epoch": 0.03385819604987713, + "grad_norm": 8.75, + "learning_rate": 4.999595545110374e-06, + "loss": 1.646989107131958, + "step": 186 + }, + { + "epoch": 0.03422226267406935, + "grad_norm": 2.53125, + "learning_rate": 4.999579991108495e-06, + "loss": 1.101284384727478, + "step": 188 + }, + { + "epoch": 0.03458632929826158, + "grad_norm": 13.3125, + "learning_rate": 4.999564143676355e-06, + "loss": 1.894997000694275, + "step": 190 + }, + { + "epoch": 0.03495039592245381, + "grad_norm": 9.6875, + "learning_rate": 4.999548002816283e-06, + "loss": 1.7325177192687988, + "step": 192 + }, + { + "epoch": 0.03531446254664604, + "grad_norm": 39.5, + "learning_rate": 4.999531568530642e-06, + "loss": 1.519869089126587, + "step": 194 + }, + { + "epoch": 0.03567852917083826, + "grad_norm": 14.0, + "learning_rate": 4.999514840821847e-06, + "loss": 1.671815276145935, + "step": 196 + }, + { + "epoch": 0.03604259579503049, + "grad_norm": 6.34375, + "learning_rate": 4.9994978196923545e-06, + "loss": 1.409099817276001, + "step": 198 + }, + { + "epoch": 0.03640666241922272, + "grad_norm": 23.25, + "learning_rate": 4.99948050514466e-06, + "loss": 1.731999158859253, + "step": 200 + }, + { + "epoch": 0.03677072904341495, + "grad_norm": 5.6875, + "learning_rate": 4.999462897181303e-06, + "loss": 0.8340705633163452, + "step": 202 + }, + { + "epoch": 0.03713479566760717, + "grad_norm": 38.75, + "learning_rate": 4.99944499580487e-06, + "loss": 0.9876308441162109, + "step": 204 + }, + { + "epoch": 0.0374988622917994, + "grad_norm": 8.875, + "learning_rate": 4.999426801017987e-06, + "loss": 1.5689787864685059, + "step": 206 + }, + { + "epoch": 0.03786292891599163, + "grad_norm": 4.3125, + "learning_rate": 4.999408312823323e-06, + "loss": 1.3400174379348755, + "step": 208 + }, + { + "epoch": 0.038226995540183856, + "grad_norm": 3.703125, + "learning_rate": 4.9993895312235915e-06, + "loss": 1.0843534469604492, + "step": 210 + }, + { + "epoch": 0.03859106216437608, + "grad_norm": 22.25, + "learning_rate": 4.99937045622155e-06, + "loss": 1.7182073593139648, + "step": 212 + }, + { + "epoch": 0.038955128788568306, + "grad_norm": 8.75, + "learning_rate": 4.999351087819998e-06, + "loss": 0.9828606247901917, + "step": 214 + }, + { + "epoch": 0.039319195412760535, + "grad_norm": 12.0, + "learning_rate": 4.9993314260217755e-06, + "loss": 1.2823659181594849, + "step": 216 + }, + { + "epoch": 0.039683262036952764, + "grad_norm": 89.0, + "learning_rate": 4.999311470829769e-06, + "loss": 1.8203669786453247, + "step": 218 + }, + { + "epoch": 0.040047328661144986, + "grad_norm": 13.0625, + "learning_rate": 4.999291222246906e-06, + "loss": 1.7533475160598755, + "step": 220 + }, + { + "epoch": 0.040411395285337215, + "grad_norm": 3.4375, + "learning_rate": 4.999270680276159e-06, + "loss": 1.1587835550308228, + "step": 222 + }, + { + "epoch": 0.040775461909529444, + "grad_norm": 14.125, + "learning_rate": 4.999249844920542e-06, + "loss": 1.6403859853744507, + "step": 224 + }, + { + "epoch": 0.04113952853372167, + "grad_norm": 19.375, + "learning_rate": 4.999228716183112e-06, + "loss": 1.605838418006897, + "step": 226 + }, + { + "epoch": 0.0415035951579139, + "grad_norm": 22.0, + "learning_rate": 4.999207294066971e-06, + "loss": 1.7368394136428833, + "step": 228 + }, + { + "epoch": 0.04186766178210612, + "grad_norm": 8.3125, + "learning_rate": 4.999185578575261e-06, + "loss": 1.2459367513656616, + "step": 230 + }, + { + "epoch": 0.04223172840629835, + "grad_norm": 7.65625, + "learning_rate": 4.9991635697111695e-06, + "loss": 1.6125494241714478, + "step": 232 + }, + { + "epoch": 0.04259579503049058, + "grad_norm": 11.375, + "learning_rate": 4.999141267477926e-06, + "loss": 1.3983747959136963, + "step": 234 + }, + { + "epoch": 0.04295986165468281, + "grad_norm": 32.0, + "learning_rate": 4.999118671878803e-06, + "loss": 2.105119466781616, + "step": 236 + }, + { + "epoch": 0.04332392827887503, + "grad_norm": 4.65625, + "learning_rate": 4.999095782917118e-06, + "loss": 0.8702592849731445, + "step": 238 + }, + { + "epoch": 0.04368799490306726, + "grad_norm": 12.125, + "learning_rate": 4.999072600596226e-06, + "loss": 1.6933553218841553, + "step": 240 + }, + { + "epoch": 0.04405206152725949, + "grad_norm": 4.53125, + "learning_rate": 4.999049124919534e-06, + "loss": 0.9142586588859558, + "step": 242 + }, + { + "epoch": 0.04441612815145172, + "grad_norm": 17.875, + "learning_rate": 4.999025355890482e-06, + "loss": 1.405271291732788, + "step": 244 + }, + { + "epoch": 0.04478019477564394, + "grad_norm": 25.75, + "learning_rate": 4.999001293512562e-06, + "loss": 1.8398109674453735, + "step": 246 + }, + { + "epoch": 0.04514426139983617, + "grad_norm": 14.0, + "learning_rate": 4.998976937789304e-06, + "loss": 1.8075313568115234, + "step": 248 + }, + { + "epoch": 0.0455083280240284, + "grad_norm": 5.8125, + "learning_rate": 4.9989522887242806e-06, + "loss": 1.103116750717163, + "step": 250 + }, + { + "epoch": 0.04587239464822063, + "grad_norm": 6.96875, + "learning_rate": 4.99892734632111e-06, + "loss": 1.1777992248535156, + "step": 252 + }, + { + "epoch": 0.04623646127241285, + "grad_norm": 13.5625, + "learning_rate": 4.9989021105834515e-06, + "loss": 2.266021251678467, + "step": 254 + }, + { + "epoch": 0.04660052789660508, + "grad_norm": 16.625, + "learning_rate": 4.99887658151501e-06, + "loss": 1.7739362716674805, + "step": 256 + }, + { + "epoch": 0.046964594520797306, + "grad_norm": 25.75, + "learning_rate": 4.99885075911953e-06, + "loss": 0.9575203061103821, + "step": 258 + }, + { + "epoch": 0.047328661144989535, + "grad_norm": 10.5, + "learning_rate": 4.9988246434008025e-06, + "loss": 1.6021323204040527, + "step": 260 + }, + { + "epoch": 0.04769272776918176, + "grad_norm": 20.125, + "learning_rate": 4.998798234362659e-06, + "loss": 2.106436252593994, + "step": 262 + }, + { + "epoch": 0.048056794393373986, + "grad_norm": 8.5625, + "learning_rate": 4.998771532008974e-06, + "loss": 1.1828550100326538, + "step": 264 + }, + { + "epoch": 0.048420861017566215, + "grad_norm": 52.25, + "learning_rate": 4.998744536343669e-06, + "loss": 1.099178671836853, + "step": 266 + }, + { + "epoch": 0.048784927641758444, + "grad_norm": 9.75, + "learning_rate": 4.998717247370703e-06, + "loss": 1.6490399837493896, + "step": 268 + }, + { + "epoch": 0.049148994265950666, + "grad_norm": 18.75, + "learning_rate": 4.998689665094079e-06, + "loss": 0.9570120573043823, + "step": 270 + }, + { + "epoch": 0.049513060890142895, + "grad_norm": 5.375, + "learning_rate": 4.998661789517849e-06, + "loss": 1.5607397556304932, + "step": 272 + }, + { + "epoch": 0.04987712751433512, + "grad_norm": 20.375, + "learning_rate": 4.998633620646101e-06, + "loss": 0.7462946772575378, + "step": 274 + }, + { + "epoch": 0.05024119413852735, + "grad_norm": 2.75, + "learning_rate": 4.998605158482967e-06, + "loss": 0.7914036512374878, + "step": 276 + }, + { + "epoch": 0.05060526076271958, + "grad_norm": 7.9375, + "learning_rate": 4.998576403032628e-06, + "loss": 1.245314121246338, + "step": 278 + }, + { + "epoch": 0.0509693273869118, + "grad_norm": 36.0, + "learning_rate": 4.9985473542993e-06, + "loss": 0.9020047187805176, + "step": 280 + }, + { + "epoch": 0.05133339401110403, + "grad_norm": 34.0, + "learning_rate": 4.998518012287248e-06, + "loss": 1.66724693775177, + "step": 282 + }, + { + "epoch": 0.05169746063529626, + "grad_norm": 17.875, + "learning_rate": 4.998488377000776e-06, + "loss": 1.7929542064666748, + "step": 284 + }, + { + "epoch": 0.05206152725948849, + "grad_norm": 19.125, + "learning_rate": 4.998458448444235e-06, + "loss": 1.9806444644927979, + "step": 286 + }, + { + "epoch": 0.05242559388368071, + "grad_norm": 4.09375, + "learning_rate": 4.998428226622014e-06, + "loss": 1.0485785007476807, + "step": 288 + }, + { + "epoch": 0.05278966050787294, + "grad_norm": 26.0, + "learning_rate": 4.9983977115385505e-06, + "loss": 2.3810715675354004, + "step": 290 + }, + { + "epoch": 0.05315372713206517, + "grad_norm": 60.0, + "learning_rate": 4.998366903198323e-06, + "loss": 2.1565098762512207, + "step": 292 + }, + { + "epoch": 0.0535177937562574, + "grad_norm": 32.75, + "learning_rate": 4.9983358016058494e-06, + "loss": 1.9635474681854248, + "step": 294 + }, + { + "epoch": 0.05388186038044962, + "grad_norm": 6.5, + "learning_rate": 4.998304406765696e-06, + "loss": 1.1579265594482422, + "step": 296 + }, + { + "epoch": 0.05424592700464185, + "grad_norm": 10.5625, + "learning_rate": 4.99827271868247e-06, + "loss": 1.6695032119750977, + "step": 298 + }, + { + "epoch": 0.05460999362883408, + "grad_norm": 12.5625, + "learning_rate": 4.998240737360819e-06, + "loss": 1.888932228088379, + "step": 300 + }, + { + "epoch": 0.054974060253026306, + "grad_norm": 3.40625, + "learning_rate": 4.99820846280544e-06, + "loss": 1.097129225730896, + "step": 302 + }, + { + "epoch": 0.05533812687721853, + "grad_norm": 9.0, + "learning_rate": 4.998175895021066e-06, + "loss": 1.0700342655181885, + "step": 304 + }, + { + "epoch": 0.05570219350141076, + "grad_norm": 9.8125, + "learning_rate": 4.998143034012478e-06, + "loss": 1.1494324207305908, + "step": 306 + }, + { + "epoch": 0.056066260125602986, + "grad_norm": 12.5625, + "learning_rate": 4.998109879784496e-06, + "loss": 1.8598127365112305, + "step": 308 + }, + { + "epoch": 0.056430326749795215, + "grad_norm": 19.625, + "learning_rate": 4.998076432341988e-06, + "loss": 1.6065984964370728, + "step": 310 + }, + { + "epoch": 0.05679439337398744, + "grad_norm": 45.5, + "learning_rate": 4.998042691689862e-06, + "loss": 0.9418438673019409, + "step": 312 + }, + { + "epoch": 0.057158459998179666, + "grad_norm": 10.9375, + "learning_rate": 4.998008657833067e-06, + "loss": 1.579975962638855, + "step": 314 + }, + { + "epoch": 0.057522526622371895, + "grad_norm": 11.9375, + "learning_rate": 4.997974330776598e-06, + "loss": 1.1987617015838623, + "step": 316 + }, + { + "epoch": 0.05788659324656412, + "grad_norm": 18.0, + "learning_rate": 4.9979397105254945e-06, + "loss": 2.357426404953003, + "step": 318 + }, + { + "epoch": 0.058250659870756345, + "grad_norm": 13.9375, + "learning_rate": 4.997904797084835e-06, + "loss": 1.6827460527420044, + "step": 320 + }, + { + "epoch": 0.058614726494948574, + "grad_norm": 44.5, + "learning_rate": 4.997869590459743e-06, + "loss": 0.7791367173194885, + "step": 322 + }, + { + "epoch": 0.0589787931191408, + "grad_norm": 25.875, + "learning_rate": 4.997834090655385e-06, + "loss": 1.9080381393432617, + "step": 324 + }, + { + "epoch": 0.05934285974333303, + "grad_norm": 10.75, + "learning_rate": 4.9977982976769715e-06, + "loss": 1.536203145980835, + "step": 326 + }, + { + "epoch": 0.059706926367525254, + "grad_norm": 34.0, + "learning_rate": 4.997762211529754e-06, + "loss": 1.9274436235427856, + "step": 328 + }, + { + "epoch": 0.06007099299171748, + "grad_norm": 23.375, + "learning_rate": 4.9977258322190285e-06, + "loss": 2.073014736175537, + "step": 330 + }, + { + "epoch": 0.06043505961590971, + "grad_norm": 11.0625, + "learning_rate": 4.997689159750132e-06, + "loss": 1.7975918054580688, + "step": 332 + }, + { + "epoch": 0.06079912624010194, + "grad_norm": 10.0625, + "learning_rate": 4.997652194128449e-06, + "loss": 1.551814317703247, + "step": 334 + }, + { + "epoch": 0.06116319286429417, + "grad_norm": 15.375, + "learning_rate": 4.9976149353594e-06, + "loss": 1.4602867364883423, + "step": 336 + }, + { + "epoch": 0.06152725948848639, + "grad_norm": 20.125, + "learning_rate": 4.9975773834484565e-06, + "loss": 1.6254475116729736, + "step": 338 + }, + { + "epoch": 0.06189132611267862, + "grad_norm": 9.6875, + "learning_rate": 4.997539538401127e-06, + "loss": 1.5822265148162842, + "step": 340 + }, + { + "epoch": 0.06225539273687085, + "grad_norm": 7.3125, + "learning_rate": 4.997501400222966e-06, + "loss": 1.3375024795532227, + "step": 342 + }, + { + "epoch": 0.06261945936106307, + "grad_norm": 12.75, + "learning_rate": 4.99746296891957e-06, + "loss": 1.5867472887039185, + "step": 344 + }, + { + "epoch": 0.0629835259852553, + "grad_norm": 10.3125, + "learning_rate": 4.997424244496577e-06, + "loss": 1.6741087436676025, + "step": 346 + }, + { + "epoch": 0.06334759260944753, + "grad_norm": 27.875, + "learning_rate": 4.997385226959672e-06, + "loss": 2.0546278953552246, + "step": 348 + }, + { + "epoch": 0.06371165923363975, + "grad_norm": 13.5625, + "learning_rate": 4.997345916314578e-06, + "loss": 1.687022089958191, + "step": 350 + }, + { + "epoch": 0.06407572585783199, + "grad_norm": 12.875, + "learning_rate": 4.997306312567067e-06, + "loss": 1.7766467332839966, + "step": 352 + }, + { + "epoch": 0.06443979248202421, + "grad_norm": 4.34375, + "learning_rate": 4.997266415722949e-06, + "loss": 1.1123629808425903, + "step": 354 + }, + { + "epoch": 0.06480385910621644, + "grad_norm": 11.125, + "learning_rate": 4.997226225788078e-06, + "loss": 1.645906925201416, + "step": 356 + }, + { + "epoch": 0.06516792573040867, + "grad_norm": 14.875, + "learning_rate": 4.997185742768352e-06, + "loss": 1.8206772804260254, + "step": 358 + }, + { + "epoch": 0.06553199235460089, + "grad_norm": 11.0625, + "learning_rate": 4.997144966669713e-06, + "loss": 1.4990421533584595, + "step": 360 + }, + { + "epoch": 0.06589605897879312, + "grad_norm": 9.625, + "learning_rate": 4.997103897498144e-06, + "loss": 1.7342629432678223, + "step": 362 + }, + { + "epoch": 0.06626012560298535, + "grad_norm": 4.46875, + "learning_rate": 4.997062535259672e-06, + "loss": 1.2138992547988892, + "step": 364 + }, + { + "epoch": 0.06662419222717757, + "grad_norm": 3.765625, + "learning_rate": 4.997020879960365e-06, + "loss": 1.220954418182373, + "step": 366 + }, + { + "epoch": 0.0669882588513698, + "grad_norm": 4.125, + "learning_rate": 4.996978931606338e-06, + "loss": 1.4167884588241577, + "step": 368 + }, + { + "epoch": 0.06735232547556202, + "grad_norm": 4.9375, + "learning_rate": 4.996936690203746e-06, + "loss": 0.8898102641105652, + "step": 370 + }, + { + "epoch": 0.06771639209975426, + "grad_norm": 10.1875, + "learning_rate": 4.996894155758787e-06, + "loss": 1.6967730522155762, + "step": 372 + }, + { + "epoch": 0.06808045872394648, + "grad_norm": 13.0625, + "learning_rate": 4.996851328277703e-06, + "loss": 1.5613888502120972, + "step": 374 + }, + { + "epoch": 0.0684445253481387, + "grad_norm": 20.75, + "learning_rate": 4.99680820776678e-06, + "loss": 1.6336811780929565, + "step": 376 + }, + { + "epoch": 0.06880859197233094, + "grad_norm": 25.375, + "learning_rate": 4.996764794232344e-06, + "loss": 1.7506146430969238, + "step": 378 + }, + { + "epoch": 0.06917265859652316, + "grad_norm": 11.3125, + "learning_rate": 4.996721087680767e-06, + "loss": 1.9016811847686768, + "step": 380 + }, + { + "epoch": 0.06953672522071538, + "grad_norm": 18.625, + "learning_rate": 4.9966770881184625e-06, + "loss": 1.6010924577713013, + "step": 382 + }, + { + "epoch": 0.06990079184490762, + "grad_norm": 35.0, + "learning_rate": 4.996632795551887e-06, + "loss": 1.7810370922088623, + "step": 384 + }, + { + "epoch": 0.07026485846909984, + "grad_norm": 7.375, + "learning_rate": 4.996588209987541e-06, + "loss": 1.1643171310424805, + "step": 386 + }, + { + "epoch": 0.07062892509329208, + "grad_norm": 26.375, + "learning_rate": 4.996543331431966e-06, + "loss": 0.9697562456130981, + "step": 388 + }, + { + "epoch": 0.0709929917174843, + "grad_norm": 15.5, + "learning_rate": 4.996498159891748e-06, + "loss": 1.4735223054885864, + "step": 390 + }, + { + "epoch": 0.07135705834167652, + "grad_norm": 17.875, + "learning_rate": 4.996452695373517e-06, + "loss": 1.5924171209335327, + "step": 392 + }, + { + "epoch": 0.07172112496586876, + "grad_norm": 21.5, + "learning_rate": 4.996406937883944e-06, + "loss": 1.2968168258666992, + "step": 394 + }, + { + "epoch": 0.07208519159006098, + "grad_norm": 11.0, + "learning_rate": 4.996360887429743e-06, + "loss": 1.7380396127700806, + "step": 396 + }, + { + "epoch": 0.07244925821425321, + "grad_norm": 10.6875, + "learning_rate": 4.996314544017672e-06, + "loss": 1.675560474395752, + "step": 398 + }, + { + "epoch": 0.07281332483844544, + "grad_norm": 4.5625, + "learning_rate": 4.9962679076545325e-06, + "loss": 0.8685811161994934, + "step": 400 + }, + { + "epoch": 0.07317739146263766, + "grad_norm": 86.0, + "learning_rate": 4.9962209783471685e-06, + "loss": 2.0414555072784424, + "step": 402 + }, + { + "epoch": 0.0735414580868299, + "grad_norm": 21.75, + "learning_rate": 4.9961737561024645e-06, + "loss": 1.722549557685852, + "step": 404 + }, + { + "epoch": 0.07390552471102212, + "grad_norm": 7.0, + "learning_rate": 4.996126240927353e-06, + "loss": 1.5973738431930542, + "step": 406 + }, + { + "epoch": 0.07426959133521434, + "grad_norm": 8.0, + "learning_rate": 4.996078432828804e-06, + "loss": 1.7757899761199951, + "step": 408 + }, + { + "epoch": 0.07463365795940657, + "grad_norm": 15.375, + "learning_rate": 4.9960303318138345e-06, + "loss": 1.5636049509048462, + "step": 410 + }, + { + "epoch": 0.0749977245835988, + "grad_norm": 21.375, + "learning_rate": 4.995981937889503e-06, + "loss": 1.7934061288833618, + "step": 412 + }, + { + "epoch": 0.07536179120779103, + "grad_norm": 7.15625, + "learning_rate": 4.995933251062911e-06, + "loss": 1.1408581733703613, + "step": 414 + }, + { + "epoch": 0.07572585783198325, + "grad_norm": 60.5, + "learning_rate": 4.9958842713412045e-06, + "loss": 1.3937311172485352, + "step": 416 + }, + { + "epoch": 0.07608992445617548, + "grad_norm": 24.25, + "learning_rate": 4.9958349987315694e-06, + "loss": 2.245112895965576, + "step": 418 + }, + { + "epoch": 0.07645399108036771, + "grad_norm": 14.4375, + "learning_rate": 4.9957854332412355e-06, + "loss": 1.5475777387619019, + "step": 420 + }, + { + "epoch": 0.07681805770455993, + "grad_norm": 26.125, + "learning_rate": 4.995735574877479e-06, + "loss": 1.450985074043274, + "step": 422 + }, + { + "epoch": 0.07718212432875216, + "grad_norm": 10.5, + "learning_rate": 4.995685423647614e-06, + "loss": 1.5756415128707886, + "step": 424 + }, + { + "epoch": 0.07754619095294439, + "grad_norm": 3.6875, + "learning_rate": 4.995634979559001e-06, + "loss": 1.1469438076019287, + "step": 426 + }, + { + "epoch": 0.07791025757713661, + "grad_norm": 13.75, + "learning_rate": 4.995584242619042e-06, + "loss": 1.561311960220337, + "step": 428 + }, + { + "epoch": 0.07827432420132885, + "grad_norm": 24.625, + "learning_rate": 4.995533212835183e-06, + "loss": 1.953911542892456, + "step": 430 + }, + { + "epoch": 0.07863839082552107, + "grad_norm": 17.75, + "learning_rate": 4.995481890214912e-06, + "loss": 1.5231664180755615, + "step": 432 + }, + { + "epoch": 0.07900245744971329, + "grad_norm": 10.0625, + "learning_rate": 4.995430274765761e-06, + "loss": 1.6239662170410156, + "step": 434 + }, + { + "epoch": 0.07936652407390553, + "grad_norm": 9.3125, + "learning_rate": 4.9953783664953035e-06, + "loss": 1.6825274229049683, + "step": 436 + }, + { + "epoch": 0.07973059069809775, + "grad_norm": 11.875, + "learning_rate": 4.995326165411158e-06, + "loss": 1.6108781099319458, + "step": 438 + }, + { + "epoch": 0.08009465732228997, + "grad_norm": 16.375, + "learning_rate": 4.995273671520984e-06, + "loss": 1.752515196800232, + "step": 440 + }, + { + "epoch": 0.08045872394648221, + "grad_norm": 2.5625, + "learning_rate": 4.995220884832484e-06, + "loss": 1.2974646091461182, + "step": 442 + }, + { + "epoch": 0.08082279057067443, + "grad_norm": 23.375, + "learning_rate": 4.995167805353406e-06, + "loss": 2.2482070922851562, + "step": 444 + }, + { + "epoch": 0.08118685719486667, + "grad_norm": 8.0625, + "learning_rate": 4.995114433091538e-06, + "loss": 1.6542468070983887, + "step": 446 + }, + { + "epoch": 0.08155092381905889, + "grad_norm": 6.65625, + "learning_rate": 4.995060768054711e-06, + "loss": 0.9779666066169739, + "step": 448 + }, + { + "epoch": 0.08191499044325111, + "grad_norm": 14.625, + "learning_rate": 4.995006810250804e-06, + "loss": 1.1051747798919678, + "step": 450 + }, + { + "epoch": 0.08227905706744335, + "grad_norm": 8.5, + "learning_rate": 4.9949525596877315e-06, + "loss": 1.5198476314544678, + "step": 452 + }, + { + "epoch": 0.08264312369163557, + "grad_norm": 32.75, + "learning_rate": 4.994898016373455e-06, + "loss": 2.0264604091644287, + "step": 454 + }, + { + "epoch": 0.0830071903158278, + "grad_norm": 12.875, + "learning_rate": 4.99484318031598e-06, + "loss": 1.7488818168640137, + "step": 456 + }, + { + "epoch": 0.08337125694002002, + "grad_norm": 19.75, + "learning_rate": 4.994788051523353e-06, + "loss": 0.8572578430175781, + "step": 458 + }, + { + "epoch": 0.08373532356421225, + "grad_norm": 10.5, + "learning_rate": 4.994732630003663e-06, + "loss": 1.4364445209503174, + "step": 460 + }, + { + "epoch": 0.08409939018840448, + "grad_norm": 73.5, + "learning_rate": 4.994676915765044e-06, + "loss": 2.2151713371276855, + "step": 462 + }, + { + "epoch": 0.0844634568125967, + "grad_norm": 11.0625, + "learning_rate": 4.994620908815672e-06, + "loss": 1.579387903213501, + "step": 464 + }, + { + "epoch": 0.08482752343678893, + "grad_norm": 22.0, + "learning_rate": 4.994564609163763e-06, + "loss": 2.0482544898986816, + "step": 466 + }, + { + "epoch": 0.08519159006098116, + "grad_norm": 20.25, + "learning_rate": 4.994508016817582e-06, + "loss": 1.6465792655944824, + "step": 468 + }, + { + "epoch": 0.08555565668517338, + "grad_norm": 28.75, + "learning_rate": 4.9944511317854325e-06, + "loss": 1.4567583799362183, + "step": 470 + }, + { + "epoch": 0.08591972330936562, + "grad_norm": 20.375, + "learning_rate": 4.994393954075663e-06, + "loss": 1.0225732326507568, + "step": 472 + }, + { + "epoch": 0.08628378993355784, + "grad_norm": 12.5625, + "learning_rate": 4.994336483696663e-06, + "loss": 1.2597174644470215, + "step": 474 + }, + { + "epoch": 0.08664785655775006, + "grad_norm": 10.625, + "learning_rate": 4.994278720656865e-06, + "loss": 0.9398125410079956, + "step": 476 + }, + { + "epoch": 0.0870119231819423, + "grad_norm": 4.3125, + "learning_rate": 4.9942206649647485e-06, + "loss": 1.1096361875534058, + "step": 478 + }, + { + "epoch": 0.08737598980613452, + "grad_norm": 11.3125, + "learning_rate": 4.99416231662883e-06, + "loss": 1.516744613647461, + "step": 480 + }, + { + "epoch": 0.08774005643032674, + "grad_norm": 8.375, + "learning_rate": 4.9941036756576746e-06, + "loss": 1.6033967733383179, + "step": 482 + }, + { + "epoch": 0.08810412305451898, + "grad_norm": 48.5, + "learning_rate": 4.994044742059885e-06, + "loss": 1.4961059093475342, + "step": 484 + }, + { + "epoch": 0.0884681896787112, + "grad_norm": 30.625, + "learning_rate": 4.993985515844111e-06, + "loss": 1.3461837768554688, + "step": 486 + }, + { + "epoch": 0.08883225630290344, + "grad_norm": 25.625, + "learning_rate": 4.993925997019044e-06, + "loss": 1.5667678117752075, + "step": 488 + }, + { + "epoch": 0.08919632292709566, + "grad_norm": 8.875, + "learning_rate": 4.993866185593417e-06, + "loss": 1.6020535230636597, + "step": 490 + }, + { + "epoch": 0.08956038955128788, + "grad_norm": 21.75, + "learning_rate": 4.993806081576007e-06, + "loss": 2.1408867835998535, + "step": 492 + }, + { + "epoch": 0.08992445617548012, + "grad_norm": 20.125, + "learning_rate": 4.993745684975636e-06, + "loss": 1.9854199886322021, + "step": 494 + }, + { + "epoch": 0.09028852279967234, + "grad_norm": 24.875, + "learning_rate": 4.9936849958011645e-06, + "loss": 2.068175792694092, + "step": 496 + }, + { + "epoch": 0.09065258942386457, + "grad_norm": 14.5, + "learning_rate": 4.993624014061501e-06, + "loss": 0.9099866151809692, + "step": 498 + }, + { + "epoch": 0.0910166560480568, + "grad_norm": 7.4375, + "learning_rate": 4.993562739765593e-06, + "loss": 1.1208518743515015, + "step": 500 + }, + { + "epoch": 0.09138072267224902, + "grad_norm": 20.75, + "learning_rate": 4.99350117292243e-06, + "loss": 1.3898968696594238, + "step": 502 + }, + { + "epoch": 0.09174478929644125, + "grad_norm": 6.0, + "learning_rate": 4.993439313541051e-06, + "loss": 0.9804521799087524, + "step": 504 + }, + { + "epoch": 0.09210885592063348, + "grad_norm": 7.65625, + "learning_rate": 4.9933771616305304e-06, + "loss": 1.688679575920105, + "step": 506 + }, + { + "epoch": 0.0924729225448257, + "grad_norm": 20.25, + "learning_rate": 4.99331471719999e-06, + "loss": 0.8258498311042786, + "step": 508 + }, + { + "epoch": 0.09283698916901793, + "grad_norm": 18.125, + "learning_rate": 4.993251980258592e-06, + "loss": 1.9254039525985718, + "step": 510 + }, + { + "epoch": 0.09320105579321016, + "grad_norm": 8.0, + "learning_rate": 4.993188950815545e-06, + "loss": 1.6451748609542847, + "step": 512 + }, + { + "epoch": 0.09356512241740239, + "grad_norm": 12.375, + "learning_rate": 4.993125628880098e-06, + "loss": 1.6198679208755493, + "step": 514 + }, + { + "epoch": 0.09392918904159461, + "grad_norm": 15.6875, + "learning_rate": 4.993062014461542e-06, + "loss": 1.0850889682769775, + "step": 516 + }, + { + "epoch": 0.09429325566578683, + "grad_norm": 8.9375, + "learning_rate": 4.9929981075692115e-06, + "loss": 1.705244779586792, + "step": 518 + }, + { + "epoch": 0.09465732228997907, + "grad_norm": 9.9375, + "learning_rate": 4.992933908212485e-06, + "loss": 1.6756541728973389, + "step": 520 + }, + { + "epoch": 0.09502138891417129, + "grad_norm": 15.5, + "learning_rate": 4.992869416400785e-06, + "loss": 1.7981197834014893, + "step": 522 + }, + { + "epoch": 0.09538545553836351, + "grad_norm": 37.0, + "learning_rate": 4.992804632143575e-06, + "loss": 2.236433982849121, + "step": 524 + }, + { + "epoch": 0.09574952216255575, + "grad_norm": 14.8125, + "learning_rate": 4.992739555450361e-06, + "loss": 1.0820854902267456, + "step": 526 + }, + { + "epoch": 0.09611358878674797, + "grad_norm": 8.3125, + "learning_rate": 4.992674186330694e-06, + "loss": 1.7297701835632324, + "step": 528 + }, + { + "epoch": 0.09647765541094021, + "grad_norm": 14.3125, + "learning_rate": 4.992608524794165e-06, + "loss": 1.5572032928466797, + "step": 530 + }, + { + "epoch": 0.09684172203513243, + "grad_norm": 5.84375, + "learning_rate": 4.99254257085041e-06, + "loss": 1.0914921760559082, + "step": 532 + }, + { + "epoch": 0.09720578865932465, + "grad_norm": 5.90625, + "learning_rate": 4.992476324509108e-06, + "loss": 1.2033464908599854, + "step": 534 + }, + { + "epoch": 0.09756985528351689, + "grad_norm": 115.5, + "learning_rate": 4.99240978577998e-06, + "loss": 1.4244115352630615, + "step": 536 + }, + { + "epoch": 0.09793392190770911, + "grad_norm": 9.125, + "learning_rate": 4.992342954672791e-06, + "loss": 1.5869569778442383, + "step": 538 + }, + { + "epoch": 0.09829798853190133, + "grad_norm": 7.21875, + "learning_rate": 4.992275831197347e-06, + "loss": 1.1813812255859375, + "step": 540 + }, + { + "epoch": 0.09866205515609357, + "grad_norm": 9.1875, + "learning_rate": 4.9922084153635e-06, + "loss": 1.3222036361694336, + "step": 542 + }, + { + "epoch": 0.09902612178028579, + "grad_norm": 7.8125, + "learning_rate": 4.99214070718114e-06, + "loss": 1.5701090097427368, + "step": 544 + }, + { + "epoch": 0.09939018840447802, + "grad_norm": 11.4375, + "learning_rate": 4.992072706660206e-06, + "loss": 1.4453173875808716, + "step": 546 + }, + { + "epoch": 0.09975425502867025, + "grad_norm": 4.78125, + "learning_rate": 4.9920044138106745e-06, + "loss": 1.21589195728302, + "step": 548 + }, + { + "epoch": 0.10011832165286247, + "grad_norm": 17.25, + "learning_rate": 4.991935828642569e-06, + "loss": 1.4480749368667603, + "step": 550 + }, + { + "epoch": 0.1004823882770547, + "grad_norm": 59.75, + "learning_rate": 4.991866951165954e-06, + "loss": 0.8703964948654175, + "step": 552 + }, + { + "epoch": 0.10084645490124693, + "grad_norm": 17.375, + "learning_rate": 4.991797781390935e-06, + "loss": 1.4682190418243408, + "step": 554 + }, + { + "epoch": 0.10121052152543916, + "grad_norm": 10.75, + "learning_rate": 4.991728319327664e-06, + "loss": 1.9051607847213745, + "step": 556 + }, + { + "epoch": 0.10157458814963138, + "grad_norm": 15.25, + "learning_rate": 4.9916585649863335e-06, + "loss": 1.9129139184951782, + "step": 558 + }, + { + "epoch": 0.1019386547738236, + "grad_norm": 12.625, + "learning_rate": 4.99158851837718e-06, + "loss": 1.4913318157196045, + "step": 560 + }, + { + "epoch": 0.10230272139801584, + "grad_norm": 28.75, + "learning_rate": 4.991518179510483e-06, + "loss": 1.8028734922409058, + "step": 562 + }, + { + "epoch": 0.10266678802220806, + "grad_norm": 19.0, + "learning_rate": 4.991447548396564e-06, + "loss": 1.1229168176651, + "step": 564 + }, + { + "epoch": 0.10303085464640029, + "grad_norm": 22.875, + "learning_rate": 4.9913766250457885e-06, + "loss": 1.4349894523620605, + "step": 566 + }, + { + "epoch": 0.10339492127059252, + "grad_norm": 9.5, + "learning_rate": 4.991305409468562e-06, + "loss": 1.4226398468017578, + "step": 568 + }, + { + "epoch": 0.10375898789478474, + "grad_norm": 7.1875, + "learning_rate": 4.9912339016753375e-06, + "loss": 0.8516451120376587, + "step": 570 + }, + { + "epoch": 0.10412305451897698, + "grad_norm": 51.0, + "learning_rate": 4.9911621016766085e-06, + "loss": 1.5673589706420898, + "step": 572 + }, + { + "epoch": 0.1044871211431692, + "grad_norm": 9.75, + "learning_rate": 4.99109000948291e-06, + "loss": 1.6054155826568604, + "step": 574 + }, + { + "epoch": 0.10485118776736142, + "grad_norm": 18.375, + "learning_rate": 4.991017625104821e-06, + "loss": 1.466247797012329, + "step": 576 + }, + { + "epoch": 0.10521525439155366, + "grad_norm": 7.625, + "learning_rate": 4.990944948552966e-06, + "loss": 1.7692545652389526, + "step": 578 + }, + { + "epoch": 0.10557932101574588, + "grad_norm": 28.875, + "learning_rate": 4.990871979838008e-06, + "loss": 2.2233469486236572, + "step": 580 + }, + { + "epoch": 0.1059433876399381, + "grad_norm": 11.0, + "learning_rate": 4.990798718970654e-06, + "loss": 1.4929414987564087, + "step": 582 + }, + { + "epoch": 0.10630745426413034, + "grad_norm": 10.0, + "learning_rate": 4.990725165961658e-06, + "loss": 1.7463886737823486, + "step": 584 + }, + { + "epoch": 0.10667152088832256, + "grad_norm": 15.5, + "learning_rate": 4.99065132082181e-06, + "loss": 1.547139286994934, + "step": 586 + }, + { + "epoch": 0.1070355875125148, + "grad_norm": 14.75, + "learning_rate": 4.990577183561949e-06, + "loss": 1.3375365734100342, + "step": 588 + }, + { + "epoch": 0.10739965413670702, + "grad_norm": 6.59375, + "learning_rate": 4.990502754192952e-06, + "loss": 1.1098231077194214, + "step": 590 + }, + { + "epoch": 0.10776372076089924, + "grad_norm": 25.375, + "learning_rate": 4.9904280327257435e-06, + "loss": 1.6693047285079956, + "step": 592 + }, + { + "epoch": 0.10812778738509148, + "grad_norm": 2.796875, + "learning_rate": 4.9903530191712875e-06, + "loss": 1.3150269985198975, + "step": 594 + }, + { + "epoch": 0.1084918540092837, + "grad_norm": 97.0, + "learning_rate": 4.990277713540594e-06, + "loss": 1.7677597999572754, + "step": 596 + }, + { + "epoch": 0.10885592063347592, + "grad_norm": 21.75, + "learning_rate": 4.990202115844709e-06, + "loss": 1.8721543550491333, + "step": 598 + }, + { + "epoch": 0.10921998725766816, + "grad_norm": 7.59375, + "learning_rate": 4.99012622609473e-06, + "loss": 1.2447410821914673, + "step": 600 + }, + { + "epoch": 0.10958405388186038, + "grad_norm": 13.25, + "learning_rate": 4.990050044301794e-06, + "loss": 1.3232799768447876, + "step": 602 + }, + { + "epoch": 0.10994812050605261, + "grad_norm": 9.125, + "learning_rate": 4.989973570477078e-06, + "loss": 1.4806007146835327, + "step": 604 + }, + { + "epoch": 0.11031218713024483, + "grad_norm": 9.125, + "learning_rate": 4.9898968046318045e-06, + "loss": 1.5398962497711182, + "step": 606 + }, + { + "epoch": 0.11067625375443706, + "grad_norm": 15.0625, + "learning_rate": 4.98981974677724e-06, + "loss": 1.6886669397354126, + "step": 608 + }, + { + "epoch": 0.11104032037862929, + "grad_norm": 9.25, + "learning_rate": 4.989742396924691e-06, + "loss": 1.5922999382019043, + "step": 610 + }, + { + "epoch": 0.11140438700282151, + "grad_norm": 11.5625, + "learning_rate": 4.9896647550855105e-06, + "loss": 1.2320183515548706, + "step": 612 + }, + { + "epoch": 0.11176845362701375, + "grad_norm": 6.96875, + "learning_rate": 4.9895868212710895e-06, + "loss": 1.351425051689148, + "step": 614 + }, + { + "epoch": 0.11213252025120597, + "grad_norm": 10.6875, + "learning_rate": 4.989508595492866e-06, + "loss": 1.5891644954681396, + "step": 616 + }, + { + "epoch": 0.1124965868753982, + "grad_norm": 3.8125, + "learning_rate": 4.989430077762318e-06, + "loss": 1.2235267162322998, + "step": 618 + }, + { + "epoch": 0.11286065349959043, + "grad_norm": 10.875, + "learning_rate": 4.9893512680909695e-06, + "loss": 1.5127015113830566, + "step": 620 + }, + { + "epoch": 0.11322472012378265, + "grad_norm": 12.4375, + "learning_rate": 4.9892721664903845e-06, + "loss": 1.6188157796859741, + "step": 622 + }, + { + "epoch": 0.11358878674797487, + "grad_norm": 13.0, + "learning_rate": 4.98919277297217e-06, + "loss": 1.4947715997695923, + "step": 624 + }, + { + "epoch": 0.11395285337216711, + "grad_norm": 24.375, + "learning_rate": 4.989113087547979e-06, + "loss": 1.9253239631652832, + "step": 626 + }, + { + "epoch": 0.11431691999635933, + "grad_norm": 24.125, + "learning_rate": 4.989033110229502e-06, + "loss": 1.6219408512115479, + "step": 628 + }, + { + "epoch": 0.11468098662055157, + "grad_norm": 6.6875, + "learning_rate": 4.9889528410284785e-06, + "loss": 1.6991894245147705, + "step": 630 + }, + { + "epoch": 0.11504505324474379, + "grad_norm": 21.625, + "learning_rate": 4.9888722799566845e-06, + "loss": 1.488581895828247, + "step": 632 + }, + { + "epoch": 0.11540911986893601, + "grad_norm": 32.75, + "learning_rate": 4.988791427025944e-06, + "loss": 1.9470293521881104, + "step": 634 + }, + { + "epoch": 0.11577318649312825, + "grad_norm": 8.1875, + "learning_rate": 4.988710282248122e-06, + "loss": 1.5553927421569824, + "step": 636 + }, + { + "epoch": 0.11613725311732047, + "grad_norm": 9.8125, + "learning_rate": 4.988628845635125e-06, + "loss": 1.617289423942566, + "step": 638 + }, + { + "epoch": 0.11650131974151269, + "grad_norm": 13.0625, + "learning_rate": 4.988547117198906e-06, + "loss": 1.654571294784546, + "step": 640 + }, + { + "epoch": 0.11686538636570493, + "grad_norm": 22.75, + "learning_rate": 4.9884650969514545e-06, + "loss": 1.599797010421753, + "step": 642 + }, + { + "epoch": 0.11722945298989715, + "grad_norm": 9.875, + "learning_rate": 4.98838278490481e-06, + "loss": 1.4296228885650635, + "step": 644 + }, + { + "epoch": 0.11759351961408938, + "grad_norm": 10.5625, + "learning_rate": 4.988300181071047e-06, + "loss": 1.5124624967575073, + "step": 646 + }, + { + "epoch": 0.1179575862382816, + "grad_norm": 12.25, + "learning_rate": 4.9882172854622935e-06, + "loss": 1.621030569076538, + "step": 648 + }, + { + "epoch": 0.11832165286247383, + "grad_norm": 26.25, + "learning_rate": 4.988134098090709e-06, + "loss": 0.9525178670883179, + "step": 650 + }, + { + "epoch": 0.11868571948666606, + "grad_norm": 24.875, + "learning_rate": 4.988050618968504e-06, + "loss": 1.5937832593917847, + "step": 652 + }, + { + "epoch": 0.11904978611085829, + "grad_norm": 22.5, + "learning_rate": 4.987966848107927e-06, + "loss": 2.128664016723633, + "step": 654 + }, + { + "epoch": 0.11941385273505051, + "grad_norm": 7.1875, + "learning_rate": 4.9878827855212715e-06, + "loss": 1.0092219114303589, + "step": 656 + }, + { + "epoch": 0.11977791935924274, + "grad_norm": 28.375, + "learning_rate": 4.987798431220874e-06, + "loss": 1.005147099494934, + "step": 658 + }, + { + "epoch": 0.12014198598343497, + "grad_norm": 9.75, + "learning_rate": 4.987713785219111e-06, + "loss": 0.8914632797241211, + "step": 660 + }, + { + "epoch": 0.1205060526076272, + "grad_norm": 3.8125, + "learning_rate": 4.9876288475284076e-06, + "loss": 0.9911003112792969, + "step": 662 + }, + { + "epoch": 0.12087011923181942, + "grad_norm": 11.1875, + "learning_rate": 4.987543618161225e-06, + "loss": 1.546281337738037, + "step": 664 + }, + { + "epoch": 0.12123418585601164, + "grad_norm": 10.0, + "learning_rate": 4.987458097130071e-06, + "loss": 1.531376838684082, + "step": 666 + }, + { + "epoch": 0.12159825248020388, + "grad_norm": 18.875, + "learning_rate": 4.987372284447496e-06, + "loss": 1.6790237426757812, + "step": 668 + }, + { + "epoch": 0.1219623191043961, + "grad_norm": 9.0625, + "learning_rate": 4.987286180126093e-06, + "loss": 1.620091199874878, + "step": 670 + }, + { + "epoch": 0.12232638572858834, + "grad_norm": 19.125, + "learning_rate": 4.987199784178496e-06, + "loss": 2.202967405319214, + "step": 672 + }, + { + "epoch": 0.12269045235278056, + "grad_norm": 6.9375, + "learning_rate": 4.987113096617384e-06, + "loss": 1.178342580795288, + "step": 674 + }, + { + "epoch": 0.12305451897697278, + "grad_norm": 16.375, + "learning_rate": 4.987026117455479e-06, + "loss": 1.2810454368591309, + "step": 676 + }, + { + "epoch": 0.12341858560116502, + "grad_norm": 16.25, + "learning_rate": 4.986938846705544e-06, + "loss": 1.3532731533050537, + "step": 678 + }, + { + "epoch": 0.12378265222535724, + "grad_norm": 9.75, + "learning_rate": 4.986851284380384e-06, + "loss": 1.5397684574127197, + "step": 680 + }, + { + "epoch": 0.12414671884954946, + "grad_norm": 8.8125, + "learning_rate": 4.986763430492851e-06, + "loss": 1.404807209968567, + "step": 682 + }, + { + "epoch": 0.1245107854737417, + "grad_norm": 13.625, + "learning_rate": 4.9866752850558365e-06, + "loss": 0.9748321771621704, + "step": 684 + }, + { + "epoch": 0.12487485209793392, + "grad_norm": 8.5, + "learning_rate": 4.986586848082274e-06, + "loss": 1.6022968292236328, + "step": 686 + }, + { + "epoch": 0.12523891872212614, + "grad_norm": 10.375, + "learning_rate": 4.986498119585145e-06, + "loss": 1.5184335708618164, + "step": 688 + }, + { + "epoch": 0.12560298534631836, + "grad_norm": 8.1875, + "learning_rate": 4.986409099577465e-06, + "loss": 1.596593976020813, + "step": 690 + }, + { + "epoch": 0.1259670519705106, + "grad_norm": 8.4375, + "learning_rate": 4.986319788072301e-06, + "loss": 1.2138594388961792, + "step": 692 + }, + { + "epoch": 0.12633111859470283, + "grad_norm": 4.9375, + "learning_rate": 4.986230185082758e-06, + "loss": 1.2089152336120605, + "step": 694 + }, + { + "epoch": 0.12669518521889506, + "grad_norm": 19.625, + "learning_rate": 4.986140290621985e-06, + "loss": 1.5043275356292725, + "step": 696 + }, + { + "epoch": 0.12705925184308728, + "grad_norm": 7.78125, + "learning_rate": 4.986050104703173e-06, + "loss": 1.502524971961975, + "step": 698 + }, + { + "epoch": 0.1274233184672795, + "grad_norm": 14.875, + "learning_rate": 4.985959627339556e-06, + "loss": 2.0593042373657227, + "step": 700 + }, + { + "epoch": 0.12778738509147175, + "grad_norm": 15.875, + "learning_rate": 4.985868858544413e-06, + "loss": 1.4950681924819946, + "step": 702 + }, + { + "epoch": 0.12815145171566397, + "grad_norm": 7.8125, + "learning_rate": 4.985777798331063e-06, + "loss": 1.127295732498169, + "step": 704 + }, + { + "epoch": 0.1285155183398562, + "grad_norm": 8.4375, + "learning_rate": 4.9856864467128694e-06, + "loss": 1.2744174003601074, + "step": 706 + }, + { + "epoch": 0.12887958496404842, + "grad_norm": 29.875, + "learning_rate": 4.9855948037032365e-06, + "loss": 1.8374284505844116, + "step": 708 + }, + { + "epoch": 0.12924365158824064, + "grad_norm": 7.46875, + "learning_rate": 4.985502869315613e-06, + "loss": 0.994347095489502, + "step": 710 + }, + { + "epoch": 0.1296077182124329, + "grad_norm": 6.75, + "learning_rate": 4.98541064356349e-06, + "loss": 1.5544590950012207, + "step": 712 + }, + { + "epoch": 0.1299717848366251, + "grad_norm": 18.875, + "learning_rate": 4.985318126460401e-06, + "loss": 0.7793824672698975, + "step": 714 + }, + { + "epoch": 0.13033585146081733, + "grad_norm": 7.625, + "learning_rate": 4.985225318019923e-06, + "loss": 1.461126446723938, + "step": 716 + }, + { + "epoch": 0.13069991808500955, + "grad_norm": 23.25, + "learning_rate": 4.985132218255675e-06, + "loss": 1.0607283115386963, + "step": 718 + }, + { + "epoch": 0.13106398470920178, + "grad_norm": 15.0, + "learning_rate": 4.9850388271813185e-06, + "loss": 1.913686990737915, + "step": 720 + }, + { + "epoch": 0.13142805133339402, + "grad_norm": 11.625, + "learning_rate": 4.984945144810559e-06, + "loss": 1.60567307472229, + "step": 722 + }, + { + "epoch": 0.13179211795758625, + "grad_norm": 9.125, + "learning_rate": 4.9848511711571444e-06, + "loss": 1.5132880210876465, + "step": 724 + }, + { + "epoch": 0.13215618458177847, + "grad_norm": 10.8125, + "learning_rate": 4.984756906234863e-06, + "loss": 1.8271973133087158, + "step": 726 + }, + { + "epoch": 0.1325202512059707, + "grad_norm": 15.625, + "learning_rate": 4.984662350057551e-06, + "loss": 1.8523712158203125, + "step": 728 + }, + { + "epoch": 0.1328843178301629, + "grad_norm": 12.875, + "learning_rate": 4.984567502639082e-06, + "loss": 0.896437406539917, + "step": 730 + }, + { + "epoch": 0.13324838445435513, + "grad_norm": 4.28125, + "learning_rate": 4.984472363993373e-06, + "loss": 1.0899426937103271, + "step": 732 + }, + { + "epoch": 0.13361245107854738, + "grad_norm": 11.625, + "learning_rate": 4.984376934134388e-06, + "loss": 1.4315885305404663, + "step": 734 + }, + { + "epoch": 0.1339765177027396, + "grad_norm": 6.5625, + "learning_rate": 4.98428121307613e-06, + "loss": 1.5829685926437378, + "step": 736 + }, + { + "epoch": 0.13434058432693183, + "grad_norm": 12.375, + "learning_rate": 4.984185200832645e-06, + "loss": 1.468256950378418, + "step": 738 + }, + { + "epoch": 0.13470465095112405, + "grad_norm": 12.75, + "learning_rate": 4.984088897418024e-06, + "loss": 2.0691981315612793, + "step": 740 + }, + { + "epoch": 0.13506871757531627, + "grad_norm": 8.125, + "learning_rate": 4.983992302846398e-06, + "loss": 1.5110063552856445, + "step": 742 + }, + { + "epoch": 0.13543278419950852, + "grad_norm": 9.1875, + "learning_rate": 4.983895417131941e-06, + "loss": 1.5119638442993164, + "step": 744 + }, + { + "epoch": 0.13579685082370074, + "grad_norm": 60.25, + "learning_rate": 4.983798240288872e-06, + "loss": 0.860778272151947, + "step": 746 + }, + { + "epoch": 0.13616091744789297, + "grad_norm": 34.75, + "learning_rate": 4.983700772331451e-06, + "loss": 1.487378478050232, + "step": 748 + }, + { + "epoch": 0.1365249840720852, + "grad_norm": 8.0625, + "learning_rate": 4.983603013273981e-06, + "loss": 1.2067204713821411, + "step": 750 + }, + { + "epoch": 0.1368890506962774, + "grad_norm": 18.625, + "learning_rate": 4.9835049631308074e-06, + "loss": 1.641838788986206, + "step": 752 + }, + { + "epoch": 0.13725311732046966, + "grad_norm": 14.5625, + "learning_rate": 4.983406621916319e-06, + "loss": 1.5795860290527344, + "step": 754 + }, + { + "epoch": 0.13761718394466188, + "grad_norm": 9.3125, + "learning_rate": 4.983307989644946e-06, + "loss": 1.4934535026550293, + "step": 756 + }, + { + "epoch": 0.1379812505688541, + "grad_norm": 7.25, + "learning_rate": 4.983209066331165e-06, + "loss": 1.2594249248504639, + "step": 758 + }, + { + "epoch": 0.13834531719304632, + "grad_norm": 6.75, + "learning_rate": 4.9831098519894895e-06, + "loss": 1.5626554489135742, + "step": 760 + }, + { + "epoch": 0.13870938381723855, + "grad_norm": 5.0625, + "learning_rate": 4.983010346634481e-06, + "loss": 1.1138370037078857, + "step": 762 + }, + { + "epoch": 0.13907345044143077, + "grad_norm": 6.96875, + "learning_rate": 4.9829105502807395e-06, + "loss": 1.5185916423797607, + "step": 764 + }, + { + "epoch": 0.13943751706562302, + "grad_norm": 7.65625, + "learning_rate": 4.982810462942911e-06, + "loss": 1.6079630851745605, + "step": 766 + }, + { + "epoch": 0.13980158368981524, + "grad_norm": 19.375, + "learning_rate": 4.982710084635683e-06, + "loss": 1.459214210510254, + "step": 768 + }, + { + "epoch": 0.14016565031400746, + "grad_norm": 7.875, + "learning_rate": 4.982609415373785e-06, + "loss": 1.5401808023452759, + "step": 770 + }, + { + "epoch": 0.14052971693819968, + "grad_norm": 10.0, + "learning_rate": 4.98250845517199e-06, + "loss": 1.5686360597610474, + "step": 772 + }, + { + "epoch": 0.1408937835623919, + "grad_norm": 12.75, + "learning_rate": 4.982407204045114e-06, + "loss": 1.478764295578003, + "step": 774 + }, + { + "epoch": 0.14125785018658416, + "grad_norm": 11.0, + "learning_rate": 4.982305662008015e-06, + "loss": 1.5797213315963745, + "step": 776 + }, + { + "epoch": 0.14162191681077638, + "grad_norm": 18.125, + "learning_rate": 4.982203829075594e-06, + "loss": 1.4691228866577148, + "step": 778 + }, + { + "epoch": 0.1419859834349686, + "grad_norm": 15.6875, + "learning_rate": 4.982101705262793e-06, + "loss": 1.740663766860962, + "step": 780 + }, + { + "epoch": 0.14235005005916082, + "grad_norm": 14.1875, + "learning_rate": 4.981999290584601e-06, + "loss": 2.018289566040039, + "step": 782 + }, + { + "epoch": 0.14271411668335304, + "grad_norm": 40.25, + "learning_rate": 4.981896585056044e-06, + "loss": 1.7228575944900513, + "step": 784 + }, + { + "epoch": 0.1430781833075453, + "grad_norm": 11.375, + "learning_rate": 4.981793588692196e-06, + "loss": 1.2557510137557983, + "step": 786 + }, + { + "epoch": 0.14344224993173751, + "grad_norm": 5.625, + "learning_rate": 4.981690301508169e-06, + "loss": 1.0434479713439941, + "step": 788 + }, + { + "epoch": 0.14380631655592974, + "grad_norm": 9.0, + "learning_rate": 4.981586723519123e-06, + "loss": 1.5941314697265625, + "step": 790 + }, + { + "epoch": 0.14417038318012196, + "grad_norm": 24.25, + "learning_rate": 4.981482854740255e-06, + "loss": 1.5563713312149048, + "step": 792 + }, + { + "epoch": 0.14453444980431418, + "grad_norm": 47.5, + "learning_rate": 4.981378695186808e-06, + "loss": 1.399549961090088, + "step": 794 + }, + { + "epoch": 0.14489851642850643, + "grad_norm": 5.25, + "learning_rate": 4.981274244874069e-06, + "loss": 1.1914955377578735, + "step": 796 + }, + { + "epoch": 0.14526258305269865, + "grad_norm": 30.25, + "learning_rate": 4.981169503817362e-06, + "loss": 2.3184032440185547, + "step": 798 + }, + { + "epoch": 0.14562664967689087, + "grad_norm": 6.9375, + "learning_rate": 4.981064472032061e-06, + "loss": 1.563476324081421, + "step": 800 + }, + { + "epoch": 0.1459907163010831, + "grad_norm": 7.21875, + "learning_rate": 4.980959149533576e-06, + "loss": 1.2146525382995605, + "step": 802 + }, + { + "epoch": 0.14635478292527532, + "grad_norm": 2.734375, + "learning_rate": 4.980853536337366e-06, + "loss": 1.3861656188964844, + "step": 804 + }, + { + "epoch": 0.14671884954946754, + "grad_norm": 59.25, + "learning_rate": 4.9807476324589246e-06, + "loss": 1.2269115447998047, + "step": 806 + }, + { + "epoch": 0.1470829161736598, + "grad_norm": 37.0, + "learning_rate": 4.980641437913797e-06, + "loss": 2.0585579872131348, + "step": 808 + }, + { + "epoch": 0.147446982797852, + "grad_norm": 11.0625, + "learning_rate": 4.980534952717564e-06, + "loss": 2.0534462928771973, + "step": 810 + }, + { + "epoch": 0.14781104942204423, + "grad_norm": 50.5, + "learning_rate": 4.9804281768858545e-06, + "loss": 0.9488010406494141, + "step": 812 + }, + { + "epoch": 0.14817511604623645, + "grad_norm": 13.875, + "learning_rate": 4.980321110434335e-06, + "loss": 1.693977952003479, + "step": 814 + }, + { + "epoch": 0.14853918267042868, + "grad_norm": 16.0, + "learning_rate": 4.980213753378719e-06, + "loss": 1.4362962245941162, + "step": 816 + }, + { + "epoch": 0.14890324929462093, + "grad_norm": 27.5, + "learning_rate": 4.980106105734759e-06, + "loss": 1.519448161125183, + "step": 818 + }, + { + "epoch": 0.14926731591881315, + "grad_norm": 9.75, + "learning_rate": 4.979998167518253e-06, + "loss": 1.4360629320144653, + "step": 820 + }, + { + "epoch": 0.14963138254300537, + "grad_norm": 7.6875, + "learning_rate": 4.979889938745039e-06, + "loss": 1.6215627193450928, + "step": 822 + }, + { + "epoch": 0.1499954491671976, + "grad_norm": 16.125, + "learning_rate": 4.9797814194310015e-06, + "loss": 1.6443802118301392, + "step": 824 + }, + { + "epoch": 0.15035951579138981, + "grad_norm": 16.0, + "learning_rate": 4.979672609592064e-06, + "loss": 1.2052421569824219, + "step": 826 + }, + { + "epoch": 0.15072358241558206, + "grad_norm": 19.75, + "learning_rate": 4.979563509244194e-06, + "loss": 0.8855452537536621, + "step": 828 + }, + { + "epoch": 0.15108764903977429, + "grad_norm": 13.9375, + "learning_rate": 4.9794541184034004e-06, + "loss": 1.6762653589248657, + "step": 830 + }, + { + "epoch": 0.1514517156639665, + "grad_norm": 5.21875, + "learning_rate": 4.979344437085738e-06, + "loss": 1.1307954788208008, + "step": 832 + }, + { + "epoch": 0.15181578228815873, + "grad_norm": 12.25, + "learning_rate": 4.979234465307301e-06, + "loss": 1.461775302886963, + "step": 834 + }, + { + "epoch": 0.15217984891235095, + "grad_norm": 9.4375, + "learning_rate": 4.979124203084228e-06, + "loss": 0.9856496453285217, + "step": 836 + }, + { + "epoch": 0.1525439155365432, + "grad_norm": 22.75, + "learning_rate": 4.979013650432698e-06, + "loss": 2.053518056869507, + "step": 838 + }, + { + "epoch": 0.15290798216073542, + "grad_norm": 15.0, + "learning_rate": 4.978902807368935e-06, + "loss": 1.5742968320846558, + "step": 840 + }, + { + "epoch": 0.15327204878492764, + "grad_norm": 12.875, + "learning_rate": 4.978791673909205e-06, + "loss": 1.4561097621917725, + "step": 842 + }, + { + "epoch": 0.15363611540911987, + "grad_norm": 11.375, + "learning_rate": 4.978680250069816e-06, + "loss": 1.4235137701034546, + "step": 844 + }, + { + "epoch": 0.1540001820333121, + "grad_norm": 8.375, + "learning_rate": 4.9785685358671195e-06, + "loss": 1.6382036209106445, + "step": 846 + }, + { + "epoch": 0.1543642486575043, + "grad_norm": 9.3125, + "learning_rate": 4.9784565313175084e-06, + "loss": 1.7686500549316406, + "step": 848 + }, + { + "epoch": 0.15472831528169656, + "grad_norm": 25.375, + "learning_rate": 4.978344236437419e-06, + "loss": 2.117997169494629, + "step": 850 + }, + { + "epoch": 0.15509238190588878, + "grad_norm": 9.6875, + "learning_rate": 4.978231651243331e-06, + "loss": 1.5159403085708618, + "step": 852 + }, + { + "epoch": 0.155456448530081, + "grad_norm": 8.625, + "learning_rate": 4.978118775751765e-06, + "loss": 1.4558329582214355, + "step": 854 + }, + { + "epoch": 0.15582051515427323, + "grad_norm": 15.125, + "learning_rate": 4.978005609979286e-06, + "loss": 1.431607961654663, + "step": 856 + }, + { + "epoch": 0.15618458177846545, + "grad_norm": 15.5, + "learning_rate": 4.9778921539424995e-06, + "loss": 1.4661400318145752, + "step": 858 + }, + { + "epoch": 0.1565486484026577, + "grad_norm": 31.75, + "learning_rate": 4.977778407658055e-06, + "loss": 1.2503175735473633, + "step": 860 + }, + { + "epoch": 0.15691271502684992, + "grad_norm": 14.75, + "learning_rate": 4.977664371142644e-06, + "loss": 1.3584790229797363, + "step": 862 + }, + { + "epoch": 0.15727678165104214, + "grad_norm": 91.5, + "learning_rate": 4.977550044413002e-06, + "loss": 0.8588085174560547, + "step": 864 + }, + { + "epoch": 0.15764084827523436, + "grad_norm": 8.3125, + "learning_rate": 4.9774354274859045e-06, + "loss": 1.5325477123260498, + "step": 866 + }, + { + "epoch": 0.15800491489942659, + "grad_norm": 5.46875, + "learning_rate": 4.977320520378173e-06, + "loss": 1.419021725654602, + "step": 868 + }, + { + "epoch": 0.15836898152361883, + "grad_norm": 10.125, + "learning_rate": 4.977205323106667e-06, + "loss": 1.4541630744934082, + "step": 870 + }, + { + "epoch": 0.15873304814781106, + "grad_norm": 7.03125, + "learning_rate": 4.9770898356882946e-06, + "loss": 1.2505348920822144, + "step": 872 + }, + { + "epoch": 0.15909711477200328, + "grad_norm": 7.25, + "learning_rate": 4.976974058140001e-06, + "loss": 1.640791893005371, + "step": 874 + }, + { + "epoch": 0.1594611813961955, + "grad_norm": 9.625, + "learning_rate": 4.976857990478775e-06, + "loss": 1.4406989812850952, + "step": 876 + }, + { + "epoch": 0.15982524802038772, + "grad_norm": 7.84375, + "learning_rate": 4.976741632721651e-06, + "loss": 1.5606796741485596, + "step": 878 + }, + { + "epoch": 0.16018931464457994, + "grad_norm": 14.4375, + "learning_rate": 4.976624984885704e-06, + "loss": 1.668114185333252, + "step": 880 + }, + { + "epoch": 0.1605533812687722, + "grad_norm": 13.5625, + "learning_rate": 4.97650804698805e-06, + "loss": 1.8590716123580933, + "step": 882 + }, + { + "epoch": 0.16091744789296442, + "grad_norm": 15.0625, + "learning_rate": 4.976390819045851e-06, + "loss": 1.6209524869918823, + "step": 884 + }, + { + "epoch": 0.16128151451715664, + "grad_norm": 16.875, + "learning_rate": 4.976273301076309e-06, + "loss": 2.1605849266052246, + "step": 886 + }, + { + "epoch": 0.16164558114134886, + "grad_norm": 15.625, + "learning_rate": 4.976155493096669e-06, + "loss": 1.426637887954712, + "step": 888 + }, + { + "epoch": 0.16200964776554108, + "grad_norm": 6.34375, + "learning_rate": 4.976037395124218e-06, + "loss": 1.105234980583191, + "step": 890 + }, + { + "epoch": 0.16237371438973333, + "grad_norm": 6.96875, + "learning_rate": 4.975919007176289e-06, + "loss": 1.2164201736450195, + "step": 892 + }, + { + "epoch": 0.16273778101392555, + "grad_norm": 3.703125, + "learning_rate": 4.9758003292702515e-06, + "loss": 0.9556933641433716, + "step": 894 + }, + { + "epoch": 0.16310184763811778, + "grad_norm": 6.75, + "learning_rate": 4.975681361423524e-06, + "loss": 1.0577716827392578, + "step": 896 + }, + { + "epoch": 0.16346591426231, + "grad_norm": 14.4375, + "learning_rate": 4.9755621036535635e-06, + "loss": 1.507400393486023, + "step": 898 + }, + { + "epoch": 0.16382998088650222, + "grad_norm": 11.0, + "learning_rate": 4.975442555977871e-06, + "loss": 1.6189115047454834, + "step": 900 + }, + { + "epoch": 0.16419404751069447, + "grad_norm": 20.5, + "learning_rate": 4.975322718413988e-06, + "loss": 1.636732816696167, + "step": 902 + }, + { + "epoch": 0.1645581141348867, + "grad_norm": 15.8125, + "learning_rate": 4.9752025909795035e-06, + "loss": 1.5428428649902344, + "step": 904 + }, + { + "epoch": 0.1649221807590789, + "grad_norm": 13.0625, + "learning_rate": 4.975082173692042e-06, + "loss": 1.5462164878845215, + "step": 906 + }, + { + "epoch": 0.16528624738327113, + "grad_norm": 4.09375, + "learning_rate": 4.974961466569276e-06, + "loss": 1.1371560096740723, + "step": 908 + }, + { + "epoch": 0.16565031400746336, + "grad_norm": 6.125, + "learning_rate": 4.974840469628919e-06, + "loss": 1.084640622138977, + "step": 910 + }, + { + "epoch": 0.1660143806316556, + "grad_norm": 13.3125, + "learning_rate": 4.974719182888725e-06, + "loss": 1.086508870124817, + "step": 912 + }, + { + "epoch": 0.16637844725584783, + "grad_norm": 9.75, + "learning_rate": 4.974597606366495e-06, + "loss": 1.5391892194747925, + "step": 914 + }, + { + "epoch": 0.16674251388004005, + "grad_norm": 17.875, + "learning_rate": 4.974475740080069e-06, + "loss": 1.5544822216033936, + "step": 916 + }, + { + "epoch": 0.16710658050423227, + "grad_norm": 16.0, + "learning_rate": 4.974353584047329e-06, + "loss": 1.3960521221160889, + "step": 918 + }, + { + "epoch": 0.1674706471284245, + "grad_norm": 8.25, + "learning_rate": 4.974231138286202e-06, + "loss": 1.6951947212219238, + "step": 920 + }, + { + "epoch": 0.16783471375261672, + "grad_norm": 70.0, + "learning_rate": 4.974108402814657e-06, + "loss": 1.5855159759521484, + "step": 922 + }, + { + "epoch": 0.16819878037680897, + "grad_norm": 31.375, + "learning_rate": 4.973985377650704e-06, + "loss": 1.2763593196868896, + "step": 924 + }, + { + "epoch": 0.1685628470010012, + "grad_norm": 14.0, + "learning_rate": 4.973862062812397e-06, + "loss": 1.5414307117462158, + "step": 926 + }, + { + "epoch": 0.1689269136251934, + "grad_norm": 10.875, + "learning_rate": 4.97373845831783e-06, + "loss": 1.6462106704711914, + "step": 928 + }, + { + "epoch": 0.16929098024938563, + "grad_norm": 18.125, + "learning_rate": 4.9736145641851445e-06, + "loss": 1.759765863418579, + "step": 930 + }, + { + "epoch": 0.16965504687357785, + "grad_norm": 17.25, + "learning_rate": 4.97349038043252e-06, + "loss": 1.9544644355773926, + "step": 932 + }, + { + "epoch": 0.1700191134977701, + "grad_norm": 17.25, + "learning_rate": 4.973365907078179e-06, + "loss": 1.5560743808746338, + "step": 934 + }, + { + "epoch": 0.17038318012196232, + "grad_norm": 4.6875, + "learning_rate": 4.973241144140391e-06, + "loss": 1.2322574853897095, + "step": 936 + }, + { + "epoch": 0.17074724674615455, + "grad_norm": 12.8125, + "learning_rate": 4.973116091637459e-06, + "loss": 1.453364610671997, + "step": 938 + }, + { + "epoch": 0.17111131337034677, + "grad_norm": 26.625, + "learning_rate": 4.972990749587738e-06, + "loss": 1.4589955806732178, + "step": 940 + }, + { + "epoch": 0.171475379994539, + "grad_norm": 17.0, + "learning_rate": 4.972865118009621e-06, + "loss": 1.6683200597763062, + "step": 942 + }, + { + "epoch": 0.17183944661873124, + "grad_norm": 18.875, + "learning_rate": 4.972739196921543e-06, + "loss": 1.6839137077331543, + "step": 944 + }, + { + "epoch": 0.17220351324292346, + "grad_norm": 30.0, + "learning_rate": 4.972612986341983e-06, + "loss": 1.5545344352722168, + "step": 946 + }, + { + "epoch": 0.17256757986711568, + "grad_norm": 5.75, + "learning_rate": 4.9724864862894605e-06, + "loss": 1.1030503511428833, + "step": 948 + }, + { + "epoch": 0.1729316464913079, + "grad_norm": 6.84375, + "learning_rate": 4.97235969678254e-06, + "loss": 1.5186653137207031, + "step": 950 + }, + { + "epoch": 0.17329571311550013, + "grad_norm": 10.75, + "learning_rate": 4.9722326178398286e-06, + "loss": 1.6165885925292969, + "step": 952 + }, + { + "epoch": 0.17365977973969238, + "grad_norm": 20.875, + "learning_rate": 4.972105249479971e-06, + "loss": 1.4254658222198486, + "step": 954 + }, + { + "epoch": 0.1740238463638846, + "grad_norm": 18.875, + "learning_rate": 4.9719775917216625e-06, + "loss": 1.646559476852417, + "step": 956 + }, + { + "epoch": 0.17438791298807682, + "grad_norm": 28.0, + "learning_rate": 4.9718496445836325e-06, + "loss": 1.7532553672790527, + "step": 958 + }, + { + "epoch": 0.17475197961226904, + "grad_norm": 33.0, + "learning_rate": 4.97172140808466e-06, + "loss": 1.961106538772583, + "step": 960 + }, + { + "epoch": 0.17511604623646126, + "grad_norm": 14.5625, + "learning_rate": 4.971592882243561e-06, + "loss": 1.4358066320419312, + "step": 962 + }, + { + "epoch": 0.1754801128606535, + "grad_norm": 11.1875, + "learning_rate": 4.971464067079196e-06, + "loss": 1.5383143424987793, + "step": 964 + }, + { + "epoch": 0.17584417948484574, + "grad_norm": 30.5, + "learning_rate": 4.971334962610469e-06, + "loss": 1.6120471954345703, + "step": 966 + }, + { + "epoch": 0.17620824610903796, + "grad_norm": 24.625, + "learning_rate": 4.9712055688563256e-06, + "loss": 2.341151237487793, + "step": 968 + }, + { + "epoch": 0.17657231273323018, + "grad_norm": 9.75, + "learning_rate": 4.971075885835753e-06, + "loss": 1.38916015625, + "step": 970 + }, + { + "epoch": 0.1769363793574224, + "grad_norm": 4.09375, + "learning_rate": 4.970945913567784e-06, + "loss": 1.2353990077972412, + "step": 972 + }, + { + "epoch": 0.17730044598161462, + "grad_norm": 20.125, + "learning_rate": 4.970815652071488e-06, + "loss": 1.47007417678833, + "step": 974 + }, + { + "epoch": 0.17766451260580687, + "grad_norm": 10.875, + "learning_rate": 4.970685101365983e-06, + "loss": 1.7017881870269775, + "step": 976 + }, + { + "epoch": 0.1780285792299991, + "grad_norm": 16.875, + "learning_rate": 4.970554261470425e-06, + "loss": 1.7518398761749268, + "step": 978 + }, + { + "epoch": 0.17839264585419132, + "grad_norm": 9.5625, + "learning_rate": 4.970423132404016e-06, + "loss": 2.097485065460205, + "step": 980 + }, + { + "epoch": 0.17875671247838354, + "grad_norm": 8.0, + "learning_rate": 4.9702917141859965e-06, + "loss": 1.3711477518081665, + "step": 982 + }, + { + "epoch": 0.17912077910257576, + "grad_norm": 10.5, + "learning_rate": 4.970160006835655e-06, + "loss": 1.3914536237716675, + "step": 984 + }, + { + "epoch": 0.179484845726768, + "grad_norm": 8.375, + "learning_rate": 4.970028010372314e-06, + "loss": 1.2097982168197632, + "step": 986 + }, + { + "epoch": 0.17984891235096023, + "grad_norm": 37.25, + "learning_rate": 4.969895724815348e-06, + "loss": 1.254248857498169, + "step": 988 + }, + { + "epoch": 0.18021297897515245, + "grad_norm": 11.8125, + "learning_rate": 4.9697631501841685e-06, + "loss": 1.3299647569656372, + "step": 990 + }, + { + "epoch": 0.18057704559934468, + "grad_norm": 9.25, + "learning_rate": 4.969630286498228e-06, + "loss": 1.5818966627120972, + "step": 992 + }, + { + "epoch": 0.1809411122235369, + "grad_norm": 7.96875, + "learning_rate": 4.969497133777025e-06, + "loss": 1.5132246017456055, + "step": 994 + }, + { + "epoch": 0.18130517884772915, + "grad_norm": 14.5, + "learning_rate": 4.9693636920401005e-06, + "loss": 1.7159264087677002, + "step": 996 + }, + { + "epoch": 0.18166924547192137, + "grad_norm": 15.9375, + "learning_rate": 4.9692299613070346e-06, + "loss": 2.1041908264160156, + "step": 998 + }, + { + "epoch": 0.1820333120961136, + "grad_norm": 14.75, + "learning_rate": 4.969095941597453e-06, + "loss": 1.2940460443496704, + "step": 1000 + }, + { + "epoch": 0.18239737872030581, + "grad_norm": 23.25, + "learning_rate": 4.9689616329310204e-06, + "loss": 1.2223471403121948, + "step": 1002 + }, + { + "epoch": 0.18276144534449804, + "grad_norm": 11.75, + "learning_rate": 4.968827035327449e-06, + "loss": 0.988057017326355, + "step": 1004 + }, + { + "epoch": 0.18312551196869026, + "grad_norm": 8.25, + "learning_rate": 4.96869214880649e-06, + "loss": 1.1488367319107056, + "step": 1006 + }, + { + "epoch": 0.1834895785928825, + "grad_norm": 56.0, + "learning_rate": 4.968556973387935e-06, + "loss": 0.41850680112838745, + "step": 1008 + }, + { + "epoch": 0.18385364521707473, + "grad_norm": 10.75, + "learning_rate": 4.9684215090916224e-06, + "loss": 0.6935019493103027, + "step": 1010 + }, + { + "epoch": 0.18421771184126695, + "grad_norm": 12.0625, + "learning_rate": 4.968285755937431e-06, + "loss": 1.6634321212768555, + "step": 1012 + }, + { + "epoch": 0.18458177846545917, + "grad_norm": 39.5, + "learning_rate": 4.968149713945281e-06, + "loss": 1.827513337135315, + "step": 1014 + }, + { + "epoch": 0.1849458450896514, + "grad_norm": 26.75, + "learning_rate": 4.968013383135137e-06, + "loss": 1.7480419874191284, + "step": 1016 + }, + { + "epoch": 0.18530991171384364, + "grad_norm": 21.375, + "learning_rate": 4.967876763527005e-06, + "loss": 2.1570892333984375, + "step": 1018 + }, + { + "epoch": 0.18567397833803587, + "grad_norm": 9.625, + "learning_rate": 4.967739855140934e-06, + "loss": 1.7794275283813477, + "step": 1020 + }, + { + "epoch": 0.1860380449622281, + "grad_norm": 15.6875, + "learning_rate": 4.967602657997012e-06, + "loss": 1.172874093055725, + "step": 1022 + }, + { + "epoch": 0.1864021115864203, + "grad_norm": 16.625, + "learning_rate": 4.967465172115374e-06, + "loss": 1.9025425910949707, + "step": 1024 + }, + { + "epoch": 0.18676617821061253, + "grad_norm": 5.75, + "learning_rate": 4.967327397516197e-06, + "loss": 1.5241506099700928, + "step": 1026 + }, + { + "epoch": 0.18713024483480478, + "grad_norm": 19.625, + "learning_rate": 4.967189334219697e-06, + "loss": 1.2078227996826172, + "step": 1028 + }, + { + "epoch": 0.187494311458997, + "grad_norm": 14.25, + "learning_rate": 4.967050982246133e-06, + "loss": 1.719788908958435, + "step": 1030 + }, + { + "epoch": 0.18785837808318923, + "grad_norm": 25.25, + "learning_rate": 4.96691234161581e-06, + "loss": 1.5995606184005737, + "step": 1032 + }, + { + "epoch": 0.18822244470738145, + "grad_norm": 11.1875, + "learning_rate": 4.966773412349073e-06, + "loss": 1.6250306367874146, + "step": 1034 + }, + { + "epoch": 0.18858651133157367, + "grad_norm": 14.1875, + "learning_rate": 4.966634194466306e-06, + "loss": 1.491241693496704, + "step": 1036 + }, + { + "epoch": 0.1889505779557659, + "grad_norm": 10.0, + "learning_rate": 4.966494687987944e-06, + "loss": 1.6580965518951416, + "step": 1038 + }, + { + "epoch": 0.18931464457995814, + "grad_norm": 11.25, + "learning_rate": 4.966354892934454e-06, + "loss": 1.560678482055664, + "step": 1040 + }, + { + "epoch": 0.18967871120415036, + "grad_norm": 7.6875, + "learning_rate": 4.966214809326353e-06, + "loss": 1.491178035736084, + "step": 1042 + }, + { + "epoch": 0.19004277782834259, + "grad_norm": 7.0625, + "learning_rate": 4.966074437184198e-06, + "loss": 1.290732502937317, + "step": 1044 + }, + { + "epoch": 0.1904068444525348, + "grad_norm": 6.71875, + "learning_rate": 4.965933776528586e-06, + "loss": 1.249894142150879, + "step": 1046 + }, + { + "epoch": 0.19077091107672703, + "grad_norm": 18.625, + "learning_rate": 4.965792827380159e-06, + "loss": 1.5564920902252197, + "step": 1048 + }, + { + "epoch": 0.19113497770091928, + "grad_norm": 22.5, + "learning_rate": 4.965651589759602e-06, + "loss": 1.5470733642578125, + "step": 1050 + }, + { + "epoch": 0.1914990443251115, + "grad_norm": 9.0, + "learning_rate": 4.965510063687641e-06, + "loss": 1.4763240814208984, + "step": 1052 + }, + { + "epoch": 0.19186311094930372, + "grad_norm": 11.125, + "learning_rate": 4.965368249185043e-06, + "loss": 1.5355173349380493, + "step": 1054 + }, + { + "epoch": 0.19222717757349594, + "grad_norm": 8.375, + "learning_rate": 4.965226146272619e-06, + "loss": 1.5831536054611206, + "step": 1056 + }, + { + "epoch": 0.19259124419768817, + "grad_norm": 17.625, + "learning_rate": 4.965083754971223e-06, + "loss": 1.7306116819381714, + "step": 1058 + }, + { + "epoch": 0.19295531082188042, + "grad_norm": 11.625, + "learning_rate": 4.964941075301749e-06, + "loss": 1.905921220779419, + "step": 1060 + }, + { + "epoch": 0.19331937744607264, + "grad_norm": 9.0, + "learning_rate": 4.964798107285136e-06, + "loss": 1.330867052078247, + "step": 1062 + }, + { + "epoch": 0.19368344407026486, + "grad_norm": 3.046875, + "learning_rate": 4.964654850942363e-06, + "loss": 0.8626788854598999, + "step": 1064 + }, + { + "epoch": 0.19404751069445708, + "grad_norm": 7.03125, + "learning_rate": 4.964511306294454e-06, + "loss": 1.229498028755188, + "step": 1066 + }, + { + "epoch": 0.1944115773186493, + "grad_norm": 9.6875, + "learning_rate": 4.96436747336247e-06, + "loss": 1.5555689334869385, + "step": 1068 + }, + { + "epoch": 0.19477564394284155, + "grad_norm": 20.25, + "learning_rate": 4.964223352167522e-06, + "loss": 1.5085597038269043, + "step": 1070 + }, + { + "epoch": 0.19513971056703378, + "grad_norm": 6.5, + "learning_rate": 4.964078942730757e-06, + "loss": 1.1167781352996826, + "step": 1072 + }, + { + "epoch": 0.195503777191226, + "grad_norm": 9.3125, + "learning_rate": 4.963934245073366e-06, + "loss": 1.2785255908966064, + "step": 1074 + }, + { + "epoch": 0.19586784381541822, + "grad_norm": 5.9375, + "learning_rate": 4.963789259216584e-06, + "loss": 1.1449613571166992, + "step": 1076 + }, + { + "epoch": 0.19623191043961044, + "grad_norm": 9.125, + "learning_rate": 4.963643985181688e-06, + "loss": 1.6972298622131348, + "step": 1078 + }, + { + "epoch": 0.19659597706380266, + "grad_norm": 12.4375, + "learning_rate": 4.963498422989993e-06, + "loss": 1.5406492948532104, + "step": 1080 + }, + { + "epoch": 0.1969600436879949, + "grad_norm": 8.1875, + "learning_rate": 4.963352572662864e-06, + "loss": 1.4850382804870605, + "step": 1082 + }, + { + "epoch": 0.19732411031218713, + "grad_norm": 6.53125, + "learning_rate": 4.9632064342217e-06, + "loss": 1.420413851737976, + "step": 1084 + }, + { + "epoch": 0.19768817693637936, + "grad_norm": 11.0, + "learning_rate": 4.9630600076879486e-06, + "loss": 1.1185743808746338, + "step": 1086 + }, + { + "epoch": 0.19805224356057158, + "grad_norm": 4.96875, + "learning_rate": 4.962913293083097e-06, + "loss": 1.5190268754959106, + "step": 1088 + }, + { + "epoch": 0.1984163101847638, + "grad_norm": 4.59375, + "learning_rate": 4.9627662904286745e-06, + "loss": 0.8890689611434937, + "step": 1090 + }, + { + "epoch": 0.19878037680895605, + "grad_norm": 7.84375, + "learning_rate": 4.962618999746253e-06, + "loss": 1.3202004432678223, + "step": 1092 + }, + { + "epoch": 0.19914444343314827, + "grad_norm": 17.625, + "learning_rate": 4.962471421057447e-06, + "loss": 1.6558799743652344, + "step": 1094 + }, + { + "epoch": 0.1995085100573405, + "grad_norm": 14.75, + "learning_rate": 4.962323554383913e-06, + "loss": 1.6826921701431274, + "step": 1096 + }, + { + "epoch": 0.19987257668153272, + "grad_norm": 20.5, + "learning_rate": 4.962175399747351e-06, + "loss": 1.0449330806732178, + "step": 1098 + }, + { + "epoch": 0.20023664330572494, + "grad_norm": 9.9375, + "learning_rate": 4.9620269571695e-06, + "loss": 1.3753266334533691, + "step": 1100 + }, + { + "epoch": 0.2006007099299172, + "grad_norm": 25.0, + "learning_rate": 4.9618782266721455e-06, + "loss": 1.4963014125823975, + "step": 1102 + }, + { + "epoch": 0.2009647765541094, + "grad_norm": 10.1875, + "learning_rate": 4.9617292082771106e-06, + "loss": 1.8828684091567993, + "step": 1104 + }, + { + "epoch": 0.20132884317830163, + "grad_norm": 7.59375, + "learning_rate": 4.961579902006266e-06, + "loss": 1.737760066986084, + "step": 1106 + }, + { + "epoch": 0.20169290980249385, + "grad_norm": 8.6875, + "learning_rate": 4.9614303078815195e-06, + "loss": 1.6666333675384521, + "step": 1108 + }, + { + "epoch": 0.20205697642668607, + "grad_norm": 16.875, + "learning_rate": 4.961280425924825e-06, + "loss": 1.4696074724197388, + "step": 1110 + }, + { + "epoch": 0.20242104305087832, + "grad_norm": 6.8125, + "learning_rate": 4.961130256158176e-06, + "loss": 1.4471994638442993, + "step": 1112 + }, + { + "epoch": 0.20278510967507055, + "grad_norm": 5.8125, + "learning_rate": 4.96097979860361e-06, + "loss": 1.267797827720642, + "step": 1114 + }, + { + "epoch": 0.20314917629926277, + "grad_norm": 11.25, + "learning_rate": 4.960829053283205e-06, + "loss": 1.2800853252410889, + "step": 1116 + }, + { + "epoch": 0.203513242923455, + "grad_norm": 17.375, + "learning_rate": 4.960678020219083e-06, + "loss": 1.7602601051330566, + "step": 1118 + }, + { + "epoch": 0.2038773095476472, + "grad_norm": 22.625, + "learning_rate": 4.960526699433408e-06, + "loss": 1.5081733465194702, + "step": 1120 + }, + { + "epoch": 0.20424137617183943, + "grad_norm": 18.0, + "learning_rate": 4.960375090948385e-06, + "loss": 1.9400660991668701, + "step": 1122 + }, + { + "epoch": 0.20460544279603168, + "grad_norm": 28.125, + "learning_rate": 4.960223194786261e-06, + "loss": 1.49504816532135, + "step": 1124 + }, + { + "epoch": 0.2049695094202239, + "grad_norm": 9.625, + "learning_rate": 4.96007101096933e-06, + "loss": 1.4075953960418701, + "step": 1126 + }, + { + "epoch": 0.20533357604441613, + "grad_norm": 11.1875, + "learning_rate": 4.959918539519919e-06, + "loss": 0.9709517955780029, + "step": 1128 + }, + { + "epoch": 0.20569764266860835, + "grad_norm": 3.90625, + "learning_rate": 4.959765780460406e-06, + "loss": 1.1713272333145142, + "step": 1130 + }, + { + "epoch": 0.20606170929280057, + "grad_norm": 5.6875, + "learning_rate": 4.959612733813207e-06, + "loss": 0.9374972581863403, + "step": 1132 + }, + { + "epoch": 0.20642577591699282, + "grad_norm": 12.3125, + "learning_rate": 4.959459399600781e-06, + "loss": 1.1903777122497559, + "step": 1134 + }, + { + "epoch": 0.20678984254118504, + "grad_norm": 33.25, + "learning_rate": 4.959305777845629e-06, + "loss": 1.498299479484558, + "step": 1136 + }, + { + "epoch": 0.20715390916537726, + "grad_norm": 12.9375, + "learning_rate": 4.959151868570295e-06, + "loss": 1.6460087299346924, + "step": 1138 + }, + { + "epoch": 0.2075179757895695, + "grad_norm": 14.25, + "learning_rate": 4.958997671797363e-06, + "loss": 1.524951696395874, + "step": 1140 + }, + { + "epoch": 0.2078820424137617, + "grad_norm": 13.1875, + "learning_rate": 4.9588431875494626e-06, + "loss": 1.4647579193115234, + "step": 1142 + }, + { + "epoch": 0.20824610903795396, + "grad_norm": 10.0625, + "learning_rate": 4.958688415849263e-06, + "loss": 1.4470139741897583, + "step": 1144 + }, + { + "epoch": 0.20861017566214618, + "grad_norm": 9.8125, + "learning_rate": 4.958533356719476e-06, + "loss": 1.4590744972229004, + "step": 1146 + }, + { + "epoch": 0.2089742422863384, + "grad_norm": 5.625, + "learning_rate": 4.958378010182856e-06, + "loss": 1.2866430282592773, + "step": 1148 + }, + { + "epoch": 0.20933830891053062, + "grad_norm": 7.15625, + "learning_rate": 4.958222376262199e-06, + "loss": 0.8470004796981812, + "step": 1150 + }, + { + "epoch": 0.20970237553472285, + "grad_norm": 24.375, + "learning_rate": 4.958066454980345e-06, + "loss": 1.0619269609451294, + "step": 1152 + }, + { + "epoch": 0.21006644215891507, + "grad_norm": 10.0, + "learning_rate": 4.957910246360175e-06, + "loss": 0.6137018203735352, + "step": 1154 + }, + { + "epoch": 0.21043050878310732, + "grad_norm": 10.1875, + "learning_rate": 4.9577537504246095e-06, + "loss": 1.5130021572113037, + "step": 1156 + }, + { + "epoch": 0.21079457540729954, + "grad_norm": 4.21875, + "learning_rate": 4.957596967196616e-06, + "loss": 0.9950746297836304, + "step": 1158 + }, + { + "epoch": 0.21115864203149176, + "grad_norm": 8.8125, + "learning_rate": 4.957439896699201e-06, + "loss": 1.4392368793487549, + "step": 1160 + }, + { + "epoch": 0.21152270865568398, + "grad_norm": 11.25, + "learning_rate": 4.957282538955413e-06, + "loss": 1.4692758321762085, + "step": 1162 + }, + { + "epoch": 0.2118867752798762, + "grad_norm": 136.0, + "learning_rate": 4.957124893988347e-06, + "loss": 1.6144437789916992, + "step": 1164 + }, + { + "epoch": 0.21225084190406845, + "grad_norm": 48.5, + "learning_rate": 4.9569669618211316e-06, + "loss": 0.7389823198318481, + "step": 1166 + }, + { + "epoch": 0.21261490852826068, + "grad_norm": 5.75, + "learning_rate": 4.956808742476948e-06, + "loss": 1.1057299375534058, + "step": 1168 + }, + { + "epoch": 0.2129789751524529, + "grad_norm": 30.0, + "learning_rate": 4.9566502359790095e-06, + "loss": 1.6763724088668823, + "step": 1170 + }, + { + "epoch": 0.21334304177664512, + "grad_norm": 9.8125, + "learning_rate": 4.9564914423505784e-06, + "loss": 1.589447259902954, + "step": 1172 + }, + { + "epoch": 0.21370710840083734, + "grad_norm": 8.125, + "learning_rate": 4.956332361614958e-06, + "loss": 1.524857521057129, + "step": 1174 + }, + { + "epoch": 0.2140711750250296, + "grad_norm": 4.875, + "learning_rate": 4.9561729937954925e-06, + "loss": 1.4731521606445312, + "step": 1176 + }, + { + "epoch": 0.2144352416492218, + "grad_norm": 17.875, + "learning_rate": 4.956013338915568e-06, + "loss": 1.0429184436798096, + "step": 1178 + }, + { + "epoch": 0.21479930827341404, + "grad_norm": 7.03125, + "learning_rate": 4.955853396998611e-06, + "loss": 1.3547178506851196, + "step": 1180 + }, + { + "epoch": 0.21516337489760626, + "grad_norm": 35.5, + "learning_rate": 4.955693168068095e-06, + "loss": 1.6174616813659668, + "step": 1182 + }, + { + "epoch": 0.21552744152179848, + "grad_norm": 40.0, + "learning_rate": 4.955532652147533e-06, + "loss": 2.0516693592071533, + "step": 1184 + }, + { + "epoch": 0.21589150814599073, + "grad_norm": 13.0, + "learning_rate": 4.9553718492604794e-06, + "loss": 1.570616602897644, + "step": 1186 + }, + { + "epoch": 0.21625557477018295, + "grad_norm": 11.0625, + "learning_rate": 4.955210759430531e-06, + "loss": 1.4672772884368896, + "step": 1188 + }, + { + "epoch": 0.21661964139437517, + "grad_norm": 14.1875, + "learning_rate": 4.9550493826813285e-06, + "loss": 1.432410478591919, + "step": 1190 + }, + { + "epoch": 0.2169837080185674, + "grad_norm": 11.125, + "learning_rate": 4.954887719036551e-06, + "loss": 1.6189162731170654, + "step": 1192 + }, + { + "epoch": 0.21734777464275962, + "grad_norm": 16.75, + "learning_rate": 4.954725768519924e-06, + "loss": 1.359072208404541, + "step": 1194 + }, + { + "epoch": 0.21771184126695184, + "grad_norm": 8.8125, + "learning_rate": 4.954563531155211e-06, + "loss": 1.412648320198059, + "step": 1196 + }, + { + "epoch": 0.2180759078911441, + "grad_norm": 13.5, + "learning_rate": 4.9544010069662215e-06, + "loss": 1.4863377809524536, + "step": 1198 + }, + { + "epoch": 0.2184399745153363, + "grad_norm": 8.625, + "learning_rate": 4.954238195976805e-06, + "loss": 1.2107902765274048, + "step": 1200 + }, + { + "epoch": 0.21880404113952853, + "grad_norm": 13.1875, + "learning_rate": 4.954075098210853e-06, + "loss": 0.8278253078460693, + "step": 1202 + }, + { + "epoch": 0.21916810776372075, + "grad_norm": 11.375, + "learning_rate": 4.9539117136923e-06, + "loss": 0.6940520405769348, + "step": 1204 + }, + { + "epoch": 0.21953217438791298, + "grad_norm": 8.6875, + "learning_rate": 4.953748042445121e-06, + "loss": 1.216526985168457, + "step": 1206 + }, + { + "epoch": 0.21989624101210523, + "grad_norm": 12.5, + "learning_rate": 4.953584084493335e-06, + "loss": 1.0924643278121948, + "step": 1208 + }, + { + "epoch": 0.22026030763629745, + "grad_norm": 17.75, + "learning_rate": 4.953419839861001e-06, + "loss": 1.6490846872329712, + "step": 1210 + }, + { + "epoch": 0.22062437426048967, + "grad_norm": 37.25, + "learning_rate": 4.953255308572224e-06, + "loss": 1.464174747467041, + "step": 1212 + }, + { + "epoch": 0.2209884408846819, + "grad_norm": 12.8125, + "learning_rate": 4.953090490651143e-06, + "loss": 1.4101375341415405, + "step": 1214 + }, + { + "epoch": 0.2213525075088741, + "grad_norm": 15.875, + "learning_rate": 4.952925386121951e-06, + "loss": 1.4389835596084595, + "step": 1216 + }, + { + "epoch": 0.22171657413306636, + "grad_norm": 21.25, + "learning_rate": 4.952759995008871e-06, + "loss": 1.1927437782287598, + "step": 1218 + }, + { + "epoch": 0.22208064075725858, + "grad_norm": 4.75, + "learning_rate": 4.952594317336176e-06, + "loss": 1.148517370223999, + "step": 1220 + }, + { + "epoch": 0.2224447073814508, + "grad_norm": 9.0625, + "learning_rate": 4.952428353128178e-06, + "loss": 1.1994445323944092, + "step": 1222 + }, + { + "epoch": 0.22280877400564303, + "grad_norm": 147.0, + "learning_rate": 4.952262102409232e-06, + "loss": 0.9708088040351868, + "step": 1224 + }, + { + "epoch": 0.22317284062983525, + "grad_norm": 13.75, + "learning_rate": 4.952095565203735e-06, + "loss": 1.394553780555725, + "step": 1226 + }, + { + "epoch": 0.2235369072540275, + "grad_norm": 9.0, + "learning_rate": 4.9519287415361235e-06, + "loss": 1.3822214603424072, + "step": 1228 + }, + { + "epoch": 0.22390097387821972, + "grad_norm": 10.3125, + "learning_rate": 4.9517616314308814e-06, + "loss": 1.3066985607147217, + "step": 1230 + }, + { + "epoch": 0.22426504050241194, + "grad_norm": 17.0, + "learning_rate": 4.951594234912528e-06, + "loss": 1.6092782020568848, + "step": 1232 + }, + { + "epoch": 0.22462910712660417, + "grad_norm": 10.0, + "learning_rate": 4.9514265520056306e-06, + "loss": 1.3286519050598145, + "step": 1234 + }, + { + "epoch": 0.2249931737507964, + "grad_norm": 8.4375, + "learning_rate": 4.9512585827347945e-06, + "loss": 1.2676196098327637, + "step": 1236 + }, + { + "epoch": 0.2253572403749886, + "grad_norm": 16.625, + "learning_rate": 4.95109032712467e-06, + "loss": 0.9929633140563965, + "step": 1238 + }, + { + "epoch": 0.22572130699918086, + "grad_norm": 17.125, + "learning_rate": 4.950921785199947e-06, + "loss": 1.0534217357635498, + "step": 1240 + }, + { + "epoch": 0.22608537362337308, + "grad_norm": 12.6875, + "learning_rate": 4.950752956985358e-06, + "loss": 1.6984401941299438, + "step": 1242 + }, + { + "epoch": 0.2264494402475653, + "grad_norm": 15.125, + "learning_rate": 4.950583842505679e-06, + "loss": 1.2302775382995605, + "step": 1244 + }, + { + "epoch": 0.22681350687175753, + "grad_norm": 12.3125, + "learning_rate": 4.950414441785725e-06, + "loss": 1.1167516708374023, + "step": 1246 + }, + { + "epoch": 0.22717757349594975, + "grad_norm": 8.4375, + "learning_rate": 4.950244754850357e-06, + "loss": 1.4290108680725098, + "step": 1248 + }, + { + "epoch": 0.227541640120142, + "grad_norm": 6.40625, + "learning_rate": 4.950074781724473e-06, + "loss": 1.1943378448486328, + "step": 1250 + }, + { + "epoch": 0.22790570674433422, + "grad_norm": 20.0, + "learning_rate": 4.94990452243302e-06, + "loss": 0.8593637943267822, + "step": 1252 + }, + { + "epoch": 0.22826977336852644, + "grad_norm": 10.25, + "learning_rate": 4.94973397700098e-06, + "loss": 0.977672815322876, + "step": 1254 + }, + { + "epoch": 0.22863383999271866, + "grad_norm": 12.875, + "learning_rate": 4.94956314545338e-06, + "loss": 1.5777567625045776, + "step": 1256 + }, + { + "epoch": 0.22899790661691088, + "grad_norm": 13.375, + "learning_rate": 4.949392027815288e-06, + "loss": 1.8342951536178589, + "step": 1258 + }, + { + "epoch": 0.22936197324110313, + "grad_norm": 13.0625, + "learning_rate": 4.949220624111819e-06, + "loss": 1.4170873165130615, + "step": 1260 + }, + { + "epoch": 0.22972603986529536, + "grad_norm": 8.3125, + "learning_rate": 4.949048934368122e-06, + "loss": 1.3099360466003418, + "step": 1262 + }, + { + "epoch": 0.23009010648948758, + "grad_norm": 11.3125, + "learning_rate": 4.948876958609391e-06, + "loss": 1.5055506229400635, + "step": 1264 + }, + { + "epoch": 0.2304541731136798, + "grad_norm": 10.0625, + "learning_rate": 4.948704696860866e-06, + "loss": 1.63662588596344, + "step": 1266 + }, + { + "epoch": 0.23081823973787202, + "grad_norm": 7.59375, + "learning_rate": 4.948532149147823e-06, + "loss": 1.3518517017364502, + "step": 1268 + }, + { + "epoch": 0.23118230636206427, + "grad_norm": 46.25, + "learning_rate": 4.948359315495585e-06, + "loss": 1.1541385650634766, + "step": 1270 + }, + { + "epoch": 0.2315463729862565, + "grad_norm": 12.1875, + "learning_rate": 4.948186195929513e-06, + "loss": 1.5124117136001587, + "step": 1272 + }, + { + "epoch": 0.23191043961044872, + "grad_norm": 7.75, + "learning_rate": 4.9480127904750134e-06, + "loss": 0.8574434518814087, + "step": 1274 + }, + { + "epoch": 0.23227450623464094, + "grad_norm": 18.75, + "learning_rate": 4.947839099157529e-06, + "loss": 1.3625457286834717, + "step": 1276 + }, + { + "epoch": 0.23263857285883316, + "grad_norm": 12.875, + "learning_rate": 4.9476651220025525e-06, + "loss": 1.0178558826446533, + "step": 1278 + }, + { + "epoch": 0.23300263948302538, + "grad_norm": 9.0625, + "learning_rate": 4.947490859035612e-06, + "loss": 1.6901695728302002, + "step": 1280 + }, + { + "epoch": 0.23336670610721763, + "grad_norm": 7.25, + "learning_rate": 4.94731631028228e-06, + "loss": 1.5088622570037842, + "step": 1282 + }, + { + "epoch": 0.23373077273140985, + "grad_norm": 10.75, + "learning_rate": 4.947141475768171e-06, + "loss": 1.4696831703186035, + "step": 1284 + }, + { + "epoch": 0.23409483935560207, + "grad_norm": 17.75, + "learning_rate": 4.946966355518943e-06, + "loss": 1.5505940914154053, + "step": 1286 + }, + { + "epoch": 0.2344589059797943, + "grad_norm": 33.0, + "learning_rate": 4.946790949560291e-06, + "loss": 1.0533626079559326, + "step": 1288 + }, + { + "epoch": 0.23482297260398652, + "grad_norm": 6.40625, + "learning_rate": 4.9466152579179575e-06, + "loss": 0.5825610756874084, + "step": 1290 + }, + { + "epoch": 0.23518703922817877, + "grad_norm": 6.625, + "learning_rate": 4.946439280617724e-06, + "loss": 1.3869107961654663, + "step": 1292 + }, + { + "epoch": 0.235551105852371, + "grad_norm": 6.71875, + "learning_rate": 4.946263017685414e-06, + "loss": 1.0499608516693115, + "step": 1294 + }, + { + "epoch": 0.2359151724765632, + "grad_norm": 44.0, + "learning_rate": 4.946086469146895e-06, + "loss": 1.1648304462432861, + "step": 1296 + }, + { + "epoch": 0.23627923910075543, + "grad_norm": 19.875, + "learning_rate": 4.945909635028071e-06, + "loss": 0.7895115613937378, + "step": 1298 + }, + { + "epoch": 0.23664330572494766, + "grad_norm": 13.875, + "learning_rate": 4.945732515354896e-06, + "loss": 1.5313125848770142, + "step": 1300 + }, + { + "epoch": 0.2370073723491399, + "grad_norm": 7.71875, + "learning_rate": 4.945555110153358e-06, + "loss": 1.4455209970474243, + "step": 1302 + }, + { + "epoch": 0.23737143897333213, + "grad_norm": 7.65625, + "learning_rate": 4.945377419449494e-06, + "loss": 1.4758248329162598, + "step": 1304 + }, + { + "epoch": 0.23773550559752435, + "grad_norm": 24.125, + "learning_rate": 4.945199443269377e-06, + "loss": 1.713889718055725, + "step": 1306 + }, + { + "epoch": 0.23809957222171657, + "grad_norm": 13.1875, + "learning_rate": 4.945021181639126e-06, + "loss": 1.8196473121643066, + "step": 1308 + }, + { + "epoch": 0.2384636388459088, + "grad_norm": 2.703125, + "learning_rate": 4.944842634584897e-06, + "loss": 0.8688795566558838, + "step": 1310 + }, + { + "epoch": 0.23882770547010101, + "grad_norm": 15.3125, + "learning_rate": 4.9446638021328944e-06, + "loss": 1.2760381698608398, + "step": 1312 + }, + { + "epoch": 0.23919177209429326, + "grad_norm": 13.5625, + "learning_rate": 4.94448468430936e-06, + "loss": 1.4693682193756104, + "step": 1314 + }, + { + "epoch": 0.2395558387184855, + "grad_norm": 16.375, + "learning_rate": 4.944305281140578e-06, + "loss": 1.5682768821716309, + "step": 1316 + }, + { + "epoch": 0.2399199053426777, + "grad_norm": 4.90625, + "learning_rate": 4.9441255926528755e-06, + "loss": 1.3700590133666992, + "step": 1318 + }, + { + "epoch": 0.24028397196686993, + "grad_norm": 11.25, + "learning_rate": 4.943945618872621e-06, + "loss": 0.9841552376747131, + "step": 1320 + }, + { + "epoch": 0.24064803859106215, + "grad_norm": 92.0, + "learning_rate": 4.943765359826226e-06, + "loss": 1.9470609426498413, + "step": 1322 + }, + { + "epoch": 0.2410121052152544, + "grad_norm": 27.375, + "learning_rate": 4.943584815540141e-06, + "loss": 1.7613348960876465, + "step": 1324 + }, + { + "epoch": 0.24137617183944662, + "grad_norm": 12.1875, + "learning_rate": 4.9434039860408615e-06, + "loss": 1.5088882446289062, + "step": 1326 + }, + { + "epoch": 0.24174023846363885, + "grad_norm": 8.125, + "learning_rate": 4.943222871354922e-06, + "loss": 1.4682915210723877, + "step": 1328 + }, + { + "epoch": 0.24210430508783107, + "grad_norm": 9.0625, + "learning_rate": 4.943041471508902e-06, + "loss": 1.6899994611740112, + "step": 1330 + }, + { + "epoch": 0.2424683717120233, + "grad_norm": 13.75, + "learning_rate": 4.94285978652942e-06, + "loss": 1.736012578010559, + "step": 1332 + }, + { + "epoch": 0.24283243833621554, + "grad_norm": 28.375, + "learning_rate": 4.942677816443139e-06, + "loss": 1.5456054210662842, + "step": 1334 + }, + { + "epoch": 0.24319650496040776, + "grad_norm": 10.125, + "learning_rate": 4.942495561276761e-06, + "loss": 1.5503977537155151, + "step": 1336 + }, + { + "epoch": 0.24356057158459998, + "grad_norm": 12.875, + "learning_rate": 4.942313021057031e-06, + "loss": 1.6288011074066162, + "step": 1338 + }, + { + "epoch": 0.2439246382087922, + "grad_norm": 15.6875, + "learning_rate": 4.9421301958107385e-06, + "loss": 1.4899564981460571, + "step": 1340 + }, + { + "epoch": 0.24428870483298443, + "grad_norm": 19.5, + "learning_rate": 4.941947085564709e-06, + "loss": 1.503650188446045, + "step": 1342 + }, + { + "epoch": 0.24465277145717668, + "grad_norm": 4.375, + "learning_rate": 4.941763690345814e-06, + "loss": 1.3934472799301147, + "step": 1344 + }, + { + "epoch": 0.2450168380813689, + "grad_norm": 11.3125, + "learning_rate": 4.941580010180969e-06, + "loss": 1.5408638715744019, + "step": 1346 + }, + { + "epoch": 0.24538090470556112, + "grad_norm": 15.125, + "learning_rate": 4.941396045097124e-06, + "loss": 1.8255137205123901, + "step": 1348 + }, + { + "epoch": 0.24574497132975334, + "grad_norm": 13.0, + "learning_rate": 4.941211795121278e-06, + "loss": 1.5376935005187988, + "step": 1350 + }, + { + "epoch": 0.24610903795394556, + "grad_norm": 16.125, + "learning_rate": 4.941027260280468e-06, + "loss": 1.6967862844467163, + "step": 1352 + }, + { + "epoch": 0.24647310457813779, + "grad_norm": 8.9375, + "learning_rate": 4.940842440601774e-06, + "loss": 1.4979592561721802, + "step": 1354 + }, + { + "epoch": 0.24683717120233004, + "grad_norm": 5.3125, + "learning_rate": 4.940657336112317e-06, + "loss": 1.3248778581619263, + "step": 1356 + }, + { + "epoch": 0.24720123782652226, + "grad_norm": 14.4375, + "learning_rate": 4.9404719468392615e-06, + "loss": 1.4855530261993408, + "step": 1358 + }, + { + "epoch": 0.24756530445071448, + "grad_norm": 24.625, + "learning_rate": 4.940286272809811e-06, + "loss": 1.5398468971252441, + "step": 1360 + }, + { + "epoch": 0.2479293710749067, + "grad_norm": 5.5, + "learning_rate": 4.940100314051214e-06, + "loss": 0.8858805298805237, + "step": 1362 + }, + { + "epoch": 0.24829343769909892, + "grad_norm": 10.5, + "learning_rate": 4.9399140705907575e-06, + "loss": 1.1802723407745361, + "step": 1364 + }, + { + "epoch": 0.24865750432329117, + "grad_norm": 10.5, + "learning_rate": 4.939727542455774e-06, + "loss": 1.6216623783111572, + "step": 1366 + }, + { + "epoch": 0.2490215709474834, + "grad_norm": 8.0, + "learning_rate": 4.939540729673634e-06, + "loss": 1.5625545978546143, + "step": 1368 + }, + { + "epoch": 0.24938563757167562, + "grad_norm": 14.25, + "learning_rate": 4.939353632271752e-06, + "loss": 1.4306856393814087, + "step": 1370 + }, + { + "epoch": 0.24974970419586784, + "grad_norm": 15.8125, + "learning_rate": 4.939166250277584e-06, + "loss": 1.5217489004135132, + "step": 1372 + }, + { + "epoch": 0.2501137708200601, + "grad_norm": 9.0625, + "learning_rate": 4.938978583718629e-06, + "loss": 1.3186440467834473, + "step": 1374 + }, + { + "epoch": 0.2504778374442523, + "grad_norm": 7.5625, + "learning_rate": 4.9387906326224235e-06, + "loss": 1.5230623483657837, + "step": 1376 + }, + { + "epoch": 0.25084190406844453, + "grad_norm": 22.25, + "learning_rate": 4.93860239701655e-06, + "loss": 1.6378216743469238, + "step": 1378 + }, + { + "epoch": 0.2512059706926367, + "grad_norm": 10.3125, + "learning_rate": 4.93841387692863e-06, + "loss": 1.406336784362793, + "step": 1380 + }, + { + "epoch": 0.251570037316829, + "grad_norm": 6.65625, + "learning_rate": 4.938225072386332e-06, + "loss": 1.6698787212371826, + "step": 1382 + }, + { + "epoch": 0.2519341039410212, + "grad_norm": 8.125, + "learning_rate": 4.9380359834173575e-06, + "loss": 1.4710549116134644, + "step": 1384 + }, + { + "epoch": 0.2522981705652134, + "grad_norm": 18.25, + "learning_rate": 4.937846610049457e-06, + "loss": 1.6586592197418213, + "step": 1386 + }, + { + "epoch": 0.25266223718940567, + "grad_norm": 8.0, + "learning_rate": 4.93765695231042e-06, + "loss": 1.4598597288131714, + "step": 1388 + }, + { + "epoch": 0.25302630381359786, + "grad_norm": 12.3125, + "learning_rate": 4.937467010228079e-06, + "loss": 1.583903431892395, + "step": 1390 + }, + { + "epoch": 0.2533903704377901, + "grad_norm": 12.75, + "learning_rate": 4.9372767838303035e-06, + "loss": 1.6516391038894653, + "step": 1392 + }, + { + "epoch": 0.25375443706198236, + "grad_norm": 13.0625, + "learning_rate": 4.937086273145014e-06, + "loss": 2.0364017486572266, + "step": 1394 + }, + { + "epoch": 0.25411850368617456, + "grad_norm": 14.125, + "learning_rate": 4.936895478200162e-06, + "loss": 1.8258267641067505, + "step": 1396 + }, + { + "epoch": 0.2544825703103668, + "grad_norm": 12.5625, + "learning_rate": 4.936704399023749e-06, + "loss": 1.6076844930648804, + "step": 1398 + }, + { + "epoch": 0.254846636934559, + "grad_norm": 8.3125, + "learning_rate": 4.936513035643814e-06, + "loss": 1.4629310369491577, + "step": 1400 + }, + { + "epoch": 0.25521070355875125, + "grad_norm": 36.0, + "learning_rate": 4.93632138808844e-06, + "loss": 1.5793068408966064, + "step": 1402 + }, + { + "epoch": 0.2555747701829435, + "grad_norm": 48.5, + "learning_rate": 4.936129456385748e-06, + "loss": 1.6921769380569458, + "step": 1404 + }, + { + "epoch": 0.2559388368071357, + "grad_norm": 7.3125, + "learning_rate": 4.935937240563906e-06, + "loss": 1.2959438562393188, + "step": 1406 + }, + { + "epoch": 0.25630290343132794, + "grad_norm": 11.5625, + "learning_rate": 4.935744740651119e-06, + "loss": 1.4843640327453613, + "step": 1408 + }, + { + "epoch": 0.25666697005552014, + "grad_norm": 7.90625, + "learning_rate": 4.935551956675636e-06, + "loss": 1.3071551322937012, + "step": 1410 + }, + { + "epoch": 0.2570310366797124, + "grad_norm": 8.0625, + "learning_rate": 4.9353588886657486e-06, + "loss": 1.5213901996612549, + "step": 1412 + }, + { + "epoch": 0.25739510330390464, + "grad_norm": 10.4375, + "learning_rate": 4.935165536649788e-06, + "loss": 1.3940701484680176, + "step": 1414 + }, + { + "epoch": 0.25775916992809683, + "grad_norm": 7.8125, + "learning_rate": 4.934971900656125e-06, + "loss": 1.3361718654632568, + "step": 1416 + }, + { + "epoch": 0.2581232365522891, + "grad_norm": 21.25, + "learning_rate": 4.934777980713178e-06, + "loss": 1.164435625076294, + "step": 1418 + }, + { + "epoch": 0.2584873031764813, + "grad_norm": 8.3125, + "learning_rate": 4.934583776849404e-06, + "loss": 1.395206332206726, + "step": 1420 + }, + { + "epoch": 0.2588513698006735, + "grad_norm": 10.6875, + "learning_rate": 4.934389289093301e-06, + "loss": 1.7019124031066895, + "step": 1422 + }, + { + "epoch": 0.2592154364248658, + "grad_norm": 18.625, + "learning_rate": 4.93419451747341e-06, + "loss": 2.0475809574127197, + "step": 1424 + }, + { + "epoch": 0.25957950304905797, + "grad_norm": 12.5625, + "learning_rate": 4.933999462018311e-06, + "loss": 1.5055116415023804, + "step": 1426 + }, + { + "epoch": 0.2599435696732502, + "grad_norm": 16.125, + "learning_rate": 4.933804122756628e-06, + "loss": 1.5948069095611572, + "step": 1428 + }, + { + "epoch": 0.2603076362974424, + "grad_norm": 5.25, + "learning_rate": 4.933608499717029e-06, + "loss": 1.4785643815994263, + "step": 1430 + }, + { + "epoch": 0.26067170292163466, + "grad_norm": 23.75, + "learning_rate": 4.933412592928218e-06, + "loss": 0.9416211247444153, + "step": 1432 + }, + { + "epoch": 0.2610357695458269, + "grad_norm": 19.375, + "learning_rate": 4.933216402418943e-06, + "loss": 0.8640943765640259, + "step": 1434 + }, + { + "epoch": 0.2613998361700191, + "grad_norm": 8.5, + "learning_rate": 4.933019928217997e-06, + "loss": 1.7342121601104736, + "step": 1436 + }, + { + "epoch": 0.26176390279421136, + "grad_norm": 6.5, + "learning_rate": 4.932823170354211e-06, + "loss": 1.2391496896743774, + "step": 1438 + }, + { + "epoch": 0.26212796941840355, + "grad_norm": 11.0, + "learning_rate": 4.932626128856457e-06, + "loss": 1.4514696598052979, + "step": 1440 + }, + { + "epoch": 0.2624920360425958, + "grad_norm": 4.09375, + "learning_rate": 4.932428803753651e-06, + "loss": 1.6537638902664185, + "step": 1442 + }, + { + "epoch": 0.26285610266678805, + "grad_norm": 5.65625, + "learning_rate": 4.9322311950747495e-06, + "loss": 1.1202993392944336, + "step": 1444 + }, + { + "epoch": 0.26322016929098024, + "grad_norm": 12.125, + "learning_rate": 4.93203330284875e-06, + "loss": 1.2976114749908447, + "step": 1446 + }, + { + "epoch": 0.2635842359151725, + "grad_norm": 10.75, + "learning_rate": 4.931835127104694e-06, + "loss": 1.507428526878357, + "step": 1448 + }, + { + "epoch": 0.2639483025393647, + "grad_norm": 9.3125, + "learning_rate": 4.931636667871662e-06, + "loss": 1.4791808128356934, + "step": 1450 + }, + { + "epoch": 0.26431236916355694, + "grad_norm": 13.8125, + "learning_rate": 4.931437925178777e-06, + "loss": 1.5374300479888916, + "step": 1452 + }, + { + "epoch": 0.26467643578774913, + "grad_norm": 4.28125, + "learning_rate": 4.931238899055204e-06, + "loss": 1.3075952529907227, + "step": 1454 + }, + { + "epoch": 0.2650405024119414, + "grad_norm": 15.1875, + "learning_rate": 4.931039589530149e-06, + "loss": 1.0934590101242065, + "step": 1456 + }, + { + "epoch": 0.26540456903613363, + "grad_norm": 24.75, + "learning_rate": 4.93083999663286e-06, + "loss": 1.858839750289917, + "step": 1458 + }, + { + "epoch": 0.2657686356603258, + "grad_norm": 64.5, + "learning_rate": 4.930640120392628e-06, + "loss": 1.4588422775268555, + "step": 1460 + }, + { + "epoch": 0.2661327022845181, + "grad_norm": 19.625, + "learning_rate": 4.930439960838781e-06, + "loss": 1.5768492221832275, + "step": 1462 + }, + { + "epoch": 0.26649676890871027, + "grad_norm": 5.96875, + "learning_rate": 4.930239518000693e-06, + "loss": 1.5265140533447266, + "step": 1464 + }, + { + "epoch": 0.2668608355329025, + "grad_norm": 9.4375, + "learning_rate": 4.93003879190778e-06, + "loss": 1.426647424697876, + "step": 1466 + }, + { + "epoch": 0.26722490215709477, + "grad_norm": 24.0, + "learning_rate": 4.929837782589494e-06, + "loss": 1.9441301822662354, + "step": 1468 + }, + { + "epoch": 0.26758896878128696, + "grad_norm": 9.3125, + "learning_rate": 4.9296364900753345e-06, + "loss": 1.1998927593231201, + "step": 1470 + }, + { + "epoch": 0.2679530354054792, + "grad_norm": 12.0625, + "learning_rate": 4.929434914394842e-06, + "loss": 1.5073827505111694, + "step": 1472 + }, + { + "epoch": 0.2683171020296714, + "grad_norm": 9.5625, + "learning_rate": 4.929233055577594e-06, + "loss": 1.5349888801574707, + "step": 1474 + }, + { + "epoch": 0.26868116865386366, + "grad_norm": 11.75, + "learning_rate": 4.9290309136532136e-06, + "loss": 1.5648607015609741, + "step": 1476 + }, + { + "epoch": 0.2690452352780559, + "grad_norm": 10.8125, + "learning_rate": 4.928828488651363e-06, + "loss": 1.2843973636627197, + "step": 1478 + }, + { + "epoch": 0.2694093019022481, + "grad_norm": 8.0, + "learning_rate": 4.928625780601751e-06, + "loss": 1.7909092903137207, + "step": 1480 + }, + { + "epoch": 0.26977336852644035, + "grad_norm": 27.0, + "learning_rate": 4.928422789534121e-06, + "loss": 1.2462284564971924, + "step": 1482 + }, + { + "epoch": 0.27013743515063254, + "grad_norm": 15.125, + "learning_rate": 4.9282195154782605e-06, + "loss": 1.5644384622573853, + "step": 1484 + }, + { + "epoch": 0.2705015017748248, + "grad_norm": 18.25, + "learning_rate": 4.928015958464002e-06, + "loss": 1.4136170148849487, + "step": 1486 + }, + { + "epoch": 0.27086556839901704, + "grad_norm": 10.375, + "learning_rate": 4.927812118521215e-06, + "loss": 1.8244026899337769, + "step": 1488 + }, + { + "epoch": 0.27122963502320924, + "grad_norm": 14.625, + "learning_rate": 4.927607995679812e-06, + "loss": 1.8218878507614136, + "step": 1490 + }, + { + "epoch": 0.2715937016474015, + "grad_norm": 15.75, + "learning_rate": 4.927403589969747e-06, + "loss": 1.861344575881958, + "step": 1492 + }, + { + "epoch": 0.2719577682715937, + "grad_norm": 10.125, + "learning_rate": 4.927198901421018e-06, + "loss": 1.6647491455078125, + "step": 1494 + }, + { + "epoch": 0.27232183489578593, + "grad_norm": 25.75, + "learning_rate": 4.926993930063658e-06, + "loss": 1.1363540887832642, + "step": 1496 + }, + { + "epoch": 0.2726859015199782, + "grad_norm": 39.0, + "learning_rate": 4.92678867592775e-06, + "loss": 1.2200970649719238, + "step": 1498 + }, + { + "epoch": 0.2730499681441704, + "grad_norm": 7.625, + "learning_rate": 4.926583139043412e-06, + "loss": 1.4953429698944092, + "step": 1500 + }, + { + "epoch": 0.2734140347683626, + "grad_norm": 3.609375, + "learning_rate": 4.926377319440806e-06, + "loss": 0.9606925845146179, + "step": 1502 + }, + { + "epoch": 0.2737781013925548, + "grad_norm": 16.125, + "learning_rate": 4.926171217150135e-06, + "loss": 1.4604928493499756, + "step": 1504 + }, + { + "epoch": 0.27414216801674707, + "grad_norm": 34.25, + "learning_rate": 4.925964832201644e-06, + "loss": 1.8967225551605225, + "step": 1506 + }, + { + "epoch": 0.2745062346409393, + "grad_norm": 15.25, + "learning_rate": 4.925758164625619e-06, + "loss": 1.5837507247924805, + "step": 1508 + }, + { + "epoch": 0.2748703012651315, + "grad_norm": 9.875, + "learning_rate": 4.925551214452389e-06, + "loss": 1.5635753870010376, + "step": 1510 + }, + { + "epoch": 0.27523436788932376, + "grad_norm": 9.3125, + "learning_rate": 4.92534398171232e-06, + "loss": 1.6564198732376099, + "step": 1512 + }, + { + "epoch": 0.27559843451351596, + "grad_norm": 27.375, + "learning_rate": 4.925136466435826e-06, + "loss": 2.018310546875, + "step": 1514 + }, + { + "epoch": 0.2759625011377082, + "grad_norm": 112.5, + "learning_rate": 4.9249286686533575e-06, + "loss": 1.6500873565673828, + "step": 1516 + }, + { + "epoch": 0.27632656776190045, + "grad_norm": 12.0, + "learning_rate": 4.924720588395406e-06, + "loss": 1.3833949565887451, + "step": 1518 + }, + { + "epoch": 0.27669063438609265, + "grad_norm": 5.28125, + "learning_rate": 4.924512225692509e-06, + "loss": 1.544885277748108, + "step": 1520 + }, + { + "epoch": 0.2770547010102849, + "grad_norm": 15.0, + "learning_rate": 4.924303580575244e-06, + "loss": 1.1007860898971558, + "step": 1522 + }, + { + "epoch": 0.2774187676344771, + "grad_norm": 33.5, + "learning_rate": 4.924094653074226e-06, + "loss": 1.2831106185913086, + "step": 1524 + }, + { + "epoch": 0.27778283425866934, + "grad_norm": 11.625, + "learning_rate": 4.9238854432201144e-06, + "loss": 1.512636423110962, + "step": 1526 + }, + { + "epoch": 0.27814690088286154, + "grad_norm": 9.9375, + "learning_rate": 4.9236759510436125e-06, + "loss": 1.6301493644714355, + "step": 1528 + }, + { + "epoch": 0.2785109675070538, + "grad_norm": 64.5, + "learning_rate": 4.92346617657546e-06, + "loss": 1.8329832553863525, + "step": 1530 + }, + { + "epoch": 0.27887503413124604, + "grad_norm": 33.5, + "learning_rate": 4.923256119846441e-06, + "loss": 1.811387300491333, + "step": 1532 + }, + { + "epoch": 0.27923910075543823, + "grad_norm": 13.75, + "learning_rate": 4.923045780887381e-06, + "loss": 1.1624234914779663, + "step": 1534 + }, + { + "epoch": 0.2796031673796305, + "grad_norm": 8.5625, + "learning_rate": 4.922835159729145e-06, + "loss": 1.4665863513946533, + "step": 1536 + }, + { + "epoch": 0.2799672340038227, + "grad_norm": 12.5625, + "learning_rate": 4.922624256402644e-06, + "loss": 1.372377634048462, + "step": 1538 + }, + { + "epoch": 0.2803313006280149, + "grad_norm": 8.25, + "learning_rate": 4.922413070938823e-06, + "loss": 0.9352389574050903, + "step": 1540 + }, + { + "epoch": 0.2806953672522072, + "grad_norm": 12.5, + "learning_rate": 4.922201603368676e-06, + "loss": 1.4835054874420166, + "step": 1542 + }, + { + "epoch": 0.28105943387639937, + "grad_norm": 9.9375, + "learning_rate": 4.921989853723234e-06, + "loss": 1.3877606391906738, + "step": 1544 + }, + { + "epoch": 0.2814235005005916, + "grad_norm": 11.6875, + "learning_rate": 4.921777822033569e-06, + "loss": 0.9911926984786987, + "step": 1546 + }, + { + "epoch": 0.2817875671247838, + "grad_norm": 11.0625, + "learning_rate": 4.921565508330797e-06, + "loss": 1.6913743019104004, + "step": 1548 + }, + { + "epoch": 0.28215163374897606, + "grad_norm": 22.125, + "learning_rate": 4.921352912646075e-06, + "loss": 1.3811343908309937, + "step": 1550 + }, + { + "epoch": 0.2825157003731683, + "grad_norm": 36.75, + "learning_rate": 4.921140035010599e-06, + "loss": 1.4110978841781616, + "step": 1552 + }, + { + "epoch": 0.2828797669973605, + "grad_norm": 13.1875, + "learning_rate": 4.920926875455608e-06, + "loss": 1.5414402484893799, + "step": 1554 + }, + { + "epoch": 0.28324383362155275, + "grad_norm": 23.25, + "learning_rate": 4.920713434012384e-06, + "loss": 1.4706974029541016, + "step": 1556 + }, + { + "epoch": 0.28360790024574495, + "grad_norm": 9.9375, + "learning_rate": 4.920499710712247e-06, + "loss": 1.5283700227737427, + "step": 1558 + }, + { + "epoch": 0.2839719668699372, + "grad_norm": 31.0, + "learning_rate": 4.92028570558656e-06, + "loss": 1.2847185134887695, + "step": 1560 + }, + { + "epoch": 0.28433603349412945, + "grad_norm": 4.5625, + "learning_rate": 4.9200714186667284e-06, + "loss": 1.0840245485305786, + "step": 1562 + }, + { + "epoch": 0.28470010011832164, + "grad_norm": 7.53125, + "learning_rate": 4.919856849984198e-06, + "loss": 1.274387240409851, + "step": 1564 + }, + { + "epoch": 0.2850641667425139, + "grad_norm": 10.6875, + "learning_rate": 4.9196419995704545e-06, + "loss": 1.4173680543899536, + "step": 1566 + }, + { + "epoch": 0.2854282333667061, + "grad_norm": 13.4375, + "learning_rate": 4.919426867457028e-06, + "loss": 1.5290443897247314, + "step": 1568 + }, + { + "epoch": 0.28579229999089834, + "grad_norm": 7.71875, + "learning_rate": 4.919211453675486e-06, + "loss": 1.6210980415344238, + "step": 1570 + }, + { + "epoch": 0.2861563666150906, + "grad_norm": 33.5, + "learning_rate": 4.918995758257443e-06, + "loss": 1.1563513278961182, + "step": 1572 + }, + { + "epoch": 0.2865204332392828, + "grad_norm": 16.625, + "learning_rate": 4.918779781234548e-06, + "loss": 1.3691879510879517, + "step": 1574 + }, + { + "epoch": 0.28688449986347503, + "grad_norm": 9.75, + "learning_rate": 4.918563522638498e-06, + "loss": 1.3036433458328247, + "step": 1576 + }, + { + "epoch": 0.2872485664876672, + "grad_norm": 9.25, + "learning_rate": 4.918346982501025e-06, + "loss": 1.6119672060012817, + "step": 1578 + }, + { + "epoch": 0.2876126331118595, + "grad_norm": 42.25, + "learning_rate": 4.918130160853906e-06, + "loss": 1.3039236068725586, + "step": 1580 + }, + { + "epoch": 0.2879766997360517, + "grad_norm": 28.75, + "learning_rate": 4.917913057728961e-06, + "loss": 1.6524569988250732, + "step": 1582 + }, + { + "epoch": 0.2883407663602439, + "grad_norm": 8.375, + "learning_rate": 4.917695673158046e-06, + "loss": 1.3654242753982544, + "step": 1584 + }, + { + "epoch": 0.28870483298443617, + "grad_norm": 4.40625, + "learning_rate": 4.9174780071730635e-06, + "loss": 1.0268429517745972, + "step": 1586 + }, + { + "epoch": 0.28906889960862836, + "grad_norm": 78.0, + "learning_rate": 4.917260059805954e-06, + "loss": 1.1582659482955933, + "step": 1588 + }, + { + "epoch": 0.2894329662328206, + "grad_norm": 4.78125, + "learning_rate": 4.917041831088702e-06, + "loss": 1.3455580472946167, + "step": 1590 + }, + { + "epoch": 0.28979703285701286, + "grad_norm": 29.375, + "learning_rate": 4.916823321053329e-06, + "loss": 1.1064903736114502, + "step": 1592 + }, + { + "epoch": 0.29016109948120505, + "grad_norm": 12.3125, + "learning_rate": 4.916604529731902e-06, + "loss": 1.4881951808929443, + "step": 1594 + }, + { + "epoch": 0.2905251661053973, + "grad_norm": 8.6875, + "learning_rate": 4.916385457156528e-06, + "loss": 1.4490782022476196, + "step": 1596 + }, + { + "epoch": 0.2908892327295895, + "grad_norm": 7.71875, + "learning_rate": 4.916166103359353e-06, + "loss": 1.5107438564300537, + "step": 1598 + }, + { + "epoch": 0.29125329935378175, + "grad_norm": 12.3125, + "learning_rate": 4.91594646837257e-06, + "loss": 1.3751193284988403, + "step": 1600 + }, + { + "epoch": 0.291617365977974, + "grad_norm": 18.625, + "learning_rate": 4.915726552228406e-06, + "loss": 1.193555474281311, + "step": 1602 + }, + { + "epoch": 0.2919814326021662, + "grad_norm": 9.375, + "learning_rate": 4.915506354959135e-06, + "loss": 0.6499952077865601, + "step": 1604 + }, + { + "epoch": 0.29234549922635844, + "grad_norm": 5.125, + "learning_rate": 4.915285876597069e-06, + "loss": 1.266210675239563, + "step": 1606 + }, + { + "epoch": 0.29270956585055063, + "grad_norm": 16.625, + "learning_rate": 4.9150651171745635e-06, + "loss": 1.6331934928894043, + "step": 1608 + }, + { + "epoch": 0.2930736324747429, + "grad_norm": 18.375, + "learning_rate": 4.914844076724012e-06, + "loss": 1.424376130104065, + "step": 1610 + }, + { + "epoch": 0.2934376990989351, + "grad_norm": 8.4375, + "learning_rate": 4.914622755277852e-06, + "loss": 1.4336001873016357, + "step": 1612 + }, + { + "epoch": 0.29380176572312733, + "grad_norm": 8.3125, + "learning_rate": 4.9144011528685635e-06, + "loss": 1.5310535430908203, + "step": 1614 + }, + { + "epoch": 0.2941658323473196, + "grad_norm": 6.875, + "learning_rate": 4.9141792695286625e-06, + "loss": 1.3885955810546875, + "step": 1616 + }, + { + "epoch": 0.29452989897151177, + "grad_norm": 6.375, + "learning_rate": 4.913957105290712e-06, + "loss": 1.2195767164230347, + "step": 1618 + }, + { + "epoch": 0.294893965595704, + "grad_norm": 4.9375, + "learning_rate": 4.913734660187314e-06, + "loss": 1.2497265338897705, + "step": 1620 + }, + { + "epoch": 0.2952580322198962, + "grad_norm": 7.53125, + "learning_rate": 4.913511934251109e-06, + "loss": 1.3131662607192993, + "step": 1622 + }, + { + "epoch": 0.29562209884408847, + "grad_norm": 8.75, + "learning_rate": 4.913288927514782e-06, + "loss": 1.388770580291748, + "step": 1624 + }, + { + "epoch": 0.2959861654682807, + "grad_norm": 7.46875, + "learning_rate": 4.91306564001106e-06, + "loss": 1.2544684410095215, + "step": 1626 + }, + { + "epoch": 0.2963502320924729, + "grad_norm": 12.25, + "learning_rate": 4.912842071772708e-06, + "loss": 1.6028367280960083, + "step": 1628 + }, + { + "epoch": 0.29671429871666516, + "grad_norm": 16.75, + "learning_rate": 4.912618222832534e-06, + "loss": 1.8615212440490723, + "step": 1630 + }, + { + "epoch": 0.29707836534085735, + "grad_norm": 9.25, + "learning_rate": 4.912394093223386e-06, + "loss": 1.0779528617858887, + "step": 1632 + }, + { + "epoch": 0.2974424319650496, + "grad_norm": 27.375, + "learning_rate": 4.912169682978156e-06, + "loss": 1.3994189500808716, + "step": 1634 + }, + { + "epoch": 0.29780649858924185, + "grad_norm": 7.9375, + "learning_rate": 4.911944992129773e-06, + "loss": 0.26486337184906006, + "step": 1636 + }, + { + "epoch": 0.29817056521343405, + "grad_norm": 45.5, + "learning_rate": 4.911720020711212e-06, + "loss": 0.5341527462005615, + "step": 1638 + }, + { + "epoch": 0.2985346318376263, + "grad_norm": 79.5, + "learning_rate": 4.911494768755487e-06, + "loss": 1.3533393144607544, + "step": 1640 + }, + { + "epoch": 0.2988986984618185, + "grad_norm": 20.125, + "learning_rate": 4.91126923629565e-06, + "loss": 1.0230789184570312, + "step": 1642 + }, + { + "epoch": 0.29926276508601074, + "grad_norm": 7.34375, + "learning_rate": 4.911043423364797e-06, + "loss": 1.595827341079712, + "step": 1644 + }, + { + "epoch": 0.299626831710203, + "grad_norm": 7.5, + "learning_rate": 4.9108173299960685e-06, + "loss": 1.4891338348388672, + "step": 1646 + }, + { + "epoch": 0.2999908983343952, + "grad_norm": 5.53125, + "learning_rate": 4.910590956222639e-06, + "loss": 0.9356772899627686, + "step": 1648 + }, + { + "epoch": 0.30035496495858743, + "grad_norm": 7.65625, + "learning_rate": 4.9103643020777304e-06, + "loss": 1.5626338720321655, + "step": 1650 + }, + { + "epoch": 0.30071903158277963, + "grad_norm": 8.3125, + "learning_rate": 4.910137367594601e-06, + "loss": 1.3338781595230103, + "step": 1652 + }, + { + "epoch": 0.3010830982069719, + "grad_norm": 12.3125, + "learning_rate": 4.909910152806556e-06, + "loss": 1.1236662864685059, + "step": 1654 + }, + { + "epoch": 0.3014471648311641, + "grad_norm": 21.625, + "learning_rate": 4.9096826577469355e-06, + "loss": 1.4933905601501465, + "step": 1656 + }, + { + "epoch": 0.3018112314553563, + "grad_norm": 15.5, + "learning_rate": 4.9094548824491254e-06, + "loss": 1.4233638048171997, + "step": 1658 + }, + { + "epoch": 0.30217529807954857, + "grad_norm": 26.625, + "learning_rate": 4.909226826946548e-06, + "loss": 0.7125585079193115, + "step": 1660 + }, + { + "epoch": 0.30253936470374077, + "grad_norm": 33.25, + "learning_rate": 4.908998491272673e-06, + "loss": 1.870632290840149, + "step": 1662 + }, + { + "epoch": 0.302903431327933, + "grad_norm": 9.4375, + "learning_rate": 4.908769875461005e-06, + "loss": 1.4414547681808472, + "step": 1664 + }, + { + "epoch": 0.30326749795212526, + "grad_norm": 23.5, + "learning_rate": 4.908540979545092e-06, + "loss": 1.5465025901794434, + "step": 1666 + }, + { + "epoch": 0.30363156457631746, + "grad_norm": 11.5, + "learning_rate": 4.9083118035585266e-06, + "loss": 1.4594249725341797, + "step": 1668 + }, + { + "epoch": 0.3039956312005097, + "grad_norm": 21.25, + "learning_rate": 4.908082347534937e-06, + "loss": 1.4396207332611084, + "step": 1670 + }, + { + "epoch": 0.3043596978247019, + "grad_norm": 11.75, + "learning_rate": 4.907852611507995e-06, + "loss": 0.8985010981559753, + "step": 1672 + }, + { + "epoch": 0.30472376444889415, + "grad_norm": 14.125, + "learning_rate": 4.907622595511416e-06, + "loss": 1.1935596466064453, + "step": 1674 + }, + { + "epoch": 0.3050878310730864, + "grad_norm": 5.5, + "learning_rate": 4.90739229957895e-06, + "loss": 0.6197656393051147, + "step": 1676 + }, + { + "epoch": 0.3054518976972786, + "grad_norm": 7.1875, + "learning_rate": 4.907161723744395e-06, + "loss": 1.3275530338287354, + "step": 1678 + }, + { + "epoch": 0.30581596432147085, + "grad_norm": 17.75, + "learning_rate": 4.906930868041586e-06, + "loss": 1.3283809423446655, + "step": 1680 + }, + { + "epoch": 0.30618003094566304, + "grad_norm": 7.3125, + "learning_rate": 4.906699732504401e-06, + "loss": 1.6180342435836792, + "step": 1682 + }, + { + "epoch": 0.3065440975698553, + "grad_norm": 5.8125, + "learning_rate": 4.906468317166756e-06, + "loss": 0.9833647608757019, + "step": 1684 + }, + { + "epoch": 0.3069081641940475, + "grad_norm": 8.875, + "learning_rate": 4.9062366220626125e-06, + "loss": 1.3363556861877441, + "step": 1686 + }, + { + "epoch": 0.30727223081823973, + "grad_norm": 8.5, + "learning_rate": 4.9060046472259695e-06, + "loss": 1.4134596586227417, + "step": 1688 + }, + { + "epoch": 0.307636297442432, + "grad_norm": 5.84375, + "learning_rate": 4.905772392690869e-06, + "loss": 1.3034048080444336, + "step": 1690 + }, + { + "epoch": 0.3080003640666242, + "grad_norm": 14.3125, + "learning_rate": 4.905539858491394e-06, + "loss": 1.1930062770843506, + "step": 1692 + }, + { + "epoch": 0.3083644306908164, + "grad_norm": 14.8125, + "learning_rate": 4.9053070446616666e-06, + "loss": 1.4095170497894287, + "step": 1694 + }, + { + "epoch": 0.3087284973150086, + "grad_norm": 36.0, + "learning_rate": 4.905073951235853e-06, + "loss": 1.3340262174606323, + "step": 1696 + }, + { + "epoch": 0.30909256393920087, + "grad_norm": 20.125, + "learning_rate": 4.9048405782481566e-06, + "loss": 0.7319252490997314, + "step": 1698 + }, + { + "epoch": 0.3094566305633931, + "grad_norm": 12.6875, + "learning_rate": 4.904606925732826e-06, + "loss": 1.0980385541915894, + "step": 1700 + }, + { + "epoch": 0.3098206971875853, + "grad_norm": 16.0, + "learning_rate": 4.904372993724146e-06, + "loss": 1.401090383529663, + "step": 1702 + }, + { + "epoch": 0.31018476381177756, + "grad_norm": 11.75, + "learning_rate": 4.904138782256448e-06, + "loss": 1.603826880455017, + "step": 1704 + }, + { + "epoch": 0.31054883043596976, + "grad_norm": 13.125, + "learning_rate": 4.9039042913641e-06, + "loss": 1.4457886219024658, + "step": 1706 + }, + { + "epoch": 0.310912897060162, + "grad_norm": 6.75, + "learning_rate": 4.903669521081513e-06, + "loss": 1.2897664308547974, + "step": 1708 + }, + { + "epoch": 0.31127696368435426, + "grad_norm": 27.125, + "learning_rate": 4.90343447144314e-06, + "loss": 1.230724811553955, + "step": 1710 + }, + { + "epoch": 0.31164103030854645, + "grad_norm": 10.3125, + "learning_rate": 4.9031991424834716e-06, + "loss": 1.1595288515090942, + "step": 1712 + }, + { + "epoch": 0.3120050969327387, + "grad_norm": 6.5, + "learning_rate": 4.902963534237042e-06, + "loss": 1.276283621788025, + "step": 1714 + }, + { + "epoch": 0.3123691635569309, + "grad_norm": 16.75, + "learning_rate": 4.902727646738424e-06, + "loss": 1.395148515701294, + "step": 1716 + }, + { + "epoch": 0.31273323018112315, + "grad_norm": 7.59375, + "learning_rate": 4.902491480022238e-06, + "loss": 1.2297887802124023, + "step": 1718 + }, + { + "epoch": 0.3130972968053154, + "grad_norm": 7.28125, + "learning_rate": 4.9022550341231355e-06, + "loss": 1.1767199039459229, + "step": 1720 + }, + { + "epoch": 0.3134613634295076, + "grad_norm": 13.875, + "learning_rate": 4.902018309075816e-06, + "loss": 1.6155898571014404, + "step": 1722 + }, + { + "epoch": 0.31382543005369984, + "grad_norm": 13.375, + "learning_rate": 4.9017813049150185e-06, + "loss": 1.811079502105713, + "step": 1724 + }, + { + "epoch": 0.31418949667789203, + "grad_norm": 12.125, + "learning_rate": 4.901544021675521e-06, + "loss": 1.4806177616119385, + "step": 1726 + }, + { + "epoch": 0.3145535633020843, + "grad_norm": 9.1875, + "learning_rate": 4.9013064593921456e-06, + "loss": 1.4905201196670532, + "step": 1728 + }, + { + "epoch": 0.31491762992627653, + "grad_norm": 6.46875, + "learning_rate": 4.901068618099752e-06, + "loss": 1.1806972026824951, + "step": 1730 + }, + { + "epoch": 0.3152816965504687, + "grad_norm": 24.5, + "learning_rate": 4.900830497833243e-06, + "loss": 1.4952486753463745, + "step": 1732 + }, + { + "epoch": 0.315645763174661, + "grad_norm": 7.8125, + "learning_rate": 4.9005920986275625e-06, + "loss": 1.4671142101287842, + "step": 1734 + }, + { + "epoch": 0.31600982979885317, + "grad_norm": 22.875, + "learning_rate": 4.900353420517693e-06, + "loss": 1.0870821475982666, + "step": 1736 + }, + { + "epoch": 0.3163738964230454, + "grad_norm": 12.125, + "learning_rate": 4.900114463538661e-06, + "loss": 1.3239197731018066, + "step": 1738 + }, + { + "epoch": 0.31673796304723767, + "grad_norm": 27.375, + "learning_rate": 4.899875227725532e-06, + "loss": 1.5688568353652954, + "step": 1740 + }, + { + "epoch": 0.31710202967142986, + "grad_norm": 8.875, + "learning_rate": 4.899635713113412e-06, + "loss": 1.7790281772613525, + "step": 1742 + }, + { + "epoch": 0.3174660962956221, + "grad_norm": 28.5, + "learning_rate": 4.899395919737451e-06, + "loss": 1.7641477584838867, + "step": 1744 + }, + { + "epoch": 0.3178301629198143, + "grad_norm": 14.125, + "learning_rate": 4.899155847632836e-06, + "loss": 1.9671759605407715, + "step": 1746 + }, + { + "epoch": 0.31819422954400656, + "grad_norm": 6.9375, + "learning_rate": 4.898915496834796e-06, + "loss": 1.2605907917022705, + "step": 1748 + }, + { + "epoch": 0.3185582961681988, + "grad_norm": 6.71875, + "learning_rate": 4.898674867378603e-06, + "loss": 1.093799114227295, + "step": 1750 + }, + { + "epoch": 0.318922362792391, + "grad_norm": 10.875, + "learning_rate": 4.898433959299569e-06, + "loss": 1.577582836151123, + "step": 1752 + }, + { + "epoch": 0.31928642941658325, + "grad_norm": 16.5, + "learning_rate": 4.898192772633043e-06, + "loss": 1.3264565467834473, + "step": 1754 + }, + { + "epoch": 0.31965049604077544, + "grad_norm": 14.0, + "learning_rate": 4.897951307414423e-06, + "loss": 1.5290167331695557, + "step": 1756 + }, + { + "epoch": 0.3200145626649677, + "grad_norm": 30.125, + "learning_rate": 4.897709563679138e-06, + "loss": 1.2932648658752441, + "step": 1758 + }, + { + "epoch": 0.3203786292891599, + "grad_norm": 8.3125, + "learning_rate": 4.897467541462666e-06, + "loss": 1.2476685047149658, + "step": 1760 + }, + { + "epoch": 0.32074269591335214, + "grad_norm": 6.25, + "learning_rate": 4.897225240800523e-06, + "loss": 1.104599118232727, + "step": 1762 + }, + { + "epoch": 0.3211067625375444, + "grad_norm": 7.28125, + "learning_rate": 4.896982661728263e-06, + "loss": 1.6615452766418457, + "step": 1764 + }, + { + "epoch": 0.3214708291617366, + "grad_norm": 10.5, + "learning_rate": 4.896739804281486e-06, + "loss": 1.4220013618469238, + "step": 1766 + }, + { + "epoch": 0.32183489578592883, + "grad_norm": 5.09375, + "learning_rate": 4.89649666849583e-06, + "loss": 1.2345436811447144, + "step": 1768 + }, + { + "epoch": 0.322198962410121, + "grad_norm": 10.4375, + "learning_rate": 4.896253254406973e-06, + "loss": 1.4038641452789307, + "step": 1770 + }, + { + "epoch": 0.3225630290343133, + "grad_norm": 12.125, + "learning_rate": 4.8960095620506364e-06, + "loss": 1.2185895442962646, + "step": 1772 + }, + { + "epoch": 0.3229270956585055, + "grad_norm": 12.375, + "learning_rate": 4.89576559146258e-06, + "loss": 0.5059472322463989, + "step": 1774 + }, + { + "epoch": 0.3232911622826977, + "grad_norm": 9.25, + "learning_rate": 4.895521342678606e-06, + "loss": 1.336120367050171, + "step": 1776 + }, + { + "epoch": 0.32365522890688997, + "grad_norm": 8.5, + "learning_rate": 4.895276815734558e-06, + "loss": 1.6235952377319336, + "step": 1778 + }, + { + "epoch": 0.32401929553108216, + "grad_norm": 18.125, + "learning_rate": 4.895032010666316e-06, + "loss": 1.5352451801300049, + "step": 1780 + }, + { + "epoch": 0.3243833621552744, + "grad_norm": 19.625, + "learning_rate": 4.894786927509808e-06, + "loss": 1.733658790588379, + "step": 1782 + }, + { + "epoch": 0.32474742877946666, + "grad_norm": 7.71875, + "learning_rate": 4.894541566300996e-06, + "loss": 1.5690175294876099, + "step": 1784 + }, + { + "epoch": 0.32511149540365886, + "grad_norm": 10.625, + "learning_rate": 4.894295927075888e-06, + "loss": 1.2554556131362915, + "step": 1786 + }, + { + "epoch": 0.3254755620278511, + "grad_norm": 21.25, + "learning_rate": 4.894050009870529e-06, + "loss": 1.7130169868469238, + "step": 1788 + }, + { + "epoch": 0.3258396286520433, + "grad_norm": 22.125, + "learning_rate": 4.893803814721007e-06, + "loss": 1.9033994674682617, + "step": 1790 + }, + { + "epoch": 0.32620369527623555, + "grad_norm": 14.3125, + "learning_rate": 4.8935573416634515e-06, + "loss": 1.6412822008132935, + "step": 1792 + }, + { + "epoch": 0.3265677619004278, + "grad_norm": 13.5625, + "learning_rate": 4.8933105907340285e-06, + "loss": 1.238384485244751, + "step": 1794 + }, + { + "epoch": 0.32693182852462, + "grad_norm": 17.875, + "learning_rate": 4.893063561968951e-06, + "loss": 0.8862287402153015, + "step": 1796 + }, + { + "epoch": 0.32729589514881224, + "grad_norm": 15.625, + "learning_rate": 4.892816255404465e-06, + "loss": 0.7366995811462402, + "step": 1798 + }, + { + "epoch": 0.32765996177300444, + "grad_norm": 14.875, + "learning_rate": 4.8925686710768665e-06, + "loss": 1.824641227722168, + "step": 1800 + }, + { + "epoch": 0.3280240283971967, + "grad_norm": 10.75, + "learning_rate": 4.892320809022484e-06, + "loss": 1.4951683282852173, + "step": 1802 + }, + { + "epoch": 0.32838809502138894, + "grad_norm": 12.1875, + "learning_rate": 4.892072669277692e-06, + "loss": 1.5310840606689453, + "step": 1804 + }, + { + "epoch": 0.32875216164558113, + "grad_norm": 5.25, + "learning_rate": 4.8918242518789046e-06, + "loss": 1.3562496900558472, + "step": 1806 + }, + { + "epoch": 0.3291162282697734, + "grad_norm": 5.21875, + "learning_rate": 4.891575556862574e-06, + "loss": 1.089478611946106, + "step": 1808 + }, + { + "epoch": 0.3294802948939656, + "grad_norm": 7.09375, + "learning_rate": 4.891326584265198e-06, + "loss": 0.8965445756912231, + "step": 1810 + }, + { + "epoch": 0.3298443615181578, + "grad_norm": 32.0, + "learning_rate": 4.89107733412331e-06, + "loss": 1.2015230655670166, + "step": 1812 + }, + { + "epoch": 0.3302084281423501, + "grad_norm": 34.25, + "learning_rate": 4.890827806473486e-06, + "loss": 1.742699146270752, + "step": 1814 + }, + { + "epoch": 0.33057249476654227, + "grad_norm": 13.3125, + "learning_rate": 4.890578001352345e-06, + "loss": 1.6372833251953125, + "step": 1816 + }, + { + "epoch": 0.3309365613907345, + "grad_norm": 10.1875, + "learning_rate": 4.890327918796543e-06, + "loss": 1.4791233539581299, + "step": 1818 + }, + { + "epoch": 0.3313006280149267, + "grad_norm": 12.0625, + "learning_rate": 4.890077558842782e-06, + "loss": 1.93343186378479, + "step": 1820 + }, + { + "epoch": 0.33166469463911896, + "grad_norm": 12.25, + "learning_rate": 4.889826921527797e-06, + "loss": 1.443381905555725, + "step": 1822 + }, + { + "epoch": 0.3320287612633112, + "grad_norm": 12.0625, + "learning_rate": 4.889576006888372e-06, + "loss": 1.3324389457702637, + "step": 1824 + }, + { + "epoch": 0.3323928278875034, + "grad_norm": 223.0, + "learning_rate": 4.8893248149613235e-06, + "loss": 0.8466244339942932, + "step": 1826 + }, + { + "epoch": 0.33275689451169566, + "grad_norm": 9.0625, + "learning_rate": 4.889073345783517e-06, + "loss": 0.9138334393501282, + "step": 1828 + }, + { + "epoch": 0.33312096113588785, + "grad_norm": 8.8125, + "learning_rate": 4.888821599391852e-06, + "loss": 1.0970096588134766, + "step": 1830 + }, + { + "epoch": 0.3334850277600801, + "grad_norm": 8.5625, + "learning_rate": 4.888569575823272e-06, + "loss": 1.583977222442627, + "step": 1832 + }, + { + "epoch": 0.33384909438427235, + "grad_norm": 163.0, + "learning_rate": 4.8883172751147615e-06, + "loss": 1.3237519264221191, + "step": 1834 + }, + { + "epoch": 0.33421316100846454, + "grad_norm": 11.5625, + "learning_rate": 4.888064697303342e-06, + "loss": 1.1429455280303955, + "step": 1836 + }, + { + "epoch": 0.3345772276326568, + "grad_norm": 37.0, + "learning_rate": 4.88781184242608e-06, + "loss": 1.8034274578094482, + "step": 1838 + }, + { + "epoch": 0.334941294256849, + "grad_norm": 2.421875, + "learning_rate": 4.8875587105200816e-06, + "loss": 0.9290924668312073, + "step": 1840 + }, + { + "epoch": 0.33530536088104124, + "grad_norm": 18.25, + "learning_rate": 4.8873053016224916e-06, + "loss": 1.313586711883545, + "step": 1842 + }, + { + "epoch": 0.33566942750523343, + "grad_norm": 9.9375, + "learning_rate": 4.887051615770497e-06, + "loss": 1.783752202987671, + "step": 1844 + }, + { + "epoch": 0.3360334941294257, + "grad_norm": 36.0, + "learning_rate": 4.886797653001326e-06, + "loss": 1.207167625427246, + "step": 1846 + }, + { + "epoch": 0.33639756075361793, + "grad_norm": 9.1875, + "learning_rate": 4.886543413352245e-06, + "loss": 1.6194645166397095, + "step": 1848 + }, + { + "epoch": 0.3367616273778101, + "grad_norm": 13.25, + "learning_rate": 4.886288896860565e-06, + "loss": 1.6892048120498657, + "step": 1850 + }, + { + "epoch": 0.3371256940020024, + "grad_norm": 12.25, + "learning_rate": 4.886034103563633e-06, + "loss": 1.4255638122558594, + "step": 1852 + }, + { + "epoch": 0.33748976062619457, + "grad_norm": 22.0, + "learning_rate": 4.885779033498838e-06, + "loss": 1.5247515439987183, + "step": 1854 + }, + { + "epoch": 0.3378538272503868, + "grad_norm": 12.0, + "learning_rate": 4.885523686703615e-06, + "loss": 1.8602932691574097, + "step": 1856 + }, + { + "epoch": 0.33821789387457907, + "grad_norm": 8.9375, + "learning_rate": 4.8852680632154305e-06, + "loss": 1.2452037334442139, + "step": 1858 + }, + { + "epoch": 0.33858196049877126, + "grad_norm": 18.375, + "learning_rate": 4.8850121630718e-06, + "loss": 0.8291829228401184, + "step": 1860 + }, + { + "epoch": 0.3389460271229635, + "grad_norm": 13.6875, + "learning_rate": 4.884755986310271e-06, + "loss": 1.5668028593063354, + "step": 1862 + }, + { + "epoch": 0.3393100937471557, + "grad_norm": 18.25, + "learning_rate": 4.8844995329684416e-06, + "loss": 1.7037612199783325, + "step": 1864 + }, + { + "epoch": 0.33967416037134796, + "grad_norm": 11.6875, + "learning_rate": 4.884242803083943e-06, + "loss": 1.2408268451690674, + "step": 1866 + }, + { + "epoch": 0.3400382269955402, + "grad_norm": 21.875, + "learning_rate": 4.883985796694448e-06, + "loss": 1.200797200202942, + "step": 1868 + }, + { + "epoch": 0.3404022936197324, + "grad_norm": 10.875, + "learning_rate": 4.883728513837672e-06, + "loss": 1.2831562757492065, + "step": 1870 + }, + { + "epoch": 0.34076636024392465, + "grad_norm": 14.4375, + "learning_rate": 4.883470954551373e-06, + "loss": 1.3389482498168945, + "step": 1872 + }, + { + "epoch": 0.34113042686811684, + "grad_norm": 22.875, + "learning_rate": 4.883213118873342e-06, + "loss": 1.7483201026916504, + "step": 1874 + }, + { + "epoch": 0.3414944934923091, + "grad_norm": 55.75, + "learning_rate": 4.882955006841419e-06, + "loss": 2.0661890506744385, + "step": 1876 + }, + { + "epoch": 0.34185856011650134, + "grad_norm": 7.5, + "learning_rate": 4.88269661849348e-06, + "loss": 1.3901770114898682, + "step": 1878 + }, + { + "epoch": 0.34222262674069354, + "grad_norm": 19.625, + "learning_rate": 4.882437953867441e-06, + "loss": 1.3201402425765991, + "step": 1880 + }, + { + "epoch": 0.3425866933648858, + "grad_norm": 20.75, + "learning_rate": 4.882179013001262e-06, + "loss": 2.316580295562744, + "step": 1882 + }, + { + "epoch": 0.342950759989078, + "grad_norm": 32.0, + "learning_rate": 4.8819197959329404e-06, + "loss": 1.2262027263641357, + "step": 1884 + }, + { + "epoch": 0.34331482661327023, + "grad_norm": 11.4375, + "learning_rate": 4.881660302700516e-06, + "loss": 0.9581204652786255, + "step": 1886 + }, + { + "epoch": 0.3436788932374625, + "grad_norm": 20.375, + "learning_rate": 4.881400533342068e-06, + "loss": 1.5597262382507324, + "step": 1888 + }, + { + "epoch": 0.3440429598616547, + "grad_norm": 22.0, + "learning_rate": 4.881140487895715e-06, + "loss": 1.6273586750030518, + "step": 1890 + }, + { + "epoch": 0.3444070264858469, + "grad_norm": 16.875, + "learning_rate": 4.8808801663996195e-06, + "loss": 1.4734547138214111, + "step": 1892 + }, + { + "epoch": 0.3447710931100391, + "grad_norm": 14.375, + "learning_rate": 4.880619568891982e-06, + "loss": 1.6394697427749634, + "step": 1894 + }, + { + "epoch": 0.34513515973423137, + "grad_norm": 21.875, + "learning_rate": 4.880358695411045e-06, + "loss": 1.5059329271316528, + "step": 1896 + }, + { + "epoch": 0.3454992263584236, + "grad_norm": 6.28125, + "learning_rate": 4.880097545995089e-06, + "loss": 0.9710061550140381, + "step": 1898 + }, + { + "epoch": 0.3458632929826158, + "grad_norm": 26.0, + "learning_rate": 4.879836120682438e-06, + "loss": 0.9650970697402954, + "step": 1900 + }, + { + "epoch": 0.34622735960680806, + "grad_norm": 16.5, + "learning_rate": 4.879574419511456e-06, + "loss": 0.8568592071533203, + "step": 1902 + }, + { + "epoch": 0.34659142623100025, + "grad_norm": 8.5, + "learning_rate": 4.879312442520543e-06, + "loss": 1.436322808265686, + "step": 1904 + }, + { + "epoch": 0.3469554928551925, + "grad_norm": 16.125, + "learning_rate": 4.879050189748147e-06, + "loss": 1.510069727897644, + "step": 1906 + }, + { + "epoch": 0.34731955947938475, + "grad_norm": 15.4375, + "learning_rate": 4.878787661232749e-06, + "loss": 1.3854396343231201, + "step": 1908 + }, + { + "epoch": 0.34768362610357695, + "grad_norm": 12.0625, + "learning_rate": 4.878524857012877e-06, + "loss": 1.4114618301391602, + "step": 1910 + }, + { + "epoch": 0.3480476927277692, + "grad_norm": 21.625, + "learning_rate": 4.878261777127095e-06, + "loss": 1.2796521186828613, + "step": 1912 + }, + { + "epoch": 0.3484117593519614, + "grad_norm": 2.859375, + "learning_rate": 4.877998421614009e-06, + "loss": 0.7274551391601562, + "step": 1914 + }, + { + "epoch": 0.34877582597615364, + "grad_norm": 4.9375, + "learning_rate": 4.877734790512265e-06, + "loss": 1.4244027137756348, + "step": 1916 + }, + { + "epoch": 0.34913989260034584, + "grad_norm": 5.3125, + "learning_rate": 4.877470883860551e-06, + "loss": 1.1214369535446167, + "step": 1918 + }, + { + "epoch": 0.3495039592245381, + "grad_norm": 11.3125, + "learning_rate": 4.877206701697594e-06, + "loss": 1.326357364654541, + "step": 1920 + }, + { + "epoch": 0.34986802584873034, + "grad_norm": 7.28125, + "learning_rate": 4.8769422440621606e-06, + "loss": 1.282468557357788, + "step": 1922 + }, + { + "epoch": 0.35023209247292253, + "grad_norm": 7.0625, + "learning_rate": 4.876677510993058e-06, + "loss": 0.9851529598236084, + "step": 1924 + }, + { + "epoch": 0.3505961590971148, + "grad_norm": 16.5, + "learning_rate": 4.876412502529138e-06, + "loss": 1.5737724304199219, + "step": 1926 + }, + { + "epoch": 0.350960225721307, + "grad_norm": 9.625, + "learning_rate": 4.876147218709287e-06, + "loss": 1.3451647758483887, + "step": 1928 + }, + { + "epoch": 0.3513242923454992, + "grad_norm": 11.125, + "learning_rate": 4.875881659572436e-06, + "loss": 1.3311004638671875, + "step": 1930 + }, + { + "epoch": 0.3516883589696915, + "grad_norm": 7.34375, + "learning_rate": 4.875615825157553e-06, + "loss": 1.3036038875579834, + "step": 1932 + }, + { + "epoch": 0.35205242559388367, + "grad_norm": 17.375, + "learning_rate": 4.875349715503648e-06, + "loss": 1.5920460224151611, + "step": 1934 + }, + { + "epoch": 0.3524164922180759, + "grad_norm": 15.875, + "learning_rate": 4.875083330649774e-06, + "loss": 1.7447134256362915, + "step": 1936 + }, + { + "epoch": 0.3527805588422681, + "grad_norm": 23.375, + "learning_rate": 4.874816670635019e-06, + "loss": 1.7228376865386963, + "step": 1938 + }, + { + "epoch": 0.35314462546646036, + "grad_norm": 25.375, + "learning_rate": 4.874549735498516e-06, + "loss": 1.0123932361602783, + "step": 1940 + }, + { + "epoch": 0.3535086920906526, + "grad_norm": 19.0, + "learning_rate": 4.8742825252794354e-06, + "loss": 1.7268915176391602, + "step": 1942 + }, + { + "epoch": 0.3538727587148448, + "grad_norm": 22.75, + "learning_rate": 4.874015040016991e-06, + "loss": 2.2342190742492676, + "step": 1944 + }, + { + "epoch": 0.35423682533903705, + "grad_norm": 24.0, + "learning_rate": 4.8737472797504345e-06, + "loss": 1.5835553407669067, + "step": 1946 + }, + { + "epoch": 0.35460089196322925, + "grad_norm": 9.25, + "learning_rate": 4.873479244519058e-06, + "loss": 1.56566321849823, + "step": 1948 + }, + { + "epoch": 0.3549649585874215, + "grad_norm": 21.625, + "learning_rate": 4.873210934362195e-06, + "loss": 1.720061182975769, + "step": 1950 + }, + { + "epoch": 0.35532902521161375, + "grad_norm": 16.875, + "learning_rate": 4.8729423493192185e-06, + "loss": 1.7763686180114746, + "step": 1952 + }, + { + "epoch": 0.35569309183580594, + "grad_norm": 11.5625, + "learning_rate": 4.872673489429542e-06, + "loss": 1.5924266576766968, + "step": 1954 + }, + { + "epoch": 0.3560571584599982, + "grad_norm": 11.875, + "learning_rate": 4.872404354732621e-06, + "loss": 1.2784820795059204, + "step": 1956 + }, + { + "epoch": 0.3564212250841904, + "grad_norm": 15.3125, + "learning_rate": 4.87213494526795e-06, + "loss": 1.552825927734375, + "step": 1958 + }, + { + "epoch": 0.35678529170838263, + "grad_norm": 15.5, + "learning_rate": 4.8718652610750615e-06, + "loss": 1.4479613304138184, + "step": 1960 + }, + { + "epoch": 0.3571493583325749, + "grad_norm": 34.75, + "learning_rate": 4.871595302193533e-06, + "loss": 1.6408932209014893, + "step": 1962 + }, + { + "epoch": 0.3575134249567671, + "grad_norm": 16.375, + "learning_rate": 4.871325068662978e-06, + "loss": 1.9525874853134155, + "step": 1964 + }, + { + "epoch": 0.35787749158095933, + "grad_norm": 7.625, + "learning_rate": 4.871054560523053e-06, + "loss": 1.3724558353424072, + "step": 1966 + }, + { + "epoch": 0.3582415582051515, + "grad_norm": 20.0, + "learning_rate": 4.870783777813454e-06, + "loss": 1.6303695440292358, + "step": 1968 + }, + { + "epoch": 0.35860562482934377, + "grad_norm": 8.8125, + "learning_rate": 4.870512720573918e-06, + "loss": 1.210604190826416, + "step": 1970 + }, + { + "epoch": 0.358969691453536, + "grad_norm": 12.25, + "learning_rate": 4.87024138884422e-06, + "loss": 0.6176000833511353, + "step": 1972 + }, + { + "epoch": 0.3593337580777282, + "grad_norm": 14.1875, + "learning_rate": 4.869969782664178e-06, + "loss": 1.2780102491378784, + "step": 1974 + }, + { + "epoch": 0.35969782470192047, + "grad_norm": 13.0, + "learning_rate": 4.869697902073648e-06, + "loss": 1.7059063911437988, + "step": 1976 + }, + { + "epoch": 0.36006189132611266, + "grad_norm": 14.375, + "learning_rate": 4.869425747112528e-06, + "loss": 1.1936110258102417, + "step": 1978 + }, + { + "epoch": 0.3604259579503049, + "grad_norm": 21.125, + "learning_rate": 4.869153317820757e-06, + "loss": 1.492185354232788, + "step": 1980 + }, + { + "epoch": 0.36079002457449716, + "grad_norm": 8.0, + "learning_rate": 4.8688806142383105e-06, + "loss": 1.5416659116744995, + "step": 1982 + }, + { + "epoch": 0.36115409119868935, + "grad_norm": 11.375, + "learning_rate": 4.868607636405208e-06, + "loss": 1.7989057302474976, + "step": 1984 + }, + { + "epoch": 0.3615181578228816, + "grad_norm": 25.375, + "learning_rate": 4.868334384361508e-06, + "loss": 1.651280164718628, + "step": 1986 + }, + { + "epoch": 0.3618822244470738, + "grad_norm": 13.625, + "learning_rate": 4.8680608581473085e-06, + "loss": 1.0221983194351196, + "step": 1988 + }, + { + "epoch": 0.36224629107126605, + "grad_norm": 22.875, + "learning_rate": 4.867787057802749e-06, + "loss": 1.549391269683838, + "step": 1990 + }, + { + "epoch": 0.3626103576954583, + "grad_norm": 11.6875, + "learning_rate": 4.867512983368009e-06, + "loss": 0.7112131714820862, + "step": 1992 + }, + { + "epoch": 0.3629744243196505, + "grad_norm": 12.0, + "learning_rate": 4.867238634883305e-06, + "loss": 1.44002366065979, + "step": 1994 + }, + { + "epoch": 0.36333849094384274, + "grad_norm": 11.8125, + "learning_rate": 4.8669640123889e-06, + "loss": 1.8782907724380493, + "step": 1996 + }, + { + "epoch": 0.36370255756803493, + "grad_norm": 11.1875, + "learning_rate": 4.866689115925093e-06, + "loss": 1.585323452949524, + "step": 1998 + }, + { + "epoch": 0.3640666241922272, + "grad_norm": 10.1875, + "learning_rate": 4.866413945532221e-06, + "loss": 1.1778922080993652, + "step": 2000 + }, + { + "epoch": 0.3644306908164194, + "grad_norm": 17.875, + "learning_rate": 4.866138501250669e-06, + "loss": 0.9781596660614014, + "step": 2002 + }, + { + "epoch": 0.36479475744061163, + "grad_norm": 7.90625, + "learning_rate": 4.865862783120853e-06, + "loss": 1.3375802040100098, + "step": 2004 + }, + { + "epoch": 0.3651588240648039, + "grad_norm": 2.953125, + "learning_rate": 4.865586791183236e-06, + "loss": 1.044349193572998, + "step": 2006 + }, + { + "epoch": 0.36552289068899607, + "grad_norm": 8.8125, + "learning_rate": 4.865310525478318e-06, + "loss": 1.2266474962234497, + "step": 2008 + }, + { + "epoch": 0.3658869573131883, + "grad_norm": 10.5625, + "learning_rate": 4.865033986046639e-06, + "loss": 1.4242433309555054, + "step": 2010 + }, + { + "epoch": 0.3662510239373805, + "grad_norm": 9.0625, + "learning_rate": 4.8647571729287814e-06, + "loss": 1.3591541051864624, + "step": 2012 + }, + { + "epoch": 0.36661509056157277, + "grad_norm": 8.8125, + "learning_rate": 4.864480086165365e-06, + "loss": 1.4692587852478027, + "step": 2014 + }, + { + "epoch": 0.366979157185765, + "grad_norm": 6.03125, + "learning_rate": 4.864202725797053e-06, + "loss": 1.2231907844543457, + "step": 2016 + }, + { + "epoch": 0.3673432238099572, + "grad_norm": 19.25, + "learning_rate": 4.863925091864547e-06, + "loss": 1.5058244466781616, + "step": 2018 + }, + { + "epoch": 0.36770729043414946, + "grad_norm": 11.0, + "learning_rate": 4.863647184408585e-06, + "loss": 1.4587091207504272, + "step": 2020 + }, + { + "epoch": 0.36807135705834165, + "grad_norm": 19.125, + "learning_rate": 4.8633690034699536e-06, + "loss": 1.0562090873718262, + "step": 2022 + }, + { + "epoch": 0.3684354236825339, + "grad_norm": 5.78125, + "learning_rate": 4.863090549089472e-06, + "loss": 1.3666481971740723, + "step": 2024 + }, + { + "epoch": 0.36879949030672615, + "grad_norm": 11.375, + "learning_rate": 4.862811821308002e-06, + "loss": 1.3786983489990234, + "step": 2026 + }, + { + "epoch": 0.36916355693091835, + "grad_norm": 11.25, + "learning_rate": 4.862532820166447e-06, + "loss": 1.3864390850067139, + "step": 2028 + }, + { + "epoch": 0.3695276235551106, + "grad_norm": 14.1875, + "learning_rate": 4.86225354570575e-06, + "loss": 1.4415440559387207, + "step": 2030 + }, + { + "epoch": 0.3698916901793028, + "grad_norm": 27.875, + "learning_rate": 4.8619739979668904e-06, + "loss": 1.4659253358840942, + "step": 2032 + }, + { + "epoch": 0.37025575680349504, + "grad_norm": 6.625, + "learning_rate": 4.8616941769908935e-06, + "loss": 1.4282618761062622, + "step": 2034 + }, + { + "epoch": 0.3706198234276873, + "grad_norm": 7.875, + "learning_rate": 4.86141408281882e-06, + "loss": 1.1804873943328857, + "step": 2036 + }, + { + "epoch": 0.3709838900518795, + "grad_norm": 5.90625, + "learning_rate": 4.861133715491773e-06, + "loss": 1.314592957496643, + "step": 2038 + }, + { + "epoch": 0.37134795667607173, + "grad_norm": 7.125, + "learning_rate": 4.860853075050899e-06, + "loss": 1.4239892959594727, + "step": 2040 + }, + { + "epoch": 0.3717120233002639, + "grad_norm": 13.375, + "learning_rate": 4.8605721615373744e-06, + "loss": 1.5178523063659668, + "step": 2042 + }, + { + "epoch": 0.3720760899244562, + "grad_norm": 30.5, + "learning_rate": 4.8602909749924265e-06, + "loss": 1.6570950746536255, + "step": 2044 + }, + { + "epoch": 0.3724401565486484, + "grad_norm": 12.625, + "learning_rate": 4.8600095154573164e-06, + "loss": 1.35731840133667, + "step": 2046 + }, + { + "epoch": 0.3728042231728406, + "grad_norm": 23.25, + "learning_rate": 4.859727782973349e-06, + "loss": 1.615955114364624, + "step": 2048 + }, + { + "epoch": 0.37316828979703287, + "grad_norm": 14.625, + "learning_rate": 4.859445777581866e-06, + "loss": 1.263051152229309, + "step": 2050 + }, + { + "epoch": 0.37353235642122506, + "grad_norm": 26.75, + "learning_rate": 4.859163499324251e-06, + "loss": 1.0694100856781006, + "step": 2052 + }, + { + "epoch": 0.3738964230454173, + "grad_norm": 9.3125, + "learning_rate": 4.858880948241926e-06, + "loss": 1.2078285217285156, + "step": 2054 + }, + { + "epoch": 0.37426048966960956, + "grad_norm": 13.4375, + "learning_rate": 4.858598124376356e-06, + "loss": 1.5817310810089111, + "step": 2056 + }, + { + "epoch": 0.37462455629380176, + "grad_norm": 7.0, + "learning_rate": 4.858315027769044e-06, + "loss": 1.4974950551986694, + "step": 2058 + }, + { + "epoch": 0.374988622917994, + "grad_norm": 17.75, + "learning_rate": 4.8580316584615315e-06, + "loss": 1.8317619562149048, + "step": 2060 + }, + { + "epoch": 0.3753526895421862, + "grad_norm": 17.625, + "learning_rate": 4.857748016495404e-06, + "loss": 1.7796279191970825, + "step": 2062 + }, + { + "epoch": 0.37571675616637845, + "grad_norm": 15.0, + "learning_rate": 4.857464101912282e-06, + "loss": 1.926640510559082, + "step": 2064 + }, + { + "epoch": 0.3760808227905707, + "grad_norm": 8.3125, + "learning_rate": 4.8571799147538335e-06, + "loss": 1.4101649522781372, + "step": 2066 + }, + { + "epoch": 0.3764448894147629, + "grad_norm": 17.0, + "learning_rate": 4.856895455061757e-06, + "loss": 1.7048012018203735, + "step": 2068 + }, + { + "epoch": 0.37680895603895515, + "grad_norm": 103.5, + "learning_rate": 4.856610722877799e-06, + "loss": 1.936248779296875, + "step": 2070 + }, + { + "epoch": 0.37717302266314734, + "grad_norm": 13.0, + "learning_rate": 4.856325718243742e-06, + "loss": 1.5187864303588867, + "step": 2072 + }, + { + "epoch": 0.3775370892873396, + "grad_norm": 8.4375, + "learning_rate": 4.856040441201407e-06, + "loss": 1.6555273532867432, + "step": 2074 + }, + { + "epoch": 0.3779011559115318, + "grad_norm": 12.4375, + "learning_rate": 4.85575489179266e-06, + "loss": 0.9584940075874329, + "step": 2076 + }, + { + "epoch": 0.37826522253572403, + "grad_norm": 10.0625, + "learning_rate": 4.8554690700594034e-06, + "loss": 0.6771257519721985, + "step": 2078 + }, + { + "epoch": 0.3786292891599163, + "grad_norm": 2.71875, + "learning_rate": 4.855182976043581e-06, + "loss": 1.0531089305877686, + "step": 2080 + }, + { + "epoch": 0.3789933557841085, + "grad_norm": 11.25, + "learning_rate": 4.854896609787174e-06, + "loss": 1.087500810623169, + "step": 2082 + }, + { + "epoch": 0.3793574224083007, + "grad_norm": 20.125, + "learning_rate": 4.854609971332208e-06, + "loss": 1.4262564182281494, + "step": 2084 + }, + { + "epoch": 0.3797214890324929, + "grad_norm": 10.25, + "learning_rate": 4.854323060720743e-06, + "loss": 1.5136618614196777, + "step": 2086 + }, + { + "epoch": 0.38008555565668517, + "grad_norm": 33.5, + "learning_rate": 4.854035877994886e-06, + "loss": 1.4735723733901978, + "step": 2088 + }, + { + "epoch": 0.3804496222808774, + "grad_norm": 6.75, + "learning_rate": 4.8537484231967766e-06, + "loss": 0.898356556892395, + "step": 2090 + }, + { + "epoch": 0.3808136889050696, + "grad_norm": 13.0625, + "learning_rate": 4.853460696368599e-06, + "loss": 1.290900468826294, + "step": 2092 + }, + { + "epoch": 0.38117775552926186, + "grad_norm": 10.625, + "learning_rate": 4.853172697552575e-06, + "loss": 1.8041828870773315, + "step": 2094 + }, + { + "epoch": 0.38154182215345406, + "grad_norm": 9.8125, + "learning_rate": 4.8528844267909685e-06, + "loss": 1.5967042446136475, + "step": 2096 + }, + { + "epoch": 0.3819058887776463, + "grad_norm": 8.5625, + "learning_rate": 4.8525958841260815e-06, + "loss": 1.62123703956604, + "step": 2098 + }, + { + "epoch": 0.38226995540183856, + "grad_norm": 9.25, + "learning_rate": 4.852307069600256e-06, + "loss": 1.348247766494751, + "step": 2100 + }, + { + "epoch": 0.38263402202603075, + "grad_norm": 14.875, + "learning_rate": 4.852017983255874e-06, + "loss": 1.467861294746399, + "step": 2102 + }, + { + "epoch": 0.382998088650223, + "grad_norm": 18.625, + "learning_rate": 4.8517286251353606e-06, + "loss": 1.3362497091293335, + "step": 2104 + }, + { + "epoch": 0.3833621552744152, + "grad_norm": 13.0, + "learning_rate": 4.851438995281173e-06, + "loss": 1.0168746709823608, + "step": 2106 + }, + { + "epoch": 0.38372622189860744, + "grad_norm": 11.3125, + "learning_rate": 4.8511490937358185e-06, + "loss": 1.527422547340393, + "step": 2108 + }, + { + "epoch": 0.3840902885227997, + "grad_norm": 8.8125, + "learning_rate": 4.850858920541836e-06, + "loss": 1.5718482732772827, + "step": 2110 + }, + { + "epoch": 0.3844543551469919, + "grad_norm": 10.1875, + "learning_rate": 4.850568475741807e-06, + "loss": 1.1038970947265625, + "step": 2112 + }, + { + "epoch": 0.38481842177118414, + "grad_norm": 23.875, + "learning_rate": 4.850277759378355e-06, + "loss": 1.102405309677124, + "step": 2114 + }, + { + "epoch": 0.38518248839537633, + "grad_norm": 17.0, + "learning_rate": 4.849986771494139e-06, + "loss": 1.7098726034164429, + "step": 2116 + }, + { + "epoch": 0.3855465550195686, + "grad_norm": 8.125, + "learning_rate": 4.849695512131864e-06, + "loss": 1.6488326787948608, + "step": 2118 + }, + { + "epoch": 0.38591062164376083, + "grad_norm": 78.0, + "learning_rate": 4.849403981334267e-06, + "loss": 1.7363027334213257, + "step": 2120 + }, + { + "epoch": 0.386274688267953, + "grad_norm": 19.0, + "learning_rate": 4.8491121791441295e-06, + "loss": 1.879165530204773, + "step": 2122 + }, + { + "epoch": 0.3866387548921453, + "grad_norm": 13.625, + "learning_rate": 4.848820105604275e-06, + "loss": 1.5834972858428955, + "step": 2124 + }, + { + "epoch": 0.38700282151633747, + "grad_norm": 10.0, + "learning_rate": 4.8485277607575636e-06, + "loss": 1.5458858013153076, + "step": 2126 + }, + { + "epoch": 0.3873668881405297, + "grad_norm": 8.6875, + "learning_rate": 4.848235144646893e-06, + "loss": 1.4922245740890503, + "step": 2128 + }, + { + "epoch": 0.38773095476472197, + "grad_norm": 5.3125, + "learning_rate": 4.847942257315206e-06, + "loss": 1.1346415281295776, + "step": 2130 + }, + { + "epoch": 0.38809502138891416, + "grad_norm": 10.75, + "learning_rate": 4.847649098805482e-06, + "loss": 1.6507272720336914, + "step": 2132 + }, + { + "epoch": 0.3884590880131064, + "grad_norm": 15.625, + "learning_rate": 4.847355669160739e-06, + "loss": 1.5805333852767944, + "step": 2134 + }, + { + "epoch": 0.3888231546372986, + "grad_norm": 14.6875, + "learning_rate": 4.84706196842404e-06, + "loss": 1.5614655017852783, + "step": 2136 + }, + { + "epoch": 0.38918722126149086, + "grad_norm": 14.1875, + "learning_rate": 4.846767996638482e-06, + "loss": 1.829200029373169, + "step": 2138 + }, + { + "epoch": 0.3895512878856831, + "grad_norm": 12.25, + "learning_rate": 4.846473753847204e-06, + "loss": 1.190377950668335, + "step": 2140 + }, + { + "epoch": 0.3899153545098753, + "grad_norm": 13.9375, + "learning_rate": 4.8461792400933874e-06, + "loss": 1.2341125011444092, + "step": 2142 + }, + { + "epoch": 0.39027942113406755, + "grad_norm": 14.1875, + "learning_rate": 4.8458844554202474e-06, + "loss": 1.4373302459716797, + "step": 2144 + }, + { + "epoch": 0.39064348775825974, + "grad_norm": 15.5, + "learning_rate": 4.8455893998710455e-06, + "loss": 1.9258980751037598, + "step": 2146 + }, + { + "epoch": 0.391007554382452, + "grad_norm": 12.0, + "learning_rate": 4.8452940734890785e-06, + "loss": 1.5115244388580322, + "step": 2148 + }, + { + "epoch": 0.39137162100664424, + "grad_norm": 9.5, + "learning_rate": 4.844998476317685e-06, + "loss": 1.146047592163086, + "step": 2150 + }, + { + "epoch": 0.39173568763083644, + "grad_norm": 7.78125, + "learning_rate": 4.844702608400241e-06, + "loss": 1.5512419939041138, + "step": 2152 + }, + { + "epoch": 0.3920997542550287, + "grad_norm": 7.46875, + "learning_rate": 4.844406469780166e-06, + "loss": 1.4744033813476562, + "step": 2154 + }, + { + "epoch": 0.3924638208792209, + "grad_norm": 8.25, + "learning_rate": 4.844110060500916e-06, + "loss": 1.34213125705719, + "step": 2156 + }, + { + "epoch": 0.39282788750341313, + "grad_norm": 13.875, + "learning_rate": 4.843813380605989e-06, + "loss": 1.1744656562805176, + "step": 2158 + }, + { + "epoch": 0.3931919541276053, + "grad_norm": 17.625, + "learning_rate": 4.84351643013892e-06, + "loss": 2.003048896789551, + "step": 2160 + }, + { + "epoch": 0.3935560207517976, + "grad_norm": 15.375, + "learning_rate": 4.843219209143286e-06, + "loss": 1.7268306016921997, + "step": 2162 + }, + { + "epoch": 0.3939200873759898, + "grad_norm": 54.75, + "learning_rate": 4.842921717662703e-06, + "loss": 1.2337154150009155, + "step": 2164 + }, + { + "epoch": 0.394284154000182, + "grad_norm": 17.25, + "learning_rate": 4.842623955740826e-06, + "loss": 1.8911058902740479, + "step": 2166 + }, + { + "epoch": 0.39464822062437427, + "grad_norm": 10.3125, + "learning_rate": 4.842325923421353e-06, + "loss": 1.064198613166809, + "step": 2168 + }, + { + "epoch": 0.39501228724856646, + "grad_norm": 8.125, + "learning_rate": 4.842027620748014e-06, + "loss": 1.4203481674194336, + "step": 2170 + }, + { + "epoch": 0.3953763538727587, + "grad_norm": 10.375, + "learning_rate": 4.841729047764589e-06, + "loss": 1.555790901184082, + "step": 2172 + }, + { + "epoch": 0.39574042049695096, + "grad_norm": 25.25, + "learning_rate": 4.84143020451489e-06, + "loss": 0.8090052604675293, + "step": 2174 + }, + { + "epoch": 0.39610448712114316, + "grad_norm": 9.4375, + "learning_rate": 4.8411310910427704e-06, + "loss": 0.7757471203804016, + "step": 2176 + }, + { + "epoch": 0.3964685537453354, + "grad_norm": 9.75, + "learning_rate": 4.840831707392125e-06, + "loss": 1.5251039266586304, + "step": 2178 + }, + { + "epoch": 0.3968326203695276, + "grad_norm": 19.375, + "learning_rate": 4.840532053606887e-06, + "loss": 1.5991133451461792, + "step": 2180 + }, + { + "epoch": 0.39719668699371985, + "grad_norm": 12.5625, + "learning_rate": 4.840232129731028e-06, + "loss": 1.8887784481048584, + "step": 2182 + }, + { + "epoch": 0.3975607536179121, + "grad_norm": 11.8125, + "learning_rate": 4.839931935808563e-06, + "loss": 1.771515130996704, + "step": 2184 + }, + { + "epoch": 0.3979248202421043, + "grad_norm": 47.0, + "learning_rate": 4.839631471883542e-06, + "loss": 0.9939151406288147, + "step": 2186 + }, + { + "epoch": 0.39828888686629654, + "grad_norm": 8.125, + "learning_rate": 4.83933073800006e-06, + "loss": 1.5242818593978882, + "step": 2188 + }, + { + "epoch": 0.39865295349048874, + "grad_norm": 11.8125, + "learning_rate": 4.839029734202244e-06, + "loss": 1.2641445398330688, + "step": 2190 + }, + { + "epoch": 0.399017020114681, + "grad_norm": 4.28125, + "learning_rate": 4.838728460534269e-06, + "loss": 1.1251391172409058, + "step": 2192 + }, + { + "epoch": 0.39938108673887324, + "grad_norm": 16.25, + "learning_rate": 4.838426917040343e-06, + "loss": 1.315170168876648, + "step": 2194 + }, + { + "epoch": 0.39974515336306543, + "grad_norm": 100.5, + "learning_rate": 4.838125103764717e-06, + "loss": 1.6331381797790527, + "step": 2196 + }, + { + "epoch": 0.4001092199872577, + "grad_norm": 10.25, + "learning_rate": 4.837823020751683e-06, + "loss": 0.7378606200218201, + "step": 2198 + }, + { + "epoch": 0.4004732866114499, + "grad_norm": 14.5625, + "learning_rate": 4.837520668045568e-06, + "loss": 1.553528904914856, + "step": 2200 + }, + { + "epoch": 0.4008373532356421, + "grad_norm": 9.0, + "learning_rate": 4.837218045690741e-06, + "loss": 1.4627171754837036, + "step": 2202 + }, + { + "epoch": 0.4012014198598344, + "grad_norm": 8.375, + "learning_rate": 4.8369151537316125e-06, + "loss": 1.2701606750488281, + "step": 2204 + }, + { + "epoch": 0.40156548648402657, + "grad_norm": 5.03125, + "learning_rate": 4.836611992212629e-06, + "loss": 1.3050543069839478, + "step": 2206 + }, + { + "epoch": 0.4019295531082188, + "grad_norm": 7.3125, + "learning_rate": 4.836308561178279e-06, + "loss": 1.1303834915161133, + "step": 2208 + }, + { + "epoch": 0.402293619732411, + "grad_norm": 14.3125, + "learning_rate": 4.836004860673089e-06, + "loss": 1.367389440536499, + "step": 2210 + }, + { + "epoch": 0.40265768635660326, + "grad_norm": 36.0, + "learning_rate": 4.835700890741627e-06, + "loss": 1.5948712825775146, + "step": 2212 + }, + { + "epoch": 0.4030217529807955, + "grad_norm": 20.25, + "learning_rate": 4.835396651428499e-06, + "loss": 1.7656869888305664, + "step": 2214 + }, + { + "epoch": 0.4033858196049877, + "grad_norm": 16.75, + "learning_rate": 4.835092142778349e-06, + "loss": 1.8960367441177368, + "step": 2216 + }, + { + "epoch": 0.40374988622917996, + "grad_norm": 22.0, + "learning_rate": 4.834787364835866e-06, + "loss": 2.0225956439971924, + "step": 2218 + }, + { + "epoch": 0.40411395285337215, + "grad_norm": 9.25, + "learning_rate": 4.834482317645772e-06, + "loss": 1.5366857051849365, + "step": 2220 + }, + { + "epoch": 0.4044780194775644, + "grad_norm": 8.375, + "learning_rate": 4.834177001252832e-06, + "loss": 1.5475716590881348, + "step": 2222 + }, + { + "epoch": 0.40484208610175665, + "grad_norm": 8.625, + "learning_rate": 4.833871415701852e-06, + "loss": 1.2871226072311401, + "step": 2224 + }, + { + "epoch": 0.40520615272594884, + "grad_norm": 9.0, + "learning_rate": 4.833565561037672e-06, + "loss": 1.3163812160491943, + "step": 2226 + }, + { + "epoch": 0.4055702193501411, + "grad_norm": 11.5625, + "learning_rate": 4.833259437305178e-06, + "loss": 1.3300800323486328, + "step": 2228 + }, + { + "epoch": 0.4059342859743333, + "grad_norm": 9.4375, + "learning_rate": 4.83295304454929e-06, + "loss": 1.154212236404419, + "step": 2230 + }, + { + "epoch": 0.40629835259852554, + "grad_norm": 10.625, + "learning_rate": 4.832646382814972e-06, + "loss": 1.492110252380371, + "step": 2232 + }, + { + "epoch": 0.40666241922271773, + "grad_norm": 13.3125, + "learning_rate": 4.8323394521472235e-06, + "loss": 1.158155083656311, + "step": 2234 + }, + { + "epoch": 0.40702648584691, + "grad_norm": 18.25, + "learning_rate": 4.832032252591087e-06, + "loss": 1.679713249206543, + "step": 2236 + }, + { + "epoch": 0.40739055247110223, + "grad_norm": 7.625, + "learning_rate": 4.831724784191641e-06, + "loss": 1.5359445810317993, + "step": 2238 + }, + { + "epoch": 0.4077546190952944, + "grad_norm": 7.65625, + "learning_rate": 4.831417046994007e-06, + "loss": 1.4562760591506958, + "step": 2240 + }, + { + "epoch": 0.4081186857194867, + "grad_norm": 9.875, + "learning_rate": 4.831109041043344e-06, + "loss": 1.4791457653045654, + "step": 2242 + }, + { + "epoch": 0.40848275234367887, + "grad_norm": 9.5, + "learning_rate": 4.830800766384849e-06, + "loss": 1.4202136993408203, + "step": 2244 + }, + { + "epoch": 0.4088468189678711, + "grad_norm": 22.375, + "learning_rate": 4.83049222306376e-06, + "loss": 1.6754508018493652, + "step": 2246 + }, + { + "epoch": 0.40921088559206337, + "grad_norm": 9.8125, + "learning_rate": 4.830183411125358e-06, + "loss": 1.6363577842712402, + "step": 2248 + }, + { + "epoch": 0.40957495221625556, + "grad_norm": 3.265625, + "learning_rate": 4.829874330614956e-06, + "loss": 1.4338135719299316, + "step": 2250 + }, + { + "epoch": 0.4099390188404478, + "grad_norm": 7.21875, + "learning_rate": 4.829564981577913e-06, + "loss": 1.124242901802063, + "step": 2252 + }, + { + "epoch": 0.41030308546464, + "grad_norm": 15.75, + "learning_rate": 4.829255364059623e-06, + "loss": 1.6867132186889648, + "step": 2254 + }, + { + "epoch": 0.41066715208883225, + "grad_norm": 18.125, + "learning_rate": 4.828945478105521e-06, + "loss": 1.5656864643096924, + "step": 2256 + }, + { + "epoch": 0.4110312187130245, + "grad_norm": 11.8125, + "learning_rate": 4.828635323761083e-06, + "loss": 1.4033061265945435, + "step": 2258 + }, + { + "epoch": 0.4113952853372167, + "grad_norm": 87.0, + "learning_rate": 4.828324901071823e-06, + "loss": 1.8032457828521729, + "step": 2260 + }, + { + "epoch": 0.41175935196140895, + "grad_norm": 8.75, + "learning_rate": 4.828014210083292e-06, + "loss": 1.4799344539642334, + "step": 2262 + }, + { + "epoch": 0.41212341858560114, + "grad_norm": 12.125, + "learning_rate": 4.8277032508410835e-06, + "loss": 1.2214703559875488, + "step": 2264 + }, + { + "epoch": 0.4124874852097934, + "grad_norm": 19.0, + "learning_rate": 4.8273920233908304e-06, + "loss": 1.7928133010864258, + "step": 2266 + }, + { + "epoch": 0.41285155183398564, + "grad_norm": 10.3125, + "learning_rate": 4.827080527778204e-06, + "loss": 1.4186478853225708, + "step": 2268 + }, + { + "epoch": 0.41321561845817784, + "grad_norm": 13.875, + "learning_rate": 4.8267687640489135e-06, + "loss": 1.2705228328704834, + "step": 2270 + }, + { + "epoch": 0.4135796850823701, + "grad_norm": 13.4375, + "learning_rate": 4.826456732248711e-06, + "loss": 1.5323959589004517, + "step": 2272 + }, + { + "epoch": 0.4139437517065623, + "grad_norm": 16.25, + "learning_rate": 4.8261444324233865e-06, + "loss": 1.7160745859146118, + "step": 2274 + }, + { + "epoch": 0.41430781833075453, + "grad_norm": 8.25, + "learning_rate": 4.825831864618765e-06, + "loss": 1.673983097076416, + "step": 2276 + }, + { + "epoch": 0.4146718849549468, + "grad_norm": 9.6875, + "learning_rate": 4.8255190288807175e-06, + "loss": 1.2232874631881714, + "step": 2278 + }, + { + "epoch": 0.415035951579139, + "grad_norm": 12.625, + "learning_rate": 4.825205925255151e-06, + "loss": 1.3800568580627441, + "step": 2280 + }, + { + "epoch": 0.4154000182033312, + "grad_norm": 13.4375, + "learning_rate": 4.824892553788012e-06, + "loss": 1.5487409830093384, + "step": 2282 + }, + { + "epoch": 0.4157640848275234, + "grad_norm": 46.25, + "learning_rate": 4.8245789145252865e-06, + "loss": 1.6988344192504883, + "step": 2284 + }, + { + "epoch": 0.41612815145171567, + "grad_norm": 5.8125, + "learning_rate": 4.824265007512999e-06, + "loss": 1.2878892421722412, + "step": 2286 + }, + { + "epoch": 0.4164922180759079, + "grad_norm": 13.4375, + "learning_rate": 4.823950832797215e-06, + "loss": 1.5087294578552246, + "step": 2288 + }, + { + "epoch": 0.4168562847001001, + "grad_norm": 5.9375, + "learning_rate": 4.823636390424038e-06, + "loss": 1.465741515159607, + "step": 2290 + }, + { + "epoch": 0.41722035132429236, + "grad_norm": 6.09375, + "learning_rate": 4.823321680439611e-06, + "loss": 1.1088500022888184, + "step": 2292 + }, + { + "epoch": 0.41758441794848455, + "grad_norm": 8.375, + "learning_rate": 4.8230067028901175e-06, + "loss": 1.4532020092010498, + "step": 2294 + }, + { + "epoch": 0.4179484845726768, + "grad_norm": 5.28125, + "learning_rate": 4.822691457821777e-06, + "loss": 1.3000138998031616, + "step": 2296 + }, + { + "epoch": 0.41831255119686905, + "grad_norm": 5.78125, + "learning_rate": 4.822375945280854e-06, + "loss": 1.268698811531067, + "step": 2298 + }, + { + "epoch": 0.41867661782106125, + "grad_norm": 7.0625, + "learning_rate": 4.822060165313645e-06, + "loss": 1.36043381690979, + "step": 2300 + }, + { + "epoch": 0.4190406844452535, + "grad_norm": 13.5, + "learning_rate": 4.821744117966491e-06, + "loss": 1.572618842124939, + "step": 2302 + }, + { + "epoch": 0.4194047510694457, + "grad_norm": 11.25, + "learning_rate": 4.8214278032857706e-06, + "loss": 1.7182421684265137, + "step": 2304 + }, + { + "epoch": 0.41976881769363794, + "grad_norm": 9.8125, + "learning_rate": 4.821111221317901e-06, + "loss": 1.4769285917282104, + "step": 2306 + }, + { + "epoch": 0.42013288431783014, + "grad_norm": 28.0, + "learning_rate": 4.820794372109342e-06, + "loss": 1.5118043422698975, + "step": 2308 + }, + { + "epoch": 0.4204969509420224, + "grad_norm": 17.625, + "learning_rate": 4.820477255706586e-06, + "loss": 1.294557809829712, + "step": 2310 + }, + { + "epoch": 0.42086101756621463, + "grad_norm": 5.09375, + "learning_rate": 4.820159872156172e-06, + "loss": 0.9314954280853271, + "step": 2312 + }, + { + "epoch": 0.42122508419040683, + "grad_norm": 3.0625, + "learning_rate": 4.819842221504671e-06, + "loss": 1.0919597148895264, + "step": 2314 + }, + { + "epoch": 0.4215891508145991, + "grad_norm": 26.25, + "learning_rate": 4.8195243037987e-06, + "loss": 1.3483877182006836, + "step": 2316 + }, + { + "epoch": 0.4219532174387913, + "grad_norm": 16.875, + "learning_rate": 4.819206119084913e-06, + "loss": 0.7155640125274658, + "step": 2318 + }, + { + "epoch": 0.4223172840629835, + "grad_norm": 63.25, + "learning_rate": 4.8188876674099995e-06, + "loss": 1.4566911458969116, + "step": 2320 + }, + { + "epoch": 0.42268135068717577, + "grad_norm": 5.28125, + "learning_rate": 4.818568948820692e-06, + "loss": 1.2753733396530151, + "step": 2322 + }, + { + "epoch": 0.42304541731136797, + "grad_norm": 12.9375, + "learning_rate": 4.81824996336376e-06, + "loss": 0.9926548004150391, + "step": 2324 + }, + { + "epoch": 0.4234094839355602, + "grad_norm": 21.125, + "learning_rate": 4.817930711086017e-06, + "loss": 1.6045634746551514, + "step": 2326 + }, + { + "epoch": 0.4237735505597524, + "grad_norm": 9.75, + "learning_rate": 4.817611192034308e-06, + "loss": 1.9188514947891235, + "step": 2328 + }, + { + "epoch": 0.42413761718394466, + "grad_norm": 3.0625, + "learning_rate": 4.817291406255524e-06, + "loss": 1.0702840089797974, + "step": 2330 + }, + { + "epoch": 0.4245016838081369, + "grad_norm": 11.0625, + "learning_rate": 4.81697135379659e-06, + "loss": 1.0890417098999023, + "step": 2332 + }, + { + "epoch": 0.4248657504323291, + "grad_norm": 15.4375, + "learning_rate": 4.816651034704474e-06, + "loss": 1.7874891757965088, + "step": 2334 + }, + { + "epoch": 0.42522981705652135, + "grad_norm": 11.25, + "learning_rate": 4.81633044902618e-06, + "loss": 1.5855598449707031, + "step": 2336 + }, + { + "epoch": 0.42559388368071355, + "grad_norm": 11.0, + "learning_rate": 4.816009596808754e-06, + "loss": 1.596388578414917, + "step": 2338 + }, + { + "epoch": 0.4259579503049058, + "grad_norm": 53.5, + "learning_rate": 4.815688478099279e-06, + "loss": 0.751289963722229, + "step": 2340 + }, + { + "epoch": 0.42632201692909805, + "grad_norm": 147.0, + "learning_rate": 4.815367092944878e-06, + "loss": 0.5301092863082886, + "step": 2342 + }, + { + "epoch": 0.42668608355329024, + "grad_norm": 25.375, + "learning_rate": 4.8150454413927154e-06, + "loss": 0.9436086416244507, + "step": 2344 + }, + { + "epoch": 0.4270501501774825, + "grad_norm": 11.75, + "learning_rate": 4.814723523489987e-06, + "loss": 1.4477030038833618, + "step": 2346 + }, + { + "epoch": 0.4274142168016747, + "grad_norm": 9.3125, + "learning_rate": 4.814401339283937e-06, + "loss": 1.7095439434051514, + "step": 2348 + }, + { + "epoch": 0.42777828342586693, + "grad_norm": 9.0, + "learning_rate": 4.814078888821844e-06, + "loss": 1.930063247680664, + "step": 2350 + }, + { + "epoch": 0.4281423500500592, + "grad_norm": 16.5, + "learning_rate": 4.813756172151026e-06, + "loss": 1.5254054069519043, + "step": 2352 + }, + { + "epoch": 0.4285064166742514, + "grad_norm": 8.5625, + "learning_rate": 4.8134331893188405e-06, + "loss": 1.5094444751739502, + "step": 2354 + }, + { + "epoch": 0.4288704832984436, + "grad_norm": 3.453125, + "learning_rate": 4.813109940372682e-06, + "loss": 1.1158978939056396, + "step": 2356 + }, + { + "epoch": 0.4292345499226358, + "grad_norm": 5.125, + "learning_rate": 4.812786425359989e-06, + "loss": 0.9549667239189148, + "step": 2358 + }, + { + "epoch": 0.42959861654682807, + "grad_norm": 6.65625, + "learning_rate": 4.812462644328234e-06, + "loss": 1.4353636503219604, + "step": 2360 + }, + { + "epoch": 0.4299626831710203, + "grad_norm": 9.3125, + "learning_rate": 4.812138597324932e-06, + "loss": 1.4717034101486206, + "step": 2362 + }, + { + "epoch": 0.4303267497952125, + "grad_norm": 5.8125, + "learning_rate": 4.8118142843976345e-06, + "loss": 1.329596996307373, + "step": 2364 + }, + { + "epoch": 0.43069081641940477, + "grad_norm": 6.8125, + "learning_rate": 4.811489705593933e-06, + "loss": 1.4275367259979248, + "step": 2366 + }, + { + "epoch": 0.43105488304359696, + "grad_norm": 7.40625, + "learning_rate": 4.811164860961459e-06, + "loss": 1.4369772672653198, + "step": 2368 + }, + { + "epoch": 0.4314189496677892, + "grad_norm": 7.0, + "learning_rate": 4.810839750547882e-06, + "loss": 1.3342591524124146, + "step": 2370 + }, + { + "epoch": 0.43178301629198146, + "grad_norm": 21.5, + "learning_rate": 4.81051437440091e-06, + "loss": 1.489044189453125, + "step": 2372 + }, + { + "epoch": 0.43214708291617365, + "grad_norm": 7.78125, + "learning_rate": 4.810188732568291e-06, + "loss": 1.158496379852295, + "step": 2374 + }, + { + "epoch": 0.4325111495403659, + "grad_norm": 10.375, + "learning_rate": 4.809862825097811e-06, + "loss": 0.47940826416015625, + "step": 2376 + }, + { + "epoch": 0.4328752161645581, + "grad_norm": 8.125, + "learning_rate": 4.8095366520372955e-06, + "loss": 1.3868757486343384, + "step": 2378 + }, + { + "epoch": 0.43323928278875035, + "grad_norm": 8.3125, + "learning_rate": 4.809210213434611e-06, + "loss": 1.3867754936218262, + "step": 2380 + }, + { + "epoch": 0.4336033494129426, + "grad_norm": 21.5, + "learning_rate": 4.8088835093376595e-06, + "loss": 1.4974238872528076, + "step": 2382 + }, + { + "epoch": 0.4339674160371348, + "grad_norm": 10.8125, + "learning_rate": 4.808556539794383e-06, + "loss": 1.639586091041565, + "step": 2384 + }, + { + "epoch": 0.43433148266132704, + "grad_norm": 7.46875, + "learning_rate": 4.808229304852765e-06, + "loss": 1.469184398651123, + "step": 2386 + }, + { + "epoch": 0.43469554928551923, + "grad_norm": 41.75, + "learning_rate": 4.807901804560824e-06, + "loss": 1.4129558801651, + "step": 2388 + }, + { + "epoch": 0.4350596159097115, + "grad_norm": 16.125, + "learning_rate": 4.80757403896662e-06, + "loss": 1.7425603866577148, + "step": 2390 + }, + { + "epoch": 0.4354236825339037, + "grad_norm": 12.125, + "learning_rate": 4.807246008118251e-06, + "loss": 2.0718836784362793, + "step": 2392 + }, + { + "epoch": 0.4357877491580959, + "grad_norm": 9.6875, + "learning_rate": 4.806917712063856e-06, + "loss": 1.6627055406570435, + "step": 2394 + }, + { + "epoch": 0.4361518157822882, + "grad_norm": 13.1875, + "learning_rate": 4.8065891508516074e-06, + "loss": 1.447837471961975, + "step": 2396 + }, + { + "epoch": 0.43651588240648037, + "grad_norm": 8.6875, + "learning_rate": 4.806260324529722e-06, + "loss": 1.491906762123108, + "step": 2398 + }, + { + "epoch": 0.4368799490306726, + "grad_norm": 14.75, + "learning_rate": 4.805931233146457e-06, + "loss": 1.6257590055465698, + "step": 2400 + }, + { + "epoch": 0.4372440156548648, + "grad_norm": 11.8125, + "learning_rate": 4.8056018767501e-06, + "loss": 1.4428527355194092, + "step": 2402 + }, + { + "epoch": 0.43760808227905706, + "grad_norm": 16.5, + "learning_rate": 4.805272255388985e-06, + "loss": 1.5536551475524902, + "step": 2404 + }, + { + "epoch": 0.4379721489032493, + "grad_norm": 27.25, + "learning_rate": 4.804942369111484e-06, + "loss": 1.7104015350341797, + "step": 2406 + }, + { + "epoch": 0.4383362155274415, + "grad_norm": 13.1875, + "learning_rate": 4.804612217966005e-06, + "loss": 1.110038161277771, + "step": 2408 + }, + { + "epoch": 0.43870028215163376, + "grad_norm": 11.9375, + "learning_rate": 4.804281802000995e-06, + "loss": 1.2847239971160889, + "step": 2410 + }, + { + "epoch": 0.43906434877582595, + "grad_norm": 21.5, + "learning_rate": 4.8039511212649436e-06, + "loss": 1.7150689363479614, + "step": 2412 + }, + { + "epoch": 0.4394284154000182, + "grad_norm": 10.875, + "learning_rate": 4.803620175806377e-06, + "loss": 1.3355991840362549, + "step": 2414 + }, + { + "epoch": 0.43979248202421045, + "grad_norm": 10.625, + "learning_rate": 4.803288965673857e-06, + "loss": 2.004887580871582, + "step": 2416 + }, + { + "epoch": 0.44015654864840265, + "grad_norm": 9.3125, + "learning_rate": 4.80295749091599e-06, + "loss": 1.3342925310134888, + "step": 2418 + }, + { + "epoch": 0.4405206152725949, + "grad_norm": 5.1875, + "learning_rate": 4.802625751581419e-06, + "loss": 1.015660285949707, + "step": 2420 + }, + { + "epoch": 0.4408846818967871, + "grad_norm": 22.875, + "learning_rate": 4.8022937477188235e-06, + "loss": 2.085942268371582, + "step": 2422 + }, + { + "epoch": 0.44124874852097934, + "grad_norm": 11.25, + "learning_rate": 4.801961479376925e-06, + "loss": 1.2945802211761475, + "step": 2424 + }, + { + "epoch": 0.4416128151451716, + "grad_norm": 12.0, + "learning_rate": 4.801628946604482e-06, + "loss": 1.389954686164856, + "step": 2426 + }, + { + "epoch": 0.4419768817693638, + "grad_norm": 6.0625, + "learning_rate": 4.801296149450293e-06, + "loss": 1.0629265308380127, + "step": 2428 + }, + { + "epoch": 0.44234094839355603, + "grad_norm": 31.0, + "learning_rate": 4.800963087963193e-06, + "loss": 1.8067104816436768, + "step": 2430 + }, + { + "epoch": 0.4427050150177482, + "grad_norm": 13.875, + "learning_rate": 4.800629762192058e-06, + "loss": 1.1785645484924316, + "step": 2432 + }, + { + "epoch": 0.4430690816419405, + "grad_norm": 7.59375, + "learning_rate": 4.800296172185804e-06, + "loss": 1.1710501909255981, + "step": 2434 + }, + { + "epoch": 0.4434331482661327, + "grad_norm": 11.875, + "learning_rate": 4.799962317993382e-06, + "loss": 1.5772453546524048, + "step": 2436 + }, + { + "epoch": 0.4437972148903249, + "grad_norm": 75.5, + "learning_rate": 4.799628199663785e-06, + "loss": 1.7493717670440674, + "step": 2438 + }, + { + "epoch": 0.44416128151451717, + "grad_norm": 25.375, + "learning_rate": 4.7992938172460434e-06, + "loss": 1.71174955368042, + "step": 2440 + }, + { + "epoch": 0.44452534813870936, + "grad_norm": 6.59375, + "learning_rate": 4.798959170789225e-06, + "loss": 1.2972800731658936, + "step": 2442 + }, + { + "epoch": 0.4448894147629016, + "grad_norm": 10.875, + "learning_rate": 4.798624260342439e-06, + "loss": 1.3126376867294312, + "step": 2444 + }, + { + "epoch": 0.44525348138709386, + "grad_norm": 22.5, + "learning_rate": 4.798289085954833e-06, + "loss": 1.248924732208252, + "step": 2446 + }, + { + "epoch": 0.44561754801128606, + "grad_norm": 11.0, + "learning_rate": 4.79795364767559e-06, + "loss": 1.35087251663208, + "step": 2448 + }, + { + "epoch": 0.4459816146354783, + "grad_norm": 18.5, + "learning_rate": 4.7976179455539365e-06, + "loss": 1.3028209209442139, + "step": 2450 + }, + { + "epoch": 0.4463456812596705, + "grad_norm": 14.9375, + "learning_rate": 4.7972819796391335e-06, + "loss": 2.157160758972168, + "step": 2452 + }, + { + "epoch": 0.44670974788386275, + "grad_norm": 12.1875, + "learning_rate": 4.796945749980485e-06, + "loss": 1.3547319173812866, + "step": 2454 + }, + { + "epoch": 0.447073814508055, + "grad_norm": 14.625, + "learning_rate": 4.79660925662733e-06, + "loss": 1.5041558742523193, + "step": 2456 + }, + { + "epoch": 0.4474378811322472, + "grad_norm": 8.75, + "learning_rate": 4.796272499629048e-06, + "loss": 1.4152911901474, + "step": 2458 + }, + { + "epoch": 0.44780194775643944, + "grad_norm": 6.78125, + "learning_rate": 4.795935479035055e-06, + "loss": 1.4617106914520264, + "step": 2460 + }, + { + "epoch": 0.44816601438063164, + "grad_norm": 30.25, + "learning_rate": 4.795598194894809e-06, + "loss": 1.2192132472991943, + "step": 2462 + }, + { + "epoch": 0.4485300810048239, + "grad_norm": 32.0, + "learning_rate": 4.795260647257805e-06, + "loss": 1.2421975135803223, + "step": 2464 + }, + { + "epoch": 0.4488941476290161, + "grad_norm": 12.9375, + "learning_rate": 4.794922836173576e-06, + "loss": 1.8442548513412476, + "step": 2466 + }, + { + "epoch": 0.44925821425320833, + "grad_norm": 5.75, + "learning_rate": 4.794584761691696e-06, + "loss": 1.2030386924743652, + "step": 2468 + }, + { + "epoch": 0.4496222808774006, + "grad_norm": 5.78125, + "learning_rate": 4.794246423861776e-06, + "loss": 1.364159107208252, + "step": 2470 + }, + { + "epoch": 0.4499863475015928, + "grad_norm": 5.90625, + "learning_rate": 4.793907822733463e-06, + "loss": 1.3991097211837769, + "step": 2472 + }, + { + "epoch": 0.450350414125785, + "grad_norm": 11.5625, + "learning_rate": 4.793568958356448e-06, + "loss": 1.4109032154083252, + "step": 2474 + }, + { + "epoch": 0.4507144807499772, + "grad_norm": 11.5, + "learning_rate": 4.793229830780456e-06, + "loss": 1.5175824165344238, + "step": 2476 + }, + { + "epoch": 0.45107854737416947, + "grad_norm": 11.0625, + "learning_rate": 4.792890440055255e-06, + "loss": 1.4294145107269287, + "step": 2478 + }, + { + "epoch": 0.4514426139983617, + "grad_norm": 9.625, + "learning_rate": 4.792550786230647e-06, + "loss": 1.3272117376327515, + "step": 2480 + }, + { + "epoch": 0.4518066806225539, + "grad_norm": 22.875, + "learning_rate": 4.792210869356476e-06, + "loss": 1.4949346780776978, + "step": 2482 + }, + { + "epoch": 0.45217074724674616, + "grad_norm": 8.0, + "learning_rate": 4.791870689482623e-06, + "loss": 1.4751938581466675, + "step": 2484 + }, + { + "epoch": 0.45253481387093836, + "grad_norm": 6.65625, + "learning_rate": 4.791530246659007e-06, + "loss": 1.1491219997406006, + "step": 2486 + }, + { + "epoch": 0.4528988804951306, + "grad_norm": 20.25, + "learning_rate": 4.791189540935589e-06, + "loss": 1.4928953647613525, + "step": 2488 + }, + { + "epoch": 0.45326294711932286, + "grad_norm": 4.5, + "learning_rate": 4.790848572362365e-06, + "loss": 1.0747045278549194, + "step": 2490 + }, + { + "epoch": 0.45362701374351505, + "grad_norm": 7.25, + "learning_rate": 4.790507340989371e-06, + "loss": 1.4724012613296509, + "step": 2492 + }, + { + "epoch": 0.4539910803677073, + "grad_norm": 17.875, + "learning_rate": 4.79016584686668e-06, + "loss": 1.091583490371704, + "step": 2494 + }, + { + "epoch": 0.4543551469918995, + "grad_norm": 57.0, + "learning_rate": 4.789824090044407e-06, + "loss": 0.5528209209442139, + "step": 2496 + }, + { + "epoch": 0.45471921361609174, + "grad_norm": 12.375, + "learning_rate": 4.789482070572702e-06, + "loss": 1.4132084846496582, + "step": 2498 + }, + { + "epoch": 0.455083280240284, + "grad_norm": 15.25, + "learning_rate": 4.789139788501755e-06, + "loss": 1.5109962224960327, + "step": 2500 + }, + { + "epoch": 0.4554473468644762, + "grad_norm": 10.875, + "learning_rate": 4.788797243881794e-06, + "loss": 1.7004756927490234, + "step": 2502 + }, + { + "epoch": 0.45581141348866844, + "grad_norm": 6.125, + "learning_rate": 4.788454436763088e-06, + "loss": 1.1646795272827148, + "step": 2504 + }, + { + "epoch": 0.45617548011286063, + "grad_norm": 11.0625, + "learning_rate": 4.788111367195941e-06, + "loss": 1.6641459465026855, + "step": 2506 + }, + { + "epoch": 0.4565395467370529, + "grad_norm": 11.1875, + "learning_rate": 4.7877680352306965e-06, + "loss": 2.0821268558502197, + "step": 2508 + }, + { + "epoch": 0.45690361336124513, + "grad_norm": 22.25, + "learning_rate": 4.78742444091774e-06, + "loss": 1.363267421722412, + "step": 2510 + }, + { + "epoch": 0.4572676799854373, + "grad_norm": 18.375, + "learning_rate": 4.78708058430749e-06, + "loss": 1.6625890731811523, + "step": 2512 + }, + { + "epoch": 0.4576317466096296, + "grad_norm": 15.6875, + "learning_rate": 4.786736465450405e-06, + "loss": 1.4096591472625732, + "step": 2514 + }, + { + "epoch": 0.45799581323382177, + "grad_norm": 6.90625, + "learning_rate": 4.786392084396986e-06, + "loss": 1.2796412706375122, + "step": 2516 + }, + { + "epoch": 0.458359879858014, + "grad_norm": 8.125, + "learning_rate": 4.786047441197769e-06, + "loss": 1.179935336112976, + "step": 2518 + }, + { + "epoch": 0.45872394648220627, + "grad_norm": 5.96875, + "learning_rate": 4.785702535903326e-06, + "loss": 1.589585542678833, + "step": 2520 + }, + { + "epoch": 0.45908801310639846, + "grad_norm": 16.0, + "learning_rate": 4.785357368564275e-06, + "loss": 1.413062572479248, + "step": 2522 + }, + { + "epoch": 0.4594520797305907, + "grad_norm": 5.84375, + "learning_rate": 4.785011939231265e-06, + "loss": 1.2646909952163696, + "step": 2524 + }, + { + "epoch": 0.4598161463547829, + "grad_norm": 14.3125, + "learning_rate": 4.784666247954986e-06, + "loss": 1.3635499477386475, + "step": 2526 + }, + { + "epoch": 0.46018021297897516, + "grad_norm": 34.5, + "learning_rate": 4.784320294786168e-06, + "loss": 1.175112009048462, + "step": 2528 + }, + { + "epoch": 0.4605442796031674, + "grad_norm": 10.875, + "learning_rate": 4.783974079775579e-06, + "loss": 1.0801090002059937, + "step": 2530 + }, + { + "epoch": 0.4609083462273596, + "grad_norm": 18.0, + "learning_rate": 4.783627602974023e-06, + "loss": 1.6281940937042236, + "step": 2532 + }, + { + "epoch": 0.46127241285155185, + "grad_norm": 31.0, + "learning_rate": 4.783280864432344e-06, + "loss": 1.5610016584396362, + "step": 2534 + }, + { + "epoch": 0.46163647947574404, + "grad_norm": 14.1875, + "learning_rate": 4.782933864201426e-06, + "loss": 1.4834723472595215, + "step": 2536 + }, + { + "epoch": 0.4620005460999363, + "grad_norm": 5.75, + "learning_rate": 4.782586602332188e-06, + "loss": 1.5038542747497559, + "step": 2538 + }, + { + "epoch": 0.46236461272412854, + "grad_norm": 7.21875, + "learning_rate": 4.782239078875591e-06, + "loss": 1.2938894033432007, + "step": 2540 + }, + { + "epoch": 0.46272867934832074, + "grad_norm": 36.25, + "learning_rate": 4.781891293882632e-06, + "loss": 1.5071697235107422, + "step": 2542 + }, + { + "epoch": 0.463092745972513, + "grad_norm": 12.8125, + "learning_rate": 4.781543247404347e-06, + "loss": 1.4677107334136963, + "step": 2544 + }, + { + "epoch": 0.4634568125967052, + "grad_norm": 13.5625, + "learning_rate": 4.78119493949181e-06, + "loss": 1.4703729152679443, + "step": 2546 + }, + { + "epoch": 0.46382087922089743, + "grad_norm": 11.0, + "learning_rate": 4.780846370196134e-06, + "loss": 1.458452820777893, + "step": 2548 + }, + { + "epoch": 0.4641849458450896, + "grad_norm": 10.0625, + "learning_rate": 4.780497539568471e-06, + "loss": 1.4700268507003784, + "step": 2550 + }, + { + "epoch": 0.4645490124692819, + "grad_norm": 7.8125, + "learning_rate": 4.78014844766001e-06, + "loss": 1.3840463161468506, + "step": 2552 + }, + { + "epoch": 0.4649130790934741, + "grad_norm": 6.03125, + "learning_rate": 4.779799094521979e-06, + "loss": 1.3164231777191162, + "step": 2554 + }, + { + "epoch": 0.4652771457176663, + "grad_norm": 13.8125, + "learning_rate": 4.779449480205642e-06, + "loss": 1.2951946258544922, + "step": 2556 + }, + { + "epoch": 0.46564121234185857, + "grad_norm": 34.0, + "learning_rate": 4.779099604762306e-06, + "loss": 2.0955660343170166, + "step": 2558 + }, + { + "epoch": 0.46600527896605076, + "grad_norm": 14.6875, + "learning_rate": 4.7787494682433136e-06, + "loss": 1.104082703590393, + "step": 2560 + }, + { + "epoch": 0.466369345590243, + "grad_norm": 15.375, + "learning_rate": 4.778399070700045e-06, + "loss": 1.5023795366287231, + "step": 2562 + }, + { + "epoch": 0.46673341221443526, + "grad_norm": 6.9375, + "learning_rate": 4.77804841218392e-06, + "loss": 1.3493629693984985, + "step": 2564 + }, + { + "epoch": 0.46709747883862746, + "grad_norm": 13.1875, + "learning_rate": 4.777697492746397e-06, + "loss": 1.1247923374176025, + "step": 2566 + }, + { + "epoch": 0.4674615454628197, + "grad_norm": 15.0, + "learning_rate": 4.77734631243897e-06, + "loss": 1.4516193866729736, + "step": 2568 + }, + { + "epoch": 0.4678256120870119, + "grad_norm": 7.5625, + "learning_rate": 4.776994871313175e-06, + "loss": 1.3607850074768066, + "step": 2570 + }, + { + "epoch": 0.46818967871120415, + "grad_norm": 6.46875, + "learning_rate": 4.776643169420585e-06, + "loss": 1.03755521774292, + "step": 2572 + }, + { + "epoch": 0.4685537453353964, + "grad_norm": 10.25, + "learning_rate": 4.776291206812809e-06, + "loss": 1.906078815460205, + "step": 2574 + }, + { + "epoch": 0.4689178119595886, + "grad_norm": 22.5, + "learning_rate": 4.775938983541498e-06, + "loss": 1.6876178979873657, + "step": 2576 + }, + { + "epoch": 0.46928187858378084, + "grad_norm": 6.53125, + "learning_rate": 4.775586499658338e-06, + "loss": 1.1927977800369263, + "step": 2578 + }, + { + "epoch": 0.46964594520797304, + "grad_norm": 7.625, + "learning_rate": 4.775233755215055e-06, + "loss": 1.206666350364685, + "step": 2580 + }, + { + "epoch": 0.4700100118321653, + "grad_norm": 18.25, + "learning_rate": 4.774880750263413e-06, + "loss": 1.288048267364502, + "step": 2582 + }, + { + "epoch": 0.47037407845635754, + "grad_norm": 19.875, + "learning_rate": 4.7745274848552135e-06, + "loss": 1.356485366821289, + "step": 2584 + }, + { + "epoch": 0.47073814508054973, + "grad_norm": 51.0, + "learning_rate": 4.7741739590422975e-06, + "loss": 1.0911235809326172, + "step": 2586 + }, + { + "epoch": 0.471102211704742, + "grad_norm": 10.0, + "learning_rate": 4.773820172876543e-06, + "loss": 1.6049859523773193, + "step": 2588 + }, + { + "epoch": 0.4714662783289342, + "grad_norm": 17.375, + "learning_rate": 4.773466126409866e-06, + "loss": 1.4031660556793213, + "step": 2590 + }, + { + "epoch": 0.4718303449531264, + "grad_norm": 3.8125, + "learning_rate": 4.773111819694224e-06, + "loss": 1.2222685813903809, + "step": 2592 + }, + { + "epoch": 0.4721944115773187, + "grad_norm": 3.125, + "learning_rate": 4.772757252781607e-06, + "loss": 1.1153395175933838, + "step": 2594 + }, + { + "epoch": 0.47255847820151087, + "grad_norm": 7.625, + "learning_rate": 4.772402425724047e-06, + "loss": 1.441320538520813, + "step": 2596 + }, + { + "epoch": 0.4729225448257031, + "grad_norm": 17.75, + "learning_rate": 4.772047338573615e-06, + "loss": 1.4108866453170776, + "step": 2598 + }, + { + "epoch": 0.4732866114498953, + "grad_norm": 13.5, + "learning_rate": 4.771691991382417e-06, + "loss": 1.227704405784607, + "step": 2600 + }, + { + "epoch": 0.47365067807408756, + "grad_norm": 3.28125, + "learning_rate": 4.7713363842025995e-06, + "loss": 1.2100099325180054, + "step": 2602 + }, + { + "epoch": 0.4740147446982798, + "grad_norm": 3.734375, + "learning_rate": 4.770980517086346e-06, + "loss": 0.8995495438575745, + "step": 2604 + }, + { + "epoch": 0.474378811322472, + "grad_norm": 11.1875, + "learning_rate": 4.770624390085878e-06, + "loss": 0.4387377202510834, + "step": 2606 + }, + { + "epoch": 0.47474287794666425, + "grad_norm": 101.5, + "learning_rate": 4.7702680032534585e-06, + "loss": 0.9370388984680176, + "step": 2608 + }, + { + "epoch": 0.47510694457085645, + "grad_norm": 8.6875, + "learning_rate": 4.769911356641383e-06, + "loss": 1.2931002378463745, + "step": 2610 + }, + { + "epoch": 0.4754710111950487, + "grad_norm": 9.6875, + "learning_rate": 4.769554450301987e-06, + "loss": 1.5461426973342896, + "step": 2612 + }, + { + "epoch": 0.47583507781924095, + "grad_norm": 48.25, + "learning_rate": 4.769197284287649e-06, + "loss": 1.647145390510559, + "step": 2614 + }, + { + "epoch": 0.47619914444343314, + "grad_norm": 4.75, + "learning_rate": 4.768839858650779e-06, + "loss": 1.3851596117019653, + "step": 2616 + }, + { + "epoch": 0.4765632110676254, + "grad_norm": 2.671875, + "learning_rate": 4.768482173443828e-06, + "loss": 0.8840892314910889, + "step": 2618 + }, + { + "epoch": 0.4769272776918176, + "grad_norm": 10.625, + "learning_rate": 4.768124228719284e-06, + "loss": 1.418744683265686, + "step": 2620 + }, + { + "epoch": 0.47729134431600984, + "grad_norm": 10.9375, + "learning_rate": 4.767766024529677e-06, + "loss": 1.4615778923034668, + "step": 2622 + }, + { + "epoch": 0.47765541094020203, + "grad_norm": 12.9375, + "learning_rate": 4.767407560927569e-06, + "loss": 1.4521212577819824, + "step": 2624 + }, + { + "epoch": 0.4780194775643943, + "grad_norm": 18.0, + "learning_rate": 4.767048837965564e-06, + "loss": 1.855870246887207, + "step": 2626 + }, + { + "epoch": 0.47838354418858653, + "grad_norm": 16.75, + "learning_rate": 4.766689855696302e-06, + "loss": 1.3619643449783325, + "step": 2628 + }, + { + "epoch": 0.4787476108127787, + "grad_norm": 8.4375, + "learning_rate": 4.766330614172465e-06, + "loss": 1.144242525100708, + "step": 2630 + }, + { + "epoch": 0.479111677436971, + "grad_norm": 8.0, + "learning_rate": 4.765971113446769e-06, + "loss": 1.4348689317703247, + "step": 2632 + }, + { + "epoch": 0.47947574406116317, + "grad_norm": 29.375, + "learning_rate": 4.765611353571968e-06, + "loss": 1.5049371719360352, + "step": 2634 + }, + { + "epoch": 0.4798398106853554, + "grad_norm": 4.0625, + "learning_rate": 4.765251334600858e-06, + "loss": 1.0106384754180908, + "step": 2636 + }, + { + "epoch": 0.48020387730954767, + "grad_norm": 5.8125, + "learning_rate": 4.764891056586266e-06, + "loss": 0.9415692090988159, + "step": 2638 + }, + { + "epoch": 0.48056794393373986, + "grad_norm": 18.0, + "learning_rate": 4.764530519581066e-06, + "loss": 1.248544454574585, + "step": 2640 + }, + { + "epoch": 0.4809320105579321, + "grad_norm": 10.0, + "learning_rate": 4.764169723638163e-06, + "loss": 1.505332589149475, + "step": 2642 + }, + { + "epoch": 0.4812960771821243, + "grad_norm": 4.25, + "learning_rate": 4.763808668810501e-06, + "loss": 1.4893392324447632, + "step": 2644 + }, + { + "epoch": 0.48166014380631655, + "grad_norm": 12.1875, + "learning_rate": 4.7634473551510666e-06, + "loss": 1.3953019380569458, + "step": 2646 + }, + { + "epoch": 0.4820242104305088, + "grad_norm": 13.25, + "learning_rate": 4.763085782712879e-06, + "loss": 1.5962615013122559, + "step": 2648 + }, + { + "epoch": 0.482388277054701, + "grad_norm": 9.9375, + "learning_rate": 4.762723951548998e-06, + "loss": 1.1721880435943604, + "step": 2650 + }, + { + "epoch": 0.48275234367889325, + "grad_norm": 7.4375, + "learning_rate": 4.76236186171252e-06, + "loss": 1.3240480422973633, + "step": 2652 + }, + { + "epoch": 0.48311641030308544, + "grad_norm": 12.9375, + "learning_rate": 4.761999513256582e-06, + "loss": 1.4799084663391113, + "step": 2654 + }, + { + "epoch": 0.4834804769272777, + "grad_norm": 16.375, + "learning_rate": 4.761636906234356e-06, + "loss": 1.4244959354400635, + "step": 2656 + }, + { + "epoch": 0.48384454355146994, + "grad_norm": 20.0, + "learning_rate": 4.761274040699053e-06, + "loss": 1.3574916124343872, + "step": 2658 + }, + { + "epoch": 0.48420861017566214, + "grad_norm": 9.0625, + "learning_rate": 4.760910916703922e-06, + "loss": 1.52568781375885, + "step": 2660 + }, + { + "epoch": 0.4845726767998544, + "grad_norm": 7.59375, + "learning_rate": 4.760547534302252e-06, + "loss": 1.3339539766311646, + "step": 2662 + }, + { + "epoch": 0.4849367434240466, + "grad_norm": 8.625, + "learning_rate": 4.760183893547364e-06, + "loss": 1.5742881298065186, + "step": 2664 + }, + { + "epoch": 0.48530081004823883, + "grad_norm": 9.375, + "learning_rate": 4.759819994492625e-06, + "loss": 1.5729178190231323, + "step": 2666 + }, + { + "epoch": 0.4856648766724311, + "grad_norm": 17.125, + "learning_rate": 4.7594558371914325e-06, + "loss": 1.6616425514221191, + "step": 2668 + }, + { + "epoch": 0.4860289432966233, + "grad_norm": 25.375, + "learning_rate": 4.7590914216972275e-06, + "loss": 1.5070042610168457, + "step": 2670 + }, + { + "epoch": 0.4863930099208155, + "grad_norm": 17.375, + "learning_rate": 4.758726748063483e-06, + "loss": 1.5015840530395508, + "step": 2672 + }, + { + "epoch": 0.4867570765450077, + "grad_norm": 15.5, + "learning_rate": 4.758361816343717e-06, + "loss": 1.4561421871185303, + "step": 2674 + }, + { + "epoch": 0.48712114316919997, + "grad_norm": 7.9375, + "learning_rate": 4.757996626591481e-06, + "loss": 1.4493619203567505, + "step": 2676 + }, + { + "epoch": 0.4874852097933922, + "grad_norm": 9.0, + "learning_rate": 4.757631178860365e-06, + "loss": 1.2875139713287354, + "step": 2678 + }, + { + "epoch": 0.4878492764175844, + "grad_norm": 43.5, + "learning_rate": 4.757265473203996e-06, + "loss": 1.3142790794372559, + "step": 2680 + }, + { + "epoch": 0.48821334304177666, + "grad_norm": 16.625, + "learning_rate": 4.75689950967604e-06, + "loss": 0.9488089680671692, + "step": 2682 + }, + { + "epoch": 0.48857740966596885, + "grad_norm": 46.75, + "learning_rate": 4.756533288330202e-06, + "loss": 1.394675374031067, + "step": 2684 + }, + { + "epoch": 0.4889414762901611, + "grad_norm": 18.375, + "learning_rate": 4.756166809220221e-06, + "loss": 1.4688191413879395, + "step": 2686 + }, + { + "epoch": 0.48930554291435335, + "grad_norm": 14.0, + "learning_rate": 4.755800072399879e-06, + "loss": 1.770665168762207, + "step": 2688 + }, + { + "epoch": 0.48966960953854555, + "grad_norm": 11.5, + "learning_rate": 4.755433077922992e-06, + "loss": 1.5805026292800903, + "step": 2690 + }, + { + "epoch": 0.4900336761627378, + "grad_norm": 17.25, + "learning_rate": 4.755065825843413e-06, + "loss": 1.336399793624878, + "step": 2692 + }, + { + "epoch": 0.49039774278693, + "grad_norm": 20.0, + "learning_rate": 4.7546983162150394e-06, + "loss": 0.9021884202957153, + "step": 2694 + }, + { + "epoch": 0.49076180941112224, + "grad_norm": 5.75, + "learning_rate": 4.754330549091798e-06, + "loss": 1.328724980354309, + "step": 2696 + }, + { + "epoch": 0.49112587603531443, + "grad_norm": 7.03125, + "learning_rate": 4.753962524527658e-06, + "loss": 1.1296789646148682, + "step": 2698 + }, + { + "epoch": 0.4914899426595067, + "grad_norm": 9.3125, + "learning_rate": 4.753594242576626e-06, + "loss": 1.4913991689682007, + "step": 2700 + }, + { + "epoch": 0.49185400928369893, + "grad_norm": 41.75, + "learning_rate": 4.753225703292745e-06, + "loss": 1.3883957862854004, + "step": 2702 + }, + { + "epoch": 0.49221807590789113, + "grad_norm": 7.3125, + "learning_rate": 4.7528569067300975e-06, + "loss": 1.2255128622055054, + "step": 2704 + }, + { + "epoch": 0.4925821425320834, + "grad_norm": 13.5, + "learning_rate": 4.752487852942803e-06, + "loss": 1.5752066373825073, + "step": 2706 + }, + { + "epoch": 0.49294620915627557, + "grad_norm": 11.625, + "learning_rate": 4.752118541985019e-06, + "loss": 1.4964804649353027, + "step": 2708 + }, + { + "epoch": 0.4933102757804678, + "grad_norm": 6.03125, + "learning_rate": 4.751748973910939e-06, + "loss": 1.1876988410949707, + "step": 2710 + }, + { + "epoch": 0.49367434240466007, + "grad_norm": 14.0, + "learning_rate": 4.751379148774796e-06, + "loss": 1.5623489618301392, + "step": 2712 + }, + { + "epoch": 0.49403840902885227, + "grad_norm": 21.625, + "learning_rate": 4.75100906663086e-06, + "loss": 1.268662452697754, + "step": 2714 + }, + { + "epoch": 0.4944024756530445, + "grad_norm": 8.0625, + "learning_rate": 4.750638727533442e-06, + "loss": 0.7708040475845337, + "step": 2716 + }, + { + "epoch": 0.4947665422772367, + "grad_norm": 19.25, + "learning_rate": 4.750268131536884e-06, + "loss": 1.5136632919311523, + "step": 2718 + }, + { + "epoch": 0.49513060890142896, + "grad_norm": 11.25, + "learning_rate": 4.74989727869557e-06, + "loss": 1.5816248655319214, + "step": 2720 + }, + { + "epoch": 0.4954946755256212, + "grad_norm": 7.25, + "learning_rate": 4.749526169063923e-06, + "loss": 1.1813924312591553, + "step": 2722 + }, + { + "epoch": 0.4958587421498134, + "grad_norm": 18.0, + "learning_rate": 4.7491548026964e-06, + "loss": 1.3222821950912476, + "step": 2724 + }, + { + "epoch": 0.49622280877400565, + "grad_norm": 14.125, + "learning_rate": 4.7487831796475e-06, + "loss": 1.2531263828277588, + "step": 2726 + }, + { + "epoch": 0.49658687539819785, + "grad_norm": 13.1875, + "learning_rate": 4.748411299971755e-06, + "loss": 1.7887446880340576, + "step": 2728 + }, + { + "epoch": 0.4969509420223901, + "grad_norm": 24.875, + "learning_rate": 4.748039163723738e-06, + "loss": 1.43979012966156, + "step": 2730 + }, + { + "epoch": 0.49731500864658235, + "grad_norm": 8.9375, + "learning_rate": 4.747666770958057e-06, + "loss": 1.304109811782837, + "step": 2732 + }, + { + "epoch": 0.49767907527077454, + "grad_norm": 5.0625, + "learning_rate": 4.74729412172936e-06, + "loss": 1.0966064929962158, + "step": 2734 + }, + { + "epoch": 0.4980431418949668, + "grad_norm": 8.4375, + "learning_rate": 4.746921216092332e-06, + "loss": 1.529699683189392, + "step": 2736 + }, + { + "epoch": 0.498407208519159, + "grad_norm": 9.5625, + "learning_rate": 4.746548054101695e-06, + "loss": 1.3052148818969727, + "step": 2738 + }, + { + "epoch": 0.49877127514335123, + "grad_norm": 11.3125, + "learning_rate": 4.74617463581221e-06, + "loss": 1.584709882736206, + "step": 2740 + }, + { + "epoch": 0.4991353417675435, + "grad_norm": 8.0625, + "learning_rate": 4.745800961278673e-06, + "loss": 1.7206652164459229, + "step": 2742 + }, + { + "epoch": 0.4994994083917357, + "grad_norm": 13.4375, + "learning_rate": 4.745427030555919e-06, + "loss": 1.2229584455490112, + "step": 2744 + }, + { + "epoch": 0.4998634750159279, + "grad_norm": 10.875, + "learning_rate": 4.745052843698824e-06, + "loss": 1.817686915397644, + "step": 2746 + }, + { + "epoch": 0.5002275416401202, + "grad_norm": 15.0625, + "learning_rate": 4.744678400762296e-06, + "loss": 1.8809374570846558, + "step": 2748 + }, + { + "epoch": 0.5005916082643124, + "grad_norm": 9.25, + "learning_rate": 4.7443037018012815e-06, + "loss": 1.4616502523422241, + "step": 2750 + }, + { + "epoch": 0.5009556748885046, + "grad_norm": 11.625, + "learning_rate": 4.74392874687077e-06, + "loss": 1.1442909240722656, + "step": 2752 + }, + { + "epoch": 0.5013197415126969, + "grad_norm": 25.125, + "learning_rate": 4.743553536025781e-06, + "loss": 1.5182840824127197, + "step": 2754 + }, + { + "epoch": 0.5016838081368891, + "grad_norm": 19.5, + "learning_rate": 4.743178069321377e-06, + "loss": 1.602879524230957, + "step": 2756 + }, + { + "epoch": 0.5020478747610813, + "grad_norm": 14.375, + "learning_rate": 4.742802346812656e-06, + "loss": 1.6968035697937012, + "step": 2758 + }, + { + "epoch": 0.5024119413852735, + "grad_norm": 7.375, + "learning_rate": 4.742426368554752e-06, + "loss": 1.487510085105896, + "step": 2760 + }, + { + "epoch": 0.5027760080094658, + "grad_norm": 7.96875, + "learning_rate": 4.742050134602842e-06, + "loss": 1.0827512741088867, + "step": 2762 + }, + { + "epoch": 0.503140074633658, + "grad_norm": 8.4375, + "learning_rate": 4.741673645012134e-06, + "loss": 1.2456951141357422, + "step": 2764 + }, + { + "epoch": 0.5035041412578501, + "grad_norm": 7.375, + "learning_rate": 4.741296899837877e-06, + "loss": 1.3233991861343384, + "step": 2766 + }, + { + "epoch": 0.5038682078820425, + "grad_norm": 19.625, + "learning_rate": 4.740919899135357e-06, + "loss": 1.3172898292541504, + "step": 2768 + }, + { + "epoch": 0.5042322745062346, + "grad_norm": 4.375, + "learning_rate": 4.740542642959897e-06, + "loss": 1.117545485496521, + "step": 2770 + }, + { + "epoch": 0.5045963411304268, + "grad_norm": 12.1875, + "learning_rate": 4.740165131366857e-06, + "loss": 1.4898500442504883, + "step": 2772 + }, + { + "epoch": 0.5049604077546191, + "grad_norm": 11.125, + "learning_rate": 4.739787364411638e-06, + "loss": 1.793648362159729, + "step": 2774 + }, + { + "epoch": 0.5053244743788113, + "grad_norm": 8.4375, + "learning_rate": 4.7394093421496725e-06, + "loss": 1.372380256652832, + "step": 2776 + }, + { + "epoch": 0.5056885410030035, + "grad_norm": 6.9375, + "learning_rate": 4.739031064636436e-06, + "loss": 1.0014395713806152, + "step": 2778 + }, + { + "epoch": 0.5060526076271957, + "grad_norm": 12.875, + "learning_rate": 4.738652531927438e-06, + "loss": 1.356541395187378, + "step": 2780 + }, + { + "epoch": 0.506416674251388, + "grad_norm": 20.25, + "learning_rate": 4.7382737440782265e-06, + "loss": 1.4575536251068115, + "step": 2782 + }, + { + "epoch": 0.5067807408755802, + "grad_norm": 10.9375, + "learning_rate": 4.737894701144389e-06, + "loss": 1.4169225692749023, + "step": 2784 + }, + { + "epoch": 0.5071448074997724, + "grad_norm": 10.5625, + "learning_rate": 4.737515403181546e-06, + "loss": 1.1146185398101807, + "step": 2786 + }, + { + "epoch": 0.5075088741239647, + "grad_norm": 26.375, + "learning_rate": 4.73713585024536e-06, + "loss": 1.268887996673584, + "step": 2788 + }, + { + "epoch": 0.5078729407481569, + "grad_norm": 9.4375, + "learning_rate": 4.736756042391527e-06, + "loss": 1.5824708938598633, + "step": 2790 + }, + { + "epoch": 0.5082370073723491, + "grad_norm": 11.0625, + "learning_rate": 4.736375979675784e-06, + "loss": 1.5918515920639038, + "step": 2792 + }, + { + "epoch": 0.5086010739965414, + "grad_norm": 14.1875, + "learning_rate": 4.735995662153904e-06, + "loss": 1.3004698753356934, + "step": 2794 + }, + { + "epoch": 0.5089651406207336, + "grad_norm": 13.125, + "learning_rate": 4.735615089881694e-06, + "loss": 1.5352630615234375, + "step": 2796 + }, + { + "epoch": 0.5093292072449258, + "grad_norm": 9.375, + "learning_rate": 4.735234262915004e-06, + "loss": 1.4200998544692993, + "step": 2798 + }, + { + "epoch": 0.509693273869118, + "grad_norm": 7.0, + "learning_rate": 4.734853181309719e-06, + "loss": 1.2577595710754395, + "step": 2800 + }, + { + "epoch": 0.5100573404933103, + "grad_norm": 18.0, + "learning_rate": 4.734471845121759e-06, + "loss": 0.9026685953140259, + "step": 2802 + }, + { + "epoch": 0.5104214071175025, + "grad_norm": 13.5625, + "learning_rate": 4.734090254407086e-06, + "loss": 1.338547706604004, + "step": 2804 + }, + { + "epoch": 0.5107854737416947, + "grad_norm": 17.0, + "learning_rate": 4.733708409221695e-06, + "loss": 1.5599079132080078, + "step": 2806 + }, + { + "epoch": 0.511149540365887, + "grad_norm": 23.5, + "learning_rate": 4.733326309621622e-06, + "loss": 1.8223967552185059, + "step": 2808 + }, + { + "epoch": 0.5115136069900792, + "grad_norm": 11.5, + "learning_rate": 4.7329439556629375e-06, + "loss": 1.0494744777679443, + "step": 2810 + }, + { + "epoch": 0.5118776736142714, + "grad_norm": 5.53125, + "learning_rate": 4.732561347401751e-06, + "loss": 1.398725152015686, + "step": 2812 + }, + { + "epoch": 0.5122417402384637, + "grad_norm": 16.375, + "learning_rate": 4.732178484894206e-06, + "loss": 0.9273154139518738, + "step": 2814 + }, + { + "epoch": 0.5126058068626559, + "grad_norm": 14.1875, + "learning_rate": 4.73179536819649e-06, + "loss": 1.0062042474746704, + "step": 2816 + }, + { + "epoch": 0.5129698734868481, + "grad_norm": 9.125, + "learning_rate": 4.731411997364822e-06, + "loss": 1.8211826086044312, + "step": 2818 + }, + { + "epoch": 0.5133339401110403, + "grad_norm": 24.375, + "learning_rate": 4.73102837245546e-06, + "loss": 1.7762036323547363, + "step": 2820 + }, + { + "epoch": 0.5136980067352326, + "grad_norm": 21.0, + "learning_rate": 4.7306444935247e-06, + "loss": 2.1570186614990234, + "step": 2822 + }, + { + "epoch": 0.5140620733594248, + "grad_norm": 14.875, + "learning_rate": 4.730260360628873e-06, + "loss": 2.1902740001678467, + "step": 2824 + }, + { + "epoch": 0.514426139983617, + "grad_norm": 19.375, + "learning_rate": 4.7298759738243505e-06, + "loss": 1.8290989398956299, + "step": 2826 + }, + { + "epoch": 0.5147902066078093, + "grad_norm": 22.0, + "learning_rate": 4.7294913331675406e-06, + "loss": 1.6025868654251099, + "step": 2828 + }, + { + "epoch": 0.5151542732320015, + "grad_norm": 19.375, + "learning_rate": 4.729106438714886e-06, + "loss": 0.5601062178611755, + "step": 2830 + }, + { + "epoch": 0.5155183398561937, + "grad_norm": 13.0625, + "learning_rate": 4.7287212905228684e-06, + "loss": 1.4542829990386963, + "step": 2832 + }, + { + "epoch": 0.5158824064803859, + "grad_norm": 8.875, + "learning_rate": 4.728335888648008e-06, + "loss": 1.4797817468643188, + "step": 2834 + }, + { + "epoch": 0.5162464731045782, + "grad_norm": 27.875, + "learning_rate": 4.72795023314686e-06, + "loss": 1.549726963043213, + "step": 2836 + }, + { + "epoch": 0.5166105397287704, + "grad_norm": 21.0, + "learning_rate": 4.727564324076019e-06, + "loss": 1.5557734966278076, + "step": 2838 + }, + { + "epoch": 0.5169746063529626, + "grad_norm": 18.125, + "learning_rate": 4.727178161492113e-06, + "loss": 0.9821083545684814, + "step": 2840 + }, + { + "epoch": 0.5173386729771549, + "grad_norm": 10.9375, + "learning_rate": 4.726791745451812e-06, + "loss": 1.6308296918869019, + "step": 2842 + }, + { + "epoch": 0.517702739601347, + "grad_norm": 5.90625, + "learning_rate": 4.726405076011821e-06, + "loss": 1.3690816164016724, + "step": 2844 + }, + { + "epoch": 0.5180668062255392, + "grad_norm": 6.40625, + "learning_rate": 4.72601815322888e-06, + "loss": 1.477687120437622, + "step": 2846 + }, + { + "epoch": 0.5184308728497315, + "grad_norm": 11.0625, + "learning_rate": 4.725630977159772e-06, + "loss": 1.0053203105926514, + "step": 2848 + }, + { + "epoch": 0.5187949394739237, + "grad_norm": 12.75, + "learning_rate": 4.72524354786131e-06, + "loss": 1.4586474895477295, + "step": 2850 + }, + { + "epoch": 0.5191590060981159, + "grad_norm": 9.75, + "learning_rate": 4.72485586539035e-06, + "loss": 1.4755802154541016, + "step": 2852 + }, + { + "epoch": 0.5195230727223081, + "grad_norm": 20.875, + "learning_rate": 4.724467929803782e-06, + "loss": 1.0338621139526367, + "step": 2854 + }, + { + "epoch": 0.5198871393465004, + "grad_norm": 9.5, + "learning_rate": 4.7240797411585335e-06, + "loss": 1.3863259553909302, + "step": 2856 + }, + { + "epoch": 0.5202512059706926, + "grad_norm": 18.5, + "learning_rate": 4.72369129951157e-06, + "loss": 1.1654415130615234, + "step": 2858 + }, + { + "epoch": 0.5206152725948848, + "grad_norm": 14.6875, + "learning_rate": 4.723302604919895e-06, + "loss": 1.6258399486541748, + "step": 2860 + }, + { + "epoch": 0.5209793392190771, + "grad_norm": 11.125, + "learning_rate": 4.722913657440545e-06, + "loss": 1.5496981143951416, + "step": 2862 + }, + { + "epoch": 0.5213434058432693, + "grad_norm": 20.625, + "learning_rate": 4.722524457130599e-06, + "loss": 1.591113567352295, + "step": 2864 + }, + { + "epoch": 0.5217074724674615, + "grad_norm": 10.25, + "learning_rate": 4.72213500404717e-06, + "loss": 1.7281275987625122, + "step": 2866 + }, + { + "epoch": 0.5220715390916538, + "grad_norm": 12.5, + "learning_rate": 4.721745298247408e-06, + "loss": 1.8635038137435913, + "step": 2868 + }, + { + "epoch": 0.522435605715846, + "grad_norm": 16.625, + "learning_rate": 4.721355339788501e-06, + "loss": 1.5160801410675049, + "step": 2870 + }, + { + "epoch": 0.5227996723400382, + "grad_norm": 16.25, + "learning_rate": 4.720965128727674e-06, + "loss": 1.5367975234985352, + "step": 2872 + }, + { + "epoch": 0.5231637389642304, + "grad_norm": 10.75, + "learning_rate": 4.72057466512219e-06, + "loss": 1.5876338481903076, + "step": 2874 + }, + { + "epoch": 0.5235278055884227, + "grad_norm": 13.375, + "learning_rate": 4.720183949029344e-06, + "loss": 1.4861215353012085, + "step": 2876 + }, + { + "epoch": 0.5238918722126149, + "grad_norm": 12.6875, + "learning_rate": 4.719792980506477e-06, + "loss": 1.0783206224441528, + "step": 2878 + }, + { + "epoch": 0.5242559388368071, + "grad_norm": 23.125, + "learning_rate": 4.719401759610958e-06, + "loss": 1.9256478548049927, + "step": 2880 + }, + { + "epoch": 0.5246200054609994, + "grad_norm": 18.75, + "learning_rate": 4.7190102864002005e-06, + "loss": 1.8310813903808594, + "step": 2882 + }, + { + "epoch": 0.5249840720851916, + "grad_norm": 16.25, + "learning_rate": 4.71861856093165e-06, + "loss": 1.1610815525054932, + "step": 2884 + }, + { + "epoch": 0.5253481387093838, + "grad_norm": 15.1875, + "learning_rate": 4.718226583262791e-06, + "loss": 1.5273479223251343, + "step": 2886 + }, + { + "epoch": 0.5257122053335761, + "grad_norm": 40.75, + "learning_rate": 4.717834353451143e-06, + "loss": 1.5807608366012573, + "step": 2888 + }, + { + "epoch": 0.5260762719577683, + "grad_norm": 6.46875, + "learning_rate": 4.717441871554266e-06, + "loss": 1.322156548500061, + "step": 2890 + }, + { + "epoch": 0.5264403385819605, + "grad_norm": 40.0, + "learning_rate": 4.7170491376297535e-06, + "loss": 1.035938024520874, + "step": 2892 + }, + { + "epoch": 0.5268044052061527, + "grad_norm": 201.0, + "learning_rate": 4.716656151735241e-06, + "loss": 1.1095421314239502, + "step": 2894 + }, + { + "epoch": 0.527168471830345, + "grad_norm": 7.09375, + "learning_rate": 4.7162629139283935e-06, + "loss": 1.397359848022461, + "step": 2896 + }, + { + "epoch": 0.5275325384545372, + "grad_norm": 6.25, + "learning_rate": 4.715869424266919e-06, + "loss": 1.2935075759887695, + "step": 2898 + }, + { + "epoch": 0.5278966050787294, + "grad_norm": 14.375, + "learning_rate": 4.71547568280856e-06, + "loss": 1.5288337469100952, + "step": 2900 + }, + { + "epoch": 0.5282606717029217, + "grad_norm": 8.5625, + "learning_rate": 4.715081689611097e-06, + "loss": 1.5273665189743042, + "step": 2902 + }, + { + "epoch": 0.5286247383271139, + "grad_norm": 18.0, + "learning_rate": 4.714687444732348e-06, + "loss": 1.7785556316375732, + "step": 2904 + }, + { + "epoch": 0.5289888049513061, + "grad_norm": 44.0, + "learning_rate": 4.714292948230164e-06, + "loss": 1.650891661643982, + "step": 2906 + }, + { + "epoch": 0.5293528715754983, + "grad_norm": 13.25, + "learning_rate": 4.7138982001624365e-06, + "loss": 1.902068018913269, + "step": 2908 + }, + { + "epoch": 0.5297169381996906, + "grad_norm": 19.125, + "learning_rate": 4.713503200587095e-06, + "loss": 1.5662474632263184, + "step": 2910 + }, + { + "epoch": 0.5300810048238828, + "grad_norm": 9.8125, + "learning_rate": 4.7131079495621035e-06, + "loss": 1.4625476598739624, + "step": 2912 + }, + { + "epoch": 0.530445071448075, + "grad_norm": 11.3125, + "learning_rate": 4.712712447145462e-06, + "loss": 1.4308027029037476, + "step": 2914 + }, + { + "epoch": 0.5308091380722673, + "grad_norm": 9.875, + "learning_rate": 4.712316693395211e-06, + "loss": 1.4272058010101318, + "step": 2916 + }, + { + "epoch": 0.5311732046964595, + "grad_norm": 12.375, + "learning_rate": 4.711920688369424e-06, + "loss": 1.459861397743225, + "step": 2918 + }, + { + "epoch": 0.5315372713206516, + "grad_norm": 8.0, + "learning_rate": 4.711524432126214e-06, + "loss": 1.523653268814087, + "step": 2920 + }, + { + "epoch": 0.531901337944844, + "grad_norm": 17.625, + "learning_rate": 4.711127924723729e-06, + "loss": 1.5347280502319336, + "step": 2922 + }, + { + "epoch": 0.5322654045690361, + "grad_norm": 17.125, + "learning_rate": 4.710731166220157e-06, + "loss": 1.5497467517852783, + "step": 2924 + }, + { + "epoch": 0.5326294711932283, + "grad_norm": 6.21875, + "learning_rate": 4.710334156673719e-06, + "loss": 0.8407115340232849, + "step": 2926 + }, + { + "epoch": 0.5329935378174205, + "grad_norm": 23.625, + "learning_rate": 4.709936896142675e-06, + "loss": 0.366423636674881, + "step": 2928 + }, + { + "epoch": 0.5333576044416128, + "grad_norm": 9.8125, + "learning_rate": 4.709539384685321e-06, + "loss": 0.6109656691551208, + "step": 2930 + }, + { + "epoch": 0.533721671065805, + "grad_norm": 18.625, + "learning_rate": 4.70914162235999e-06, + "loss": 0.7239855527877808, + "step": 2932 + }, + { + "epoch": 0.5340857376899972, + "grad_norm": 10.6875, + "learning_rate": 4.708743609225053e-06, + "loss": 1.124316930770874, + "step": 2934 + }, + { + "epoch": 0.5344498043141895, + "grad_norm": 14.625, + "learning_rate": 4.7083453453389165e-06, + "loss": 1.4050620794296265, + "step": 2936 + }, + { + "epoch": 0.5348138709383817, + "grad_norm": 31.0, + "learning_rate": 4.707946830760023e-06, + "loss": 1.4761255979537964, + "step": 2938 + }, + { + "epoch": 0.5351779375625739, + "grad_norm": 9.75, + "learning_rate": 4.707548065546854e-06, + "loss": 1.5079489946365356, + "step": 2940 + }, + { + "epoch": 0.5355420041867662, + "grad_norm": 15.625, + "learning_rate": 4.7071490497579265e-06, + "loss": 1.6067997217178345, + "step": 2942 + }, + { + "epoch": 0.5359060708109584, + "grad_norm": 7.78125, + "learning_rate": 4.706749783451795e-06, + "loss": 1.4546140432357788, + "step": 2944 + }, + { + "epoch": 0.5362701374351506, + "grad_norm": 9.625, + "learning_rate": 4.706350266687049e-06, + "loss": 1.6595127582550049, + "step": 2946 + }, + { + "epoch": 0.5366342040593428, + "grad_norm": 8.625, + "learning_rate": 4.705950499522317e-06, + "loss": 1.277746319770813, + "step": 2948 + }, + { + "epoch": 0.5369982706835351, + "grad_norm": 12.9375, + "learning_rate": 4.705550482016262e-06, + "loss": 1.6249651908874512, + "step": 2950 + }, + { + "epoch": 0.5373623373077273, + "grad_norm": 9.375, + "learning_rate": 4.705150214227587e-06, + "loss": 1.439868688583374, + "step": 2952 + }, + { + "epoch": 0.5377264039319195, + "grad_norm": 23.75, + "learning_rate": 4.7047496962150285e-06, + "loss": 1.6325018405914307, + "step": 2954 + }, + { + "epoch": 0.5380904705561118, + "grad_norm": 135.0, + "learning_rate": 4.70434892803736e-06, + "loss": 1.6851948499679565, + "step": 2956 + }, + { + "epoch": 0.538454537180304, + "grad_norm": 19.125, + "learning_rate": 4.7039479097533935e-06, + "loss": 1.5004594326019287, + "step": 2958 + }, + { + "epoch": 0.5388186038044962, + "grad_norm": 6.84375, + "learning_rate": 4.703546641421977e-06, + "loss": 1.3415923118591309, + "step": 2960 + }, + { + "epoch": 0.5391826704286885, + "grad_norm": 14.125, + "learning_rate": 4.703145123101994e-06, + "loss": 1.4607899188995361, + "step": 2962 + }, + { + "epoch": 0.5395467370528807, + "grad_norm": 14.3125, + "learning_rate": 4.702743354852367e-06, + "loss": 1.4355204105377197, + "step": 2964 + }, + { + "epoch": 0.5399108036770729, + "grad_norm": 5.90625, + "learning_rate": 4.702341336732054e-06, + "loss": 1.134709119796753, + "step": 2966 + }, + { + "epoch": 0.5402748703012651, + "grad_norm": 10.875, + "learning_rate": 4.701939068800048e-06, + "loss": 1.3385690450668335, + "step": 2968 + }, + { + "epoch": 0.5406389369254574, + "grad_norm": 20.625, + "learning_rate": 4.701536551115381e-06, + "loss": 1.7676000595092773, + "step": 2970 + }, + { + "epoch": 0.5410030035496496, + "grad_norm": 34.5, + "learning_rate": 4.701133783737121e-06, + "loss": 1.6422622203826904, + "step": 2972 + }, + { + "epoch": 0.5413670701738418, + "grad_norm": 59.5, + "learning_rate": 4.700730766724372e-06, + "loss": 1.8241934776306152, + "step": 2974 + }, + { + "epoch": 0.5417311367980341, + "grad_norm": 8.6875, + "learning_rate": 4.700327500136276e-06, + "loss": 1.5284518003463745, + "step": 2976 + }, + { + "epoch": 0.5420952034222263, + "grad_norm": 12.125, + "learning_rate": 4.69992398403201e-06, + "loss": 1.5116437673568726, + "step": 2978 + }, + { + "epoch": 0.5424592700464185, + "grad_norm": 12.4375, + "learning_rate": 4.6995202184707885e-06, + "loss": 1.5724998712539673, + "step": 2980 + }, + { + "epoch": 0.5428233366706107, + "grad_norm": 26.25, + "learning_rate": 4.699116203511862e-06, + "loss": 1.6610677242279053, + "step": 2982 + }, + { + "epoch": 0.543187403294803, + "grad_norm": 25.5, + "learning_rate": 4.6987119392145185e-06, + "loss": 2.1597137451171875, + "step": 2984 + }, + { + "epoch": 0.5435514699189952, + "grad_norm": 23.25, + "learning_rate": 4.698307425638083e-06, + "loss": 1.7757792472839355, + "step": 2986 + }, + { + "epoch": 0.5439155365431874, + "grad_norm": 7.15625, + "learning_rate": 4.697902662841915e-06, + "loss": 1.1460011005401611, + "step": 2988 + }, + { + "epoch": 0.5442796031673797, + "grad_norm": 9.125, + "learning_rate": 4.697497650885413e-06, + "loss": 1.556647539138794, + "step": 2990 + }, + { + "epoch": 0.5446436697915719, + "grad_norm": 16.25, + "learning_rate": 4.69709238982801e-06, + "loss": 1.4367669820785522, + "step": 2992 + }, + { + "epoch": 0.545007736415764, + "grad_norm": 27.5, + "learning_rate": 4.696686879729176e-06, + "loss": 1.2195615768432617, + "step": 2994 + }, + { + "epoch": 0.5453718030399564, + "grad_norm": 8.125, + "learning_rate": 4.6962811206484185e-06, + "loss": 0.39954930543899536, + "step": 2996 + }, + { + "epoch": 0.5457358696641486, + "grad_norm": 8.875, + "learning_rate": 4.6958751126452804e-06, + "loss": 1.2129322290420532, + "step": 2998 + }, + { + "epoch": 0.5460999362883407, + "grad_norm": 15.0, + "learning_rate": 4.695468855779343e-06, + "loss": 1.3926241397857666, + "step": 3000 + }, + { + "epoch": 0.5464640029125329, + "grad_norm": 10.0, + "learning_rate": 4.695062350110221e-06, + "loss": 1.4251070022583008, + "step": 3002 + }, + { + "epoch": 0.5468280695367252, + "grad_norm": 9.0, + "learning_rate": 4.694655595697571e-06, + "loss": 1.6396093368530273, + "step": 3004 + }, + { + "epoch": 0.5471921361609174, + "grad_norm": 12.3125, + "learning_rate": 4.694248592601077e-06, + "loss": 1.3367325067520142, + "step": 3006 + }, + { + "epoch": 0.5475562027851096, + "grad_norm": 8.5, + "learning_rate": 4.693841340880471e-06, + "loss": 0.914564847946167, + "step": 3008 + }, + { + "epoch": 0.5479202694093019, + "grad_norm": 6.53125, + "learning_rate": 4.693433840595511e-06, + "loss": 1.34604811668396, + "step": 3010 + }, + { + "epoch": 0.5482843360334941, + "grad_norm": 8.9375, + "learning_rate": 4.693026091805999e-06, + "loss": 1.4301847219467163, + "step": 3012 + }, + { + "epoch": 0.5486484026576863, + "grad_norm": 7.6875, + "learning_rate": 4.692618094571769e-06, + "loss": 1.2869274616241455, + "step": 3014 + }, + { + "epoch": 0.5490124692818786, + "grad_norm": 11.875, + "learning_rate": 4.6922098489526926e-06, + "loss": 1.4045214653015137, + "step": 3016 + }, + { + "epoch": 0.5493765359060708, + "grad_norm": 14.9375, + "learning_rate": 4.691801355008679e-06, + "loss": 1.9691683053970337, + "step": 3018 + }, + { + "epoch": 0.549740602530263, + "grad_norm": 8.4375, + "learning_rate": 4.691392612799673e-06, + "loss": 1.390626072883606, + "step": 3020 + }, + { + "epoch": 0.5501046691544552, + "grad_norm": 13.1875, + "learning_rate": 4.6909836223856555e-06, + "loss": 1.4834929704666138, + "step": 3022 + }, + { + "epoch": 0.5504687357786475, + "grad_norm": 8.375, + "learning_rate": 4.690574383826645e-06, + "loss": 1.3541334867477417, + "step": 3024 + }, + { + "epoch": 0.5508328024028397, + "grad_norm": 10.9375, + "learning_rate": 4.6901648971826945e-06, + "loss": 1.6976135969161987, + "step": 3026 + }, + { + "epoch": 0.5511968690270319, + "grad_norm": 5.34375, + "learning_rate": 4.689755162513895e-06, + "loss": 1.4011552333831787, + "step": 3028 + }, + { + "epoch": 0.5515609356512242, + "grad_norm": 4.15625, + "learning_rate": 4.689345179880374e-06, + "loss": 1.071542739868164, + "step": 3030 + }, + { + "epoch": 0.5519250022754164, + "grad_norm": 8.9375, + "learning_rate": 4.688934949342293e-06, + "loss": 1.0931841135025024, + "step": 3032 + }, + { + "epoch": 0.5522890688996086, + "grad_norm": 16.375, + "learning_rate": 4.688524470959853e-06, + "loss": 1.6992199420928955, + "step": 3034 + }, + { + "epoch": 0.5526531355238009, + "grad_norm": 16.875, + "learning_rate": 4.68811374479329e-06, + "loss": 1.6201632022857666, + "step": 3036 + }, + { + "epoch": 0.5530172021479931, + "grad_norm": 46.25, + "learning_rate": 4.687702770902877e-06, + "loss": 1.4474477767944336, + "step": 3038 + }, + { + "epoch": 0.5533812687721853, + "grad_norm": 10.5, + "learning_rate": 4.687291549348921e-06, + "loss": 1.4587504863739014, + "step": 3040 + }, + { + "epoch": 0.5537453353963775, + "grad_norm": 13.3125, + "learning_rate": 4.686880080191769e-06, + "loss": 1.2866533994674683, + "step": 3042 + }, + { + "epoch": 0.5541094020205698, + "grad_norm": 10.375, + "learning_rate": 4.686468363491802e-06, + "loss": 0.9405273199081421, + "step": 3044 + }, + { + "epoch": 0.554473468644762, + "grad_norm": 8.25, + "learning_rate": 4.686056399309436e-06, + "loss": 1.0470237731933594, + "step": 3046 + }, + { + "epoch": 0.5548375352689542, + "grad_norm": 16.0, + "learning_rate": 4.685644187705127e-06, + "loss": 1.54508376121521, + "step": 3048 + }, + { + "epoch": 0.5552016018931465, + "grad_norm": 12.9375, + "learning_rate": 4.685231728739364e-06, + "loss": 1.7101861238479614, + "step": 3050 + }, + { + "epoch": 0.5555656685173387, + "grad_norm": 10.5, + "learning_rate": 4.684819022472677e-06, + "loss": 1.4591268301010132, + "step": 3052 + }, + { + "epoch": 0.5559297351415309, + "grad_norm": 12.625, + "learning_rate": 4.6844060689656235e-06, + "loss": 1.0944174528121948, + "step": 3054 + }, + { + "epoch": 0.5562938017657231, + "grad_norm": 12.1875, + "learning_rate": 4.683992868278807e-06, + "loss": 0.9923224449157715, + "step": 3056 + }, + { + "epoch": 0.5566578683899154, + "grad_norm": 124.5, + "learning_rate": 4.683579420472861e-06, + "loss": 0.8241497278213501, + "step": 3058 + }, + { + "epoch": 0.5570219350141076, + "grad_norm": 8.0625, + "learning_rate": 4.68316572560846e-06, + "loss": 1.002344012260437, + "step": 3060 + }, + { + "epoch": 0.5573860016382998, + "grad_norm": 29.375, + "learning_rate": 4.682751783746308e-06, + "loss": 1.116881251335144, + "step": 3062 + }, + { + "epoch": 0.5577500682624921, + "grad_norm": 16.25, + "learning_rate": 4.682337594947152e-06, + "loss": 1.583069086074829, + "step": 3064 + }, + { + "epoch": 0.5581141348866843, + "grad_norm": 59.0, + "learning_rate": 4.68192315927177e-06, + "loss": 1.565364122390747, + "step": 3066 + }, + { + "epoch": 0.5584782015108765, + "grad_norm": 10.0625, + "learning_rate": 4.681508476780984e-06, + "loss": 1.595078706741333, + "step": 3068 + }, + { + "epoch": 0.5588422681350688, + "grad_norm": 6.125, + "learning_rate": 4.681093547535641e-06, + "loss": 1.391558289527893, + "step": 3070 + }, + { + "epoch": 0.559206334759261, + "grad_norm": 15.375, + "learning_rate": 4.680678371596634e-06, + "loss": 1.5937374830245972, + "step": 3072 + }, + { + "epoch": 0.5595704013834532, + "grad_norm": 10.8125, + "learning_rate": 4.680262949024886e-06, + "loss": 1.448059320449829, + "step": 3074 + }, + { + "epoch": 0.5599344680076453, + "grad_norm": 16.75, + "learning_rate": 4.679847279881361e-06, + "loss": 2.119917869567871, + "step": 3076 + }, + { + "epoch": 0.5602985346318377, + "grad_norm": 12.6875, + "learning_rate": 4.679431364227055e-06, + "loss": 1.5274730920791626, + "step": 3078 + }, + { + "epoch": 0.5606626012560298, + "grad_norm": 11.5, + "learning_rate": 4.679015202123003e-06, + "loss": 1.4733775854110718, + "step": 3080 + }, + { + "epoch": 0.561026667880222, + "grad_norm": 10.0, + "learning_rate": 4.678598793630274e-06, + "loss": 1.4676270484924316, + "step": 3082 + }, + { + "epoch": 0.5613907345044143, + "grad_norm": 8.0625, + "learning_rate": 4.678182138809976e-06, + "loss": 1.4255867004394531, + "step": 3084 + }, + { + "epoch": 0.5617548011286065, + "grad_norm": 16.5, + "learning_rate": 4.677765237723247e-06, + "loss": 0.8566718697547913, + "step": 3086 + }, + { + "epoch": 0.5621188677527987, + "grad_norm": 23.875, + "learning_rate": 4.677348090431272e-06, + "loss": 0.4970046579837799, + "step": 3088 + }, + { + "epoch": 0.562482934376991, + "grad_norm": 15.1875, + "learning_rate": 4.676930696995261e-06, + "loss": 1.617612361907959, + "step": 3090 + }, + { + "epoch": 0.5628470010011832, + "grad_norm": 55.5, + "learning_rate": 4.676513057476467e-06, + "loss": 1.3165394067764282, + "step": 3092 + }, + { + "epoch": 0.5632110676253754, + "grad_norm": 5.21875, + "learning_rate": 4.676095171936176e-06, + "loss": 1.4364819526672363, + "step": 3094 + }, + { + "epoch": 0.5635751342495676, + "grad_norm": 9.9375, + "learning_rate": 4.67567704043571e-06, + "loss": 1.0662716627120972, + "step": 3096 + }, + { + "epoch": 0.5639392008737599, + "grad_norm": 7.15625, + "learning_rate": 4.6752586630364305e-06, + "loss": 1.6108722686767578, + "step": 3098 + }, + { + "epoch": 0.5643032674979521, + "grad_norm": 19.5, + "learning_rate": 4.674840039799731e-06, + "loss": 1.4411096572875977, + "step": 3100 + }, + { + "epoch": 0.5646673341221443, + "grad_norm": 5.9375, + "learning_rate": 4.674421170787043e-06, + "loss": 1.2056114673614502, + "step": 3102 + }, + { + "epoch": 0.5650314007463366, + "grad_norm": 23.625, + "learning_rate": 4.674002056059833e-06, + "loss": 1.3782093524932861, + "step": 3104 + }, + { + "epoch": 0.5653954673705288, + "grad_norm": 16.25, + "learning_rate": 4.673582695679607e-06, + "loss": 1.5344831943511963, + "step": 3106 + }, + { + "epoch": 0.565759533994721, + "grad_norm": 9.875, + "learning_rate": 4.6731630897079025e-06, + "loss": 1.3298767805099487, + "step": 3108 + }, + { + "epoch": 0.5661236006189133, + "grad_norm": 22.125, + "learning_rate": 4.672743238206295e-06, + "loss": 1.644599437713623, + "step": 3110 + }, + { + "epoch": 0.5664876672431055, + "grad_norm": 29.0, + "learning_rate": 4.672323141236397e-06, + "loss": 1.771336555480957, + "step": 3112 + }, + { + "epoch": 0.5668517338672977, + "grad_norm": 67.5, + "learning_rate": 4.671902798859856e-06, + "loss": 1.1102098226547241, + "step": 3114 + }, + { + "epoch": 0.5672158004914899, + "grad_norm": 6.6875, + "learning_rate": 4.671482211138353e-06, + "loss": 1.4423727989196777, + "step": 3116 + }, + { + "epoch": 0.5675798671156822, + "grad_norm": 7.84375, + "learning_rate": 4.67106137813361e-06, + "loss": 1.4640207290649414, + "step": 3118 + }, + { + "epoch": 0.5679439337398744, + "grad_norm": 6.3125, + "learning_rate": 4.670640299907382e-06, + "loss": 1.0204105377197266, + "step": 3120 + }, + { + "epoch": 0.5683080003640666, + "grad_norm": 5.3125, + "learning_rate": 4.6702189765214614e-06, + "loss": 1.2244899272918701, + "step": 3122 + }, + { + "epoch": 0.5686720669882589, + "grad_norm": 11.9375, + "learning_rate": 4.669797408037674e-06, + "loss": 1.266489028930664, + "step": 3124 + }, + { + "epoch": 0.5690361336124511, + "grad_norm": 7.5, + "learning_rate": 4.669375594517884e-06, + "loss": 1.2684581279754639, + "step": 3126 + }, + { + "epoch": 0.5694002002366433, + "grad_norm": 15.5, + "learning_rate": 4.668953536023991e-06, + "loss": 1.0628567934036255, + "step": 3128 + }, + { + "epoch": 0.5697642668608355, + "grad_norm": 11.5625, + "learning_rate": 4.668531232617931e-06, + "loss": 1.6200110912322998, + "step": 3130 + }, + { + "epoch": 0.5701283334850278, + "grad_norm": 11.5, + "learning_rate": 4.668108684361673e-06, + "loss": 0.9993836283683777, + "step": 3132 + }, + { + "epoch": 0.57049240010922, + "grad_norm": 14.4375, + "learning_rate": 4.667685891317227e-06, + "loss": 1.2345932722091675, + "step": 3134 + }, + { + "epoch": 0.5708564667334122, + "grad_norm": 9.0625, + "learning_rate": 4.667262853546634e-06, + "loss": 1.195366382598877, + "step": 3136 + }, + { + "epoch": 0.5712205333576045, + "grad_norm": 18.75, + "learning_rate": 4.666839571111975e-06, + "loss": 2.00323224067688, + "step": 3138 + }, + { + "epoch": 0.5715845999817967, + "grad_norm": 13.75, + "learning_rate": 4.666416044075365e-06, + "loss": 1.9695429801940918, + "step": 3140 + }, + { + "epoch": 0.5719486666059889, + "grad_norm": 3.515625, + "learning_rate": 4.665992272498952e-06, + "loss": 1.1043243408203125, + "step": 3142 + }, + { + "epoch": 0.5723127332301812, + "grad_norm": 20.625, + "learning_rate": 4.665568256444926e-06, + "loss": 1.1558351516723633, + "step": 3144 + }, + { + "epoch": 0.5726767998543734, + "grad_norm": 15.5, + "learning_rate": 4.6651439959755076e-06, + "loss": 1.2644670009613037, + "step": 3146 + }, + { + "epoch": 0.5730408664785656, + "grad_norm": 16.75, + "learning_rate": 4.664719491152956e-06, + "loss": 1.6701910495758057, + "step": 3148 + }, + { + "epoch": 0.5734049331027578, + "grad_norm": 10.5, + "learning_rate": 4.664294742039567e-06, + "loss": 1.4925076961517334, + "step": 3150 + }, + { + "epoch": 0.5737689997269501, + "grad_norm": 3.546875, + "learning_rate": 4.663869748697667e-06, + "loss": 1.1980005502700806, + "step": 3152 + }, + { + "epoch": 0.5741330663511423, + "grad_norm": 14.1875, + "learning_rate": 4.6634445111896264e-06, + "loss": 0.8367608785629272, + "step": 3154 + }, + { + "epoch": 0.5744971329753344, + "grad_norm": 72.5, + "learning_rate": 4.6630190295778446e-06, + "loss": 1.0910260677337646, + "step": 3156 + }, + { + "epoch": 0.5748611995995268, + "grad_norm": 20.125, + "learning_rate": 4.662593303924759e-06, + "loss": 1.3143784999847412, + "step": 3158 + }, + { + "epoch": 0.575225266223719, + "grad_norm": 11.5625, + "learning_rate": 4.662167334292845e-06, + "loss": 1.5937294960021973, + "step": 3160 + }, + { + "epoch": 0.5755893328479111, + "grad_norm": 24.125, + "learning_rate": 4.6617411207446104e-06, + "loss": 1.5265504121780396, + "step": 3162 + }, + { + "epoch": 0.5759533994721034, + "grad_norm": 16.375, + "learning_rate": 4.661314663342601e-06, + "loss": 0.6745487451553345, + "step": 3164 + }, + { + "epoch": 0.5763174660962956, + "grad_norm": 14.25, + "learning_rate": 4.660887962149397e-06, + "loss": 1.5956515073776245, + "step": 3166 + }, + { + "epoch": 0.5766815327204878, + "grad_norm": 7.03125, + "learning_rate": 4.6604610172276164e-06, + "loss": 1.4386987686157227, + "step": 3168 + }, + { + "epoch": 0.57704559934468, + "grad_norm": 4.125, + "learning_rate": 4.660033828639909e-06, + "loss": 1.2788320779800415, + "step": 3170 + }, + { + "epoch": 0.5774096659688723, + "grad_norm": 3.71875, + "learning_rate": 4.659606396448967e-06, + "loss": 1.398795485496521, + "step": 3172 + }, + { + "epoch": 0.5777737325930645, + "grad_norm": 28.125, + "learning_rate": 4.659178720717511e-06, + "loss": 1.3583898544311523, + "step": 3174 + }, + { + "epoch": 0.5781377992172567, + "grad_norm": 15.25, + "learning_rate": 4.658750801508302e-06, + "loss": 1.8299055099487305, + "step": 3176 + }, + { + "epoch": 0.578501865841449, + "grad_norm": 14.3125, + "learning_rate": 4.658322638884135e-06, + "loss": 1.1780740022659302, + "step": 3178 + }, + { + "epoch": 0.5788659324656412, + "grad_norm": 10.75, + "learning_rate": 4.657894232907842e-06, + "loss": 1.575628638267517, + "step": 3180 + }, + { + "epoch": 0.5792299990898334, + "grad_norm": 10.8125, + "learning_rate": 4.657465583642287e-06, + "loss": 1.254805326461792, + "step": 3182 + }, + { + "epoch": 0.5795940657140257, + "grad_norm": 5.3125, + "learning_rate": 4.657036691150377e-06, + "loss": 1.2919539213180542, + "step": 3184 + }, + { + "epoch": 0.5799581323382179, + "grad_norm": 9.0, + "learning_rate": 4.656607555495048e-06, + "loss": 1.1025445461273193, + "step": 3186 + }, + { + "epoch": 0.5803221989624101, + "grad_norm": 18.75, + "learning_rate": 4.6561781767392734e-06, + "loss": 1.4582033157348633, + "step": 3188 + }, + { + "epoch": 0.5806862655866023, + "grad_norm": 15.5, + "learning_rate": 4.655748554946064e-06, + "loss": 1.068711519241333, + "step": 3190 + }, + { + "epoch": 0.5810503322107946, + "grad_norm": 27.125, + "learning_rate": 4.655318690178462e-06, + "loss": 1.0355747938156128, + "step": 3192 + }, + { + "epoch": 0.5814143988349868, + "grad_norm": 11.6875, + "learning_rate": 4.654888582499552e-06, + "loss": 0.6396878957748413, + "step": 3194 + }, + { + "epoch": 0.581778465459179, + "grad_norm": 10.875, + "learning_rate": 4.65445823197245e-06, + "loss": 1.6420068740844727, + "step": 3196 + }, + { + "epoch": 0.5821425320833713, + "grad_norm": 7.6875, + "learning_rate": 4.654027638660306e-06, + "loss": 1.3766051530838013, + "step": 3198 + }, + { + "epoch": 0.5825065987075635, + "grad_norm": 8.3125, + "learning_rate": 4.65359680262631e-06, + "loss": 1.1669166088104248, + "step": 3200 + }, + { + "epoch": 0.5828706653317557, + "grad_norm": 9.8125, + "learning_rate": 4.6531657239336845e-06, + "loss": 1.6351568698883057, + "step": 3202 + }, + { + "epoch": 0.583234731955948, + "grad_norm": 9.8125, + "learning_rate": 4.6527344026456886e-06, + "loss": 1.8216816186904907, + "step": 3204 + }, + { + "epoch": 0.5835987985801402, + "grad_norm": 3.84375, + "learning_rate": 4.652302838825617e-06, + "loss": 1.052591323852539, + "step": 3206 + }, + { + "epoch": 0.5839628652043324, + "grad_norm": 19.875, + "learning_rate": 4.6518710325368e-06, + "loss": 1.7370530366897583, + "step": 3208 + }, + { + "epoch": 0.5843269318285246, + "grad_norm": 25.75, + "learning_rate": 4.651438983842604e-06, + "loss": 1.4809801578521729, + "step": 3210 + }, + { + "epoch": 0.5846909984527169, + "grad_norm": 13.8125, + "learning_rate": 4.651006692806429e-06, + "loss": 1.9143999814987183, + "step": 3212 + }, + { + "epoch": 0.5850550650769091, + "grad_norm": 7.53125, + "learning_rate": 4.650574159491713e-06, + "loss": 1.4653382301330566, + "step": 3214 + }, + { + "epoch": 0.5854191317011013, + "grad_norm": 5.4375, + "learning_rate": 4.650141383961929e-06, + "loss": 1.3473073244094849, + "step": 3216 + }, + { + "epoch": 0.5857831983252936, + "grad_norm": 21.25, + "learning_rate": 4.6497083662805845e-06, + "loss": 1.5710654258728027, + "step": 3218 + }, + { + "epoch": 0.5861472649494858, + "grad_norm": 19.375, + "learning_rate": 4.6492751065112215e-06, + "loss": 1.3432490825653076, + "step": 3220 + }, + { + "epoch": 0.586511331573678, + "grad_norm": 11.375, + "learning_rate": 4.648841604717421e-06, + "loss": 1.418708324432373, + "step": 3222 + }, + { + "epoch": 0.5868753981978702, + "grad_norm": 10.5, + "learning_rate": 4.648407860962798e-06, + "loss": 1.5353648662567139, + "step": 3224 + }, + { + "epoch": 0.5872394648220625, + "grad_norm": 7.96875, + "learning_rate": 4.647973875311001e-06, + "loss": 1.4630835056304932, + "step": 3226 + }, + { + "epoch": 0.5876035314462547, + "grad_norm": 24.25, + "learning_rate": 4.647539647825716e-06, + "loss": 1.6073410511016846, + "step": 3228 + }, + { + "epoch": 0.5879675980704469, + "grad_norm": 29.0, + "learning_rate": 4.647105178570666e-06, + "loss": 1.8549131155014038, + "step": 3230 + }, + { + "epoch": 0.5883316646946392, + "grad_norm": 11.0625, + "learning_rate": 4.646670467609605e-06, + "loss": 1.6139217615127563, + "step": 3232 + }, + { + "epoch": 0.5886957313188314, + "grad_norm": 7.90625, + "learning_rate": 4.646235515006328e-06, + "loss": 1.3840563297271729, + "step": 3234 + }, + { + "epoch": 0.5890597979430235, + "grad_norm": 16.375, + "learning_rate": 4.645800320824659e-06, + "loss": 1.1495158672332764, + "step": 3236 + }, + { + "epoch": 0.5894238645672158, + "grad_norm": 11.0, + "learning_rate": 4.645364885128463e-06, + "loss": 1.3970487117767334, + "step": 3238 + }, + { + "epoch": 0.589787931191408, + "grad_norm": 7.5, + "learning_rate": 4.644929207981639e-06, + "loss": 1.2395824193954468, + "step": 3240 + }, + { + "epoch": 0.5901519978156002, + "grad_norm": 13.5, + "learning_rate": 4.644493289448118e-06, + "loss": 1.2570314407348633, + "step": 3242 + }, + { + "epoch": 0.5905160644397924, + "grad_norm": 17.0, + "learning_rate": 4.6440571295918735e-06, + "loss": 0.7949339747428894, + "step": 3244 + }, + { + "epoch": 0.5908801310639847, + "grad_norm": 5.78125, + "learning_rate": 4.643620728476907e-06, + "loss": 1.052629828453064, + "step": 3246 + }, + { + "epoch": 0.5912441976881769, + "grad_norm": 11.0625, + "learning_rate": 4.64318408616726e-06, + "loss": 1.5894979238510132, + "step": 3248 + }, + { + "epoch": 0.5916082643123691, + "grad_norm": 10.6875, + "learning_rate": 4.642747202727008e-06, + "loss": 1.552390217781067, + "step": 3250 + }, + { + "epoch": 0.5919723309365614, + "grad_norm": 18.375, + "learning_rate": 4.6423100782202615e-06, + "loss": 1.6982547044754028, + "step": 3252 + }, + { + "epoch": 0.5923363975607536, + "grad_norm": 15.375, + "learning_rate": 4.641872712711166e-06, + "loss": 1.6790783405303955, + "step": 3254 + }, + { + "epoch": 0.5927004641849458, + "grad_norm": 13.6875, + "learning_rate": 4.641435106263904e-06, + "loss": 1.7826764583587646, + "step": 3256 + }, + { + "epoch": 0.5930645308091381, + "grad_norm": 8.9375, + "learning_rate": 4.640997258942692e-06, + "loss": 1.7601191997528076, + "step": 3258 + }, + { + "epoch": 0.5934285974333303, + "grad_norm": 10.4375, + "learning_rate": 4.640559170811784e-06, + "loss": 1.4909051656723022, + "step": 3260 + }, + { + "epoch": 0.5937926640575225, + "grad_norm": 6.125, + "learning_rate": 4.640120841935465e-06, + "loss": 1.4465163946151733, + "step": 3262 + }, + { + "epoch": 0.5941567306817147, + "grad_norm": 16.125, + "learning_rate": 4.63968227237806e-06, + "loss": 1.105288028717041, + "step": 3264 + }, + { + "epoch": 0.594520797305907, + "grad_norm": 20.875, + "learning_rate": 4.6392434622039265e-06, + "loss": 0.7226976752281189, + "step": 3266 + }, + { + "epoch": 0.5948848639300992, + "grad_norm": 8.375, + "learning_rate": 4.638804411477457e-06, + "loss": 1.6137361526489258, + "step": 3268 + }, + { + "epoch": 0.5952489305542914, + "grad_norm": 8.625, + "learning_rate": 4.638365120263083e-06, + "loss": 1.1538947820663452, + "step": 3270 + }, + { + "epoch": 0.5956129971784837, + "grad_norm": 80.5, + "learning_rate": 4.637925588625266e-06, + "loss": 1.5226430892944336, + "step": 3272 + }, + { + "epoch": 0.5959770638026759, + "grad_norm": 11.0, + "learning_rate": 4.637485816628507e-06, + "loss": 1.8728686571121216, + "step": 3274 + }, + { + "epoch": 0.5963411304268681, + "grad_norm": 4.125, + "learning_rate": 4.637045804337342e-06, + "loss": 0.9120247960090637, + "step": 3276 + }, + { + "epoch": 0.5967051970510604, + "grad_norm": 21.25, + "learning_rate": 4.636605551816339e-06, + "loss": 1.2076787948608398, + "step": 3278 + }, + { + "epoch": 0.5970692636752526, + "grad_norm": 13.6875, + "learning_rate": 4.636165059130104e-06, + "loss": 1.621565818786621, + "step": 3280 + }, + { + "epoch": 0.5974333302994448, + "grad_norm": 14.75, + "learning_rate": 4.635724326343276e-06, + "loss": 1.559403896331787, + "step": 3282 + }, + { + "epoch": 0.597797396923637, + "grad_norm": 20.75, + "learning_rate": 4.6352833535205335e-06, + "loss": 0.9151042699813843, + "step": 3284 + }, + { + "epoch": 0.5981614635478293, + "grad_norm": 14.5, + "learning_rate": 4.634842140726586e-06, + "loss": 1.8382973670959473, + "step": 3286 + }, + { + "epoch": 0.5985255301720215, + "grad_norm": 33.25, + "learning_rate": 4.634400688026179e-06, + "loss": 1.4484329223632812, + "step": 3288 + }, + { + "epoch": 0.5988895967962137, + "grad_norm": 23.375, + "learning_rate": 4.633958995484095e-06, + "loss": 1.3155410289764404, + "step": 3290 + }, + { + "epoch": 0.599253663420406, + "grad_norm": 49.5, + "learning_rate": 4.633517063165151e-06, + "loss": 1.3144867420196533, + "step": 3292 + }, + { + "epoch": 0.5996177300445982, + "grad_norm": 8.5, + "learning_rate": 4.633074891134197e-06, + "loss": 1.6168324947357178, + "step": 3294 + }, + { + "epoch": 0.5999817966687904, + "grad_norm": 11.5, + "learning_rate": 4.632632479456121e-06, + "loss": 1.3474645614624023, + "step": 3296 + }, + { + "epoch": 0.6003458632929826, + "grad_norm": 13.5, + "learning_rate": 4.632189828195847e-06, + "loss": 1.751720666885376, + "step": 3298 + }, + { + "epoch": 0.6007099299171749, + "grad_norm": 8.6875, + "learning_rate": 4.631746937418328e-06, + "loss": 1.4904208183288574, + "step": 3300 + }, + { + "epoch": 0.6010739965413671, + "grad_norm": 4.65625, + "learning_rate": 4.63130380718856e-06, + "loss": 1.1204825639724731, + "step": 3302 + }, + { + "epoch": 0.6014380631655593, + "grad_norm": 7.09375, + "learning_rate": 4.63086043757157e-06, + "loss": 1.4762994050979614, + "step": 3304 + }, + { + "epoch": 0.6018021297897516, + "grad_norm": 6.8125, + "learning_rate": 4.630416828632418e-06, + "loss": 1.1321595907211304, + "step": 3306 + }, + { + "epoch": 0.6021661964139438, + "grad_norm": 15.0, + "learning_rate": 4.6299729804362046e-06, + "loss": 1.5416860580444336, + "step": 3308 + }, + { + "epoch": 0.602530263038136, + "grad_norm": 13.6875, + "learning_rate": 4.629528893048063e-06, + "loss": 1.5017390251159668, + "step": 3310 + }, + { + "epoch": 0.6028943296623283, + "grad_norm": 24.875, + "learning_rate": 4.629084566533161e-06, + "loss": 1.6793529987335205, + "step": 3312 + }, + { + "epoch": 0.6032583962865204, + "grad_norm": 10.0, + "learning_rate": 4.628640000956699e-06, + "loss": 1.7469428777694702, + "step": 3314 + }, + { + "epoch": 0.6036224629107126, + "grad_norm": 3.546875, + "learning_rate": 4.628195196383918e-06, + "loss": 0.9798278212547302, + "step": 3316 + }, + { + "epoch": 0.6039865295349048, + "grad_norm": 5.25, + "learning_rate": 4.6277501528800905e-06, + "loss": 1.0058797597885132, + "step": 3318 + }, + { + "epoch": 0.6043505961590971, + "grad_norm": 8.125, + "learning_rate": 4.6273048705105235e-06, + "loss": 1.1809449195861816, + "step": 3320 + }, + { + "epoch": 0.6047146627832893, + "grad_norm": 33.0, + "learning_rate": 4.6268593493405635e-06, + "loss": 1.5396674871444702, + "step": 3322 + }, + { + "epoch": 0.6050787294074815, + "grad_norm": 18.875, + "learning_rate": 4.626413589435586e-06, + "loss": 1.5441834926605225, + "step": 3324 + }, + { + "epoch": 0.6054427960316738, + "grad_norm": 16.5, + "learning_rate": 4.625967590861007e-06, + "loss": 1.3417446613311768, + "step": 3326 + }, + { + "epoch": 0.605806862655866, + "grad_norm": 24.75, + "learning_rate": 4.625521353682272e-06, + "loss": 1.8228718042373657, + "step": 3328 + }, + { + "epoch": 0.6061709292800582, + "grad_norm": 13.5625, + "learning_rate": 4.6250748779648665e-06, + "loss": 1.5724678039550781, + "step": 3330 + }, + { + "epoch": 0.6065349959042505, + "grad_norm": 24.125, + "learning_rate": 4.624628163774309e-06, + "loss": 2.088804244995117, + "step": 3332 + }, + { + "epoch": 0.6068990625284427, + "grad_norm": 11.375, + "learning_rate": 4.624181211176151e-06, + "loss": 1.8216822147369385, + "step": 3334 + }, + { + "epoch": 0.6072631291526349, + "grad_norm": 12.4375, + "learning_rate": 4.623734020235984e-06, + "loss": 1.444862961769104, + "step": 3336 + }, + { + "epoch": 0.6076271957768271, + "grad_norm": 27.625, + "learning_rate": 4.623286591019429e-06, + "loss": 1.5083041191101074, + "step": 3338 + }, + { + "epoch": 0.6079912624010194, + "grad_norm": 21.25, + "learning_rate": 4.622838923592145e-06, + "loss": 1.8390361070632935, + "step": 3340 + }, + { + "epoch": 0.6083553290252116, + "grad_norm": 44.5, + "learning_rate": 4.622391018019828e-06, + "loss": 1.6136435270309448, + "step": 3342 + }, + { + "epoch": 0.6087193956494038, + "grad_norm": 6.34375, + "learning_rate": 4.621942874368202e-06, + "loss": 0.991256594657898, + "step": 3344 + }, + { + "epoch": 0.6090834622735961, + "grad_norm": 5.1875, + "learning_rate": 4.621494492703033e-06, + "loss": 0.9313170313835144, + "step": 3346 + }, + { + "epoch": 0.6094475288977883, + "grad_norm": 10.9375, + "learning_rate": 4.6210458730901185e-06, + "loss": 0.7289725542068481, + "step": 3348 + }, + { + "epoch": 0.6098115955219805, + "grad_norm": 17.875, + "learning_rate": 4.620597015595291e-06, + "loss": 1.5423052310943604, + "step": 3350 + }, + { + "epoch": 0.6101756621461728, + "grad_norm": 9.75, + "learning_rate": 4.62014792028442e-06, + "loss": 1.6939454078674316, + "step": 3352 + }, + { + "epoch": 0.610539728770365, + "grad_norm": 16.125, + "learning_rate": 4.6196985872234075e-06, + "loss": 1.4564534425735474, + "step": 3354 + }, + { + "epoch": 0.6109037953945572, + "grad_norm": 9.375, + "learning_rate": 4.619249016478191e-06, + "loss": 1.2883763313293457, + "step": 3356 + }, + { + "epoch": 0.6112678620187494, + "grad_norm": 10.0625, + "learning_rate": 4.6187992081147436e-06, + "loss": 1.3018970489501953, + "step": 3358 + }, + { + "epoch": 0.6116319286429417, + "grad_norm": 11.6875, + "learning_rate": 4.618349162199074e-06, + "loss": 1.4381163120269775, + "step": 3360 + }, + { + "epoch": 0.6119959952671339, + "grad_norm": 10.875, + "learning_rate": 4.617898878797221e-06, + "loss": 1.4397021532058716, + "step": 3362 + }, + { + "epoch": 0.6123600618913261, + "grad_norm": 13.3125, + "learning_rate": 4.617448357975267e-06, + "loss": 1.5341143608093262, + "step": 3364 + }, + { + "epoch": 0.6127241285155184, + "grad_norm": 9.6875, + "learning_rate": 4.61699759979932e-06, + "loss": 1.4235416650772095, + "step": 3366 + }, + { + "epoch": 0.6130881951397106, + "grad_norm": 7.21875, + "learning_rate": 4.616546604335529e-06, + "loss": 1.3741611242294312, + "step": 3368 + }, + { + "epoch": 0.6134522617639028, + "grad_norm": 10.625, + "learning_rate": 4.616095371650075e-06, + "loss": 1.6094715595245361, + "step": 3370 + }, + { + "epoch": 0.613816328388095, + "grad_norm": 9.1875, + "learning_rate": 4.615643901809173e-06, + "loss": 1.5990512371063232, + "step": 3372 + }, + { + "epoch": 0.6141803950122873, + "grad_norm": 25.875, + "learning_rate": 4.615192194879078e-06, + "loss": 1.5131504535675049, + "step": 3374 + }, + { + "epoch": 0.6145444616364795, + "grad_norm": 66.0, + "learning_rate": 4.614740250926074e-06, + "loss": 1.5291842222213745, + "step": 3376 + }, + { + "epoch": 0.6149085282606717, + "grad_norm": 9.0625, + "learning_rate": 4.614288070016482e-06, + "loss": 0.9591556787490845, + "step": 3378 + }, + { + "epoch": 0.615272594884864, + "grad_norm": 4.9375, + "learning_rate": 4.613835652216657e-06, + "loss": 1.0939677953720093, + "step": 3380 + }, + { + "epoch": 0.6156366615090562, + "grad_norm": 14.25, + "learning_rate": 4.6133829975929915e-06, + "loss": 1.055437445640564, + "step": 3382 + }, + { + "epoch": 0.6160007281332484, + "grad_norm": 17.0, + "learning_rate": 4.612930106211908e-06, + "loss": 1.5648449659347534, + "step": 3384 + }, + { + "epoch": 0.6163647947574407, + "grad_norm": 10.6875, + "learning_rate": 4.612476978139869e-06, + "loss": 1.7257728576660156, + "step": 3386 + }, + { + "epoch": 0.6167288613816329, + "grad_norm": 9.625, + "learning_rate": 4.6120236134433684e-06, + "loss": 1.3101181983947754, + "step": 3388 + }, + { + "epoch": 0.617092928005825, + "grad_norm": 13.3125, + "learning_rate": 4.611570012188936e-06, + "loss": 1.6583659648895264, + "step": 3390 + }, + { + "epoch": 0.6174569946300172, + "grad_norm": 11.0, + "learning_rate": 4.611116174443134e-06, + "loss": 1.964901089668274, + "step": 3392 + }, + { + "epoch": 0.6178210612542095, + "grad_norm": 14.625, + "learning_rate": 4.610662100272564e-06, + "loss": 1.2678747177124023, + "step": 3394 + }, + { + "epoch": 0.6181851278784017, + "grad_norm": 8.1875, + "learning_rate": 4.610207789743858e-06, + "loss": 0.691560685634613, + "step": 3396 + }, + { + "epoch": 0.6185491945025939, + "grad_norm": 15.3125, + "learning_rate": 4.609753242923683e-06, + "loss": 1.3154070377349854, + "step": 3398 + }, + { + "epoch": 0.6189132611267862, + "grad_norm": 10.375, + "learning_rate": 4.609298459878745e-06, + "loss": 1.6963614225387573, + "step": 3400 + }, + { + "epoch": 0.6192773277509784, + "grad_norm": 7.125, + "learning_rate": 4.60884344067578e-06, + "loss": 0.8437920808792114, + "step": 3402 + }, + { + "epoch": 0.6196413943751706, + "grad_norm": 6.09375, + "learning_rate": 4.608388185381559e-06, + "loss": 1.4655256271362305, + "step": 3404 + }, + { + "epoch": 0.6200054609993629, + "grad_norm": 3.75, + "learning_rate": 4.607932694062891e-06, + "loss": 1.0056616067886353, + "step": 3406 + }, + { + "epoch": 0.6203695276235551, + "grad_norm": 9.25, + "learning_rate": 4.607476966786616e-06, + "loss": 1.0568050146102905, + "step": 3408 + }, + { + "epoch": 0.6207335942477473, + "grad_norm": 7.5625, + "learning_rate": 4.6070210036196115e-06, + "loss": 1.2031705379486084, + "step": 3410 + }, + { + "epoch": 0.6210976608719395, + "grad_norm": 5.59375, + "learning_rate": 4.606564804628787e-06, + "loss": 1.2960994243621826, + "step": 3412 + }, + { + "epoch": 0.6214617274961318, + "grad_norm": 7.0625, + "learning_rate": 4.606108369881087e-06, + "loss": 1.2023874521255493, + "step": 3414 + }, + { + "epoch": 0.621825794120324, + "grad_norm": 12.4375, + "learning_rate": 4.605651699443494e-06, + "loss": 1.5052754878997803, + "step": 3416 + }, + { + "epoch": 0.6221898607445162, + "grad_norm": 4.28125, + "learning_rate": 4.605194793383021e-06, + "loss": 1.1749440431594849, + "step": 3418 + }, + { + "epoch": 0.6225539273687085, + "grad_norm": 2.734375, + "learning_rate": 4.604737651766718e-06, + "loss": 0.8927658200263977, + "step": 3420 + }, + { + "epoch": 0.6229179939929007, + "grad_norm": 8.1875, + "learning_rate": 4.604280274661667e-06, + "loss": 1.2033967971801758, + "step": 3422 + }, + { + "epoch": 0.6232820606170929, + "grad_norm": 12.375, + "learning_rate": 4.603822662134988e-06, + "loss": 1.5042997598648071, + "step": 3424 + }, + { + "epoch": 0.6236461272412852, + "grad_norm": 7.15625, + "learning_rate": 4.603364814253832e-06, + "loss": 1.5573424100875854, + "step": 3426 + }, + { + "epoch": 0.6240101938654774, + "grad_norm": 9.3125, + "learning_rate": 4.602906731085388e-06, + "loss": 1.4264090061187744, + "step": 3428 + }, + { + "epoch": 0.6243742604896696, + "grad_norm": 12.375, + "learning_rate": 4.602448412696877e-06, + "loss": 1.1589868068695068, + "step": 3430 + }, + { + "epoch": 0.6247383271138618, + "grad_norm": 16.25, + "learning_rate": 4.601989859155557e-06, + "loss": 0.8242495656013489, + "step": 3432 + }, + { + "epoch": 0.6251023937380541, + "grad_norm": 22.0, + "learning_rate": 4.601531070528716e-06, + "loss": 1.584548830986023, + "step": 3434 + }, + { + "epoch": 0.6254664603622463, + "grad_norm": 21.125, + "learning_rate": 4.601072046883681e-06, + "loss": 1.445434808731079, + "step": 3436 + }, + { + "epoch": 0.6258305269864385, + "grad_norm": 10.0625, + "learning_rate": 4.6006127882878115e-06, + "loss": 1.94016432762146, + "step": 3438 + }, + { + "epoch": 0.6261945936106308, + "grad_norm": 42.75, + "learning_rate": 4.600153294808503e-06, + "loss": 1.3921657800674438, + "step": 3440 + }, + { + "epoch": 0.626558660234823, + "grad_norm": 30.5, + "learning_rate": 4.5996935665131825e-06, + "loss": 1.9987621307373047, + "step": 3442 + }, + { + "epoch": 0.6269227268590152, + "grad_norm": 7.0625, + "learning_rate": 4.599233603469314e-06, + "loss": 1.3436954021453857, + "step": 3444 + }, + { + "epoch": 0.6272867934832074, + "grad_norm": 10.0625, + "learning_rate": 4.598773405744397e-06, + "loss": 1.5192806720733643, + "step": 3446 + }, + { + "epoch": 0.6276508601073997, + "grad_norm": 42.0, + "learning_rate": 4.598312973405959e-06, + "loss": 1.377097249031067, + "step": 3448 + }, + { + "epoch": 0.6280149267315919, + "grad_norm": 6.90625, + "learning_rate": 4.597852306521572e-06, + "loss": 0.8952192068099976, + "step": 3450 + }, + { + "epoch": 0.6283789933557841, + "grad_norm": 31.375, + "learning_rate": 4.597391405158833e-06, + "loss": 1.1780035495758057, + "step": 3452 + }, + { + "epoch": 0.6287430599799764, + "grad_norm": 110.5, + "learning_rate": 4.59693026938538e-06, + "loss": 1.4515801668167114, + "step": 3454 + }, + { + "epoch": 0.6291071266041686, + "grad_norm": 11.25, + "learning_rate": 4.596468899268882e-06, + "loss": 1.2682180404663086, + "step": 3456 + }, + { + "epoch": 0.6294711932283608, + "grad_norm": 18.25, + "learning_rate": 4.596007294877042e-06, + "loss": 1.6006085872650146, + "step": 3458 + }, + { + "epoch": 0.6298352598525531, + "grad_norm": 8.1875, + "learning_rate": 4.595545456277601e-06, + "loss": 1.5804389715194702, + "step": 3460 + }, + { + "epoch": 0.6301993264767453, + "grad_norm": 9.1875, + "learning_rate": 4.595083383538329e-06, + "loss": 1.1393368244171143, + "step": 3462 + }, + { + "epoch": 0.6305633931009375, + "grad_norm": 17.875, + "learning_rate": 4.594621076727035e-06, + "loss": 1.5236291885375977, + "step": 3464 + }, + { + "epoch": 0.6309274597251296, + "grad_norm": 8.5, + "learning_rate": 4.59415853591156e-06, + "loss": 1.318071722984314, + "step": 3466 + }, + { + "epoch": 0.631291526349322, + "grad_norm": 4.75, + "learning_rate": 4.593695761159781e-06, + "loss": 1.0345380306243896, + "step": 3468 + }, + { + "epoch": 0.6316555929735141, + "grad_norm": 50.5, + "learning_rate": 4.593232752539608e-06, + "loss": 1.464630365371704, + "step": 3470 + }, + { + "epoch": 0.6320196595977063, + "grad_norm": 14.1875, + "learning_rate": 4.592769510118985e-06, + "loss": 1.8135061264038086, + "step": 3472 + }, + { + "epoch": 0.6323837262218986, + "grad_norm": 13.4375, + "learning_rate": 4.592306033965892e-06, + "loss": 1.650680661201477, + "step": 3474 + }, + { + "epoch": 0.6327477928460908, + "grad_norm": 9.3125, + "learning_rate": 4.591842324148341e-06, + "loss": 1.8015437126159668, + "step": 3476 + }, + { + "epoch": 0.633111859470283, + "grad_norm": 10.5, + "learning_rate": 4.591378380734381e-06, + "loss": 1.535023808479309, + "step": 3478 + }, + { + "epoch": 0.6334759260944753, + "grad_norm": 34.5, + "learning_rate": 4.5909142037920905e-06, + "loss": 1.5320165157318115, + "step": 3480 + }, + { + "epoch": 0.6338399927186675, + "grad_norm": 8.8125, + "learning_rate": 4.59044979338959e-06, + "loss": 1.3897587060928345, + "step": 3482 + }, + { + "epoch": 0.6342040593428597, + "grad_norm": 6.09375, + "learning_rate": 4.5899851495950274e-06, + "loss": 1.4943766593933105, + "step": 3484 + }, + { + "epoch": 0.6345681259670519, + "grad_norm": 6.5, + "learning_rate": 4.589520272476588e-06, + "loss": 1.0019128322601318, + "step": 3486 + }, + { + "epoch": 0.6349321925912442, + "grad_norm": 37.5, + "learning_rate": 4.5890551621024906e-06, + "loss": 1.3247580528259277, + "step": 3488 + }, + { + "epoch": 0.6352962592154364, + "grad_norm": 10.25, + "learning_rate": 4.588589818540987e-06, + "loss": 0.9012529850006104, + "step": 3490 + }, + { + "epoch": 0.6356603258396286, + "grad_norm": 10.375, + "learning_rate": 4.5881242418603656e-06, + "loss": 1.4762120246887207, + "step": 3492 + }, + { + "epoch": 0.6360243924638209, + "grad_norm": 4.78125, + "learning_rate": 4.587658432128948e-06, + "loss": 1.0838160514831543, + "step": 3494 + }, + { + "epoch": 0.6363884590880131, + "grad_norm": 19.25, + "learning_rate": 4.58719238941509e-06, + "loss": 1.242342472076416, + "step": 3496 + }, + { + "epoch": 0.6367525257122053, + "grad_norm": 9.5, + "learning_rate": 4.586726113787182e-06, + "loss": 1.5322515964508057, + "step": 3498 + }, + { + "epoch": 0.6371165923363976, + "grad_norm": 9.9375, + "learning_rate": 4.5862596053136465e-06, + "loss": 1.404787302017212, + "step": 3500 + }, + { + "epoch": 0.6374806589605898, + "grad_norm": 10.6875, + "learning_rate": 4.585792864062943e-06, + "loss": 1.1525084972381592, + "step": 3502 + }, + { + "epoch": 0.637844725584782, + "grad_norm": 4.78125, + "learning_rate": 4.585325890103561e-06, + "loss": 1.3715919256210327, + "step": 3504 + }, + { + "epoch": 0.6382087922089742, + "grad_norm": 11.5625, + "learning_rate": 4.584858683504032e-06, + "loss": 1.4361039400100708, + "step": 3506 + }, + { + "epoch": 0.6385728588331665, + "grad_norm": 10.6875, + "learning_rate": 4.584391244332913e-06, + "loss": 1.5253689289093018, + "step": 3508 + }, + { + "epoch": 0.6389369254573587, + "grad_norm": 6.6875, + "learning_rate": 4.583923572658801e-06, + "loss": 1.2777526378631592, + "step": 3510 + }, + { + "epoch": 0.6393009920815509, + "grad_norm": 12.875, + "learning_rate": 4.5834556685503215e-06, + "loss": 1.391641616821289, + "step": 3512 + }, + { + "epoch": 0.6396650587057432, + "grad_norm": 9.4375, + "learning_rate": 4.5829875320761416e-06, + "loss": 1.3365249633789062, + "step": 3514 + }, + { + "epoch": 0.6400291253299354, + "grad_norm": 10.625, + "learning_rate": 4.582519163304956e-06, + "loss": 1.4570097923278809, + "step": 3516 + }, + { + "epoch": 0.6403931919541276, + "grad_norm": 17.75, + "learning_rate": 4.5820505623054966e-06, + "loss": 1.6383426189422607, + "step": 3518 + }, + { + "epoch": 0.6407572585783198, + "grad_norm": 39.0, + "learning_rate": 4.5815817291465295e-06, + "loss": 1.2982375621795654, + "step": 3520 + }, + { + "epoch": 0.6411213252025121, + "grad_norm": 19.875, + "learning_rate": 4.581112663896853e-06, + "loss": 1.855355143547058, + "step": 3522 + }, + { + "epoch": 0.6414853918267043, + "grad_norm": 8.875, + "learning_rate": 4.580643366625301e-06, + "loss": 1.6114633083343506, + "step": 3524 + }, + { + "epoch": 0.6418494584508965, + "grad_norm": 20.625, + "learning_rate": 4.5801738374007386e-06, + "loss": 1.223724126815796, + "step": 3526 + }, + { + "epoch": 0.6422135250750888, + "grad_norm": 6.34375, + "learning_rate": 4.579704076292072e-06, + "loss": 1.3601583242416382, + "step": 3528 + }, + { + "epoch": 0.642577591699281, + "grad_norm": 15.875, + "learning_rate": 4.5792340833682325e-06, + "loss": 1.4852311611175537, + "step": 3530 + }, + { + "epoch": 0.6429416583234732, + "grad_norm": 5.3125, + "learning_rate": 4.5787638586981915e-06, + "loss": 1.290025234222412, + "step": 3532 + }, + { + "epoch": 0.6433057249476655, + "grad_norm": 9.8125, + "learning_rate": 4.578293402350954e-06, + "loss": 1.1747198104858398, + "step": 3534 + }, + { + "epoch": 0.6436697915718577, + "grad_norm": 21.125, + "learning_rate": 4.577822714395555e-06, + "loss": 1.4804720878601074, + "step": 3536 + }, + { + "epoch": 0.6440338581960499, + "grad_norm": 23.125, + "learning_rate": 4.577351794901066e-06, + "loss": 1.5694031715393066, + "step": 3538 + }, + { + "epoch": 0.644397924820242, + "grad_norm": 5.625, + "learning_rate": 4.576880643936594e-06, + "loss": 1.4072412252426147, + "step": 3540 + }, + { + "epoch": 0.6447619914444344, + "grad_norm": 7.34375, + "learning_rate": 4.576409261571278e-06, + "loss": 1.5995231866836548, + "step": 3542 + }, + { + "epoch": 0.6451260580686266, + "grad_norm": 47.5, + "learning_rate": 4.575937647874293e-06, + "loss": 1.3782302141189575, + "step": 3544 + }, + { + "epoch": 0.6454901246928187, + "grad_norm": 15.0625, + "learning_rate": 4.575465802914844e-06, + "loss": 1.2868973016738892, + "step": 3546 + }, + { + "epoch": 0.645854191317011, + "grad_norm": 12.6875, + "learning_rate": 4.574993726762173e-06, + "loss": 1.4982761144638062, + "step": 3548 + }, + { + "epoch": 0.6462182579412032, + "grad_norm": 18.375, + "learning_rate": 4.574521419485556e-06, + "loss": 1.5792279243469238, + "step": 3550 + }, + { + "epoch": 0.6465823245653954, + "grad_norm": 11.6875, + "learning_rate": 4.5740488811543e-06, + "loss": 1.6417756080627441, + "step": 3552 + }, + { + "epoch": 0.6469463911895877, + "grad_norm": 5.40625, + "learning_rate": 4.573576111837752e-06, + "loss": 1.3600236177444458, + "step": 3554 + }, + { + "epoch": 0.6473104578137799, + "grad_norm": 6.84375, + "learning_rate": 4.573103111605286e-06, + "loss": 1.373063087463379, + "step": 3556 + }, + { + "epoch": 0.6476745244379721, + "grad_norm": 6.34375, + "learning_rate": 4.5726298805263124e-06, + "loss": 1.1180996894836426, + "step": 3558 + }, + { + "epoch": 0.6480385910621643, + "grad_norm": 33.75, + "learning_rate": 4.5721564186702785e-06, + "loss": 1.0901854038238525, + "step": 3560 + }, + { + "epoch": 0.6484026576863566, + "grad_norm": 12.375, + "learning_rate": 4.571682726106661e-06, + "loss": 0.8341667652130127, + "step": 3562 + }, + { + "epoch": 0.6487667243105488, + "grad_norm": 8.875, + "learning_rate": 4.5712088029049725e-06, + "loss": 1.074367880821228, + "step": 3564 + }, + { + "epoch": 0.649130790934741, + "grad_norm": 25.375, + "learning_rate": 4.570734649134761e-06, + "loss": 1.3583738803863525, + "step": 3566 + }, + { + "epoch": 0.6494948575589333, + "grad_norm": 19.0, + "learning_rate": 4.570260264865605e-06, + "loss": 1.3751565217971802, + "step": 3568 + }, + { + "epoch": 0.6498589241831255, + "grad_norm": 14.375, + "learning_rate": 4.569785650167119e-06, + "loss": 1.3538877964019775, + "step": 3570 + }, + { + "epoch": 0.6502229908073177, + "grad_norm": 9.375, + "learning_rate": 4.569310805108949e-06, + "loss": 1.0558316707611084, + "step": 3572 + }, + { + "epoch": 0.65058705743151, + "grad_norm": 11.0625, + "learning_rate": 4.56883572976078e-06, + "loss": 0.9923142194747925, + "step": 3574 + }, + { + "epoch": 0.6509511240557022, + "grad_norm": 8.5625, + "learning_rate": 4.568360424192325e-06, + "loss": 1.5509668588638306, + "step": 3576 + }, + { + "epoch": 0.6513151906798944, + "grad_norm": 7.40625, + "learning_rate": 4.567884888473333e-06, + "loss": 1.479922890663147, + "step": 3578 + }, + { + "epoch": 0.6516792573040866, + "grad_norm": 13.75, + "learning_rate": 4.567409122673588e-06, + "loss": 1.5896632671356201, + "step": 3580 + }, + { + "epoch": 0.6520433239282789, + "grad_norm": 11.8125, + "learning_rate": 4.566933126862907e-06, + "loss": 1.7980318069458008, + "step": 3582 + }, + { + "epoch": 0.6524073905524711, + "grad_norm": 10.375, + "learning_rate": 4.566456901111139e-06, + "loss": 1.3209612369537354, + "step": 3584 + }, + { + "epoch": 0.6527714571766633, + "grad_norm": 16.375, + "learning_rate": 4.565980445488171e-06, + "loss": 1.428575038909912, + "step": 3586 + }, + { + "epoch": 0.6531355238008556, + "grad_norm": 16.125, + "learning_rate": 4.565503760063918e-06, + "loss": 1.6164405345916748, + "step": 3588 + }, + { + "epoch": 0.6534995904250478, + "grad_norm": 10.8125, + "learning_rate": 4.565026844908331e-06, + "loss": 1.593670129776001, + "step": 3590 + }, + { + "epoch": 0.65386365704924, + "grad_norm": 7.375, + "learning_rate": 4.564549700091399e-06, + "loss": 1.7358109951019287, + "step": 3592 + }, + { + "epoch": 0.6542277236734323, + "grad_norm": 24.625, + "learning_rate": 4.564072325683138e-06, + "loss": 1.4772859811782837, + "step": 3594 + }, + { + "epoch": 0.6545917902976245, + "grad_norm": 9.8125, + "learning_rate": 4.563594721753602e-06, + "loss": 1.0298840999603271, + "step": 3596 + }, + { + "epoch": 0.6549558569218167, + "grad_norm": 9.5, + "learning_rate": 4.563116888372878e-06, + "loss": 1.46136474609375, + "step": 3598 + }, + { + "epoch": 0.6553199235460089, + "grad_norm": 8.25, + "learning_rate": 4.562638825611084e-06, + "loss": 1.1734012365341187, + "step": 3600 + }, + { + "epoch": 0.6556839901702012, + "grad_norm": 9.1875, + "learning_rate": 4.562160533538377e-06, + "loss": 1.4688090085983276, + "step": 3602 + }, + { + "epoch": 0.6560480567943934, + "grad_norm": 10.9375, + "learning_rate": 4.561682012224942e-06, + "loss": 1.4228413105010986, + "step": 3604 + }, + { + "epoch": 0.6564121234185856, + "grad_norm": 17.125, + "learning_rate": 4.561203261741e-06, + "loss": 1.6375436782836914, + "step": 3606 + }, + { + "epoch": 0.6567761900427779, + "grad_norm": 23.5, + "learning_rate": 4.560724282156806e-06, + "loss": 1.8387964963912964, + "step": 3608 + }, + { + "epoch": 0.6571402566669701, + "grad_norm": 14.5625, + "learning_rate": 4.5602450735426494e-06, + "loss": 1.4028043746948242, + "step": 3610 + }, + { + "epoch": 0.6575043232911623, + "grad_norm": 11.625, + "learning_rate": 4.5597656359688514e-06, + "loss": 1.0829812288284302, + "step": 3612 + }, + { + "epoch": 0.6578683899153545, + "grad_norm": 10.75, + "learning_rate": 4.5592859695057675e-06, + "loss": 0.827850341796875, + "step": 3614 + }, + { + "epoch": 0.6582324565395468, + "grad_norm": 12.0625, + "learning_rate": 4.558806074223787e-06, + "loss": 1.4451552629470825, + "step": 3616 + }, + { + "epoch": 0.658596523163739, + "grad_norm": 16.375, + "learning_rate": 4.558325950193332e-06, + "loss": 1.683427333831787, + "step": 3618 + }, + { + "epoch": 0.6589605897879312, + "grad_norm": 21.625, + "learning_rate": 4.557845597484859e-06, + "loss": 1.6295377016067505, + "step": 3620 + }, + { + "epoch": 0.6593246564121235, + "grad_norm": 9.5625, + "learning_rate": 4.5573650161688585e-06, + "loss": 0.23497727513313293, + "step": 3622 + }, + { + "epoch": 0.6596887230363156, + "grad_norm": 26.125, + "learning_rate": 4.556884206315853e-06, + "loss": 1.352825403213501, + "step": 3624 + }, + { + "epoch": 0.6600527896605078, + "grad_norm": 12.6875, + "learning_rate": 4.556403167996399e-06, + "loss": 1.8081603050231934, + "step": 3626 + }, + { + "epoch": 0.6604168562847001, + "grad_norm": 14.1875, + "learning_rate": 4.555921901281089e-06, + "loss": 1.4837032556533813, + "step": 3628 + }, + { + "epoch": 0.6607809229088923, + "grad_norm": 15.875, + "learning_rate": 4.5554404062405445e-06, + "loss": 1.6521873474121094, + "step": 3630 + }, + { + "epoch": 0.6611449895330845, + "grad_norm": 20.5, + "learning_rate": 4.554958682945425e-06, + "loss": 1.604405403137207, + "step": 3632 + }, + { + "epoch": 0.6615090561572767, + "grad_norm": 19.5, + "learning_rate": 4.554476731466419e-06, + "loss": 1.7262576818466187, + "step": 3634 + }, + { + "epoch": 0.661873122781469, + "grad_norm": 15.125, + "learning_rate": 4.553994551874254e-06, + "loss": 1.9060571193695068, + "step": 3636 + }, + { + "epoch": 0.6622371894056612, + "grad_norm": 12.0, + "learning_rate": 4.5535121442396855e-06, + "loss": 1.4974902868270874, + "step": 3638 + }, + { + "epoch": 0.6626012560298534, + "grad_norm": 13.5625, + "learning_rate": 4.553029508633506e-06, + "loss": 1.9952466487884521, + "step": 3640 + }, + { + "epoch": 0.6629653226540457, + "grad_norm": 20.75, + "learning_rate": 4.55254664512654e-06, + "loss": 1.4698244333267212, + "step": 3642 + }, + { + "epoch": 0.6633293892782379, + "grad_norm": 19.0, + "learning_rate": 4.552063553789645e-06, + "loss": 1.2583839893341064, + "step": 3644 + }, + { + "epoch": 0.6636934559024301, + "grad_norm": 8.0625, + "learning_rate": 4.551580234693715e-06, + "loss": 0.8891459703445435, + "step": 3646 + }, + { + "epoch": 0.6640575225266224, + "grad_norm": 17.375, + "learning_rate": 4.551096687909672e-06, + "loss": 1.5591721534729004, + "step": 3648 + }, + { + "epoch": 0.6644215891508146, + "grad_norm": 16.5, + "learning_rate": 4.550612913508478e-06, + "loss": 2.0533902645111084, + "step": 3650 + }, + { + "epoch": 0.6647856557750068, + "grad_norm": 8.625, + "learning_rate": 4.550128911561121e-06, + "loss": 1.515520453453064, + "step": 3652 + }, + { + "epoch": 0.665149722399199, + "grad_norm": 7.71875, + "learning_rate": 4.5496446821386285e-06, + "loss": 1.4978930950164795, + "step": 3654 + }, + { + "epoch": 0.6655137890233913, + "grad_norm": 9.4375, + "learning_rate": 4.54916022531206e-06, + "loss": 1.4474818706512451, + "step": 3656 + }, + { + "epoch": 0.6658778556475835, + "grad_norm": 12.25, + "learning_rate": 4.548675541152506e-06, + "loss": 2.0054242610931396, + "step": 3658 + }, + { + "epoch": 0.6662419222717757, + "grad_norm": 7.34375, + "learning_rate": 4.548190629731093e-06, + "loss": 1.6663093566894531, + "step": 3660 + }, + { + "epoch": 0.666605988895968, + "grad_norm": 3.296875, + "learning_rate": 4.547705491118979e-06, + "loss": 1.0006725788116455, + "step": 3662 + }, + { + "epoch": 0.6669700555201602, + "grad_norm": 9.5, + "learning_rate": 4.547220125387356e-06, + "loss": 1.2612828016281128, + "step": 3664 + }, + { + "epoch": 0.6673341221443524, + "grad_norm": 9.125, + "learning_rate": 4.546734532607452e-06, + "loss": 1.348300576210022, + "step": 3666 + }, + { + "epoch": 0.6676981887685447, + "grad_norm": 11.625, + "learning_rate": 4.546248712850521e-06, + "loss": 1.8749618530273438, + "step": 3668 + }, + { + "epoch": 0.6680622553927369, + "grad_norm": 16.25, + "learning_rate": 4.54576266618786e-06, + "loss": 1.9296215772628784, + "step": 3670 + }, + { + "epoch": 0.6684263220169291, + "grad_norm": 27.375, + "learning_rate": 4.54527639269079e-06, + "loss": 1.2320010662078857, + "step": 3672 + }, + { + "epoch": 0.6687903886411213, + "grad_norm": 10.75, + "learning_rate": 4.544789892430674e-06, + "loss": 1.4485838413238525, + "step": 3674 + }, + { + "epoch": 0.6691544552653136, + "grad_norm": 12.125, + "learning_rate": 4.544303165478902e-06, + "loss": 1.2830255031585693, + "step": 3676 + }, + { + "epoch": 0.6695185218895058, + "grad_norm": 13.125, + "learning_rate": 4.543816211906896e-06, + "loss": 1.4946999549865723, + "step": 3678 + }, + { + "epoch": 0.669882588513698, + "grad_norm": 10.3125, + "learning_rate": 4.54332903178612e-06, + "loss": 1.488081932067871, + "step": 3680 + }, + { + "epoch": 0.6702466551378903, + "grad_norm": 6.625, + "learning_rate": 4.542841625188062e-06, + "loss": 1.6220935583114624, + "step": 3682 + }, + { + "epoch": 0.6706107217620825, + "grad_norm": 16.875, + "learning_rate": 4.542353992184248e-06, + "loss": 1.7125155925750732, + "step": 3684 + }, + { + "epoch": 0.6709747883862747, + "grad_norm": 21.0, + "learning_rate": 4.541866132846238e-06, + "loss": 2.2610013484954834, + "step": 3686 + }, + { + "epoch": 0.6713388550104669, + "grad_norm": 16.75, + "learning_rate": 4.54137804724562e-06, + "loss": 1.7678964138031006, + "step": 3688 + }, + { + "epoch": 0.6717029216346592, + "grad_norm": 34.25, + "learning_rate": 4.54088973545402e-06, + "loss": 1.7956126928329468, + "step": 3690 + }, + { + "epoch": 0.6720669882588514, + "grad_norm": 8.125, + "learning_rate": 4.540401197543097e-06, + "loss": 0.9123656749725342, + "step": 3692 + }, + { + "epoch": 0.6724310548830436, + "grad_norm": 8.5, + "learning_rate": 4.539912433584541e-06, + "loss": 1.4581201076507568, + "step": 3694 + }, + { + "epoch": 0.6727951215072359, + "grad_norm": 9.9375, + "learning_rate": 4.539423443650076e-06, + "loss": 0.7008066177368164, + "step": 3696 + }, + { + "epoch": 0.673159188131428, + "grad_norm": 11.125, + "learning_rate": 4.538934227811459e-06, + "loss": 1.5727577209472656, + "step": 3698 + }, + { + "epoch": 0.6735232547556202, + "grad_norm": 22.375, + "learning_rate": 4.5384447861404805e-06, + "loss": 1.6482254266738892, + "step": 3700 + }, + { + "epoch": 0.6738873213798126, + "grad_norm": 24.0, + "learning_rate": 4.537955118708965e-06, + "loss": 1.1783530712127686, + "step": 3702 + }, + { + "epoch": 0.6742513880040047, + "grad_norm": 13.4375, + "learning_rate": 4.537465225588769e-06, + "loss": 0.6194881200790405, + "step": 3704 + }, + { + "epoch": 0.6746154546281969, + "grad_norm": 15.9375, + "learning_rate": 4.536975106851781e-06, + "loss": 1.563307285308838, + "step": 3706 + }, + { + "epoch": 0.6749795212523891, + "grad_norm": 10.5, + "learning_rate": 4.5364847625699245e-06, + "loss": 1.803078055381775, + "step": 3708 + }, + { + "epoch": 0.6753435878765814, + "grad_norm": 7.15625, + "learning_rate": 4.535994192815158e-06, + "loss": 1.5700092315673828, + "step": 3710 + }, + { + "epoch": 0.6757076545007736, + "grad_norm": 8.625, + "learning_rate": 4.535503397659466e-06, + "loss": 1.0744454860687256, + "step": 3712 + }, + { + "epoch": 0.6760717211249658, + "grad_norm": 17.625, + "learning_rate": 4.535012377174875e-06, + "loss": 1.1591023206710815, + "step": 3714 + }, + { + "epoch": 0.6764357877491581, + "grad_norm": 26.625, + "learning_rate": 4.534521131433438e-06, + "loss": 1.0465067625045776, + "step": 3716 + }, + { + "epoch": 0.6767998543733503, + "grad_norm": 14.625, + "learning_rate": 4.5340296605072446e-06, + "loss": 1.2641870975494385, + "step": 3718 + }, + { + "epoch": 0.6771639209975425, + "grad_norm": 9.75, + "learning_rate": 4.533537964468414e-06, + "loss": 1.6101582050323486, + "step": 3720 + }, + { + "epoch": 0.6775279876217348, + "grad_norm": 141.0, + "learning_rate": 4.533046043389102e-06, + "loss": 2.221773862838745, + "step": 3722 + }, + { + "epoch": 0.677892054245927, + "grad_norm": 3.515625, + "learning_rate": 4.532553897341497e-06, + "loss": 1.061731219291687, + "step": 3724 + }, + { + "epoch": 0.6782561208701192, + "grad_norm": 9.0625, + "learning_rate": 4.532061526397818e-06, + "loss": 1.4650909900665283, + "step": 3726 + }, + { + "epoch": 0.6786201874943114, + "grad_norm": 11.8125, + "learning_rate": 4.531568930630319e-06, + "loss": 1.4499444961547852, + "step": 3728 + }, + { + "epoch": 0.6789842541185037, + "grad_norm": 15.25, + "learning_rate": 4.531076110111286e-06, + "loss": 1.5956716537475586, + "step": 3730 + }, + { + "epoch": 0.6793483207426959, + "grad_norm": 6.84375, + "learning_rate": 4.53058306491304e-06, + "loss": 1.4599652290344238, + "step": 3732 + }, + { + "epoch": 0.6797123873668881, + "grad_norm": 9.8125, + "learning_rate": 4.530089795107931e-06, + "loss": 1.1326687335968018, + "step": 3734 + }, + { + "epoch": 0.6800764539910804, + "grad_norm": 7.0625, + "learning_rate": 4.529596300768346e-06, + "loss": 1.2323410511016846, + "step": 3736 + }, + { + "epoch": 0.6804405206152726, + "grad_norm": 6.78125, + "learning_rate": 4.529102581966702e-06, + "loss": 0.7173973321914673, + "step": 3738 + }, + { + "epoch": 0.6808045872394648, + "grad_norm": 19.25, + "learning_rate": 4.528608638775451e-06, + "loss": 1.355765700340271, + "step": 3740 + }, + { + "epoch": 0.6811686538636571, + "grad_norm": 17.125, + "learning_rate": 4.528114471267079e-06, + "loss": 0.8032264113426208, + "step": 3742 + }, + { + "epoch": 0.6815327204878493, + "grad_norm": 11.0, + "learning_rate": 4.527620079514099e-06, + "loss": 1.518304705619812, + "step": 3744 + }, + { + "epoch": 0.6818967871120415, + "grad_norm": 9.8125, + "learning_rate": 4.527125463589065e-06, + "loss": 1.0924034118652344, + "step": 3746 + }, + { + "epoch": 0.6822608537362337, + "grad_norm": 21.625, + "learning_rate": 4.526630623564557e-06, + "loss": 1.5667080879211426, + "step": 3748 + }, + { + "epoch": 0.682624920360426, + "grad_norm": 9.4375, + "learning_rate": 4.5261355595131915e-06, + "loss": 1.0316617488861084, + "step": 3750 + }, + { + "epoch": 0.6829889869846182, + "grad_norm": 11.125, + "learning_rate": 4.525640271507619e-06, + "loss": 1.391589879989624, + "step": 3752 + }, + { + "epoch": 0.6833530536088104, + "grad_norm": 14.0625, + "learning_rate": 4.525144759620518e-06, + "loss": 1.4162417650222778, + "step": 3754 + }, + { + "epoch": 0.6837171202330027, + "grad_norm": 17.0, + "learning_rate": 4.524649023924605e-06, + "loss": 1.4036498069763184, + "step": 3756 + }, + { + "epoch": 0.6840811868571949, + "grad_norm": 18.0, + "learning_rate": 4.5241530644926265e-06, + "loss": 1.4906201362609863, + "step": 3758 + }, + { + "epoch": 0.6844452534813871, + "grad_norm": 11.5625, + "learning_rate": 4.523656881397364e-06, + "loss": 1.3111437559127808, + "step": 3760 + }, + { + "epoch": 0.6848093201055793, + "grad_norm": 5.34375, + "learning_rate": 4.523160474711627e-06, + "loss": 1.0921313762664795, + "step": 3762 + }, + { + "epoch": 0.6851733867297716, + "grad_norm": 7.75, + "learning_rate": 4.5226638445082634e-06, + "loss": 1.3556110858917236, + "step": 3764 + }, + { + "epoch": 0.6855374533539638, + "grad_norm": 19.0, + "learning_rate": 4.522166990860153e-06, + "loss": 1.2254915237426758, + "step": 3766 + }, + { + "epoch": 0.685901519978156, + "grad_norm": 11.5, + "learning_rate": 4.521669913840204e-06, + "loss": 1.1682343482971191, + "step": 3768 + }, + { + "epoch": 0.6862655866023483, + "grad_norm": 13.375, + "learning_rate": 4.521172613521363e-06, + "loss": 1.7945854663848877, + "step": 3770 + }, + { + "epoch": 0.6866296532265405, + "grad_norm": 8.375, + "learning_rate": 4.520675089976605e-06, + "loss": 1.7545654773712158, + "step": 3772 + }, + { + "epoch": 0.6869937198507327, + "grad_norm": 11.5, + "learning_rate": 4.520177343278941e-06, + "loss": 1.2681553363800049, + "step": 3774 + }, + { + "epoch": 0.687357786474925, + "grad_norm": 10.875, + "learning_rate": 4.519679373501412e-06, + "loss": 1.019344687461853, + "step": 3776 + }, + { + "epoch": 0.6877218530991172, + "grad_norm": 10.6875, + "learning_rate": 4.519181180717093e-06, + "loss": 1.5107334852218628, + "step": 3778 + }, + { + "epoch": 0.6880859197233093, + "grad_norm": 15.875, + "learning_rate": 4.5186827649990925e-06, + "loss": 1.8209457397460938, + "step": 3780 + }, + { + "epoch": 0.6884499863475015, + "grad_norm": 18.125, + "learning_rate": 4.518184126420553e-06, + "loss": 1.4318132400512695, + "step": 3782 + }, + { + "epoch": 0.6888140529716938, + "grad_norm": 11.75, + "learning_rate": 4.517685265054644e-06, + "loss": 1.5755144357681274, + "step": 3784 + }, + { + "epoch": 0.689178119595886, + "grad_norm": 34.5, + "learning_rate": 4.517186180974573e-06, + "loss": 1.4664556980133057, + "step": 3786 + }, + { + "epoch": 0.6895421862200782, + "grad_norm": 9.375, + "learning_rate": 4.516686874253579e-06, + "loss": 1.6866182088851929, + "step": 3788 + }, + { + "epoch": 0.6899062528442705, + "grad_norm": 44.25, + "learning_rate": 4.516187344964932e-06, + "loss": 1.4259510040283203, + "step": 3790 + }, + { + "epoch": 0.6902703194684627, + "grad_norm": 9.375, + "learning_rate": 4.515687593181938e-06, + "loss": 1.3689099550247192, + "step": 3792 + }, + { + "epoch": 0.6906343860926549, + "grad_norm": 10.1875, + "learning_rate": 4.515187618977931e-06, + "loss": 1.239065408706665, + "step": 3794 + }, + { + "epoch": 0.6909984527168472, + "grad_norm": 47.5, + "learning_rate": 4.5146874224262825e-06, + "loss": 1.429834246635437, + "step": 3796 + }, + { + "epoch": 0.6913625193410394, + "grad_norm": 3.890625, + "learning_rate": 4.5141870036003925e-06, + "loss": 1.1575387716293335, + "step": 3798 + }, + { + "epoch": 0.6917265859652316, + "grad_norm": 11.625, + "learning_rate": 4.513686362573696e-06, + "loss": 1.2679401636123657, + "step": 3800 + }, + { + "epoch": 0.6920906525894238, + "grad_norm": 7.78125, + "learning_rate": 4.513185499419661e-06, + "loss": 1.0431429147720337, + "step": 3802 + }, + { + "epoch": 0.6924547192136161, + "grad_norm": 15.0, + "learning_rate": 4.512684414211787e-06, + "loss": 1.3145463466644287, + "step": 3804 + }, + { + "epoch": 0.6928187858378083, + "grad_norm": 8.5625, + "learning_rate": 4.512183107023603e-06, + "loss": 1.825685739517212, + "step": 3806 + }, + { + "epoch": 0.6931828524620005, + "grad_norm": 28.375, + "learning_rate": 4.511681577928678e-06, + "loss": 1.2586886882781982, + "step": 3808 + }, + { + "epoch": 0.6935469190861928, + "grad_norm": 14.4375, + "learning_rate": 4.511179827000608e-06, + "loss": 0.9188296794891357, + "step": 3810 + }, + { + "epoch": 0.693910985710385, + "grad_norm": 22.875, + "learning_rate": 4.510677854313023e-06, + "loss": 1.6138436794281006, + "step": 3812 + }, + { + "epoch": 0.6942750523345772, + "grad_norm": 3.453125, + "learning_rate": 4.510175659939584e-06, + "loss": 1.3478995561599731, + "step": 3814 + }, + { + "epoch": 0.6946391189587695, + "grad_norm": 49.0, + "learning_rate": 4.5096732439539885e-06, + "loss": 1.5111191272735596, + "step": 3816 + }, + { + "epoch": 0.6950031855829617, + "grad_norm": 19.5, + "learning_rate": 4.509170606429961e-06, + "loss": 1.4348554611206055, + "step": 3818 + }, + { + "epoch": 0.6953672522071539, + "grad_norm": 12.125, + "learning_rate": 4.508667747441264e-06, + "loss": 1.6349647045135498, + "step": 3820 + }, + { + "epoch": 0.6957313188313461, + "grad_norm": 13.9375, + "learning_rate": 4.508164667061689e-06, + "loss": 1.4164223670959473, + "step": 3822 + }, + { + "epoch": 0.6960953854555384, + "grad_norm": 40.5, + "learning_rate": 4.5076613653650614e-06, + "loss": 1.674292802810669, + "step": 3824 + }, + { + "epoch": 0.6964594520797306, + "grad_norm": 8.6875, + "learning_rate": 4.5071578424252385e-06, + "loss": 1.4369826316833496, + "step": 3826 + }, + { + "epoch": 0.6968235187039228, + "grad_norm": 11.75, + "learning_rate": 4.506654098316109e-06, + "loss": 1.206772804260254, + "step": 3828 + }, + { + "epoch": 0.6971875853281151, + "grad_norm": 14.8125, + "learning_rate": 4.506150133111598e-06, + "loss": 0.8857887387275696, + "step": 3830 + }, + { + "epoch": 0.6975516519523073, + "grad_norm": 14.3125, + "learning_rate": 4.505645946885659e-06, + "loss": 1.3125667572021484, + "step": 3832 + }, + { + "epoch": 0.6979157185764995, + "grad_norm": 7.34375, + "learning_rate": 4.505141539712278e-06, + "loss": 1.3718979358673096, + "step": 3834 + }, + { + "epoch": 0.6982797852006917, + "grad_norm": 9.875, + "learning_rate": 4.5046369116654755e-06, + "loss": 1.1708085536956787, + "step": 3836 + }, + { + "epoch": 0.698643851824884, + "grad_norm": 16.75, + "learning_rate": 4.504132062819306e-06, + "loss": 0.8497837781906128, + "step": 3838 + }, + { + "epoch": 0.6990079184490762, + "grad_norm": 103.0, + "learning_rate": 4.503626993247851e-06, + "loss": 0.7867426872253418, + "step": 3840 + }, + { + "epoch": 0.6993719850732684, + "grad_norm": 4.375, + "learning_rate": 4.503121703025227e-06, + "loss": 1.2313120365142822, + "step": 3842 + }, + { + "epoch": 0.6997360516974607, + "grad_norm": 20.0, + "learning_rate": 4.502616192225586e-06, + "loss": 1.697470784187317, + "step": 3844 + }, + { + "epoch": 0.7001001183216529, + "grad_norm": 6.59375, + "learning_rate": 4.502110460923108e-06, + "loss": 1.5227793455123901, + "step": 3846 + }, + { + "epoch": 0.7004641849458451, + "grad_norm": 10.75, + "learning_rate": 4.501604509192008e-06, + "loss": 1.295369267463684, + "step": 3848 + }, + { + "epoch": 0.7008282515700374, + "grad_norm": 23.25, + "learning_rate": 4.501098337106532e-06, + "loss": 1.2205997705459595, + "step": 3850 + }, + { + "epoch": 0.7011923181942296, + "grad_norm": 10.625, + "learning_rate": 4.5005919447409575e-06, + "loss": 1.1970067024230957, + "step": 3852 + }, + { + "epoch": 0.7015563848184218, + "grad_norm": 12.125, + "learning_rate": 4.5000853321695955e-06, + "loss": 1.3742592334747314, + "step": 3854 + }, + { + "epoch": 0.701920451442614, + "grad_norm": 13.0, + "learning_rate": 4.499578499466792e-06, + "loss": 1.643075942993164, + "step": 3856 + }, + { + "epoch": 0.7022845180668063, + "grad_norm": 17.125, + "learning_rate": 4.499071446706921e-06, + "loss": 1.6244299411773682, + "step": 3858 + }, + { + "epoch": 0.7026485846909984, + "grad_norm": 20.0, + "learning_rate": 4.49856417396439e-06, + "loss": 1.6272008419036865, + "step": 3860 + }, + { + "epoch": 0.7030126513151906, + "grad_norm": 9.125, + "learning_rate": 4.498056681313639e-06, + "loss": 1.2644447088241577, + "step": 3862 + }, + { + "epoch": 0.703376717939383, + "grad_norm": 9.0625, + "learning_rate": 4.497548968829143e-06, + "loss": 1.5149474143981934, + "step": 3864 + }, + { + "epoch": 0.7037407845635751, + "grad_norm": 12.0625, + "learning_rate": 4.497041036585404e-06, + "loss": 1.5932220220565796, + "step": 3866 + }, + { + "epoch": 0.7041048511877673, + "grad_norm": 11.9375, + "learning_rate": 4.496532884656957e-06, + "loss": 1.5577473640441895, + "step": 3868 + }, + { + "epoch": 0.7044689178119596, + "grad_norm": 13.75, + "learning_rate": 4.496024513118378e-06, + "loss": 1.675068974494934, + "step": 3870 + }, + { + "epoch": 0.7048329844361518, + "grad_norm": 5.6875, + "learning_rate": 4.495515922044264e-06, + "loss": 1.0313233137130737, + "step": 3872 + }, + { + "epoch": 0.705197051060344, + "grad_norm": 7.84375, + "learning_rate": 4.4950071115092474e-06, + "loss": 1.648760437965393, + "step": 3874 + }, + { + "epoch": 0.7055611176845362, + "grad_norm": 7.15625, + "learning_rate": 4.494498081587997e-06, + "loss": 1.5017939805984497, + "step": 3876 + }, + { + "epoch": 0.7059251843087285, + "grad_norm": 9.0625, + "learning_rate": 4.49398883235521e-06, + "loss": 1.4444807767868042, + "step": 3878 + }, + { + "epoch": 0.7062892509329207, + "grad_norm": 8.375, + "learning_rate": 4.493479363885615e-06, + "loss": 1.3903720378875732, + "step": 3880 + }, + { + "epoch": 0.7066533175571129, + "grad_norm": 6.125, + "learning_rate": 4.4929696762539774e-06, + "loss": 1.3726187944412231, + "step": 3882 + }, + { + "epoch": 0.7070173841813052, + "grad_norm": 8.75, + "learning_rate": 4.49245976953509e-06, + "loss": 1.2965294122695923, + "step": 3884 + }, + { + "epoch": 0.7073814508054974, + "grad_norm": 8.9375, + "learning_rate": 4.49194964380378e-06, + "loss": 1.366483449935913, + "step": 3886 + }, + { + "epoch": 0.7077455174296896, + "grad_norm": 29.625, + "learning_rate": 4.4914392991349055e-06, + "loss": 1.4897915124893188, + "step": 3888 + }, + { + "epoch": 0.7081095840538819, + "grad_norm": 14.375, + "learning_rate": 4.490928735603358e-06, + "loss": 1.9999818801879883, + "step": 3890 + }, + { + "epoch": 0.7084736506780741, + "grad_norm": 23.25, + "learning_rate": 4.490417953284062e-06, + "loss": 1.395628571510315, + "step": 3892 + }, + { + "epoch": 0.7088377173022663, + "grad_norm": 6.125, + "learning_rate": 4.48990695225197e-06, + "loss": 1.1906358003616333, + "step": 3894 + }, + { + "epoch": 0.7092017839264585, + "grad_norm": 11.625, + "learning_rate": 4.4893957325820725e-06, + "loss": 1.257643699645996, + "step": 3896 + }, + { + "epoch": 0.7095658505506508, + "grad_norm": 6.78125, + "learning_rate": 4.488884294349386e-06, + "loss": 1.3802566528320312, + "step": 3898 + }, + { + "epoch": 0.709929917174843, + "grad_norm": 29.625, + "learning_rate": 4.4883726376289624e-06, + "loss": 1.5334763526916504, + "step": 3900 + }, + { + "epoch": 0.7102939837990352, + "grad_norm": 6.90625, + "learning_rate": 4.487860762495888e-06, + "loss": 1.1014628410339355, + "step": 3902 + }, + { + "epoch": 0.7106580504232275, + "grad_norm": 11.1875, + "learning_rate": 4.487348669025275e-06, + "loss": 0.813360333442688, + "step": 3904 + }, + { + "epoch": 0.7110221170474197, + "grad_norm": 9.0625, + "learning_rate": 4.486836357292273e-06, + "loss": 1.1505460739135742, + "step": 3906 + }, + { + "epoch": 0.7113861836716119, + "grad_norm": 14.1875, + "learning_rate": 4.4863238273720625e-06, + "loss": 1.627068042755127, + "step": 3908 + }, + { + "epoch": 0.7117502502958041, + "grad_norm": 32.25, + "learning_rate": 4.485811079339852e-06, + "loss": 1.280656337738037, + "step": 3910 + }, + { + "epoch": 0.7121143169199964, + "grad_norm": 8.9375, + "learning_rate": 4.485298113270887e-06, + "loss": 1.5592602491378784, + "step": 3912 + }, + { + "epoch": 0.7124783835441886, + "grad_norm": 19.625, + "learning_rate": 4.484784929240445e-06, + "loss": 1.2622069120407104, + "step": 3914 + }, + { + "epoch": 0.7128424501683808, + "grad_norm": 18.25, + "learning_rate": 4.484271527323831e-06, + "loss": 1.570495843887329, + "step": 3916 + }, + { + "epoch": 0.7132065167925731, + "grad_norm": 9.875, + "learning_rate": 4.483757907596386e-06, + "loss": 1.0512316226959229, + "step": 3918 + }, + { + "epoch": 0.7135705834167653, + "grad_norm": 16.125, + "learning_rate": 4.48324407013348e-06, + "loss": 1.3812226057052612, + "step": 3920 + }, + { + "epoch": 0.7139346500409575, + "grad_norm": 16.25, + "learning_rate": 4.482730015010519e-06, + "loss": 0.9664490222930908, + "step": 3922 + }, + { + "epoch": 0.7142987166651498, + "grad_norm": 25.0, + "learning_rate": 4.482215742302937e-06, + "loss": 1.4992426633834839, + "step": 3924 + }, + { + "epoch": 0.714662783289342, + "grad_norm": 7.625, + "learning_rate": 4.481701252086201e-06, + "loss": 1.301613211631775, + "step": 3926 + }, + { + "epoch": 0.7150268499135342, + "grad_norm": 18.375, + "learning_rate": 4.481186544435812e-06, + "loss": 1.4645214080810547, + "step": 3928 + }, + { + "epoch": 0.7153909165377264, + "grad_norm": 9.6875, + "learning_rate": 4.4806716194273e-06, + "loss": 1.3729705810546875, + "step": 3930 + }, + { + "epoch": 0.7157549831619187, + "grad_norm": 7.71875, + "learning_rate": 4.480156477136229e-06, + "loss": 0.9819673299789429, + "step": 3932 + }, + { + "epoch": 0.7161190497861109, + "grad_norm": 14.25, + "learning_rate": 4.479641117638193e-06, + "loss": 1.7443373203277588, + "step": 3934 + }, + { + "epoch": 0.716483116410303, + "grad_norm": 12.25, + "learning_rate": 4.479125541008819e-06, + "loss": 1.4857553243637085, + "step": 3936 + }, + { + "epoch": 0.7168471830344953, + "grad_norm": 13.3125, + "learning_rate": 4.478609747323767e-06, + "loss": 1.4183406829833984, + "step": 3938 + }, + { + "epoch": 0.7172112496586875, + "grad_norm": 2.234375, + "learning_rate": 4.478093736658725e-06, + "loss": 0.9251596927642822, + "step": 3940 + }, + { + "epoch": 0.7175753162828797, + "grad_norm": 18.125, + "learning_rate": 4.477577509089419e-06, + "loss": 1.0102472305297852, + "step": 3942 + }, + { + "epoch": 0.717939382907072, + "grad_norm": 15.5625, + "learning_rate": 4.4770610646916025e-06, + "loss": 0.7217501401901245, + "step": 3944 + }, + { + "epoch": 0.7183034495312642, + "grad_norm": 12.5, + "learning_rate": 4.47654440354106e-06, + "loss": 1.6710395812988281, + "step": 3946 + }, + { + "epoch": 0.7186675161554564, + "grad_norm": 63.25, + "learning_rate": 4.47602752571361e-06, + "loss": 1.7212402820587158, + "step": 3948 + }, + { + "epoch": 0.7190315827796486, + "grad_norm": 10.1875, + "learning_rate": 4.475510431285102e-06, + "loss": 1.3765300512313843, + "step": 3950 + }, + { + "epoch": 0.7193956494038409, + "grad_norm": 9.0625, + "learning_rate": 4.474993120331418e-06, + "loss": 1.3453419208526611, + "step": 3952 + }, + { + "epoch": 0.7197597160280331, + "grad_norm": 11.375, + "learning_rate": 4.4744755929284714e-06, + "loss": 1.4030954837799072, + "step": 3954 + }, + { + "epoch": 0.7201237826522253, + "grad_norm": 12.8125, + "learning_rate": 4.473957849152207e-06, + "loss": 1.8009711503982544, + "step": 3956 + }, + { + "epoch": 0.7204878492764176, + "grad_norm": 6.5625, + "learning_rate": 4.473439889078602e-06, + "loss": 1.1683478355407715, + "step": 3958 + }, + { + "epoch": 0.7208519159006098, + "grad_norm": 11.3125, + "learning_rate": 4.472921712783665e-06, + "loss": 1.19291353225708, + "step": 3960 + }, + { + "epoch": 0.721215982524802, + "grad_norm": 17.125, + "learning_rate": 4.4724033203434345e-06, + "loss": 1.2174474000930786, + "step": 3962 + }, + { + "epoch": 0.7215800491489943, + "grad_norm": 6.96875, + "learning_rate": 4.471884711833985e-06, + "loss": 1.6224758625030518, + "step": 3964 + }, + { + "epoch": 0.7219441157731865, + "grad_norm": 16.125, + "learning_rate": 4.471365887331418e-06, + "loss": 1.1545883417129517, + "step": 3966 + }, + { + "epoch": 0.7223081823973787, + "grad_norm": 15.875, + "learning_rate": 4.47084684691187e-06, + "loss": 1.8949850797653198, + "step": 3968 + }, + { + "epoch": 0.7226722490215709, + "grad_norm": 9.875, + "learning_rate": 4.4703275906515095e-06, + "loss": 1.6063568592071533, + "step": 3970 + }, + { + "epoch": 0.7230363156457632, + "grad_norm": 19.0, + "learning_rate": 4.469808118626534e-06, + "loss": 1.2111185789108276, + "step": 3972 + }, + { + "epoch": 0.7234003822699554, + "grad_norm": 22.875, + "learning_rate": 4.469288430913172e-06, + "loss": 0.995964527130127, + "step": 3974 + }, + { + "epoch": 0.7237644488941476, + "grad_norm": 4.65625, + "learning_rate": 4.468768527587688e-06, + "loss": 1.2622394561767578, + "step": 3976 + }, + { + "epoch": 0.7241285155183399, + "grad_norm": 6.46875, + "learning_rate": 4.468248408726376e-06, + "loss": 1.2863210439682007, + "step": 3978 + }, + { + "epoch": 0.7244925821425321, + "grad_norm": 15.5625, + "learning_rate": 4.467728074405558e-06, + "loss": 1.3262053728103638, + "step": 3980 + }, + { + "epoch": 0.7248566487667243, + "grad_norm": 7.28125, + "learning_rate": 4.467207524701595e-06, + "loss": 1.5426204204559326, + "step": 3982 + }, + { + "epoch": 0.7252207153909166, + "grad_norm": 5.8125, + "learning_rate": 4.466686759690874e-06, + "loss": 1.0809754133224487, + "step": 3984 + }, + { + "epoch": 0.7255847820151088, + "grad_norm": 14.9375, + "learning_rate": 4.466165779449814e-06, + "loss": 1.4829933643341064, + "step": 3986 + }, + { + "epoch": 0.725948848639301, + "grad_norm": 118.5, + "learning_rate": 4.465644584054868e-06, + "loss": 1.3769590854644775, + "step": 3988 + }, + { + "epoch": 0.7263129152634932, + "grad_norm": 61.75, + "learning_rate": 4.465123173582519e-06, + "loss": 1.9018545150756836, + "step": 3990 + }, + { + "epoch": 0.7266769818876855, + "grad_norm": 10.5, + "learning_rate": 4.4646015481092805e-06, + "loss": 1.611771821975708, + "step": 3992 + }, + { + "epoch": 0.7270410485118777, + "grad_norm": 11.875, + "learning_rate": 4.464079707711703e-06, + "loss": 1.387900471687317, + "step": 3994 + }, + { + "epoch": 0.7274051151360699, + "grad_norm": 12.0, + "learning_rate": 4.46355765246636e-06, + "loss": 1.3997489213943481, + "step": 3996 + }, + { + "epoch": 0.7277691817602622, + "grad_norm": 18.375, + "learning_rate": 4.463035382449864e-06, + "loss": 1.6485671997070312, + "step": 3998 + }, + { + "epoch": 0.7281332483844544, + "grad_norm": 13.625, + "learning_rate": 4.462512897738855e-06, + "loss": 1.6315934658050537, + "step": 4000 + }, + { + "epoch": 0.7284973150086466, + "grad_norm": 13.125, + "learning_rate": 4.461990198410003e-06, + "loss": 1.6964771747589111, + "step": 4002 + }, + { + "epoch": 0.7288613816328388, + "grad_norm": 15.0, + "learning_rate": 4.461467284540016e-06, + "loss": 1.8010307550430298, + "step": 4004 + }, + { + "epoch": 0.7292254482570311, + "grad_norm": 6.5, + "learning_rate": 4.460944156205628e-06, + "loss": 1.5327054262161255, + "step": 4006 + }, + { + "epoch": 0.7295895148812233, + "grad_norm": 10.4375, + "learning_rate": 4.460420813483605e-06, + "loss": 1.2161662578582764, + "step": 4008 + }, + { + "epoch": 0.7299535815054154, + "grad_norm": 7.71875, + "learning_rate": 4.459897256450747e-06, + "loss": 1.3260434865951538, + "step": 4010 + }, + { + "epoch": 0.7303176481296078, + "grad_norm": 33.5, + "learning_rate": 4.459373485183882e-06, + "loss": 1.3870368003845215, + "step": 4012 + }, + { + "epoch": 0.7306817147538, + "grad_norm": 10.0, + "learning_rate": 4.458849499759873e-06, + "loss": 1.7670800685882568, + "step": 4014 + }, + { + "epoch": 0.7310457813779921, + "grad_norm": 16.5, + "learning_rate": 4.4583253002556126e-06, + "loss": 1.6878101825714111, + "step": 4016 + }, + { + "epoch": 0.7314098480021844, + "grad_norm": 23.75, + "learning_rate": 4.457800886748024e-06, + "loss": 0.6901309490203857, + "step": 4018 + }, + { + "epoch": 0.7317739146263766, + "grad_norm": 5.1875, + "learning_rate": 4.457276259314063e-06, + "loss": 1.0010206699371338, + "step": 4020 + }, + { + "epoch": 0.7321379812505688, + "grad_norm": 29.0, + "learning_rate": 4.456751418030717e-06, + "loss": 1.601664423942566, + "step": 4022 + }, + { + "epoch": 0.732502047874761, + "grad_norm": 14.125, + "learning_rate": 4.456226362975004e-06, + "loss": 2.0186665058135986, + "step": 4024 + }, + { + "epoch": 0.7328661144989533, + "grad_norm": 5.15625, + "learning_rate": 4.455701094223973e-06, + "loss": 1.2741717100143433, + "step": 4026 + }, + { + "epoch": 0.7332301811231455, + "grad_norm": 12.25, + "learning_rate": 4.455175611854708e-06, + "loss": 1.0950629711151123, + "step": 4028 + }, + { + "epoch": 0.7335942477473377, + "grad_norm": 9.6875, + "learning_rate": 4.454649915944317e-06, + "loss": 1.4390480518341064, + "step": 4030 + }, + { + "epoch": 0.73395831437153, + "grad_norm": 8.5, + "learning_rate": 4.454124006569948e-06, + "loss": 0.883948802947998, + "step": 4032 + }, + { + "epoch": 0.7343223809957222, + "grad_norm": 19.0, + "learning_rate": 4.4535978838087725e-06, + "loss": 1.3211839199066162, + "step": 4034 + }, + { + "epoch": 0.7346864476199144, + "grad_norm": 5.25, + "learning_rate": 4.4530715477379995e-06, + "loss": 0.5684911012649536, + "step": 4036 + }, + { + "epoch": 0.7350505142441067, + "grad_norm": 7.5, + "learning_rate": 4.452544998434864e-06, + "loss": 1.5237979888916016, + "step": 4038 + }, + { + "epoch": 0.7354145808682989, + "grad_norm": 12.5625, + "learning_rate": 4.452018235976638e-06, + "loss": 1.486379623413086, + "step": 4040 + }, + { + "epoch": 0.7357786474924911, + "grad_norm": 9.5625, + "learning_rate": 4.45149126044062e-06, + "loss": 1.3715254068374634, + "step": 4042 + }, + { + "epoch": 0.7361427141166833, + "grad_norm": 8.8125, + "learning_rate": 4.4509640719041425e-06, + "loss": 1.466097116470337, + "step": 4044 + }, + { + "epoch": 0.7365067807408756, + "grad_norm": 12.5, + "learning_rate": 4.450436670444568e-06, + "loss": 1.5135843753814697, + "step": 4046 + }, + { + "epoch": 0.7368708473650678, + "grad_norm": 27.75, + "learning_rate": 4.449909056139289e-06, + "loss": 1.5392903089523315, + "step": 4048 + }, + { + "epoch": 0.73723491398926, + "grad_norm": 16.875, + "learning_rate": 4.449381229065734e-06, + "loss": 1.4597059488296509, + "step": 4050 + }, + { + "epoch": 0.7375989806134523, + "grad_norm": 13.875, + "learning_rate": 4.448853189301354e-06, + "loss": 1.262041449546814, + "step": 4052 + }, + { + "epoch": 0.7379630472376445, + "grad_norm": 11.625, + "learning_rate": 4.448324936923643e-06, + "loss": 1.4917747974395752, + "step": 4054 + }, + { + "epoch": 0.7383271138618367, + "grad_norm": 29.5, + "learning_rate": 4.447796472010116e-06, + "loss": 2.0125930309295654, + "step": 4056 + }, + { + "epoch": 0.738691180486029, + "grad_norm": 6.0, + "learning_rate": 4.4472677946383245e-06, + "loss": 1.0335878133773804, + "step": 4058 + }, + { + "epoch": 0.7390552471102212, + "grad_norm": 10.25, + "learning_rate": 4.446738904885849e-06, + "loss": 1.2267245054244995, + "step": 4060 + }, + { + "epoch": 0.7394193137344134, + "grad_norm": 14.625, + "learning_rate": 4.446209802830303e-06, + "loss": 2.0021884441375732, + "step": 4062 + }, + { + "epoch": 0.7397833803586056, + "grad_norm": 11.3125, + "learning_rate": 4.445680488549329e-06, + "loss": 1.563736915588379, + "step": 4064 + }, + { + "epoch": 0.7401474469827979, + "grad_norm": 6.625, + "learning_rate": 4.445150962120601e-06, + "loss": 1.5454363822937012, + "step": 4066 + }, + { + "epoch": 0.7405115136069901, + "grad_norm": 13.1875, + "learning_rate": 4.444621223621827e-06, + "loss": 1.5436625480651855, + "step": 4068 + }, + { + "epoch": 0.7408755802311823, + "grad_norm": 7.25, + "learning_rate": 4.444091273130744e-06, + "loss": 1.4621388912200928, + "step": 4070 + }, + { + "epoch": 0.7412396468553746, + "grad_norm": 30.25, + "learning_rate": 4.443561110725118e-06, + "loss": 1.1844981908798218, + "step": 4072 + }, + { + "epoch": 0.7416037134795668, + "grad_norm": 42.75, + "learning_rate": 4.44303073648275e-06, + "loss": 1.3885607719421387, + "step": 4074 + }, + { + "epoch": 0.741967780103759, + "grad_norm": 17.75, + "learning_rate": 4.442500150481468e-06, + "loss": 1.5434989929199219, + "step": 4076 + }, + { + "epoch": 0.7423318467279512, + "grad_norm": 9.3125, + "learning_rate": 4.4419693527991356e-06, + "loss": 1.6302589178085327, + "step": 4078 + }, + { + "epoch": 0.7426959133521435, + "grad_norm": 9.4375, + "learning_rate": 4.441438343513644e-06, + "loss": 1.5927282571792603, + "step": 4080 + }, + { + "epoch": 0.7430599799763357, + "grad_norm": 18.75, + "learning_rate": 4.440907122702919e-06, + "loss": 1.510801911354065, + "step": 4082 + }, + { + "epoch": 0.7434240466005279, + "grad_norm": 7.375, + "learning_rate": 4.440375690444911e-06, + "loss": 0.5664615035057068, + "step": 4084 + }, + { + "epoch": 0.7437881132247202, + "grad_norm": 12.0, + "learning_rate": 4.439844046817609e-06, + "loss": 1.4451801776885986, + "step": 4086 + }, + { + "epoch": 0.7441521798489124, + "grad_norm": 10.9375, + "learning_rate": 4.439312191899028e-06, + "loss": 1.762737512588501, + "step": 4088 + }, + { + "epoch": 0.7445162464731045, + "grad_norm": 5.0, + "learning_rate": 4.438780125767216e-06, + "loss": 1.3794842958450317, + "step": 4090 + }, + { + "epoch": 0.7448803130972969, + "grad_norm": 11.625, + "learning_rate": 4.438247848500251e-06, + "loss": 1.5104601383209229, + "step": 4092 + }, + { + "epoch": 0.745244379721489, + "grad_norm": 13.5625, + "learning_rate": 4.4377153601762435e-06, + "loss": 2.0323023796081543, + "step": 4094 + }, + { + "epoch": 0.7456084463456812, + "grad_norm": 12.0, + "learning_rate": 4.437182660873334e-06, + "loss": 1.3515594005584717, + "step": 4096 + }, + { + "epoch": 0.7459725129698734, + "grad_norm": 7.75, + "learning_rate": 4.436649750669692e-06, + "loss": 1.6212824583053589, + "step": 4098 + }, + { + "epoch": 0.7463365795940657, + "grad_norm": 8.1875, + "learning_rate": 4.436116629643522e-06, + "loss": 1.426540732383728, + "step": 4100 + }, + { + "epoch": 0.7467006462182579, + "grad_norm": 18.125, + "learning_rate": 4.4355832978730566e-06, + "loss": 1.4811890125274658, + "step": 4102 + }, + { + "epoch": 0.7470647128424501, + "grad_norm": 11.5, + "learning_rate": 4.43504975543656e-06, + "loss": 1.505039930343628, + "step": 4104 + }, + { + "epoch": 0.7474287794666424, + "grad_norm": 10.25, + "learning_rate": 4.434516002412328e-06, + "loss": 1.4368852376937866, + "step": 4106 + }, + { + "epoch": 0.7477928460908346, + "grad_norm": 7.625, + "learning_rate": 4.433982038878686e-06, + "loss": 1.3733965158462524, + "step": 4108 + }, + { + "epoch": 0.7481569127150268, + "grad_norm": 3.734375, + "learning_rate": 4.4334478649139915e-06, + "loss": 0.9276962280273438, + "step": 4110 + }, + { + "epoch": 0.7485209793392191, + "grad_norm": 51.25, + "learning_rate": 4.4329134805966315e-06, + "loss": 1.2068966627120972, + "step": 4112 + }, + { + "epoch": 0.7488850459634113, + "grad_norm": 6.75, + "learning_rate": 4.432378886005025e-06, + "loss": 0.8691614866256714, + "step": 4114 + }, + { + "epoch": 0.7492491125876035, + "grad_norm": 13.25, + "learning_rate": 4.431844081217622e-06, + "loss": 1.483361005783081, + "step": 4116 + }, + { + "epoch": 0.7496131792117957, + "grad_norm": 27.25, + "learning_rate": 4.431309066312903e-06, + "loss": 1.6403098106384277, + "step": 4118 + }, + { + "epoch": 0.749977245835988, + "grad_norm": 26.25, + "learning_rate": 4.43077384136938e-06, + "loss": 1.5191655158996582, + "step": 4120 + }, + { + "epoch": 0.7503413124601802, + "grad_norm": 8.9375, + "learning_rate": 4.430238406465594e-06, + "loss": 1.4765812158584595, + "step": 4122 + }, + { + "epoch": 0.7507053790843724, + "grad_norm": 7.5625, + "learning_rate": 4.429702761680117e-06, + "loss": 1.2506628036499023, + "step": 4124 + }, + { + "epoch": 0.7510694457085647, + "grad_norm": 11.875, + "learning_rate": 4.429166907091554e-06, + "loss": 1.8009881973266602, + "step": 4126 + }, + { + "epoch": 0.7514335123327569, + "grad_norm": 14.125, + "learning_rate": 4.4286308427785394e-06, + "loss": 1.8766233921051025, + "step": 4128 + }, + { + "epoch": 0.7517975789569491, + "grad_norm": 7.03125, + "learning_rate": 4.42809456881974e-06, + "loss": 1.3045039176940918, + "step": 4130 + }, + { + "epoch": 0.7521616455811414, + "grad_norm": 5.15625, + "learning_rate": 4.4275580852938485e-06, + "loss": 0.890677273273468, + "step": 4132 + }, + { + "epoch": 0.7525257122053336, + "grad_norm": 9.1875, + "learning_rate": 4.4270213922795935e-06, + "loss": 1.0816421508789062, + "step": 4134 + }, + { + "epoch": 0.7528897788295258, + "grad_norm": 12.0, + "learning_rate": 4.426484489855733e-06, + "loss": 1.4418411254882812, + "step": 4136 + }, + { + "epoch": 0.753253845453718, + "grad_norm": 7.28125, + "learning_rate": 4.425947378101054e-06, + "loss": 1.4546067714691162, + "step": 4138 + }, + { + "epoch": 0.7536179120779103, + "grad_norm": 9.125, + "learning_rate": 4.425410057094377e-06, + "loss": 1.187925934791565, + "step": 4140 + }, + { + "epoch": 0.7539819787021025, + "grad_norm": 19.25, + "learning_rate": 4.42487252691455e-06, + "loss": 1.7679461240768433, + "step": 4142 + }, + { + "epoch": 0.7543460453262947, + "grad_norm": 23.25, + "learning_rate": 4.424334787640454e-06, + "loss": 1.4857308864593506, + "step": 4144 + }, + { + "epoch": 0.754710111950487, + "grad_norm": 10.125, + "learning_rate": 4.423796839351001e-06, + "loss": 1.3260689973831177, + "step": 4146 + }, + { + "epoch": 0.7550741785746792, + "grad_norm": 13.6875, + "learning_rate": 4.4232586821251325e-06, + "loss": 1.6781939268112183, + "step": 4148 + }, + { + "epoch": 0.7554382451988714, + "grad_norm": 6.875, + "learning_rate": 4.4227203160418185e-06, + "loss": 0.9126325845718384, + "step": 4150 + }, + { + "epoch": 0.7558023118230636, + "grad_norm": 25.75, + "learning_rate": 4.422181741180065e-06, + "loss": 1.2500287294387817, + "step": 4152 + }, + { + "epoch": 0.7561663784472559, + "grad_norm": 12.9375, + "learning_rate": 4.421642957618905e-06, + "loss": 1.8193004131317139, + "step": 4154 + }, + { + "epoch": 0.7565304450714481, + "grad_norm": 13.5, + "learning_rate": 4.421103965437401e-06, + "loss": 1.2327810525894165, + "step": 4156 + }, + { + "epoch": 0.7568945116956403, + "grad_norm": 9.5, + "learning_rate": 4.4205647647146495e-06, + "loss": 1.4102716445922852, + "step": 4158 + }, + { + "epoch": 0.7572585783198326, + "grad_norm": 29.625, + "learning_rate": 4.420025355529778e-06, + "loss": 1.4116885662078857, + "step": 4160 + }, + { + "epoch": 0.7576226449440248, + "grad_norm": 12.8125, + "learning_rate": 4.419485737961938e-06, + "loss": 1.8389171361923218, + "step": 4162 + }, + { + "epoch": 0.757986711568217, + "grad_norm": 13.8125, + "learning_rate": 4.41894591209032e-06, + "loss": 1.4878777265548706, + "step": 4164 + }, + { + "epoch": 0.7583507781924093, + "grad_norm": 14.0, + "learning_rate": 4.41840587799414e-06, + "loss": 1.5728201866149902, + "step": 4166 + }, + { + "epoch": 0.7587148448166015, + "grad_norm": 21.5, + "learning_rate": 4.417865635752644e-06, + "loss": 1.3733251094818115, + "step": 4168 + }, + { + "epoch": 0.7590789114407936, + "grad_norm": 6.0, + "learning_rate": 4.4173251854451135e-06, + "loss": 1.3655896186828613, + "step": 4170 + }, + { + "epoch": 0.7594429780649858, + "grad_norm": 8.375, + "learning_rate": 4.416784527150856e-06, + "loss": 1.2453241348266602, + "step": 4172 + }, + { + "epoch": 0.7598070446891781, + "grad_norm": 9.6875, + "learning_rate": 4.41624366094921e-06, + "loss": 1.616510272026062, + "step": 4174 + }, + { + "epoch": 0.7601711113133703, + "grad_norm": 14.5, + "learning_rate": 4.415702586919547e-06, + "loss": 1.441003680229187, + "step": 4176 + }, + { + "epoch": 0.7605351779375625, + "grad_norm": 7.84375, + "learning_rate": 4.415161305141267e-06, + "loss": 1.2992274761199951, + "step": 4178 + }, + { + "epoch": 0.7608992445617548, + "grad_norm": 16.5, + "learning_rate": 4.414619815693799e-06, + "loss": 1.259737253189087, + "step": 4180 + }, + { + "epoch": 0.761263311185947, + "grad_norm": 11.0625, + "learning_rate": 4.4140781186566075e-06, + "loss": 1.9988055229187012, + "step": 4182 + }, + { + "epoch": 0.7616273778101392, + "grad_norm": 15.75, + "learning_rate": 4.413536214109183e-06, + "loss": 1.6156141757965088, + "step": 4184 + }, + { + "epoch": 0.7619914444343315, + "grad_norm": 24.5, + "learning_rate": 4.412994102131046e-06, + "loss": 1.521872878074646, + "step": 4186 + }, + { + "epoch": 0.7623555110585237, + "grad_norm": 7.78125, + "learning_rate": 4.4124517828017534e-06, + "loss": 1.257689118385315, + "step": 4188 + }, + { + "epoch": 0.7627195776827159, + "grad_norm": 24.75, + "learning_rate": 4.411909256200884e-06, + "loss": 1.377212405204773, + "step": 4190 + }, + { + "epoch": 0.7630836443069081, + "grad_norm": 7.34375, + "learning_rate": 4.411366522408054e-06, + "loss": 1.0804977416992188, + "step": 4192 + }, + { + "epoch": 0.7634477109311004, + "grad_norm": 9.5625, + "learning_rate": 4.4108235815029056e-06, + "loss": 1.5017775297164917, + "step": 4194 + }, + { + "epoch": 0.7638117775552926, + "grad_norm": 69.0, + "learning_rate": 4.410280433565115e-06, + "loss": 1.1005418300628662, + "step": 4196 + }, + { + "epoch": 0.7641758441794848, + "grad_norm": 12.0625, + "learning_rate": 4.409737078674387e-06, + "loss": 1.6864604949951172, + "step": 4198 + }, + { + "epoch": 0.7645399108036771, + "grad_norm": 10.1875, + "learning_rate": 4.4091935169104535e-06, + "loss": 1.6279773712158203, + "step": 4200 + }, + { + "epoch": 0.7649039774278693, + "grad_norm": 19.875, + "learning_rate": 4.408649748353083e-06, + "loss": 1.8135032653808594, + "step": 4202 + }, + { + "epoch": 0.7652680440520615, + "grad_norm": 12.0625, + "learning_rate": 4.408105773082072e-06, + "loss": 1.5221168994903564, + "step": 4204 + }, + { + "epoch": 0.7656321106762538, + "grad_norm": 7.3125, + "learning_rate": 4.407561591177245e-06, + "loss": 1.4761295318603516, + "step": 4206 + }, + { + "epoch": 0.765996177300446, + "grad_norm": 8.25, + "learning_rate": 4.407017202718459e-06, + "loss": 1.5649135112762451, + "step": 4208 + }, + { + "epoch": 0.7663602439246382, + "grad_norm": 9.5625, + "learning_rate": 4.406472607785599e-06, + "loss": 1.192299485206604, + "step": 4210 + }, + { + "epoch": 0.7667243105488304, + "grad_norm": 11.0, + "learning_rate": 4.405927806458586e-06, + "loss": 1.4417957067489624, + "step": 4212 + }, + { + "epoch": 0.7670883771730227, + "grad_norm": 9.0625, + "learning_rate": 4.405382798817364e-06, + "loss": 1.2320743799209595, + "step": 4214 + }, + { + "epoch": 0.7674524437972149, + "grad_norm": 27.25, + "learning_rate": 4.404837584941911e-06, + "loss": 1.0693466663360596, + "step": 4216 + }, + { + "epoch": 0.7678165104214071, + "grad_norm": 5.1875, + "learning_rate": 4.404292164912237e-06, + "loss": 1.280682921409607, + "step": 4218 + }, + { + "epoch": 0.7681805770455994, + "grad_norm": 8.4375, + "learning_rate": 4.4037465388083785e-06, + "loss": 1.1720728874206543, + "step": 4220 + }, + { + "epoch": 0.7685446436697916, + "grad_norm": 8.9375, + "learning_rate": 4.403200706710404e-06, + "loss": 1.6981515884399414, + "step": 4222 + }, + { + "epoch": 0.7689087102939838, + "grad_norm": 31.75, + "learning_rate": 4.402654668698413e-06, + "loss": 1.984631061553955, + "step": 4224 + }, + { + "epoch": 0.769272776918176, + "grad_norm": 30.125, + "learning_rate": 4.402108424852533e-06, + "loss": 1.3658406734466553, + "step": 4226 + }, + { + "epoch": 0.7696368435423683, + "grad_norm": 19.0, + "learning_rate": 4.401561975252926e-06, + "loss": 1.6068122386932373, + "step": 4228 + }, + { + "epoch": 0.7700009101665605, + "grad_norm": 21.75, + "learning_rate": 4.401015319979777e-06, + "loss": 1.462298035621643, + "step": 4230 + }, + { + "epoch": 0.7703649767907527, + "grad_norm": 14.875, + "learning_rate": 4.400468459113308e-06, + "loss": 1.1048028469085693, + "step": 4232 + }, + { + "epoch": 0.770729043414945, + "grad_norm": 28.25, + "learning_rate": 4.399921392733769e-06, + "loss": 0.6294914484024048, + "step": 4234 + }, + { + "epoch": 0.7710931100391372, + "grad_norm": 17.0, + "learning_rate": 4.399374120921439e-06, + "loss": 1.5844395160675049, + "step": 4236 + }, + { + "epoch": 0.7714571766633294, + "grad_norm": 9.5625, + "learning_rate": 4.398826643756628e-06, + "loss": 1.5203673839569092, + "step": 4238 + }, + { + "epoch": 0.7718212432875217, + "grad_norm": 10.0625, + "learning_rate": 4.398278961319674e-06, + "loss": 1.6357944011688232, + "step": 4240 + }, + { + "epoch": 0.7721853099117139, + "grad_norm": 8.3125, + "learning_rate": 4.397731073690951e-06, + "loss": 1.2385048866271973, + "step": 4242 + }, + { + "epoch": 0.772549376535906, + "grad_norm": 7.8125, + "learning_rate": 4.397182980950857e-06, + "loss": 1.3403857946395874, + "step": 4244 + }, + { + "epoch": 0.7729134431600982, + "grad_norm": 9.6875, + "learning_rate": 4.396634683179823e-06, + "loss": 1.2890541553497314, + "step": 4246 + }, + { + "epoch": 0.7732775097842906, + "grad_norm": 17.25, + "learning_rate": 4.396086180458309e-06, + "loss": 1.4887521266937256, + "step": 4248 + }, + { + "epoch": 0.7736415764084827, + "grad_norm": 18.75, + "learning_rate": 4.395537472866805e-06, + "loss": 1.7618627548217773, + "step": 4250 + }, + { + "epoch": 0.7740056430326749, + "grad_norm": 10.25, + "learning_rate": 4.394988560485835e-06, + "loss": 1.5332757234573364, + "step": 4252 + }, + { + "epoch": 0.7743697096568672, + "grad_norm": 33.5, + "learning_rate": 4.3944394433959445e-06, + "loss": 1.649156093597412, + "step": 4254 + }, + { + "epoch": 0.7747337762810594, + "grad_norm": 10.5625, + "learning_rate": 4.393890121677718e-06, + "loss": 1.6425827741622925, + "step": 4256 + }, + { + "epoch": 0.7750978429052516, + "grad_norm": 12.6875, + "learning_rate": 4.3933405954117655e-06, + "loss": 1.2292001247406006, + "step": 4258 + }, + { + "epoch": 0.7754619095294439, + "grad_norm": 16.5, + "learning_rate": 4.392790864678728e-06, + "loss": 1.3680897951126099, + "step": 4260 + }, + { + "epoch": 0.7758259761536361, + "grad_norm": 20.125, + "learning_rate": 4.392240929559274e-06, + "loss": 1.5855176448822021, + "step": 4262 + }, + { + "epoch": 0.7761900427778283, + "grad_norm": 12.625, + "learning_rate": 4.391690790134105e-06, + "loss": 1.9958208799362183, + "step": 4264 + }, + { + "epoch": 0.7765541094020205, + "grad_norm": 5.15625, + "learning_rate": 4.3911404464839546e-06, + "loss": 1.373746395111084, + "step": 4266 + }, + { + "epoch": 0.7769181760262128, + "grad_norm": 8.0, + "learning_rate": 4.390589898689581e-06, + "loss": 1.199842929840088, + "step": 4268 + }, + { + "epoch": 0.777282242650405, + "grad_norm": 8.5625, + "learning_rate": 4.390039146831775e-06, + "loss": 1.4002423286437988, + "step": 4270 + }, + { + "epoch": 0.7776463092745972, + "grad_norm": 10.875, + "learning_rate": 4.389488190991358e-06, + "loss": 1.4751213788986206, + "step": 4272 + }, + { + "epoch": 0.7780103758987895, + "grad_norm": 17.125, + "learning_rate": 4.38893703124918e-06, + "loss": 1.281068205833435, + "step": 4274 + }, + { + "epoch": 0.7783744425229817, + "grad_norm": 14.1875, + "learning_rate": 4.38838566768612e-06, + "loss": 0.930081844329834, + "step": 4276 + }, + { + "epoch": 0.7787385091471739, + "grad_norm": 31.0, + "learning_rate": 4.387834100383093e-06, + "loss": 1.372563362121582, + "step": 4278 + }, + { + "epoch": 0.7791025757713662, + "grad_norm": 9.375, + "learning_rate": 4.387282329421033e-06, + "loss": 1.3405052423477173, + "step": 4280 + }, + { + "epoch": 0.7794666423955584, + "grad_norm": 22.0, + "learning_rate": 4.386730354880916e-06, + "loss": 1.2313324213027954, + "step": 4282 + }, + { + "epoch": 0.7798307090197506, + "grad_norm": 145.0, + "learning_rate": 4.386178176843737e-06, + "loss": 1.7453323602676392, + "step": 4284 + }, + { + "epoch": 0.7801947756439428, + "grad_norm": 8.6875, + "learning_rate": 4.385625795390529e-06, + "loss": 1.3262126445770264, + "step": 4286 + }, + { + "epoch": 0.7805588422681351, + "grad_norm": 8.125, + "learning_rate": 4.385073210602352e-06, + "loss": 1.0706943273544312, + "step": 4288 + }, + { + "epoch": 0.7809229088923273, + "grad_norm": 8.5625, + "learning_rate": 4.3845204225602935e-06, + "loss": 1.2356127500534058, + "step": 4290 + }, + { + "epoch": 0.7812869755165195, + "grad_norm": 12.375, + "learning_rate": 4.383967431345474e-06, + "loss": 1.2644524574279785, + "step": 4292 + }, + { + "epoch": 0.7816510421407118, + "grad_norm": 11.875, + "learning_rate": 4.383414237039043e-06, + "loss": 1.8036032915115356, + "step": 4294 + }, + { + "epoch": 0.782015108764904, + "grad_norm": 11.0625, + "learning_rate": 4.382860839722179e-06, + "loss": 1.3682570457458496, + "step": 4296 + }, + { + "epoch": 0.7823791753890962, + "grad_norm": 4.0625, + "learning_rate": 4.382307239476093e-06, + "loss": 1.3352452516555786, + "step": 4298 + }, + { + "epoch": 0.7827432420132885, + "grad_norm": 22.375, + "learning_rate": 4.381753436382018e-06, + "loss": 1.594243049621582, + "step": 4300 + }, + { + "epoch": 0.7831073086374807, + "grad_norm": 8.3125, + "learning_rate": 4.381199430521228e-06, + "loss": 1.4326969385147095, + "step": 4302 + }, + { + "epoch": 0.7834713752616729, + "grad_norm": 17.375, + "learning_rate": 4.3806452219750184e-06, + "loss": 1.5666792392730713, + "step": 4304 + }, + { + "epoch": 0.7838354418858651, + "grad_norm": 7.3125, + "learning_rate": 4.380090810824719e-06, + "loss": 1.482906460762024, + "step": 4306 + }, + { + "epoch": 0.7841995085100574, + "grad_norm": 15.3125, + "learning_rate": 4.379536197151685e-06, + "loss": 1.2503796815872192, + "step": 4308 + }, + { + "epoch": 0.7845635751342496, + "grad_norm": 9.375, + "learning_rate": 4.378981381037305e-06, + "loss": 1.3743815422058105, + "step": 4310 + }, + { + "epoch": 0.7849276417584418, + "grad_norm": 8.0625, + "learning_rate": 4.3784263625629965e-06, + "loss": 1.182907223701477, + "step": 4312 + }, + { + "epoch": 0.7852917083826341, + "grad_norm": 27.875, + "learning_rate": 4.377871141810205e-06, + "loss": 1.4062925577163696, + "step": 4314 + }, + { + "epoch": 0.7856557750068263, + "grad_norm": 3.90625, + "learning_rate": 4.377315718860407e-06, + "loss": 1.1224201917648315, + "step": 4316 + }, + { + "epoch": 0.7860198416310185, + "grad_norm": 11.0, + "learning_rate": 4.376760093795111e-06, + "loss": 1.3343844413757324, + "step": 4318 + }, + { + "epoch": 0.7863839082552107, + "grad_norm": 8.4375, + "learning_rate": 4.376204266695848e-06, + "loss": 1.4972634315490723, + "step": 4320 + }, + { + "epoch": 0.786747974879403, + "grad_norm": 12.625, + "learning_rate": 4.375648237644188e-06, + "loss": 1.687469482421875, + "step": 4322 + }, + { + "epoch": 0.7871120415035952, + "grad_norm": 68.0, + "learning_rate": 4.375092006721723e-06, + "loss": 1.7964059114456177, + "step": 4324 + }, + { + "epoch": 0.7874761081277873, + "grad_norm": 4.46875, + "learning_rate": 4.3745355740100796e-06, + "loss": 0.9894921779632568, + "step": 4326 + }, + { + "epoch": 0.7878401747519796, + "grad_norm": 43.5, + "learning_rate": 4.37397893959091e-06, + "loss": 1.2102699279785156, + "step": 4328 + }, + { + "epoch": 0.7882042413761718, + "grad_norm": 9.3125, + "learning_rate": 4.3734221035459e-06, + "loss": 1.2620714902877808, + "step": 4330 + }, + { + "epoch": 0.788568308000364, + "grad_norm": 8.5625, + "learning_rate": 4.3728650659567626e-06, + "loss": 0.9282611608505249, + "step": 4332 + }, + { + "epoch": 0.7889323746245563, + "grad_norm": 10.25, + "learning_rate": 4.372307826905241e-06, + "loss": 1.0102256536483765, + "step": 4334 + }, + { + "epoch": 0.7892964412487485, + "grad_norm": 5.78125, + "learning_rate": 4.371750386473107e-06, + "loss": 1.2553620338439941, + "step": 4336 + }, + { + "epoch": 0.7896605078729407, + "grad_norm": 15.1875, + "learning_rate": 4.371192744742162e-06, + "loss": 1.548854112625122, + "step": 4338 + }, + { + "epoch": 0.7900245744971329, + "grad_norm": 7.3125, + "learning_rate": 4.3706349017942395e-06, + "loss": 0.9886182546615601, + "step": 4340 + }, + { + "epoch": 0.7903886411213252, + "grad_norm": 13.8125, + "learning_rate": 4.3700768577112e-06, + "loss": 1.607574462890625, + "step": 4342 + }, + { + "epoch": 0.7907527077455174, + "grad_norm": 108.5, + "learning_rate": 4.369518612574933e-06, + "loss": 1.316232442855835, + "step": 4344 + }, + { + "epoch": 0.7911167743697096, + "grad_norm": 13.875, + "learning_rate": 4.368960166467362e-06, + "loss": 1.0501240491867065, + "step": 4346 + }, + { + "epoch": 0.7914808409939019, + "grad_norm": 9.125, + "learning_rate": 4.368401519470433e-06, + "loss": 1.7694333791732788, + "step": 4348 + }, + { + "epoch": 0.7918449076180941, + "grad_norm": 2.53125, + "learning_rate": 4.367842671666126e-06, + "loss": 0.9769310355186462, + "step": 4350 + }, + { + "epoch": 0.7922089742422863, + "grad_norm": 8.6875, + "learning_rate": 4.367283623136451e-06, + "loss": 1.5035035610198975, + "step": 4352 + }, + { + "epoch": 0.7925730408664786, + "grad_norm": 7.5625, + "learning_rate": 4.366724373963446e-06, + "loss": 1.4821988344192505, + "step": 4354 + }, + { + "epoch": 0.7929371074906708, + "grad_norm": 8.0, + "learning_rate": 4.366164924229178e-06, + "loss": 1.3605796098709106, + "step": 4356 + }, + { + "epoch": 0.793301174114863, + "grad_norm": 8.1875, + "learning_rate": 4.3656052740157426e-06, + "loss": 1.541952133178711, + "step": 4358 + }, + { + "epoch": 0.7936652407390552, + "grad_norm": 6.75, + "learning_rate": 4.365045423405269e-06, + "loss": 1.5068315267562866, + "step": 4360 + }, + { + "epoch": 0.7940293073632475, + "grad_norm": 8.9375, + "learning_rate": 4.36448537247991e-06, + "loss": 1.310437798500061, + "step": 4362 + }, + { + "epoch": 0.7943933739874397, + "grad_norm": 11.4375, + "learning_rate": 4.363925121321854e-06, + "loss": 1.3380366563796997, + "step": 4364 + }, + { + "epoch": 0.7947574406116319, + "grad_norm": 23.5, + "learning_rate": 4.363364670013312e-06, + "loss": 1.4515764713287354, + "step": 4366 + }, + { + "epoch": 0.7951215072358242, + "grad_norm": 5.90625, + "learning_rate": 4.362804018636532e-06, + "loss": 1.442319393157959, + "step": 4368 + }, + { + "epoch": 0.7954855738600164, + "grad_norm": 7.1875, + "learning_rate": 4.362243167273784e-06, + "loss": 1.3407368659973145, + "step": 4370 + }, + { + "epoch": 0.7958496404842086, + "grad_norm": 10.625, + "learning_rate": 4.361682116007372e-06, + "loss": 1.3934518098831177, + "step": 4372 + }, + { + "epoch": 0.7962137071084009, + "grad_norm": 4.90625, + "learning_rate": 4.361120864919629e-06, + "loss": 1.281641960144043, + "step": 4374 + }, + { + "epoch": 0.7965777737325931, + "grad_norm": 13.9375, + "learning_rate": 4.360559414092914e-06, + "loss": 1.5547196865081787, + "step": 4376 + }, + { + "epoch": 0.7969418403567853, + "grad_norm": 18.875, + "learning_rate": 4.359997763609619e-06, + "loss": 1.333369255065918, + "step": 4378 + }, + { + "epoch": 0.7973059069809775, + "grad_norm": 31.5, + "learning_rate": 4.359435913552163e-06, + "loss": 1.5652134418487549, + "step": 4380 + }, + { + "epoch": 0.7976699736051698, + "grad_norm": 10.0625, + "learning_rate": 4.3588738640029984e-06, + "loss": 1.4560751914978027, + "step": 4382 + }, + { + "epoch": 0.798034040229362, + "grad_norm": 9.25, + "learning_rate": 4.358311615044599e-06, + "loss": 1.3550523519515991, + "step": 4384 + }, + { + "epoch": 0.7983981068535542, + "grad_norm": 9.5625, + "learning_rate": 4.3577491667594765e-06, + "loss": 1.4192368984222412, + "step": 4386 + }, + { + "epoch": 0.7987621734777465, + "grad_norm": 11.25, + "learning_rate": 4.357186519230165e-06, + "loss": 1.5846765041351318, + "step": 4388 + }, + { + "epoch": 0.7991262401019387, + "grad_norm": 8.9375, + "learning_rate": 4.356623672539233e-06, + "loss": 1.237911343574524, + "step": 4390 + }, + { + "epoch": 0.7994903067261309, + "grad_norm": 15.3125, + "learning_rate": 4.356060626769274e-06, + "loss": 0.7316661477088928, + "step": 4392 + }, + { + "epoch": 0.7998543733503231, + "grad_norm": 19.625, + "learning_rate": 4.355497382002915e-06, + "loss": 1.2689944505691528, + "step": 4394 + }, + { + "epoch": 0.8002184399745154, + "grad_norm": 12.3125, + "learning_rate": 4.3549339383228065e-06, + "loss": 1.4916542768478394, + "step": 4396 + }, + { + "epoch": 0.8005825065987076, + "grad_norm": 8.75, + "learning_rate": 4.354370295811635e-06, + "loss": 1.2607371807098389, + "step": 4398 + }, + { + "epoch": 0.8009465732228997, + "grad_norm": 16.375, + "learning_rate": 4.353806454552111e-06, + "loss": 1.2823785543441772, + "step": 4400 + }, + { + "epoch": 0.801310639847092, + "grad_norm": 11.875, + "learning_rate": 4.353242414626977e-06, + "loss": 1.5608652830123901, + "step": 4402 + }, + { + "epoch": 0.8016747064712842, + "grad_norm": 11.125, + "learning_rate": 4.352678176119002e-06, + "loss": 1.3546953201293945, + "step": 4404 + }, + { + "epoch": 0.8020387730954764, + "grad_norm": 25.75, + "learning_rate": 4.352113739110987e-06, + "loss": 1.2929246425628662, + "step": 4406 + }, + { + "epoch": 0.8024028397196687, + "grad_norm": 9.625, + "learning_rate": 4.35154910368576e-06, + "loss": 0.9250297546386719, + "step": 4408 + }, + { + "epoch": 0.8027669063438609, + "grad_norm": 7.75, + "learning_rate": 4.35098426992618e-06, + "loss": 1.7485544681549072, + "step": 4410 + }, + { + "epoch": 0.8031309729680531, + "grad_norm": 9.625, + "learning_rate": 4.350419237915134e-06, + "loss": 1.0787283182144165, + "step": 4412 + }, + { + "epoch": 0.8034950395922453, + "grad_norm": 8.75, + "learning_rate": 4.349854007735536e-06, + "loss": 0.7617124319076538, + "step": 4414 + }, + { + "epoch": 0.8038591062164376, + "grad_norm": 6.90625, + "learning_rate": 4.349288579470333e-06, + "loss": 1.2137619256973267, + "step": 4416 + }, + { + "epoch": 0.8042231728406298, + "grad_norm": 17.0, + "learning_rate": 4.3487229532025e-06, + "loss": 1.2146120071411133, + "step": 4418 + }, + { + "epoch": 0.804587239464822, + "grad_norm": 8.25, + "learning_rate": 4.348157129015039e-06, + "loss": 1.1762677431106567, + "step": 4420 + }, + { + "epoch": 0.8049513060890143, + "grad_norm": 9.25, + "learning_rate": 4.347591106990984e-06, + "loss": 1.1438363790512085, + "step": 4422 + }, + { + "epoch": 0.8053153727132065, + "grad_norm": 10.1875, + "learning_rate": 4.347024887213393e-06, + "loss": 1.514618992805481, + "step": 4424 + }, + { + "epoch": 0.8056794393373987, + "grad_norm": 12.3125, + "learning_rate": 4.346458469765361e-06, + "loss": 1.3525161743164062, + "step": 4426 + }, + { + "epoch": 0.806043505961591, + "grad_norm": 6.25, + "learning_rate": 4.345891854730005e-06, + "loss": 1.247894048690796, + "step": 4428 + }, + { + "epoch": 0.8064075725857832, + "grad_norm": 12.875, + "learning_rate": 4.345325042190473e-06, + "loss": 1.8402509689331055, + "step": 4430 + }, + { + "epoch": 0.8067716392099754, + "grad_norm": 29.125, + "learning_rate": 4.344758032229943e-06, + "loss": 1.3306828737258911, + "step": 4432 + }, + { + "epoch": 0.8071357058341676, + "grad_norm": 24.75, + "learning_rate": 4.344190824931622e-06, + "loss": 1.2305538654327393, + "step": 4434 + }, + { + "epoch": 0.8074997724583599, + "grad_norm": 10.875, + "learning_rate": 4.343623420378745e-06, + "loss": 1.4118926525115967, + "step": 4436 + }, + { + "epoch": 0.8078638390825521, + "grad_norm": 5.71875, + "learning_rate": 4.3430558186545765e-06, + "loss": 1.3336848020553589, + "step": 4438 + }, + { + "epoch": 0.8082279057067443, + "grad_norm": 3.5, + "learning_rate": 4.34248801984241e-06, + "loss": 1.0588898658752441, + "step": 4440 + }, + { + "epoch": 0.8085919723309366, + "grad_norm": 15.25, + "learning_rate": 4.3419200240255665e-06, + "loss": 1.6459929943084717, + "step": 4442 + }, + { + "epoch": 0.8089560389551288, + "grad_norm": 45.0, + "learning_rate": 4.341351831287398e-06, + "loss": 1.7264268398284912, + "step": 4444 + }, + { + "epoch": 0.809320105579321, + "grad_norm": 7.25, + "learning_rate": 4.340783441711284e-06, + "loss": 1.2586559057235718, + "step": 4446 + }, + { + "epoch": 0.8096841722035133, + "grad_norm": 6.5625, + "learning_rate": 4.340214855380634e-06, + "loss": 1.5741991996765137, + "step": 4448 + }, + { + "epoch": 0.8100482388277055, + "grad_norm": 5.96875, + "learning_rate": 4.339646072378886e-06, + "loss": 0.9454053640365601, + "step": 4450 + }, + { + "epoch": 0.8104123054518977, + "grad_norm": 15.875, + "learning_rate": 4.339077092789505e-06, + "loss": 1.0356849431991577, + "step": 4452 + }, + { + "epoch": 0.8107763720760899, + "grad_norm": 7.125, + "learning_rate": 4.338507916695988e-06, + "loss": 0.7811887264251709, + "step": 4454 + }, + { + "epoch": 0.8111404387002822, + "grad_norm": 16.625, + "learning_rate": 4.337938544181858e-06, + "loss": 1.0020370483398438, + "step": 4456 + }, + { + "epoch": 0.8115045053244744, + "grad_norm": 13.125, + "learning_rate": 4.337368975330669e-06, + "loss": 1.3336344957351685, + "step": 4458 + }, + { + "epoch": 0.8118685719486666, + "grad_norm": 36.0, + "learning_rate": 4.336799210226003e-06, + "loss": 2.022167205810547, + "step": 4460 + }, + { + "epoch": 0.8122326385728589, + "grad_norm": 16.125, + "learning_rate": 4.3362292489514716e-06, + "loss": 2.00360107421875, + "step": 4462 + }, + { + "epoch": 0.8125967051970511, + "grad_norm": 19.5, + "learning_rate": 4.335659091590711e-06, + "loss": 2.0605931282043457, + "step": 4464 + }, + { + "epoch": 0.8129607718212433, + "grad_norm": 7.5625, + "learning_rate": 4.3350887382273934e-06, + "loss": 1.5292812585830688, + "step": 4466 + }, + { + "epoch": 0.8133248384454355, + "grad_norm": 7.90625, + "learning_rate": 4.334518188945213e-06, + "loss": 1.4856352806091309, + "step": 4468 + }, + { + "epoch": 0.8136889050696278, + "grad_norm": 8.9375, + "learning_rate": 4.333947443827897e-06, + "loss": 1.4383578300476074, + "step": 4470 + }, + { + "epoch": 0.81405297169382, + "grad_norm": 16.125, + "learning_rate": 4.3333765029592e-06, + "loss": 1.5232003927230835, + "step": 4472 + }, + { + "epoch": 0.8144170383180122, + "grad_norm": 9.75, + "learning_rate": 4.3328053664229045e-06, + "loss": 1.5454494953155518, + "step": 4474 + }, + { + "epoch": 0.8147811049422045, + "grad_norm": 12.6875, + "learning_rate": 4.332234034302824e-06, + "loss": 1.1203497648239136, + "step": 4476 + }, + { + "epoch": 0.8151451715663967, + "grad_norm": 12.0625, + "learning_rate": 4.3316625066827955e-06, + "loss": 1.3930609226226807, + "step": 4478 + }, + { + "epoch": 0.8155092381905888, + "grad_norm": 7.5625, + "learning_rate": 4.331090783646693e-06, + "loss": 1.8819421529769897, + "step": 4480 + }, + { + "epoch": 0.8158733048147812, + "grad_norm": 7.96875, + "learning_rate": 4.330518865278412e-06, + "loss": 1.3567163944244385, + "step": 4482 + }, + { + "epoch": 0.8162373714389733, + "grad_norm": 11.125, + "learning_rate": 4.32994675166188e-06, + "loss": 1.6875742673873901, + "step": 4484 + }, + { + "epoch": 0.8166014380631655, + "grad_norm": 12.9375, + "learning_rate": 4.329374442881051e-06, + "loss": 1.3831372261047363, + "step": 4486 + }, + { + "epoch": 0.8169655046873577, + "grad_norm": 15.8125, + "learning_rate": 4.32880193901991e-06, + "loss": 1.1459221839904785, + "step": 4488 + }, + { + "epoch": 0.81732957131155, + "grad_norm": 14.3125, + "learning_rate": 4.328229240162471e-06, + "loss": 0.67169189453125, + "step": 4490 + }, + { + "epoch": 0.8176936379357422, + "grad_norm": 100.0, + "learning_rate": 4.3276563463927725e-06, + "loss": 1.5136171579360962, + "step": 4492 + }, + { + "epoch": 0.8180577045599344, + "grad_norm": 16.25, + "learning_rate": 4.327083257794886e-06, + "loss": 1.5977423191070557, + "step": 4494 + }, + { + "epoch": 0.8184217711841267, + "grad_norm": 11.5, + "learning_rate": 4.326509974452909e-06, + "loss": 1.636972427368164, + "step": 4496 + }, + { + "epoch": 0.8187858378083189, + "grad_norm": 25.375, + "learning_rate": 4.325936496450971e-06, + "loss": 1.7512624263763428, + "step": 4498 + }, + { + "epoch": 0.8191499044325111, + "grad_norm": 12.0, + "learning_rate": 4.3253628238732245e-06, + "loss": 1.5959237813949585, + "step": 4500 + }, + { + "epoch": 0.8195139710567034, + "grad_norm": 10.9375, + "learning_rate": 4.3247889568038544e-06, + "loss": 1.3929579257965088, + "step": 4502 + }, + { + "epoch": 0.8198780376808956, + "grad_norm": 19.75, + "learning_rate": 4.3242148953270745e-06, + "loss": 1.0622797012329102, + "step": 4504 + }, + { + "epoch": 0.8202421043050878, + "grad_norm": 20.125, + "learning_rate": 4.323640639527126e-06, + "loss": 1.4992486238479614, + "step": 4506 + }, + { + "epoch": 0.82060617092928, + "grad_norm": 16.125, + "learning_rate": 4.323066189488277e-06, + "loss": 1.6865487098693848, + "step": 4508 + }, + { + "epoch": 0.8209702375534723, + "grad_norm": 13.1875, + "learning_rate": 4.322491545294826e-06, + "loss": 0.5326933860778809, + "step": 4510 + }, + { + "epoch": 0.8213343041776645, + "grad_norm": 9.5625, + "learning_rate": 4.321916707031101e-06, + "loss": 1.5173028707504272, + "step": 4512 + }, + { + "epoch": 0.8216983708018567, + "grad_norm": 22.5, + "learning_rate": 4.321341674781456e-06, + "loss": 1.3065694570541382, + "step": 4514 + }, + { + "epoch": 0.822062437426049, + "grad_norm": 18.25, + "learning_rate": 4.320766448630276e-06, + "loss": 1.378330111503601, + "step": 4516 + }, + { + "epoch": 0.8224265040502412, + "grad_norm": 13.625, + "learning_rate": 4.320191028661972e-06, + "loss": 1.4198566675186157, + "step": 4518 + }, + { + "epoch": 0.8227905706744334, + "grad_norm": 84.5, + "learning_rate": 4.319615414960984e-06, + "loss": 1.341217041015625, + "step": 4520 + }, + { + "epoch": 0.8231546372986257, + "grad_norm": 9.75, + "learning_rate": 4.319039607611782e-06, + "loss": 0.5010709762573242, + "step": 4522 + }, + { + "epoch": 0.8235187039228179, + "grad_norm": 6.09375, + "learning_rate": 4.318463606698865e-06, + "loss": 1.281144618988037, + "step": 4524 + }, + { + "epoch": 0.8238827705470101, + "grad_norm": 28.0, + "learning_rate": 4.317887412306755e-06, + "loss": 1.253221869468689, + "step": 4526 + }, + { + "epoch": 0.8242468371712023, + "grad_norm": 8.4375, + "learning_rate": 4.317311024520009e-06, + "loss": 1.6719715595245361, + "step": 4528 + }, + { + "epoch": 0.8246109037953946, + "grad_norm": 33.75, + "learning_rate": 4.316734443423208e-06, + "loss": 1.5020778179168701, + "step": 4530 + }, + { + "epoch": 0.8249749704195868, + "grad_norm": 6.5, + "learning_rate": 4.3161576691009646e-06, + "loss": 1.1740750074386597, + "step": 4532 + }, + { + "epoch": 0.825339037043779, + "grad_norm": 11.0, + "learning_rate": 4.315580701637917e-06, + "loss": 1.3550879955291748, + "step": 4534 + }, + { + "epoch": 0.8257031036679713, + "grad_norm": 9.0, + "learning_rate": 4.315003541118733e-06, + "loss": 1.1664139032363892, + "step": 4536 + }, + { + "epoch": 0.8260671702921635, + "grad_norm": 12.25, + "learning_rate": 4.314426187628108e-06, + "loss": 1.4160387516021729, + "step": 4538 + }, + { + "epoch": 0.8264312369163557, + "grad_norm": 13.625, + "learning_rate": 4.313848641250767e-06, + "loss": 1.502232551574707, + "step": 4540 + }, + { + "epoch": 0.8267953035405479, + "grad_norm": 6.15625, + "learning_rate": 4.313270902071463e-06, + "loss": 1.5931636095046997, + "step": 4542 + }, + { + "epoch": 0.8271593701647402, + "grad_norm": 6.84375, + "learning_rate": 4.312692970174977e-06, + "loss": 1.3021752834320068, + "step": 4544 + }, + { + "epoch": 0.8275234367889324, + "grad_norm": 10.75, + "learning_rate": 4.312114845646116e-06, + "loss": 1.3560556173324585, + "step": 4546 + }, + { + "epoch": 0.8278875034131246, + "grad_norm": 14.0625, + "learning_rate": 4.31153652856972e-06, + "loss": 1.1802349090576172, + "step": 4548 + }, + { + "epoch": 0.8282515700373169, + "grad_norm": 3.90625, + "learning_rate": 4.310958019030652e-06, + "loss": 0.9591573476791382, + "step": 4550 + }, + { + "epoch": 0.8286156366615091, + "grad_norm": 33.0, + "learning_rate": 4.310379317113809e-06, + "loss": 1.2214243412017822, + "step": 4552 + }, + { + "epoch": 0.8289797032857013, + "grad_norm": 18.5, + "learning_rate": 4.309800422904111e-06, + "loss": 1.799782633781433, + "step": 4554 + }, + { + "epoch": 0.8293437699098936, + "grad_norm": 13.875, + "learning_rate": 4.30922133648651e-06, + "loss": 1.4806492328643799, + "step": 4556 + }, + { + "epoch": 0.8297078365340858, + "grad_norm": 11.5, + "learning_rate": 4.3086420579459835e-06, + "loss": 1.4170966148376465, + "step": 4558 + }, + { + "epoch": 0.830071903158278, + "grad_norm": 9.625, + "learning_rate": 4.308062587367537e-06, + "loss": 1.4671202898025513, + "step": 4560 + }, + { + "epoch": 0.8304359697824701, + "grad_norm": 9.5, + "learning_rate": 4.307482924836208e-06, + "loss": 1.47428560256958, + "step": 4562 + }, + { + "epoch": 0.8308000364066624, + "grad_norm": 6.90625, + "learning_rate": 4.306903070437059e-06, + "loss": 0.9847546815872192, + "step": 4564 + }, + { + "epoch": 0.8311641030308546, + "grad_norm": 17.625, + "learning_rate": 4.30632302425518e-06, + "loss": 1.356264591217041, + "step": 4566 + }, + { + "epoch": 0.8315281696550468, + "grad_norm": 8.625, + "learning_rate": 4.305742786375693e-06, + "loss": 1.7652785778045654, + "step": 4568 + }, + { + "epoch": 0.8318922362792391, + "grad_norm": 21.75, + "learning_rate": 4.305162356883742e-06, + "loss": 1.3786014318466187, + "step": 4570 + }, + { + "epoch": 0.8322563029034313, + "grad_norm": 12.3125, + "learning_rate": 4.3045817358645044e-06, + "loss": 1.3860353231430054, + "step": 4572 + }, + { + "epoch": 0.8326203695276235, + "grad_norm": 8.1875, + "learning_rate": 4.304000923403186e-06, + "loss": 1.4982531070709229, + "step": 4574 + }, + { + "epoch": 0.8329844361518158, + "grad_norm": 2.59375, + "learning_rate": 4.3034199195850144e-06, + "loss": 1.2097407579421997, + "step": 4576 + }, + { + "epoch": 0.833348502776008, + "grad_norm": 10.0625, + "learning_rate": 4.302838724495253e-06, + "loss": 1.3626539707183838, + "step": 4578 + }, + { + "epoch": 0.8337125694002002, + "grad_norm": 15.6875, + "learning_rate": 4.302257338219189e-06, + "loss": 1.9367485046386719, + "step": 4580 + }, + { + "epoch": 0.8340766360243924, + "grad_norm": 27.125, + "learning_rate": 4.301675760842138e-06, + "loss": 1.4965035915374756, + "step": 4582 + }, + { + "epoch": 0.8344407026485847, + "grad_norm": 22.25, + "learning_rate": 4.301093992449445e-06, + "loss": 1.592379093170166, + "step": 4584 + }, + { + "epoch": 0.8348047692727769, + "grad_norm": 15.25, + "learning_rate": 4.3005120331264795e-06, + "loss": 1.2901054620742798, + "step": 4586 + }, + { + "epoch": 0.8351688358969691, + "grad_norm": 12.0625, + "learning_rate": 4.2999298829586455e-06, + "loss": 1.8928816318511963, + "step": 4588 + }, + { + "epoch": 0.8355329025211614, + "grad_norm": 14.75, + "learning_rate": 4.299347542031368e-06, + "loss": 1.8752672672271729, + "step": 4590 + }, + { + "epoch": 0.8358969691453536, + "grad_norm": 8.875, + "learning_rate": 4.298765010430105e-06, + "loss": 1.0792758464813232, + "step": 4592 + }, + { + "epoch": 0.8362610357695458, + "grad_norm": 5.25, + "learning_rate": 4.29818228824034e-06, + "loss": 1.0743558406829834, + "step": 4594 + }, + { + "epoch": 0.8366251023937381, + "grad_norm": 9.6875, + "learning_rate": 4.297599375547586e-06, + "loss": 1.0034387111663818, + "step": 4596 + }, + { + "epoch": 0.8369891690179303, + "grad_norm": 10.6875, + "learning_rate": 4.297016272437382e-06, + "loss": 1.4353010654449463, + "step": 4598 + }, + { + "epoch": 0.8373532356421225, + "grad_norm": 23.875, + "learning_rate": 4.296432978995296e-06, + "loss": 1.987682580947876, + "step": 4600 + }, + { + "epoch": 0.8377173022663147, + "grad_norm": 11.8125, + "learning_rate": 4.295849495306924e-06, + "loss": 2.0193583965301514, + "step": 4602 + }, + { + "epoch": 0.838081368890507, + "grad_norm": 7.0625, + "learning_rate": 4.295265821457891e-06, + "loss": 1.526608943939209, + "step": 4604 + }, + { + "epoch": 0.8384454355146992, + "grad_norm": 13.375, + "learning_rate": 4.294681957533849e-06, + "loss": 1.3104884624481201, + "step": 4606 + }, + { + "epoch": 0.8388095021388914, + "grad_norm": 17.875, + "learning_rate": 4.294097903620474e-06, + "loss": 1.399722933769226, + "step": 4608 + }, + { + "epoch": 0.8391735687630837, + "grad_norm": 30.375, + "learning_rate": 4.293513659803478e-06, + "loss": 1.5541167259216309, + "step": 4610 + }, + { + "epoch": 0.8395376353872759, + "grad_norm": 10.5, + "learning_rate": 4.292929226168594e-06, + "loss": 1.7088623046875, + "step": 4612 + }, + { + "epoch": 0.8399017020114681, + "grad_norm": 3.609375, + "learning_rate": 4.292344602801586e-06, + "loss": 1.0808995962142944, + "step": 4614 + }, + { + "epoch": 0.8402657686356603, + "grad_norm": 3.0625, + "learning_rate": 4.2917597897882445e-06, + "loss": 1.1097280979156494, + "step": 4616 + }, + { + "epoch": 0.8406298352598526, + "grad_norm": 21.375, + "learning_rate": 4.29117478721439e-06, + "loss": 1.1709375381469727, + "step": 4618 + }, + { + "epoch": 0.8409939018840448, + "grad_norm": 24.125, + "learning_rate": 4.290589595165867e-06, + "loss": 1.7756097316741943, + "step": 4620 + }, + { + "epoch": 0.841357968508237, + "grad_norm": 8.0, + "learning_rate": 4.290004213728551e-06, + "loss": 1.5190043449401855, + "step": 4622 + }, + { + "epoch": 0.8417220351324293, + "grad_norm": 7.6875, + "learning_rate": 4.289418642988346e-06, + "loss": 1.4005178213119507, + "step": 4624 + }, + { + "epoch": 0.8420861017566215, + "grad_norm": 12.3125, + "learning_rate": 4.28883288303118e-06, + "loss": 1.7801865339279175, + "step": 4626 + }, + { + "epoch": 0.8424501683808137, + "grad_norm": 16.75, + "learning_rate": 4.288246933943011e-06, + "loss": 1.6216456890106201, + "step": 4628 + }, + { + "epoch": 0.842814235005006, + "grad_norm": 17.125, + "learning_rate": 4.287660795809826e-06, + "loss": 1.7046103477478027, + "step": 4630 + }, + { + "epoch": 0.8431783016291982, + "grad_norm": 13.6875, + "learning_rate": 4.287074468717637e-06, + "loss": 2.0742743015289307, + "step": 4632 + }, + { + "epoch": 0.8435423682533904, + "grad_norm": 25.625, + "learning_rate": 4.286487952752486e-06, + "loss": 1.380745530128479, + "step": 4634 + }, + { + "epoch": 0.8439064348775825, + "grad_norm": 21.875, + "learning_rate": 4.285901248000442e-06, + "loss": 0.9399363994598389, + "step": 4636 + }, + { + "epoch": 0.8442705015017749, + "grad_norm": 12.0, + "learning_rate": 4.285314354547601e-06, + "loss": 1.4656147956848145, + "step": 4638 + }, + { + "epoch": 0.844634568125967, + "grad_norm": 6.15625, + "learning_rate": 4.284727272480087e-06, + "loss": 1.3026562929153442, + "step": 4640 + }, + { + "epoch": 0.8449986347501592, + "grad_norm": 10.8125, + "learning_rate": 4.284140001884053e-06, + "loss": 1.3558553457260132, + "step": 4642 + }, + { + "epoch": 0.8453627013743515, + "grad_norm": 6.21875, + "learning_rate": 4.2835525428456785e-06, + "loss": 1.4424870014190674, + "step": 4644 + }, + { + "epoch": 0.8457267679985437, + "grad_norm": 23.25, + "learning_rate": 4.2829648954511684e-06, + "loss": 1.5834052562713623, + "step": 4646 + }, + { + "epoch": 0.8460908346227359, + "grad_norm": 29.375, + "learning_rate": 4.2823770597867595e-06, + "loss": 1.6375362873077393, + "step": 4648 + }, + { + "epoch": 0.8464549012469282, + "grad_norm": 72.0, + "learning_rate": 4.2817890359387145e-06, + "loss": 0.601618766784668, + "step": 4650 + }, + { + "epoch": 0.8468189678711204, + "grad_norm": 20.75, + "learning_rate": 4.281200823993323e-06, + "loss": 1.486088514328003, + "step": 4652 + }, + { + "epoch": 0.8471830344953126, + "grad_norm": 8.4375, + "learning_rate": 4.280612424036904e-06, + "loss": 1.263069987297058, + "step": 4654 + }, + { + "epoch": 0.8475471011195048, + "grad_norm": 8.6875, + "learning_rate": 4.280023836155801e-06, + "loss": 1.40183687210083, + "step": 4656 + }, + { + "epoch": 0.8479111677436971, + "grad_norm": 18.125, + "learning_rate": 4.279435060436387e-06, + "loss": 1.2984257936477661, + "step": 4658 + }, + { + "epoch": 0.8482752343678893, + "grad_norm": 12.25, + "learning_rate": 4.278846096965063e-06, + "loss": 1.5261800289154053, + "step": 4660 + }, + { + "epoch": 0.8486393009920815, + "grad_norm": 17.625, + "learning_rate": 4.278256945828258e-06, + "loss": 1.8382480144500732, + "step": 4662 + }, + { + "epoch": 0.8490033676162738, + "grad_norm": 5.25, + "learning_rate": 4.277667607112425e-06, + "loss": 0.8402523994445801, + "step": 4664 + }, + { + "epoch": 0.849367434240466, + "grad_norm": 6.75, + "learning_rate": 4.2770780809040495e-06, + "loss": 0.997430682182312, + "step": 4666 + }, + { + "epoch": 0.8497315008646582, + "grad_norm": 14.4375, + "learning_rate": 4.276488367289641e-06, + "loss": 1.6727664470672607, + "step": 4668 + }, + { + "epoch": 0.8500955674888505, + "grad_norm": 26.125, + "learning_rate": 4.275898466355738e-06, + "loss": 1.6993602514266968, + "step": 4670 + }, + { + "epoch": 0.8504596341130427, + "grad_norm": 13.8125, + "learning_rate": 4.2753083781889045e-06, + "loss": 1.5225656032562256, + "step": 4672 + }, + { + "epoch": 0.8508237007372349, + "grad_norm": 81.5, + "learning_rate": 4.274718102875737e-06, + "loss": 1.5078167915344238, + "step": 4674 + }, + { + "epoch": 0.8511877673614271, + "grad_norm": 12.25, + "learning_rate": 4.274127640502852e-06, + "loss": 1.785815954208374, + "step": 4676 + }, + { + "epoch": 0.8515518339856194, + "grad_norm": 34.0, + "learning_rate": 4.2735369911569e-06, + "loss": 1.3342301845550537, + "step": 4678 + }, + { + "epoch": 0.8519159006098116, + "grad_norm": 12.3125, + "learning_rate": 4.2729461549245565e-06, + "loss": 1.3573226928710938, + "step": 4680 + }, + { + "epoch": 0.8522799672340038, + "grad_norm": 7.125, + "learning_rate": 4.272355131892523e-06, + "loss": 1.471217393875122, + "step": 4682 + }, + { + "epoch": 0.8526440338581961, + "grad_norm": 11.5625, + "learning_rate": 4.271763922147531e-06, + "loss": 1.3622182607650757, + "step": 4684 + }, + { + "epoch": 0.8530081004823883, + "grad_norm": 3.609375, + "learning_rate": 4.271172525776336e-06, + "loss": 1.1282398700714111, + "step": 4686 + }, + { + "epoch": 0.8533721671065805, + "grad_norm": 9.25, + "learning_rate": 4.270580942865725e-06, + "loss": 1.0616729259490967, + "step": 4688 + }, + { + "epoch": 0.8537362337307728, + "grad_norm": 9.0625, + "learning_rate": 4.26998917350251e-06, + "loss": 1.4764699935913086, + "step": 4690 + }, + { + "epoch": 0.854100300354965, + "grad_norm": 17.625, + "learning_rate": 4.269397217773531e-06, + "loss": 1.7590532302856445, + "step": 4692 + }, + { + "epoch": 0.8544643669791572, + "grad_norm": 44.75, + "learning_rate": 4.268805075765654e-06, + "loss": 2.0107500553131104, + "step": 4694 + }, + { + "epoch": 0.8548284336033494, + "grad_norm": 11.75, + "learning_rate": 4.268212747565774e-06, + "loss": 1.7459686994552612, + "step": 4696 + }, + { + "epoch": 0.8551925002275417, + "grad_norm": 9.6875, + "learning_rate": 4.267620233260814e-06, + "loss": 1.4347848892211914, + "step": 4698 + }, + { + "epoch": 0.8555565668517339, + "grad_norm": 12.25, + "learning_rate": 4.267027532937721e-06, + "loss": 1.373937964439392, + "step": 4700 + }, + { + "epoch": 0.8559206334759261, + "grad_norm": 8.6875, + "learning_rate": 4.266434646683473e-06, + "loss": 1.5379109382629395, + "step": 4702 + }, + { + "epoch": 0.8562847001001184, + "grad_norm": 13.5, + "learning_rate": 4.265841574585072e-06, + "loss": 1.384209156036377, + "step": 4704 + }, + { + "epoch": 0.8566487667243106, + "grad_norm": 8.5, + "learning_rate": 4.265248316729551e-06, + "loss": 1.0949008464813232, + "step": 4706 + }, + { + "epoch": 0.8570128333485028, + "grad_norm": 6.96875, + "learning_rate": 4.264654873203967e-06, + "loss": 1.5007158517837524, + "step": 4708 + }, + { + "epoch": 0.857376899972695, + "grad_norm": 16.625, + "learning_rate": 4.2640612440954055e-06, + "loss": 1.3567674160003662, + "step": 4710 + }, + { + "epoch": 0.8577409665968873, + "grad_norm": 24.375, + "learning_rate": 4.263467429490979e-06, + "loss": 0.9061417579650879, + "step": 4712 + }, + { + "epoch": 0.8581050332210794, + "grad_norm": 15.25, + "learning_rate": 4.262873429477829e-06, + "loss": 0.4820134937763214, + "step": 4714 + }, + { + "epoch": 0.8584690998452716, + "grad_norm": 10.875, + "learning_rate": 4.262279244143119e-06, + "loss": 1.7351123094558716, + "step": 4716 + }, + { + "epoch": 0.858833166469464, + "grad_norm": 15.125, + "learning_rate": 4.261684873574047e-06, + "loss": 1.369818091392517, + "step": 4718 + }, + { + "epoch": 0.8591972330936561, + "grad_norm": 14.6875, + "learning_rate": 4.261090317857831e-06, + "loss": 1.9767913818359375, + "step": 4720 + }, + { + "epoch": 0.8595612997178483, + "grad_norm": 11.875, + "learning_rate": 4.260495577081724e-06, + "loss": 1.9178898334503174, + "step": 4722 + }, + { + "epoch": 0.8599253663420406, + "grad_norm": 12.5, + "learning_rate": 4.259900651332998e-06, + "loss": 1.4070062637329102, + "step": 4724 + }, + { + "epoch": 0.8602894329662328, + "grad_norm": 21.625, + "learning_rate": 4.259305540698958e-06, + "loss": 1.573645830154419, + "step": 4726 + }, + { + "epoch": 0.860653499590425, + "grad_norm": 22.875, + "learning_rate": 4.2587102452669325e-06, + "loss": 1.4348583221435547, + "step": 4728 + }, + { + "epoch": 0.8610175662146172, + "grad_norm": 8.625, + "learning_rate": 4.25811476512428e-06, + "loss": 1.2445785999298096, + "step": 4730 + }, + { + "epoch": 0.8613816328388095, + "grad_norm": 15.0, + "learning_rate": 4.257519100358385e-06, + "loss": 1.4276273250579834, + "step": 4732 + }, + { + "epoch": 0.8617456994630017, + "grad_norm": 12.5, + "learning_rate": 4.2569232510566585e-06, + "loss": 1.5543544292449951, + "step": 4734 + }, + { + "epoch": 0.8621097660871939, + "grad_norm": 10.5625, + "learning_rate": 4.256327217306537e-06, + "loss": 1.5052167177200317, + "step": 4736 + }, + { + "epoch": 0.8624738327113862, + "grad_norm": 7.03125, + "learning_rate": 4.25573099919549e-06, + "loss": 1.2933942079544067, + "step": 4738 + }, + { + "epoch": 0.8628378993355784, + "grad_norm": 16.375, + "learning_rate": 4.255134596811007e-06, + "loss": 1.1105557680130005, + "step": 4740 + }, + { + "epoch": 0.8632019659597706, + "grad_norm": 12.5625, + "learning_rate": 4.254538010240608e-06, + "loss": 0.692167341709137, + "step": 4742 + }, + { + "epoch": 0.8635660325839629, + "grad_norm": 12.875, + "learning_rate": 4.253941239571841e-06, + "loss": 1.613133192062378, + "step": 4744 + }, + { + "epoch": 0.8639300992081551, + "grad_norm": 11.0, + "learning_rate": 4.253344284892279e-06, + "loss": 1.5525827407836914, + "step": 4746 + }, + { + "epoch": 0.8642941658323473, + "grad_norm": 9.625, + "learning_rate": 4.252747146289521e-06, + "loss": 1.4382461309432983, + "step": 4748 + }, + { + "epoch": 0.8646582324565395, + "grad_norm": 12.25, + "learning_rate": 4.252149823851198e-06, + "loss": 1.298912763595581, + "step": 4750 + }, + { + "epoch": 0.8650222990807318, + "grad_norm": 11.75, + "learning_rate": 4.251552317664962e-06, + "loss": 1.6574656963348389, + "step": 4752 + }, + { + "epoch": 0.865386365704924, + "grad_norm": 9.0625, + "learning_rate": 4.250954627818495e-06, + "loss": 1.4766242504119873, + "step": 4754 + }, + { + "epoch": 0.8657504323291162, + "grad_norm": 16.5, + "learning_rate": 4.250356754399507e-06, + "loss": 1.4572079181671143, + "step": 4756 + }, + { + "epoch": 0.8661144989533085, + "grad_norm": 6.59375, + "learning_rate": 4.249758697495733e-06, + "loss": 1.1828886270523071, + "step": 4758 + }, + { + "epoch": 0.8664785655775007, + "grad_norm": 4.1875, + "learning_rate": 4.249160457194933e-06, + "loss": 1.2524945735931396, + "step": 4760 + }, + { + "epoch": 0.8668426322016929, + "grad_norm": 13.25, + "learning_rate": 4.2485620335849e-06, + "loss": 1.5294002294540405, + "step": 4762 + }, + { + "epoch": 0.8672066988258852, + "grad_norm": 7.65625, + "learning_rate": 4.2479634267534484e-06, + "loss": 1.5452783107757568, + "step": 4764 + }, + { + "epoch": 0.8675707654500774, + "grad_norm": 6.15625, + "learning_rate": 4.247364636788421e-06, + "loss": 1.3477267026901245, + "step": 4766 + }, + { + "epoch": 0.8679348320742696, + "grad_norm": 17.375, + "learning_rate": 4.246765663777689e-06, + "loss": 1.3008713722229004, + "step": 4768 + }, + { + "epoch": 0.8682988986984618, + "grad_norm": 21.0, + "learning_rate": 4.2461665078091475e-06, + "loss": 1.7729074954986572, + "step": 4770 + }, + { + "epoch": 0.8686629653226541, + "grad_norm": 7.09375, + "learning_rate": 4.245567168970721e-06, + "loss": 1.221357822418213, + "step": 4772 + }, + { + "epoch": 0.8690270319468463, + "grad_norm": 7.75, + "learning_rate": 4.244967647350361e-06, + "loss": 1.4549627304077148, + "step": 4774 + }, + { + "epoch": 0.8693910985710385, + "grad_norm": 9.625, + "learning_rate": 4.244367943036045e-06, + "loss": 1.2176653146743774, + "step": 4776 + }, + { + "epoch": 0.8697551651952308, + "grad_norm": 17.625, + "learning_rate": 4.243768056115774e-06, + "loss": 1.3642253875732422, + "step": 4778 + }, + { + "epoch": 0.870119231819423, + "grad_norm": 24.125, + "learning_rate": 4.243167986677584e-06, + "loss": 1.2372949123382568, + "step": 4780 + }, + { + "epoch": 0.8704832984436152, + "grad_norm": 9.75, + "learning_rate": 4.242567734809529e-06, + "loss": 1.4391237497329712, + "step": 4782 + }, + { + "epoch": 0.8708473650678074, + "grad_norm": 15.1875, + "learning_rate": 4.241967300599696e-06, + "loss": 1.5092319250106812, + "step": 4784 + }, + { + "epoch": 0.8712114316919997, + "grad_norm": 11.9375, + "learning_rate": 4.241366684136192e-06, + "loss": 1.4717799425125122, + "step": 4786 + }, + { + "epoch": 0.8715754983161919, + "grad_norm": 8.4375, + "learning_rate": 4.24076588550716e-06, + "loss": 1.384933590888977, + "step": 4788 + }, + { + "epoch": 0.871939564940384, + "grad_norm": 8.125, + "learning_rate": 4.240164904800761e-06, + "loss": 1.3773796558380127, + "step": 4790 + }, + { + "epoch": 0.8723036315645764, + "grad_norm": 30.5, + "learning_rate": 4.23956374210519e-06, + "loss": 1.5136041641235352, + "step": 4792 + }, + { + "epoch": 0.8726676981887685, + "grad_norm": 11.8125, + "learning_rate": 4.238962397508662e-06, + "loss": 1.400811791419983, + "step": 4794 + }, + { + "epoch": 0.8730317648129607, + "grad_norm": 16.375, + "learning_rate": 4.238360871099424e-06, + "loss": 2.15653657913208, + "step": 4796 + }, + { + "epoch": 0.873395831437153, + "grad_norm": 31.5, + "learning_rate": 4.2377591629657465e-06, + "loss": 1.1911017894744873, + "step": 4798 + }, + { + "epoch": 0.8737598980613452, + "grad_norm": 18.0, + "learning_rate": 4.237157273195927e-06, + "loss": 1.8955740928649902, + "step": 4800 + }, + { + "epoch": 0.8741239646855374, + "grad_norm": 10.6875, + "learning_rate": 4.2365552018782925e-06, + "loss": 1.627846360206604, + "step": 4802 + }, + { + "epoch": 0.8744880313097296, + "grad_norm": 8.125, + "learning_rate": 4.235952949101195e-06, + "loss": 1.3204150199890137, + "step": 4804 + }, + { + "epoch": 0.8748520979339219, + "grad_norm": 7.1875, + "learning_rate": 4.2353505149530095e-06, + "loss": 1.3699387311935425, + "step": 4806 + }, + { + "epoch": 0.8752161645581141, + "grad_norm": 13.3125, + "learning_rate": 4.234747899522142e-06, + "loss": 1.353880763053894, + "step": 4808 + }, + { + "epoch": 0.8755802311823063, + "grad_norm": 22.875, + "learning_rate": 4.234145102897025e-06, + "loss": 0.9445521235466003, + "step": 4810 + }, + { + "epoch": 0.8759442978064986, + "grad_norm": 5.625, + "learning_rate": 4.2335421251661155e-06, + "loss": 1.4221251010894775, + "step": 4812 + }, + { + "epoch": 0.8763083644306908, + "grad_norm": 10.9375, + "learning_rate": 4.232938966417898e-06, + "loss": 1.3604538440704346, + "step": 4814 + }, + { + "epoch": 0.876672431054883, + "grad_norm": 16.0, + "learning_rate": 4.232335626740883e-06, + "loss": 1.7300820350646973, + "step": 4816 + }, + { + "epoch": 0.8770364976790753, + "grad_norm": 22.125, + "learning_rate": 4.231732106223611e-06, + "loss": 1.5328739881515503, + "step": 4818 + }, + { + "epoch": 0.8774005643032675, + "grad_norm": 17.375, + "learning_rate": 4.231128404954643e-06, + "loss": 1.9044041633605957, + "step": 4820 + }, + { + "epoch": 0.8777646309274597, + "grad_norm": 19.875, + "learning_rate": 4.230524523022571e-06, + "loss": 1.772191047668457, + "step": 4822 + }, + { + "epoch": 0.8781286975516519, + "grad_norm": 12.0, + "learning_rate": 4.2299204605160125e-06, + "loss": 1.2660330533981323, + "step": 4824 + }, + { + "epoch": 0.8784927641758442, + "grad_norm": 94.5, + "learning_rate": 4.2293162175236105e-06, + "loss": 1.0608837604522705, + "step": 4826 + }, + { + "epoch": 0.8788568308000364, + "grad_norm": 8.9375, + "learning_rate": 4.228711794134035e-06, + "loss": 1.3414355516433716, + "step": 4828 + }, + { + "epoch": 0.8792208974242286, + "grad_norm": 162.0, + "learning_rate": 4.228107190435984e-06, + "loss": 1.876403570175171, + "step": 4830 + }, + { + "epoch": 0.8795849640484209, + "grad_norm": 11.25, + "learning_rate": 4.22750240651818e-06, + "loss": 0.911278486251831, + "step": 4832 + }, + { + "epoch": 0.8799490306726131, + "grad_norm": 9.5, + "learning_rate": 4.226897442469372e-06, + "loss": 1.4786566495895386, + "step": 4834 + }, + { + "epoch": 0.8803130972968053, + "grad_norm": 4.5, + "learning_rate": 4.226292298378337e-06, + "loss": 0.9341336488723755, + "step": 4836 + }, + { + "epoch": 0.8806771639209976, + "grad_norm": 6.71875, + "learning_rate": 4.225686974333877e-06, + "loss": 1.1129498481750488, + "step": 4838 + }, + { + "epoch": 0.8810412305451898, + "grad_norm": 8.5625, + "learning_rate": 4.22508147042482e-06, + "loss": 1.2893462181091309, + "step": 4840 + }, + { + "epoch": 0.881405297169382, + "grad_norm": 18.375, + "learning_rate": 4.224475786740022e-06, + "loss": 1.4436461925506592, + "step": 4842 + }, + { + "epoch": 0.8817693637935742, + "grad_norm": 10.8125, + "learning_rate": 4.223869923368366e-06, + "loss": 0.9314145445823669, + "step": 4844 + }, + { + "epoch": 0.8821334304177665, + "grad_norm": 18.375, + "learning_rate": 4.223263880398757e-06, + "loss": 1.4014346599578857, + "step": 4846 + }, + { + "epoch": 0.8824974970419587, + "grad_norm": 8.4375, + "learning_rate": 4.222657657920131e-06, + "loss": 0.8709172606468201, + "step": 4848 + }, + { + "epoch": 0.8828615636661509, + "grad_norm": 6.21875, + "learning_rate": 4.22205125602145e-06, + "loss": 1.2982878684997559, + "step": 4850 + }, + { + "epoch": 0.8832256302903432, + "grad_norm": 19.125, + "learning_rate": 4.2214446747917e-06, + "loss": 1.6648458242416382, + "step": 4852 + }, + { + "epoch": 0.8835896969145354, + "grad_norm": 14.5, + "learning_rate": 4.2208379143198926e-06, + "loss": 0.7558045387268066, + "step": 4854 + }, + { + "epoch": 0.8839537635387276, + "grad_norm": 22.75, + "learning_rate": 4.22023097469507e-06, + "loss": 1.7609848976135254, + "step": 4856 + }, + { + "epoch": 0.8843178301629198, + "grad_norm": 8.1875, + "learning_rate": 4.219623856006296e-06, + "loss": 1.5912220478057861, + "step": 4858 + }, + { + "epoch": 0.8846818967871121, + "grad_norm": 39.5, + "learning_rate": 4.2190165583426645e-06, + "loss": 1.973174810409546, + "step": 4860 + }, + { + "epoch": 0.8850459634113043, + "grad_norm": 9.6875, + "learning_rate": 4.218409081793294e-06, + "loss": 1.7836987972259521, + "step": 4862 + }, + { + "epoch": 0.8854100300354965, + "grad_norm": 3.3125, + "learning_rate": 4.217801426447328e-06, + "loss": 1.269561529159546, + "step": 4864 + }, + { + "epoch": 0.8857740966596888, + "grad_norm": 3.796875, + "learning_rate": 4.217193592393937e-06, + "loss": 0.7881308794021606, + "step": 4866 + }, + { + "epoch": 0.886138163283881, + "grad_norm": 12.125, + "learning_rate": 4.21658557972232e-06, + "loss": 0.9713007807731628, + "step": 4868 + }, + { + "epoch": 0.8865022299080731, + "grad_norm": 9.8125, + "learning_rate": 4.215977388521699e-06, + "loss": 1.0350278615951538, + "step": 4870 + }, + { + "epoch": 0.8868662965322655, + "grad_norm": 5.0, + "learning_rate": 4.2153690188813255e-06, + "loss": 1.3495904207229614, + "step": 4872 + }, + { + "epoch": 0.8872303631564576, + "grad_norm": 20.125, + "learning_rate": 4.214760470890473e-06, + "loss": 1.4872019290924072, + "step": 4874 + }, + { + "epoch": 0.8875944297806498, + "grad_norm": 15.0, + "learning_rate": 4.214151744638444e-06, + "loss": 1.4880549907684326, + "step": 4876 + }, + { + "epoch": 0.887958496404842, + "grad_norm": 9.5625, + "learning_rate": 4.213542840214569e-06, + "loss": 1.6322938203811646, + "step": 4878 + }, + { + "epoch": 0.8883225630290343, + "grad_norm": 5.6875, + "learning_rate": 4.212933757708198e-06, + "loss": 1.224144458770752, + "step": 4880 + }, + { + "epoch": 0.8886866296532265, + "grad_norm": 19.875, + "learning_rate": 4.212324497208714e-06, + "loss": 1.2496262788772583, + "step": 4882 + }, + { + "epoch": 0.8890506962774187, + "grad_norm": 8.5625, + "learning_rate": 4.211715058805523e-06, + "loss": 1.1319372653961182, + "step": 4884 + }, + { + "epoch": 0.889414762901611, + "grad_norm": 6.3125, + "learning_rate": 4.21110544258806e-06, + "loss": 1.5279579162597656, + "step": 4886 + }, + { + "epoch": 0.8897788295258032, + "grad_norm": 7.9375, + "learning_rate": 4.210495648645778e-06, + "loss": 1.1159669160842896, + "step": 4888 + }, + { + "epoch": 0.8901428961499954, + "grad_norm": 20.125, + "learning_rate": 4.209885677068167e-06, + "loss": 1.555025339126587, + "step": 4890 + }, + { + "epoch": 0.8905069627741877, + "grad_norm": 17.75, + "learning_rate": 4.209275527944736e-06, + "loss": 1.6326406002044678, + "step": 4892 + }, + { + "epoch": 0.8908710293983799, + "grad_norm": 12.6875, + "learning_rate": 4.208665201365023e-06, + "loss": 1.3105897903442383, + "step": 4894 + }, + { + "epoch": 0.8912350960225721, + "grad_norm": 8.75, + "learning_rate": 4.208054697418589e-06, + "loss": 0.8129794001579285, + "step": 4896 + }, + { + "epoch": 0.8915991626467643, + "grad_norm": 40.75, + "learning_rate": 4.207444016195024e-06, + "loss": 0.6662251949310303, + "step": 4898 + }, + { + "epoch": 0.8919632292709566, + "grad_norm": 16.0, + "learning_rate": 4.206833157783944e-06, + "loss": 0.923011839389801, + "step": 4900 + }, + { + "epoch": 0.8923272958951488, + "grad_norm": 28.125, + "learning_rate": 4.206222122274988e-06, + "loss": 1.3391501903533936, + "step": 4902 + }, + { + "epoch": 0.892691362519341, + "grad_norm": 18.75, + "learning_rate": 4.205610909757823e-06, + "loss": 1.3556047677993774, + "step": 4904 + }, + { + "epoch": 0.8930554291435333, + "grad_norm": 20.75, + "learning_rate": 4.204999520322145e-06, + "loss": 1.1428438425064087, + "step": 4906 + }, + { + "epoch": 0.8934194957677255, + "grad_norm": 12.4375, + "learning_rate": 4.2043879540576695e-06, + "loss": 1.2189842462539673, + "step": 4908 + }, + { + "epoch": 0.8937835623919177, + "grad_norm": 13.0, + "learning_rate": 4.203776211054144e-06, + "loss": 1.8105757236480713, + "step": 4910 + }, + { + "epoch": 0.89414762901611, + "grad_norm": 5.78125, + "learning_rate": 4.203164291401336e-06, + "loss": 1.2286920547485352, + "step": 4912 + }, + { + "epoch": 0.8945116956403022, + "grad_norm": 7.46875, + "learning_rate": 4.202552195189046e-06, + "loss": 1.553588628768921, + "step": 4914 + }, + { + "epoch": 0.8948757622644944, + "grad_norm": 6.3125, + "learning_rate": 4.201939922507093e-06, + "loss": 1.4365291595458984, + "step": 4916 + }, + { + "epoch": 0.8952398288886866, + "grad_norm": 29.375, + "learning_rate": 4.201327473445329e-06, + "loss": 1.1697344779968262, + "step": 4918 + }, + { + "epoch": 0.8956038955128789, + "grad_norm": 148.0, + "learning_rate": 4.200714848093627e-06, + "loss": 1.9819934368133545, + "step": 4920 + }, + { + "epoch": 0.8959679621370711, + "grad_norm": 20.625, + "learning_rate": 4.200102046541887e-06, + "loss": 1.492671012878418, + "step": 4922 + }, + { + "epoch": 0.8963320287612633, + "grad_norm": 17.25, + "learning_rate": 4.199489068880034e-06, + "loss": 1.1183415651321411, + "step": 4924 + }, + { + "epoch": 0.8966960953854556, + "grad_norm": 10.3125, + "learning_rate": 4.198875915198021e-06, + "loss": 1.5800209045410156, + "step": 4926 + }, + { + "epoch": 0.8970601620096478, + "grad_norm": 9.6875, + "learning_rate": 4.198262585585827e-06, + "loss": 1.3528177738189697, + "step": 4928 + }, + { + "epoch": 0.89742422863384, + "grad_norm": 11.9375, + "learning_rate": 4.1976490801334555e-06, + "loss": 1.722776174545288, + "step": 4930 + }, + { + "epoch": 0.8977882952580322, + "grad_norm": 11.4375, + "learning_rate": 4.197035398930935e-06, + "loss": 1.3847436904907227, + "step": 4932 + }, + { + "epoch": 0.8981523618822245, + "grad_norm": 11.375, + "learning_rate": 4.19642154206832e-06, + "loss": 1.584481120109558, + "step": 4934 + }, + { + "epoch": 0.8985164285064167, + "grad_norm": 4.34375, + "learning_rate": 4.195807509635692e-06, + "loss": 1.181349754333496, + "step": 4936 + }, + { + "epoch": 0.8988804951306089, + "grad_norm": 12.875, + "learning_rate": 4.195193301723158e-06, + "loss": 1.2476022243499756, + "step": 4938 + }, + { + "epoch": 0.8992445617548012, + "grad_norm": 13.8125, + "learning_rate": 4.194578918420852e-06, + "loss": 1.6769561767578125, + "step": 4940 + }, + { + "epoch": 0.8996086283789934, + "grad_norm": 10.9375, + "learning_rate": 4.193964359818931e-06, + "loss": 1.8135871887207031, + "step": 4942 + }, + { + "epoch": 0.8999726950031856, + "grad_norm": 8.625, + "learning_rate": 4.193349626007578e-06, + "loss": 1.135275959968567, + "step": 4944 + }, + { + "epoch": 0.9003367616273779, + "grad_norm": 9.625, + "learning_rate": 4.192734717077004e-06, + "loss": 1.2510685920715332, + "step": 4946 + }, + { + "epoch": 0.90070082825157, + "grad_norm": 6.71875, + "learning_rate": 4.192119633117443e-06, + "loss": 1.337836503982544, + "step": 4948 + }, + { + "epoch": 0.9010648948757622, + "grad_norm": 11.0625, + "learning_rate": 4.191504374219158e-06, + "loss": 1.3526248931884766, + "step": 4950 + }, + { + "epoch": 0.9014289614999544, + "grad_norm": 19.625, + "learning_rate": 4.190888940472435e-06, + "loss": 1.9208570718765259, + "step": 4952 + }, + { + "epoch": 0.9017930281241467, + "grad_norm": 7.75, + "learning_rate": 4.1902733319675855e-06, + "loss": 1.0100975036621094, + "step": 4954 + }, + { + "epoch": 0.9021570947483389, + "grad_norm": 9.1875, + "learning_rate": 4.1896575487949485e-06, + "loss": 1.0518213510513306, + "step": 4956 + }, + { + "epoch": 0.9025211613725311, + "grad_norm": 10.125, + "learning_rate": 4.189041591044889e-06, + "loss": 1.4388116598129272, + "step": 4958 + }, + { + "epoch": 0.9028852279967234, + "grad_norm": 57.0, + "learning_rate": 4.1884254588077935e-06, + "loss": 1.567813515663147, + "step": 4960 + }, + { + "epoch": 0.9032492946209156, + "grad_norm": 39.5, + "learning_rate": 4.187809152174078e-06, + "loss": 1.2036789655685425, + "step": 4962 + }, + { + "epoch": 0.9036133612451078, + "grad_norm": 14.125, + "learning_rate": 4.187192671234182e-06, + "loss": 1.4062695503234863, + "step": 4964 + }, + { + "epoch": 0.9039774278693001, + "grad_norm": 6.5625, + "learning_rate": 4.186576016078575e-06, + "loss": 1.321616291999817, + "step": 4966 + }, + { + "epoch": 0.9043414944934923, + "grad_norm": 8.8125, + "learning_rate": 4.185959186797747e-06, + "loss": 0.9876461625099182, + "step": 4968 + }, + { + "epoch": 0.9047055611176845, + "grad_norm": 7.84375, + "learning_rate": 4.185342183482213e-06, + "loss": 1.205830693244934, + "step": 4970 + }, + { + "epoch": 0.9050696277418767, + "grad_norm": 14.375, + "learning_rate": 4.184725006222517e-06, + "loss": 1.518424391746521, + "step": 4972 + }, + { + "epoch": 0.905433694366069, + "grad_norm": 9.875, + "learning_rate": 4.184107655109227e-06, + "loss": 1.6104869842529297, + "step": 4974 + }, + { + "epoch": 0.9057977609902612, + "grad_norm": 8.875, + "learning_rate": 4.18349013023294e-06, + "loss": 1.4636800289154053, + "step": 4976 + }, + { + "epoch": 0.9061618276144534, + "grad_norm": 9.5625, + "learning_rate": 4.18287243168427e-06, + "loss": 1.4461798667907715, + "step": 4978 + }, + { + "epoch": 0.9065258942386457, + "grad_norm": 28.875, + "learning_rate": 4.182254559553867e-06, + "loss": 1.3796623945236206, + "step": 4980 + }, + { + "epoch": 0.9068899608628379, + "grad_norm": 13.0, + "learning_rate": 4.181636513932397e-06, + "loss": 1.3355103731155396, + "step": 4982 + }, + { + "epoch": 0.9072540274870301, + "grad_norm": 25.375, + "learning_rate": 4.181018294910557e-06, + "loss": 2.0719802379608154, + "step": 4984 + }, + { + "epoch": 0.9076180941112224, + "grad_norm": 17.75, + "learning_rate": 4.1803999025790695e-06, + "loss": 1.1612348556518555, + "step": 4986 + }, + { + "epoch": 0.9079821607354146, + "grad_norm": 13.5, + "learning_rate": 4.17978133702868e-06, + "loss": 1.8009529113769531, + "step": 4988 + }, + { + "epoch": 0.9083462273596068, + "grad_norm": 16.0, + "learning_rate": 4.179162598350159e-06, + "loss": 1.4866795539855957, + "step": 4990 + }, + { + "epoch": 0.908710293983799, + "grad_norm": 22.0, + "learning_rate": 4.178543686634307e-06, + "loss": 1.992185354232788, + "step": 4992 + }, + { + "epoch": 0.9090743606079913, + "grad_norm": 31.75, + "learning_rate": 4.177924601971944e-06, + "loss": 1.487194299697876, + "step": 4994 + }, + { + "epoch": 0.9094384272321835, + "grad_norm": 9.8125, + "learning_rate": 4.177305344453921e-06, + "loss": 1.4130845069885254, + "step": 4996 + }, + { + "epoch": 0.9098024938563757, + "grad_norm": 13.125, + "learning_rate": 4.176685914171109e-06, + "loss": 0.9557865858078003, + "step": 4998 + }, + { + "epoch": 0.910166560480568, + "grad_norm": 10.5625, + "learning_rate": 4.176066311214407e-06, + "loss": 1.0403658151626587, + "step": 5000 + }, + { + "epoch": 0.9105306271047602, + "grad_norm": 48.5, + "learning_rate": 4.175446535674742e-06, + "loss": 0.5787985324859619, + "step": 5002 + }, + { + "epoch": 0.9108946937289524, + "grad_norm": 8.625, + "learning_rate": 4.174826587643061e-06, + "loss": 1.4757091999053955, + "step": 5004 + }, + { + "epoch": 0.9112587603531446, + "grad_norm": 17.0, + "learning_rate": 4.174206467210337e-06, + "loss": 1.932018756866455, + "step": 5006 + }, + { + "epoch": 0.9116228269773369, + "grad_norm": 6.46875, + "learning_rate": 4.173586174467575e-06, + "loss": 1.2241672277450562, + "step": 5008 + }, + { + "epoch": 0.9119868936015291, + "grad_norm": 9.0625, + "learning_rate": 4.172965709505797e-06, + "loss": 1.5072886943817139, + "step": 5010 + }, + { + "epoch": 0.9123509602257213, + "grad_norm": 6.3125, + "learning_rate": 4.1723450724160565e-06, + "loss": 1.0905475616455078, + "step": 5012 + }, + { + "epoch": 0.9127150268499136, + "grad_norm": 15.125, + "learning_rate": 4.171724263289426e-06, + "loss": 1.4574275016784668, + "step": 5014 + }, + { + "epoch": 0.9130790934741058, + "grad_norm": 6.5, + "learning_rate": 4.171103282217009e-06, + "loss": 1.2100286483764648, + "step": 5016 + }, + { + "epoch": 0.913443160098298, + "grad_norm": 5.84375, + "learning_rate": 4.170482129289931e-06, + "loss": 1.3488829135894775, + "step": 5018 + }, + { + "epoch": 0.9138072267224903, + "grad_norm": 10.0625, + "learning_rate": 4.169860804599344e-06, + "loss": 1.435385823249817, + "step": 5020 + }, + { + "epoch": 0.9141712933466825, + "grad_norm": 11.625, + "learning_rate": 4.169239308236424e-06, + "loss": 1.5078396797180176, + "step": 5022 + }, + { + "epoch": 0.9145353599708747, + "grad_norm": 12.6875, + "learning_rate": 4.168617640292376e-06, + "loss": 1.3219835758209229, + "step": 5024 + }, + { + "epoch": 0.9148994265950668, + "grad_norm": 6.4375, + "learning_rate": 4.167995800858425e-06, + "loss": 1.2192504405975342, + "step": 5026 + }, + { + "epoch": 0.9152634932192591, + "grad_norm": 32.75, + "learning_rate": 4.167373790025824e-06, + "loss": 1.5529321432113647, + "step": 5028 + }, + { + "epoch": 0.9156275598434513, + "grad_norm": 21.0, + "learning_rate": 4.166751607885848e-06, + "loss": 1.614398717880249, + "step": 5030 + }, + { + "epoch": 0.9159916264676435, + "grad_norm": 14.125, + "learning_rate": 4.166129254529804e-06, + "loss": 1.584756851196289, + "step": 5032 + }, + { + "epoch": 0.9163556930918358, + "grad_norm": 21.875, + "learning_rate": 4.165506730049017e-06, + "loss": 1.5878582000732422, + "step": 5034 + }, + { + "epoch": 0.916719759716028, + "grad_norm": 29.625, + "learning_rate": 4.164884034534842e-06, + "loss": 1.42337965965271, + "step": 5036 + }, + { + "epoch": 0.9170838263402202, + "grad_norm": 14.8125, + "learning_rate": 4.1642611680786545e-06, + "loss": 1.5671693086624146, + "step": 5038 + }, + { + "epoch": 0.9174478929644125, + "grad_norm": 7.1875, + "learning_rate": 4.16363813077186e-06, + "loss": 1.4436068534851074, + "step": 5040 + }, + { + "epoch": 0.9178119595886047, + "grad_norm": 5.84375, + "learning_rate": 4.1630149227058846e-06, + "loss": 1.289434552192688, + "step": 5042 + }, + { + "epoch": 0.9181760262127969, + "grad_norm": 8.125, + "learning_rate": 4.1623915439721826e-06, + "loss": 0.9608882069587708, + "step": 5044 + }, + { + "epoch": 0.9185400928369891, + "grad_norm": 19.625, + "learning_rate": 4.161767994662233e-06, + "loss": 1.9189702272415161, + "step": 5046 + }, + { + "epoch": 0.9189041594611814, + "grad_norm": 7.84375, + "learning_rate": 4.161144274867538e-06, + "loss": 1.2211374044418335, + "step": 5048 + }, + { + "epoch": 0.9192682260853736, + "grad_norm": 10.0, + "learning_rate": 4.160520384679626e-06, + "loss": 0.8679113984107971, + "step": 5050 + }, + { + "epoch": 0.9196322927095658, + "grad_norm": 9.5, + "learning_rate": 4.15989632419005e-06, + "loss": 1.583971381187439, + "step": 5052 + }, + { + "epoch": 0.9199963593337581, + "grad_norm": 6.125, + "learning_rate": 4.159272093490391e-06, + "loss": 0.958564043045044, + "step": 5054 + }, + { + "epoch": 0.9203604259579503, + "grad_norm": 19.875, + "learning_rate": 4.1586476926722495e-06, + "loss": 1.3093253374099731, + "step": 5056 + }, + { + "epoch": 0.9207244925821425, + "grad_norm": 6.78125, + "learning_rate": 4.158023121827255e-06, + "loss": 1.24656081199646, + "step": 5058 + }, + { + "epoch": 0.9210885592063348, + "grad_norm": 8.25, + "learning_rate": 4.15739838104706e-06, + "loss": 1.2656464576721191, + "step": 5060 + }, + { + "epoch": 0.921452625830527, + "grad_norm": 21.5, + "learning_rate": 4.156773470423343e-06, + "loss": 1.220057725906372, + "step": 5062 + }, + { + "epoch": 0.9218166924547192, + "grad_norm": 4.875, + "learning_rate": 4.1561483900478085e-06, + "loss": 1.1945585012435913, + "step": 5064 + }, + { + "epoch": 0.9221807590789114, + "grad_norm": 10.8125, + "learning_rate": 4.155523140012182e-06, + "loss": 1.4139323234558105, + "step": 5066 + }, + { + "epoch": 0.9225448257031037, + "grad_norm": 20.625, + "learning_rate": 4.154897720408217e-06, + "loss": 1.480215311050415, + "step": 5068 + }, + { + "epoch": 0.9229088923272959, + "grad_norm": 26.5, + "learning_rate": 4.154272131327693e-06, + "loss": 1.612576961517334, + "step": 5070 + }, + { + "epoch": 0.9232729589514881, + "grad_norm": 12.875, + "learning_rate": 4.153646372862411e-06, + "loss": 0.518166184425354, + "step": 5072 + }, + { + "epoch": 0.9236370255756804, + "grad_norm": 32.25, + "learning_rate": 4.1530204451042e-06, + "loss": 1.3966941833496094, + "step": 5074 + }, + { + "epoch": 0.9240010921998726, + "grad_norm": 6.34375, + "learning_rate": 4.152394348144912e-06, + "loss": 1.0344822406768799, + "step": 5076 + }, + { + "epoch": 0.9243651588240648, + "grad_norm": 5.03125, + "learning_rate": 4.151768082076422e-06, + "loss": 0.9916671514511108, + "step": 5078 + }, + { + "epoch": 0.9247292254482571, + "grad_norm": 10.375, + "learning_rate": 4.151141646990633e-06, + "loss": 1.3234797716140747, + "step": 5080 + }, + { + "epoch": 0.9250932920724493, + "grad_norm": 12.75, + "learning_rate": 4.150515042979474e-06, + "loss": 1.350399136543274, + "step": 5082 + }, + { + "epoch": 0.9254573586966415, + "grad_norm": 5.65625, + "learning_rate": 4.149888270134895e-06, + "loss": 1.0546815395355225, + "step": 5084 + }, + { + "epoch": 0.9258214253208337, + "grad_norm": 20.125, + "learning_rate": 4.149261328548873e-06, + "loss": 1.148550271987915, + "step": 5086 + }, + { + "epoch": 0.926185491945026, + "grad_norm": 23.75, + "learning_rate": 4.148634218313406e-06, + "loss": 1.4253292083740234, + "step": 5088 + }, + { + "epoch": 0.9265495585692182, + "grad_norm": 21.75, + "learning_rate": 4.148006939520524e-06, + "loss": 1.2504509687423706, + "step": 5090 + }, + { + "epoch": 0.9269136251934104, + "grad_norm": 17.25, + "learning_rate": 4.147379492262278e-06, + "loss": 1.775317907333374, + "step": 5092 + }, + { + "epoch": 0.9272776918176027, + "grad_norm": 10.375, + "learning_rate": 4.146751876630739e-06, + "loss": 1.9931669235229492, + "step": 5094 + }, + { + "epoch": 0.9276417584417949, + "grad_norm": 5.59375, + "learning_rate": 4.14612409271801e-06, + "loss": 1.3912854194641113, + "step": 5096 + }, + { + "epoch": 0.9280058250659871, + "grad_norm": 6.78125, + "learning_rate": 4.145496140616217e-06, + "loss": 1.1869392395019531, + "step": 5098 + }, + { + "epoch": 0.9283698916901792, + "grad_norm": 5.5, + "learning_rate": 4.1448680204175054e-06, + "loss": 1.099517583847046, + "step": 5100 + }, + { + "epoch": 0.9287339583143716, + "grad_norm": 10.0, + "learning_rate": 4.144239732214052e-06, + "loss": 1.2340768575668335, + "step": 5102 + }, + { + "epoch": 0.9290980249385637, + "grad_norm": 16.625, + "learning_rate": 4.143611276098055e-06, + "loss": 1.3761390447616577, + "step": 5104 + }, + { + "epoch": 0.9294620915627559, + "grad_norm": 15.1875, + "learning_rate": 4.1429826521617385e-06, + "loss": 1.8559261560440063, + "step": 5106 + }, + { + "epoch": 0.9298261581869482, + "grad_norm": 5.65625, + "learning_rate": 4.14235386049735e-06, + "loss": 1.3722867965698242, + "step": 5108 + }, + { + "epoch": 0.9301902248111404, + "grad_norm": 12.75, + "learning_rate": 4.141724901197161e-06, + "loss": 1.351995825767517, + "step": 5110 + }, + { + "epoch": 0.9305542914353326, + "grad_norm": 6.875, + "learning_rate": 4.14109577435347e-06, + "loss": 1.1992316246032715, + "step": 5112 + }, + { + "epoch": 0.9309183580595249, + "grad_norm": 9.0625, + "learning_rate": 4.1404664800586e-06, + "loss": 0.958271861076355, + "step": 5114 + }, + { + "epoch": 0.9312824246837171, + "grad_norm": 16.75, + "learning_rate": 4.139837018404895e-06, + "loss": 1.400178074836731, + "step": 5116 + }, + { + "epoch": 0.9316464913079093, + "grad_norm": 15.9375, + "learning_rate": 4.139207389484727e-06, + "loss": 2.0021300315856934, + "step": 5118 + }, + { + "epoch": 0.9320105579321015, + "grad_norm": 6.40625, + "learning_rate": 4.1385775933904915e-06, + "loss": 1.1845452785491943, + "step": 5120 + }, + { + "epoch": 0.9323746245562938, + "grad_norm": 12.1875, + "learning_rate": 4.1379476302146085e-06, + "loss": 1.2682902812957764, + "step": 5122 + }, + { + "epoch": 0.932738691180486, + "grad_norm": 8.4375, + "learning_rate": 4.1373175000495215e-06, + "loss": 1.6483125686645508, + "step": 5124 + }, + { + "epoch": 0.9331027578046782, + "grad_norm": 13.3125, + "learning_rate": 4.136687202987701e-06, + "loss": 1.129159927368164, + "step": 5126 + }, + { + "epoch": 0.9334668244288705, + "grad_norm": 3.546875, + "learning_rate": 4.136056739121641e-06, + "loss": 1.296934723854065, + "step": 5128 + }, + { + "epoch": 0.9338308910530627, + "grad_norm": 6.53125, + "learning_rate": 4.1354261085438575e-06, + "loss": 0.9634895920753479, + "step": 5130 + }, + { + "epoch": 0.9341949576772549, + "grad_norm": 10.4375, + "learning_rate": 4.134795311346894e-06, + "loss": 1.4433799982070923, + "step": 5132 + }, + { + "epoch": 0.9345590243014472, + "grad_norm": 11.875, + "learning_rate": 4.1341643476233185e-06, + "loss": 1.0714704990386963, + "step": 5134 + }, + { + "epoch": 0.9349230909256394, + "grad_norm": 11.8125, + "learning_rate": 4.133533217465721e-06, + "loss": 1.5339690446853638, + "step": 5136 + }, + { + "epoch": 0.9352871575498316, + "grad_norm": 27.375, + "learning_rate": 4.132901920966716e-06, + "loss": 1.901155948638916, + "step": 5138 + }, + { + "epoch": 0.9356512241740238, + "grad_norm": 18.0, + "learning_rate": 4.132270458218947e-06, + "loss": 1.8158046007156372, + "step": 5140 + }, + { + "epoch": 0.9360152907982161, + "grad_norm": 7.75, + "learning_rate": 4.1316388293150765e-06, + "loss": 1.4996728897094727, + "step": 5142 + }, + { + "epoch": 0.9363793574224083, + "grad_norm": 9.5, + "learning_rate": 4.131007034347795e-06, + "loss": 1.6671572923660278, + "step": 5144 + }, + { + "epoch": 0.9367434240466005, + "grad_norm": 18.75, + "learning_rate": 4.130375073409814e-06, + "loss": 1.4469289779663086, + "step": 5146 + }, + { + "epoch": 0.9371074906707928, + "grad_norm": 6.40625, + "learning_rate": 4.129742946593872e-06, + "loss": 0.9581305384635925, + "step": 5148 + }, + { + "epoch": 0.937471557294985, + "grad_norm": 3.15625, + "learning_rate": 4.12911065399273e-06, + "loss": 0.9949471354484558, + "step": 5150 + }, + { + "epoch": 0.9378356239191772, + "grad_norm": 6.0625, + "learning_rate": 4.128478195699176e-06, + "loss": 1.3300024271011353, + "step": 5152 + }, + { + "epoch": 0.9381996905433695, + "grad_norm": 12.4375, + "learning_rate": 4.12784557180602e-06, + "loss": 1.3249174356460571, + "step": 5154 + }, + { + "epoch": 0.9385637571675617, + "grad_norm": 10.25, + "learning_rate": 4.127212782406098e-06, + "loss": 1.8071355819702148, + "step": 5156 + }, + { + "epoch": 0.9389278237917539, + "grad_norm": 7.6875, + "learning_rate": 4.1265798275922685e-06, + "loss": 1.3531473875045776, + "step": 5158 + }, + { + "epoch": 0.9392918904159461, + "grad_norm": 13.125, + "learning_rate": 4.125946707457415e-06, + "loss": 1.3463969230651855, + "step": 5160 + }, + { + "epoch": 0.9396559570401384, + "grad_norm": 13.8125, + "learning_rate": 4.125313422094443e-06, + "loss": 1.3990118503570557, + "step": 5162 + }, + { + "epoch": 0.9400200236643306, + "grad_norm": 12.1875, + "learning_rate": 4.124679971596289e-06, + "loss": 1.2347328662872314, + "step": 5164 + }, + { + "epoch": 0.9403840902885228, + "grad_norm": 11.25, + "learning_rate": 4.1240463560559066e-06, + "loss": 1.0382413864135742, + "step": 5166 + }, + { + "epoch": 0.9407481569127151, + "grad_norm": 14.625, + "learning_rate": 4.123412575566276e-06, + "loss": 1.5316096544265747, + "step": 5168 + }, + { + "epoch": 0.9411122235369073, + "grad_norm": 19.625, + "learning_rate": 4.122778630220403e-06, + "loss": 1.7390072345733643, + "step": 5170 + }, + { + "epoch": 0.9414762901610995, + "grad_norm": 24.625, + "learning_rate": 4.122144520111317e-06, + "loss": 1.4744058847427368, + "step": 5172 + }, + { + "epoch": 0.9418403567852917, + "grad_norm": 9.5, + "learning_rate": 4.12151024533207e-06, + "loss": 1.5495997667312622, + "step": 5174 + }, + { + "epoch": 0.942204423409484, + "grad_norm": 6.65625, + "learning_rate": 4.1208758059757405e-06, + "loss": 1.188522219657898, + "step": 5176 + }, + { + "epoch": 0.9425684900336762, + "grad_norm": 7.4375, + "learning_rate": 4.120241202135428e-06, + "loss": 1.4370396137237549, + "step": 5178 + }, + { + "epoch": 0.9429325566578683, + "grad_norm": 20.625, + "learning_rate": 4.119606433904259e-06, + "loss": 1.4181649684906006, + "step": 5180 + }, + { + "epoch": 0.9432966232820607, + "grad_norm": 9.0, + "learning_rate": 4.118971501375383e-06, + "loss": 1.4289231300354004, + "step": 5182 + }, + { + "epoch": 0.9436606899062528, + "grad_norm": 13.4375, + "learning_rate": 4.1183364046419726e-06, + "loss": 1.099740743637085, + "step": 5184 + }, + { + "epoch": 0.944024756530445, + "grad_norm": 14.875, + "learning_rate": 4.117701143797229e-06, + "loss": 0.9242735505104065, + "step": 5186 + }, + { + "epoch": 0.9443888231546373, + "grad_norm": 13.5625, + "learning_rate": 4.1170657189343725e-06, + "loss": 1.3909621238708496, + "step": 5188 + }, + { + "epoch": 0.9447528897788295, + "grad_norm": 6.71875, + "learning_rate": 4.116430130146648e-06, + "loss": 1.3420443534851074, + "step": 5190 + }, + { + "epoch": 0.9451169564030217, + "grad_norm": 5.1875, + "learning_rate": 4.115794377527327e-06, + "loss": 1.2318744659423828, + "step": 5192 + }, + { + "epoch": 0.9454810230272139, + "grad_norm": 9.125, + "learning_rate": 4.115158461169703e-06, + "loss": 1.52884840965271, + "step": 5194 + }, + { + "epoch": 0.9458450896514062, + "grad_norm": 11.75, + "learning_rate": 4.114522381167093e-06, + "loss": 1.4363899230957031, + "step": 5196 + }, + { + "epoch": 0.9462091562755984, + "grad_norm": 13.375, + "learning_rate": 4.11388613761284e-06, + "loss": 1.6438429355621338, + "step": 5198 + }, + { + "epoch": 0.9465732228997906, + "grad_norm": 12.5, + "learning_rate": 4.113249730600311e-06, + "loss": 1.9754016399383545, + "step": 5200 + }, + { + "epoch": 0.9469372895239829, + "grad_norm": 10.625, + "learning_rate": 4.112613160222897e-06, + "loss": 1.2382174730300903, + "step": 5202 + }, + { + "epoch": 0.9473013561481751, + "grad_norm": 16.875, + "learning_rate": 4.11197642657401e-06, + "loss": 1.3318297863006592, + "step": 5204 + }, + { + "epoch": 0.9476654227723673, + "grad_norm": 11.6875, + "learning_rate": 4.1113395297470895e-06, + "loss": 1.870293378829956, + "step": 5206 + }, + { + "epoch": 0.9480294893965596, + "grad_norm": 26.0, + "learning_rate": 4.110702469835596e-06, + "loss": 1.3904330730438232, + "step": 5208 + }, + { + "epoch": 0.9483935560207518, + "grad_norm": 3.125, + "learning_rate": 4.110065246933016e-06, + "loss": 0.9703547358512878, + "step": 5210 + }, + { + "epoch": 0.948757622644944, + "grad_norm": 12.125, + "learning_rate": 4.109427861132861e-06, + "loss": 1.6436619758605957, + "step": 5212 + }, + { + "epoch": 0.9491216892691362, + "grad_norm": 8.125, + "learning_rate": 4.108790312528662e-06, + "loss": 1.5922930240631104, + "step": 5214 + }, + { + "epoch": 0.9494857558933285, + "grad_norm": 13.5625, + "learning_rate": 4.108152601213979e-06, + "loss": 1.4536755084991455, + "step": 5216 + }, + { + "epoch": 0.9498498225175207, + "grad_norm": 38.5, + "learning_rate": 4.107514727282394e-06, + "loss": 1.1911377906799316, + "step": 5218 + }, + { + "epoch": 0.9502138891417129, + "grad_norm": 17.5, + "learning_rate": 4.106876690827508e-06, + "loss": 0.9176419973373413, + "step": 5220 + }, + { + "epoch": 0.9505779557659052, + "grad_norm": 15.4375, + "learning_rate": 4.106238491942956e-06, + "loss": 1.0802327394485474, + "step": 5222 + }, + { + "epoch": 0.9509420223900974, + "grad_norm": 5.4375, + "learning_rate": 4.105600130722387e-06, + "loss": 0.9345170259475708, + "step": 5224 + }, + { + "epoch": 0.9513060890142896, + "grad_norm": 6.75, + "learning_rate": 4.104961607259481e-06, + "loss": 1.1717047691345215, + "step": 5226 + }, + { + "epoch": 0.9516701556384819, + "grad_norm": 9.4375, + "learning_rate": 4.1043229216479364e-06, + "loss": 1.4178457260131836, + "step": 5228 + }, + { + "epoch": 0.9520342222626741, + "grad_norm": 9.1875, + "learning_rate": 4.103684073981478e-06, + "loss": 1.4470610618591309, + "step": 5230 + }, + { + "epoch": 0.9523982888868663, + "grad_norm": 5.4375, + "learning_rate": 4.103045064353854e-06, + "loss": 1.306809425354004, + "step": 5232 + }, + { + "epoch": 0.9527623555110585, + "grad_norm": 22.25, + "learning_rate": 4.1024058928588386e-06, + "loss": 1.5226964950561523, + "step": 5234 + }, + { + "epoch": 0.9531264221352508, + "grad_norm": 8.625, + "learning_rate": 4.101766559590226e-06, + "loss": 2.154179334640503, + "step": 5236 + }, + { + "epoch": 0.953490488759443, + "grad_norm": 11.0, + "learning_rate": 4.1011270646418345e-06, + "loss": 1.469681978225708, + "step": 5238 + }, + { + "epoch": 0.9538545553836352, + "grad_norm": 25.0, + "learning_rate": 4.10048740810751e-06, + "loss": 1.5788908004760742, + "step": 5240 + }, + { + "epoch": 0.9542186220078275, + "grad_norm": 18.375, + "learning_rate": 4.099847590081117e-06, + "loss": 1.4505411386489868, + "step": 5242 + }, + { + "epoch": 0.9545826886320197, + "grad_norm": 8.75, + "learning_rate": 4.099207610656548e-06, + "loss": 1.5523974895477295, + "step": 5244 + }, + { + "epoch": 0.9549467552562119, + "grad_norm": 8.375, + "learning_rate": 4.0985674699277176e-06, + "loss": 1.5758659839630127, + "step": 5246 + }, + { + "epoch": 0.9553108218804041, + "grad_norm": 10.4375, + "learning_rate": 4.097927167988562e-06, + "loss": 1.305679202079773, + "step": 5248 + }, + { + "epoch": 0.9556748885045964, + "grad_norm": 7.4375, + "learning_rate": 4.097286704933045e-06, + "loss": 0.9770964980125427, + "step": 5250 + }, + { + "epoch": 0.9560389551287886, + "grad_norm": 13.5, + "learning_rate": 4.09664608085515e-06, + "loss": 1.7222895622253418, + "step": 5252 + }, + { + "epoch": 0.9564030217529808, + "grad_norm": 18.125, + "learning_rate": 4.0960052958488885e-06, + "loss": 1.7340359687805176, + "step": 5254 + }, + { + "epoch": 0.9567670883771731, + "grad_norm": 11.5, + "learning_rate": 4.095364350008289e-06, + "loss": 1.6007449626922607, + "step": 5256 + }, + { + "epoch": 0.9571311550013653, + "grad_norm": 19.75, + "learning_rate": 4.094723243427413e-06, + "loss": 1.782277226448059, + "step": 5258 + }, + { + "epoch": 0.9574952216255574, + "grad_norm": 12.625, + "learning_rate": 4.094081976200336e-06, + "loss": 1.4803569316864014, + "step": 5260 + }, + { + "epoch": 0.9578592882497498, + "grad_norm": 6.78125, + "learning_rate": 4.093440548421162e-06, + "loss": 1.0331615209579468, + "step": 5262 + }, + { + "epoch": 0.958223354873942, + "grad_norm": 10.5625, + "learning_rate": 4.092798960184021e-06, + "loss": 1.596687912940979, + "step": 5264 + }, + { + "epoch": 0.9585874214981341, + "grad_norm": 16.5, + "learning_rate": 4.092157211583061e-06, + "loss": 1.4851155281066895, + "step": 5266 + }, + { + "epoch": 0.9589514881223263, + "grad_norm": 7.71875, + "learning_rate": 4.091515302712456e-06, + "loss": 1.4892480373382568, + "step": 5268 + }, + { + "epoch": 0.9593155547465186, + "grad_norm": 3.546875, + "learning_rate": 4.090873233666402e-06, + "loss": 1.0636554956436157, + "step": 5270 + }, + { + "epoch": 0.9596796213707108, + "grad_norm": 10.75, + "learning_rate": 4.090231004539125e-06, + "loss": 1.035035252571106, + "step": 5272 + }, + { + "epoch": 0.960043687994903, + "grad_norm": 23.125, + "learning_rate": 4.089588615424865e-06, + "loss": 1.5316381454467773, + "step": 5274 + }, + { + "epoch": 0.9604077546190953, + "grad_norm": 13.5625, + "learning_rate": 4.0889460664178904e-06, + "loss": 1.423702359199524, + "step": 5276 + }, + { + "epoch": 0.9607718212432875, + "grad_norm": 35.75, + "learning_rate": 4.088303357612494e-06, + "loss": 1.3729106187820435, + "step": 5278 + }, + { + "epoch": 0.9611358878674797, + "grad_norm": 9.125, + "learning_rate": 4.0876604891029916e-06, + "loss": 1.5042821168899536, + "step": 5280 + }, + { + "epoch": 0.961499954491672, + "grad_norm": 12.1875, + "learning_rate": 4.08701746098372e-06, + "loss": 1.5164340734481812, + "step": 5282 + }, + { + "epoch": 0.9618640211158642, + "grad_norm": 7.75, + "learning_rate": 4.086374273349041e-06, + "loss": 1.0936529636383057, + "step": 5284 + }, + { + "epoch": 0.9622280877400564, + "grad_norm": 9.625, + "learning_rate": 4.08573092629334e-06, + "loss": 1.1869004964828491, + "step": 5286 + }, + { + "epoch": 0.9625921543642486, + "grad_norm": 8.25, + "learning_rate": 4.085087419911026e-06, + "loss": 1.4046446084976196, + "step": 5288 + }, + { + "epoch": 0.9629562209884409, + "grad_norm": 36.75, + "learning_rate": 4.084443754296529e-06, + "loss": 0.8610131144523621, + "step": 5290 + }, + { + "epoch": 0.9633202876126331, + "grad_norm": 9.125, + "learning_rate": 4.0837999295443074e-06, + "loss": 1.0124895572662354, + "step": 5292 + }, + { + "epoch": 0.9636843542368253, + "grad_norm": 21.0, + "learning_rate": 4.083155945748839e-06, + "loss": 1.593213677406311, + "step": 5294 + }, + { + "epoch": 0.9640484208610176, + "grad_norm": 16.75, + "learning_rate": 4.082511803004624e-06, + "loss": 1.8866251707077026, + "step": 5296 + }, + { + "epoch": 0.9644124874852098, + "grad_norm": 28.0, + "learning_rate": 4.081867501406189e-06, + "loss": 0.991114616394043, + "step": 5298 + }, + { + "epoch": 0.964776554109402, + "grad_norm": 20.75, + "learning_rate": 4.0812230410480836e-06, + "loss": 1.0621370077133179, + "step": 5300 + }, + { + "epoch": 0.9651406207335943, + "grad_norm": 10.3125, + "learning_rate": 4.080578422024878e-06, + "loss": 1.4864797592163086, + "step": 5302 + }, + { + "epoch": 0.9655046873577865, + "grad_norm": 27.875, + "learning_rate": 4.079933644431168e-06, + "loss": 1.1908533573150635, + "step": 5304 + }, + { + "epoch": 0.9658687539819787, + "grad_norm": 10.875, + "learning_rate": 4.0792887083615714e-06, + "loss": 1.3227975368499756, + "step": 5306 + }, + { + "epoch": 0.9662328206061709, + "grad_norm": 31.0, + "learning_rate": 4.078643613910733e-06, + "loss": 1.6085257530212402, + "step": 5308 + }, + { + "epoch": 0.9665968872303632, + "grad_norm": 10.625, + "learning_rate": 4.077998361173314e-06, + "loss": 1.7413854598999023, + "step": 5310 + }, + { + "epoch": 0.9669609538545554, + "grad_norm": 7.375, + "learning_rate": 4.0773529502440055e-06, + "loss": 1.1168752908706665, + "step": 5312 + }, + { + "epoch": 0.9673250204787476, + "grad_norm": 44.25, + "learning_rate": 4.076707381217516e-06, + "loss": 1.4369099140167236, + "step": 5314 + }, + { + "epoch": 0.9676890871029399, + "grad_norm": 18.875, + "learning_rate": 4.076061654188583e-06, + "loss": 1.3208189010620117, + "step": 5316 + }, + { + "epoch": 0.9680531537271321, + "grad_norm": 11.8125, + "learning_rate": 4.075415769251963e-06, + "loss": 1.54694664478302, + "step": 5318 + }, + { + "epoch": 0.9684172203513243, + "grad_norm": 23.75, + "learning_rate": 4.074769726502438e-06, + "loss": 1.453613042831421, + "step": 5320 + }, + { + "epoch": 0.9687812869755165, + "grad_norm": 13.375, + "learning_rate": 4.07412352603481e-06, + "loss": 1.1985145807266235, + "step": 5322 + }, + { + "epoch": 0.9691453535997088, + "grad_norm": 16.375, + "learning_rate": 4.073477167943908e-06, + "loss": 1.7540076971054077, + "step": 5324 + }, + { + "epoch": 0.969509420223901, + "grad_norm": 9.875, + "learning_rate": 4.072830652324582e-06, + "loss": 1.4244577884674072, + "step": 5326 + }, + { + "epoch": 0.9698734868480932, + "grad_norm": 10.3125, + "learning_rate": 4.0721839792717055e-06, + "loss": 1.3076109886169434, + "step": 5328 + }, + { + "epoch": 0.9702375534722855, + "grad_norm": 6.375, + "learning_rate": 4.071537148880174e-06, + "loss": 1.3746185302734375, + "step": 5330 + }, + { + "epoch": 0.9706016200964777, + "grad_norm": 18.5, + "learning_rate": 4.070890161244911e-06, + "loss": 1.543274998664856, + "step": 5332 + }, + { + "epoch": 0.9709656867206699, + "grad_norm": 114.0, + "learning_rate": 4.070243016460855e-06, + "loss": 1.909379005432129, + "step": 5334 + }, + { + "epoch": 0.9713297533448622, + "grad_norm": 8.6875, + "learning_rate": 4.069595714622974e-06, + "loss": 1.3400285243988037, + "step": 5336 + }, + { + "epoch": 0.9716938199690544, + "grad_norm": 36.25, + "learning_rate": 4.068948255826257e-06, + "loss": 1.5058397054672241, + "step": 5338 + }, + { + "epoch": 0.9720578865932465, + "grad_norm": 24.375, + "learning_rate": 4.0683006401657155e-06, + "loss": 1.848771572113037, + "step": 5340 + }, + { + "epoch": 0.9724219532174387, + "grad_norm": 12.4375, + "learning_rate": 4.0676528677363845e-06, + "loss": 1.7847723960876465, + "step": 5342 + }, + { + "epoch": 0.972786019841631, + "grad_norm": 14.375, + "learning_rate": 4.067004938633322e-06, + "loss": 1.0416680574417114, + "step": 5344 + }, + { + "epoch": 0.9731500864658232, + "grad_norm": 28.125, + "learning_rate": 4.066356852951609e-06, + "loss": 1.158281922340393, + "step": 5346 + }, + { + "epoch": 0.9735141530900154, + "grad_norm": 9.5625, + "learning_rate": 4.0657086107863485e-06, + "loss": 1.5224478244781494, + "step": 5348 + }, + { + "epoch": 0.9738782197142077, + "grad_norm": 19.25, + "learning_rate": 4.0650602122326684e-06, + "loss": 1.4053987264633179, + "step": 5350 + }, + { + "epoch": 0.9742422863383999, + "grad_norm": 5.34375, + "learning_rate": 4.064411657385719e-06, + "loss": 1.400122046470642, + "step": 5352 + }, + { + "epoch": 0.9746063529625921, + "grad_norm": 2.765625, + "learning_rate": 4.063762946340673e-06, + "loss": 1.2220582962036133, + "step": 5354 + }, + { + "epoch": 0.9749704195867844, + "grad_norm": 5.0625, + "learning_rate": 4.063114079192726e-06, + "loss": 1.0634700059890747, + "step": 5356 + }, + { + "epoch": 0.9753344862109766, + "grad_norm": 11.6875, + "learning_rate": 4.062465056037095e-06, + "loss": 1.2520416975021362, + "step": 5358 + }, + { + "epoch": 0.9756985528351688, + "grad_norm": 37.5, + "learning_rate": 4.061815876969023e-06, + "loss": 1.2351628541946411, + "step": 5360 + }, + { + "epoch": 0.976062619459361, + "grad_norm": 14.9375, + "learning_rate": 4.061166542083775e-06, + "loss": 1.0903244018554688, + "step": 5362 + }, + { + "epoch": 0.9764266860835533, + "grad_norm": 14.8125, + "learning_rate": 4.060517051476637e-06, + "loss": 1.4582080841064453, + "step": 5364 + }, + { + "epoch": 0.9767907527077455, + "grad_norm": 15.8125, + "learning_rate": 4.05986740524292e-06, + "loss": 1.5534783601760864, + "step": 5366 + }, + { + "epoch": 0.9771548193319377, + "grad_norm": 11.75, + "learning_rate": 4.059217603477955e-06, + "loss": 1.6136937141418457, + "step": 5368 + }, + { + "epoch": 0.97751888595613, + "grad_norm": 6.21875, + "learning_rate": 4.058567646277101e-06, + "loss": 1.4576661586761475, + "step": 5370 + }, + { + "epoch": 0.9778829525803222, + "grad_norm": 12.8125, + "learning_rate": 4.057917533735734e-06, + "loss": 1.1450663805007935, + "step": 5372 + }, + { + "epoch": 0.9782470192045144, + "grad_norm": 20.25, + "learning_rate": 4.057267265949257e-06, + "loss": 2.101703643798828, + "step": 5374 + }, + { + "epoch": 0.9786110858287067, + "grad_norm": 16.75, + "learning_rate": 4.056616843013094e-06, + "loss": 1.921669602394104, + "step": 5376 + }, + { + "epoch": 0.9789751524528989, + "grad_norm": 17.25, + "learning_rate": 4.055966265022689e-06, + "loss": 1.2655748128890991, + "step": 5378 + }, + { + "epoch": 0.9793392190770911, + "grad_norm": 54.5, + "learning_rate": 4.055315532073517e-06, + "loss": 1.0070053339004517, + "step": 5380 + }, + { + "epoch": 0.9797032857012833, + "grad_norm": 16.5, + "learning_rate": 4.054664644261065e-06, + "loss": 1.4612586498260498, + "step": 5382 + }, + { + "epoch": 0.9800673523254756, + "grad_norm": 12.125, + "learning_rate": 4.054013601680852e-06, + "loss": 1.4487459659576416, + "step": 5384 + }, + { + "epoch": 0.9804314189496678, + "grad_norm": 12.25, + "learning_rate": 4.0533624044284145e-06, + "loss": 1.3196929693222046, + "step": 5386 + }, + { + "epoch": 0.98079548557386, + "grad_norm": 12.875, + "learning_rate": 4.052711052599313e-06, + "loss": 0.611315131187439, + "step": 5388 + }, + { + "epoch": 0.9811595521980523, + "grad_norm": 10.5625, + "learning_rate": 4.05205954628913e-06, + "loss": 0.8820526599884033, + "step": 5390 + }, + { + "epoch": 0.9815236188222445, + "grad_norm": 5.3125, + "learning_rate": 4.051407885593473e-06, + "loss": 1.5229648351669312, + "step": 5392 + }, + { + "epoch": 0.9818876854464367, + "grad_norm": 4.34375, + "learning_rate": 4.050756070607969e-06, + "loss": 1.0040631294250488, + "step": 5394 + }, + { + "epoch": 0.9822517520706289, + "grad_norm": 11.125, + "learning_rate": 4.050104101428271e-06, + "loss": 1.1378288269042969, + "step": 5396 + }, + { + "epoch": 0.9826158186948212, + "grad_norm": 7.03125, + "learning_rate": 4.049451978150052e-06, + "loss": 1.4940667152404785, + "step": 5398 + }, + { + "epoch": 0.9829798853190134, + "grad_norm": 8.4375, + "learning_rate": 4.048799700869007e-06, + "loss": 1.600928544998169, + "step": 5400 + }, + { + "epoch": 0.9833439519432056, + "grad_norm": 5.375, + "learning_rate": 4.048147269680857e-06, + "loss": 1.032188057899475, + "step": 5402 + }, + { + "epoch": 0.9837080185673979, + "grad_norm": 5.40625, + "learning_rate": 4.047494684681343e-06, + "loss": 1.52291738986969, + "step": 5404 + }, + { + "epoch": 0.9840720851915901, + "grad_norm": 10.4375, + "learning_rate": 4.046841945966229e-06, + "loss": 1.5357755422592163, + "step": 5406 + }, + { + "epoch": 0.9844361518157823, + "grad_norm": 11.0, + "learning_rate": 4.046189053631302e-06, + "loss": 1.7469202280044556, + "step": 5408 + }, + { + "epoch": 0.9848002184399746, + "grad_norm": 10.0, + "learning_rate": 4.0455360077723716e-06, + "loss": 1.3371202945709229, + "step": 5410 + }, + { + "epoch": 0.9851642850641668, + "grad_norm": 10.9375, + "learning_rate": 4.044882808485267e-06, + "loss": 1.1834675073623657, + "step": 5412 + }, + { + "epoch": 0.985528351688359, + "grad_norm": 9.875, + "learning_rate": 4.044229455865848e-06, + "loss": 1.4310107231140137, + "step": 5414 + }, + { + "epoch": 0.9858924183125511, + "grad_norm": 22.625, + "learning_rate": 4.043575950009987e-06, + "loss": 1.296782374382019, + "step": 5416 + }, + { + "epoch": 0.9862564849367434, + "grad_norm": 13.1875, + "learning_rate": 4.042922291013584e-06, + "loss": 0.7016429901123047, + "step": 5418 + }, + { + "epoch": 0.9866205515609356, + "grad_norm": 17.75, + "learning_rate": 4.042268478972562e-06, + "loss": 0.45298367738723755, + "step": 5420 + }, + { + "epoch": 0.9869846181851278, + "grad_norm": 12.4375, + "learning_rate": 4.041614513982864e-06, + "loss": 1.768958568572998, + "step": 5422 + }, + { + "epoch": 0.9873486848093201, + "grad_norm": 14.0, + "learning_rate": 4.040960396140457e-06, + "loss": 1.13344407081604, + "step": 5424 + }, + { + "epoch": 0.9877127514335123, + "grad_norm": 9.3125, + "learning_rate": 4.040306125541332e-06, + "loss": 1.4268752336502075, + "step": 5426 + }, + { + "epoch": 0.9880768180577045, + "grad_norm": 6.875, + "learning_rate": 4.039651702281499e-06, + "loss": 1.3852629661560059, + "step": 5428 + }, + { + "epoch": 0.9884408846818968, + "grad_norm": 68.5, + "learning_rate": 4.038997126456992e-06, + "loss": 1.208700180053711, + "step": 5430 + }, + { + "epoch": 0.988804951306089, + "grad_norm": 18.25, + "learning_rate": 4.038342398163866e-06, + "loss": 1.5933505296707153, + "step": 5432 + }, + { + "epoch": 0.9891690179302812, + "grad_norm": 17.875, + "learning_rate": 4.037687517498203e-06, + "loss": 1.0996559858322144, + "step": 5434 + }, + { + "epoch": 0.9895330845544734, + "grad_norm": 9.5, + "learning_rate": 4.037032484556099e-06, + "loss": 1.4849562644958496, + "step": 5436 + }, + { + "epoch": 0.9898971511786657, + "grad_norm": 9.5, + "learning_rate": 4.036377299433683e-06, + "loss": 1.2864630222320557, + "step": 5438 + }, + { + "epoch": 0.9902612178028579, + "grad_norm": 16.625, + "learning_rate": 4.035721962227098e-06, + "loss": 1.2677327394485474, + "step": 5440 + }, + { + "epoch": 0.9906252844270501, + "grad_norm": 23.125, + "learning_rate": 4.035066473032513e-06, + "loss": 0.8988248109817505, + "step": 5442 + }, + { + "epoch": 0.9909893510512424, + "grad_norm": 9.625, + "learning_rate": 4.034410831946117e-06, + "loss": 1.4658727645874023, + "step": 5444 + }, + { + "epoch": 0.9913534176754346, + "grad_norm": 8.9375, + "learning_rate": 4.033755039064124e-06, + "loss": 1.4299671649932861, + "step": 5446 + }, + { + "epoch": 0.9917174842996268, + "grad_norm": 5.96875, + "learning_rate": 4.033099094482769e-06, + "loss": 1.3738458156585693, + "step": 5448 + }, + { + "epoch": 0.9920815509238191, + "grad_norm": 11.25, + "learning_rate": 4.032442998298308e-06, + "loss": 1.3911213874816895, + "step": 5450 + }, + { + "epoch": 0.9924456175480113, + "grad_norm": 8.3125, + "learning_rate": 4.031786750607021e-06, + "loss": 1.5168060064315796, + "step": 5452 + }, + { + "epoch": 0.9928096841722035, + "grad_norm": 11.9375, + "learning_rate": 4.03113035150521e-06, + "loss": 1.37184476852417, + "step": 5454 + }, + { + "epoch": 0.9931737507963957, + "grad_norm": 12.6875, + "learning_rate": 4.0304738010891984e-06, + "loss": 1.438374638557434, + "step": 5456 + }, + { + "epoch": 0.993537817420588, + "grad_norm": 17.0, + "learning_rate": 4.029817099455333e-06, + "loss": 1.4127064943313599, + "step": 5458 + }, + { + "epoch": 0.9939018840447802, + "grad_norm": 44.0, + "learning_rate": 4.029160246699982e-06, + "loss": 2.000234603881836, + "step": 5460 + }, + { + "epoch": 0.9942659506689724, + "grad_norm": 14.25, + "learning_rate": 4.028503242919536e-06, + "loss": 1.509704828262329, + "step": 5462 + }, + { + "epoch": 0.9946300172931647, + "grad_norm": 18.25, + "learning_rate": 4.0278460882104065e-06, + "loss": 1.3360744714736938, + "step": 5464 + }, + { + "epoch": 0.9949940839173569, + "grad_norm": 13.125, + "learning_rate": 4.0271887826690285e-06, + "loss": 1.2061679363250732, + "step": 5466 + }, + { + "epoch": 0.9953581505415491, + "grad_norm": 3.90625, + "learning_rate": 4.02653132639186e-06, + "loss": 1.2287050485610962, + "step": 5468 + }, + { + "epoch": 0.9957222171657414, + "grad_norm": 13.375, + "learning_rate": 4.025873719475379e-06, + "loss": 1.4585390090942383, + "step": 5470 + }, + { + "epoch": 0.9960862837899336, + "grad_norm": 14.0, + "learning_rate": 4.025215962016088e-06, + "loss": 1.3630969524383545, + "step": 5472 + }, + { + "epoch": 0.9964503504141258, + "grad_norm": 18.5, + "learning_rate": 4.024558054110509e-06, + "loss": 0.8193284273147583, + "step": 5474 + }, + { + "epoch": 0.996814417038318, + "grad_norm": 25.875, + "learning_rate": 4.023899995855188e-06, + "loss": 1.4585412740707397, + "step": 5476 + }, + { + "epoch": 0.9971784836625103, + "grad_norm": 4.25, + "learning_rate": 4.023241787346692e-06, + "loss": 1.2779680490493774, + "step": 5478 + }, + { + "epoch": 0.9975425502867025, + "grad_norm": 15.125, + "learning_rate": 4.0225834286816115e-06, + "loss": 1.2345284223556519, + "step": 5480 + }, + { + "epoch": 0.9979066169108947, + "grad_norm": 8.5625, + "learning_rate": 4.021924919956556e-06, + "loss": 1.4153228998184204, + "step": 5482 + }, + { + "epoch": 0.998270683535087, + "grad_norm": 14.4375, + "learning_rate": 4.02126626126816e-06, + "loss": 1.317355751991272, + "step": 5484 + }, + { + "epoch": 0.9986347501592792, + "grad_norm": 11.125, + "learning_rate": 4.020607452713078e-06, + "loss": 1.4046615362167358, + "step": 5486 + }, + { + "epoch": 0.9989988167834714, + "grad_norm": 7.46875, + "learning_rate": 4.0199484943879896e-06, + "loss": 1.197911262512207, + "step": 5488 + }, + { + "epoch": 0.9993628834076635, + "grad_norm": 15.375, + "learning_rate": 4.019289386389593e-06, + "loss": 1.910951018333435, + "step": 5490 + }, + { + "epoch": 0.9997269500318559, + "grad_norm": 17.0, + "learning_rate": 4.01863012881461e-06, + "loss": 1.6107884645462036, + "step": 5492 + }, + { + "epoch": 1.0, + "grad_norm": 17.5, + "learning_rate": 4.017970721759784e-06, + "loss": 1.5087711811065674, + "step": 5494 + }, + { + "epoch": 1.0003640666241922, + "grad_norm": 3.484375, + "learning_rate": 4.0173111653218795e-06, + "loss": 1.4003055095672607, + "step": 5496 + }, + { + "epoch": 1.0007281332483844, + "grad_norm": 32.75, + "learning_rate": 4.0166514595976845e-06, + "loss": 0.9484976530075073, + "step": 5498 + }, + { + "epoch": 1.0010921998725766, + "grad_norm": 9.4375, + "learning_rate": 4.015991604684008e-06, + "loss": 1.5096923112869263, + "step": 5500 + }, + { + "epoch": 1.001456266496769, + "grad_norm": 19.375, + "learning_rate": 4.0153316006776795e-06, + "loss": 1.0394673347473145, + "step": 5502 + }, + { + "epoch": 1.0018203331209612, + "grad_norm": 132.0, + "learning_rate": 4.0146714476755555e-06, + "loss": 1.490920066833496, + "step": 5504 + }, + { + "epoch": 1.0021843997451534, + "grad_norm": 27.125, + "learning_rate": 4.0140111457745076e-06, + "loss": 0.35383227467536926, + "step": 5506 + }, + { + "epoch": 1.0025484663693456, + "grad_norm": 11.75, + "learning_rate": 4.013350695071434e-06, + "loss": 1.3834779262542725, + "step": 5508 + }, + { + "epoch": 1.0029125329935378, + "grad_norm": 4.78125, + "learning_rate": 4.012690095663253e-06, + "loss": 1.3156682252883911, + "step": 5510 + }, + { + "epoch": 1.00327659961773, + "grad_norm": 15.5625, + "learning_rate": 4.012029347646903e-06, + "loss": 1.3250792026519775, + "step": 5512 + }, + { + "epoch": 1.0036406662419222, + "grad_norm": 20.0, + "learning_rate": 4.01136845111935e-06, + "loss": 1.9037601947784424, + "step": 5514 + }, + { + "epoch": 1.0040047328661146, + "grad_norm": 71.5, + "learning_rate": 4.010707406177573e-06, + "loss": 1.2338621616363525, + "step": 5516 + }, + { + "epoch": 1.0043687994903068, + "grad_norm": 19.625, + "learning_rate": 4.010046212918581e-06, + "loss": 1.43930983543396, + "step": 5518 + }, + { + "epoch": 1.004732866114499, + "grad_norm": 6.0, + "learning_rate": 4.009384871439401e-06, + "loss": 1.3999929428100586, + "step": 5520 + }, + { + "epoch": 1.0050969327386912, + "grad_norm": 12.5625, + "learning_rate": 4.008723381837082e-06, + "loss": 1.4799768924713135, + "step": 5522 + }, + { + "epoch": 1.0054609993628834, + "grad_norm": 13.0, + "learning_rate": 4.0080617442086945e-06, + "loss": 1.5514425039291382, + "step": 5524 + }, + { + "epoch": 1.0058250659870756, + "grad_norm": 9.25, + "learning_rate": 4.007399958651331e-06, + "loss": 1.4073677062988281, + "step": 5526 + }, + { + "epoch": 1.006189132611268, + "grad_norm": 17.25, + "learning_rate": 4.006738025262106e-06, + "loss": 1.86566162109375, + "step": 5528 + }, + { + "epoch": 1.0065531992354602, + "grad_norm": 11.3125, + "learning_rate": 4.006075944138157e-06, + "loss": 1.0989599227905273, + "step": 5530 + }, + { + "epoch": 1.0069172658596524, + "grad_norm": 14.875, + "learning_rate": 4.00541371537664e-06, + "loss": 1.3692044019699097, + "step": 5532 + }, + { + "epoch": 1.0072813324838445, + "grad_norm": 242.0, + "learning_rate": 4.004751339074734e-06, + "loss": 1.4603779315948486, + "step": 5534 + }, + { + "epoch": 1.0076453991080367, + "grad_norm": 25.625, + "learning_rate": 4.004088815329641e-06, + "loss": 1.5434331893920898, + "step": 5536 + }, + { + "epoch": 1.008009465732229, + "grad_norm": 7.4375, + "learning_rate": 4.003426144238583e-06, + "loss": 1.4738571643829346, + "step": 5538 + }, + { + "epoch": 1.0083735323564211, + "grad_norm": 18.875, + "learning_rate": 4.002763325898808e-06, + "loss": 1.3381311893463135, + "step": 5540 + }, + { + "epoch": 1.0087375989806135, + "grad_norm": 9.75, + "learning_rate": 4.002100360407576e-06, + "loss": 1.3541488647460938, + "step": 5542 + }, + { + "epoch": 1.0091016656048057, + "grad_norm": 30.875, + "learning_rate": 4.0014372478621775e-06, + "loss": 1.6272519826889038, + "step": 5544 + }, + { + "epoch": 1.009465732228998, + "grad_norm": 6.03125, + "learning_rate": 4.000773988359922e-06, + "loss": 1.1985750198364258, + "step": 5546 + }, + { + "epoch": 1.0098297988531901, + "grad_norm": 2.203125, + "learning_rate": 4.000110581998139e-06, + "loss": 1.1730234622955322, + "step": 5548 + }, + { + "epoch": 1.0101938654773823, + "grad_norm": 25.875, + "learning_rate": 3.9994470288741805e-06, + "loss": 0.9922491312026978, + "step": 5550 + }, + { + "epoch": 1.0105579321015745, + "grad_norm": 18.25, + "learning_rate": 3.998783329085421e-06, + "loss": 1.6758556365966797, + "step": 5552 + }, + { + "epoch": 1.0109219987257667, + "grad_norm": 5.59375, + "learning_rate": 3.998119482729258e-06, + "loss": 1.390196442604065, + "step": 5554 + }, + { + "epoch": 1.0112860653499591, + "grad_norm": 18.375, + "learning_rate": 3.997455489903104e-06, + "loss": 1.6273266077041626, + "step": 5556 + }, + { + "epoch": 1.0116501319741513, + "grad_norm": 5.25, + "learning_rate": 3.9967913507044e-06, + "loss": 1.3647640943527222, + "step": 5558 + }, + { + "epoch": 1.0120141985983435, + "grad_norm": 9.1875, + "learning_rate": 3.996127065230604e-06, + "loss": 1.4864213466644287, + "step": 5560 + }, + { + "epoch": 1.0123782652225357, + "grad_norm": 17.375, + "learning_rate": 3.995462633579199e-06, + "loss": 1.1231017112731934, + "step": 5562 + }, + { + "epoch": 1.012742331846728, + "grad_norm": 4.9375, + "learning_rate": 3.9947980558476865e-06, + "loss": 0.7884931564331055, + "step": 5564 + }, + { + "epoch": 1.01310639847092, + "grad_norm": 20.0, + "learning_rate": 3.9941333321335904e-06, + "loss": 1.5433924198150635, + "step": 5566 + }, + { + "epoch": 1.0134704650951125, + "grad_norm": 13.5625, + "learning_rate": 3.993468462534457e-06, + "loss": 1.4204564094543457, + "step": 5568 + }, + { + "epoch": 1.0138345317193047, + "grad_norm": 30.25, + "learning_rate": 3.992803447147853e-06, + "loss": 1.3641741275787354, + "step": 5570 + }, + { + "epoch": 1.014198598343497, + "grad_norm": 13.5, + "learning_rate": 3.992138286071366e-06, + "loss": 1.415997862815857, + "step": 5572 + }, + { + "epoch": 1.014562664967689, + "grad_norm": 8.0, + "learning_rate": 3.991472979402608e-06, + "loss": 1.17242431640625, + "step": 5574 + }, + { + "epoch": 1.0149267315918813, + "grad_norm": 19.125, + "learning_rate": 3.990807527239206e-06, + "loss": 1.5586891174316406, + "step": 5576 + }, + { + "epoch": 1.0152907982160735, + "grad_norm": 4.71875, + "learning_rate": 3.9901419296788165e-06, + "loss": 1.1179163455963135, + "step": 5578 + }, + { + "epoch": 1.0156548648402657, + "grad_norm": 12.625, + "learning_rate": 3.989476186819111e-06, + "loss": 1.4072048664093018, + "step": 5580 + }, + { + "epoch": 1.016018931464458, + "grad_norm": 12.75, + "learning_rate": 3.988810298757785e-06, + "loss": 1.5453095436096191, + "step": 5582 + }, + { + "epoch": 1.0163829980886503, + "grad_norm": 4.375, + "learning_rate": 3.988144265592556e-06, + "loss": 1.5294172763824463, + "step": 5584 + }, + { + "epoch": 1.0167470647128425, + "grad_norm": 7.375, + "learning_rate": 3.987478087421159e-06, + "loss": 1.491885781288147, + "step": 5586 + }, + { + "epoch": 1.0171111313370347, + "grad_norm": 13.5625, + "learning_rate": 3.986811764341355e-06, + "loss": 1.6509900093078613, + "step": 5588 + }, + { + "epoch": 1.0174751979612269, + "grad_norm": 3.21875, + "learning_rate": 3.986145296450924e-06, + "loss": 0.958034336566925, + "step": 5590 + }, + { + "epoch": 1.017839264585419, + "grad_norm": 15.5625, + "learning_rate": 3.9854786838476674e-06, + "loss": 2.092007875442505, + "step": 5592 + }, + { + "epoch": 1.0182033312096113, + "grad_norm": 99.0, + "learning_rate": 3.984811926629408e-06, + "loss": 1.3249096870422363, + "step": 5594 + }, + { + "epoch": 1.0185673978338037, + "grad_norm": 14.3125, + "learning_rate": 3.9841450248939894e-06, + "loss": 1.4061754941940308, + "step": 5596 + }, + { + "epoch": 1.0189314644579959, + "grad_norm": 11.4375, + "learning_rate": 3.983477978739276e-06, + "loss": 1.425875186920166, + "step": 5598 + }, + { + "epoch": 1.019295531082188, + "grad_norm": 26.875, + "learning_rate": 3.982810788263155e-06, + "loss": 1.3803876638412476, + "step": 5600 + }, + { + "epoch": 1.0196595977063803, + "grad_norm": 22.375, + "learning_rate": 3.982143453563535e-06, + "loss": 1.3369160890579224, + "step": 5602 + }, + { + "epoch": 1.0200236643305725, + "grad_norm": 133.0, + "learning_rate": 3.981475974738343e-06, + "loss": 1.377591609954834, + "step": 5604 + }, + { + "epoch": 1.0203877309547646, + "grad_norm": 9.875, + "learning_rate": 3.980808351885528e-06, + "loss": 1.2054471969604492, + "step": 5606 + }, + { + "epoch": 1.0207517975789568, + "grad_norm": 44.0, + "learning_rate": 3.980140585103064e-06, + "loss": 1.5488436222076416, + "step": 5608 + }, + { + "epoch": 1.0211158642031493, + "grad_norm": 97.0, + "learning_rate": 3.97947267448894e-06, + "loss": 1.4750125408172607, + "step": 5610 + }, + { + "epoch": 1.0214799308273415, + "grad_norm": 19.875, + "learning_rate": 3.978804620141171e-06, + "loss": 1.8069579601287842, + "step": 5612 + }, + { + "epoch": 1.0218439974515336, + "grad_norm": 3.671875, + "learning_rate": 3.97813642215779e-06, + "loss": 1.0305663347244263, + "step": 5614 + }, + { + "epoch": 1.0222080640757258, + "grad_norm": 9.1875, + "learning_rate": 3.9774680806368534e-06, + "loss": 1.0835367441177368, + "step": 5616 + }, + { + "epoch": 1.022572130699918, + "grad_norm": 8.625, + "learning_rate": 3.976799595676438e-06, + "loss": 1.466511607170105, + "step": 5618 + }, + { + "epoch": 1.0229361973241102, + "grad_norm": 3.9375, + "learning_rate": 3.97613096737464e-06, + "loss": 1.1156589984893799, + "step": 5620 + }, + { + "epoch": 1.0233002639483026, + "grad_norm": 10.875, + "learning_rate": 3.9754621958295795e-06, + "loss": 1.5707851648330688, + "step": 5622 + }, + { + "epoch": 1.0236643305724948, + "grad_norm": 13.875, + "learning_rate": 3.974793281139394e-06, + "loss": 1.498622179031372, + "step": 5624 + }, + { + "epoch": 1.024028397196687, + "grad_norm": 23.375, + "learning_rate": 3.974124223402246e-06, + "loss": 1.5071301460266113, + "step": 5626 + }, + { + "epoch": 1.0243924638208792, + "grad_norm": 18.25, + "learning_rate": 3.973455022716314e-06, + "loss": 1.4509986639022827, + "step": 5628 + }, + { + "epoch": 1.0247565304450714, + "grad_norm": 20.0, + "learning_rate": 3.972785679179804e-06, + "loss": 1.3919399976730347, + "step": 5630 + }, + { + "epoch": 1.0251205970692636, + "grad_norm": 12.5, + "learning_rate": 3.972116192890937e-06, + "loss": 1.5151044130325317, + "step": 5632 + }, + { + "epoch": 1.0254846636934558, + "grad_norm": 9.0625, + "learning_rate": 3.97144656394796e-06, + "loss": 1.400679111480713, + "step": 5634 + }, + { + "epoch": 1.0258487303176482, + "grad_norm": 13.25, + "learning_rate": 3.970776792449135e-06, + "loss": 0.9440341591835022, + "step": 5636 + }, + { + "epoch": 1.0262127969418404, + "grad_norm": 10.25, + "learning_rate": 3.970106878492751e-06, + "loss": 1.6199828386306763, + "step": 5638 + }, + { + "epoch": 1.0265768635660326, + "grad_norm": 18.0, + "learning_rate": 3.9694368221771125e-06, + "loss": 0.8532905578613281, + "step": 5640 + }, + { + "epoch": 1.0269409301902248, + "grad_norm": 3.34375, + "learning_rate": 3.96876662360055e-06, + "loss": 0.8992357850074768, + "step": 5642 + }, + { + "epoch": 1.027304996814417, + "grad_norm": 4.5625, + "learning_rate": 3.968096282861412e-06, + "loss": 0.9135205745697021, + "step": 5644 + }, + { + "epoch": 1.0276690634386092, + "grad_norm": 18.75, + "learning_rate": 3.967425800058068e-06, + "loss": 1.770528793334961, + "step": 5646 + }, + { + "epoch": 1.0280331300628014, + "grad_norm": 9.0625, + "learning_rate": 3.966755175288908e-06, + "loss": 1.3742282390594482, + "step": 5648 + }, + { + "epoch": 1.0283971966869938, + "grad_norm": 17.125, + "learning_rate": 3.966084408652344e-06, + "loss": 1.5820229053497314, + "step": 5650 + }, + { + "epoch": 1.028761263311186, + "grad_norm": 9.8125, + "learning_rate": 3.965413500246807e-06, + "loss": 1.4844284057617188, + "step": 5652 + }, + { + "epoch": 1.0291253299353782, + "grad_norm": 8.8125, + "learning_rate": 3.964742450170753e-06, + "loss": 1.593349575996399, + "step": 5654 + }, + { + "epoch": 1.0294893965595704, + "grad_norm": 17.625, + "learning_rate": 3.964071258522654e-06, + "loss": 1.4519903659820557, + "step": 5656 + }, + { + "epoch": 1.0298534631837626, + "grad_norm": 6.21875, + "learning_rate": 3.9633999254010045e-06, + "loss": 1.1237472295761108, + "step": 5658 + }, + { + "epoch": 1.0302175298079548, + "grad_norm": 17.625, + "learning_rate": 3.962728450904321e-06, + "loss": 1.5210973024368286, + "step": 5660 + }, + { + "epoch": 1.030581596432147, + "grad_norm": 7.875, + "learning_rate": 3.9620568351311384e-06, + "loss": 0.661182165145874, + "step": 5662 + }, + { + "epoch": 1.0309456630563394, + "grad_norm": 7.9375, + "learning_rate": 3.961385078180013e-06, + "loss": 1.507887363433838, + "step": 5664 + }, + { + "epoch": 1.0313097296805316, + "grad_norm": 15.125, + "learning_rate": 3.960713180149526e-06, + "loss": 1.4062614440917969, + "step": 5666 + }, + { + "epoch": 1.0316737963047238, + "grad_norm": 8.75, + "learning_rate": 3.960041141138271e-06, + "loss": 1.1013820171356201, + "step": 5668 + }, + { + "epoch": 1.032037862928916, + "grad_norm": 11.25, + "learning_rate": 3.9593689612448706e-06, + "loss": 1.5551948547363281, + "step": 5670 + }, + { + "epoch": 1.0324019295531082, + "grad_norm": 18.625, + "learning_rate": 3.958696640567961e-06, + "loss": 1.3923044204711914, + "step": 5672 + }, + { + "epoch": 1.0327659961773004, + "grad_norm": 7.4375, + "learning_rate": 3.9580241792062066e-06, + "loss": 1.240286946296692, + "step": 5674 + }, + { + "epoch": 1.0331300628014928, + "grad_norm": 40.5, + "learning_rate": 3.957351577258286e-06, + "loss": 1.9395053386688232, + "step": 5676 + }, + { + "epoch": 1.033494129425685, + "grad_norm": 2.40625, + "learning_rate": 3.956678834822902e-06, + "loss": 1.061018705368042, + "step": 5678 + }, + { + "epoch": 1.0338581960498772, + "grad_norm": 14.6875, + "learning_rate": 3.956005951998775e-06, + "loss": 1.4600074291229248, + "step": 5680 + }, + { + "epoch": 1.0342222626740694, + "grad_norm": 3.234375, + "learning_rate": 3.955332928884649e-06, + "loss": 1.0646876096725464, + "step": 5682 + }, + { + "epoch": 1.0345863292982616, + "grad_norm": 10.4375, + "learning_rate": 3.9546597655792884e-06, + "loss": 1.699087381362915, + "step": 5684 + }, + { + "epoch": 1.0349503959224537, + "grad_norm": 13.0625, + "learning_rate": 3.953986462181475e-06, + "loss": 1.546065092086792, + "step": 5686 + }, + { + "epoch": 1.035314462546646, + "grad_norm": 11.1875, + "learning_rate": 3.9533130187900136e-06, + "loss": 0.25471073389053345, + "step": 5688 + }, + { + "epoch": 1.0356785291708384, + "grad_norm": 12.875, + "learning_rate": 3.952639435503732e-06, + "loss": 1.4835538864135742, + "step": 5690 + }, + { + "epoch": 1.0360425957950306, + "grad_norm": 4.9375, + "learning_rate": 3.951965712421473e-06, + "loss": 1.3616836071014404, + "step": 5692 + }, + { + "epoch": 1.0364066624192227, + "grad_norm": 45.5, + "learning_rate": 3.951291849642104e-06, + "loss": 1.5426254272460938, + "step": 5694 + }, + { + "epoch": 1.036770729043415, + "grad_norm": 41.5, + "learning_rate": 3.9506178472645106e-06, + "loss": 0.7989203929901123, + "step": 5696 + }, + { + "epoch": 1.0371347956676071, + "grad_norm": 29.0, + "learning_rate": 3.949943705387601e-06, + "loss": 0.9459846019744873, + "step": 5698 + }, + { + "epoch": 1.0374988622917993, + "grad_norm": 9.125, + "learning_rate": 3.949269424110304e-06, + "loss": 1.3828026056289673, + "step": 5700 + }, + { + "epoch": 1.0378629289159915, + "grad_norm": 14.5, + "learning_rate": 3.948595003531564e-06, + "loss": 1.2757847309112549, + "step": 5702 + }, + { + "epoch": 1.038226995540184, + "grad_norm": 7.625, + "learning_rate": 3.947920443750351e-06, + "loss": 1.0518667697906494, + "step": 5704 + }, + { + "epoch": 1.0385910621643761, + "grad_norm": 27.0, + "learning_rate": 3.947245744865657e-06, + "loss": 1.598201036453247, + "step": 5706 + }, + { + "epoch": 1.0389551287885683, + "grad_norm": 11.9375, + "learning_rate": 3.9465709069764864e-06, + "loss": 0.9058932065963745, + "step": 5708 + }, + { + "epoch": 1.0393191954127605, + "grad_norm": 7.21875, + "learning_rate": 3.945895930181873e-06, + "loss": 1.2411967515945435, + "step": 5710 + }, + { + "epoch": 1.0396832620369527, + "grad_norm": 77.0, + "learning_rate": 3.945220814580865e-06, + "loss": 1.7014228105545044, + "step": 5712 + }, + { + "epoch": 1.040047328661145, + "grad_norm": 7.90625, + "learning_rate": 3.944545560272532e-06, + "loss": 1.5376970767974854, + "step": 5714 + }, + { + "epoch": 1.040411395285337, + "grad_norm": 3.4375, + "learning_rate": 3.943870167355968e-06, + "loss": 1.106825351715088, + "step": 5716 + }, + { + "epoch": 1.0407754619095295, + "grad_norm": 13.375, + "learning_rate": 3.94319463593028e-06, + "loss": 1.4561913013458252, + "step": 5718 + }, + { + "epoch": 1.0411395285337217, + "grad_norm": 15.1875, + "learning_rate": 3.942518966094603e-06, + "loss": 1.4524739980697632, + "step": 5720 + }, + { + "epoch": 1.041503595157914, + "grad_norm": 9.0, + "learning_rate": 3.941843157948086e-06, + "loss": 1.5416932106018066, + "step": 5722 + }, + { + "epoch": 1.041867661782106, + "grad_norm": 5.84375, + "learning_rate": 3.941167211589904e-06, + "loss": 1.21150541305542, + "step": 5724 + }, + { + "epoch": 1.0422317284062983, + "grad_norm": 8.125, + "learning_rate": 3.940491127119247e-06, + "loss": 1.4245576858520508, + "step": 5726 + }, + { + "epoch": 1.0425957950304905, + "grad_norm": 13.1875, + "learning_rate": 3.939814904635329e-06, + "loss": 1.2398494482040405, + "step": 5728 + }, + { + "epoch": 1.042959861654683, + "grad_norm": 16.875, + "learning_rate": 3.939138544237382e-06, + "loss": 1.990612506866455, + "step": 5730 + }, + { + "epoch": 1.043323928278875, + "grad_norm": 8.4375, + "learning_rate": 3.93846204602466e-06, + "loss": 0.8355557918548584, + "step": 5732 + }, + { + "epoch": 1.0436879949030673, + "grad_norm": 10.0, + "learning_rate": 3.9377854100964364e-06, + "loss": 1.5349178314208984, + "step": 5734 + }, + { + "epoch": 1.0440520615272595, + "grad_norm": 6.4375, + "learning_rate": 3.937108636552004e-06, + "loss": 0.882619321346283, + "step": 5736 + }, + { + "epoch": 1.0444161281514517, + "grad_norm": 19.875, + "learning_rate": 3.936431725490676e-06, + "loss": 1.2917356491088867, + "step": 5738 + }, + { + "epoch": 1.0447801947756439, + "grad_norm": 11.6875, + "learning_rate": 3.9357546770117885e-06, + "loss": 1.7410742044448853, + "step": 5740 + }, + { + "epoch": 1.045144261399836, + "grad_norm": 11.1875, + "learning_rate": 3.935077491214694e-06, + "loss": 1.5905922651290894, + "step": 5742 + }, + { + "epoch": 1.0455083280240285, + "grad_norm": 11.6875, + "learning_rate": 3.934400168198768e-06, + "loss": 1.0633156299591064, + "step": 5744 + }, + { + "epoch": 1.0458723946482207, + "grad_norm": 6.5625, + "learning_rate": 3.933722708063402e-06, + "loss": 1.1479876041412354, + "step": 5746 + }, + { + "epoch": 1.0462364612724129, + "grad_norm": 52.25, + "learning_rate": 3.933045110908015e-06, + "loss": 2.0883524417877197, + "step": 5748 + }, + { + "epoch": 1.046600527896605, + "grad_norm": 14.3125, + "learning_rate": 3.932367376832038e-06, + "loss": 1.606029748916626, + "step": 5750 + }, + { + "epoch": 1.0469645945207973, + "grad_norm": 13.3125, + "learning_rate": 3.931689505934928e-06, + "loss": 0.7816003561019897, + "step": 5752 + }, + { + "epoch": 1.0473286611449895, + "grad_norm": 8.1875, + "learning_rate": 3.931011498316158e-06, + "loss": 1.4243874549865723, + "step": 5754 + }, + { + "epoch": 1.0476927277691817, + "grad_norm": 30.5, + "learning_rate": 3.930333354075223e-06, + "loss": 2.042520761489868, + "step": 5756 + }, + { + "epoch": 1.048056794393374, + "grad_norm": 6.09375, + "learning_rate": 3.92965507331164e-06, + "loss": 1.138911485671997, + "step": 5758 + }, + { + "epoch": 1.0484208610175663, + "grad_norm": 13.75, + "learning_rate": 3.9289766561249425e-06, + "loss": 0.16640490293502808, + "step": 5760 + }, + { + "epoch": 1.0487849276417585, + "grad_norm": 11.3125, + "learning_rate": 3.928298102614685e-06, + "loss": 1.4840346574783325, + "step": 5762 + }, + { + "epoch": 1.0491489942659507, + "grad_norm": 5.15625, + "learning_rate": 3.9276194128804425e-06, + "loss": 0.917036771774292, + "step": 5764 + }, + { + "epoch": 1.0495130608901428, + "grad_norm": 7.15625, + "learning_rate": 3.9269405870218115e-06, + "loss": 1.4044708013534546, + "step": 5766 + }, + { + "epoch": 1.049877127514335, + "grad_norm": 132.0, + "learning_rate": 3.926261625138404e-06, + "loss": 0.7152976393699646, + "step": 5768 + }, + { + "epoch": 1.0502411941385275, + "grad_norm": 7.3125, + "learning_rate": 3.92558252732986e-06, + "loss": 0.7644884586334229, + "step": 5770 + }, + { + "epoch": 1.0506052607627197, + "grad_norm": 3.9375, + "learning_rate": 3.9249032936958285e-06, + "loss": 1.206880807876587, + "step": 5772 + }, + { + "epoch": 1.0509693273869118, + "grad_norm": 21.125, + "learning_rate": 3.924223924335988e-06, + "loss": 0.6048542857170105, + "step": 5774 + }, + { + "epoch": 1.051333394011104, + "grad_norm": 28.125, + "learning_rate": 3.923544419350033e-06, + "loss": 1.5641148090362549, + "step": 5776 + }, + { + "epoch": 1.0516974606352962, + "grad_norm": 13.8125, + "learning_rate": 3.922864778837675e-06, + "loss": 1.6029363870620728, + "step": 5778 + }, + { + "epoch": 1.0520615272594884, + "grad_norm": 21.125, + "learning_rate": 3.922185002898652e-06, + "loss": 1.8753116130828857, + "step": 5780 + }, + { + "epoch": 1.0524255938836806, + "grad_norm": 4.78125, + "learning_rate": 3.921505091632718e-06, + "loss": 1.0220108032226562, + "step": 5782 + }, + { + "epoch": 1.052789660507873, + "grad_norm": 21.25, + "learning_rate": 3.920825045139646e-06, + "loss": 2.271599054336548, + "step": 5784 + }, + { + "epoch": 1.0531537271320652, + "grad_norm": 44.0, + "learning_rate": 3.920144863519228e-06, + "loss": 2.0198800563812256, + "step": 5786 + }, + { + "epoch": 1.0535177937562574, + "grad_norm": 27.75, + "learning_rate": 3.919464546871283e-06, + "loss": 1.8812106847763062, + "step": 5788 + }, + { + "epoch": 1.0538818603804496, + "grad_norm": 29.0, + "learning_rate": 3.918784095295642e-06, + "loss": 1.1237467527389526, + "step": 5790 + }, + { + "epoch": 1.0542459270046418, + "grad_norm": 10.8125, + "learning_rate": 3.918103508892157e-06, + "loss": 1.5218358039855957, + "step": 5792 + }, + { + "epoch": 1.054609993628834, + "grad_norm": 12.5, + "learning_rate": 3.917422787760704e-06, + "loss": 1.8341076374053955, + "step": 5794 + }, + { + "epoch": 1.0549740602530262, + "grad_norm": 4.1875, + "learning_rate": 3.916741932001173e-06, + "loss": 1.0572822093963623, + "step": 5796 + }, + { + "epoch": 1.0553381268772186, + "grad_norm": 7.6875, + "learning_rate": 3.916060941713481e-06, + "loss": 1.036707878112793, + "step": 5798 + }, + { + "epoch": 1.0557021935014108, + "grad_norm": 23.5, + "learning_rate": 3.915379816997558e-06, + "loss": 1.1157270669937134, + "step": 5800 + }, + { + "epoch": 1.056066260125603, + "grad_norm": 10.625, + "learning_rate": 3.914698557953355e-06, + "loss": 1.7082679271697998, + "step": 5802 + }, + { + "epoch": 1.0564303267497952, + "grad_norm": 10.0, + "learning_rate": 3.914017164680847e-06, + "loss": 1.4446851015090942, + "step": 5804 + }, + { + "epoch": 1.0567943933739874, + "grad_norm": 37.5, + "learning_rate": 3.913335637280024e-06, + "loss": 0.7253285050392151, + "step": 5806 + }, + { + "epoch": 1.0571584599981796, + "grad_norm": 11.875, + "learning_rate": 3.912653975850897e-06, + "loss": 1.3953938484191895, + "step": 5808 + }, + { + "epoch": 1.057522526622372, + "grad_norm": 13.625, + "learning_rate": 3.911972180493499e-06, + "loss": 1.1639409065246582, + "step": 5810 + }, + { + "epoch": 1.0578865932465642, + "grad_norm": 17.25, + "learning_rate": 3.9112902513078775e-06, + "loss": 2.2260875701904297, + "step": 5812 + }, + { + "epoch": 1.0582506598707564, + "grad_norm": 13.625, + "learning_rate": 3.910608188394106e-06, + "loss": 1.5188775062561035, + "step": 5814 + }, + { + "epoch": 1.0586147264949486, + "grad_norm": 19.0, + "learning_rate": 3.909925991852274e-06, + "loss": 0.23688673973083496, + "step": 5816 + }, + { + "epoch": 1.0589787931191408, + "grad_norm": 14.125, + "learning_rate": 3.909243661782488e-06, + "loss": 1.7974152565002441, + "step": 5818 + }, + { + "epoch": 1.059342859743333, + "grad_norm": 28.5, + "learning_rate": 3.908561198284881e-06, + "loss": 1.3932104110717773, + "step": 5820 + }, + { + "epoch": 1.0597069263675252, + "grad_norm": 22.5, + "learning_rate": 3.9078786014596e-06, + "loss": 1.7903791666030884, + "step": 5822 + }, + { + "epoch": 1.0600709929917176, + "grad_norm": 22.25, + "learning_rate": 3.907195871406813e-06, + "loss": 1.9884388446807861, + "step": 5824 + }, + { + "epoch": 1.0604350596159098, + "grad_norm": 11.5625, + "learning_rate": 3.90651300822671e-06, + "loss": 1.666649580001831, + "step": 5826 + }, + { + "epoch": 1.060799126240102, + "grad_norm": 8.8125, + "learning_rate": 3.905830012019496e-06, + "loss": 1.3741055727005005, + "step": 5828 + }, + { + "epoch": 1.0611631928642942, + "grad_norm": 24.75, + "learning_rate": 3.905146882885399e-06, + "loss": 1.3806006908416748, + "step": 5830 + }, + { + "epoch": 1.0615272594884864, + "grad_norm": 26.625, + "learning_rate": 3.904463620924665e-06, + "loss": 1.460968017578125, + "step": 5832 + }, + { + "epoch": 1.0618913261126786, + "grad_norm": 33.5, + "learning_rate": 3.903780226237559e-06, + "loss": 1.4285956621170044, + "step": 5834 + }, + { + "epoch": 1.0622553927368708, + "grad_norm": 4.34375, + "learning_rate": 3.9030966989243675e-06, + "loss": 1.3044016361236572, + "step": 5836 + }, + { + "epoch": 1.0626194593610632, + "grad_norm": 20.25, + "learning_rate": 3.9024130390853975e-06, + "loss": 1.442693829536438, + "step": 5838 + }, + { + "epoch": 1.0629835259852554, + "grad_norm": 13.5, + "learning_rate": 3.90172924682097e-06, + "loss": 1.4817867279052734, + "step": 5840 + }, + { + "epoch": 1.0633475926094476, + "grad_norm": 19.125, + "learning_rate": 3.901045322231429e-06, + "loss": 1.949110746383667, + "step": 5842 + }, + { + "epoch": 1.0637116592336398, + "grad_norm": 10.5625, + "learning_rate": 3.9003612654171395e-06, + "loss": 1.528233289718628, + "step": 5844 + }, + { + "epoch": 1.064075725857832, + "grad_norm": 12.1875, + "learning_rate": 3.899677076478483e-06, + "loss": 1.6203858852386475, + "step": 5846 + }, + { + "epoch": 1.0644397924820241, + "grad_norm": 7.875, + "learning_rate": 3.898992755515862e-06, + "loss": 1.0835732221603394, + "step": 5848 + }, + { + "epoch": 1.0648038591062163, + "grad_norm": 21.0, + "learning_rate": 3.898308302629697e-06, + "loss": 1.5007903575897217, + "step": 5850 + }, + { + "epoch": 1.0651679257304087, + "grad_norm": 14.875, + "learning_rate": 3.89762371792043e-06, + "loss": 1.658623218536377, + "step": 5852 + }, + { + "epoch": 1.065531992354601, + "grad_norm": 13.25, + "learning_rate": 3.896939001488519e-06, + "loss": 1.3804954290390015, + "step": 5854 + }, + { + "epoch": 1.0658960589787931, + "grad_norm": 11.125, + "learning_rate": 3.896254153434444e-06, + "loss": 1.5994998216629028, + "step": 5856 + }, + { + "epoch": 1.0662601256029853, + "grad_norm": 6.125, + "learning_rate": 3.895569173858705e-06, + "loss": 1.1914063692092896, + "step": 5858 + }, + { + "epoch": 1.0666241922271775, + "grad_norm": 6.1875, + "learning_rate": 3.89488406286182e-06, + "loss": 1.1871479749679565, + "step": 5860 + }, + { + "epoch": 1.0669882588513697, + "grad_norm": 5.5625, + "learning_rate": 3.894198820544325e-06, + "loss": 1.380682349205017, + "step": 5862 + }, + { + "epoch": 1.0673523254755621, + "grad_norm": 9.8125, + "learning_rate": 3.893513447006776e-06, + "loss": 0.861020565032959, + "step": 5864 + }, + { + "epoch": 1.0677163920997543, + "grad_norm": 33.25, + "learning_rate": 3.89282794234975e-06, + "loss": 1.5483207702636719, + "step": 5866 + }, + { + "epoch": 1.0680804587239465, + "grad_norm": 13.0625, + "learning_rate": 3.892142306673842e-06, + "loss": 1.4005470275878906, + "step": 5868 + }, + { + "epoch": 1.0684445253481387, + "grad_norm": 17.75, + "learning_rate": 3.891456540079667e-06, + "loss": 1.4869873523712158, + "step": 5870 + }, + { + "epoch": 1.068808591972331, + "grad_norm": 10.8125, + "learning_rate": 3.890770642667856e-06, + "loss": 1.5940215587615967, + "step": 5872 + }, + { + "epoch": 1.069172658596523, + "grad_norm": 8.9375, + "learning_rate": 3.890084614539063e-06, + "loss": 1.8276211023330688, + "step": 5874 + }, + { + "epoch": 1.0695367252207153, + "grad_norm": 12.0625, + "learning_rate": 3.889398455793962e-06, + "loss": 1.4968147277832031, + "step": 5876 + }, + { + "epoch": 1.0699007918449077, + "grad_norm": 17.25, + "learning_rate": 3.88871216653324e-06, + "loss": 1.6963486671447754, + "step": 5878 + }, + { + "epoch": 1.0702648584691, + "grad_norm": 9.0, + "learning_rate": 3.8880257468576114e-06, + "loss": 1.1339311599731445, + "step": 5880 + }, + { + "epoch": 1.070628925093292, + "grad_norm": 20.375, + "learning_rate": 3.887339196867801e-06, + "loss": 0.6718922257423401, + "step": 5882 + }, + { + "epoch": 1.0709929917174843, + "grad_norm": 6.125, + "learning_rate": 3.8866525166645606e-06, + "loss": 1.4228242635726929, + "step": 5884 + }, + { + "epoch": 1.0713570583416765, + "grad_norm": 9.375, + "learning_rate": 3.885965706348657e-06, + "loss": 1.4468133449554443, + "step": 5886 + }, + { + "epoch": 1.0717211249658687, + "grad_norm": 9.9375, + "learning_rate": 3.885278766020876e-06, + "loss": 1.1995999813079834, + "step": 5888 + }, + { + "epoch": 1.0720851915900609, + "grad_norm": 9.6875, + "learning_rate": 3.884591695782023e-06, + "loss": 1.6023592948913574, + "step": 5890 + }, + { + "epoch": 1.0724492582142533, + "grad_norm": 15.0, + "learning_rate": 3.883904495732925e-06, + "loss": 1.5041823387145996, + "step": 5892 + }, + { + "epoch": 1.0728133248384455, + "grad_norm": 4.71875, + "learning_rate": 3.883217165974423e-06, + "loss": 0.845273494720459, + "step": 5894 + }, + { + "epoch": 1.0731773914626377, + "grad_norm": 46.5, + "learning_rate": 3.882529706607383e-06, + "loss": 1.9611445665359497, + "step": 5896 + }, + { + "epoch": 1.0735414580868299, + "grad_norm": 11.4375, + "learning_rate": 3.8818421177326835e-06, + "loss": 1.5844045877456665, + "step": 5898 + }, + { + "epoch": 1.073905524711022, + "grad_norm": 8.0625, + "learning_rate": 3.881154399451228e-06, + "loss": 1.4252173900604248, + "step": 5900 + }, + { + "epoch": 1.0742695913352143, + "grad_norm": 8.1875, + "learning_rate": 3.880466551863935e-06, + "loss": 1.6375072002410889, + "step": 5902 + }, + { + "epoch": 1.0746336579594065, + "grad_norm": 11.875, + "learning_rate": 3.879778575071744e-06, + "loss": 1.4291961193084717, + "step": 5904 + }, + { + "epoch": 1.0749977245835989, + "grad_norm": 12.5, + "learning_rate": 3.879090469175613e-06, + "loss": 1.6476150751113892, + "step": 5906 + }, + { + "epoch": 1.075361791207791, + "grad_norm": 9.0625, + "learning_rate": 3.878402234276517e-06, + "loss": 1.1103986501693726, + "step": 5908 + }, + { + "epoch": 1.0757258578319833, + "grad_norm": 22.75, + "learning_rate": 3.877713870475454e-06, + "loss": 1.3158318996429443, + "step": 5910 + }, + { + "epoch": 1.0760899244561755, + "grad_norm": 19.875, + "learning_rate": 3.877025377873437e-06, + "loss": 2.143655776977539, + "step": 5912 + }, + { + "epoch": 1.0764539910803677, + "grad_norm": 12.625, + "learning_rate": 3.8763367565715e-06, + "loss": 1.40102219581604, + "step": 5914 + }, + { + "epoch": 1.0768180577045599, + "grad_norm": 6.40625, + "learning_rate": 3.875648006670696e-06, + "loss": 1.3365706205368042, + "step": 5916 + }, + { + "epoch": 1.0771821243287523, + "grad_norm": 9.0, + "learning_rate": 3.874959128272096e-06, + "loss": 1.4652103185653687, + "step": 5918 + }, + { + "epoch": 1.0775461909529445, + "grad_norm": 6.4375, + "learning_rate": 3.874270121476789e-06, + "loss": 1.1194751262664795, + "step": 5920 + }, + { + "epoch": 1.0779102575771367, + "grad_norm": 7.34375, + "learning_rate": 3.873580986385884e-06, + "loss": 1.4235799312591553, + "step": 5922 + }, + { + "epoch": 1.0782743242013288, + "grad_norm": 11.25, + "learning_rate": 3.872891723100512e-06, + "loss": 1.8807854652404785, + "step": 5924 + }, + { + "epoch": 1.078638390825521, + "grad_norm": 15.3125, + "learning_rate": 3.872202331721815e-06, + "loss": 1.4013569355010986, + "step": 5926 + }, + { + "epoch": 1.0790024574497132, + "grad_norm": 9.25, + "learning_rate": 3.871512812350962e-06, + "loss": 1.485183835029602, + "step": 5928 + }, + { + "epoch": 1.0793665240739054, + "grad_norm": 16.375, + "learning_rate": 3.8708231650891345e-06, + "loss": 1.5554332733154297, + "step": 5930 + }, + { + "epoch": 1.0797305906980978, + "grad_norm": 12.5625, + "learning_rate": 3.870133390037537e-06, + "loss": 1.461188554763794, + "step": 5932 + }, + { + "epoch": 1.08009465732229, + "grad_norm": 19.625, + "learning_rate": 3.869443487297392e-06, + "loss": 1.625997543334961, + "step": 5934 + }, + { + "epoch": 1.0804587239464822, + "grad_norm": 2.609375, + "learning_rate": 3.868753456969937e-06, + "loss": 1.266181230545044, + "step": 5936 + }, + { + "epoch": 1.0808227905706744, + "grad_norm": 43.25, + "learning_rate": 3.868063299156434e-06, + "loss": 2.1645095348358154, + "step": 5938 + }, + { + "epoch": 1.0811868571948666, + "grad_norm": 82.5, + "learning_rate": 3.867373013958159e-06, + "loss": 1.5211347341537476, + "step": 5940 + }, + { + "epoch": 1.0815509238190588, + "grad_norm": 4.25, + "learning_rate": 3.86668260147641e-06, + "loss": 0.9621850252151489, + "step": 5942 + }, + { + "epoch": 1.081914990443251, + "grad_norm": 15.5, + "learning_rate": 3.865992061812501e-06, + "loss": 1.035224199295044, + "step": 5944 + }, + { + "epoch": 1.0822790570674434, + "grad_norm": 17.125, + "learning_rate": 3.865301395067768e-06, + "loss": 1.3911495208740234, + "step": 5946 + }, + { + "epoch": 1.0826431236916356, + "grad_norm": 31.625, + "learning_rate": 3.864610601343562e-06, + "loss": 1.8952548503875732, + "step": 5948 + }, + { + "epoch": 1.0830071903158278, + "grad_norm": 12.0, + "learning_rate": 3.863919680741253e-06, + "loss": 1.6181720495224, + "step": 5950 + }, + { + "epoch": 1.08337125694002, + "grad_norm": 14.5, + "learning_rate": 3.863228633362232e-06, + "loss": 0.6717743873596191, + "step": 5952 + }, + { + "epoch": 1.0837353235642122, + "grad_norm": 19.75, + "learning_rate": 3.862537459307908e-06, + "loss": 1.3409478664398193, + "step": 5954 + }, + { + "epoch": 1.0840993901884044, + "grad_norm": 24.375, + "learning_rate": 3.861846158679707e-06, + "loss": 2.1179358959198, + "step": 5956 + }, + { + "epoch": 1.0844634568125966, + "grad_norm": 8.0, + "learning_rate": 3.861154731579075e-06, + "loss": 1.4280810356140137, + "step": 5958 + }, + { + "epoch": 1.084827523436789, + "grad_norm": 37.25, + "learning_rate": 3.8604631781074755e-06, + "loss": 1.9865317344665527, + "step": 5960 + }, + { + "epoch": 1.0851915900609812, + "grad_norm": 10.4375, + "learning_rate": 3.859771498366392e-06, + "loss": 1.5203263759613037, + "step": 5962 + }, + { + "epoch": 1.0855556566851734, + "grad_norm": 24.375, + "learning_rate": 3.859079692457327e-06, + "loss": 1.3228542804718018, + "step": 5964 + }, + { + "epoch": 1.0859197233093656, + "grad_norm": 13.5, + "learning_rate": 3.858387760481797e-06, + "loss": 0.8345619440078735, + "step": 5966 + }, + { + "epoch": 1.0862837899335578, + "grad_norm": 8.375, + "learning_rate": 3.857695702541343e-06, + "loss": 1.1953986883163452, + "step": 5968 + }, + { + "epoch": 1.08664785655775, + "grad_norm": 20.625, + "learning_rate": 3.85700351873752e-06, + "loss": 0.8473007678985596, + "step": 5970 + }, + { + "epoch": 1.0870119231819424, + "grad_norm": 7.53125, + "learning_rate": 3.856311209171904e-06, + "loss": 1.0867971181869507, + "step": 5972 + }, + { + "epoch": 1.0873759898061346, + "grad_norm": 24.875, + "learning_rate": 3.855618773946087e-06, + "loss": 1.3933706283569336, + "step": 5974 + }, + { + "epoch": 1.0877400564303268, + "grad_norm": 9.625, + "learning_rate": 3.854926213161684e-06, + "loss": 1.458542823791504, + "step": 5976 + }, + { + "epoch": 1.088104123054519, + "grad_norm": 16.0, + "learning_rate": 3.8542335269203235e-06, + "loss": 1.3841851949691772, + "step": 5978 + }, + { + "epoch": 1.0884681896787112, + "grad_norm": 46.25, + "learning_rate": 3.853540715323655e-06, + "loss": 1.2392947673797607, + "step": 5980 + }, + { + "epoch": 1.0888322563029034, + "grad_norm": 11.8125, + "learning_rate": 3.852847778473345e-06, + "loss": 1.506159782409668, + "step": 5982 + }, + { + "epoch": 1.0891963229270956, + "grad_norm": 9.4375, + "learning_rate": 3.8521547164710805e-06, + "loss": 1.456261157989502, + "step": 5984 + }, + { + "epoch": 1.089560389551288, + "grad_norm": 20.75, + "learning_rate": 3.8514615294185656e-06, + "loss": 2.058417797088623, + "step": 5986 + }, + { + "epoch": 1.0899244561754802, + "grad_norm": 20.125, + "learning_rate": 3.850768217417521e-06, + "loss": 1.8634366989135742, + "step": 5988 + }, + { + "epoch": 1.0902885227996724, + "grad_norm": 21.25, + "learning_rate": 3.850074780569688e-06, + "loss": 1.9875717163085938, + "step": 5990 + }, + { + "epoch": 1.0906525894238646, + "grad_norm": 27.25, + "learning_rate": 3.8493812189768266e-06, + "loss": 0.8094217777252197, + "step": 5992 + }, + { + "epoch": 1.0910166560480568, + "grad_norm": 18.5, + "learning_rate": 3.848687532740713e-06, + "loss": 1.0909367799758911, + "step": 5994 + }, + { + "epoch": 1.091380722672249, + "grad_norm": 25.125, + "learning_rate": 3.847993721963143e-06, + "loss": 1.292501449584961, + "step": 5996 + }, + { + "epoch": 1.0917447892964414, + "grad_norm": 3.828125, + "learning_rate": 3.847299786745931e-06, + "loss": 0.9557967185974121, + "step": 5998 + }, + { + "epoch": 1.0921088559206336, + "grad_norm": 16.75, + "learning_rate": 3.846605727190911e-06, + "loss": 1.5790600776672363, + "step": 6000 + }, + { + "epoch": 1.0924729225448258, + "grad_norm": 15.0, + "learning_rate": 3.845911543399931e-06, + "loss": 0.7537775039672852, + "step": 6002 + }, + { + "epoch": 1.092836989169018, + "grad_norm": 15.1875, + "learning_rate": 3.8452172354748585e-06, + "loss": 1.8361414670944214, + "step": 6004 + }, + { + "epoch": 1.0932010557932101, + "grad_norm": 24.25, + "learning_rate": 3.844522803517583e-06, + "loss": 1.5165725946426392, + "step": 6006 + }, + { + "epoch": 1.0935651224174023, + "grad_norm": 28.375, + "learning_rate": 3.84382824763001e-06, + "loss": 1.5054811239242554, + "step": 6008 + }, + { + "epoch": 1.0939291890415945, + "grad_norm": 13.5625, + "learning_rate": 3.8431335679140595e-06, + "loss": 0.9894658923149109, + "step": 6010 + }, + { + "epoch": 1.0942932556657867, + "grad_norm": 14.3125, + "learning_rate": 3.842438764471674e-06, + "loss": 1.6074790954589844, + "step": 6012 + }, + { + "epoch": 1.0946573222899791, + "grad_norm": 9.5625, + "learning_rate": 3.841743837404815e-06, + "loss": 1.5319018363952637, + "step": 6014 + }, + { + "epoch": 1.0950213889141713, + "grad_norm": 27.625, + "learning_rate": 3.84104878681546e-06, + "loss": 1.6745892763137817, + "step": 6016 + }, + { + "epoch": 1.0953854555383635, + "grad_norm": 48.0, + "learning_rate": 3.840353612805604e-06, + "loss": 2.157043695449829, + "step": 6018 + }, + { + "epoch": 1.0957495221625557, + "grad_norm": 8.25, + "learning_rate": 3.83965831547726e-06, + "loss": 1.0617197751998901, + "step": 6020 + }, + { + "epoch": 1.096113588786748, + "grad_norm": 19.0, + "learning_rate": 3.838962894932462e-06, + "loss": 1.6075338125228882, + "step": 6022 + }, + { + "epoch": 1.09647765541094, + "grad_norm": 15.5, + "learning_rate": 3.838267351273258e-06, + "loss": 1.4352613687515259, + "step": 6024 + }, + { + "epoch": 1.0968417220351325, + "grad_norm": 4.0625, + "learning_rate": 3.837571684601718e-06, + "loss": 1.0723764896392822, + "step": 6026 + }, + { + "epoch": 1.0972057886593247, + "grad_norm": 2.890625, + "learning_rate": 3.836875895019928e-06, + "loss": 1.1767336130142212, + "step": 6028 + }, + { + "epoch": 1.097569855283517, + "grad_norm": 17.0, + "learning_rate": 3.836179982629992e-06, + "loss": 1.2921134233474731, + "step": 6030 + }, + { + "epoch": 1.097933921907709, + "grad_norm": 12.3125, + "learning_rate": 3.8354839475340325e-06, + "loss": 1.462576985359192, + "step": 6032 + }, + { + "epoch": 1.0982979885319013, + "grad_norm": 5.09375, + "learning_rate": 3.8347877898341875e-06, + "loss": 1.1341749429702759, + "step": 6034 + }, + { + "epoch": 1.0986620551560935, + "grad_norm": 4.9375, + "learning_rate": 3.834091509632619e-06, + "loss": 1.294270396232605, + "step": 6036 + }, + { + "epoch": 1.0990261217802857, + "grad_norm": 4.125, + "learning_rate": 3.833395107031503e-06, + "loss": 1.5328009128570557, + "step": 6038 + }, + { + "epoch": 1.099390188404478, + "grad_norm": 4.59375, + "learning_rate": 3.832698582133031e-06, + "loss": 1.438232183456421, + "step": 6040 + }, + { + "epoch": 1.0997542550286703, + "grad_norm": 2.84375, + "learning_rate": 3.8320019350394165e-06, + "loss": 1.1747462749481201, + "step": 6042 + }, + { + "epoch": 1.1001183216528625, + "grad_norm": 25.875, + "learning_rate": 3.831305165852891e-06, + "loss": 1.3242974281311035, + "step": 6044 + }, + { + "epoch": 1.1004823882770547, + "grad_norm": 21.75, + "learning_rate": 3.8306082746757e-06, + "loss": 0.571100115776062, + "step": 6046 + }, + { + "epoch": 1.1008464549012469, + "grad_norm": 8.5, + "learning_rate": 3.829911261610112e-06, + "loss": 1.338200569152832, + "step": 6048 + }, + { + "epoch": 1.101210521525439, + "grad_norm": 10.4375, + "learning_rate": 3.829214126758409e-06, + "loss": 1.8243277072906494, + "step": 6050 + }, + { + "epoch": 1.1015745881496315, + "grad_norm": 15.6875, + "learning_rate": 3.828516870222894e-06, + "loss": 1.8486000299453735, + "step": 6052 + }, + { + "epoch": 1.1019386547738237, + "grad_norm": 7.8125, + "learning_rate": 3.827819492105885e-06, + "loss": 1.3602410554885864, + "step": 6054 + }, + { + "epoch": 1.1023027213980159, + "grad_norm": 37.25, + "learning_rate": 3.827121992509721e-06, + "loss": 1.759181261062622, + "step": 6056 + }, + { + "epoch": 1.102666788022208, + "grad_norm": 18.375, + "learning_rate": 3.826424371536756e-06, + "loss": 1.0174120664596558, + "step": 6058 + }, + { + "epoch": 1.1030308546464003, + "grad_norm": 12.9375, + "learning_rate": 3.8257266292893655e-06, + "loss": 1.3128403425216675, + "step": 6060 + }, + { + "epoch": 1.1033949212705925, + "grad_norm": 9.4375, + "learning_rate": 3.8250287658699366e-06, + "loss": 1.3344205617904663, + "step": 6062 + }, + { + "epoch": 1.1037589878947847, + "grad_norm": 39.75, + "learning_rate": 3.824330781380882e-06, + "loss": 0.82818603515625, + "step": 6064 + }, + { + "epoch": 1.104123054518977, + "grad_norm": 12.4375, + "learning_rate": 3.823632675924623e-06, + "loss": 1.4115170240402222, + "step": 6066 + }, + { + "epoch": 1.1044871211431693, + "grad_norm": 12.4375, + "learning_rate": 3.8229344496036094e-06, + "loss": 1.4783213138580322, + "step": 6068 + }, + { + "epoch": 1.1048511877673615, + "grad_norm": 18.125, + "learning_rate": 3.822236102520299e-06, + "loss": 1.4205418825149536, + "step": 6070 + }, + { + "epoch": 1.1052152543915537, + "grad_norm": 14.5625, + "learning_rate": 3.821537634777173e-06, + "loss": 1.6862735748291016, + "step": 6072 + }, + { + "epoch": 1.1055793210157459, + "grad_norm": 27.25, + "learning_rate": 3.820839046476728e-06, + "loss": 2.1544601917266846, + "step": 6074 + }, + { + "epoch": 1.105943387639938, + "grad_norm": 10.5, + "learning_rate": 3.820140337721481e-06, + "loss": 1.4112235307693481, + "step": 6076 + }, + { + "epoch": 1.1063074542641302, + "grad_norm": 8.8125, + "learning_rate": 3.819441508613962e-06, + "loss": 1.6237759590148926, + "step": 6078 + }, + { + "epoch": 1.1066715208883227, + "grad_norm": 25.125, + "learning_rate": 3.818742559256723e-06, + "loss": 1.4444926977157593, + "step": 6080 + }, + { + "epoch": 1.1070355875125149, + "grad_norm": 8.5, + "learning_rate": 3.8180434897523315e-06, + "loss": 1.264920949935913, + "step": 6082 + }, + { + "epoch": 1.107399654136707, + "grad_norm": 14.25, + "learning_rate": 3.817344300203373e-06, + "loss": 1.087512493133545, + "step": 6084 + }, + { + "epoch": 1.1077637207608992, + "grad_norm": 10.8125, + "learning_rate": 3.81664499071245e-06, + "loss": 1.535239338874817, + "step": 6086 + }, + { + "epoch": 1.1081277873850914, + "grad_norm": 2.515625, + "learning_rate": 3.815945561382185e-06, + "loss": 1.3006737232208252, + "step": 6088 + }, + { + "epoch": 1.1084918540092836, + "grad_norm": 14.625, + "learning_rate": 3.815246012315216e-06, + "loss": 1.7061011791229248, + "step": 6090 + }, + { + "epoch": 1.1088559206334758, + "grad_norm": 122.0, + "learning_rate": 3.8145463436141973e-06, + "loss": 1.8039379119873047, + "step": 6092 + }, + { + "epoch": 1.1092199872576682, + "grad_norm": 6.25, + "learning_rate": 3.813846555381805e-06, + "loss": 1.220511794090271, + "step": 6094 + }, + { + "epoch": 1.1095840538818604, + "grad_norm": 17.75, + "learning_rate": 3.813146647720728e-06, + "loss": 1.2950901985168457, + "step": 6096 + }, + { + "epoch": 1.1099481205060526, + "grad_norm": 10.8125, + "learning_rate": 3.8124466207336765e-06, + "loss": 1.3862860202789307, + "step": 6098 + }, + { + "epoch": 1.1103121871302448, + "grad_norm": 8.4375, + "learning_rate": 3.811746474523376e-06, + "loss": 1.4157158136367798, + "step": 6100 + }, + { + "epoch": 1.110676253754437, + "grad_norm": 12.5, + "learning_rate": 3.8110462091925694e-06, + "loss": 1.5394641160964966, + "step": 6102 + }, + { + "epoch": 1.1110403203786292, + "grad_norm": 16.875, + "learning_rate": 3.8103458248440196e-06, + "loss": 1.4767036437988281, + "step": 6104 + }, + { + "epoch": 1.1114043870028216, + "grad_norm": 7.71875, + "learning_rate": 3.809645321580503e-06, + "loss": 1.1851692199707031, + "step": 6106 + }, + { + "epoch": 1.1117684536270138, + "grad_norm": 13.0625, + "learning_rate": 3.8089446995048174e-06, + "loss": 1.2467987537384033, + "step": 6108 + }, + { + "epoch": 1.112132520251206, + "grad_norm": 8.8125, + "learning_rate": 3.808243958719775e-06, + "loss": 1.4570598602294922, + "step": 6110 + }, + { + "epoch": 1.1124965868753982, + "grad_norm": 4.25, + "learning_rate": 3.807543099328207e-06, + "loss": 1.2031669616699219, + "step": 6112 + }, + { + "epoch": 1.1128606534995904, + "grad_norm": 8.4375, + "learning_rate": 3.806842121432962e-06, + "loss": 1.3806991577148438, + "step": 6114 + }, + { + "epoch": 1.1132247201237826, + "grad_norm": 26.25, + "learning_rate": 3.806141025136906e-06, + "loss": 1.5639595985412598, + "step": 6116 + }, + { + "epoch": 1.1135887867479748, + "grad_norm": 9.75, + "learning_rate": 3.8054398105429212e-06, + "loss": 1.386110782623291, + "step": 6118 + }, + { + "epoch": 1.1139528533721672, + "grad_norm": 16.75, + "learning_rate": 3.8047384777539088e-06, + "loss": 1.7756434679031372, + "step": 6120 + }, + { + "epoch": 1.1143169199963594, + "grad_norm": 16.125, + "learning_rate": 3.8040370268727855e-06, + "loss": 1.5200403928756714, + "step": 6122 + }, + { + "epoch": 1.1146809866205516, + "grad_norm": 7.4375, + "learning_rate": 3.8033354580024875e-06, + "loss": 1.5931994915008545, + "step": 6124 + }, + { + "epoch": 1.1150450532447438, + "grad_norm": 218.0, + "learning_rate": 3.802633771245966e-06, + "loss": 1.413737416267395, + "step": 6126 + }, + { + "epoch": 1.115409119868936, + "grad_norm": 52.5, + "learning_rate": 3.8019319667061926e-06, + "loss": 1.872003436088562, + "step": 6128 + }, + { + "epoch": 1.1157731864931282, + "grad_norm": 8.875, + "learning_rate": 3.8012300444861514e-06, + "loss": 1.4467624425888062, + "step": 6130 + }, + { + "epoch": 1.1161372531173204, + "grad_norm": 11.5, + "learning_rate": 3.8005280046888494e-06, + "loss": 1.5086885690689087, + "step": 6132 + }, + { + "epoch": 1.1165013197415128, + "grad_norm": 15.1875, + "learning_rate": 3.7998258474173067e-06, + "loss": 1.60709810256958, + "step": 6134 + }, + { + "epoch": 1.116865386365705, + "grad_norm": 12.5, + "learning_rate": 3.7991235727745622e-06, + "loss": 1.5082638263702393, + "step": 6136 + }, + { + "epoch": 1.1172294529898972, + "grad_norm": 9.5625, + "learning_rate": 3.798421180863673e-06, + "loss": 1.2955968379974365, + "step": 6138 + }, + { + "epoch": 1.1175935196140894, + "grad_norm": 15.25, + "learning_rate": 3.7977186717877103e-06, + "loss": 1.392683744430542, + "step": 6140 + }, + { + "epoch": 1.1179575862382816, + "grad_norm": 20.875, + "learning_rate": 3.7970160456497652e-06, + "loss": 1.4873758554458618, + "step": 6142 + }, + { + "epoch": 1.1183216528624738, + "grad_norm": 8.1875, + "learning_rate": 3.7963133025529454e-06, + "loss": 0.9370136260986328, + "step": 6144 + }, + { + "epoch": 1.118685719486666, + "grad_norm": 14.25, + "learning_rate": 3.795610442600376e-06, + "loss": 1.475476622581482, + "step": 6146 + }, + { + "epoch": 1.1190497861108584, + "grad_norm": 16.875, + "learning_rate": 3.794907465895198e-06, + "loss": 2.063793420791626, + "step": 6148 + }, + { + "epoch": 1.1194138527350506, + "grad_norm": 5.6875, + "learning_rate": 3.7942043725405707e-06, + "loss": 0.985072135925293, + "step": 6150 + }, + { + "epoch": 1.1197779193592428, + "grad_norm": 8.1875, + "learning_rate": 3.793501162639671e-06, + "loss": 0.963082492351532, + "step": 6152 + }, + { + "epoch": 1.120141985983435, + "grad_norm": 7.03125, + "learning_rate": 3.792797836295691e-06, + "loss": 0.8796142339706421, + "step": 6154 + }, + { + "epoch": 1.1205060526076271, + "grad_norm": 5.125, + "learning_rate": 3.7920943936118415e-06, + "loss": 0.9691740274429321, + "step": 6156 + }, + { + "epoch": 1.1208701192318193, + "grad_norm": 11.1875, + "learning_rate": 3.79139083469135e-06, + "loss": 1.4237829446792603, + "step": 6158 + }, + { + "epoch": 1.1212341858560118, + "grad_norm": 14.5625, + "learning_rate": 3.790687159637462e-06, + "loss": 1.4156314134597778, + "step": 6160 + }, + { + "epoch": 1.121598252480204, + "grad_norm": 9.3125, + "learning_rate": 3.789983368553436e-06, + "loss": 1.5332188606262207, + "step": 6162 + }, + { + "epoch": 1.1219623191043961, + "grad_norm": 7.875, + "learning_rate": 3.789279461542552e-06, + "loss": 1.516160488128662, + "step": 6164 + }, + { + "epoch": 1.1223263857285883, + "grad_norm": 38.5, + "learning_rate": 3.7885754387081065e-06, + "loss": 2.1352405548095703, + "step": 6166 + }, + { + "epoch": 1.1226904523527805, + "grad_norm": 8.5625, + "learning_rate": 3.7878713001534106e-06, + "loss": 1.156064748764038, + "step": 6168 + }, + { + "epoch": 1.1230545189769727, + "grad_norm": 19.375, + "learning_rate": 3.7871670459817956e-06, + "loss": 1.2078046798706055, + "step": 6170 + }, + { + "epoch": 1.123418585601165, + "grad_norm": 82.5, + "learning_rate": 3.786462676296606e-06, + "loss": 1.2897133827209473, + "step": 6172 + }, + { + "epoch": 1.1237826522253573, + "grad_norm": 8.0, + "learning_rate": 3.7857581912012054e-06, + "loss": 1.4413726329803467, + "step": 6174 + }, + { + "epoch": 1.1241467188495495, + "grad_norm": 8.5, + "learning_rate": 3.785053590798975e-06, + "loss": 1.313905119895935, + "step": 6176 + }, + { + "epoch": 1.1245107854737417, + "grad_norm": 3.84375, + "learning_rate": 3.7843488751933123e-06, + "loss": 0.956428587436676, + "step": 6178 + }, + { + "epoch": 1.124874852097934, + "grad_norm": 10.0625, + "learning_rate": 3.78364404448763e-06, + "loss": 1.476861834526062, + "step": 6180 + }, + { + "epoch": 1.1252389187221261, + "grad_norm": 7.5, + "learning_rate": 3.7829390987853596e-06, + "loss": 1.4707458019256592, + "step": 6182 + }, + { + "epoch": 1.1256029853463183, + "grad_norm": 7.8125, + "learning_rate": 3.78223403818995e-06, + "loss": 1.4864747524261475, + "step": 6184 + }, + { + "epoch": 1.1259670519705107, + "grad_norm": 9.125, + "learning_rate": 3.7815288628048664e-06, + "loss": 1.188392162322998, + "step": 6186 + }, + { + "epoch": 1.126331118594703, + "grad_norm": 3.921875, + "learning_rate": 3.7808235727335884e-06, + "loss": 1.194756031036377, + "step": 6188 + }, + { + "epoch": 1.1266951852188951, + "grad_norm": 15.875, + "learning_rate": 3.780118168079615e-06, + "loss": 1.4416488409042358, + "step": 6190 + }, + { + "epoch": 1.1270592518430873, + "grad_norm": 9.25, + "learning_rate": 3.7794126489464635e-06, + "loss": 1.4451711177825928, + "step": 6192 + }, + { + "epoch": 1.1274233184672795, + "grad_norm": 22.875, + "learning_rate": 3.7787070154376624e-06, + "loss": 2.0071449279785156, + "step": 6194 + }, + { + "epoch": 1.1277873850914717, + "grad_norm": 12.0, + "learning_rate": 3.7780012676567645e-06, + "loss": 1.4033293724060059, + "step": 6196 + }, + { + "epoch": 1.1281514517156639, + "grad_norm": 78.5, + "learning_rate": 3.777295405707333e-06, + "loss": 1.1076610088348389, + "step": 6198 + }, + { + "epoch": 1.128515518339856, + "grad_norm": 10.125, + "learning_rate": 3.7765894296929505e-06, + "loss": 1.2459193468093872, + "step": 6200 + }, + { + "epoch": 1.1288795849640485, + "grad_norm": 21.25, + "learning_rate": 3.7758833397172166e-06, + "loss": 1.7686487436294556, + "step": 6202 + }, + { + "epoch": 1.1292436515882407, + "grad_norm": 3.65625, + "learning_rate": 3.7751771358837476e-06, + "loss": 0.9775285720825195, + "step": 6204 + }, + { + "epoch": 1.1296077182124329, + "grad_norm": 12.25, + "learning_rate": 3.7744708182961742e-06, + "loss": 1.4092994928359985, + "step": 6206 + }, + { + "epoch": 1.129971784836625, + "grad_norm": 13.375, + "learning_rate": 3.7737643870581474e-06, + "loss": 0.5858030319213867, + "step": 6208 + }, + { + "epoch": 1.1303358514608173, + "grad_norm": 35.75, + "learning_rate": 3.7730578422733334e-06, + "loss": 1.3565285205841064, + "step": 6210 + }, + { + "epoch": 1.1306999180850095, + "grad_norm": 14.125, + "learning_rate": 3.772351184045413e-06, + "loss": 0.991074800491333, + "step": 6212 + }, + { + "epoch": 1.1310639847092019, + "grad_norm": 10.0, + "learning_rate": 3.771644412478086e-06, + "loss": 1.883345127105713, + "step": 6214 + }, + { + "epoch": 1.131428051333394, + "grad_norm": 19.75, + "learning_rate": 3.770937527675069e-06, + "loss": 1.526034951210022, + "step": 6216 + }, + { + "epoch": 1.1317921179575863, + "grad_norm": 11.125, + "learning_rate": 3.7702305297400955e-06, + "loss": 1.4189389944076538, + "step": 6218 + }, + { + "epoch": 1.1321561845817785, + "grad_norm": 12.3125, + "learning_rate": 3.7695234187769114e-06, + "loss": 1.7195016145706177, + "step": 6220 + }, + { + "epoch": 1.1325202512059707, + "grad_norm": 15.0, + "learning_rate": 3.7688161948892854e-06, + "loss": 1.805555820465088, + "step": 6222 + }, + { + "epoch": 1.1328843178301629, + "grad_norm": 10.6875, + "learning_rate": 3.7681088581809975e-06, + "loss": 0.8778680562973022, + "step": 6224 + }, + { + "epoch": 1.133248384454355, + "grad_norm": 3.0, + "learning_rate": 3.7674014087558487e-06, + "loss": 1.0607777833938599, + "step": 6226 + }, + { + "epoch": 1.1336124510785475, + "grad_norm": 10.875, + "learning_rate": 3.7666938467176527e-06, + "loss": 1.3322945833206177, + "step": 6228 + }, + { + "epoch": 1.1339765177027397, + "grad_norm": 7.0625, + "learning_rate": 3.7659861721702416e-06, + "loss": 1.4955732822418213, + "step": 6230 + }, + { + "epoch": 1.1343405843269319, + "grad_norm": 10.0, + "learning_rate": 3.7652783852174647e-06, + "loss": 1.3816795349121094, + "step": 6232 + }, + { + "epoch": 1.134704650951124, + "grad_norm": 13.25, + "learning_rate": 3.7645704859631848e-06, + "loss": 2.000274896621704, + "step": 6234 + }, + { + "epoch": 1.1350687175753162, + "grad_norm": 8.4375, + "learning_rate": 3.763862474511286e-06, + "loss": 1.4250493049621582, + "step": 6236 + }, + { + "epoch": 1.1354327841995084, + "grad_norm": 10.1875, + "learning_rate": 3.763154350965664e-06, + "loss": 1.4186378717422485, + "step": 6238 + }, + { + "epoch": 1.1357968508237009, + "grad_norm": 20.75, + "learning_rate": 3.762446115430235e-06, + "loss": 0.6588457822799683, + "step": 6240 + }, + { + "epoch": 1.136160917447893, + "grad_norm": 17.625, + "learning_rate": 3.7617377680089274e-06, + "loss": 1.4386787414550781, + "step": 6242 + }, + { + "epoch": 1.1365249840720852, + "grad_norm": 6.65625, + "learning_rate": 3.7610293088056894e-06, + "loss": 1.1824570894241333, + "step": 6244 + }, + { + "epoch": 1.1368890506962774, + "grad_norm": 11.0, + "learning_rate": 3.7603207379244843e-06, + "loss": 1.5339453220367432, + "step": 6246 + }, + { + "epoch": 1.1372531173204696, + "grad_norm": 11.625, + "learning_rate": 3.7596120554692916e-06, + "loss": 1.4633429050445557, + "step": 6248 + }, + { + "epoch": 1.1376171839446618, + "grad_norm": 8.75, + "learning_rate": 3.7589032615441102e-06, + "loss": 1.4041224718093872, + "step": 6250 + }, + { + "epoch": 1.137981250568854, + "grad_norm": 29.625, + "learning_rate": 3.7581943562529487e-06, + "loss": 1.233878493309021, + "step": 6252 + }, + { + "epoch": 1.1383453171930462, + "grad_norm": 8.75, + "learning_rate": 3.757485339699839e-06, + "loss": 1.4670976400375366, + "step": 6254 + }, + { + "epoch": 1.1387093838172386, + "grad_norm": 5.5, + "learning_rate": 3.7567762119888262e-06, + "loss": 1.093485713005066, + "step": 6256 + }, + { + "epoch": 1.1390734504414308, + "grad_norm": 14.625, + "learning_rate": 3.7560669732239698e-06, + "loss": 1.4327590465545654, + "step": 6258 + }, + { + "epoch": 1.139437517065623, + "grad_norm": 9.5625, + "learning_rate": 3.7553576235093503e-06, + "loss": 1.494154930114746, + "step": 6260 + }, + { + "epoch": 1.1398015836898152, + "grad_norm": 16.625, + "learning_rate": 3.7546481629490606e-06, + "loss": 1.3798234462738037, + "step": 6262 + }, + { + "epoch": 1.1401656503140074, + "grad_norm": 7.875, + "learning_rate": 3.7539385916472116e-06, + "loss": 1.4378374814987183, + "step": 6264 + }, + { + "epoch": 1.1405297169381996, + "grad_norm": 11.8125, + "learning_rate": 3.753228909707929e-06, + "loss": 1.4703234434127808, + "step": 6266 + }, + { + "epoch": 1.140893783562392, + "grad_norm": 14.0, + "learning_rate": 3.752519117235356e-06, + "loss": 1.3794260025024414, + "step": 6268 + }, + { + "epoch": 1.1412578501865842, + "grad_norm": 27.5, + "learning_rate": 3.751809214333654e-06, + "loss": 1.4715687036514282, + "step": 6270 + }, + { + "epoch": 1.1416219168107764, + "grad_norm": 50.25, + "learning_rate": 3.7510992011069946e-06, + "loss": 1.395397424697876, + "step": 6272 + }, + { + "epoch": 1.1419859834349686, + "grad_norm": 11.875, + "learning_rate": 3.750389077659573e-06, + "loss": 1.6279557943344116, + "step": 6274 + }, + { + "epoch": 1.1423500500591608, + "grad_norm": 33.5, + "learning_rate": 3.7496788440955946e-06, + "loss": 1.9665371179580688, + "step": 6276 + }, + { + "epoch": 1.142714116683353, + "grad_norm": 27.125, + "learning_rate": 3.7489685005192834e-06, + "loss": 1.6656520366668701, + "step": 6278 + }, + { + "epoch": 1.1430781833075452, + "grad_norm": 11.25, + "learning_rate": 3.7482580470348805e-06, + "loss": 1.1945672035217285, + "step": 6280 + }, + { + "epoch": 1.1434422499317376, + "grad_norm": 6.6875, + "learning_rate": 3.747547483746643e-06, + "loss": 1.0218722820281982, + "step": 6282 + }, + { + "epoch": 1.1438063165559298, + "grad_norm": 9.1875, + "learning_rate": 3.7468368107588405e-06, + "loss": 1.4882116317749023, + "step": 6284 + }, + { + "epoch": 1.144170383180122, + "grad_norm": 21.875, + "learning_rate": 3.7461260281757627e-06, + "loss": 1.4471235275268555, + "step": 6286 + }, + { + "epoch": 1.1445344498043142, + "grad_norm": 9.125, + "learning_rate": 3.7454151361017143e-06, + "loss": 1.3163316249847412, + "step": 6288 + }, + { + "epoch": 1.1448985164285064, + "grad_norm": 6.96875, + "learning_rate": 3.744704134641015e-06, + "loss": 1.174575686454773, + "step": 6290 + }, + { + "epoch": 1.1452625830526986, + "grad_norm": 20.0, + "learning_rate": 3.7439930238980026e-06, + "loss": 2.2616584300994873, + "step": 6292 + }, + { + "epoch": 1.145626649676891, + "grad_norm": 3.53125, + "learning_rate": 3.743281803977029e-06, + "loss": 1.545267105102539, + "step": 6294 + }, + { + "epoch": 1.1459907163010832, + "grad_norm": 6.1875, + "learning_rate": 3.742570474982463e-06, + "loss": 1.1953632831573486, + "step": 6296 + }, + { + "epoch": 1.1463547829252754, + "grad_norm": 7.625, + "learning_rate": 3.741859037018688e-06, + "loss": 1.3737146854400635, + "step": 6298 + }, + { + "epoch": 1.1467188495494676, + "grad_norm": 53.75, + "learning_rate": 3.741147490190108e-06, + "loss": 1.0242600440979004, + "step": 6300 + }, + { + "epoch": 1.1470829161736598, + "grad_norm": 25.25, + "learning_rate": 3.740435834601136e-06, + "loss": 1.9867749214172363, + "step": 6302 + }, + { + "epoch": 1.147446982797852, + "grad_norm": 35.25, + "learning_rate": 3.7397240703562064e-06, + "loss": 2.025033473968506, + "step": 6304 + }, + { + "epoch": 1.1478110494220441, + "grad_norm": 50.75, + "learning_rate": 3.739012197559767e-06, + "loss": 0.8330847024917603, + "step": 6306 + }, + { + "epoch": 1.1481751160462363, + "grad_norm": 15.9375, + "learning_rate": 3.738300216316282e-06, + "loss": 1.5945284366607666, + "step": 6308 + }, + { + "epoch": 1.1485391826704288, + "grad_norm": 12.1875, + "learning_rate": 3.737588126730233e-06, + "loss": 1.3502593040466309, + "step": 6310 + }, + { + "epoch": 1.148903249294621, + "grad_norm": 17.125, + "learning_rate": 3.736875928906116e-06, + "loss": 1.414579153060913, + "step": 6312 + }, + { + "epoch": 1.1492673159188131, + "grad_norm": 17.125, + "learning_rate": 3.736163622948442e-06, + "loss": 1.363706111907959, + "step": 6314 + }, + { + "epoch": 1.1496313825430053, + "grad_norm": 14.5625, + "learning_rate": 3.7354512089617412e-06, + "loss": 1.519248366355896, + "step": 6316 + }, + { + "epoch": 1.1499954491671975, + "grad_norm": 14.6875, + "learning_rate": 3.734738687050554e-06, + "loss": 1.5777974128723145, + "step": 6318 + }, + { + "epoch": 1.1503595157913897, + "grad_norm": 16.25, + "learning_rate": 3.734026057319443e-06, + "loss": 1.163029432296753, + "step": 6320 + }, + { + "epoch": 1.1507235824155821, + "grad_norm": 21.125, + "learning_rate": 3.733313319872983e-06, + "loss": 0.8242368698120117, + "step": 6322 + }, + { + "epoch": 1.1510876490397743, + "grad_norm": 25.0, + "learning_rate": 3.732600474815765e-06, + "loss": 1.5769751071929932, + "step": 6324 + }, + { + "epoch": 1.1514517156639665, + "grad_norm": 6.53125, + "learning_rate": 3.7318875222523964e-06, + "loss": 1.1115974187850952, + "step": 6326 + }, + { + "epoch": 1.1518157822881587, + "grad_norm": 8.3125, + "learning_rate": 3.7311744622875e-06, + "loss": 1.3775323629379272, + "step": 6328 + }, + { + "epoch": 1.152179848912351, + "grad_norm": 7.0625, + "learning_rate": 3.7304612950257134e-06, + "loss": 0.9680616855621338, + "step": 6330 + }, + { + "epoch": 1.1525439155365431, + "grad_norm": 16.875, + "learning_rate": 3.729748020571694e-06, + "loss": 1.9834116697311401, + "step": 6332 + }, + { + "epoch": 1.1529079821607353, + "grad_norm": 14.4375, + "learning_rate": 3.7290346390301092e-06, + "loss": 1.4998986721038818, + "step": 6334 + }, + { + "epoch": 1.1532720487849277, + "grad_norm": 8.5, + "learning_rate": 3.728321150505645e-06, + "loss": 1.357297420501709, + "step": 6336 + }, + { + "epoch": 1.15363611540912, + "grad_norm": 14.25, + "learning_rate": 3.727607555103003e-06, + "loss": 1.3547457456588745, + "step": 6338 + }, + { + "epoch": 1.1540001820333121, + "grad_norm": 16.25, + "learning_rate": 3.7268938529269026e-06, + "loss": 1.536704421043396, + "step": 6340 + }, + { + "epoch": 1.1543642486575043, + "grad_norm": 9.6875, + "learning_rate": 3.726180044082075e-06, + "loss": 1.663135290145874, + "step": 6342 + }, + { + "epoch": 1.1547283152816965, + "grad_norm": 16.25, + "learning_rate": 3.7254661286732685e-06, + "loss": 2.0617427825927734, + "step": 6344 + }, + { + "epoch": 1.1550923819058887, + "grad_norm": 9.875, + "learning_rate": 3.724752106805247e-06, + "loss": 1.4491593837738037, + "step": 6346 + }, + { + "epoch": 1.1554564485300811, + "grad_norm": 10.5, + "learning_rate": 3.724037978582792e-06, + "loss": 1.3757569789886475, + "step": 6348 + }, + { + "epoch": 1.1558205151542733, + "grad_norm": 31.5, + "learning_rate": 3.723323744110697e-06, + "loss": 1.3985414505004883, + "step": 6350 + }, + { + "epoch": 1.1561845817784655, + "grad_norm": 31.5, + "learning_rate": 3.7226094034937754e-06, + "loss": 1.372774362564087, + "step": 6352 + }, + { + "epoch": 1.1565486484026577, + "grad_norm": 2.8125, + "learning_rate": 3.721894956836851e-06, + "loss": 1.2280420064926147, + "step": 6354 + }, + { + "epoch": 1.15691271502685, + "grad_norm": 11.1875, + "learning_rate": 3.7211804042447676e-06, + "loss": 1.279487133026123, + "step": 6356 + }, + { + "epoch": 1.157276781651042, + "grad_norm": 13.0, + "learning_rate": 3.7204657458223825e-06, + "loss": 0.48636630177497864, + "step": 6358 + }, + { + "epoch": 1.1576408482752343, + "grad_norm": 7.78125, + "learning_rate": 3.7197509816745693e-06, + "loss": 1.4522373676300049, + "step": 6360 + }, + { + "epoch": 1.1580049148994265, + "grad_norm": 7.1875, + "learning_rate": 3.719036111906217e-06, + "loss": 1.3992928266525269, + "step": 6362 + }, + { + "epoch": 1.158368981523619, + "grad_norm": 22.875, + "learning_rate": 3.7183211366222283e-06, + "loss": 1.4296058416366577, + "step": 6364 + }, + { + "epoch": 1.158733048147811, + "grad_norm": 7.75, + "learning_rate": 3.717606055927524e-06, + "loss": 1.2124700546264648, + "step": 6366 + }, + { + "epoch": 1.1590971147720033, + "grad_norm": 9.375, + "learning_rate": 3.7168908699270388e-06, + "loss": 1.5632113218307495, + "step": 6368 + }, + { + "epoch": 1.1594611813961955, + "grad_norm": 13.125, + "learning_rate": 3.7161755787257237e-06, + "loss": 1.3403748273849487, + "step": 6370 + }, + { + "epoch": 1.1598252480203877, + "grad_norm": 20.75, + "learning_rate": 3.715460182428546e-06, + "loss": 1.4441494941711426, + "step": 6372 + }, + { + "epoch": 1.1601893146445799, + "grad_norm": 21.125, + "learning_rate": 3.7147446811404855e-06, + "loss": 1.587996482849121, + "step": 6374 + }, + { + "epoch": 1.1605533812687723, + "grad_norm": 10.25, + "learning_rate": 3.7140290749665397e-06, + "loss": 1.7855424880981445, + "step": 6376 + }, + { + "epoch": 1.1609174478929645, + "grad_norm": 13.3125, + "learning_rate": 3.7133133640117203e-06, + "loss": 1.5762220621109009, + "step": 6378 + }, + { + "epoch": 1.1612815145171567, + "grad_norm": 15.5, + "learning_rate": 3.7125975483810562e-06, + "loss": 2.0644497871398926, + "step": 6380 + }, + { + "epoch": 1.1616455811413489, + "grad_norm": 8.6875, + "learning_rate": 3.71188162817959e-06, + "loss": 1.3270652294158936, + "step": 6382 + }, + { + "epoch": 1.162009647765541, + "grad_norm": 8.875, + "learning_rate": 3.7111656035123787e-06, + "loss": 1.0801115036010742, + "step": 6384 + }, + { + "epoch": 1.1623737143897332, + "grad_norm": 9.75, + "learning_rate": 3.7104494744844975e-06, + "loss": 1.1640353202819824, + "step": 6386 + }, + { + "epoch": 1.1627377810139254, + "grad_norm": 3.765625, + "learning_rate": 3.7097332412010357e-06, + "loss": 0.9398530721664429, + "step": 6388 + }, + { + "epoch": 1.1631018476381179, + "grad_norm": 14.625, + "learning_rate": 3.7090169037670963e-06, + "loss": 1.0305564403533936, + "step": 6390 + }, + { + "epoch": 1.16346591426231, + "grad_norm": 8.6875, + "learning_rate": 3.708300462287802e-06, + "loss": 1.4037699699401855, + "step": 6392 + }, + { + "epoch": 1.1638299808865022, + "grad_norm": 35.0, + "learning_rate": 3.7075839168682824e-06, + "loss": 1.5099254846572876, + "step": 6394 + }, + { + "epoch": 1.1641940475106944, + "grad_norm": 38.0, + "learning_rate": 3.706867267613693e-06, + "loss": 1.5266382694244385, + "step": 6396 + }, + { + "epoch": 1.1645581141348866, + "grad_norm": 19.375, + "learning_rate": 3.7061505146291943e-06, + "loss": 1.4627496004104614, + "step": 6398 + }, + { + "epoch": 1.1649221807590788, + "grad_norm": 59.75, + "learning_rate": 3.7054336580199714e-06, + "loss": 1.4535367488861084, + "step": 6400 + }, + { + "epoch": 1.1652862473832712, + "grad_norm": 4.34375, + "learning_rate": 3.7047166978912165e-06, + "loss": 1.1168949604034424, + "step": 6402 + }, + { + "epoch": 1.1656503140074634, + "grad_norm": 7.03125, + "learning_rate": 3.7039996343481434e-06, + "loss": 1.052788496017456, + "step": 6404 + }, + { + "epoch": 1.1660143806316556, + "grad_norm": 16.75, + "learning_rate": 3.7032824674959765e-06, + "loss": 0.997197151184082, + "step": 6406 + }, + { + "epoch": 1.1663784472558478, + "grad_norm": 5.5625, + "learning_rate": 3.7025651974399567e-06, + "loss": 1.4710323810577393, + "step": 6408 + }, + { + "epoch": 1.16674251388004, + "grad_norm": 16.625, + "learning_rate": 3.7018478242853427e-06, + "loss": 1.5309149026870728, + "step": 6410 + }, + { + "epoch": 1.1671065805042322, + "grad_norm": 11.6875, + "learning_rate": 3.701130348137405e-06, + "loss": 1.3008434772491455, + "step": 6412 + }, + { + "epoch": 1.1674706471284244, + "grad_norm": 19.375, + "learning_rate": 3.7004127691014303e-06, + "loss": 1.6052995920181274, + "step": 6414 + }, + { + "epoch": 1.1678347137526166, + "grad_norm": 15.75, + "learning_rate": 3.699695087282719e-06, + "loss": 1.5112760066986084, + "step": 6416 + }, + { + "epoch": 1.168198780376809, + "grad_norm": 21.125, + "learning_rate": 3.6989773027865892e-06, + "loss": 1.1832265853881836, + "step": 6418 + }, + { + "epoch": 1.1685628470010012, + "grad_norm": 19.25, + "learning_rate": 3.698259415718374e-06, + "loss": 1.4476916790008545, + "step": 6420 + }, + { + "epoch": 1.1689269136251934, + "grad_norm": 13.5, + "learning_rate": 3.6975414261834185e-06, + "loss": 1.552721381187439, + "step": 6422 + }, + { + "epoch": 1.1692909802493856, + "grad_norm": 14.9375, + "learning_rate": 3.696823334287086e-06, + "loss": 1.6933646202087402, + "step": 6424 + }, + { + "epoch": 1.1696550468735778, + "grad_norm": 12.0, + "learning_rate": 3.6961051401347537e-06, + "loss": 1.9006317853927612, + "step": 6426 + }, + { + "epoch": 1.1700191134977702, + "grad_norm": 10.25, + "learning_rate": 3.695386843831813e-06, + "loss": 1.4743210077285767, + "step": 6428 + }, + { + "epoch": 1.1703831801219624, + "grad_norm": 10.1875, + "learning_rate": 3.69466844548367e-06, + "loss": 1.2118438482284546, + "step": 6430 + }, + { + "epoch": 1.1707472467461546, + "grad_norm": 18.0, + "learning_rate": 3.6939499451957494e-06, + "loss": 1.3957982063293457, + "step": 6432 + }, + { + "epoch": 1.1711113133703468, + "grad_norm": 8.625, + "learning_rate": 3.6932313430734856e-06, + "loss": 1.3583909273147583, + "step": 6434 + }, + { + "epoch": 1.171475379994539, + "grad_norm": 13.6875, + "learning_rate": 3.692512639222332e-06, + "loss": 1.5858925580978394, + "step": 6436 + }, + { + "epoch": 1.1718394466187312, + "grad_norm": 24.5, + "learning_rate": 3.6917938337477543e-06, + "loss": 1.5727481842041016, + "step": 6438 + }, + { + "epoch": 1.1722035132429234, + "grad_norm": 16.75, + "learning_rate": 3.6910749267552358e-06, + "loss": 1.4769070148468018, + "step": 6440 + }, + { + "epoch": 1.1725675798671156, + "grad_norm": 8.6875, + "learning_rate": 3.6903559183502734e-06, + "loss": 1.0711969137191772, + "step": 6442 + }, + { + "epoch": 1.172931646491308, + "grad_norm": 10.5625, + "learning_rate": 3.6896368086383773e-06, + "loss": 1.4186828136444092, + "step": 6444 + }, + { + "epoch": 1.1732957131155002, + "grad_norm": 8.625, + "learning_rate": 3.6889175977250735e-06, + "loss": 1.549869179725647, + "step": 6446 + }, + { + "epoch": 1.1736597797396924, + "grad_norm": 10.5625, + "learning_rate": 3.688198285715904e-06, + "loss": 1.3686096668243408, + "step": 6448 + }, + { + "epoch": 1.1740238463638846, + "grad_norm": 10.0, + "learning_rate": 3.6874788727164267e-06, + "loss": 1.558035969734192, + "step": 6450 + }, + { + "epoch": 1.1743879129880768, + "grad_norm": 13.0, + "learning_rate": 3.686759358832209e-06, + "loss": 1.6602004766464233, + "step": 6452 + }, + { + "epoch": 1.174751979612269, + "grad_norm": 14.4375, + "learning_rate": 3.68603974416884e-06, + "loss": 1.914360523223877, + "step": 6454 + }, + { + "epoch": 1.1751160462364614, + "grad_norm": 13.9375, + "learning_rate": 3.6853200288319192e-06, + "loss": 1.4059867858886719, + "step": 6456 + }, + { + "epoch": 1.1754801128606536, + "grad_norm": 10.5625, + "learning_rate": 3.6846002129270595e-06, + "loss": 1.4582817554473877, + "step": 6458 + }, + { + "epoch": 1.1758441794848458, + "grad_norm": 25.5, + "learning_rate": 3.683880296559894e-06, + "loss": 1.557992696762085, + "step": 6460 + }, + { + "epoch": 1.176208246109038, + "grad_norm": 18.75, + "learning_rate": 3.683160279836068e-06, + "loss": 2.259274482727051, + "step": 6462 + }, + { + "epoch": 1.1765723127332302, + "grad_norm": 7.65625, + "learning_rate": 3.682440162861237e-06, + "loss": 1.3134491443634033, + "step": 6464 + }, + { + "epoch": 1.1769363793574223, + "grad_norm": 4.40625, + "learning_rate": 3.6817199457410802e-06, + "loss": 1.1946327686309814, + "step": 6466 + }, + { + "epoch": 1.1773004459816145, + "grad_norm": 8.5625, + "learning_rate": 3.680999628581282e-06, + "loss": 1.3952946662902832, + "step": 6468 + }, + { + "epoch": 1.177664512605807, + "grad_norm": 12.1875, + "learning_rate": 3.6802792114875494e-06, + "loss": 1.6258931159973145, + "step": 6470 + }, + { + "epoch": 1.1780285792299992, + "grad_norm": 15.0, + "learning_rate": 3.6795586945655992e-06, + "loss": 1.6228737831115723, + "step": 6472 + }, + { + "epoch": 1.1783926458541913, + "grad_norm": 9.0, + "learning_rate": 3.678838077921165e-06, + "loss": 1.9807703495025635, + "step": 6474 + }, + { + "epoch": 1.1787567124783835, + "grad_norm": 9.0, + "learning_rate": 3.678117361659993e-06, + "loss": 1.339045524597168, + "step": 6476 + }, + { + "epoch": 1.1791207791025757, + "grad_norm": 6.78125, + "learning_rate": 3.677396545887847e-06, + "loss": 1.327804446220398, + "step": 6478 + }, + { + "epoch": 1.179484845726768, + "grad_norm": 10.0625, + "learning_rate": 3.6766756307105025e-06, + "loss": 1.1708130836486816, + "step": 6480 + }, + { + "epoch": 1.1798489123509603, + "grad_norm": 41.75, + "learning_rate": 3.6759546162337525e-06, + "loss": 1.1664369106292725, + "step": 6482 + }, + { + "epoch": 1.1802129789751525, + "grad_norm": 19.625, + "learning_rate": 3.675233502563401e-06, + "loss": 1.2675286531448364, + "step": 6484 + }, + { + "epoch": 1.1805770455993447, + "grad_norm": 10.0625, + "learning_rate": 3.674512289805271e-06, + "loss": 1.5202103853225708, + "step": 6486 + }, + { + "epoch": 1.180941112223537, + "grad_norm": 8.125, + "learning_rate": 3.6737909780651936e-06, + "loss": 1.4262146949768066, + "step": 6488 + }, + { + "epoch": 1.1813051788477291, + "grad_norm": 56.0, + "learning_rate": 3.6730695674490226e-06, + "loss": 1.6234575510025024, + "step": 6490 + }, + { + "epoch": 1.1816692454719213, + "grad_norm": 28.0, + "learning_rate": 3.6723480580626203e-06, + "loss": 2.057462453842163, + "step": 6492 + }, + { + "epoch": 1.1820333120961135, + "grad_norm": 7.71875, + "learning_rate": 3.671626450011865e-06, + "loss": 1.234393835067749, + "step": 6494 + }, + { + "epoch": 1.1823973787203057, + "grad_norm": 11.625, + "learning_rate": 3.67090474340265e-06, + "loss": 1.1700392961502075, + "step": 6496 + }, + { + "epoch": 1.1827614453444981, + "grad_norm": 13.375, + "learning_rate": 3.6701829383408814e-06, + "loss": 0.8894488215446472, + "step": 6498 + }, + { + "epoch": 1.1831255119686903, + "grad_norm": 13.4375, + "learning_rate": 3.6694610349324843e-06, + "loss": 1.0495619773864746, + "step": 6500 + }, + { + "epoch": 1.1834895785928825, + "grad_norm": 12.125, + "learning_rate": 3.6687390332833937e-06, + "loss": 0.1490705907344818, + "step": 6502 + }, + { + "epoch": 1.1838536452170747, + "grad_norm": 6.5625, + "learning_rate": 3.6680169334995595e-06, + "loss": 0.5033633708953857, + "step": 6504 + }, + { + "epoch": 1.184217711841267, + "grad_norm": 11.25, + "learning_rate": 3.667294735686947e-06, + "loss": 1.5920605659484863, + "step": 6506 + }, + { + "epoch": 1.184581778465459, + "grad_norm": 26.25, + "learning_rate": 3.6665724399515367e-06, + "loss": 1.7717106342315674, + "step": 6508 + }, + { + "epoch": 1.1849458450896515, + "grad_norm": 13.5, + "learning_rate": 3.665850046399323e-06, + "loss": 1.6689963340759277, + "step": 6510 + }, + { + "epoch": 1.1853099117138437, + "grad_norm": 14.9375, + "learning_rate": 3.665127555136313e-06, + "loss": 2.112014055252075, + "step": 6512 + }, + { + "epoch": 1.185673978338036, + "grad_norm": 10.4375, + "learning_rate": 3.66440496626853e-06, + "loss": 1.7579913139343262, + "step": 6514 + }, + { + "epoch": 1.186038044962228, + "grad_norm": 22.625, + "learning_rate": 3.66368227990201e-06, + "loss": 1.1560434103012085, + "step": 6516 + }, + { + "epoch": 1.1864021115864203, + "grad_norm": 13.9375, + "learning_rate": 3.662959496142805e-06, + "loss": 1.8474457263946533, + "step": 6518 + }, + { + "epoch": 1.1867661782106125, + "grad_norm": 8.75, + "learning_rate": 3.6622366150969813e-06, + "loss": 1.4579943418502808, + "step": 6520 + }, + { + "epoch": 1.1871302448348047, + "grad_norm": 27.875, + "learning_rate": 3.6615136368706185e-06, + "loss": 1.1896178722381592, + "step": 6522 + }, + { + "epoch": 1.187494311458997, + "grad_norm": 8.3125, + "learning_rate": 3.66079056156981e-06, + "loss": 1.6693971157073975, + "step": 6524 + }, + { + "epoch": 1.1878583780831893, + "grad_norm": 14.75, + "learning_rate": 3.6600673893006646e-06, + "loss": 1.5194038152694702, + "step": 6526 + }, + { + "epoch": 1.1882224447073815, + "grad_norm": 13.0625, + "learning_rate": 3.6593441201693035e-06, + "loss": 1.5235917568206787, + "step": 6528 + }, + { + "epoch": 1.1885865113315737, + "grad_norm": 11.5, + "learning_rate": 3.6586207542818664e-06, + "loss": 1.4025870561599731, + "step": 6530 + }, + { + "epoch": 1.1889505779557659, + "grad_norm": 11.5625, + "learning_rate": 3.657897291744503e-06, + "loss": 1.5682439804077148, + "step": 6532 + }, + { + "epoch": 1.189314644579958, + "grad_norm": 10.75, + "learning_rate": 3.6571737326633783e-06, + "loss": 1.4926438331604004, + "step": 6534 + }, + { + "epoch": 1.1896787112041505, + "grad_norm": 10.875, + "learning_rate": 3.65645007714467e-06, + "loss": 1.401459813117981, + "step": 6536 + }, + { + "epoch": 1.1900427778283427, + "grad_norm": 6.90625, + "learning_rate": 3.655726325294574e-06, + "loss": 1.2278746366500854, + "step": 6538 + }, + { + "epoch": 1.1904068444525349, + "grad_norm": 9.8125, + "learning_rate": 3.655002477219297e-06, + "loss": 1.2134888172149658, + "step": 6540 + }, + { + "epoch": 1.190770911076727, + "grad_norm": 15.0625, + "learning_rate": 3.654278533025063e-06, + "loss": 1.4659929275512695, + "step": 6542 + }, + { + "epoch": 1.1911349777009193, + "grad_norm": 10.0625, + "learning_rate": 3.653554492818103e-06, + "loss": 1.490921139717102, + "step": 6544 + }, + { + "epoch": 1.1914990443251114, + "grad_norm": 10.1875, + "learning_rate": 3.6528303567046717e-06, + "loss": 1.394912838935852, + "step": 6546 + }, + { + "epoch": 1.1918631109493036, + "grad_norm": 16.5, + "learning_rate": 3.6521061247910296e-06, + "loss": 1.4490734338760376, + "step": 6548 + }, + { + "epoch": 1.1922271775734958, + "grad_norm": 9.3125, + "learning_rate": 3.6513817971834574e-06, + "loss": 1.504162073135376, + "step": 6550 + }, + { + "epoch": 1.1925912441976882, + "grad_norm": 27.0, + "learning_rate": 3.6506573739882468e-06, + "loss": 1.6482129096984863, + "step": 6552 + }, + { + "epoch": 1.1929553108218804, + "grad_norm": 15.125, + "learning_rate": 3.649932855311703e-06, + "loss": 1.8429118394851685, + "step": 6554 + }, + { + "epoch": 1.1933193774460726, + "grad_norm": 10.25, + "learning_rate": 3.649208241260147e-06, + "loss": 1.273280143737793, + "step": 6556 + }, + { + "epoch": 1.1936834440702648, + "grad_norm": 4.0, + "learning_rate": 3.6484835319399113e-06, + "loss": 0.8494876623153687, + "step": 6558 + }, + { + "epoch": 1.194047510694457, + "grad_norm": 7.25, + "learning_rate": 3.647758727457347e-06, + "loss": 1.2064921855926514, + "step": 6560 + }, + { + "epoch": 1.1944115773186492, + "grad_norm": 17.5, + "learning_rate": 3.6470338279188143e-06, + "loss": 1.4824676513671875, + "step": 6562 + }, + { + "epoch": 1.1947756439428416, + "grad_norm": 12.875, + "learning_rate": 3.6463088334306886e-06, + "loss": 1.432023525238037, + "step": 6564 + }, + { + "epoch": 1.1951397105670338, + "grad_norm": 52.25, + "learning_rate": 3.6455837440993614e-06, + "loss": 1.0832397937774658, + "step": 6566 + }, + { + "epoch": 1.195503777191226, + "grad_norm": 10.375, + "learning_rate": 3.6448585600312357e-06, + "loss": 1.2010561227798462, + "step": 6568 + }, + { + "epoch": 1.1958678438154182, + "grad_norm": 7.125, + "learning_rate": 3.64413328133273e-06, + "loss": 1.109591007232666, + "step": 6570 + }, + { + "epoch": 1.1962319104396104, + "grad_norm": 12.375, + "learning_rate": 3.6434079081102757e-06, + "loss": 1.6361263990402222, + "step": 6572 + }, + { + "epoch": 1.1965959770638026, + "grad_norm": 12.375, + "learning_rate": 3.6426824404703186e-06, + "loss": 1.4606677293777466, + "step": 6574 + }, + { + "epoch": 1.1969600436879948, + "grad_norm": 12.6875, + "learning_rate": 3.641956878519318e-06, + "loss": 1.3994128704071045, + "step": 6576 + }, + { + "epoch": 1.1973241103121872, + "grad_norm": 10.0625, + "learning_rate": 3.641231222363746e-06, + "loss": 1.3531302213668823, + "step": 6578 + }, + { + "epoch": 1.1976881769363794, + "grad_norm": 6.34375, + "learning_rate": 3.640505472110092e-06, + "loss": 1.086106300354004, + "step": 6580 + }, + { + "epoch": 1.1980522435605716, + "grad_norm": 5.53125, + "learning_rate": 3.6397796278648555e-06, + "loss": 1.4456672668457031, + "step": 6582 + }, + { + "epoch": 1.1984163101847638, + "grad_norm": 5.0625, + "learning_rate": 3.6390536897345517e-06, + "loss": 0.8753069639205933, + "step": 6584 + }, + { + "epoch": 1.198780376808956, + "grad_norm": 8.25, + "learning_rate": 3.6383276578257074e-06, + "loss": 1.2965389490127563, + "step": 6586 + }, + { + "epoch": 1.1991444434331482, + "grad_norm": 32.0, + "learning_rate": 3.6376015322448677e-06, + "loss": 1.5784127712249756, + "step": 6588 + }, + { + "epoch": 1.1995085100573406, + "grad_norm": 44.75, + "learning_rate": 3.6368753130985866e-06, + "loss": 1.6178051233291626, + "step": 6590 + }, + { + "epoch": 1.1998725766815328, + "grad_norm": 6.96875, + "learning_rate": 3.636149000493434e-06, + "loss": 1.0059093236923218, + "step": 6592 + }, + { + "epoch": 1.200236643305725, + "grad_norm": 5.9375, + "learning_rate": 3.6354225945359944e-06, + "loss": 1.343648910522461, + "step": 6594 + }, + { + "epoch": 1.2006007099299172, + "grad_norm": 49.5, + "learning_rate": 3.6346960953328637e-06, + "loss": 1.422285556793213, + "step": 6596 + }, + { + "epoch": 1.2009647765541094, + "grad_norm": 9.125, + "learning_rate": 3.633969502990653e-06, + "loss": 1.8281251192092896, + "step": 6598 + }, + { + "epoch": 1.2013288431783016, + "grad_norm": 7.53125, + "learning_rate": 3.6332428176159873e-06, + "loss": 1.6549746990203857, + "step": 6600 + }, + { + "epoch": 1.2016929098024938, + "grad_norm": 9.25, + "learning_rate": 3.6325160393155047e-06, + "loss": 1.582465648651123, + "step": 6602 + }, + { + "epoch": 1.202056976426686, + "grad_norm": 8.1875, + "learning_rate": 3.6317891681958562e-06, + "loss": 1.3914471864700317, + "step": 6604 + }, + { + "epoch": 1.2024210430508784, + "grad_norm": 18.5, + "learning_rate": 3.631062204363708e-06, + "loss": 1.3895189762115479, + "step": 6606 + }, + { + "epoch": 1.2027851096750706, + "grad_norm": 3.625, + "learning_rate": 3.6303351479257387e-06, + "loss": 1.2460017204284668, + "step": 6608 + }, + { + "epoch": 1.2031491762992628, + "grad_norm": 13.1875, + "learning_rate": 3.629607998988641e-06, + "loss": 1.2483797073364258, + "step": 6610 + }, + { + "epoch": 1.203513242923455, + "grad_norm": 11.375, + "learning_rate": 3.6288807576591213e-06, + "loss": 1.703126072883606, + "step": 6612 + }, + { + "epoch": 1.2038773095476472, + "grad_norm": 14.875, + "learning_rate": 3.6281534240438986e-06, + "loss": 1.4523193836212158, + "step": 6614 + }, + { + "epoch": 1.2042413761718394, + "grad_norm": 17.75, + "learning_rate": 3.627425998249706e-06, + "loss": 1.8474994897842407, + "step": 6616 + }, + { + "epoch": 1.2046054427960318, + "grad_norm": 17.875, + "learning_rate": 3.6266984803832917e-06, + "loss": 1.423680067062378, + "step": 6618 + }, + { + "epoch": 1.204969509420224, + "grad_norm": 9.0, + "learning_rate": 3.625970870551415e-06, + "loss": 1.3614366054534912, + "step": 6620 + }, + { + "epoch": 1.2053335760444162, + "grad_norm": 19.375, + "learning_rate": 3.62524316886085e-06, + "loss": 0.9116806983947754, + "step": 6622 + }, + { + "epoch": 1.2056976426686083, + "grad_norm": 3.484375, + "learning_rate": 3.6245153754183836e-06, + "loss": 1.1547564268112183, + "step": 6624 + }, + { + "epoch": 1.2060617092928005, + "grad_norm": 5.25, + "learning_rate": 3.6237874903308177e-06, + "loss": 0.923206090927124, + "step": 6626 + }, + { + "epoch": 1.2064257759169927, + "grad_norm": 11.25, + "learning_rate": 3.623059513704964e-06, + "loss": 1.1630260944366455, + "step": 6628 + }, + { + "epoch": 1.206789842541185, + "grad_norm": 12.0625, + "learning_rate": 3.6223314456476533e-06, + "loss": 1.4146908521652222, + "step": 6630 + }, + { + "epoch": 1.2071539091653773, + "grad_norm": 22.75, + "learning_rate": 3.621603286265725e-06, + "loss": 1.576790452003479, + "step": 6632 + }, + { + "epoch": 1.2075179757895695, + "grad_norm": 14.4375, + "learning_rate": 3.6208750356660327e-06, + "loss": 1.4390062093734741, + "step": 6634 + }, + { + "epoch": 1.2078820424137617, + "grad_norm": 15.9375, + "learning_rate": 3.6201466939554453e-06, + "loss": 1.4026565551757812, + "step": 6636 + }, + { + "epoch": 1.208246109037954, + "grad_norm": 14.1875, + "learning_rate": 3.6194182612408433e-06, + "loss": 1.3882112503051758, + "step": 6638 + }, + { + "epoch": 1.2086101756621461, + "grad_norm": 8.6875, + "learning_rate": 3.618689737629122e-06, + "loss": 1.3948407173156738, + "step": 6640 + }, + { + "epoch": 1.2089742422863383, + "grad_norm": 5.15625, + "learning_rate": 3.6179611232271904e-06, + "loss": 1.226584553718567, + "step": 6642 + }, + { + "epoch": 1.2093383089105307, + "grad_norm": 21.5, + "learning_rate": 3.6172324181419673e-06, + "loss": 0.8317216038703918, + "step": 6644 + }, + { + "epoch": 1.209702375534723, + "grad_norm": 7.65625, + "learning_rate": 3.6165036224803874e-06, + "loss": 0.9816555976867676, + "step": 6646 + }, + { + "epoch": 1.2100664421589151, + "grad_norm": 6.84375, + "learning_rate": 3.6157747363494e-06, + "loss": 0.4452129006385803, + "step": 6648 + }, + { + "epoch": 1.2104305087831073, + "grad_norm": 8.125, + "learning_rate": 3.615045759855965e-06, + "loss": 1.451916217803955, + "step": 6650 + }, + { + "epoch": 1.2107945754072995, + "grad_norm": 6.59375, + "learning_rate": 3.6143166931070596e-06, + "loss": 0.9785503149032593, + "step": 6652 + }, + { + "epoch": 1.2111586420314917, + "grad_norm": 7.28125, + "learning_rate": 3.6135875362096675e-06, + "loss": 1.3907877206802368, + "step": 6654 + }, + { + "epoch": 1.211522708655684, + "grad_norm": 26.125, + "learning_rate": 3.612858289270791e-06, + "loss": 1.390721321105957, + "step": 6656 + }, + { + "epoch": 1.211886775279876, + "grad_norm": 20.25, + "learning_rate": 3.6121289523974436e-06, + "loss": 1.5638270378112793, + "step": 6658 + }, + { + "epoch": 1.2122508419040685, + "grad_norm": 53.25, + "learning_rate": 3.611399525696654e-06, + "loss": 0.5643367767333984, + "step": 6660 + }, + { + "epoch": 1.2126149085282607, + "grad_norm": 4.71875, + "learning_rate": 3.6106700092754623e-06, + "loss": 1.0857794284820557, + "step": 6662 + }, + { + "epoch": 1.212978975152453, + "grad_norm": 21.125, + "learning_rate": 3.6099404032409212e-06, + "loss": 1.6155925989151, + "step": 6664 + }, + { + "epoch": 1.213343041776645, + "grad_norm": 9.0, + "learning_rate": 3.6092107077000982e-06, + "loss": 1.5053106546401978, + "step": 6666 + }, + { + "epoch": 1.2137071084008373, + "grad_norm": 13.3125, + "learning_rate": 3.6084809227600713e-06, + "loss": 1.4572757482528687, + "step": 6668 + }, + { + "epoch": 1.2140711750250297, + "grad_norm": 13.625, + "learning_rate": 3.6077510485279356e-06, + "loss": 1.404142141342163, + "step": 6670 + }, + { + "epoch": 1.214435241649222, + "grad_norm": 5.53125, + "learning_rate": 3.607021085110798e-06, + "loss": 1.0271953344345093, + "step": 6672 + }, + { + "epoch": 1.214799308273414, + "grad_norm": 13.25, + "learning_rate": 3.606291032615774e-06, + "loss": 1.3324660062789917, + "step": 6674 + }, + { + "epoch": 1.2151633748976063, + "grad_norm": 21.75, + "learning_rate": 3.6055608911499993e-06, + "loss": 1.538636326789856, + "step": 6676 + }, + { + "epoch": 1.2155274415217985, + "grad_norm": 39.75, + "learning_rate": 3.6048306608206174e-06, + "loss": 1.9539988040924072, + "step": 6678 + }, + { + "epoch": 1.2158915081459907, + "grad_norm": 11.375, + "learning_rate": 3.6041003417347873e-06, + "loss": 1.4843626022338867, + "step": 6680 + }, + { + "epoch": 1.2162555747701829, + "grad_norm": 15.6875, + "learning_rate": 3.6033699339996808e-06, + "loss": 1.4113197326660156, + "step": 6682 + }, + { + "epoch": 1.216619641394375, + "grad_norm": 22.25, + "learning_rate": 3.6026394377224817e-06, + "loss": 1.407170295715332, + "step": 6684 + }, + { + "epoch": 1.2169837080185675, + "grad_norm": 12.9375, + "learning_rate": 3.6019088530103863e-06, + "loss": 1.5715916156768799, + "step": 6686 + }, + { + "epoch": 1.2173477746427597, + "grad_norm": 17.625, + "learning_rate": 3.601178179970607e-06, + "loss": 1.336016058921814, + "step": 6688 + }, + { + "epoch": 1.2177118412669519, + "grad_norm": 7.625, + "learning_rate": 3.6004474187103656e-06, + "loss": 1.3757257461547852, + "step": 6690 + }, + { + "epoch": 1.218075907891144, + "grad_norm": 13.0, + "learning_rate": 3.5997165693368996e-06, + "loss": 1.4690946340560913, + "step": 6692 + }, + { + "epoch": 1.2184399745153363, + "grad_norm": 10.0, + "learning_rate": 3.598985631957458e-06, + "loss": 1.1817655563354492, + "step": 6694 + }, + { + "epoch": 1.2188040411395284, + "grad_norm": 11.5, + "learning_rate": 3.598254606679301e-06, + "loss": 0.7359495759010315, + "step": 6696 + }, + { + "epoch": 1.2191681077637209, + "grad_norm": 11.6875, + "learning_rate": 3.5975234936097048e-06, + "loss": 0.5830755233764648, + "step": 6698 + }, + { + "epoch": 1.219532174387913, + "grad_norm": 8.8125, + "learning_rate": 3.5967922928559586e-06, + "loss": 1.1584774255752563, + "step": 6700 + }, + { + "epoch": 1.2198962410121053, + "grad_norm": 12.875, + "learning_rate": 3.5960610045253618e-06, + "loss": 1.0171111822128296, + "step": 6702 + }, + { + "epoch": 1.2202603076362974, + "grad_norm": 15.3125, + "learning_rate": 3.595329628725227e-06, + "loss": 1.5831761360168457, + "step": 6704 + }, + { + "epoch": 1.2206243742604896, + "grad_norm": 35.25, + "learning_rate": 3.5945981655628838e-06, + "loss": 1.4115028381347656, + "step": 6706 + }, + { + "epoch": 1.2209884408846818, + "grad_norm": 10.6875, + "learning_rate": 3.593866615145668e-06, + "loss": 1.3418809175491333, + "step": 6708 + }, + { + "epoch": 1.221352507508874, + "grad_norm": 6.5, + "learning_rate": 3.5931349775809334e-06, + "loss": 1.3783400058746338, + "step": 6710 + }, + { + "epoch": 1.2217165741330664, + "grad_norm": 4.90625, + "learning_rate": 3.5924032529760454e-06, + "loss": 1.1789112091064453, + "step": 6712 + }, + { + "epoch": 1.2220806407572586, + "grad_norm": 8.5625, + "learning_rate": 3.59167144143838e-06, + "loss": 1.1209945678710938, + "step": 6714 + }, + { + "epoch": 1.2224447073814508, + "grad_norm": 8.4375, + "learning_rate": 3.590939543075329e-06, + "loss": 1.131537914276123, + "step": 6716 + }, + { + "epoch": 1.222808774005643, + "grad_norm": 59.75, + "learning_rate": 3.590207557994294e-06, + "loss": 0.8571414351463318, + "step": 6718 + }, + { + "epoch": 1.2231728406298352, + "grad_norm": 9.0625, + "learning_rate": 3.5894754863026926e-06, + "loss": 1.321110486984253, + "step": 6720 + }, + { + "epoch": 1.2235369072540274, + "grad_norm": 7.1875, + "learning_rate": 3.588743328107953e-06, + "loss": 1.334371566772461, + "step": 6722 + }, + { + "epoch": 1.2239009738782198, + "grad_norm": 11.8125, + "learning_rate": 3.5880110835175154e-06, + "loss": 1.2903462648391724, + "step": 6724 + }, + { + "epoch": 1.224265040502412, + "grad_norm": 46.5, + "learning_rate": 3.5872787526388343e-06, + "loss": 1.5546529293060303, + "step": 6726 + }, + { + "epoch": 1.2246291071266042, + "grad_norm": 23.375, + "learning_rate": 3.586546335579375e-06, + "loss": 1.2964411973953247, + "step": 6728 + }, + { + "epoch": 1.2249931737507964, + "grad_norm": 8.9375, + "learning_rate": 3.5858138324466195e-06, + "loss": 1.2439881563186646, + "step": 6730 + }, + { + "epoch": 1.2253572403749886, + "grad_norm": 18.375, + "learning_rate": 3.5850812433480586e-06, + "loss": 0.9473875164985657, + "step": 6732 + }, + { + "epoch": 1.2257213069991808, + "grad_norm": 14.5, + "learning_rate": 3.584348568391195e-06, + "loss": 0.9904845356941223, + "step": 6734 + }, + { + "epoch": 1.226085373623373, + "grad_norm": 13.5, + "learning_rate": 3.583615807683548e-06, + "loss": 1.6508527994155884, + "step": 6736 + }, + { + "epoch": 1.2264494402475652, + "grad_norm": 7.53125, + "learning_rate": 3.5828829613326467e-06, + "loss": 1.1980913877487183, + "step": 6738 + }, + { + "epoch": 1.2268135068717576, + "grad_norm": 11.8125, + "learning_rate": 3.582150029446032e-06, + "loss": 1.087984323501587, + "step": 6740 + }, + { + "epoch": 1.2271775734959498, + "grad_norm": 11.625, + "learning_rate": 3.581417012131261e-06, + "loss": 1.362506628036499, + "step": 6742 + }, + { + "epoch": 1.227541640120142, + "grad_norm": 9.75, + "learning_rate": 3.580683909495898e-06, + "loss": 1.174209713935852, + "step": 6744 + }, + { + "epoch": 1.2279057067443342, + "grad_norm": 13.9375, + "learning_rate": 3.579950721647526e-06, + "loss": 0.804348349571228, + "step": 6746 + }, + { + "epoch": 1.2282697733685264, + "grad_norm": 11.875, + "learning_rate": 3.579217448693735e-06, + "loss": 0.9139540791511536, + "step": 6748 + }, + { + "epoch": 1.2286338399927186, + "grad_norm": 14.625, + "learning_rate": 3.5784840907421315e-06, + "loss": 1.4969446659088135, + "step": 6750 + }, + { + "epoch": 1.228997906616911, + "grad_norm": 18.125, + "learning_rate": 3.577750647900332e-06, + "loss": 1.7645078897476196, + "step": 6752 + }, + { + "epoch": 1.2293619732411032, + "grad_norm": 12.875, + "learning_rate": 3.5770171202759664e-06, + "loss": 1.380998969078064, + "step": 6754 + }, + { + "epoch": 1.2297260398652954, + "grad_norm": 10.25, + "learning_rate": 3.576283507976677e-06, + "loss": 1.277035117149353, + "step": 6756 + }, + { + "epoch": 1.2300901064894876, + "grad_norm": 14.875, + "learning_rate": 3.575549811110117e-06, + "loss": 1.4455209970474243, + "step": 6758 + }, + { + "epoch": 1.2304541731136798, + "grad_norm": 11.0625, + "learning_rate": 3.574816029783956e-06, + "loss": 1.5612605810165405, + "step": 6760 + }, + { + "epoch": 1.230818239737872, + "grad_norm": 13.5, + "learning_rate": 3.5740821641058722e-06, + "loss": 1.288191795349121, + "step": 6762 + }, + { + "epoch": 1.2311823063620642, + "grad_norm": 12.0, + "learning_rate": 3.5733482141835575e-06, + "loss": 1.1193962097167969, + "step": 6764 + }, + { + "epoch": 1.2315463729862566, + "grad_norm": 13.4375, + "learning_rate": 3.572614180124715e-06, + "loss": 1.4512017965316772, + "step": 6766 + }, + { + "epoch": 1.2319104396104488, + "grad_norm": 7.71875, + "learning_rate": 3.571880062037062e-06, + "loss": 0.7806037664413452, + "step": 6768 + }, + { + "epoch": 1.232274506234641, + "grad_norm": 13.9375, + "learning_rate": 3.5711458600283277e-06, + "loss": 1.3409650325775146, + "step": 6770 + }, + { + "epoch": 1.2326385728588332, + "grad_norm": 13.1875, + "learning_rate": 3.570411574206254e-06, + "loss": 0.956801176071167, + "step": 6772 + }, + { + "epoch": 1.2330026394830254, + "grad_norm": 9.125, + "learning_rate": 3.5696772046785935e-06, + "loss": 1.628990650177002, + "step": 6774 + }, + { + "epoch": 1.2333667061072175, + "grad_norm": 16.75, + "learning_rate": 3.5689427515531117e-06, + "loss": 1.4250504970550537, + "step": 6776 + }, + { + "epoch": 1.23373077273141, + "grad_norm": 13.0, + "learning_rate": 3.568208214937586e-06, + "loss": 1.4149599075317383, + "step": 6778 + }, + { + "epoch": 1.2340948393556022, + "grad_norm": 13.875, + "learning_rate": 3.5674735949398075e-06, + "loss": 1.4843947887420654, + "step": 6780 + }, + { + "epoch": 1.2344589059797944, + "grad_norm": 36.5, + "learning_rate": 3.5667388916675795e-06, + "loss": 0.9764255285263062, + "step": 6782 + }, + { + "epoch": 1.2348229726039865, + "grad_norm": 7.9375, + "learning_rate": 3.5660041052287165e-06, + "loss": 0.49985015392303467, + "step": 6784 + }, + { + "epoch": 1.2351870392281787, + "grad_norm": 34.25, + "learning_rate": 3.5652692357310435e-06, + "loss": 1.3247759342193604, + "step": 6786 + }, + { + "epoch": 1.235551105852371, + "grad_norm": 7.5, + "learning_rate": 3.5645342832824002e-06, + "loss": 1.0177068710327148, + "step": 6788 + }, + { + "epoch": 1.2359151724765631, + "grad_norm": 30.125, + "learning_rate": 3.5637992479906415e-06, + "loss": 1.0612064599990845, + "step": 6790 + }, + { + "epoch": 1.2362792391007553, + "grad_norm": 31.125, + "learning_rate": 3.563064129963626e-06, + "loss": 0.7122699618339539, + "step": 6792 + }, + { + "epoch": 1.2366433057249477, + "grad_norm": 24.875, + "learning_rate": 3.5623289293092322e-06, + "loss": 1.4799864292144775, + "step": 6794 + }, + { + "epoch": 1.23700737234914, + "grad_norm": 7.03125, + "learning_rate": 3.5615936461353473e-06, + "loss": 1.387568712234497, + "step": 6796 + }, + { + "epoch": 1.2373714389733321, + "grad_norm": 16.75, + "learning_rate": 3.560858280549869e-06, + "loss": 1.446279525756836, + "step": 6798 + }, + { + "epoch": 1.2377355055975243, + "grad_norm": 18.875, + "learning_rate": 3.5601228326607118e-06, + "loss": 1.6540460586547852, + "step": 6800 + }, + { + "epoch": 1.2380995722217165, + "grad_norm": 10.0, + "learning_rate": 3.5593873025757995e-06, + "loss": 1.781539797782898, + "step": 6802 + }, + { + "epoch": 1.2384636388459087, + "grad_norm": 2.21875, + "learning_rate": 3.5586516904030676e-06, + "loss": 0.8537502884864807, + "step": 6804 + }, + { + "epoch": 1.2388277054701011, + "grad_norm": 23.375, + "learning_rate": 3.557915996250464e-06, + "loss": 1.2302873134613037, + "step": 6806 + }, + { + "epoch": 1.2391917720942933, + "grad_norm": 12.5, + "learning_rate": 3.5571802202259476e-06, + "loss": 1.3988693952560425, + "step": 6808 + }, + { + "epoch": 1.2395558387184855, + "grad_norm": 10.6875, + "learning_rate": 3.5564443624374935e-06, + "loss": 1.5080784559249878, + "step": 6810 + }, + { + "epoch": 1.2399199053426777, + "grad_norm": 9.1875, + "learning_rate": 3.555708422993084e-06, + "loss": 1.3585219383239746, + "step": 6812 + }, + { + "epoch": 1.24028397196687, + "grad_norm": 22.25, + "learning_rate": 3.554972402000716e-06, + "loss": 0.9604939818382263, + "step": 6814 + }, + { + "epoch": 1.240648038591062, + "grad_norm": 11.375, + "learning_rate": 3.5542362995683967e-06, + "loss": 1.9124616384506226, + "step": 6816 + }, + { + "epoch": 1.2410121052152543, + "grad_norm": 68.5, + "learning_rate": 3.553500115804146e-06, + "loss": 1.7408808469772339, + "step": 6818 + }, + { + "epoch": 1.2413761718394467, + "grad_norm": 13.6875, + "learning_rate": 3.552763850815997e-06, + "loss": 1.4779905080795288, + "step": 6820 + }, + { + "epoch": 1.241740238463639, + "grad_norm": 7.875, + "learning_rate": 3.5520275047119925e-06, + "loss": 1.4016799926757812, + "step": 6822 + }, + { + "epoch": 1.242104305087831, + "grad_norm": 10.875, + "learning_rate": 3.5512910776001897e-06, + "loss": 1.6315690279006958, + "step": 6824 + }, + { + "epoch": 1.2424683717120233, + "grad_norm": 10.5, + "learning_rate": 3.5505545695886546e-06, + "loss": 1.6864829063415527, + "step": 6826 + }, + { + "epoch": 1.2428324383362155, + "grad_norm": 11.4375, + "learning_rate": 3.549817980785467e-06, + "loss": 1.4699666500091553, + "step": 6828 + }, + { + "epoch": 1.2431965049604077, + "grad_norm": 16.375, + "learning_rate": 3.54908131129872e-06, + "loss": 1.476232886314392, + "step": 6830 + }, + { + "epoch": 1.2435605715846, + "grad_norm": 13.75, + "learning_rate": 3.5483445612365163e-06, + "loss": 1.5693073272705078, + "step": 6832 + }, + { + "epoch": 1.2439246382087923, + "grad_norm": 17.75, + "learning_rate": 3.54760773070697e-06, + "loss": 1.441408395767212, + "step": 6834 + }, + { + "epoch": 1.2442887048329845, + "grad_norm": 16.75, + "learning_rate": 3.546870819818209e-06, + "loss": 1.4684395790100098, + "step": 6836 + }, + { + "epoch": 1.2446527714571767, + "grad_norm": 4.90625, + "learning_rate": 3.5461338286783698e-06, + "loss": 1.380609154701233, + "step": 6838 + }, + { + "epoch": 1.2450168380813689, + "grad_norm": 9.875, + "learning_rate": 3.5453967573956063e-06, + "loss": 1.4816055297851562, + "step": 6840 + }, + { + "epoch": 1.245380904705561, + "grad_norm": 12.875, + "learning_rate": 3.544659606078078e-06, + "loss": 1.7490373849868774, + "step": 6842 + }, + { + "epoch": 1.2457449713297533, + "grad_norm": 49.75, + "learning_rate": 3.54392237483396e-06, + "loss": 1.4835480451583862, + "step": 6844 + }, + { + "epoch": 1.2461090379539455, + "grad_norm": 12.5, + "learning_rate": 3.543185063771438e-06, + "loss": 1.6208269596099854, + "step": 6846 + }, + { + "epoch": 1.2464731045781379, + "grad_norm": 11.4375, + "learning_rate": 3.542447672998709e-06, + "loss": 1.4339570999145508, + "step": 6848 + }, + { + "epoch": 1.24683717120233, + "grad_norm": 6.5625, + "learning_rate": 3.5417102026239823e-06, + "loss": 1.2629786729812622, + "step": 6850 + }, + { + "epoch": 1.2472012378265223, + "grad_norm": 15.6875, + "learning_rate": 3.540972652755479e-06, + "loss": 1.4285693168640137, + "step": 6852 + }, + { + "epoch": 1.2475653044507145, + "grad_norm": 33.25, + "learning_rate": 3.5402350235014317e-06, + "loss": 1.492363452911377, + "step": 6854 + }, + { + "epoch": 1.2479293710749066, + "grad_norm": 2.921875, + "learning_rate": 3.539497314970083e-06, + "loss": 0.8729903697967529, + "step": 6856 + }, + { + "epoch": 1.2482934376990988, + "grad_norm": 13.1875, + "learning_rate": 3.5387595272696895e-06, + "loss": 1.1616034507751465, + "step": 6858 + }, + { + "epoch": 1.2486575043232913, + "grad_norm": 37.25, + "learning_rate": 3.5380216605085205e-06, + "loss": 1.5804818868637085, + "step": 6860 + }, + { + "epoch": 1.2490215709474835, + "grad_norm": 12.25, + "learning_rate": 3.5372837147948515e-06, + "loss": 1.4918934106826782, + "step": 6862 + }, + { + "epoch": 1.2493856375716756, + "grad_norm": 14.0, + "learning_rate": 3.5365456902369755e-06, + "loss": 1.38448965549469, + "step": 6864 + }, + { + "epoch": 1.2497497041958678, + "grad_norm": 14.9375, + "learning_rate": 3.535807586943194e-06, + "loss": 1.46933913230896, + "step": 6866 + }, + { + "epoch": 1.25011377082006, + "grad_norm": 6.28125, + "learning_rate": 3.5350694050218205e-06, + "loss": 1.2928518056869507, + "step": 6868 + }, + { + "epoch": 1.2504778374442522, + "grad_norm": 7.9375, + "learning_rate": 3.53433114458118e-06, + "loss": 1.463866949081421, + "step": 6870 + }, + { + "epoch": 1.2508419040684444, + "grad_norm": 13.375, + "learning_rate": 3.53359280572961e-06, + "loss": 1.5643519163131714, + "step": 6872 + }, + { + "epoch": 1.2512059706926366, + "grad_norm": 9.4375, + "learning_rate": 3.5328543885754583e-06, + "loss": 1.3523356914520264, + "step": 6874 + }, + { + "epoch": 1.251570037316829, + "grad_norm": 9.6875, + "learning_rate": 3.532115893227084e-06, + "loss": 1.6246832609176636, + "step": 6876 + }, + { + "epoch": 1.2519341039410212, + "grad_norm": 10.0, + "learning_rate": 3.531377319792858e-06, + "loss": 1.4115750789642334, + "step": 6878 + }, + { + "epoch": 1.2522981705652134, + "grad_norm": 19.25, + "learning_rate": 3.5306386683811655e-06, + "loss": 1.6279051303863525, + "step": 6880 + }, + { + "epoch": 1.2526622371894056, + "grad_norm": 10.4375, + "learning_rate": 3.5298999391003986e-06, + "loss": 1.4184452295303345, + "step": 6882 + }, + { + "epoch": 1.2530263038135978, + "grad_norm": 9.0, + "learning_rate": 3.5291611320589624e-06, + "loss": 1.5353273153305054, + "step": 6884 + }, + { + "epoch": 1.2533903704377902, + "grad_norm": 10.9375, + "learning_rate": 3.5284222473652752e-06, + "loss": 1.601741075515747, + "step": 6886 + }, + { + "epoch": 1.2537544370619824, + "grad_norm": 13.25, + "learning_rate": 3.5276832851277643e-06, + "loss": 1.9086626768112183, + "step": 6888 + }, + { + "epoch": 1.2541185036861746, + "grad_norm": 24.875, + "learning_rate": 3.5269442454548698e-06, + "loss": 1.7674446105957031, + "step": 6890 + }, + { + "epoch": 1.2544825703103668, + "grad_norm": 9.875, + "learning_rate": 3.5262051284550424e-06, + "loss": 1.538628101348877, + "step": 6892 + }, + { + "epoch": 1.254846636934559, + "grad_norm": 43.75, + "learning_rate": 3.525465934236746e-06, + "loss": 1.3958988189697266, + "step": 6894 + }, + { + "epoch": 1.2552107035587512, + "grad_norm": 27.75, + "learning_rate": 3.5247266629084533e-06, + "loss": 1.5351393222808838, + "step": 6896 + }, + { + "epoch": 1.2555747701829434, + "grad_norm": 14.75, + "learning_rate": 3.5239873145786484e-06, + "loss": 1.6492743492126465, + "step": 6898 + }, + { + "epoch": 1.2559388368071356, + "grad_norm": 13.75, + "learning_rate": 3.5232478893558288e-06, + "loss": 1.2367078065872192, + "step": 6900 + }, + { + "epoch": 1.256302903431328, + "grad_norm": 13.4375, + "learning_rate": 3.522508387348502e-06, + "loss": 1.450238823890686, + "step": 6902 + }, + { + "epoch": 1.2566669700555202, + "grad_norm": 13.875, + "learning_rate": 3.521768808665188e-06, + "loss": 1.2624263763427734, + "step": 6904 + }, + { + "epoch": 1.2570310366797124, + "grad_norm": 10.5, + "learning_rate": 3.5210291534144147e-06, + "loss": 1.4558993577957153, + "step": 6906 + }, + { + "epoch": 1.2573951033039046, + "grad_norm": 14.3125, + "learning_rate": 3.5202894217047247e-06, + "loss": 1.3276500701904297, + "step": 6908 + }, + { + "epoch": 1.2577591699280968, + "grad_norm": 6.71875, + "learning_rate": 3.5195496136446706e-06, + "loss": 1.300504207611084, + "step": 6910 + }, + { + "epoch": 1.2581232365522892, + "grad_norm": 8.9375, + "learning_rate": 3.518809729342817e-06, + "loss": 1.1391650438308716, + "step": 6912 + }, + { + "epoch": 1.2584873031764814, + "grad_norm": 8.375, + "learning_rate": 3.518069768907738e-06, + "loss": 1.3528811931610107, + "step": 6914 + }, + { + "epoch": 1.2588513698006736, + "grad_norm": 13.875, + "learning_rate": 3.5173297324480195e-06, + "loss": 1.6795876026153564, + "step": 6916 + }, + { + "epoch": 1.2592154364248658, + "grad_norm": 20.875, + "learning_rate": 3.5165896200722582e-06, + "loss": 2.002683162689209, + "step": 6918 + }, + { + "epoch": 1.259579503049058, + "grad_norm": 8.6875, + "learning_rate": 3.515849431889066e-06, + "loss": 1.4503512382507324, + "step": 6920 + }, + { + "epoch": 1.2599435696732502, + "grad_norm": 7.9375, + "learning_rate": 3.5151091680070594e-06, + "loss": 1.5432205200195312, + "step": 6922 + }, + { + "epoch": 1.2603076362974424, + "grad_norm": 5.15625, + "learning_rate": 3.5143688285348697e-06, + "loss": 1.4279499053955078, + "step": 6924 + }, + { + "epoch": 1.2606717029216346, + "grad_norm": 8.5625, + "learning_rate": 3.5136284135811393e-06, + "loss": 0.8904904127120972, + "step": 6926 + }, + { + "epoch": 1.261035769545827, + "grad_norm": 19.125, + "learning_rate": 3.51288792325452e-06, + "loss": 0.6826170682907104, + "step": 6928 + }, + { + "epoch": 1.2613998361700192, + "grad_norm": 10.875, + "learning_rate": 3.5121473576636765e-06, + "loss": 1.694131851196289, + "step": 6930 + }, + { + "epoch": 1.2617639027942114, + "grad_norm": 10.8125, + "learning_rate": 3.5114067169172848e-06, + "loss": 1.2076680660247803, + "step": 6932 + }, + { + "epoch": 1.2621279694184036, + "grad_norm": 8.9375, + "learning_rate": 3.51066600112403e-06, + "loss": 1.386491298675537, + "step": 6934 + }, + { + "epoch": 1.2624920360425957, + "grad_norm": 5.09375, + "learning_rate": 3.5099252103926085e-06, + "loss": 1.598430871963501, + "step": 6936 + }, + { + "epoch": 1.2628561026667882, + "grad_norm": 9.1875, + "learning_rate": 3.5091843448317285e-06, + "loss": 1.107453465461731, + "step": 6938 + }, + { + "epoch": 1.2632201692909804, + "grad_norm": 13.4375, + "learning_rate": 3.5084434045501094e-06, + "loss": 1.2815744876861572, + "step": 6940 + }, + { + "epoch": 1.2635842359151725, + "grad_norm": 7.875, + "learning_rate": 3.5077023896564823e-06, + "loss": 1.457084059715271, + "step": 6942 + }, + { + "epoch": 1.2639483025393647, + "grad_norm": 9.625, + "learning_rate": 3.506961300259587e-06, + "loss": 1.4279173612594604, + "step": 6944 + }, + { + "epoch": 1.264312369163557, + "grad_norm": 14.9375, + "learning_rate": 3.5062201364681748e-06, + "loss": 1.5036423206329346, + "step": 6946 + }, + { + "epoch": 1.2646764357877491, + "grad_norm": 21.375, + "learning_rate": 3.5054788983910082e-06, + "loss": 1.290251612663269, + "step": 6948 + }, + { + "epoch": 1.2650405024119413, + "grad_norm": 17.375, + "learning_rate": 3.5047375861368626e-06, + "loss": 1.0799905061721802, + "step": 6950 + }, + { + "epoch": 1.2654045690361335, + "grad_norm": 22.25, + "learning_rate": 3.5039961998145222e-06, + "loss": 1.8073190450668335, + "step": 6952 + }, + { + "epoch": 1.2657686356603257, + "grad_norm": 30.5, + "learning_rate": 3.5032547395327823e-06, + "loss": 1.4099798202514648, + "step": 6954 + }, + { + "epoch": 1.2661327022845181, + "grad_norm": 11.0625, + "learning_rate": 3.5025132054004487e-06, + "loss": 1.5092816352844238, + "step": 6956 + }, + { + "epoch": 1.2664967689087103, + "grad_norm": 7.90625, + "learning_rate": 3.5017715975263377e-06, + "loss": 1.4591829776763916, + "step": 6958 + }, + { + "epoch": 1.2668608355329025, + "grad_norm": 11.625, + "learning_rate": 3.5010299160192786e-06, + "loss": 1.397963523864746, + "step": 6960 + }, + { + "epoch": 1.2672249021570947, + "grad_norm": 42.5, + "learning_rate": 3.50028816098811e-06, + "loss": 1.8981044292449951, + "step": 6962 + }, + { + "epoch": 1.267588968781287, + "grad_norm": 17.0, + "learning_rate": 3.4995463325416823e-06, + "loss": 1.1766111850738525, + "step": 6964 + }, + { + "epoch": 1.2679530354054793, + "grad_norm": 33.0, + "learning_rate": 3.4988044307888537e-06, + "loss": 1.437781810760498, + "step": 6966 + }, + { + "epoch": 1.2683171020296715, + "grad_norm": 9.4375, + "learning_rate": 3.4980624558384956e-06, + "loss": 1.4637047052383423, + "step": 6968 + }, + { + "epoch": 1.2686811686538637, + "grad_norm": 21.375, + "learning_rate": 3.4973204077994915e-06, + "loss": 1.5192077159881592, + "step": 6970 + }, + { + "epoch": 1.269045235278056, + "grad_norm": 17.875, + "learning_rate": 3.496578286780733e-06, + "loss": 1.261083722114563, + "step": 6972 + }, + { + "epoch": 1.269409301902248, + "grad_norm": 8.1875, + "learning_rate": 3.495836092891124e-06, + "loss": 1.743212342262268, + "step": 6974 + }, + { + "epoch": 1.2697733685264403, + "grad_norm": 13.25, + "learning_rate": 3.4950938262395774e-06, + "loss": 1.2227096557617188, + "step": 6976 + }, + { + "epoch": 1.2701374351506325, + "grad_norm": 17.875, + "learning_rate": 3.4943514869350176e-06, + "loss": 1.5062861442565918, + "step": 6978 + }, + { + "epoch": 1.2705015017748247, + "grad_norm": 13.875, + "learning_rate": 3.4936090750863816e-06, + "loss": 1.361649513244629, + "step": 6980 + }, + { + "epoch": 1.270865568399017, + "grad_norm": 11.6875, + "learning_rate": 3.4928665908026135e-06, + "loss": 1.785667896270752, + "step": 6982 + }, + { + "epoch": 1.2712296350232093, + "grad_norm": 23.0, + "learning_rate": 3.492124034192671e-06, + "loss": 1.785082221031189, + "step": 6984 + }, + { + "epoch": 1.2715937016474015, + "grad_norm": 15.0, + "learning_rate": 3.4913814053655205e-06, + "loss": 1.7949333190917969, + "step": 6986 + }, + { + "epoch": 1.2719577682715937, + "grad_norm": 7.8125, + "learning_rate": 3.4906387044301393e-06, + "loss": 1.6175525188446045, + "step": 6988 + }, + { + "epoch": 1.2723218348957859, + "grad_norm": 20.375, + "learning_rate": 3.4898959314955185e-06, + "loss": 1.064249038696289, + "step": 6990 + }, + { + "epoch": 1.2726859015199783, + "grad_norm": 45.25, + "learning_rate": 3.4891530866706534e-06, + "loss": 1.0330097675323486, + "step": 6992 + }, + { + "epoch": 1.2730499681441705, + "grad_norm": 27.875, + "learning_rate": 3.488410170064557e-06, + "loss": 1.45759117603302, + "step": 6994 + }, + { + "epoch": 1.2734140347683627, + "grad_norm": 5.5625, + "learning_rate": 3.487667181786246e-06, + "loss": 0.9440405964851379, + "step": 6996 + }, + { + "epoch": 1.2737781013925549, + "grad_norm": 15.9375, + "learning_rate": 3.486924121944753e-06, + "loss": 1.4143867492675781, + "step": 6998 + }, + { + "epoch": 1.274142168016747, + "grad_norm": 17.125, + "learning_rate": 3.4861809906491182e-06, + "loss": 1.859867811203003, + "step": 7000 + }, + { + "epoch": 1.2745062346409393, + "grad_norm": 18.625, + "learning_rate": 3.4854377880083932e-06, + "loss": 1.5393643379211426, + "step": 7002 + }, + { + "epoch": 1.2748703012651315, + "grad_norm": 13.6875, + "learning_rate": 3.4846945141316413e-06, + "loss": 1.5204248428344727, + "step": 7004 + }, + { + "epoch": 1.2752343678893237, + "grad_norm": 27.375, + "learning_rate": 3.4839511691279327e-06, + "loss": 1.5938551425933838, + "step": 7006 + }, + { + "epoch": 1.2755984345135158, + "grad_norm": 13.6875, + "learning_rate": 3.4832077531063514e-06, + "loss": 1.9839081764221191, + "step": 7008 + }, + { + "epoch": 1.2759625011377083, + "grad_norm": 8.125, + "learning_rate": 3.48246426617599e-06, + "loss": 1.6252151727676392, + "step": 7010 + }, + { + "epoch": 1.2763265677619005, + "grad_norm": 9.5, + "learning_rate": 3.4817207084459537e-06, + "loss": 1.3268239498138428, + "step": 7012 + }, + { + "epoch": 1.2766906343860926, + "grad_norm": 5.84375, + "learning_rate": 3.480977080025356e-06, + "loss": 1.513934850692749, + "step": 7014 + }, + { + "epoch": 1.2770547010102848, + "grad_norm": 2.859375, + "learning_rate": 3.48023338102332e-06, + "loss": 1.0915807485580444, + "step": 7016 + }, + { + "epoch": 1.277418767634477, + "grad_norm": 19.5, + "learning_rate": 3.479489611548982e-06, + "loss": 1.2613751888275146, + "step": 7018 + }, + { + "epoch": 1.2777828342586695, + "grad_norm": 10.8125, + "learning_rate": 3.478745771711487e-06, + "loss": 1.4437661170959473, + "step": 7020 + }, + { + "epoch": 1.2781469008828616, + "grad_norm": 10.4375, + "learning_rate": 3.4780018616199894e-06, + "loss": 1.5811054706573486, + "step": 7022 + }, + { + "epoch": 1.2785109675070538, + "grad_norm": 17.125, + "learning_rate": 3.477257881383658e-06, + "loss": 1.7467256784439087, + "step": 7024 + }, + { + "epoch": 1.278875034131246, + "grad_norm": 81.0, + "learning_rate": 3.4765138311116643e-06, + "loss": 1.7535210847854614, + "step": 7026 + }, + { + "epoch": 1.2792391007554382, + "grad_norm": 10.25, + "learning_rate": 3.475769710913197e-06, + "loss": 1.1360485553741455, + "step": 7028 + }, + { + "epoch": 1.2796031673796304, + "grad_norm": 8.6875, + "learning_rate": 3.475025520897454e-06, + "loss": 1.4009417295455933, + "step": 7030 + }, + { + "epoch": 1.2799672340038226, + "grad_norm": 9.0625, + "learning_rate": 3.4742812611736397e-06, + "loss": 1.3411486148834229, + "step": 7032 + }, + { + "epoch": 1.2803313006280148, + "grad_norm": 10.0625, + "learning_rate": 3.473536931850974e-06, + "loss": 0.9136326909065247, + "step": 7034 + }, + { + "epoch": 1.2806953672522072, + "grad_norm": 7.0, + "learning_rate": 3.4727925330386814e-06, + "loss": 1.4404561519622803, + "step": 7036 + }, + { + "epoch": 1.2810594338763994, + "grad_norm": 10.1875, + "learning_rate": 3.472048064846001e-06, + "loss": 1.3468619585037231, + "step": 7038 + }, + { + "epoch": 1.2814235005005916, + "grad_norm": 12.25, + "learning_rate": 3.47130352738218e-06, + "loss": 0.9277386665344238, + "step": 7040 + }, + { + "epoch": 1.2817875671247838, + "grad_norm": 21.5, + "learning_rate": 3.470558920756476e-06, + "loss": 1.6297842264175415, + "step": 7042 + }, + { + "epoch": 1.282151633748976, + "grad_norm": 15.875, + "learning_rate": 3.4698142450781584e-06, + "loss": 1.3305178880691528, + "step": 7044 + }, + { + "epoch": 1.2825157003731684, + "grad_norm": 9.875, + "learning_rate": 3.4690695004565044e-06, + "loss": 1.3600209951400757, + "step": 7046 + }, + { + "epoch": 1.2828797669973606, + "grad_norm": 12.6875, + "learning_rate": 3.4683246870008015e-06, + "loss": 1.4870193004608154, + "step": 7048 + }, + { + "epoch": 1.2832438336215528, + "grad_norm": 12.5, + "learning_rate": 3.467579804820348e-06, + "loss": 1.4263830184936523, + "step": 7050 + }, + { + "epoch": 1.283607900245745, + "grad_norm": 7.03125, + "learning_rate": 3.4668348540244547e-06, + "loss": 1.4817838668823242, + "step": 7052 + }, + { + "epoch": 1.2839719668699372, + "grad_norm": 10.25, + "learning_rate": 3.4660898347224377e-06, + "loss": 1.239769697189331, + "step": 7054 + }, + { + "epoch": 1.2843360334941294, + "grad_norm": 5.875, + "learning_rate": 3.4653447470236258e-06, + "loss": 1.0721032619476318, + "step": 7056 + }, + { + "epoch": 1.2847001001183216, + "grad_norm": 10.3125, + "learning_rate": 3.4645995910373585e-06, + "loss": 1.2548054456710815, + "step": 7058 + }, + { + "epoch": 1.2850641667425138, + "grad_norm": 8.5, + "learning_rate": 3.463854366872984e-06, + "loss": 1.3587526082992554, + "step": 7060 + }, + { + "epoch": 1.285428233366706, + "grad_norm": 6.90625, + "learning_rate": 3.4631090746398614e-06, + "loss": 1.4787750244140625, + "step": 7062 + }, + { + "epoch": 1.2857922999908984, + "grad_norm": 6.71875, + "learning_rate": 3.462363714447359e-06, + "loss": 1.5639283657073975, + "step": 7064 + }, + { + "epoch": 1.2861563666150906, + "grad_norm": 9.4375, + "learning_rate": 3.461618286404855e-06, + "loss": 1.121364712715149, + "step": 7066 + }, + { + "epoch": 1.2865204332392828, + "grad_norm": 15.625, + "learning_rate": 3.4608727906217387e-06, + "loss": 1.3390510082244873, + "step": 7068 + }, + { + "epoch": 1.286884499863475, + "grad_norm": 8.25, + "learning_rate": 3.460127227207407e-06, + "loss": 1.2885761260986328, + "step": 7070 + }, + { + "epoch": 1.2872485664876672, + "grad_norm": 11.25, + "learning_rate": 3.459381596271271e-06, + "loss": 1.5833566188812256, + "step": 7072 + }, + { + "epoch": 1.2876126331118596, + "grad_norm": 20.125, + "learning_rate": 3.458635897922746e-06, + "loss": 1.2856498956680298, + "step": 7074 + }, + { + "epoch": 1.2879766997360518, + "grad_norm": 11.5, + "learning_rate": 3.457890132271263e-06, + "loss": 1.6123485565185547, + "step": 7076 + }, + { + "epoch": 1.288340766360244, + "grad_norm": 11.8125, + "learning_rate": 3.4571442994262572e-06, + "loss": 1.3298993110656738, + "step": 7078 + }, + { + "epoch": 1.2887048329844362, + "grad_norm": 9.4375, + "learning_rate": 3.4563983994971794e-06, + "loss": 1.0125114917755127, + "step": 7080 + }, + { + "epoch": 1.2890688996086284, + "grad_norm": 2.875, + "learning_rate": 3.455652432593486e-06, + "loss": 1.1476223468780518, + "step": 7082 + }, + { + "epoch": 1.2894329662328206, + "grad_norm": 3.96875, + "learning_rate": 3.4549063988246445e-06, + "loss": 1.3313473463058472, + "step": 7084 + }, + { + "epoch": 1.2897970328570127, + "grad_norm": 9.25, + "learning_rate": 3.4541602983001322e-06, + "loss": 1.0937621593475342, + "step": 7086 + }, + { + "epoch": 1.290161099481205, + "grad_norm": 11.0, + "learning_rate": 3.4534141311294368e-06, + "loss": 1.4370759725570679, + "step": 7088 + }, + { + "epoch": 1.2905251661053974, + "grad_norm": 6.65625, + "learning_rate": 3.4526678974220552e-06, + "loss": 1.4123831987380981, + "step": 7090 + }, + { + "epoch": 1.2908892327295896, + "grad_norm": 6.65625, + "learning_rate": 3.4519215972874942e-06, + "loss": 1.4852166175842285, + "step": 7092 + }, + { + "epoch": 1.2912532993537817, + "grad_norm": 9.3125, + "learning_rate": 3.451175230835271e-06, + "loss": 1.3300797939300537, + "step": 7094 + }, + { + "epoch": 1.291617365977974, + "grad_norm": 16.5, + "learning_rate": 3.4504287981749103e-06, + "loss": 1.1346608400344849, + "step": 7096 + }, + { + "epoch": 1.2919814326021661, + "grad_norm": 9.0625, + "learning_rate": 3.4496822994159483e-06, + "loss": 0.5941730737686157, + "step": 7098 + }, + { + "epoch": 1.2923454992263586, + "grad_norm": 7.03125, + "learning_rate": 3.448935734667932e-06, + "loss": 1.2391228675842285, + "step": 7100 + }, + { + "epoch": 1.2927095658505507, + "grad_norm": 7.78125, + "learning_rate": 3.448189104040416e-06, + "loss": 1.575432300567627, + "step": 7102 + }, + { + "epoch": 1.293073632474743, + "grad_norm": 16.25, + "learning_rate": 3.4474424076429658e-06, + "loss": 1.3727679252624512, + "step": 7104 + }, + { + "epoch": 1.2934376990989351, + "grad_norm": 8.1875, + "learning_rate": 3.446695645585155e-06, + "loss": 1.3660389184951782, + "step": 7106 + }, + { + "epoch": 1.2938017657231273, + "grad_norm": 62.75, + "learning_rate": 3.4459488179765683e-06, + "loss": 1.4868298768997192, + "step": 7108 + }, + { + "epoch": 1.2941658323473195, + "grad_norm": 18.25, + "learning_rate": 3.4452019249268004e-06, + "loss": 1.3551559448242188, + "step": 7110 + }, + { + "epoch": 1.2945298989715117, + "grad_norm": 3.90625, + "learning_rate": 3.444454966545454e-06, + "loss": 1.2077412605285645, + "step": 7112 + }, + { + "epoch": 1.294893965595704, + "grad_norm": 9.9375, + "learning_rate": 3.443707942942143e-06, + "loss": 1.2375282049179077, + "step": 7114 + }, + { + "epoch": 1.295258032219896, + "grad_norm": 5.65625, + "learning_rate": 3.442960854226489e-06, + "loss": 1.2822661399841309, + "step": 7116 + }, + { + "epoch": 1.2956220988440885, + "grad_norm": 14.875, + "learning_rate": 3.4422137005081245e-06, + "loss": 1.3456642627716064, + "step": 7118 + }, + { + "epoch": 1.2959861654682807, + "grad_norm": 21.5, + "learning_rate": 3.441466481896692e-06, + "loss": 1.228467583656311, + "step": 7120 + }, + { + "epoch": 1.296350232092473, + "grad_norm": 21.5, + "learning_rate": 3.440719198501842e-06, + "loss": 1.5472502708435059, + "step": 7122 + }, + { + "epoch": 1.296714298716665, + "grad_norm": 8.6875, + "learning_rate": 3.439971850433237e-06, + "loss": 1.807754397392273, + "step": 7124 + }, + { + "epoch": 1.2970783653408573, + "grad_norm": 9.0625, + "learning_rate": 3.4392244378005445e-06, + "loss": 1.0491812229156494, + "step": 7126 + }, + { + "epoch": 1.2974424319650497, + "grad_norm": 23.5, + "learning_rate": 3.438476960713446e-06, + "loss": 1.295921802520752, + "step": 7128 + }, + { + "epoch": 1.297806498589242, + "grad_norm": 6.25, + "learning_rate": 3.4377294192816315e-06, + "loss": 0.13925348222255707, + "step": 7130 + }, + { + "epoch": 1.298170565213434, + "grad_norm": 25.125, + "learning_rate": 3.4369818136147976e-06, + "loss": 0.4353587031364441, + "step": 7132 + }, + { + "epoch": 1.2985346318376263, + "grad_norm": 17.25, + "learning_rate": 3.436234143822654e-06, + "loss": 1.3399690389633179, + "step": 7134 + }, + { + "epoch": 1.2988986984618185, + "grad_norm": 40.5, + "learning_rate": 3.4354864100149175e-06, + "loss": 0.9734708666801453, + "step": 7136 + }, + { + "epoch": 1.2992627650860107, + "grad_norm": 5.84375, + "learning_rate": 3.434738612301315e-06, + "loss": 1.5604196786880493, + "step": 7138 + }, + { + "epoch": 1.2996268317102029, + "grad_norm": 6.03125, + "learning_rate": 3.433990750791584e-06, + "loss": 1.4461499452590942, + "step": 7140 + }, + { + "epoch": 1.299990898334395, + "grad_norm": 6.1875, + "learning_rate": 3.4332428255954686e-06, + "loss": 0.9172034859657288, + "step": 7142 + }, + { + "epoch": 1.3003549649585875, + "grad_norm": 10.0625, + "learning_rate": 3.432494836822724e-06, + "loss": 1.5358330011367798, + "step": 7144 + }, + { + "epoch": 1.3007190315827797, + "grad_norm": 10.0625, + "learning_rate": 3.4317467845831153e-06, + "loss": 1.2882869243621826, + "step": 7146 + }, + { + "epoch": 1.3010830982069719, + "grad_norm": 15.0625, + "learning_rate": 3.430998668986415e-06, + "loss": 1.06436026096344, + "step": 7148 + }, + { + "epoch": 1.301447164831164, + "grad_norm": 14.875, + "learning_rate": 3.430250490142407e-06, + "loss": 1.4392189979553223, + "step": 7150 + }, + { + "epoch": 1.3018112314553563, + "grad_norm": 41.5, + "learning_rate": 3.429502248160883e-06, + "loss": 1.3414818048477173, + "step": 7152 + }, + { + "epoch": 1.3021752980795487, + "grad_norm": 13.9375, + "learning_rate": 3.428753943151646e-06, + "loss": 0.6288173794746399, + "step": 7154 + }, + { + "epoch": 1.3025393647037409, + "grad_norm": 338.0, + "learning_rate": 3.4280055752245044e-06, + "loss": 1.8327256441116333, + "step": 7156 + }, + { + "epoch": 1.302903431327933, + "grad_norm": 37.25, + "learning_rate": 3.427257144489279e-06, + "loss": 1.4023244380950928, + "step": 7158 + }, + { + "epoch": 1.3032674979521253, + "grad_norm": 13.6875, + "learning_rate": 3.4265086510557986e-06, + "loss": 1.4952585697174072, + "step": 7160 + }, + { + "epoch": 1.3036315645763175, + "grad_norm": 14.4375, + "learning_rate": 3.425760095033903e-06, + "loss": 1.4060511589050293, + "step": 7162 + }, + { + "epoch": 1.3039956312005097, + "grad_norm": 16.5, + "learning_rate": 3.4250114765334397e-06, + "loss": 1.3897628784179688, + "step": 7164 + }, + { + "epoch": 1.3043596978247018, + "grad_norm": 7.375, + "learning_rate": 3.424262795664264e-06, + "loss": 0.8717978596687317, + "step": 7166 + }, + { + "epoch": 1.304723764448894, + "grad_norm": 26.125, + "learning_rate": 3.4235140525362414e-06, + "loss": 1.1149710416793823, + "step": 7168 + }, + { + "epoch": 1.3050878310730865, + "grad_norm": 3.546875, + "learning_rate": 3.4227652472592487e-06, + "loss": 0.5271800756454468, + "step": 7170 + }, + { + "epoch": 1.3054518976972787, + "grad_norm": 7.71875, + "learning_rate": 3.4220163799431693e-06, + "loss": 1.3105355501174927, + "step": 7172 + }, + { + "epoch": 1.3058159643214708, + "grad_norm": 53.75, + "learning_rate": 3.421267450697897e-06, + "loss": 1.281732201576233, + "step": 7174 + }, + { + "epoch": 1.306180030945663, + "grad_norm": 7.96875, + "learning_rate": 3.420518459633333e-06, + "loss": 1.5820822715759277, + "step": 7176 + }, + { + "epoch": 1.3065440975698552, + "grad_norm": 4.0625, + "learning_rate": 3.419769406859389e-06, + "loss": 0.9697157740592957, + "step": 7178 + }, + { + "epoch": 1.3069081641940474, + "grad_norm": 8.0625, + "learning_rate": 3.419020292485986e-06, + "loss": 1.304370641708374, + "step": 7180 + }, + { + "epoch": 1.3072722308182398, + "grad_norm": 8.375, + "learning_rate": 3.418271116623053e-06, + "loss": 1.3723695278167725, + "step": 7182 + }, + { + "epoch": 1.307636297442432, + "grad_norm": 6.75, + "learning_rate": 3.4175218793805297e-06, + "loss": 1.2795096635818481, + "step": 7184 + }, + { + "epoch": 1.3080003640666242, + "grad_norm": 9.9375, + "learning_rate": 3.416772580868362e-06, + "loss": 1.1621792316436768, + "step": 7186 + }, + { + "epoch": 1.3083644306908164, + "grad_norm": 12.0625, + "learning_rate": 3.4160232211965072e-06, + "loss": 1.3562220335006714, + "step": 7188 + }, + { + "epoch": 1.3087284973150086, + "grad_norm": 41.5, + "learning_rate": 3.4152738004749297e-06, + "loss": 1.2786340713500977, + "step": 7190 + }, + { + "epoch": 1.3090925639392008, + "grad_norm": 22.5, + "learning_rate": 3.414524318813606e-06, + "loss": 0.7081055045127869, + "step": 7192 + }, + { + "epoch": 1.309456630563393, + "grad_norm": 8.5, + "learning_rate": 3.4137747763225186e-06, + "loss": 1.072160243988037, + "step": 7194 + }, + { + "epoch": 1.3098206971875852, + "grad_norm": 15.375, + "learning_rate": 3.413025173111659e-06, + "loss": 1.3887577056884766, + "step": 7196 + }, + { + "epoch": 1.3101847638117776, + "grad_norm": 16.5, + "learning_rate": 3.41227550929103e-06, + "loss": 1.5575710535049438, + "step": 7198 + }, + { + "epoch": 1.3105488304359698, + "grad_norm": 20.5, + "learning_rate": 3.4115257849706394e-06, + "loss": 1.4033201932907104, + "step": 7200 + }, + { + "epoch": 1.310912897060162, + "grad_norm": 8.125, + "learning_rate": 3.4107760002605086e-06, + "loss": 1.2694982290267944, + "step": 7202 + }, + { + "epoch": 1.3112769636843542, + "grad_norm": 26.875, + "learning_rate": 3.410026155270665e-06, + "loss": 1.1874176263809204, + "step": 7204 + }, + { + "epoch": 1.3116410303085464, + "grad_norm": 17.0, + "learning_rate": 3.409276250111143e-06, + "loss": 1.0976594686508179, + "step": 7206 + }, + { + "epoch": 1.3120050969327388, + "grad_norm": 6.1875, + "learning_rate": 3.408526284891991e-06, + "loss": 1.2550384998321533, + "step": 7208 + }, + { + "epoch": 1.312369163556931, + "grad_norm": 12.125, + "learning_rate": 3.407776259723262e-06, + "loss": 1.362528920173645, + "step": 7210 + }, + { + "epoch": 1.3127332301811232, + "grad_norm": 8.375, + "learning_rate": 3.4070261747150203e-06, + "loss": 1.1947346925735474, + "step": 7212 + }, + { + "epoch": 1.3130972968053154, + "grad_norm": 12.875, + "learning_rate": 3.4062760299773368e-06, + "loss": 1.1582088470458984, + "step": 7214 + }, + { + "epoch": 1.3134613634295076, + "grad_norm": 84.5, + "learning_rate": 3.4055258256202926e-06, + "loss": 1.554925799369812, + "step": 7216 + }, + { + "epoch": 1.3138254300536998, + "grad_norm": 17.75, + "learning_rate": 3.4047755617539755e-06, + "loss": 1.7379704713821411, + "step": 7218 + }, + { + "epoch": 1.314189496677892, + "grad_norm": 14.375, + "learning_rate": 3.4040252384884862e-06, + "loss": 1.432013988494873, + "step": 7220 + }, + { + "epoch": 1.3145535633020842, + "grad_norm": 9.25, + "learning_rate": 3.40327485593393e-06, + "loss": 1.4521740674972534, + "step": 7222 + }, + { + "epoch": 1.3149176299262766, + "grad_norm": 12.5625, + "learning_rate": 3.4025244142004244e-06, + "loss": 1.1514098644256592, + "step": 7224 + }, + { + "epoch": 1.3152816965504688, + "grad_norm": 12.0625, + "learning_rate": 3.401773913398091e-06, + "loss": 1.4416508674621582, + "step": 7226 + }, + { + "epoch": 1.315645763174661, + "grad_norm": 8.3125, + "learning_rate": 3.401023353637064e-06, + "loss": 1.4151341915130615, + "step": 7228 + }, + { + "epoch": 1.3160098297988532, + "grad_norm": 12.375, + "learning_rate": 3.4002727350274855e-06, + "loss": 1.0667016506195068, + "step": 7230 + }, + { + "epoch": 1.3163738964230454, + "grad_norm": 10.25, + "learning_rate": 3.399522057679505e-06, + "loss": 1.279942274093628, + "step": 7232 + }, + { + "epoch": 1.3167379630472378, + "grad_norm": 10.375, + "learning_rate": 3.3987713217032826e-06, + "loss": 1.5324357748031616, + "step": 7234 + }, + { + "epoch": 1.31710202967143, + "grad_norm": 11.1875, + "learning_rate": 3.3980205272089837e-06, + "loss": 1.7289388179779053, + "step": 7236 + }, + { + "epoch": 1.3174660962956222, + "grad_norm": 21.375, + "learning_rate": 3.3972696743067856e-06, + "loss": 1.7450295686721802, + "step": 7238 + }, + { + "epoch": 1.3178301629198144, + "grad_norm": 14.8125, + "learning_rate": 3.396518763106873e-06, + "loss": 1.9419946670532227, + "step": 7240 + }, + { + "epoch": 1.3181942295440066, + "grad_norm": 4.15625, + "learning_rate": 3.395767793719439e-06, + "loss": 1.2483052015304565, + "step": 7242 + }, + { + "epoch": 1.3185582961681988, + "grad_norm": 12.1875, + "learning_rate": 3.395016766254685e-06, + "loss": 1.0632487535476685, + "step": 7244 + }, + { + "epoch": 1.318922362792391, + "grad_norm": 11.0625, + "learning_rate": 3.394265680822822e-06, + "loss": 1.5236999988555908, + "step": 7246 + }, + { + "epoch": 1.3192864294165831, + "grad_norm": 9.4375, + "learning_rate": 3.3935145375340673e-06, + "loss": 1.2758264541625977, + "step": 7248 + }, + { + "epoch": 1.3196504960407753, + "grad_norm": 12.8125, + "learning_rate": 3.392763336498649e-06, + "loss": 1.4826488494873047, + "step": 7250 + }, + { + "epoch": 1.3200145626649677, + "grad_norm": 10.75, + "learning_rate": 3.3920120778268032e-06, + "loss": 1.2545771598815918, + "step": 7252 + }, + { + "epoch": 1.32037862928916, + "grad_norm": 7.46875, + "learning_rate": 3.391260761628774e-06, + "loss": 1.2271801233291626, + "step": 7254 + }, + { + "epoch": 1.3207426959133521, + "grad_norm": 5.3125, + "learning_rate": 3.390509388014813e-06, + "loss": 1.0805290937423706, + "step": 7256 + }, + { + "epoch": 1.3211067625375443, + "grad_norm": 9.125, + "learning_rate": 3.3897579570951824e-06, + "loss": 1.6068825721740723, + "step": 7258 + }, + { + "epoch": 1.3214708291617365, + "grad_norm": 7.625, + "learning_rate": 3.3890064689801504e-06, + "loss": 1.3820632696151733, + "step": 7260 + }, + { + "epoch": 1.321834895785929, + "grad_norm": 3.703125, + "learning_rate": 3.3882549237799965e-06, + "loss": 1.2232885360717773, + "step": 7262 + }, + { + "epoch": 1.3221989624101211, + "grad_norm": 10.625, + "learning_rate": 3.387503321605006e-06, + "loss": 1.37892746925354, + "step": 7264 + }, + { + "epoch": 1.3225630290343133, + "grad_norm": 10.0, + "learning_rate": 3.386751662565473e-06, + "loss": 1.171010971069336, + "step": 7266 + }, + { + "epoch": 1.3229270956585055, + "grad_norm": 13.1875, + "learning_rate": 3.3859999467717007e-06, + "loss": 0.4496696889400482, + "step": 7268 + }, + { + "epoch": 1.3232911622826977, + "grad_norm": 8.0625, + "learning_rate": 3.3852481743340006e-06, + "loss": 1.2991750240325928, + "step": 7270 + }, + { + "epoch": 1.32365522890689, + "grad_norm": 6.875, + "learning_rate": 3.384496345362692e-06, + "loss": 1.5770032405853271, + "step": 7272 + }, + { + "epoch": 1.324019295531082, + "grad_norm": 35.5, + "learning_rate": 3.383744459968104e-06, + "loss": 1.4898180961608887, + "step": 7274 + }, + { + "epoch": 1.3243833621552743, + "grad_norm": 19.375, + "learning_rate": 3.3829925182605717e-06, + "loss": 1.7032684087753296, + "step": 7276 + }, + { + "epoch": 1.3247474287794667, + "grad_norm": 24.875, + "learning_rate": 3.3822405203504383e-06, + "loss": 1.516749620437622, + "step": 7278 + }, + { + "epoch": 1.325111495403659, + "grad_norm": 8.375, + "learning_rate": 3.381488466348058e-06, + "loss": 1.2130012512207031, + "step": 7280 + }, + { + "epoch": 1.325475562027851, + "grad_norm": 20.625, + "learning_rate": 3.3807363563637907e-06, + "loss": 1.6764572858810425, + "step": 7282 + }, + { + "epoch": 1.3258396286520433, + "grad_norm": 13.6875, + "learning_rate": 3.379984190508008e-06, + "loss": 1.885354995727539, + "step": 7284 + }, + { + "epoch": 1.3262036952762355, + "grad_norm": 11.375, + "learning_rate": 3.3792319688910843e-06, + "loss": 1.601582407951355, + "step": 7286 + }, + { + "epoch": 1.326567761900428, + "grad_norm": 6.21875, + "learning_rate": 3.378479691623405e-06, + "loss": 1.2295979261398315, + "step": 7288 + }, + { + "epoch": 1.32693182852462, + "grad_norm": 20.375, + "learning_rate": 3.377727358815366e-06, + "loss": 0.8671035170555115, + "step": 7290 + }, + { + "epoch": 1.3272958951488123, + "grad_norm": 28.125, + "learning_rate": 3.376974970577367e-06, + "loss": 0.6735749840736389, + "step": 7292 + }, + { + "epoch": 1.3276599617730045, + "grad_norm": 28.875, + "learning_rate": 3.3762225270198198e-06, + "loss": 1.7925301790237427, + "step": 7294 + }, + { + "epoch": 1.3280240283971967, + "grad_norm": 18.125, + "learning_rate": 3.375470028253141e-06, + "loss": 1.4346858263015747, + "step": 7296 + }, + { + "epoch": 1.3283880950213889, + "grad_norm": 11.625, + "learning_rate": 3.374717474387757e-06, + "loss": 1.4641952514648438, + "step": 7298 + }, + { + "epoch": 1.328752161645581, + "grad_norm": 7.59375, + "learning_rate": 3.3739648655341027e-06, + "loss": 1.3167438507080078, + "step": 7300 + }, + { + "epoch": 1.3291162282697733, + "grad_norm": 6.59375, + "learning_rate": 3.373212201802619e-06, + "loss": 1.076387643814087, + "step": 7302 + }, + { + "epoch": 1.3294802948939655, + "grad_norm": 5.625, + "learning_rate": 3.3724594833037583e-06, + "loss": 0.8748537302017212, + "step": 7304 + }, + { + "epoch": 1.3298443615181579, + "grad_norm": 21.25, + "learning_rate": 3.3717067101479778e-06, + "loss": 1.14386785030365, + "step": 7306 + }, + { + "epoch": 1.33020842814235, + "grad_norm": 20.875, + "learning_rate": 3.3709538824457432e-06, + "loss": 1.7059259414672852, + "step": 7308 + }, + { + "epoch": 1.3305724947665423, + "grad_norm": 17.625, + "learning_rate": 3.3702010003075303e-06, + "loss": 1.6073601245880127, + "step": 7310 + }, + { + "epoch": 1.3309365613907345, + "grad_norm": 14.0, + "learning_rate": 3.369448063843821e-06, + "loss": 1.433451533317566, + "step": 7312 + }, + { + "epoch": 1.3313006280149267, + "grad_norm": 19.875, + "learning_rate": 3.3686950731651048e-06, + "loss": 1.8865445852279663, + "step": 7314 + }, + { + "epoch": 1.331664694639119, + "grad_norm": 11.625, + "learning_rate": 3.3679420283818814e-06, + "loss": 1.3903053998947144, + "step": 7316 + }, + { + "epoch": 1.3320287612633113, + "grad_norm": 22.0, + "learning_rate": 3.3671889296046567e-06, + "loss": 1.2783299684524536, + "step": 7318 + }, + { + "epoch": 1.3323928278875035, + "grad_norm": 30.625, + "learning_rate": 3.366435776943944e-06, + "loss": 0.7890225648880005, + "step": 7320 + }, + { + "epoch": 1.3327568945116957, + "grad_norm": 7.59375, + "learning_rate": 3.3656825705102657e-06, + "loss": 0.903577446937561, + "step": 7322 + }, + { + "epoch": 1.3331209611358878, + "grad_norm": 6.0, + "learning_rate": 3.3649293104141534e-06, + "loss": 1.0759109258651733, + "step": 7324 + }, + { + "epoch": 1.33348502776008, + "grad_norm": 16.625, + "learning_rate": 3.3641759967661435e-06, + "loss": 1.5572078227996826, + "step": 7326 + }, + { + "epoch": 1.3338490943842722, + "grad_norm": 11.3125, + "learning_rate": 3.3634226296767815e-06, + "loss": 1.291471242904663, + "step": 7328 + }, + { + "epoch": 1.3342131610084644, + "grad_norm": 272.0, + "learning_rate": 3.3626692092566214e-06, + "loss": 1.1058037281036377, + "step": 7330 + }, + { + "epoch": 1.3345772276326568, + "grad_norm": 17.25, + "learning_rate": 3.3619157356162245e-06, + "loss": 1.7693865299224854, + "step": 7332 + }, + { + "epoch": 1.334941294256849, + "grad_norm": 6.53125, + "learning_rate": 3.3611622088661605e-06, + "loss": 0.9201260209083557, + "step": 7334 + }, + { + "epoch": 1.3353053608810412, + "grad_norm": 12.75, + "learning_rate": 3.360408629117007e-06, + "loss": 1.304374098777771, + "step": 7336 + }, + { + "epoch": 1.3356694275052334, + "grad_norm": 7.875, + "learning_rate": 3.3596549964793457e-06, + "loss": 1.7526408433914185, + "step": 7338 + }, + { + "epoch": 1.3360334941294256, + "grad_norm": 6.9375, + "learning_rate": 3.3589013110637718e-06, + "loss": 1.1788661479949951, + "step": 7340 + }, + { + "epoch": 1.336397560753618, + "grad_norm": 12.375, + "learning_rate": 3.3581475729808856e-06, + "loss": 1.585633397102356, + "step": 7342 + }, + { + "epoch": 1.3367616273778102, + "grad_norm": 18.75, + "learning_rate": 3.3573937823412945e-06, + "loss": 1.6532328128814697, + "step": 7344 + }, + { + "epoch": 1.3371256940020024, + "grad_norm": 13.0, + "learning_rate": 3.3566399392556137e-06, + "loss": 1.3763723373413086, + "step": 7346 + }, + { + "epoch": 1.3374897606261946, + "grad_norm": 27.625, + "learning_rate": 3.3558860438344674e-06, + "loss": 1.4979712963104248, + "step": 7348 + }, + { + "epoch": 1.3378538272503868, + "grad_norm": 11.125, + "learning_rate": 3.355132096188487e-06, + "loss": 1.8226686716079712, + "step": 7350 + }, + { + "epoch": 1.338217893874579, + "grad_norm": 12.75, + "learning_rate": 3.35437809642831e-06, + "loss": 1.198443055152893, + "step": 7352 + }, + { + "epoch": 1.3385819604987712, + "grad_norm": 13.25, + "learning_rate": 3.3536240446645833e-06, + "loss": 0.7959929704666138, + "step": 7354 + }, + { + "epoch": 1.3389460271229634, + "grad_norm": 20.125, + "learning_rate": 3.3528699410079624e-06, + "loss": 1.5173218250274658, + "step": 7356 + }, + { + "epoch": 1.3393100937471556, + "grad_norm": 22.625, + "learning_rate": 3.3521157855691067e-06, + "loss": 1.6894701719284058, + "step": 7358 + }, + { + "epoch": 1.339674160371348, + "grad_norm": 6.78125, + "learning_rate": 3.351361578458686e-06, + "loss": 1.22219979763031, + "step": 7360 + }, + { + "epoch": 1.3400382269955402, + "grad_norm": 25.875, + "learning_rate": 3.350607319787379e-06, + "loss": 1.163588047027588, + "step": 7362 + }, + { + "epoch": 1.3404022936197324, + "grad_norm": 10.9375, + "learning_rate": 3.349853009665868e-06, + "loss": 1.2345460653305054, + "step": 7364 + }, + { + "epoch": 1.3407663602439246, + "grad_norm": 13.0625, + "learning_rate": 3.349098648204846e-06, + "loss": 1.3027905225753784, + "step": 7366 + }, + { + "epoch": 1.3411304268681168, + "grad_norm": 17.75, + "learning_rate": 3.3483442355150115e-06, + "loss": 1.7057050466537476, + "step": 7368 + }, + { + "epoch": 1.3414944934923092, + "grad_norm": 16.125, + "learning_rate": 3.347589771707072e-06, + "loss": 2.037951946258545, + "step": 7370 + }, + { + "epoch": 1.3418585601165014, + "grad_norm": 6.40625, + "learning_rate": 3.346835256891743e-06, + "loss": 1.3593381643295288, + "step": 7372 + }, + { + "epoch": 1.3422226267406936, + "grad_norm": 23.5, + "learning_rate": 3.346080691179745e-06, + "loss": 1.3068010807037354, + "step": 7374 + }, + { + "epoch": 1.3425866933648858, + "grad_norm": 27.25, + "learning_rate": 3.3453260746818093e-06, + "loss": 2.273064613342285, + "step": 7376 + }, + { + "epoch": 1.342950759989078, + "grad_norm": 30.75, + "learning_rate": 3.34457140750867e-06, + "loss": 1.2065379619598389, + "step": 7378 + }, + { + "epoch": 1.3433148266132702, + "grad_norm": 12.75, + "learning_rate": 3.343816689771074e-06, + "loss": 0.9221192002296448, + "step": 7380 + }, + { + "epoch": 1.3436788932374624, + "grad_norm": 9.0625, + "learning_rate": 3.3430619215797717e-06, + "loss": 1.5145831108093262, + "step": 7382 + }, + { + "epoch": 1.3440429598616546, + "grad_norm": 13.5, + "learning_rate": 3.3423071030455236e-06, + "loss": 1.5732219219207764, + "step": 7384 + }, + { + "epoch": 1.344407026485847, + "grad_norm": 48.75, + "learning_rate": 3.341552234279094e-06, + "loss": 1.4417767524719238, + "step": 7386 + }, + { + "epoch": 1.3447710931100392, + "grad_norm": 16.125, + "learning_rate": 3.340797315391259e-06, + "loss": 1.5921684503555298, + "step": 7388 + }, + { + "epoch": 1.3451351597342314, + "grad_norm": 14.6875, + "learning_rate": 3.340042346492799e-06, + "loss": 1.4667093753814697, + "step": 7390 + }, + { + "epoch": 1.3454992263584236, + "grad_norm": 5.28125, + "learning_rate": 3.3392873276945025e-06, + "loss": 0.9612140655517578, + "step": 7392 + }, + { + "epoch": 1.3458632929826158, + "grad_norm": 20.0, + "learning_rate": 3.3385322591071663e-06, + "loss": 0.946083664894104, + "step": 7394 + }, + { + "epoch": 1.3462273596068082, + "grad_norm": 6.65625, + "learning_rate": 3.3377771408415926e-06, + "loss": 0.8163317441940308, + "step": 7396 + }, + { + "epoch": 1.3465914262310004, + "grad_norm": 10.0625, + "learning_rate": 3.3370219730085923e-06, + "loss": 1.4150222539901733, + "step": 7398 + }, + { + "epoch": 1.3469554928551926, + "grad_norm": 38.25, + "learning_rate": 3.336266755718983e-06, + "loss": 1.4804495573043823, + "step": 7400 + }, + { + "epoch": 1.3473195594793848, + "grad_norm": 51.5, + "learning_rate": 3.3355114890835917e-06, + "loss": 1.352971076965332, + "step": 7402 + }, + { + "epoch": 1.347683626103577, + "grad_norm": 18.625, + "learning_rate": 3.3347561732132473e-06, + "loss": 1.3537341356277466, + "step": 7404 + }, + { + "epoch": 1.3480476927277691, + "grad_norm": 8.3125, + "learning_rate": 3.3340008082187917e-06, + "loss": 1.237910270690918, + "step": 7406 + }, + { + "epoch": 1.3484117593519613, + "grad_norm": 9.875, + "learning_rate": 3.333245394211071e-06, + "loss": 0.7169888615608215, + "step": 7408 + }, + { + "epoch": 1.3487758259761535, + "grad_norm": 5.875, + "learning_rate": 3.3324899313009397e-06, + "loss": 1.3834115266799927, + "step": 7410 + }, + { + "epoch": 1.3491398926003457, + "grad_norm": 5.65625, + "learning_rate": 3.331734419599258e-06, + "loss": 1.1051740646362305, + "step": 7412 + }, + { + "epoch": 1.3495039592245381, + "grad_norm": 20.375, + "learning_rate": 3.3309788592168947e-06, + "loss": 1.2976346015930176, + "step": 7414 + }, + { + "epoch": 1.3498680258487303, + "grad_norm": 11.6875, + "learning_rate": 3.3302232502647246e-06, + "loss": 1.2372989654541016, + "step": 7416 + }, + { + "epoch": 1.3502320924729225, + "grad_norm": 5.03125, + "learning_rate": 3.329467592853631e-06, + "loss": 0.9661650657653809, + "step": 7418 + }, + { + "epoch": 1.3505961590971147, + "grad_norm": 10.875, + "learning_rate": 3.3287118870945043e-06, + "loss": 1.5318711996078491, + "step": 7420 + }, + { + "epoch": 1.350960225721307, + "grad_norm": 10.0625, + "learning_rate": 3.3279561330982403e-06, + "loss": 1.3106411695480347, + "step": 7422 + }, + { + "epoch": 1.3513242923454993, + "grad_norm": 24.375, + "learning_rate": 3.3272003309757415e-06, + "loss": 1.2864681482315063, + "step": 7424 + }, + { + "epoch": 1.3516883589696915, + "grad_norm": 6.375, + "learning_rate": 3.3264444808379214e-06, + "loss": 1.2805230617523193, + "step": 7426 + }, + { + "epoch": 1.3520524255938837, + "grad_norm": 24.125, + "learning_rate": 3.3256885827956965e-06, + "loss": 1.5450854301452637, + "step": 7428 + }, + { + "epoch": 1.352416492218076, + "grad_norm": 11.625, + "learning_rate": 3.324932636959991e-06, + "loss": 1.7074167728424072, + "step": 7430 + }, + { + "epoch": 1.352780558842268, + "grad_norm": 91.0, + "learning_rate": 3.3241766434417386e-06, + "loss": 1.6927354335784912, + "step": 7432 + }, + { + "epoch": 1.3531446254664603, + "grad_norm": 14.625, + "learning_rate": 3.3234206023518776e-06, + "loss": 0.9706486463546753, + "step": 7434 + }, + { + "epoch": 1.3535086920906525, + "grad_norm": 25.625, + "learning_rate": 3.322664513801355e-06, + "loss": 1.6820039749145508, + "step": 7436 + }, + { + "epoch": 1.3538727587148447, + "grad_norm": 91.5, + "learning_rate": 3.3219083779011204e-06, + "loss": 2.1871206760406494, + "step": 7438 + }, + { + "epoch": 1.354236825339037, + "grad_norm": 24.0, + "learning_rate": 3.321152194762137e-06, + "loss": 1.5362269878387451, + "step": 7440 + }, + { + "epoch": 1.3546008919632293, + "grad_norm": 10.75, + "learning_rate": 3.320395964495371e-06, + "loss": 1.5154311656951904, + "step": 7442 + }, + { + "epoch": 1.3549649585874215, + "grad_norm": 14.0625, + "learning_rate": 3.3196396872117943e-06, + "loss": 1.6648980379104614, + "step": 7444 + }, + { + "epoch": 1.3553290252116137, + "grad_norm": 15.125, + "learning_rate": 3.3188833630223905e-06, + "loss": 1.7354077100753784, + "step": 7446 + }, + { + "epoch": 1.3556930918358059, + "grad_norm": 9.25, + "learning_rate": 3.318126992038144e-06, + "loss": 1.5520142316818237, + "step": 7448 + }, + { + "epoch": 1.3560571584599983, + "grad_norm": 11.0625, + "learning_rate": 3.3173705743700517e-06, + "loss": 1.2338981628417969, + "step": 7450 + }, + { + "epoch": 1.3564212250841905, + "grad_norm": 17.0, + "learning_rate": 3.316614110129114e-06, + "loss": 1.5115392208099365, + "step": 7452 + }, + { + "epoch": 1.3567852917083827, + "grad_norm": 16.875, + "learning_rate": 3.3158575994263383e-06, + "loss": 1.394870638847351, + "step": 7454 + }, + { + "epoch": 1.3571493583325749, + "grad_norm": 30.0, + "learning_rate": 3.3151010423727402e-06, + "loss": 1.5799367427825928, + "step": 7456 + }, + { + "epoch": 1.357513424956767, + "grad_norm": 12.9375, + "learning_rate": 3.3143444390793404e-06, + "loss": 1.8644154071807861, + "step": 7458 + }, + { + "epoch": 1.3578774915809593, + "grad_norm": 6.375, + "learning_rate": 3.313587789657169e-06, + "loss": 1.3400384187698364, + "step": 7460 + }, + { + "epoch": 1.3582415582051515, + "grad_norm": 9.0, + "learning_rate": 3.312831094217259e-06, + "loss": 1.5915063619613647, + "step": 7462 + }, + { + "epoch": 1.3586056248293437, + "grad_norm": 10.125, + "learning_rate": 3.3120743528706556e-06, + "loss": 1.1541141271591187, + "step": 7464 + }, + { + "epoch": 1.358969691453536, + "grad_norm": 16.625, + "learning_rate": 3.3113175657284048e-06, + "loss": 0.5486347675323486, + "step": 7466 + }, + { + "epoch": 1.3593337580777283, + "grad_norm": 7.3125, + "learning_rate": 3.310560732901562e-06, + "loss": 1.2431353330612183, + "step": 7468 + }, + { + "epoch": 1.3596978247019205, + "grad_norm": 15.4375, + "learning_rate": 3.3098038545011914e-06, + "loss": 1.6641631126403809, + "step": 7470 + }, + { + "epoch": 1.3600618913261127, + "grad_norm": 12.5, + "learning_rate": 3.30904693063836e-06, + "loss": 1.1571792364120483, + "step": 7472 + }, + { + "epoch": 1.3604259579503049, + "grad_norm": 13.3125, + "learning_rate": 3.308289961424145e-06, + "loss": 1.4395567178726196, + "step": 7474 + }, + { + "epoch": 1.3607900245744973, + "grad_norm": 14.75, + "learning_rate": 3.307532946969627e-06, + "loss": 1.4999630451202393, + "step": 7476 + }, + { + "epoch": 1.3611540911986895, + "grad_norm": 13.75, + "learning_rate": 3.306775887385895e-06, + "loss": 1.7517006397247314, + "step": 7478 + }, + { + "epoch": 1.3615181578228817, + "grad_norm": 34.0, + "learning_rate": 3.306018782784044e-06, + "loss": 1.6111230850219727, + "step": 7480 + }, + { + "epoch": 1.3618822244470739, + "grad_norm": 6.8125, + "learning_rate": 3.3052616332751785e-06, + "loss": 1.0036317110061646, + "step": 7482 + }, + { + "epoch": 1.362246291071266, + "grad_norm": 24.5, + "learning_rate": 3.304504438970404e-06, + "loss": 1.4738763570785522, + "step": 7484 + }, + { + "epoch": 1.3626103576954582, + "grad_norm": 12.375, + "learning_rate": 3.3037471999808383e-06, + "loss": 0.6481763124465942, + "step": 7486 + }, + { + "epoch": 1.3629744243196504, + "grad_norm": 13.75, + "learning_rate": 3.3029899164176015e-06, + "loss": 1.3967748880386353, + "step": 7488 + }, + { + "epoch": 1.3633384909438426, + "grad_norm": 31.25, + "learning_rate": 3.3022325883918226e-06, + "loss": 1.840897798538208, + "step": 7490 + }, + { + "epoch": 1.3637025575680348, + "grad_norm": 10.75, + "learning_rate": 3.301475216014636e-06, + "loss": 1.540225625038147, + "step": 7492 + }, + { + "epoch": 1.3640666241922272, + "grad_norm": 18.25, + "learning_rate": 3.300717799397183e-06, + "loss": 1.142910122871399, + "step": 7494 + }, + { + "epoch": 1.3644306908164194, + "grad_norm": 15.9375, + "learning_rate": 3.299960338650612e-06, + "loss": 0.9440374374389648, + "step": 7496 + }, + { + "epoch": 1.3647947574406116, + "grad_norm": 6.34375, + "learning_rate": 3.2992028338860758e-06, + "loss": 1.3031357526779175, + "step": 7498 + }, + { + "epoch": 1.3651588240648038, + "grad_norm": 3.484375, + "learning_rate": 3.2984452852147376e-06, + "loss": 1.032446265220642, + "step": 7500 + }, + { + "epoch": 1.365522890688996, + "grad_norm": 6.71875, + "learning_rate": 3.297687692747763e-06, + "loss": 1.207587480545044, + "step": 7502 + }, + { + "epoch": 1.3658869573131884, + "grad_norm": 6.6875, + "learning_rate": 3.296930056596326e-06, + "loss": 1.4006855487823486, + "step": 7504 + }, + { + "epoch": 1.3662510239373806, + "grad_norm": 8.1875, + "learning_rate": 3.296172376871607e-06, + "loss": 1.3404649496078491, + "step": 7506 + }, + { + "epoch": 1.3666150905615728, + "grad_norm": 10.9375, + "learning_rate": 3.295414653684791e-06, + "loss": 1.4360241889953613, + "step": 7508 + }, + { + "epoch": 1.366979157185765, + "grad_norm": 7.21875, + "learning_rate": 3.294656887147072e-06, + "loss": 1.2095770835876465, + "step": 7510 + }, + { + "epoch": 1.3673432238099572, + "grad_norm": 29.875, + "learning_rate": 3.2938990773696493e-06, + "loss": 1.4648830890655518, + "step": 7512 + }, + { + "epoch": 1.3677072904341494, + "grad_norm": 20.125, + "learning_rate": 3.293141224463728e-06, + "loss": 1.4174734354019165, + "step": 7514 + }, + { + "epoch": 1.3680713570583416, + "grad_norm": 16.25, + "learning_rate": 3.2923833285405206e-06, + "loss": 1.0182284116744995, + "step": 7516 + }, + { + "epoch": 1.3684354236825338, + "grad_norm": 6.4375, + "learning_rate": 3.2916253897112426e-06, + "loss": 1.3506553173065186, + "step": 7518 + }, + { + "epoch": 1.3687994903067262, + "grad_norm": 8.5, + "learning_rate": 3.290867408087122e-06, + "loss": 1.3320130109786987, + "step": 7520 + }, + { + "epoch": 1.3691635569309184, + "grad_norm": 8.0625, + "learning_rate": 3.2901093837793884e-06, + "loss": 1.3439966440200806, + "step": 7522 + }, + { + "epoch": 1.3695276235551106, + "grad_norm": 12.5625, + "learning_rate": 3.2893513168992773e-06, + "loss": 1.424302339553833, + "step": 7524 + }, + { + "epoch": 1.3698916901793028, + "grad_norm": 23.125, + "learning_rate": 3.2885932075580352e-06, + "loss": 1.4320043325424194, + "step": 7526 + }, + { + "epoch": 1.370255756803495, + "grad_norm": 9.625, + "learning_rate": 3.287835055866907e-06, + "loss": 1.3840255737304688, + "step": 7528 + }, + { + "epoch": 1.3706198234276874, + "grad_norm": 8.875, + "learning_rate": 3.287076861937152e-06, + "loss": 1.166704535484314, + "step": 7530 + }, + { + "epoch": 1.3709838900518796, + "grad_norm": 9.5625, + "learning_rate": 3.2863186258800307e-06, + "loss": 1.2772608995437622, + "step": 7532 + }, + { + "epoch": 1.3713479566760718, + "grad_norm": 12.625, + "learning_rate": 3.2855603478068114e-06, + "loss": 1.3996284008026123, + "step": 7534 + }, + { + "epoch": 1.371712023300264, + "grad_norm": 14.25, + "learning_rate": 3.284802027828769e-06, + "loss": 1.4800236225128174, + "step": 7536 + }, + { + "epoch": 1.3720760899244562, + "grad_norm": 14.6875, + "learning_rate": 3.2840436660571815e-06, + "loss": 1.622351884841919, + "step": 7538 + }, + { + "epoch": 1.3724401565486484, + "grad_norm": 11.0, + "learning_rate": 3.2832852626033383e-06, + "loss": 1.3180458545684814, + "step": 7540 + }, + { + "epoch": 1.3728042231728406, + "grad_norm": 45.0, + "learning_rate": 3.2825268175785312e-06, + "loss": 1.598387360572815, + "step": 7542 + }, + { + "epoch": 1.3731682897970328, + "grad_norm": 6.125, + "learning_rate": 3.2817683310940584e-06, + "loss": 1.235984444618225, + "step": 7544 + }, + { + "epoch": 1.373532356421225, + "grad_norm": 12.25, + "learning_rate": 3.2810098032612246e-06, + "loss": 1.0352967977523804, + "step": 7546 + }, + { + "epoch": 1.3738964230454174, + "grad_norm": 15.1875, + "learning_rate": 3.280251234191341e-06, + "loss": 1.1905550956726074, + "step": 7548 + }, + { + "epoch": 1.3742604896696096, + "grad_norm": 11.75, + "learning_rate": 3.2794926239957246e-06, + "loss": 1.5297881364822388, + "step": 7550 + }, + { + "epoch": 1.3746245562938018, + "grad_norm": 10.0625, + "learning_rate": 3.2787339727856993e-06, + "loss": 1.4633986949920654, + "step": 7552 + }, + { + "epoch": 1.374988622917994, + "grad_norm": 22.25, + "learning_rate": 3.277975280672592e-06, + "loss": 1.7917207479476929, + "step": 7554 + }, + { + "epoch": 1.3753526895421861, + "grad_norm": 23.125, + "learning_rate": 3.2772165477677394e-06, + "loss": 1.7453250885009766, + "step": 7556 + }, + { + "epoch": 1.3757167561663786, + "grad_norm": 13.3125, + "learning_rate": 3.276457774182481e-06, + "loss": 1.842512845993042, + "step": 7558 + }, + { + "epoch": 1.3760808227905708, + "grad_norm": 10.375, + "learning_rate": 3.2756989600281654e-06, + "loss": 1.3612158298492432, + "step": 7560 + }, + { + "epoch": 1.376444889414763, + "grad_norm": 41.25, + "learning_rate": 3.2749401054161446e-06, + "loss": 1.6692206859588623, + "step": 7562 + }, + { + "epoch": 1.3768089560389551, + "grad_norm": 13.5, + "learning_rate": 3.274181210457777e-06, + "loss": 1.8831231594085693, + "step": 7564 + }, + { + "epoch": 1.3771730226631473, + "grad_norm": 15.125, + "learning_rate": 3.2734222752644283e-06, + "loss": 1.5002413988113403, + "step": 7566 + }, + { + "epoch": 1.3775370892873395, + "grad_norm": 14.5, + "learning_rate": 3.272663299947468e-06, + "loss": 1.6364710330963135, + "step": 7568 + }, + { + "epoch": 1.3779011559115317, + "grad_norm": 7.21875, + "learning_rate": 3.2719042846182746e-06, + "loss": 0.9296509027481079, + "step": 7570 + }, + { + "epoch": 1.378265222535724, + "grad_norm": 8.9375, + "learning_rate": 3.2711452293882295e-06, + "loss": 0.6318345665931702, + "step": 7572 + }, + { + "epoch": 1.3786292891599163, + "grad_norm": 3.46875, + "learning_rate": 3.2703861343687206e-06, + "loss": 1.0426054000854492, + "step": 7574 + }, + { + "epoch": 1.3789933557841085, + "grad_norm": 10.375, + "learning_rate": 3.2696269996711417e-06, + "loss": 1.0668225288391113, + "step": 7576 + }, + { + "epoch": 1.3793574224083007, + "grad_norm": 39.75, + "learning_rate": 3.268867825406894e-06, + "loss": 1.3814303874969482, + "step": 7578 + }, + { + "epoch": 1.379721489032493, + "grad_norm": 11.5, + "learning_rate": 3.2681086116873817e-06, + "loss": 1.4663279056549072, + "step": 7580 + }, + { + "epoch": 1.3800855556566851, + "grad_norm": 18.25, + "learning_rate": 3.267349358624018e-06, + "loss": 1.427017331123352, + "step": 7582 + }, + { + "epoch": 1.3804496222808775, + "grad_norm": 11.5625, + "learning_rate": 3.266590066328219e-06, + "loss": 0.8903486132621765, + "step": 7584 + }, + { + "epoch": 1.3808136889050697, + "grad_norm": 14.4375, + "learning_rate": 3.2658307349114083e-06, + "loss": 1.2753279209136963, + "step": 7586 + }, + { + "epoch": 1.381177755529262, + "grad_norm": 13.5625, + "learning_rate": 3.2650713644850142e-06, + "loss": 1.7756521701812744, + "step": 7588 + }, + { + "epoch": 1.3815418221534541, + "grad_norm": 10.9375, + "learning_rate": 3.2643119551604718e-06, + "loss": 1.5580977201461792, + "step": 7590 + }, + { + "epoch": 1.3819058887776463, + "grad_norm": 7.09375, + "learning_rate": 3.2635525070492213e-06, + "loss": 1.588334321975708, + "step": 7592 + }, + { + "epoch": 1.3822699554018385, + "grad_norm": 9.875, + "learning_rate": 3.2627930202627077e-06, + "loss": 1.3302747011184692, + "step": 7594 + }, + { + "epoch": 1.3826340220260307, + "grad_norm": 16.125, + "learning_rate": 3.262033494912385e-06, + "loss": 1.4324854612350464, + "step": 7596 + }, + { + "epoch": 1.382998088650223, + "grad_norm": 15.3125, + "learning_rate": 3.2612739311097073e-06, + "loss": 1.295045256614685, + "step": 7598 + }, + { + "epoch": 1.383362155274415, + "grad_norm": 11.875, + "learning_rate": 3.26051432896614e-06, + "loss": 0.9888509511947632, + "step": 7600 + }, + { + "epoch": 1.3837262218986075, + "grad_norm": 10.4375, + "learning_rate": 3.259754688593151e-06, + "loss": 1.4764946699142456, + "step": 7602 + }, + { + "epoch": 1.3840902885227997, + "grad_norm": 16.5, + "learning_rate": 3.2589950101022127e-06, + "loss": 1.529345154762268, + "step": 7604 + }, + { + "epoch": 1.3844543551469919, + "grad_norm": 12.4375, + "learning_rate": 3.258235293604808e-06, + "loss": 1.0732531547546387, + "step": 7606 + }, + { + "epoch": 1.384818421771184, + "grad_norm": 4.9375, + "learning_rate": 3.257475539212419e-06, + "loss": 1.0899908542633057, + "step": 7608 + }, + { + "epoch": 1.3851824883953763, + "grad_norm": 16.875, + "learning_rate": 3.2567157470365386e-06, + "loss": 1.6812312602996826, + "step": 7610 + }, + { + "epoch": 1.3855465550195687, + "grad_norm": 26.5, + "learning_rate": 3.255955917188663e-06, + "loss": 1.6087589263916016, + "step": 7612 + }, + { + "epoch": 1.3859106216437609, + "grad_norm": 14.4375, + "learning_rate": 3.2551960497802937e-06, + "loss": 1.6894302368164062, + "step": 7614 + }, + { + "epoch": 1.386274688267953, + "grad_norm": 27.875, + "learning_rate": 3.254436144922939e-06, + "loss": 1.8321233987808228, + "step": 7616 + }, + { + "epoch": 1.3866387548921453, + "grad_norm": 10.4375, + "learning_rate": 3.2536762027281092e-06, + "loss": 1.5329599380493164, + "step": 7618 + }, + { + "epoch": 1.3870028215163375, + "grad_norm": 8.3125, + "learning_rate": 3.2529162233073263e-06, + "loss": 1.4889445304870605, + "step": 7620 + }, + { + "epoch": 1.3873668881405297, + "grad_norm": 8.25, + "learning_rate": 3.2521562067721126e-06, + "loss": 1.4516929388046265, + "step": 7622 + }, + { + "epoch": 1.3877309547647219, + "grad_norm": 8.75, + "learning_rate": 3.2513961532339965e-06, + "loss": 1.1148942708969116, + "step": 7624 + }, + { + "epoch": 1.388095021388914, + "grad_norm": 16.125, + "learning_rate": 3.2506360628045153e-06, + "loss": 1.6145505905151367, + "step": 7626 + }, + { + "epoch": 1.3884590880131065, + "grad_norm": 8.1875, + "learning_rate": 3.249875935595206e-06, + "loss": 1.5374705791473389, + "step": 7628 + }, + { + "epoch": 1.3888231546372987, + "grad_norm": 11.375, + "learning_rate": 3.2491157717176157e-06, + "loss": 1.519486665725708, + "step": 7630 + }, + { + "epoch": 1.3891872212614909, + "grad_norm": 17.125, + "learning_rate": 3.248355571283297e-06, + "loss": 1.7875479459762573, + "step": 7632 + }, + { + "epoch": 1.389551287885683, + "grad_norm": 16.625, + "learning_rate": 3.2475953344038037e-06, + "loss": 1.1493209600448608, + "step": 7634 + }, + { + "epoch": 1.3899153545098752, + "grad_norm": 15.5, + "learning_rate": 3.2468350611906997e-06, + "loss": 1.1954141855239868, + "step": 7636 + }, + { + "epoch": 1.3902794211340677, + "grad_norm": 13.125, + "learning_rate": 3.2460747517555493e-06, + "loss": 1.4001014232635498, + "step": 7638 + }, + { + "epoch": 1.3906434877582599, + "grad_norm": 12.9375, + "learning_rate": 3.245314406209926e-06, + "loss": 1.8974583148956299, + "step": 7640 + }, + { + "epoch": 1.391007554382452, + "grad_norm": 16.125, + "learning_rate": 3.2445540246654095e-06, + "loss": 1.4770878553390503, + "step": 7642 + }, + { + "epoch": 1.3913716210066442, + "grad_norm": 7.125, + "learning_rate": 3.2437936072335795e-06, + "loss": 1.1289637088775635, + "step": 7644 + }, + { + "epoch": 1.3917356876308364, + "grad_norm": 7.375, + "learning_rate": 3.2430331540260275e-06, + "loss": 1.5044817924499512, + "step": 7646 + }, + { + "epoch": 1.3920997542550286, + "grad_norm": 13.1875, + "learning_rate": 3.242272665154343e-06, + "loss": 1.4327861070632935, + "step": 7648 + }, + { + "epoch": 1.3924638208792208, + "grad_norm": 7.0625, + "learning_rate": 3.2415121407301274e-06, + "loss": 1.3112051486968994, + "step": 7650 + }, + { + "epoch": 1.392827887503413, + "grad_norm": 15.0, + "learning_rate": 3.2407515808649846e-06, + "loss": 1.1587635278701782, + "step": 7652 + }, + { + "epoch": 1.3931919541276052, + "grad_norm": 20.75, + "learning_rate": 3.2399909856705224e-06, + "loss": 1.9742004871368408, + "step": 7654 + }, + { + "epoch": 1.3935560207517976, + "grad_norm": 7.78125, + "learning_rate": 3.239230355258356e-06, + "loss": 1.7030959129333496, + "step": 7656 + }, + { + "epoch": 1.3939200873759898, + "grad_norm": 17.625, + "learning_rate": 3.2384696897401036e-06, + "loss": 1.2186884880065918, + "step": 7658 + }, + { + "epoch": 1.394284154000182, + "grad_norm": 18.25, + "learning_rate": 3.2377089892273917e-06, + "loss": 1.8685083389282227, + "step": 7660 + }, + { + "epoch": 1.3946482206243742, + "grad_norm": 7.21875, + "learning_rate": 3.2369482538318485e-06, + "loss": 1.0573270320892334, + "step": 7662 + }, + { + "epoch": 1.3950122872485664, + "grad_norm": 10.6875, + "learning_rate": 3.2361874836651085e-06, + "loss": 1.3933029174804688, + "step": 7664 + }, + { + "epoch": 1.3953763538727588, + "grad_norm": 15.3125, + "learning_rate": 3.2354266788388146e-06, + "loss": 1.525225043296814, + "step": 7666 + }, + { + "epoch": 1.395740420496951, + "grad_norm": 18.0, + "learning_rate": 3.234665839464608e-06, + "loss": 0.7856537699699402, + "step": 7668 + }, + { + "epoch": 1.3961044871211432, + "grad_norm": 11.125, + "learning_rate": 3.2339049656541404e-06, + "loss": 0.7443108558654785, + "step": 7670 + }, + { + "epoch": 1.3964685537453354, + "grad_norm": 10.3125, + "learning_rate": 3.2331440575190678e-06, + "loss": 1.477005958557129, + "step": 7672 + }, + { + "epoch": 1.3968326203695276, + "grad_norm": 15.1875, + "learning_rate": 3.2323831151710494e-06, + "loss": 1.56203031539917, + "step": 7674 + }, + { + "epoch": 1.3971966869937198, + "grad_norm": 17.0, + "learning_rate": 3.2316221387217506e-06, + "loss": 1.8665440082550049, + "step": 7676 + }, + { + "epoch": 1.397560753617912, + "grad_norm": 24.375, + "learning_rate": 3.2308611282828415e-06, + "loss": 1.714219570159912, + "step": 7678 + }, + { + "epoch": 1.3979248202421042, + "grad_norm": 14.125, + "learning_rate": 3.2301000839659972e-06, + "loss": 0.9606533050537109, + "step": 7680 + }, + { + "epoch": 1.3982888868662966, + "grad_norm": 11.1875, + "learning_rate": 3.229339005882899e-06, + "loss": 1.4967529773712158, + "step": 7682 + }, + { + "epoch": 1.3986529534904888, + "grad_norm": 10.9375, + "learning_rate": 3.2285778941452297e-06, + "loss": 1.2466202974319458, + "step": 7684 + }, + { + "epoch": 1.399017020114681, + "grad_norm": 6.03125, + "learning_rate": 3.2278167488646826e-06, + "loss": 1.1179149150848389, + "step": 7686 + }, + { + "epoch": 1.3993810867388732, + "grad_norm": 21.75, + "learning_rate": 3.2270555701529496e-06, + "loss": 1.2939362525939941, + "step": 7688 + }, + { + "epoch": 1.3997451533630654, + "grad_norm": 24.625, + "learning_rate": 3.2262943581217313e-06, + "loss": 1.6130499839782715, + "step": 7690 + }, + { + "epoch": 1.4001092199872578, + "grad_norm": 13.5, + "learning_rate": 3.225533112882734e-06, + "loss": 0.6802541017532349, + "step": 7692 + }, + { + "epoch": 1.40047328661145, + "grad_norm": 10.875, + "learning_rate": 3.2247718345476662e-06, + "loss": 1.5096487998962402, + "step": 7694 + }, + { + "epoch": 1.4008373532356422, + "grad_norm": 12.9375, + "learning_rate": 3.2240105232282433e-06, + "loss": 1.4196727275848389, + "step": 7696 + }, + { + "epoch": 1.4012014198598344, + "grad_norm": 8.5, + "learning_rate": 3.2232491790361832e-06, + "loss": 1.2425036430358887, + "step": 7698 + }, + { + "epoch": 1.4015654864840266, + "grad_norm": 3.5625, + "learning_rate": 3.2224878020832105e-06, + "loss": 1.2976609468460083, + "step": 7700 + }, + { + "epoch": 1.4019295531082188, + "grad_norm": 4.90625, + "learning_rate": 3.221726392481055e-06, + "loss": 1.1224021911621094, + "step": 7702 + }, + { + "epoch": 1.402293619732411, + "grad_norm": 9.1875, + "learning_rate": 3.22096495034145e-06, + "loss": 1.3450913429260254, + "step": 7704 + }, + { + "epoch": 1.4026576863566032, + "grad_norm": 23.625, + "learning_rate": 3.2202034757761343e-06, + "loss": 1.555578589439392, + "step": 7706 + }, + { + "epoch": 1.4030217529807956, + "grad_norm": 20.875, + "learning_rate": 3.21944196889685e-06, + "loss": 1.7363042831420898, + "step": 7708 + }, + { + "epoch": 1.4033858196049878, + "grad_norm": 23.625, + "learning_rate": 3.218680429815346e-06, + "loss": 1.8728492259979248, + "step": 7710 + }, + { + "epoch": 1.40374988622918, + "grad_norm": 27.375, + "learning_rate": 3.2179188586433763e-06, + "loss": 1.9928492307662964, + "step": 7712 + }, + { + "epoch": 1.4041139528533721, + "grad_norm": 9.0625, + "learning_rate": 3.2171572554926966e-06, + "loss": 1.4951566457748413, + "step": 7714 + }, + { + "epoch": 1.4044780194775643, + "grad_norm": 10.3125, + "learning_rate": 3.2163956204750703e-06, + "loss": 1.5097541809082031, + "step": 7716 + }, + { + "epoch": 1.4048420861017568, + "grad_norm": 6.46875, + "learning_rate": 3.215633953702263e-06, + "loss": 1.2711541652679443, + "step": 7718 + }, + { + "epoch": 1.405206152725949, + "grad_norm": 11.3125, + "learning_rate": 3.2148722552860466e-06, + "loss": 1.2776074409484863, + "step": 7720 + }, + { + "epoch": 1.4055702193501411, + "grad_norm": 123.0, + "learning_rate": 3.214110525338199e-06, + "loss": 1.2997921705245972, + "step": 7722 + }, + { + "epoch": 1.4059342859743333, + "grad_norm": 18.125, + "learning_rate": 3.2133487639704983e-06, + "loss": 1.13933265209198, + "step": 7724 + }, + { + "epoch": 1.4062983525985255, + "grad_norm": 9.375, + "learning_rate": 3.2125869712947313e-06, + "loss": 1.4762831926345825, + "step": 7726 + }, + { + "epoch": 1.4066624192227177, + "grad_norm": 9.75, + "learning_rate": 3.211825147422688e-06, + "loss": 1.1391164064407349, + "step": 7728 + }, + { + "epoch": 1.40702648584691, + "grad_norm": 13.3125, + "learning_rate": 3.2110632924661623e-06, + "loss": 1.6492605209350586, + "step": 7730 + }, + { + "epoch": 1.4073905524711021, + "grad_norm": 10.3125, + "learning_rate": 3.2103014065369543e-06, + "loss": 1.4995006322860718, + "step": 7732 + }, + { + "epoch": 1.4077546190952943, + "grad_norm": 12.8125, + "learning_rate": 3.209539489746867e-06, + "loss": 1.4288798570632935, + "step": 7734 + }, + { + "epoch": 1.4081186857194867, + "grad_norm": 12.5, + "learning_rate": 3.2087775422077087e-06, + "loss": 1.4374701976776123, + "step": 7736 + }, + { + "epoch": 1.408482752343679, + "grad_norm": 8.875, + "learning_rate": 3.2080155640312925e-06, + "loss": 1.3923237323760986, + "step": 7738 + }, + { + "epoch": 1.4088468189678711, + "grad_norm": 23.75, + "learning_rate": 3.2072535553294348e-06, + "loss": 1.6418286561965942, + "step": 7740 + }, + { + "epoch": 1.4092108855920633, + "grad_norm": 12.3125, + "learning_rate": 3.2064915162139574e-06, + "loss": 1.6202239990234375, + "step": 7742 + }, + { + "epoch": 1.4095749522162555, + "grad_norm": 4.625, + "learning_rate": 3.2057294467966882e-06, + "loss": 1.4111257791519165, + "step": 7744 + }, + { + "epoch": 1.409939018840448, + "grad_norm": 7.1875, + "learning_rate": 3.204967347189456e-06, + "loss": 1.1084165573120117, + "step": 7746 + }, + { + "epoch": 1.4103030854646401, + "grad_norm": 9.625, + "learning_rate": 3.2042052175040955e-06, + "loss": 1.6573485136032104, + "step": 7748 + }, + { + "epoch": 1.4106671520888323, + "grad_norm": 18.5, + "learning_rate": 3.203443057852448e-06, + "loss": 1.5492092370986938, + "step": 7750 + }, + { + "epoch": 1.4110312187130245, + "grad_norm": 54.75, + "learning_rate": 3.202680868346355e-06, + "loss": 1.3896853923797607, + "step": 7752 + }, + { + "epoch": 1.4113952853372167, + "grad_norm": 18.375, + "learning_rate": 3.2019186490976667e-06, + "loss": 1.7841663360595703, + "step": 7754 + }, + { + "epoch": 1.411759351961409, + "grad_norm": 8.3125, + "learning_rate": 3.201156400218235e-06, + "loss": 1.4492237567901611, + "step": 7756 + }, + { + "epoch": 1.412123418585601, + "grad_norm": 11.3125, + "learning_rate": 3.2003941218199165e-06, + "loss": 1.2060620784759521, + "step": 7758 + }, + { + "epoch": 1.4124874852097933, + "grad_norm": 13.8125, + "learning_rate": 3.1996318140145726e-06, + "loss": 1.7670797109603882, + "step": 7760 + }, + { + "epoch": 1.4128515518339857, + "grad_norm": 8.75, + "learning_rate": 3.1988694769140695e-06, + "loss": 1.3916168212890625, + "step": 7762 + }, + { + "epoch": 1.413215618458178, + "grad_norm": 13.8125, + "learning_rate": 3.1981071106302765e-06, + "loss": 1.255885124206543, + "step": 7764 + }, + { + "epoch": 1.41357968508237, + "grad_norm": 14.4375, + "learning_rate": 3.197344715275068e-06, + "loss": 1.4936281442642212, + "step": 7766 + }, + { + "epoch": 1.4139437517065623, + "grad_norm": 9.75, + "learning_rate": 3.196582290960322e-06, + "loss": 1.6796693801879883, + "step": 7768 + }, + { + "epoch": 1.4143078183307545, + "grad_norm": 14.5625, + "learning_rate": 3.195819837797921e-06, + "loss": 1.650527000427246, + "step": 7770 + }, + { + "epoch": 1.414671884954947, + "grad_norm": 9.1875, + "learning_rate": 3.1950573558997532e-06, + "loss": 1.208836317062378, + "step": 7772 + }, + { + "epoch": 1.415035951579139, + "grad_norm": 14.5, + "learning_rate": 3.194294845377709e-06, + "loss": 1.3354949951171875, + "step": 7774 + }, + { + "epoch": 1.4154000182033313, + "grad_norm": 20.5, + "learning_rate": 3.193532306343683e-06, + "loss": 1.5038433074951172, + "step": 7776 + }, + { + "epoch": 1.4157640848275235, + "grad_norm": 20.5, + "learning_rate": 3.1927697389095756e-06, + "loss": 1.6682833433151245, + "step": 7778 + }, + { + "epoch": 1.4161281514517157, + "grad_norm": 6.90625, + "learning_rate": 3.1920071431872903e-06, + "loss": 1.2686760425567627, + "step": 7780 + }, + { + "epoch": 1.4164922180759079, + "grad_norm": 11.75, + "learning_rate": 3.1912445192887348e-06, + "loss": 1.4714316129684448, + "step": 7782 + }, + { + "epoch": 1.4168562847001, + "grad_norm": 30.25, + "learning_rate": 3.1904818673258216e-06, + "loss": 1.4260897636413574, + "step": 7784 + }, + { + "epoch": 1.4172203513242922, + "grad_norm": 7.375, + "learning_rate": 3.189719187410466e-06, + "loss": 1.0889663696289062, + "step": 7786 + }, + { + "epoch": 1.4175844179484844, + "grad_norm": 9.1875, + "learning_rate": 3.188956479654588e-06, + "loss": 1.4181095361709595, + "step": 7788 + }, + { + "epoch": 1.4179484845726769, + "grad_norm": 4.4375, + "learning_rate": 3.1881937441701126e-06, + "loss": 1.2934268712997437, + "step": 7790 + }, + { + "epoch": 1.418312551196869, + "grad_norm": 50.25, + "learning_rate": 3.1874309810689686e-06, + "loss": 1.2611124515533447, + "step": 7792 + }, + { + "epoch": 1.4186766178210612, + "grad_norm": 6.5, + "learning_rate": 3.1866681904630877e-06, + "loss": 1.3409913778305054, + "step": 7794 + }, + { + "epoch": 1.4190406844452534, + "grad_norm": 31.0, + "learning_rate": 3.185905372464405e-06, + "loss": 1.5416288375854492, + "step": 7796 + }, + { + "epoch": 1.4194047510694456, + "grad_norm": 26.75, + "learning_rate": 3.185142527184864e-06, + "loss": 1.68964684009552, + "step": 7798 + }, + { + "epoch": 1.419768817693638, + "grad_norm": 13.75, + "learning_rate": 3.184379654736407e-06, + "loss": 1.438903570175171, + "step": 7800 + }, + { + "epoch": 1.4201328843178302, + "grad_norm": 22.25, + "learning_rate": 3.1836167552309827e-06, + "loss": 1.4650827646255493, + "step": 7802 + }, + { + "epoch": 1.4204969509420224, + "grad_norm": 8.4375, + "learning_rate": 3.1828538287805433e-06, + "loss": 1.2642356157302856, + "step": 7804 + }, + { + "epoch": 1.4208610175662146, + "grad_norm": 15.4375, + "learning_rate": 3.1820908754970457e-06, + "loss": 0.9210084676742554, + "step": 7806 + }, + { + "epoch": 1.4212250841904068, + "grad_norm": 7.84375, + "learning_rate": 3.1813278954924506e-06, + "loss": 1.082798957824707, + "step": 7808 + }, + { + "epoch": 1.421589150814599, + "grad_norm": 34.0, + "learning_rate": 3.180564888878721e-06, + "loss": 1.328012466430664, + "step": 7810 + }, + { + "epoch": 1.4219532174387912, + "grad_norm": 12.875, + "learning_rate": 3.1798018557678257e-06, + "loss": 0.6483087539672852, + "step": 7812 + }, + { + "epoch": 1.4223172840629834, + "grad_norm": 13.1875, + "learning_rate": 3.179038796271737e-06, + "loss": 1.42759108543396, + "step": 7814 + }, + { + "epoch": 1.4226813506871758, + "grad_norm": 5.125, + "learning_rate": 3.178275710502431e-06, + "loss": 1.2675269842147827, + "step": 7816 + }, + { + "epoch": 1.423045417311368, + "grad_norm": 3.65625, + "learning_rate": 3.1775125985718864e-06, + "loss": 0.97649085521698, + "step": 7818 + }, + { + "epoch": 1.4234094839355602, + "grad_norm": 12.125, + "learning_rate": 3.1767494605920877e-06, + "loss": 1.5684438943862915, + "step": 7820 + }, + { + "epoch": 1.4237735505597524, + "grad_norm": 11.75, + "learning_rate": 3.175986296675022e-06, + "loss": 1.8689886331558228, + "step": 7822 + }, + { + "epoch": 1.4241376171839446, + "grad_norm": 15.5625, + "learning_rate": 3.17522310693268e-06, + "loss": 1.0635731220245361, + "step": 7824 + }, + { + "epoch": 1.424501683808137, + "grad_norm": 15.1875, + "learning_rate": 3.1744598914770576e-06, + "loss": 1.0783973932266235, + "step": 7826 + }, + { + "epoch": 1.4248657504323292, + "grad_norm": 18.5, + "learning_rate": 3.1736966504201526e-06, + "loss": 1.7604498863220215, + "step": 7828 + }, + { + "epoch": 1.4252298170565214, + "grad_norm": 12.75, + "learning_rate": 3.172933383873969e-06, + "loss": 1.5512090921401978, + "step": 7830 + }, + { + "epoch": 1.4255938836807136, + "grad_norm": 11.0625, + "learning_rate": 3.1721700919505115e-06, + "loss": 1.5707006454467773, + "step": 7832 + }, + { + "epoch": 1.4259579503049058, + "grad_norm": 20.0, + "learning_rate": 3.1714067747617906e-06, + "loss": 0.6973305344581604, + "step": 7834 + }, + { + "epoch": 1.426322016929098, + "grad_norm": 24.0, + "learning_rate": 3.170643432419821e-06, + "loss": 0.4689348340034485, + "step": 7836 + }, + { + "epoch": 1.4266860835532902, + "grad_norm": 28.5, + "learning_rate": 3.169880065036618e-06, + "loss": 0.9014610052108765, + "step": 7838 + }, + { + "epoch": 1.4270501501774824, + "grad_norm": 10.0625, + "learning_rate": 3.169116672724205e-06, + "loss": 1.4136472940444946, + "step": 7840 + }, + { + "epoch": 1.4274142168016746, + "grad_norm": 16.25, + "learning_rate": 3.1683532555946052e-06, + "loss": 1.6703163385391235, + "step": 7842 + }, + { + "epoch": 1.427778283425867, + "grad_norm": 8.9375, + "learning_rate": 3.167589813759847e-06, + "loss": 1.8979815244674683, + "step": 7844 + }, + { + "epoch": 1.4281423500500592, + "grad_norm": 12.625, + "learning_rate": 3.166826347331964e-06, + "loss": 1.4808622598648071, + "step": 7846 + }, + { + "epoch": 1.4285064166742514, + "grad_norm": 19.375, + "learning_rate": 3.16606285642299e-06, + "loss": 1.4716413021087646, + "step": 7848 + }, + { + "epoch": 1.4288704832984436, + "grad_norm": 5.46875, + "learning_rate": 3.165299341144964e-06, + "loss": 1.107454538345337, + "step": 7850 + }, + { + "epoch": 1.4292345499226358, + "grad_norm": 6.21875, + "learning_rate": 3.1645358016099303e-06, + "loss": 0.9434410333633423, + "step": 7852 + }, + { + "epoch": 1.4295986165468282, + "grad_norm": 7.5, + "learning_rate": 3.163772237929935e-06, + "loss": 1.400646448135376, + "step": 7854 + }, + { + "epoch": 1.4299626831710204, + "grad_norm": 7.28125, + "learning_rate": 3.1630086502170266e-06, + "loss": 1.440133810043335, + "step": 7856 + }, + { + "epoch": 1.4303267497952126, + "grad_norm": 6.1875, + "learning_rate": 3.16224503858326e-06, + "loss": 1.3168493509292603, + "step": 7858 + }, + { + "epoch": 1.4306908164194048, + "grad_norm": 7.5625, + "learning_rate": 3.1614814031406914e-06, + "loss": 1.3910198211669922, + "step": 7860 + }, + { + "epoch": 1.431054883043597, + "grad_norm": 6.875, + "learning_rate": 3.1607177440013816e-06, + "loss": 1.4033300876617432, + "step": 7862 + }, + { + "epoch": 1.4314189496677892, + "grad_norm": 4.3125, + "learning_rate": 3.159954061277394e-06, + "loss": 1.3206058740615845, + "step": 7864 + }, + { + "epoch": 1.4317830162919813, + "grad_norm": 20.625, + "learning_rate": 3.1591903550807955e-06, + "loss": 1.4612363576889038, + "step": 7866 + }, + { + "epoch": 1.4321470829161735, + "grad_norm": 6.59375, + "learning_rate": 3.1584266255236582e-06, + "loss": 1.1208163499832153, + "step": 7868 + }, + { + "epoch": 1.432511149540366, + "grad_norm": 26.875, + "learning_rate": 3.157662872718055e-06, + "loss": 0.44327694177627563, + "step": 7870 + }, + { + "epoch": 1.4328752161645582, + "grad_norm": 13.375, + "learning_rate": 3.156899096776065e-06, + "loss": 1.3511675596237183, + "step": 7872 + }, + { + "epoch": 1.4332392827887503, + "grad_norm": 3.875, + "learning_rate": 3.156135297809768e-06, + "loss": 1.378699541091919, + "step": 7874 + }, + { + "epoch": 1.4336033494129425, + "grad_norm": 15.875, + "learning_rate": 3.155371475931249e-06, + "loss": 1.449173927307129, + "step": 7876 + }, + { + "epoch": 1.4339674160371347, + "grad_norm": 14.75, + "learning_rate": 3.1546076312525955e-06, + "loss": 1.5970392227172852, + "step": 7878 + }, + { + "epoch": 1.4343314826613272, + "grad_norm": 12.25, + "learning_rate": 3.153843763885899e-06, + "loss": 1.430195689201355, + "step": 7880 + }, + { + "epoch": 1.4346955492855193, + "grad_norm": 20.25, + "learning_rate": 3.1530798739432526e-06, + "loss": 1.3880562782287598, + "step": 7882 + }, + { + "epoch": 1.4350596159097115, + "grad_norm": 12.4375, + "learning_rate": 3.152315961536756e-06, + "loss": 1.7094707489013672, + "step": 7884 + }, + { + "epoch": 1.4354236825339037, + "grad_norm": 10.5625, + "learning_rate": 3.1515520267785095e-06, + "loss": 2.0341827869415283, + "step": 7886 + }, + { + "epoch": 1.435787749158096, + "grad_norm": 10.8125, + "learning_rate": 3.150788069780616e-06, + "loss": 1.6291069984436035, + "step": 7888 + }, + { + "epoch": 1.4361518157822881, + "grad_norm": 7.09375, + "learning_rate": 3.150024090655186e-06, + "loss": 1.4164053201675415, + "step": 7890 + }, + { + "epoch": 1.4365158824064803, + "grad_norm": 8.25, + "learning_rate": 3.1492600895143278e-06, + "loss": 1.459969401359558, + "step": 7892 + }, + { + "epoch": 1.4368799490306725, + "grad_norm": 28.0, + "learning_rate": 3.1484960664701557e-06, + "loss": 1.5984312295913696, + "step": 7894 + }, + { + "epoch": 1.4372440156548647, + "grad_norm": 11.625, + "learning_rate": 3.1477320216347885e-06, + "loss": 1.398792028427124, + "step": 7896 + }, + { + "epoch": 1.4376080822790571, + "grad_norm": 18.25, + "learning_rate": 3.1469679551203456e-06, + "loss": 1.5217562913894653, + "step": 7898 + }, + { + "epoch": 1.4379721489032493, + "grad_norm": 18.875, + "learning_rate": 3.146203867038951e-06, + "loss": 1.6921648979187012, + "step": 7900 + }, + { + "epoch": 1.4383362155274415, + "grad_norm": 6.53125, + "learning_rate": 3.145439757502732e-06, + "loss": 1.1033227443695068, + "step": 7902 + }, + { + "epoch": 1.4387002821516337, + "grad_norm": 14.25, + "learning_rate": 3.144675626623817e-06, + "loss": 1.2729337215423584, + "step": 7904 + }, + { + "epoch": 1.439064348775826, + "grad_norm": 25.125, + "learning_rate": 3.143911474514341e-06, + "loss": 1.6983637809753418, + "step": 7906 + }, + { + "epoch": 1.4394284154000183, + "grad_norm": 7.71875, + "learning_rate": 3.143147301286438e-06, + "loss": 1.3148442506790161, + "step": 7908 + }, + { + "epoch": 1.4397924820242105, + "grad_norm": 11.1875, + "learning_rate": 3.1423831070522497e-06, + "loss": 1.95399808883667, + "step": 7910 + }, + { + "epoch": 1.4401565486484027, + "grad_norm": 14.75, + "learning_rate": 3.141618891923918e-06, + "loss": 1.3040627241134644, + "step": 7912 + }, + { + "epoch": 1.440520615272595, + "grad_norm": 9.6875, + "learning_rate": 3.1408546560135865e-06, + "loss": 1.00283682346344, + "step": 7914 + }, + { + "epoch": 1.440884681896787, + "grad_norm": 17.0, + "learning_rate": 3.1400903994334054e-06, + "loss": 2.068880558013916, + "step": 7916 + }, + { + "epoch": 1.4412487485209793, + "grad_norm": 212.0, + "learning_rate": 3.1393261222955263e-06, + "loss": 1.2730417251586914, + "step": 7918 + }, + { + "epoch": 1.4416128151451715, + "grad_norm": 6.96875, + "learning_rate": 3.1385618247121035e-06, + "loss": 1.3510453701019287, + "step": 7920 + }, + { + "epoch": 1.4419768817693637, + "grad_norm": 7.125, + "learning_rate": 3.137797506795295e-06, + "loss": 1.0491981506347656, + "step": 7922 + }, + { + "epoch": 1.442340948393556, + "grad_norm": 19.125, + "learning_rate": 3.1370331686572597e-06, + "loss": 1.7947553396224976, + "step": 7924 + }, + { + "epoch": 1.4427050150177483, + "grad_norm": 8.75, + "learning_rate": 3.1362688104101622e-06, + "loss": 1.1688612699508667, + "step": 7926 + }, + { + "epoch": 1.4430690816419405, + "grad_norm": 10.375, + "learning_rate": 3.13550443216617e-06, + "loss": 1.1563042402267456, + "step": 7928 + }, + { + "epoch": 1.4434331482661327, + "grad_norm": 8.4375, + "learning_rate": 3.134740034037451e-06, + "loss": 1.5420507192611694, + "step": 7930 + }, + { + "epoch": 1.4437972148903249, + "grad_norm": 21.875, + "learning_rate": 3.133975616136178e-06, + "loss": 1.727454423904419, + "step": 7932 + }, + { + "epoch": 1.4441612815145173, + "grad_norm": 57.75, + "learning_rate": 3.1332111785745255e-06, + "loss": 1.6875288486480713, + "step": 7934 + }, + { + "epoch": 1.4445253481387095, + "grad_norm": 3.796875, + "learning_rate": 3.1324467214646736e-06, + "loss": 1.2898133993148804, + "step": 7936 + }, + { + "epoch": 1.4448894147629017, + "grad_norm": 5.09375, + "learning_rate": 3.131682244918802e-06, + "loss": 1.3041505813598633, + "step": 7938 + }, + { + "epoch": 1.4452534813870939, + "grad_norm": 6.21875, + "learning_rate": 3.1309177490490943e-06, + "loss": 1.2383619546890259, + "step": 7940 + }, + { + "epoch": 1.445617548011286, + "grad_norm": 13.6875, + "learning_rate": 3.1301532339677375e-06, + "loss": 1.3182579278945923, + "step": 7942 + }, + { + "epoch": 1.4459816146354783, + "grad_norm": 15.25, + "learning_rate": 3.129388699786922e-06, + "loss": 1.2915605306625366, + "step": 7944 + }, + { + "epoch": 1.4463456812596704, + "grad_norm": 9.5, + "learning_rate": 3.1286241466188377e-06, + "loss": 2.1337482929229736, + "step": 7946 + }, + { + "epoch": 1.4467097478838626, + "grad_norm": 9.5625, + "learning_rate": 3.127859574575681e-06, + "loss": 1.3303313255310059, + "step": 7948 + }, + { + "epoch": 1.447073814508055, + "grad_norm": 13.375, + "learning_rate": 3.1270949837696508e-06, + "loss": 1.4784893989562988, + "step": 7950 + }, + { + "epoch": 1.4474378811322473, + "grad_norm": 12.5, + "learning_rate": 3.126330374312947e-06, + "loss": 1.3828274011611938, + "step": 7952 + }, + { + "epoch": 1.4478019477564394, + "grad_norm": 8.75, + "learning_rate": 3.1255657463177723e-06, + "loss": 1.4443020820617676, + "step": 7954 + }, + { + "epoch": 1.4481660143806316, + "grad_norm": 7.90625, + "learning_rate": 3.1248010998963336e-06, + "loss": 1.1901321411132812, + "step": 7956 + }, + { + "epoch": 1.4485300810048238, + "grad_norm": 16.625, + "learning_rate": 3.1240364351608386e-06, + "loss": 1.2194774150848389, + "step": 7958 + }, + { + "epoch": 1.448894147629016, + "grad_norm": 14.3125, + "learning_rate": 3.1232717522235e-06, + "loss": 1.8177173137664795, + "step": 7960 + }, + { + "epoch": 1.4492582142532084, + "grad_norm": 4.90625, + "learning_rate": 3.122507051196531e-06, + "loss": 1.1955132484436035, + "step": 7962 + }, + { + "epoch": 1.4496222808774006, + "grad_norm": 11.0625, + "learning_rate": 3.1217423321921494e-06, + "loss": 1.3393421173095703, + "step": 7964 + }, + { + "epoch": 1.4499863475015928, + "grad_norm": 7.84375, + "learning_rate": 3.120977595322573e-06, + "loss": 1.3808822631835938, + "step": 7966 + }, + { + "epoch": 1.450350414125785, + "grad_norm": 12.4375, + "learning_rate": 3.1202128407000255e-06, + "loss": 1.3850477933883667, + "step": 7968 + }, + { + "epoch": 1.4507144807499772, + "grad_norm": 12.0625, + "learning_rate": 3.1194480684367302e-06, + "loss": 1.4826576709747314, + "step": 7970 + }, + { + "epoch": 1.4510785473741694, + "grad_norm": 10.5, + "learning_rate": 3.1186832786449152e-06, + "loss": 1.3932586908340454, + "step": 7972 + }, + { + "epoch": 1.4514426139983616, + "grad_norm": 7.4375, + "learning_rate": 3.1179184714368094e-06, + "loss": 1.2970638275146484, + "step": 7974 + }, + { + "epoch": 1.4518066806225538, + "grad_norm": 21.25, + "learning_rate": 3.1171536469246468e-06, + "loss": 1.4657469987869263, + "step": 7976 + }, + { + "epoch": 1.4521707472467462, + "grad_norm": 7.40625, + "learning_rate": 3.116388805220661e-06, + "loss": 1.4523301124572754, + "step": 7978 + }, + { + "epoch": 1.4525348138709384, + "grad_norm": 7.125, + "learning_rate": 3.1156239464370895e-06, + "loss": 1.1368095874786377, + "step": 7980 + }, + { + "epoch": 1.4528988804951306, + "grad_norm": 16.125, + "learning_rate": 3.1148590706861725e-06, + "loss": 1.4664783477783203, + "step": 7982 + }, + { + "epoch": 1.4532629471193228, + "grad_norm": 9.1875, + "learning_rate": 3.1140941780801524e-06, + "loss": 1.066304326057434, + "step": 7984 + }, + { + "epoch": 1.453627013743515, + "grad_norm": 8.1875, + "learning_rate": 3.113329268731274e-06, + "loss": 1.450404405593872, + "step": 7986 + }, + { + "epoch": 1.4539910803677074, + "grad_norm": 15.625, + "learning_rate": 3.112564342751785e-06, + "loss": 1.0457264184951782, + "step": 7988 + }, + { + "epoch": 1.4543551469918996, + "grad_norm": 32.75, + "learning_rate": 3.111799400253934e-06, + "loss": 0.5165541172027588, + "step": 7990 + }, + { + "epoch": 1.4547192136160918, + "grad_norm": 17.5, + "learning_rate": 3.111034441349975e-06, + "loss": 1.372770071029663, + "step": 7992 + }, + { + "epoch": 1.455083280240284, + "grad_norm": 14.5625, + "learning_rate": 3.1102694661521615e-06, + "loss": 1.4857652187347412, + "step": 7994 + }, + { + "epoch": 1.4554473468644762, + "grad_norm": 6.65625, + "learning_rate": 3.109504474772751e-06, + "loss": 1.6825917959213257, + "step": 7996 + }, + { + "epoch": 1.4558114134886684, + "grad_norm": 8.5625, + "learning_rate": 3.1087394673240025e-06, + "loss": 1.1532573699951172, + "step": 7998 + }, + { + "epoch": 1.4561754801128606, + "grad_norm": 12.625, + "learning_rate": 3.107974443918177e-06, + "loss": 1.6383496522903442, + "step": 8000 + }, + { + "epoch": 1.4565395467370528, + "grad_norm": 9.8125, + "learning_rate": 3.1072094046675406e-06, + "loss": 2.055211067199707, + "step": 8002 + }, + { + "epoch": 1.4569036133612452, + "grad_norm": 23.5, + "learning_rate": 3.1064443496843576e-06, + "loss": 1.3486170768737793, + "step": 8004 + }, + { + "epoch": 1.4572676799854374, + "grad_norm": 12.125, + "learning_rate": 3.105679279080898e-06, + "loss": 1.635927677154541, + "step": 8006 + }, + { + "epoch": 1.4576317466096296, + "grad_norm": 24.375, + "learning_rate": 3.1049141929694325e-06, + "loss": 1.377577543258667, + "step": 8008 + }, + { + "epoch": 1.4579958132338218, + "grad_norm": 18.375, + "learning_rate": 3.1041490914622347e-06, + "loss": 1.2570985555648804, + "step": 8010 + }, + { + "epoch": 1.458359879858014, + "grad_norm": 3.796875, + "learning_rate": 3.1033839746715793e-06, + "loss": 1.173210620880127, + "step": 8012 + }, + { + "epoch": 1.4587239464822064, + "grad_norm": 5.21875, + "learning_rate": 3.1026188427097447e-06, + "loss": 1.577376365661621, + "step": 8014 + }, + { + "epoch": 1.4590880131063986, + "grad_norm": 11.5625, + "learning_rate": 3.101853695689011e-06, + "loss": 1.3778821229934692, + "step": 8016 + }, + { + "epoch": 1.4594520797305908, + "grad_norm": 5.71875, + "learning_rate": 3.1010885337216603e-06, + "loss": 1.2397392988204956, + "step": 8018 + }, + { + "epoch": 1.459816146354783, + "grad_norm": 16.375, + "learning_rate": 3.100323356919977e-06, + "loss": 1.34848952293396, + "step": 8020 + }, + { + "epoch": 1.4601802129789752, + "grad_norm": 12.5, + "learning_rate": 3.099558165396248e-06, + "loss": 1.145046591758728, + "step": 8022 + }, + { + "epoch": 1.4605442796031674, + "grad_norm": 13.1875, + "learning_rate": 3.098792959262761e-06, + "loss": 1.056328296661377, + "step": 8024 + }, + { + "epoch": 1.4609083462273595, + "grad_norm": 32.5, + "learning_rate": 3.098027738631808e-06, + "loss": 1.5916104316711426, + "step": 8026 + }, + { + "epoch": 1.4612724128515517, + "grad_norm": 9.0625, + "learning_rate": 3.097262503615681e-06, + "loss": 1.547664999961853, + "step": 8028 + }, + { + "epoch": 1.461636479475744, + "grad_norm": 8.0625, + "learning_rate": 3.096497254326676e-06, + "loss": 1.4686845541000366, + "step": 8030 + }, + { + "epoch": 1.4620005460999363, + "grad_norm": 6.53125, + "learning_rate": 3.09573199087709e-06, + "loss": 1.4751832485198975, + "step": 8032 + }, + { + "epoch": 1.4623646127241285, + "grad_norm": 9.0625, + "learning_rate": 3.094966713379223e-06, + "loss": 1.283064842224121, + "step": 8034 + }, + { + "epoch": 1.4627286793483207, + "grad_norm": 29.375, + "learning_rate": 3.0942014219453755e-06, + "loss": 1.478798508644104, + "step": 8036 + }, + { + "epoch": 1.463092745972513, + "grad_norm": 11.4375, + "learning_rate": 3.09343611668785e-06, + "loss": 1.4340828657150269, + "step": 8038 + }, + { + "epoch": 1.4634568125967051, + "grad_norm": 11.3125, + "learning_rate": 3.0926707977189543e-06, + "loss": 1.4324859380722046, + "step": 8040 + }, + { + "epoch": 1.4638208792208975, + "grad_norm": 13.875, + "learning_rate": 3.091905465150994e-06, + "loss": 1.4226700067520142, + "step": 8042 + }, + { + "epoch": 1.4641849458450897, + "grad_norm": 13.125, + "learning_rate": 3.091140119096279e-06, + "loss": 1.43776273727417, + "step": 8044 + }, + { + "epoch": 1.464549012469282, + "grad_norm": 4.90625, + "learning_rate": 3.090374759667121e-06, + "loss": 1.3603628873825073, + "step": 8046 + }, + { + "epoch": 1.4649130790934741, + "grad_norm": 5.5625, + "learning_rate": 3.0896093869758336e-06, + "loss": 1.3050345182418823, + "step": 8048 + }, + { + "epoch": 1.4652771457176663, + "grad_norm": 8.5, + "learning_rate": 3.088844001134732e-06, + "loss": 1.2661861181259155, + "step": 8050 + }, + { + "epoch": 1.4656412123418585, + "grad_norm": 84.5, + "learning_rate": 3.088078602256133e-06, + "loss": 2.0722508430480957, + "step": 8052 + }, + { + "epoch": 1.4660052789660507, + "grad_norm": 6.59375, + "learning_rate": 3.0873131904523557e-06, + "loss": 1.0848398208618164, + "step": 8054 + }, + { + "epoch": 1.466369345590243, + "grad_norm": 9.5, + "learning_rate": 3.086547765835721e-06, + "loss": 1.4588651657104492, + "step": 8056 + }, + { + "epoch": 1.4667334122144353, + "grad_norm": 6.0625, + "learning_rate": 3.085782328518554e-06, + "loss": 1.3222707509994507, + "step": 8058 + }, + { + "epoch": 1.4670974788386275, + "grad_norm": 6.09375, + "learning_rate": 3.0850168786131766e-06, + "loss": 1.1119022369384766, + "step": 8060 + }, + { + "epoch": 1.4674615454628197, + "grad_norm": 14.75, + "learning_rate": 3.0842514162319183e-06, + "loss": 1.4139639139175415, + "step": 8062 + }, + { + "epoch": 1.467825612087012, + "grad_norm": 5.125, + "learning_rate": 3.0834859414871044e-06, + "loss": 1.336667776107788, + "step": 8064 + }, + { + "epoch": 1.468189678711204, + "grad_norm": 11.6875, + "learning_rate": 3.0827204544910682e-06, + "loss": 1.0249202251434326, + "step": 8066 + }, + { + "epoch": 1.4685537453353965, + "grad_norm": 9.25, + "learning_rate": 3.0819549553561397e-06, + "loss": 1.8818554878234863, + "step": 8068 + }, + { + "epoch": 1.4689178119595887, + "grad_norm": 22.75, + "learning_rate": 3.081189444194654e-06, + "loss": 1.6467548608779907, + "step": 8070 + }, + { + "epoch": 1.469281878583781, + "grad_norm": 13.0625, + "learning_rate": 3.0804239211189468e-06, + "loss": 1.161402702331543, + "step": 8072 + }, + { + "epoch": 1.469645945207973, + "grad_norm": 8.125, + "learning_rate": 3.0796583862413544e-06, + "loss": 1.1921584606170654, + "step": 8074 + }, + { + "epoch": 1.4700100118321653, + "grad_norm": 12.25, + "learning_rate": 3.078892839674217e-06, + "loss": 1.2623369693756104, + "step": 8076 + }, + { + "epoch": 1.4703740784563575, + "grad_norm": 29.375, + "learning_rate": 3.0781272815298746e-06, + "loss": 1.339120864868164, + "step": 8078 + }, + { + "epoch": 1.4707381450805497, + "grad_norm": 25.125, + "learning_rate": 3.077361711920671e-06, + "loss": 1.0708491802215576, + "step": 8080 + }, + { + "epoch": 1.4711022117047419, + "grad_norm": 12.6875, + "learning_rate": 3.0765961309589493e-06, + "loss": 1.5739586353302002, + "step": 8082 + }, + { + "epoch": 1.471466278328934, + "grad_norm": 10.6875, + "learning_rate": 3.0758305387570563e-06, + "loss": 1.372029185295105, + "step": 8084 + }, + { + "epoch": 1.4718303449531265, + "grad_norm": 54.5, + "learning_rate": 3.0750649354273387e-06, + "loss": 1.2132720947265625, + "step": 8086 + }, + { + "epoch": 1.4721944115773187, + "grad_norm": 2.65625, + "learning_rate": 3.074299321082146e-06, + "loss": 1.1051228046417236, + "step": 8088 + }, + { + "epoch": 1.4725584782015109, + "grad_norm": 6.625, + "learning_rate": 3.0735336958338284e-06, + "loss": 1.4189369678497314, + "step": 8090 + }, + { + "epoch": 1.472922544825703, + "grad_norm": 42.5, + "learning_rate": 3.0727680597947396e-06, + "loss": 1.394530177116394, + "step": 8092 + }, + { + "epoch": 1.4732866114498953, + "grad_norm": 5.40625, + "learning_rate": 3.072002413077233e-06, + "loss": 1.2035231590270996, + "step": 8094 + }, + { + "epoch": 1.4736506780740877, + "grad_norm": 3.546875, + "learning_rate": 3.071236755793664e-06, + "loss": 1.2046468257904053, + "step": 8096 + }, + { + "epoch": 1.4740147446982799, + "grad_norm": 5.90625, + "learning_rate": 3.0704710880563893e-06, + "loss": 0.879520833492279, + "step": 8098 + }, + { + "epoch": 1.474378811322472, + "grad_norm": 27.625, + "learning_rate": 3.069705409977769e-06, + "loss": 0.37783581018447876, + "step": 8100 + }, + { + "epoch": 1.4747428779466643, + "grad_norm": 20.875, + "learning_rate": 3.068939721670162e-06, + "loss": 0.9105134606361389, + "step": 8102 + }, + { + "epoch": 1.4751069445708564, + "grad_norm": 5.5625, + "learning_rate": 3.0681740232459297e-06, + "loss": 1.276327133178711, + "step": 8104 + }, + { + "epoch": 1.4754710111950486, + "grad_norm": 10.8125, + "learning_rate": 3.067408314817436e-06, + "loss": 1.510435700416565, + "step": 8106 + }, + { + "epoch": 1.4758350778192408, + "grad_norm": 15.3125, + "learning_rate": 3.0666425964970452e-06, + "loss": 1.6176692247390747, + "step": 8108 + }, + { + "epoch": 1.476199144443433, + "grad_norm": 5.78125, + "learning_rate": 3.065876868397124e-06, + "loss": 1.3561370372772217, + "step": 8110 + }, + { + "epoch": 1.4765632110676254, + "grad_norm": 4.59375, + "learning_rate": 3.0651111306300384e-06, + "loss": 0.8761101961135864, + "step": 8112 + }, + { + "epoch": 1.4769272776918176, + "grad_norm": 8.3125, + "learning_rate": 3.0643453833081587e-06, + "loss": 1.4022860527038574, + "step": 8114 + }, + { + "epoch": 1.4772913443160098, + "grad_norm": 10.3125, + "learning_rate": 3.0635796265438555e-06, + "loss": 1.4284610748291016, + "step": 8116 + }, + { + "epoch": 1.477655410940202, + "grad_norm": 13.75, + "learning_rate": 3.062813860449499e-06, + "loss": 1.4287099838256836, + "step": 8118 + }, + { + "epoch": 1.4780194775643942, + "grad_norm": 19.625, + "learning_rate": 3.0620480851374623e-06, + "loss": 1.8255910873413086, + "step": 8120 + }, + { + "epoch": 1.4783835441885866, + "grad_norm": 8.9375, + "learning_rate": 3.061282300720122e-06, + "loss": 1.334941029548645, + "step": 8122 + }, + { + "epoch": 1.4787476108127788, + "grad_norm": 9.0625, + "learning_rate": 3.060516507309852e-06, + "loss": 1.1309086084365845, + "step": 8124 + }, + { + "epoch": 1.479111677436971, + "grad_norm": 8.0, + "learning_rate": 3.0597507050190296e-06, + "loss": 1.3888471126556396, + "step": 8126 + }, + { + "epoch": 1.4794757440611632, + "grad_norm": 15.375, + "learning_rate": 3.058984893960033e-06, + "loss": 1.4849586486816406, + "step": 8128 + }, + { + "epoch": 1.4798398106853554, + "grad_norm": 4.28125, + "learning_rate": 3.0582190742452433e-06, + "loss": 1.0041438341140747, + "step": 8130 + }, + { + "epoch": 1.4802038773095476, + "grad_norm": 3.78125, + "learning_rate": 3.057453245987039e-06, + "loss": 0.9366267919540405, + "step": 8132 + }, + { + "epoch": 1.4805679439337398, + "grad_norm": 9.4375, + "learning_rate": 3.0566874092978048e-06, + "loss": 1.234405517578125, + "step": 8134 + }, + { + "epoch": 1.480932010557932, + "grad_norm": 5.375, + "learning_rate": 3.055921564289923e-06, + "loss": 1.4830735921859741, + "step": 8136 + }, + { + "epoch": 1.4812960771821242, + "grad_norm": 6.0, + "learning_rate": 3.055155711075778e-06, + "loss": 1.4721479415893555, + "step": 8138 + }, + { + "epoch": 1.4816601438063166, + "grad_norm": 9.25, + "learning_rate": 3.054389849767756e-06, + "loss": 1.367739200592041, + "step": 8140 + }, + { + "epoch": 1.4820242104305088, + "grad_norm": 18.75, + "learning_rate": 3.053623980478243e-06, + "loss": 1.5807201862335205, + "step": 8142 + }, + { + "epoch": 1.482388277054701, + "grad_norm": 36.5, + "learning_rate": 3.0528581033196298e-06, + "loss": 1.1614680290222168, + "step": 8144 + }, + { + "epoch": 1.4827523436788932, + "grad_norm": 6.21875, + "learning_rate": 3.0520922184043036e-06, + "loss": 1.2902480363845825, + "step": 8146 + }, + { + "epoch": 1.4831164103030854, + "grad_norm": 8.9375, + "learning_rate": 3.0513263258446545e-06, + "loss": 1.444229245185852, + "step": 8148 + }, + { + "epoch": 1.4834804769272778, + "grad_norm": 8.4375, + "learning_rate": 3.0505604257530762e-06, + "loss": 1.3943898677825928, + "step": 8150 + }, + { + "epoch": 1.48384454355147, + "grad_norm": 32.75, + "learning_rate": 3.04979451824196e-06, + "loss": 1.3271794319152832, + "step": 8152 + }, + { + "epoch": 1.4842086101756622, + "grad_norm": 11.4375, + "learning_rate": 3.0490286034237003e-06, + "loss": 1.4934171438217163, + "step": 8154 + }, + { + "epoch": 1.4845726767998544, + "grad_norm": 11.5, + "learning_rate": 3.048262681410691e-06, + "loss": 1.3100519180297852, + "step": 8156 + }, + { + "epoch": 1.4849367434240466, + "grad_norm": 13.9375, + "learning_rate": 3.0474967523153293e-06, + "loss": 1.5533921718597412, + "step": 8158 + }, + { + "epoch": 1.4853008100482388, + "grad_norm": 28.75, + "learning_rate": 3.046730816250012e-06, + "loss": 1.5534807443618774, + "step": 8160 + }, + { + "epoch": 1.485664876672431, + "grad_norm": 11.5625, + "learning_rate": 3.0459648733271365e-06, + "loss": 1.6323692798614502, + "step": 8162 + }, + { + "epoch": 1.4860289432966232, + "grad_norm": 13.9375, + "learning_rate": 3.045198923659102e-06, + "loss": 1.491867184638977, + "step": 8164 + }, + { + "epoch": 1.4863930099208156, + "grad_norm": 16.25, + "learning_rate": 3.0444329673583084e-06, + "loss": 1.481719732284546, + "step": 8166 + }, + { + "epoch": 1.4867570765450078, + "grad_norm": 18.625, + "learning_rate": 3.0436670045371573e-06, + "loss": 1.4221787452697754, + "step": 8168 + }, + { + "epoch": 1.4871211431692, + "grad_norm": 12.0625, + "learning_rate": 3.04290103530805e-06, + "loss": 1.4206541776657104, + "step": 8170 + }, + { + "epoch": 1.4874852097933922, + "grad_norm": 10.5625, + "learning_rate": 3.0421350597833897e-06, + "loss": 1.2737412452697754, + "step": 8172 + }, + { + "epoch": 1.4878492764175844, + "grad_norm": 13.0, + "learning_rate": 3.0413690780755804e-06, + "loss": 1.2841761112213135, + "step": 8174 + }, + { + "epoch": 1.4882133430417768, + "grad_norm": 10.6875, + "learning_rate": 3.0406030902970262e-06, + "loss": 0.9190031290054321, + "step": 8176 + }, + { + "epoch": 1.488577409665969, + "grad_norm": 15.375, + "learning_rate": 3.039837096560133e-06, + "loss": 1.3743256330490112, + "step": 8178 + }, + { + "epoch": 1.4889414762901612, + "grad_norm": 54.75, + "learning_rate": 3.0390710969773075e-06, + "loss": 1.454878807067871, + "step": 8180 + }, + { + "epoch": 1.4893055429143534, + "grad_norm": 14.375, + "learning_rate": 3.038305091660957e-06, + "loss": 1.7485191822052002, + "step": 8182 + }, + { + "epoch": 1.4896696095385455, + "grad_norm": 22.125, + "learning_rate": 3.0375390807234894e-06, + "loss": 1.5463649034500122, + "step": 8184 + }, + { + "epoch": 1.4900336761627377, + "grad_norm": 27.375, + "learning_rate": 3.036773064277314e-06, + "loss": 1.2868549823760986, + "step": 8186 + }, + { + "epoch": 1.49039774278693, + "grad_norm": 23.0, + "learning_rate": 3.036007042434841e-06, + "loss": 0.8589935898780823, + "step": 8188 + }, + { + "epoch": 1.4907618094111221, + "grad_norm": 4.625, + "learning_rate": 3.0352410153084792e-06, + "loss": 1.306921124458313, + "step": 8190 + }, + { + "epoch": 1.4911258760353143, + "grad_norm": 6.28125, + "learning_rate": 3.0344749830106424e-06, + "loss": 1.117292881011963, + "step": 8192 + }, + { + "epoch": 1.4914899426595067, + "grad_norm": 16.25, + "learning_rate": 3.0337089456537405e-06, + "loss": 1.467806100845337, + "step": 8194 + }, + { + "epoch": 1.491854009283699, + "grad_norm": 9.25, + "learning_rate": 3.0329429033501877e-06, + "loss": 1.3650826215744019, + "step": 8196 + }, + { + "epoch": 1.4922180759078911, + "grad_norm": 27.125, + "learning_rate": 3.0321768562123976e-06, + "loss": 1.2112196683883667, + "step": 8198 + }, + { + "epoch": 1.4925821425320833, + "grad_norm": 8.0625, + "learning_rate": 3.031410804352784e-06, + "loss": 1.5436772108078003, + "step": 8200 + }, + { + "epoch": 1.4929462091562755, + "grad_norm": 24.25, + "learning_rate": 3.0306447478837625e-06, + "loss": 1.4654940366744995, + "step": 8202 + }, + { + "epoch": 1.493310275780468, + "grad_norm": 7.125, + "learning_rate": 3.0298786869177487e-06, + "loss": 1.1727949380874634, + "step": 8204 + }, + { + "epoch": 1.4936743424046601, + "grad_norm": 29.375, + "learning_rate": 3.0291126215671575e-06, + "loss": 1.5292565822601318, + "step": 8206 + }, + { + "epoch": 1.4940384090288523, + "grad_norm": 32.5, + "learning_rate": 3.028346551944408e-06, + "loss": 1.246259093284607, + "step": 8208 + }, + { + "epoch": 1.4944024756530445, + "grad_norm": 9.875, + "learning_rate": 3.027580478161917e-06, + "loss": 0.7279324531555176, + "step": 8210 + }, + { + "epoch": 1.4947665422772367, + "grad_norm": 17.75, + "learning_rate": 3.0268144003321023e-06, + "loss": 1.494260549545288, + "step": 8212 + }, + { + "epoch": 1.495130608901429, + "grad_norm": 6.5625, + "learning_rate": 3.026048318567383e-06, + "loss": 1.555148959159851, + "step": 8214 + }, + { + "epoch": 1.495494675525621, + "grad_norm": 6.53125, + "learning_rate": 3.0252822329801785e-06, + "loss": 1.1693923473358154, + "step": 8216 + }, + { + "epoch": 1.4958587421498133, + "grad_norm": 9.5, + "learning_rate": 3.0245161436829083e-06, + "loss": 1.3028727769851685, + "step": 8218 + }, + { + "epoch": 1.4962228087740057, + "grad_norm": 8.5, + "learning_rate": 3.0237500507879948e-06, + "loss": 1.2423659563064575, + "step": 8220 + }, + { + "epoch": 1.496586875398198, + "grad_norm": 15.5625, + "learning_rate": 3.0229839544078567e-06, + "loss": 1.7578924894332886, + "step": 8222 + }, + { + "epoch": 1.49695094202239, + "grad_norm": 14.125, + "learning_rate": 3.0222178546549164e-06, + "loss": 1.4031219482421875, + "step": 8224 + }, + { + "epoch": 1.4973150086465823, + "grad_norm": 15.3125, + "learning_rate": 3.0214517516415955e-06, + "loss": 1.277764916419983, + "step": 8226 + }, + { + "epoch": 1.4976790752707745, + "grad_norm": 10.8125, + "learning_rate": 3.0206856454803173e-06, + "loss": 1.088913917541504, + "step": 8228 + }, + { + "epoch": 1.498043141894967, + "grad_norm": 9.6875, + "learning_rate": 3.0199195362835047e-06, + "loss": 1.5067713260650635, + "step": 8230 + }, + { + "epoch": 1.498407208519159, + "grad_norm": 9.4375, + "learning_rate": 3.019153424163581e-06, + "loss": 1.273909330368042, + "step": 8232 + }, + { + "epoch": 1.4987712751433513, + "grad_norm": 29.5, + "learning_rate": 3.0183873092329694e-06, + "loss": 1.5521973371505737, + "step": 8234 + }, + { + "epoch": 1.4991353417675435, + "grad_norm": 7.53125, + "learning_rate": 3.0176211916040955e-06, + "loss": 1.6984868049621582, + "step": 8236 + }, + { + "epoch": 1.4994994083917357, + "grad_norm": 13.5, + "learning_rate": 3.0168550713893824e-06, + "loss": 1.213895320892334, + "step": 8238 + }, + { + "epoch": 1.4998634750159279, + "grad_norm": 9.625, + "learning_rate": 3.0160889487012556e-06, + "loss": 1.7921996116638184, + "step": 8240 + }, + { + "epoch": 1.50022754164012, + "grad_norm": 17.625, + "learning_rate": 3.0153228236521413e-06, + "loss": 1.859833002090454, + "step": 8242 + }, + { + "epoch": 1.5005916082643123, + "grad_norm": 9.8125, + "learning_rate": 3.0145566963544654e-06, + "loss": 1.4426655769348145, + "step": 8244 + }, + { + "epoch": 1.5009556748885045, + "grad_norm": 8.8125, + "learning_rate": 3.0137905669206525e-06, + "loss": 1.1333223581314087, + "step": 8246 + }, + { + "epoch": 1.5013197415126969, + "grad_norm": 15.8125, + "learning_rate": 3.01302443546313e-06, + "loss": 1.4960157871246338, + "step": 8248 + }, + { + "epoch": 1.501683808136889, + "grad_norm": 22.875, + "learning_rate": 3.012258302094324e-06, + "loss": 1.5881778001785278, + "step": 8250 + }, + { + "epoch": 1.5020478747610813, + "grad_norm": 20.125, + "learning_rate": 3.011492166926662e-06, + "loss": 1.6777312755584717, + "step": 8252 + }, + { + "epoch": 1.5024119413852735, + "grad_norm": 13.4375, + "learning_rate": 3.0107260300725717e-06, + "loss": 1.4614934921264648, + "step": 8254 + }, + { + "epoch": 1.5027760080094659, + "grad_norm": 6.5, + "learning_rate": 3.0099598916444793e-06, + "loss": 1.0745995044708252, + "step": 8256 + }, + { + "epoch": 1.503140074633658, + "grad_norm": 20.625, + "learning_rate": 3.0091937517548137e-06, + "loss": 1.2333183288574219, + "step": 8258 + }, + { + "epoch": 1.5035041412578503, + "grad_norm": 11.125, + "learning_rate": 3.0084276105160013e-06, + "loss": 1.2908381223678589, + "step": 8260 + }, + { + "epoch": 1.5038682078820425, + "grad_norm": 12.0625, + "learning_rate": 3.0076614680404725e-06, + "loss": 1.2935439348220825, + "step": 8262 + }, + { + "epoch": 1.5042322745062346, + "grad_norm": 5.75, + "learning_rate": 3.0068953244406536e-06, + "loss": 1.1082379817962646, + "step": 8264 + }, + { + "epoch": 1.5045963411304268, + "grad_norm": 11.125, + "learning_rate": 3.0061291798289738e-06, + "loss": 1.4589749574661255, + "step": 8266 + }, + { + "epoch": 1.504960407754619, + "grad_norm": 12.3125, + "learning_rate": 3.005363034317862e-06, + "loss": 1.7751314640045166, + "step": 8268 + }, + { + "epoch": 1.5053244743788112, + "grad_norm": 11.9375, + "learning_rate": 3.004596888019746e-06, + "loss": 1.367035984992981, + "step": 8270 + }, + { + "epoch": 1.5056885410030034, + "grad_norm": 9.0625, + "learning_rate": 3.0038307410470556e-06, + "loss": 0.9916867017745972, + "step": 8272 + }, + { + "epoch": 1.5060526076271956, + "grad_norm": 8.125, + "learning_rate": 3.0030645935122198e-06, + "loss": 1.3437137603759766, + "step": 8274 + }, + { + "epoch": 1.506416674251388, + "grad_norm": 21.75, + "learning_rate": 3.002298445527667e-06, + "loss": 1.4305222034454346, + "step": 8276 + }, + { + "epoch": 1.5067807408755802, + "grad_norm": 10.9375, + "learning_rate": 3.001532297205827e-06, + "loss": 1.4024221897125244, + "step": 8278 + }, + { + "epoch": 1.5071448074997724, + "grad_norm": 14.5625, + "learning_rate": 3.000766148659129e-06, + "loss": 1.1061606407165527, + "step": 8280 + }, + { + "epoch": 1.5075088741239648, + "grad_norm": 9.75, + "learning_rate": 3.0000000000000005e-06, + "loss": 1.2565230131149292, + "step": 8282 + }, + { + "epoch": 1.507872940748157, + "grad_norm": 8.6875, + "learning_rate": 2.9992338513408724e-06, + "loss": 1.5496501922607422, + "step": 8284 + }, + { + "epoch": 1.5082370073723492, + "grad_norm": 10.5625, + "learning_rate": 2.998467702794174e-06, + "loss": 1.5667330026626587, + "step": 8286 + }, + { + "epoch": 1.5086010739965414, + "grad_norm": 7.3125, + "learning_rate": 2.9977015544723336e-06, + "loss": 1.2874367237091064, + "step": 8288 + }, + { + "epoch": 1.5089651406207336, + "grad_norm": 13.3125, + "learning_rate": 2.9969354064877817e-06, + "loss": 1.5023891925811768, + "step": 8290 + }, + { + "epoch": 1.5093292072449258, + "grad_norm": 8.625, + "learning_rate": 2.996169258952945e-06, + "loss": 1.387244462966919, + "step": 8292 + }, + { + "epoch": 1.509693273869118, + "grad_norm": 7.78125, + "learning_rate": 2.995403111980254e-06, + "loss": 1.2260160446166992, + "step": 8294 + }, + { + "epoch": 1.5100573404933102, + "grad_norm": 16.125, + "learning_rate": 2.9946369656821396e-06, + "loss": 0.8720483779907227, + "step": 8296 + }, + { + "epoch": 1.5104214071175024, + "grad_norm": 12.3125, + "learning_rate": 2.9938708201710272e-06, + "loss": 1.3226685523986816, + "step": 8298 + }, + { + "epoch": 1.5107854737416946, + "grad_norm": 29.375, + "learning_rate": 2.993104675559348e-06, + "loss": 1.5295064449310303, + "step": 8300 + }, + { + "epoch": 1.511149540365887, + "grad_norm": 27.625, + "learning_rate": 2.992338531959529e-06, + "loss": 1.7971018552780151, + "step": 8302 + }, + { + "epoch": 1.5115136069900792, + "grad_norm": 11.375, + "learning_rate": 2.991572389483999e-06, + "loss": 1.0405447483062744, + "step": 8304 + }, + { + "epoch": 1.5118776736142714, + "grad_norm": 5.40625, + "learning_rate": 2.9908062482451873e-06, + "loss": 1.3780726194381714, + "step": 8306 + }, + { + "epoch": 1.5122417402384638, + "grad_norm": 11.3125, + "learning_rate": 2.9900401083555213e-06, + "loss": 0.9142655730247498, + "step": 8308 + }, + { + "epoch": 1.512605806862656, + "grad_norm": 27.75, + "learning_rate": 2.9892739699274297e-06, + "loss": 0.9862573742866516, + "step": 8310 + }, + { + "epoch": 1.5129698734868482, + "grad_norm": 10.3125, + "learning_rate": 2.9885078330733384e-06, + "loss": 1.7985886335372925, + "step": 8312 + }, + { + "epoch": 1.5133339401110404, + "grad_norm": 30.25, + "learning_rate": 2.987741697905676e-06, + "loss": 1.7422081232070923, + "step": 8314 + }, + { + "epoch": 1.5136980067352326, + "grad_norm": 19.75, + "learning_rate": 2.9869755645368713e-06, + "loss": 2.1116433143615723, + "step": 8316 + }, + { + "epoch": 1.5140620733594248, + "grad_norm": 19.25, + "learning_rate": 2.9862094330793485e-06, + "loss": 2.140390157699585, + "step": 8318 + }, + { + "epoch": 1.514426139983617, + "grad_norm": 22.625, + "learning_rate": 2.9854433036455364e-06, + "loss": 1.7981786727905273, + "step": 8320 + }, + { + "epoch": 1.5147902066078092, + "grad_norm": 39.5, + "learning_rate": 2.984677176347859e-06, + "loss": 1.5796661376953125, + "step": 8322 + }, + { + "epoch": 1.5151542732320014, + "grad_norm": 12.1875, + "learning_rate": 2.9839110512987446e-06, + "loss": 0.5388069748878479, + "step": 8324 + }, + { + "epoch": 1.5155183398561936, + "grad_norm": 12.0, + "learning_rate": 2.9831449286106186e-06, + "loss": 1.4176826477050781, + "step": 8326 + }, + { + "epoch": 1.5158824064803857, + "grad_norm": 14.9375, + "learning_rate": 2.982378808395906e-06, + "loss": 1.4480212926864624, + "step": 8328 + }, + { + "epoch": 1.5162464731045782, + "grad_norm": 8.8125, + "learning_rate": 2.9816126907670316e-06, + "loss": 1.5152921676635742, + "step": 8330 + }, + { + "epoch": 1.5166105397287704, + "grad_norm": 22.875, + "learning_rate": 2.98084657583642e-06, + "loss": 1.5244202613830566, + "step": 8332 + }, + { + "epoch": 1.5169746063529626, + "grad_norm": 20.375, + "learning_rate": 2.980080463716496e-06, + "loss": 0.9391945600509644, + "step": 8334 + }, + { + "epoch": 1.517338672977155, + "grad_norm": 13.75, + "learning_rate": 2.9793143545196833e-06, + "loss": 1.6006522178649902, + "step": 8336 + }, + { + "epoch": 1.5177027396013472, + "grad_norm": 5.65625, + "learning_rate": 2.9785482483584055e-06, + "loss": 1.3516886234283447, + "step": 8338 + }, + { + "epoch": 1.5180668062255394, + "grad_norm": 5.4375, + "learning_rate": 2.9777821453450855e-06, + "loss": 1.4529610872268677, + "step": 8340 + }, + { + "epoch": 1.5184308728497315, + "grad_norm": 10.0625, + "learning_rate": 2.9770160455921448e-06, + "loss": 0.9903704524040222, + "step": 8342 + }, + { + "epoch": 1.5187949394739237, + "grad_norm": 17.875, + "learning_rate": 2.9762499492120062e-06, + "loss": 1.4350577592849731, + "step": 8344 + }, + { + "epoch": 1.519159006098116, + "grad_norm": 8.3125, + "learning_rate": 2.9754838563170923e-06, + "loss": 1.452880859375, + "step": 8346 + }, + { + "epoch": 1.5195230727223081, + "grad_norm": 5.84375, + "learning_rate": 2.9747177670198225e-06, + "loss": 1.024656057357788, + "step": 8348 + }, + { + "epoch": 1.5198871393465003, + "grad_norm": 9.625, + "learning_rate": 2.9739516814326185e-06, + "loss": 1.3598967790603638, + "step": 8350 + }, + { + "epoch": 1.5202512059706925, + "grad_norm": 10.6875, + "learning_rate": 2.9731855996678987e-06, + "loss": 1.1387789249420166, + "step": 8352 + }, + { + "epoch": 1.5206152725948847, + "grad_norm": 32.75, + "learning_rate": 2.9724195218380837e-06, + "loss": 1.6009889841079712, + "step": 8354 + }, + { + "epoch": 1.5209793392190771, + "grad_norm": 11.6875, + "learning_rate": 2.9716534480555924e-06, + "loss": 1.5074760913848877, + "step": 8356 + }, + { + "epoch": 1.5213434058432693, + "grad_norm": 13.9375, + "learning_rate": 2.970887378432843e-06, + "loss": 1.5610768795013428, + "step": 8358 + }, + { + "epoch": 1.5217074724674615, + "grad_norm": 11.75, + "learning_rate": 2.970121313082253e-06, + "loss": 1.690721869468689, + "step": 8360 + }, + { + "epoch": 1.522071539091654, + "grad_norm": 17.5, + "learning_rate": 2.969355252116238e-06, + "loss": 1.8271783590316772, + "step": 8362 + }, + { + "epoch": 1.5224356057158461, + "grad_norm": 11.75, + "learning_rate": 2.9685891956472163e-06, + "loss": 1.4870573282241821, + "step": 8364 + }, + { + "epoch": 1.5227996723400383, + "grad_norm": 29.125, + "learning_rate": 2.967823143787604e-06, + "loss": 1.5060391426086426, + "step": 8366 + }, + { + "epoch": 1.5231637389642305, + "grad_norm": 12.5, + "learning_rate": 2.9670570966498138e-06, + "loss": 1.550807237625122, + "step": 8368 + }, + { + "epoch": 1.5235278055884227, + "grad_norm": 10.625, + "learning_rate": 2.966291054346261e-06, + "loss": 1.4581334590911865, + "step": 8370 + }, + { + "epoch": 1.523891872212615, + "grad_norm": 42.5, + "learning_rate": 2.965525016989359e-06, + "loss": 1.057798147201538, + "step": 8372 + }, + { + "epoch": 1.524255938836807, + "grad_norm": 13.75, + "learning_rate": 2.9647589846915214e-06, + "loss": 1.9072461128234863, + "step": 8374 + }, + { + "epoch": 1.5246200054609993, + "grad_norm": 10.4375, + "learning_rate": 2.9639929575651604e-06, + "loss": 1.8109527826309204, + "step": 8376 + }, + { + "epoch": 1.5249840720851915, + "grad_norm": 10.375, + "learning_rate": 2.9632269357226866e-06, + "loss": 1.144439935684204, + "step": 8378 + }, + { + "epoch": 1.5253481387093837, + "grad_norm": 10.875, + "learning_rate": 2.962460919276512e-06, + "loss": 1.5001131296157837, + "step": 8380 + }, + { + "epoch": 1.525712205333576, + "grad_norm": 39.5, + "learning_rate": 2.9616949083390433e-06, + "loss": 1.561955213546753, + "step": 8382 + }, + { + "epoch": 1.5260762719577683, + "grad_norm": 9.4375, + "learning_rate": 2.9609289030226927e-06, + "loss": 1.3002409934997559, + "step": 8384 + }, + { + "epoch": 1.5264403385819605, + "grad_norm": 22.875, + "learning_rate": 2.960162903439868e-06, + "loss": 1.0219875574111938, + "step": 8386 + }, + { + "epoch": 1.5268044052061527, + "grad_norm": 233.0, + "learning_rate": 2.9593969097029743e-06, + "loss": 1.0393669605255127, + "step": 8388 + }, + { + "epoch": 1.527168471830345, + "grad_norm": 10.25, + "learning_rate": 2.9586309219244214e-06, + "loss": 1.3701894283294678, + "step": 8390 + }, + { + "epoch": 1.5275325384545373, + "grad_norm": 5.03125, + "learning_rate": 2.9578649402166117e-06, + "loss": 1.283165693283081, + "step": 8392 + }, + { + "epoch": 1.5278966050787295, + "grad_norm": 14.125, + "learning_rate": 2.95709896469195e-06, + "loss": 1.4996963739395142, + "step": 8394 + }, + { + "epoch": 1.5282606717029217, + "grad_norm": 10.9375, + "learning_rate": 2.956332995462844e-06, + "loss": 1.4978924989700317, + "step": 8396 + }, + { + "epoch": 1.5286247383271139, + "grad_norm": 25.125, + "learning_rate": 2.9555670326416922e-06, + "loss": 1.7476577758789062, + "step": 8398 + }, + { + "epoch": 1.528988804951306, + "grad_norm": 16.375, + "learning_rate": 2.9548010763408996e-06, + "loss": 1.6362634897232056, + "step": 8400 + }, + { + "epoch": 1.5293528715754983, + "grad_norm": 16.125, + "learning_rate": 2.954035126672865e-06, + "loss": 1.8842966556549072, + "step": 8402 + }, + { + "epoch": 1.5297169381996905, + "grad_norm": 13.6875, + "learning_rate": 2.9532691837499887e-06, + "loss": 1.5491468906402588, + "step": 8404 + }, + { + "epoch": 1.5300810048238827, + "grad_norm": 12.9375, + "learning_rate": 2.9525032476846717e-06, + "loss": 1.4289557933807373, + "step": 8406 + }, + { + "epoch": 1.5304450714480748, + "grad_norm": 10.3125, + "learning_rate": 2.9517373185893093e-06, + "loss": 1.4095780849456787, + "step": 8408 + }, + { + "epoch": 1.5308091380722673, + "grad_norm": 14.0625, + "learning_rate": 2.9509713965763015e-06, + "loss": 1.4054762125015259, + "step": 8410 + }, + { + "epoch": 1.5311732046964595, + "grad_norm": 17.5, + "learning_rate": 2.9502054817580407e-06, + "loss": 1.4419529438018799, + "step": 8412 + }, + { + "epoch": 1.5315372713206516, + "grad_norm": 7.5, + "learning_rate": 2.949439574246924e-06, + "loss": 1.496908187866211, + "step": 8414 + }, + { + "epoch": 1.531901337944844, + "grad_norm": 17.25, + "learning_rate": 2.9486736741553457e-06, + "loss": 1.5230573415756226, + "step": 8416 + }, + { + "epoch": 1.5322654045690363, + "grad_norm": 10.9375, + "learning_rate": 2.947907781595698e-06, + "loss": 1.5322203636169434, + "step": 8418 + }, + { + "epoch": 1.5326294711932285, + "grad_norm": 3.625, + "learning_rate": 2.947141896680372e-06, + "loss": 0.8272435665130615, + "step": 8420 + }, + { + "epoch": 1.5329935378174206, + "grad_norm": 40.5, + "learning_rate": 2.9463760195217573e-06, + "loss": 0.3384915590286255, + "step": 8422 + }, + { + "epoch": 1.5333576044416128, + "grad_norm": 17.75, + "learning_rate": 2.945610150232245e-06, + "loss": 0.5723781585693359, + "step": 8424 + }, + { + "epoch": 1.533721671065805, + "grad_norm": 31.25, + "learning_rate": 2.9448442889242234e-06, + "loss": 0.6992935538291931, + "step": 8426 + }, + { + "epoch": 1.5340857376899972, + "grad_norm": 6.84375, + "learning_rate": 2.9440784357100784e-06, + "loss": 1.1146118640899658, + "step": 8428 + }, + { + "epoch": 1.5344498043141894, + "grad_norm": 14.875, + "learning_rate": 2.9433125907021966e-06, + "loss": 1.372719407081604, + "step": 8430 + }, + { + "epoch": 1.5348138709383816, + "grad_norm": 15.125, + "learning_rate": 2.9425467540129615e-06, + "loss": 1.4509079456329346, + "step": 8432 + }, + { + "epoch": 1.5351779375625738, + "grad_norm": 10.125, + "learning_rate": 2.9417809257547577e-06, + "loss": 1.4932427406311035, + "step": 8434 + }, + { + "epoch": 1.5355420041867662, + "grad_norm": 14.3125, + "learning_rate": 2.941015106039968e-06, + "loss": 1.5959903001785278, + "step": 8436 + }, + { + "epoch": 1.5359060708109584, + "grad_norm": 11.375, + "learning_rate": 2.9402492949809714e-06, + "loss": 1.4296441078186035, + "step": 8438 + }, + { + "epoch": 1.5362701374351506, + "grad_norm": 9.75, + "learning_rate": 2.9394834926901496e-06, + "loss": 1.6396554708480835, + "step": 8440 + }, + { + "epoch": 1.5366342040593428, + "grad_norm": 17.375, + "learning_rate": 2.9387176992798794e-06, + "loss": 1.2531663179397583, + "step": 8442 + }, + { + "epoch": 1.5369982706835352, + "grad_norm": 46.5, + "learning_rate": 2.937951914862538e-06, + "loss": 1.5916471481323242, + "step": 8444 + }, + { + "epoch": 1.5373623373077274, + "grad_norm": 11.75, + "learning_rate": 2.9371861395505018e-06, + "loss": 1.412388563156128, + "step": 8446 + }, + { + "epoch": 1.5377264039319196, + "grad_norm": 34.0, + "learning_rate": 2.936420373456146e-06, + "loss": 1.6131799221038818, + "step": 8448 + }, + { + "epoch": 1.5380904705561118, + "grad_norm": 33.5, + "learning_rate": 2.935654616691842e-06, + "loss": 1.6662569046020508, + "step": 8450 + }, + { + "epoch": 1.538454537180304, + "grad_norm": 7.5625, + "learning_rate": 2.934888869369962e-06, + "loss": 1.488200068473816, + "step": 8452 + }, + { + "epoch": 1.5388186038044962, + "grad_norm": 8.75, + "learning_rate": 2.9341231316028767e-06, + "loss": 1.3250789642333984, + "step": 8454 + }, + { + "epoch": 1.5391826704286884, + "grad_norm": 9.375, + "learning_rate": 2.9333574035029554e-06, + "loss": 1.4266244173049927, + "step": 8456 + }, + { + "epoch": 1.5395467370528806, + "grad_norm": 8.375, + "learning_rate": 2.932591685182565e-06, + "loss": 1.4169809818267822, + "step": 8458 + }, + { + "epoch": 1.5399108036770728, + "grad_norm": 5.53125, + "learning_rate": 2.9318259767540717e-06, + "loss": 1.1201229095458984, + "step": 8460 + }, + { + "epoch": 1.540274870301265, + "grad_norm": 14.0625, + "learning_rate": 2.931060278329839e-06, + "loss": 1.3146371841430664, + "step": 8462 + }, + { + "epoch": 1.5406389369254574, + "grad_norm": 17.25, + "learning_rate": 2.9302945900222314e-06, + "loss": 1.7455590963363647, + "step": 8464 + }, + { + "epoch": 1.5410030035496496, + "grad_norm": 60.0, + "learning_rate": 2.929528911943611e-06, + "loss": 1.6149749755859375, + "step": 8466 + }, + { + "epoch": 1.5413670701738418, + "grad_norm": 25.125, + "learning_rate": 2.9287632442063375e-06, + "loss": 1.7968575954437256, + "step": 8468 + }, + { + "epoch": 1.5417311367980342, + "grad_norm": 5.875, + "learning_rate": 2.9279975869227684e-06, + "loss": 1.500647783279419, + "step": 8470 + }, + { + "epoch": 1.5420952034222264, + "grad_norm": 13.4375, + "learning_rate": 2.9272319402052614e-06, + "loss": 1.484393835067749, + "step": 8472 + }, + { + "epoch": 1.5424592700464186, + "grad_norm": 10.875, + "learning_rate": 2.926466304166172e-06, + "loss": 1.5424009561538696, + "step": 8474 + }, + { + "epoch": 1.5428233366706108, + "grad_norm": 44.5, + "learning_rate": 2.9257006789178554e-06, + "loss": 1.6324516534805298, + "step": 8476 + }, + { + "epoch": 1.543187403294803, + "grad_norm": 29.375, + "learning_rate": 2.9249350645726627e-06, + "loss": 2.130366563796997, + "step": 8478 + }, + { + "epoch": 1.5435514699189952, + "grad_norm": 92.0, + "learning_rate": 2.9241694612429455e-06, + "loss": 1.7553602457046509, + "step": 8480 + }, + { + "epoch": 1.5439155365431874, + "grad_norm": 21.375, + "learning_rate": 2.9234038690410517e-06, + "loss": 1.1325058937072754, + "step": 8482 + }, + { + "epoch": 1.5442796031673796, + "grad_norm": 13.0625, + "learning_rate": 2.92263828807933e-06, + "loss": 1.5293290615081787, + "step": 8484 + }, + { + "epoch": 1.5446436697915717, + "grad_norm": 12.4375, + "learning_rate": 2.921872718470126e-06, + "loss": 1.4125573635101318, + "step": 8486 + }, + { + "epoch": 1.545007736415764, + "grad_norm": 11.5625, + "learning_rate": 2.9211071603257845e-06, + "loss": 1.1901652812957764, + "step": 8488 + }, + { + "epoch": 1.5453718030399564, + "grad_norm": 8.6875, + "learning_rate": 2.920341613758647e-06, + "loss": 0.376153826713562, + "step": 8490 + }, + { + "epoch": 1.5457358696641486, + "grad_norm": 8.625, + "learning_rate": 2.9195760788810547e-06, + "loss": 1.1818699836730957, + "step": 8492 + }, + { + "epoch": 1.5460999362883407, + "grad_norm": 9.9375, + "learning_rate": 2.9188105558053464e-06, + "loss": 1.3671725988388062, + "step": 8494 + }, + { + "epoch": 1.546464002912533, + "grad_norm": 10.9375, + "learning_rate": 2.9180450446438613e-06, + "loss": 1.3944634199142456, + "step": 8496 + }, + { + "epoch": 1.5468280695367254, + "grad_norm": 7.5, + "learning_rate": 2.917279545508933e-06, + "loss": 1.6090569496154785, + "step": 8498 + }, + { + "epoch": 1.5471921361609176, + "grad_norm": 12.0, + "learning_rate": 2.916514058512897e-06, + "loss": 1.305992603302002, + "step": 8500 + }, + { + "epoch": 1.5475562027851097, + "grad_norm": 11.0, + "learning_rate": 2.915748583768083e-06, + "loss": 0.889894962310791, + "step": 8502 + }, + { + "epoch": 1.547920269409302, + "grad_norm": 6.1875, + "learning_rate": 2.9149831213868235e-06, + "loss": 1.3305896520614624, + "step": 8504 + }, + { + "epoch": 1.5482843360334941, + "grad_norm": 11.75, + "learning_rate": 2.9142176714814474e-06, + "loss": 1.4135520458221436, + "step": 8506 + }, + { + "epoch": 1.5486484026576863, + "grad_norm": 6.15625, + "learning_rate": 2.913452234164279e-06, + "loss": 1.272111415863037, + "step": 8508 + }, + { + "epoch": 1.5490124692818785, + "grad_norm": 10.875, + "learning_rate": 2.9126868095476457e-06, + "loss": 1.367983341217041, + "step": 8510 + }, + { + "epoch": 1.5493765359060707, + "grad_norm": 12.1875, + "learning_rate": 2.9119213977438686e-06, + "loss": 1.951696753501892, + "step": 8512 + }, + { + "epoch": 1.549740602530263, + "grad_norm": 39.75, + "learning_rate": 2.911155998865269e-06, + "loss": 1.373100996017456, + "step": 8514 + }, + { + "epoch": 1.550104669154455, + "grad_norm": 12.125, + "learning_rate": 2.910390613024168e-06, + "loss": 1.467010498046875, + "step": 8516 + }, + { + "epoch": 1.5504687357786475, + "grad_norm": 7.65625, + "learning_rate": 2.9096252403328794e-06, + "loss": 1.3421916961669922, + "step": 8518 + }, + { + "epoch": 1.5508328024028397, + "grad_norm": 10.25, + "learning_rate": 2.9088598809037227e-06, + "loss": 1.6727701425552368, + "step": 8520 + }, + { + "epoch": 1.551196869027032, + "grad_norm": 6.96875, + "learning_rate": 2.9080945348490074e-06, + "loss": 1.373345136642456, + "step": 8522 + }, + { + "epoch": 1.5515609356512243, + "grad_norm": 3.78125, + "learning_rate": 2.9073292022810463e-06, + "loss": 1.0632575750350952, + "step": 8524 + }, + { + "epoch": 1.5519250022754165, + "grad_norm": 19.625, + "learning_rate": 2.9065638833121505e-06, + "loss": 1.0813533067703247, + "step": 8526 + }, + { + "epoch": 1.5522890688996087, + "grad_norm": 22.625, + "learning_rate": 2.905798578054626e-06, + "loss": 1.6733286380767822, + "step": 8528 + }, + { + "epoch": 1.552653135523801, + "grad_norm": 43.0, + "learning_rate": 2.9050332866207786e-06, + "loss": 1.6042594909667969, + "step": 8530 + }, + { + "epoch": 1.553017202147993, + "grad_norm": 11.375, + "learning_rate": 2.90426800912291e-06, + "loss": 1.4242624044418335, + "step": 8532 + }, + { + "epoch": 1.5533812687721853, + "grad_norm": 6.75, + "learning_rate": 2.9035027456733246e-06, + "loss": 1.436721682548523, + "step": 8534 + }, + { + "epoch": 1.5537453353963775, + "grad_norm": 10.8125, + "learning_rate": 2.9027374963843203e-06, + "loss": 1.2582474946975708, + "step": 8536 + }, + { + "epoch": 1.5541094020205697, + "grad_norm": 9.875, + "learning_rate": 2.9019722613681933e-06, + "loss": 0.9072771668434143, + "step": 8538 + }, + { + "epoch": 1.5544734686447619, + "grad_norm": 47.75, + "learning_rate": 2.9012070407372405e-06, + "loss": 1.0377178192138672, + "step": 8540 + }, + { + "epoch": 1.554837535268954, + "grad_norm": 13.9375, + "learning_rate": 2.9004418346037534e-06, + "loss": 1.526257872581482, + "step": 8542 + }, + { + "epoch": 1.5552016018931465, + "grad_norm": 12.0, + "learning_rate": 2.899676643080024e-06, + "loss": 1.6981232166290283, + "step": 8544 + }, + { + "epoch": 1.5555656685173387, + "grad_norm": 11.0625, + "learning_rate": 2.8989114662783403e-06, + "loss": 1.436377763748169, + "step": 8546 + }, + { + "epoch": 1.5559297351415309, + "grad_norm": 6.9375, + "learning_rate": 2.898146304310989e-06, + "loss": 1.0867383480072021, + "step": 8548 + }, + { + "epoch": 1.556293801765723, + "grad_norm": 8.625, + "learning_rate": 2.8973811572902567e-06, + "loss": 0.9838950037956238, + "step": 8550 + }, + { + "epoch": 1.5566578683899155, + "grad_norm": 169.0, + "learning_rate": 2.8966160253284213e-06, + "loss": 0.7961543798446655, + "step": 8552 + }, + { + "epoch": 1.5570219350141077, + "grad_norm": 8.375, + "learning_rate": 2.8958509085377663e-06, + "loss": 0.9839916229248047, + "step": 8554 + }, + { + "epoch": 1.5573860016382999, + "grad_norm": 24.125, + "learning_rate": 2.895085807030568e-06, + "loss": 1.0826778411865234, + "step": 8556 + }, + { + "epoch": 1.557750068262492, + "grad_norm": 32.75, + "learning_rate": 2.8943207209191025e-06, + "loss": 1.559889554977417, + "step": 8558 + }, + { + "epoch": 1.5581141348866843, + "grad_norm": 24.625, + "learning_rate": 2.893555650315644e-06, + "loss": 1.548248052597046, + "step": 8560 + }, + { + "epoch": 1.5584782015108765, + "grad_norm": 11.625, + "learning_rate": 2.892790595332461e-06, + "loss": 1.5722010135650635, + "step": 8562 + }, + { + "epoch": 1.5588422681350687, + "grad_norm": 11.1875, + "learning_rate": 2.8920255560818233e-06, + "loss": 1.3703052997589111, + "step": 8564 + }, + { + "epoch": 1.5592063347592608, + "grad_norm": 24.75, + "learning_rate": 2.891260532675999e-06, + "loss": 1.5773344039916992, + "step": 8566 + }, + { + "epoch": 1.559570401383453, + "grad_norm": 8.625, + "learning_rate": 2.8904955252272503e-06, + "loss": 1.4237834215164185, + "step": 8568 + }, + { + "epoch": 1.5599344680076452, + "grad_norm": 21.875, + "learning_rate": 2.88973053384784e-06, + "loss": 2.098597526550293, + "step": 8570 + }, + { + "epoch": 1.5602985346318377, + "grad_norm": 31.125, + "learning_rate": 2.8889655586500258e-06, + "loss": 1.4976621866226196, + "step": 8572 + }, + { + "epoch": 1.5606626012560298, + "grad_norm": 8.0625, + "learning_rate": 2.888200599746066e-06, + "loss": 1.449190616607666, + "step": 8574 + }, + { + "epoch": 1.561026667880222, + "grad_norm": 9.8125, + "learning_rate": 2.8874356572482164e-06, + "loss": 1.4395779371261597, + "step": 8576 + }, + { + "epoch": 1.5613907345044145, + "grad_norm": 6.375, + "learning_rate": 2.886670731268727e-06, + "loss": 1.403394341468811, + "step": 8578 + }, + { + "epoch": 1.5617548011286067, + "grad_norm": 24.25, + "learning_rate": 2.885905821919849e-06, + "loss": 0.8339476585388184, + "step": 8580 + }, + { + "epoch": 1.5621188677527988, + "grad_norm": 12.4375, + "learning_rate": 2.885140929313829e-06, + "loss": 0.4694342017173767, + "step": 8582 + }, + { + "epoch": 1.562482934376991, + "grad_norm": 13.75, + "learning_rate": 2.884376053562911e-06, + "loss": 1.5944947004318237, + "step": 8584 + }, + { + "epoch": 1.5628470010011832, + "grad_norm": 52.0, + "learning_rate": 2.8836111947793397e-06, + "loss": 1.2944097518920898, + "step": 8586 + }, + { + "epoch": 1.5632110676253754, + "grad_norm": 3.453125, + "learning_rate": 2.8828463530753547e-06, + "loss": 1.4292616844177246, + "step": 8588 + }, + { + "epoch": 1.5635751342495676, + "grad_norm": 10.5625, + "learning_rate": 2.882081528563192e-06, + "loss": 1.0554499626159668, + "step": 8590 + }, + { + "epoch": 1.5639392008737598, + "grad_norm": 12.8125, + "learning_rate": 2.881316721355086e-06, + "loss": 1.5793795585632324, + "step": 8592 + }, + { + "epoch": 1.564303267497952, + "grad_norm": 5.125, + "learning_rate": 2.8805519315632708e-06, + "loss": 1.4209654331207275, + "step": 8594 + }, + { + "epoch": 1.5646673341221442, + "grad_norm": 5.96875, + "learning_rate": 2.879787159299976e-06, + "loss": 1.1931174993515015, + "step": 8596 + }, + { + "epoch": 1.5650314007463366, + "grad_norm": 6.875, + "learning_rate": 2.8790224046774284e-06, + "loss": 1.3480455875396729, + "step": 8598 + }, + { + "epoch": 1.5653954673705288, + "grad_norm": 11.875, + "learning_rate": 2.878257667807852e-06, + "loss": 1.5183436870574951, + "step": 8600 + }, + { + "epoch": 1.565759533994721, + "grad_norm": 8.5625, + "learning_rate": 2.87749294880347e-06, + "loss": 1.302595615386963, + "step": 8602 + }, + { + "epoch": 1.5661236006189134, + "grad_norm": 36.25, + "learning_rate": 2.8767282477765e-06, + "loss": 1.6306260824203491, + "step": 8604 + }, + { + "epoch": 1.5664876672431056, + "grad_norm": 27.875, + "learning_rate": 2.8759635648391628e-06, + "loss": 1.7543013095855713, + "step": 8606 + }, + { + "epoch": 1.5668517338672978, + "grad_norm": 5.71875, + "learning_rate": 2.8751989001036674e-06, + "loss": 1.1017216444015503, + "step": 8608 + }, + { + "epoch": 1.56721580049149, + "grad_norm": 14.9375, + "learning_rate": 2.874434253682229e-06, + "loss": 1.430077314376831, + "step": 8610 + }, + { + "epoch": 1.5675798671156822, + "grad_norm": 11.0, + "learning_rate": 2.8736696256870538e-06, + "loss": 1.4390859603881836, + "step": 8612 + }, + { + "epoch": 1.5679439337398744, + "grad_norm": 4.5, + "learning_rate": 2.8729050162303494e-06, + "loss": 1.0132466554641724, + "step": 8614 + }, + { + "epoch": 1.5683080003640666, + "grad_norm": 10.875, + "learning_rate": 2.872140425424319e-06, + "loss": 1.2170507907867432, + "step": 8616 + }, + { + "epoch": 1.5686720669882588, + "grad_norm": 8.125, + "learning_rate": 2.8713758533811633e-06, + "loss": 1.2501498460769653, + "step": 8618 + }, + { + "epoch": 1.569036133612451, + "grad_norm": 4.65625, + "learning_rate": 2.8706113002130796e-06, + "loss": 1.261684536933899, + "step": 8620 + }, + { + "epoch": 1.5694002002366432, + "grad_norm": 7.5625, + "learning_rate": 2.8698467660322635e-06, + "loss": 1.0582857131958008, + "step": 8622 + }, + { + "epoch": 1.5697642668608354, + "grad_norm": 11.5, + "learning_rate": 2.8690822509509063e-06, + "loss": 1.607926845550537, + "step": 8624 + }, + { + "epoch": 1.5701283334850278, + "grad_norm": 11.1875, + "learning_rate": 2.868317755081199e-06, + "loss": 0.9916055798530579, + "step": 8626 + }, + { + "epoch": 1.57049240010922, + "grad_norm": 9.5, + "learning_rate": 2.8675532785353274e-06, + "loss": 1.2218564748764038, + "step": 8628 + }, + { + "epoch": 1.5708564667334122, + "grad_norm": 6.5625, + "learning_rate": 2.866788821425475e-06, + "loss": 1.1856184005737305, + "step": 8630 + }, + { + "epoch": 1.5712205333576046, + "grad_norm": 20.125, + "learning_rate": 2.8660243838638235e-06, + "loss": 1.9854960441589355, + "step": 8632 + }, + { + "epoch": 1.5715845999817968, + "grad_norm": 9.3125, + "learning_rate": 2.8652599659625503e-06, + "loss": 1.9577082395553589, + "step": 8634 + }, + { + "epoch": 1.571948666605989, + "grad_norm": 5.0625, + "learning_rate": 2.8644955678338315e-06, + "loss": 1.0977166891098022, + "step": 8636 + }, + { + "epoch": 1.5723127332301812, + "grad_norm": 10.625, + "learning_rate": 2.8637311895898383e-06, + "loss": 1.1508939266204834, + "step": 8638 + }, + { + "epoch": 1.5726767998543734, + "grad_norm": 10.3125, + "learning_rate": 2.8629668313427417e-06, + "loss": 1.2506386041641235, + "step": 8640 + }, + { + "epoch": 1.5730408664785656, + "grad_norm": 12.5, + "learning_rate": 2.8622024932047066e-06, + "loss": 1.6508686542510986, + "step": 8642 + }, + { + "epoch": 1.5734049331027578, + "grad_norm": 10.4375, + "learning_rate": 2.861438175287897e-06, + "loss": 1.4742003679275513, + "step": 8644 + }, + { + "epoch": 1.57376899972695, + "grad_norm": 2.765625, + "learning_rate": 2.860673877704474e-06, + "loss": 1.1899914741516113, + "step": 8646 + }, + { + "epoch": 1.5741330663511421, + "grad_norm": 13.125, + "learning_rate": 2.8599096005665948e-06, + "loss": 0.8315771818161011, + "step": 8648 + }, + { + "epoch": 1.5744971329753343, + "grad_norm": 16.625, + "learning_rate": 2.859145343986415e-06, + "loss": 1.0613441467285156, + "step": 8650 + }, + { + "epoch": 1.5748611995995268, + "grad_norm": 8.375, + "learning_rate": 2.8583811080760836e-06, + "loss": 1.2905110120773315, + "step": 8652 + }, + { + "epoch": 1.575225266223719, + "grad_norm": 13.9375, + "learning_rate": 2.857616892947751e-06, + "loss": 1.5749127864837646, + "step": 8654 + }, + { + "epoch": 1.5755893328479111, + "grad_norm": 27.25, + "learning_rate": 2.8568526987135624e-06, + "loss": 1.4892123937606812, + "step": 8656 + }, + { + "epoch": 1.5759533994721036, + "grad_norm": 25.375, + "learning_rate": 2.8560885254856606e-06, + "loss": 0.6337347030639648, + "step": 8658 + }, + { + "epoch": 1.5763174660962957, + "grad_norm": 9.5625, + "learning_rate": 2.855324373376185e-06, + "loss": 1.5698970556259155, + "step": 8660 + }, + { + "epoch": 1.576681532720488, + "grad_norm": 15.125, + "learning_rate": 2.8545602424972695e-06, + "loss": 1.4183642864227295, + "step": 8662 + }, + { + "epoch": 1.5770455993446801, + "grad_norm": 7.5, + "learning_rate": 2.8537961329610496e-06, + "loss": 1.2746975421905518, + "step": 8664 + }, + { + "epoch": 1.5774096659688723, + "grad_norm": 5.21875, + "learning_rate": 2.8530320448796554e-06, + "loss": 1.3939204216003418, + "step": 8666 + }, + { + "epoch": 1.5777737325930645, + "grad_norm": 12.5, + "learning_rate": 2.8522679783652125e-06, + "loss": 1.3499265909194946, + "step": 8668 + }, + { + "epoch": 1.5781377992172567, + "grad_norm": 14.375, + "learning_rate": 2.8515039335298457e-06, + "loss": 1.814496636390686, + "step": 8670 + }, + { + "epoch": 1.578501865841449, + "grad_norm": 13.375, + "learning_rate": 2.8507399104856737e-06, + "loss": 1.1691458225250244, + "step": 8672 + }, + { + "epoch": 1.578865932465641, + "grad_norm": 12.9375, + "learning_rate": 2.8499759093448152e-06, + "loss": 1.5588127374649048, + "step": 8674 + }, + { + "epoch": 1.5792299990898333, + "grad_norm": 6.125, + "learning_rate": 2.8492119302193845e-06, + "loss": 1.2434666156768799, + "step": 8676 + }, + { + "epoch": 1.5795940657140257, + "grad_norm": 4.46875, + "learning_rate": 2.848447973221492e-06, + "loss": 1.2844034433364868, + "step": 8678 + }, + { + "epoch": 1.579958132338218, + "grad_norm": 3.96875, + "learning_rate": 2.8476840384632455e-06, + "loss": 1.0985606908798218, + "step": 8680 + }, + { + "epoch": 1.58032219896241, + "grad_norm": 14.0625, + "learning_rate": 2.846920126056748e-06, + "loss": 1.4552021026611328, + "step": 8682 + }, + { + "epoch": 1.5806862655866023, + "grad_norm": 4.53125, + "learning_rate": 2.8461562361141017e-06, + "loss": 1.0634877681732178, + "step": 8684 + }, + { + "epoch": 1.5810503322107947, + "grad_norm": 19.125, + "learning_rate": 2.8453923687474055e-06, + "loss": 1.027315616607666, + "step": 8686 + }, + { + "epoch": 1.581414398834987, + "grad_norm": 15.625, + "learning_rate": 2.844628524068752e-06, + "loss": 0.6117433905601501, + "step": 8688 + }, + { + "epoch": 1.581778465459179, + "grad_norm": 44.5, + "learning_rate": 2.843864702190233e-06, + "loss": 1.6204490661621094, + "step": 8690 + }, + { + "epoch": 1.5821425320833713, + "grad_norm": 11.8125, + "learning_rate": 2.843100903223936e-06, + "loss": 1.3569025993347168, + "step": 8692 + }, + { + "epoch": 1.5825065987075635, + "grad_norm": 40.75, + "learning_rate": 2.842337127281945e-06, + "loss": 1.1576783657073975, + "step": 8694 + }, + { + "epoch": 1.5828706653317557, + "grad_norm": 18.75, + "learning_rate": 2.8415733744763428e-06, + "loss": 1.6060115098953247, + "step": 8696 + }, + { + "epoch": 1.5832347319559479, + "grad_norm": 7.71875, + "learning_rate": 2.840809644919206e-06, + "loss": 1.812389850616455, + "step": 8698 + }, + { + "epoch": 1.58359879858014, + "grad_norm": 30.25, + "learning_rate": 2.8400459387226075e-06, + "loss": 1.0458847284317017, + "step": 8700 + }, + { + "epoch": 1.5839628652043323, + "grad_norm": 10.25, + "learning_rate": 2.83928225599862e-06, + "loss": 1.7215838432312012, + "step": 8702 + }, + { + "epoch": 1.5843269318285245, + "grad_norm": 15.75, + "learning_rate": 2.8385185968593087e-06, + "loss": 1.4653116464614868, + "step": 8704 + }, + { + "epoch": 1.5846909984527169, + "grad_norm": 16.375, + "learning_rate": 2.837754961416741e-06, + "loss": 1.867810845375061, + "step": 8706 + }, + { + "epoch": 1.585055065076909, + "grad_norm": 7.34375, + "learning_rate": 2.836991349782974e-06, + "loss": 1.445216178894043, + "step": 8708 + }, + { + "epoch": 1.5854191317011013, + "grad_norm": 10.6875, + "learning_rate": 2.8362277620700663e-06, + "loss": 1.335452914237976, + "step": 8710 + }, + { + "epoch": 1.5857831983252937, + "grad_norm": 15.4375, + "learning_rate": 2.83546419839007e-06, + "loss": 1.5468621253967285, + "step": 8712 + }, + { + "epoch": 1.5861472649494859, + "grad_norm": 14.375, + "learning_rate": 2.8347006588550363e-06, + "loss": 1.3194500207901, + "step": 8714 + }, + { + "epoch": 1.586511331573678, + "grad_norm": 10.625, + "learning_rate": 2.8339371435770114e-06, + "loss": 1.390432357788086, + "step": 8716 + }, + { + "epoch": 1.5868753981978703, + "grad_norm": 17.25, + "learning_rate": 2.8331736526680375e-06, + "loss": 1.5081697702407837, + "step": 8718 + }, + { + "epoch": 1.5872394648220625, + "grad_norm": 18.125, + "learning_rate": 2.832410186240153e-06, + "loss": 1.4467625617980957, + "step": 8720 + }, + { + "epoch": 1.5876035314462547, + "grad_norm": 18.25, + "learning_rate": 2.8316467444053958e-06, + "loss": 1.5970709323883057, + "step": 8722 + }, + { + "epoch": 1.5879675980704469, + "grad_norm": 18.625, + "learning_rate": 2.8308833272757953e-06, + "loss": 1.835404634475708, + "step": 8724 + }, + { + "epoch": 1.588331664694639, + "grad_norm": 12.9375, + "learning_rate": 2.830119934963382e-06, + "loss": 1.5871875286102295, + "step": 8726 + }, + { + "epoch": 1.5886957313188312, + "grad_norm": 7.3125, + "learning_rate": 2.8293565675801804e-06, + "loss": 1.3618942499160767, + "step": 8728 + }, + { + "epoch": 1.5890597979430234, + "grad_norm": 23.75, + "learning_rate": 2.8285932252382096e-06, + "loss": 1.1385767459869385, + "step": 8730 + }, + { + "epoch": 1.5894238645672158, + "grad_norm": 11.125, + "learning_rate": 2.82782990804949e-06, + "loss": 1.374788522720337, + "step": 8732 + }, + { + "epoch": 1.589787931191408, + "grad_norm": 11.5, + "learning_rate": 2.8270666161260318e-06, + "loss": 1.2342286109924316, + "step": 8734 + }, + { + "epoch": 1.5901519978156002, + "grad_norm": 33.5, + "learning_rate": 2.826303349579848e-06, + "loss": 1.2460260391235352, + "step": 8736 + }, + { + "epoch": 1.5905160644397924, + "grad_norm": 21.875, + "learning_rate": 2.8255401085229434e-06, + "loss": 0.7765758633613586, + "step": 8738 + }, + { + "epoch": 1.5908801310639848, + "grad_norm": 8.125, + "learning_rate": 2.824776893067321e-06, + "loss": 1.0403656959533691, + "step": 8740 + }, + { + "epoch": 1.591244197688177, + "grad_norm": 9.6875, + "learning_rate": 2.8240137033249793e-06, + "loss": 1.5699713230133057, + "step": 8742 + }, + { + "epoch": 1.5916082643123692, + "grad_norm": 11.0, + "learning_rate": 2.823250539407913e-06, + "loss": 1.5276795625686646, + "step": 8744 + }, + { + "epoch": 1.5919723309365614, + "grad_norm": 49.0, + "learning_rate": 2.8224874014281146e-06, + "loss": 1.6743950843811035, + "step": 8746 + }, + { + "epoch": 1.5923363975607536, + "grad_norm": 18.875, + "learning_rate": 2.82172428949757e-06, + "loss": 1.6582130193710327, + "step": 8748 + }, + { + "epoch": 1.5927004641849458, + "grad_norm": 15.75, + "learning_rate": 2.8209612037282634e-06, + "loss": 1.7769865989685059, + "step": 8750 + }, + { + "epoch": 1.593064530809138, + "grad_norm": 8.8125, + "learning_rate": 2.8201981442321744e-06, + "loss": 1.7454893589019775, + "step": 8752 + }, + { + "epoch": 1.5934285974333302, + "grad_norm": 14.9375, + "learning_rate": 2.819435111121279e-06, + "loss": 1.467721939086914, + "step": 8754 + }, + { + "epoch": 1.5937926640575224, + "grad_norm": 14.5, + "learning_rate": 2.8186721045075504e-06, + "loss": 1.4269483089447021, + "step": 8756 + }, + { + "epoch": 1.5941567306817146, + "grad_norm": 16.625, + "learning_rate": 2.8179091245029557e-06, + "loss": 1.0955188274383545, + "step": 8758 + }, + { + "epoch": 1.594520797305907, + "grad_norm": 15.3125, + "learning_rate": 2.817146171219458e-06, + "loss": 0.6740895509719849, + "step": 8760 + }, + { + "epoch": 1.5948848639300992, + "grad_norm": 8.9375, + "learning_rate": 2.8163832447690187e-06, + "loss": 1.601981520652771, + "step": 8762 + }, + { + "epoch": 1.5952489305542914, + "grad_norm": 6.5625, + "learning_rate": 2.8156203452635943e-06, + "loss": 1.1408567428588867, + "step": 8764 + }, + { + "epoch": 1.5956129971784838, + "grad_norm": 11.5, + "learning_rate": 2.814857472815137e-06, + "loss": 1.4998714923858643, + "step": 8766 + }, + { + "epoch": 1.595977063802676, + "grad_norm": 12.5, + "learning_rate": 2.814094627535595e-06, + "loss": 1.8614895343780518, + "step": 8768 + }, + { + "epoch": 1.5963411304268682, + "grad_norm": 10.5625, + "learning_rate": 2.8133318095369133e-06, + "loss": 0.9073572158813477, + "step": 8770 + }, + { + "epoch": 1.5967051970510604, + "grad_norm": 7.0625, + "learning_rate": 2.8125690189310324e-06, + "loss": 1.1940504312515259, + "step": 8772 + }, + { + "epoch": 1.5970692636752526, + "grad_norm": 15.625, + "learning_rate": 2.8118062558298876e-06, + "loss": 1.5961065292358398, + "step": 8774 + }, + { + "epoch": 1.5974333302994448, + "grad_norm": 15.4375, + "learning_rate": 2.811043520345412e-06, + "loss": 1.5292340517044067, + "step": 8776 + }, + { + "epoch": 1.597797396923637, + "grad_norm": 79.0, + "learning_rate": 2.8102808125895352e-06, + "loss": 0.8951650857925415, + "step": 8778 + }, + { + "epoch": 1.5981614635478292, + "grad_norm": 26.0, + "learning_rate": 2.8095181326741794e-06, + "loss": 1.8279677629470825, + "step": 8780 + }, + { + "epoch": 1.5985255301720214, + "grad_norm": 9.3125, + "learning_rate": 2.808755480711266e-06, + "loss": 1.4210129976272583, + "step": 8782 + }, + { + "epoch": 1.5988895967962136, + "grad_norm": 105.5, + "learning_rate": 2.8079928568127103e-06, + "loss": 1.294496774673462, + "step": 8784 + }, + { + "epoch": 1.599253663420406, + "grad_norm": 36.5, + "learning_rate": 2.807230261090426e-06, + "loss": 1.3026769161224365, + "step": 8786 + }, + { + "epoch": 1.5996177300445982, + "grad_norm": 9.375, + "learning_rate": 2.8064676936563177e-06, + "loss": 1.5896800756454468, + "step": 8788 + }, + { + "epoch": 1.5999817966687904, + "grad_norm": 7.5625, + "learning_rate": 2.8057051546222926e-06, + "loss": 1.3268531560897827, + "step": 8790 + }, + { + "epoch": 1.6003458632929826, + "grad_norm": 21.75, + "learning_rate": 2.804942644100248e-06, + "loss": 1.7382636070251465, + "step": 8792 + }, + { + "epoch": 1.600709929917175, + "grad_norm": 9.25, + "learning_rate": 2.804180162202079e-06, + "loss": 1.4696338176727295, + "step": 8794 + }, + { + "epoch": 1.6010739965413672, + "grad_norm": 7.25, + "learning_rate": 2.803417709039679e-06, + "loss": 1.1101782321929932, + "step": 8796 + }, + { + "epoch": 1.6014380631655594, + "grad_norm": 9.3125, + "learning_rate": 2.8026552847249334e-06, + "loss": 1.4599006175994873, + "step": 8798 + }, + { + "epoch": 1.6018021297897516, + "grad_norm": 5.59375, + "learning_rate": 2.8018928893697245e-06, + "loss": 1.1237291097640991, + "step": 8800 + }, + { + "epoch": 1.6021661964139438, + "grad_norm": 12.375, + "learning_rate": 2.8011305230859315e-06, + "loss": 1.5153921842575073, + "step": 8802 + }, + { + "epoch": 1.602530263038136, + "grad_norm": 16.375, + "learning_rate": 2.8003681859854276e-06, + "loss": 1.4775722026824951, + "step": 8804 + }, + { + "epoch": 1.6028943296623281, + "grad_norm": 15.4375, + "learning_rate": 2.7996058781800845e-06, + "loss": 1.6583020687103271, + "step": 8806 + }, + { + "epoch": 1.6032583962865203, + "grad_norm": 9.0625, + "learning_rate": 2.7988435997817666e-06, + "loss": 1.7370648384094238, + "step": 8808 + }, + { + "epoch": 1.6036224629107125, + "grad_norm": 5.21875, + "learning_rate": 2.7980813509023343e-06, + "loss": 0.975601851940155, + "step": 8810 + }, + { + "epoch": 1.6039865295349047, + "grad_norm": 11.125, + "learning_rate": 2.797319131653646e-06, + "loss": 1.0002918243408203, + "step": 8812 + }, + { + "epoch": 1.6043505961590971, + "grad_norm": 9.4375, + "learning_rate": 2.796556942147553e-06, + "loss": 1.1703486442565918, + "step": 8814 + }, + { + "epoch": 1.6047146627832893, + "grad_norm": 9.5625, + "learning_rate": 2.7957947824959055e-06, + "loss": 1.5182147026062012, + "step": 8816 + }, + { + "epoch": 1.6050787294074815, + "grad_norm": 14.75, + "learning_rate": 2.7950326528105455e-06, + "loss": 1.521026611328125, + "step": 8818 + }, + { + "epoch": 1.605442796031674, + "grad_norm": 59.25, + "learning_rate": 2.7942705532033128e-06, + "loss": 1.3177852630615234, + "step": 8820 + }, + { + "epoch": 1.6058068626558661, + "grad_norm": 147.0, + "learning_rate": 2.7935084837860427e-06, + "loss": 1.8035461902618408, + "step": 8822 + }, + { + "epoch": 1.6061709292800583, + "grad_norm": 14.0, + "learning_rate": 2.7927464446705654e-06, + "loss": 1.5627117156982422, + "step": 8824 + }, + { + "epoch": 1.6065349959042505, + "grad_norm": 44.0, + "learning_rate": 2.791984435968709e-06, + "loss": 2.0490307807922363, + "step": 8826 + }, + { + "epoch": 1.6068990625284427, + "grad_norm": 9.625, + "learning_rate": 2.791222457792292e-06, + "loss": 1.780914545059204, + "step": 8828 + }, + { + "epoch": 1.607263129152635, + "grad_norm": 9.875, + "learning_rate": 2.790460510253134e-06, + "loss": 1.4170982837677002, + "step": 8830 + }, + { + "epoch": 1.6076271957768271, + "grad_norm": 21.75, + "learning_rate": 2.7896985934630467e-06, + "loss": 1.4862158298492432, + "step": 8832 + }, + { + "epoch": 1.6079912624010193, + "grad_norm": 40.0, + "learning_rate": 2.7889367075338382e-06, + "loss": 1.8221937417984009, + "step": 8834 + }, + { + "epoch": 1.6083553290252115, + "grad_norm": 21.25, + "learning_rate": 2.788174852577313e-06, + "loss": 1.5980892181396484, + "step": 8836 + }, + { + "epoch": 1.6087193956494037, + "grad_norm": 7.4375, + "learning_rate": 2.7874130287052697e-06, + "loss": 0.985620379447937, + "step": 8838 + }, + { + "epoch": 1.609083462273596, + "grad_norm": 6.84375, + "learning_rate": 2.7866512360295027e-06, + "loss": 0.9243313074111938, + "step": 8840 + }, + { + "epoch": 1.6094475288977883, + "grad_norm": 7.8125, + "learning_rate": 2.7858894746618025e-06, + "loss": 0.6877187490463257, + "step": 8842 + }, + { + "epoch": 1.6098115955219805, + "grad_norm": 10.25, + "learning_rate": 2.7851277447139536e-06, + "loss": 1.517800211906433, + "step": 8844 + }, + { + "epoch": 1.610175662146173, + "grad_norm": 10.375, + "learning_rate": 2.784366046297738e-06, + "loss": 1.6749703884124756, + "step": 8846 + }, + { + "epoch": 1.610539728770365, + "grad_norm": 23.25, + "learning_rate": 2.783604379524931e-06, + "loss": 1.4349150657653809, + "step": 8848 + }, + { + "epoch": 1.6109037953945573, + "grad_norm": 16.875, + "learning_rate": 2.782842744507304e-06, + "loss": 1.266967535018921, + "step": 8850 + }, + { + "epoch": 1.6112678620187495, + "grad_norm": 10.5625, + "learning_rate": 2.7820811413566247e-06, + "loss": 1.286463737487793, + "step": 8852 + }, + { + "epoch": 1.6116319286429417, + "grad_norm": 12.25, + "learning_rate": 2.781319570184654e-06, + "loss": 1.4116493463516235, + "step": 8854 + }, + { + "epoch": 1.6119959952671339, + "grad_norm": 11.25, + "learning_rate": 2.780558031103151e-06, + "loss": 1.4223873615264893, + "step": 8856 + }, + { + "epoch": 1.612360061891326, + "grad_norm": 6.625, + "learning_rate": 2.7797965242238667e-06, + "loss": 1.5108402967453003, + "step": 8858 + }, + { + "epoch": 1.6127241285155183, + "grad_norm": 9.75, + "learning_rate": 2.7790350496585516e-06, + "loss": 1.3979135751724243, + "step": 8860 + }, + { + "epoch": 1.6130881951397105, + "grad_norm": 6.28125, + "learning_rate": 2.7782736075189464e-06, + "loss": 1.3624908924102783, + "step": 8862 + }, + { + "epoch": 1.6134522617639027, + "grad_norm": 12.75, + "learning_rate": 2.77751219791679e-06, + "loss": 1.5869847536087036, + "step": 8864 + }, + { + "epoch": 1.6138163283880949, + "grad_norm": 10.4375, + "learning_rate": 2.776750820963818e-06, + "loss": 1.573920488357544, + "step": 8866 + }, + { + "epoch": 1.6141803950122873, + "grad_norm": 18.375, + "learning_rate": 2.7759894767717577e-06, + "loss": 1.4915132522583008, + "step": 8868 + }, + { + "epoch": 1.6145444616364795, + "grad_norm": 19.75, + "learning_rate": 2.7752281654523348e-06, + "loss": 1.509536623954773, + "step": 8870 + }, + { + "epoch": 1.6149085282606717, + "grad_norm": 13.9375, + "learning_rate": 2.7744668871172663e-06, + "loss": 0.956229567527771, + "step": 8872 + }, + { + "epoch": 1.615272594884864, + "grad_norm": 3.765625, + "learning_rate": 2.7737056418782692e-06, + "loss": 1.08854079246521, + "step": 8874 + }, + { + "epoch": 1.6156366615090563, + "grad_norm": 13.75, + "learning_rate": 2.772944429847052e-06, + "loss": 1.0453836917877197, + "step": 8876 + }, + { + "epoch": 1.6160007281332485, + "grad_norm": 13.5, + "learning_rate": 2.7721832511353184e-06, + "loss": 1.5481067895889282, + "step": 8878 + }, + { + "epoch": 1.6163647947574407, + "grad_norm": 25.375, + "learning_rate": 2.771422105854771e-06, + "loss": 1.7072434425354004, + "step": 8880 + }, + { + "epoch": 1.6167288613816329, + "grad_norm": 22.5, + "learning_rate": 2.7706609941171025e-06, + "loss": 1.2968029975891113, + "step": 8882 + }, + { + "epoch": 1.617092928005825, + "grad_norm": 18.0, + "learning_rate": 2.7698999160340034e-06, + "loss": 1.6372756958007812, + "step": 8884 + }, + { + "epoch": 1.6174569946300172, + "grad_norm": 14.4375, + "learning_rate": 2.7691388717171595e-06, + "loss": 1.9359009265899658, + "step": 8886 + }, + { + "epoch": 1.6178210612542094, + "grad_norm": 61.0, + "learning_rate": 2.7683778612782496e-06, + "loss": 1.2515411376953125, + "step": 8888 + }, + { + "epoch": 1.6181851278784016, + "grad_norm": 9.9375, + "learning_rate": 2.767616884828952e-06, + "loss": 0.6741016507148743, + "step": 8890 + }, + { + "epoch": 1.6185491945025938, + "grad_norm": 12.75, + "learning_rate": 2.7668559424809332e-06, + "loss": 1.3022091388702393, + "step": 8892 + }, + { + "epoch": 1.6189132611267862, + "grad_norm": 28.625, + "learning_rate": 2.7660950343458597e-06, + "loss": 1.689863920211792, + "step": 8894 + }, + { + "epoch": 1.6192773277509784, + "grad_norm": 5.71875, + "learning_rate": 2.7653341605353933e-06, + "loss": 0.8395198583602905, + "step": 8896 + }, + { + "epoch": 1.6196413943751706, + "grad_norm": 4.625, + "learning_rate": 2.764573321161187e-06, + "loss": 1.459787368774414, + "step": 8898 + }, + { + "epoch": 1.620005460999363, + "grad_norm": 3.453125, + "learning_rate": 2.7638125163348916e-06, + "loss": 1.001482367515564, + "step": 8900 + }, + { + "epoch": 1.6203695276235552, + "grad_norm": 8.1875, + "learning_rate": 2.763051746168153e-06, + "loss": 1.0472490787506104, + "step": 8902 + }, + { + "epoch": 1.6207335942477474, + "grad_norm": 7.21875, + "learning_rate": 2.7622910107726092e-06, + "loss": 1.1912736892700195, + "step": 8904 + }, + { + "epoch": 1.6210976608719396, + "grad_norm": 5.4375, + "learning_rate": 2.7615303102598974e-06, + "loss": 1.2734090089797974, + "step": 8906 + }, + { + "epoch": 1.6214617274961318, + "grad_norm": 17.875, + "learning_rate": 2.7607696447416454e-06, + "loss": 1.19173002243042, + "step": 8908 + }, + { + "epoch": 1.621825794120324, + "grad_norm": 14.0, + "learning_rate": 2.760009014329479e-06, + "loss": 1.4895093441009521, + "step": 8910 + }, + { + "epoch": 1.6221898607445162, + "grad_norm": 2.828125, + "learning_rate": 2.759248419135017e-06, + "loss": 1.1691850423812866, + "step": 8912 + }, + { + "epoch": 1.6225539273687084, + "grad_norm": 4.09375, + "learning_rate": 2.7584878592698728e-06, + "loss": 0.8882176280021667, + "step": 8914 + }, + { + "epoch": 1.6229179939929006, + "grad_norm": 12.4375, + "learning_rate": 2.757727334845658e-06, + "loss": 1.1957151889801025, + "step": 8916 + }, + { + "epoch": 1.6232820606170928, + "grad_norm": 13.4375, + "learning_rate": 2.756966845973974e-06, + "loss": 1.4767260551452637, + "step": 8918 + }, + { + "epoch": 1.6236461272412852, + "grad_norm": 8.0, + "learning_rate": 2.7562063927664207e-06, + "loss": 1.5322933197021484, + "step": 8920 + }, + { + "epoch": 1.6240101938654774, + "grad_norm": 21.0, + "learning_rate": 2.755445975334592e-06, + "loss": 1.4067540168762207, + "step": 8922 + }, + { + "epoch": 1.6243742604896696, + "grad_norm": 15.5, + "learning_rate": 2.754685593790074e-06, + "loss": 1.1420401334762573, + "step": 8924 + }, + { + "epoch": 1.6247383271138618, + "grad_norm": 18.375, + "learning_rate": 2.753925248244452e-06, + "loss": 0.8038854598999023, + "step": 8926 + }, + { + "epoch": 1.6251023937380542, + "grad_norm": 10.1875, + "learning_rate": 2.7531649388093017e-06, + "loss": 1.5709344148635864, + "step": 8928 + }, + { + "epoch": 1.6254664603622464, + "grad_norm": 17.25, + "learning_rate": 2.752404665596197e-06, + "loss": 1.4348938465118408, + "step": 8930 + }, + { + "epoch": 1.6258305269864386, + "grad_norm": 12.9375, + "learning_rate": 2.7516444287167045e-06, + "loss": 1.9187887907028198, + "step": 8932 + }, + { + "epoch": 1.6261945936106308, + "grad_norm": 11.1875, + "learning_rate": 2.7508842282823844e-06, + "loss": 1.3741451501846313, + "step": 8934 + }, + { + "epoch": 1.626558660234823, + "grad_norm": 29.625, + "learning_rate": 2.7501240644047955e-06, + "loss": 1.978157877922058, + "step": 8936 + }, + { + "epoch": 1.6269227268590152, + "grad_norm": 13.125, + "learning_rate": 2.749363937195486e-06, + "loss": 1.3270370960235596, + "step": 8938 + }, + { + "epoch": 1.6272867934832074, + "grad_norm": 14.625, + "learning_rate": 2.748603846766004e-06, + "loss": 1.503261685371399, + "step": 8940 + }, + { + "epoch": 1.6276508601073996, + "grad_norm": 19.375, + "learning_rate": 2.747843793227889e-06, + "loss": 1.347365379333496, + "step": 8942 + }, + { + "epoch": 1.6280149267315918, + "grad_norm": 5.8125, + "learning_rate": 2.7470837766926743e-06, + "loss": 0.8761991262435913, + "step": 8944 + }, + { + "epoch": 1.628378993355784, + "grad_norm": 13.8125, + "learning_rate": 2.7463237972718913e-06, + "loss": 1.1706876754760742, + "step": 8946 + }, + { + "epoch": 1.6287430599799764, + "grad_norm": 52.0, + "learning_rate": 2.7455638550770625e-06, + "loss": 1.4347889423370361, + "step": 8948 + }, + { + "epoch": 1.6291071266041686, + "grad_norm": 12.0625, + "learning_rate": 2.7448039502197077e-06, + "loss": 1.2601158618927002, + "step": 8950 + }, + { + "epoch": 1.6294711932283608, + "grad_norm": 15.8125, + "learning_rate": 2.744044082811338e-06, + "loss": 1.5689654350280762, + "step": 8952 + }, + { + "epoch": 1.6298352598525532, + "grad_norm": 13.375, + "learning_rate": 2.743284252963462e-06, + "loss": 1.5567150115966797, + "step": 8954 + }, + { + "epoch": 1.6301993264767454, + "grad_norm": 12.625, + "learning_rate": 2.7425244607875825e-06, + "loss": 1.1311638355255127, + "step": 8956 + }, + { + "epoch": 1.6305633931009376, + "grad_norm": 44.25, + "learning_rate": 2.7417647063951936e-06, + "loss": 1.5073493719100952, + "step": 8958 + }, + { + "epoch": 1.6309274597251298, + "grad_norm": 13.875, + "learning_rate": 2.741004989897788e-06, + "loss": 1.303006887435913, + "step": 8960 + }, + { + "epoch": 1.631291526349322, + "grad_norm": 5.46875, + "learning_rate": 2.7402453114068504e-06, + "loss": 1.0294671058654785, + "step": 8962 + }, + { + "epoch": 1.6316555929735141, + "grad_norm": 6.9375, + "learning_rate": 2.73948567103386e-06, + "loss": 1.4539611339569092, + "step": 8964 + }, + { + "epoch": 1.6320196595977063, + "grad_norm": 13.1875, + "learning_rate": 2.7387260688902933e-06, + "loss": 1.7967519760131836, + "step": 8966 + }, + { + "epoch": 1.6323837262218985, + "grad_norm": 10.1875, + "learning_rate": 2.737966505087616e-06, + "loss": 1.634358286857605, + "step": 8968 + }, + { + "epoch": 1.6327477928460907, + "grad_norm": 34.25, + "learning_rate": 2.7372069797372925e-06, + "loss": 1.7835495471954346, + "step": 8970 + }, + { + "epoch": 1.633111859470283, + "grad_norm": 10.3125, + "learning_rate": 2.73644749295078e-06, + "loss": 1.5124640464782715, + "step": 8972 + }, + { + "epoch": 1.6334759260944753, + "grad_norm": 10.3125, + "learning_rate": 2.7356880448395284e-06, + "loss": 1.5099689960479736, + "step": 8974 + }, + { + "epoch": 1.6338399927186675, + "grad_norm": 10.125, + "learning_rate": 2.7349286355149868e-06, + "loss": 1.367443323135376, + "step": 8976 + }, + { + "epoch": 1.6342040593428597, + "grad_norm": 4.40625, + "learning_rate": 2.7341692650885927e-06, + "loss": 1.472070574760437, + "step": 8978 + }, + { + "epoch": 1.634568125967052, + "grad_norm": 6.875, + "learning_rate": 2.733409933671782e-06, + "loss": 0.9984011650085449, + "step": 8980 + }, + { + "epoch": 1.6349321925912443, + "grad_norm": 99.5, + "learning_rate": 2.7326506413759836e-06, + "loss": 1.314826250076294, + "step": 8982 + }, + { + "epoch": 1.6352962592154365, + "grad_norm": 10.5, + "learning_rate": 2.7318913883126184e-06, + "loss": 0.881316065788269, + "step": 8984 + }, + { + "epoch": 1.6356603258396287, + "grad_norm": 8.5, + "learning_rate": 2.7311321745931073e-06, + "loss": 1.4579654932022095, + "step": 8986 + }, + { + "epoch": 1.636024392463821, + "grad_norm": 4.3125, + "learning_rate": 2.7303730003288585e-06, + "loss": 1.0781248807907104, + "step": 8988 + }, + { + "epoch": 1.6363884590880131, + "grad_norm": 7.625, + "learning_rate": 2.729613865631281e-06, + "loss": 1.2237229347229004, + "step": 8990 + }, + { + "epoch": 1.6367525257122053, + "grad_norm": 16.875, + "learning_rate": 2.728854770611772e-06, + "loss": 1.513213872909546, + "step": 8992 + }, + { + "epoch": 1.6371165923363975, + "grad_norm": 8.9375, + "learning_rate": 2.7280957153817256e-06, + "loss": 1.3854542970657349, + "step": 8994 + }, + { + "epoch": 1.6374806589605897, + "grad_norm": 15.0, + "learning_rate": 2.7273367000525324e-06, + "loss": 1.1437013149261475, + "step": 8996 + }, + { + "epoch": 1.637844725584782, + "grad_norm": 7.21875, + "learning_rate": 2.7265777247355723e-06, + "loss": 1.3488255739212036, + "step": 8998 + }, + { + "epoch": 1.638208792208974, + "grad_norm": 15.25, + "learning_rate": 2.725818789542224e-06, + "loss": 1.4252893924713135, + "step": 9000 + }, + { + "epoch": 1.6385728588331665, + "grad_norm": 11.125, + "learning_rate": 2.725059894583857e-06, + "loss": 1.497605800628662, + "step": 9002 + }, + { + "epoch": 1.6389369254573587, + "grad_norm": 8.8125, + "learning_rate": 2.7243010399718356e-06, + "loss": 1.2594776153564453, + "step": 9004 + }, + { + "epoch": 1.639300992081551, + "grad_norm": 8.125, + "learning_rate": 2.7235422258175202e-06, + "loss": 1.3826425075531006, + "step": 9006 + }, + { + "epoch": 1.6396650587057433, + "grad_norm": 8.375, + "learning_rate": 2.7227834522322616e-06, + "loss": 1.3130407333374023, + "step": 9008 + }, + { + "epoch": 1.6400291253299355, + "grad_norm": 9.375, + "learning_rate": 2.7220247193274095e-06, + "loss": 1.435045838356018, + "step": 9010 + }, + { + "epoch": 1.6403931919541277, + "grad_norm": 14.8125, + "learning_rate": 2.721266027214302e-06, + "loss": 1.6278200149536133, + "step": 9012 + }, + { + "epoch": 1.6407572585783199, + "grad_norm": 16.125, + "learning_rate": 2.7205073760042755e-06, + "loss": 1.2888107299804688, + "step": 9014 + }, + { + "epoch": 1.641121325202512, + "grad_norm": 23.375, + "learning_rate": 2.7197487658086596e-06, + "loss": 1.8384628295898438, + "step": 9016 + }, + { + "epoch": 1.6414853918267043, + "grad_norm": 8.4375, + "learning_rate": 2.7189901967387755e-06, + "loss": 1.5917160511016846, + "step": 9018 + }, + { + "epoch": 1.6418494584508965, + "grad_norm": 10.6875, + "learning_rate": 2.718231668905943e-06, + "loss": 1.2195689678192139, + "step": 9020 + }, + { + "epoch": 1.6422135250750887, + "grad_norm": 8.0, + "learning_rate": 2.7174731824214693e-06, + "loss": 1.3459898233413696, + "step": 9022 + }, + { + "epoch": 1.6425775916992809, + "grad_norm": 13.3125, + "learning_rate": 2.716714737396662e-06, + "loss": 1.4541394710540771, + "step": 9024 + }, + { + "epoch": 1.642941658323473, + "grad_norm": 10.4375, + "learning_rate": 2.7159563339428186e-06, + "loss": 1.2718074321746826, + "step": 9026 + }, + { + "epoch": 1.6433057249476655, + "grad_norm": 8.8125, + "learning_rate": 2.7151979721712316e-06, + "loss": 1.1680617332458496, + "step": 9028 + }, + { + "epoch": 1.6436697915718577, + "grad_norm": 12.9375, + "learning_rate": 2.7144396521931892e-06, + "loss": 1.4630279541015625, + "step": 9030 + }, + { + "epoch": 1.6440338581960499, + "grad_norm": 16.5, + "learning_rate": 2.7136813741199703e-06, + "loss": 1.5427327156066895, + "step": 9032 + }, + { + "epoch": 1.644397924820242, + "grad_norm": 43.5, + "learning_rate": 2.7129231380628483e-06, + "loss": 1.393282413482666, + "step": 9034 + }, + { + "epoch": 1.6447619914444345, + "grad_norm": 8.375, + "learning_rate": 2.712164944133094e-06, + "loss": 1.577666997909546, + "step": 9036 + }, + { + "epoch": 1.6451260580686267, + "grad_norm": 39.75, + "learning_rate": 2.711406792441966e-06, + "loss": 1.3574159145355225, + "step": 9038 + }, + { + "epoch": 1.6454901246928189, + "grad_norm": 9.1875, + "learning_rate": 2.710648683100723e-06, + "loss": 1.2750723361968994, + "step": 9040 + }, + { + "epoch": 1.645854191317011, + "grad_norm": 10.1875, + "learning_rate": 2.709890616220613e-06, + "loss": 1.46896231174469, + "step": 9042 + }, + { + "epoch": 1.6462182579412032, + "grad_norm": 29.375, + "learning_rate": 2.7091325919128785e-06, + "loss": 1.554670810699463, + "step": 9044 + }, + { + "epoch": 1.6465823245653954, + "grad_norm": 10.875, + "learning_rate": 2.7083746102887575e-06, + "loss": 1.6285943984985352, + "step": 9046 + }, + { + "epoch": 1.6469463911895876, + "grad_norm": 4.8125, + "learning_rate": 2.707616671459481e-06, + "loss": 1.3509740829467773, + "step": 9048 + }, + { + "epoch": 1.6473104578137798, + "grad_norm": 10.3125, + "learning_rate": 2.706858775536273e-06, + "loss": 1.3567811250686646, + "step": 9050 + }, + { + "epoch": 1.647674524437972, + "grad_norm": 3.96875, + "learning_rate": 2.7061009226303513e-06, + "loss": 1.1134076118469238, + "step": 9052 + }, + { + "epoch": 1.6480385910621642, + "grad_norm": 11.25, + "learning_rate": 2.7053431128529286e-06, + "loss": 1.0791559219360352, + "step": 9054 + }, + { + "epoch": 1.6484026576863566, + "grad_norm": 10.125, + "learning_rate": 2.70458534631521e-06, + "loss": 0.8206127285957336, + "step": 9056 + }, + { + "epoch": 1.6487667243105488, + "grad_norm": 6.4375, + "learning_rate": 2.7038276231283943e-06, + "loss": 1.065165638923645, + "step": 9058 + }, + { + "epoch": 1.649130790934741, + "grad_norm": 19.875, + "learning_rate": 2.7030699434036746e-06, + "loss": 1.3392390012741089, + "step": 9060 + }, + { + "epoch": 1.6494948575589334, + "grad_norm": 17.125, + "learning_rate": 2.7023123072522377e-06, + "loss": 1.3549519777297974, + "step": 9062 + }, + { + "epoch": 1.6498589241831256, + "grad_norm": 7.125, + "learning_rate": 2.7015547147852626e-06, + "loss": 1.3336328268051147, + "step": 9064 + }, + { + "epoch": 1.6502229908073178, + "grad_norm": 54.25, + "learning_rate": 2.7007971661139244e-06, + "loss": 1.0506854057312012, + "step": 9066 + }, + { + "epoch": 1.65058705743151, + "grad_norm": 8.75, + "learning_rate": 2.7000396613493886e-06, + "loss": 0.9831037521362305, + "step": 9068 + }, + { + "epoch": 1.6509511240557022, + "grad_norm": 16.25, + "learning_rate": 2.6992822006028185e-06, + "loss": 1.522639274597168, + "step": 9070 + }, + { + "epoch": 1.6513151906798944, + "grad_norm": 10.5625, + "learning_rate": 2.6985247839853656e-06, + "loss": 1.4508020877838135, + "step": 9072 + }, + { + "epoch": 1.6516792573040866, + "grad_norm": 26.125, + "learning_rate": 2.697767411608179e-06, + "loss": 1.564084529876709, + "step": 9074 + }, + { + "epoch": 1.6520433239282788, + "grad_norm": 17.375, + "learning_rate": 2.6970100835824e-06, + "loss": 1.777204990386963, + "step": 9076 + }, + { + "epoch": 1.652407390552471, + "grad_norm": 8.1875, + "learning_rate": 2.6962528000191622e-06, + "loss": 1.308849573135376, + "step": 9078 + }, + { + "epoch": 1.6527714571766632, + "grad_norm": 10.75, + "learning_rate": 2.6954955610295963e-06, + "loss": 1.4076833724975586, + "step": 9080 + }, + { + "epoch": 1.6531355238008556, + "grad_norm": 14.375, + "learning_rate": 2.694738366724823e-06, + "loss": 1.5952646732330322, + "step": 9082 + }, + { + "epoch": 1.6534995904250478, + "grad_norm": 14.0625, + "learning_rate": 2.6939812172159565e-06, + "loss": 1.5751872062683105, + "step": 9084 + }, + { + "epoch": 1.65386365704924, + "grad_norm": 9.1875, + "learning_rate": 2.6932241126141066e-06, + "loss": 1.7137010097503662, + "step": 9086 + }, + { + "epoch": 1.6542277236734324, + "grad_norm": 21.625, + "learning_rate": 2.692467053030374e-06, + "loss": 1.4660654067993164, + "step": 9088 + }, + { + "epoch": 1.6545917902976246, + "grad_norm": 15.75, + "learning_rate": 2.6917100385758564e-06, + "loss": 1.0201032161712646, + "step": 9090 + }, + { + "epoch": 1.6549558569218168, + "grad_norm": 15.5625, + "learning_rate": 2.6909530693616408e-06, + "loss": 1.4451243877410889, + "step": 9092 + }, + { + "epoch": 1.655319923546009, + "grad_norm": 9.4375, + "learning_rate": 2.6901961454988092e-06, + "loss": 1.1621956825256348, + "step": 9094 + }, + { + "epoch": 1.6556839901702012, + "grad_norm": 8.625, + "learning_rate": 2.6894392670984386e-06, + "loss": 1.4459900856018066, + "step": 9096 + }, + { + "epoch": 1.6560480567943934, + "grad_norm": 11.0, + "learning_rate": 2.6886824342715967e-06, + "loss": 1.3991408348083496, + "step": 9098 + }, + { + "epoch": 1.6564121234185856, + "grad_norm": 27.875, + "learning_rate": 2.687925647129346e-06, + "loss": 1.618884563446045, + "step": 9100 + }, + { + "epoch": 1.6567761900427778, + "grad_norm": 15.25, + "learning_rate": 2.687168905782741e-06, + "loss": 1.824678659439087, + "step": 9102 + }, + { + "epoch": 1.65714025666697, + "grad_norm": 21.375, + "learning_rate": 2.6864122103428313e-06, + "loss": 1.3834012746810913, + "step": 9104 + }, + { + "epoch": 1.6575043232911622, + "grad_norm": 4.0625, + "learning_rate": 2.68565556092066e-06, + "loss": 1.0795729160308838, + "step": 9106 + }, + { + "epoch": 1.6578683899153543, + "grad_norm": 27.375, + "learning_rate": 2.684898957627261e-06, + "loss": 0.8226364850997925, + "step": 9108 + }, + { + "epoch": 1.6582324565395468, + "grad_norm": 16.75, + "learning_rate": 2.6841424005736627e-06, + "loss": 1.4209195375442505, + "step": 9110 + }, + { + "epoch": 1.658596523163739, + "grad_norm": 13.1875, + "learning_rate": 2.6833858898708875e-06, + "loss": 1.658259630203247, + "step": 9112 + }, + { + "epoch": 1.6589605897879312, + "grad_norm": 18.625, + "learning_rate": 2.682629425629949e-06, + "loss": 1.596691608428955, + "step": 9114 + }, + { + "epoch": 1.6593246564121236, + "grad_norm": 36.0, + "learning_rate": 2.6818730079618564e-06, + "loss": 0.21741102635860443, + "step": 9116 + }, + { + "epoch": 1.6596887230363158, + "grad_norm": 92.0, + "learning_rate": 2.6811166369776105e-06, + "loss": 1.3444486856460571, + "step": 9118 + }, + { + "epoch": 1.660052789660508, + "grad_norm": 18.875, + "learning_rate": 2.680360312788206e-06, + "loss": 1.788914442062378, + "step": 9120 + }, + { + "epoch": 1.6604168562847001, + "grad_norm": 12.875, + "learning_rate": 2.6796040355046305e-06, + "loss": 1.463478684425354, + "step": 9122 + }, + { + "epoch": 1.6607809229088923, + "grad_norm": 11.375, + "learning_rate": 2.6788478052378635e-06, + "loss": 1.6262067556381226, + "step": 9124 + }, + { + "epoch": 1.6611449895330845, + "grad_norm": 17.0, + "learning_rate": 2.6780916220988806e-06, + "loss": 1.5845322608947754, + "step": 9126 + }, + { + "epoch": 1.6615090561572767, + "grad_norm": 19.375, + "learning_rate": 2.6773354861986465e-06, + "loss": 1.7036840915679932, + "step": 9128 + }, + { + "epoch": 1.661873122781469, + "grad_norm": 25.25, + "learning_rate": 2.6765793976481234e-06, + "loss": 1.8864271640777588, + "step": 9130 + }, + { + "epoch": 1.6622371894056611, + "grad_norm": 6.3125, + "learning_rate": 2.6758233565582624e-06, + "loss": 1.4782168865203857, + "step": 9132 + }, + { + "epoch": 1.6626012560298533, + "grad_norm": 24.625, + "learning_rate": 2.675067363040009e-06, + "loss": 1.9831485748291016, + "step": 9134 + }, + { + "epoch": 1.6629653226540457, + "grad_norm": 15.3125, + "learning_rate": 2.674311417204305e-06, + "loss": 1.4451037645339966, + "step": 9136 + }, + { + "epoch": 1.663329389278238, + "grad_norm": 35.75, + "learning_rate": 2.673555519162079e-06, + "loss": 1.2403595447540283, + "step": 9138 + }, + { + "epoch": 1.6636934559024301, + "grad_norm": 7.90625, + "learning_rate": 2.67279966902426e-06, + "loss": 0.8842126727104187, + "step": 9140 + }, + { + "epoch": 1.6640575225266225, + "grad_norm": 16.25, + "learning_rate": 2.672043866901761e-06, + "loss": 1.546290636062622, + "step": 9142 + }, + { + "epoch": 1.6644215891508147, + "grad_norm": 16.75, + "learning_rate": 2.6712881129054962e-06, + "loss": 2.038055658340454, + "step": 9144 + }, + { + "epoch": 1.664785655775007, + "grad_norm": 8.75, + "learning_rate": 2.6705324071463694e-06, + "loss": 1.495240330696106, + "step": 9146 + }, + { + "epoch": 1.6651497223991991, + "grad_norm": 9.4375, + "learning_rate": 2.6697767497352755e-06, + "loss": 1.4728375673294067, + "step": 9148 + }, + { + "epoch": 1.6655137890233913, + "grad_norm": 22.0, + "learning_rate": 2.6690211407831067e-06, + "loss": 1.430677056312561, + "step": 9150 + }, + { + "epoch": 1.6658778556475835, + "grad_norm": 16.5, + "learning_rate": 2.6682655804007427e-06, + "loss": 1.9896063804626465, + "step": 9152 + }, + { + "epoch": 1.6662419222717757, + "grad_norm": 8.6875, + "learning_rate": 2.6675100686990605e-06, + "loss": 1.651367425918579, + "step": 9154 + }, + { + "epoch": 1.666605988895968, + "grad_norm": 3.9375, + "learning_rate": 2.66675460578893e-06, + "loss": 0.9940225481987, + "step": 9156 + }, + { + "epoch": 1.66697005552016, + "grad_norm": 9.9375, + "learning_rate": 2.665999191781209e-06, + "loss": 1.243117332458496, + "step": 9158 + }, + { + "epoch": 1.6673341221443523, + "grad_norm": 8.5625, + "learning_rate": 2.6652438267867537e-06, + "loss": 1.3259273767471313, + "step": 9160 + }, + { + "epoch": 1.6676981887685447, + "grad_norm": 13.9375, + "learning_rate": 2.6644885109164097e-06, + "loss": 1.8624930381774902, + "step": 9162 + }, + { + "epoch": 1.668062255392737, + "grad_norm": 18.125, + "learning_rate": 2.663733244281017e-06, + "loss": 1.9162217378616333, + "step": 9164 + }, + { + "epoch": 1.668426322016929, + "grad_norm": 17.0, + "learning_rate": 2.6629780269914083e-06, + "loss": 1.2164101600646973, + "step": 9166 + }, + { + "epoch": 1.6687903886411213, + "grad_norm": 9.6875, + "learning_rate": 2.6622228591584076e-06, + "loss": 1.4263803958892822, + "step": 9168 + }, + { + "epoch": 1.6691544552653137, + "grad_norm": 10.75, + "learning_rate": 2.6614677408928347e-06, + "loss": 1.2728183269500732, + "step": 9170 + }, + { + "epoch": 1.669518521889506, + "grad_norm": 13.0, + "learning_rate": 2.6607126723054977e-06, + "loss": 1.4746191501617432, + "step": 9172 + }, + { + "epoch": 1.669882588513698, + "grad_norm": 15.9375, + "learning_rate": 2.659957653507201e-06, + "loss": 1.4660627841949463, + "step": 9174 + }, + { + "epoch": 1.6702466551378903, + "grad_norm": 21.375, + "learning_rate": 2.6592026846087417e-06, + "loss": 1.599808692932129, + "step": 9176 + }, + { + "epoch": 1.6706107217620825, + "grad_norm": 19.125, + "learning_rate": 2.658447765720906e-06, + "loss": 1.6903605461120605, + "step": 9178 + }, + { + "epoch": 1.6709747883862747, + "grad_norm": 20.625, + "learning_rate": 2.657692896954478e-06, + "loss": 2.237009048461914, + "step": 9180 + }, + { + "epoch": 1.6713388550104669, + "grad_norm": 17.625, + "learning_rate": 2.6569380784202293e-06, + "loss": 1.7403486967086792, + "step": 9182 + }, + { + "epoch": 1.671702921634659, + "grad_norm": 23.625, + "learning_rate": 2.656183310228927e-06, + "loss": 1.7829509973526, + "step": 9184 + }, + { + "epoch": 1.6720669882588513, + "grad_norm": 10.625, + "learning_rate": 2.6554285924913308e-06, + "loss": 0.8804969191551208, + "step": 9186 + }, + { + "epoch": 1.6724310548830434, + "grad_norm": 10.5625, + "learning_rate": 2.654673925318192e-06, + "loss": 1.4425172805786133, + "step": 9188 + }, + { + "epoch": 1.6727951215072359, + "grad_norm": 9.0, + "learning_rate": 2.653919308820256e-06, + "loss": 0.6721599102020264, + "step": 9190 + }, + { + "epoch": 1.673159188131428, + "grad_norm": 13.625, + "learning_rate": 2.653164743108258e-06, + "loss": 1.5481771230697632, + "step": 9192 + }, + { + "epoch": 1.6735232547556202, + "grad_norm": 11.125, + "learning_rate": 2.652410228292928e-06, + "loss": 1.62593412399292, + "step": 9194 + }, + { + "epoch": 1.6738873213798127, + "grad_norm": 26.125, + "learning_rate": 2.6516557644849895e-06, + "loss": 1.1630219221115112, + "step": 9196 + }, + { + "epoch": 1.6742513880040049, + "grad_norm": 16.5, + "learning_rate": 2.6509013517951554e-06, + "loss": 0.5983873605728149, + "step": 9198 + }, + { + "epoch": 1.674615454628197, + "grad_norm": 18.375, + "learning_rate": 2.650146990334133e-06, + "loss": 1.5465179681777954, + "step": 9200 + }, + { + "epoch": 1.6749795212523892, + "grad_norm": 18.5, + "learning_rate": 2.6493926802126223e-06, + "loss": 1.7888509035110474, + "step": 9202 + }, + { + "epoch": 1.6753435878765814, + "grad_norm": 10.5625, + "learning_rate": 2.648638421541314e-06, + "loss": 1.5550678968429565, + "step": 9204 + }, + { + "epoch": 1.6757076545007736, + "grad_norm": 11.0625, + "learning_rate": 2.6478842144308948e-06, + "loss": 1.0708715915679932, + "step": 9206 + }, + { + "epoch": 1.6760717211249658, + "grad_norm": 11.5625, + "learning_rate": 2.6471300589920386e-06, + "loss": 1.1484529972076416, + "step": 9208 + }, + { + "epoch": 1.676435787749158, + "grad_norm": 14.8125, + "learning_rate": 2.6463759553354173e-06, + "loss": 1.0184969902038574, + "step": 9210 + }, + { + "epoch": 1.6767998543733502, + "grad_norm": 12.8125, + "learning_rate": 2.6456219035716914e-06, + "loss": 1.2476813793182373, + "step": 9212 + }, + { + "epoch": 1.6771639209975424, + "grad_norm": 31.625, + "learning_rate": 2.644867903811514e-06, + "loss": 1.5933899879455566, + "step": 9214 + }, + { + "epoch": 1.6775279876217348, + "grad_norm": 55.75, + "learning_rate": 2.644113956165534e-06, + "loss": 2.2150018215179443, + "step": 9216 + }, + { + "epoch": 1.677892054245927, + "grad_norm": 3.84375, + "learning_rate": 2.6433600607443865e-06, + "loss": 1.0567104816436768, + "step": 9218 + }, + { + "epoch": 1.6782561208701192, + "grad_norm": 17.25, + "learning_rate": 2.6426062176587065e-06, + "loss": 1.4568065404891968, + "step": 9220 + }, + { + "epoch": 1.6786201874943114, + "grad_norm": 16.375, + "learning_rate": 2.6418524270191154e-06, + "loss": 1.435537576675415, + "step": 9222 + }, + { + "epoch": 1.6789842541185038, + "grad_norm": 12.0, + "learning_rate": 2.641098688936229e-06, + "loss": 1.5779021978378296, + "step": 9224 + }, + { + "epoch": 1.679348320742696, + "grad_norm": 8.9375, + "learning_rate": 2.6403450035206557e-06, + "loss": 1.4415068626403809, + "step": 9226 + }, + { + "epoch": 1.6797123873668882, + "grad_norm": 5.84375, + "learning_rate": 2.6395913708829945e-06, + "loss": 1.1263445615768433, + "step": 9228 + }, + { + "epoch": 1.6800764539910804, + "grad_norm": 9.375, + "learning_rate": 2.63883779113384e-06, + "loss": 1.2108830213546753, + "step": 9230 + }, + { + "epoch": 1.6804405206152726, + "grad_norm": 5.71875, + "learning_rate": 2.638084264383777e-06, + "loss": 0.6980651021003723, + "step": 9232 + }, + { + "epoch": 1.6808045872394648, + "grad_norm": 21.375, + "learning_rate": 2.6373307907433792e-06, + "loss": 1.3352385759353638, + "step": 9234 + }, + { + "epoch": 1.681168653863657, + "grad_norm": 15.25, + "learning_rate": 2.63657737032322e-06, + "loss": 0.7894084453582764, + "step": 9236 + }, + { + "epoch": 1.6815327204878492, + "grad_norm": 26.375, + "learning_rate": 2.635824003233858e-06, + "loss": 1.5009653568267822, + "step": 9238 + }, + { + "epoch": 1.6818967871120414, + "grad_norm": 8.0625, + "learning_rate": 2.635070689585848e-06, + "loss": 1.0865978002548218, + "step": 9240 + }, + { + "epoch": 1.6822608537362336, + "grad_norm": 13.4375, + "learning_rate": 2.6343174294897345e-06, + "loss": 1.5583518743515015, + "step": 9242 + }, + { + "epoch": 1.682624920360426, + "grad_norm": 5.3125, + "learning_rate": 2.6335642230560566e-06, + "loss": 1.0222206115722656, + "step": 9244 + }, + { + "epoch": 1.6829889869846182, + "grad_norm": 6.78125, + "learning_rate": 2.6328110703953447e-06, + "loss": 1.3737409114837646, + "step": 9246 + }, + { + "epoch": 1.6833530536088104, + "grad_norm": 10.75, + "learning_rate": 2.632057971618119e-06, + "loss": 1.403762936592102, + "step": 9248 + }, + { + "epoch": 1.6837171202330028, + "grad_norm": 8.3125, + "learning_rate": 2.6313049268348966e-06, + "loss": 1.3826375007629395, + "step": 9250 + }, + { + "epoch": 1.684081186857195, + "grad_norm": 11.125, + "learning_rate": 2.6305519361561806e-06, + "loss": 1.4757513999938965, + "step": 9252 + }, + { + "epoch": 1.6844452534813872, + "grad_norm": 9.9375, + "learning_rate": 2.6297989996924707e-06, + "loss": 1.2882804870605469, + "step": 9254 + }, + { + "epoch": 1.6848093201055794, + "grad_norm": 18.5, + "learning_rate": 2.629046117554258e-06, + "loss": 1.0880687236785889, + "step": 9256 + }, + { + "epoch": 1.6851733867297716, + "grad_norm": 16.5, + "learning_rate": 2.628293289852023e-06, + "loss": 1.344069004058838, + "step": 9258 + }, + { + "epoch": 1.6855374533539638, + "grad_norm": 8.5, + "learning_rate": 2.627540516696243e-06, + "loss": 1.216900110244751, + "step": 9260 + }, + { + "epoch": 1.685901519978156, + "grad_norm": 16.375, + "learning_rate": 2.6267877981973815e-06, + "loss": 1.1618045568466187, + "step": 9262 + }, + { + "epoch": 1.6862655866023482, + "grad_norm": 12.5, + "learning_rate": 2.6260351344658987e-06, + "loss": 1.771470546722412, + "step": 9264 + }, + { + "epoch": 1.6866296532265403, + "grad_norm": 9.125, + "learning_rate": 2.625282525612244e-06, + "loss": 1.7320276498794556, + "step": 9266 + }, + { + "epoch": 1.6869937198507325, + "grad_norm": 5.1875, + "learning_rate": 2.62452997174686e-06, + "loss": 1.2539374828338623, + "step": 9268 + }, + { + "epoch": 1.687357786474925, + "grad_norm": 9.5, + "learning_rate": 2.6237774729801812e-06, + "loss": 1.0105149745941162, + "step": 9270 + }, + { + "epoch": 1.6877218530991172, + "grad_norm": 10.8125, + "learning_rate": 2.623025029422633e-06, + "loss": 1.4932713508605957, + "step": 9272 + }, + { + "epoch": 1.6880859197233093, + "grad_norm": 23.0, + "learning_rate": 2.622272641184635e-06, + "loss": 1.8004236221313477, + "step": 9274 + }, + { + "epoch": 1.6884499863475015, + "grad_norm": 10.9375, + "learning_rate": 2.621520308376596e-06, + "loss": 1.4048701524734497, + "step": 9276 + }, + { + "epoch": 1.688814052971694, + "grad_norm": 14.375, + "learning_rate": 2.6207680311089167e-06, + "loss": 1.559190273284912, + "step": 9278 + }, + { + "epoch": 1.6891781195958862, + "grad_norm": 8.5625, + "learning_rate": 2.6200158094919936e-06, + "loss": 1.4601926803588867, + "step": 9280 + }, + { + "epoch": 1.6895421862200783, + "grad_norm": 10.75, + "learning_rate": 2.6192636436362094e-06, + "loss": 1.6643003225326538, + "step": 9282 + }, + { + "epoch": 1.6899062528442705, + "grad_norm": 10.3125, + "learning_rate": 2.618511533651943e-06, + "loss": 1.4071576595306396, + "step": 9284 + }, + { + "epoch": 1.6902703194684627, + "grad_norm": 6.90625, + "learning_rate": 2.617759479649563e-06, + "loss": 1.356034278869629, + "step": 9286 + }, + { + "epoch": 1.690634386092655, + "grad_norm": 8.1875, + "learning_rate": 2.6170074817394297e-06, + "loss": 1.2255876064300537, + "step": 9288 + }, + { + "epoch": 1.6909984527168471, + "grad_norm": 33.5, + "learning_rate": 2.6162555400318966e-06, + "loss": 1.416519045829773, + "step": 9290 + }, + { + "epoch": 1.6913625193410393, + "grad_norm": 3.515625, + "learning_rate": 2.615503654637308e-06, + "loss": 1.1522860527038574, + "step": 9292 + }, + { + "epoch": 1.6917265859652315, + "grad_norm": 17.5, + "learning_rate": 2.614751825666e-06, + "loss": 1.2634915113449097, + "step": 9294 + }, + { + "epoch": 1.6920906525894237, + "grad_norm": 6.59375, + "learning_rate": 2.6140000532283003e-06, + "loss": 1.0256493091583252, + "step": 9296 + }, + { + "epoch": 1.6924547192136161, + "grad_norm": 13.9375, + "learning_rate": 2.6132483374345276e-06, + "loss": 1.3080412149429321, + "step": 9298 + }, + { + "epoch": 1.6928187858378083, + "grad_norm": 11.75, + "learning_rate": 2.6124966783949956e-06, + "loss": 1.8101611137390137, + "step": 9300 + }, + { + "epoch": 1.6931828524620005, + "grad_norm": 17.875, + "learning_rate": 2.611745076220005e-06, + "loss": 1.2362911701202393, + "step": 9302 + }, + { + "epoch": 1.693546919086193, + "grad_norm": 12.5625, + "learning_rate": 2.61099353101985e-06, + "loss": 0.8984555006027222, + "step": 9304 + }, + { + "epoch": 1.6939109857103851, + "grad_norm": 11.125, + "learning_rate": 2.6102420429048186e-06, + "loss": 1.5998139381408691, + "step": 9306 + }, + { + "epoch": 1.6942750523345773, + "grad_norm": 3.671875, + "learning_rate": 2.609490611985187e-06, + "loss": 1.3438420295715332, + "step": 9308 + }, + { + "epoch": 1.6946391189587695, + "grad_norm": 7.53125, + "learning_rate": 2.6087392383712274e-06, + "loss": 1.505103588104248, + "step": 9310 + }, + { + "epoch": 1.6950031855829617, + "grad_norm": 31.25, + "learning_rate": 2.6079879221731974e-06, + "loss": 1.414860486984253, + "step": 9312 + }, + { + "epoch": 1.695367252207154, + "grad_norm": 13.875, + "learning_rate": 2.6072366635013515e-06, + "loss": 1.612790584564209, + "step": 9314 + }, + { + "epoch": 1.695731318831346, + "grad_norm": 31.5, + "learning_rate": 2.606485462465934e-06, + "loss": 1.4038561582565308, + "step": 9316 + }, + { + "epoch": 1.6960953854555383, + "grad_norm": 16.625, + "learning_rate": 2.605734319177179e-06, + "loss": 1.6603057384490967, + "step": 9318 + }, + { + "epoch": 1.6964594520797305, + "grad_norm": 14.1875, + "learning_rate": 2.6049832337453163e-06, + "loss": 1.4171526432037354, + "step": 9320 + }, + { + "epoch": 1.6968235187039227, + "grad_norm": 8.8125, + "learning_rate": 2.604232206280562e-06, + "loss": 1.1852669715881348, + "step": 9322 + }, + { + "epoch": 1.697187585328115, + "grad_norm": 47.5, + "learning_rate": 2.603481236893127e-06, + "loss": 0.8682548999786377, + "step": 9324 + }, + { + "epoch": 1.6975516519523073, + "grad_norm": 46.25, + "learning_rate": 2.602730325693216e-06, + "loss": 1.2902064323425293, + "step": 9326 + }, + { + "epoch": 1.6979157185764995, + "grad_norm": 11.0, + "learning_rate": 2.6019794727910173e-06, + "loss": 1.3586435317993164, + "step": 9328 + }, + { + "epoch": 1.6982797852006917, + "grad_norm": 21.625, + "learning_rate": 2.601228678296719e-06, + "loss": 1.1663780212402344, + "step": 9330 + }, + { + "epoch": 1.698643851824884, + "grad_norm": 11.8125, + "learning_rate": 2.6004779423204963e-06, + "loss": 0.8382279276847839, + "step": 9332 + }, + { + "epoch": 1.6990079184490763, + "grad_norm": 42.0, + "learning_rate": 2.5997272649725155e-06, + "loss": 0.7721830606460571, + "step": 9334 + }, + { + "epoch": 1.6993719850732685, + "grad_norm": 7.875, + "learning_rate": 2.5989766463629373e-06, + "loss": 1.210772156715393, + "step": 9336 + }, + { + "epoch": 1.6997360516974607, + "grad_norm": 9.0625, + "learning_rate": 2.59822608660191e-06, + "loss": 1.6795170307159424, + "step": 9338 + }, + { + "epoch": 1.7001001183216529, + "grad_norm": 8.0, + "learning_rate": 2.597475585799577e-06, + "loss": 1.5050535202026367, + "step": 9340 + }, + { + "epoch": 1.700464184945845, + "grad_norm": 7.6875, + "learning_rate": 2.5967251440660705e-06, + "loss": 1.2744718790054321, + "step": 9342 + }, + { + "epoch": 1.7008282515700373, + "grad_norm": 10.8125, + "learning_rate": 2.5959747615115143e-06, + "loss": 1.2136510610580444, + "step": 9344 + }, + { + "epoch": 1.7011923181942294, + "grad_norm": 14.1875, + "learning_rate": 2.5952244382460255e-06, + "loss": 1.1773263216018677, + "step": 9346 + }, + { + "epoch": 1.7015563848184216, + "grad_norm": 7.09375, + "learning_rate": 2.594474174379709e-06, + "loss": 1.3566792011260986, + "step": 9348 + }, + { + "epoch": 1.7019204514426138, + "grad_norm": 11.0625, + "learning_rate": 2.5937239700226647e-06, + "loss": 1.6224448680877686, + "step": 9350 + }, + { + "epoch": 1.7022845180668063, + "grad_norm": 19.125, + "learning_rate": 2.5929738252849807e-06, + "loss": 1.5998003482818604, + "step": 9352 + }, + { + "epoch": 1.7026485846909984, + "grad_norm": 23.125, + "learning_rate": 2.592223740276738e-06, + "loss": 1.6076273918151855, + "step": 9354 + }, + { + "epoch": 1.7030126513151906, + "grad_norm": 17.0, + "learning_rate": 2.5914737151080095e-06, + "loss": 1.2455108165740967, + "step": 9356 + }, + { + "epoch": 1.703376717939383, + "grad_norm": 7.5625, + "learning_rate": 2.590723749888857e-06, + "loss": 1.497173547744751, + "step": 9358 + }, + { + "epoch": 1.7037407845635753, + "grad_norm": 16.25, + "learning_rate": 2.5899738447293366e-06, + "loss": 1.5745115280151367, + "step": 9360 + }, + { + "epoch": 1.7041048511877674, + "grad_norm": 13.4375, + "learning_rate": 2.5892239997394924e-06, + "loss": 1.5484111309051514, + "step": 9362 + }, + { + "epoch": 1.7044689178119596, + "grad_norm": 9.875, + "learning_rate": 2.5884742150293607e-06, + "loss": 1.6625993251800537, + "step": 9364 + }, + { + "epoch": 1.7048329844361518, + "grad_norm": 4.96875, + "learning_rate": 2.5877244907089716e-06, + "loss": 1.0240740776062012, + "step": 9366 + }, + { + "epoch": 1.705197051060344, + "grad_norm": 8.6875, + "learning_rate": 2.5869748268883414e-06, + "loss": 1.629879117012024, + "step": 9368 + }, + { + "epoch": 1.7055611176845362, + "grad_norm": 12.625, + "learning_rate": 2.586225223677483e-06, + "loss": 1.4819724559783936, + "step": 9370 + }, + { + "epoch": 1.7059251843087284, + "grad_norm": 12.625, + "learning_rate": 2.5854756811863945e-06, + "loss": 1.4234089851379395, + "step": 9372 + }, + { + "epoch": 1.7062892509329206, + "grad_norm": 6.9375, + "learning_rate": 2.5847261995250705e-06, + "loss": 1.3711363077163696, + "step": 9374 + }, + { + "epoch": 1.7066533175571128, + "grad_norm": 7.90625, + "learning_rate": 2.583976778803494e-06, + "loss": 1.3665838241577148, + "step": 9376 + }, + { + "epoch": 1.7070173841813052, + "grad_norm": 9.5, + "learning_rate": 2.583227419131639e-06, + "loss": 1.2841482162475586, + "step": 9378 + }, + { + "epoch": 1.7073814508054974, + "grad_norm": 16.5, + "learning_rate": 2.5824781206194717e-06, + "loss": 1.358459711074829, + "step": 9380 + }, + { + "epoch": 1.7077455174296896, + "grad_norm": 15.75, + "learning_rate": 2.581728883376947e-06, + "loss": 1.4769535064697266, + "step": 9382 + }, + { + "epoch": 1.708109584053882, + "grad_norm": 18.625, + "learning_rate": 2.580979707514014e-06, + "loss": 1.9856996536254883, + "step": 9384 + }, + { + "epoch": 1.7084736506780742, + "grad_norm": 10.875, + "learning_rate": 2.580230593140612e-06, + "loss": 1.3805100917816162, + "step": 9386 + }, + { + "epoch": 1.7088377173022664, + "grad_norm": 13.6875, + "learning_rate": 2.5794815403666674e-06, + "loss": 1.180823564529419, + "step": 9388 + }, + { + "epoch": 1.7092017839264586, + "grad_norm": 14.0625, + "learning_rate": 2.5787325493021042e-06, + "loss": 1.2458652257919312, + "step": 9390 + }, + { + "epoch": 1.7095658505506508, + "grad_norm": 16.0, + "learning_rate": 2.577983620056831e-06, + "loss": 1.3731491565704346, + "step": 9392 + }, + { + "epoch": 1.709929917174843, + "grad_norm": 11.5625, + "learning_rate": 2.5772347527407514e-06, + "loss": 1.512205958366394, + "step": 9394 + }, + { + "epoch": 1.7102939837990352, + "grad_norm": 13.0, + "learning_rate": 2.576485947463759e-06, + "loss": 1.0887117385864258, + "step": 9396 + }, + { + "epoch": 1.7106580504232274, + "grad_norm": 9.1875, + "learning_rate": 2.5757372043357376e-06, + "loss": 0.7916741371154785, + "step": 9398 + }, + { + "epoch": 1.7110221170474196, + "grad_norm": 17.0, + "learning_rate": 2.5749885234665617e-06, + "loss": 1.1433402299880981, + "step": 9400 + }, + { + "epoch": 1.7113861836716118, + "grad_norm": 19.375, + "learning_rate": 2.5742399049660976e-06, + "loss": 1.6151204109191895, + "step": 9402 + }, + { + "epoch": 1.711750250295804, + "grad_norm": 10.125, + "learning_rate": 2.5734913489442016e-06, + "loss": 1.2714388370513916, + "step": 9404 + }, + { + "epoch": 1.7121143169199964, + "grad_norm": 7.21875, + "learning_rate": 2.5727428555107224e-06, + "loss": 1.5417263507843018, + "step": 9406 + }, + { + "epoch": 1.7124783835441886, + "grad_norm": 9.0, + "learning_rate": 2.571994424775497e-06, + "loss": 1.2543859481811523, + "step": 9408 + }, + { + "epoch": 1.7128424501683808, + "grad_norm": 16.875, + "learning_rate": 2.5712460568483554e-06, + "loss": 1.5546578168869019, + "step": 9410 + }, + { + "epoch": 1.7132065167925732, + "grad_norm": 6.21875, + "learning_rate": 2.5704977518391176e-06, + "loss": 1.0436283349990845, + "step": 9412 + }, + { + "epoch": 1.7135705834167654, + "grad_norm": 10.0, + "learning_rate": 2.569749509857594e-06, + "loss": 1.3642622232437134, + "step": 9414 + }, + { + "epoch": 1.7139346500409576, + "grad_norm": 19.5, + "learning_rate": 2.5690013310135866e-06, + "loss": 0.9530158042907715, + "step": 9416 + }, + { + "epoch": 1.7142987166651498, + "grad_norm": 31.5, + "learning_rate": 2.568253215416886e-06, + "loss": 1.4778410196304321, + "step": 9418 + }, + { + "epoch": 1.714662783289342, + "grad_norm": 9.625, + "learning_rate": 2.5675051631772774e-06, + "loss": 1.286505937576294, + "step": 9420 + }, + { + "epoch": 1.7150268499135342, + "grad_norm": 12.375, + "learning_rate": 2.5667571744045328e-06, + "loss": 1.4485591650009155, + "step": 9422 + }, + { + "epoch": 1.7153909165377264, + "grad_norm": 9.75, + "learning_rate": 2.5660092492084163e-06, + "loss": 1.3602252006530762, + "step": 9424 + }, + { + "epoch": 1.7157549831619185, + "grad_norm": 13.125, + "learning_rate": 2.5652613876986853e-06, + "loss": 0.9761204719543457, + "step": 9426 + }, + { + "epoch": 1.7161190497861107, + "grad_norm": 26.375, + "learning_rate": 2.5645135899850826e-06, + "loss": 1.7280157804489136, + "step": 9428 + }, + { + "epoch": 1.716483116410303, + "grad_norm": 10.5, + "learning_rate": 2.5637658561773472e-06, + "loss": 1.4641187191009521, + "step": 9430 + }, + { + "epoch": 1.7168471830344953, + "grad_norm": 10.1875, + "learning_rate": 2.5630181863852034e-06, + "loss": 1.4083130359649658, + "step": 9432 + }, + { + "epoch": 1.7172112496586875, + "grad_norm": 14.0, + "learning_rate": 2.562270580718369e-06, + "loss": 0.9213528037071228, + "step": 9434 + }, + { + "epoch": 1.7175753162828797, + "grad_norm": 11.5, + "learning_rate": 2.5615230392865544e-06, + "loss": 1.0007522106170654, + "step": 9436 + }, + { + "epoch": 1.7179393829070722, + "grad_norm": 38.5, + "learning_rate": 2.560775562199456e-06, + "loss": 0.6947571039199829, + "step": 9438 + }, + { + "epoch": 1.7183034495312643, + "grad_norm": 17.75, + "learning_rate": 2.5600281495667646e-06, + "loss": 1.6564505100250244, + "step": 9440 + }, + { + "epoch": 1.7186675161554565, + "grad_norm": 25.625, + "learning_rate": 2.5592808014981586e-06, + "loss": 1.7123980522155762, + "step": 9442 + }, + { + "epoch": 1.7190315827796487, + "grad_norm": 7.5, + "learning_rate": 2.5585335181033088e-06, + "loss": 1.358048439025879, + "step": 9444 + }, + { + "epoch": 1.719395649403841, + "grad_norm": 6.0625, + "learning_rate": 2.5577862994918765e-06, + "loss": 1.3348028659820557, + "step": 9446 + }, + { + "epoch": 1.7197597160280331, + "grad_norm": 9.5625, + "learning_rate": 2.557039145773512e-06, + "loss": 1.3989973068237305, + "step": 9448 + }, + { + "epoch": 1.7201237826522253, + "grad_norm": 13.625, + "learning_rate": 2.5562920570578586e-06, + "loss": 1.7857215404510498, + "step": 9450 + }, + { + "epoch": 1.7204878492764175, + "grad_norm": 3.828125, + "learning_rate": 2.5555450334545472e-06, + "loss": 1.1638060808181763, + "step": 9452 + }, + { + "epoch": 1.7208519159006097, + "grad_norm": 6.65625, + "learning_rate": 2.5547980750732006e-06, + "loss": 1.1869091987609863, + "step": 9454 + }, + { + "epoch": 1.721215982524802, + "grad_norm": 14.6875, + "learning_rate": 2.5540511820234327e-06, + "loss": 1.2031751871109009, + "step": 9456 + }, + { + "epoch": 1.7215800491489943, + "grad_norm": 18.25, + "learning_rate": 2.5533043544148463e-06, + "loss": 1.6081607341766357, + "step": 9458 + }, + { + "epoch": 1.7219441157731865, + "grad_norm": 12.875, + "learning_rate": 2.5525575923570356e-06, + "loss": 1.149942398071289, + "step": 9460 + }, + { + "epoch": 1.7223081823973787, + "grad_norm": 16.625, + "learning_rate": 2.551810895959585e-06, + "loss": 1.8807177543640137, + "step": 9462 + }, + { + "epoch": 1.722672249021571, + "grad_norm": 10.75, + "learning_rate": 2.551064265332068e-06, + "loss": 1.5904488563537598, + "step": 9464 + }, + { + "epoch": 1.7230363156457633, + "grad_norm": 12.375, + "learning_rate": 2.5503177005840527e-06, + "loss": 1.200134038925171, + "step": 9466 + }, + { + "epoch": 1.7234003822699555, + "grad_norm": 9.75, + "learning_rate": 2.5495712018250907e-06, + "loss": 0.9712511301040649, + "step": 9468 + }, + { + "epoch": 1.7237644488941477, + "grad_norm": 6.53125, + "learning_rate": 2.5488247691647306e-06, + "loss": 1.2511146068572998, + "step": 9470 + }, + { + "epoch": 1.72412851551834, + "grad_norm": 15.3125, + "learning_rate": 2.548078402712507e-06, + "loss": 1.2763681411743164, + "step": 9472 + }, + { + "epoch": 1.724492582142532, + "grad_norm": 57.5, + "learning_rate": 2.5473321025779453e-06, + "loss": 1.3130264282226562, + "step": 9474 + }, + { + "epoch": 1.7248566487667243, + "grad_norm": 11.5625, + "learning_rate": 2.5465858688705647e-06, + "loss": 1.5271480083465576, + "step": 9476 + }, + { + "epoch": 1.7252207153909165, + "grad_norm": 4.21875, + "learning_rate": 2.5458397016998688e-06, + "loss": 1.0769906044006348, + "step": 9478 + }, + { + "epoch": 1.7255847820151087, + "grad_norm": 19.625, + "learning_rate": 2.545093601175357e-06, + "loss": 1.4715313911437988, + "step": 9480 + }, + { + "epoch": 1.7259488486393009, + "grad_norm": 6.8125, + "learning_rate": 2.5443475674065155e-06, + "loss": 1.36512291431427, + "step": 9482 + }, + { + "epoch": 1.726312915263493, + "grad_norm": 9.375, + "learning_rate": 2.543601600502821e-06, + "loss": 1.8886725902557373, + "step": 9484 + }, + { + "epoch": 1.7266769818876855, + "grad_norm": 7.96875, + "learning_rate": 2.542855700573743e-06, + "loss": 1.5973446369171143, + "step": 9486 + }, + { + "epoch": 1.7270410485118777, + "grad_norm": 11.625, + "learning_rate": 2.542109867728738e-06, + "loss": 1.3817940950393677, + "step": 9488 + }, + { + "epoch": 1.7274051151360699, + "grad_norm": 10.6875, + "learning_rate": 2.541364102077255e-06, + "loss": 1.3823630809783936, + "step": 9490 + }, + { + "epoch": 1.7277691817602623, + "grad_norm": 16.25, + "learning_rate": 2.5406184037287306e-06, + "loss": 1.6294407844543457, + "step": 9492 + }, + { + "epoch": 1.7281332483844545, + "grad_norm": 13.8125, + "learning_rate": 2.5398727727925932e-06, + "loss": 1.61515212059021, + "step": 9494 + }, + { + "epoch": 1.7284973150086467, + "grad_norm": 11.5625, + "learning_rate": 2.5391272093782628e-06, + "loss": 1.671416997909546, + "step": 9496 + }, + { + "epoch": 1.7288613816328389, + "grad_norm": 33.75, + "learning_rate": 2.5383817135951454e-06, + "loss": 1.786433219909668, + "step": 9498 + }, + { + "epoch": 1.729225448257031, + "grad_norm": 9.0625, + "learning_rate": 2.5376362855526423e-06, + "loss": 1.5212745666503906, + "step": 9500 + }, + { + "epoch": 1.7295895148812233, + "grad_norm": 16.5, + "learning_rate": 2.536890925360139e-06, + "loss": 1.1949679851531982, + "step": 9502 + }, + { + "epoch": 1.7299535815054154, + "grad_norm": 12.8125, + "learning_rate": 2.5361456331270164e-06, + "loss": 1.3068662881851196, + "step": 9504 + }, + { + "epoch": 1.7303176481296076, + "grad_norm": 22.125, + "learning_rate": 2.535400408962643e-06, + "loss": 1.3767890930175781, + "step": 9506 + }, + { + "epoch": 1.7306817147537998, + "grad_norm": 10.625, + "learning_rate": 2.5346552529763744e-06, + "loss": 1.7470083236694336, + "step": 9508 + }, + { + "epoch": 1.731045781377992, + "grad_norm": 21.5, + "learning_rate": 2.5339101652775637e-06, + "loss": 1.6628854274749756, + "step": 9510 + }, + { + "epoch": 1.7314098480021844, + "grad_norm": 27.5, + "learning_rate": 2.5331651459755467e-06, + "loss": 0.6716596484184265, + "step": 9512 + }, + { + "epoch": 1.7317739146263766, + "grad_norm": 17.75, + "learning_rate": 2.5324201951796525e-06, + "loss": 0.9974995255470276, + "step": 9514 + }, + { + "epoch": 1.7321379812505688, + "grad_norm": 26.875, + "learning_rate": 2.5316753129992e-06, + "loss": 1.5912859439849854, + "step": 9516 + }, + { + "epoch": 1.732502047874761, + "grad_norm": 13.9375, + "learning_rate": 2.530930499543497e-06, + "loss": 2.0026633739471436, + "step": 9518 + }, + { + "epoch": 1.7328661144989534, + "grad_norm": 4.625, + "learning_rate": 2.530185754921842e-06, + "loss": 1.2697510719299316, + "step": 9520 + }, + { + "epoch": 1.7332301811231456, + "grad_norm": 9.625, + "learning_rate": 2.5294410792435244e-06, + "loss": 1.0889990329742432, + "step": 9522 + }, + { + "epoch": 1.7335942477473378, + "grad_norm": 57.0, + "learning_rate": 2.528696472617821e-06, + "loss": 1.425118088722229, + "step": 9524 + }, + { + "epoch": 1.73395831437153, + "grad_norm": 7.40625, + "learning_rate": 2.5279519351540003e-06, + "loss": 0.8747223615646362, + "step": 9526 + }, + { + "epoch": 1.7343223809957222, + "grad_norm": 7.53125, + "learning_rate": 2.5272074669613188e-06, + "loss": 1.295595645904541, + "step": 9528 + }, + { + "epoch": 1.7346864476199144, + "grad_norm": 5.3125, + "learning_rate": 2.5264630681490276e-06, + "loss": 0.5483484268188477, + "step": 9530 + }, + { + "epoch": 1.7350505142441066, + "grad_norm": 12.625, + "learning_rate": 2.5257187388263605e-06, + "loss": 1.5035117864608765, + "step": 9532 + }, + { + "epoch": 1.7354145808682988, + "grad_norm": 11.5625, + "learning_rate": 2.5249744791025467e-06, + "loss": 1.470358967781067, + "step": 9534 + }, + { + "epoch": 1.735778647492491, + "grad_norm": 10.4375, + "learning_rate": 2.5242302890868035e-06, + "loss": 1.357577919960022, + "step": 9536 + }, + { + "epoch": 1.7361427141166832, + "grad_norm": 5.78125, + "learning_rate": 2.5234861688883367e-06, + "loss": 1.4563628435134888, + "step": 9538 + }, + { + "epoch": 1.7365067807408756, + "grad_norm": 8.4375, + "learning_rate": 2.5227421186163436e-06, + "loss": 1.495448350906372, + "step": 9540 + }, + { + "epoch": 1.7368708473650678, + "grad_norm": 15.75, + "learning_rate": 2.5219981383800107e-06, + "loss": 1.5220292806625366, + "step": 9542 + }, + { + "epoch": 1.73723491398926, + "grad_norm": 13.375, + "learning_rate": 2.5212542282885133e-06, + "loss": 1.4442135095596313, + "step": 9544 + }, + { + "epoch": 1.7375989806134524, + "grad_norm": 7.90625, + "learning_rate": 2.5205103884510186e-06, + "loss": 1.253954529762268, + "step": 9546 + }, + { + "epoch": 1.7379630472376446, + "grad_norm": 11.1875, + "learning_rate": 2.51976661897668e-06, + "loss": 1.4745149612426758, + "step": 9548 + }, + { + "epoch": 1.7383271138618368, + "grad_norm": 28.25, + "learning_rate": 2.5190229199746454e-06, + "loss": 2.002610683441162, + "step": 9550 + }, + { + "epoch": 1.738691180486029, + "grad_norm": 6.625, + "learning_rate": 2.5182792915540464e-06, + "loss": 1.028231143951416, + "step": 9552 + }, + { + "epoch": 1.7390552471102212, + "grad_norm": 12.8125, + "learning_rate": 2.51753573382401e-06, + "loss": 1.2153304815292358, + "step": 9554 + }, + { + "epoch": 1.7394193137344134, + "grad_norm": 28.25, + "learning_rate": 2.51679224689365e-06, + "loss": 1.9716033935546875, + "step": 9556 + }, + { + "epoch": 1.7397833803586056, + "grad_norm": 10.9375, + "learning_rate": 2.5160488308720675e-06, + "loss": 1.5472264289855957, + "step": 9558 + }, + { + "epoch": 1.7401474469827978, + "grad_norm": 7.5, + "learning_rate": 2.51530548586836e-06, + "loss": 1.529146432876587, + "step": 9560 + }, + { + "epoch": 1.74051151360699, + "grad_norm": 14.4375, + "learning_rate": 2.5145622119916073e-06, + "loss": 1.5244109630584717, + "step": 9562 + }, + { + "epoch": 1.7408755802311822, + "grad_norm": 6.78125, + "learning_rate": 2.513819009350882e-06, + "loss": 1.4480321407318115, + "step": 9564 + }, + { + "epoch": 1.7412396468553746, + "grad_norm": 33.25, + "learning_rate": 2.513075878055248e-06, + "loss": 1.1801588535308838, + "step": 9566 + }, + { + "epoch": 1.7416037134795668, + "grad_norm": 19.375, + "learning_rate": 2.5123328182137546e-06, + "loss": 1.3722394704818726, + "step": 9568 + }, + { + "epoch": 1.741967780103759, + "grad_norm": 17.5, + "learning_rate": 2.5115898299354446e-06, + "loss": 1.5290158987045288, + "step": 9570 + }, + { + "epoch": 1.7423318467279512, + "grad_norm": 10.3125, + "learning_rate": 2.5108469133293467e-06, + "loss": 1.6164394617080688, + "step": 9572 + }, + { + "epoch": 1.7426959133521436, + "grad_norm": 10.6875, + "learning_rate": 2.5101040685044825e-06, + "loss": 1.5796513557434082, + "step": 9574 + }, + { + "epoch": 1.7430599799763358, + "grad_norm": 10.5625, + "learning_rate": 2.509361295569861e-06, + "loss": 1.4934625625610352, + "step": 9576 + }, + { + "epoch": 1.743424046600528, + "grad_norm": 5.3125, + "learning_rate": 2.5086185946344805e-06, + "loss": 0.552492618560791, + "step": 9578 + }, + { + "epoch": 1.7437881132247202, + "grad_norm": 11.8125, + "learning_rate": 2.5078759658073305e-06, + "loss": 1.4284861087799072, + "step": 9580 + }, + { + "epoch": 1.7441521798489124, + "grad_norm": 8.875, + "learning_rate": 2.5071334091973875e-06, + "loss": 1.740067720413208, + "step": 9582 + }, + { + "epoch": 1.7445162464731045, + "grad_norm": 8.1875, + "learning_rate": 2.50639092491362e-06, + "loss": 1.3642146587371826, + "step": 9584 + }, + { + "epoch": 1.7448803130972967, + "grad_norm": 30.125, + "learning_rate": 2.5056485130649834e-06, + "loss": 1.5055596828460693, + "step": 9586 + }, + { + "epoch": 1.745244379721489, + "grad_norm": 18.0, + "learning_rate": 2.504906173760423e-06, + "loss": 2.0214505195617676, + "step": 9588 + }, + { + "epoch": 1.7456084463456811, + "grad_norm": 9.25, + "learning_rate": 2.5041639071088773e-06, + "loss": 1.335474967956543, + "step": 9590 + }, + { + "epoch": 1.7459725129698733, + "grad_norm": 8.125, + "learning_rate": 2.503421713219267e-06, + "loss": 1.606168508529663, + "step": 9592 + }, + { + "epoch": 1.7463365795940657, + "grad_norm": 8.3125, + "learning_rate": 2.5026795922005086e-06, + "loss": 1.4051029682159424, + "step": 9594 + }, + { + "epoch": 1.746700646218258, + "grad_norm": 17.5, + "learning_rate": 2.5019375441615046e-06, + "loss": 1.4665770530700684, + "step": 9596 + }, + { + "epoch": 1.7470647128424501, + "grad_norm": 9.3125, + "learning_rate": 2.5011955692111477e-06, + "loss": 1.4853754043579102, + "step": 9598 + }, + { + "epoch": 1.7474287794666425, + "grad_norm": 10.0625, + "learning_rate": 2.500453667458319e-06, + "loss": 1.4237127304077148, + "step": 9600 + }, + { + "epoch": 1.7477928460908347, + "grad_norm": 7.21875, + "learning_rate": 2.499711839011891e-06, + "loss": 1.3613488674163818, + "step": 9602 + }, + { + "epoch": 1.748156912715027, + "grad_norm": 10.3125, + "learning_rate": 2.498970083980722e-06, + "loss": 0.9239940643310547, + "step": 9604 + }, + { + "epoch": 1.7485209793392191, + "grad_norm": 14.9375, + "learning_rate": 2.4982284024736638e-06, + "loss": 1.1943309307098389, + "step": 9606 + }, + { + "epoch": 1.7488850459634113, + "grad_norm": 7.09375, + "learning_rate": 2.4974867945995528e-06, + "loss": 0.851227879524231, + "step": 9608 + }, + { + "epoch": 1.7492491125876035, + "grad_norm": 11.75, + "learning_rate": 2.496745260467219e-06, + "loss": 1.4715644121170044, + "step": 9610 + }, + { + "epoch": 1.7496131792117957, + "grad_norm": 20.875, + "learning_rate": 2.4960038001854788e-06, + "loss": 1.628612995147705, + "step": 9612 + }, + { + "epoch": 1.749977245835988, + "grad_norm": 10.6875, + "learning_rate": 2.4952624138631376e-06, + "loss": 1.5021353960037231, + "step": 9614 + }, + { + "epoch": 1.75034131246018, + "grad_norm": 13.0625, + "learning_rate": 2.4945211016089928e-06, + "loss": 1.4636298418045044, + "step": 9616 + }, + { + "epoch": 1.7507053790843723, + "grad_norm": 7.59375, + "learning_rate": 2.4937798635318266e-06, + "loss": 1.2379364967346191, + "step": 9618 + }, + { + "epoch": 1.7510694457085647, + "grad_norm": 12.8125, + "learning_rate": 2.4930386997404148e-06, + "loss": 1.788224458694458, + "step": 9620 + }, + { + "epoch": 1.751433512332757, + "grad_norm": 16.75, + "learning_rate": 2.492297610343519e-06, + "loss": 1.8670352697372437, + "step": 9622 + }, + { + "epoch": 1.751797578956949, + "grad_norm": 7.8125, + "learning_rate": 2.491556595449891e-06, + "loss": 1.2936989068984985, + "step": 9624 + }, + { + "epoch": 1.7521616455811415, + "grad_norm": 5.71875, + "learning_rate": 2.490815655168273e-06, + "loss": 0.8864408135414124, + "step": 9626 + }, + { + "epoch": 1.7525257122053337, + "grad_norm": 7.125, + "learning_rate": 2.490074789607393e-06, + "loss": 1.0749104022979736, + "step": 9628 + }, + { + "epoch": 1.752889778829526, + "grad_norm": 28.5, + "learning_rate": 2.4893339988759717e-06, + "loss": 1.4294183254241943, + "step": 9630 + }, + { + "epoch": 1.753253845453718, + "grad_norm": 12.25, + "learning_rate": 2.4885932830827162e-06, + "loss": 1.4415847063064575, + "step": 9632 + }, + { + "epoch": 1.7536179120779103, + "grad_norm": 5.4375, + "learning_rate": 2.4878526423363236e-06, + "loss": 1.1845145225524902, + "step": 9634 + }, + { + "epoch": 1.7539819787021025, + "grad_norm": 21.0, + "learning_rate": 2.4871120767454814e-06, + "loss": 1.7602466344833374, + "step": 9636 + }, + { + "epoch": 1.7543460453262947, + "grad_norm": 15.875, + "learning_rate": 2.486371586418862e-06, + "loss": 1.47333824634552, + "step": 9638 + }, + { + "epoch": 1.7547101119504869, + "grad_norm": 13.3125, + "learning_rate": 2.4856311714651317e-06, + "loss": 1.3083434104919434, + "step": 9640 + }, + { + "epoch": 1.755074178574679, + "grad_norm": 14.0, + "learning_rate": 2.484890831992942e-06, + "loss": 1.6655809879302979, + "step": 9642 + }, + { + "epoch": 1.7554382451988713, + "grad_norm": 16.0, + "learning_rate": 2.4841505681109348e-06, + "loss": 0.9073799848556519, + "step": 9644 + }, + { + "epoch": 1.7558023118230635, + "grad_norm": 12.75, + "learning_rate": 2.483410379927742e-06, + "loss": 1.241572618484497, + "step": 9646 + }, + { + "epoch": 1.7561663784472559, + "grad_norm": 9.75, + "learning_rate": 2.482670267551982e-06, + "loss": 1.8137145042419434, + "step": 9648 + }, + { + "epoch": 1.756530445071448, + "grad_norm": 7.4375, + "learning_rate": 2.4819302310922635e-06, + "loss": 1.2279140949249268, + "step": 9650 + }, + { + "epoch": 1.7568945116956403, + "grad_norm": 7.5, + "learning_rate": 2.4811902706571846e-06, + "loss": 1.3970588445663452, + "step": 9652 + }, + { + "epoch": 1.7572585783198327, + "grad_norm": 7.84375, + "learning_rate": 2.48045038635533e-06, + "loss": 1.402956247329712, + "step": 9654 + }, + { + "epoch": 1.7576226449440249, + "grad_norm": 16.625, + "learning_rate": 2.4797105782952767e-06, + "loss": 1.8207128047943115, + "step": 9656 + }, + { + "epoch": 1.757986711568217, + "grad_norm": 10.8125, + "learning_rate": 2.4789708465855863e-06, + "loss": 1.483075737953186, + "step": 9658 + }, + { + "epoch": 1.7583507781924093, + "grad_norm": 18.875, + "learning_rate": 2.4782311913348135e-06, + "loss": 1.5671143531799316, + "step": 9660 + }, + { + "epoch": 1.7587148448166015, + "grad_norm": 49.0, + "learning_rate": 2.477491612651499e-06, + "loss": 1.3647823333740234, + "step": 9662 + }, + { + "epoch": 1.7590789114407936, + "grad_norm": 7.15625, + "learning_rate": 2.476752110644172e-06, + "loss": 1.352724552154541, + "step": 9664 + }, + { + "epoch": 1.7594429780649858, + "grad_norm": 14.625, + "learning_rate": 2.476012685421353e-06, + "loss": 1.2368907928466797, + "step": 9666 + }, + { + "epoch": 1.759807044689178, + "grad_norm": 47.25, + "learning_rate": 2.475273337091548e-06, + "loss": 1.5982136726379395, + "step": 9668 + }, + { + "epoch": 1.7601711113133702, + "grad_norm": 9.75, + "learning_rate": 2.474534065763255e-06, + "loss": 1.4263297319412231, + "step": 9670 + }, + { + "epoch": 1.7605351779375624, + "grad_norm": 9.375, + "learning_rate": 2.473794871544958e-06, + "loss": 1.2839057445526123, + "step": 9672 + }, + { + "epoch": 1.7608992445617548, + "grad_norm": 13.6875, + "learning_rate": 2.473055754545131e-06, + "loss": 1.248408555984497, + "step": 9674 + }, + { + "epoch": 1.761263311185947, + "grad_norm": 16.875, + "learning_rate": 2.472316714872237e-06, + "loss": 1.9855908155441284, + "step": 9676 + }, + { + "epoch": 1.7616273778101392, + "grad_norm": 17.625, + "learning_rate": 2.4715777526347258e-06, + "loss": 1.597987413406372, + "step": 9678 + }, + { + "epoch": 1.7619914444343316, + "grad_norm": 17.25, + "learning_rate": 2.470838867941039e-06, + "loss": 1.511553168296814, + "step": 9680 + }, + { + "epoch": 1.7623555110585238, + "grad_norm": 6.96875, + "learning_rate": 2.470100060899603e-06, + "loss": 1.247733473777771, + "step": 9682 + }, + { + "epoch": 1.762719577682716, + "grad_norm": 35.25, + "learning_rate": 2.469361331618835e-06, + "loss": 1.3600777387619019, + "step": 9684 + }, + { + "epoch": 1.7630836443069082, + "grad_norm": 6.53125, + "learning_rate": 2.4686226802071424e-06, + "loss": 1.0719618797302246, + "step": 9686 + }, + { + "epoch": 1.7634477109311004, + "grad_norm": 12.5, + "learning_rate": 2.467884106772917e-06, + "loss": 1.487484097480774, + "step": 9688 + }, + { + "epoch": 1.7638117775552926, + "grad_norm": 19.25, + "learning_rate": 2.467145611424543e-06, + "loss": 1.0855759382247925, + "step": 9690 + }, + { + "epoch": 1.7641758441794848, + "grad_norm": 11.0625, + "learning_rate": 2.4664071942703914e-06, + "loss": 1.6698211431503296, + "step": 9692 + }, + { + "epoch": 1.764539910803677, + "grad_norm": 15.0, + "learning_rate": 2.4656688554188203e-06, + "loss": 1.6015561819076538, + "step": 9694 + }, + { + "epoch": 1.7649039774278692, + "grad_norm": 13.875, + "learning_rate": 2.464930594978181e-06, + "loss": 1.7836737632751465, + "step": 9696 + }, + { + "epoch": 1.7652680440520614, + "grad_norm": 18.125, + "learning_rate": 2.4641924130568066e-06, + "loss": 1.5052692890167236, + "step": 9698 + }, + { + "epoch": 1.7656321106762538, + "grad_norm": 8.75, + "learning_rate": 2.4634543097630255e-06, + "loss": 1.4619030952453613, + "step": 9700 + }, + { + "epoch": 1.765996177300446, + "grad_norm": 6.59375, + "learning_rate": 2.4627162852051495e-06, + "loss": 1.549086332321167, + "step": 9702 + }, + { + "epoch": 1.7663602439246382, + "grad_norm": 7.75, + "learning_rate": 2.461978339491481e-06, + "loss": 1.185326099395752, + "step": 9704 + }, + { + "epoch": 1.7667243105488304, + "grad_norm": 11.0625, + "learning_rate": 2.4612404727303115e-06, + "loss": 1.426266074180603, + "step": 9706 + }, + { + "epoch": 1.7670883771730228, + "grad_norm": 7.625, + "learning_rate": 2.460502685029918e-06, + "loss": 1.2171050310134888, + "step": 9708 + }, + { + "epoch": 1.767452443797215, + "grad_norm": 8.125, + "learning_rate": 2.45976497649857e-06, + "loss": 1.066184163093567, + "step": 9710 + }, + { + "epoch": 1.7678165104214072, + "grad_norm": 9.5, + "learning_rate": 2.4590273472445226e-06, + "loss": 1.266125202178955, + "step": 9712 + }, + { + "epoch": 1.7681805770455994, + "grad_norm": 5.84375, + "learning_rate": 2.4582897973760187e-06, + "loss": 1.1672508716583252, + "step": 9714 + }, + { + "epoch": 1.7685446436697916, + "grad_norm": 149.0, + "learning_rate": 2.4575523270012925e-06, + "loss": 1.6852155923843384, + "step": 9716 + }, + { + "epoch": 1.7689087102939838, + "grad_norm": 16.75, + "learning_rate": 2.4568149362285633e-06, + "loss": 1.973315715789795, + "step": 9718 + }, + { + "epoch": 1.769272776918176, + "grad_norm": 5.65625, + "learning_rate": 2.456077625166041e-06, + "loss": 1.35611093044281, + "step": 9720 + }, + { + "epoch": 1.7696368435423682, + "grad_norm": 19.0, + "learning_rate": 2.4553403939219233e-06, + "loss": 1.5911699533462524, + "step": 9722 + }, + { + "epoch": 1.7700009101665604, + "grad_norm": 26.0, + "learning_rate": 2.4546032426043947e-06, + "loss": 1.450231909751892, + "step": 9724 + }, + { + "epoch": 1.7703649767907526, + "grad_norm": 11.625, + "learning_rate": 2.4538661713216312e-06, + "loss": 1.0855478048324585, + "step": 9726 + }, + { + "epoch": 1.770729043414945, + "grad_norm": 21.25, + "learning_rate": 2.4531291801817926e-06, + "loss": 0.5967104434967041, + "step": 9728 + }, + { + "epoch": 1.7710931100391372, + "grad_norm": 63.25, + "learning_rate": 2.4523922692930313e-06, + "loss": 1.5665773153305054, + "step": 9730 + }, + { + "epoch": 1.7714571766633294, + "grad_norm": 11.5, + "learning_rate": 2.451655438763485e-06, + "loss": 1.5051593780517578, + "step": 9732 + }, + { + "epoch": 1.7718212432875218, + "grad_norm": 5.5625, + "learning_rate": 2.45091868870128e-06, + "loss": 1.6245388984680176, + "step": 9734 + }, + { + "epoch": 1.772185309911714, + "grad_norm": 5.9375, + "learning_rate": 2.4501820192145335e-06, + "loss": 1.2335116863250732, + "step": 9736 + }, + { + "epoch": 1.7725493765359062, + "grad_norm": 8.375, + "learning_rate": 2.4494454304113464e-06, + "loss": 1.3282036781311035, + "step": 9738 + }, + { + "epoch": 1.7729134431600984, + "grad_norm": 9.4375, + "learning_rate": 2.448708922399812e-06, + "loss": 1.277222752571106, + "step": 9740 + }, + { + "epoch": 1.7732775097842906, + "grad_norm": 28.625, + "learning_rate": 2.447972495288009e-06, + "loss": 1.4769645929336548, + "step": 9742 + }, + { + "epoch": 1.7736415764084827, + "grad_norm": 14.4375, + "learning_rate": 2.4472361491840046e-06, + "loss": 1.7494463920593262, + "step": 9744 + }, + { + "epoch": 1.774005643032675, + "grad_norm": 9.0625, + "learning_rate": 2.4464998841958554e-06, + "loss": 1.5159399509429932, + "step": 9746 + }, + { + "epoch": 1.7743697096568671, + "grad_norm": 15.75, + "learning_rate": 2.4457637004316048e-06, + "loss": 1.6344311237335205, + "step": 9748 + }, + { + "epoch": 1.7747337762810593, + "grad_norm": 13.4375, + "learning_rate": 2.4450275979992854e-06, + "loss": 1.628173828125, + "step": 9750 + }, + { + "epoch": 1.7750978429052515, + "grad_norm": 9.75, + "learning_rate": 2.444291577006917e-06, + "loss": 1.2222493886947632, + "step": 9752 + }, + { + "epoch": 1.775461909529444, + "grad_norm": 11.25, + "learning_rate": 2.443555637562507e-06, + "loss": 1.3597251176834106, + "step": 9754 + }, + { + "epoch": 1.7758259761536361, + "grad_norm": 16.375, + "learning_rate": 2.4428197797740526e-06, + "loss": 1.582828164100647, + "step": 9756 + }, + { + "epoch": 1.7761900427778283, + "grad_norm": 16.625, + "learning_rate": 2.4420840037495373e-06, + "loss": 1.9756643772125244, + "step": 9758 + }, + { + "epoch": 1.7765541094020205, + "grad_norm": 9.375, + "learning_rate": 2.441348309596934e-06, + "loss": 1.3612494468688965, + "step": 9760 + }, + { + "epoch": 1.776918176026213, + "grad_norm": 8.3125, + "learning_rate": 2.440612697424202e-06, + "loss": 1.1914498805999756, + "step": 9762 + }, + { + "epoch": 1.7772822426504051, + "grad_norm": 8.5, + "learning_rate": 2.4398771673392884e-06, + "loss": 1.3873649835586548, + "step": 9764 + }, + { + "epoch": 1.7776463092745973, + "grad_norm": 10.375, + "learning_rate": 2.4391417194501323e-06, + "loss": 1.4707486629486084, + "step": 9766 + }, + { + "epoch": 1.7780103758987895, + "grad_norm": 16.5, + "learning_rate": 2.4384063538646545e-06, + "loss": 1.2610704898834229, + "step": 9768 + }, + { + "epoch": 1.7783744425229817, + "grad_norm": 6.28125, + "learning_rate": 2.437671070690769e-06, + "loss": 0.9276196956634521, + "step": 9770 + }, + { + "epoch": 1.778738509147174, + "grad_norm": 10.6875, + "learning_rate": 2.436935870036375e-06, + "loss": 1.3663438558578491, + "step": 9772 + }, + { + "epoch": 1.779102575771366, + "grad_norm": 5.90625, + "learning_rate": 2.4362007520093595e-06, + "loss": 1.3276851177215576, + "step": 9774 + }, + { + "epoch": 1.7794666423955583, + "grad_norm": 23.5, + "learning_rate": 2.4354657167176e-06, + "loss": 1.2276180982589722, + "step": 9776 + }, + { + "epoch": 1.7798307090197505, + "grad_norm": 61.0, + "learning_rate": 2.434730764268958e-06, + "loss": 1.7314021587371826, + "step": 9778 + }, + { + "epoch": 1.7801947756439427, + "grad_norm": 14.3125, + "learning_rate": 2.4339958947712854e-06, + "loss": 1.318850040435791, + "step": 9780 + }, + { + "epoch": 1.780558842268135, + "grad_norm": 10.5, + "learning_rate": 2.433261108332422e-06, + "loss": 1.0651581287384033, + "step": 9782 + }, + { + "epoch": 1.7809229088923273, + "grad_norm": 6.34375, + "learning_rate": 2.432526405060193e-06, + "loss": 1.230223536491394, + "step": 9784 + }, + { + "epoch": 1.7812869755165195, + "grad_norm": 11.5, + "learning_rate": 2.4317917850624152e-06, + "loss": 1.2609082460403442, + "step": 9786 + }, + { + "epoch": 1.781651042140712, + "grad_norm": 13.9375, + "learning_rate": 2.4310572484468897e-06, + "loss": 1.7883930206298828, + "step": 9788 + }, + { + "epoch": 1.782015108764904, + "grad_norm": 5.03125, + "learning_rate": 2.430322795321408e-06, + "loss": 1.3512423038482666, + "step": 9790 + }, + { + "epoch": 1.7823791753890963, + "grad_norm": 6.78125, + "learning_rate": 2.429588425793747e-06, + "loss": 1.3327968120574951, + "step": 9792 + }, + { + "epoch": 1.7827432420132885, + "grad_norm": 10.625, + "learning_rate": 2.4288541399716725e-06, + "loss": 1.5858317613601685, + "step": 9794 + }, + { + "epoch": 1.7831073086374807, + "grad_norm": 10.0, + "learning_rate": 2.428119937962939e-06, + "loss": 1.4162780046463013, + "step": 9796 + }, + { + "epoch": 1.7834713752616729, + "grad_norm": 10.875, + "learning_rate": 2.4273858198752863e-06, + "loss": 1.5521361827850342, + "step": 9798 + }, + { + "epoch": 1.783835441885865, + "grad_norm": 10.1875, + "learning_rate": 2.4266517858164444e-06, + "loss": 1.471111536026001, + "step": 9800 + }, + { + "epoch": 1.7841995085100573, + "grad_norm": 15.6875, + "learning_rate": 2.425917835894129e-06, + "loss": 1.2437063455581665, + "step": 9802 + }, + { + "epoch": 1.7845635751342495, + "grad_norm": 22.75, + "learning_rate": 2.425183970216045e-06, + "loss": 1.3602137565612793, + "step": 9804 + }, + { + "epoch": 1.7849276417584417, + "grad_norm": 7.25, + "learning_rate": 2.424450188889884e-06, + "loss": 1.1773463487625122, + "step": 9806 + }, + { + "epoch": 1.785291708382634, + "grad_norm": 31.25, + "learning_rate": 2.4237164920233246e-06, + "loss": 1.4031540155410767, + "step": 9808 + }, + { + "epoch": 1.7856557750068263, + "grad_norm": 2.203125, + "learning_rate": 2.422982879724035e-06, + "loss": 1.119649887084961, + "step": 9810 + }, + { + "epoch": 1.7860198416310185, + "grad_norm": 9.4375, + "learning_rate": 2.422249352099669e-06, + "loss": 1.3279435634613037, + "step": 9812 + }, + { + "epoch": 1.7863839082552107, + "grad_norm": 12.375, + "learning_rate": 2.421515909257869e-06, + "loss": 1.4814023971557617, + "step": 9814 + }, + { + "epoch": 1.786747974879403, + "grad_norm": 13.3125, + "learning_rate": 2.4207825513062656e-06, + "loss": 1.6729483604431152, + "step": 9816 + }, + { + "epoch": 1.7871120415035953, + "grad_norm": 36.75, + "learning_rate": 2.420049278352475e-06, + "loss": 1.786515474319458, + "step": 9818 + }, + { + "epoch": 1.7874761081277875, + "grad_norm": 5.21875, + "learning_rate": 2.419316090504103e-06, + "loss": 0.9854660630226135, + "step": 9820 + }, + { + "epoch": 1.7878401747519796, + "grad_norm": 9.1875, + "learning_rate": 2.4185829878687405e-06, + "loss": 1.2063982486724854, + "step": 9822 + }, + { + "epoch": 1.7882042413761718, + "grad_norm": 8.0, + "learning_rate": 2.417849970553969e-06, + "loss": 1.247206449508667, + "step": 9824 + }, + { + "epoch": 1.788568308000364, + "grad_norm": 49.25, + "learning_rate": 2.417117038667355e-06, + "loss": 0.9150762557983398, + "step": 9826 + }, + { + "epoch": 1.7889323746245562, + "grad_norm": 9.25, + "learning_rate": 2.416384192316453e-06, + "loss": 1.0062452554702759, + "step": 9828 + }, + { + "epoch": 1.7892964412487484, + "grad_norm": 6.125, + "learning_rate": 2.4156514316088063e-06, + "loss": 1.250044345855713, + "step": 9830 + }, + { + "epoch": 1.7896605078729406, + "grad_norm": 12.0, + "learning_rate": 2.414918756651943e-06, + "loss": 1.5411306619644165, + "step": 9832 + }, + { + "epoch": 1.7900245744971328, + "grad_norm": 20.125, + "learning_rate": 2.414186167553381e-06, + "loss": 0.9801827669143677, + "step": 9834 + }, + { + "epoch": 1.7903886411213252, + "grad_norm": 20.5, + "learning_rate": 2.4134536644206254e-06, + "loss": 1.5909456014633179, + "step": 9836 + }, + { + "epoch": 1.7907527077455174, + "grad_norm": 58.0, + "learning_rate": 2.412721247361167e-06, + "loss": 1.3072903156280518, + "step": 9838 + }, + { + "epoch": 1.7911167743697096, + "grad_norm": 10.625, + "learning_rate": 2.411988916482486e-06, + "loss": 1.0310885906219482, + "step": 9840 + }, + { + "epoch": 1.791480840993902, + "grad_norm": 24.125, + "learning_rate": 2.4112566718920482e-06, + "loss": 1.7582591772079468, + "step": 9842 + }, + { + "epoch": 1.7918449076180942, + "grad_norm": 9.625, + "learning_rate": 2.4105245136973075e-06, + "loss": 0.9733285903930664, + "step": 9844 + }, + { + "epoch": 1.7922089742422864, + "grad_norm": 10.875, + "learning_rate": 2.4097924420057066e-06, + "loss": 1.4968267679214478, + "step": 9846 + }, + { + "epoch": 1.7925730408664786, + "grad_norm": 19.875, + "learning_rate": 2.409060456924672e-06, + "loss": 1.4720216989517212, + "step": 9848 + }, + { + "epoch": 1.7929371074906708, + "grad_norm": 7.75, + "learning_rate": 2.4083285585616213e-06, + "loss": 1.351499319076538, + "step": 9850 + }, + { + "epoch": 1.793301174114863, + "grad_norm": 6.78125, + "learning_rate": 2.4075967470239556e-06, + "loss": 1.5258337259292603, + "step": 9852 + }, + { + "epoch": 1.7936652407390552, + "grad_norm": 7.65625, + "learning_rate": 2.406865022419067e-06, + "loss": 1.4940929412841797, + "step": 9854 + }, + { + "epoch": 1.7940293073632474, + "grad_norm": 6.34375, + "learning_rate": 2.4061333848543332e-06, + "loss": 1.3019057512283325, + "step": 9856 + }, + { + "epoch": 1.7943933739874396, + "grad_norm": 13.8125, + "learning_rate": 2.4054018344371172e-06, + "loss": 1.3263444900512695, + "step": 9858 + }, + { + "epoch": 1.7947574406116318, + "grad_norm": 33.25, + "learning_rate": 2.404670371274774e-06, + "loss": 1.4296473264694214, + "step": 9860 + }, + { + "epoch": 1.7951215072358242, + "grad_norm": 5.28125, + "learning_rate": 2.4039389954746396e-06, + "loss": 1.4287168979644775, + "step": 9862 + }, + { + "epoch": 1.7954855738600164, + "grad_norm": 7.53125, + "learning_rate": 2.4032077071440424e-06, + "loss": 1.3336765766143799, + "step": 9864 + }, + { + "epoch": 1.7958496404842086, + "grad_norm": 6.15625, + "learning_rate": 2.4024765063902962e-06, + "loss": 1.3832186460494995, + "step": 9866 + }, + { + "epoch": 1.796213707108401, + "grad_norm": 4.3125, + "learning_rate": 2.4017453933207003e-06, + "loss": 1.2775369882583618, + "step": 9868 + }, + { + "epoch": 1.7965777737325932, + "grad_norm": 10.3125, + "learning_rate": 2.4010143680425443e-06, + "loss": 1.537932276725769, + "step": 9870 + }, + { + "epoch": 1.7969418403567854, + "grad_norm": 11.4375, + "learning_rate": 2.4002834306631014e-06, + "loss": 1.3176320791244507, + "step": 9872 + }, + { + "epoch": 1.7973059069809776, + "grad_norm": 10.125, + "learning_rate": 2.3995525812896346e-06, + "loss": 1.5509226322174072, + "step": 9874 + }, + { + "epoch": 1.7976699736051698, + "grad_norm": 12.4375, + "learning_rate": 2.3988218200293943e-06, + "loss": 1.438368320465088, + "step": 9876 + }, + { + "epoch": 1.798034040229362, + "grad_norm": 6.90625, + "learning_rate": 2.3980911469896142e-06, + "loss": 1.3350715637207031, + "step": 9878 + }, + { + "epoch": 1.7983981068535542, + "grad_norm": 24.25, + "learning_rate": 2.39736056227752e-06, + "loss": 1.4058773517608643, + "step": 9880 + }, + { + "epoch": 1.7987621734777464, + "grad_norm": 26.625, + "learning_rate": 2.3966300660003202e-06, + "loss": 1.571915626525879, + "step": 9882 + }, + { + "epoch": 1.7991262401019386, + "grad_norm": 9.125, + "learning_rate": 2.3958996582652133e-06, + "loss": 1.2185813188552856, + "step": 9884 + }, + { + "epoch": 1.7994903067261308, + "grad_norm": 11.375, + "learning_rate": 2.3951693391793836e-06, + "loss": 0.7160068154335022, + "step": 9886 + }, + { + "epoch": 1.799854373350323, + "grad_norm": 13.5, + "learning_rate": 2.3944391088500017e-06, + "loss": 1.264702558517456, + "step": 9888 + }, + { + "epoch": 1.8002184399745154, + "grad_norm": 13.5625, + "learning_rate": 2.3937089673842274e-06, + "loss": 1.4804730415344238, + "step": 9890 + }, + { + "epoch": 1.8005825065987076, + "grad_norm": 7.53125, + "learning_rate": 2.3929789148892035e-06, + "loss": 1.2566052675247192, + "step": 9892 + }, + { + "epoch": 1.8009465732228997, + "grad_norm": 51.5, + "learning_rate": 2.392248951472065e-06, + "loss": 1.2755497694015503, + "step": 9894 + }, + { + "epoch": 1.8013106398470922, + "grad_norm": 13.125, + "learning_rate": 2.39151907723993e-06, + "loss": 1.5451502799987793, + "step": 9896 + }, + { + "epoch": 1.8016747064712844, + "grad_norm": 16.875, + "learning_rate": 2.3907892922999036e-06, + "loss": 1.3404134511947632, + "step": 9898 + }, + { + "epoch": 1.8020387730954766, + "grad_norm": 34.5, + "learning_rate": 2.39005959675908e-06, + "loss": 1.2840261459350586, + "step": 9900 + }, + { + "epoch": 1.8024028397196687, + "grad_norm": 11.6875, + "learning_rate": 2.3893299907245383e-06, + "loss": 0.9123265147209167, + "step": 9902 + }, + { + "epoch": 1.802766906343861, + "grad_norm": 9.875, + "learning_rate": 2.3886004743033463e-06, + "loss": 1.7312496900558472, + "step": 9904 + }, + { + "epoch": 1.8031309729680531, + "grad_norm": 8.6875, + "learning_rate": 2.387871047602557e-06, + "loss": 1.0656354427337646, + "step": 9906 + }, + { + "epoch": 1.8034950395922453, + "grad_norm": 5.21875, + "learning_rate": 2.38714171072921e-06, + "loss": 0.7418041825294495, + "step": 9908 + }, + { + "epoch": 1.8038591062164375, + "grad_norm": 14.0, + "learning_rate": 2.386412463790334e-06, + "loss": 1.2070902585983276, + "step": 9910 + }, + { + "epoch": 1.8042231728406297, + "grad_norm": 12.125, + "learning_rate": 2.385683306892942e-06, + "loss": 1.2074546813964844, + "step": 9912 + }, + { + "epoch": 1.804587239464822, + "grad_norm": 3.875, + "learning_rate": 2.3849542401440346e-06, + "loss": 1.1747174263000488, + "step": 9914 + }, + { + "epoch": 1.8049513060890143, + "grad_norm": 9.625, + "learning_rate": 2.384225263650601e-06, + "loss": 1.1381653547286987, + "step": 9916 + }, + { + "epoch": 1.8053153727132065, + "grad_norm": 6.34375, + "learning_rate": 2.3834963775196132e-06, + "loss": 1.5001487731933594, + "step": 9918 + }, + { + "epoch": 1.8056794393373987, + "grad_norm": 6.28125, + "learning_rate": 2.382767581858035e-06, + "loss": 1.3417357206344604, + "step": 9920 + }, + { + "epoch": 1.8060435059615911, + "grad_norm": 4.65625, + "learning_rate": 2.382038876772811e-06, + "loss": 1.2414604425430298, + "step": 9922 + }, + { + "epoch": 1.8064075725857833, + "grad_norm": 17.625, + "learning_rate": 2.381310262370878e-06, + "loss": 1.8292914628982544, + "step": 9924 + }, + { + "epoch": 1.8067716392099755, + "grad_norm": 5.9375, + "learning_rate": 2.3805817387591577e-06, + "loss": 1.3193349838256836, + "step": 9926 + }, + { + "epoch": 1.8071357058341677, + "grad_norm": 8.3125, + "learning_rate": 2.379853306044556e-06, + "loss": 1.2253795862197876, + "step": 9928 + }, + { + "epoch": 1.80749977245836, + "grad_norm": 7.25, + "learning_rate": 2.379124964333969e-06, + "loss": 1.3973757028579712, + "step": 9930 + }, + { + "epoch": 1.807863839082552, + "grad_norm": 7.78125, + "learning_rate": 2.3783967137342766e-06, + "loss": 1.3232834339141846, + "step": 9932 + }, + { + "epoch": 1.8082279057067443, + "grad_norm": 3.15625, + "learning_rate": 2.3776685543523477e-06, + "loss": 1.0558030605316162, + "step": 9934 + }, + { + "epoch": 1.8085919723309365, + "grad_norm": 16.0, + "learning_rate": 2.3769404862950366e-06, + "loss": 1.6298116445541382, + "step": 9936 + }, + { + "epoch": 1.8089560389551287, + "grad_norm": 36.5, + "learning_rate": 2.3762125096691833e-06, + "loss": 1.709897756576538, + "step": 9938 + }, + { + "epoch": 1.8093201055793209, + "grad_norm": 11.0, + "learning_rate": 2.375484624581617e-06, + "loss": 1.2543293237686157, + "step": 9940 + }, + { + "epoch": 1.8096841722035133, + "grad_norm": 8.375, + "learning_rate": 2.3747568311391505e-06, + "loss": 1.5632082223892212, + "step": 9942 + }, + { + "epoch": 1.8100482388277055, + "grad_norm": 4.8125, + "learning_rate": 2.374029129448586e-06, + "loss": 0.942573070526123, + "step": 9944 + }, + { + "epoch": 1.8104123054518977, + "grad_norm": 20.75, + "learning_rate": 2.3733015196167093e-06, + "loss": 1.0287446975708008, + "step": 9946 + }, + { + "epoch": 1.8107763720760899, + "grad_norm": 17.25, + "learning_rate": 2.3725740017502946e-06, + "loss": 0.7639522552490234, + "step": 9948 + }, + { + "epoch": 1.8111404387002823, + "grad_norm": 4.5625, + "learning_rate": 2.3718465759561032e-06, + "loss": 0.999504804611206, + "step": 9950 + }, + { + "epoch": 1.8115045053244745, + "grad_norm": 20.75, + "learning_rate": 2.37111924234088e-06, + "loss": 1.3263179063796997, + "step": 9952 + }, + { + "epoch": 1.8118685719486667, + "grad_norm": 37.5, + "learning_rate": 2.3703920010113603e-06, + "loss": 2.006227731704712, + "step": 9954 + }, + { + "epoch": 1.8122326385728589, + "grad_norm": 19.0, + "learning_rate": 2.3696648520742627e-06, + "loss": 1.997079610824585, + "step": 9956 + }, + { + "epoch": 1.812596705197051, + "grad_norm": 11.6875, + "learning_rate": 2.368937795636293e-06, + "loss": 2.0477285385131836, + "step": 9958 + }, + { + "epoch": 1.8129607718212433, + "grad_norm": 12.875, + "learning_rate": 2.3682108318041448e-06, + "loss": 1.5167710781097412, + "step": 9960 + }, + { + "epoch": 1.8133248384454355, + "grad_norm": 10.1875, + "learning_rate": 2.3674839606844963e-06, + "loss": 1.478399634361267, + "step": 9962 + }, + { + "epoch": 1.8136889050696277, + "grad_norm": 13.5, + "learning_rate": 2.3667571823840132e-06, + "loss": 1.423717975616455, + "step": 9964 + }, + { + "epoch": 1.8140529716938198, + "grad_norm": 12.125, + "learning_rate": 2.366030497009348e-06, + "loss": 1.5092569589614868, + "step": 9966 + }, + { + "epoch": 1.814417038318012, + "grad_norm": 11.9375, + "learning_rate": 2.3653039046671373e-06, + "loss": 1.5333917140960693, + "step": 9968 + }, + { + "epoch": 1.8147811049422045, + "grad_norm": 11.75, + "learning_rate": 2.364577405464007e-06, + "loss": 1.1136995553970337, + "step": 9970 + }, + { + "epoch": 1.8151451715663967, + "grad_norm": 12.25, + "learning_rate": 2.3638509995065665e-06, + "loss": 1.3786582946777344, + "step": 9972 + }, + { + "epoch": 1.8155092381905888, + "grad_norm": 7.0625, + "learning_rate": 2.3631246869014144e-06, + "loss": 1.8730974197387695, + "step": 9974 + }, + { + "epoch": 1.8158733048147813, + "grad_norm": 8.0625, + "learning_rate": 2.3623984677551338e-06, + "loss": 1.3522603511810303, + "step": 9976 + }, + { + "epoch": 1.8162373714389735, + "grad_norm": 7.875, + "learning_rate": 2.3616723421742927e-06, + "loss": 1.6729565858840942, + "step": 9978 + }, + { + "epoch": 1.8166014380631657, + "grad_norm": 17.0, + "learning_rate": 2.36094631026545e-06, + "loss": 1.3685040473937988, + "step": 9980 + }, + { + "epoch": 1.8169655046873578, + "grad_norm": 26.875, + "learning_rate": 2.3602203721351455e-06, + "loss": 1.1316989660263062, + "step": 9982 + }, + { + "epoch": 1.81732957131155, + "grad_norm": 12.0, + "learning_rate": 2.3594945278899087e-06, + "loss": 0.6563942432403564, + "step": 9984 + }, + { + "epoch": 1.8176936379357422, + "grad_norm": 19.5, + "learning_rate": 2.3587687776362546e-06, + "loss": 1.5108327865600586, + "step": 9986 + }, + { + "epoch": 1.8180577045599344, + "grad_norm": 16.625, + "learning_rate": 2.3580431214806833e-06, + "loss": 1.586306095123291, + "step": 9988 + }, + { + "epoch": 1.8184217711841266, + "grad_norm": 13.5, + "learning_rate": 2.357317559529683e-06, + "loss": 1.6256462335586548, + "step": 9990 + }, + { + "epoch": 1.8187858378083188, + "grad_norm": 12.75, + "learning_rate": 2.356592091889725e-06, + "loss": 1.7333831787109375, + "step": 9992 + }, + { + "epoch": 1.819149904432511, + "grad_norm": 15.6875, + "learning_rate": 2.3558667186672705e-06, + "loss": 1.5826849937438965, + "step": 9994 + }, + { + "epoch": 1.8195139710567034, + "grad_norm": 11.125, + "learning_rate": 2.3551414399687658e-06, + "loss": 1.3787710666656494, + "step": 9996 + }, + { + "epoch": 1.8198780376808956, + "grad_norm": 15.125, + "learning_rate": 2.3544162559006396e-06, + "loss": 1.0588726997375488, + "step": 9998 + }, + { + "epoch": 1.8202421043050878, + "grad_norm": 13.5, + "learning_rate": 2.353691166569313e-06, + "loss": 1.4980621337890625, + "step": 10000 + }, + { + "epoch": 1.82060617092928, + "grad_norm": 16.75, + "learning_rate": 2.352966172081187e-06, + "loss": 1.6686599254608154, + "step": 10002 + }, + { + "epoch": 1.8209702375534724, + "grad_norm": 16.625, + "learning_rate": 2.3522412725426537e-06, + "loss": 0.5254675149917603, + "step": 10004 + }, + { + "epoch": 1.8213343041776646, + "grad_norm": 10.3125, + "learning_rate": 2.3515164680600892e-06, + "loss": 1.5052647590637207, + "step": 10006 + }, + { + "epoch": 1.8216983708018568, + "grad_norm": 15.875, + "learning_rate": 2.350791758739854e-06, + "loss": 1.2988427877426147, + "step": 10008 + }, + { + "epoch": 1.822062437426049, + "grad_norm": 12.5, + "learning_rate": 2.3500671446882985e-06, + "loss": 1.3656269311904907, + "step": 10010 + }, + { + "epoch": 1.8224265040502412, + "grad_norm": 9.875, + "learning_rate": 2.349342626011754e-06, + "loss": 1.4077694416046143, + "step": 10012 + }, + { + "epoch": 1.8227905706744334, + "grad_norm": 71.5, + "learning_rate": 2.3486182028165427e-06, + "loss": 1.3234163522720337, + "step": 10014 + }, + { + "epoch": 1.8231546372986256, + "grad_norm": 13.75, + "learning_rate": 2.347893875208971e-06, + "loss": 0.47953706979751587, + "step": 10016 + }, + { + "epoch": 1.8235187039228178, + "grad_norm": 21.125, + "learning_rate": 2.3471696432953293e-06, + "loss": 1.265723705291748, + "step": 10018 + }, + { + "epoch": 1.82388277054701, + "grad_norm": 19.75, + "learning_rate": 2.346445507181898e-06, + "loss": 1.247279405593872, + "step": 10020 + }, + { + "epoch": 1.8242468371712022, + "grad_norm": 10.6875, + "learning_rate": 2.3457214669749387e-06, + "loss": 1.6593971252441406, + "step": 10022 + }, + { + "epoch": 1.8246109037953946, + "grad_norm": 10.0625, + "learning_rate": 2.344997522780703e-06, + "loss": 1.4940249919891357, + "step": 10024 + }, + { + "epoch": 1.8249749704195868, + "grad_norm": 7.21875, + "learning_rate": 2.344273674705427e-06, + "loss": 1.1693711280822754, + "step": 10026 + }, + { + "epoch": 1.825339037043779, + "grad_norm": 10.0, + "learning_rate": 2.3435499228553306e-06, + "loss": 1.3392581939697266, + "step": 10028 + }, + { + "epoch": 1.8257031036679714, + "grad_norm": 11.375, + "learning_rate": 2.342826267336624e-06, + "loss": 1.159989595413208, + "step": 10030 + }, + { + "epoch": 1.8260671702921636, + "grad_norm": 12.3125, + "learning_rate": 2.3421027082554982e-06, + "loss": 1.4001920223236084, + "step": 10032 + }, + { + "epoch": 1.8264312369163558, + "grad_norm": 8.625, + "learning_rate": 2.3413792457181337e-06, + "loss": 1.4928545951843262, + "step": 10034 + }, + { + "epoch": 1.826795303540548, + "grad_norm": 9.125, + "learning_rate": 2.340655879830697e-06, + "loss": 1.57978093624115, + "step": 10036 + }, + { + "epoch": 1.8271593701647402, + "grad_norm": 6.625, + "learning_rate": 2.339932610699337e-06, + "loss": 1.290604829788208, + "step": 10038 + }, + { + "epoch": 1.8275234367889324, + "grad_norm": 13.0, + "learning_rate": 2.3392094384301916e-06, + "loss": 1.3449780941009521, + "step": 10040 + }, + { + "epoch": 1.8278875034131246, + "grad_norm": 7.28125, + "learning_rate": 2.338486363129383e-06, + "loss": 1.16850745677948, + "step": 10042 + }, + { + "epoch": 1.8282515700373168, + "grad_norm": 7.46875, + "learning_rate": 2.3377633849030193e-06, + "loss": 0.9517574310302734, + "step": 10044 + }, + { + "epoch": 1.828615636661509, + "grad_norm": 25.625, + "learning_rate": 2.337040503857196e-06, + "loss": 1.2145745754241943, + "step": 10046 + }, + { + "epoch": 1.8289797032857011, + "grad_norm": 20.5, + "learning_rate": 2.336317720097991e-06, + "loss": 1.7920341491699219, + "step": 10048 + }, + { + "epoch": 1.8293437699098936, + "grad_norm": 13.75, + "learning_rate": 2.335595033731472e-06, + "loss": 1.4677098989486694, + "step": 10050 + }, + { + "epoch": 1.8297078365340858, + "grad_norm": 13.1875, + "learning_rate": 2.3348724448636884e-06, + "loss": 1.4022963047027588, + "step": 10052 + }, + { + "epoch": 1.830071903158278, + "grad_norm": 12.75, + "learning_rate": 2.3341499536006777e-06, + "loss": 1.454895257949829, + "step": 10054 + }, + { + "epoch": 1.8304359697824701, + "grad_norm": 8.625, + "learning_rate": 2.3334275600484643e-06, + "loss": 1.4594972133636475, + "step": 10056 + }, + { + "epoch": 1.8308000364066626, + "grad_norm": 7.84375, + "learning_rate": 2.3327052643130534e-06, + "loss": 0.9805500507354736, + "step": 10058 + }, + { + "epoch": 1.8311641030308548, + "grad_norm": 27.125, + "learning_rate": 2.331983066500442e-06, + "loss": 1.3447250127792358, + "step": 10060 + }, + { + "epoch": 1.831528169655047, + "grad_norm": 11.75, + "learning_rate": 2.3312609667166073e-06, + "loss": 1.7461843490600586, + "step": 10062 + }, + { + "epoch": 1.8318922362792391, + "grad_norm": 16.625, + "learning_rate": 2.3305389650675163e-06, + "loss": 1.3648128509521484, + "step": 10064 + }, + { + "epoch": 1.8322563029034313, + "grad_norm": 12.0625, + "learning_rate": 2.329817061659119e-06, + "loss": 1.3718647956848145, + "step": 10066 + }, + { + "epoch": 1.8326203695276235, + "grad_norm": 6.96875, + "learning_rate": 2.3290952565973514e-06, + "loss": 1.483525276184082, + "step": 10068 + }, + { + "epoch": 1.8329844361518157, + "grad_norm": 9.0625, + "learning_rate": 2.3283735499881365e-06, + "loss": 1.2079241275787354, + "step": 10070 + }, + { + "epoch": 1.833348502776008, + "grad_norm": 16.25, + "learning_rate": 2.3276519419373807e-06, + "loss": 1.3581733703613281, + "step": 10072 + }, + { + "epoch": 1.8337125694002, + "grad_norm": 19.375, + "learning_rate": 2.326930432550978e-06, + "loss": 1.9249236583709717, + "step": 10074 + }, + { + "epoch": 1.8340766360243923, + "grad_norm": 43.0, + "learning_rate": 2.326209021934807e-06, + "loss": 1.4857887029647827, + "step": 10076 + }, + { + "epoch": 1.8344407026485847, + "grad_norm": 17.25, + "learning_rate": 2.3254877101947304e-06, + "loss": 1.573176622390747, + "step": 10078 + }, + { + "epoch": 1.834804769272777, + "grad_norm": 13.4375, + "learning_rate": 2.3247664974366e-06, + "loss": 1.2796710729599, + "step": 10080 + }, + { + "epoch": 1.835168835896969, + "grad_norm": 14.0, + "learning_rate": 2.324045383766248e-06, + "loss": 1.8859400749206543, + "step": 10082 + }, + { + "epoch": 1.8355329025211615, + "grad_norm": 17.375, + "learning_rate": 2.323324369289498e-06, + "loss": 1.8698440790176392, + "step": 10084 + }, + { + "epoch": 1.8358969691453537, + "grad_norm": 6.5, + "learning_rate": 2.3226034541121544e-06, + "loss": 1.0743274688720703, + "step": 10086 + }, + { + "epoch": 1.836261035769546, + "grad_norm": 4.0, + "learning_rate": 2.321882638340008e-06, + "loss": 1.0709354877471924, + "step": 10088 + }, + { + "epoch": 1.836625102393738, + "grad_norm": 6.4375, + "learning_rate": 2.321161922078837e-06, + "loss": 1.0000280141830444, + "step": 10090 + }, + { + "epoch": 1.8369891690179303, + "grad_norm": 15.8125, + "learning_rate": 2.3204413054344018e-06, + "loss": 1.4246320724487305, + "step": 10092 + }, + { + "epoch": 1.8373532356421225, + "grad_norm": 14.0625, + "learning_rate": 2.3197207885124516e-06, + "loss": 1.9766209125518799, + "step": 10094 + }, + { + "epoch": 1.8377173022663147, + "grad_norm": 21.125, + "learning_rate": 2.319000371418719e-06, + "loss": 1.990584373474121, + "step": 10096 + }, + { + "epoch": 1.8380813688905069, + "grad_norm": 6.5625, + "learning_rate": 2.318280054258921e-06, + "loss": 1.5160785913467407, + "step": 10098 + }, + { + "epoch": 1.838445435514699, + "grad_norm": 11.3125, + "learning_rate": 2.317559837138764e-06, + "loss": 1.3069117069244385, + "step": 10100 + }, + { + "epoch": 1.8388095021388913, + "grad_norm": 17.875, + "learning_rate": 2.3168397201639336e-06, + "loss": 1.3874542713165283, + "step": 10102 + }, + { + "epoch": 1.8391735687630837, + "grad_norm": 24.625, + "learning_rate": 2.316119703440106e-06, + "loss": 1.5388832092285156, + "step": 10104 + }, + { + "epoch": 1.8395376353872759, + "grad_norm": 19.875, + "learning_rate": 2.3153997870729415e-06, + "loss": 1.7023801803588867, + "step": 10106 + }, + { + "epoch": 1.839901702011468, + "grad_norm": 6.9375, + "learning_rate": 2.314679971168082e-06, + "loss": 1.0762262344360352, + "step": 10108 + }, + { + "epoch": 1.8402657686356603, + "grad_norm": 3.3125, + "learning_rate": 2.3139602558311614e-06, + "loss": 1.1073142290115356, + "step": 10110 + }, + { + "epoch": 1.8406298352598527, + "grad_norm": 35.75, + "learning_rate": 2.313240641167791e-06, + "loss": 1.1669108867645264, + "step": 10112 + }, + { + "epoch": 1.8409939018840449, + "grad_norm": 14.5, + "learning_rate": 2.3125211272835747e-06, + "loss": 1.7666399478912354, + "step": 10114 + }, + { + "epoch": 1.841357968508237, + "grad_norm": 8.875, + "learning_rate": 2.3118017142840967e-06, + "loss": 1.5065183639526367, + "step": 10116 + }, + { + "epoch": 1.8417220351324293, + "grad_norm": 9.0, + "learning_rate": 2.3110824022749275e-06, + "loss": 1.3965014219284058, + "step": 10118 + }, + { + "epoch": 1.8420861017566215, + "grad_norm": 9.9375, + "learning_rate": 2.310363191361624e-06, + "loss": 1.7696468830108643, + "step": 10120 + }, + { + "epoch": 1.8424501683808137, + "grad_norm": 15.5, + "learning_rate": 2.3096440816497276e-06, + "loss": 1.6102962493896484, + "step": 10122 + }, + { + "epoch": 1.8428142350050059, + "grad_norm": 14.0, + "learning_rate": 2.3089250732447644e-06, + "loss": 1.6875274181365967, + "step": 10124 + }, + { + "epoch": 1.843178301629198, + "grad_norm": 15.5, + "learning_rate": 2.3082061662522463e-06, + "loss": 2.058544635772705, + "step": 10126 + }, + { + "epoch": 1.8435423682533902, + "grad_norm": 17.5, + "learning_rate": 2.3074873607776692e-06, + "loss": 1.361794352531433, + "step": 10128 + }, + { + "epoch": 1.8439064348775824, + "grad_norm": 26.875, + "learning_rate": 2.306768656926516e-06, + "loss": 0.9231115579605103, + "step": 10130 + }, + { + "epoch": 1.8442705015017749, + "grad_norm": 11.5, + "learning_rate": 2.3060500548042516e-06, + "loss": 1.4587790966033936, + "step": 10132 + }, + { + "epoch": 1.844634568125967, + "grad_norm": 8.6875, + "learning_rate": 2.30533155451633e-06, + "loss": 1.2917771339416504, + "step": 10134 + }, + { + "epoch": 1.8449986347501592, + "grad_norm": 13.75, + "learning_rate": 2.3046131561681888e-06, + "loss": 1.3458640575408936, + "step": 10136 + }, + { + "epoch": 1.8453627013743517, + "grad_norm": 7.90625, + "learning_rate": 2.303894859865247e-06, + "loss": 1.4280368089675903, + "step": 10138 + }, + { + "epoch": 1.8457267679985438, + "grad_norm": 20.125, + "learning_rate": 2.303176665712915e-06, + "loss": 1.568877935409546, + "step": 10140 + }, + { + "epoch": 1.846090834622736, + "grad_norm": 19.75, + "learning_rate": 2.3024585738165816e-06, + "loss": 1.6109087467193604, + "step": 10142 + }, + { + "epoch": 1.8464549012469282, + "grad_norm": 46.0, + "learning_rate": 2.3017405842816263e-06, + "loss": 0.5939738750457764, + "step": 10144 + }, + { + "epoch": 1.8468189678711204, + "grad_norm": 35.5, + "learning_rate": 2.3010226972134114e-06, + "loss": 1.4714713096618652, + "step": 10146 + }, + { + "epoch": 1.8471830344953126, + "grad_norm": 12.8125, + "learning_rate": 2.300304912717282e-06, + "loss": 1.2469115257263184, + "step": 10148 + }, + { + "epoch": 1.8475471011195048, + "grad_norm": 6.09375, + "learning_rate": 2.2995872308985715e-06, + "loss": 1.3877387046813965, + "step": 10150 + }, + { + "epoch": 1.847911167743697, + "grad_norm": 6.46875, + "learning_rate": 2.298869651862596e-06, + "loss": 1.2895342111587524, + "step": 10152 + }, + { + "epoch": 1.8482752343678892, + "grad_norm": 13.5625, + "learning_rate": 2.2981521757146575e-06, + "loss": 1.5223307609558105, + "step": 10154 + }, + { + "epoch": 1.8486393009920814, + "grad_norm": 19.25, + "learning_rate": 2.297434802560044e-06, + "loss": 1.8295148611068726, + "step": 10156 + }, + { + "epoch": 1.8490033676162738, + "grad_norm": 5.59375, + "learning_rate": 2.296717532504025e-06, + "loss": 0.838250458240509, + "step": 10158 + }, + { + "epoch": 1.849367434240466, + "grad_norm": 6.875, + "learning_rate": 2.296000365651858e-06, + "loss": 0.9912216067314148, + "step": 10160 + }, + { + "epoch": 1.8497315008646582, + "grad_norm": 13.9375, + "learning_rate": 2.295283302108784e-06, + "loss": 1.6604490280151367, + "step": 10162 + }, + { + "epoch": 1.8500955674888506, + "grad_norm": 23.5, + "learning_rate": 2.29456634198003e-06, + "loss": 1.6898630857467651, + "step": 10164 + }, + { + "epoch": 1.8504596341130428, + "grad_norm": 13.0, + "learning_rate": 2.2938494853708067e-06, + "loss": 1.5062068700790405, + "step": 10166 + }, + { + "epoch": 1.850823700737235, + "grad_norm": 18.5, + "learning_rate": 2.2931327323863087e-06, + "loss": 1.5017207860946655, + "step": 10168 + }, + { + "epoch": 1.8511877673614272, + "grad_norm": 18.875, + "learning_rate": 2.2924160831317186e-06, + "loss": 1.7805989980697632, + "step": 10170 + }, + { + "epoch": 1.8515518339856194, + "grad_norm": 20.5, + "learning_rate": 2.2916995377121996e-06, + "loss": 1.3220857381820679, + "step": 10172 + }, + { + "epoch": 1.8519159006098116, + "grad_norm": 9.5, + "learning_rate": 2.2909830962329034e-06, + "loss": 1.3464634418487549, + "step": 10174 + }, + { + "epoch": 1.8522799672340038, + "grad_norm": 10.4375, + "learning_rate": 2.2902667587989653e-06, + "loss": 1.4577744007110596, + "step": 10176 + }, + { + "epoch": 1.852644033858196, + "grad_norm": 7.53125, + "learning_rate": 2.2895505255155026e-06, + "loss": 1.3538519144058228, + "step": 10178 + }, + { + "epoch": 1.8530081004823882, + "grad_norm": 3.3125, + "learning_rate": 2.2888343964876223e-06, + "loss": 1.1247426271438599, + "step": 10180 + }, + { + "epoch": 1.8533721671065804, + "grad_norm": 8.0, + "learning_rate": 2.2881183718204115e-06, + "loss": 1.0560004711151123, + "step": 10182 + }, + { + "epoch": 1.8537362337307728, + "grad_norm": 216.0, + "learning_rate": 2.2874024516189448e-06, + "loss": 1.4581248760223389, + "step": 10184 + }, + { + "epoch": 1.854100300354965, + "grad_norm": 11.875, + "learning_rate": 2.2866866359882807e-06, + "loss": 1.7488996982574463, + "step": 10186 + }, + { + "epoch": 1.8544643669791572, + "grad_norm": 16.375, + "learning_rate": 2.2859709250334617e-06, + "loss": 2.0033645629882812, + "step": 10188 + }, + { + "epoch": 1.8548284336033494, + "grad_norm": 19.5, + "learning_rate": 2.285255318859516e-06, + "loss": 1.7307451963424683, + "step": 10190 + }, + { + "epoch": 1.8551925002275418, + "grad_norm": 8.6875, + "learning_rate": 2.2845398175714552e-06, + "loss": 1.427422285079956, + "step": 10192 + }, + { + "epoch": 1.855556566851734, + "grad_norm": 10.5625, + "learning_rate": 2.2838244212742765e-06, + "loss": 1.3619296550750732, + "step": 10194 + }, + { + "epoch": 1.8559206334759262, + "grad_norm": 8.6875, + "learning_rate": 2.2831091300729622e-06, + "loss": 1.5230233669281006, + "step": 10196 + }, + { + "epoch": 1.8562847001001184, + "grad_norm": 7.875, + "learning_rate": 2.2823939440724772e-06, + "loss": 1.3712950944900513, + "step": 10198 + }, + { + "epoch": 1.8566487667243106, + "grad_norm": 12.625, + "learning_rate": 2.2816788633777735e-06, + "loss": 1.0905096530914307, + "step": 10200 + }, + { + "epoch": 1.8570128333485028, + "grad_norm": 11.625, + "learning_rate": 2.2809638880937845e-06, + "loss": 1.4866703748703003, + "step": 10202 + }, + { + "epoch": 1.857376899972695, + "grad_norm": 122.5, + "learning_rate": 2.2802490183254317e-06, + "loss": 1.3468842506408691, + "step": 10204 + }, + { + "epoch": 1.8577409665968871, + "grad_norm": 51.5, + "learning_rate": 2.2795342541776185e-06, + "loss": 0.9001464247703552, + "step": 10206 + }, + { + "epoch": 1.8581050332210793, + "grad_norm": 5.6875, + "learning_rate": 2.278819595755233e-06, + "loss": 0.4550447463989258, + "step": 10208 + }, + { + "epoch": 1.8584690998452715, + "grad_norm": 8.875, + "learning_rate": 2.27810504316315e-06, + "loss": 1.7200883626937866, + "step": 10210 + }, + { + "epoch": 1.858833166469464, + "grad_norm": 7.125, + "learning_rate": 2.2773905965062256e-06, + "loss": 1.3557761907577515, + "step": 10212 + }, + { + "epoch": 1.8591972330936561, + "grad_norm": 13.5625, + "learning_rate": 2.276676255889303e-06, + "loss": 1.967002272605896, + "step": 10214 + }, + { + "epoch": 1.8595612997178483, + "grad_norm": 16.75, + "learning_rate": 2.275962021417209e-06, + "loss": 1.9087586402893066, + "step": 10216 + }, + { + "epoch": 1.8599253663420408, + "grad_norm": 10.6875, + "learning_rate": 2.2752478931947534e-06, + "loss": 1.3895682096481323, + "step": 10218 + }, + { + "epoch": 1.860289432966233, + "grad_norm": 14.375, + "learning_rate": 2.274533871326733e-06, + "loss": 1.5569223165512085, + "step": 10220 + }, + { + "epoch": 1.8606534995904251, + "grad_norm": 13.875, + "learning_rate": 2.273819955917926e-06, + "loss": 1.4245853424072266, + "step": 10222 + }, + { + "epoch": 1.8610175662146173, + "grad_norm": 9.625, + "learning_rate": 2.273106147073098e-06, + "loss": 1.2391117811203003, + "step": 10224 + }, + { + "epoch": 1.8613816328388095, + "grad_norm": 9.5, + "learning_rate": 2.272392444896997e-06, + "loss": 1.4110950231552124, + "step": 10226 + }, + { + "epoch": 1.8617456994630017, + "grad_norm": 16.375, + "learning_rate": 2.271678849494356e-06, + "loss": 1.5389165878295898, + "step": 10228 + }, + { + "epoch": 1.862109766087194, + "grad_norm": 12.0625, + "learning_rate": 2.2709653609698926e-06, + "loss": 1.4933632612228394, + "step": 10230 + }, + { + "epoch": 1.8624738327113861, + "grad_norm": 9.0625, + "learning_rate": 2.2702519794283074e-06, + "loss": 1.2820851802825928, + "step": 10232 + }, + { + "epoch": 1.8628378993355783, + "grad_norm": 10.9375, + "learning_rate": 2.2695387049742868e-06, + "loss": 1.0977133512496948, + "step": 10234 + }, + { + "epoch": 1.8632019659597705, + "grad_norm": 26.0, + "learning_rate": 2.2688255377125016e-06, + "loss": 0.6658727526664734, + "step": 10236 + }, + { + "epoch": 1.863566032583963, + "grad_norm": 7.59375, + "learning_rate": 2.2681124777476046e-06, + "loss": 1.6029568910598755, + "step": 10238 + }, + { + "epoch": 1.8639300992081551, + "grad_norm": 12.0, + "learning_rate": 2.2673995251842364e-06, + "loss": 1.5376559495925903, + "step": 10240 + }, + { + "epoch": 1.8642941658323473, + "grad_norm": 18.25, + "learning_rate": 2.266686680127018e-06, + "loss": 1.4276180267333984, + "step": 10242 + }, + { + "epoch": 1.8646582324565395, + "grad_norm": 13.3125, + "learning_rate": 2.2659739426805576e-06, + "loss": 1.2914968729019165, + "step": 10244 + }, + { + "epoch": 1.865022299080732, + "grad_norm": 12.8125, + "learning_rate": 2.2652613129494473e-06, + "loss": 1.6507633924484253, + "step": 10246 + }, + { + "epoch": 1.865386365704924, + "grad_norm": 25.0, + "learning_rate": 2.2645487910382606e-06, + "loss": 1.4687492847442627, + "step": 10248 + }, + { + "epoch": 1.8657504323291163, + "grad_norm": 17.375, + "learning_rate": 2.263836377051559e-06, + "loss": 1.4469367265701294, + "step": 10250 + }, + { + "epoch": 1.8661144989533085, + "grad_norm": 7.59375, + "learning_rate": 2.263124071093885e-06, + "loss": 1.1711663007736206, + "step": 10252 + }, + { + "epoch": 1.8664785655775007, + "grad_norm": 12.25, + "learning_rate": 2.2624118732697676e-06, + "loss": 1.2464871406555176, + "step": 10254 + }, + { + "epoch": 1.8668426322016929, + "grad_norm": 7.34375, + "learning_rate": 2.2616997836837187e-06, + "loss": 1.5155750513076782, + "step": 10256 + }, + { + "epoch": 1.867206698825885, + "grad_norm": 7.53125, + "learning_rate": 2.2609878024402344e-06, + "loss": 1.5323436260223389, + "step": 10258 + }, + { + "epoch": 1.8675707654500773, + "grad_norm": 7.875, + "learning_rate": 2.2602759296437955e-06, + "loss": 1.3353511095046997, + "step": 10260 + }, + { + "epoch": 1.8679348320742695, + "grad_norm": 11.375, + "learning_rate": 2.2595641653988656e-06, + "loss": 1.2968274354934692, + "step": 10262 + }, + { + "epoch": 1.8682988986984617, + "grad_norm": 13.8125, + "learning_rate": 2.2588525098098936e-06, + "loss": 1.7588025331497192, + "step": 10264 + }, + { + "epoch": 1.868662965322654, + "grad_norm": 7.5625, + "learning_rate": 2.258140962981313e-06, + "loss": 1.2066104412078857, + "step": 10266 + }, + { + "epoch": 1.8690270319468463, + "grad_norm": 7.46875, + "learning_rate": 2.2574295250175383e-06, + "loss": 1.4478979110717773, + "step": 10268 + }, + { + "epoch": 1.8693910985710385, + "grad_norm": 8.6875, + "learning_rate": 2.2567181960229727e-06, + "loss": 1.2111611366271973, + "step": 10270 + }, + { + "epoch": 1.8697551651952309, + "grad_norm": 29.0, + "learning_rate": 2.2560069761019984e-06, + "loss": 1.3516755104064941, + "step": 10272 + }, + { + "epoch": 1.870119231819423, + "grad_norm": 9.25, + "learning_rate": 2.255295865358986e-06, + "loss": 1.218153953552246, + "step": 10274 + }, + { + "epoch": 1.8704832984436153, + "grad_norm": 8.25, + "learning_rate": 2.254584863898287e-06, + "loss": 1.4296220541000366, + "step": 10276 + }, + { + "epoch": 1.8708473650678075, + "grad_norm": 10.0625, + "learning_rate": 2.2538739718242383e-06, + "loss": 1.4985926151275635, + "step": 10278 + }, + { + "epoch": 1.8712114316919997, + "grad_norm": 19.25, + "learning_rate": 2.253163189241161e-06, + "loss": 1.4593578577041626, + "step": 10280 + }, + { + "epoch": 1.8715754983161919, + "grad_norm": 9.4375, + "learning_rate": 2.2524525162533583e-06, + "loss": 1.3749850988388062, + "step": 10282 + }, + { + "epoch": 1.871939564940384, + "grad_norm": 9.6875, + "learning_rate": 2.2517419529651196e-06, + "loss": 1.3718054294586182, + "step": 10284 + }, + { + "epoch": 1.8723036315645762, + "grad_norm": 10.0, + "learning_rate": 2.251031499480717e-06, + "loss": 1.5082831382751465, + "step": 10286 + }, + { + "epoch": 1.8726676981887684, + "grad_norm": 6.8125, + "learning_rate": 2.2503211559044068e-06, + "loss": 1.3963520526885986, + "step": 10288 + }, + { + "epoch": 1.8730317648129606, + "grad_norm": 11.125, + "learning_rate": 2.2496109223404285e-06, + "loss": 2.1480770111083984, + "step": 10290 + }, + { + "epoch": 1.873395831437153, + "grad_norm": 13.0, + "learning_rate": 2.2489007988930056e-06, + "loss": 1.1892786026000977, + "step": 10292 + }, + { + "epoch": 1.8737598980613452, + "grad_norm": 19.0, + "learning_rate": 2.2481907856663475e-06, + "loss": 1.8853135108947754, + "step": 10294 + }, + { + "epoch": 1.8741239646855374, + "grad_norm": 26.625, + "learning_rate": 2.2474808827646446e-06, + "loss": 1.6226311922073364, + "step": 10296 + }, + { + "epoch": 1.8744880313097296, + "grad_norm": 5.25, + "learning_rate": 2.2467710902920724e-06, + "loss": 1.3160977363586426, + "step": 10298 + }, + { + "epoch": 1.874852097933922, + "grad_norm": 8.375, + "learning_rate": 2.2460614083527902e-06, + "loss": 1.3614863157272339, + "step": 10300 + }, + { + "epoch": 1.8752161645581142, + "grad_norm": 17.0, + "learning_rate": 2.2453518370509404e-06, + "loss": 1.3437060117721558, + "step": 10302 + }, + { + "epoch": 1.8755802311823064, + "grad_norm": 14.6875, + "learning_rate": 2.2446423764906502e-06, + "loss": 0.9286806583404541, + "step": 10304 + }, + { + "epoch": 1.8759442978064986, + "grad_norm": 6.71875, + "learning_rate": 2.243933026776031e-06, + "loss": 1.4103400707244873, + "step": 10306 + }, + { + "epoch": 1.8763083644306908, + "grad_norm": 11.6875, + "learning_rate": 2.2432237880111748e-06, + "loss": 1.3581068515777588, + "step": 10308 + }, + { + "epoch": 1.876672431054883, + "grad_norm": 101.0, + "learning_rate": 2.2425146603001617e-06, + "loss": 1.7171987295150757, + "step": 10310 + }, + { + "epoch": 1.8770364976790752, + "grad_norm": 23.125, + "learning_rate": 2.241805643747052e-06, + "loss": 1.5244677066802979, + "step": 10312 + }, + { + "epoch": 1.8774005643032674, + "grad_norm": 60.0, + "learning_rate": 2.241096738455891e-06, + "loss": 1.9016815423965454, + "step": 10314 + }, + { + "epoch": 1.8777646309274596, + "grad_norm": 12.3125, + "learning_rate": 2.240387944530709e-06, + "loss": 1.7663288116455078, + "step": 10316 + }, + { + "epoch": 1.8781286975516518, + "grad_norm": 13.0625, + "learning_rate": 2.239679262075517e-06, + "loss": 1.251397967338562, + "step": 10318 + }, + { + "epoch": 1.8784927641758442, + "grad_norm": 43.0, + "learning_rate": 2.2389706911943125e-06, + "loss": 1.0551949739456177, + "step": 10320 + }, + { + "epoch": 1.8788568308000364, + "grad_norm": 15.0, + "learning_rate": 2.2382622319910744e-06, + "loss": 1.332777976989746, + "step": 10322 + }, + { + "epoch": 1.8792208974242286, + "grad_norm": 49.75, + "learning_rate": 2.237553884569767e-06, + "loss": 1.8567054271697998, + "step": 10324 + }, + { + "epoch": 1.879584964048421, + "grad_norm": 8.75, + "learning_rate": 2.2368456490343372e-06, + "loss": 0.8983043432235718, + "step": 10326 + }, + { + "epoch": 1.8799490306726132, + "grad_norm": 6.65625, + "learning_rate": 2.236137525488715e-06, + "loss": 1.4657609462738037, + "step": 10328 + }, + { + "epoch": 1.8803130972968054, + "grad_norm": 6.1875, + "learning_rate": 2.235429514036816e-06, + "loss": 0.9314013719558716, + "step": 10330 + }, + { + "epoch": 1.8806771639209976, + "grad_norm": 5.40625, + "learning_rate": 2.2347216147825367e-06, + "loss": 1.109434962272644, + "step": 10332 + }, + { + "epoch": 1.8810412305451898, + "grad_norm": 8.125, + "learning_rate": 2.234013827829759e-06, + "loss": 1.2846033573150635, + "step": 10334 + }, + { + "epoch": 1.881405297169382, + "grad_norm": 20.25, + "learning_rate": 2.2333061532823487e-06, + "loss": 1.4278689622879028, + "step": 10336 + }, + { + "epoch": 1.8817693637935742, + "grad_norm": 13.625, + "learning_rate": 2.232598591244152e-06, + "loss": 0.9231512546539307, + "step": 10338 + }, + { + "epoch": 1.8821334304177664, + "grad_norm": 11.25, + "learning_rate": 2.231891141819003e-06, + "loss": 1.3879550695419312, + "step": 10340 + }, + { + "epoch": 1.8824974970419586, + "grad_norm": 8.9375, + "learning_rate": 2.2311838051107156e-06, + "loss": 0.8556983470916748, + "step": 10342 + }, + { + "epoch": 1.8828615636661508, + "grad_norm": 65.5, + "learning_rate": 2.230476581223089e-06, + "loss": 1.2869782447814941, + "step": 10344 + }, + { + "epoch": 1.8832256302903432, + "grad_norm": 18.5, + "learning_rate": 2.2297694702599064e-06, + "loss": 1.6536414623260498, + "step": 10346 + }, + { + "epoch": 1.8835896969145354, + "grad_norm": 16.5, + "learning_rate": 2.2290624723249314e-06, + "loss": 0.7474377155303955, + "step": 10348 + }, + { + "epoch": 1.8839537635387276, + "grad_norm": 13.4375, + "learning_rate": 2.228355587521915e-06, + "loss": 1.7495023012161255, + "step": 10350 + }, + { + "epoch": 1.8843178301629198, + "grad_norm": 13.5, + "learning_rate": 2.2276488159545886e-06, + "loss": 1.574780821800232, + "step": 10352 + }, + { + "epoch": 1.8846818967871122, + "grad_norm": 16.875, + "learning_rate": 2.226942157726668e-06, + "loss": 1.9637646675109863, + "step": 10354 + }, + { + "epoch": 1.8850459634113044, + "grad_norm": 10.375, + "learning_rate": 2.226235612941854e-06, + "loss": 1.7782838344573975, + "step": 10356 + }, + { + "epoch": 1.8854100300354966, + "grad_norm": 8.5, + "learning_rate": 2.2255291817038268e-06, + "loss": 1.26701021194458, + "step": 10358 + }, + { + "epoch": 1.8857740966596888, + "grad_norm": 7.15625, + "learning_rate": 2.2248228641162542e-06, + "loss": 0.7866450548171997, + "step": 10360 + }, + { + "epoch": 1.886138163283881, + "grad_norm": 14.25, + "learning_rate": 2.2241166602827844e-06, + "loss": 0.9672653675079346, + "step": 10362 + }, + { + "epoch": 1.8865022299080731, + "grad_norm": 11.0, + "learning_rate": 2.22341057030705e-06, + "loss": 1.0276377201080322, + "step": 10364 + }, + { + "epoch": 1.8868662965322653, + "grad_norm": 15.125, + "learning_rate": 2.2227045942926684e-06, + "loss": 1.340790033340454, + "step": 10366 + }, + { + "epoch": 1.8872303631564575, + "grad_norm": 11.8125, + "learning_rate": 2.221998732343236e-06, + "loss": 1.4866089820861816, + "step": 10368 + }, + { + "epoch": 1.8875944297806497, + "grad_norm": 19.75, + "learning_rate": 2.221292984562338e-06, + "loss": 1.4822797775268555, + "step": 10370 + }, + { + "epoch": 1.887958496404842, + "grad_norm": 21.125, + "learning_rate": 2.220587351053538e-06, + "loss": 1.6251622438430786, + "step": 10372 + }, + { + "epoch": 1.8883225630290343, + "grad_norm": 5.03125, + "learning_rate": 2.2198818319203855e-06, + "loss": 1.21848726272583, + "step": 10374 + }, + { + "epoch": 1.8886866296532265, + "grad_norm": 16.125, + "learning_rate": 2.219176427266413e-06, + "loss": 1.2360891103744507, + "step": 10376 + }, + { + "epoch": 1.8890506962774187, + "grad_norm": 10.8125, + "learning_rate": 2.218471137195135e-06, + "loss": 1.1219102144241333, + "step": 10378 + }, + { + "epoch": 1.8894147629016111, + "grad_norm": 5.21875, + "learning_rate": 2.217765961810051e-06, + "loss": 1.5207395553588867, + "step": 10380 + }, + { + "epoch": 1.8897788295258033, + "grad_norm": 8.1875, + "learning_rate": 2.217060901214641e-06, + "loss": 1.1101880073547363, + "step": 10382 + }, + { + "epoch": 1.8901428961499955, + "grad_norm": 10.625, + "learning_rate": 2.2163559555123716e-06, + "loss": 1.5389623641967773, + "step": 10384 + }, + { + "epoch": 1.8905069627741877, + "grad_norm": 22.5, + "learning_rate": 2.2156511248066896e-06, + "loss": 1.6152387857437134, + "step": 10386 + }, + { + "epoch": 1.89087102939838, + "grad_norm": 12.875, + "learning_rate": 2.214946409201026e-06, + "loss": 1.2975308895111084, + "step": 10388 + }, + { + "epoch": 1.8912350960225721, + "grad_norm": 11.0625, + "learning_rate": 2.214241808798796e-06, + "loss": 0.8062430024147034, + "step": 10390 + }, + { + "epoch": 1.8915991626467643, + "grad_norm": 14.5, + "learning_rate": 2.2135373237033957e-06, + "loss": 0.6579453945159912, + "step": 10392 + }, + { + "epoch": 1.8919632292709565, + "grad_norm": 5.8125, + "learning_rate": 2.212832954018206e-06, + "loss": 0.9215497374534607, + "step": 10394 + }, + { + "epoch": 1.8923272958951487, + "grad_norm": 8.75, + "learning_rate": 2.2121286998465904e-06, + "loss": 1.3262220621109009, + "step": 10396 + }, + { + "epoch": 1.892691362519341, + "grad_norm": 33.0, + "learning_rate": 2.2114245612918945e-06, + "loss": 1.3361772298812866, + "step": 10398 + }, + { + "epoch": 1.8930554291435333, + "grad_norm": 19.25, + "learning_rate": 2.210720538457449e-06, + "loss": 1.1203155517578125, + "step": 10400 + }, + { + "epoch": 1.8934194957677255, + "grad_norm": 27.125, + "learning_rate": 2.2100166314465654e-06, + "loss": 1.2038145065307617, + "step": 10402 + }, + { + "epoch": 1.8937835623919177, + "grad_norm": 21.625, + "learning_rate": 2.20931284036254e-06, + "loss": 1.8017784357070923, + "step": 10404 + }, + { + "epoch": 1.8941476290161101, + "grad_norm": 7.3125, + "learning_rate": 2.208609165308651e-06, + "loss": 1.215720295906067, + "step": 10406 + }, + { + "epoch": 1.8945116956403023, + "grad_norm": 11.5625, + "learning_rate": 2.2079056063881595e-06, + "loss": 1.5422110557556152, + "step": 10408 + }, + { + "epoch": 1.8948757622644945, + "grad_norm": 5.5, + "learning_rate": 2.20720216370431e-06, + "loss": 1.4257352352142334, + "step": 10410 + }, + { + "epoch": 1.8952398288886867, + "grad_norm": 21.875, + "learning_rate": 2.2064988373603302e-06, + "loss": 1.1668429374694824, + "step": 10412 + }, + { + "epoch": 1.895603895512879, + "grad_norm": 27.125, + "learning_rate": 2.2057956274594303e-06, + "loss": 1.9746252298355103, + "step": 10414 + }, + { + "epoch": 1.895967962137071, + "grad_norm": 20.875, + "learning_rate": 2.205092534104804e-06, + "loss": 1.4797194004058838, + "step": 10416 + }, + { + "epoch": 1.8963320287612633, + "grad_norm": 15.875, + "learning_rate": 2.2043895573996256e-06, + "loss": 1.1040180921554565, + "step": 10418 + }, + { + "epoch": 1.8966960953854555, + "grad_norm": 18.125, + "learning_rate": 2.2036866974470565e-06, + "loss": 1.563361644744873, + "step": 10420 + }, + { + "epoch": 1.8970601620096477, + "grad_norm": 8.625, + "learning_rate": 2.202983954350236e-06, + "loss": 1.3463839292526245, + "step": 10422 + }, + { + "epoch": 1.8974242286338399, + "grad_norm": 18.625, + "learning_rate": 2.2022813282122915e-06, + "loss": 1.7155592441558838, + "step": 10424 + }, + { + "epoch": 1.897788295258032, + "grad_norm": 13.1875, + "learning_rate": 2.2015788191363293e-06, + "loss": 1.3703267574310303, + "step": 10426 + }, + { + "epoch": 1.8981523618822245, + "grad_norm": 13.4375, + "learning_rate": 2.2008764272254384e-06, + "loss": 1.5700005292892456, + "step": 10428 + }, + { + "epoch": 1.8985164285064167, + "grad_norm": 14.5625, + "learning_rate": 2.2001741525826943e-06, + "loss": 1.172662377357483, + "step": 10430 + }, + { + "epoch": 1.8988804951306089, + "grad_norm": 31.125, + "learning_rate": 2.1994719953111516e-06, + "loss": 1.2414630651474, + "step": 10432 + }, + { + "epoch": 1.8992445617548013, + "grad_norm": 12.125, + "learning_rate": 2.198769955513849e-06, + "loss": 1.6593002080917358, + "step": 10434 + }, + { + "epoch": 1.8996086283789935, + "grad_norm": 20.625, + "learning_rate": 2.1980680332938088e-06, + "loss": 1.8022005558013916, + "step": 10436 + }, + { + "epoch": 1.8999726950031857, + "grad_norm": 9.8125, + "learning_rate": 2.1973662287540345e-06, + "loss": 1.1300904750823975, + "step": 10438 + }, + { + "epoch": 1.9003367616273779, + "grad_norm": 8.3125, + "learning_rate": 2.196664541997514e-06, + "loss": 1.2492272853851318, + "step": 10440 + }, + { + "epoch": 1.90070082825157, + "grad_norm": 7.6875, + "learning_rate": 2.195962973127215e-06, + "loss": 1.329573392868042, + "step": 10442 + }, + { + "epoch": 1.9010648948757622, + "grad_norm": 9.1875, + "learning_rate": 2.1952615222460922e-06, + "loss": 1.345508337020874, + "step": 10444 + }, + { + "epoch": 1.9014289614999544, + "grad_norm": 28.375, + "learning_rate": 2.19456018945708e-06, + "loss": 1.9136571884155273, + "step": 10446 + }, + { + "epoch": 1.9017930281241466, + "grad_norm": 7.875, + "learning_rate": 2.1938589748630947e-06, + "loss": 0.9954184293746948, + "step": 10448 + }, + { + "epoch": 1.9021570947483388, + "grad_norm": 5.5, + "learning_rate": 2.193157878567039e-06, + "loss": 1.0466398000717163, + "step": 10450 + }, + { + "epoch": 1.902521161372531, + "grad_norm": 8.375, + "learning_rate": 2.192456900671794e-06, + "loss": 1.422209620475769, + "step": 10452 + }, + { + "epoch": 1.9028852279967234, + "grad_norm": 9.25, + "learning_rate": 2.191756041280226e-06, + "loss": 1.5524622201919556, + "step": 10454 + }, + { + "epoch": 1.9032492946209156, + "grad_norm": 9.8125, + "learning_rate": 2.191055300495184e-06, + "loss": 1.1897838115692139, + "step": 10456 + }, + { + "epoch": 1.9036133612451078, + "grad_norm": 25.0, + "learning_rate": 2.190354678419498e-06, + "loss": 1.3939895629882812, + "step": 10458 + }, + { + "epoch": 1.9039774278693002, + "grad_norm": 17.25, + "learning_rate": 2.1896541751559823e-06, + "loss": 1.312684178352356, + "step": 10460 + }, + { + "epoch": 1.9043414944934924, + "grad_norm": 10.5625, + "learning_rate": 2.188953790807431e-06, + "loss": 0.9836719036102295, + "step": 10462 + }, + { + "epoch": 1.9047055611176846, + "grad_norm": 12.125, + "learning_rate": 2.188253525476625e-06, + "loss": 1.1918948888778687, + "step": 10464 + }, + { + "epoch": 1.9050696277418768, + "grad_norm": 13.125, + "learning_rate": 2.187553379266325e-06, + "loss": 1.5050095319747925, + "step": 10466 + }, + { + "epoch": 1.905433694366069, + "grad_norm": 9.0, + "learning_rate": 2.186853352279273e-06, + "loss": 1.5979856252670288, + "step": 10468 + }, + { + "epoch": 1.9057977609902612, + "grad_norm": 20.125, + "learning_rate": 2.186153444618197e-06, + "loss": 1.4531993865966797, + "step": 10470 + }, + { + "epoch": 1.9061618276144534, + "grad_norm": 16.125, + "learning_rate": 2.1854536563858037e-06, + "loss": 1.4323182106018066, + "step": 10472 + }, + { + "epoch": 1.9065258942386456, + "grad_norm": 12.25, + "learning_rate": 2.1847539876847858e-06, + "loss": 1.371595859527588, + "step": 10474 + }, + { + "epoch": 1.9068899608628378, + "grad_norm": 61.0, + "learning_rate": 2.1840544386178166e-06, + "loss": 1.3252613544464111, + "step": 10476 + }, + { + "epoch": 1.90725402748703, + "grad_norm": 33.5, + "learning_rate": 2.183355009287551e-06, + "loss": 2.0604937076568604, + "step": 10478 + }, + { + "epoch": 1.9076180941112224, + "grad_norm": 7.1875, + "learning_rate": 2.182655699796629e-06, + "loss": 1.1572775840759277, + "step": 10480 + }, + { + "epoch": 1.9079821607354146, + "grad_norm": 15.375, + "learning_rate": 2.1819565102476703e-06, + "loss": 1.7954611778259277, + "step": 10482 + }, + { + "epoch": 1.9083462273596068, + "grad_norm": 20.375, + "learning_rate": 2.1812574407432786e-06, + "loss": 1.4730114936828613, + "step": 10484 + }, + { + "epoch": 1.908710293983799, + "grad_norm": 36.25, + "learning_rate": 2.1805584913860397e-06, + "loss": 1.9809212684631348, + "step": 10486 + }, + { + "epoch": 1.9090743606079914, + "grad_norm": 21.625, + "learning_rate": 2.17985966227852e-06, + "loss": 1.4753215312957764, + "step": 10488 + }, + { + "epoch": 1.9094384272321836, + "grad_norm": 9.875, + "learning_rate": 2.1791609535232728e-06, + "loss": 1.40386164188385, + "step": 10490 + }, + { + "epoch": 1.9098024938563758, + "grad_norm": 11.4375, + "learning_rate": 2.178462365222828e-06, + "loss": 0.9462644457817078, + "step": 10492 + }, + { + "epoch": 1.910166560480568, + "grad_norm": 17.125, + "learning_rate": 2.1777638974797022e-06, + "loss": 1.0285927057266235, + "step": 10494 + }, + { + "epoch": 1.9105306271047602, + "grad_norm": 16.125, + "learning_rate": 2.1770655503963924e-06, + "loss": 0.5643746256828308, + "step": 10496 + }, + { + "epoch": 1.9108946937289524, + "grad_norm": 12.125, + "learning_rate": 2.176367324075377e-06, + "loss": 1.461489200592041, + "step": 10498 + }, + { + "epoch": 1.9112587603531446, + "grad_norm": 23.375, + "learning_rate": 2.17566921861912e-06, + "loss": 1.9206092357635498, + "step": 10500 + }, + { + "epoch": 1.9116228269773368, + "grad_norm": 9.3125, + "learning_rate": 2.174971234130064e-06, + "loss": 1.2126343250274658, + "step": 10502 + }, + { + "epoch": 1.911986893601529, + "grad_norm": 6.90625, + "learning_rate": 2.1742733707106355e-06, + "loss": 1.4992780685424805, + "step": 10504 + }, + { + "epoch": 1.9123509602257212, + "grad_norm": 7.1875, + "learning_rate": 2.1735756284632444e-06, + "loss": 1.0821495056152344, + "step": 10506 + }, + { + "epoch": 1.9127150268499136, + "grad_norm": 10.0, + "learning_rate": 2.1728780074902795e-06, + "loss": 1.4519574642181396, + "step": 10508 + }, + { + "epoch": 1.9130790934741058, + "grad_norm": 5.84375, + "learning_rate": 2.172180507894116e-06, + "loss": 1.2001405954360962, + "step": 10510 + }, + { + "epoch": 1.913443160098298, + "grad_norm": 9.8125, + "learning_rate": 2.1714831297771074e-06, + "loss": 1.3419532775878906, + "step": 10512 + }, + { + "epoch": 1.9138072267224904, + "grad_norm": 7.21875, + "learning_rate": 2.170785873241592e-06, + "loss": 1.4213727712631226, + "step": 10514 + }, + { + "epoch": 1.9141712933466826, + "grad_norm": 25.0, + "learning_rate": 2.17008873838989e-06, + "loss": 1.494529366493225, + "step": 10516 + }, + { + "epoch": 1.9145353599708748, + "grad_norm": 9.4375, + "learning_rate": 2.169391725324301e-06, + "loss": 1.3124151229858398, + "step": 10518 + }, + { + "epoch": 1.914899426595067, + "grad_norm": 6.9375, + "learning_rate": 2.1686948341471108e-06, + "loss": 1.2098684310913086, + "step": 10520 + }, + { + "epoch": 1.9152634932192591, + "grad_norm": 22.0, + "learning_rate": 2.167998064960584e-06, + "loss": 1.5387494564056396, + "step": 10522 + }, + { + "epoch": 1.9156275598434513, + "grad_norm": 29.625, + "learning_rate": 2.16730141786697e-06, + "loss": 1.5989289283752441, + "step": 10524 + }, + { + "epoch": 1.9159916264676435, + "grad_norm": 43.25, + "learning_rate": 2.1666048929684993e-06, + "loss": 1.5735671520233154, + "step": 10526 + }, + { + "epoch": 1.9163556930918357, + "grad_norm": 14.0, + "learning_rate": 2.165908490367381e-06, + "loss": 1.5775954723358154, + "step": 10528 + }, + { + "epoch": 1.916719759716028, + "grad_norm": 9.4375, + "learning_rate": 2.165212210165813e-06, + "loss": 1.4131669998168945, + "step": 10530 + }, + { + "epoch": 1.9170838263402201, + "grad_norm": 10.625, + "learning_rate": 2.16451605246597e-06, + "loss": 1.5526275634765625, + "step": 10532 + }, + { + "epoch": 1.9174478929644125, + "grad_norm": 19.5, + "learning_rate": 2.163820017370009e-06, + "loss": 1.430741548538208, + "step": 10534 + }, + { + "epoch": 1.9178119595886047, + "grad_norm": 6.28125, + "learning_rate": 2.1631241049800733e-06, + "loss": 1.2797216176986694, + "step": 10536 + }, + { + "epoch": 1.918176026212797, + "grad_norm": 7.84375, + "learning_rate": 2.1624283153982822e-06, + "loss": 0.9575626254081726, + "step": 10538 + }, + { + "epoch": 1.9185400928369891, + "grad_norm": 18.875, + "learning_rate": 2.161732648726743e-06, + "loss": 1.9104572534561157, + "step": 10540 + }, + { + "epoch": 1.9189041594611815, + "grad_norm": 9.0625, + "learning_rate": 2.16103710506754e-06, + "loss": 1.2149488925933838, + "step": 10542 + }, + { + "epoch": 1.9192682260853737, + "grad_norm": 4.03125, + "learning_rate": 2.1603416845227403e-06, + "loss": 0.8648663759231567, + "step": 10544 + }, + { + "epoch": 1.919632292709566, + "grad_norm": 7.46875, + "learning_rate": 2.1596463871943977e-06, + "loss": 1.5795938968658447, + "step": 10546 + }, + { + "epoch": 1.9199963593337581, + "grad_norm": 7.15625, + "learning_rate": 2.1589512131845405e-06, + "loss": 0.9581153392791748, + "step": 10548 + }, + { + "epoch": 1.9203604259579503, + "grad_norm": 11.0625, + "learning_rate": 2.1582561625951857e-06, + "loss": 1.3011606931686401, + "step": 10550 + }, + { + "epoch": 1.9207244925821425, + "grad_norm": 7.34375, + "learning_rate": 2.1575612355283266e-06, + "loss": 1.2429723739624023, + "step": 10552 + }, + { + "epoch": 1.9210885592063347, + "grad_norm": 9.0625, + "learning_rate": 2.156866432085942e-06, + "loss": 1.2560807466506958, + "step": 10554 + }, + { + "epoch": 1.921452625830527, + "grad_norm": 3.3125, + "learning_rate": 2.1561717523699927e-06, + "loss": 1.2178103923797607, + "step": 10556 + }, + { + "epoch": 1.921816692454719, + "grad_norm": 20.5, + "learning_rate": 2.1554771964824177e-06, + "loss": 1.1888086795806885, + "step": 10558 + }, + { + "epoch": 1.9221807590789113, + "grad_norm": 9.9375, + "learning_rate": 2.154782764525143e-06, + "loss": 1.4054150581359863, + "step": 10560 + }, + { + "epoch": 1.9225448257031037, + "grad_norm": 25.875, + "learning_rate": 2.154088456600071e-06, + "loss": 1.4726853370666504, + "step": 10562 + }, + { + "epoch": 1.922908892327296, + "grad_norm": 11.9375, + "learning_rate": 2.1533942728090897e-06, + "loss": 1.5992368459701538, + "step": 10564 + }, + { + "epoch": 1.923272958951488, + "grad_norm": 13.0, + "learning_rate": 2.152700213254069e-06, + "loss": 0.5055093765258789, + "step": 10566 + }, + { + "epoch": 1.9236370255756805, + "grad_norm": 17.375, + "learning_rate": 2.152006278036857e-06, + "loss": 1.3911727666854858, + "step": 10568 + }, + { + "epoch": 1.9240010921998727, + "grad_norm": 8.9375, + "learning_rate": 2.1513124672592887e-06, + "loss": 1.0272668600082397, + "step": 10570 + }, + { + "epoch": 1.924365158824065, + "grad_norm": 4.78125, + "learning_rate": 2.1506187810231753e-06, + "loss": 0.9814562797546387, + "step": 10572 + }, + { + "epoch": 1.924729225448257, + "grad_norm": 8.4375, + "learning_rate": 2.1499252194303127e-06, + "loss": 1.3142948150634766, + "step": 10574 + }, + { + "epoch": 1.9250932920724493, + "grad_norm": 7.0625, + "learning_rate": 2.149231782582481e-06, + "loss": 1.340053915977478, + "step": 10576 + }, + { + "epoch": 1.9254573586966415, + "grad_norm": 4.65625, + "learning_rate": 2.1485384705814354e-06, + "loss": 1.0525617599487305, + "step": 10578 + }, + { + "epoch": 1.9258214253208337, + "grad_norm": 12.375, + "learning_rate": 2.1478452835289205e-06, + "loss": 1.1405552625656128, + "step": 10580 + }, + { + "epoch": 1.9261854919450259, + "grad_norm": 11.25, + "learning_rate": 2.1471522215266556e-06, + "loss": 1.4168860912322998, + "step": 10582 + }, + { + "epoch": 1.926549558569218, + "grad_norm": 8.125, + "learning_rate": 2.1464592846763454e-06, + "loss": 1.247086763381958, + "step": 10584 + }, + { + "epoch": 1.9269136251934103, + "grad_norm": 16.25, + "learning_rate": 2.1457664730796775e-06, + "loss": 1.7664971351623535, + "step": 10586 + }, + { + "epoch": 1.9272776918176027, + "grad_norm": 26.625, + "learning_rate": 2.1450737868383166e-06, + "loss": 1.989170789718628, + "step": 10588 + }, + { + "epoch": 1.9276417584417949, + "grad_norm": 5.40625, + "learning_rate": 2.1443812260539137e-06, + "loss": 1.3812612295150757, + "step": 10590 + }, + { + "epoch": 1.928005825065987, + "grad_norm": 5.9375, + "learning_rate": 2.143688790828098e-06, + "loss": 1.1844573020935059, + "step": 10592 + }, + { + "epoch": 1.9283698916901792, + "grad_norm": 3.71875, + "learning_rate": 2.142996481262481e-06, + "loss": 1.097573161125183, + "step": 10594 + }, + { + "epoch": 1.9287339583143717, + "grad_norm": 12.125, + "learning_rate": 2.142304297458659e-06, + "loss": 1.2319780588150024, + "step": 10596 + }, + { + "epoch": 1.9290980249385639, + "grad_norm": 13.0625, + "learning_rate": 2.1416122395182037e-06, + "loss": 1.3655378818511963, + "step": 10598 + }, + { + "epoch": 1.929462091562756, + "grad_norm": 14.125, + "learning_rate": 2.140920307542675e-06, + "loss": 1.8357151746749878, + "step": 10600 + }, + { + "epoch": 1.9298261581869482, + "grad_norm": 8.5625, + "learning_rate": 2.140228501633609e-06, + "loss": 1.3636448383331299, + "step": 10602 + }, + { + "epoch": 1.9301902248111404, + "grad_norm": 4.5625, + "learning_rate": 2.139536821892525e-06, + "loss": 1.3478671312332153, + "step": 10604 + }, + { + "epoch": 1.9305542914353326, + "grad_norm": 5.21875, + "learning_rate": 2.138845268420927e-06, + "loss": 1.1939629316329956, + "step": 10606 + }, + { + "epoch": 1.9309183580595248, + "grad_norm": 9.5, + "learning_rate": 2.138153841320294e-06, + "loss": 0.9429299235343933, + "step": 10608 + }, + { + "epoch": 1.931282424683717, + "grad_norm": 13.375, + "learning_rate": 2.1374625406920945e-06, + "loss": 1.3931149244308472, + "step": 10610 + }, + { + "epoch": 1.9316464913079092, + "grad_norm": 13.9375, + "learning_rate": 2.1367713666377698e-06, + "loss": 1.9956785440444946, + "step": 10612 + }, + { + "epoch": 1.9320105579321014, + "grad_norm": 5.6875, + "learning_rate": 2.1360803192587483e-06, + "loss": 1.1798756122589111, + "step": 10614 + }, + { + "epoch": 1.9323746245562938, + "grad_norm": 9.1875, + "learning_rate": 2.13538939865644e-06, + "loss": 1.2579030990600586, + "step": 10616 + }, + { + "epoch": 1.932738691180486, + "grad_norm": 8.125, + "learning_rate": 2.1346986049322327e-06, + "loss": 1.6402931213378906, + "step": 10618 + }, + { + "epoch": 1.9331027578046782, + "grad_norm": 8.3125, + "learning_rate": 2.1340079381874997e-06, + "loss": 1.1270995140075684, + "step": 10620 + }, + { + "epoch": 1.9334668244288706, + "grad_norm": 2.984375, + "learning_rate": 2.133317398523591e-06, + "loss": 1.2936630249023438, + "step": 10622 + }, + { + "epoch": 1.9338308910530628, + "grad_norm": 8.5625, + "learning_rate": 2.1326269860418413e-06, + "loss": 0.9579342603683472, + "step": 10624 + }, + { + "epoch": 1.934194957677255, + "grad_norm": 11.1875, + "learning_rate": 2.1319367008435677e-06, + "loss": 1.431419849395752, + "step": 10626 + }, + { + "epoch": 1.9345590243014472, + "grad_norm": 10.375, + "learning_rate": 2.1312465430300633e-06, + "loss": 1.0643624067306519, + "step": 10628 + }, + { + "epoch": 1.9349230909256394, + "grad_norm": 14.3125, + "learning_rate": 2.1305565127026102e-06, + "loss": 1.524848222732544, + "step": 10630 + }, + { + "epoch": 1.9352871575498316, + "grad_norm": 27.5, + "learning_rate": 2.1298666099624645e-06, + "loss": 1.893900752067566, + "step": 10632 + }, + { + "epoch": 1.9356512241740238, + "grad_norm": 19.75, + "learning_rate": 2.129176834910866e-06, + "loss": 1.8068509101867676, + "step": 10634 + }, + { + "epoch": 1.936015290798216, + "grad_norm": 12.9375, + "learning_rate": 2.1284871876490397e-06, + "loss": 1.4855549335479736, + "step": 10636 + }, + { + "epoch": 1.9363793574224082, + "grad_norm": 11.625, + "learning_rate": 2.1277976682781853e-06, + "loss": 1.6553020477294922, + "step": 10638 + }, + { + "epoch": 1.9367434240466004, + "grad_norm": 6.59375, + "learning_rate": 2.12710827689949e-06, + "loss": 1.4351539611816406, + "step": 10640 + }, + { + "epoch": 1.9371074906707928, + "grad_norm": 10.0625, + "learning_rate": 2.1264190136141166e-06, + "loss": 0.9558985233306885, + "step": 10642 + }, + { + "epoch": 1.937471557294985, + "grad_norm": 2.65625, + "learning_rate": 2.125729878523212e-06, + "loss": 0.9934003949165344, + "step": 10644 + }, + { + "epoch": 1.9378356239191772, + "grad_norm": 8.1875, + "learning_rate": 2.125040871727906e-06, + "loss": 1.3211816549301147, + "step": 10646 + }, + { + "epoch": 1.9381996905433696, + "grad_norm": 26.75, + "learning_rate": 2.1243519933293046e-06, + "loss": 1.3147072792053223, + "step": 10648 + }, + { + "epoch": 1.9385637571675618, + "grad_norm": 28.625, + "learning_rate": 2.123663243428501e-06, + "loss": 1.7976093292236328, + "step": 10650 + }, + { + "epoch": 1.938927823791754, + "grad_norm": 7.25, + "learning_rate": 2.1229746221265644e-06, + "loss": 1.3431675434112549, + "step": 10652 + }, + { + "epoch": 1.9392918904159462, + "grad_norm": 11.3125, + "learning_rate": 2.1222861295245467e-06, + "loss": 1.3423734903335571, + "step": 10654 + }, + { + "epoch": 1.9396559570401384, + "grad_norm": 13.5, + "learning_rate": 2.1215977657234843e-06, + "loss": 1.385918140411377, + "step": 10656 + }, + { + "epoch": 1.9400200236643306, + "grad_norm": 12.0, + "learning_rate": 2.1209095308243883e-06, + "loss": 1.2211955785751343, + "step": 10658 + }, + { + "epoch": 1.9403840902885228, + "grad_norm": 9.5625, + "learning_rate": 2.1202214249282573e-06, + "loss": 1.0299816131591797, + "step": 10660 + }, + { + "epoch": 1.940748156912715, + "grad_norm": 34.0, + "learning_rate": 2.1195334481360665e-06, + "loss": 1.522233486175537, + "step": 10662 + }, + { + "epoch": 1.9411122235369072, + "grad_norm": 76.5, + "learning_rate": 2.1188456005487725e-06, + "loss": 1.7297430038452148, + "step": 10664 + }, + { + "epoch": 1.9414762901610993, + "grad_norm": 12.1875, + "learning_rate": 2.1181578822673175e-06, + "loss": 1.4662880897521973, + "step": 10666 + }, + { + "epoch": 1.9418403567852915, + "grad_norm": 9.9375, + "learning_rate": 2.117470293392618e-06, + "loss": 1.5376778841018677, + "step": 10668 + }, + { + "epoch": 1.942204423409484, + "grad_norm": 9.3125, + "learning_rate": 2.116782834025578e-06, + "loss": 1.182296633720398, + "step": 10670 + }, + { + "epoch": 1.9425684900336762, + "grad_norm": 8.9375, + "learning_rate": 2.116095504267077e-06, + "loss": 1.425065040588379, + "step": 10672 + }, + { + "epoch": 1.9429325566578683, + "grad_norm": 14.8125, + "learning_rate": 2.1154083042179773e-06, + "loss": 1.4091671705245972, + "step": 10674 + }, + { + "epoch": 1.9432966232820608, + "grad_norm": 7.0625, + "learning_rate": 2.1147212339791257e-06, + "loss": 1.423144817352295, + "step": 10676 + }, + { + "epoch": 1.943660689906253, + "grad_norm": 14.375, + "learning_rate": 2.114034293651344e-06, + "loss": 1.092933177947998, + "step": 10678 + }, + { + "epoch": 1.9440247565304452, + "grad_norm": 16.25, + "learning_rate": 2.113347483335441e-06, + "loss": 0.911866307258606, + "step": 10680 + }, + { + "epoch": 1.9443888231546373, + "grad_norm": 12.875, + "learning_rate": 2.1126608031322006e-06, + "loss": 1.382908582687378, + "step": 10682 + }, + { + "epoch": 1.9447528897788295, + "grad_norm": 9.375, + "learning_rate": 2.11197425314239e-06, + "loss": 1.3330384492874146, + "step": 10684 + }, + { + "epoch": 1.9451169564030217, + "grad_norm": 15.0625, + "learning_rate": 2.1112878334667607e-06, + "loss": 1.2274844646453857, + "step": 10686 + }, + { + "epoch": 1.945481023027214, + "grad_norm": 11.5625, + "learning_rate": 2.110601544206039e-06, + "loss": 1.5134776830673218, + "step": 10688 + }, + { + "epoch": 1.9458450896514061, + "grad_norm": 84.5, + "learning_rate": 2.1099153854609377e-06, + "loss": 1.4281120300292969, + "step": 10690 + }, + { + "epoch": 1.9462091562755983, + "grad_norm": 16.25, + "learning_rate": 2.1092293573321455e-06, + "loss": 1.6313104629516602, + "step": 10692 + }, + { + "epoch": 1.9465732228997905, + "grad_norm": 13.4375, + "learning_rate": 2.1085434599203343e-06, + "loss": 1.9710571765899658, + "step": 10694 + }, + { + "epoch": 1.946937289523983, + "grad_norm": 21.0, + "learning_rate": 2.1078576933261593e-06, + "loss": 1.228401780128479, + "step": 10696 + }, + { + "epoch": 1.9473013561481751, + "grad_norm": 21.5, + "learning_rate": 2.1071720576502503e-06, + "loss": 1.3314635753631592, + "step": 10698 + }, + { + "epoch": 1.9476654227723673, + "grad_norm": 8.6875, + "learning_rate": 2.1064865529932253e-06, + "loss": 1.859405517578125, + "step": 10700 + }, + { + "epoch": 1.9480294893965597, + "grad_norm": 7.4375, + "learning_rate": 2.105801179455677e-06, + "loss": 1.380920648574829, + "step": 10702 + }, + { + "epoch": 1.948393556020752, + "grad_norm": 4.5625, + "learning_rate": 2.105115937138181e-06, + "loss": 0.9663572907447815, + "step": 10704 + }, + { + "epoch": 1.9487576226449441, + "grad_norm": 14.6875, + "learning_rate": 2.104430826141296e-06, + "loss": 1.6403248310089111, + "step": 10706 + }, + { + "epoch": 1.9491216892691363, + "grad_norm": 8.625, + "learning_rate": 2.1037458465655562e-06, + "loss": 1.5831127166748047, + "step": 10708 + }, + { + "epoch": 1.9494857558933285, + "grad_norm": 16.0, + "learning_rate": 2.103060998511483e-06, + "loss": 1.4402159452438354, + "step": 10710 + }, + { + "epoch": 1.9498498225175207, + "grad_norm": 17.875, + "learning_rate": 2.102376282079572e-06, + "loss": 1.1778827905654907, + "step": 10712 + }, + { + "epoch": 1.950213889141713, + "grad_norm": 20.0, + "learning_rate": 2.1016916973703033e-06, + "loss": 0.9038905501365662, + "step": 10714 + }, + { + "epoch": 1.950577955765905, + "grad_norm": 9.5, + "learning_rate": 2.1010072444841394e-06, + "loss": 1.0613863468170166, + "step": 10716 + }, + { + "epoch": 1.9509420223900973, + "grad_norm": 4.0625, + "learning_rate": 2.100322923521517e-06, + "loss": 0.9329920411109924, + "step": 10718 + }, + { + "epoch": 1.9513060890142895, + "grad_norm": 23.5, + "learning_rate": 2.099638734582862e-06, + "loss": 1.1665724515914917, + "step": 10720 + }, + { + "epoch": 1.951670155638482, + "grad_norm": 11.0, + "learning_rate": 2.098954677768572e-06, + "loss": 1.405213713645935, + "step": 10722 + }, + { + "epoch": 1.952034222262674, + "grad_norm": 11.375, + "learning_rate": 2.098270753179031e-06, + "loss": 1.4372384548187256, + "step": 10724 + }, + { + "epoch": 1.9523982888868663, + "grad_norm": 6.5625, + "learning_rate": 2.097586960914604e-06, + "loss": 1.3001532554626465, + "step": 10726 + }, + { + "epoch": 1.9527623555110585, + "grad_norm": 18.375, + "learning_rate": 2.096903301075632e-06, + "loss": 1.5140721797943115, + "step": 10728 + }, + { + "epoch": 1.953126422135251, + "grad_norm": 16.625, + "learning_rate": 2.0962197737624423e-06, + "loss": 2.1442830562591553, + "step": 10730 + }, + { + "epoch": 1.953490488759443, + "grad_norm": 13.375, + "learning_rate": 2.095536379075337e-06, + "loss": 1.4570618867874146, + "step": 10732 + }, + { + "epoch": 1.9538545553836353, + "grad_norm": 9.6875, + "learning_rate": 2.094853117114602e-06, + "loss": 1.5648422241210938, + "step": 10734 + }, + { + "epoch": 1.9542186220078275, + "grad_norm": 9.75, + "learning_rate": 2.0941699879805057e-06, + "loss": 1.4410163164138794, + "step": 10736 + }, + { + "epoch": 1.9545826886320197, + "grad_norm": 12.625, + "learning_rate": 2.09348699177329e-06, + "loss": 1.5370848178863525, + "step": 10738 + }, + { + "epoch": 1.9549467552562119, + "grad_norm": 13.9375, + "learning_rate": 2.092804128593187e-06, + "loss": 1.5675652027130127, + "step": 10740 + }, + { + "epoch": 1.955310821880404, + "grad_norm": 11.8125, + "learning_rate": 2.092121398540401e-06, + "loss": 1.2988542318344116, + "step": 10742 + }, + { + "epoch": 1.9556748885045963, + "grad_norm": 9.75, + "learning_rate": 2.091438801715119e-06, + "loss": 0.9742361307144165, + "step": 10744 + }, + { + "epoch": 1.9560389551287884, + "grad_norm": 9.6875, + "learning_rate": 2.0907563382175127e-06, + "loss": 1.7126858234405518, + "step": 10746 + }, + { + "epoch": 1.9564030217529806, + "grad_norm": 28.75, + "learning_rate": 2.090074008147727e-06, + "loss": 1.7257542610168457, + "step": 10748 + }, + { + "epoch": 1.956767088377173, + "grad_norm": 23.25, + "learning_rate": 2.0893918116058953e-06, + "loss": 1.5862066745758057, + "step": 10750 + }, + { + "epoch": 1.9571311550013653, + "grad_norm": 16.625, + "learning_rate": 2.0887097486921235e-06, + "loss": 1.7677816152572632, + "step": 10752 + }, + { + "epoch": 1.9574952216255574, + "grad_norm": 15.375, + "learning_rate": 2.0880278195065024e-06, + "loss": 1.473554253578186, + "step": 10754 + }, + { + "epoch": 1.9578592882497499, + "grad_norm": 8.3125, + "learning_rate": 2.087346024149104e-06, + "loss": 1.0296990871429443, + "step": 10756 + }, + { + "epoch": 1.958223354873942, + "grad_norm": 13.625, + "learning_rate": 2.0866643627199767e-06, + "loss": 1.5855239629745483, + "step": 10758 + }, + { + "epoch": 1.9585874214981343, + "grad_norm": 11.5, + "learning_rate": 2.0859828353191545e-06, + "loss": 1.4761435985565186, + "step": 10760 + }, + { + "epoch": 1.9589514881223264, + "grad_norm": 8.3125, + "learning_rate": 2.0853014420466462e-06, + "loss": 1.4762241840362549, + "step": 10762 + }, + { + "epoch": 1.9593155547465186, + "grad_norm": 6.53125, + "learning_rate": 2.0846201830024437e-06, + "loss": 1.0595502853393555, + "step": 10764 + }, + { + "epoch": 1.9596796213707108, + "grad_norm": 8.375, + "learning_rate": 2.0839390582865204e-06, + "loss": 1.0314304828643799, + "step": 10766 + }, + { + "epoch": 1.960043687994903, + "grad_norm": 26.75, + "learning_rate": 2.083258067998827e-06, + "loss": 1.520331621170044, + "step": 10768 + }, + { + "epoch": 1.9604077546190952, + "grad_norm": 25.125, + "learning_rate": 2.0825772122392984e-06, + "loss": 1.411595106124878, + "step": 10770 + }, + { + "epoch": 1.9607718212432874, + "grad_norm": 14.875, + "learning_rate": 2.0818964911078442e-06, + "loss": 1.3655226230621338, + "step": 10772 + }, + { + "epoch": 1.9611358878674796, + "grad_norm": 12.375, + "learning_rate": 2.0812159047043593e-06, + "loss": 1.4935168027877808, + "step": 10774 + }, + { + "epoch": 1.961499954491672, + "grad_norm": 9.75, + "learning_rate": 2.0805354531287185e-06, + "loss": 1.5060609579086304, + "step": 10776 + }, + { + "epoch": 1.9618640211158642, + "grad_norm": 6.78125, + "learning_rate": 2.0798551364807717e-06, + "loss": 1.092186689376831, + "step": 10778 + }, + { + "epoch": 1.9622280877400564, + "grad_norm": 56.25, + "learning_rate": 2.0791749548603567e-06, + "loss": 1.1833786964416504, + "step": 10780 + }, + { + "epoch": 1.9625921543642486, + "grad_norm": 3.875, + "learning_rate": 2.078494908367284e-06, + "loss": 1.3987946510314941, + "step": 10782 + }, + { + "epoch": 1.962956220988441, + "grad_norm": 13.0625, + "learning_rate": 2.0778149971013483e-06, + "loss": 0.859051525592804, + "step": 10784 + }, + { + "epoch": 1.9633202876126332, + "grad_norm": 20.125, + "learning_rate": 2.0771352211623264e-06, + "loss": 1.0024619102478027, + "step": 10786 + }, + { + "epoch": 1.9636843542368254, + "grad_norm": 15.5, + "learning_rate": 2.0764555806499688e-06, + "loss": 1.5861179828643799, + "step": 10788 + }, + { + "epoch": 1.9640484208610176, + "grad_norm": 21.375, + "learning_rate": 2.075776075664013e-06, + "loss": 1.8818206787109375, + "step": 10790 + }, + { + "epoch": 1.9644124874852098, + "grad_norm": 7.03125, + "learning_rate": 2.075096706304173e-06, + "loss": 0.9894894361495972, + "step": 10792 + }, + { + "epoch": 1.964776554109402, + "grad_norm": 10.4375, + "learning_rate": 2.0744174726701414e-06, + "loss": 1.0505038499832153, + "step": 10794 + }, + { + "epoch": 1.9651406207335942, + "grad_norm": 6.0625, + "learning_rate": 2.0737383748615962e-06, + "loss": 1.4778542518615723, + "step": 10796 + }, + { + "epoch": 1.9655046873577864, + "grad_norm": 22.0, + "learning_rate": 2.0730594129781895e-06, + "loss": 1.1886543035507202, + "step": 10798 + }, + { + "epoch": 1.9658687539819786, + "grad_norm": 10.0, + "learning_rate": 2.072380587119559e-06, + "loss": 1.3207370042800903, + "step": 10800 + }, + { + "epoch": 1.9662328206061708, + "grad_norm": 30.625, + "learning_rate": 2.0717018973853166e-06, + "loss": 1.598076343536377, + "step": 10802 + }, + { + "epoch": 1.9665968872303632, + "grad_norm": 10.75, + "learning_rate": 2.0710233438750585e-06, + "loss": 1.732502818107605, + "step": 10804 + }, + { + "epoch": 1.9669609538545554, + "grad_norm": 10.0625, + "learning_rate": 2.0703449266883613e-06, + "loss": 1.1132270097732544, + "step": 10806 + }, + { + "epoch": 1.9673250204787476, + "grad_norm": 14.0, + "learning_rate": 2.0696666459247773e-06, + "loss": 1.4258068799972534, + "step": 10808 + }, + { + "epoch": 1.96768908710294, + "grad_norm": 15.125, + "learning_rate": 2.0689885016838437e-06, + "loss": 1.3199408054351807, + "step": 10810 + }, + { + "epoch": 1.9680531537271322, + "grad_norm": 19.625, + "learning_rate": 2.068310494065074e-06, + "loss": 1.5398144721984863, + "step": 10812 + }, + { + "epoch": 1.9684172203513244, + "grad_norm": 10.25, + "learning_rate": 2.067632623167962e-06, + "loss": 1.44391667842865, + "step": 10814 + }, + { + "epoch": 1.9687812869755166, + "grad_norm": 18.5, + "learning_rate": 2.0669548890919865e-06, + "loss": 1.193953275680542, + "step": 10816 + }, + { + "epoch": 1.9691453535997088, + "grad_norm": 14.8125, + "learning_rate": 2.0662772919365977e-06, + "loss": 1.752061367034912, + "step": 10818 + }, + { + "epoch": 1.969509420223901, + "grad_norm": 22.0, + "learning_rate": 2.065599831801234e-06, + "loss": 1.4170674085617065, + "step": 10820 + }, + { + "epoch": 1.9698734868480932, + "grad_norm": 6.6875, + "learning_rate": 2.0649225087853074e-06, + "loss": 1.3031843900680542, + "step": 10822 + }, + { + "epoch": 1.9702375534722854, + "grad_norm": 5.96875, + "learning_rate": 2.064245322988212e-06, + "loss": 1.3610680103302002, + "step": 10824 + }, + { + "epoch": 1.9706016200964775, + "grad_norm": 14.1875, + "learning_rate": 2.063568274509325e-06, + "loss": 1.5324195623397827, + "step": 10826 + }, + { + "epoch": 1.9709656867206697, + "grad_norm": 27.0, + "learning_rate": 2.0628913634479973e-06, + "loss": 1.9046146869659424, + "step": 10828 + }, + { + "epoch": 1.9713297533448622, + "grad_norm": 190.0, + "learning_rate": 2.0622145899035654e-06, + "loss": 1.325943946838379, + "step": 10830 + }, + { + "epoch": 1.9716938199690544, + "grad_norm": 15.125, + "learning_rate": 2.0615379539753415e-06, + "loss": 1.494063377380371, + "step": 10832 + }, + { + "epoch": 1.9720578865932465, + "grad_norm": 16.5, + "learning_rate": 2.0608614557626186e-06, + "loss": 1.836858868598938, + "step": 10834 + }, + { + "epoch": 1.9724219532174387, + "grad_norm": 28.625, + "learning_rate": 2.0601850953646727e-06, + "loss": 1.7695846557617188, + "step": 10836 + }, + { + "epoch": 1.9727860198416312, + "grad_norm": 21.375, + "learning_rate": 2.0595088728807537e-06, + "loss": 1.029619812965393, + "step": 10838 + }, + { + "epoch": 1.9731500864658233, + "grad_norm": 31.625, + "learning_rate": 2.058832788410098e-06, + "loss": 1.155238389968872, + "step": 10840 + }, + { + "epoch": 1.9735141530900155, + "grad_norm": 13.75, + "learning_rate": 2.058156842051915e-06, + "loss": 1.5167967081069946, + "step": 10842 + }, + { + "epoch": 1.9738782197142077, + "grad_norm": 5.4375, + "learning_rate": 2.057481033905398e-06, + "loss": 1.4010772705078125, + "step": 10844 + }, + { + "epoch": 1.9742422863384, + "grad_norm": 7.28125, + "learning_rate": 2.0568053640697213e-06, + "loss": 1.3923753499984741, + "step": 10846 + }, + { + "epoch": 1.9746063529625921, + "grad_norm": 4.25, + "learning_rate": 2.0561298326440334e-06, + "loss": 1.2189128398895264, + "step": 10848 + }, + { + "epoch": 1.9749704195867843, + "grad_norm": 5.03125, + "learning_rate": 2.055454439727469e-06, + "loss": 1.0579004287719727, + "step": 10850 + }, + { + "epoch": 1.9753344862109765, + "grad_norm": 7.71875, + "learning_rate": 2.0547791854191365e-06, + "loss": 1.241098403930664, + "step": 10852 + }, + { + "epoch": 1.9756985528351687, + "grad_norm": 15.25, + "learning_rate": 2.0541040698181273e-06, + "loss": 1.2325248718261719, + "step": 10854 + }, + { + "epoch": 1.976062619459361, + "grad_norm": 9.8125, + "learning_rate": 2.053429093023514e-06, + "loss": 1.075623631477356, + "step": 10856 + }, + { + "epoch": 1.9764266860835533, + "grad_norm": 15.75, + "learning_rate": 2.052754255134344e-06, + "loss": 1.4465932846069336, + "step": 10858 + }, + { + "epoch": 1.9767907527077455, + "grad_norm": 12.0, + "learning_rate": 2.052079556249649e-06, + "loss": 1.5425323247909546, + "step": 10860 + }, + { + "epoch": 1.9771548193319377, + "grad_norm": 34.5, + "learning_rate": 2.0514049964684375e-06, + "loss": 1.5979787111282349, + "step": 10862 + }, + { + "epoch": 1.9775188859561301, + "grad_norm": 5.40625, + "learning_rate": 2.0507305758896973e-06, + "loss": 1.4477760791778564, + "step": 10864 + }, + { + "epoch": 1.9778829525803223, + "grad_norm": 21.0, + "learning_rate": 2.0500562946124e-06, + "loss": 1.1385356187820435, + "step": 10866 + }, + { + "epoch": 1.9782470192045145, + "grad_norm": 24.375, + "learning_rate": 2.0493821527354896e-06, + "loss": 2.09773850440979, + "step": 10868 + }, + { + "epoch": 1.9786110858287067, + "grad_norm": 16.125, + "learning_rate": 2.0487081503578978e-06, + "loss": 1.9110240936279297, + "step": 10870 + }, + { + "epoch": 1.978975152452899, + "grad_norm": 16.625, + "learning_rate": 2.0480342875785283e-06, + "loss": 1.2511754035949707, + "step": 10872 + }, + { + "epoch": 1.979339219077091, + "grad_norm": 29.0, + "learning_rate": 2.0473605644962687e-06, + "loss": 0.9926354885101318, + "step": 10874 + }, + { + "epoch": 1.9797032857012833, + "grad_norm": 19.5, + "learning_rate": 2.046686981209987e-06, + "loss": 1.4487391710281372, + "step": 10876 + }, + { + "epoch": 1.9800673523254755, + "grad_norm": 93.0, + "learning_rate": 2.0460135378185263e-06, + "loss": 1.4361932277679443, + "step": 10878 + }, + { + "epoch": 1.9804314189496677, + "grad_norm": 13.5625, + "learning_rate": 2.045340234420714e-06, + "loss": 1.3014165163040161, + "step": 10880 + }, + { + "epoch": 1.9807954855738599, + "grad_norm": 14.25, + "learning_rate": 2.044667071115352e-06, + "loss": 0.6043608784675598, + "step": 10882 + }, + { + "epoch": 1.9811595521980523, + "grad_norm": 5.1875, + "learning_rate": 2.0439940480012255e-06, + "loss": 0.8798382878303528, + "step": 10884 + }, + { + "epoch": 1.9815236188222445, + "grad_norm": 6.9375, + "learning_rate": 2.0433211651770998e-06, + "loss": 1.5133492946624756, + "step": 10886 + }, + { + "epoch": 1.9818876854464367, + "grad_norm": 5.65625, + "learning_rate": 2.0426484227417144e-06, + "loss": 1.0015044212341309, + "step": 10888 + }, + { + "epoch": 1.9822517520706289, + "grad_norm": 10.625, + "learning_rate": 2.0419758207937944e-06, + "loss": 1.1347880363464355, + "step": 10890 + }, + { + "epoch": 1.9826158186948213, + "grad_norm": 11.25, + "learning_rate": 2.0413033594320396e-06, + "loss": 1.4811532497406006, + "step": 10892 + }, + { + "epoch": 1.9829798853190135, + "grad_norm": 9.375, + "learning_rate": 2.040631038755131e-06, + "loss": 1.5913841724395752, + "step": 10894 + }, + { + "epoch": 1.9833439519432057, + "grad_norm": 7.3125, + "learning_rate": 2.0399588588617307e-06, + "loss": 1.0283632278442383, + "step": 10896 + }, + { + "epoch": 1.9837080185673979, + "grad_norm": 9.1875, + "learning_rate": 2.0392868198504755e-06, + "loss": 1.5134652853012085, + "step": 10898 + }, + { + "epoch": 1.98407208519159, + "grad_norm": 17.625, + "learning_rate": 2.038614921819988e-06, + "loss": 1.526085376739502, + "step": 10900 + }, + { + "epoch": 1.9844361518157823, + "grad_norm": 20.0, + "learning_rate": 2.0379431648688634e-06, + "loss": 1.7380069494247437, + "step": 10902 + }, + { + "epoch": 1.9848002184399745, + "grad_norm": 14.8125, + "learning_rate": 2.03727154909568e-06, + "loss": 1.3333088159561157, + "step": 10904 + }, + { + "epoch": 1.9851642850641666, + "grad_norm": 12.9375, + "learning_rate": 2.0366000745989965e-06, + "loss": 1.1788612604141235, + "step": 10906 + }, + { + "epoch": 1.9855283516883588, + "grad_norm": 11.75, + "learning_rate": 2.035928741477346e-06, + "loss": 1.420978307723999, + "step": 10908 + }, + { + "epoch": 1.985892418312551, + "grad_norm": 10.0625, + "learning_rate": 2.035257549829248e-06, + "loss": 1.2888288497924805, + "step": 10910 + }, + { + "epoch": 1.9862564849367434, + "grad_norm": 19.5, + "learning_rate": 2.0345864997531934e-06, + "loss": 0.694785475730896, + "step": 10912 + }, + { + "epoch": 1.9866205515609356, + "grad_norm": 19.875, + "learning_rate": 2.0339155913476567e-06, + "loss": 0.4282897710800171, + "step": 10914 + }, + { + "epoch": 1.9869846181851278, + "grad_norm": 17.0, + "learning_rate": 2.0332448247110937e-06, + "loss": 1.7610716819763184, + "step": 10916 + }, + { + "epoch": 1.9873486848093203, + "grad_norm": 10.5625, + "learning_rate": 2.0325741999419328e-06, + "loss": 1.1312470436096191, + "step": 10918 + }, + { + "epoch": 1.9877127514335124, + "grad_norm": 8.375, + "learning_rate": 2.0319037171385892e-06, + "loss": 1.420812726020813, + "step": 10920 + }, + { + "epoch": 1.9880768180577046, + "grad_norm": 16.875, + "learning_rate": 2.0312333763994507e-06, + "loss": 1.3763034343719482, + "step": 10922 + }, + { + "epoch": 1.9884408846818968, + "grad_norm": 67.0, + "learning_rate": 2.0305631778228877e-06, + "loss": 1.2035367488861084, + "step": 10924 + }, + { + "epoch": 1.988804951306089, + "grad_norm": 12.75, + "learning_rate": 2.0298931215072514e-06, + "loss": 1.589197039604187, + "step": 10926 + }, + { + "epoch": 1.9891690179302812, + "grad_norm": 7.65625, + "learning_rate": 2.0292232075508658e-06, + "loss": 1.0951956510543823, + "step": 10928 + }, + { + "epoch": 1.9895330845544734, + "grad_norm": 14.8125, + "learning_rate": 2.028553436052042e-06, + "loss": 1.479056477546692, + "step": 10930 + }, + { + "epoch": 1.9898971511786656, + "grad_norm": 5.875, + "learning_rate": 2.027883807109064e-06, + "loss": 1.2826660871505737, + "step": 10932 + }, + { + "epoch": 1.9902612178028578, + "grad_norm": 11.125, + "learning_rate": 2.0272143208201965e-06, + "loss": 1.260714054107666, + "step": 10934 + }, + { + "epoch": 1.99062528442705, + "grad_norm": 8.875, + "learning_rate": 2.026544977283687e-06, + "loss": 0.886364221572876, + "step": 10936 + }, + { + "epoch": 1.9909893510512424, + "grad_norm": 10.5, + "learning_rate": 2.0258757765977556e-06, + "loss": 1.4551880359649658, + "step": 10938 + }, + { + "epoch": 1.9913534176754346, + "grad_norm": 9.3125, + "learning_rate": 2.025206718860607e-06, + "loss": 1.423505187034607, + "step": 10940 + }, + { + "epoch": 1.9917174842996268, + "grad_norm": 6.78125, + "learning_rate": 2.0245378041704224e-06, + "loss": 1.3690451383590698, + "step": 10942 + }, + { + "epoch": 1.9920815509238192, + "grad_norm": 7.84375, + "learning_rate": 2.02386903262536e-06, + "loss": 1.38144850730896, + "step": 10944 + }, + { + "epoch": 1.9924456175480114, + "grad_norm": 8.0, + "learning_rate": 2.023200404323563e-06, + "loss": 1.5066239833831787, + "step": 10946 + }, + { + "epoch": 1.9928096841722036, + "grad_norm": 18.75, + "learning_rate": 2.0225319193631467e-06, + "loss": 1.362168550491333, + "step": 10948 + }, + { + "epoch": 1.9931737507963958, + "grad_norm": 10.4375, + "learning_rate": 2.021863577842211e-06, + "loss": 1.4317498207092285, + "step": 10950 + }, + { + "epoch": 1.993537817420588, + "grad_norm": 16.75, + "learning_rate": 2.0211953798588305e-06, + "loss": 1.4061001539230347, + "step": 10952 + }, + { + "epoch": 1.9939018840447802, + "grad_norm": 18.125, + "learning_rate": 2.0205273255110606e-06, + "loss": 1.9899253845214844, + "step": 10954 + }, + { + "epoch": 1.9942659506689724, + "grad_norm": 10.125, + "learning_rate": 2.019859414896938e-06, + "loss": 1.497283697128296, + "step": 10956 + }, + { + "epoch": 1.9946300172931646, + "grad_norm": 12.0, + "learning_rate": 2.0191916481144725e-06, + "loss": 1.3275463581085205, + "step": 10958 + }, + { + "epoch": 1.9949940839173568, + "grad_norm": 10.5625, + "learning_rate": 2.018524025261659e-06, + "loss": 1.2029767036437988, + "step": 10960 + }, + { + "epoch": 1.995358150541549, + "grad_norm": 3.21875, + "learning_rate": 2.0178565464364667e-06, + "loss": 1.2263057231903076, + "step": 10962 + }, + { + "epoch": 1.9957222171657414, + "grad_norm": 9.1875, + "learning_rate": 2.0171892117368453e-06, + "loss": 1.4551470279693604, + "step": 10964 + }, + { + "epoch": 1.9960862837899336, + "grad_norm": 12.125, + "learning_rate": 2.016522021260725e-06, + "loss": 1.3572474718093872, + "step": 10966 + }, + { + "epoch": 1.9964503504141258, + "grad_norm": 11.0625, + "learning_rate": 2.0158549751060116e-06, + "loss": 0.8097766637802124, + "step": 10968 + }, + { + "epoch": 1.996814417038318, + "grad_norm": 8.875, + "learning_rate": 2.0151880733705935e-06, + "loss": 1.4493364095687866, + "step": 10970 + }, + { + "epoch": 1.9971784836625104, + "grad_norm": 6.625, + "learning_rate": 2.0145213161523336e-06, + "loss": 1.2721589803695679, + "step": 10972 + }, + { + "epoch": 1.9975425502867026, + "grad_norm": 8.875, + "learning_rate": 2.013854703549076e-06, + "loss": 1.225865364074707, + "step": 10974 + }, + { + "epoch": 1.9979066169108948, + "grad_norm": 12.0, + "learning_rate": 2.013188235658646e-06, + "loss": 1.4053356647491455, + "step": 10976 + }, + { + "epoch": 1.998270683535087, + "grad_norm": 24.625, + "learning_rate": 2.0125219125788416e-06, + "loss": 1.3101599216461182, + "step": 10978 + }, + { + "epoch": 1.9986347501592792, + "grad_norm": 8.0, + "learning_rate": 2.011855734407446e-06, + "loss": 1.398789405822754, + "step": 10980 + }, + { + "epoch": 1.9989988167834714, + "grad_norm": 10.0, + "learning_rate": 2.011189701242216e-06, + "loss": 1.1941096782684326, + "step": 10982 + }, + { + "epoch": 1.9993628834076635, + "grad_norm": 15.1875, + "learning_rate": 2.010523813180889e-06, + "loss": 1.9056274890899658, + "step": 10984 + }, + { + "epoch": 1.9997269500318557, + "grad_norm": 16.0, + "learning_rate": 2.0098580703211845e-06, + "loss": 1.604621410369873, + "step": 10986 + }, + { + "epoch": 2.0, + "grad_norm": 20.375, + "learning_rate": 2.0091924727607935e-06, + "loss": 1.499759316444397, + "step": 10988 + }, + { + "epoch": 2.000364066624192, + "grad_norm": 2.625, + "learning_rate": 2.008527020597394e-06, + "loss": 1.3992056846618652, + "step": 10990 + }, + { + "epoch": 2.0007281332483844, + "grad_norm": 44.0, + "learning_rate": 2.0078617139286346e-06, + "loss": 0.9406244158744812, + "step": 10992 + }, + { + "epoch": 2.0010921998725766, + "grad_norm": 9.25, + "learning_rate": 2.0071965528521476e-06, + "loss": 1.4997491836547852, + "step": 10994 + }, + { + "epoch": 2.0014562664967688, + "grad_norm": 5.40625, + "learning_rate": 2.0065315374655443e-06, + "loss": 1.037550449371338, + "step": 10996 + }, + { + "epoch": 2.001820333120961, + "grad_norm": 17.875, + "learning_rate": 2.00586666786641e-06, + "loss": 1.4874814748764038, + "step": 10998 + }, + { + "epoch": 2.002184399745153, + "grad_norm": 19.0, + "learning_rate": 2.0052019441523153e-06, + "loss": 0.333152174949646, + "step": 11000 + }, + { + "epoch": 2.002548466369346, + "grad_norm": 7.625, + "learning_rate": 2.0045373664208024e-06, + "loss": 1.3715388774871826, + "step": 11002 + }, + { + "epoch": 2.002912532993538, + "grad_norm": 6.15625, + "learning_rate": 2.0038729347693963e-06, + "loss": 1.313523769378662, + "step": 11004 + }, + { + "epoch": 2.00327659961773, + "grad_norm": 10.75, + "learning_rate": 2.0032086492956014e-06, + "loss": 1.3113813400268555, + "step": 11006 + }, + { + "epoch": 2.0036406662419224, + "grad_norm": 21.375, + "learning_rate": 2.0025445100968965e-06, + "loss": 1.8938591480255127, + "step": 11008 + }, + { + "epoch": 2.0040047328661146, + "grad_norm": 75.5, + "learning_rate": 2.0018805172707437e-06, + "loss": 1.2183549404144287, + "step": 11010 + }, + { + "epoch": 2.0043687994903068, + "grad_norm": 13.3125, + "learning_rate": 2.0012166709145793e-06, + "loss": 1.4318671226501465, + "step": 11012 + }, + { + "epoch": 2.004732866114499, + "grad_norm": 11.625, + "learning_rate": 2.0005529711258197e-06, + "loss": 1.3979640007019043, + "step": 11014 + }, + { + "epoch": 2.005096932738691, + "grad_norm": 13.4375, + "learning_rate": 1.9998894180018627e-06, + "loss": 1.4701021909713745, + "step": 11016 + }, + { + "epoch": 2.0054609993628834, + "grad_norm": 11.5, + "learning_rate": 1.999226011640079e-06, + "loss": 1.5419895648956299, + "step": 11018 + }, + { + "epoch": 2.0058250659870756, + "grad_norm": 10.375, + "learning_rate": 1.9985627521378243e-06, + "loss": 1.3981966972351074, + "step": 11020 + }, + { + "epoch": 2.0061891326112677, + "grad_norm": 36.5, + "learning_rate": 1.997899639592426e-06, + "loss": 1.8604216575622559, + "step": 11022 + }, + { + "epoch": 2.00655319923546, + "grad_norm": 4.625, + "learning_rate": 1.9972366741011937e-06, + "loss": 1.0971505641937256, + "step": 11024 + }, + { + "epoch": 2.006917265859652, + "grad_norm": 8.5625, + "learning_rate": 1.9965738557614176e-06, + "loss": 1.357130765914917, + "step": 11026 + }, + { + "epoch": 2.0072813324838443, + "grad_norm": 11.375, + "learning_rate": 1.9959111846703594e-06, + "loss": 1.4518401622772217, + "step": 11028 + }, + { + "epoch": 2.007645399108037, + "grad_norm": 7.65625, + "learning_rate": 1.9952486609252678e-06, + "loss": 1.5343501567840576, + "step": 11030 + }, + { + "epoch": 2.008009465732229, + "grad_norm": 7.90625, + "learning_rate": 1.994586284623362e-06, + "loss": 1.4651836156845093, + "step": 11032 + }, + { + "epoch": 2.0083735323564214, + "grad_norm": 23.75, + "learning_rate": 1.9939240558618444e-06, + "loss": 1.3329646587371826, + "step": 11034 + }, + { + "epoch": 2.0087375989806135, + "grad_norm": 9.3125, + "learning_rate": 1.9932619747378953e-06, + "loss": 1.3475887775421143, + "step": 11036 + }, + { + "epoch": 2.0091016656048057, + "grad_norm": 18.375, + "learning_rate": 1.99260004134867e-06, + "loss": 1.6197303533554077, + "step": 11038 + }, + { + "epoch": 2.009465732228998, + "grad_norm": 8.6875, + "learning_rate": 1.9919382557913074e-06, + "loss": 1.1972200870513916, + "step": 11040 + }, + { + "epoch": 2.00982979885319, + "grad_norm": 9.1875, + "learning_rate": 1.9912766181629196e-06, + "loss": 1.1718153953552246, + "step": 11042 + }, + { + "epoch": 2.0101938654773823, + "grad_norm": 4.21875, + "learning_rate": 1.9906151285605993e-06, + "loss": 0.9909963607788086, + "step": 11044 + }, + { + "epoch": 2.0105579321015745, + "grad_norm": 11.1875, + "learning_rate": 1.98995378708142e-06, + "loss": 1.668025255203247, + "step": 11046 + }, + { + "epoch": 2.0109219987257667, + "grad_norm": 5.96875, + "learning_rate": 1.9892925938224274e-06, + "loss": 1.3886585235595703, + "step": 11048 + }, + { + "epoch": 2.011286065349959, + "grad_norm": 9.9375, + "learning_rate": 1.9886315488806525e-06, + "loss": 1.6192712783813477, + "step": 11050 + }, + { + "epoch": 2.011650131974151, + "grad_norm": 13.875, + "learning_rate": 1.9879706523530977e-06, + "loss": 1.3619799613952637, + "step": 11052 + }, + { + "epoch": 2.0120141985983433, + "grad_norm": 11.4375, + "learning_rate": 1.987309904336748e-06, + "loss": 1.4744445085525513, + "step": 11054 + }, + { + "epoch": 2.012378265222536, + "grad_norm": 5.15625, + "learning_rate": 1.9866493049285674e-06, + "loss": 1.1223396062850952, + "step": 11056 + }, + { + "epoch": 2.012742331846728, + "grad_norm": 4.875, + "learning_rate": 1.985988854225492e-06, + "loss": 0.7826835513114929, + "step": 11058 + }, + { + "epoch": 2.0131063984709203, + "grad_norm": 12.5, + "learning_rate": 1.985328552324446e-06, + "loss": 1.536156415939331, + "step": 11060 + }, + { + "epoch": 2.0134704650951125, + "grad_norm": 9.125, + "learning_rate": 1.984668399322321e-06, + "loss": 1.4070651531219482, + "step": 11062 + }, + { + "epoch": 2.0138345317193047, + "grad_norm": 7.09375, + "learning_rate": 1.984008395315993e-06, + "loss": 1.3552974462509155, + "step": 11064 + }, + { + "epoch": 2.014198598343497, + "grad_norm": 22.375, + "learning_rate": 1.983348540402317e-06, + "loss": 1.4086244106292725, + "step": 11066 + }, + { + "epoch": 2.014562664967689, + "grad_norm": 8.375, + "learning_rate": 1.982688834678121e-06, + "loss": 1.1711201667785645, + "step": 11068 + }, + { + "epoch": 2.0149267315918813, + "grad_norm": 10.1875, + "learning_rate": 1.9820292782402176e-06, + "loss": 1.5485522747039795, + "step": 11070 + }, + { + "epoch": 2.0152907982160735, + "grad_norm": 3.546875, + "learning_rate": 1.9813698711853912e-06, + "loss": 1.1161587238311768, + "step": 11072 + }, + { + "epoch": 2.0156548648402657, + "grad_norm": 10.75, + "learning_rate": 1.980710613610407e-06, + "loss": 1.3998215198516846, + "step": 11074 + }, + { + "epoch": 2.016018931464458, + "grad_norm": 24.75, + "learning_rate": 1.9800515056120114e-06, + "loss": 1.5405805110931396, + "step": 11076 + }, + { + "epoch": 2.01638299808865, + "grad_norm": 19.375, + "learning_rate": 1.979392547286922e-06, + "loss": 1.5268285274505615, + "step": 11078 + }, + { + "epoch": 2.0167470647128423, + "grad_norm": 8.3125, + "learning_rate": 1.978733738731842e-06, + "loss": 1.4839603900909424, + "step": 11080 + }, + { + "epoch": 2.0171111313370345, + "grad_norm": 10.0, + "learning_rate": 1.978075080043446e-06, + "loss": 1.6420220136642456, + "step": 11082 + }, + { + "epoch": 2.017475197961227, + "grad_norm": 3.421875, + "learning_rate": 1.97741657131839e-06, + "loss": 0.9530258178710938, + "step": 11084 + }, + { + "epoch": 2.0178392645854193, + "grad_norm": 22.5, + "learning_rate": 1.976758212653309e-06, + "loss": 2.084479570388794, + "step": 11086 + }, + { + "epoch": 2.0182033312096115, + "grad_norm": 8.125, + "learning_rate": 1.976100004144812e-06, + "loss": 1.3172053098678589, + "step": 11088 + }, + { + "epoch": 2.0185673978338037, + "grad_norm": 10.25, + "learning_rate": 1.9754419458894924e-06, + "loss": 1.398364543914795, + "step": 11090 + }, + { + "epoch": 2.018931464457996, + "grad_norm": 21.625, + "learning_rate": 1.974784037983913e-06, + "loss": 1.422690987586975, + "step": 11092 + }, + { + "epoch": 2.019295531082188, + "grad_norm": 21.875, + "learning_rate": 1.974126280524621e-06, + "loss": 1.3765701055526733, + "step": 11094 + }, + { + "epoch": 2.0196595977063803, + "grad_norm": 24.0, + "learning_rate": 1.9734686736081417e-06, + "loss": 1.33046555519104, + "step": 11096 + }, + { + "epoch": 2.0200236643305725, + "grad_norm": 21.25, + "learning_rate": 1.972811217330972e-06, + "loss": 1.3694443702697754, + "step": 11098 + }, + { + "epoch": 2.0203877309547646, + "grad_norm": 9.0625, + "learning_rate": 1.972153911789596e-06, + "loss": 1.1963346004486084, + "step": 11100 + }, + { + "epoch": 2.020751797578957, + "grad_norm": 19.75, + "learning_rate": 1.9714967570804665e-06, + "loss": 1.5416532754898071, + "step": 11102 + }, + { + "epoch": 2.021115864203149, + "grad_norm": 14.875, + "learning_rate": 1.9708397533000186e-06, + "loss": 1.4636294841766357, + "step": 11104 + }, + { + "epoch": 2.0214799308273412, + "grad_norm": 17.125, + "learning_rate": 1.970182900544668e-06, + "loss": 1.8029705286026, + "step": 11106 + }, + { + "epoch": 2.0218439974515334, + "grad_norm": 7.3125, + "learning_rate": 1.9695261989108017e-06, + "loss": 1.0296516418457031, + "step": 11108 + }, + { + "epoch": 2.022208064075726, + "grad_norm": 5.0, + "learning_rate": 1.9688696484947912e-06, + "loss": 1.0822944641113281, + "step": 11110 + }, + { + "epoch": 2.0225721306999183, + "grad_norm": 6.0, + "learning_rate": 1.9682132493929802e-06, + "loss": 1.46492338180542, + "step": 11112 + }, + { + "epoch": 2.0229361973241105, + "grad_norm": 8.0, + "learning_rate": 1.9675570017016925e-06, + "loss": 1.1143368482589722, + "step": 11114 + }, + { + "epoch": 2.0233002639483026, + "grad_norm": 27.375, + "learning_rate": 1.9669009055172326e-06, + "loss": 1.5624620914459229, + "step": 11116 + }, + { + "epoch": 2.023664330572495, + "grad_norm": 14.0625, + "learning_rate": 1.966244960935876e-06, + "loss": 1.489018440246582, + "step": 11118 + }, + { + "epoch": 2.024028397196687, + "grad_norm": 8.6875, + "learning_rate": 1.965589168053884e-06, + "loss": 1.501712441444397, + "step": 11120 + }, + { + "epoch": 2.0243924638208792, + "grad_norm": 39.5, + "learning_rate": 1.964933526967488e-06, + "loss": 1.4440007209777832, + "step": 11122 + }, + { + "epoch": 2.0247565304450714, + "grad_norm": 8.4375, + "learning_rate": 1.964278037772902e-06, + "loss": 1.3837101459503174, + "step": 11124 + }, + { + "epoch": 2.0251205970692636, + "grad_norm": 11.125, + "learning_rate": 1.9636227005663177e-06, + "loss": 1.5050194263458252, + "step": 11126 + }, + { + "epoch": 2.025484663693456, + "grad_norm": 21.375, + "learning_rate": 1.962967515443901e-06, + "loss": 1.394757628440857, + "step": 11128 + }, + { + "epoch": 2.025848730317648, + "grad_norm": 15.625, + "learning_rate": 1.9623124825017993e-06, + "loss": 0.9417558908462524, + "step": 11130 + }, + { + "epoch": 2.02621279694184, + "grad_norm": 9.25, + "learning_rate": 1.9616576018361355e-06, + "loss": 1.6117267608642578, + "step": 11132 + }, + { + "epoch": 2.0265768635660324, + "grad_norm": 21.0, + "learning_rate": 1.9610028735430096e-06, + "loss": 0.8419802784919739, + "step": 11134 + }, + { + "epoch": 2.026940930190225, + "grad_norm": 7.40625, + "learning_rate": 1.9603482977185028e-06, + "loss": 0.8970146179199219, + "step": 11136 + }, + { + "epoch": 2.0273049968144172, + "grad_norm": 10.0625, + "learning_rate": 1.9596938744586684e-06, + "loss": 0.9122978448867798, + "step": 11138 + }, + { + "epoch": 2.0276690634386094, + "grad_norm": 18.25, + "learning_rate": 1.959039603859543e-06, + "loss": 1.7657095193862915, + "step": 11140 + }, + { + "epoch": 2.0280331300628016, + "grad_norm": 8.75, + "learning_rate": 1.958385486017137e-06, + "loss": 1.3639525175094604, + "step": 11142 + }, + { + "epoch": 2.028397196686994, + "grad_norm": 20.125, + "learning_rate": 1.957731521027439e-06, + "loss": 1.5718979835510254, + "step": 11144 + }, + { + "epoch": 2.028761263311186, + "grad_norm": 10.8125, + "learning_rate": 1.957077708986417e-06, + "loss": 1.4763069152832031, + "step": 11146 + }, + { + "epoch": 2.029125329935378, + "grad_norm": 12.125, + "learning_rate": 1.956424049990014e-06, + "loss": 1.5811841487884521, + "step": 11148 + }, + { + "epoch": 2.0294893965595704, + "grad_norm": 8.8125, + "learning_rate": 1.9557705441341534e-06, + "loss": 1.4419233798980713, + "step": 11150 + }, + { + "epoch": 2.0298534631837626, + "grad_norm": 4.65625, + "learning_rate": 1.9551171915147334e-06, + "loss": 1.1227138042449951, + "step": 11152 + }, + { + "epoch": 2.030217529807955, + "grad_norm": 14.125, + "learning_rate": 1.9544639922276294e-06, + "loss": 1.5154244899749756, + "step": 11154 + }, + { + "epoch": 2.030581596432147, + "grad_norm": 28.125, + "learning_rate": 1.9538109463686994e-06, + "loss": 0.6497099995613098, + "step": 11156 + }, + { + "epoch": 2.030945663056339, + "grad_norm": 10.3125, + "learning_rate": 1.9531580540337715e-06, + "loss": 1.4992166757583618, + "step": 11158 + }, + { + "epoch": 2.0313097296805314, + "grad_norm": 15.875, + "learning_rate": 1.9525053153186583e-06, + "loss": 1.3977303504943848, + "step": 11160 + }, + { + "epoch": 2.0316737963047236, + "grad_norm": 4.84375, + "learning_rate": 1.9518527303191444e-06, + "loss": 1.0993247032165527, + "step": 11162 + }, + { + "epoch": 2.032037862928916, + "grad_norm": 44.0, + "learning_rate": 1.951200299130993e-06, + "loss": 1.545391321182251, + "step": 11164 + }, + { + "epoch": 2.0324019295531084, + "grad_norm": 15.875, + "learning_rate": 1.95054802184995e-06, + "loss": 1.387149453163147, + "step": 11166 + }, + { + "epoch": 2.0327659961773006, + "grad_norm": 11.5625, + "learning_rate": 1.9498958985717294e-06, + "loss": 1.2326574325561523, + "step": 11168 + }, + { + "epoch": 2.0331300628014928, + "grad_norm": 53.5, + "learning_rate": 1.9492439293920317e-06, + "loss": 1.9308810234069824, + "step": 11170 + }, + { + "epoch": 2.033494129425685, + "grad_norm": 13.375, + "learning_rate": 1.9485921144065282e-06, + "loss": 1.0600999593734741, + "step": 11172 + }, + { + "epoch": 2.033858196049877, + "grad_norm": 16.5, + "learning_rate": 1.9479404537108704e-06, + "loss": 1.4514191150665283, + "step": 11174 + }, + { + "epoch": 2.0342222626740694, + "grad_norm": 2.515625, + "learning_rate": 1.9472889474006883e-06, + "loss": 1.0639142990112305, + "step": 11176 + }, + { + "epoch": 2.0345863292982616, + "grad_norm": 17.25, + "learning_rate": 1.946637595571586e-06, + "loss": 1.6877634525299072, + "step": 11178 + }, + { + "epoch": 2.0349503959224537, + "grad_norm": 9.3125, + "learning_rate": 1.945986398319149e-06, + "loss": 1.540773868560791, + "step": 11180 + }, + { + "epoch": 2.035314462546646, + "grad_norm": 19.25, + "learning_rate": 1.9453353557389357e-06, + "loss": 0.2354702353477478, + "step": 11182 + }, + { + "epoch": 2.035678529170838, + "grad_norm": 9.5, + "learning_rate": 1.944684467926484e-06, + "loss": 1.4709038734436035, + "step": 11184 + }, + { + "epoch": 2.0360425957950303, + "grad_norm": 39.0, + "learning_rate": 1.944033734977312e-06, + "loss": 1.360809087753296, + "step": 11186 + }, + { + "epoch": 2.0364066624192225, + "grad_norm": 19.25, + "learning_rate": 1.9433831569869075e-06, + "loss": 1.529122233390808, + "step": 11188 + }, + { + "epoch": 2.036770729043415, + "grad_norm": 5.1875, + "learning_rate": 1.942732734050744e-06, + "loss": 0.7980729341506958, + "step": 11190 + }, + { + "epoch": 2.0371347956676074, + "grad_norm": 7.4375, + "learning_rate": 1.942082466264267e-06, + "loss": 0.9450342059135437, + "step": 11192 + }, + { + "epoch": 2.0374988622917996, + "grad_norm": 16.25, + "learning_rate": 1.9414323537228995e-06, + "loss": 1.3768373727798462, + "step": 11194 + }, + { + "epoch": 2.0378629289159917, + "grad_norm": 5.3125, + "learning_rate": 1.940782396522046e-06, + "loss": 1.2740161418914795, + "step": 11196 + }, + { + "epoch": 2.038226995540184, + "grad_norm": 9.3125, + "learning_rate": 1.9401325947570816e-06, + "loss": 1.0506833791732788, + "step": 11198 + }, + { + "epoch": 2.038591062164376, + "grad_norm": 17.5, + "learning_rate": 1.9394829485233645e-06, + "loss": 1.5903105735778809, + "step": 11200 + }, + { + "epoch": 2.0389551287885683, + "grad_norm": 4.625, + "learning_rate": 1.9388334579162267e-06, + "loss": 0.9030445218086243, + "step": 11202 + }, + { + "epoch": 2.0393191954127605, + "grad_norm": 6.78125, + "learning_rate": 1.9381841230309777e-06, + "loss": 1.2412681579589844, + "step": 11204 + }, + { + "epoch": 2.0396832620369527, + "grad_norm": 12.375, + "learning_rate": 1.9375349439629065e-06, + "loss": 1.6972997188568115, + "step": 11206 + }, + { + "epoch": 2.040047328661145, + "grad_norm": 15.0, + "learning_rate": 1.9368859208072755e-06, + "loss": 1.527509331703186, + "step": 11208 + }, + { + "epoch": 2.040411395285337, + "grad_norm": 3.828125, + "learning_rate": 1.936237053659328e-06, + "loss": 1.1031190156936646, + "step": 11210 + }, + { + "epoch": 2.0407754619095293, + "grad_norm": 9.375, + "learning_rate": 1.935588342614282e-06, + "loss": 1.4453120231628418, + "step": 11212 + }, + { + "epoch": 2.0411395285337215, + "grad_norm": 16.125, + "learning_rate": 1.9349397877673313e-06, + "loss": 1.4460902214050293, + "step": 11214 + }, + { + "epoch": 2.0415035951579137, + "grad_norm": 11.875, + "learning_rate": 1.934291389213653e-06, + "loss": 1.529700517654419, + "step": 11216 + }, + { + "epoch": 2.0418676617821063, + "grad_norm": 14.9375, + "learning_rate": 1.933643147048392e-06, + "loss": 1.2103297710418701, + "step": 11218 + }, + { + "epoch": 2.0422317284062985, + "grad_norm": 7.875, + "learning_rate": 1.9329950613666794e-06, + "loss": 1.4142873287200928, + "step": 11220 + }, + { + "epoch": 2.0425957950304907, + "grad_norm": 10.375, + "learning_rate": 1.932347132263617e-06, + "loss": 1.232865333557129, + "step": 11222 + }, + { + "epoch": 2.042959861654683, + "grad_norm": 21.625, + "learning_rate": 1.9316993598342846e-06, + "loss": 1.9871118068695068, + "step": 11224 + }, + { + "epoch": 2.043323928278875, + "grad_norm": 5.53125, + "learning_rate": 1.931051744173744e-06, + "loss": 0.8346189856529236, + "step": 11226 + }, + { + "epoch": 2.0436879949030673, + "grad_norm": 11.1875, + "learning_rate": 1.930404285377026e-06, + "loss": 1.5212898254394531, + "step": 11228 + }, + { + "epoch": 2.0440520615272595, + "grad_norm": 3.40625, + "learning_rate": 1.9297569835391463e-06, + "loss": 0.8817059993743896, + "step": 11230 + }, + { + "epoch": 2.0444161281514517, + "grad_norm": 14.5625, + "learning_rate": 1.9291098387550907e-06, + "loss": 1.2817615270614624, + "step": 11232 + }, + { + "epoch": 2.044780194775644, + "grad_norm": 50.5, + "learning_rate": 1.9284628511198254e-06, + "loss": 1.738089680671692, + "step": 11234 + }, + { + "epoch": 2.045144261399836, + "grad_norm": 20.125, + "learning_rate": 1.927816020728296e-06, + "loss": 1.577939510345459, + "step": 11236 + }, + { + "epoch": 2.0455083280240283, + "grad_norm": 5.0625, + "learning_rate": 1.927169347675419e-06, + "loss": 1.0617436170578003, + "step": 11238 + }, + { + "epoch": 2.0458723946482205, + "grad_norm": 5.375, + "learning_rate": 1.9265228320560934e-06, + "loss": 1.1472480297088623, + "step": 11240 + }, + { + "epoch": 2.0462364612724127, + "grad_norm": 13.0, + "learning_rate": 1.9258764739651912e-06, + "loss": 2.0814905166625977, + "step": 11242 + }, + { + "epoch": 2.0466005278966053, + "grad_norm": 13.375, + "learning_rate": 1.925230273497563e-06, + "loss": 1.6026432514190674, + "step": 11244 + }, + { + "epoch": 2.0469645945207975, + "grad_norm": 13.1875, + "learning_rate": 1.924584230748038e-06, + "loss": 0.7772610187530518, + "step": 11246 + }, + { + "epoch": 2.0473286611449897, + "grad_norm": 11.125, + "learning_rate": 1.9239383458114173e-06, + "loss": 1.4164042472839355, + "step": 11248 + }, + { + "epoch": 2.047692727769182, + "grad_norm": 21.0, + "learning_rate": 1.9232926187824848e-06, + "loss": 2.03822922706604, + "step": 11250 + }, + { + "epoch": 2.048056794393374, + "grad_norm": 6.90625, + "learning_rate": 1.9226470497559963e-06, + "loss": 1.1361656188964844, + "step": 11252 + }, + { + "epoch": 2.0484208610175663, + "grad_norm": 19.25, + "learning_rate": 1.922001638826686e-06, + "loss": 0.15727046132087708, + "step": 11254 + }, + { + "epoch": 2.0487849276417585, + "grad_norm": 10.5, + "learning_rate": 1.9213563860892687e-06, + "loss": 1.4766499996185303, + "step": 11256 + }, + { + "epoch": 2.0491489942659507, + "grad_norm": 6.34375, + "learning_rate": 1.9207112916384287e-06, + "loss": 0.9160822629928589, + "step": 11258 + }, + { + "epoch": 2.049513060890143, + "grad_norm": 15.0625, + "learning_rate": 1.9200663555688335e-06, + "loss": 1.3961005210876465, + "step": 11260 + }, + { + "epoch": 2.049877127514335, + "grad_norm": 48.25, + "learning_rate": 1.919421577975124e-06, + "loss": 0.7138979434967041, + "step": 11262 + }, + { + "epoch": 2.0502411941385272, + "grad_norm": 3.15625, + "learning_rate": 1.9187769589519174e-06, + "loss": 0.7638934254646301, + "step": 11264 + }, + { + "epoch": 2.0506052607627194, + "grad_norm": 9.0, + "learning_rate": 1.9181324985938123e-06, + "loss": 1.2059357166290283, + "step": 11266 + }, + { + "epoch": 2.0509693273869116, + "grad_norm": 24.875, + "learning_rate": 1.9174881969953766e-06, + "loss": 0.5940987467765808, + "step": 11268 + }, + { + "epoch": 2.051333394011104, + "grad_norm": 15.75, + "learning_rate": 1.916844054251163e-06, + "loss": 1.557391881942749, + "step": 11270 + }, + { + "epoch": 2.0516974606352965, + "grad_norm": 34.0, + "learning_rate": 1.916200070455694e-06, + "loss": 1.5932555198669434, + "step": 11272 + }, + { + "epoch": 2.0520615272594886, + "grad_norm": 25.125, + "learning_rate": 1.9155562457034714e-06, + "loss": 1.8712421655654907, + "step": 11274 + }, + { + "epoch": 2.052425593883681, + "grad_norm": 4.34375, + "learning_rate": 1.914912580088976e-06, + "loss": 1.0210652351379395, + "step": 11276 + }, + { + "epoch": 2.052789660507873, + "grad_norm": 28.5, + "learning_rate": 1.914269073706661e-06, + "loss": 2.2715117931365967, + "step": 11278 + }, + { + "epoch": 2.0531537271320652, + "grad_norm": 20.625, + "learning_rate": 1.913625726650961e-06, + "loss": 2.012913465499878, + "step": 11280 + }, + { + "epoch": 2.0535177937562574, + "grad_norm": 28.375, + "learning_rate": 1.9129825390162817e-06, + "loss": 1.8749830722808838, + "step": 11282 + }, + { + "epoch": 2.0538818603804496, + "grad_norm": 5.90625, + "learning_rate": 1.912339510897009e-06, + "loss": 1.1219078302383423, + "step": 11284 + }, + { + "epoch": 2.054245927004642, + "grad_norm": 9.4375, + "learning_rate": 1.9116966423875067e-06, + "loss": 1.5137276649475098, + "step": 11286 + }, + { + "epoch": 2.054609993628834, + "grad_norm": 13.3125, + "learning_rate": 1.91105393358211e-06, + "loss": 1.8291330337524414, + "step": 11288 + }, + { + "epoch": 2.054974060253026, + "grad_norm": 4.125, + "learning_rate": 1.9104113845751372e-06, + "loss": 1.0564454793930054, + "step": 11290 + }, + { + "epoch": 2.0553381268772184, + "grad_norm": 12.0, + "learning_rate": 1.9097689954608768e-06, + "loss": 1.0356847047805786, + "step": 11292 + }, + { + "epoch": 2.0557021935014106, + "grad_norm": 10.9375, + "learning_rate": 1.9091267663335975e-06, + "loss": 1.1149048805236816, + "step": 11294 + }, + { + "epoch": 2.056066260125603, + "grad_norm": 10.9375, + "learning_rate": 1.908484697287546e-06, + "loss": 1.7005622386932373, + "step": 11296 + }, + { + "epoch": 2.0564303267497954, + "grad_norm": 11.5625, + "learning_rate": 1.90784278841694e-06, + "loss": 1.4318026304244995, + "step": 11298 + }, + { + "epoch": 2.0567943933739876, + "grad_norm": 41.5, + "learning_rate": 1.90720103981598e-06, + "loss": 0.7224030494689941, + "step": 11300 + }, + { + "epoch": 2.05715845999818, + "grad_norm": 8.3125, + "learning_rate": 1.9065594515788382e-06, + "loss": 1.3860046863555908, + "step": 11302 + }, + { + "epoch": 2.057522526622372, + "grad_norm": 11.6875, + "learning_rate": 1.9059180237996646e-06, + "loss": 1.1618001461029053, + "step": 11304 + }, + { + "epoch": 2.057886593246564, + "grad_norm": 18.125, + "learning_rate": 1.9052767565725887e-06, + "loss": 2.2215492725372314, + "step": 11306 + }, + { + "epoch": 2.0582506598707564, + "grad_norm": 26.0, + "learning_rate": 1.9046356499917106e-06, + "loss": 1.507474660873413, + "step": 11308 + }, + { + "epoch": 2.0586147264949486, + "grad_norm": 14.875, + "learning_rate": 1.9039947041511136e-06, + "loss": 0.22918254137039185, + "step": 11310 + }, + { + "epoch": 2.058978793119141, + "grad_norm": 74.5, + "learning_rate": 1.903353919144851e-06, + "loss": 1.789607048034668, + "step": 11312 + }, + { + "epoch": 2.059342859743333, + "grad_norm": 19.75, + "learning_rate": 1.9027132950669557e-06, + "loss": 1.3877757787704468, + "step": 11314 + }, + { + "epoch": 2.059706926367525, + "grad_norm": 16.875, + "learning_rate": 1.902072832011439e-06, + "loss": 1.781526803970337, + "step": 11316 + }, + { + "epoch": 2.0600709929917174, + "grad_norm": 12.4375, + "learning_rate": 1.9014325300722832e-06, + "loss": 1.985640525817871, + "step": 11318 + }, + { + "epoch": 2.0604350596159096, + "grad_norm": 13.8125, + "learning_rate": 1.9007923893434533e-06, + "loss": 1.6570340394973755, + "step": 11320 + }, + { + "epoch": 2.0607991262401018, + "grad_norm": 18.875, + "learning_rate": 1.9001524099188843e-06, + "loss": 1.3645168542861938, + "step": 11322 + }, + { + "epoch": 2.061163192864294, + "grad_norm": 8.0625, + "learning_rate": 1.8995125918924915e-06, + "loss": 1.377673625946045, + "step": 11324 + }, + { + "epoch": 2.0615272594884866, + "grad_norm": 31.25, + "learning_rate": 1.898872935358167e-06, + "loss": 1.4514261484146118, + "step": 11326 + }, + { + "epoch": 2.061891326112679, + "grad_norm": 11.25, + "learning_rate": 1.8982334404097758e-06, + "loss": 1.4237110614776611, + "step": 11328 + }, + { + "epoch": 2.062255392736871, + "grad_norm": 4.28125, + "learning_rate": 1.897594107141163e-06, + "loss": 1.3030797243118286, + "step": 11330 + }, + { + "epoch": 2.062619459361063, + "grad_norm": 12.3125, + "learning_rate": 1.896954935646147e-06, + "loss": 1.4342344999313354, + "step": 11332 + }, + { + "epoch": 2.0629835259852554, + "grad_norm": 9.875, + "learning_rate": 1.8963159260185233e-06, + "loss": 1.4712165594100952, + "step": 11334 + }, + { + "epoch": 2.0633475926094476, + "grad_norm": 19.375, + "learning_rate": 1.8956770783520658e-06, + "loss": 1.9412319660186768, + "step": 11336 + }, + { + "epoch": 2.0637116592336398, + "grad_norm": 11.4375, + "learning_rate": 1.8950383927405202e-06, + "loss": 1.5182976722717285, + "step": 11338 + }, + { + "epoch": 2.064075725857832, + "grad_norm": 20.25, + "learning_rate": 1.894399869277614e-06, + "loss": 1.6106157302856445, + "step": 11340 + }, + { + "epoch": 2.064439792482024, + "grad_norm": 15.0, + "learning_rate": 1.8937615080570448e-06, + "loss": 1.080476999282837, + "step": 11342 + }, + { + "epoch": 2.0648038591062163, + "grad_norm": 10.4375, + "learning_rate": 1.893123309172492e-06, + "loss": 1.4900929927825928, + "step": 11344 + }, + { + "epoch": 2.0651679257304085, + "grad_norm": 17.0, + "learning_rate": 1.8924852727176085e-06, + "loss": 1.6501578092575073, + "step": 11346 + }, + { + "epoch": 2.0655319923546007, + "grad_norm": 37.0, + "learning_rate": 1.8918473987860214e-06, + "loss": 1.373894453048706, + "step": 11348 + }, + { + "epoch": 2.065896058978793, + "grad_norm": 7.40625, + "learning_rate": 1.8912096874713392e-06, + "loss": 1.5894296169281006, + "step": 11350 + }, + { + "epoch": 2.0662601256029856, + "grad_norm": 5.28125, + "learning_rate": 1.89057213886714e-06, + "loss": 1.190136432647705, + "step": 11352 + }, + { + "epoch": 2.0666241922271777, + "grad_norm": 14.1875, + "learning_rate": 1.8899347530669843e-06, + "loss": 1.1848104000091553, + "step": 11354 + }, + { + "epoch": 2.06698825885137, + "grad_norm": 4.34375, + "learning_rate": 1.8892975301644057e-06, + "loss": 1.3796597719192505, + "step": 11356 + }, + { + "epoch": 2.067352325475562, + "grad_norm": 3.84375, + "learning_rate": 1.8886604702529113e-06, + "loss": 0.8591499924659729, + "step": 11358 + }, + { + "epoch": 2.0677163920997543, + "grad_norm": 9.8125, + "learning_rate": 1.8880235734259911e-06, + "loss": 1.543081521987915, + "step": 11360 + }, + { + "epoch": 2.0680804587239465, + "grad_norm": 15.875, + "learning_rate": 1.8873868397771031e-06, + "loss": 1.3929495811462402, + "step": 11362 + }, + { + "epoch": 2.0684445253481387, + "grad_norm": 8.625, + "learning_rate": 1.8867502693996884e-06, + "loss": 1.478927493095398, + "step": 11364 + }, + { + "epoch": 2.068808591972331, + "grad_norm": 11.75, + "learning_rate": 1.8861138623871605e-06, + "loss": 1.5845457315444946, + "step": 11366 + }, + { + "epoch": 2.069172658596523, + "grad_norm": 10.6875, + "learning_rate": 1.885477618832908e-06, + "loss": 1.8207017183303833, + "step": 11368 + }, + { + "epoch": 2.0695367252207153, + "grad_norm": 18.875, + "learning_rate": 1.8848415388302992e-06, + "loss": 1.4906504154205322, + "step": 11370 + }, + { + "epoch": 2.0699007918449075, + "grad_norm": 50.25, + "learning_rate": 1.8842056224726742e-06, + "loss": 1.6885826587677002, + "step": 11372 + }, + { + "epoch": 2.0702648584690997, + "grad_norm": 8.875, + "learning_rate": 1.8835698698533528e-06, + "loss": 1.132664680480957, + "step": 11374 + }, + { + "epoch": 2.070628925093292, + "grad_norm": 32.75, + "learning_rate": 1.8829342810656293e-06, + "loss": 0.6545571088790894, + "step": 11376 + }, + { + "epoch": 2.0709929917174845, + "grad_norm": 11.8125, + "learning_rate": 1.882298856202771e-06, + "loss": 1.419360637664795, + "step": 11378 + }, + { + "epoch": 2.0713570583416767, + "grad_norm": 7.875, + "learning_rate": 1.8816635953580278e-06, + "loss": 1.439836859703064, + "step": 11380 + }, + { + "epoch": 2.071721124965869, + "grad_norm": 34.75, + "learning_rate": 1.8810284986246185e-06, + "loss": 1.1952608823776245, + "step": 11382 + }, + { + "epoch": 2.072085191590061, + "grad_norm": 9.75, + "learning_rate": 1.8803935660957427e-06, + "loss": 1.596019983291626, + "step": 11384 + }, + { + "epoch": 2.0724492582142533, + "grad_norm": 10.25, + "learning_rate": 1.8797587978645743e-06, + "loss": 1.488053798675537, + "step": 11386 + }, + { + "epoch": 2.0728133248384455, + "grad_norm": 2.75, + "learning_rate": 1.879124194024261e-06, + "loss": 0.8447047472000122, + "step": 11388 + }, + { + "epoch": 2.0731773914626377, + "grad_norm": 28.875, + "learning_rate": 1.8784897546679314e-06, + "loss": 1.9565701484680176, + "step": 11390 + }, + { + "epoch": 2.07354145808683, + "grad_norm": 14.375, + "learning_rate": 1.8778554798886837e-06, + "loss": 1.5755465030670166, + "step": 11392 + }, + { + "epoch": 2.073905524711022, + "grad_norm": 6.75, + "learning_rate": 1.8772213697795972e-06, + "loss": 1.4141664505004883, + "step": 11394 + }, + { + "epoch": 2.0742695913352143, + "grad_norm": 6.96875, + "learning_rate": 1.8765874244337254e-06, + "loss": 1.6290302276611328, + "step": 11396 + }, + { + "epoch": 2.0746336579594065, + "grad_norm": 10.625, + "learning_rate": 1.8759536439440944e-06, + "loss": 1.4229499101638794, + "step": 11398 + }, + { + "epoch": 2.0749977245835987, + "grad_norm": 9.375, + "learning_rate": 1.875320028403713e-06, + "loss": 1.6379119157791138, + "step": 11400 + }, + { + "epoch": 2.075361791207791, + "grad_norm": 4.25, + "learning_rate": 1.8746865779055573e-06, + "loss": 1.1094028949737549, + "step": 11402 + }, + { + "epoch": 2.075725857831983, + "grad_norm": 44.75, + "learning_rate": 1.874053292542587e-06, + "loss": 1.3150646686553955, + "step": 11404 + }, + { + "epoch": 2.0760899244561757, + "grad_norm": 21.125, + "learning_rate": 1.8734201724077333e-06, + "loss": 2.1382763385772705, + "step": 11406 + }, + { + "epoch": 2.076453991080368, + "grad_norm": 11.375, + "learning_rate": 1.8727872175939024e-06, + "loss": 1.3936569690704346, + "step": 11408 + }, + { + "epoch": 2.07681805770456, + "grad_norm": 12.6875, + "learning_rate": 1.8721544281939808e-06, + "loss": 1.3287220001220703, + "step": 11410 + }, + { + "epoch": 2.0771821243287523, + "grad_norm": 10.3125, + "learning_rate": 1.8715218043008243e-06, + "loss": 1.4587661027908325, + "step": 11412 + }, + { + "epoch": 2.0775461909529445, + "grad_norm": 11.0625, + "learning_rate": 1.8708893460072708e-06, + "loss": 1.1178665161132812, + "step": 11414 + }, + { + "epoch": 2.0779102575771367, + "grad_norm": 9.5, + "learning_rate": 1.8702570534061304e-06, + "loss": 1.4152162075042725, + "step": 11416 + }, + { + "epoch": 2.078274324201329, + "grad_norm": 23.875, + "learning_rate": 1.8696249265901872e-06, + "loss": 1.875107765197754, + "step": 11418 + }, + { + "epoch": 2.078638390825521, + "grad_norm": 6.96875, + "learning_rate": 1.868992965652207e-06, + "loss": 1.3942734003067017, + "step": 11420 + }, + { + "epoch": 2.0790024574497132, + "grad_norm": 8.9375, + "learning_rate": 1.8683611706849237e-06, + "loss": 1.4798282384872437, + "step": 11422 + }, + { + "epoch": 2.0793665240739054, + "grad_norm": 29.625, + "learning_rate": 1.8677295417810534e-06, + "loss": 1.5472404956817627, + "step": 11424 + }, + { + "epoch": 2.0797305906980976, + "grad_norm": 15.875, + "learning_rate": 1.8670980790332848e-06, + "loss": 1.4509446620941162, + "step": 11426 + }, + { + "epoch": 2.08009465732229, + "grad_norm": 11.3125, + "learning_rate": 1.8664667825342805e-06, + "loss": 1.6181693077087402, + "step": 11428 + }, + { + "epoch": 2.080458723946482, + "grad_norm": 2.640625, + "learning_rate": 1.8658356523766833e-06, + "loss": 1.261971116065979, + "step": 11430 + }, + { + "epoch": 2.080822790570674, + "grad_norm": 18.25, + "learning_rate": 1.8652046886531065e-06, + "loss": 2.1572773456573486, + "step": 11432 + }, + { + "epoch": 2.081186857194867, + "grad_norm": 8.9375, + "learning_rate": 1.8645738914561435e-06, + "loss": 1.513465404510498, + "step": 11434 + }, + { + "epoch": 2.081550923819059, + "grad_norm": 21.25, + "learning_rate": 1.863943260878361e-06, + "loss": 0.9612942934036255, + "step": 11436 + }, + { + "epoch": 2.0819149904432512, + "grad_norm": 27.125, + "learning_rate": 1.8633127970122993e-06, + "loss": 1.0310657024383545, + "step": 11438 + }, + { + "epoch": 2.0822790570674434, + "grad_norm": 8.5, + "learning_rate": 1.86268249995048e-06, + "loss": 1.3813170194625854, + "step": 11440 + }, + { + "epoch": 2.0826431236916356, + "grad_norm": 27.625, + "learning_rate": 1.862052369785393e-06, + "loss": 1.8864072561264038, + "step": 11442 + }, + { + "epoch": 2.083007190315828, + "grad_norm": 9.75, + "learning_rate": 1.8614224066095093e-06, + "loss": 1.608127474784851, + "step": 11444 + }, + { + "epoch": 2.08337125694002, + "grad_norm": 20.75, + "learning_rate": 1.8607926105152744e-06, + "loss": 0.6679896712303162, + "step": 11446 + }, + { + "epoch": 2.083735323564212, + "grad_norm": 14.0, + "learning_rate": 1.8601629815951055e-06, + "loss": 1.3346567153930664, + "step": 11448 + }, + { + "epoch": 2.0840993901884044, + "grad_norm": 45.75, + "learning_rate": 1.8595335199414014e-06, + "loss": 2.1126351356506348, + "step": 11450 + }, + { + "epoch": 2.0844634568125966, + "grad_norm": 12.9375, + "learning_rate": 1.8589042256465295e-06, + "loss": 1.4173626899719238, + "step": 11452 + }, + { + "epoch": 2.084827523436789, + "grad_norm": 136.0, + "learning_rate": 1.8582750988028392e-06, + "loss": 1.9854846000671387, + "step": 11454 + }, + { + "epoch": 2.085191590060981, + "grad_norm": 11.4375, + "learning_rate": 1.8576461395026516e-06, + "loss": 1.5123332738876343, + "step": 11456 + }, + { + "epoch": 2.085555656685173, + "grad_norm": 11.875, + "learning_rate": 1.857017347838262e-06, + "loss": 1.3140833377838135, + "step": 11458 + }, + { + "epoch": 2.085919723309366, + "grad_norm": 10.5625, + "learning_rate": 1.8563887239019459e-06, + "loss": 0.822762131690979, + "step": 11460 + }, + { + "epoch": 2.086283789933558, + "grad_norm": 8.3125, + "learning_rate": 1.8557602677859488e-06, + "loss": 1.1884052753448486, + "step": 11462 + }, + { + "epoch": 2.08664785655775, + "grad_norm": 48.5, + "learning_rate": 1.8551319795824953e-06, + "loss": 0.8358457684516907, + "step": 11464 + }, + { + "epoch": 2.0870119231819424, + "grad_norm": 6.5625, + "learning_rate": 1.8545038593837855e-06, + "loss": 1.085221529006958, + "step": 11466 + }, + { + "epoch": 2.0873759898061346, + "grad_norm": 7.78125, + "learning_rate": 1.85387590728199e-06, + "loss": 1.3838255405426025, + "step": 11468 + }, + { + "epoch": 2.087740056430327, + "grad_norm": 10.3125, + "learning_rate": 1.8532481233692624e-06, + "loss": 1.449247121810913, + "step": 11470 + }, + { + "epoch": 2.088104123054519, + "grad_norm": 10.1875, + "learning_rate": 1.8526205077377231e-06, + "loss": 1.3740648031234741, + "step": 11472 + }, + { + "epoch": 2.088468189678711, + "grad_norm": 29.875, + "learning_rate": 1.8519930604794755e-06, + "loss": 1.2248938083648682, + "step": 11474 + }, + { + "epoch": 2.0888322563029034, + "grad_norm": 16.0, + "learning_rate": 1.8513657816865946e-06, + "loss": 1.5027263164520264, + "step": 11476 + }, + { + "epoch": 2.0891963229270956, + "grad_norm": 9.75, + "learning_rate": 1.8507386714511288e-06, + "loss": 1.4471714496612549, + "step": 11478 + }, + { + "epoch": 2.0895603895512878, + "grad_norm": 18.5, + "learning_rate": 1.8501117298651067e-06, + "loss": 2.0544333457946777, + "step": 11480 + }, + { + "epoch": 2.08992445617548, + "grad_norm": 16.625, + "learning_rate": 1.8494849570205264e-06, + "loss": 1.851595401763916, + "step": 11482 + }, + { + "epoch": 2.090288522799672, + "grad_norm": 16.25, + "learning_rate": 1.8488583530093673e-06, + "loss": 1.9838097095489502, + "step": 11484 + }, + { + "epoch": 2.090652589423865, + "grad_norm": 13.0625, + "learning_rate": 1.8482319179235802e-06, + "loss": 0.8050888180732727, + "step": 11486 + }, + { + "epoch": 2.091016656048057, + "grad_norm": 23.75, + "learning_rate": 1.84760565185509e-06, + "loss": 1.089007019996643, + "step": 11488 + }, + { + "epoch": 2.091380722672249, + "grad_norm": 20.875, + "learning_rate": 1.8469795548958017e-06, + "loss": 1.2898389101028442, + "step": 11490 + }, + { + "epoch": 2.0917447892964414, + "grad_norm": 67.0, + "learning_rate": 1.8463536271375893e-06, + "loss": 0.9535794258117676, + "step": 11492 + }, + { + "epoch": 2.0921088559206336, + "grad_norm": 9.875, + "learning_rate": 1.8457278686723079e-06, + "loss": 1.5751750469207764, + "step": 11494 + }, + { + "epoch": 2.0924729225448258, + "grad_norm": 36.75, + "learning_rate": 1.8451022795917843e-06, + "loss": 0.7495129108428955, + "step": 11496 + }, + { + "epoch": 2.092836989169018, + "grad_norm": 12.8125, + "learning_rate": 1.8444768599878192e-06, + "loss": 1.828484296798706, + "step": 11498 + }, + { + "epoch": 2.09320105579321, + "grad_norm": 11.0, + "learning_rate": 1.843851609952194e-06, + "loss": 1.5117626190185547, + "step": 11500 + }, + { + "epoch": 2.0935651224174023, + "grad_norm": 17.0, + "learning_rate": 1.8432265295766575e-06, + "loss": 1.4968690872192383, + "step": 11502 + }, + { + "epoch": 2.0939291890415945, + "grad_norm": 15.25, + "learning_rate": 1.8426016189529407e-06, + "loss": 0.9843392968177795, + "step": 11504 + }, + { + "epoch": 2.0942932556657867, + "grad_norm": 11.0, + "learning_rate": 1.8419768781727465e-06, + "loss": 1.5971155166625977, + "step": 11506 + }, + { + "epoch": 2.094657322289979, + "grad_norm": 14.1875, + "learning_rate": 1.841352307327751e-06, + "loss": 1.524049997329712, + "step": 11508 + }, + { + "epoch": 2.095021388914171, + "grad_norm": 19.875, + "learning_rate": 1.8407279065096106e-06, + "loss": 1.6609448194503784, + "step": 11510 + }, + { + "epoch": 2.0953854555383633, + "grad_norm": 18.5, + "learning_rate": 1.84010367580995e-06, + "loss": 2.1521108150482178, + "step": 11512 + }, + { + "epoch": 2.095749522162556, + "grad_norm": 28.375, + "learning_rate": 1.839479615320375e-06, + "loss": 1.0616257190704346, + "step": 11514 + }, + { + "epoch": 2.096113588786748, + "grad_norm": 9.75, + "learning_rate": 1.838855725132464e-06, + "loss": 1.5993046760559082, + "step": 11516 + }, + { + "epoch": 2.0964776554109403, + "grad_norm": 15.8125, + "learning_rate": 1.8382320053377681e-06, + "loss": 1.424747109413147, + "step": 11518 + }, + { + "epoch": 2.0968417220351325, + "grad_norm": 4.71875, + "learning_rate": 1.837608456027819e-06, + "loss": 1.0702464580535889, + "step": 11520 + }, + { + "epoch": 2.0972057886593247, + "grad_norm": 7.28125, + "learning_rate": 1.8369850772941166e-06, + "loss": 1.176002860069275, + "step": 11522 + }, + { + "epoch": 2.097569855283517, + "grad_norm": 8.75, + "learning_rate": 1.8363618692281415e-06, + "loss": 1.282271385192871, + "step": 11524 + }, + { + "epoch": 2.097933921907709, + "grad_norm": 13.25, + "learning_rate": 1.8357388319213467e-06, + "loss": 1.453086256980896, + "step": 11526 + }, + { + "epoch": 2.0982979885319013, + "grad_norm": 8.0625, + "learning_rate": 1.835115965465159e-06, + "loss": 1.1277140378952026, + "step": 11528 + }, + { + "epoch": 2.0986620551560935, + "grad_norm": 6.125, + "learning_rate": 1.8344932699509838e-06, + "loss": 1.291964054107666, + "step": 11530 + }, + { + "epoch": 2.0990261217802857, + "grad_norm": 6.03125, + "learning_rate": 1.8338707454701965e-06, + "loss": 1.5287799835205078, + "step": 11532 + }, + { + "epoch": 2.099390188404478, + "grad_norm": 7.90625, + "learning_rate": 1.833248392114152e-06, + "loss": 1.4355192184448242, + "step": 11534 + }, + { + "epoch": 2.09975425502867, + "grad_norm": 3.59375, + "learning_rate": 1.8326262099741782e-06, + "loss": 1.170735239982605, + "step": 11536 + }, + { + "epoch": 2.1001183216528623, + "grad_norm": 15.3125, + "learning_rate": 1.8320041991415757e-06, + "loss": 1.3118293285369873, + "step": 11538 + }, + { + "epoch": 2.100482388277055, + "grad_norm": 22.25, + "learning_rate": 1.8313823597076249e-06, + "loss": 0.5604899525642395, + "step": 11540 + }, + { + "epoch": 2.100846454901247, + "grad_norm": 10.5, + "learning_rate": 1.8307606917635756e-06, + "loss": 1.3302524089813232, + "step": 11542 + }, + { + "epoch": 2.1012105215254393, + "grad_norm": 15.5625, + "learning_rate": 1.8301391954006568e-06, + "loss": 1.8190664052963257, + "step": 11544 + }, + { + "epoch": 2.1015745881496315, + "grad_norm": 14.125, + "learning_rate": 1.8295178707100707e-06, + "loss": 1.8432998657226562, + "step": 11546 + }, + { + "epoch": 2.1019386547738237, + "grad_norm": 19.875, + "learning_rate": 1.8288967177829922e-06, + "loss": 1.3491830825805664, + "step": 11548 + }, + { + "epoch": 2.102302721398016, + "grad_norm": 37.75, + "learning_rate": 1.8282757367105757e-06, + "loss": 1.7569167613983154, + "step": 11550 + }, + { + "epoch": 2.102666788022208, + "grad_norm": 17.75, + "learning_rate": 1.8276549275839451e-06, + "loss": 1.0134644508361816, + "step": 11552 + }, + { + "epoch": 2.1030308546464003, + "grad_norm": 31.375, + "learning_rate": 1.827034290494203e-06, + "loss": 1.3021458387374878, + "step": 11554 + }, + { + "epoch": 2.1033949212705925, + "grad_norm": 7.25, + "learning_rate": 1.8264138255324263e-06, + "loss": 1.326372504234314, + "step": 11556 + }, + { + "epoch": 2.1037589878947847, + "grad_norm": 5.34375, + "learning_rate": 1.8257935327896628e-06, + "loss": 0.8276577591896057, + "step": 11558 + }, + { + "epoch": 2.104123054518977, + "grad_norm": 11.6875, + "learning_rate": 1.8251734123569414e-06, + "loss": 1.4027292728424072, + "step": 11560 + }, + { + "epoch": 2.104487121143169, + "grad_norm": 10.0, + "learning_rate": 1.824553464325259e-06, + "loss": 1.4700348377227783, + "step": 11562 + }, + { + "epoch": 2.1048511877673612, + "grad_norm": 46.5, + "learning_rate": 1.823933688785593e-06, + "loss": 1.4165862798690796, + "step": 11564 + }, + { + "epoch": 2.1052152543915534, + "grad_norm": 9.3125, + "learning_rate": 1.8233140858288922e-06, + "loss": 1.682431697845459, + "step": 11566 + }, + { + "epoch": 2.105579321015746, + "grad_norm": 19.75, + "learning_rate": 1.8226946555460797e-06, + "loss": 2.1505484580993652, + "step": 11568 + }, + { + "epoch": 2.1059433876399383, + "grad_norm": 9.875, + "learning_rate": 1.8220753980280567e-06, + "loss": 1.4033162593841553, + "step": 11570 + }, + { + "epoch": 2.1063074542641305, + "grad_norm": 8.9375, + "learning_rate": 1.8214563133656936e-06, + "loss": 1.615618109703064, + "step": 11572 + }, + { + "epoch": 2.1066715208883227, + "grad_norm": 14.75, + "learning_rate": 1.8208374016498412e-06, + "loss": 1.4349865913391113, + "step": 11574 + }, + { + "epoch": 2.107035587512515, + "grad_norm": 26.25, + "learning_rate": 1.820218662971322e-06, + "loss": 1.258475661277771, + "step": 11576 + }, + { + "epoch": 2.107399654136707, + "grad_norm": 9.125, + "learning_rate": 1.8196000974209315e-06, + "loss": 1.0866085290908813, + "step": 11578 + }, + { + "epoch": 2.1077637207608992, + "grad_norm": 8.4375, + "learning_rate": 1.8189817050894442e-06, + "loss": 1.5266196727752686, + "step": 11580 + }, + { + "epoch": 2.1081277873850914, + "grad_norm": 2.390625, + "learning_rate": 1.8183634860676042e-06, + "loss": 1.2994287014007568, + "step": 11582 + }, + { + "epoch": 2.1084918540092836, + "grad_norm": 15.4375, + "learning_rate": 1.8177454404461344e-06, + "loss": 1.6951885223388672, + "step": 11584 + }, + { + "epoch": 2.108855920633476, + "grad_norm": 31.625, + "learning_rate": 1.8171275683157309e-06, + "loss": 1.796293020248413, + "step": 11586 + }, + { + "epoch": 2.109219987257668, + "grad_norm": 5.28125, + "learning_rate": 1.8165098697670614e-06, + "loss": 1.2194533348083496, + "step": 11588 + }, + { + "epoch": 2.10958405388186, + "grad_norm": 87.0, + "learning_rate": 1.8158923448907733e-06, + "loss": 1.293585181236267, + "step": 11590 + }, + { + "epoch": 2.1099481205060524, + "grad_norm": 7.28125, + "learning_rate": 1.8152749937774837e-06, + "loss": 1.380223035812378, + "step": 11592 + }, + { + "epoch": 2.110312187130245, + "grad_norm": 9.1875, + "learning_rate": 1.8146578165177885e-06, + "loss": 1.4090272188186646, + "step": 11594 + }, + { + "epoch": 2.1106762537544372, + "grad_norm": 43.75, + "learning_rate": 1.8140408132022554e-06, + "loss": 1.5302612781524658, + "step": 11596 + }, + { + "epoch": 2.1110403203786294, + "grad_norm": 13.5, + "learning_rate": 1.8134239839214252e-06, + "loss": 1.4695546627044678, + "step": 11598 + }, + { + "epoch": 2.1114043870028216, + "grad_norm": 10.375, + "learning_rate": 1.8128073287658183e-06, + "loss": 1.1802515983581543, + "step": 11600 + }, + { + "epoch": 2.111768453627014, + "grad_norm": 6.25, + "learning_rate": 1.812190847825923e-06, + "loss": 1.2398107051849365, + "step": 11602 + }, + { + "epoch": 2.112132520251206, + "grad_norm": 10.375, + "learning_rate": 1.8115745411922075e-06, + "loss": 1.4469610452651978, + "step": 11604 + }, + { + "epoch": 2.112496586875398, + "grad_norm": 6.65625, + "learning_rate": 1.8109584089551127e-06, + "loss": 1.2013463973999023, + "step": 11606 + }, + { + "epoch": 2.1128606534995904, + "grad_norm": 8.3125, + "learning_rate": 1.8103424512050516e-06, + "loss": 1.3670557737350464, + "step": 11608 + }, + { + "epoch": 2.1132247201237826, + "grad_norm": 12.375, + "learning_rate": 1.8097266680324155e-06, + "loss": 1.559103012084961, + "step": 11610 + }, + { + "epoch": 2.113588786747975, + "grad_norm": 11.25, + "learning_rate": 1.8091110595275657e-06, + "loss": 1.3754302263259888, + "step": 11612 + }, + { + "epoch": 2.113952853372167, + "grad_norm": 17.625, + "learning_rate": 1.8084956257808424e-06, + "loss": 1.7541909217834473, + "step": 11614 + }, + { + "epoch": 2.114316919996359, + "grad_norm": 11.4375, + "learning_rate": 1.8078803668825582e-06, + "loss": 1.5109436511993408, + "step": 11616 + }, + { + "epoch": 2.1146809866205514, + "grad_norm": 7.90625, + "learning_rate": 1.8072652829229973e-06, + "loss": 1.5829871892929077, + "step": 11618 + }, + { + "epoch": 2.115045053244744, + "grad_norm": 20.0, + "learning_rate": 1.8066503739924237e-06, + "loss": 1.410038709640503, + "step": 11620 + }, + { + "epoch": 2.115409119868936, + "grad_norm": 24.625, + "learning_rate": 1.8060356401810705e-06, + "loss": 1.8690016269683838, + "step": 11622 + }, + { + "epoch": 2.1157731864931284, + "grad_norm": 7.71875, + "learning_rate": 1.8054210815791486e-06, + "loss": 1.4407477378845215, + "step": 11624 + }, + { + "epoch": 2.1161372531173206, + "grad_norm": 41.0, + "learning_rate": 1.804806698276843e-06, + "loss": 1.4998970031738281, + "step": 11626 + }, + { + "epoch": 2.116501319741513, + "grad_norm": 15.0, + "learning_rate": 1.804192490364309e-06, + "loss": 1.6051552295684814, + "step": 11628 + }, + { + "epoch": 2.116865386365705, + "grad_norm": 10.1875, + "learning_rate": 1.8035784579316823e-06, + "loss": 1.5052504539489746, + "step": 11630 + }, + { + "epoch": 2.117229452989897, + "grad_norm": 10.3125, + "learning_rate": 1.8029646010690668e-06, + "loss": 1.2871546745300293, + "step": 11632 + }, + { + "epoch": 2.1175935196140894, + "grad_norm": 13.0625, + "learning_rate": 1.8023509198665457e-06, + "loss": 1.383264183998108, + "step": 11634 + }, + { + "epoch": 2.1179575862382816, + "grad_norm": 9.375, + "learning_rate": 1.8017374144141742e-06, + "loss": 1.477445125579834, + "step": 11636 + }, + { + "epoch": 2.1183216528624738, + "grad_norm": 5.03125, + "learning_rate": 1.8011240848019796e-06, + "loss": 0.9355295300483704, + "step": 11638 + }, + { + "epoch": 2.118685719486666, + "grad_norm": 7.4375, + "learning_rate": 1.8005109311199681e-06, + "loss": 1.4660059213638306, + "step": 11640 + }, + { + "epoch": 2.119049786110858, + "grad_norm": 15.0, + "learning_rate": 1.7998979534581152e-06, + "loss": 2.0586273670196533, + "step": 11642 + }, + { + "epoch": 2.1194138527350503, + "grad_norm": 5.90625, + "learning_rate": 1.7992851519063747e-06, + "loss": 0.9837992191314697, + "step": 11644 + }, + { + "epoch": 2.1197779193592425, + "grad_norm": 8.375, + "learning_rate": 1.7986725265546726e-06, + "loss": 0.9593755602836609, + "step": 11646 + }, + { + "epoch": 2.120141985983435, + "grad_norm": 10.125, + "learning_rate": 1.7980600774929074e-06, + "loss": 0.8791125416755676, + "step": 11648 + }, + { + "epoch": 2.1205060526076274, + "grad_norm": 3.234375, + "learning_rate": 1.7974478048109562e-06, + "loss": 0.9672107696533203, + "step": 11650 + }, + { + "epoch": 2.1208701192318196, + "grad_norm": 14.1875, + "learning_rate": 1.796835708598665e-06, + "loss": 1.4134061336517334, + "step": 11652 + }, + { + "epoch": 2.1212341858560118, + "grad_norm": 9.0625, + "learning_rate": 1.7962237889458577e-06, + "loss": 1.4089746475219727, + "step": 11654 + }, + { + "epoch": 2.121598252480204, + "grad_norm": 16.875, + "learning_rate": 1.7956120459423322e-06, + "loss": 1.520780324935913, + "step": 11656 + }, + { + "epoch": 2.121962319104396, + "grad_norm": 9.875, + "learning_rate": 1.795000479677856e-06, + "loss": 1.5086718797683716, + "step": 11658 + }, + { + "epoch": 2.1223263857285883, + "grad_norm": 34.0, + "learning_rate": 1.7943890902421779e-06, + "loss": 2.127101421356201, + "step": 11660 + }, + { + "epoch": 2.1226904523527805, + "grad_norm": 7.5, + "learning_rate": 1.7937778777250132e-06, + "loss": 1.1552338600158691, + "step": 11662 + }, + { + "epoch": 2.1230545189769727, + "grad_norm": 8.1875, + "learning_rate": 1.7931668422160572e-06, + "loss": 1.1991102695465088, + "step": 11664 + }, + { + "epoch": 2.123418585601165, + "grad_norm": 10.75, + "learning_rate": 1.792555983804977e-06, + "loss": 1.282840609550476, + "step": 11666 + }, + { + "epoch": 2.123782652225357, + "grad_norm": 12.0625, + "learning_rate": 1.7919453025814116e-06, + "loss": 1.4351963996887207, + "step": 11668 + }, + { + "epoch": 2.1241467188495493, + "grad_norm": 7.875, + "learning_rate": 1.7913347986349784e-06, + "loss": 1.3094508647918701, + "step": 11670 + }, + { + "epoch": 2.1245107854737415, + "grad_norm": 2.15625, + "learning_rate": 1.7907244720552641e-06, + "loss": 0.9559741020202637, + "step": 11672 + }, + { + "epoch": 2.1248748520979337, + "grad_norm": 12.25, + "learning_rate": 1.7901143229318333e-06, + "loss": 1.4657357931137085, + "step": 11674 + }, + { + "epoch": 2.1252389187221263, + "grad_norm": 13.875, + "learning_rate": 1.7895043513542228e-06, + "loss": 1.4646859169006348, + "step": 11676 + }, + { + "epoch": 2.1256029853463185, + "grad_norm": 10.0625, + "learning_rate": 1.788894557411942e-06, + "loss": 1.4770485162734985, + "step": 11678 + }, + { + "epoch": 2.1259670519705107, + "grad_norm": 4.34375, + "learning_rate": 1.7882849411944781e-06, + "loss": 1.1873323917388916, + "step": 11680 + }, + { + "epoch": 2.126331118594703, + "grad_norm": 3.765625, + "learning_rate": 1.7876755027912869e-06, + "loss": 1.1944563388824463, + "step": 11682 + }, + { + "epoch": 2.126695185218895, + "grad_norm": 30.5, + "learning_rate": 1.787066242291803e-06, + "loss": 1.4382654428482056, + "step": 11684 + }, + { + "epoch": 2.1270592518430873, + "grad_norm": 6.21875, + "learning_rate": 1.7864571597854338e-06, + "loss": 1.4378302097320557, + "step": 11686 + }, + { + "epoch": 2.1274233184672795, + "grad_norm": 19.25, + "learning_rate": 1.7858482553615564e-06, + "loss": 2.002408266067505, + "step": 11688 + }, + { + "epoch": 2.1277873850914717, + "grad_norm": 12.25, + "learning_rate": 1.7852395291095288e-06, + "loss": 1.3984942436218262, + "step": 11690 + }, + { + "epoch": 2.128151451715664, + "grad_norm": 10.625, + "learning_rate": 1.7846309811186757e-06, + "loss": 1.1072252988815308, + "step": 11692 + }, + { + "epoch": 2.128515518339856, + "grad_norm": 7.4375, + "learning_rate": 1.784022611478301e-06, + "loss": 1.2439546585083008, + "step": 11694 + }, + { + "epoch": 2.1288795849640483, + "grad_norm": 21.5, + "learning_rate": 1.7834144202776815e-06, + "loss": 1.7651779651641846, + "step": 11696 + }, + { + "epoch": 2.1292436515882405, + "grad_norm": 3.59375, + "learning_rate": 1.7828064076060637e-06, + "loss": 0.9770365953445435, + "step": 11698 + }, + { + "epoch": 2.1296077182124327, + "grad_norm": 9.125, + "learning_rate": 1.782198573552674e-06, + "loss": 1.3989923000335693, + "step": 11700 + }, + { + "epoch": 2.1299717848366253, + "grad_norm": 20.5, + "learning_rate": 1.781590918206707e-06, + "loss": 0.5772086381912231, + "step": 11702 + }, + { + "epoch": 2.1303358514608175, + "grad_norm": 10.375, + "learning_rate": 1.7809834416573356e-06, + "loss": 1.3486251831054688, + "step": 11704 + }, + { + "epoch": 2.1306999180850097, + "grad_norm": 25.0, + "learning_rate": 1.780376143993705e-06, + "loss": 0.9902925491333008, + "step": 11706 + }, + { + "epoch": 2.131063984709202, + "grad_norm": 10.125, + "learning_rate": 1.7797690253049307e-06, + "loss": 1.8836326599121094, + "step": 11708 + }, + { + "epoch": 2.131428051333394, + "grad_norm": 22.0, + "learning_rate": 1.7791620856801084e-06, + "loss": 1.520932912826538, + "step": 11710 + }, + { + "epoch": 2.1317921179575863, + "grad_norm": 16.875, + "learning_rate": 1.778555325208301e-06, + "loss": 1.409934401512146, + "step": 11712 + }, + { + "epoch": 2.1321561845817785, + "grad_norm": 9.875, + "learning_rate": 1.7779487439785503e-06, + "loss": 1.7099024057388306, + "step": 11714 + }, + { + "epoch": 2.1325202512059707, + "grad_norm": 14.75, + "learning_rate": 1.7773423420798697e-06, + "loss": 1.797615647315979, + "step": 11716 + }, + { + "epoch": 2.132884317830163, + "grad_norm": 6.59375, + "learning_rate": 1.7767361196012434e-06, + "loss": 0.8760505318641663, + "step": 11718 + }, + { + "epoch": 2.133248384454355, + "grad_norm": 6.03125, + "learning_rate": 1.776130076631636e-06, + "loss": 1.057438611984253, + "step": 11720 + }, + { + "epoch": 2.1336124510785472, + "grad_norm": 23.75, + "learning_rate": 1.7755242132599784e-06, + "loss": 1.3238999843597412, + "step": 11722 + }, + { + "epoch": 2.1339765177027394, + "grad_norm": 10.5625, + "learning_rate": 1.7749185295751808e-06, + "loss": 1.4893951416015625, + "step": 11724 + }, + { + "epoch": 2.1343405843269316, + "grad_norm": 8.4375, + "learning_rate": 1.7743130256661252e-06, + "loss": 1.3726822137832642, + "step": 11726 + }, + { + "epoch": 2.1347046509511243, + "grad_norm": 15.125, + "learning_rate": 1.773707701621664e-06, + "loss": 1.9946835041046143, + "step": 11728 + }, + { + "epoch": 2.1350687175753165, + "grad_norm": 10.125, + "learning_rate": 1.7731025575306294e-06, + "loss": 1.4197754859924316, + "step": 11730 + }, + { + "epoch": 2.1354327841995087, + "grad_norm": 9.5, + "learning_rate": 1.772497593481821e-06, + "loss": 1.4092371463775635, + "step": 11732 + }, + { + "epoch": 2.135796850823701, + "grad_norm": 17.375, + "learning_rate": 1.7718928095640164e-06, + "loss": 0.6433576345443726, + "step": 11734 + }, + { + "epoch": 2.136160917447893, + "grad_norm": 14.4375, + "learning_rate": 1.7712882058659664e-06, + "loss": 1.433774471282959, + "step": 11736 + }, + { + "epoch": 2.1365249840720852, + "grad_norm": 7.875, + "learning_rate": 1.7706837824763907e-06, + "loss": 1.1799637079238892, + "step": 11738 + }, + { + "epoch": 2.1368890506962774, + "grad_norm": 61.25, + "learning_rate": 1.7700795394839893e-06, + "loss": 1.524416446685791, + "step": 11740 + }, + { + "epoch": 2.1372531173204696, + "grad_norm": 11.1875, + "learning_rate": 1.7694754769774298e-06, + "loss": 1.4541678428649902, + "step": 11742 + }, + { + "epoch": 2.137617183944662, + "grad_norm": 13.3125, + "learning_rate": 1.7688715950453579e-06, + "loss": 1.3971558809280396, + "step": 11744 + }, + { + "epoch": 2.137981250568854, + "grad_norm": 26.375, + "learning_rate": 1.7682678937763908e-06, + "loss": 1.2316131591796875, + "step": 11746 + }, + { + "epoch": 2.138345317193046, + "grad_norm": 7.09375, + "learning_rate": 1.767664373259117e-06, + "loss": 1.4580987691879272, + "step": 11748 + }, + { + "epoch": 2.1387093838172384, + "grad_norm": 6.4375, + "learning_rate": 1.7670610335821037e-06, + "loss": 1.0925869941711426, + "step": 11750 + }, + { + "epoch": 2.1390734504414306, + "grad_norm": 7.25, + "learning_rate": 1.7664578748338857e-06, + "loss": 1.4267220497131348, + "step": 11752 + }, + { + "epoch": 2.1394375170656232, + "grad_norm": 15.6875, + "learning_rate": 1.765854897102976e-06, + "loss": 1.486244797706604, + "step": 11754 + }, + { + "epoch": 2.1398015836898154, + "grad_norm": 10.5625, + "learning_rate": 1.7652521004778595e-06, + "loss": 1.3745989799499512, + "step": 11756 + }, + { + "epoch": 2.1401656503140076, + "grad_norm": 14.0625, + "learning_rate": 1.7646494850469917e-06, + "loss": 1.432636022567749, + "step": 11758 + }, + { + "epoch": 2.1405297169382, + "grad_norm": 9.6875, + "learning_rate": 1.764047050898807e-06, + "loss": 1.4629637002944946, + "step": 11760 + }, + { + "epoch": 2.140893783562392, + "grad_norm": 9.8125, + "learning_rate": 1.7634447981217074e-06, + "loss": 1.3738616704940796, + "step": 11762 + }, + { + "epoch": 2.141257850186584, + "grad_norm": 13.9375, + "learning_rate": 1.7628427268040726e-06, + "loss": 1.4615758657455444, + "step": 11764 + }, + { + "epoch": 2.1416219168107764, + "grad_norm": 21.25, + "learning_rate": 1.7622408370342551e-06, + "loss": 1.390002727508545, + "step": 11766 + }, + { + "epoch": 2.1419859834349686, + "grad_norm": 13.125, + "learning_rate": 1.761639128900577e-06, + "loss": 1.6146819591522217, + "step": 11768 + }, + { + "epoch": 2.142350050059161, + "grad_norm": 12.5625, + "learning_rate": 1.7610376024913394e-06, + "loss": 1.960791826248169, + "step": 11770 + }, + { + "epoch": 2.142714116683353, + "grad_norm": 16.625, + "learning_rate": 1.7604362578948111e-06, + "loss": 1.660319209098816, + "step": 11772 + }, + { + "epoch": 2.143078183307545, + "grad_norm": 11.625, + "learning_rate": 1.7598350951992393e-06, + "loss": 1.188616156578064, + "step": 11774 + }, + { + "epoch": 2.1434422499317374, + "grad_norm": 24.75, + "learning_rate": 1.759234114492842e-06, + "loss": 1.0207582712173462, + "step": 11776 + }, + { + "epoch": 2.1438063165559296, + "grad_norm": 10.125, + "learning_rate": 1.7586333158638089e-06, + "loss": 1.4799736738204956, + "step": 11778 + }, + { + "epoch": 2.1441703831801218, + "grad_norm": 16.25, + "learning_rate": 1.758032699400307e-06, + "loss": 1.4395723342895508, + "step": 11780 + }, + { + "epoch": 2.144534449804314, + "grad_norm": 34.5, + "learning_rate": 1.7574322651904718e-06, + "loss": 1.3090319633483887, + "step": 11782 + }, + { + "epoch": 2.1448985164285066, + "grad_norm": 11.875, + "learning_rate": 1.7568320133224168e-06, + "loss": 1.1724014282226562, + "step": 11784 + }, + { + "epoch": 2.145262583052699, + "grad_norm": 33.25, + "learning_rate": 1.7562319438842263e-06, + "loss": 2.2555654048919678, + "step": 11786 + }, + { + "epoch": 2.145626649676891, + "grad_norm": 6.28125, + "learning_rate": 1.7556320569639563e-06, + "loss": 1.5441243648529053, + "step": 11788 + }, + { + "epoch": 2.145990716301083, + "grad_norm": 9.25, + "learning_rate": 1.75503235264964e-06, + "loss": 1.1939061880111694, + "step": 11790 + }, + { + "epoch": 2.1463547829252754, + "grad_norm": 2.859375, + "learning_rate": 1.754432831029279e-06, + "loss": 1.3727716207504272, + "step": 11792 + }, + { + "epoch": 2.1467188495494676, + "grad_norm": 18.125, + "learning_rate": 1.7538334921908535e-06, + "loss": 1.002747654914856, + "step": 11794 + }, + { + "epoch": 2.1470829161736598, + "grad_norm": 24.75, + "learning_rate": 1.7532343362223132e-06, + "loss": 1.9813264608383179, + "step": 11796 + }, + { + "epoch": 2.147446982797852, + "grad_norm": 22.75, + "learning_rate": 1.75263536321158e-06, + "loss": 2.0209238529205322, + "step": 11798 + }, + { + "epoch": 2.147811049422044, + "grad_norm": 14.0625, + "learning_rate": 1.7520365732465532e-06, + "loss": 0.8212375640869141, + "step": 11800 + }, + { + "epoch": 2.1481751160462363, + "grad_norm": 14.5, + "learning_rate": 1.7514379664151005e-06, + "loss": 1.586103081703186, + "step": 11802 + }, + { + "epoch": 2.1485391826704285, + "grad_norm": 8.3125, + "learning_rate": 1.7508395428050672e-06, + "loss": 1.3431743383407593, + "step": 11804 + }, + { + "epoch": 2.1489032492946207, + "grad_norm": 15.0, + "learning_rate": 1.750241302504269e-06, + "loss": 1.408813714981079, + "step": 11806 + }, + { + "epoch": 2.149267315918813, + "grad_norm": 9.3125, + "learning_rate": 1.7496432456004936e-06, + "loss": 1.3541207313537598, + "step": 11808 + }, + { + "epoch": 2.1496313825430056, + "grad_norm": 10.4375, + "learning_rate": 1.749045372181506e-06, + "loss": 1.5129245519638062, + "step": 11810 + }, + { + "epoch": 2.1499954491671978, + "grad_norm": 14.5, + "learning_rate": 1.7484476823350388e-06, + "loss": 1.571367859840393, + "step": 11812 + }, + { + "epoch": 2.15035951579139, + "grad_norm": 7.40625, + "learning_rate": 1.747850176148803e-06, + "loss": 1.1557633876800537, + "step": 11814 + }, + { + "epoch": 2.150723582415582, + "grad_norm": 19.25, + "learning_rate": 1.74725285371048e-06, + "loss": 0.8231582045555115, + "step": 11816 + }, + { + "epoch": 2.1510876490397743, + "grad_norm": 24.0, + "learning_rate": 1.7466557151077224e-06, + "loss": 1.566454291343689, + "step": 11818 + }, + { + "epoch": 2.1514517156639665, + "grad_norm": 4.75, + "learning_rate": 1.746058760428161e-06, + "loss": 1.1101126670837402, + "step": 11820 + }, + { + "epoch": 2.1518157822881587, + "grad_norm": 109.0, + "learning_rate": 1.7454619897593927e-06, + "loss": 1.3713303804397583, + "step": 11822 + }, + { + "epoch": 2.152179848912351, + "grad_norm": 8.375, + "learning_rate": 1.744865403188994e-06, + "loss": 0.9665773510932922, + "step": 11824 + }, + { + "epoch": 2.152543915536543, + "grad_norm": 21.875, + "learning_rate": 1.7442690008045119e-06, + "loss": 1.9788179397583008, + "step": 11826 + }, + { + "epoch": 2.1529079821607353, + "grad_norm": 6.03125, + "learning_rate": 1.743672782693463e-06, + "loss": 1.493281364440918, + "step": 11828 + }, + { + "epoch": 2.1532720487849275, + "grad_norm": 8.125, + "learning_rate": 1.7430767489433436e-06, + "loss": 1.3502285480499268, + "step": 11830 + }, + { + "epoch": 2.1536361154091197, + "grad_norm": 9.375, + "learning_rate": 1.742480899641616e-06, + "loss": 1.347835659980774, + "step": 11832 + }, + { + "epoch": 2.154000182033312, + "grad_norm": 9.75, + "learning_rate": 1.7418852348757203e-06, + "loss": 1.5300534963607788, + "step": 11834 + }, + { + "epoch": 2.1543642486575045, + "grad_norm": 13.4375, + "learning_rate": 1.7412897547330687e-06, + "loss": 1.653989553451538, + "step": 11836 + }, + { + "epoch": 2.1547283152816967, + "grad_norm": 21.75, + "learning_rate": 1.7406944593010434e-06, + "loss": 2.0581629276275635, + "step": 11838 + }, + { + "epoch": 2.155092381905889, + "grad_norm": 12.5625, + "learning_rate": 1.7400993486670038e-06, + "loss": 1.4431942701339722, + "step": 11840 + }, + { + "epoch": 2.155456448530081, + "grad_norm": 7.90625, + "learning_rate": 1.7395044229182773e-06, + "loss": 1.3695716857910156, + "step": 11842 + }, + { + "epoch": 2.1558205151542733, + "grad_norm": 21.0, + "learning_rate": 1.7389096821421691e-06, + "loss": 1.3919975757598877, + "step": 11844 + }, + { + "epoch": 2.1561845817784655, + "grad_norm": 12.25, + "learning_rate": 1.738315126425955e-06, + "loss": 1.3687949180603027, + "step": 11846 + }, + { + "epoch": 2.1565486484026577, + "grad_norm": 3.953125, + "learning_rate": 1.7377207558568822e-06, + "loss": 1.2273038625717163, + "step": 11848 + }, + { + "epoch": 2.15691271502685, + "grad_norm": 83.0, + "learning_rate": 1.7371265705221735e-06, + "loss": 1.2729640007019043, + "step": 11850 + }, + { + "epoch": 2.157276781651042, + "grad_norm": 18.875, + "learning_rate": 1.7365325705090213e-06, + "loss": 0.47645875811576843, + "step": 11852 + }, + { + "epoch": 2.1576408482752343, + "grad_norm": 15.0625, + "learning_rate": 1.735938755904595e-06, + "loss": 1.4482628107070923, + "step": 11854 + }, + { + "epoch": 2.1580049148994265, + "grad_norm": 5.59375, + "learning_rate": 1.735345126796034e-06, + "loss": 1.3984953165054321, + "step": 11856 + }, + { + "epoch": 2.1583689815236187, + "grad_norm": 5.15625, + "learning_rate": 1.7347516832704492e-06, + "loss": 1.4285850524902344, + "step": 11858 + }, + { + "epoch": 2.158733048147811, + "grad_norm": 5.90625, + "learning_rate": 1.7341584254149285e-06, + "loss": 1.210035800933838, + "step": 11860 + }, + { + "epoch": 2.1590971147720035, + "grad_norm": 8.1875, + "learning_rate": 1.7335653533165275e-06, + "loss": 1.5598353147506714, + "step": 11862 + }, + { + "epoch": 2.1594611813961957, + "grad_norm": 18.125, + "learning_rate": 1.7329724670622793e-06, + "loss": 1.3323031663894653, + "step": 11864 + }, + { + "epoch": 2.159825248020388, + "grad_norm": 8.0625, + "learning_rate": 1.7323797667391877e-06, + "loss": 1.4360469579696655, + "step": 11866 + }, + { + "epoch": 2.16018931464458, + "grad_norm": 31.0, + "learning_rate": 1.7317872524342262e-06, + "loss": 1.5811915397644043, + "step": 11868 + }, + { + "epoch": 2.1605533812687723, + "grad_norm": 18.875, + "learning_rate": 1.7311949242343474e-06, + "loss": 1.7797983884811401, + "step": 11870 + }, + { + "epoch": 2.1609174478929645, + "grad_norm": 24.5, + "learning_rate": 1.7306027822264699e-06, + "loss": 1.5695608854293823, + "step": 11872 + }, + { + "epoch": 2.1612815145171567, + "grad_norm": 12.875, + "learning_rate": 1.7300108264974907e-06, + "loss": 2.051684856414795, + "step": 11874 + }, + { + "epoch": 2.161645581141349, + "grad_norm": 11.125, + "learning_rate": 1.7294190571342762e-06, + "loss": 1.3239431381225586, + "step": 11876 + }, + { + "epoch": 2.162009647765541, + "grad_norm": 4.625, + "learning_rate": 1.728827474223665e-06, + "loss": 1.0789450407028198, + "step": 11878 + }, + { + "epoch": 2.1623737143897332, + "grad_norm": 4.40625, + "learning_rate": 1.7282360778524712e-06, + "loss": 1.1580252647399902, + "step": 11880 + }, + { + "epoch": 2.1627377810139254, + "grad_norm": 5.125, + "learning_rate": 1.7276448681074778e-06, + "loss": 0.9379229545593262, + "step": 11882 + }, + { + "epoch": 2.1631018476381176, + "grad_norm": 6.40625, + "learning_rate": 1.7270538450754443e-06, + "loss": 1.0298399925231934, + "step": 11884 + }, + { + "epoch": 2.16346591426231, + "grad_norm": 11.875, + "learning_rate": 1.7264630088431006e-06, + "loss": 1.3942416906356812, + "step": 11886 + }, + { + "epoch": 2.163829980886502, + "grad_norm": 24.125, + "learning_rate": 1.7258723594971483e-06, + "loss": 1.5015668869018555, + "step": 11888 + }, + { + "epoch": 2.1641940475106947, + "grad_norm": 21.375, + "learning_rate": 1.725281897124265e-06, + "loss": 1.5184601545333862, + "step": 11890 + }, + { + "epoch": 2.164558114134887, + "grad_norm": 16.25, + "learning_rate": 1.7246916218110956e-06, + "loss": 1.4565767049789429, + "step": 11892 + }, + { + "epoch": 2.164922180759079, + "grad_norm": 20.0, + "learning_rate": 1.7241015336442629e-06, + "loss": 1.44452702999115, + "step": 11894 + }, + { + "epoch": 2.1652862473832712, + "grad_norm": 4.0625, + "learning_rate": 1.7235116327103607e-06, + "loss": 1.1152594089508057, + "step": 11896 + }, + { + "epoch": 2.1656503140074634, + "grad_norm": 8.75, + "learning_rate": 1.7229219190959515e-06, + "loss": 1.0509376525878906, + "step": 11898 + }, + { + "epoch": 2.1660143806316556, + "grad_norm": 11.125, + "learning_rate": 1.7223323928875762e-06, + "loss": 0.9874260425567627, + "step": 11900 + }, + { + "epoch": 2.166378447255848, + "grad_norm": 7.84375, + "learning_rate": 1.7217430541717434e-06, + "loss": 1.4655461311340332, + "step": 11902 + }, + { + "epoch": 2.16674251388004, + "grad_norm": 19.375, + "learning_rate": 1.7211539030349379e-06, + "loss": 1.5268408060073853, + "step": 11904 + }, + { + "epoch": 2.167106580504232, + "grad_norm": 18.375, + "learning_rate": 1.7205649395636147e-06, + "loss": 1.2957861423492432, + "step": 11906 + }, + { + "epoch": 2.1674706471284244, + "grad_norm": 8.5625, + "learning_rate": 1.7199761638442003e-06, + "loss": 1.5992405414581299, + "step": 11908 + }, + { + "epoch": 2.1678347137526166, + "grad_norm": 14.3125, + "learning_rate": 1.7193875759630976e-06, + "loss": 1.5022704601287842, + "step": 11910 + }, + { + "epoch": 2.168198780376809, + "grad_norm": 10.1875, + "learning_rate": 1.7187991760066769e-06, + "loss": 1.1741129159927368, + "step": 11912 + }, + { + "epoch": 2.168562847001001, + "grad_norm": 11.9375, + "learning_rate": 1.7182109640612857e-06, + "loss": 1.4434632062911987, + "step": 11914 + }, + { + "epoch": 2.168926913625193, + "grad_norm": 10.9375, + "learning_rate": 1.7176229402132417e-06, + "loss": 1.5435893535614014, + "step": 11916 + }, + { + "epoch": 2.169290980249386, + "grad_norm": 17.25, + "learning_rate": 1.7170351045488326e-06, + "loss": 1.6838984489440918, + "step": 11918 + }, + { + "epoch": 2.169655046873578, + "grad_norm": 20.375, + "learning_rate": 1.7164474571543238e-06, + "loss": 1.8975512981414795, + "step": 11920 + }, + { + "epoch": 2.17001911349777, + "grad_norm": 12.0, + "learning_rate": 1.7158599981159477e-06, + "loss": 1.467161774635315, + "step": 11922 + }, + { + "epoch": 2.1703831801219624, + "grad_norm": 5.125, + "learning_rate": 1.7152727275199132e-06, + "loss": 1.2082945108413696, + "step": 11924 + }, + { + "epoch": 2.1707472467461546, + "grad_norm": 13.5, + "learning_rate": 1.7146856454524003e-06, + "loss": 1.388268232345581, + "step": 11926 + }, + { + "epoch": 2.171111313370347, + "grad_norm": 9.75, + "learning_rate": 1.7140987519995584e-06, + "loss": 1.3517110347747803, + "step": 11928 + }, + { + "epoch": 2.171475379994539, + "grad_norm": 29.25, + "learning_rate": 1.7135120472475148e-06, + "loss": 1.5776770114898682, + "step": 11930 + }, + { + "epoch": 2.171839446618731, + "grad_norm": 28.25, + "learning_rate": 1.7129255312823634e-06, + "loss": 1.5649540424346924, + "step": 11932 + }, + { + "epoch": 2.1722035132429234, + "grad_norm": 11.875, + "learning_rate": 1.7123392041901748e-06, + "loss": 1.4681792259216309, + "step": 11934 + }, + { + "epoch": 2.1725675798671156, + "grad_norm": 10.6875, + "learning_rate": 1.7117530660569904e-06, + "loss": 1.0693531036376953, + "step": 11936 + }, + { + "epoch": 2.1729316464913078, + "grad_norm": 8.8125, + "learning_rate": 1.711167116968821e-06, + "loss": 1.4105781316757202, + "step": 11938 + }, + { + "epoch": 2.1732957131155, + "grad_norm": 14.3125, + "learning_rate": 1.7105813570116558e-06, + "loss": 1.5430963039398193, + "step": 11940 + }, + { + "epoch": 2.173659779739692, + "grad_norm": 21.125, + "learning_rate": 1.7099957862714492e-06, + "loss": 1.362628698348999, + "step": 11942 + }, + { + "epoch": 2.174023846363885, + "grad_norm": 9.25, + "learning_rate": 1.7094104048341336e-06, + "loss": 1.5510234832763672, + "step": 11944 + }, + { + "epoch": 2.174387912988077, + "grad_norm": 26.375, + "learning_rate": 1.708825212785612e-06, + "loss": 1.6540273427963257, + "step": 11946 + }, + { + "epoch": 2.174751979612269, + "grad_norm": 21.375, + "learning_rate": 1.7082402102117559e-06, + "loss": 1.9103442430496216, + "step": 11948 + }, + { + "epoch": 2.1751160462364614, + "grad_norm": 12.625, + "learning_rate": 1.7076553971984156e-06, + "loss": 1.4040323495864868, + "step": 11950 + }, + { + "epoch": 2.1754801128606536, + "grad_norm": 12.3125, + "learning_rate": 1.7070707738314068e-06, + "loss": 1.4501792192459106, + "step": 11952 + }, + { + "epoch": 2.1758441794848458, + "grad_norm": 35.75, + "learning_rate": 1.706486340196523e-06, + "loss": 1.5558557510375977, + "step": 11954 + }, + { + "epoch": 2.176208246109038, + "grad_norm": 25.0, + "learning_rate": 1.705902096379527e-06, + "loss": 2.25014591217041, + "step": 11956 + }, + { + "epoch": 2.17657231273323, + "grad_norm": 7.4375, + "learning_rate": 1.7053180424661525e-06, + "loss": 1.3083001375198364, + "step": 11958 + }, + { + "epoch": 2.1769363793574223, + "grad_norm": 5.65625, + "learning_rate": 1.70473417854211e-06, + "loss": 1.1901150941848755, + "step": 11960 + }, + { + "epoch": 2.1773004459816145, + "grad_norm": 9.1875, + "learning_rate": 1.7041505046930762e-06, + "loss": 1.3908497095108032, + "step": 11962 + }, + { + "epoch": 2.1776645126058067, + "grad_norm": 12.0, + "learning_rate": 1.7035670210047044e-06, + "loss": 1.6196521520614624, + "step": 11964 + }, + { + "epoch": 2.178028579229999, + "grad_norm": 25.0, + "learning_rate": 1.7029837275626198e-06, + "loss": 1.610152244567871, + "step": 11966 + }, + { + "epoch": 2.178392645854191, + "grad_norm": 15.8125, + "learning_rate": 1.7024006244524148e-06, + "loss": 1.960754156112671, + "step": 11968 + }, + { + "epoch": 2.1787567124783838, + "grad_norm": 7.5, + "learning_rate": 1.7018177117596612e-06, + "loss": 1.3356890678405762, + "step": 11970 + }, + { + "epoch": 2.179120779102576, + "grad_norm": 7.65625, + "learning_rate": 1.7012349895698957e-06, + "loss": 1.3218178749084473, + "step": 11972 + }, + { + "epoch": 2.179484845726768, + "grad_norm": 12.3125, + "learning_rate": 1.7006524579686329e-06, + "loss": 1.16715407371521, + "step": 11974 + }, + { + "epoch": 2.1798489123509603, + "grad_norm": 30.375, + "learning_rate": 1.700070117041357e-06, + "loss": 1.160258412361145, + "step": 11976 + }, + { + "epoch": 2.1802129789751525, + "grad_norm": 8.0625, + "learning_rate": 1.6994879668735211e-06, + "loss": 1.2591595649719238, + "step": 11978 + }, + { + "epoch": 2.1805770455993447, + "grad_norm": 9.4375, + "learning_rate": 1.6989060075505575e-06, + "loss": 1.5157290697097778, + "step": 11980 + }, + { + "epoch": 2.180941112223537, + "grad_norm": 13.0625, + "learning_rate": 1.698324239157863e-06, + "loss": 1.4201197624206543, + "step": 11982 + }, + { + "epoch": 2.181305178847729, + "grad_norm": 25.625, + "learning_rate": 1.6977426617808118e-06, + "loss": 1.615785837173462, + "step": 11984 + }, + { + "epoch": 2.1816692454719213, + "grad_norm": 28.5, + "learning_rate": 1.6971612755047485e-06, + "loss": 2.0562760829925537, + "step": 11986 + }, + { + "epoch": 2.1820333120961135, + "grad_norm": 8.75, + "learning_rate": 1.696580080414986e-06, + "loss": 1.227978229522705, + "step": 11988 + }, + { + "epoch": 2.1823973787203057, + "grad_norm": 12.9375, + "learning_rate": 1.6959990765968162e-06, + "loss": 1.1672794818878174, + "step": 11990 + }, + { + "epoch": 2.182761445344498, + "grad_norm": 11.5625, + "learning_rate": 1.6954182641354957e-06, + "loss": 0.8809859156608582, + "step": 11992 + }, + { + "epoch": 2.18312551196869, + "grad_norm": 7.3125, + "learning_rate": 1.6948376431162588e-06, + "loss": 1.0424938201904297, + "step": 11994 + }, + { + "epoch": 2.1834895785928827, + "grad_norm": 11.625, + "learning_rate": 1.6942572136243087e-06, + "loss": 0.13582243025302887, + "step": 11996 + }, + { + "epoch": 2.183853645217075, + "grad_norm": 5.0, + "learning_rate": 1.6936769757448202e-06, + "loss": 0.4949483871459961, + "step": 11998 + }, + { + "epoch": 2.184217711841267, + "grad_norm": 32.75, + "learning_rate": 1.693096929562942e-06, + "loss": 1.5824527740478516, + "step": 12000 + }, + { + "epoch": 2.1845817784654593, + "grad_norm": 20.0, + "learning_rate": 1.6925170751637921e-06, + "loss": 1.7683151960372925, + "step": 12002 + }, + { + "epoch": 2.1849458450896515, + "grad_norm": 9.625, + "learning_rate": 1.691937412632463e-06, + "loss": 1.6637481451034546, + "step": 12004 + }, + { + "epoch": 2.1853099117138437, + "grad_norm": 35.0, + "learning_rate": 1.6913579420540182e-06, + "loss": 2.1081438064575195, + "step": 12006 + }, + { + "epoch": 2.185673978338036, + "grad_norm": 10.3125, + "learning_rate": 1.690778663513491e-06, + "loss": 1.7586209774017334, + "step": 12008 + }, + { + "epoch": 2.186038044962228, + "grad_norm": 14.0, + "learning_rate": 1.69019957709589e-06, + "loss": 1.1529549360275269, + "step": 12010 + }, + { + "epoch": 2.1864021115864203, + "grad_norm": 10.75, + "learning_rate": 1.6896206828861916e-06, + "loss": 1.8441519737243652, + "step": 12012 + }, + { + "epoch": 2.1867661782106125, + "grad_norm": 7.3125, + "learning_rate": 1.6890419809693484e-06, + "loss": 1.4527759552001953, + "step": 12014 + }, + { + "epoch": 2.1871302448348047, + "grad_norm": 12.8125, + "learning_rate": 1.6884634714302823e-06, + "loss": 1.189206600189209, + "step": 12016 + }, + { + "epoch": 2.187494311458997, + "grad_norm": 9.0625, + "learning_rate": 1.687885154353885e-06, + "loss": 1.6650750637054443, + "step": 12018 + }, + { + "epoch": 2.187858378083189, + "grad_norm": 18.25, + "learning_rate": 1.6873070298250255e-06, + "loss": 1.5120407342910767, + "step": 12020 + }, + { + "epoch": 2.1882224447073813, + "grad_norm": 8.0625, + "learning_rate": 1.6867290979285377e-06, + "loss": 1.513980746269226, + "step": 12022 + }, + { + "epoch": 2.1885865113315734, + "grad_norm": 13.0625, + "learning_rate": 1.6861513587492335e-06, + "loss": 1.393868327140808, + "step": 12024 + }, + { + "epoch": 2.188950577955766, + "grad_norm": 11.9375, + "learning_rate": 1.6855738123718935e-06, + "loss": 1.5611568689346313, + "step": 12026 + }, + { + "epoch": 2.1893146445799583, + "grad_norm": 13.375, + "learning_rate": 1.684996458881268e-06, + "loss": 1.4893863201141357, + "step": 12028 + }, + { + "epoch": 2.1896787112041505, + "grad_norm": 10.6875, + "learning_rate": 1.6844192983620846e-06, + "loss": 1.3925065994262695, + "step": 12030 + }, + { + "epoch": 2.1900427778283427, + "grad_norm": 48.75, + "learning_rate": 1.6838423308990362e-06, + "loss": 1.2226086854934692, + "step": 12032 + }, + { + "epoch": 2.190406844452535, + "grad_norm": 30.875, + "learning_rate": 1.6832655565767924e-06, + "loss": 1.2104977369308472, + "step": 12034 + }, + { + "epoch": 2.190770911076727, + "grad_norm": 22.5, + "learning_rate": 1.6826889754799925e-06, + "loss": 1.4582310914993286, + "step": 12036 + }, + { + "epoch": 2.1911349777009193, + "grad_norm": 10.8125, + "learning_rate": 1.6821125876932456e-06, + "loss": 1.4852790832519531, + "step": 12038 + }, + { + "epoch": 2.1914990443251114, + "grad_norm": 10.25, + "learning_rate": 1.6815363933011368e-06, + "loss": 1.387447476387024, + "step": 12040 + }, + { + "epoch": 2.1918631109493036, + "grad_norm": 14.75, + "learning_rate": 1.6809603923882178e-06, + "loss": 1.4417144060134888, + "step": 12042 + }, + { + "epoch": 2.192227177573496, + "grad_norm": 13.0, + "learning_rate": 1.6803845850390166e-06, + "loss": 1.4992517232894897, + "step": 12044 + }, + { + "epoch": 2.192591244197688, + "grad_norm": 15.25, + "learning_rate": 1.6798089713380297e-06, + "loss": 1.6413629055023193, + "step": 12046 + }, + { + "epoch": 2.19295531082188, + "grad_norm": 13.6875, + "learning_rate": 1.6792335513697248e-06, + "loss": 1.8382233381271362, + "step": 12048 + }, + { + "epoch": 2.1933193774460724, + "grad_norm": 9.4375, + "learning_rate": 1.6786583252185451e-06, + "loss": 1.2685561180114746, + "step": 12050 + }, + { + "epoch": 2.193683444070265, + "grad_norm": 3.046875, + "learning_rate": 1.6780832929688998e-06, + "loss": 0.8487936854362488, + "step": 12052 + }, + { + "epoch": 2.1940475106944572, + "grad_norm": 11.1875, + "learning_rate": 1.6775084547051748e-06, + "loss": 1.2040059566497803, + "step": 12054 + }, + { + "epoch": 2.1944115773186494, + "grad_norm": 13.75, + "learning_rate": 1.676933810511725e-06, + "loss": 1.4764724969863892, + "step": 12056 + }, + { + "epoch": 2.1947756439428416, + "grad_norm": 7.9375, + "learning_rate": 1.6763593604728755e-06, + "loss": 1.4257529973983765, + "step": 12058 + }, + { + "epoch": 2.195139710567034, + "grad_norm": 5.59375, + "learning_rate": 1.6757851046729267e-06, + "loss": 1.0820488929748535, + "step": 12060 + }, + { + "epoch": 2.195503777191226, + "grad_norm": 8.125, + "learning_rate": 1.675211043196146e-06, + "loss": 1.195681095123291, + "step": 12062 + }, + { + "epoch": 2.195867843815418, + "grad_norm": 7.0, + "learning_rate": 1.6746371761267765e-06, + "loss": 1.1074368953704834, + "step": 12064 + }, + { + "epoch": 2.1962319104396104, + "grad_norm": 8.5, + "learning_rate": 1.6740635035490305e-06, + "loss": 1.6307562589645386, + "step": 12066 + }, + { + "epoch": 2.1965959770638026, + "grad_norm": 34.5, + "learning_rate": 1.673490025547091e-06, + "loss": 1.4520540237426758, + "step": 12068 + }, + { + "epoch": 2.196960043687995, + "grad_norm": 26.75, + "learning_rate": 1.672916742205115e-06, + "loss": 1.390350580215454, + "step": 12070 + }, + { + "epoch": 2.197324110312187, + "grad_norm": 7.375, + "learning_rate": 1.6723436536072283e-06, + "loss": 1.3479061126708984, + "step": 12072 + }, + { + "epoch": 2.197688176936379, + "grad_norm": 7.90625, + "learning_rate": 1.6717707598375302e-06, + "loss": 1.0836222171783447, + "step": 12074 + }, + { + "epoch": 2.1980522435605714, + "grad_norm": 12.875, + "learning_rate": 1.671198060980091e-06, + "loss": 1.4379175901412964, + "step": 12076 + }, + { + "epoch": 2.198416310184764, + "grad_norm": 3.046875, + "learning_rate": 1.6706255571189501e-06, + "loss": 0.8740071058273315, + "step": 12078 + }, + { + "epoch": 2.198780376808956, + "grad_norm": 23.625, + "learning_rate": 1.6700532483381221e-06, + "loss": 1.2940067052841187, + "step": 12080 + }, + { + "epoch": 2.1991444434331484, + "grad_norm": 10.3125, + "learning_rate": 1.6694811347215889e-06, + "loss": 1.57370924949646, + "step": 12082 + }, + { + "epoch": 2.1995085100573406, + "grad_norm": 34.5, + "learning_rate": 1.6689092163533078e-06, + "loss": 1.6122078895568848, + "step": 12084 + }, + { + "epoch": 2.199872576681533, + "grad_norm": 42.0, + "learning_rate": 1.6683374933172053e-06, + "loss": 1.0024185180664062, + "step": 12086 + }, + { + "epoch": 2.200236643305725, + "grad_norm": 29.375, + "learning_rate": 1.6677659656971778e-06, + "loss": 1.3415708541870117, + "step": 12088 + }, + { + "epoch": 2.200600709929917, + "grad_norm": 13.1875, + "learning_rate": 1.6671946335770971e-06, + "loss": 1.4179677963256836, + "step": 12090 + }, + { + "epoch": 2.2009647765541094, + "grad_norm": 9.9375, + "learning_rate": 1.6666234970408012e-06, + "loss": 1.82107675075531, + "step": 12092 + }, + { + "epoch": 2.2013288431783016, + "grad_norm": 16.25, + "learning_rate": 1.6660525561721036e-06, + "loss": 1.6467400789260864, + "step": 12094 + }, + { + "epoch": 2.2016929098024938, + "grad_norm": 11.4375, + "learning_rate": 1.6654818110547888e-06, + "loss": 1.57603120803833, + "step": 12096 + }, + { + "epoch": 2.202056976426686, + "grad_norm": 8.4375, + "learning_rate": 1.6649112617726082e-06, + "loss": 1.3872840404510498, + "step": 12098 + }, + { + "epoch": 2.202421043050878, + "grad_norm": 24.0, + "learning_rate": 1.6643409084092904e-06, + "loss": 1.3849416971206665, + "step": 12100 + }, + { + "epoch": 2.2027851096750704, + "grad_norm": 5.40625, + "learning_rate": 1.66377075104853e-06, + "loss": 1.2443528175354004, + "step": 12102 + }, + { + "epoch": 2.203149176299263, + "grad_norm": 12.5, + "learning_rate": 1.6632007897739978e-06, + "loss": 1.2464863061904907, + "step": 12104 + }, + { + "epoch": 2.203513242923455, + "grad_norm": 15.375, + "learning_rate": 1.6626310246693323e-06, + "loss": 1.6990505456924438, + "step": 12106 + }, + { + "epoch": 2.2038773095476474, + "grad_norm": 22.5, + "learning_rate": 1.6620614558181427e-06, + "loss": 1.447169303894043, + "step": 12108 + }, + { + "epoch": 2.2042413761718396, + "grad_norm": 16.75, + "learning_rate": 1.6614920833040138e-06, + "loss": 1.8406569957733154, + "step": 12110 + }, + { + "epoch": 2.2046054427960318, + "grad_norm": 11.375, + "learning_rate": 1.660922907210496e-06, + "loss": 1.4172639846801758, + "step": 12112 + }, + { + "epoch": 2.204969509420224, + "grad_norm": 10.625, + "learning_rate": 1.660353927621115e-06, + "loss": 1.3564472198486328, + "step": 12114 + }, + { + "epoch": 2.205333576044416, + "grad_norm": 14.4375, + "learning_rate": 1.659785144619367e-06, + "loss": 0.9050820469856262, + "step": 12116 + }, + { + "epoch": 2.2056976426686083, + "grad_norm": 5.03125, + "learning_rate": 1.6592165582887165e-06, + "loss": 1.1530940532684326, + "step": 12118 + }, + { + "epoch": 2.2060617092928005, + "grad_norm": 4.90625, + "learning_rate": 1.6586481687126032e-06, + "loss": 0.9225291609764099, + "step": 12120 + }, + { + "epoch": 2.2064257759169927, + "grad_norm": 13.0625, + "learning_rate": 1.658079975974434e-06, + "loss": 1.1620161533355713, + "step": 12122 + }, + { + "epoch": 2.206789842541185, + "grad_norm": 10.8125, + "learning_rate": 1.6575119801575905e-06, + "loss": 1.4045647382736206, + "step": 12124 + }, + { + "epoch": 2.207153909165377, + "grad_norm": 9.9375, + "learning_rate": 1.656944181345424e-06, + "loss": 1.5692120790481567, + "step": 12126 + }, + { + "epoch": 2.2075179757895693, + "grad_norm": 9.4375, + "learning_rate": 1.656376579621255e-06, + "loss": 1.4317455291748047, + "step": 12128 + }, + { + "epoch": 2.2078820424137615, + "grad_norm": 11.4375, + "learning_rate": 1.6558091750683787e-06, + "loss": 1.3973381519317627, + "step": 12130 + }, + { + "epoch": 2.208246109037954, + "grad_norm": 12.6875, + "learning_rate": 1.655241967770057e-06, + "loss": 1.379529595375061, + "step": 12132 + }, + { + "epoch": 2.2086101756621463, + "grad_norm": 9.4375, + "learning_rate": 1.6546749578095277e-06, + "loss": 1.3875292539596558, + "step": 12134 + }, + { + "epoch": 2.2089742422863385, + "grad_norm": 7.15625, + "learning_rate": 1.6541081452699964e-06, + "loss": 1.2197068929672241, + "step": 12136 + }, + { + "epoch": 2.2093383089105307, + "grad_norm": 5.0625, + "learning_rate": 1.6535415302346398e-06, + "loss": 0.8297624588012695, + "step": 12138 + }, + { + "epoch": 2.209702375534723, + "grad_norm": 25.0, + "learning_rate": 1.6529751127866078e-06, + "loss": 0.9787057042121887, + "step": 12140 + }, + { + "epoch": 2.210066442158915, + "grad_norm": 7.75, + "learning_rate": 1.6524088930090175e-06, + "loss": 0.4380683898925781, + "step": 12142 + }, + { + "epoch": 2.2104305087831073, + "grad_norm": 7.3125, + "learning_rate": 1.6518428709849616e-06, + "loss": 1.4496345520019531, + "step": 12144 + }, + { + "epoch": 2.2107945754072995, + "grad_norm": 6.5625, + "learning_rate": 1.6512770467975014e-06, + "loss": 0.975763738155365, + "step": 12146 + }, + { + "epoch": 2.2111586420314917, + "grad_norm": 7.125, + "learning_rate": 1.6507114205296675e-06, + "loss": 1.3833062648773193, + "step": 12148 + }, + { + "epoch": 2.211522708655684, + "grad_norm": 17.375, + "learning_rate": 1.6501459922644658e-06, + "loss": 1.385900855064392, + "step": 12150 + }, + { + "epoch": 2.211886775279876, + "grad_norm": 22.5, + "learning_rate": 1.649580762084868e-06, + "loss": 1.554508924484253, + "step": 12152 + }, + { + "epoch": 2.2122508419040683, + "grad_norm": 186.0, + "learning_rate": 1.6490157300738211e-06, + "loss": 0.5537604689598083, + "step": 12154 + }, + { + "epoch": 2.2126149085282605, + "grad_norm": 8.0, + "learning_rate": 1.6484508963142411e-06, + "loss": 1.0816677808761597, + "step": 12156 + }, + { + "epoch": 2.2129789751524527, + "grad_norm": 12.875, + "learning_rate": 1.6478862608890139e-06, + "loss": 1.6150009632110596, + "step": 12158 + }, + { + "epoch": 2.2133430417766453, + "grad_norm": 12.6875, + "learning_rate": 1.6473218238809996e-06, + "loss": 1.496718406677246, + "step": 12160 + }, + { + "epoch": 2.2137071084008375, + "grad_norm": 9.8125, + "learning_rate": 1.6467575853730238e-06, + "loss": 1.4483039379119873, + "step": 12162 + }, + { + "epoch": 2.2140711750250297, + "grad_norm": 6.625, + "learning_rate": 1.6461935454478894e-06, + "loss": 1.399533987045288, + "step": 12164 + }, + { + "epoch": 2.214435241649222, + "grad_norm": 8.1875, + "learning_rate": 1.6456297041883663e-06, + "loss": 1.025894045829773, + "step": 12166 + }, + { + "epoch": 2.214799308273414, + "grad_norm": 6.625, + "learning_rate": 1.6450660616771941e-06, + "loss": 1.3313698768615723, + "step": 12168 + }, + { + "epoch": 2.2151633748976063, + "grad_norm": 24.125, + "learning_rate": 1.6445026179970871e-06, + "loss": 1.5325853824615479, + "step": 12170 + }, + { + "epoch": 2.2155274415217985, + "grad_norm": 133.0, + "learning_rate": 1.6439393732307265e-06, + "loss": 1.9374743700027466, + "step": 12172 + }, + { + "epoch": 2.2158915081459907, + "grad_norm": 11.625, + "learning_rate": 1.6433763274607677e-06, + "loss": 1.4798552989959717, + "step": 12174 + }, + { + "epoch": 2.216255574770183, + "grad_norm": 8.4375, + "learning_rate": 1.642813480769836e-06, + "loss": 1.4079176187515259, + "step": 12176 + }, + { + "epoch": 2.216619641394375, + "grad_norm": 15.875, + "learning_rate": 1.6422508332405243e-06, + "loss": 1.4054397344589233, + "step": 12178 + }, + { + "epoch": 2.2169837080185673, + "grad_norm": 15.4375, + "learning_rate": 1.6416883849554016e-06, + "loss": 1.566229224205017, + "step": 12180 + }, + { + "epoch": 2.2173477746427595, + "grad_norm": 96.5, + "learning_rate": 1.6411261359970026e-06, + "loss": 1.3326313495635986, + "step": 12182 + }, + { + "epoch": 2.2177118412669516, + "grad_norm": 7.25, + "learning_rate": 1.6405640864478367e-06, + "loss": 1.3738365173339844, + "step": 12184 + }, + { + "epoch": 2.2180759078911443, + "grad_norm": 8.9375, + "learning_rate": 1.6400022363903823e-06, + "loss": 1.4668095111846924, + "step": 12186 + }, + { + "epoch": 2.2184399745153365, + "grad_norm": 9.8125, + "learning_rate": 1.6394405859070866e-06, + "loss": 1.1771247386932373, + "step": 12188 + }, + { + "epoch": 2.2188040411395287, + "grad_norm": 15.8125, + "learning_rate": 1.6388791350803725e-06, + "loss": 0.7339882850646973, + "step": 12190 + }, + { + "epoch": 2.219168107763721, + "grad_norm": 12.5, + "learning_rate": 1.6383178839926284e-06, + "loss": 0.5722537636756897, + "step": 12192 + }, + { + "epoch": 2.219532174387913, + "grad_norm": 7.34375, + "learning_rate": 1.6377568327262163e-06, + "loss": 1.1518901586532593, + "step": 12194 + }, + { + "epoch": 2.2198962410121053, + "grad_norm": 12.0625, + "learning_rate": 1.6371959813634698e-06, + "loss": 1.0115777254104614, + "step": 12196 + }, + { + "epoch": 2.2202603076362974, + "grad_norm": 27.625, + "learning_rate": 1.636635329986688e-06, + "loss": 1.5801260471343994, + "step": 12198 + }, + { + "epoch": 2.2206243742604896, + "grad_norm": 15.4375, + "learning_rate": 1.6360748786781477e-06, + "loss": 1.4054018259048462, + "step": 12200 + }, + { + "epoch": 2.220988440884682, + "grad_norm": 8.625, + "learning_rate": 1.6355146275200906e-06, + "loss": 1.3353304862976074, + "step": 12202 + }, + { + "epoch": 2.221352507508874, + "grad_norm": 8.625, + "learning_rate": 1.6349545765947323e-06, + "loss": 1.370603084564209, + "step": 12204 + }, + { + "epoch": 2.2217165741330662, + "grad_norm": 2.96875, + "learning_rate": 1.6343947259842584e-06, + "loss": 1.177539587020874, + "step": 12206 + }, + { + "epoch": 2.2220806407572584, + "grad_norm": 7.09375, + "learning_rate": 1.6338350757708235e-06, + "loss": 1.1190351247787476, + "step": 12208 + }, + { + "epoch": 2.2224447073814506, + "grad_norm": 10.0625, + "learning_rate": 1.6332756260365556e-06, + "loss": 1.124096155166626, + "step": 12210 + }, + { + "epoch": 2.2228087740056433, + "grad_norm": 101.0, + "learning_rate": 1.6327163768635492e-06, + "loss": 0.8702162504196167, + "step": 12212 + }, + { + "epoch": 2.2231728406298354, + "grad_norm": 39.25, + "learning_rate": 1.6321573283338744e-06, + "loss": 1.3182525634765625, + "step": 12214 + }, + { + "epoch": 2.2235369072540276, + "grad_norm": 6.15625, + "learning_rate": 1.6315984805295688e-06, + "loss": 1.3298591375350952, + "step": 12216 + }, + { + "epoch": 2.22390097387822, + "grad_norm": 12.625, + "learning_rate": 1.6310398335326394e-06, + "loss": 1.2881121635437012, + "step": 12218 + }, + { + "epoch": 2.224265040502412, + "grad_norm": 17.875, + "learning_rate": 1.6304813874250674e-06, + "loss": 1.5499582290649414, + "step": 12220 + }, + { + "epoch": 2.224629107126604, + "grad_norm": 12.3125, + "learning_rate": 1.6299231422888007e-06, + "loss": 1.2902812957763672, + "step": 12222 + }, + { + "epoch": 2.2249931737507964, + "grad_norm": 18.875, + "learning_rate": 1.6293650982057607e-06, + "loss": 1.243004560470581, + "step": 12224 + }, + { + "epoch": 2.2253572403749886, + "grad_norm": 14.8125, + "learning_rate": 1.6288072552578389e-06, + "loss": 0.9448727369308472, + "step": 12226 + }, + { + "epoch": 2.225721306999181, + "grad_norm": 17.75, + "learning_rate": 1.6282496135268939e-06, + "loss": 0.9821646213531494, + "step": 12228 + }, + { + "epoch": 2.226085373623373, + "grad_norm": 17.875, + "learning_rate": 1.6276921730947603e-06, + "loss": 1.6493090391159058, + "step": 12230 + }, + { + "epoch": 2.226449440247565, + "grad_norm": 38.25, + "learning_rate": 1.6271349340432374e-06, + "loss": 1.192422866821289, + "step": 12232 + }, + { + "epoch": 2.2268135068717574, + "grad_norm": 7.6875, + "learning_rate": 1.6265778964541002e-06, + "loss": 1.085685133934021, + "step": 12234 + }, + { + "epoch": 2.2271775734959496, + "grad_norm": 18.375, + "learning_rate": 1.626021060409091e-06, + "loss": 1.3583873510360718, + "step": 12236 + }, + { + "epoch": 2.227541640120142, + "grad_norm": 6.03125, + "learning_rate": 1.6254644259899216e-06, + "loss": 1.170522928237915, + "step": 12238 + }, + { + "epoch": 2.2279057067443344, + "grad_norm": 23.125, + "learning_rate": 1.6249079932782785e-06, + "loss": 0.8008841276168823, + "step": 12240 + }, + { + "epoch": 2.2282697733685266, + "grad_norm": 9.75, + "learning_rate": 1.6243517623558135e-06, + "loss": 0.9121081829071045, + "step": 12242 + }, + { + "epoch": 2.228633839992719, + "grad_norm": 11.875, + "learning_rate": 1.623795733304153e-06, + "loss": 1.4922815561294556, + "step": 12244 + }, + { + "epoch": 2.228997906616911, + "grad_norm": 14.375, + "learning_rate": 1.623239906204892e-06, + "loss": 1.7578998804092407, + "step": 12246 + }, + { + "epoch": 2.229361973241103, + "grad_norm": 8.6875, + "learning_rate": 1.6226842811395938e-06, + "loss": 1.3746850490570068, + "step": 12248 + }, + { + "epoch": 2.2297260398652954, + "grad_norm": 8.6875, + "learning_rate": 1.6221288581897968e-06, + "loss": 1.275140404701233, + "step": 12250 + }, + { + "epoch": 2.2300901064894876, + "grad_norm": 10.5, + "learning_rate": 1.621573637437005e-06, + "loss": 1.4401323795318604, + "step": 12252 + }, + { + "epoch": 2.2304541731136798, + "grad_norm": 19.125, + "learning_rate": 1.621018618962696e-06, + "loss": 1.552638053894043, + "step": 12254 + }, + { + "epoch": 2.230818239737872, + "grad_norm": 12.0625, + "learning_rate": 1.6204638028483166e-06, + "loss": 1.2832891941070557, + "step": 12256 + }, + { + "epoch": 2.231182306362064, + "grad_norm": 10.25, + "learning_rate": 1.6199091891752822e-06, + "loss": 1.117211937904358, + "step": 12258 + }, + { + "epoch": 2.2315463729862564, + "grad_norm": 58.0, + "learning_rate": 1.6193547780249828e-06, + "loss": 1.444494366645813, + "step": 12260 + }, + { + "epoch": 2.2319104396104485, + "grad_norm": 7.90625, + "learning_rate": 1.6188005694787728e-06, + "loss": 0.7699679136276245, + "step": 12262 + }, + { + "epoch": 2.2322745062346407, + "grad_norm": 19.625, + "learning_rate": 1.6182465636179826e-06, + "loss": 1.3388326168060303, + "step": 12264 + }, + { + "epoch": 2.232638572858833, + "grad_norm": 11.125, + "learning_rate": 1.6176927605239102e-06, + "loss": 0.9454472661018372, + "step": 12266 + }, + { + "epoch": 2.2330026394830256, + "grad_norm": 17.25, + "learning_rate": 1.6171391602778214e-06, + "loss": 1.6248373985290527, + "step": 12268 + }, + { + "epoch": 2.2333667061072178, + "grad_norm": 10.9375, + "learning_rate": 1.6165857629609582e-06, + "loss": 1.4173344373703003, + "step": 12270 + }, + { + "epoch": 2.23373077273141, + "grad_norm": 6.625, + "learning_rate": 1.6160325686545263e-06, + "loss": 1.4110231399536133, + "step": 12272 + }, + { + "epoch": 2.234094839355602, + "grad_norm": 11.625, + "learning_rate": 1.6154795774397073e-06, + "loss": 1.4788658618927002, + "step": 12274 + }, + { + "epoch": 2.2344589059797944, + "grad_norm": 106.0, + "learning_rate": 1.6149267893976496e-06, + "loss": 0.9729205965995789, + "step": 12276 + }, + { + "epoch": 2.2348229726039865, + "grad_norm": 6.25, + "learning_rate": 1.6143742046094713e-06, + "loss": 0.5006702542304993, + "step": 12278 + }, + { + "epoch": 2.2351870392281787, + "grad_norm": 23.5, + "learning_rate": 1.6138218231562642e-06, + "loss": 1.3185569047927856, + "step": 12280 + }, + { + "epoch": 2.235551105852371, + "grad_norm": 12.1875, + "learning_rate": 1.6132696451190854e-06, + "loss": 1.0156015157699585, + "step": 12282 + }, + { + "epoch": 2.235915172476563, + "grad_norm": 56.0, + "learning_rate": 1.6127176705789673e-06, + "loss": 1.0495911836624146, + "step": 12284 + }, + { + "epoch": 2.2362792391007553, + "grad_norm": 54.25, + "learning_rate": 1.6121658996169092e-06, + "loss": 0.6994911432266235, + "step": 12286 + }, + { + "epoch": 2.2366433057249475, + "grad_norm": 11.375, + "learning_rate": 1.61161433231388e-06, + "loss": 1.4782464504241943, + "step": 12288 + }, + { + "epoch": 2.2370073723491397, + "grad_norm": 11.625, + "learning_rate": 1.6110629687508217e-06, + "loss": 1.3820947408676147, + "step": 12290 + }, + { + "epoch": 2.237371438973332, + "grad_norm": 14.375, + "learning_rate": 1.610511809008643e-06, + "loss": 1.4437205791473389, + "step": 12292 + }, + { + "epoch": 2.2377355055975245, + "grad_norm": 29.25, + "learning_rate": 1.6099608531682256e-06, + "loss": 1.6471656560897827, + "step": 12294 + }, + { + "epoch": 2.2380995722217167, + "grad_norm": 8.1875, + "learning_rate": 1.60941010131042e-06, + "loss": 1.7767752408981323, + "step": 12296 + }, + { + "epoch": 2.238463638845909, + "grad_norm": 2.21875, + "learning_rate": 1.6088595535160458e-06, + "loss": 0.8517869710922241, + "step": 12298 + }, + { + "epoch": 2.238827705470101, + "grad_norm": 8.4375, + "learning_rate": 1.6083092098658957e-06, + "loss": 1.2293821573257446, + "step": 12300 + }, + { + "epoch": 2.2391917720942933, + "grad_norm": 11.25, + "learning_rate": 1.6077590704407272e-06, + "loss": 1.3938345909118652, + "step": 12302 + }, + { + "epoch": 2.2395558387184855, + "grad_norm": 8.9375, + "learning_rate": 1.6072091353212737e-06, + "loss": 1.5042243003845215, + "step": 12304 + }, + { + "epoch": 2.2399199053426777, + "grad_norm": 4.25, + "learning_rate": 1.6066594045882353e-06, + "loss": 1.3572784662246704, + "step": 12306 + }, + { + "epoch": 2.24028397196687, + "grad_norm": 8.0, + "learning_rate": 1.6061098783222823e-06, + "loss": 0.9582677483558655, + "step": 12308 + }, + { + "epoch": 2.240648038591062, + "grad_norm": 11.1875, + "learning_rate": 1.6055605566040565e-06, + "loss": 1.9111570119857788, + "step": 12310 + }, + { + "epoch": 2.2410121052152543, + "grad_norm": 22.5, + "learning_rate": 1.6050114395141663e-06, + "loss": 1.7372725009918213, + "step": 12312 + }, + { + "epoch": 2.2413761718394465, + "grad_norm": 12.25, + "learning_rate": 1.6044625271331948e-06, + "loss": 1.4749634265899658, + "step": 12314 + }, + { + "epoch": 2.2417402384636387, + "grad_norm": 13.125, + "learning_rate": 1.603913819541692e-06, + "loss": 1.3942590951919556, + "step": 12316 + }, + { + "epoch": 2.242104305087831, + "grad_norm": 8.75, + "learning_rate": 1.603365316820178e-06, + "loss": 1.6264064311981201, + "step": 12318 + }, + { + "epoch": 2.2424683717120235, + "grad_norm": 8.9375, + "learning_rate": 1.6028170190491446e-06, + "loss": 1.685206651687622, + "step": 12320 + }, + { + "epoch": 2.2428324383362157, + "grad_norm": 11.0, + "learning_rate": 1.60226892630905e-06, + "loss": 1.4618518352508545, + "step": 12322 + }, + { + "epoch": 2.243196504960408, + "grad_norm": 8.75, + "learning_rate": 1.6017210386803264e-06, + "loss": 1.468754768371582, + "step": 12324 + }, + { + "epoch": 2.2435605715846, + "grad_norm": 8.875, + "learning_rate": 1.601173356243374e-06, + "loss": 1.5624163150787354, + "step": 12326 + }, + { + "epoch": 2.2439246382087923, + "grad_norm": 27.5, + "learning_rate": 1.6006258790785622e-06, + "loss": 1.4366270303726196, + "step": 12328 + }, + { + "epoch": 2.2442887048329845, + "grad_norm": 17.5, + "learning_rate": 1.6000786072662326e-06, + "loss": 1.4687341451644897, + "step": 12330 + }, + { + "epoch": 2.2446527714571767, + "grad_norm": 4.40625, + "learning_rate": 1.5995315408866925e-06, + "loss": 1.3800256252288818, + "step": 12332 + }, + { + "epoch": 2.245016838081369, + "grad_norm": 22.75, + "learning_rate": 1.5989846800202235e-06, + "loss": 1.4765992164611816, + "step": 12334 + }, + { + "epoch": 2.245380904705561, + "grad_norm": 35.25, + "learning_rate": 1.5984380247470755e-06, + "loss": 1.745290994644165, + "step": 12336 + }, + { + "epoch": 2.2457449713297533, + "grad_norm": 11.5625, + "learning_rate": 1.597891575147467e-06, + "loss": 1.479117751121521, + "step": 12338 + }, + { + "epoch": 2.2461090379539455, + "grad_norm": 25.625, + "learning_rate": 1.5973453313015884e-06, + "loss": 1.6162757873535156, + "step": 12340 + }, + { + "epoch": 2.2464731045781376, + "grad_norm": 9.8125, + "learning_rate": 1.5967992932895963e-06, + "loss": 1.4312381744384766, + "step": 12342 + }, + { + "epoch": 2.24683717120233, + "grad_norm": 9.625, + "learning_rate": 1.5962534611916223e-06, + "loss": 1.2580519914627075, + "step": 12344 + }, + { + "epoch": 2.2472012378265225, + "grad_norm": 20.5, + "learning_rate": 1.5957078350877636e-06, + "loss": 1.4227744340896606, + "step": 12346 + }, + { + "epoch": 2.2475653044507147, + "grad_norm": 9.0, + "learning_rate": 1.595162415058089e-06, + "loss": 1.4911073446273804, + "step": 12348 + }, + { + "epoch": 2.247929371074907, + "grad_norm": 3.96875, + "learning_rate": 1.5946172011826376e-06, + "loss": 0.872816264629364, + "step": 12350 + }, + { + "epoch": 2.248293437699099, + "grad_norm": 10.6875, + "learning_rate": 1.594072193541415e-06, + "loss": 1.1604639291763306, + "step": 12352 + }, + { + "epoch": 2.2486575043232913, + "grad_norm": 34.75, + "learning_rate": 1.5935273922144013e-06, + "loss": 1.5771641731262207, + "step": 12354 + }, + { + "epoch": 2.2490215709474835, + "grad_norm": 7.78125, + "learning_rate": 1.5929827972815423e-06, + "loss": 1.4863736629486084, + "step": 12356 + }, + { + "epoch": 2.2493856375716756, + "grad_norm": 21.25, + "learning_rate": 1.592438408822756e-06, + "loss": 1.3807090520858765, + "step": 12358 + }, + { + "epoch": 2.249749704195868, + "grad_norm": 9.75, + "learning_rate": 1.5918942269179294e-06, + "loss": 1.4658913612365723, + "step": 12360 + }, + { + "epoch": 2.25011377082006, + "grad_norm": 8.125, + "learning_rate": 1.591350251646917e-06, + "loss": 1.2913917303085327, + "step": 12362 + }, + { + "epoch": 2.2504778374442522, + "grad_norm": 8.3125, + "learning_rate": 1.5908064830895473e-06, + "loss": 1.4570693969726562, + "step": 12364 + }, + { + "epoch": 2.2508419040684444, + "grad_norm": 13.875, + "learning_rate": 1.5902629213256148e-06, + "loss": 1.5609384775161743, + "step": 12366 + }, + { + "epoch": 2.2512059706926366, + "grad_norm": 11.6875, + "learning_rate": 1.589719566434886e-06, + "loss": 1.3474090099334717, + "step": 12368 + }, + { + "epoch": 2.251570037316829, + "grad_norm": 12.3125, + "learning_rate": 1.5891764184970959e-06, + "loss": 1.6190539598464966, + "step": 12370 + }, + { + "epoch": 2.2519341039410214, + "grad_norm": 17.625, + "learning_rate": 1.5886334775919476e-06, + "loss": 1.40696382522583, + "step": 12372 + }, + { + "epoch": 2.252298170565213, + "grad_norm": 15.6875, + "learning_rate": 1.5880907437991172e-06, + "loss": 1.6238670349121094, + "step": 12374 + }, + { + "epoch": 2.252662237189406, + "grad_norm": 9.5625, + "learning_rate": 1.5875482171982482e-06, + "loss": 1.4120055437088013, + "step": 12376 + }, + { + "epoch": 2.253026303813598, + "grad_norm": 11.125, + "learning_rate": 1.587005897868954e-06, + "loss": 1.5295876264572144, + "step": 12378 + }, + { + "epoch": 2.2533903704377902, + "grad_norm": 9.9375, + "learning_rate": 1.5864637858908188e-06, + "loss": 1.5951356887817383, + "step": 12380 + }, + { + "epoch": 2.2537544370619824, + "grad_norm": 11.5625, + "learning_rate": 1.585921881343393e-06, + "loss": 1.8884718418121338, + "step": 12382 + }, + { + "epoch": 2.2541185036861746, + "grad_norm": 13.3125, + "learning_rate": 1.5853801843062011e-06, + "loss": 1.7638633251190186, + "step": 12384 + }, + { + "epoch": 2.254482570310367, + "grad_norm": 17.0, + "learning_rate": 1.5848386948587343e-06, + "loss": 1.535719871520996, + "step": 12386 + }, + { + "epoch": 2.254846636934559, + "grad_norm": 13.4375, + "learning_rate": 1.584297413080454e-06, + "loss": 1.3890198469161987, + "step": 12388 + }, + { + "epoch": 2.255210703558751, + "grad_norm": 11.4375, + "learning_rate": 1.5837563390507907e-06, + "loss": 1.531652569770813, + "step": 12390 + }, + { + "epoch": 2.2555747701829434, + "grad_norm": 21.375, + "learning_rate": 1.5832154728491452e-06, + "loss": 1.6434849500656128, + "step": 12392 + }, + { + "epoch": 2.2559388368071356, + "grad_norm": 11.375, + "learning_rate": 1.5826748145548873e-06, + "loss": 1.2316720485687256, + "step": 12394 + }, + { + "epoch": 2.2563029034313278, + "grad_norm": 8.5625, + "learning_rate": 1.5821343642473563e-06, + "loss": 1.4487545490264893, + "step": 12396 + }, + { + "epoch": 2.25666697005552, + "grad_norm": 6.9375, + "learning_rate": 1.5815941220058618e-06, + "loss": 1.2573072910308838, + "step": 12398 + }, + { + "epoch": 2.257031036679712, + "grad_norm": 12.5625, + "learning_rate": 1.5810540879096812e-06, + "loss": 1.4514893293380737, + "step": 12400 + }, + { + "epoch": 2.257395103303905, + "grad_norm": 11.9375, + "learning_rate": 1.5805142620380625e-06, + "loss": 1.320814609527588, + "step": 12402 + }, + { + "epoch": 2.257759169928097, + "grad_norm": 10.1875, + "learning_rate": 1.5799746444702236e-06, + "loss": 1.299497127532959, + "step": 12404 + }, + { + "epoch": 2.258123236552289, + "grad_norm": 19.625, + "learning_rate": 1.5794352352853505e-06, + "loss": 1.1353826522827148, + "step": 12406 + }, + { + "epoch": 2.2584873031764814, + "grad_norm": 10.9375, + "learning_rate": 1.5788960345625995e-06, + "loss": 1.3488101959228516, + "step": 12408 + }, + { + "epoch": 2.2588513698006736, + "grad_norm": 15.5625, + "learning_rate": 1.5783570423810965e-06, + "loss": 1.6760532855987549, + "step": 12410 + }, + { + "epoch": 2.2592154364248658, + "grad_norm": 11.6875, + "learning_rate": 1.5778182588199358e-06, + "loss": 1.9968559741973877, + "step": 12412 + }, + { + "epoch": 2.259579503049058, + "grad_norm": 11.5625, + "learning_rate": 1.5772796839581821e-06, + "loss": 1.4470630884170532, + "step": 12414 + }, + { + "epoch": 2.25994356967325, + "grad_norm": 10.875, + "learning_rate": 1.5767413178748691e-06, + "loss": 1.539105772972107, + "step": 12416 + }, + { + "epoch": 2.2603076362974424, + "grad_norm": 11.25, + "learning_rate": 1.5762031606489999e-06, + "loss": 1.4229347705841064, + "step": 12418 + }, + { + "epoch": 2.2606717029216346, + "grad_norm": 8.0, + "learning_rate": 1.5756652123595465e-06, + "loss": 0.885093629360199, + "step": 12420 + }, + { + "epoch": 2.2610357695458267, + "grad_norm": 14.125, + "learning_rate": 1.575127473085451e-06, + "loss": 0.6655827760696411, + "step": 12422 + }, + { + "epoch": 2.261399836170019, + "grad_norm": 10.8125, + "learning_rate": 1.5745899429056242e-06, + "loss": 1.6897716522216797, + "step": 12424 + }, + { + "epoch": 2.261763902794211, + "grad_norm": 6.03125, + "learning_rate": 1.5740526218989466e-06, + "loss": 1.2048144340515137, + "step": 12426 + }, + { + "epoch": 2.2621279694184038, + "grad_norm": 10.125, + "learning_rate": 1.573515510144268e-06, + "loss": 1.3830907344818115, + "step": 12428 + }, + { + "epoch": 2.262492036042596, + "grad_norm": 4.65625, + "learning_rate": 1.572978607720407e-06, + "loss": 1.5929877758026123, + "step": 12430 + }, + { + "epoch": 2.262856102666788, + "grad_norm": 10.3125, + "learning_rate": 1.5724419147061523e-06, + "loss": 1.107445478439331, + "step": 12432 + }, + { + "epoch": 2.2632201692909804, + "grad_norm": 7.46875, + "learning_rate": 1.5719054311802612e-06, + "loss": 1.279052495956421, + "step": 12434 + }, + { + "epoch": 2.2635842359151725, + "grad_norm": 8.375, + "learning_rate": 1.5713691572214607e-06, + "loss": 1.4521898031234741, + "step": 12436 + }, + { + "epoch": 2.2639483025393647, + "grad_norm": 12.9375, + "learning_rate": 1.5708330929084463e-06, + "loss": 1.4216667413711548, + "step": 12438 + }, + { + "epoch": 2.264312369163557, + "grad_norm": 17.625, + "learning_rate": 1.5702972383198836e-06, + "loss": 1.4985642433166504, + "step": 12440 + }, + { + "epoch": 2.264676435787749, + "grad_norm": 5.84375, + "learning_rate": 1.5697615935344074e-06, + "loss": 1.2892903089523315, + "step": 12442 + }, + { + "epoch": 2.2650405024119413, + "grad_norm": 8.0, + "learning_rate": 1.5692261586306209e-06, + "loss": 1.0783270597457886, + "step": 12444 + }, + { + "epoch": 2.2654045690361335, + "grad_norm": 12.25, + "learning_rate": 1.5686909336870974e-06, + "loss": 1.8017053604125977, + "step": 12446 + }, + { + "epoch": 2.2657686356603257, + "grad_norm": 27.125, + "learning_rate": 1.5681559187823785e-06, + "loss": 1.4044363498687744, + "step": 12448 + }, + { + "epoch": 2.266132702284518, + "grad_norm": 15.0, + "learning_rate": 1.567621113994976e-06, + "loss": 1.505138635635376, + "step": 12450 + }, + { + "epoch": 2.26649676890871, + "grad_norm": 7.90625, + "learning_rate": 1.56708651940337e-06, + "loss": 1.4537688493728638, + "step": 12452 + }, + { + "epoch": 2.2668608355329027, + "grad_norm": 11.75, + "learning_rate": 1.5665521350860101e-06, + "loss": 1.393488883972168, + "step": 12454 + }, + { + "epoch": 2.267224902157095, + "grad_norm": 43.5, + "learning_rate": 1.5660179611213152e-06, + "loss": 1.892049789428711, + "step": 12456 + }, + { + "epoch": 2.267588968781287, + "grad_norm": 15.875, + "learning_rate": 1.5654839975876731e-06, + "loss": 1.1731960773468018, + "step": 12458 + }, + { + "epoch": 2.2679530354054793, + "grad_norm": 11.5, + "learning_rate": 1.564950244563441e-06, + "loss": 1.429764747619629, + "step": 12460 + }, + { + "epoch": 2.2683171020296715, + "grad_norm": 19.75, + "learning_rate": 1.5644167021269444e-06, + "loss": 1.4573893547058105, + "step": 12462 + }, + { + "epoch": 2.2686811686538637, + "grad_norm": 16.0, + "learning_rate": 1.563883370356479e-06, + "loss": 1.514305591583252, + "step": 12464 + }, + { + "epoch": 2.269045235278056, + "grad_norm": 24.875, + "learning_rate": 1.5633502493303087e-06, + "loss": 1.2604882717132568, + "step": 12466 + }, + { + "epoch": 2.269409301902248, + "grad_norm": 11.75, + "learning_rate": 1.5628173391266674e-06, + "loss": 1.7389116287231445, + "step": 12468 + }, + { + "epoch": 2.2697733685264403, + "grad_norm": 31.375, + "learning_rate": 1.562284639823757e-06, + "loss": 1.2206616401672363, + "step": 12470 + }, + { + "epoch": 2.2701374351506325, + "grad_norm": 11.75, + "learning_rate": 1.5617521514997494e-06, + "loss": 1.4983587265014648, + "step": 12472 + }, + { + "epoch": 2.2705015017748247, + "grad_norm": 13.1875, + "learning_rate": 1.5612198742327846e-06, + "loss": 1.357822060585022, + "step": 12474 + }, + { + "epoch": 2.270865568399017, + "grad_norm": 24.5, + "learning_rate": 1.5606878081009724e-06, + "loss": 1.7816073894500732, + "step": 12476 + }, + { + "epoch": 2.271229635023209, + "grad_norm": 14.375, + "learning_rate": 1.5601559531823917e-06, + "loss": 1.7828789949417114, + "step": 12478 + }, + { + "epoch": 2.2715937016474017, + "grad_norm": 13.4375, + "learning_rate": 1.5596243095550891e-06, + "loss": 1.7889257669448853, + "step": 12480 + }, + { + "epoch": 2.2719577682715935, + "grad_norm": 8.75, + "learning_rate": 1.5590928772970823e-06, + "loss": 1.6126959323883057, + "step": 12482 + }, + { + "epoch": 2.272321834895786, + "grad_norm": 11.0, + "learning_rate": 1.558561656486356e-06, + "loss": 1.0573344230651855, + "step": 12484 + }, + { + "epoch": 2.2726859015199783, + "grad_norm": 46.0, + "learning_rate": 1.558030647200865e-06, + "loss": 1.019694209098816, + "step": 12486 + }, + { + "epoch": 2.2730499681441705, + "grad_norm": 10.375, + "learning_rate": 1.5574998495185325e-06, + "loss": 1.4526140689849854, + "step": 12488 + }, + { + "epoch": 2.2734140347683627, + "grad_norm": 7.84375, + "learning_rate": 1.5569692635172518e-06, + "loss": 0.9422599077224731, + "step": 12490 + }, + { + "epoch": 2.273778101392555, + "grad_norm": 11.0, + "learning_rate": 1.5564388892748827e-06, + "loss": 1.4089980125427246, + "step": 12492 + }, + { + "epoch": 2.274142168016747, + "grad_norm": 9.6875, + "learning_rate": 1.555908726869257e-06, + "loss": 1.8546347618103027, + "step": 12494 + }, + { + "epoch": 2.2745062346409393, + "grad_norm": 102.0, + "learning_rate": 1.555378776378173e-06, + "loss": 1.5285102128982544, + "step": 12496 + }, + { + "epoch": 2.2748703012651315, + "grad_norm": 12.1875, + "learning_rate": 1.554849037879399e-06, + "loss": 1.5162465572357178, + "step": 12498 + }, + { + "epoch": 2.2752343678893237, + "grad_norm": 12.0625, + "learning_rate": 1.5543195114506724e-06, + "loss": 1.5895440578460693, + "step": 12500 + }, + { + "epoch": 2.275598434513516, + "grad_norm": 31.625, + "learning_rate": 1.5537901971696984e-06, + "loss": 1.978537678718567, + "step": 12502 + }, + { + "epoch": 2.275962501137708, + "grad_norm": 6.84375, + "learning_rate": 1.553261095114152e-06, + "loss": 1.6220439672470093, + "step": 12504 + }, + { + "epoch": 2.2763265677619007, + "grad_norm": 8.25, + "learning_rate": 1.5527322053616767e-06, + "loss": 1.3209445476531982, + "step": 12506 + }, + { + "epoch": 2.2766906343860924, + "grad_norm": 8.0625, + "learning_rate": 1.552203527989885e-06, + "loss": 1.5111479759216309, + "step": 12508 + }, + { + "epoch": 2.277054701010285, + "grad_norm": 5.125, + "learning_rate": 1.551675063076358e-06, + "loss": 1.090872883796692, + "step": 12510 + }, + { + "epoch": 2.2774187676344773, + "grad_norm": 15.9375, + "learning_rate": 1.551146810698646e-06, + "loss": 1.2567442655563354, + "step": 12512 + }, + { + "epoch": 2.2777828342586695, + "grad_norm": 9.4375, + "learning_rate": 1.550618770934268e-06, + "loss": 1.4358798265457153, + "step": 12514 + }, + { + "epoch": 2.2781469008828616, + "grad_norm": 15.0625, + "learning_rate": 1.5500909438607115e-06, + "loss": 1.5757757425308228, + "step": 12516 + }, + { + "epoch": 2.278510967507054, + "grad_norm": 15.75, + "learning_rate": 1.5495633295554332e-06, + "loss": 1.732880711555481, + "step": 12518 + }, + { + "epoch": 2.278875034131246, + "grad_norm": 24.625, + "learning_rate": 1.5490359280958579e-06, + "loss": 1.745815396308899, + "step": 12520 + }, + { + "epoch": 2.2792391007554382, + "grad_norm": 13.5625, + "learning_rate": 1.54850873955938e-06, + "loss": 1.1320409774780273, + "step": 12522 + }, + { + "epoch": 2.2796031673796304, + "grad_norm": 9.8125, + "learning_rate": 1.5479817640233624e-06, + "loss": 1.3969218730926514, + "step": 12524 + }, + { + "epoch": 2.2799672340038226, + "grad_norm": 17.125, + "learning_rate": 1.547455001565136e-06, + "loss": 1.3373697996139526, + "step": 12526 + }, + { + "epoch": 2.280331300628015, + "grad_norm": 9.6875, + "learning_rate": 1.5469284522620022e-06, + "loss": 0.9119505882263184, + "step": 12528 + }, + { + "epoch": 2.280695367252207, + "grad_norm": 6.90625, + "learning_rate": 1.5464021161912285e-06, + "loss": 1.4338898658752441, + "step": 12530 + }, + { + "epoch": 2.281059433876399, + "grad_norm": 13.375, + "learning_rate": 1.5458759934300536e-06, + "loss": 1.3429218530654907, + "step": 12532 + }, + { + "epoch": 2.2814235005005914, + "grad_norm": 16.75, + "learning_rate": 1.5453500840556834e-06, + "loss": 0.916692852973938, + "step": 12534 + }, + { + "epoch": 2.281787567124784, + "grad_norm": 11.0, + "learning_rate": 1.5448243881452934e-06, + "loss": 1.6242117881774902, + "step": 12536 + }, + { + "epoch": 2.2821516337489762, + "grad_norm": 17.625, + "learning_rate": 1.5442989057760272e-06, + "loss": 1.32588529586792, + "step": 12538 + }, + { + "epoch": 2.2825157003731684, + "grad_norm": 11.5, + "learning_rate": 1.543773637024997e-06, + "loss": 1.3498769998550415, + "step": 12540 + }, + { + "epoch": 2.2828797669973606, + "grad_norm": 11.75, + "learning_rate": 1.5432485819692842e-06, + "loss": 1.480819582939148, + "step": 12542 + }, + { + "epoch": 2.283243833621553, + "grad_norm": 9.75, + "learning_rate": 1.542723740685938e-06, + "loss": 1.4214507341384888, + "step": 12544 + }, + { + "epoch": 2.283607900245745, + "grad_norm": 8.9375, + "learning_rate": 1.542199113251977e-06, + "loss": 1.4767481088638306, + "step": 12546 + }, + { + "epoch": 2.283971966869937, + "grad_norm": 8.0625, + "learning_rate": 1.5416746997443884e-06, + "loss": 1.2315850257873535, + "step": 12548 + }, + { + "epoch": 2.2843360334941294, + "grad_norm": 11.4375, + "learning_rate": 1.5411505002401275e-06, + "loss": 1.0710111856460571, + "step": 12550 + }, + { + "epoch": 2.2847001001183216, + "grad_norm": 4.5625, + "learning_rate": 1.5406265148161183e-06, + "loss": 1.2519993782043457, + "step": 12552 + }, + { + "epoch": 2.285064166742514, + "grad_norm": 10.4375, + "learning_rate": 1.540102743549254e-06, + "loss": 1.3542646169662476, + "step": 12554 + }, + { + "epoch": 2.285428233366706, + "grad_norm": 6.96875, + "learning_rate": 1.5395791865163957e-06, + "loss": 1.4727967977523804, + "step": 12556 + }, + { + "epoch": 2.285792299990898, + "grad_norm": 6.84375, + "learning_rate": 1.539055843794373e-06, + "loss": 1.5567904710769653, + "step": 12558 + }, + { + "epoch": 2.2861563666150904, + "grad_norm": 12.25, + "learning_rate": 1.5385327154599846e-06, + "loss": 1.1173269748687744, + "step": 12560 + }, + { + "epoch": 2.286520433239283, + "grad_norm": 10.9375, + "learning_rate": 1.5380098015899972e-06, + "loss": 1.3361327648162842, + "step": 12562 + }, + { + "epoch": 2.286884499863475, + "grad_norm": 8.8125, + "learning_rate": 1.5374871022611467e-06, + "loss": 1.285651445388794, + "step": 12564 + }, + { + "epoch": 2.2872485664876674, + "grad_norm": 9.75, + "learning_rate": 1.536964617550137e-06, + "loss": 1.582836627960205, + "step": 12566 + }, + { + "epoch": 2.2876126331118596, + "grad_norm": 22.875, + "learning_rate": 1.5364423475336405e-06, + "loss": 1.2831053733825684, + "step": 12568 + }, + { + "epoch": 2.2879766997360518, + "grad_norm": 33.5, + "learning_rate": 1.535920292288298e-06, + "loss": 1.605603814125061, + "step": 12570 + }, + { + "epoch": 2.288340766360244, + "grad_norm": 8.875, + "learning_rate": 1.5353984518907195e-06, + "loss": 1.3263009786605835, + "step": 12572 + }, + { + "epoch": 2.288704832984436, + "grad_norm": 6.8125, + "learning_rate": 1.5348768264174821e-06, + "loss": 1.0126454830169678, + "step": 12574 + }, + { + "epoch": 2.2890688996086284, + "grad_norm": 4.03125, + "learning_rate": 1.5343554159451336e-06, + "loss": 1.14657461643219, + "step": 12576 + }, + { + "epoch": 2.2894329662328206, + "grad_norm": 4.21875, + "learning_rate": 1.5338342205501874e-06, + "loss": 1.3293346166610718, + "step": 12578 + }, + { + "epoch": 2.2897970328570127, + "grad_norm": 12.25, + "learning_rate": 1.5333132403091278e-06, + "loss": 1.091294765472412, + "step": 12580 + }, + { + "epoch": 2.290161099481205, + "grad_norm": 7.71875, + "learning_rate": 1.532792475298406e-06, + "loss": 1.4288572072982788, + "step": 12582 + }, + { + "epoch": 2.290525166105397, + "grad_norm": 9.1875, + "learning_rate": 1.5322719255944427e-06, + "loss": 1.4103387594223022, + "step": 12584 + }, + { + "epoch": 2.2908892327295893, + "grad_norm": 10.25, + "learning_rate": 1.5317515912736259e-06, + "loss": 1.4835950136184692, + "step": 12586 + }, + { + "epoch": 2.291253299353782, + "grad_norm": 13.5625, + "learning_rate": 1.5312314724123128e-06, + "loss": 1.3269966840744019, + "step": 12588 + }, + { + "epoch": 2.291617365977974, + "grad_norm": 13.5625, + "learning_rate": 1.5307115690868289e-06, + "loss": 1.1279107332229614, + "step": 12590 + }, + { + "epoch": 2.2919814326021664, + "grad_norm": 7.9375, + "learning_rate": 1.5301918813734673e-06, + "loss": 0.5846623182296753, + "step": 12592 + }, + { + "epoch": 2.2923454992263586, + "grad_norm": 5.53125, + "learning_rate": 1.529672409348491e-06, + "loss": 1.2363202571868896, + "step": 12594 + }, + { + "epoch": 2.2927095658505507, + "grad_norm": 11.5, + "learning_rate": 1.5291531530881299e-06, + "loss": 1.5694537162780762, + "step": 12596 + }, + { + "epoch": 2.293073632474743, + "grad_norm": 12.8125, + "learning_rate": 1.5286341126685825e-06, + "loss": 1.369071364402771, + "step": 12598 + }, + { + "epoch": 2.293437699098935, + "grad_norm": 12.0625, + "learning_rate": 1.5281152881660163e-06, + "loss": 1.3592339754104614, + "step": 12600 + }, + { + "epoch": 2.2938017657231273, + "grad_norm": 10.125, + "learning_rate": 1.5275966796565665e-06, + "loss": 1.4833550453186035, + "step": 12602 + }, + { + "epoch": 2.2941658323473195, + "grad_norm": 16.125, + "learning_rate": 1.5270782872163367e-06, + "loss": 1.3515369892120361, + "step": 12604 + }, + { + "epoch": 2.2945298989715117, + "grad_norm": 4.59375, + "learning_rate": 1.526560110921399e-06, + "loss": 1.2067272663116455, + "step": 12606 + }, + { + "epoch": 2.294893965595704, + "grad_norm": 6.78125, + "learning_rate": 1.526042150847794e-06, + "loss": 1.2377903461456299, + "step": 12608 + }, + { + "epoch": 2.295258032219896, + "grad_norm": 7.0, + "learning_rate": 1.5255244070715298e-06, + "loss": 1.2793469429016113, + "step": 12610 + }, + { + "epoch": 2.2956220988440883, + "grad_norm": 8.5625, + "learning_rate": 1.5250068796685833e-06, + "loss": 1.3406494855880737, + "step": 12612 + }, + { + "epoch": 2.295986165468281, + "grad_norm": 5.9375, + "learning_rate": 1.5244895687148994e-06, + "loss": 1.2264078855514526, + "step": 12614 + }, + { + "epoch": 2.2963502320924727, + "grad_norm": 63.0, + "learning_rate": 1.5239724742863914e-06, + "loss": 1.5421836376190186, + "step": 12616 + }, + { + "epoch": 2.2967142987166653, + "grad_norm": 14.875, + "learning_rate": 1.5234555964589415e-06, + "loss": 1.8008121252059937, + "step": 12618 + }, + { + "epoch": 2.2970783653408575, + "grad_norm": 8.75, + "learning_rate": 1.522938935308399e-06, + "loss": 1.0468926429748535, + "step": 12620 + }, + { + "epoch": 2.2974424319650497, + "grad_norm": 31.375, + "learning_rate": 1.522422490910581e-06, + "loss": 1.2860817909240723, + "step": 12622 + }, + { + "epoch": 2.297806498589242, + "grad_norm": 5.59375, + "learning_rate": 1.521906263341275e-06, + "loss": 0.12845967710018158, + "step": 12624 + }, + { + "epoch": 2.298170565213434, + "grad_norm": 198.0, + "learning_rate": 1.5213902526762348e-06, + "loss": 0.43172675371170044, + "step": 12626 + }, + { + "epoch": 2.2985346318376263, + "grad_norm": 21.75, + "learning_rate": 1.5208744589911823e-06, + "loss": 1.3381685018539429, + "step": 12628 + }, + { + "epoch": 2.2988986984618185, + "grad_norm": 184.0, + "learning_rate": 1.5203588823618087e-06, + "loss": 0.9716833829879761, + "step": 12630 + }, + { + "epoch": 2.2992627650860107, + "grad_norm": 5.71875, + "learning_rate": 1.5198435228637726e-06, + "loss": 1.551675796508789, + "step": 12632 + }, + { + "epoch": 2.299626831710203, + "grad_norm": 5.4375, + "learning_rate": 1.519328380572701e-06, + "loss": 1.4420608282089233, + "step": 12634 + }, + { + "epoch": 2.299990898334395, + "grad_norm": 6.34375, + "learning_rate": 1.518813455564189e-06, + "loss": 0.9131681323051453, + "step": 12636 + }, + { + "epoch": 2.3003549649585873, + "grad_norm": 9.0, + "learning_rate": 1.5182987479137994e-06, + "loss": 1.5336073637008667, + "step": 12638 + }, + { + "epoch": 2.3007190315827795, + "grad_norm": 17.25, + "learning_rate": 1.5177842576970641e-06, + "loss": 1.2791965007781982, + "step": 12640 + }, + { + "epoch": 2.3010830982069717, + "grad_norm": 12.375, + "learning_rate": 1.5172699849894821e-06, + "loss": 1.0568195581436157, + "step": 12642 + }, + { + "epoch": 2.3014471648311643, + "grad_norm": 16.125, + "learning_rate": 1.5167559298665206e-06, + "loss": 1.4356865882873535, + "step": 12644 + }, + { + "epoch": 2.3018112314553565, + "grad_norm": 21.5, + "learning_rate": 1.5162420924036152e-06, + "loss": 1.3333125114440918, + "step": 12646 + }, + { + "epoch": 2.3021752980795487, + "grad_norm": 59.25, + "learning_rate": 1.51572847267617e-06, + "loss": 0.6198864579200745, + "step": 12648 + }, + { + "epoch": 2.302539364703741, + "grad_norm": 18.875, + "learning_rate": 1.5152150707595558e-06, + "loss": 1.824110507965088, + "step": 12650 + }, + { + "epoch": 2.302903431327933, + "grad_norm": 30.625, + "learning_rate": 1.514701886729113e-06, + "loss": 1.3971707820892334, + "step": 12652 + }, + { + "epoch": 2.3032674979521253, + "grad_norm": 9.875, + "learning_rate": 1.5141889206601488e-06, + "loss": 1.489194393157959, + "step": 12654 + }, + { + "epoch": 2.3036315645763175, + "grad_norm": 11.9375, + "learning_rate": 1.513676172627939e-06, + "loss": 1.4018131494522095, + "step": 12656 + }, + { + "epoch": 2.3039956312005097, + "grad_norm": 25.125, + "learning_rate": 1.5131636427077274e-06, + "loss": 1.383791208267212, + "step": 12658 + }, + { + "epoch": 2.304359697824702, + "grad_norm": 7.21875, + "learning_rate": 1.5126513309747255e-06, + "loss": 0.8677065372467041, + "step": 12660 + }, + { + "epoch": 2.304723764448894, + "grad_norm": 21.5, + "learning_rate": 1.512139237504113e-06, + "loss": 1.1065874099731445, + "step": 12662 + }, + { + "epoch": 2.3050878310730862, + "grad_norm": 4.0, + "learning_rate": 1.5116273623710375e-06, + "loss": 0.5159171223640442, + "step": 12664 + }, + { + "epoch": 2.3054518976972784, + "grad_norm": 10.5, + "learning_rate": 1.5111157056506155e-06, + "loss": 1.3095202445983887, + "step": 12666 + }, + { + "epoch": 2.3058159643214706, + "grad_norm": 12.125, + "learning_rate": 1.510604267417929e-06, + "loss": 1.2780394554138184, + "step": 12668 + }, + { + "epoch": 2.3061800309456633, + "grad_norm": 7.0625, + "learning_rate": 1.5100930477480305e-06, + "loss": 1.5795389413833618, + "step": 12670 + }, + { + "epoch": 2.3065440975698555, + "grad_norm": 7.21875, + "learning_rate": 1.5095820467159391e-06, + "loss": 0.9675000309944153, + "step": 12672 + }, + { + "epoch": 2.3069081641940477, + "grad_norm": 7.96875, + "learning_rate": 1.5090712643966423e-06, + "loss": 1.3001408576965332, + "step": 12674 + }, + { + "epoch": 2.30727223081824, + "grad_norm": 10.9375, + "learning_rate": 1.5085607008650955e-06, + "loss": 1.3690695762634277, + "step": 12676 + }, + { + "epoch": 2.307636297442432, + "grad_norm": 5.5625, + "learning_rate": 1.5080503561962212e-06, + "loss": 1.2771575450897217, + "step": 12678 + }, + { + "epoch": 2.3080003640666242, + "grad_norm": 7.625, + "learning_rate": 1.507540230464911e-06, + "loss": 1.159590721130371, + "step": 12680 + }, + { + "epoch": 2.3083644306908164, + "grad_norm": 13.875, + "learning_rate": 1.5070303237460235e-06, + "loss": 1.3532944917678833, + "step": 12682 + }, + { + "epoch": 2.3087284973150086, + "grad_norm": 26.875, + "learning_rate": 1.5065206361143852e-06, + "loss": 1.2736201286315918, + "step": 12684 + }, + { + "epoch": 2.309092563939201, + "grad_norm": 60.5, + "learning_rate": 1.5060111676447914e-06, + "loss": 0.7054938077926636, + "step": 12686 + }, + { + "epoch": 2.309456630563393, + "grad_norm": 7.09375, + "learning_rate": 1.505501918412004e-06, + "loss": 1.06803297996521, + "step": 12688 + }, + { + "epoch": 2.309820697187585, + "grad_norm": 9.6875, + "learning_rate": 1.5049928884907536e-06, + "loss": 1.3874268531799316, + "step": 12690 + }, + { + "epoch": 2.3101847638117774, + "grad_norm": 10.875, + "learning_rate": 1.5044840779557379e-06, + "loss": 1.5525838136672974, + "step": 12692 + }, + { + "epoch": 2.3105488304359696, + "grad_norm": 29.125, + "learning_rate": 1.5039754868816227e-06, + "loss": 1.3969459533691406, + "step": 12694 + }, + { + "epoch": 2.3109128970601622, + "grad_norm": 13.875, + "learning_rate": 1.5034671153430425e-06, + "loss": 1.265852451324463, + "step": 12696 + }, + { + "epoch": 2.3112769636843544, + "grad_norm": 27.875, + "learning_rate": 1.502958963414598e-06, + "loss": 1.1846003532409668, + "step": 12698 + }, + { + "epoch": 2.3116410303085466, + "grad_norm": 17.5, + "learning_rate": 1.5024510311708583e-06, + "loss": 1.091532826423645, + "step": 12700 + }, + { + "epoch": 2.312005096932739, + "grad_norm": 7.15625, + "learning_rate": 1.5019433186863612e-06, + "loss": 1.253321886062622, + "step": 12702 + }, + { + "epoch": 2.312369163556931, + "grad_norm": 8.625, + "learning_rate": 1.5014358260356106e-06, + "loss": 1.3607995510101318, + "step": 12704 + }, + { + "epoch": 2.312733230181123, + "grad_norm": 8.6875, + "learning_rate": 1.5009285532930796e-06, + "loss": 1.1928050518035889, + "step": 12706 + }, + { + "epoch": 2.3130972968053154, + "grad_norm": 5.78125, + "learning_rate": 1.5004215005332082e-06, + "loss": 1.1566638946533203, + "step": 12708 + }, + { + "epoch": 2.3134613634295076, + "grad_norm": 16.375, + "learning_rate": 1.4999146678304044e-06, + "loss": 1.54642653465271, + "step": 12710 + }, + { + "epoch": 2.3138254300537, + "grad_norm": 12.75, + "learning_rate": 1.4994080552590437e-06, + "loss": 1.730119228363037, + "step": 12712 + }, + { + "epoch": 2.314189496677892, + "grad_norm": 22.375, + "learning_rate": 1.4989016628934695e-06, + "loss": 1.4265369176864624, + "step": 12714 + }, + { + "epoch": 2.314553563302084, + "grad_norm": 43.5, + "learning_rate": 1.4983954908079929e-06, + "loss": 1.4489706754684448, + "step": 12716 + }, + { + "epoch": 2.3149176299262764, + "grad_norm": 5.9375, + "learning_rate": 1.4978895390768925e-06, + "loss": 1.1474939584732056, + "step": 12718 + }, + { + "epoch": 2.3152816965504686, + "grad_norm": 12.8125, + "learning_rate": 1.497383807774415e-06, + "loss": 1.4372098445892334, + "step": 12720 + }, + { + "epoch": 2.315645763174661, + "grad_norm": 9.375, + "learning_rate": 1.4968782969747736e-06, + "loss": 1.4117785692214966, + "step": 12722 + }, + { + "epoch": 2.316009829798853, + "grad_norm": 19.5, + "learning_rate": 1.496373006752151e-06, + "loss": 1.0667873620986938, + "step": 12724 + }, + { + "epoch": 2.3163738964230456, + "grad_norm": 7.9375, + "learning_rate": 1.4958679371806956e-06, + "loss": 1.2744520902633667, + "step": 12726 + }, + { + "epoch": 2.316737963047238, + "grad_norm": 15.0, + "learning_rate": 1.495363088334525e-06, + "loss": 1.5304172039031982, + "step": 12728 + }, + { + "epoch": 2.31710202967143, + "grad_norm": 14.5625, + "learning_rate": 1.4948584602877233e-06, + "loss": 1.7262049913406372, + "step": 12730 + }, + { + "epoch": 2.317466096295622, + "grad_norm": 21.75, + "learning_rate": 1.4943540531143428e-06, + "loss": 1.7382382154464722, + "step": 12732 + }, + { + "epoch": 2.3178301629198144, + "grad_norm": 11.1875, + "learning_rate": 1.493849866888403e-06, + "loss": 1.9376696348190308, + "step": 12734 + }, + { + "epoch": 2.3181942295440066, + "grad_norm": 4.15625, + "learning_rate": 1.4933459016838914e-06, + "loss": 1.2478326559066772, + "step": 12736 + }, + { + "epoch": 2.3185582961681988, + "grad_norm": 6.75, + "learning_rate": 1.492842157574763e-06, + "loss": 1.0598152875900269, + "step": 12738 + }, + { + "epoch": 2.318922362792391, + "grad_norm": 27.0, + "learning_rate": 1.4923386346349398e-06, + "loss": 1.518973708152771, + "step": 12740 + }, + { + "epoch": 2.319286429416583, + "grad_norm": 56.0, + "learning_rate": 1.4918353329383117e-06, + "loss": 1.2738940715789795, + "step": 12742 + }, + { + "epoch": 2.3196504960407753, + "grad_norm": 12.125, + "learning_rate": 1.491332252558737e-06, + "loss": 1.4766879081726074, + "step": 12744 + }, + { + "epoch": 2.3200145626649675, + "grad_norm": 14.625, + "learning_rate": 1.4908293935700398e-06, + "loss": 1.2525542974472046, + "step": 12746 + }, + { + "epoch": 2.3203786292891597, + "grad_norm": 6.5625, + "learning_rate": 1.4903267560460134e-06, + "loss": 1.22401762008667, + "step": 12748 + }, + { + "epoch": 2.320742695913352, + "grad_norm": 7.90625, + "learning_rate": 1.4898243400604169e-06, + "loss": 1.0784289836883545, + "step": 12750 + }, + { + "epoch": 2.3211067625375446, + "grad_norm": 8.375, + "learning_rate": 1.4893221456869783e-06, + "loss": 1.6045362949371338, + "step": 12752 + }, + { + "epoch": 2.3214708291617367, + "grad_norm": 6.25, + "learning_rate": 1.4888201729993925e-06, + "loss": 1.3778455257415771, + "step": 12754 + }, + { + "epoch": 2.321834895785929, + "grad_norm": 11.9375, + "learning_rate": 1.4883184220713224e-06, + "loss": 1.2218191623687744, + "step": 12756 + }, + { + "epoch": 2.322198962410121, + "grad_norm": 27.25, + "learning_rate": 1.4878168929763972e-06, + "loss": 1.3763898611068726, + "step": 12758 + }, + { + "epoch": 2.3225630290343133, + "grad_norm": 10.5625, + "learning_rate": 1.4873155857882148e-06, + "loss": 1.164544701576233, + "step": 12760 + }, + { + "epoch": 2.3229270956585055, + "grad_norm": 17.125, + "learning_rate": 1.48681450058034e-06, + "loss": 0.44405442476272583, + "step": 12762 + }, + { + "epoch": 2.3232911622826977, + "grad_norm": 7.8125, + "learning_rate": 1.4863136374263044e-06, + "loss": 1.2949330806732178, + "step": 12764 + }, + { + "epoch": 2.32365522890689, + "grad_norm": 6.5625, + "learning_rate": 1.4858129963996083e-06, + "loss": 1.571161150932312, + "step": 12766 + }, + { + "epoch": 2.324019295531082, + "grad_norm": 33.5, + "learning_rate": 1.4853125775737187e-06, + "loss": 1.4868299961090088, + "step": 12768 + }, + { + "epoch": 2.3243833621552743, + "grad_norm": 18.875, + "learning_rate": 1.4848123810220693e-06, + "loss": 1.7010656595230103, + "step": 12770 + }, + { + "epoch": 2.3247474287794665, + "grad_norm": 9.6875, + "learning_rate": 1.4843124068180632e-06, + "loss": 1.5108423233032227, + "step": 12772 + }, + { + "epoch": 2.3251114954036587, + "grad_norm": 10.0625, + "learning_rate": 1.4838126550350684e-06, + "loss": 1.2086381912231445, + "step": 12774 + }, + { + "epoch": 2.325475562027851, + "grad_norm": 15.8125, + "learning_rate": 1.483313125746422e-06, + "loss": 1.6754951477050781, + "step": 12776 + }, + { + "epoch": 2.3258396286520435, + "grad_norm": 14.0625, + "learning_rate": 1.4828138190254276e-06, + "loss": 1.8837347030639648, + "step": 12778 + }, + { + "epoch": 2.3262036952762357, + "grad_norm": 13.5625, + "learning_rate": 1.482314734945357e-06, + "loss": 1.5949786901474, + "step": 12780 + }, + { + "epoch": 2.326567761900428, + "grad_norm": 5.5, + "learning_rate": 1.4818158735794483e-06, + "loss": 1.2288943529129028, + "step": 12782 + }, + { + "epoch": 2.32693182852462, + "grad_norm": 16.5, + "learning_rate": 1.4813172350009074e-06, + "loss": 0.8689623475074768, + "step": 12784 + }, + { + "epoch": 2.3272958951488123, + "grad_norm": 18.125, + "learning_rate": 1.4808188192829076e-06, + "loss": 0.6625009775161743, + "step": 12786 + }, + { + "epoch": 2.3276599617730045, + "grad_norm": 21.625, + "learning_rate": 1.4803206264985891e-06, + "loss": 1.7881476879119873, + "step": 12788 + }, + { + "epoch": 2.3280240283971967, + "grad_norm": 9.1875, + "learning_rate": 1.4798226567210605e-06, + "loss": 1.4295833110809326, + "step": 12790 + }, + { + "epoch": 2.328388095021389, + "grad_norm": 13.0625, + "learning_rate": 1.4793249100233962e-06, + "loss": 1.4551790952682495, + "step": 12792 + }, + { + "epoch": 2.328752161645581, + "grad_norm": 6.6875, + "learning_rate": 1.4788273864786382e-06, + "loss": 1.314556360244751, + "step": 12794 + }, + { + "epoch": 2.3291162282697733, + "grad_norm": 6.96875, + "learning_rate": 1.4783300861597965e-06, + "loss": 1.07609224319458, + "step": 12796 + }, + { + "epoch": 2.3294802948939655, + "grad_norm": 7.9375, + "learning_rate": 1.4778330091398482e-06, + "loss": 0.8711978793144226, + "step": 12798 + }, + { + "epoch": 2.3298443615181577, + "grad_norm": 30.875, + "learning_rate": 1.4773361554917367e-06, + "loss": 1.1399784088134766, + "step": 12800 + }, + { + "epoch": 2.33020842814235, + "grad_norm": 17.5, + "learning_rate": 1.4768395252883737e-06, + "loss": 1.6996500492095947, + "step": 12802 + }, + { + "epoch": 2.3305724947665425, + "grad_norm": 16.5, + "learning_rate": 1.4763431186026378e-06, + "loss": 1.6008769273757935, + "step": 12804 + }, + { + "epoch": 2.3309365613907347, + "grad_norm": 12.0, + "learning_rate": 1.475846935507374e-06, + "loss": 1.4288322925567627, + "step": 12806 + }, + { + "epoch": 2.331300628014927, + "grad_norm": 12.75, + "learning_rate": 1.4753509760753956e-06, + "loss": 1.8797374963760376, + "step": 12808 + }, + { + "epoch": 2.331664694639119, + "grad_norm": 10.375, + "learning_rate": 1.4748552403794827e-06, + "loss": 1.3842031955718994, + "step": 12810 + }, + { + "epoch": 2.3320287612633113, + "grad_norm": 23.875, + "learning_rate": 1.4743597284923824e-06, + "loss": 1.2714418172836304, + "step": 12812 + }, + { + "epoch": 2.3323928278875035, + "grad_norm": 73.5, + "learning_rate": 1.473864440486809e-06, + "loss": 0.7885069847106934, + "step": 12814 + }, + { + "epoch": 2.3327568945116957, + "grad_norm": 4.25, + "learning_rate": 1.4733693764354442e-06, + "loss": 0.9025139808654785, + "step": 12816 + }, + { + "epoch": 2.333120961135888, + "grad_norm": 19.625, + "learning_rate": 1.4728745364109364e-06, + "loss": 1.0751862525939941, + "step": 12818 + }, + { + "epoch": 2.33348502776008, + "grad_norm": 8.375, + "learning_rate": 1.4723799204859016e-06, + "loss": 1.555716633796692, + "step": 12820 + }, + { + "epoch": 2.3338490943842722, + "grad_norm": 13.0, + "learning_rate": 1.4718855287329226e-06, + "loss": 1.2870285511016846, + "step": 12822 + }, + { + "epoch": 2.3342131610084644, + "grad_norm": 15.0, + "learning_rate": 1.4713913612245492e-06, + "loss": 1.0989220142364502, + "step": 12824 + }, + { + "epoch": 2.3345772276326566, + "grad_norm": 27.25, + "learning_rate": 1.470897418033299e-06, + "loss": 1.764319658279419, + "step": 12826 + }, + { + "epoch": 2.334941294256849, + "grad_norm": 4.78125, + "learning_rate": 1.470403699231655e-06, + "loss": 0.9202785491943359, + "step": 12828 + }, + { + "epoch": 2.3353053608810415, + "grad_norm": 19.0, + "learning_rate": 1.46991020489207e-06, + "loss": 1.3015995025634766, + "step": 12830 + }, + { + "epoch": 2.335669427505233, + "grad_norm": 124.0, + "learning_rate": 1.469416935086961e-06, + "loss": 1.7507283687591553, + "step": 12832 + }, + { + "epoch": 2.336033494129426, + "grad_norm": 6.3125, + "learning_rate": 1.4689238898887144e-06, + "loss": 1.1765103340148926, + "step": 12834 + }, + { + "epoch": 2.336397560753618, + "grad_norm": 11.3125, + "learning_rate": 1.4684310693696815e-06, + "loss": 1.5808298587799072, + "step": 12836 + }, + { + "epoch": 2.3367616273778102, + "grad_norm": 13.25, + "learning_rate": 1.4679384736021827e-06, + "loss": 1.6481205224990845, + "step": 12838 + }, + { + "epoch": 2.3371256940020024, + "grad_norm": 13.1875, + "learning_rate": 1.4674461026585038e-06, + "loss": 1.370032548904419, + "step": 12840 + }, + { + "epoch": 2.3374897606261946, + "grad_norm": 27.25, + "learning_rate": 1.466953956610898e-06, + "loss": 1.4947459697723389, + "step": 12842 + }, + { + "epoch": 2.337853827250387, + "grad_norm": 11.375, + "learning_rate": 1.466462035531587e-06, + "loss": 1.8175482749938965, + "step": 12844 + }, + { + "epoch": 2.338217893874579, + "grad_norm": 10.875, + "learning_rate": 1.465970339492757e-06, + "loss": 1.192372441291809, + "step": 12846 + }, + { + "epoch": 2.338581960498771, + "grad_norm": 30.5, + "learning_rate": 1.4654788685665627e-06, + "loss": 0.791659951210022, + "step": 12848 + }, + { + "epoch": 2.3389460271229634, + "grad_norm": 39.5, + "learning_rate": 1.4649876228251259e-06, + "loss": 1.5135085582733154, + "step": 12850 + }, + { + "epoch": 2.3393100937471556, + "grad_norm": 22.75, + "learning_rate": 1.464496602340534e-06, + "loss": 1.6889326572418213, + "step": 12852 + }, + { + "epoch": 2.339674160371348, + "grad_norm": 129.0, + "learning_rate": 1.4640058071848434e-06, + "loss": 1.2184324264526367, + "step": 12854 + }, + { + "epoch": 2.3400382269955404, + "grad_norm": 32.0, + "learning_rate": 1.4635152374300754e-06, + "loss": 1.1586614847183228, + "step": 12856 + }, + { + "epoch": 2.340402293619732, + "grad_norm": 15.3125, + "learning_rate": 1.46302489314822e-06, + "loss": 1.2337902784347534, + "step": 12858 + }, + { + "epoch": 2.340766360243925, + "grad_norm": 9.0, + "learning_rate": 1.4625347744112323e-06, + "loss": 1.2980743646621704, + "step": 12860 + }, + { + "epoch": 2.341130426868117, + "grad_norm": 19.375, + "learning_rate": 1.4620448812910357e-06, + "loss": 1.7018345594406128, + "step": 12862 + }, + { + "epoch": 2.341494493492309, + "grad_norm": 29.625, + "learning_rate": 1.46155521385952e-06, + "loss": 2.0326168537139893, + "step": 12864 + }, + { + "epoch": 2.3418585601165014, + "grad_norm": 25.5, + "learning_rate": 1.461065772188542e-06, + "loss": 1.3540562391281128, + "step": 12866 + }, + { + "epoch": 2.3422226267406936, + "grad_norm": 26.75, + "learning_rate": 1.460576556349925e-06, + "loss": 1.3052375316619873, + "step": 12868 + }, + { + "epoch": 2.342586693364886, + "grad_norm": 28.625, + "learning_rate": 1.46008756641546e-06, + "loss": 2.268367290496826, + "step": 12870 + }, + { + "epoch": 2.342950759989078, + "grad_norm": 22.0, + "learning_rate": 1.4595988024569032e-06, + "loss": 1.1996707916259766, + "step": 12872 + }, + { + "epoch": 2.34331482661327, + "grad_norm": 13.0625, + "learning_rate": 1.4591102645459798e-06, + "loss": 0.9140642285346985, + "step": 12874 + }, + { + "epoch": 2.3436788932374624, + "grad_norm": 32.75, + "learning_rate": 1.4586219527543808e-06, + "loss": 1.5103952884674072, + "step": 12876 + }, + { + "epoch": 2.3440429598616546, + "grad_norm": 18.375, + "learning_rate": 1.458133867153763e-06, + "loss": 1.570326566696167, + "step": 12878 + }, + { + "epoch": 2.3444070264858468, + "grad_norm": 18.625, + "learning_rate": 1.4576460078157518e-06, + "loss": 1.4378412961959839, + "step": 12880 + }, + { + "epoch": 2.344771093110039, + "grad_norm": 11.0, + "learning_rate": 1.4571583748119382e-06, + "loss": 1.5854227542877197, + "step": 12882 + }, + { + "epoch": 2.345135159734231, + "grad_norm": 9.8125, + "learning_rate": 1.4566709682138808e-06, + "loss": 1.4638748168945312, + "step": 12884 + }, + { + "epoch": 2.345499226358424, + "grad_norm": 5.9375, + "learning_rate": 1.456183788093104e-06, + "loss": 0.9610429406166077, + "step": 12886 + }, + { + "epoch": 2.345863292982616, + "grad_norm": 37.0, + "learning_rate": 1.4556968345210998e-06, + "loss": 0.9435508251190186, + "step": 12888 + }, + { + "epoch": 2.346227359606808, + "grad_norm": 13.125, + "learning_rate": 1.4552101075693268e-06, + "loss": 0.8131308555603027, + "step": 12890 + }, + { + "epoch": 2.3465914262310004, + "grad_norm": 9.3125, + "learning_rate": 1.4547236073092096e-06, + "loss": 1.4127264022827148, + "step": 12892 + }, + { + "epoch": 2.3469554928551926, + "grad_norm": 13.9375, + "learning_rate": 1.454237333812141e-06, + "loss": 1.47605562210083, + "step": 12894 + }, + { + "epoch": 2.3473195594793848, + "grad_norm": 17.0, + "learning_rate": 1.453751287149479e-06, + "loss": 1.350528597831726, + "step": 12896 + }, + { + "epoch": 2.347683626103577, + "grad_norm": 10.1875, + "learning_rate": 1.4532654673925495e-06, + "loss": 1.346376895904541, + "step": 12898 + }, + { + "epoch": 2.348047692727769, + "grad_norm": 6.40625, + "learning_rate": 1.4527798746126442e-06, + "loss": 1.2344372272491455, + "step": 12900 + }, + { + "epoch": 2.3484117593519613, + "grad_norm": 3.375, + "learning_rate": 1.4522945088810217e-06, + "loss": 0.715396523475647, + "step": 12902 + }, + { + "epoch": 2.3487758259761535, + "grad_norm": 7.28125, + "learning_rate": 1.4518093702689079e-06, + "loss": 1.3802067041397095, + "step": 12904 + }, + { + "epoch": 2.3491398926003457, + "grad_norm": 9.4375, + "learning_rate": 1.4513244588474948e-06, + "loss": 1.1022003889083862, + "step": 12906 + }, + { + "epoch": 2.349503959224538, + "grad_norm": 13.25, + "learning_rate": 1.4508397746879411e-06, + "loss": 1.291582703590393, + "step": 12908 + }, + { + "epoch": 2.34986802584873, + "grad_norm": 8.5625, + "learning_rate": 1.450355317861372e-06, + "loss": 1.2321122884750366, + "step": 12910 + }, + { + "epoch": 2.3502320924729228, + "grad_norm": 10.25, + "learning_rate": 1.44987108843888e-06, + "loss": 0.9633653163909912, + "step": 12912 + }, + { + "epoch": 2.350596159097115, + "grad_norm": 11.25, + "learning_rate": 1.449387086491524e-06, + "loss": 1.5249788761138916, + "step": 12914 + }, + { + "epoch": 2.350960225721307, + "grad_norm": 9.375, + "learning_rate": 1.4489033120903284e-06, + "loss": 1.3060550689697266, + "step": 12916 + }, + { + "epoch": 2.3513242923454993, + "grad_norm": 14.625, + "learning_rate": 1.4484197653062863e-06, + "loss": 1.2838326692581177, + "step": 12918 + }, + { + "epoch": 2.3516883589696915, + "grad_norm": 6.09375, + "learning_rate": 1.4479364462103551e-06, + "loss": 1.2784196138381958, + "step": 12920 + }, + { + "epoch": 2.3520524255938837, + "grad_norm": 21.875, + "learning_rate": 1.4474533548734607e-06, + "loss": 1.5399810075759888, + "step": 12922 + }, + { + "epoch": 2.352416492218076, + "grad_norm": 16.125, + "learning_rate": 1.4469704913664947e-06, + "loss": 1.7008895874023438, + "step": 12924 + }, + { + "epoch": 2.352780558842268, + "grad_norm": 26.125, + "learning_rate": 1.446487855760315e-06, + "loss": 1.6886892318725586, + "step": 12926 + }, + { + "epoch": 2.3531446254664603, + "grad_norm": 19.5, + "learning_rate": 1.4460054481257468e-06, + "loss": 0.9687001705169678, + "step": 12928 + }, + { + "epoch": 2.3535086920906525, + "grad_norm": 15.6875, + "learning_rate": 1.445523268533581e-06, + "loss": 1.6733012199401855, + "step": 12930 + }, + { + "epoch": 2.3538727587148447, + "grad_norm": 20.75, + "learning_rate": 1.445041317054576e-06, + "loss": 2.175306797027588, + "step": 12932 + }, + { + "epoch": 2.354236825339037, + "grad_norm": 18.25, + "learning_rate": 1.4445595937594558e-06, + "loss": 1.5332789421081543, + "step": 12934 + }, + { + "epoch": 2.354600891963229, + "grad_norm": 16.125, + "learning_rate": 1.4440780987189118e-06, + "loss": 1.5125454664230347, + "step": 12936 + }, + { + "epoch": 2.3549649585874217, + "grad_norm": 13.6875, + "learning_rate": 1.4435968320036014e-06, + "loss": 1.656247615814209, + "step": 12938 + }, + { + "epoch": 2.355329025211614, + "grad_norm": 18.125, + "learning_rate": 1.443115793684148e-06, + "loss": 1.7316515445709229, + "step": 12940 + }, + { + "epoch": 2.355693091835806, + "grad_norm": 11.0625, + "learning_rate": 1.4426349838311427e-06, + "loss": 1.5492905378341675, + "step": 12942 + }, + { + "epoch": 2.3560571584599983, + "grad_norm": 11.125, + "learning_rate": 1.4421544025151418e-06, + "loss": 1.2327100038528442, + "step": 12944 + }, + { + "epoch": 2.3564212250841905, + "grad_norm": 17.75, + "learning_rate": 1.4416740498066692e-06, + "loss": 1.5096063613891602, + "step": 12946 + }, + { + "epoch": 2.3567852917083827, + "grad_norm": 13.5625, + "learning_rate": 1.4411939257762142e-06, + "loss": 1.390622854232788, + "step": 12948 + }, + { + "epoch": 2.357149358332575, + "grad_norm": 14.3125, + "learning_rate": 1.4407140304942332e-06, + "loss": 1.573535442352295, + "step": 12950 + }, + { + "epoch": 2.357513424956767, + "grad_norm": 16.25, + "learning_rate": 1.4402343640311491e-06, + "loss": 1.849221110343933, + "step": 12952 + }, + { + "epoch": 2.3578774915809593, + "grad_norm": 9.875, + "learning_rate": 1.439754926457351e-06, + "loss": 1.3351898193359375, + "step": 12954 + }, + { + "epoch": 2.3582415582051515, + "grad_norm": 14.4375, + "learning_rate": 1.4392757178431947e-06, + "loss": 1.5842418670654297, + "step": 12956 + }, + { + "epoch": 2.3586056248293437, + "grad_norm": 7.8125, + "learning_rate": 1.438796738259001e-06, + "loss": 1.1496270895004272, + "step": 12958 + }, + { + "epoch": 2.358969691453536, + "grad_norm": 40.0, + "learning_rate": 1.4383179877750595e-06, + "loss": 0.5406174659729004, + "step": 12960 + }, + { + "epoch": 2.359333758077728, + "grad_norm": 9.875, + "learning_rate": 1.437839466461624e-06, + "loss": 1.2391849756240845, + "step": 12962 + }, + { + "epoch": 2.3596978247019207, + "grad_norm": 16.625, + "learning_rate": 1.437361174388916e-06, + "loss": 1.6558117866516113, + "step": 12964 + }, + { + "epoch": 2.3600618913261124, + "grad_norm": 10.875, + "learning_rate": 1.436883111627123e-06, + "loss": 1.1513111591339111, + "step": 12966 + }, + { + "epoch": 2.360425957950305, + "grad_norm": 12.25, + "learning_rate": 1.4364052782463985e-06, + "loss": 1.433065414428711, + "step": 12968 + }, + { + "epoch": 2.3607900245744973, + "grad_norm": 12.0, + "learning_rate": 1.4359276743168626e-06, + "loss": 1.4959712028503418, + "step": 12970 + }, + { + "epoch": 2.3611540911986895, + "grad_norm": 14.6875, + "learning_rate": 1.435450299908602e-06, + "loss": 1.7500238418579102, + "step": 12972 + }, + { + "epoch": 2.3615181578228817, + "grad_norm": 29.375, + "learning_rate": 1.4349731550916692e-06, + "loss": 1.6094770431518555, + "step": 12974 + }, + { + "epoch": 2.361882224447074, + "grad_norm": 7.0625, + "learning_rate": 1.4344962399360836e-06, + "loss": 0.9987369775772095, + "step": 12976 + }, + { + "epoch": 2.362246291071266, + "grad_norm": 22.25, + "learning_rate": 1.4340195545118304e-06, + "loss": 1.464593768119812, + "step": 12978 + }, + { + "epoch": 2.3626103576954582, + "grad_norm": 12.125, + "learning_rate": 1.433543098888861e-06, + "loss": 0.6411446928977966, + "step": 12980 + }, + { + "epoch": 2.3629744243196504, + "grad_norm": 11.625, + "learning_rate": 1.4330668731370937e-06, + "loss": 1.3931729793548584, + "step": 12982 + }, + { + "epoch": 2.3633384909438426, + "grad_norm": 51.5, + "learning_rate": 1.4325908773264125e-06, + "loss": 1.8375945091247559, + "step": 12984 + }, + { + "epoch": 2.363702557568035, + "grad_norm": 11.1875, + "learning_rate": 1.4321151115266676e-06, + "loss": 1.5373510122299194, + "step": 12986 + }, + { + "epoch": 2.364066624192227, + "grad_norm": 11.1875, + "learning_rate": 1.4316395758076765e-06, + "loss": 1.1396845579147339, + "step": 12988 + }, + { + "epoch": 2.364430690816419, + "grad_norm": 16.5, + "learning_rate": 1.4311642702392215e-06, + "loss": 0.9452848434448242, + "step": 12990 + }, + { + "epoch": 2.3647947574406114, + "grad_norm": 7.46875, + "learning_rate": 1.4306891948910517e-06, + "loss": 1.2993203401565552, + "step": 12992 + }, + { + "epoch": 2.365158824064804, + "grad_norm": 3.6875, + "learning_rate": 1.4302143498328828e-06, + "loss": 1.0312904119491577, + "step": 12994 + }, + { + "epoch": 2.3655228906889962, + "grad_norm": 8.3125, + "learning_rate": 1.4297397351343965e-06, + "loss": 1.2052390575408936, + "step": 12996 + }, + { + "epoch": 2.3658869573131884, + "grad_norm": 32.0, + "learning_rate": 1.4292653508652398e-06, + "loss": 1.3997597694396973, + "step": 12998 + }, + { + "epoch": 2.3662510239373806, + "grad_norm": 9.4375, + "learning_rate": 1.4287911970950275e-06, + "loss": 1.3373398780822754, + "step": 13000 + }, + { + "epoch": 2.366615090561573, + "grad_norm": 12.0, + "learning_rate": 1.4283172738933396e-06, + "loss": 1.4318392276763916, + "step": 13002 + }, + { + "epoch": 2.366979157185765, + "grad_norm": 7.0, + "learning_rate": 1.4278435813297223e-06, + "loss": 1.2079908847808838, + "step": 13004 + }, + { + "epoch": 2.367343223809957, + "grad_norm": 187.0, + "learning_rate": 1.427370119473688e-06, + "loss": 1.459062933921814, + "step": 13006 + }, + { + "epoch": 2.3677072904341494, + "grad_norm": 9.125, + "learning_rate": 1.4268968883947154e-06, + "loss": 1.410860300064087, + "step": 13008 + }, + { + "epoch": 2.3680713570583416, + "grad_norm": 44.0, + "learning_rate": 1.4264238881622492e-06, + "loss": 1.0161633491516113, + "step": 13010 + }, + { + "epoch": 2.368435423682534, + "grad_norm": 6.96875, + "learning_rate": 1.4259511188456998e-06, + "loss": 1.3491284847259521, + "step": 13012 + }, + { + "epoch": 2.368799490306726, + "grad_norm": 8.0625, + "learning_rate": 1.4254785805144452e-06, + "loss": 1.3301050662994385, + "step": 13014 + }, + { + "epoch": 2.369163556930918, + "grad_norm": 15.0, + "learning_rate": 1.425006273237828e-06, + "loss": 1.3427374362945557, + "step": 13016 + }, + { + "epoch": 2.3695276235551104, + "grad_norm": 16.125, + "learning_rate": 1.4245341970851568e-06, + "loss": 1.4216352701187134, + "step": 13018 + }, + { + "epoch": 2.369891690179303, + "grad_norm": 37.0, + "learning_rate": 1.424062352125708e-06, + "loss": 1.4281244277954102, + "step": 13020 + }, + { + "epoch": 2.370255756803495, + "grad_norm": 12.0, + "learning_rate": 1.4235907384287218e-06, + "loss": 1.3793773651123047, + "step": 13022 + }, + { + "epoch": 2.3706198234276874, + "grad_norm": 11.0, + "learning_rate": 1.423119356063406e-06, + "loss": 1.1660332679748535, + "step": 13024 + }, + { + "epoch": 2.3709838900518796, + "grad_norm": 6.0, + "learning_rate": 1.4226482050989345e-06, + "loss": 1.2736250162124634, + "step": 13026 + }, + { + "epoch": 2.371347956676072, + "grad_norm": 9.1875, + "learning_rate": 1.4221772856044467e-06, + "loss": 1.399125337600708, + "step": 13028 + }, + { + "epoch": 2.371712023300264, + "grad_norm": 16.875, + "learning_rate": 1.4217065976490474e-06, + "loss": 1.475401759147644, + "step": 13030 + }, + { + "epoch": 2.372076089924456, + "grad_norm": 13.9375, + "learning_rate": 1.4212361413018088e-06, + "loss": 1.6186275482177734, + "step": 13032 + }, + { + "epoch": 2.3724401565486484, + "grad_norm": 9.5625, + "learning_rate": 1.4207659166317683e-06, + "loss": 1.314518928527832, + "step": 13034 + }, + { + "epoch": 2.3728042231728406, + "grad_norm": 16.375, + "learning_rate": 1.4202959237079295e-06, + "loss": 1.5972141027450562, + "step": 13036 + }, + { + "epoch": 2.3731682897970328, + "grad_norm": 6.71875, + "learning_rate": 1.4198261625992618e-06, + "loss": 1.229283094406128, + "step": 13038 + }, + { + "epoch": 2.373532356421225, + "grad_norm": 19.625, + "learning_rate": 1.4193566333747012e-06, + "loss": 1.0320490598678589, + "step": 13040 + }, + { + "epoch": 2.373896423045417, + "grad_norm": 6.34375, + "learning_rate": 1.4188873361031482e-06, + "loss": 1.1893830299377441, + "step": 13042 + }, + { + "epoch": 2.3742604896696093, + "grad_norm": 10.0, + "learning_rate": 1.4184182708534713e-06, + "loss": 1.5255910158157349, + "step": 13044 + }, + { + "epoch": 2.374624556293802, + "grad_norm": 7.125, + "learning_rate": 1.4179494376945036e-06, + "loss": 1.460092306137085, + "step": 13046 + }, + { + "epoch": 2.374988622917994, + "grad_norm": 24.125, + "learning_rate": 1.4174808366950442e-06, + "loss": 1.7898707389831543, + "step": 13048 + }, + { + "epoch": 2.3753526895421864, + "grad_norm": 36.25, + "learning_rate": 1.4170124679238592e-06, + "loss": 1.7407951354980469, + "step": 13050 + }, + { + "epoch": 2.3757167561663786, + "grad_norm": 11.125, + "learning_rate": 1.4165443314496789e-06, + "loss": 1.8305128812789917, + "step": 13052 + }, + { + "epoch": 2.3760808227905708, + "grad_norm": 10.1875, + "learning_rate": 1.4160764273412008e-06, + "loss": 1.3565661907196045, + "step": 13054 + }, + { + "epoch": 2.376444889414763, + "grad_norm": 26.0, + "learning_rate": 1.4156087556670877e-06, + "loss": 1.6626286506652832, + "step": 13056 + }, + { + "epoch": 2.376808956038955, + "grad_norm": 18.625, + "learning_rate": 1.415141316495969e-06, + "loss": 1.8741836547851562, + "step": 13058 + }, + { + "epoch": 2.3771730226631473, + "grad_norm": 13.0, + "learning_rate": 1.4146741098964389e-06, + "loss": 1.4989471435546875, + "step": 13060 + }, + { + "epoch": 2.3775370892873395, + "grad_norm": 10.5, + "learning_rate": 1.4142071359370587e-06, + "loss": 1.6329776048660278, + "step": 13062 + }, + { + "epoch": 2.3779011559115317, + "grad_norm": 17.625, + "learning_rate": 1.4137403946863547e-06, + "loss": 0.9275729060173035, + "step": 13064 + }, + { + "epoch": 2.378265222535724, + "grad_norm": 692.0, + "learning_rate": 1.4132738862128192e-06, + "loss": 0.6249986886978149, + "step": 13066 + }, + { + "epoch": 2.378629289159916, + "grad_norm": 4.0, + "learning_rate": 1.4128076105849103e-06, + "loss": 1.0405938625335693, + "step": 13068 + }, + { + "epoch": 2.3789933557841083, + "grad_norm": 9.75, + "learning_rate": 1.4123415678710522e-06, + "loss": 1.0645229816436768, + "step": 13070 + }, + { + "epoch": 2.379357422408301, + "grad_norm": 11.1875, + "learning_rate": 1.411875758139635e-06, + "loss": 1.3818163871765137, + "step": 13072 + }, + { + "epoch": 2.3797214890324927, + "grad_norm": 11.25, + "learning_rate": 1.4114101814590143e-06, + "loss": 1.4613906145095825, + "step": 13074 + }, + { + "epoch": 2.3800855556566853, + "grad_norm": 9.625, + "learning_rate": 1.410944837897511e-06, + "loss": 1.4235641956329346, + "step": 13076 + }, + { + "epoch": 2.3804496222808775, + "grad_norm": 5.90625, + "learning_rate": 1.4104797275234131e-06, + "loss": 0.8895462155342102, + "step": 13078 + }, + { + "epoch": 2.3808136889050697, + "grad_norm": 20.5, + "learning_rate": 1.4100148504049736e-06, + "loss": 1.2731987237930298, + "step": 13080 + }, + { + "epoch": 2.381177755529262, + "grad_norm": 10.0, + "learning_rate": 1.4095502066104107e-06, + "loss": 1.770280361175537, + "step": 13082 + }, + { + "epoch": 2.381541822153454, + "grad_norm": 9.125, + "learning_rate": 1.4090857962079099e-06, + "loss": 1.5567989349365234, + "step": 13084 + }, + { + "epoch": 2.3819058887776463, + "grad_norm": 13.6875, + "learning_rate": 1.408621619265621e-06, + "loss": 1.588033676147461, + "step": 13086 + }, + { + "epoch": 2.3822699554018385, + "grad_norm": 14.1875, + "learning_rate": 1.40815767585166e-06, + "loss": 1.3291270732879639, + "step": 13088 + }, + { + "epoch": 2.3826340220260307, + "grad_norm": 14.5625, + "learning_rate": 1.407693966034109e-06, + "loss": 1.4294660091400146, + "step": 13090 + }, + { + "epoch": 2.382998088650223, + "grad_norm": 9.8125, + "learning_rate": 1.4072304898810155e-06, + "loss": 1.2907848358154297, + "step": 13092 + }, + { + "epoch": 2.383362155274415, + "grad_norm": 11.0, + "learning_rate": 1.4067672474603928e-06, + "loss": 0.9829186201095581, + "step": 13094 + }, + { + "epoch": 2.3837262218986073, + "grad_norm": 10.0625, + "learning_rate": 1.4063042388402193e-06, + "loss": 1.4701520204544067, + "step": 13096 + }, + { + "epoch": 2.3840902885228, + "grad_norm": 18.5, + "learning_rate": 1.4058414640884404e-06, + "loss": 1.5229731798171997, + "step": 13098 + }, + { + "epoch": 2.3844543551469917, + "grad_norm": 9.375, + "learning_rate": 1.4053789232729661e-06, + "loss": 1.069821834564209, + "step": 13100 + }, + { + "epoch": 2.3848184217711843, + "grad_norm": 15.5, + "learning_rate": 1.4049166164616724e-06, + "loss": 1.087325930595398, + "step": 13102 + }, + { + "epoch": 2.3851824883953765, + "grad_norm": 23.625, + "learning_rate": 1.4044545437224008e-06, + "loss": 1.6770304441452026, + "step": 13104 + }, + { + "epoch": 2.3855465550195687, + "grad_norm": 10.4375, + "learning_rate": 1.4039927051229584e-06, + "loss": 1.6059675216674805, + "step": 13106 + }, + { + "epoch": 2.385910621643761, + "grad_norm": 18.25, + "learning_rate": 1.4035311007311192e-06, + "loss": 1.6826673746109009, + "step": 13108 + }, + { + "epoch": 2.386274688267953, + "grad_norm": 22.625, + "learning_rate": 1.4030697306146205e-06, + "loss": 1.8292741775512695, + "step": 13110 + }, + { + "epoch": 2.3866387548921453, + "grad_norm": 16.125, + "learning_rate": 1.4026085948411672e-06, + "loss": 1.5285316705703735, + "step": 13112 + }, + { + "epoch": 2.3870028215163375, + "grad_norm": 8.375, + "learning_rate": 1.402147693478429e-06, + "loss": 1.4824053049087524, + "step": 13114 + }, + { + "epoch": 2.3873668881405297, + "grad_norm": 9.1875, + "learning_rate": 1.401687026594041e-06, + "loss": 1.4457030296325684, + "step": 13116 + }, + { + "epoch": 2.387730954764722, + "grad_norm": 4.90625, + "learning_rate": 1.4012265942556046e-06, + "loss": 1.1098920106887817, + "step": 13118 + }, + { + "epoch": 2.388095021388914, + "grad_norm": 11.3125, + "learning_rate": 1.4007663965306863e-06, + "loss": 1.609438180923462, + "step": 13120 + }, + { + "epoch": 2.3884590880131062, + "grad_norm": 7.75, + "learning_rate": 1.4003064334868183e-06, + "loss": 1.530211091041565, + "step": 13122 + }, + { + "epoch": 2.3888231546372984, + "grad_norm": 14.375, + "learning_rate": 1.3998467051914983e-06, + "loss": 1.5139378309249878, + "step": 13124 + }, + { + "epoch": 2.3891872212614906, + "grad_norm": 23.125, + "learning_rate": 1.399387211712189e-06, + "loss": 1.7802497148513794, + "step": 13126 + }, + { + "epoch": 2.3895512878856833, + "grad_norm": 10.5625, + "learning_rate": 1.39892795311632e-06, + "loss": 1.1451196670532227, + "step": 13128 + }, + { + "epoch": 2.3899153545098755, + "grad_norm": 12.8125, + "learning_rate": 1.398468929471285e-06, + "loss": 1.1933187246322632, + "step": 13130 + }, + { + "epoch": 2.3902794211340677, + "grad_norm": 24.625, + "learning_rate": 1.3980101408444446e-06, + "loss": 1.3968465328216553, + "step": 13132 + }, + { + "epoch": 2.39064348775826, + "grad_norm": 14.3125, + "learning_rate": 1.397551587303123e-06, + "loss": 1.8935859203338623, + "step": 13134 + }, + { + "epoch": 2.391007554382452, + "grad_norm": 10.9375, + "learning_rate": 1.3970932689146127e-06, + "loss": 1.4720077514648438, + "step": 13136 + }, + { + "epoch": 2.3913716210066442, + "grad_norm": 6.15625, + "learning_rate": 1.3966351857461688e-06, + "loss": 1.1271402835845947, + "step": 13138 + }, + { + "epoch": 2.3917356876308364, + "grad_norm": 10.3125, + "learning_rate": 1.3961773378650135e-06, + "loss": 1.4992789030075073, + "step": 13140 + }, + { + "epoch": 2.3920997542550286, + "grad_norm": 7.375, + "learning_rate": 1.3957197253383339e-06, + "loss": 1.4290014505386353, + "step": 13142 + }, + { + "epoch": 2.392463820879221, + "grad_norm": 22.125, + "learning_rate": 1.3952623482332833e-06, + "loss": 1.307976245880127, + "step": 13144 + }, + { + "epoch": 2.392827887503413, + "grad_norm": 9.625, + "learning_rate": 1.3948052066169794e-06, + "loss": 1.1577008962631226, + "step": 13146 + }, + { + "epoch": 2.393191954127605, + "grad_norm": 16.5, + "learning_rate": 1.3943483005565068e-06, + "loss": 1.9710172414779663, + "step": 13148 + }, + { + "epoch": 2.3935560207517974, + "grad_norm": 7.90625, + "learning_rate": 1.393891630118913e-06, + "loss": 1.7011219263076782, + "step": 13150 + }, + { + "epoch": 2.3939200873759896, + "grad_norm": 77.0, + "learning_rate": 1.3934351953712145e-06, + "loss": 1.2155952453613281, + "step": 13152 + }, + { + "epoch": 2.3942841540001822, + "grad_norm": 15.625, + "learning_rate": 1.3929789963803897e-06, + "loss": 1.8661668300628662, + "step": 13154 + }, + { + "epoch": 2.3946482206243744, + "grad_norm": 15.0, + "learning_rate": 1.3925230332133844e-06, + "loss": 1.055245041847229, + "step": 13156 + }, + { + "epoch": 2.3950122872485666, + "grad_norm": 9.125, + "learning_rate": 1.3920673059371095e-06, + "loss": 1.388941764831543, + "step": 13158 + }, + { + "epoch": 2.395376353872759, + "grad_norm": 11.6875, + "learning_rate": 1.3916118146184412e-06, + "loss": 1.5239473581314087, + "step": 13160 + }, + { + "epoch": 2.395740420496951, + "grad_norm": 22.375, + "learning_rate": 1.391156559324221e-06, + "loss": 0.7819581031799316, + "step": 13162 + }, + { + "epoch": 2.396104487121143, + "grad_norm": 7.125, + "learning_rate": 1.3907015401212553e-06, + "loss": 0.7420260906219482, + "step": 13164 + }, + { + "epoch": 2.3964685537453354, + "grad_norm": 17.125, + "learning_rate": 1.390246757076317e-06, + "loss": 1.4705394506454468, + "step": 13166 + }, + { + "epoch": 2.3968326203695276, + "grad_norm": 18.625, + "learning_rate": 1.3897922102561433e-06, + "loss": 1.556272268295288, + "step": 13168 + }, + { + "epoch": 2.39719668699372, + "grad_norm": 13.0, + "learning_rate": 1.3893378997274371e-06, + "loss": 1.8583375215530396, + "step": 13170 + }, + { + "epoch": 2.397560753617912, + "grad_norm": 22.125, + "learning_rate": 1.3888838255568666e-06, + "loss": 1.7090548276901245, + "step": 13172 + }, + { + "epoch": 2.397924820242104, + "grad_norm": 16.75, + "learning_rate": 1.3884299878110651e-06, + "loss": 0.9558186531066895, + "step": 13174 + }, + { + "epoch": 2.3982888868662964, + "grad_norm": 12.125, + "learning_rate": 1.3879763865566323e-06, + "loss": 1.4936777353286743, + "step": 13176 + }, + { + "epoch": 2.3986529534904886, + "grad_norm": 12.8125, + "learning_rate": 1.3875230218601315e-06, + "loss": 1.2443791627883911, + "step": 13178 + }, + { + "epoch": 2.399017020114681, + "grad_norm": 3.15625, + "learning_rate": 1.3870698937880928e-06, + "loss": 1.115652084350586, + "step": 13180 + }, + { + "epoch": 2.3993810867388734, + "grad_norm": 31.375, + "learning_rate": 1.3866170024070102e-06, + "loss": 1.2917252779006958, + "step": 13182 + }, + { + "epoch": 2.3997451533630656, + "grad_norm": 24.25, + "learning_rate": 1.3861643477833442e-06, + "loss": 1.609562873840332, + "step": 13184 + }, + { + "epoch": 2.400109219987258, + "grad_norm": 9.5, + "learning_rate": 1.3857119299835197e-06, + "loss": 0.6737778186798096, + "step": 13186 + }, + { + "epoch": 2.40047328661145, + "grad_norm": 17.25, + "learning_rate": 1.3852597490739272e-06, + "loss": 1.5048739910125732, + "step": 13188 + }, + { + "epoch": 2.400837353235642, + "grad_norm": 7.8125, + "learning_rate": 1.384807805120923e-06, + "loss": 1.414874792098999, + "step": 13190 + }, + { + "epoch": 2.4012014198598344, + "grad_norm": 55.5, + "learning_rate": 1.3843560981908274e-06, + "loss": 1.2349931001663208, + "step": 13192 + }, + { + "epoch": 2.4015654864840266, + "grad_norm": 3.875, + "learning_rate": 1.383904628349927e-06, + "loss": 1.2970219850540161, + "step": 13194 + }, + { + "epoch": 2.4019295531082188, + "grad_norm": 10.25, + "learning_rate": 1.3834533956644724e-06, + "loss": 1.1217862367630005, + "step": 13196 + }, + { + "epoch": 2.402293619732411, + "grad_norm": 35.5, + "learning_rate": 1.383002400200681e-06, + "loss": 1.3421778678894043, + "step": 13198 + }, + { + "epoch": 2.402657686356603, + "grad_norm": 13.25, + "learning_rate": 1.3825516420247342e-06, + "loss": 1.5503783226013184, + "step": 13200 + }, + { + "epoch": 2.4030217529807953, + "grad_norm": 18.375, + "learning_rate": 1.382101121202779e-06, + "loss": 1.7319862842559814, + "step": 13202 + }, + { + "epoch": 2.4033858196049875, + "grad_norm": 18.875, + "learning_rate": 1.3816508378009274e-06, + "loss": 1.868557095527649, + "step": 13204 + }, + { + "epoch": 2.40374988622918, + "grad_norm": 24.5, + "learning_rate": 1.3812007918852568e-06, + "loss": 1.9943368434906006, + "step": 13206 + }, + { + "epoch": 2.404113952853372, + "grad_norm": 9.8125, + "learning_rate": 1.3807509835218097e-06, + "loss": 1.491589903831482, + "step": 13208 + }, + { + "epoch": 2.4044780194775646, + "grad_norm": 8.75, + "learning_rate": 1.3803014127765935e-06, + "loss": 1.505659580230713, + "step": 13210 + }, + { + "epoch": 2.4048420861017568, + "grad_norm": 39.0, + "learning_rate": 1.3798520797155809e-06, + "loss": 1.269268274307251, + "step": 13212 + }, + { + "epoch": 2.405206152725949, + "grad_norm": 8.5, + "learning_rate": 1.3794029844047097e-06, + "loss": 1.273883581161499, + "step": 13214 + }, + { + "epoch": 2.405570219350141, + "grad_norm": 8.9375, + "learning_rate": 1.3789541269098827e-06, + "loss": 1.296435832977295, + "step": 13216 + }, + { + "epoch": 2.4059342859743333, + "grad_norm": 12.5625, + "learning_rate": 1.3785055072969682e-06, + "loss": 1.137270212173462, + "step": 13218 + }, + { + "epoch": 2.4062983525985255, + "grad_norm": 20.375, + "learning_rate": 1.378057125631799e-06, + "loss": 1.4742759466171265, + "step": 13220 + }, + { + "epoch": 2.4066624192227177, + "grad_norm": 38.5, + "learning_rate": 1.3776089819801738e-06, + "loss": 1.1375072002410889, + "step": 13222 + }, + { + "epoch": 2.40702648584691, + "grad_norm": 33.75, + "learning_rate": 1.3771610764078552e-06, + "loss": 1.6485258340835571, + "step": 13224 + }, + { + "epoch": 2.407390552471102, + "grad_norm": 9.25, + "learning_rate": 1.376713408980572e-06, + "loss": 1.494165062904358, + "step": 13226 + }, + { + "epoch": 2.4077546190952943, + "grad_norm": 9.25, + "learning_rate": 1.3762659797640174e-06, + "loss": 1.4244030714035034, + "step": 13228 + }, + { + "epoch": 2.4081186857194865, + "grad_norm": 21.25, + "learning_rate": 1.3758187888238496e-06, + "loss": 1.434171438217163, + "step": 13230 + }, + { + "epoch": 2.4084827523436787, + "grad_norm": 14.8125, + "learning_rate": 1.3753718362256927e-06, + "loss": 1.387436032295227, + "step": 13232 + }, + { + "epoch": 2.408846818967871, + "grad_norm": 60.0, + "learning_rate": 1.3749251220351345e-06, + "loss": 1.6377493143081665, + "step": 13234 + }, + { + "epoch": 2.4092108855920635, + "grad_norm": 9.375, + "learning_rate": 1.374478646317729e-06, + "loss": 1.6165059804916382, + "step": 13236 + }, + { + "epoch": 2.4095749522162557, + "grad_norm": 3.671875, + "learning_rate": 1.3740324091389945e-06, + "loss": 1.4072531461715698, + "step": 13238 + }, + { + "epoch": 2.409939018840448, + "grad_norm": 7.03125, + "learning_rate": 1.3735864105644142e-06, + "loss": 1.107566475868225, + "step": 13240 + }, + { + "epoch": 2.41030308546464, + "grad_norm": 13.8125, + "learning_rate": 1.3731406506594373e-06, + "loss": 1.654891848564148, + "step": 13242 + }, + { + "epoch": 2.4106671520888323, + "grad_norm": 10.4375, + "learning_rate": 1.3726951294894764e-06, + "loss": 1.5468419790267944, + "step": 13244 + }, + { + "epoch": 2.4110312187130245, + "grad_norm": 10.125, + "learning_rate": 1.3722498471199105e-06, + "loss": 1.3897886276245117, + "step": 13246 + }, + { + "epoch": 2.4113952853372167, + "grad_norm": 14.3125, + "learning_rate": 1.371804803616083e-06, + "loss": 1.7811107635498047, + "step": 13248 + }, + { + "epoch": 2.411759351961409, + "grad_norm": 9.625, + "learning_rate": 1.3713599990433018e-06, + "loss": 1.4439624547958374, + "step": 13250 + }, + { + "epoch": 2.412123418585601, + "grad_norm": 21.875, + "learning_rate": 1.3709154334668406e-06, + "loss": 1.201629877090454, + "step": 13252 + }, + { + "epoch": 2.4124874852097933, + "grad_norm": 12.875, + "learning_rate": 1.3704711069519374e-06, + "loss": 1.766385793685913, + "step": 13254 + }, + { + "epoch": 2.4128515518339855, + "grad_norm": 8.6875, + "learning_rate": 1.3700270195637954e-06, + "loss": 1.387178897857666, + "step": 13256 + }, + { + "epoch": 2.4132156184581777, + "grad_norm": 10.0625, + "learning_rate": 1.3695831713675829e-06, + "loss": 1.2538191080093384, + "step": 13258 + }, + { + "epoch": 2.41357968508237, + "grad_norm": 29.875, + "learning_rate": 1.3691395624284321e-06, + "loss": 1.4911770820617676, + "step": 13260 + }, + { + "epoch": 2.4139437517065625, + "grad_norm": 10.6875, + "learning_rate": 1.3686961928114411e-06, + "loss": 1.6768076419830322, + "step": 13262 + }, + { + "epoch": 2.4143078183307547, + "grad_norm": 9.3125, + "learning_rate": 1.3682530625816729e-06, + "loss": 1.6474988460540771, + "step": 13264 + }, + { + "epoch": 2.414671884954947, + "grad_norm": 22.625, + "learning_rate": 1.3678101718041547e-06, + "loss": 1.2055500745773315, + "step": 13266 + }, + { + "epoch": 2.415035951579139, + "grad_norm": 17.25, + "learning_rate": 1.3673675205438796e-06, + "loss": 1.3327999114990234, + "step": 13268 + }, + { + "epoch": 2.4154000182033313, + "grad_norm": 10.9375, + "learning_rate": 1.3669251088658038e-06, + "loss": 1.4991543292999268, + "step": 13270 + }, + { + "epoch": 2.4157640848275235, + "grad_norm": 13.5, + "learning_rate": 1.3664829368348504e-06, + "loss": 1.6645715236663818, + "step": 13272 + }, + { + "epoch": 2.4161281514517157, + "grad_norm": 5.5, + "learning_rate": 1.366041004515906e-06, + "loss": 1.266193151473999, + "step": 13274 + }, + { + "epoch": 2.416492218075908, + "grad_norm": 15.9375, + "learning_rate": 1.365599311973822e-06, + "loss": 1.4692879915237427, + "step": 13276 + }, + { + "epoch": 2.4168562847001, + "grad_norm": 6.84375, + "learning_rate": 1.3651578592734155e-06, + "loss": 1.4225361347198486, + "step": 13278 + }, + { + "epoch": 2.4172203513242922, + "grad_norm": 9.6875, + "learning_rate": 1.3647166464794675e-06, + "loss": 1.0867118835449219, + "step": 13280 + }, + { + "epoch": 2.4175844179484844, + "grad_norm": 11.8125, + "learning_rate": 1.3642756736567247e-06, + "loss": 1.4150571823120117, + "step": 13282 + }, + { + "epoch": 2.4179484845726766, + "grad_norm": 14.125, + "learning_rate": 1.3638349408698976e-06, + "loss": 1.2924549579620361, + "step": 13284 + }, + { + "epoch": 2.418312551196869, + "grad_norm": 11.5625, + "learning_rate": 1.3633944481836623e-06, + "loss": 1.2613023519515991, + "step": 13286 + }, + { + "epoch": 2.4186766178210615, + "grad_norm": 11.8125, + "learning_rate": 1.3629541956626592e-06, + "loss": 1.3378260135650635, + "step": 13288 + }, + { + "epoch": 2.4190406844452537, + "grad_norm": 18.5, + "learning_rate": 1.362514183371493e-06, + "loss": 1.5379137992858887, + "step": 13290 + }, + { + "epoch": 2.419404751069446, + "grad_norm": 13.3125, + "learning_rate": 1.3620744113747347e-06, + "loss": 1.6859205961227417, + "step": 13292 + }, + { + "epoch": 2.419768817693638, + "grad_norm": 15.25, + "learning_rate": 1.3616348797369183e-06, + "loss": 1.432577133178711, + "step": 13294 + }, + { + "epoch": 2.4201328843178302, + "grad_norm": 15.3125, + "learning_rate": 1.3611955885225438e-06, + "loss": 1.45803701877594, + "step": 13296 + }, + { + "epoch": 2.4204969509420224, + "grad_norm": 9.875, + "learning_rate": 1.3607565377960752e-06, + "loss": 1.2603408098220825, + "step": 13298 + }, + { + "epoch": 2.4208610175662146, + "grad_norm": 6.1875, + "learning_rate": 1.3603177276219415e-06, + "loss": 0.9207422733306885, + "step": 13300 + }, + { + "epoch": 2.421225084190407, + "grad_norm": 3.390625, + "learning_rate": 1.359879158064536e-06, + "loss": 1.081762433052063, + "step": 13302 + }, + { + "epoch": 2.421589150814599, + "grad_norm": 49.5, + "learning_rate": 1.3594408291882175e-06, + "loss": 1.3305965662002563, + "step": 13304 + }, + { + "epoch": 2.421953217438791, + "grad_norm": 38.25, + "learning_rate": 1.3590027410573085e-06, + "loss": 0.6426777839660645, + "step": 13306 + }, + { + "epoch": 2.4223172840629834, + "grad_norm": 17.875, + "learning_rate": 1.3585648937360969e-06, + "loss": 1.423840880393982, + "step": 13308 + }, + { + "epoch": 2.4226813506871756, + "grad_norm": 3.078125, + "learning_rate": 1.3581272872888348e-06, + "loss": 1.2661268711090088, + "step": 13310 + }, + { + "epoch": 2.423045417311368, + "grad_norm": 6.90625, + "learning_rate": 1.3576899217797395e-06, + "loss": 0.9752209186553955, + "step": 13312 + }, + { + "epoch": 2.4234094839355604, + "grad_norm": 54.0, + "learning_rate": 1.3572527972729927e-06, + "loss": 1.5633896589279175, + "step": 13314 + }, + { + "epoch": 2.423773550559752, + "grad_norm": 12.5625, + "learning_rate": 1.3568159138327402e-06, + "loss": 1.8636209964752197, + "step": 13316 + }, + { + "epoch": 2.424137617183945, + "grad_norm": 3.609375, + "learning_rate": 1.3563792715230932e-06, + "loss": 1.0624810457229614, + "step": 13318 + }, + { + "epoch": 2.424501683808137, + "grad_norm": 19.25, + "learning_rate": 1.355942870408127e-06, + "loss": 1.0762466192245483, + "step": 13320 + }, + { + "epoch": 2.424865750432329, + "grad_norm": 10.5625, + "learning_rate": 1.3555067105518817e-06, + "loss": 1.7579777240753174, + "step": 13322 + }, + { + "epoch": 2.4252298170565214, + "grad_norm": 15.8125, + "learning_rate": 1.3550707920183625e-06, + "loss": 1.549422025680542, + "step": 13324 + }, + { + "epoch": 2.4255938836807136, + "grad_norm": 9.0, + "learning_rate": 1.3546351148715378e-06, + "loss": 1.5649856328964233, + "step": 13326 + }, + { + "epoch": 2.425957950304906, + "grad_norm": 17.375, + "learning_rate": 1.354199679175342e-06, + "loss": 0.6884002685546875, + "step": 13328 + }, + { + "epoch": 2.426322016929098, + "grad_norm": 29.0, + "learning_rate": 1.3537644849936738e-06, + "loss": 0.46129125356674194, + "step": 13330 + }, + { + "epoch": 2.42668608355329, + "grad_norm": 18.75, + "learning_rate": 1.3533295323903954e-06, + "loss": 0.8968048095703125, + "step": 13332 + }, + { + "epoch": 2.4270501501774824, + "grad_norm": 8.125, + "learning_rate": 1.3528948214293347e-06, + "loss": 1.4076263904571533, + "step": 13334 + }, + { + "epoch": 2.4274142168016746, + "grad_norm": 15.625, + "learning_rate": 1.3524603521742842e-06, + "loss": 1.6677645444869995, + "step": 13336 + }, + { + "epoch": 2.4277782834258668, + "grad_norm": 17.625, + "learning_rate": 1.352026124689e-06, + "loss": 1.8949885368347168, + "step": 13338 + }, + { + "epoch": 2.4281423500500594, + "grad_norm": 17.625, + "learning_rate": 1.3515921390372032e-06, + "loss": 1.4756065607070923, + "step": 13340 + }, + { + "epoch": 2.428506416674251, + "grad_norm": 21.375, + "learning_rate": 1.3511583952825795e-06, + "loss": 1.4696450233459473, + "step": 13342 + }, + { + "epoch": 2.428870483298444, + "grad_norm": 4.65625, + "learning_rate": 1.3507248934887795e-06, + "loss": 1.1072717905044556, + "step": 13344 + }, + { + "epoch": 2.429234549922636, + "grad_norm": 7.40625, + "learning_rate": 1.3502916337194171e-06, + "loss": 0.9419214725494385, + "step": 13346 + }, + { + "epoch": 2.429598616546828, + "grad_norm": 15.9375, + "learning_rate": 1.3498586160380722e-06, + "loss": 1.3977391719818115, + "step": 13348 + }, + { + "epoch": 2.4299626831710204, + "grad_norm": 17.75, + "learning_rate": 1.3494258405082874e-06, + "loss": 1.435898780822754, + "step": 13350 + }, + { + "epoch": 2.4303267497952126, + "grad_norm": 7.1875, + "learning_rate": 1.3489933071935715e-06, + "loss": 1.3148350715637207, + "step": 13352 + }, + { + "epoch": 2.4306908164194048, + "grad_norm": 8.3125, + "learning_rate": 1.348561016157397e-06, + "loss": 1.386268973350525, + "step": 13354 + }, + { + "epoch": 2.431054883043597, + "grad_norm": 8.75, + "learning_rate": 1.3481289674632006e-06, + "loss": 1.4000086784362793, + "step": 13356 + }, + { + "epoch": 2.431418949667789, + "grad_norm": 16.875, + "learning_rate": 1.347697161174384e-06, + "loss": 1.3190433979034424, + "step": 13358 + }, + { + "epoch": 2.4317830162919813, + "grad_norm": 9.4375, + "learning_rate": 1.3472655973543124e-06, + "loss": 1.4581642150878906, + "step": 13360 + }, + { + "epoch": 2.4321470829161735, + "grad_norm": 8.8125, + "learning_rate": 1.3468342760663167e-06, + "loss": 1.1164554357528687, + "step": 13362 + }, + { + "epoch": 2.4325111495403657, + "grad_norm": 17.875, + "learning_rate": 1.3464031973736912e-06, + "loss": 0.44168874621391296, + "step": 13364 + }, + { + "epoch": 2.432875216164558, + "grad_norm": 44.75, + "learning_rate": 1.3459723613396949e-06, + "loss": 1.346256971359253, + "step": 13366 + }, + { + "epoch": 2.43323928278875, + "grad_norm": 3.625, + "learning_rate": 1.3455417680275518e-06, + "loss": 1.3774802684783936, + "step": 13368 + }, + { + "epoch": 2.4336033494129428, + "grad_norm": 14.125, + "learning_rate": 1.3451114175004487e-06, + "loss": 1.4471994638442993, + "step": 13370 + }, + { + "epoch": 2.433967416037135, + "grad_norm": 10.5625, + "learning_rate": 1.3446813098215388e-06, + "loss": 1.5925532579421997, + "step": 13372 + }, + { + "epoch": 2.434331482661327, + "grad_norm": 8.75, + "learning_rate": 1.3442514450539381e-06, + "loss": 1.4256774187088013, + "step": 13374 + }, + { + "epoch": 2.4346955492855193, + "grad_norm": 17.375, + "learning_rate": 1.343821823260728e-06, + "loss": 1.3854000568389893, + "step": 13376 + }, + { + "epoch": 2.4350596159097115, + "grad_norm": 16.0, + "learning_rate": 1.3433924445049532e-06, + "loss": 1.7081060409545898, + "step": 13378 + }, + { + "epoch": 2.4354236825339037, + "grad_norm": 16.875, + "learning_rate": 1.3429633088496236e-06, + "loss": 2.032484531402588, + "step": 13380 + }, + { + "epoch": 2.435787749158096, + "grad_norm": 9.75, + "learning_rate": 1.3425344163577128e-06, + "loss": 1.6272929906845093, + "step": 13382 + }, + { + "epoch": 2.436151815782288, + "grad_norm": 7.25, + "learning_rate": 1.3421057670921594e-06, + "loss": 1.413218379020691, + "step": 13384 + }, + { + "epoch": 2.4365158824064803, + "grad_norm": 10.3125, + "learning_rate": 1.341677361115866e-06, + "loss": 1.455143690109253, + "step": 13386 + }, + { + "epoch": 2.4368799490306725, + "grad_norm": 74.0, + "learning_rate": 1.3412491984916992e-06, + "loss": 1.5953041315078735, + "step": 13388 + }, + { + "epoch": 2.4372440156548647, + "grad_norm": 17.375, + "learning_rate": 1.34082127928249e-06, + "loss": 1.3930224180221558, + "step": 13390 + }, + { + "epoch": 2.437608082279057, + "grad_norm": 14.25, + "learning_rate": 1.3403936035510342e-06, + "loss": 1.5179580450057983, + "step": 13392 + }, + { + "epoch": 2.437972148903249, + "grad_norm": 51.25, + "learning_rate": 1.3399661713600912e-06, + "loss": 1.6847363710403442, + "step": 13394 + }, + { + "epoch": 2.4383362155274417, + "grad_norm": 18.125, + "learning_rate": 1.339538982772385e-06, + "loss": 1.1017258167266846, + "step": 13396 + }, + { + "epoch": 2.438700282151634, + "grad_norm": 11.5, + "learning_rate": 1.339112037850604e-06, + "loss": 1.2716079950332642, + "step": 13398 + }, + { + "epoch": 2.439064348775826, + "grad_norm": 22.5, + "learning_rate": 1.3386853366574004e-06, + "loss": 1.6934351921081543, + "step": 13400 + }, + { + "epoch": 2.4394284154000183, + "grad_norm": 53.0, + "learning_rate": 1.3382588792553908e-06, + "loss": 1.311166524887085, + "step": 13402 + }, + { + "epoch": 2.4397924820242105, + "grad_norm": 11.3125, + "learning_rate": 1.3378326657071562e-06, + "loss": 1.944077730178833, + "step": 13404 + }, + { + "epoch": 2.4401565486484027, + "grad_norm": 9.4375, + "learning_rate": 1.337406696075242e-06, + "loss": 1.2993947267532349, + "step": 13406 + }, + { + "epoch": 2.440520615272595, + "grad_norm": 12.6875, + "learning_rate": 1.336980970422157e-06, + "loss": 0.9996879696846008, + "step": 13408 + }, + { + "epoch": 2.440884681896787, + "grad_norm": 18.25, + "learning_rate": 1.336555488810375e-06, + "loss": 2.0626003742218018, + "step": 13410 + }, + { + "epoch": 2.4412487485209793, + "grad_norm": 13.0, + "learning_rate": 1.3361302513023335e-06, + "loss": 1.2710504531860352, + "step": 13412 + }, + { + "epoch": 2.4416128151451715, + "grad_norm": 8.0, + "learning_rate": 1.3357052579604347e-06, + "loss": 1.3482321500778198, + "step": 13414 + }, + { + "epoch": 2.4419768817693637, + "grad_norm": 6.75, + "learning_rate": 1.3352805088470443e-06, + "loss": 1.0457558631896973, + "step": 13416 + }, + { + "epoch": 2.442340948393556, + "grad_norm": 16.75, + "learning_rate": 1.3348560040244932e-06, + "loss": 1.7914769649505615, + "step": 13418 + }, + { + "epoch": 2.442705015017748, + "grad_norm": 9.1875, + "learning_rate": 1.334431743555075e-06, + "loss": 1.1658673286437988, + "step": 13420 + }, + { + "epoch": 2.4430690816419407, + "grad_norm": 9.9375, + "learning_rate": 1.3340077275010486e-06, + "loss": 1.1554853916168213, + "step": 13422 + }, + { + "epoch": 2.443433148266133, + "grad_norm": 12.375, + "learning_rate": 1.3335839559246364e-06, + "loss": 1.5388017892837524, + "step": 13424 + }, + { + "epoch": 2.443797214890325, + "grad_norm": 20.625, + "learning_rate": 1.3331604288880251e-06, + "loss": 1.7255887985229492, + "step": 13426 + }, + { + "epoch": 2.4441612815145173, + "grad_norm": 26.25, + "learning_rate": 1.332737146453366e-06, + "loss": 1.684288501739502, + "step": 13428 + }, + { + "epoch": 2.4445253481387095, + "grad_norm": 6.8125, + "learning_rate": 1.3323141086827736e-06, + "loss": 1.2902215719223022, + "step": 13430 + }, + { + "epoch": 2.4448894147629017, + "grad_norm": 20.0, + "learning_rate": 1.3318913156383273e-06, + "loss": 1.3036847114562988, + "step": 13432 + }, + { + "epoch": 2.445253481387094, + "grad_norm": 8.0625, + "learning_rate": 1.3314687673820703e-06, + "loss": 1.2368851900100708, + "step": 13434 + }, + { + "epoch": 2.445617548011286, + "grad_norm": 6.125, + "learning_rate": 1.33104646397601e-06, + "loss": 1.3138872385025024, + "step": 13436 + }, + { + "epoch": 2.4459816146354783, + "grad_norm": 12.6875, + "learning_rate": 1.3306244054821169e-06, + "loss": 1.2879775762557983, + "step": 13438 + }, + { + "epoch": 2.4463456812596704, + "grad_norm": 14.1875, + "learning_rate": 1.330202591962327e-06, + "loss": 2.1288092136383057, + "step": 13440 + }, + { + "epoch": 2.4467097478838626, + "grad_norm": 11.3125, + "learning_rate": 1.32978102347854e-06, + "loss": 1.3249249458312988, + "step": 13442 + }, + { + "epoch": 2.447073814508055, + "grad_norm": 14.75, + "learning_rate": 1.3293597000926185e-06, + "loss": 1.4727728366851807, + "step": 13444 + }, + { + "epoch": 2.447437881132247, + "grad_norm": 15.8125, + "learning_rate": 1.3289386218663907e-06, + "loss": 1.378298282623291, + "step": 13446 + }, + { + "epoch": 2.4478019477564397, + "grad_norm": 18.25, + "learning_rate": 1.3285177888616483e-06, + "loss": 1.4411320686340332, + "step": 13448 + }, + { + "epoch": 2.4481660143806314, + "grad_norm": 10.8125, + "learning_rate": 1.328097201140146e-06, + "loss": 1.1870819330215454, + "step": 13450 + }, + { + "epoch": 2.448530081004824, + "grad_norm": 35.5, + "learning_rate": 1.3276768587636037e-06, + "loss": 1.219420313835144, + "step": 13452 + }, + { + "epoch": 2.4488941476290162, + "grad_norm": 12.5625, + "learning_rate": 1.3272567617937054e-06, + "loss": 1.8146917819976807, + "step": 13454 + }, + { + "epoch": 2.4492582142532084, + "grad_norm": 5.03125, + "learning_rate": 1.326836910292098e-06, + "loss": 1.1944459676742554, + "step": 13456 + }, + { + "epoch": 2.4496222808774006, + "grad_norm": 10.625, + "learning_rate": 1.3264173043203934e-06, + "loss": 1.3364241123199463, + "step": 13458 + }, + { + "epoch": 2.449986347501593, + "grad_norm": 6.9375, + "learning_rate": 1.3259979439401671e-06, + "loss": 1.3773140907287598, + "step": 13460 + }, + { + "epoch": 2.450350414125785, + "grad_norm": 72.0, + "learning_rate": 1.325578829212958e-06, + "loss": 1.3781545162200928, + "step": 13462 + }, + { + "epoch": 2.450714480749977, + "grad_norm": 23.875, + "learning_rate": 1.3251599602002704e-06, + "loss": 1.4792027473449707, + "step": 13464 + }, + { + "epoch": 2.4510785473741694, + "grad_norm": 13.0, + "learning_rate": 1.324741336963571e-06, + "loss": 1.3898544311523438, + "step": 13466 + }, + { + "epoch": 2.4514426139983616, + "grad_norm": 7.90625, + "learning_rate": 1.3243229595642907e-06, + "loss": 1.2882496118545532, + "step": 13468 + }, + { + "epoch": 2.451806680622554, + "grad_norm": 14.5625, + "learning_rate": 1.3239048280638255e-06, + "loss": 1.4602361917495728, + "step": 13470 + }, + { + "epoch": 2.452170747246746, + "grad_norm": 6.09375, + "learning_rate": 1.323486942523534e-06, + "loss": 1.4485259056091309, + "step": 13472 + }, + { + "epoch": 2.452534813870938, + "grad_norm": 4.90625, + "learning_rate": 1.3230693030047398e-06, + "loss": 1.1378285884857178, + "step": 13474 + }, + { + "epoch": 2.4528988804951304, + "grad_norm": 16.25, + "learning_rate": 1.322651909568729e-06, + "loss": 1.4630377292633057, + "step": 13476 + }, + { + "epoch": 2.453262947119323, + "grad_norm": 4.15625, + "learning_rate": 1.3222347622767529e-06, + "loss": 1.0651907920837402, + "step": 13478 + }, + { + "epoch": 2.453627013743515, + "grad_norm": 7.96875, + "learning_rate": 1.321817861190026e-06, + "loss": 1.4467761516571045, + "step": 13480 + }, + { + "epoch": 2.4539910803677074, + "grad_norm": 9.4375, + "learning_rate": 1.3214012063697268e-06, + "loss": 1.0389844179153442, + "step": 13482 + }, + { + "epoch": 2.4543551469918996, + "grad_norm": 50.25, + "learning_rate": 1.320984797876998e-06, + "loss": 0.5134612321853638, + "step": 13484 + }, + { + "epoch": 2.454719213616092, + "grad_norm": 10.3125, + "learning_rate": 1.3205686357729452e-06, + "loss": 1.3701632022857666, + "step": 13486 + }, + { + "epoch": 2.455083280240284, + "grad_norm": 15.8125, + "learning_rate": 1.3201527201186396e-06, + "loss": 1.4806716442108154, + "step": 13488 + }, + { + "epoch": 2.455447346864476, + "grad_norm": 52.25, + "learning_rate": 1.3197370509751143e-06, + "loss": 1.6775561571121216, + "step": 13490 + }, + { + "epoch": 2.4558114134886684, + "grad_norm": 17.375, + "learning_rate": 1.3193216284033672e-06, + "loss": 1.1514748334884644, + "step": 13492 + }, + { + "epoch": 2.4561754801128606, + "grad_norm": 12.25, + "learning_rate": 1.3189064524643597e-06, + "loss": 1.6332601308822632, + "step": 13494 + }, + { + "epoch": 2.4565395467370528, + "grad_norm": 10.875, + "learning_rate": 1.3184915232190175e-06, + "loss": 2.0473432540893555, + "step": 13496 + }, + { + "epoch": 2.456903613361245, + "grad_norm": 105.5, + "learning_rate": 1.31807684072823e-06, + "loss": 1.3464536666870117, + "step": 13498 + }, + { + "epoch": 2.457267679985437, + "grad_norm": 31.0, + "learning_rate": 1.3176624050528498e-06, + "loss": 1.6327688694000244, + "step": 13500 + }, + { + "epoch": 2.4576317466096294, + "grad_norm": 23.625, + "learning_rate": 1.3172482162536936e-06, + "loss": 1.3721489906311035, + "step": 13502 + }, + { + "epoch": 2.457995813233822, + "grad_norm": 7.0625, + "learning_rate": 1.316834274391542e-06, + "loss": 1.2554636001586914, + "step": 13504 + }, + { + "epoch": 2.458359879858014, + "grad_norm": 7.46875, + "learning_rate": 1.3164205795271397e-06, + "loss": 1.1718668937683105, + "step": 13506 + }, + { + "epoch": 2.4587239464822064, + "grad_norm": 4.375, + "learning_rate": 1.3160071317211943e-06, + "loss": 1.576371192932129, + "step": 13508 + }, + { + "epoch": 2.4590880131063986, + "grad_norm": 12.875, + "learning_rate": 1.3155939310343773e-06, + "loss": 1.374147891998291, + "step": 13510 + }, + { + "epoch": 2.4594520797305908, + "grad_norm": 6.5, + "learning_rate": 1.315180977527325e-06, + "loss": 1.2366358041763306, + "step": 13512 + }, + { + "epoch": 2.459816146354783, + "grad_norm": 16.375, + "learning_rate": 1.3147682712606364e-06, + "loss": 1.3463104963302612, + "step": 13514 + }, + { + "epoch": 2.460180212978975, + "grad_norm": 25.125, + "learning_rate": 1.314355812294874e-06, + "loss": 1.144848346710205, + "step": 13516 + }, + { + "epoch": 2.4605442796031674, + "grad_norm": 15.25, + "learning_rate": 1.3139436006905648e-06, + "loss": 1.053707242012024, + "step": 13518 + }, + { + "epoch": 2.4609083462273595, + "grad_norm": 17.25, + "learning_rate": 1.3135316365081996e-06, + "loss": 1.5874720811843872, + "step": 13520 + }, + { + "epoch": 2.4612724128515517, + "grad_norm": 11.8125, + "learning_rate": 1.3131199198082318e-06, + "loss": 1.547202229499817, + "step": 13522 + }, + { + "epoch": 2.461636479475744, + "grad_norm": 15.4375, + "learning_rate": 1.3127084506510792e-06, + "loss": 1.4658070802688599, + "step": 13524 + }, + { + "epoch": 2.462000546099936, + "grad_norm": 8.4375, + "learning_rate": 1.3122972290971239e-06, + "loss": 1.4734885692596436, + "step": 13526 + }, + { + "epoch": 2.4623646127241283, + "grad_norm": 8.25, + "learning_rate": 1.3118862552067104e-06, + "loss": 1.2806057929992676, + "step": 13528 + }, + { + "epoch": 2.462728679348321, + "grad_norm": 8.75, + "learning_rate": 1.311475529040148e-06, + "loss": 1.4752672910690308, + "step": 13530 + }, + { + "epoch": 2.463092745972513, + "grad_norm": 9.5, + "learning_rate": 1.3110650506577083e-06, + "loss": 1.4311320781707764, + "step": 13532 + }, + { + "epoch": 2.4634568125967053, + "grad_norm": 26.125, + "learning_rate": 1.310654820119628e-06, + "loss": 1.4307315349578857, + "step": 13534 + }, + { + "epoch": 2.4638208792208975, + "grad_norm": 8.5625, + "learning_rate": 1.310244837486106e-06, + "loss": 1.4179456233978271, + "step": 13536 + }, + { + "epoch": 2.4641849458450897, + "grad_norm": 15.0625, + "learning_rate": 1.3098351028173065e-06, + "loss": 1.43107271194458, + "step": 13538 + }, + { + "epoch": 2.464549012469282, + "grad_norm": 5.09375, + "learning_rate": 1.309425616173356e-06, + "loss": 1.3568236827850342, + "step": 13540 + }, + { + "epoch": 2.464913079093474, + "grad_norm": 5.15625, + "learning_rate": 1.309016377614345e-06, + "loss": 1.3038747310638428, + "step": 13542 + }, + { + "epoch": 2.4652771457176663, + "grad_norm": 12.5625, + "learning_rate": 1.308607387200328e-06, + "loss": 1.2644062042236328, + "step": 13544 + }, + { + "epoch": 2.4656412123418585, + "grad_norm": 105.5, + "learning_rate": 1.3081986449913218e-06, + "loss": 2.071988582611084, + "step": 13546 + }, + { + "epoch": 2.4660052789660507, + "grad_norm": 6.8125, + "learning_rate": 1.3077901510473082e-06, + "loss": 1.0824509859085083, + "step": 13548 + }, + { + "epoch": 2.466369345590243, + "grad_norm": 11.9375, + "learning_rate": 1.3073819054282322e-06, + "loss": 1.4564602375030518, + "step": 13550 + }, + { + "epoch": 2.466733412214435, + "grad_norm": 6.65625, + "learning_rate": 1.306973908194002e-06, + "loss": 1.32082998752594, + "step": 13552 + }, + { + "epoch": 2.4670974788386273, + "grad_norm": 7.0625, + "learning_rate": 1.3065661594044896e-06, + "loss": 1.1079126596450806, + "step": 13554 + }, + { + "epoch": 2.46746154546282, + "grad_norm": 81.0, + "learning_rate": 1.3061586591195303e-06, + "loss": 1.4088371992111206, + "step": 13556 + }, + { + "epoch": 2.4678256120870117, + "grad_norm": 11.5, + "learning_rate": 1.305751407398923e-06, + "loss": 1.3359097242355347, + "step": 13558 + }, + { + "epoch": 2.4681896787112043, + "grad_norm": 12.3125, + "learning_rate": 1.305344404302431e-06, + "loss": 1.022810697555542, + "step": 13560 + }, + { + "epoch": 2.4685537453353965, + "grad_norm": 10.6875, + "learning_rate": 1.3049376498897794e-06, + "loss": 1.8778188228607178, + "step": 13562 + }, + { + "epoch": 2.4689178119595887, + "grad_norm": 27.875, + "learning_rate": 1.3045311442206585e-06, + "loss": 1.6414706707000732, + "step": 13564 + }, + { + "epoch": 2.469281878583781, + "grad_norm": 7.0, + "learning_rate": 1.3041248873547208e-06, + "loss": 1.1584283113479614, + "step": 13566 + }, + { + "epoch": 2.469645945207973, + "grad_norm": 6.65625, + "learning_rate": 1.3037188793515831e-06, + "loss": 1.1896963119506836, + "step": 13568 + }, + { + "epoch": 2.4700100118321653, + "grad_norm": 17.25, + "learning_rate": 1.3033131202708257e-06, + "loss": 1.256224274635315, + "step": 13570 + }, + { + "epoch": 2.4703740784563575, + "grad_norm": 18.5, + "learning_rate": 1.3029076101719917e-06, + "loss": 1.3375985622406006, + "step": 13572 + }, + { + "epoch": 2.4707381450805497, + "grad_norm": 19.625, + "learning_rate": 1.3025023491145883e-06, + "loss": 1.067662239074707, + "step": 13574 + }, + { + "epoch": 2.471102211704742, + "grad_norm": 12.25, + "learning_rate": 1.3020973371580855e-06, + "loss": 1.5700691938400269, + "step": 13576 + }, + { + "epoch": 2.471466278328934, + "grad_norm": 9.25, + "learning_rate": 1.3016925743619176e-06, + "loss": 1.3685240745544434, + "step": 13578 + }, + { + "epoch": 2.4718303449531263, + "grad_norm": 8.1875, + "learning_rate": 1.3012880607854816e-06, + "loss": 1.2128101587295532, + "step": 13580 + }, + { + "epoch": 2.472194411577319, + "grad_norm": 3.5625, + "learning_rate": 1.3008837964881387e-06, + "loss": 1.1041147708892822, + "step": 13582 + }, + { + "epoch": 2.4725584782015106, + "grad_norm": 13.0625, + "learning_rate": 1.3004797815292127e-06, + "loss": 1.4154976606369019, + "step": 13584 + }, + { + "epoch": 2.4729225448257033, + "grad_norm": 16.875, + "learning_rate": 1.3000760159679911e-06, + "loss": 1.392364501953125, + "step": 13586 + }, + { + "epoch": 2.4732866114498955, + "grad_norm": 8.8125, + "learning_rate": 1.2996724998637253e-06, + "loss": 1.1981555223464966, + "step": 13588 + }, + { + "epoch": 2.4736506780740877, + "grad_norm": 8.3125, + "learning_rate": 1.299269233275629e-06, + "loss": 1.2037687301635742, + "step": 13590 + }, + { + "epoch": 2.47401474469828, + "grad_norm": 2.890625, + "learning_rate": 1.2988662162628803e-06, + "loss": 0.8786299228668213, + "step": 13592 + }, + { + "epoch": 2.474378811322472, + "grad_norm": 10.6875, + "learning_rate": 1.2984634488846204e-06, + "loss": 0.37407782673835754, + "step": 13594 + }, + { + "epoch": 2.4747428779466643, + "grad_norm": 12.4375, + "learning_rate": 1.2980609311999535e-06, + "loss": 0.9029257893562317, + "step": 13596 + }, + { + "epoch": 2.4751069445708564, + "grad_norm": 7.0, + "learning_rate": 1.2976586632679478e-06, + "loss": 1.2732961177825928, + "step": 13598 + }, + { + "epoch": 2.4754710111950486, + "grad_norm": 11.3125, + "learning_rate": 1.297256645147634e-06, + "loss": 1.5063642263412476, + "step": 13600 + }, + { + "epoch": 2.475835077819241, + "grad_norm": 13.6875, + "learning_rate": 1.2968548768980068e-06, + "loss": 1.6139041185379028, + "step": 13602 + }, + { + "epoch": 2.476199144443433, + "grad_norm": 9.125, + "learning_rate": 1.2964533585780246e-06, + "loss": 1.352710485458374, + "step": 13604 + }, + { + "epoch": 2.4765632110676252, + "grad_norm": 3.234375, + "learning_rate": 1.2960520902466077e-06, + "loss": 0.8745146989822388, + "step": 13606 + }, + { + "epoch": 2.4769272776918174, + "grad_norm": 8.3125, + "learning_rate": 1.2956510719626413e-06, + "loss": 1.3983957767486572, + "step": 13608 + }, + { + "epoch": 2.4772913443160096, + "grad_norm": 13.5, + "learning_rate": 1.2952503037849731e-06, + "loss": 1.4222064018249512, + "step": 13610 + }, + { + "epoch": 2.4776554109402023, + "grad_norm": 41.25, + "learning_rate": 1.294849785772414e-06, + "loss": 1.4276666641235352, + "step": 13612 + }, + { + "epoch": 2.4780194775643944, + "grad_norm": 17.75, + "learning_rate": 1.2944495179837383e-06, + "loss": 1.8202000856399536, + "step": 13614 + }, + { + "epoch": 2.4783835441885866, + "grad_norm": 11.6875, + "learning_rate": 1.294049500477684e-06, + "loss": 1.3318727016448975, + "step": 13616 + }, + { + "epoch": 2.478747610812779, + "grad_norm": 7.0, + "learning_rate": 1.2936497333129519e-06, + "loss": 1.1300780773162842, + "step": 13618 + }, + { + "epoch": 2.479111677436971, + "grad_norm": 26.875, + "learning_rate": 1.2932502165482063e-06, + "loss": 1.384740948677063, + "step": 13620 + }, + { + "epoch": 2.4794757440611632, + "grad_norm": 10.9375, + "learning_rate": 1.2928509502420745e-06, + "loss": 1.481090784072876, + "step": 13622 + }, + { + "epoch": 2.4798398106853554, + "grad_norm": 4.75, + "learning_rate": 1.2924519344531472e-06, + "loss": 1.0039970874786377, + "step": 13624 + }, + { + "epoch": 2.4802038773095476, + "grad_norm": 4.03125, + "learning_rate": 1.2920531692399781e-06, + "loss": 0.9357774257659912, + "step": 13626 + }, + { + "epoch": 2.48056794393374, + "grad_norm": 6.8125, + "learning_rate": 1.2916546546610854e-06, + "loss": 1.233557105064392, + "step": 13628 + }, + { + "epoch": 2.480932010557932, + "grad_norm": 5.75, + "learning_rate": 1.2912563907749483e-06, + "loss": 1.4808787107467651, + "step": 13630 + }, + { + "epoch": 2.481296077182124, + "grad_norm": 5.25, + "learning_rate": 1.290858377640011e-06, + "loss": 1.4696505069732666, + "step": 13632 + }, + { + "epoch": 2.4816601438063164, + "grad_norm": 7.15625, + "learning_rate": 1.2904606153146803e-06, + "loss": 1.3675892353057861, + "step": 13634 + }, + { + "epoch": 2.4820242104305086, + "grad_norm": 9.1875, + "learning_rate": 1.2900631038573263e-06, + "loss": 1.5780583620071411, + "step": 13636 + }, + { + "epoch": 2.482388277054701, + "grad_norm": 10.75, + "learning_rate": 1.2896658433262817e-06, + "loss": 1.161513090133667, + "step": 13638 + }, + { + "epoch": 2.4827523436788934, + "grad_norm": 8.8125, + "learning_rate": 1.2892688337798438e-06, + "loss": 1.2856149673461914, + "step": 13640 + }, + { + "epoch": 2.4831164103030856, + "grad_norm": 14.1875, + "learning_rate": 1.288872075276271e-06, + "loss": 1.4421100616455078, + "step": 13642 + }, + { + "epoch": 2.483480476927278, + "grad_norm": 15.5, + "learning_rate": 1.2884755678737867e-06, + "loss": 1.3901095390319824, + "step": 13644 + }, + { + "epoch": 2.48384454355147, + "grad_norm": 30.0, + "learning_rate": 1.2880793116305767e-06, + "loss": 1.324196457862854, + "step": 13646 + }, + { + "epoch": 2.484208610175662, + "grad_norm": 16.875, + "learning_rate": 1.28768330660479e-06, + "loss": 1.491485834121704, + "step": 13648 + }, + { + "epoch": 2.4845726767998544, + "grad_norm": 17.25, + "learning_rate": 1.2872875528545382e-06, + "loss": 1.3070697784423828, + "step": 13650 + }, + { + "epoch": 2.4849367434240466, + "grad_norm": 17.5, + "learning_rate": 1.2868920504378973e-06, + "loss": 1.5530606508255005, + "step": 13652 + }, + { + "epoch": 2.4853008100482388, + "grad_norm": 19.625, + "learning_rate": 1.2864967994129055e-06, + "loss": 1.5536575317382812, + "step": 13654 + }, + { + "epoch": 2.485664876672431, + "grad_norm": 17.625, + "learning_rate": 1.286101799837564e-06, + "loss": 1.6284408569335938, + "step": 13656 + }, + { + "epoch": 2.486028943296623, + "grad_norm": 21.125, + "learning_rate": 1.2857070517698378e-06, + "loss": 1.4892117977142334, + "step": 13658 + }, + { + "epoch": 2.4863930099208154, + "grad_norm": 14.75, + "learning_rate": 1.285312555267654e-06, + "loss": 1.4760229587554932, + "step": 13660 + }, + { + "epoch": 2.4867570765450075, + "grad_norm": 18.5, + "learning_rate": 1.2849183103889036e-06, + "loss": 1.417324185371399, + "step": 13662 + }, + { + "epoch": 2.4871211431692, + "grad_norm": 14.875, + "learning_rate": 1.2845243171914408e-06, + "loss": 1.4205467700958252, + "step": 13664 + }, + { + "epoch": 2.4874852097933924, + "grad_norm": 22.0, + "learning_rate": 1.2841305757330824e-06, + "loss": 1.2719480991363525, + "step": 13666 + }, + { + "epoch": 2.4878492764175846, + "grad_norm": 17.125, + "learning_rate": 1.2837370860716081e-06, + "loss": 1.2793290615081787, + "step": 13668 + }, + { + "epoch": 2.4882133430417768, + "grad_norm": 13.9375, + "learning_rate": 1.2833438482647608e-06, + "loss": 0.9162566065788269, + "step": 13670 + }, + { + "epoch": 2.488577409665969, + "grad_norm": 9.1875, + "learning_rate": 1.2829508623702469e-06, + "loss": 1.3726885318756104, + "step": 13672 + }, + { + "epoch": 2.488941476290161, + "grad_norm": 31.875, + "learning_rate": 1.2825581284457354e-06, + "loss": 1.452609896659851, + "step": 13674 + }, + { + "epoch": 2.4893055429143534, + "grad_norm": 27.0, + "learning_rate": 1.2821656465488584e-06, + "loss": 1.7444946765899658, + "step": 13676 + }, + { + "epoch": 2.4896696095385455, + "grad_norm": 9.75, + "learning_rate": 1.2817734167372105e-06, + "loss": 1.543503761291504, + "step": 13678 + }, + { + "epoch": 2.4900336761627377, + "grad_norm": 70.0, + "learning_rate": 1.281381439068351e-06, + "loss": 1.2844178676605225, + "step": 13680 + }, + { + "epoch": 2.49039774278693, + "grad_norm": 17.5, + "learning_rate": 1.2809897135998e-06, + "loss": 0.8564822673797607, + "step": 13682 + }, + { + "epoch": 2.490761809411122, + "grad_norm": 8.0625, + "learning_rate": 1.280598240389042e-06, + "loss": 1.3028268814086914, + "step": 13684 + }, + { + "epoch": 2.4911258760353143, + "grad_norm": 19.625, + "learning_rate": 1.2802070194935244e-06, + "loss": 1.1162965297698975, + "step": 13686 + }, + { + "epoch": 2.4914899426595065, + "grad_norm": 9.8125, + "learning_rate": 1.2798160509706568e-06, + "loss": 1.4655704498291016, + "step": 13688 + }, + { + "epoch": 2.491854009283699, + "grad_norm": 38.0, + "learning_rate": 1.2794253348778122e-06, + "loss": 1.3618659973144531, + "step": 13690 + }, + { + "epoch": 2.492218075907891, + "grad_norm": 8.25, + "learning_rate": 1.279034871272327e-06, + "loss": 1.208526372909546, + "step": 13692 + }, + { + "epoch": 2.4925821425320835, + "grad_norm": 12.0625, + "learning_rate": 1.2786446602114998e-06, + "loss": 1.5387649536132812, + "step": 13694 + }, + { + "epoch": 2.4929462091562757, + "grad_norm": 16.625, + "learning_rate": 1.2782547017525928e-06, + "loss": 1.4620836973190308, + "step": 13696 + }, + { + "epoch": 2.493310275780468, + "grad_norm": 5.8125, + "learning_rate": 1.277864995952831e-06, + "loss": 1.17047917842865, + "step": 13698 + }, + { + "epoch": 2.49367434240466, + "grad_norm": 12.9375, + "learning_rate": 1.2774755428694017e-06, + "loss": 1.5245435237884521, + "step": 13700 + }, + { + "epoch": 2.4940384090288523, + "grad_norm": 16.75, + "learning_rate": 1.2770863425594553e-06, + "loss": 1.2447148561477661, + "step": 13702 + }, + { + "epoch": 2.4944024756530445, + "grad_norm": 9.0625, + "learning_rate": 1.2766973950801062e-06, + "loss": 0.7250251770019531, + "step": 13704 + }, + { + "epoch": 2.4947665422772367, + "grad_norm": 16.5, + "learning_rate": 1.2763087004884303e-06, + "loss": 1.4915473461151123, + "step": 13706 + }, + { + "epoch": 2.495130608901429, + "grad_norm": 10.875, + "learning_rate": 1.275920258841467e-06, + "loss": 1.5500664710998535, + "step": 13708 + }, + { + "epoch": 2.495494675525621, + "grad_norm": 6.84375, + "learning_rate": 1.275532070196219e-06, + "loss": 1.169064998626709, + "step": 13710 + }, + { + "epoch": 2.4958587421498133, + "grad_norm": 16.625, + "learning_rate": 1.2751441346096506e-06, + "loss": 1.3000593185424805, + "step": 13712 + }, + { + "epoch": 2.4962228087740055, + "grad_norm": 9.125, + "learning_rate": 1.2747564521386905e-06, + "loss": 1.241957664489746, + "step": 13714 + }, + { + "epoch": 2.4965868753981977, + "grad_norm": 10.3125, + "learning_rate": 1.2743690228402293e-06, + "loss": 1.7536050081253052, + "step": 13716 + }, + { + "epoch": 2.49695094202239, + "grad_norm": 30.25, + "learning_rate": 1.2739818467711202e-06, + "loss": 1.396217942237854, + "step": 13718 + }, + { + "epoch": 2.4973150086465825, + "grad_norm": 9.5625, + "learning_rate": 1.2735949239881807e-06, + "loss": 1.2755478620529175, + "step": 13720 + }, + { + "epoch": 2.4976790752707747, + "grad_norm": 3.609375, + "learning_rate": 1.2732082545481892e-06, + "loss": 1.0886292457580566, + "step": 13722 + }, + { + "epoch": 2.498043141894967, + "grad_norm": 13.4375, + "learning_rate": 1.2728218385078883e-06, + "loss": 1.5048471689224243, + "step": 13724 + }, + { + "epoch": 2.498407208519159, + "grad_norm": 17.0, + "learning_rate": 1.2724356759239831e-06, + "loss": 1.2710556983947754, + "step": 13726 + }, + { + "epoch": 2.4987712751433513, + "grad_norm": 18.75, + "learning_rate": 1.2720497668531409e-06, + "loss": 1.5529038906097412, + "step": 13728 + }, + { + "epoch": 2.4991353417675435, + "grad_norm": 8.0625, + "learning_rate": 1.2716641113519932e-06, + "loss": 1.694190263748169, + "step": 13730 + }, + { + "epoch": 2.4994994083917357, + "grad_norm": 13.25, + "learning_rate": 1.2712787094771326e-06, + "loss": 1.212223768234253, + "step": 13732 + }, + { + "epoch": 2.499863475015928, + "grad_norm": 29.75, + "learning_rate": 1.2708935612851153e-06, + "loss": 1.7869689464569092, + "step": 13734 + }, + { + "epoch": 2.50022754164012, + "grad_norm": 17.0, + "learning_rate": 1.2705086668324606e-06, + "loss": 1.857490062713623, + "step": 13736 + }, + { + "epoch": 2.5005916082643123, + "grad_norm": 11.875, + "learning_rate": 1.2701240261756497e-06, + "loss": 1.4428120851516724, + "step": 13738 + }, + { + "epoch": 2.5009556748885045, + "grad_norm": 9.25, + "learning_rate": 1.2697396393711281e-06, + "loss": 1.1334147453308105, + "step": 13740 + }, + { + "epoch": 2.501319741512697, + "grad_norm": 19.5, + "learning_rate": 1.2693555064753016e-06, + "loss": 1.4924687147140503, + "step": 13742 + }, + { + "epoch": 2.501683808136889, + "grad_norm": 15.0, + "learning_rate": 1.2689716275445413e-06, + "loss": 1.585281491279602, + "step": 13744 + }, + { + "epoch": 2.5020478747610815, + "grad_norm": 54.75, + "learning_rate": 1.2685880026351793e-06, + "loss": 1.6740095615386963, + "step": 13746 + }, + { + "epoch": 2.5024119413852732, + "grad_norm": 7.1875, + "learning_rate": 1.268204631803511e-06, + "loss": 1.4591107368469238, + "step": 13748 + }, + { + "epoch": 2.502776008009466, + "grad_norm": 5.90625, + "learning_rate": 1.2678215151057946e-06, + "loss": 1.0724551677703857, + "step": 13750 + }, + { + "epoch": 2.503140074633658, + "grad_norm": 14.4375, + "learning_rate": 1.267438652598251e-06, + "loss": 1.232196569442749, + "step": 13752 + }, + { + "epoch": 2.5035041412578503, + "grad_norm": 9.625, + "learning_rate": 1.267056044337064e-06, + "loss": 1.2873656749725342, + "step": 13754 + }, + { + "epoch": 2.5038682078820425, + "grad_norm": 37.5, + "learning_rate": 1.2666736903783789e-06, + "loss": 1.29099440574646, + "step": 13756 + }, + { + "epoch": 2.5042322745062346, + "grad_norm": 5.125, + "learning_rate": 1.2662915907783056e-06, + "loss": 1.1078013181686401, + "step": 13758 + }, + { + "epoch": 2.504596341130427, + "grad_norm": 12.5, + "learning_rate": 1.2659097455929147e-06, + "loss": 1.460280179977417, + "step": 13760 + }, + { + "epoch": 2.504960407754619, + "grad_norm": 13.25, + "learning_rate": 1.2655281548782417e-06, + "loss": 1.7749568223953247, + "step": 13762 + }, + { + "epoch": 2.5053244743788112, + "grad_norm": 4.09375, + "learning_rate": 1.2651468186902825e-06, + "loss": 1.3660907745361328, + "step": 13764 + }, + { + "epoch": 2.5056885410030034, + "grad_norm": 10.4375, + "learning_rate": 1.2647657370849966e-06, + "loss": 0.9901412129402161, + "step": 13766 + }, + { + "epoch": 2.5060526076271956, + "grad_norm": 8.25, + "learning_rate": 1.264384910118307e-06, + "loss": 1.341814637184143, + "step": 13768 + }, + { + "epoch": 2.506416674251388, + "grad_norm": 18.625, + "learning_rate": 1.2640043378460975e-06, + "loss": 1.4288243055343628, + "step": 13770 + }, + { + "epoch": 2.5067807408755804, + "grad_norm": 9.8125, + "learning_rate": 1.2636240203242164e-06, + "loss": 1.400880217552185, + "step": 13772 + }, + { + "epoch": 2.507144807499772, + "grad_norm": 9.5625, + "learning_rate": 1.2632439576084735e-06, + "loss": 1.1051479578018188, + "step": 13774 + }, + { + "epoch": 2.507508874123965, + "grad_norm": 9.625, + "learning_rate": 1.2628641497546412e-06, + "loss": 1.2555122375488281, + "step": 13776 + }, + { + "epoch": 2.507872940748157, + "grad_norm": 9.25, + "learning_rate": 1.262484596818455e-06, + "loss": 1.545544147491455, + "step": 13778 + }, + { + "epoch": 2.5082370073723492, + "grad_norm": 9.5625, + "learning_rate": 1.2621052988556127e-06, + "loss": 1.566070795059204, + "step": 13780 + }, + { + "epoch": 2.5086010739965414, + "grad_norm": 5.9375, + "learning_rate": 1.2617262559217745e-06, + "loss": 1.2859457731246948, + "step": 13782 + }, + { + "epoch": 2.5089651406207336, + "grad_norm": 11.25, + "learning_rate": 1.2613474680725635e-06, + "loss": 1.496281623840332, + "step": 13784 + }, + { + "epoch": 2.509329207244926, + "grad_norm": 24.0, + "learning_rate": 1.2609689353635658e-06, + "loss": 1.3814291954040527, + "step": 13786 + }, + { + "epoch": 2.509693273869118, + "grad_norm": 8.375, + "learning_rate": 1.2605906578503291e-06, + "loss": 1.2263468503952026, + "step": 13788 + }, + { + "epoch": 2.51005734049331, + "grad_norm": 15.3125, + "learning_rate": 1.260212635588364e-06, + "loss": 0.8674606084823608, + "step": 13790 + }, + { + "epoch": 2.5104214071175024, + "grad_norm": 15.9375, + "learning_rate": 1.259834868633144e-06, + "loss": 1.3211150169372559, + "step": 13792 + }, + { + "epoch": 2.5107854737416946, + "grad_norm": 15.3125, + "learning_rate": 1.2594573570401047e-06, + "loss": 1.525397539138794, + "step": 13794 + }, + { + "epoch": 2.511149540365887, + "grad_norm": 28.5, + "learning_rate": 1.2590801008646444e-06, + "loss": 1.7968785762786865, + "step": 13796 + }, + { + "epoch": 2.5115136069900794, + "grad_norm": 9.4375, + "learning_rate": 1.2587031001621242e-06, + "loss": 1.0394837856292725, + "step": 13798 + }, + { + "epoch": 2.511877673614271, + "grad_norm": 7.1875, + "learning_rate": 1.258326354987867e-06, + "loss": 1.3745152950286865, + "step": 13800 + }, + { + "epoch": 2.512241740238464, + "grad_norm": 11.0625, + "learning_rate": 1.257949865397159e-06, + "loss": 0.9093539714813232, + "step": 13802 + }, + { + "epoch": 2.512605806862656, + "grad_norm": 14.625, + "learning_rate": 1.257573631445248e-06, + "loss": 0.9860128164291382, + "step": 13804 + }, + { + "epoch": 2.512969873486848, + "grad_norm": 17.0, + "learning_rate": 1.2571976531873453e-06, + "loss": 1.7929331064224243, + "step": 13806 + }, + { + "epoch": 2.5133339401110404, + "grad_norm": 27.75, + "learning_rate": 1.2568219306786243e-06, + "loss": 1.7403910160064697, + "step": 13808 + }, + { + "epoch": 2.5136980067352326, + "grad_norm": 21.25, + "learning_rate": 1.2564464639742203e-06, + "loss": 2.1030211448669434, + "step": 13810 + }, + { + "epoch": 2.5140620733594248, + "grad_norm": 12.5625, + "learning_rate": 1.2560712531292315e-06, + "loss": 2.1319241523742676, + "step": 13812 + }, + { + "epoch": 2.514426139983617, + "grad_norm": 37.75, + "learning_rate": 1.2556962981987188e-06, + "loss": 1.795989751815796, + "step": 13814 + }, + { + "epoch": 2.514790206607809, + "grad_norm": 20.375, + "learning_rate": 1.2553215992377054e-06, + "loss": 1.5725948810577393, + "step": 13816 + }, + { + "epoch": 2.5151542732320014, + "grad_norm": 15.5, + "learning_rate": 1.254947156301177e-06, + "loss": 0.5411054491996765, + "step": 13818 + }, + { + "epoch": 2.5155183398561936, + "grad_norm": 13.9375, + "learning_rate": 1.254572969444081e-06, + "loss": 1.4140441417694092, + "step": 13820 + }, + { + "epoch": 2.5158824064803857, + "grad_norm": 12.5625, + "learning_rate": 1.2541990387213285e-06, + "loss": 1.4466546773910522, + "step": 13822 + }, + { + "epoch": 2.5162464731045784, + "grad_norm": 9.3125, + "learning_rate": 1.2538253641877915e-06, + "loss": 1.5120737552642822, + "step": 13824 + }, + { + "epoch": 2.51661053972877, + "grad_norm": 18.125, + "learning_rate": 1.253451945898306e-06, + "loss": 1.521331548690796, + "step": 13826 + }, + { + "epoch": 2.5169746063529628, + "grad_norm": 25.125, + "learning_rate": 1.2530787839076692e-06, + "loss": 0.9347392320632935, + "step": 13828 + }, + { + "epoch": 2.517338672977155, + "grad_norm": 13.125, + "learning_rate": 1.252705878270641e-06, + "loss": 1.5979176759719849, + "step": 13830 + }, + { + "epoch": 2.517702739601347, + "grad_norm": 5.59375, + "learning_rate": 1.2523332290419442e-06, + "loss": 1.349301815032959, + "step": 13832 + }, + { + "epoch": 2.5180668062255394, + "grad_norm": 31.875, + "learning_rate": 1.2519608362762637e-06, + "loss": 1.4526147842407227, + "step": 13834 + }, + { + "epoch": 2.5184308728497315, + "grad_norm": 9.6875, + "learning_rate": 1.2515887000282457e-06, + "loss": 0.9883827567100525, + "step": 13836 + }, + { + "epoch": 2.5187949394739237, + "grad_norm": 19.625, + "learning_rate": 1.2512168203525008e-06, + "loss": 1.437248706817627, + "step": 13838 + }, + { + "epoch": 2.519159006098116, + "grad_norm": 9.5625, + "learning_rate": 1.2508451973035998e-06, + "loss": 1.451150894165039, + "step": 13840 + }, + { + "epoch": 2.519523072722308, + "grad_norm": 5.34375, + "learning_rate": 1.2504738309360776e-06, + "loss": 1.0233227014541626, + "step": 13842 + }, + { + "epoch": 2.5198871393465003, + "grad_norm": 20.375, + "learning_rate": 1.2501027213044306e-06, + "loss": 1.3571885824203491, + "step": 13844 + }, + { + "epoch": 2.5202512059706925, + "grad_norm": 11.1875, + "learning_rate": 1.2497318684631174e-06, + "loss": 1.1320991516113281, + "step": 13846 + }, + { + "epoch": 2.5206152725948847, + "grad_norm": 24.75, + "learning_rate": 1.2493612724665593e-06, + "loss": 1.600059986114502, + "step": 13848 + }, + { + "epoch": 2.5209793392190774, + "grad_norm": 11.3125, + "learning_rate": 1.24899093336914e-06, + "loss": 1.5025813579559326, + "step": 13850 + }, + { + "epoch": 2.521343405843269, + "grad_norm": 16.375, + "learning_rate": 1.2486208512252048e-06, + "loss": 1.5578429698944092, + "step": 13852 + }, + { + "epoch": 2.5217074724674617, + "grad_norm": 13.6875, + "learning_rate": 1.248251026089062e-06, + "loss": 1.6880372762680054, + "step": 13854 + }, + { + "epoch": 2.522071539091654, + "grad_norm": 13.25, + "learning_rate": 1.247881458014982e-06, + "loss": 1.8128540515899658, + "step": 13856 + }, + { + "epoch": 2.522435605715846, + "grad_norm": 12.8125, + "learning_rate": 1.2475121470571972e-06, + "loss": 1.4829449653625488, + "step": 13858 + }, + { + "epoch": 2.5227996723400383, + "grad_norm": 66.5, + "learning_rate": 1.2471430932699024e-06, + "loss": 1.5019872188568115, + "step": 13860 + }, + { + "epoch": 2.5231637389642305, + "grad_norm": 9.6875, + "learning_rate": 1.2467742967072556e-06, + "loss": 1.5482354164123535, + "step": 13862 + }, + { + "epoch": 2.5235278055884227, + "grad_norm": 68.0, + "learning_rate": 1.2464057574233749e-06, + "loss": 1.4510550498962402, + "step": 13864 + }, + { + "epoch": 2.523891872212615, + "grad_norm": 19.375, + "learning_rate": 1.2460374754723427e-06, + "loss": 1.0535571575164795, + "step": 13866 + }, + { + "epoch": 2.524255938836807, + "grad_norm": 23.375, + "learning_rate": 1.245669450908203e-06, + "loss": 1.9020047187805176, + "step": 13868 + }, + { + "epoch": 2.5246200054609993, + "grad_norm": 16.625, + "learning_rate": 1.2453016837849618e-06, + "loss": 1.8082274198532104, + "step": 13870 + }, + { + "epoch": 2.5249840720851915, + "grad_norm": 17.875, + "learning_rate": 1.244934174156587e-06, + "loss": 1.144162654876709, + "step": 13872 + }, + { + "epoch": 2.5253481387093837, + "grad_norm": 11.4375, + "learning_rate": 1.2445669220770097e-06, + "loss": 1.4965258836746216, + "step": 13874 + }, + { + "epoch": 2.5257122053335763, + "grad_norm": 35.0, + "learning_rate": 1.2441999276001226e-06, + "loss": 1.5621570348739624, + "step": 13876 + }, + { + "epoch": 2.526076271957768, + "grad_norm": 5.90625, + "learning_rate": 1.2438331907797802e-06, + "loss": 1.2954308986663818, + "step": 13878 + }, + { + "epoch": 2.5264403385819607, + "grad_norm": 17.75, + "learning_rate": 1.2434667116697999e-06, + "loss": 1.0212547779083252, + "step": 13880 + }, + { + "epoch": 2.5268044052061525, + "grad_norm": 290.0, + "learning_rate": 1.243100490323961e-06, + "loss": 1.0245438814163208, + "step": 13882 + }, + { + "epoch": 2.527168471830345, + "grad_norm": 6.1875, + "learning_rate": 1.2427345267960054e-06, + "loss": 1.3658027648925781, + "step": 13884 + }, + { + "epoch": 2.5275325384545373, + "grad_norm": 9.8125, + "learning_rate": 1.2423688211396362e-06, + "loss": 1.2823487520217896, + "step": 13886 + }, + { + "epoch": 2.5278966050787295, + "grad_norm": 15.3125, + "learning_rate": 1.2420033734085193e-06, + "loss": 1.4981389045715332, + "step": 13888 + }, + { + "epoch": 2.5282606717029217, + "grad_norm": 21.625, + "learning_rate": 1.241638183656283e-06, + "loss": 1.4964405298233032, + "step": 13890 + }, + { + "epoch": 2.528624738327114, + "grad_norm": 15.4375, + "learning_rate": 1.2412732519365173e-06, + "loss": 1.746260404586792, + "step": 13892 + }, + { + "epoch": 2.528988804951306, + "grad_norm": 21.625, + "learning_rate": 1.2409085783027743e-06, + "loss": 1.6342716217041016, + "step": 13894 + }, + { + "epoch": 2.5293528715754983, + "grad_norm": 23.75, + "learning_rate": 1.2405441628085685e-06, + "loss": 1.8806421756744385, + "step": 13896 + }, + { + "epoch": 2.5297169381996905, + "grad_norm": 12.75, + "learning_rate": 1.2401800055073763e-06, + "loss": 1.5460951328277588, + "step": 13898 + }, + { + "epoch": 2.5300810048238827, + "grad_norm": 9.25, + "learning_rate": 1.2398161064526366e-06, + "loss": 1.4257985353469849, + "step": 13900 + }, + { + "epoch": 2.530445071448075, + "grad_norm": 10.625, + "learning_rate": 1.2394524656977493e-06, + "loss": 1.4062708616256714, + "step": 13902 + }, + { + "epoch": 2.530809138072267, + "grad_norm": 14.375, + "learning_rate": 1.2390890832960783e-06, + "loss": 1.4012010097503662, + "step": 13904 + }, + { + "epoch": 2.5311732046964597, + "grad_norm": 10.8125, + "learning_rate": 1.2387259593009478e-06, + "loss": 1.4376744031906128, + "step": 13906 + }, + { + "epoch": 2.5315372713206514, + "grad_norm": 7.96875, + "learning_rate": 1.2383630937656449e-06, + "loss": 1.4925472736358643, + "step": 13908 + }, + { + "epoch": 2.531901337944844, + "grad_norm": 13.8125, + "learning_rate": 1.2380004867434187e-06, + "loss": 1.521460771560669, + "step": 13910 + }, + { + "epoch": 2.5322654045690363, + "grad_norm": 14.8125, + "learning_rate": 1.2376381382874805e-06, + "loss": 1.5296789407730103, + "step": 13912 + }, + { + "epoch": 2.5326294711932285, + "grad_norm": 3.96875, + "learning_rate": 1.2372760484510033e-06, + "loss": 0.8247794508934021, + "step": 13914 + }, + { + "epoch": 2.5329935378174206, + "grad_norm": 17.5, + "learning_rate": 1.2369142172871221e-06, + "loss": 0.3369797468185425, + "step": 13916 + }, + { + "epoch": 2.533357604441613, + "grad_norm": 13.375, + "learning_rate": 1.2365526448489342e-06, + "loss": 0.5634779334068298, + "step": 13918 + }, + { + "epoch": 2.533721671065805, + "grad_norm": 12.125, + "learning_rate": 1.2361913311894994e-06, + "loss": 0.6927357912063599, + "step": 13920 + }, + { + "epoch": 2.5340857376899972, + "grad_norm": 6.8125, + "learning_rate": 1.2358302763618385e-06, + "loss": 1.1136428117752075, + "step": 13922 + }, + { + "epoch": 2.5344498043141894, + "grad_norm": 10.5, + "learning_rate": 1.235469480418935e-06, + "loss": 1.3693249225616455, + "step": 13924 + }, + { + "epoch": 2.5348138709383816, + "grad_norm": 20.875, + "learning_rate": 1.2351089434137343e-06, + "loss": 1.448167324066162, + "step": 13926 + }, + { + "epoch": 2.535177937562574, + "grad_norm": 22.125, + "learning_rate": 1.2347486653991436e-06, + "loss": 1.4910672903060913, + "step": 13928 + }, + { + "epoch": 2.535542004186766, + "grad_norm": 22.0, + "learning_rate": 1.234388646428032e-06, + "loss": 1.5934560298919678, + "step": 13930 + }, + { + "epoch": 2.5359060708109586, + "grad_norm": 7.84375, + "learning_rate": 1.2340288865532319e-06, + "loss": 1.4249051809310913, + "step": 13932 + }, + { + "epoch": 2.5362701374351504, + "grad_norm": 20.625, + "learning_rate": 1.233669385827535e-06, + "loss": 1.6374164819717407, + "step": 13934 + }, + { + "epoch": 2.536634204059343, + "grad_norm": 9.0625, + "learning_rate": 1.233310144303698e-06, + "loss": 1.2517423629760742, + "step": 13936 + }, + { + "epoch": 2.5369982706835352, + "grad_norm": 17.25, + "learning_rate": 1.2329511620344374e-06, + "loss": 1.587918758392334, + "step": 13938 + }, + { + "epoch": 2.5373623373077274, + "grad_norm": 7.90625, + "learning_rate": 1.2325924390724326e-06, + "loss": 1.4089986085891724, + "step": 13940 + }, + { + "epoch": 2.5377264039319196, + "grad_norm": 30.0, + "learning_rate": 1.2322339754703245e-06, + "loss": 1.6078845262527466, + "step": 13942 + }, + { + "epoch": 2.538090470556112, + "grad_norm": 47.0, + "learning_rate": 1.2318757712807164e-06, + "loss": 1.6663182973861694, + "step": 13944 + }, + { + "epoch": 2.538454537180304, + "grad_norm": 10.3125, + "learning_rate": 1.2315178265561733e-06, + "loss": 1.4863834381103516, + "step": 13946 + }, + { + "epoch": 2.538818603804496, + "grad_norm": 7.03125, + "learning_rate": 1.2311601413492223e-06, + "loss": 1.3234306573867798, + "step": 13948 + }, + { + "epoch": 2.5391826704286884, + "grad_norm": 13.6875, + "learning_rate": 1.2308027157123519e-06, + "loss": 1.421712040901184, + "step": 13950 + }, + { + "epoch": 2.5395467370528806, + "grad_norm": 14.5, + "learning_rate": 1.2304455496980132e-06, + "loss": 1.4140093326568604, + "step": 13952 + }, + { + "epoch": 2.539910803677073, + "grad_norm": 8.6875, + "learning_rate": 1.2300886433586186e-06, + "loss": 1.1182925701141357, + "step": 13954 + }, + { + "epoch": 2.540274870301265, + "grad_norm": 256.0, + "learning_rate": 1.2297319967465427e-06, + "loss": 1.3109902143478394, + "step": 13956 + }, + { + "epoch": 2.5406389369254576, + "grad_norm": 15.875, + "learning_rate": 1.2293756099141222e-06, + "loss": 1.7431678771972656, + "step": 13958 + }, + { + "epoch": 2.5410030035496494, + "grad_norm": 12.3125, + "learning_rate": 1.229019482913655e-06, + "loss": 1.6153838634490967, + "step": 13960 + }, + { + "epoch": 2.541367070173842, + "grad_norm": 21.625, + "learning_rate": 1.2286636157974017e-06, + "loss": 1.7951079607009888, + "step": 13962 + }, + { + "epoch": 2.541731136798034, + "grad_norm": 6.0625, + "learning_rate": 1.2283080086175842e-06, + "loss": 1.4971402883529663, + "step": 13964 + }, + { + "epoch": 2.5420952034222264, + "grad_norm": 9.5, + "learning_rate": 1.2279526614263863e-06, + "loss": 1.4799537658691406, + "step": 13966 + }, + { + "epoch": 2.5424592700464186, + "grad_norm": 15.9375, + "learning_rate": 1.2275975742759538e-06, + "loss": 1.5392813682556152, + "step": 13968 + }, + { + "epoch": 2.542823336670611, + "grad_norm": 20.125, + "learning_rate": 1.2272427472183944e-06, + "loss": 1.6199944019317627, + "step": 13970 + }, + { + "epoch": 2.543187403294803, + "grad_norm": 39.0, + "learning_rate": 1.2268881803057776e-06, + "loss": 2.1201658248901367, + "step": 13972 + }, + { + "epoch": 2.543551469918995, + "grad_norm": 24.875, + "learning_rate": 1.2265338735901344e-06, + "loss": 1.7551052570343018, + "step": 13974 + }, + { + "epoch": 2.5439155365431874, + "grad_norm": 8.4375, + "learning_rate": 1.2261798271234582e-06, + "loss": 1.130300760269165, + "step": 13976 + }, + { + "epoch": 2.5442796031673796, + "grad_norm": 11.625, + "learning_rate": 1.2258260409577035e-06, + "loss": 1.527069330215454, + "step": 13978 + }, + { + "epoch": 2.5446436697915717, + "grad_norm": 9.4375, + "learning_rate": 1.2254725151447873e-06, + "loss": 1.410813331604004, + "step": 13980 + }, + { + "epoch": 2.545007736415764, + "grad_norm": 11.0, + "learning_rate": 1.2251192497365879e-06, + "loss": 1.1892207860946655, + "step": 13982 + }, + { + "epoch": 2.5453718030399566, + "grad_norm": 14.5, + "learning_rate": 1.2247662447849457e-06, + "loss": 0.37004998326301575, + "step": 13984 + }, + { + "epoch": 2.5457358696641483, + "grad_norm": 8.5625, + "learning_rate": 1.2244135003416627e-06, + "loss": 1.1805295944213867, + "step": 13986 + }, + { + "epoch": 2.546099936288341, + "grad_norm": 10.125, + "learning_rate": 1.2240610164585028e-06, + "loss": 1.364537239074707, + "step": 13988 + }, + { + "epoch": 2.5464640029125327, + "grad_norm": 10.0625, + "learning_rate": 1.2237087931871912e-06, + "loss": 1.3939199447631836, + "step": 13990 + }, + { + "epoch": 2.5468280695367254, + "grad_norm": 14.8125, + "learning_rate": 1.2233568305794158e-06, + "loss": 1.6082172393798828, + "step": 13992 + }, + { + "epoch": 2.5471921361609176, + "grad_norm": 13.5, + "learning_rate": 1.2230051286868253e-06, + "loss": 1.3050611019134521, + "step": 13994 + }, + { + "epoch": 2.5475562027851097, + "grad_norm": 10.3125, + "learning_rate": 1.2226536875610306e-06, + "loss": 0.8881387114524841, + "step": 13996 + }, + { + "epoch": 2.547920269409302, + "grad_norm": 6.375, + "learning_rate": 1.2223025072536042e-06, + "loss": 1.3263578414916992, + "step": 13998 + }, + { + "epoch": 2.548284336033494, + "grad_norm": 6.28125, + "learning_rate": 1.2219515878160806e-06, + "loss": 1.412998914718628, + "step": 14000 + }, + { + "epoch": 2.5486484026576863, + "grad_norm": 5.46875, + "learning_rate": 1.221600929299956e-06, + "loss": 1.2710697650909424, + "step": 14002 + }, + { + "epoch": 2.5490124692818785, + "grad_norm": 8.75, + "learning_rate": 1.2212505317566872e-06, + "loss": 1.3614611625671387, + "step": 14004 + }, + { + "epoch": 2.5493765359060707, + "grad_norm": 9.6875, + "learning_rate": 1.2209003952376947e-06, + "loss": 1.9486970901489258, + "step": 14006 + }, + { + "epoch": 2.549740602530263, + "grad_norm": 7.5, + "learning_rate": 1.220550519794359e-06, + "loss": 1.3704783916473389, + "step": 14008 + }, + { + "epoch": 2.550104669154455, + "grad_norm": 13.3125, + "learning_rate": 1.2202009054780228e-06, + "loss": 1.4659056663513184, + "step": 14010 + }, + { + "epoch": 2.5504687357786473, + "grad_norm": 5.40625, + "learning_rate": 1.2198515523399912e-06, + "loss": 1.341232180595398, + "step": 14012 + }, + { + "epoch": 2.55083280240284, + "grad_norm": 15.5625, + "learning_rate": 1.2195024604315298e-06, + "loss": 1.6684938669204712, + "step": 14014 + }, + { + "epoch": 2.5511968690270317, + "grad_norm": 6.5625, + "learning_rate": 1.2191536298038664e-06, + "loss": 1.3729336261749268, + "step": 14016 + }, + { + "epoch": 2.5515609356512243, + "grad_norm": 11.375, + "learning_rate": 1.218805060508191e-06, + "loss": 1.0623828172683716, + "step": 14018 + }, + { + "epoch": 2.5519250022754165, + "grad_norm": 7.8125, + "learning_rate": 1.2184567525956544e-06, + "loss": 1.0805500745773315, + "step": 14020 + }, + { + "epoch": 2.5522890688996087, + "grad_norm": 14.25, + "learning_rate": 1.2181087061173694e-06, + "loss": 1.6687734127044678, + "step": 14022 + }, + { + "epoch": 2.552653135523801, + "grad_norm": 20.75, + "learning_rate": 1.2177609211244101e-06, + "loss": 1.6037479639053345, + "step": 14024 + }, + { + "epoch": 2.553017202147993, + "grad_norm": 8.25, + "learning_rate": 1.2174133976678133e-06, + "loss": 1.4225633144378662, + "step": 14026 + }, + { + "epoch": 2.5533812687721853, + "grad_norm": 10.875, + "learning_rate": 1.2170661357985757e-06, + "loss": 1.4339572191238403, + "step": 14028 + }, + { + "epoch": 2.5537453353963775, + "grad_norm": 53.5, + "learning_rate": 1.2167191355676573e-06, + "loss": 1.2567026615142822, + "step": 14030 + }, + { + "epoch": 2.5541094020205697, + "grad_norm": 12.6875, + "learning_rate": 1.216372397025979e-06, + "loss": 0.9043101072311401, + "step": 14032 + }, + { + "epoch": 2.554473468644762, + "grad_norm": 9.625, + "learning_rate": 1.2160259202244227e-06, + "loss": 1.0356941223144531, + "step": 14034 + }, + { + "epoch": 2.554837535268954, + "grad_norm": 25.125, + "learning_rate": 1.215679705213833e-06, + "loss": 1.522454857826233, + "step": 14036 + }, + { + "epoch": 2.5552016018931463, + "grad_norm": 12.4375, + "learning_rate": 1.2153337520450154e-06, + "loss": 1.6948034763336182, + "step": 14038 + }, + { + "epoch": 2.555565668517339, + "grad_norm": 13.75, + "learning_rate": 1.214988060768737e-06, + "loss": 1.432172179222107, + "step": 14040 + }, + { + "epoch": 2.5559297351415307, + "grad_norm": 10.625, + "learning_rate": 1.2146426314357262e-06, + "loss": 1.0863529443740845, + "step": 14042 + }, + { + "epoch": 2.5562938017657233, + "grad_norm": 7.5625, + "learning_rate": 1.2142974640966743e-06, + "loss": 0.9806747436523438, + "step": 14044 + }, + { + "epoch": 2.5566578683899155, + "grad_norm": 100.5, + "learning_rate": 1.2139525588022325e-06, + "loss": 0.7954238057136536, + "step": 14046 + }, + { + "epoch": 2.5570219350141077, + "grad_norm": 8.5625, + "learning_rate": 1.2136079156030148e-06, + "loss": 0.9820431470870972, + "step": 14048 + }, + { + "epoch": 2.5573860016383, + "grad_norm": 35.25, + "learning_rate": 1.2132635345495956e-06, + "loss": 1.075451374053955, + "step": 14050 + }, + { + "epoch": 2.557750068262492, + "grad_norm": 17.375, + "learning_rate": 1.2129194156925118e-06, + "loss": 1.5588490962982178, + "step": 14052 + }, + { + "epoch": 2.5581141348866843, + "grad_norm": 19.375, + "learning_rate": 1.2125755590822613e-06, + "loss": 1.5445644855499268, + "step": 14054 + }, + { + "epoch": 2.5584782015108765, + "grad_norm": 27.375, + "learning_rate": 1.2122319647693036e-06, + "loss": 1.5661966800689697, + "step": 14056 + }, + { + "epoch": 2.5588422681350687, + "grad_norm": 11.6875, + "learning_rate": 1.21188863280406e-06, + "loss": 1.3662261962890625, + "step": 14058 + }, + { + "epoch": 2.559206334759261, + "grad_norm": 21.5, + "learning_rate": 1.211545563236913e-06, + "loss": 1.576395869255066, + "step": 14060 + }, + { + "epoch": 2.559570401383453, + "grad_norm": 11.0, + "learning_rate": 1.2112027561182066e-06, + "loss": 1.4173095226287842, + "step": 14062 + }, + { + "epoch": 2.5599344680076452, + "grad_norm": 15.8125, + "learning_rate": 1.2108602114982463e-06, + "loss": 2.094416379928589, + "step": 14064 + }, + { + "epoch": 2.560298534631838, + "grad_norm": 9.4375, + "learning_rate": 1.2105179294272993e-06, + "loss": 1.495354413986206, + "step": 14066 + }, + { + "epoch": 2.5606626012560296, + "grad_norm": 24.875, + "learning_rate": 1.2101759099555938e-06, + "loss": 1.4475536346435547, + "step": 14068 + }, + { + "epoch": 2.5610266678802223, + "grad_norm": 16.875, + "learning_rate": 1.2098341531333202e-06, + "loss": 1.4381740093231201, + "step": 14070 + }, + { + "epoch": 2.5613907345044145, + "grad_norm": 7.6875, + "learning_rate": 1.2094926590106298e-06, + "loss": 1.404022455215454, + "step": 14072 + }, + { + "epoch": 2.5617548011286067, + "grad_norm": 12.5, + "learning_rate": 1.2091514276376355e-06, + "loss": 0.83309006690979, + "step": 14074 + }, + { + "epoch": 2.562118867752799, + "grad_norm": 7.71875, + "learning_rate": 1.208810459064411e-06, + "loss": 0.46824920177459717, + "step": 14076 + }, + { + "epoch": 2.562482934376991, + "grad_norm": 16.25, + "learning_rate": 1.2084697533409928e-06, + "loss": 1.586581826210022, + "step": 14078 + }, + { + "epoch": 2.5628470010011832, + "grad_norm": 100.5, + "learning_rate": 1.2081293105173783e-06, + "loss": 1.2888014316558838, + "step": 14080 + }, + { + "epoch": 2.5632110676253754, + "grad_norm": 4.53125, + "learning_rate": 1.207789130643525e-06, + "loss": 1.4294322729110718, + "step": 14082 + }, + { + "epoch": 2.5635751342495676, + "grad_norm": 8.75, + "learning_rate": 1.2074492137693539e-06, + "loss": 1.0541068315505981, + "step": 14084 + }, + { + "epoch": 2.56393920087376, + "grad_norm": 7.6875, + "learning_rate": 1.2071095599447464e-06, + "loss": 1.5774511098861694, + "step": 14086 + }, + { + "epoch": 2.564303267497952, + "grad_norm": 4.90625, + "learning_rate": 1.2067701692195443e-06, + "loss": 1.4180166721343994, + "step": 14088 + }, + { + "epoch": 2.564667334122144, + "grad_norm": 9.4375, + "learning_rate": 1.2064310416435532e-06, + "loss": 1.1897733211517334, + "step": 14090 + }, + { + "epoch": 2.565031400746337, + "grad_norm": 7.1875, + "learning_rate": 1.2060921772665378e-06, + "loss": 1.3457527160644531, + "step": 14092 + }, + { + "epoch": 2.5653954673705286, + "grad_norm": 22.125, + "learning_rate": 1.2057535761382253e-06, + "loss": 1.5147616863250732, + "step": 14094 + }, + { + "epoch": 2.5657595339947212, + "grad_norm": 16.125, + "learning_rate": 1.205415238308304e-06, + "loss": 1.2961853742599487, + "step": 14096 + }, + { + "epoch": 2.5661236006189134, + "grad_norm": 23.5, + "learning_rate": 1.205077163826424e-06, + "loss": 1.6319491863250732, + "step": 14098 + }, + { + "epoch": 2.5664876672431056, + "grad_norm": 21.0, + "learning_rate": 1.2047393527421956e-06, + "loss": 1.7514996528625488, + "step": 14100 + }, + { + "epoch": 2.566851733867298, + "grad_norm": 28.125, + "learning_rate": 1.204401805105192e-06, + "loss": 1.10114324092865, + "step": 14102 + }, + { + "epoch": 2.56721580049149, + "grad_norm": 7.34375, + "learning_rate": 1.2040645209649462e-06, + "loss": 1.4271674156188965, + "step": 14104 + }, + { + "epoch": 2.567579867115682, + "grad_norm": 7.75, + "learning_rate": 1.2037275003709539e-06, + "loss": 1.4360320568084717, + "step": 14106 + }, + { + "epoch": 2.5679439337398744, + "grad_norm": 6.1875, + "learning_rate": 1.203390743372671e-06, + "loss": 1.0118153095245361, + "step": 14108 + }, + { + "epoch": 2.5683080003640666, + "grad_norm": 6.09375, + "learning_rate": 1.2030542500195159e-06, + "loss": 1.2163078784942627, + "step": 14110 + }, + { + "epoch": 2.568672066988259, + "grad_norm": 28.875, + "learning_rate": 1.2027180203608668e-06, + "loss": 1.2504976987838745, + "step": 14112 + }, + { + "epoch": 2.569036133612451, + "grad_norm": 21.125, + "learning_rate": 1.2023820544460647e-06, + "loss": 1.2607107162475586, + "step": 14114 + }, + { + "epoch": 2.569400200236643, + "grad_norm": 7.09375, + "learning_rate": 1.2020463523244112e-06, + "loss": 1.0573214292526245, + "step": 14116 + }, + { + "epoch": 2.5697642668608354, + "grad_norm": 13.375, + "learning_rate": 1.2017109140451687e-06, + "loss": 1.6065982580184937, + "step": 14118 + }, + { + "epoch": 2.5701283334850276, + "grad_norm": 18.25, + "learning_rate": 1.2013757396575619e-06, + "loss": 0.9907183647155762, + "step": 14120 + }, + { + "epoch": 2.57049240010922, + "grad_norm": 11.5, + "learning_rate": 1.201040829210776e-06, + "loss": 1.2247848510742188, + "step": 14122 + }, + { + "epoch": 2.570856466733412, + "grad_norm": 6.21875, + "learning_rate": 1.200706182753958e-06, + "loss": 1.1839265823364258, + "step": 14124 + }, + { + "epoch": 2.5712205333576046, + "grad_norm": 20.125, + "learning_rate": 1.2003718003362155e-06, + "loss": 1.9839324951171875, + "step": 14126 + }, + { + "epoch": 2.571584599981797, + "grad_norm": 9.875, + "learning_rate": 1.2000376820066183e-06, + "loss": 1.9553616046905518, + "step": 14128 + }, + { + "epoch": 2.571948666605989, + "grad_norm": 3.1875, + "learning_rate": 1.1997038278141966e-06, + "loss": 1.0969654321670532, + "step": 14130 + }, + { + "epoch": 2.572312733230181, + "grad_norm": 8.8125, + "learning_rate": 1.1993702378079422e-06, + "loss": 1.151517391204834, + "step": 14132 + }, + { + "epoch": 2.5726767998543734, + "grad_norm": 15.625, + "learning_rate": 1.1990369120368082e-06, + "loss": 1.244920253753662, + "step": 14134 + }, + { + "epoch": 2.5730408664785656, + "grad_norm": 12.375, + "learning_rate": 1.1987038505497088e-06, + "loss": 1.6495258808135986, + "step": 14136 + }, + { + "epoch": 2.5734049331027578, + "grad_norm": 10.875, + "learning_rate": 1.198371053395519e-06, + "loss": 1.46974778175354, + "step": 14138 + }, + { + "epoch": 2.57376899972695, + "grad_norm": 3.828125, + "learning_rate": 1.198038520623076e-06, + "loss": 1.18740975856781, + "step": 14140 + }, + { + "epoch": 2.574133066351142, + "grad_norm": 16.25, + "learning_rate": 1.1977062522811768e-06, + "loss": 0.8307124972343445, + "step": 14142 + }, + { + "epoch": 2.5744971329753343, + "grad_norm": 19.5, + "learning_rate": 1.1973742484185818e-06, + "loss": 1.0607904195785522, + "step": 14144 + }, + { + "epoch": 2.5748611995995265, + "grad_norm": 7.90625, + "learning_rate": 1.1970425090840099e-06, + "loss": 1.2869579792022705, + "step": 14146 + }, + { + "epoch": 2.575225266223719, + "grad_norm": 9.5, + "learning_rate": 1.1967110343261434e-06, + "loss": 1.5717499256134033, + "step": 14148 + }, + { + "epoch": 2.575589332847911, + "grad_norm": 31.0, + "learning_rate": 1.1963798241936245e-06, + "loss": 1.4820665121078491, + "step": 14150 + }, + { + "epoch": 2.5759533994721036, + "grad_norm": 32.75, + "learning_rate": 1.1960488787350566e-06, + "loss": 0.6237311363220215, + "step": 14152 + }, + { + "epoch": 2.5763174660962957, + "grad_norm": 11.25, + "learning_rate": 1.1957181979990054e-06, + "loss": 1.5681191682815552, + "step": 14154 + }, + { + "epoch": 2.576681532720488, + "grad_norm": 9.3125, + "learning_rate": 1.1953877820339965e-06, + "loss": 1.417885422706604, + "step": 14156 + }, + { + "epoch": 2.57704559934468, + "grad_norm": 6.75, + "learning_rate": 1.1950576308885166e-06, + "loss": 1.273742914199829, + "step": 14158 + }, + { + "epoch": 2.5774096659688723, + "grad_norm": 8.875, + "learning_rate": 1.194727744611015e-06, + "loss": 1.3932478427886963, + "step": 14160 + }, + { + "epoch": 2.5777737325930645, + "grad_norm": 14.375, + "learning_rate": 1.1943981232499008e-06, + "loss": 1.3501145839691162, + "step": 14162 + }, + { + "epoch": 2.5781377992172567, + "grad_norm": 12.5625, + "learning_rate": 1.1940687668535444e-06, + "loss": 1.8123904466629028, + "step": 14164 + }, + { + "epoch": 2.578501865841449, + "grad_norm": 12.375, + "learning_rate": 1.1937396754702777e-06, + "loss": 1.1665410995483398, + "step": 14166 + }, + { + "epoch": 2.578865932465641, + "grad_norm": 11.9375, + "learning_rate": 1.1934108491483938e-06, + "loss": 1.5544352531433105, + "step": 14168 + }, + { + "epoch": 2.5792299990898333, + "grad_norm": 16.25, + "learning_rate": 1.1930822879361458e-06, + "loss": 1.2436751127243042, + "step": 14170 + }, + { + "epoch": 2.5795940657140255, + "grad_norm": 5.8125, + "learning_rate": 1.1927539918817496e-06, + "loss": 1.2847809791564941, + "step": 14172 + }, + { + "epoch": 2.579958132338218, + "grad_norm": 3.625, + "learning_rate": 1.1924259610333806e-06, + "loss": 1.0977860689163208, + "step": 14174 + }, + { + "epoch": 2.58032219896241, + "grad_norm": 19.625, + "learning_rate": 1.192098195439177e-06, + "loss": 1.4552431106567383, + "step": 14176 + }, + { + "epoch": 2.5806862655866025, + "grad_norm": 6.46875, + "learning_rate": 1.1917706951472358e-06, + "loss": 1.06253182888031, + "step": 14178 + }, + { + "epoch": 2.5810503322107947, + "grad_norm": 14.375, + "learning_rate": 1.191443460205617e-06, + "loss": 1.0235005617141724, + "step": 14180 + }, + { + "epoch": 2.581414398834987, + "grad_norm": 18.25, + "learning_rate": 1.1911164906623415e-06, + "loss": 0.6101502180099487, + "step": 14182 + }, + { + "epoch": 2.581778465459179, + "grad_norm": 34.25, + "learning_rate": 1.1907897865653897e-06, + "loss": 1.621131181716919, + "step": 14184 + }, + { + "epoch": 2.5821425320833713, + "grad_norm": 32.75, + "learning_rate": 1.190463347962705e-06, + "loss": 1.3554693460464478, + "step": 14186 + }, + { + "epoch": 2.5825065987075635, + "grad_norm": 17.375, + "learning_rate": 1.1901371749021905e-06, + "loss": 1.155472993850708, + "step": 14188 + }, + { + "epoch": 2.5828706653317557, + "grad_norm": 9.5, + "learning_rate": 1.1898112674317106e-06, + "loss": 1.6012673377990723, + "step": 14190 + }, + { + "epoch": 2.583234731955948, + "grad_norm": 11.0, + "learning_rate": 1.1894856255990914e-06, + "loss": 1.8098478317260742, + "step": 14192 + }, + { + "epoch": 2.58359879858014, + "grad_norm": 5.34375, + "learning_rate": 1.1891602494521192e-06, + "loss": 1.045157551765442, + "step": 14194 + }, + { + "epoch": 2.5839628652043323, + "grad_norm": 37.25, + "learning_rate": 1.1888351390385417e-06, + "loss": 1.7236558198928833, + "step": 14196 + }, + { + "epoch": 2.5843269318285245, + "grad_norm": 12.125, + "learning_rate": 1.1885102944060676e-06, + "loss": 1.464908480644226, + "step": 14198 + }, + { + "epoch": 2.584690998452717, + "grad_norm": 41.25, + "learning_rate": 1.1881857156023665e-06, + "loss": 1.8662400245666504, + "step": 14200 + }, + { + "epoch": 2.585055065076909, + "grad_norm": 14.75, + "learning_rate": 1.1878614026750688e-06, + "loss": 1.4441614151000977, + "step": 14202 + }, + { + "epoch": 2.5854191317011015, + "grad_norm": 6.625, + "learning_rate": 1.1875373556717665e-06, + "loss": 1.3337925672531128, + "step": 14204 + }, + { + "epoch": 2.5857831983252937, + "grad_norm": 15.875, + "learning_rate": 1.187213574640012e-06, + "loss": 1.5422848463058472, + "step": 14206 + }, + { + "epoch": 2.586147264949486, + "grad_norm": 9.5, + "learning_rate": 1.1868900596273186e-06, + "loss": 1.3169329166412354, + "step": 14208 + }, + { + "epoch": 2.586511331573678, + "grad_norm": 13.5, + "learning_rate": 1.1865668106811611e-06, + "loss": 1.3868608474731445, + "step": 14210 + }, + { + "epoch": 2.5868753981978703, + "grad_norm": 13.0, + "learning_rate": 1.186243827848975e-06, + "loss": 1.5084577798843384, + "step": 14212 + }, + { + "epoch": 2.5872394648220625, + "grad_norm": 12.125, + "learning_rate": 1.1859211111781568e-06, + "loss": 1.442775845527649, + "step": 14214 + }, + { + "epoch": 2.5876035314462547, + "grad_norm": 20.75, + "learning_rate": 1.1855986607160636e-06, + "loss": 1.5960636138916016, + "step": 14216 + }, + { + "epoch": 2.587967598070447, + "grad_norm": 18.5, + "learning_rate": 1.185276476510014e-06, + "loss": 1.8336906433105469, + "step": 14218 + }, + { + "epoch": 2.588331664694639, + "grad_norm": 15.1875, + "learning_rate": 1.1849545586072866e-06, + "loss": 1.584119439125061, + "step": 14220 + }, + { + "epoch": 2.5886957313188312, + "grad_norm": 11.0, + "learning_rate": 1.184632907055122e-06, + "loss": 1.3604992628097534, + "step": 14222 + }, + { + "epoch": 2.5890597979430234, + "grad_norm": 6.25, + "learning_rate": 1.1843115219007217e-06, + "loss": 1.135657548904419, + "step": 14224 + }, + { + "epoch": 2.589423864567216, + "grad_norm": 36.25, + "learning_rate": 1.1839904031912472e-06, + "loss": 1.3711371421813965, + "step": 14226 + }, + { + "epoch": 2.589787931191408, + "grad_norm": 6.5625, + "learning_rate": 1.1836695509738211e-06, + "loss": 1.2336375713348389, + "step": 14228 + }, + { + "epoch": 2.5901519978156005, + "grad_norm": 16.875, + "learning_rate": 1.1833489652955275e-06, + "loss": 1.243660807609558, + "step": 14230 + }, + { + "epoch": 2.590516064439792, + "grad_norm": 17.5, + "learning_rate": 1.1830286462034112e-06, + "loss": 0.7745146751403809, + "step": 14232 + }, + { + "epoch": 2.590880131063985, + "grad_norm": 8.0, + "learning_rate": 1.182708593744477e-06, + "loss": 1.038374423980713, + "step": 14234 + }, + { + "epoch": 2.591244197688177, + "grad_norm": 9.0, + "learning_rate": 1.1823888079656926e-06, + "loss": 1.5681627988815308, + "step": 14236 + }, + { + "epoch": 2.5916082643123692, + "grad_norm": 13.0625, + "learning_rate": 1.1820692889139838e-06, + "loss": 1.5277447700500488, + "step": 14238 + }, + { + "epoch": 2.5919723309365614, + "grad_norm": 19.875, + "learning_rate": 1.1817500366362398e-06, + "loss": 1.6709702014923096, + "step": 14240 + }, + { + "epoch": 2.5923363975607536, + "grad_norm": 15.8125, + "learning_rate": 1.1814310511793092e-06, + "loss": 1.6598896980285645, + "step": 14242 + }, + { + "epoch": 2.592700464184946, + "grad_norm": 20.0, + "learning_rate": 1.1811123325900017e-06, + "loss": 1.7732157707214355, + "step": 14244 + }, + { + "epoch": 2.593064530809138, + "grad_norm": 12.6875, + "learning_rate": 1.1807938809150883e-06, + "loss": 1.7412445545196533, + "step": 14246 + }, + { + "epoch": 2.59342859743333, + "grad_norm": 12.1875, + "learning_rate": 1.1804756962013e-06, + "loss": 1.464808702468872, + "step": 14248 + }, + { + "epoch": 2.5937926640575224, + "grad_norm": 13.5625, + "learning_rate": 1.1801577784953295e-06, + "loss": 1.4241043329238892, + "step": 14250 + }, + { + "epoch": 2.5941567306817146, + "grad_norm": 14.5, + "learning_rate": 1.1798401278438298e-06, + "loss": 1.095261812210083, + "step": 14252 + }, + { + "epoch": 2.594520797305907, + "grad_norm": 13.25, + "learning_rate": 1.179522744293415e-06, + "loss": 0.671078085899353, + "step": 14254 + }, + { + "epoch": 2.5948848639300994, + "grad_norm": 8.625, + "learning_rate": 1.1792056278906594e-06, + "loss": 1.6016740798950195, + "step": 14256 + }, + { + "epoch": 2.595248930554291, + "grad_norm": 13.875, + "learning_rate": 1.1788887786820993e-06, + "loss": 1.1386425495147705, + "step": 14258 + }, + { + "epoch": 2.595612997178484, + "grad_norm": 13.875, + "learning_rate": 1.1785721967142304e-06, + "loss": 1.4968576431274414, + "step": 14260 + }, + { + "epoch": 2.595977063802676, + "grad_norm": 34.0, + "learning_rate": 1.17825588203351e-06, + "loss": 1.8564518690109253, + "step": 14262 + }, + { + "epoch": 2.596341130426868, + "grad_norm": 6.21875, + "learning_rate": 1.177939834686356e-06, + "loss": 0.9061320424079895, + "step": 14264 + }, + { + "epoch": 2.5967051970510604, + "grad_norm": 6.03125, + "learning_rate": 1.1776240547191475e-06, + "loss": 1.1914856433868408, + "step": 14266 + }, + { + "epoch": 2.5970692636752526, + "grad_norm": 12.125, + "learning_rate": 1.177308542178223e-06, + "loss": 1.5928727388381958, + "step": 14268 + }, + { + "epoch": 2.597433330299445, + "grad_norm": 19.0, + "learning_rate": 1.1769932971098835e-06, + "loss": 1.5279734134674072, + "step": 14270 + }, + { + "epoch": 2.597797396923637, + "grad_norm": 18.625, + "learning_rate": 1.1766783195603895e-06, + "loss": 0.8941159248352051, + "step": 14272 + }, + { + "epoch": 2.598161463547829, + "grad_norm": 16.875, + "learning_rate": 1.1763636095759628e-06, + "loss": 1.8265458345413208, + "step": 14274 + }, + { + "epoch": 2.5985255301720214, + "grad_norm": 7.59375, + "learning_rate": 1.176049167202786e-06, + "loss": 1.4179325103759766, + "step": 14276 + }, + { + "epoch": 2.5988895967962136, + "grad_norm": 51.0, + "learning_rate": 1.1757349924870021e-06, + "loss": 1.2888375520706177, + "step": 14278 + }, + { + "epoch": 2.5992536634204058, + "grad_norm": 40.75, + "learning_rate": 1.175421085474715e-06, + "loss": 1.3022136688232422, + "step": 14280 + }, + { + "epoch": 2.5996177300445984, + "grad_norm": 8.625, + "learning_rate": 1.175107446211989e-06, + "loss": 1.5855441093444824, + "step": 14282 + }, + { + "epoch": 2.59998179666879, + "grad_norm": 6.3125, + "learning_rate": 1.1747940747448497e-06, + "loss": 1.3258237838745117, + "step": 14284 + }, + { + "epoch": 2.600345863292983, + "grad_norm": 35.0, + "learning_rate": 1.174480971119283e-06, + "loss": 1.7382245063781738, + "step": 14286 + }, + { + "epoch": 2.600709929917175, + "grad_norm": 15.3125, + "learning_rate": 1.1741681353812358e-06, + "loss": 1.469666600227356, + "step": 14288 + }, + { + "epoch": 2.601073996541367, + "grad_norm": 8.0625, + "learning_rate": 1.173855567576615e-06, + "loss": 1.1094344854354858, + "step": 14290 + }, + { + "epoch": 2.6014380631655594, + "grad_norm": 7.71875, + "learning_rate": 1.173543267751289e-06, + "loss": 1.4580005407333374, + "step": 14292 + }, + { + "epoch": 2.6018021297897516, + "grad_norm": 10.625, + "learning_rate": 1.1732312359510867e-06, + "loss": 1.124633550643921, + "step": 14294 + }, + { + "epoch": 2.6021661964139438, + "grad_norm": 9.625, + "learning_rate": 1.1729194722217972e-06, + "loss": 1.5115344524383545, + "step": 14296 + }, + { + "epoch": 2.602530263038136, + "grad_norm": 10.5, + "learning_rate": 1.1726079766091706e-06, + "loss": 1.4737446308135986, + "step": 14298 + }, + { + "epoch": 2.602894329662328, + "grad_norm": 24.0, + "learning_rate": 1.1722967491589175e-06, + "loss": 1.6549428701400757, + "step": 14300 + }, + { + "epoch": 2.6032583962865203, + "grad_norm": 11.5, + "learning_rate": 1.1719857899167096e-06, + "loss": 1.7363176345825195, + "step": 14302 + }, + { + "epoch": 2.6036224629107125, + "grad_norm": 3.640625, + "learning_rate": 1.1716750989281787e-06, + "loss": 0.9746482372283936, + "step": 14304 + }, + { + "epoch": 2.6039865295349047, + "grad_norm": 3.796875, + "learning_rate": 1.1713646762389174e-06, + "loss": 0.9989664554595947, + "step": 14306 + }, + { + "epoch": 2.6043505961590974, + "grad_norm": 11.125, + "learning_rate": 1.1710545218944795e-06, + "loss": 1.170009732246399, + "step": 14308 + }, + { + "epoch": 2.604714662783289, + "grad_norm": 23.25, + "learning_rate": 1.170744635940378e-06, + "loss": 1.514284372329712, + "step": 14310 + }, + { + "epoch": 2.6050787294074818, + "grad_norm": 19.625, + "learning_rate": 1.170435018422088e-06, + "loss": 1.518493413925171, + "step": 14312 + }, + { + "epoch": 2.605442796031674, + "grad_norm": 18.125, + "learning_rate": 1.1701256693850443e-06, + "loss": 1.315392017364502, + "step": 14314 + }, + { + "epoch": 2.605806862655866, + "grad_norm": 26.625, + "learning_rate": 1.1698165888746427e-06, + "loss": 1.8009157180786133, + "step": 14316 + }, + { + "epoch": 2.6061709292800583, + "grad_norm": 33.0, + "learning_rate": 1.1695077769362397e-06, + "loss": 1.5634257793426514, + "step": 14318 + }, + { + "epoch": 2.6065349959042505, + "grad_norm": 40.0, + "learning_rate": 1.1691992336151524e-06, + "loss": 2.0482845306396484, + "step": 14320 + }, + { + "epoch": 2.6068990625284427, + "grad_norm": 16.5, + "learning_rate": 1.1688909589566578e-06, + "loss": 1.777584433555603, + "step": 14322 + }, + { + "epoch": 2.607263129152635, + "grad_norm": 17.25, + "learning_rate": 1.168582953005994e-06, + "loss": 1.4168450832366943, + "step": 14324 + }, + { + "epoch": 2.607627195776827, + "grad_norm": 11.9375, + "learning_rate": 1.1682752158083598e-06, + "loss": 1.4832003116607666, + "step": 14326 + }, + { + "epoch": 2.6079912624010193, + "grad_norm": 31.875, + "learning_rate": 1.1679677474089146e-06, + "loss": 1.8173999786376953, + "step": 14328 + }, + { + "epoch": 2.6083553290252115, + "grad_norm": 11.875, + "learning_rate": 1.1676605478527777e-06, + "loss": 1.5967960357666016, + "step": 14330 + }, + { + "epoch": 2.6087193956494037, + "grad_norm": 3.234375, + "learning_rate": 1.1673536171850295e-06, + "loss": 0.9845655560493469, + "step": 14332 + }, + { + "epoch": 2.6090834622735963, + "grad_norm": 36.0, + "learning_rate": 1.1670469554507109e-06, + "loss": 0.9247639179229736, + "step": 14334 + }, + { + "epoch": 2.609447528897788, + "grad_norm": 17.625, + "learning_rate": 1.1667405626948234e-06, + "loss": 0.6797451972961426, + "step": 14336 + }, + { + "epoch": 2.6098115955219807, + "grad_norm": 11.4375, + "learning_rate": 1.166434438962329e-06, + "loss": 1.5167579650878906, + "step": 14338 + }, + { + "epoch": 2.610175662146173, + "grad_norm": 22.125, + "learning_rate": 1.1661285842981495e-06, + "loss": 1.6723637580871582, + "step": 14340 + }, + { + "epoch": 2.610539728770365, + "grad_norm": 11.4375, + "learning_rate": 1.1658229987471683e-06, + "loss": 1.4314948320388794, + "step": 14342 + }, + { + "epoch": 2.6109037953945573, + "grad_norm": 8.4375, + "learning_rate": 1.165517682354229e-06, + "loss": 1.2646710872650146, + "step": 14344 + }, + { + "epoch": 2.6112678620187495, + "grad_norm": 51.0, + "learning_rate": 1.165212635164135e-06, + "loss": 1.2827895879745483, + "step": 14346 + }, + { + "epoch": 2.6116319286429417, + "grad_norm": 18.375, + "learning_rate": 1.1649078572216512e-06, + "loss": 1.4106487035751343, + "step": 14348 + }, + { + "epoch": 2.611995995267134, + "grad_norm": 11.3125, + "learning_rate": 1.1646033485715023e-06, + "loss": 1.4193886518478394, + "step": 14350 + }, + { + "epoch": 2.612360061891326, + "grad_norm": 9.3125, + "learning_rate": 1.164299109258374e-06, + "loss": 1.5058352947235107, + "step": 14352 + }, + { + "epoch": 2.6127241285155183, + "grad_norm": 8.1875, + "learning_rate": 1.1639951393269118e-06, + "loss": 1.394639492034912, + "step": 14354 + }, + { + "epoch": 2.6130881951397105, + "grad_norm": 9.875, + "learning_rate": 1.1636914388217224e-06, + "loss": 1.3609979152679443, + "step": 14356 + }, + { + "epoch": 2.6134522617639027, + "grad_norm": 15.1875, + "learning_rate": 1.1633880077873721e-06, + "loss": 1.588275671005249, + "step": 14358 + }, + { + "epoch": 2.613816328388095, + "grad_norm": 11.5625, + "learning_rate": 1.1630848462683885e-06, + "loss": 1.5709130764007568, + "step": 14360 + }, + { + "epoch": 2.614180395012287, + "grad_norm": 21.625, + "learning_rate": 1.1627819543092597e-06, + "loss": 1.4891860485076904, + "step": 14362 + }, + { + "epoch": 2.6145444616364797, + "grad_norm": 26.5, + "learning_rate": 1.1624793319544335e-06, + "loss": 1.5109132528305054, + "step": 14364 + }, + { + "epoch": 2.6149085282606714, + "grad_norm": 5.4375, + "learning_rate": 1.1621769792483182e-06, + "loss": 0.9553173780441284, + "step": 14366 + }, + { + "epoch": 2.615272594884864, + "grad_norm": 6.375, + "learning_rate": 1.1618748962352833e-06, + "loss": 1.087977647781372, + "step": 14368 + }, + { + "epoch": 2.6156366615090563, + "grad_norm": 25.75, + "learning_rate": 1.1615730829596583e-06, + "loss": 1.0443356037139893, + "step": 14370 + }, + { + "epoch": 2.6160007281332485, + "grad_norm": 54.75, + "learning_rate": 1.1612715394657326e-06, + "loss": 1.5442872047424316, + "step": 14372 + }, + { + "epoch": 2.6163647947574407, + "grad_norm": 12.5, + "learning_rate": 1.1609702657977568e-06, + "loss": 1.707930564880371, + "step": 14374 + }, + { + "epoch": 2.616728861381633, + "grad_norm": 11.75, + "learning_rate": 1.1606692619999418e-06, + "loss": 1.2924145460128784, + "step": 14376 + }, + { + "epoch": 2.617092928005825, + "grad_norm": 17.5, + "learning_rate": 1.1603685281164585e-06, + "loss": 1.6354949474334717, + "step": 14378 + }, + { + "epoch": 2.6174569946300172, + "grad_norm": 13.6875, + "learning_rate": 1.160068064191438e-06, + "loss": 1.9317681789398193, + "step": 14380 + }, + { + "epoch": 2.6178210612542094, + "grad_norm": 16.75, + "learning_rate": 1.1597678702689724e-06, + "loss": 1.2470465898513794, + "step": 14382 + }, + { + "epoch": 2.6181851278784016, + "grad_norm": 9.0625, + "learning_rate": 1.1594679463931142e-06, + "loss": 0.6711374521255493, + "step": 14384 + }, + { + "epoch": 2.618549194502594, + "grad_norm": 14.5, + "learning_rate": 1.1591682926078762e-06, + "loss": 1.3027068376541138, + "step": 14386 + }, + { + "epoch": 2.618913261126786, + "grad_norm": 12.0625, + "learning_rate": 1.1588689089572306e-06, + "loss": 1.6903302669525146, + "step": 14388 + }, + { + "epoch": 2.6192773277509787, + "grad_norm": 11.8125, + "learning_rate": 1.1585697954851113e-06, + "loss": 0.8386199474334717, + "step": 14390 + }, + { + "epoch": 2.6196413943751704, + "grad_norm": 6.53125, + "learning_rate": 1.1582709522354117e-06, + "loss": 1.4591509103775024, + "step": 14392 + }, + { + "epoch": 2.620005460999363, + "grad_norm": 3.234375, + "learning_rate": 1.157972379251986e-06, + "loss": 0.9996927380561829, + "step": 14394 + }, + { + "epoch": 2.6203695276235552, + "grad_norm": 13.25, + "learning_rate": 1.1576740765786488e-06, + "loss": 1.0451319217681885, + "step": 14396 + }, + { + "epoch": 2.6207335942477474, + "grad_norm": 6.625, + "learning_rate": 1.1573760442591741e-06, + "loss": 1.190625786781311, + "step": 14398 + }, + { + "epoch": 2.6210976608719396, + "grad_norm": 7.65625, + "learning_rate": 1.157078282337298e-06, + "loss": 1.2719480991363525, + "step": 14400 + }, + { + "epoch": 2.621461727496132, + "grad_norm": 12.625, + "learning_rate": 1.1567807908567148e-06, + "loss": 1.1912826299667358, + "step": 14402 + }, + { + "epoch": 2.621825794120324, + "grad_norm": 7.9375, + "learning_rate": 1.1564835698610808e-06, + "loss": 1.4906561374664307, + "step": 14404 + }, + { + "epoch": 2.622189860744516, + "grad_norm": 4.78125, + "learning_rate": 1.1561866193940119e-06, + "loss": 1.1690740585327148, + "step": 14406 + }, + { + "epoch": 2.6225539273687084, + "grad_norm": 3.84375, + "learning_rate": 1.1558899394990841e-06, + "loss": 0.8879704475402832, + "step": 14408 + }, + { + "epoch": 2.6229179939929006, + "grad_norm": 6.46875, + "learning_rate": 1.1555935302198343e-06, + "loss": 1.193251609802246, + "step": 14410 + }, + { + "epoch": 2.623282060617093, + "grad_norm": 17.625, + "learning_rate": 1.1552973915997592e-06, + "loss": 1.4749467372894287, + "step": 14412 + }, + { + "epoch": 2.623646127241285, + "grad_norm": 7.3125, + "learning_rate": 1.1550015236823158e-06, + "loss": 1.5314549207687378, + "step": 14414 + }, + { + "epoch": 2.6240101938654776, + "grad_norm": 16.75, + "learning_rate": 1.154705926510922e-06, + "loss": 1.4068033695220947, + "step": 14416 + }, + { + "epoch": 2.6243742604896694, + "grad_norm": 7.46875, + "learning_rate": 1.1544106001289549e-06, + "loss": 1.1362931728363037, + "step": 14418 + }, + { + "epoch": 2.624738327113862, + "grad_norm": 9.375, + "learning_rate": 1.1541155445797527e-06, + "loss": 0.8063347339630127, + "step": 14420 + }, + { + "epoch": 2.625102393738054, + "grad_norm": 6.875, + "learning_rate": 1.1538207599066138e-06, + "loss": 1.5680792331695557, + "step": 14422 + }, + { + "epoch": 2.6254664603622464, + "grad_norm": 15.5625, + "learning_rate": 1.1535262461527962e-06, + "loss": 1.4349617958068848, + "step": 14424 + }, + { + "epoch": 2.6258305269864386, + "grad_norm": 23.375, + "learning_rate": 1.1532320033615191e-06, + "loss": 1.9161278009414673, + "step": 14426 + }, + { + "epoch": 2.626194593610631, + "grad_norm": 14.4375, + "learning_rate": 1.152938031575961e-06, + "loss": 1.3714648485183716, + "step": 14428 + }, + { + "epoch": 2.626558660234823, + "grad_norm": 26.5, + "learning_rate": 1.1526443308392615e-06, + "loss": 1.9736828804016113, + "step": 14430 + }, + { + "epoch": 2.626922726859015, + "grad_norm": 17.875, + "learning_rate": 1.15235090119452e-06, + "loss": 1.324831247329712, + "step": 14432 + }, + { + "epoch": 2.6272867934832074, + "grad_norm": 10.4375, + "learning_rate": 1.1520577426847952e-06, + "loss": 1.5042481422424316, + "step": 14434 + }, + { + "epoch": 2.6276508601073996, + "grad_norm": 21.25, + "learning_rate": 1.151764855353108e-06, + "loss": 1.3443495035171509, + "step": 14436 + }, + { + "epoch": 2.6280149267315918, + "grad_norm": 5.90625, + "learning_rate": 1.151472239242438e-06, + "loss": 0.8775898814201355, + "step": 14438 + }, + { + "epoch": 2.628378993355784, + "grad_norm": 8.8125, + "learning_rate": 1.1511798943957255e-06, + "loss": 1.1691288948059082, + "step": 14440 + }, + { + "epoch": 2.6287430599799766, + "grad_norm": 10.125, + "learning_rate": 1.150887820855871e-06, + "loss": 1.4335007667541504, + "step": 14442 + }, + { + "epoch": 2.6291071266041683, + "grad_norm": 13.6875, + "learning_rate": 1.1505960186657349e-06, + "loss": 1.2584631443023682, + "step": 14444 + }, + { + "epoch": 2.629471193228361, + "grad_norm": 10.375, + "learning_rate": 1.150304487868138e-06, + "loss": 1.5659459829330444, + "step": 14446 + }, + { + "epoch": 2.629835259852553, + "grad_norm": 9.9375, + "learning_rate": 1.1500132285058613e-06, + "loss": 1.5534355640411377, + "step": 14448 + }, + { + "epoch": 2.6301993264767454, + "grad_norm": 13.0625, + "learning_rate": 1.1497222406216463e-06, + "loss": 1.129992127418518, + "step": 14450 + }, + { + "epoch": 2.6305633931009376, + "grad_norm": 49.25, + "learning_rate": 1.1494315242581936e-06, + "loss": 1.502547025680542, + "step": 14452 + }, + { + "epoch": 2.6309274597251298, + "grad_norm": 7.78125, + "learning_rate": 1.149141079458165e-06, + "loss": 1.3004871606826782, + "step": 14454 + }, + { + "epoch": 2.631291526349322, + "grad_norm": 3.671875, + "learning_rate": 1.1488509062641825e-06, + "loss": 1.029909610748291, + "step": 14456 + }, + { + "epoch": 2.631655592973514, + "grad_norm": 7.3125, + "learning_rate": 1.148561004718827e-06, + "loss": 1.453866720199585, + "step": 14458 + }, + { + "epoch": 2.6320196595977063, + "grad_norm": 13.875, + "learning_rate": 1.148271374864641e-06, + "loss": 1.7987303733825684, + "step": 14460 + }, + { + "epoch": 2.6323837262218985, + "grad_norm": 13.9375, + "learning_rate": 1.1479820167441265e-06, + "loss": 1.6310635805130005, + "step": 14462 + }, + { + "epoch": 2.6327477928460907, + "grad_norm": 11.75, + "learning_rate": 1.1476929303997454e-06, + "loss": 1.784327745437622, + "step": 14464 + }, + { + "epoch": 2.633111859470283, + "grad_norm": 11.75, + "learning_rate": 1.1474041158739201e-06, + "loss": 1.5094164609909058, + "step": 14466 + }, + { + "epoch": 2.6334759260944756, + "grad_norm": 19.0, + "learning_rate": 1.1471155732090325e-06, + "loss": 1.506624460220337, + "step": 14468 + }, + { + "epoch": 2.6338399927186673, + "grad_norm": 7.71875, + "learning_rate": 1.1468273024474258e-06, + "loss": 1.3636056184768677, + "step": 14470 + }, + { + "epoch": 2.63420405934286, + "grad_norm": 5.25, + "learning_rate": 1.1465393036314022e-06, + "loss": 1.4712549448013306, + "step": 14472 + }, + { + "epoch": 2.6345681259670517, + "grad_norm": 7.1875, + "learning_rate": 1.146251576803224e-06, + "loss": 0.996986448764801, + "step": 14474 + }, + { + "epoch": 2.6349321925912443, + "grad_norm": 120.5, + "learning_rate": 1.1459641220051148e-06, + "loss": 1.3135651350021362, + "step": 14476 + }, + { + "epoch": 2.6352962592154365, + "grad_norm": 9.6875, + "learning_rate": 1.1456769392792568e-06, + "loss": 0.8766427636146545, + "step": 14478 + }, + { + "epoch": 2.6356603258396287, + "grad_norm": 9.875, + "learning_rate": 1.145390028667793e-06, + "loss": 1.4558296203613281, + "step": 14480 + }, + { + "epoch": 2.636024392463821, + "grad_norm": 3.984375, + "learning_rate": 1.1451033902128264e-06, + "loss": 1.076955795288086, + "step": 14482 + }, + { + "epoch": 2.636388459088013, + "grad_norm": 18.5, + "learning_rate": 1.1448170239564201e-06, + "loss": 1.2234370708465576, + "step": 14484 + }, + { + "epoch": 2.6367525257122053, + "grad_norm": 16.25, + "learning_rate": 1.1445309299405971e-06, + "loss": 1.508989691734314, + "step": 14486 + }, + { + "epoch": 2.6371165923363975, + "grad_norm": 20.625, + "learning_rate": 1.1442451082073405e-06, + "loss": 1.3836790323257446, + "step": 14488 + }, + { + "epoch": 2.6374806589605897, + "grad_norm": 13.8125, + "learning_rate": 1.1439595587985937e-06, + "loss": 1.1410776376724243, + "step": 14490 + }, + { + "epoch": 2.637844725584782, + "grad_norm": 5.125, + "learning_rate": 1.1436742817562595e-06, + "loss": 1.3450950384140015, + "step": 14492 + }, + { + "epoch": 2.638208792208974, + "grad_norm": 12.8125, + "learning_rate": 1.1433892771222018e-06, + "loss": 1.4257886409759521, + "step": 14494 + }, + { + "epoch": 2.6385728588331663, + "grad_norm": 12.75, + "learning_rate": 1.1431045449382432e-06, + "loss": 1.494485855102539, + "step": 14496 + }, + { + "epoch": 2.638936925457359, + "grad_norm": 7.46875, + "learning_rate": 1.1428200852461675e-06, + "loss": 1.257080316543579, + "step": 14498 + }, + { + "epoch": 2.6393009920815507, + "grad_norm": 9.125, + "learning_rate": 1.1425358980877177e-06, + "loss": 1.38213312625885, + "step": 14500 + }, + { + "epoch": 2.6396650587057433, + "grad_norm": 15.8125, + "learning_rate": 1.142251983504597e-06, + "loss": 1.309072732925415, + "step": 14502 + }, + { + "epoch": 2.6400291253299355, + "grad_norm": 9.75, + "learning_rate": 1.1419683415384693e-06, + "loss": 1.4315125942230225, + "step": 14504 + }, + { + "epoch": 2.6403931919541277, + "grad_norm": 20.5, + "learning_rate": 1.1416849722309574e-06, + "loss": 1.6254199743270874, + "step": 14506 + }, + { + "epoch": 2.64075725857832, + "grad_norm": 16.0, + "learning_rate": 1.1414018756236446e-06, + "loss": 1.2900720834732056, + "step": 14508 + }, + { + "epoch": 2.641121325202512, + "grad_norm": 38.25, + "learning_rate": 1.1411190517580745e-06, + "loss": 1.8353610038757324, + "step": 14510 + }, + { + "epoch": 2.6414853918267043, + "grad_norm": 16.0, + "learning_rate": 1.1408365006757504e-06, + "loss": 1.5915122032165527, + "step": 14512 + }, + { + "epoch": 2.6418494584508965, + "grad_norm": 8.625, + "learning_rate": 1.140554222418135e-06, + "loss": 1.2189075946807861, + "step": 14514 + }, + { + "epoch": 2.6422135250750887, + "grad_norm": 5.78125, + "learning_rate": 1.1402722170266517e-06, + "loss": 1.345229148864746, + "step": 14516 + }, + { + "epoch": 2.642577591699281, + "grad_norm": 9.125, + "learning_rate": 1.1399904845426837e-06, + "loss": 1.4504570960998535, + "step": 14518 + }, + { + "epoch": 2.642941658323473, + "grad_norm": 10.25, + "learning_rate": 1.1397090250075743e-06, + "loss": 1.26963210105896, + "step": 14520 + }, + { + "epoch": 2.6433057249476652, + "grad_norm": 16.25, + "learning_rate": 1.1394278384626263e-06, + "loss": 1.1671103239059448, + "step": 14522 + }, + { + "epoch": 2.643669791571858, + "grad_norm": 13.8125, + "learning_rate": 1.1391469249491027e-06, + "loss": 1.461940884590149, + "step": 14524 + }, + { + "epoch": 2.6440338581960496, + "grad_norm": 16.5, + "learning_rate": 1.1388662845082267e-06, + "loss": 1.5407819747924805, + "step": 14526 + }, + { + "epoch": 2.6443979248202423, + "grad_norm": 6.25, + "learning_rate": 1.1385859171811806e-06, + "loss": 1.3907628059387207, + "step": 14528 + }, + { + "epoch": 2.6447619914444345, + "grad_norm": 10.75, + "learning_rate": 1.1383058230091077e-06, + "loss": 1.573671579360962, + "step": 14530 + }, + { + "epoch": 2.6451260580686267, + "grad_norm": 13.4375, + "learning_rate": 1.1380260020331108e-06, + "loss": 1.3575526475906372, + "step": 14532 + }, + { + "epoch": 2.645490124692819, + "grad_norm": 9.125, + "learning_rate": 1.1377464542942517e-06, + "loss": 1.273698329925537, + "step": 14534 + }, + { + "epoch": 2.645854191317011, + "grad_norm": 9.0625, + "learning_rate": 1.1374671798335534e-06, + "loss": 1.4671393632888794, + "step": 14536 + }, + { + "epoch": 2.6462182579412032, + "grad_norm": 12.5, + "learning_rate": 1.1371881786919986e-06, + "loss": 1.550858736038208, + "step": 14538 + }, + { + "epoch": 2.6465823245653954, + "grad_norm": 11.6875, + "learning_rate": 1.1369094509105293e-06, + "loss": 1.6266849040985107, + "step": 14540 + }, + { + "epoch": 2.6469463911895876, + "grad_norm": 5.1875, + "learning_rate": 1.1366309965300474e-06, + "loss": 1.3505311012268066, + "step": 14542 + }, + { + "epoch": 2.64731045781378, + "grad_norm": 9.0, + "learning_rate": 1.136352815591415e-06, + "loss": 1.3523355722427368, + "step": 14544 + }, + { + "epoch": 2.647674524437972, + "grad_norm": 14.8125, + "learning_rate": 1.1360749081354546e-06, + "loss": 1.112898588180542, + "step": 14546 + }, + { + "epoch": 2.648038591062164, + "grad_norm": 19.0, + "learning_rate": 1.1357972742029472e-06, + "loss": 1.0763049125671387, + "step": 14548 + }, + { + "epoch": 2.648402657686357, + "grad_norm": 7.375, + "learning_rate": 1.135519913834635e-06, + "loss": 0.8174800872802734, + "step": 14550 + }, + { + "epoch": 2.6487667243105486, + "grad_norm": 6.0, + "learning_rate": 1.1352428270712198e-06, + "loss": 1.0621155500411987, + "step": 14552 + }, + { + "epoch": 2.6491307909347412, + "grad_norm": 23.625, + "learning_rate": 1.134966013953362e-06, + "loss": 1.335787296295166, + "step": 14554 + }, + { + "epoch": 2.6494948575589334, + "grad_norm": 16.875, + "learning_rate": 1.1346894745216836e-06, + "loss": 1.3539605140686035, + "step": 14556 + }, + { + "epoch": 2.6498589241831256, + "grad_norm": 9.875, + "learning_rate": 1.1344132088167652e-06, + "loss": 1.3322436809539795, + "step": 14558 + }, + { + "epoch": 2.650222990807318, + "grad_norm": 19.375, + "learning_rate": 1.1341372168791482e-06, + "loss": 1.0502952337265015, + "step": 14560 + }, + { + "epoch": 2.65058705743151, + "grad_norm": 7.21875, + "learning_rate": 1.1338614987493323e-06, + "loss": 0.9821246266365051, + "step": 14562 + }, + { + "epoch": 2.650951124055702, + "grad_norm": 19.125, + "learning_rate": 1.133586054467779e-06, + "loss": 1.519580364227295, + "step": 14564 + }, + { + "epoch": 2.6513151906798944, + "grad_norm": 11.1875, + "learning_rate": 1.1333108840749085e-06, + "loss": 1.449804663658142, + "step": 14566 + }, + { + "epoch": 2.6516792573040866, + "grad_norm": 15.75, + "learning_rate": 1.1330359876111006e-06, + "loss": 1.5614076852798462, + "step": 14568 + }, + { + "epoch": 2.652043323928279, + "grad_norm": 21.375, + "learning_rate": 1.132761365116695e-06, + "loss": 1.7726774215698242, + "step": 14570 + }, + { + "epoch": 2.652407390552471, + "grad_norm": 10.9375, + "learning_rate": 1.1324870166319923e-06, + "loss": 1.3080321550369263, + "step": 14572 + }, + { + "epoch": 2.652771457176663, + "grad_norm": 13.3125, + "learning_rate": 1.1322129421972514e-06, + "loss": 1.405094861984253, + "step": 14574 + }, + { + "epoch": 2.653135523800856, + "grad_norm": 49.0, + "learning_rate": 1.1319391418526917e-06, + "loss": 1.595885157585144, + "step": 14576 + }, + { + "epoch": 2.6534995904250476, + "grad_norm": 13.375, + "learning_rate": 1.1316656156384926e-06, + "loss": 1.574186086654663, + "step": 14578 + }, + { + "epoch": 2.65386365704924, + "grad_norm": 13.625, + "learning_rate": 1.131392363594792e-06, + "loss": 1.7120325565338135, + "step": 14580 + }, + { + "epoch": 2.6542277236734324, + "grad_norm": 57.75, + "learning_rate": 1.1311193857616901e-06, + "loss": 1.4646236896514893, + "step": 14582 + }, + { + "epoch": 2.6545917902976246, + "grad_norm": 6.03125, + "learning_rate": 1.130846682179244e-06, + "loss": 1.0188984870910645, + "step": 14584 + }, + { + "epoch": 2.654955856921817, + "grad_norm": 73.5, + "learning_rate": 1.1305742528874724e-06, + "loss": 1.4415225982666016, + "step": 14586 + }, + { + "epoch": 2.655319923546009, + "grad_norm": 7.84375, + "learning_rate": 1.130302097926353e-06, + "loss": 1.1592278480529785, + "step": 14588 + }, + { + "epoch": 2.655683990170201, + "grad_norm": 8.6875, + "learning_rate": 1.1300302173358232e-06, + "loss": 1.4418576955795288, + "step": 14590 + }, + { + "epoch": 2.6560480567943934, + "grad_norm": 10.1875, + "learning_rate": 1.1297586111557813e-06, + "loss": 1.396946907043457, + "step": 14592 + }, + { + "epoch": 2.6564121234185856, + "grad_norm": 30.25, + "learning_rate": 1.1294872794260835e-06, + "loss": 1.6139272451400757, + "step": 14594 + }, + { + "epoch": 2.6567761900427778, + "grad_norm": 25.125, + "learning_rate": 1.1292162221865468e-06, + "loss": 1.820549488067627, + "step": 14596 + }, + { + "epoch": 2.65714025666697, + "grad_norm": 7.96875, + "learning_rate": 1.128945439476948e-06, + "loss": 1.3808941841125488, + "step": 14598 + }, + { + "epoch": 2.657504323291162, + "grad_norm": 3.703125, + "learning_rate": 1.1286749313370229e-06, + "loss": 1.07881760597229, + "step": 14600 + }, + { + "epoch": 2.6578683899153543, + "grad_norm": 5.625, + "learning_rate": 1.1284046978064684e-06, + "loss": 0.8214249014854431, + "step": 14602 + }, + { + "epoch": 2.6582324565395465, + "grad_norm": 10.0625, + "learning_rate": 1.1281347389249393e-06, + "loss": 1.4165815114974976, + "step": 14604 + }, + { + "epoch": 2.658596523163739, + "grad_norm": 16.375, + "learning_rate": 1.1278650547320513e-06, + "loss": 1.6547558307647705, + "step": 14606 + }, + { + "epoch": 2.658960589787931, + "grad_norm": 22.0, + "learning_rate": 1.1275956452673793e-06, + "loss": 1.5943236351013184, + "step": 14608 + }, + { + "epoch": 2.6593246564121236, + "grad_norm": 8.6875, + "learning_rate": 1.1273265105704579e-06, + "loss": 0.21635861694812775, + "step": 14610 + }, + { + "epoch": 2.6596887230363158, + "grad_norm": 23.75, + "learning_rate": 1.1270576506807825e-06, + "loss": 1.3442341089248657, + "step": 14612 + }, + { + "epoch": 2.660052789660508, + "grad_norm": 19.625, + "learning_rate": 1.126789065637806e-06, + "loss": 1.7877488136291504, + "step": 14614 + }, + { + "epoch": 2.6604168562847, + "grad_norm": 18.375, + "learning_rate": 1.1265207554809431e-06, + "loss": 1.4619274139404297, + "step": 14616 + }, + { + "epoch": 2.6607809229088923, + "grad_norm": 10.125, + "learning_rate": 1.1262527202495663e-06, + "loss": 1.626192331314087, + "step": 14618 + }, + { + "epoch": 2.6611449895330845, + "grad_norm": 10.0, + "learning_rate": 1.1259849599830091e-06, + "loss": 1.585963249206543, + "step": 14620 + }, + { + "epoch": 2.6615090561572767, + "grad_norm": 14.0, + "learning_rate": 1.1257174747205645e-06, + "loss": 1.6999564170837402, + "step": 14622 + }, + { + "epoch": 2.661873122781469, + "grad_norm": 12.5, + "learning_rate": 1.1254502645014849e-06, + "loss": 1.8844951391220093, + "step": 14624 + }, + { + "epoch": 2.662237189405661, + "grad_norm": 9.9375, + "learning_rate": 1.1251833293649819e-06, + "loss": 1.475205421447754, + "step": 14626 + }, + { + "epoch": 2.6626012560298533, + "grad_norm": 12.1875, + "learning_rate": 1.1249166693502274e-06, + "loss": 1.9800291061401367, + "step": 14628 + }, + { + "epoch": 2.6629653226540455, + "grad_norm": 19.375, + "learning_rate": 1.1246502844963523e-06, + "loss": 1.4416791200637817, + "step": 14630 + }, + { + "epoch": 2.663329389278238, + "grad_norm": 15.125, + "learning_rate": 1.1243841748424486e-06, + "loss": 1.2385917901992798, + "step": 14632 + }, + { + "epoch": 2.66369345590243, + "grad_norm": 11.25, + "learning_rate": 1.1241183404275653e-06, + "loss": 0.8826155662536621, + "step": 14634 + }, + { + "epoch": 2.6640575225266225, + "grad_norm": 25.75, + "learning_rate": 1.1238527812907136e-06, + "loss": 1.5413708686828613, + "step": 14636 + }, + { + "epoch": 2.6644215891508147, + "grad_norm": 12.125, + "learning_rate": 1.1235874974708626e-06, + "loss": 2.031757116317749, + "step": 14638 + }, + { + "epoch": 2.664785655775007, + "grad_norm": 10.4375, + "learning_rate": 1.123322489006942e-06, + "loss": 1.492416501045227, + "step": 14640 + }, + { + "epoch": 2.665149722399199, + "grad_norm": 8.4375, + "learning_rate": 1.123057755937841e-06, + "loss": 1.468606948852539, + "step": 14642 + }, + { + "epoch": 2.6655137890233913, + "grad_norm": 12.1875, + "learning_rate": 1.1227932983024076e-06, + "loss": 1.4281564950942993, + "step": 14644 + }, + { + "epoch": 2.6658778556475835, + "grad_norm": 14.5625, + "learning_rate": 1.1225291161394498e-06, + "loss": 1.9869191646575928, + "step": 14646 + }, + { + "epoch": 2.6662419222717757, + "grad_norm": 9.8125, + "learning_rate": 1.1222652094877357e-06, + "loss": 1.6500223875045776, + "step": 14648 + }, + { + "epoch": 2.666605988895968, + "grad_norm": 3.296875, + "learning_rate": 1.1220015783859917e-06, + "loss": 0.9932403564453125, + "step": 14650 + }, + { + "epoch": 2.66697005552016, + "grad_norm": 8.75, + "learning_rate": 1.121738222872906e-06, + "loss": 1.2444219589233398, + "step": 14652 + }, + { + "epoch": 2.6673341221443523, + "grad_norm": 8.625, + "learning_rate": 1.121475142987124e-06, + "loss": 1.322481393814087, + "step": 14654 + }, + { + "epoch": 2.6676981887685445, + "grad_norm": 12.9375, + "learning_rate": 1.1212123387672516e-06, + "loss": 1.8623713254928589, + "step": 14656 + }, + { + "epoch": 2.668062255392737, + "grad_norm": 83.5, + "learning_rate": 1.1209498102518544e-06, + "loss": 1.9126713275909424, + "step": 14658 + }, + { + "epoch": 2.668426322016929, + "grad_norm": 18.625, + "learning_rate": 1.1206875574794573e-06, + "loss": 1.214752197265625, + "step": 14660 + }, + { + "epoch": 2.6687903886411215, + "grad_norm": 13.625, + "learning_rate": 1.1204255804885455e-06, + "loss": 1.4213049411773682, + "step": 14662 + }, + { + "epoch": 2.6691544552653137, + "grad_norm": 23.5, + "learning_rate": 1.1201638793175618e-06, + "loss": 1.2736343145370483, + "step": 14664 + }, + { + "epoch": 2.669518521889506, + "grad_norm": 12.875, + "learning_rate": 1.1199024540049108e-06, + "loss": 1.472684621810913, + "step": 14666 + }, + { + "epoch": 2.669882588513698, + "grad_norm": 9.9375, + "learning_rate": 1.1196413045889553e-06, + "loss": 1.4619131088256836, + "step": 14668 + }, + { + "epoch": 2.6702466551378903, + "grad_norm": 6.96875, + "learning_rate": 1.119380431108018e-06, + "loss": 1.5982983112335205, + "step": 14670 + }, + { + "epoch": 2.6706107217620825, + "grad_norm": 14.625, + "learning_rate": 1.119119833600381e-06, + "loss": 1.685892105102539, + "step": 14672 + }, + { + "epoch": 2.6709747883862747, + "grad_norm": 13.8125, + "learning_rate": 1.118859512104286e-06, + "loss": 2.2336182594299316, + "step": 14674 + }, + { + "epoch": 2.671338855010467, + "grad_norm": 12.3125, + "learning_rate": 1.1185994666579336e-06, + "loss": 1.7420251369476318, + "step": 14676 + }, + { + "epoch": 2.671702921634659, + "grad_norm": 24.625, + "learning_rate": 1.1183396972994853e-06, + "loss": 1.7761002779006958, + "step": 14678 + }, + { + "epoch": 2.6720669882588513, + "grad_norm": 9.3125, + "learning_rate": 1.1180802040670601e-06, + "loss": 0.8746968507766724, + "step": 14680 + }, + { + "epoch": 2.6724310548830434, + "grad_norm": 10.0, + "learning_rate": 1.1178209869987387e-06, + "loss": 1.4348918199539185, + "step": 14682 + }, + { + "epoch": 2.672795121507236, + "grad_norm": 9.3125, + "learning_rate": 1.1175620461325595e-06, + "loss": 0.6703236103057861, + "step": 14684 + }, + { + "epoch": 2.673159188131428, + "grad_norm": 14.5625, + "learning_rate": 1.117303381506521e-06, + "loss": 1.5436615943908691, + "step": 14686 + }, + { + "epoch": 2.6735232547556205, + "grad_norm": 17.375, + "learning_rate": 1.1170449931585816e-06, + "loss": 1.6233363151550293, + "step": 14688 + }, + { + "epoch": 2.6738873213798127, + "grad_norm": 22.875, + "learning_rate": 1.1167868811266583e-06, + "loss": 1.157469630241394, + "step": 14690 + }, + { + "epoch": 2.674251388004005, + "grad_norm": 37.5, + "learning_rate": 1.1165290454486287e-06, + "loss": 0.5947645306587219, + "step": 14692 + }, + { + "epoch": 2.674615454628197, + "grad_norm": 16.75, + "learning_rate": 1.116271486162328e-06, + "loss": 1.547325849533081, + "step": 14694 + }, + { + "epoch": 2.6749795212523892, + "grad_norm": 8.9375, + "learning_rate": 1.116014203305553e-06, + "loss": 1.7868096828460693, + "step": 14696 + }, + { + "epoch": 2.6753435878765814, + "grad_norm": 10.125, + "learning_rate": 1.1157571969160586e-06, + "loss": 1.553477168083191, + "step": 14698 + }, + { + "epoch": 2.6757076545007736, + "grad_norm": 10.75, + "learning_rate": 1.1155004670315588e-06, + "loss": 1.070258617401123, + "step": 14700 + }, + { + "epoch": 2.676071721124966, + "grad_norm": 19.375, + "learning_rate": 1.115244013689729e-06, + "loss": 1.1477819681167603, + "step": 14702 + }, + { + "epoch": 2.676435787749158, + "grad_norm": 31.0, + "learning_rate": 1.1149878369282018e-06, + "loss": 1.0180714130401611, + "step": 14704 + }, + { + "epoch": 2.67679985437335, + "grad_norm": 21.0, + "learning_rate": 1.1147319367845699e-06, + "loss": 1.2511703968048096, + "step": 14706 + }, + { + "epoch": 2.6771639209975424, + "grad_norm": 14.5, + "learning_rate": 1.1144763132963862e-06, + "loss": 1.5910966396331787, + "step": 14708 + }, + { + "epoch": 2.677527987621735, + "grad_norm": 35.5, + "learning_rate": 1.1142209665011615e-06, + "loss": 2.2161030769348145, + "step": 14710 + }, + { + "epoch": 2.677892054245927, + "grad_norm": 5.59375, + "learning_rate": 1.1139658964363687e-06, + "loss": 1.056174397468567, + "step": 14712 + }, + { + "epoch": 2.6782561208701194, + "grad_norm": 8.1875, + "learning_rate": 1.1137111031394366e-06, + "loss": 1.4566659927368164, + "step": 14714 + }, + { + "epoch": 2.678620187494311, + "grad_norm": 11.75, + "learning_rate": 1.1134565866477556e-06, + "loss": 1.432720422744751, + "step": 14716 + }, + { + "epoch": 2.678984254118504, + "grad_norm": 11.1875, + "learning_rate": 1.1132023469986752e-06, + "loss": 1.5778437852859497, + "step": 14718 + }, + { + "epoch": 2.679348320742696, + "grad_norm": 6.5625, + "learning_rate": 1.1129483842295035e-06, + "loss": 1.4382703304290771, + "step": 14720 + }, + { + "epoch": 2.679712387366888, + "grad_norm": 6.0, + "learning_rate": 1.1126946983775094e-06, + "loss": 1.1238818168640137, + "step": 14722 + }, + { + "epoch": 2.6800764539910804, + "grad_norm": 8.5, + "learning_rate": 1.1124412894799192e-06, + "loss": 1.2095158100128174, + "step": 14724 + }, + { + "epoch": 2.6804405206152726, + "grad_norm": 7.5625, + "learning_rate": 1.1121881575739208e-06, + "loss": 0.6999382972717285, + "step": 14726 + }, + { + "epoch": 2.680804587239465, + "grad_norm": 38.25, + "learning_rate": 1.111935302696659e-06, + "loss": 1.3341186046600342, + "step": 14728 + }, + { + "epoch": 2.681168653863657, + "grad_norm": 10.4375, + "learning_rate": 1.1116827248852397e-06, + "loss": 0.7850853800773621, + "step": 14730 + }, + { + "epoch": 2.681532720487849, + "grad_norm": 15.3125, + "learning_rate": 1.1114304241767287e-06, + "loss": 1.4966394901275635, + "step": 14732 + }, + { + "epoch": 2.6818967871120414, + "grad_norm": 36.25, + "learning_rate": 1.111178400608149e-06, + "loss": 1.0849692821502686, + "step": 14734 + }, + { + "epoch": 2.6822608537362336, + "grad_norm": 14.375, + "learning_rate": 1.1109266542164838e-06, + "loss": 1.555729627609253, + "step": 14736 + }, + { + "epoch": 2.6826249203604258, + "grad_norm": 7.03125, + "learning_rate": 1.110675185038677e-06, + "loss": 1.0224976539611816, + "step": 14738 + }, + { + "epoch": 2.6829889869846184, + "grad_norm": 10.25, + "learning_rate": 1.1104239931116293e-06, + "loss": 1.3707863092422485, + "step": 14740 + }, + { + "epoch": 2.68335305360881, + "grad_norm": 11.125, + "learning_rate": 1.1101730784722034e-06, + "loss": 1.4015088081359863, + "step": 14742 + }, + { + "epoch": 2.683717120233003, + "grad_norm": 8.3125, + "learning_rate": 1.1099224411572192e-06, + "loss": 1.3800102472305298, + "step": 14744 + }, + { + "epoch": 2.684081186857195, + "grad_norm": 12.6875, + "learning_rate": 1.109672081203457e-06, + "loss": 1.4732944965362549, + "step": 14746 + }, + { + "epoch": 2.684445253481387, + "grad_norm": 10.125, + "learning_rate": 1.109421998647656e-06, + "loss": 1.2888832092285156, + "step": 14748 + }, + { + "epoch": 2.6848093201055794, + "grad_norm": 5.75, + "learning_rate": 1.1091721935265148e-06, + "loss": 1.0861753225326538, + "step": 14750 + }, + { + "epoch": 2.6851733867297716, + "grad_norm": 14.4375, + "learning_rate": 1.1089226658766916e-06, + "loss": 1.341196060180664, + "step": 14752 + }, + { + "epoch": 2.6855374533539638, + "grad_norm": 15.1875, + "learning_rate": 1.1086734157348033e-06, + "loss": 1.214228868484497, + "step": 14754 + }, + { + "epoch": 2.685901519978156, + "grad_norm": 11.875, + "learning_rate": 1.1084244431374261e-06, + "loss": 1.1609221696853638, + "step": 14756 + }, + { + "epoch": 2.686265586602348, + "grad_norm": 15.1875, + "learning_rate": 1.1081757481210962e-06, + "loss": 1.769871711730957, + "step": 14758 + }, + { + "epoch": 2.6866296532265403, + "grad_norm": 10.3125, + "learning_rate": 1.107927330722308e-06, + "loss": 1.7308299541473389, + "step": 14760 + }, + { + "epoch": 2.6869937198507325, + "grad_norm": 4.9375, + "learning_rate": 1.1076791909775168e-06, + "loss": 1.2495229244232178, + "step": 14762 + }, + { + "epoch": 2.6873577864749247, + "grad_norm": 7.65625, + "learning_rate": 1.107431328923135e-06, + "loss": 1.0096275806427002, + "step": 14764 + }, + { + "epoch": 2.6877218530991174, + "grad_norm": 12.375, + "learning_rate": 1.1071837445955356e-06, + "loss": 1.4926071166992188, + "step": 14766 + }, + { + "epoch": 2.688085919723309, + "grad_norm": 27.75, + "learning_rate": 1.106936438031051e-06, + "loss": 1.8004555702209473, + "step": 14768 + }, + { + "epoch": 2.6884499863475018, + "grad_norm": 20.0, + "learning_rate": 1.1066894092659719e-06, + "loss": 1.40166175365448, + "step": 14770 + }, + { + "epoch": 2.688814052971694, + "grad_norm": 7.21875, + "learning_rate": 1.1064426583365498e-06, + "loss": 1.55739426612854, + "step": 14772 + }, + { + "epoch": 2.689178119595886, + "grad_norm": 18.75, + "learning_rate": 1.1061961852789933e-06, + "loss": 1.456763744354248, + "step": 14774 + }, + { + "epoch": 2.6895421862200783, + "grad_norm": 10.25, + "learning_rate": 1.1059499901294713e-06, + "loss": 1.6596550941467285, + "step": 14776 + }, + { + "epoch": 2.6899062528442705, + "grad_norm": 9.625, + "learning_rate": 1.1057040729241127e-06, + "loss": 1.402900218963623, + "step": 14778 + }, + { + "epoch": 2.6902703194684627, + "grad_norm": 6.09375, + "learning_rate": 1.1054584336990043e-06, + "loss": 1.3532071113586426, + "step": 14780 + }, + { + "epoch": 2.690634386092655, + "grad_norm": 9.125, + "learning_rate": 1.1052130724901932e-06, + "loss": 1.2240444421768188, + "step": 14782 + }, + { + "epoch": 2.690998452716847, + "grad_norm": 33.5, + "learning_rate": 1.1049679893336846e-06, + "loss": 1.4172601699829102, + "step": 14784 + }, + { + "epoch": 2.6913625193410393, + "grad_norm": 6.0625, + "learning_rate": 1.1047231842654436e-06, + "loss": 1.1519123315811157, + "step": 14786 + }, + { + "epoch": 2.6917265859652315, + "grad_norm": 10.5625, + "learning_rate": 1.1044786573213945e-06, + "loss": 1.2620350122451782, + "step": 14788 + }, + { + "epoch": 2.6920906525894237, + "grad_norm": 14.0625, + "learning_rate": 1.1042344085374202e-06, + "loss": 1.023705244064331, + "step": 14790 + }, + { + "epoch": 2.6924547192136163, + "grad_norm": 34.25, + "learning_rate": 1.1039904379493643e-06, + "loss": 1.3071603775024414, + "step": 14792 + }, + { + "epoch": 2.692818785837808, + "grad_norm": 33.25, + "learning_rate": 1.1037467455930272e-06, + "loss": 1.8074480295181274, + "step": 14794 + }, + { + "epoch": 2.6931828524620007, + "grad_norm": 13.5625, + "learning_rate": 1.1035033315041705e-06, + "loss": 1.2304755449295044, + "step": 14796 + }, + { + "epoch": 2.693546919086193, + "grad_norm": 11.3125, + "learning_rate": 1.1032601957185143e-06, + "loss": 0.8926444053649902, + "step": 14798 + }, + { + "epoch": 2.693910985710385, + "grad_norm": 9.0, + "learning_rate": 1.1030173382717371e-06, + "loss": 1.5995372533798218, + "step": 14800 + }, + { + "epoch": 2.6942750523345773, + "grad_norm": 3.625, + "learning_rate": 1.1027747591994782e-06, + "loss": 1.3425649404525757, + "step": 14802 + }, + { + "epoch": 2.6946391189587695, + "grad_norm": 7.5625, + "learning_rate": 1.1025324585373344e-06, + "loss": 1.5030573606491089, + "step": 14804 + }, + { + "epoch": 2.6950031855829617, + "grad_norm": 27.0, + "learning_rate": 1.1022904363208627e-06, + "loss": 1.4105416536331177, + "step": 14806 + }, + { + "epoch": 2.695367252207154, + "grad_norm": 12.6875, + "learning_rate": 1.1020486925855785e-06, + "loss": 1.6102200746536255, + "step": 14808 + }, + { + "epoch": 2.695731318831346, + "grad_norm": 17.125, + "learning_rate": 1.1018072273669567e-06, + "loss": 1.4038678407669067, + "step": 14810 + }, + { + "epoch": 2.6960953854555383, + "grad_norm": 48.75, + "learning_rate": 1.1015660407004323e-06, + "loss": 1.6581690311431885, + "step": 14812 + }, + { + "epoch": 2.6964594520797305, + "grad_norm": 10.0625, + "learning_rate": 1.1013251326213972e-06, + "loss": 1.4166088104248047, + "step": 14814 + }, + { + "epoch": 2.6968235187039227, + "grad_norm": 7.625, + "learning_rate": 1.1010845031652046e-06, + "loss": 1.1834051609039307, + "step": 14816 + }, + { + "epoch": 2.6971875853281153, + "grad_norm": 9.5, + "learning_rate": 1.1008441523671653e-06, + "loss": 0.860220730304718, + "step": 14818 + }, + { + "epoch": 2.697551651952307, + "grad_norm": 12.6875, + "learning_rate": 1.1006040802625496e-06, + "loss": 1.2868974208831787, + "step": 14820 + }, + { + "epoch": 2.6979157185764997, + "grad_norm": 9.3125, + "learning_rate": 1.1003642868865882e-06, + "loss": 1.3573637008666992, + "step": 14822 + }, + { + "epoch": 2.6982797852006914, + "grad_norm": 12.5625, + "learning_rate": 1.100124772274469e-06, + "loss": 1.1661683320999146, + "step": 14824 + }, + { + "epoch": 2.698643851824884, + "grad_norm": 16.125, + "learning_rate": 1.09988553646134e-06, + "loss": 0.8377955555915833, + "step": 14826 + }, + { + "epoch": 2.6990079184490763, + "grad_norm": 46.25, + "learning_rate": 1.0996465794823077e-06, + "loss": 0.7688472270965576, + "step": 14828 + }, + { + "epoch": 2.6993719850732685, + "grad_norm": 4.53125, + "learning_rate": 1.0994079013724385e-06, + "loss": 1.2073286771774292, + "step": 14830 + }, + { + "epoch": 2.6997360516974607, + "grad_norm": 17.25, + "learning_rate": 1.0991695021667576e-06, + "loss": 1.6766215562820435, + "step": 14832 + }, + { + "epoch": 2.700100118321653, + "grad_norm": 6.9375, + "learning_rate": 1.0989313819002488e-06, + "loss": 1.5042650699615479, + "step": 14834 + }, + { + "epoch": 2.700464184945845, + "grad_norm": 8.625, + "learning_rate": 1.0986935406078552e-06, + "loss": 1.2722575664520264, + "step": 14836 + }, + { + "epoch": 2.7008282515700373, + "grad_norm": 15.875, + "learning_rate": 1.0984559783244792e-06, + "loss": 1.2128934860229492, + "step": 14838 + }, + { + "epoch": 2.7011923181942294, + "grad_norm": 7.96875, + "learning_rate": 1.098218695084982e-06, + "loss": 1.174805760383606, + "step": 14840 + }, + { + "epoch": 2.7015563848184216, + "grad_norm": 7.125, + "learning_rate": 1.0979816909241845e-06, + "loss": 1.3532884120941162, + "step": 14842 + }, + { + "epoch": 2.701920451442614, + "grad_norm": 12.8125, + "learning_rate": 1.0977449658768653e-06, + "loss": 1.617944598197937, + "step": 14844 + }, + { + "epoch": 2.702284518066806, + "grad_norm": 17.375, + "learning_rate": 1.0975085199777633e-06, + "loss": 1.5966460704803467, + "step": 14846 + }, + { + "epoch": 2.7026485846909987, + "grad_norm": 22.5, + "learning_rate": 1.0972723532615759e-06, + "loss": 1.6074533462524414, + "step": 14848 + }, + { + "epoch": 2.7030126513151904, + "grad_norm": 13.125, + "learning_rate": 1.097036465762959e-06, + "loss": 1.2419157028198242, + "step": 14850 + }, + { + "epoch": 2.703376717939383, + "grad_norm": 6.25, + "learning_rate": 1.0968008575165297e-06, + "loss": 1.4943106174468994, + "step": 14852 + }, + { + "epoch": 2.7037407845635753, + "grad_norm": 12.125, + "learning_rate": 1.096565528556861e-06, + "loss": 1.5701483488082886, + "step": 14854 + }, + { + "epoch": 2.7041048511877674, + "grad_norm": 15.375, + "learning_rate": 1.0963304789184872e-06, + "loss": 1.543341040611267, + "step": 14856 + }, + { + "epoch": 2.7044689178119596, + "grad_norm": 22.0, + "learning_rate": 1.0960957086359005e-06, + "loss": 1.6615351438522339, + "step": 14858 + }, + { + "epoch": 2.704832984436152, + "grad_norm": 7.0, + "learning_rate": 1.0958612177435526e-06, + "loss": 1.0224934816360474, + "step": 14860 + }, + { + "epoch": 2.705197051060344, + "grad_norm": 62.0, + "learning_rate": 1.0956270062758548e-06, + "loss": 1.6262320280075073, + "step": 14862 + }, + { + "epoch": 2.705561117684536, + "grad_norm": 10.875, + "learning_rate": 1.0953930742671758e-06, + "loss": 1.477679967880249, + "step": 14864 + }, + { + "epoch": 2.7059251843087284, + "grad_norm": 11.6875, + "learning_rate": 1.0951594217518442e-06, + "loss": 1.423030138015747, + "step": 14866 + }, + { + "epoch": 2.7062892509329206, + "grad_norm": 25.125, + "learning_rate": 1.094926048764148e-06, + "loss": 1.366789698600769, + "step": 14868 + }, + { + "epoch": 2.706653317557113, + "grad_norm": 4.78125, + "learning_rate": 1.0946929553383334e-06, + "loss": 1.3657417297363281, + "step": 14870 + }, + { + "epoch": 2.707017384181305, + "grad_norm": 7.34375, + "learning_rate": 1.0944601415086064e-06, + "loss": 1.2840315103530884, + "step": 14872 + }, + { + "epoch": 2.7073814508054976, + "grad_norm": 41.5, + "learning_rate": 1.0942276073091312e-06, + "loss": 1.3589062690734863, + "step": 14874 + }, + { + "epoch": 2.7077455174296894, + "grad_norm": 16.0, + "learning_rate": 1.0939953527740309e-06, + "loss": 1.4740675687789917, + "step": 14876 + }, + { + "epoch": 2.708109584053882, + "grad_norm": 35.5, + "learning_rate": 1.0937633779373883e-06, + "loss": 1.9843084812164307, + "step": 14878 + }, + { + "epoch": 2.708473650678074, + "grad_norm": 19.25, + "learning_rate": 1.0935316828332442e-06, + "loss": 1.3780264854431152, + "step": 14880 + }, + { + "epoch": 2.7088377173022664, + "grad_norm": 7.78125, + "learning_rate": 1.0933002674956005e-06, + "loss": 1.1791101694107056, + "step": 14882 + }, + { + "epoch": 2.7092017839264586, + "grad_norm": 7.625, + "learning_rate": 1.0930691319584147e-06, + "loss": 1.2423118352890015, + "step": 14884 + }, + { + "epoch": 2.709565850550651, + "grad_norm": 8.5, + "learning_rate": 1.0928382762556056e-06, + "loss": 1.3711953163146973, + "step": 14886 + }, + { + "epoch": 2.709929917174843, + "grad_norm": 19.75, + "learning_rate": 1.0926077004210507e-06, + "loss": 1.5114002227783203, + "step": 14888 + }, + { + "epoch": 2.710293983799035, + "grad_norm": 6.59375, + "learning_rate": 1.0923774044885854e-06, + "loss": 1.085712194442749, + "step": 14890 + }, + { + "epoch": 2.7106580504232274, + "grad_norm": 15.1875, + "learning_rate": 1.0921473884920053e-06, + "loss": 0.7910588979721069, + "step": 14892 + }, + { + "epoch": 2.7110221170474196, + "grad_norm": 8.5, + "learning_rate": 1.0919176524650642e-06, + "loss": 1.142046570777893, + "step": 14894 + }, + { + "epoch": 2.7113861836716118, + "grad_norm": 22.5, + "learning_rate": 1.0916881964414749e-06, + "loss": 1.6150672435760498, + "step": 14896 + }, + { + "epoch": 2.711750250295804, + "grad_norm": 9.8125, + "learning_rate": 1.0914590204549086e-06, + "loss": 1.270698070526123, + "step": 14898 + }, + { + "epoch": 2.7121143169199966, + "grad_norm": 6.65625, + "learning_rate": 1.0912301245389965e-06, + "loss": 1.5403876304626465, + "step": 14900 + }, + { + "epoch": 2.7124783835441884, + "grad_norm": 7.125, + "learning_rate": 1.0910015087273285e-06, + "loss": 1.2519055604934692, + "step": 14902 + }, + { + "epoch": 2.712842450168381, + "grad_norm": 14.25, + "learning_rate": 1.0907731730534524e-06, + "loss": 1.5530656576156616, + "step": 14904 + }, + { + "epoch": 2.713206516792573, + "grad_norm": 10.3125, + "learning_rate": 1.0905451175508756e-06, + "loss": 1.0416339635849, + "step": 14906 + }, + { + "epoch": 2.7135705834167654, + "grad_norm": 21.5, + "learning_rate": 1.0903173422530644e-06, + "loss": 1.3582653999328613, + "step": 14908 + }, + { + "epoch": 2.7139346500409576, + "grad_norm": 12.5, + "learning_rate": 1.090089847193444e-06, + "loss": 0.950001060962677, + "step": 14910 + }, + { + "epoch": 2.7142987166651498, + "grad_norm": 15.5625, + "learning_rate": 1.0898626324053986e-06, + "loss": 1.4787170886993408, + "step": 14912 + }, + { + "epoch": 2.714662783289342, + "grad_norm": 10.0625, + "learning_rate": 1.0896356979222706e-06, + "loss": 1.2867422103881836, + "step": 14914 + }, + { + "epoch": 2.715026849913534, + "grad_norm": 15.9375, + "learning_rate": 1.089409043777362e-06, + "loss": 1.447325348854065, + "step": 14916 + }, + { + "epoch": 2.7153909165377264, + "grad_norm": 18.125, + "learning_rate": 1.089182670003933e-06, + "loss": 1.357314109802246, + "step": 14918 + }, + { + "epoch": 2.7157549831619185, + "grad_norm": 7.09375, + "learning_rate": 1.0889565766352035e-06, + "loss": 0.9751088619232178, + "step": 14920 + }, + { + "epoch": 2.7161190497861107, + "grad_norm": 9.5, + "learning_rate": 1.0887307637043517e-06, + "loss": 1.7251536846160889, + "step": 14922 + }, + { + "epoch": 2.716483116410303, + "grad_norm": 11.5625, + "learning_rate": 1.0885052312445143e-06, + "loss": 1.4645901918411255, + "step": 14924 + }, + { + "epoch": 2.7168471830344956, + "grad_norm": 9.6875, + "learning_rate": 1.088279979288788e-06, + "loss": 1.4059804677963257, + "step": 14926 + }, + { + "epoch": 2.7172112496586873, + "grad_norm": 4.9375, + "learning_rate": 1.088055007870227e-06, + "loss": 0.921481728553772, + "step": 14928 + }, + { + "epoch": 2.71757531628288, + "grad_norm": 10.8125, + "learning_rate": 1.087830317021845e-06, + "loss": 0.9975974559783936, + "step": 14930 + }, + { + "epoch": 2.717939382907072, + "grad_norm": 13.125, + "learning_rate": 1.0876059067766149e-06, + "loss": 0.6959519386291504, + "step": 14932 + }, + { + "epoch": 2.7183034495312643, + "grad_norm": 15.9375, + "learning_rate": 1.0873817771674678e-06, + "loss": 1.6531071662902832, + "step": 14934 + }, + { + "epoch": 2.7186675161554565, + "grad_norm": 30.0, + "learning_rate": 1.0871579282272937e-06, + "loss": 1.711456298828125, + "step": 14936 + }, + { + "epoch": 2.7190315827796487, + "grad_norm": 7.40625, + "learning_rate": 1.0869343599889414e-06, + "loss": 1.3563625812530518, + "step": 14938 + }, + { + "epoch": 2.719395649403841, + "grad_norm": 8.3125, + "learning_rate": 1.0867110724852184e-06, + "loss": 1.3335464000701904, + "step": 14940 + }, + { + "epoch": 2.719759716028033, + "grad_norm": 36.25, + "learning_rate": 1.0864880657488924e-06, + "loss": 1.4002331495285034, + "step": 14942 + }, + { + "epoch": 2.7201237826522253, + "grad_norm": 8.75, + "learning_rate": 1.0862653398126877e-06, + "loss": 1.7840296030044556, + "step": 14944 + }, + { + "epoch": 2.7204878492764175, + "grad_norm": 34.5, + "learning_rate": 1.0860428947092885e-06, + "loss": 1.1636714935302734, + "step": 14946 + }, + { + "epoch": 2.7208519159006097, + "grad_norm": 7.21875, + "learning_rate": 1.0858207304713381e-06, + "loss": 1.1873703002929688, + "step": 14948 + }, + { + "epoch": 2.721215982524802, + "grad_norm": 16.375, + "learning_rate": 1.0855988471314377e-06, + "loss": 1.2008424997329712, + "step": 14950 + }, + { + "epoch": 2.7215800491489945, + "grad_norm": 7.09375, + "learning_rate": 1.0853772447221484e-06, + "loss": 1.6064014434814453, + "step": 14952 + }, + { + "epoch": 2.7219441157731863, + "grad_norm": 19.375, + "learning_rate": 1.0851559232759893e-06, + "loss": 1.1498956680297852, + "step": 14954 + }, + { + "epoch": 2.722308182397379, + "grad_norm": 19.375, + "learning_rate": 1.084934882825438e-06, + "loss": 1.8827636241912842, + "step": 14956 + }, + { + "epoch": 2.7226722490215707, + "grad_norm": 15.0625, + "learning_rate": 1.0847141234029317e-06, + "loss": 1.5877909660339355, + "step": 14958 + }, + { + "epoch": 2.7230363156457633, + "grad_norm": 108.5, + "learning_rate": 1.0844936450408656e-06, + "loss": 1.1987026929855347, + "step": 14960 + }, + { + "epoch": 2.7234003822699555, + "grad_norm": 16.375, + "learning_rate": 1.0842734477715946e-06, + "loss": 0.9726529121398926, + "step": 14962 + }, + { + "epoch": 2.7237644488941477, + "grad_norm": 4.875, + "learning_rate": 1.084053531627431e-06, + "loss": 1.2493902444839478, + "step": 14964 + }, + { + "epoch": 2.72412851551834, + "grad_norm": 14.4375, + "learning_rate": 1.0838338966406473e-06, + "loss": 1.2756197452545166, + "step": 14966 + }, + { + "epoch": 2.724492582142532, + "grad_norm": 44.25, + "learning_rate": 1.0836145428434736e-06, + "loss": 1.310465931892395, + "step": 14968 + }, + { + "epoch": 2.7248566487667243, + "grad_norm": 7.65625, + "learning_rate": 1.083395470268099e-06, + "loss": 1.5246061086654663, + "step": 14970 + }, + { + "epoch": 2.7252207153909165, + "grad_norm": 3.109375, + "learning_rate": 1.0831766789466724e-06, + "loss": 1.0762794017791748, + "step": 14972 + }, + { + "epoch": 2.7255847820151087, + "grad_norm": 22.625, + "learning_rate": 1.0829581689112996e-06, + "loss": 1.4710791110992432, + "step": 14974 + }, + { + "epoch": 2.725948848639301, + "grad_norm": 10.25, + "learning_rate": 1.0827399401940464e-06, + "loss": 1.3621971607208252, + "step": 14976 + }, + { + "epoch": 2.726312915263493, + "grad_norm": 16.5, + "learning_rate": 1.082521992826937e-06, + "loss": 1.884993314743042, + "step": 14978 + }, + { + "epoch": 2.7266769818876853, + "grad_norm": 13.9375, + "learning_rate": 1.0823043268419542e-06, + "loss": 1.5959687232971191, + "step": 14980 + }, + { + "epoch": 2.727041048511878, + "grad_norm": 12.875, + "learning_rate": 1.08208694227104e-06, + "loss": 1.381267786026001, + "step": 14982 + }, + { + "epoch": 2.7274051151360696, + "grad_norm": 11.125, + "learning_rate": 1.0818698391460943e-06, + "loss": 1.377220869064331, + "step": 14984 + }, + { + "epoch": 2.7277691817602623, + "grad_norm": 40.0, + "learning_rate": 1.0816530174989762e-06, + "loss": 1.6236605644226074, + "step": 14986 + }, + { + "epoch": 2.7281332483844545, + "grad_norm": 12.0, + "learning_rate": 1.0814364773615033e-06, + "loss": 1.6138181686401367, + "step": 14988 + }, + { + "epoch": 2.7284973150086467, + "grad_norm": 12.375, + "learning_rate": 1.081220218765452e-06, + "loss": 1.669021487236023, + "step": 14990 + }, + { + "epoch": 2.728861381632839, + "grad_norm": 43.25, + "learning_rate": 1.081004241742558e-06, + "loss": 1.7873477935791016, + "step": 14992 + }, + { + "epoch": 2.729225448257031, + "grad_norm": 11.0, + "learning_rate": 1.080788546324514e-06, + "loss": 1.5181097984313965, + "step": 14994 + }, + { + "epoch": 2.7295895148812233, + "grad_norm": 8.625, + "learning_rate": 1.0805731325429731e-06, + "loss": 1.1909656524658203, + "step": 14996 + }, + { + "epoch": 2.7299535815054154, + "grad_norm": 11.5625, + "learning_rate": 1.0803580004295463e-06, + "loss": 1.3072558641433716, + "step": 14998 + }, + { + "epoch": 2.7303176481296076, + "grad_norm": 22.75, + "learning_rate": 1.080143150015803e-06, + "loss": 1.3752968311309814, + "step": 15000 + }, + { + "epoch": 2.7306817147538, + "grad_norm": 11.75, + "learning_rate": 1.0799285813332724e-06, + "loss": 1.7476496696472168, + "step": 15002 + }, + { + "epoch": 2.731045781377992, + "grad_norm": 12.625, + "learning_rate": 1.0797142944134408e-06, + "loss": 1.6604036092758179, + "step": 15004 + }, + { + "epoch": 2.7314098480021842, + "grad_norm": 576.0, + "learning_rate": 1.0795002892877543e-06, + "loss": 0.6702216863632202, + "step": 15006 + }, + { + "epoch": 2.731773914626377, + "grad_norm": 11.625, + "learning_rate": 1.0792865659876172e-06, + "loss": 0.9962186813354492, + "step": 15008 + }, + { + "epoch": 2.7321379812505686, + "grad_norm": 26.375, + "learning_rate": 1.0790731245443924e-06, + "loss": 1.5872361660003662, + "step": 15010 + }, + { + "epoch": 2.7325020478747613, + "grad_norm": 25.75, + "learning_rate": 1.0788599649894022e-06, + "loss": 2.004768133163452, + "step": 15012 + }, + { + "epoch": 2.7328661144989534, + "grad_norm": 4.59375, + "learning_rate": 1.0786470873539263e-06, + "loss": 1.2689882516860962, + "step": 15014 + }, + { + "epoch": 2.7332301811231456, + "grad_norm": 9.5625, + "learning_rate": 1.0784344916692039e-06, + "loss": 1.0874770879745483, + "step": 15016 + }, + { + "epoch": 2.733594247747338, + "grad_norm": 11.0, + "learning_rate": 1.0782221779664322e-06, + "loss": 1.4245054721832275, + "step": 15018 + }, + { + "epoch": 2.73395831437153, + "grad_norm": 7.5, + "learning_rate": 1.0780101462767672e-06, + "loss": 0.8740946054458618, + "step": 15020 + }, + { + "epoch": 2.7343223809957222, + "grad_norm": 7.1875, + "learning_rate": 1.077798396631325e-06, + "loss": 1.290091633796692, + "step": 15022 + }, + { + "epoch": 2.7346864476199144, + "grad_norm": 7.4375, + "learning_rate": 1.0775869290611774e-06, + "loss": 0.5480546355247498, + "step": 15024 + }, + { + "epoch": 2.7350505142441066, + "grad_norm": 7.9375, + "learning_rate": 1.0773757435973573e-06, + "loss": 1.5005695819854736, + "step": 15026 + }, + { + "epoch": 2.735414580868299, + "grad_norm": 8.625, + "learning_rate": 1.077164840270855e-06, + "loss": 1.4684122800827026, + "step": 15028 + }, + { + "epoch": 2.735778647492491, + "grad_norm": 8.375, + "learning_rate": 1.0769542191126199e-06, + "loss": 1.3569011688232422, + "step": 15030 + }, + { + "epoch": 2.736142714116683, + "grad_norm": 7.90625, + "learning_rate": 1.07674388015356e-06, + "loss": 1.455420732498169, + "step": 15032 + }, + { + "epoch": 2.736506780740876, + "grad_norm": 12.8125, + "learning_rate": 1.0765338234245411e-06, + "loss": 1.492271900177002, + "step": 15034 + }, + { + "epoch": 2.7368708473650676, + "grad_norm": 15.6875, + "learning_rate": 1.0763240489563885e-06, + "loss": 1.520097017288208, + "step": 15036 + }, + { + "epoch": 2.73723491398926, + "grad_norm": 100.0, + "learning_rate": 1.076114556779886e-06, + "loss": 1.4418143033981323, + "step": 15038 + }, + { + "epoch": 2.7375989806134524, + "grad_norm": 10.375, + "learning_rate": 1.0759053469257747e-06, + "loss": 1.2542225122451782, + "step": 15040 + }, + { + "epoch": 2.7379630472376446, + "grad_norm": 11.75, + "learning_rate": 1.0756964194247571e-06, + "loss": 1.473730206489563, + "step": 15042 + }, + { + "epoch": 2.738327113861837, + "grad_norm": 18.875, + "learning_rate": 1.075487774307491e-06, + "loss": 2.000340461730957, + "step": 15044 + }, + { + "epoch": 2.738691180486029, + "grad_norm": 7.4375, + "learning_rate": 1.0752794116045944e-06, + "loss": 1.027376651763916, + "step": 15046 + }, + { + "epoch": 2.739055247110221, + "grad_norm": 11.875, + "learning_rate": 1.0750713313466444e-06, + "loss": 1.216172218322754, + "step": 15048 + }, + { + "epoch": 2.7394193137344134, + "grad_norm": 15.6875, + "learning_rate": 1.074863533564175e-06, + "loss": 1.9682786464691162, + "step": 15050 + }, + { + "epoch": 2.7397833803586056, + "grad_norm": 11.4375, + "learning_rate": 1.0746560182876805e-06, + "loss": 1.5438538789749146, + "step": 15052 + }, + { + "epoch": 2.7401474469827978, + "grad_norm": 14.75, + "learning_rate": 1.0744487855476126e-06, + "loss": 1.5282137393951416, + "step": 15054 + }, + { + "epoch": 2.74051151360699, + "grad_norm": 15.75, + "learning_rate": 1.0742418353743816e-06, + "loss": 1.5225894451141357, + "step": 15056 + }, + { + "epoch": 2.740875580231182, + "grad_norm": 4.90625, + "learning_rate": 1.0740351677983567e-06, + "loss": 1.446022629737854, + "step": 15058 + }, + { + "epoch": 2.741239646855375, + "grad_norm": 12.5625, + "learning_rate": 1.0738287828498656e-06, + "loss": 1.17795991897583, + "step": 15060 + }, + { + "epoch": 2.7416037134795666, + "grad_norm": 9.0625, + "learning_rate": 1.0736226805591951e-06, + "loss": 1.3711397647857666, + "step": 15062 + }, + { + "epoch": 2.741967780103759, + "grad_norm": 15.875, + "learning_rate": 1.0734168609565892e-06, + "loss": 1.5267786979675293, + "step": 15064 + }, + { + "epoch": 2.742331846727951, + "grad_norm": 19.75, + "learning_rate": 1.073211324072251e-06, + "loss": 1.615204930305481, + "step": 15066 + }, + { + "epoch": 2.7426959133521436, + "grad_norm": 18.375, + "learning_rate": 1.0730060699363427e-06, + "loss": 1.579131841659546, + "step": 15068 + }, + { + "epoch": 2.7430599799763358, + "grad_norm": 16.75, + "learning_rate": 1.0728010985789835e-06, + "loss": 1.4976472854614258, + "step": 15070 + }, + { + "epoch": 2.743424046600528, + "grad_norm": 9.875, + "learning_rate": 1.0725964100302535e-06, + "loss": 0.5477147102355957, + "step": 15072 + }, + { + "epoch": 2.74378811322472, + "grad_norm": 12.5625, + "learning_rate": 1.0723920043201891e-06, + "loss": 1.427178144454956, + "step": 15074 + }, + { + "epoch": 2.7441521798489124, + "grad_norm": 10.125, + "learning_rate": 1.072187881478786e-06, + "loss": 1.7375693321228027, + "step": 15076 + }, + { + "epoch": 2.7445162464731045, + "grad_norm": 4.9375, + "learning_rate": 1.071984041535999e-06, + "loss": 1.363765001296997, + "step": 15078 + }, + { + "epoch": 2.7448803130972967, + "grad_norm": 16.375, + "learning_rate": 1.0717804845217395e-06, + "loss": 1.5048247575759888, + "step": 15080 + }, + { + "epoch": 2.745244379721489, + "grad_norm": 15.3125, + "learning_rate": 1.0715772104658801e-06, + "loss": 2.018571376800537, + "step": 15082 + }, + { + "epoch": 2.745608446345681, + "grad_norm": 9.6875, + "learning_rate": 1.0713742193982497e-06, + "loss": 1.3345913887023926, + "step": 15084 + }, + { + "epoch": 2.7459725129698733, + "grad_norm": 7.625, + "learning_rate": 1.0711715113486368e-06, + "loss": 1.6039857864379883, + "step": 15086 + }, + { + "epoch": 2.7463365795940655, + "grad_norm": 19.5, + "learning_rate": 1.0709690863467874e-06, + "loss": 1.3996527194976807, + "step": 15088 + }, + { + "epoch": 2.746700646218258, + "grad_norm": 17.375, + "learning_rate": 1.070766944422407e-06, + "loss": 1.4645377397537231, + "step": 15090 + }, + { + "epoch": 2.74706471284245, + "grad_norm": 11.875, + "learning_rate": 1.0705650856051591e-06, + "loss": 1.4853307008743286, + "step": 15092 + }, + { + "epoch": 2.7474287794666425, + "grad_norm": 7.96875, + "learning_rate": 1.0703635099246657e-06, + "loss": 1.4188501834869385, + "step": 15094 + }, + { + "epoch": 2.7477928460908347, + "grad_norm": 6.34375, + "learning_rate": 1.0701622174105069e-06, + "loss": 1.3596289157867432, + "step": 15096 + }, + { + "epoch": 2.748156912715027, + "grad_norm": 9.3125, + "learning_rate": 1.0699612080922218e-06, + "loss": 0.9225339889526367, + "step": 15098 + }, + { + "epoch": 2.748520979339219, + "grad_norm": 80.5, + "learning_rate": 1.0697604819993075e-06, + "loss": 1.1935278177261353, + "step": 15100 + }, + { + "epoch": 2.7488850459634113, + "grad_norm": 6.125, + "learning_rate": 1.0695600391612201e-06, + "loss": 0.844068169593811, + "step": 15102 + }, + { + "epoch": 2.7492491125876035, + "grad_norm": 12.5625, + "learning_rate": 1.0693598796073734e-06, + "loss": 1.4681004285812378, + "step": 15104 + }, + { + "epoch": 2.7496131792117957, + "grad_norm": 39.25, + "learning_rate": 1.06916000336714e-06, + "loss": 1.627410888671875, + "step": 15106 + }, + { + "epoch": 2.749977245835988, + "grad_norm": 9.0, + "learning_rate": 1.0689604104698515e-06, + "loss": 1.5033308267593384, + "step": 15108 + }, + { + "epoch": 2.75034131246018, + "grad_norm": 8.8125, + "learning_rate": 1.0687611009447966e-06, + "loss": 1.4627599716186523, + "step": 15110 + }, + { + "epoch": 2.7507053790843723, + "grad_norm": 8.4375, + "learning_rate": 1.068562074821224e-06, + "loss": 1.2369627952575684, + "step": 15112 + }, + { + "epoch": 2.7510694457085645, + "grad_norm": 37.75, + "learning_rate": 1.0683633321283392e-06, + "loss": 1.7903141975402832, + "step": 15114 + }, + { + "epoch": 2.751433512332757, + "grad_norm": 16.875, + "learning_rate": 1.068164872895307e-06, + "loss": 1.864713430404663, + "step": 15116 + }, + { + "epoch": 2.751797578956949, + "grad_norm": 6.3125, + "learning_rate": 1.0679666971512508e-06, + "loss": 1.2925167083740234, + "step": 15118 + }, + { + "epoch": 2.7521616455811415, + "grad_norm": 5.21875, + "learning_rate": 1.0677688049252517e-06, + "loss": 0.8857063055038452, + "step": 15120 + }, + { + "epoch": 2.7525257122053337, + "grad_norm": 9.9375, + "learning_rate": 1.0675711962463503e-06, + "loss": 1.0740444660186768, + "step": 15122 + }, + { + "epoch": 2.752889778829526, + "grad_norm": 19.125, + "learning_rate": 1.0673738711435443e-06, + "loss": 1.4283254146575928, + "step": 15124 + }, + { + "epoch": 2.753253845453718, + "grad_norm": 7.28125, + "learning_rate": 1.0671768296457902e-06, + "loss": 1.4395685195922852, + "step": 15126 + }, + { + "epoch": 2.7536179120779103, + "grad_norm": 6.53125, + "learning_rate": 1.0669800717820034e-06, + "loss": 1.183334469795227, + "step": 15128 + }, + { + "epoch": 2.7539819787021025, + "grad_norm": 17.5, + "learning_rate": 1.066783597581057e-06, + "loss": 1.7554134130477905, + "step": 15130 + }, + { + "epoch": 2.7543460453262947, + "grad_norm": 14.25, + "learning_rate": 1.0665874070717838e-06, + "loss": 1.4719607830047607, + "step": 15132 + }, + { + "epoch": 2.754710111950487, + "grad_norm": 9.875, + "learning_rate": 1.0663915002829727e-06, + "loss": 1.3063950538635254, + "step": 15134 + }, + { + "epoch": 2.755074178574679, + "grad_norm": 14.875, + "learning_rate": 1.0661958772433725e-06, + "loss": 1.6672691106796265, + "step": 15136 + }, + { + "epoch": 2.7554382451988713, + "grad_norm": 8.75, + "learning_rate": 1.0660005379816906e-06, + "loss": 0.9045916199684143, + "step": 15138 + }, + { + "epoch": 2.7558023118230635, + "grad_norm": 56.0, + "learning_rate": 1.0658054825265913e-06, + "loss": 1.2428569793701172, + "step": 15140 + }, + { + "epoch": 2.756166378447256, + "grad_norm": 8.375, + "learning_rate": 1.0656107109066995e-06, + "loss": 1.813591480255127, + "step": 15142 + }, + { + "epoch": 2.756530445071448, + "grad_norm": 5.875, + "learning_rate": 1.0654162231505966e-06, + "loss": 1.2263996601104736, + "step": 15144 + }, + { + "epoch": 2.7568945116956405, + "grad_norm": 11.875, + "learning_rate": 1.0652220192868224e-06, + "loss": 1.3955984115600586, + "step": 15146 + }, + { + "epoch": 2.7572585783198327, + "grad_norm": 96.5, + "learning_rate": 1.0650280993438758e-06, + "loss": 1.4035351276397705, + "step": 15148 + }, + { + "epoch": 2.757622644944025, + "grad_norm": 28.375, + "learning_rate": 1.064834463350214e-06, + "loss": 1.8159451484680176, + "step": 15150 + }, + { + "epoch": 2.757986711568217, + "grad_norm": 25.125, + "learning_rate": 1.0646411113342524e-06, + "loss": 1.4774194955825806, + "step": 15152 + }, + { + "epoch": 2.7583507781924093, + "grad_norm": 22.875, + "learning_rate": 1.064448043324364e-06, + "loss": 1.5671591758728027, + "step": 15154 + }, + { + "epoch": 2.7587148448166015, + "grad_norm": 202.0, + "learning_rate": 1.0642552593488814e-06, + "loss": 1.3637635707855225, + "step": 15156 + }, + { + "epoch": 2.7590789114407936, + "grad_norm": 5.9375, + "learning_rate": 1.0640627594360947e-06, + "loss": 1.3500739336013794, + "step": 15158 + }, + { + "epoch": 2.759442978064986, + "grad_norm": 8.75, + "learning_rate": 1.0638705436142518e-06, + "loss": 1.2351429462432861, + "step": 15160 + }, + { + "epoch": 2.759807044689178, + "grad_norm": 13.0625, + "learning_rate": 1.0636786119115609e-06, + "loss": 1.5977540016174316, + "step": 15162 + }, + { + "epoch": 2.7601711113133702, + "grad_norm": 9.75, + "learning_rate": 1.063486964356186e-06, + "loss": 1.422680139541626, + "step": 15164 + }, + { + "epoch": 2.7605351779375624, + "grad_norm": 8.8125, + "learning_rate": 1.0632956009762514e-06, + "loss": 1.2810008525848389, + "step": 15166 + }, + { + "epoch": 2.760899244561755, + "grad_norm": 17.0, + "learning_rate": 1.0631045217998384e-06, + "loss": 1.2463493347167969, + "step": 15168 + }, + { + "epoch": 2.761263311185947, + "grad_norm": 15.125, + "learning_rate": 1.062913726854987e-06, + "loss": 1.984569787979126, + "step": 15170 + }, + { + "epoch": 2.7616273778101394, + "grad_norm": 16.0, + "learning_rate": 1.0627232161696964e-06, + "loss": 1.5968174934387207, + "step": 15172 + }, + { + "epoch": 2.7619914444343316, + "grad_norm": 18.5, + "learning_rate": 1.0625329897719226e-06, + "loss": 1.5111920833587646, + "step": 15174 + }, + { + "epoch": 2.762355511058524, + "grad_norm": 7.59375, + "learning_rate": 1.0623430476895805e-06, + "loss": 1.2464303970336914, + "step": 15176 + }, + { + "epoch": 2.762719577682716, + "grad_norm": 8.6875, + "learning_rate": 1.0621533899505437e-06, + "loss": 1.359215497970581, + "step": 15178 + }, + { + "epoch": 2.7630836443069082, + "grad_norm": 11.4375, + "learning_rate": 1.0619640165826431e-06, + "loss": 1.0701346397399902, + "step": 15180 + }, + { + "epoch": 2.7634477109311004, + "grad_norm": 13.0625, + "learning_rate": 1.0617749276136696e-06, + "loss": 1.4837589263916016, + "step": 15182 + }, + { + "epoch": 2.7638117775552926, + "grad_norm": 7.78125, + "learning_rate": 1.06158612307137e-06, + "loss": 1.0844717025756836, + "step": 15184 + }, + { + "epoch": 2.764175844179485, + "grad_norm": 12.5625, + "learning_rate": 1.0613976029834513e-06, + "loss": 1.6679472923278809, + "step": 15186 + }, + { + "epoch": 2.764539910803677, + "grad_norm": 17.625, + "learning_rate": 1.061209367377578e-06, + "loss": 1.5979485511779785, + "step": 15188 + }, + { + "epoch": 2.764903977427869, + "grad_norm": 15.5, + "learning_rate": 1.0610214162813723e-06, + "loss": 1.7798595428466797, + "step": 15190 + }, + { + "epoch": 2.7652680440520614, + "grad_norm": 15.125, + "learning_rate": 1.0608337497224164e-06, + "loss": 1.5039747953414917, + "step": 15192 + }, + { + "epoch": 2.765632110676254, + "grad_norm": 7.75, + "learning_rate": 1.0606463677282487e-06, + "loss": 1.4596741199493408, + "step": 15194 + }, + { + "epoch": 2.765996177300446, + "grad_norm": 30.875, + "learning_rate": 1.060459270326367e-06, + "loss": 1.54854416847229, + "step": 15196 + }, + { + "epoch": 2.7663602439246384, + "grad_norm": 12.0625, + "learning_rate": 1.0602724575442271e-06, + "loss": 1.1845885515213013, + "step": 15198 + }, + { + "epoch": 2.76672431054883, + "grad_norm": 9.25, + "learning_rate": 1.060085929409243e-06, + "loss": 1.4231187105178833, + "step": 15200 + }, + { + "epoch": 2.767088377173023, + "grad_norm": 9.1875, + "learning_rate": 1.059899685948787e-06, + "loss": 1.2150213718414307, + "step": 15202 + }, + { + "epoch": 2.767452443797215, + "grad_norm": 24.0, + "learning_rate": 1.05971372719019e-06, + "loss": 1.0649685859680176, + "step": 15204 + }, + { + "epoch": 2.767816510421407, + "grad_norm": 8.25, + "learning_rate": 1.0595280531607397e-06, + "loss": 1.265956163406372, + "step": 15206 + }, + { + "epoch": 2.7681805770455994, + "grad_norm": 44.0, + "learning_rate": 1.0593426638876836e-06, + "loss": 1.1646571159362793, + "step": 15208 + }, + { + "epoch": 2.7685446436697916, + "grad_norm": 12.25, + "learning_rate": 1.0591575593982267e-06, + "loss": 1.683388352394104, + "step": 15210 + }, + { + "epoch": 2.7689087102939838, + "grad_norm": 27.375, + "learning_rate": 1.058972739719533e-06, + "loss": 1.9732575416564941, + "step": 15212 + }, + { + "epoch": 2.769272776918176, + "grad_norm": 5.0, + "learning_rate": 1.058788204878723e-06, + "loss": 1.3566139936447144, + "step": 15214 + }, + { + "epoch": 2.769636843542368, + "grad_norm": 15.875, + "learning_rate": 1.0586039549028768e-06, + "loss": 1.5855283737182617, + "step": 15216 + }, + { + "epoch": 2.7700009101665604, + "grad_norm": 14.0, + "learning_rate": 1.0584199898190325e-06, + "loss": 1.4462519884109497, + "step": 15218 + }, + { + "epoch": 2.7703649767907526, + "grad_norm": 28.5, + "learning_rate": 1.058236309654186e-06, + "loss": 1.0793297290802002, + "step": 15220 + }, + { + "epoch": 2.7707290434149447, + "grad_norm": 60.0, + "learning_rate": 1.0580529144352923e-06, + "loss": 0.5910310745239258, + "step": 15222 + }, + { + "epoch": 2.7710931100391374, + "grad_norm": 11.625, + "learning_rate": 1.0578698041892632e-06, + "loss": 1.5671952962875366, + "step": 15224 + }, + { + "epoch": 2.771457176663329, + "grad_norm": 11.0, + "learning_rate": 1.0576869789429692e-06, + "loss": 1.503175973892212, + "step": 15226 + }, + { + "epoch": 2.7718212432875218, + "grad_norm": 17.25, + "learning_rate": 1.0575044387232398e-06, + "loss": 1.6208741664886475, + "step": 15228 + }, + { + "epoch": 2.772185309911714, + "grad_norm": 5.09375, + "learning_rate": 1.0573221835568614e-06, + "loss": 1.2335257530212402, + "step": 15230 + }, + { + "epoch": 2.772549376535906, + "grad_norm": 7.84375, + "learning_rate": 1.0571402134705805e-06, + "loss": 1.325007438659668, + "step": 15232 + }, + { + "epoch": 2.7729134431600984, + "grad_norm": 11.0, + "learning_rate": 1.0569585284910988e-06, + "loss": 1.2743703126907349, + "step": 15234 + }, + { + "epoch": 2.7732775097842906, + "grad_norm": 34.0, + "learning_rate": 1.056777128645079e-06, + "loss": 1.4732047319412231, + "step": 15236 + }, + { + "epoch": 2.7736415764084827, + "grad_norm": 24.25, + "learning_rate": 1.0565960139591399e-06, + "loss": 1.7495557069778442, + "step": 15238 + }, + { + "epoch": 2.774005643032675, + "grad_norm": 8.75, + "learning_rate": 1.0564151844598599e-06, + "loss": 1.5108754634857178, + "step": 15240 + }, + { + "epoch": 2.774369709656867, + "grad_norm": 68.0, + "learning_rate": 1.0562346401737754e-06, + "loss": 1.63310968875885, + "step": 15242 + }, + { + "epoch": 2.7747337762810593, + "grad_norm": 14.5, + "learning_rate": 1.0560543811273797e-06, + "loss": 1.6279914379119873, + "step": 15244 + }, + { + "epoch": 2.7750978429052515, + "grad_norm": 12.0625, + "learning_rate": 1.0558744073471255e-06, + "loss": 1.22107994556427, + "step": 15246 + }, + { + "epoch": 2.7754619095294437, + "grad_norm": 8.0625, + "learning_rate": 1.0556947188594231e-06, + "loss": 1.3569004535675049, + "step": 15248 + }, + { + "epoch": 2.7758259761536364, + "grad_norm": 26.5, + "learning_rate": 1.055515315690641e-06, + "loss": 1.5827465057373047, + "step": 15250 + }, + { + "epoch": 2.776190042777828, + "grad_norm": 12.3125, + "learning_rate": 1.0553361978671064e-06, + "loss": 1.9765727519989014, + "step": 15252 + }, + { + "epoch": 2.7765541094020207, + "grad_norm": 22.75, + "learning_rate": 1.0551573654151036e-06, + "loss": 1.3594779968261719, + "step": 15254 + }, + { + "epoch": 2.776918176026213, + "grad_norm": 9.375, + "learning_rate": 1.0549788183608755e-06, + "loss": 1.1901259422302246, + "step": 15256 + }, + { + "epoch": 2.777282242650405, + "grad_norm": 13.25, + "learning_rate": 1.0548005567306235e-06, + "loss": 1.3841781616210938, + "step": 15258 + }, + { + "epoch": 2.7776463092745973, + "grad_norm": 26.75, + "learning_rate": 1.0546225805505062e-06, + "loss": 1.4691760540008545, + "step": 15260 + }, + { + "epoch": 2.7780103758987895, + "grad_norm": 8.9375, + "learning_rate": 1.054444889846642e-06, + "loss": 1.2579461336135864, + "step": 15262 + }, + { + "epoch": 2.7783744425229817, + "grad_norm": 7.15625, + "learning_rate": 1.0542674846451052e-06, + "loss": 0.9277184009552002, + "step": 15264 + }, + { + "epoch": 2.778738509147174, + "grad_norm": 9.25, + "learning_rate": 1.0540903649719294e-06, + "loss": 1.3653194904327393, + "step": 15266 + }, + { + "epoch": 2.779102575771366, + "grad_norm": 6.34375, + "learning_rate": 1.0539135308531067e-06, + "loss": 1.3270657062530518, + "step": 15268 + }, + { + "epoch": 2.7794666423955583, + "grad_norm": 30.375, + "learning_rate": 1.0537369823145866e-06, + "loss": 1.225717544555664, + "step": 15270 + }, + { + "epoch": 2.7798307090197505, + "grad_norm": 181.0, + "learning_rate": 1.0535607193822769e-06, + "loss": 1.7292187213897705, + "step": 15272 + }, + { + "epoch": 2.7801947756439427, + "grad_norm": 27.25, + "learning_rate": 1.053384742082043e-06, + "loss": 1.31608247756958, + "step": 15274 + }, + { + "epoch": 2.7805588422681353, + "grad_norm": 10.625, + "learning_rate": 1.0532090504397098e-06, + "loss": 1.0656460523605347, + "step": 15276 + }, + { + "epoch": 2.780922908892327, + "grad_norm": 5.5, + "learning_rate": 1.0530336444810586e-06, + "loss": 1.230137825012207, + "step": 15278 + }, + { + "epoch": 2.7812869755165197, + "grad_norm": 11.0, + "learning_rate": 1.0528585242318293e-06, + "loss": 1.2619385719299316, + "step": 15280 + }, + { + "epoch": 2.781651042140712, + "grad_norm": 13.5625, + "learning_rate": 1.0526836897177212e-06, + "loss": 1.7867231369018555, + "step": 15282 + }, + { + "epoch": 2.782015108764904, + "grad_norm": 5.625, + "learning_rate": 1.0525091409643894e-06, + "loss": 1.3512623310089111, + "step": 15284 + }, + { + "epoch": 2.7823791753890963, + "grad_norm": 5.03125, + "learning_rate": 1.0523348779974487e-06, + "loss": 1.3322995901107788, + "step": 15286 + }, + { + "epoch": 2.7827432420132885, + "grad_norm": 11.0, + "learning_rate": 1.0521609008424716e-06, + "loss": 1.5845931768417358, + "step": 15288 + }, + { + "epoch": 2.7831073086374807, + "grad_norm": 8.5, + "learning_rate": 1.051987209524988e-06, + "loss": 1.4134056568145752, + "step": 15290 + }, + { + "epoch": 2.783471375261673, + "grad_norm": 13.1875, + "learning_rate": 1.0518138040704873e-06, + "loss": 1.5484111309051514, + "step": 15292 + }, + { + "epoch": 2.783835441885865, + "grad_norm": 8.5, + "learning_rate": 1.0516406845044153e-06, + "loss": 1.4702564477920532, + "step": 15294 + }, + { + "epoch": 2.7841995085100573, + "grad_norm": 12.1875, + "learning_rate": 1.051467850852177e-06, + "loss": 1.244093894958496, + "step": 15296 + }, + { + "epoch": 2.7845635751342495, + "grad_norm": 12.1875, + "learning_rate": 1.0512953031391347e-06, + "loss": 1.359610676765442, + "step": 15298 + }, + { + "epoch": 2.7849276417584417, + "grad_norm": 7.625, + "learning_rate": 1.0511230413906093e-06, + "loss": 1.1750068664550781, + "step": 15300 + }, + { + "epoch": 2.7852917083826343, + "grad_norm": 18.25, + "learning_rate": 1.0509510656318796e-06, + "loss": 1.401893973350525, + "step": 15302 + }, + { + "epoch": 2.785655775006826, + "grad_norm": 7.34375, + "learning_rate": 1.0507793758881822e-06, + "loss": 1.1192243099212646, + "step": 15304 + }, + { + "epoch": 2.7860198416310187, + "grad_norm": 25.875, + "learning_rate": 1.0506079721847117e-06, + "loss": 1.3272150754928589, + "step": 15306 + }, + { + "epoch": 2.7863839082552104, + "grad_norm": 8.75, + "learning_rate": 1.0504368545466213e-06, + "loss": 1.481140375137329, + "step": 15308 + }, + { + "epoch": 2.786747974879403, + "grad_norm": 13.25, + "learning_rate": 1.0502660229990213e-06, + "loss": 1.6706256866455078, + "step": 15310 + }, + { + "epoch": 2.7871120415035953, + "grad_norm": 32.0, + "learning_rate": 1.0500954775669813e-06, + "loss": 1.7830562591552734, + "step": 15312 + }, + { + "epoch": 2.7874761081277875, + "grad_norm": 6.625, + "learning_rate": 1.0499252182755274e-06, + "loss": 0.9861568212509155, + "step": 15314 + }, + { + "epoch": 2.7878401747519796, + "grad_norm": 6.90625, + "learning_rate": 1.0497552451496447e-06, + "loss": 1.2062160968780518, + "step": 15316 + }, + { + "epoch": 2.788204241376172, + "grad_norm": 11.5625, + "learning_rate": 1.0495855582142763e-06, + "loss": 1.2470710277557373, + "step": 15318 + }, + { + "epoch": 2.788568308000364, + "grad_norm": 39.25, + "learning_rate": 1.0494161574943224e-06, + "loss": 0.9131478071212769, + "step": 15320 + }, + { + "epoch": 2.7889323746245562, + "grad_norm": 11.8125, + "learning_rate": 1.0492470430146432e-06, + "loss": 1.005089521408081, + "step": 15322 + }, + { + "epoch": 2.7892964412487484, + "grad_norm": 6.65625, + "learning_rate": 1.049078214800054e-06, + "loss": 1.2499432563781738, + "step": 15324 + }, + { + "epoch": 2.7896605078729406, + "grad_norm": 8.0, + "learning_rate": 1.0489096728753308e-06, + "loss": 1.538610816001892, + "step": 15326 + }, + { + "epoch": 2.790024574497133, + "grad_norm": 9.4375, + "learning_rate": 1.048741417265206e-06, + "loss": 0.9799821376800537, + "step": 15328 + }, + { + "epoch": 2.790388641121325, + "grad_norm": 15.25, + "learning_rate": 1.0485734479943702e-06, + "loss": 1.5890527963638306, + "step": 15330 + }, + { + "epoch": 2.7907527077455176, + "grad_norm": 15.0625, + "learning_rate": 1.0484057650874727e-06, + "loss": 1.3033627271652222, + "step": 15332 + }, + { + "epoch": 2.7911167743697094, + "grad_norm": 17.125, + "learning_rate": 1.04823836856912e-06, + "loss": 1.0314157009124756, + "step": 15334 + }, + { + "epoch": 2.791480840993902, + "grad_norm": 12.9375, + "learning_rate": 1.0480712584638769e-06, + "loss": 1.7576978206634521, + "step": 15336 + }, + { + "epoch": 2.7918449076180942, + "grad_norm": 3.828125, + "learning_rate": 1.0479044347962662e-06, + "loss": 0.9735898971557617, + "step": 15338 + }, + { + "epoch": 2.7922089742422864, + "grad_norm": 7.8125, + "learning_rate": 1.047737897590768e-06, + "loss": 1.4962714910507202, + "step": 15340 + }, + { + "epoch": 2.7925730408664786, + "grad_norm": 42.25, + "learning_rate": 1.0475716468718224e-06, + "loss": 1.4702708721160889, + "step": 15342 + }, + { + "epoch": 2.792937107490671, + "grad_norm": 14.3125, + "learning_rate": 1.0474056826638248e-06, + "loss": 1.3507142066955566, + "step": 15344 + }, + { + "epoch": 2.793301174114863, + "grad_norm": 9.3125, + "learning_rate": 1.0472400049911302e-06, + "loss": 1.5258738994598389, + "step": 15346 + }, + { + "epoch": 2.793665240739055, + "grad_norm": 9.9375, + "learning_rate": 1.0470746138780507e-06, + "loss": 1.4927797317504883, + "step": 15348 + }, + { + "epoch": 2.7940293073632474, + "grad_norm": 10.25, + "learning_rate": 1.0469095093488568e-06, + "loss": 1.3006072044372559, + "step": 15350 + }, + { + "epoch": 2.7943933739874396, + "grad_norm": 9.8125, + "learning_rate": 1.046744691427778e-06, + "loss": 1.3229343891143799, + "step": 15352 + }, + { + "epoch": 2.794757440611632, + "grad_norm": 30.875, + "learning_rate": 1.0465801601389997e-06, + "loss": 1.4262301921844482, + "step": 15354 + }, + { + "epoch": 2.795121507235824, + "grad_norm": 6.40625, + "learning_rate": 1.0464159155066662e-06, + "loss": 1.4256393909454346, + "step": 15356 + }, + { + "epoch": 2.7954855738600166, + "grad_norm": 7.65625, + "learning_rate": 1.0462519575548798e-06, + "loss": 1.3325252532958984, + "step": 15358 + }, + { + "epoch": 2.7958496404842084, + "grad_norm": 7.625, + "learning_rate": 1.0460882863077007e-06, + "loss": 1.3819911479949951, + "step": 15360 + }, + { + "epoch": 2.796213707108401, + "grad_norm": 13.625, + "learning_rate": 1.0459249017891474e-06, + "loss": 1.2774745225906372, + "step": 15362 + }, + { + "epoch": 2.796577773732593, + "grad_norm": 10.375, + "learning_rate": 1.0457618040231953e-06, + "loss": 1.5341606140136719, + "step": 15364 + }, + { + "epoch": 2.7969418403567854, + "grad_norm": 12.5625, + "learning_rate": 1.045598993033779e-06, + "loss": 1.3138628005981445, + "step": 15366 + }, + { + "epoch": 2.7973059069809776, + "grad_norm": 7.6875, + "learning_rate": 1.04543646884479e-06, + "loss": 1.5500141382217407, + "step": 15368 + }, + { + "epoch": 2.79766997360517, + "grad_norm": 34.75, + "learning_rate": 1.0452742314800775e-06, + "loss": 1.4380184412002563, + "step": 15370 + }, + { + "epoch": 2.798034040229362, + "grad_norm": 7.84375, + "learning_rate": 1.0451122809634502e-06, + "loss": 1.3354018926620483, + "step": 15372 + }, + { + "epoch": 2.798398106853554, + "grad_norm": 10.375, + "learning_rate": 1.044950617318673e-06, + "loss": 1.4029438495635986, + "step": 15374 + }, + { + "epoch": 2.7987621734777464, + "grad_norm": 21.0, + "learning_rate": 1.0447892405694696e-06, + "loss": 1.5707744359970093, + "step": 15376 + }, + { + "epoch": 2.7991262401019386, + "grad_norm": 23.0, + "learning_rate": 1.0446281507395213e-06, + "loss": 1.2131977081298828, + "step": 15378 + }, + { + "epoch": 2.7994903067261308, + "grad_norm": 15.0625, + "learning_rate": 1.0444673478524675e-06, + "loss": 0.7167494297027588, + "step": 15380 + }, + { + "epoch": 2.799854373350323, + "grad_norm": 16.0, + "learning_rate": 1.0443068319319054e-06, + "loss": 1.2614479064941406, + "step": 15382 + }, + { + "epoch": 2.8002184399745156, + "grad_norm": 8.5, + "learning_rate": 1.04414660300139e-06, + "loss": 1.4775824546813965, + "step": 15384 + }, + { + "epoch": 2.8005825065987073, + "grad_norm": 6.09375, + "learning_rate": 1.0439866610844342e-06, + "loss": 1.256793737411499, + "step": 15386 + }, + { + "epoch": 2.8009465732229, + "grad_norm": 17.25, + "learning_rate": 1.043827006204509e-06, + "loss": 1.2728979587554932, + "step": 15388 + }, + { + "epoch": 2.801310639847092, + "grad_norm": 12.1875, + "learning_rate": 1.0436676383850425e-06, + "loss": 1.5430599451065063, + "step": 15390 + }, + { + "epoch": 2.8016747064712844, + "grad_norm": 13.1875, + "learning_rate": 1.0435085576494221e-06, + "loss": 1.3391112089157104, + "step": 15392 + }, + { + "epoch": 2.8020387730954766, + "grad_norm": 99.5, + "learning_rate": 1.0433497640209921e-06, + "loss": 1.2824134826660156, + "step": 15394 + }, + { + "epoch": 2.8024028397196687, + "grad_norm": 11.5, + "learning_rate": 1.043191257523054e-06, + "loss": 0.9084633588790894, + "step": 15396 + }, + { + "epoch": 2.802766906343861, + "grad_norm": 11.3125, + "learning_rate": 1.0430330381788692e-06, + "loss": 1.7296411991119385, + "step": 15398 + }, + { + "epoch": 2.803130972968053, + "grad_norm": 9.25, + "learning_rate": 1.0428751060116547e-06, + "loss": 1.0605690479278564, + "step": 15400 + }, + { + "epoch": 2.8034950395922453, + "grad_norm": 9.375, + "learning_rate": 1.0427174610445873e-06, + "loss": 0.7418138980865479, + "step": 15402 + }, + { + "epoch": 2.8038591062164375, + "grad_norm": 5.5, + "learning_rate": 1.0425601033008e-06, + "loss": 1.2067638635635376, + "step": 15404 + }, + { + "epoch": 2.8042231728406297, + "grad_norm": 28.125, + "learning_rate": 1.042403032803385e-06, + "loss": 1.207100510597229, + "step": 15406 + }, + { + "epoch": 2.804587239464822, + "grad_norm": 3.296875, + "learning_rate": 1.042246249575391e-06, + "loss": 1.1736596822738647, + "step": 15408 + }, + { + "epoch": 2.8049513060890146, + "grad_norm": 11.125, + "learning_rate": 1.0420897536398262e-06, + "loss": 1.1369788646697998, + "step": 15410 + }, + { + "epoch": 2.8053153727132063, + "grad_norm": 5.9375, + "learning_rate": 1.0419335450196554e-06, + "loss": 1.498537540435791, + "step": 15412 + }, + { + "epoch": 2.805679439337399, + "grad_norm": 48.75, + "learning_rate": 1.0417776237378014e-06, + "loss": 1.3380601406097412, + "step": 15414 + }, + { + "epoch": 2.806043505961591, + "grad_norm": 23.875, + "learning_rate": 1.0416219898171451e-06, + "loss": 1.2410119771957397, + "step": 15416 + }, + { + "epoch": 2.8064075725857833, + "grad_norm": 20.625, + "learning_rate": 1.0414666432805252e-06, + "loss": 1.8276417255401611, + "step": 15418 + }, + { + "epoch": 2.8067716392099755, + "grad_norm": 22.75, + "learning_rate": 1.041311584150738e-06, + "loss": 1.3166488409042358, + "step": 15420 + }, + { + "epoch": 2.8071357058341677, + "grad_norm": 12.0625, + "learning_rate": 1.0411568124505384e-06, + "loss": 1.2247083187103271, + "step": 15422 + }, + { + "epoch": 2.80749977245836, + "grad_norm": 7.28125, + "learning_rate": 1.0410023282026376e-06, + "loss": 1.3944164514541626, + "step": 15424 + }, + { + "epoch": 2.807863839082552, + "grad_norm": 5.875, + "learning_rate": 1.0408481314297062e-06, + "loss": 1.3214515447616577, + "step": 15426 + }, + { + "epoch": 2.8082279057067443, + "grad_norm": 3.3125, + "learning_rate": 1.0406942221543718e-06, + "loss": 1.055877685546875, + "step": 15428 + }, + { + "epoch": 2.8085919723309365, + "grad_norm": 39.5, + "learning_rate": 1.0405406003992197e-06, + "loss": 1.6331040859222412, + "step": 15430 + }, + { + "epoch": 2.8089560389551287, + "grad_norm": 21.0, + "learning_rate": 1.0403872661867938e-06, + "loss": 1.712215542793274, + "step": 15432 + }, + { + "epoch": 2.809320105579321, + "grad_norm": 11.3125, + "learning_rate": 1.0402342195395949e-06, + "loss": 1.2522261142730713, + "step": 15434 + }, + { + "epoch": 2.8096841722035135, + "grad_norm": 5.15625, + "learning_rate": 1.040081460480082e-06, + "loss": 1.5624516010284424, + "step": 15436 + }, + { + "epoch": 2.8100482388277053, + "grad_norm": 5.90625, + "learning_rate": 1.039928989030672e-06, + "loss": 0.9424973130226135, + "step": 15438 + }, + { + "epoch": 2.810412305451898, + "grad_norm": 12.5, + "learning_rate": 1.039776805213739e-06, + "loss": 1.0278855562210083, + "step": 15440 + }, + { + "epoch": 2.8107763720760897, + "grad_norm": 13.875, + "learning_rate": 1.0396249090516163e-06, + "loss": 0.761257529258728, + "step": 15442 + }, + { + "epoch": 2.8111404387002823, + "grad_norm": 5.78125, + "learning_rate": 1.0394733005665931e-06, + "loss": 0.9988175630569458, + "step": 15444 + }, + { + "epoch": 2.8115045053244745, + "grad_norm": 12.1875, + "learning_rate": 1.039321979780918e-06, + "loss": 1.3274415731430054, + "step": 15446 + }, + { + "epoch": 2.8118685719486667, + "grad_norm": 20.875, + "learning_rate": 1.0391709467167961e-06, + "loss": 2.0050301551818848, + "step": 15448 + }, + { + "epoch": 2.812232638572859, + "grad_norm": 12.625, + "learning_rate": 1.0390202013963913e-06, + "loss": 1.9951343536376953, + "step": 15450 + }, + { + "epoch": 2.812596705197051, + "grad_norm": 84.0, + "learning_rate": 1.0388697438418251e-06, + "loss": 2.048008680343628, + "step": 15452 + }, + { + "epoch": 2.8129607718212433, + "grad_norm": 8.75, + "learning_rate": 1.038719574075176e-06, + "loss": 1.5148104429244995, + "step": 15454 + }, + { + "epoch": 2.8133248384454355, + "grad_norm": 6.53125, + "learning_rate": 1.0385696921184813e-06, + "loss": 1.4777487516403198, + "step": 15456 + }, + { + "epoch": 2.8136889050696277, + "grad_norm": 12.75, + "learning_rate": 1.0384200979937349e-06, + "loss": 1.4203996658325195, + "step": 15458 + }, + { + "epoch": 2.81405297169382, + "grad_norm": 9.125, + "learning_rate": 1.0382707917228894e-06, + "loss": 1.5079729557037354, + "step": 15460 + }, + { + "epoch": 2.814417038318012, + "grad_norm": 7.96875, + "learning_rate": 1.0381217733278555e-06, + "loss": 1.5309672355651855, + "step": 15462 + }, + { + "epoch": 2.8147811049422042, + "grad_norm": 12.5, + "learning_rate": 1.0379730428305004e-06, + "loss": 1.1129004955291748, + "step": 15464 + }, + { + "epoch": 2.815145171566397, + "grad_norm": 14.75, + "learning_rate": 1.03782460025265e-06, + "loss": 1.3766101598739624, + "step": 15466 + }, + { + "epoch": 2.8155092381905886, + "grad_norm": 28.0, + "learning_rate": 1.0376764456160873e-06, + "loss": 1.8709042072296143, + "step": 15468 + }, + { + "epoch": 2.8158733048147813, + "grad_norm": 10.625, + "learning_rate": 1.0375285789425534e-06, + "loss": 1.3518832921981812, + "step": 15470 + }, + { + "epoch": 2.8162373714389735, + "grad_norm": 8.5, + "learning_rate": 1.037381000253748e-06, + "loss": 1.6706080436706543, + "step": 15472 + }, + { + "epoch": 2.8166014380631657, + "grad_norm": 9.125, + "learning_rate": 1.0372337095713265e-06, + "loss": 1.367620825767517, + "step": 15474 + }, + { + "epoch": 2.816965504687358, + "grad_norm": 14.3125, + "learning_rate": 1.037086706916904e-06, + "loss": 1.1302152872085571, + "step": 15476 + }, + { + "epoch": 2.81732957131155, + "grad_norm": 18.625, + "learning_rate": 1.0369399923120518e-06, + "loss": 0.6499367952346802, + "step": 15478 + }, + { + "epoch": 2.8176936379357422, + "grad_norm": 75.0, + "learning_rate": 1.0367935657783005e-06, + "loss": 1.5066099166870117, + "step": 15480 + }, + { + "epoch": 2.8180577045599344, + "grad_norm": 22.25, + "learning_rate": 1.0366474273371373e-06, + "loss": 1.5841988325119019, + "step": 15482 + }, + { + "epoch": 2.8184217711841266, + "grad_norm": 27.5, + "learning_rate": 1.0365015770100071e-06, + "loss": 1.6245061159133911, + "step": 15484 + }, + { + "epoch": 2.818785837808319, + "grad_norm": 14.0625, + "learning_rate": 1.0363560148183135e-06, + "loss": 1.731898307800293, + "step": 15486 + }, + { + "epoch": 2.819149904432511, + "grad_norm": 13.375, + "learning_rate": 1.0362107407834165e-06, + "loss": 1.5783600807189941, + "step": 15488 + }, + { + "epoch": 2.819513971056703, + "grad_norm": 6.59375, + "learning_rate": 1.0360657549266346e-06, + "loss": 1.3789788484573364, + "step": 15490 + }, + { + "epoch": 2.819878037680896, + "grad_norm": 5.90625, + "learning_rate": 1.0359210572692442e-06, + "loss": 1.0591803789138794, + "step": 15492 + }, + { + "epoch": 2.8202421043050876, + "grad_norm": 18.75, + "learning_rate": 1.0357766478324792e-06, + "loss": 1.4972491264343262, + "step": 15494 + }, + { + "epoch": 2.8206061709292802, + "grad_norm": 18.5, + "learning_rate": 1.0356325266375305e-06, + "loss": 1.6652246713638306, + "step": 15496 + }, + { + "epoch": 2.8209702375534724, + "grad_norm": 12.6875, + "learning_rate": 1.0354886937055478e-06, + "loss": 0.5180586576461792, + "step": 15498 + }, + { + "epoch": 2.8213343041776646, + "grad_norm": 15.25, + "learning_rate": 1.0353451490576375e-06, + "loss": 1.503953218460083, + "step": 15500 + }, + { + "epoch": 2.821698370801857, + "grad_norm": 21.25, + "learning_rate": 1.035201892714865e-06, + "loss": 1.2995222806930542, + "step": 15502 + }, + { + "epoch": 2.822062437426049, + "grad_norm": 13.625, + "learning_rate": 1.035058924698252e-06, + "loss": 1.3643341064453125, + "step": 15504 + }, + { + "epoch": 2.822426504050241, + "grad_norm": 16.5, + "learning_rate": 1.0349162450287781e-06, + "loss": 1.4070734977722168, + "step": 15506 + }, + { + "epoch": 2.8227905706744334, + "grad_norm": 26.375, + "learning_rate": 1.034773853727382e-06, + "loss": 1.3273887634277344, + "step": 15508 + }, + { + "epoch": 2.8231546372986256, + "grad_norm": 15.1875, + "learning_rate": 1.0346317508149581e-06, + "loss": 0.47858723998069763, + "step": 15510 + }, + { + "epoch": 2.823518703922818, + "grad_norm": 27.125, + "learning_rate": 1.0344899363123603e-06, + "loss": 1.2646902799606323, + "step": 15512 + }, + { + "epoch": 2.82388277054701, + "grad_norm": 7.75, + "learning_rate": 1.0343484102403984e-06, + "loss": 1.2462102174758911, + "step": 15514 + }, + { + "epoch": 2.824246837171202, + "grad_norm": 11.1875, + "learning_rate": 1.0342071726198415e-06, + "loss": 1.6582112312316895, + "step": 15516 + }, + { + "epoch": 2.824610903795395, + "grad_norm": 8.0625, + "learning_rate": 1.0340662234714155e-06, + "loss": 1.49208402633667, + "step": 15518 + }, + { + "epoch": 2.8249749704195866, + "grad_norm": 38.5, + "learning_rate": 1.0339255628158034e-06, + "loss": 1.167761206626892, + "step": 15520 + }, + { + "epoch": 2.825339037043779, + "grad_norm": 8.3125, + "learning_rate": 1.0337851906736476e-06, + "loss": 1.338670253753662, + "step": 15522 + }, + { + "epoch": 2.8257031036679714, + "grad_norm": 9.5625, + "learning_rate": 1.0336451070655466e-06, + "loss": 1.16083824634552, + "step": 15524 + }, + { + "epoch": 2.8260671702921636, + "grad_norm": 10.5, + "learning_rate": 1.033505312012057e-06, + "loss": 1.3998286724090576, + "step": 15526 + }, + { + "epoch": 2.826431236916356, + "grad_norm": 25.125, + "learning_rate": 1.0333658055336937e-06, + "loss": 1.4924664497375488, + "step": 15528 + }, + { + "epoch": 2.826795303540548, + "grad_norm": 9.9375, + "learning_rate": 1.033226587650928e-06, + "loss": 1.5790613889694214, + "step": 15530 + }, + { + "epoch": 2.82715937016474, + "grad_norm": 7.03125, + "learning_rate": 1.0330876583841902e-06, + "loss": 1.2886697053909302, + "step": 15532 + }, + { + "epoch": 2.8275234367889324, + "grad_norm": 10.75, + "learning_rate": 1.0329490177538673e-06, + "loss": 1.3453556299209595, + "step": 15534 + }, + { + "epoch": 2.8278875034131246, + "grad_norm": 10.8125, + "learning_rate": 1.0328106657803045e-06, + "loss": 1.1671477556228638, + "step": 15536 + }, + { + "epoch": 2.8282515700373168, + "grad_norm": 8.1875, + "learning_rate": 1.0326726024838036e-06, + "loss": 0.9520739316940308, + "step": 15538 + }, + { + "epoch": 2.828615636661509, + "grad_norm": 26.25, + "learning_rate": 1.0325348278846258e-06, + "loss": 1.2173244953155518, + "step": 15540 + }, + { + "epoch": 2.828979703285701, + "grad_norm": 13.0625, + "learning_rate": 1.0323973420029885e-06, + "loss": 1.791285514831543, + "step": 15542 + }, + { + "epoch": 2.829343769909894, + "grad_norm": 21.875, + "learning_rate": 1.0322601448590673e-06, + "loss": 1.4645047187805176, + "step": 15544 + }, + { + "epoch": 2.8297078365340855, + "grad_norm": 15.3125, + "learning_rate": 1.0321232364729953e-06, + "loss": 1.399730920791626, + "step": 15546 + }, + { + "epoch": 2.830071903158278, + "grad_norm": 9.875, + "learning_rate": 1.0319866168648632e-06, + "loss": 1.454708218574524, + "step": 15548 + }, + { + "epoch": 2.83043596978247, + "grad_norm": 11.875, + "learning_rate": 1.0318502860547193e-06, + "loss": 1.4562010765075684, + "step": 15550 + }, + { + "epoch": 2.8308000364066626, + "grad_norm": 4.125, + "learning_rate": 1.03171424406257e-06, + "loss": 0.979663610458374, + "step": 15552 + }, + { + "epoch": 2.8311641030308548, + "grad_norm": 24.0, + "learning_rate": 1.0315784909083781e-06, + "loss": 1.3442797660827637, + "step": 15554 + }, + { + "epoch": 2.831528169655047, + "grad_norm": 14.25, + "learning_rate": 1.031443026612066e-06, + "loss": 1.7458198070526123, + "step": 15556 + }, + { + "epoch": 2.831892236279239, + "grad_norm": 8.3125, + "learning_rate": 1.0313078511935114e-06, + "loss": 1.3635425567626953, + "step": 15558 + }, + { + "epoch": 2.8322563029034313, + "grad_norm": 8.9375, + "learning_rate": 1.0311729646725513e-06, + "loss": 1.3710309267044067, + "step": 15560 + }, + { + "epoch": 2.8326203695276235, + "grad_norm": 14.0625, + "learning_rate": 1.03103836706898e-06, + "loss": 1.4832854270935059, + "step": 15562 + }, + { + "epoch": 2.8329844361518157, + "grad_norm": 3.5, + "learning_rate": 1.0309040584025482e-06, + "loss": 1.207900881767273, + "step": 15564 + }, + { + "epoch": 2.833348502776008, + "grad_norm": 26.5, + "learning_rate": 1.0307700386929664e-06, + "loss": 1.359100103378296, + "step": 15566 + }, + { + "epoch": 2.8337125694002, + "grad_norm": 12.375, + "learning_rate": 1.0306363079599007e-06, + "loss": 1.9217147827148438, + "step": 15568 + }, + { + "epoch": 2.8340766360243923, + "grad_norm": 26.375, + "learning_rate": 1.0305028662229752e-06, + "loss": 1.4839872121810913, + "step": 15570 + }, + { + "epoch": 2.8344407026485845, + "grad_norm": 16.625, + "learning_rate": 1.0303697135017733e-06, + "loss": 1.5705583095550537, + "step": 15572 + }, + { + "epoch": 2.834804769272777, + "grad_norm": 10.9375, + "learning_rate": 1.030236849815833e-06, + "loss": 1.2779827117919922, + "step": 15574 + }, + { + "epoch": 2.835168835896969, + "grad_norm": 16.625, + "learning_rate": 1.0301042751846524e-06, + "loss": 1.885358214378357, + "step": 15576 + }, + { + "epoch": 2.8355329025211615, + "grad_norm": 11.125, + "learning_rate": 1.0299719896276864e-06, + "loss": 1.866011619567871, + "step": 15578 + }, + { + "epoch": 2.8358969691453537, + "grad_norm": 5.1875, + "learning_rate": 1.0298399931643466e-06, + "loss": 1.074000358581543, + "step": 15580 + }, + { + "epoch": 2.836261035769546, + "grad_norm": 3.203125, + "learning_rate": 1.029708285814004e-06, + "loss": 1.0708781480789185, + "step": 15582 + }, + { + "epoch": 2.836625102393738, + "grad_norm": 5.90625, + "learning_rate": 1.029576867595985e-06, + "loss": 0.9992654323577881, + "step": 15584 + }, + { + "epoch": 2.8369891690179303, + "grad_norm": 9.125, + "learning_rate": 1.0294457385295755e-06, + "loss": 1.426224946975708, + "step": 15586 + }, + { + "epoch": 2.8373532356421225, + "grad_norm": 12.0, + "learning_rate": 1.029314898634018e-06, + "loss": 1.972790002822876, + "step": 15588 + }, + { + "epoch": 2.8377173022663147, + "grad_norm": 13.9375, + "learning_rate": 1.0291843479285123e-06, + "loss": 1.9864373207092285, + "step": 15590 + }, + { + "epoch": 2.838081368890507, + "grad_norm": 7.125, + "learning_rate": 1.0290540864322173e-06, + "loss": 1.5132185220718384, + "step": 15592 + }, + { + "epoch": 2.838445435514699, + "grad_norm": 10.6875, + "learning_rate": 1.028924114164247e-06, + "loss": 1.3053078651428223, + "step": 15594 + }, + { + "epoch": 2.8388095021388913, + "grad_norm": 29.25, + "learning_rate": 1.0287944311436748e-06, + "loss": 1.3878569602966309, + "step": 15596 + }, + { + "epoch": 2.8391735687630835, + "grad_norm": 22.875, + "learning_rate": 1.0286650373895315e-06, + "loss": 1.5359525680541992, + "step": 15598 + }, + { + "epoch": 2.839537635387276, + "grad_norm": 15.375, + "learning_rate": 1.0285359329208045e-06, + "loss": 1.7001599073410034, + "step": 15600 + }, + { + "epoch": 2.839901702011468, + "grad_norm": 31.5, + "learning_rate": 1.02840711775644e-06, + "loss": 1.0770227909088135, + "step": 15602 + }, + { + "epoch": 2.8402657686356605, + "grad_norm": 3.375, + "learning_rate": 1.0282785919153408e-06, + "loss": 1.1072921752929688, + "step": 15604 + }, + { + "epoch": 2.8406298352598527, + "grad_norm": 35.25, + "learning_rate": 1.0281503554163675e-06, + "loss": 1.1652566194534302, + "step": 15606 + }, + { + "epoch": 2.840993901884045, + "grad_norm": 21.5, + "learning_rate": 1.0280224082783383e-06, + "loss": 1.7659976482391357, + "step": 15608 + }, + { + "epoch": 2.841357968508237, + "grad_norm": 12.0625, + "learning_rate": 1.0278947505200288e-06, + "loss": 1.5059798955917358, + "step": 15610 + }, + { + "epoch": 2.8417220351324293, + "grad_norm": 12.875, + "learning_rate": 1.0277673821601728e-06, + "loss": 1.3959065675735474, + "step": 15612 + }, + { + "epoch": 2.8420861017566215, + "grad_norm": 9.6875, + "learning_rate": 1.0276403032174604e-06, + "loss": 1.7684130668640137, + "step": 15614 + }, + { + "epoch": 2.8424501683808137, + "grad_norm": 12.5, + "learning_rate": 1.0275135137105403e-06, + "loss": 1.6079057455062866, + "step": 15616 + }, + { + "epoch": 2.842814235005006, + "grad_norm": 12.25, + "learning_rate": 1.0273870136580185e-06, + "loss": 1.6850941181182861, + "step": 15618 + }, + { + "epoch": 2.843178301629198, + "grad_norm": 31.375, + "learning_rate": 1.0272608030784576e-06, + "loss": 2.0558536052703857, + "step": 15620 + }, + { + "epoch": 2.8435423682533902, + "grad_norm": 14.125, + "learning_rate": 1.0271348819903798e-06, + "loss": 1.3582508563995361, + "step": 15622 + }, + { + "epoch": 2.8439064348775824, + "grad_norm": 15.375, + "learning_rate": 1.027009250412262e-06, + "loss": 0.9238548278808594, + "step": 15624 + }, + { + "epoch": 2.844270501501775, + "grad_norm": 25.875, + "learning_rate": 1.0268839083625413e-06, + "loss": 1.4568922519683838, + "step": 15626 + }, + { + "epoch": 2.844634568125967, + "grad_norm": 10.4375, + "learning_rate": 1.0267588558596107e-06, + "loss": 1.2882832288742065, + "step": 15628 + }, + { + "epoch": 2.8449986347501595, + "grad_norm": 10.1875, + "learning_rate": 1.026634092921821e-06, + "loss": 1.344896912574768, + "step": 15630 + }, + { + "epoch": 2.8453627013743517, + "grad_norm": 9.875, + "learning_rate": 1.0265096195674808e-06, + "loss": 1.427192211151123, + "step": 15632 + }, + { + "epoch": 2.845726767998544, + "grad_norm": 12.0625, + "learning_rate": 1.026385435814856e-06, + "loss": 1.5665194988250732, + "step": 15634 + }, + { + "epoch": 2.846090834622736, + "grad_norm": 22.375, + "learning_rate": 1.0262615416821704e-06, + "loss": 1.610897421836853, + "step": 15636 + }, + { + "epoch": 2.8464549012469282, + "grad_norm": 30.125, + "learning_rate": 1.0261379371876045e-06, + "loss": 0.5919798612594604, + "step": 15638 + }, + { + "epoch": 2.8468189678711204, + "grad_norm": 49.75, + "learning_rate": 1.0260146223492972e-06, + "loss": 1.4704358577728271, + "step": 15640 + }, + { + "epoch": 2.8471830344953126, + "grad_norm": 11.25, + "learning_rate": 1.025891597185344e-06, + "loss": 1.2458178997039795, + "step": 15642 + }, + { + "epoch": 2.847547101119505, + "grad_norm": 11.0625, + "learning_rate": 1.0257688617137985e-06, + "loss": 1.385305404663086, + "step": 15644 + }, + { + "epoch": 2.847911167743697, + "grad_norm": 15.9375, + "learning_rate": 1.0256464159526718e-06, + "loss": 1.2866665124893188, + "step": 15646 + }, + { + "epoch": 2.848275234367889, + "grad_norm": 29.625, + "learning_rate": 1.0255242599199322e-06, + "loss": 1.521550178527832, + "step": 15648 + }, + { + "epoch": 2.8486393009920814, + "grad_norm": 14.125, + "learning_rate": 1.0254023936335055e-06, + "loss": 1.831729769706726, + "step": 15650 + }, + { + "epoch": 2.849003367616274, + "grad_norm": 4.59375, + "learning_rate": 1.0252808171112755e-06, + "loss": 0.8389082551002502, + "step": 15652 + }, + { + "epoch": 2.849367434240466, + "grad_norm": 6.75, + "learning_rate": 1.0251595303710823e-06, + "loss": 0.9912675023078918, + "step": 15654 + }, + { + "epoch": 2.8497315008646584, + "grad_norm": 15.0, + "learning_rate": 1.025038533430725e-06, + "loss": 1.6578004360198975, + "step": 15656 + }, + { + "epoch": 2.8500955674888506, + "grad_norm": 36.0, + "learning_rate": 1.024917826307959e-06, + "loss": 1.6915128231048584, + "step": 15658 + }, + { + "epoch": 2.850459634113043, + "grad_norm": 12.375, + "learning_rate": 1.0247974090204977e-06, + "loss": 1.5055975914001465, + "step": 15660 + }, + { + "epoch": 2.850823700737235, + "grad_norm": 11.8125, + "learning_rate": 1.0246772815860117e-06, + "loss": 1.5027272701263428, + "step": 15662 + }, + { + "epoch": 2.851187767361427, + "grad_norm": 28.25, + "learning_rate": 1.0245574440221295e-06, + "loss": 1.7784786224365234, + "step": 15664 + }, + { + "epoch": 2.8515518339856194, + "grad_norm": 26.0, + "learning_rate": 1.0244378963464366e-06, + "loss": 1.3188923597335815, + "step": 15666 + }, + { + "epoch": 2.8519159006098116, + "grad_norm": 14.5625, + "learning_rate": 1.0243186385764762e-06, + "loss": 1.3407018184661865, + "step": 15668 + }, + { + "epoch": 2.852279967234004, + "grad_norm": 10.375, + "learning_rate": 1.0241996707297485e-06, + "loss": 1.4565585851669312, + "step": 15670 + }, + { + "epoch": 2.852644033858196, + "grad_norm": 8.125, + "learning_rate": 1.0240809928237126e-06, + "loss": 1.3527569770812988, + "step": 15672 + }, + { + "epoch": 2.853008100482388, + "grad_norm": 2.671875, + "learning_rate": 1.0239626048757829e-06, + "loss": 1.1230276823043823, + "step": 15674 + }, + { + "epoch": 2.8533721671065804, + "grad_norm": 11.0625, + "learning_rate": 1.0238445069033328e-06, + "loss": 1.0557066202163696, + "step": 15676 + }, + { + "epoch": 2.853736233730773, + "grad_norm": 35.25, + "learning_rate": 1.0237266989236925e-06, + "loss": 1.4573719501495361, + "step": 15678 + }, + { + "epoch": 2.8541003003549648, + "grad_norm": 25.0, + "learning_rate": 1.02360918095415e-06, + "loss": 1.7462213039398193, + "step": 15680 + }, + { + "epoch": 2.8544643669791574, + "grad_norm": 22.875, + "learning_rate": 1.0234919530119507e-06, + "loss": 2.002379894256592, + "step": 15682 + }, + { + "epoch": 2.854828433603349, + "grad_norm": 17.75, + "learning_rate": 1.0233750151142973e-06, + "loss": 1.7310270071029663, + "step": 15684 + }, + { + "epoch": 2.855192500227542, + "grad_norm": 18.875, + "learning_rate": 1.0232583672783497e-06, + "loss": 1.4237812757492065, + "step": 15686 + }, + { + "epoch": 2.855556566851734, + "grad_norm": 17.875, + "learning_rate": 1.0231420095212258e-06, + "loss": 1.3627734184265137, + "step": 15688 + }, + { + "epoch": 2.855920633475926, + "grad_norm": 14.9375, + "learning_rate": 1.0230259418600003e-06, + "loss": 1.5233899354934692, + "step": 15690 + }, + { + "epoch": 2.8562847001001184, + "grad_norm": 10.3125, + "learning_rate": 1.0229101643117062e-06, + "loss": 1.369388222694397, + "step": 15692 + }, + { + "epoch": 2.8566487667243106, + "grad_norm": 13.5625, + "learning_rate": 1.0227946768933325e-06, + "loss": 1.090606927871704, + "step": 15694 + }, + { + "epoch": 2.8570128333485028, + "grad_norm": 7.40625, + "learning_rate": 1.0226794796218276e-06, + "loss": 1.486121654510498, + "step": 15696 + }, + { + "epoch": 2.857376899972695, + "grad_norm": 50.75, + "learning_rate": 1.0225645725140954e-06, + "loss": 1.3467481136322021, + "step": 15698 + }, + { + "epoch": 2.857740966596887, + "grad_norm": 46.25, + "learning_rate": 1.0224499555869985e-06, + "loss": 0.8996018171310425, + "step": 15700 + }, + { + "epoch": 2.8581050332210793, + "grad_norm": 23.125, + "learning_rate": 1.0223356288573564e-06, + "loss": 0.4530978798866272, + "step": 15702 + }, + { + "epoch": 2.8584690998452715, + "grad_norm": 9.5, + "learning_rate": 1.0222215923419454e-06, + "loss": 1.717079520225525, + "step": 15704 + }, + { + "epoch": 2.8588331664694637, + "grad_norm": 11.5625, + "learning_rate": 1.0221078460575013e-06, + "loss": 1.354611873626709, + "step": 15706 + }, + { + "epoch": 2.8591972330936564, + "grad_norm": 14.1875, + "learning_rate": 1.0219943900207147e-06, + "loss": 1.965169906616211, + "step": 15708 + }, + { + "epoch": 2.859561299717848, + "grad_norm": 18.0, + "learning_rate": 1.021881224248235e-06, + "loss": 1.909058690071106, + "step": 15710 + }, + { + "epoch": 2.8599253663420408, + "grad_norm": 10.4375, + "learning_rate": 1.0217683487566692e-06, + "loss": 1.3860423564910889, + "step": 15712 + }, + { + "epoch": 2.860289432966233, + "grad_norm": 20.375, + "learning_rate": 1.0216557635625813e-06, + "loss": 1.5550782680511475, + "step": 15714 + }, + { + "epoch": 2.860653499590425, + "grad_norm": 9.0, + "learning_rate": 1.0215434686824924e-06, + "loss": 1.426262378692627, + "step": 15716 + }, + { + "epoch": 2.8610175662146173, + "grad_norm": 8.9375, + "learning_rate": 1.0214314641328815e-06, + "loss": 1.237898588180542, + "step": 15718 + }, + { + "epoch": 2.8613816328388095, + "grad_norm": 9.9375, + "learning_rate": 1.0213197499301847e-06, + "loss": 1.406991958618164, + "step": 15720 + }, + { + "epoch": 2.8617456994630017, + "grad_norm": 11.9375, + "learning_rate": 1.0212083260907962e-06, + "loss": 1.5383907556533813, + "step": 15722 + }, + { + "epoch": 2.862109766087194, + "grad_norm": 10.3125, + "learning_rate": 1.021097192631066e-06, + "loss": 1.4900743961334229, + "step": 15724 + }, + { + "epoch": 2.862473832711386, + "grad_norm": 7.78125, + "learning_rate": 1.0209863495673033e-06, + "loss": 1.2796670198440552, + "step": 15726 + }, + { + "epoch": 2.8628378993355783, + "grad_norm": 15.9375, + "learning_rate": 1.0208757969157734e-06, + "loss": 1.0955740213394165, + "step": 15728 + }, + { + "epoch": 2.8632019659597705, + "grad_norm": 19.0, + "learning_rate": 1.0207655346926995e-06, + "loss": 0.6655865907669067, + "step": 15730 + }, + { + "epoch": 2.8635660325839627, + "grad_norm": 7.03125, + "learning_rate": 1.0206555629142624e-06, + "loss": 1.6001516580581665, + "step": 15732 + }, + { + "epoch": 2.8639300992081553, + "grad_norm": 15.375, + "learning_rate": 1.0205458815965997e-06, + "loss": 1.538287878036499, + "step": 15734 + }, + { + "epoch": 2.864294165832347, + "grad_norm": 10.5, + "learning_rate": 1.020436490755807e-06, + "loss": 1.4265754222869873, + "step": 15736 + }, + { + "epoch": 2.8646582324565397, + "grad_norm": 16.5, + "learning_rate": 1.020327390407937e-06, + "loss": 1.2899627685546875, + "step": 15738 + }, + { + "epoch": 2.865022299080732, + "grad_norm": 17.75, + "learning_rate": 1.020218580568999e-06, + "loss": 1.650212049484253, + "step": 15740 + }, + { + "epoch": 2.865386365704924, + "grad_norm": 9.1875, + "learning_rate": 1.0201100612549615e-06, + "loss": 1.469498872756958, + "step": 15742 + }, + { + "epoch": 2.8657504323291163, + "grad_norm": 15.875, + "learning_rate": 1.0200018324817484e-06, + "loss": 1.4468848705291748, + "step": 15744 + }, + { + "epoch": 2.8661144989533085, + "grad_norm": 7.125, + "learning_rate": 1.0198938942652425e-06, + "loss": 1.1684160232543945, + "step": 15746 + }, + { + "epoch": 2.8664785655775007, + "grad_norm": 7.21875, + "learning_rate": 1.0197862466212826e-06, + "loss": 1.2453705072402954, + "step": 15748 + }, + { + "epoch": 2.866842632201693, + "grad_norm": 10.125, + "learning_rate": 1.0196788895656657e-06, + "loss": 1.5134919881820679, + "step": 15750 + }, + { + "epoch": 2.867206698825885, + "grad_norm": 10.625, + "learning_rate": 1.0195718231141467e-06, + "loss": 1.5316256284713745, + "step": 15752 + }, + { + "epoch": 2.8675707654500773, + "grad_norm": 5.8125, + "learning_rate": 1.0194650472824367e-06, + "loss": 1.3344790935516357, + "step": 15754 + }, + { + "epoch": 2.8679348320742695, + "grad_norm": 11.9375, + "learning_rate": 1.0193585620862044e-06, + "loss": 1.2970019578933716, + "step": 15756 + }, + { + "epoch": 2.8682988986984617, + "grad_norm": 10.6875, + "learning_rate": 1.0192523675410762e-06, + "loss": 1.759852647781372, + "step": 15758 + }, + { + "epoch": 2.8686629653226543, + "grad_norm": 8.0, + "learning_rate": 1.0191464636626358e-06, + "loss": 1.201621413230896, + "step": 15760 + }, + { + "epoch": 2.869027031946846, + "grad_norm": 12.4375, + "learning_rate": 1.0190408504664245e-06, + "loss": 1.4449375867843628, + "step": 15762 + }, + { + "epoch": 2.8693910985710387, + "grad_norm": 8.4375, + "learning_rate": 1.0189355279679398e-06, + "loss": 1.209423542022705, + "step": 15764 + }, + { + "epoch": 2.869755165195231, + "grad_norm": 18.375, + "learning_rate": 1.018830496182638e-06, + "loss": 1.356123685836792, + "step": 15766 + }, + { + "epoch": 2.870119231819423, + "grad_norm": 8.75, + "learning_rate": 1.018725755125932e-06, + "loss": 1.2158262729644775, + "step": 15768 + }, + { + "epoch": 2.8704832984436153, + "grad_norm": 12.125, + "learning_rate": 1.018621304813192e-06, + "loss": 1.4276056289672852, + "step": 15770 + }, + { + "epoch": 2.8708473650678075, + "grad_norm": 13.0625, + "learning_rate": 1.018517145259746e-06, + "loss": 1.4981703758239746, + "step": 15772 + }, + { + "epoch": 2.8712114316919997, + "grad_norm": 17.5, + "learning_rate": 1.018413276480878e-06, + "loss": 1.4588618278503418, + "step": 15774 + }, + { + "epoch": 2.871575498316192, + "grad_norm": 8.5625, + "learning_rate": 1.0183096984918315e-06, + "loss": 1.37358820438385, + "step": 15776 + }, + { + "epoch": 2.871939564940384, + "grad_norm": 9.4375, + "learning_rate": 1.0182064113078055e-06, + "loss": 1.370955467224121, + "step": 15778 + }, + { + "epoch": 2.8723036315645762, + "grad_norm": 7.96875, + "learning_rate": 1.0181034149439572e-06, + "loss": 1.507272481918335, + "step": 15780 + }, + { + "epoch": 2.8726676981887684, + "grad_norm": 12.0625, + "learning_rate": 1.0180007094154008e-06, + "loss": 1.3962452411651611, + "step": 15782 + }, + { + "epoch": 2.8730317648129606, + "grad_norm": 9.9375, + "learning_rate": 1.017898294737208e-06, + "loss": 2.1494193077087402, + "step": 15784 + }, + { + "epoch": 2.8733958314371533, + "grad_norm": 16.625, + "learning_rate": 1.0177961709244076e-06, + "loss": 1.1891204118728638, + "step": 15786 + }, + { + "epoch": 2.873759898061345, + "grad_norm": 16.25, + "learning_rate": 1.017694337991986e-06, + "loss": 1.8845057487487793, + "step": 15788 + }, + { + "epoch": 2.8741239646855377, + "grad_norm": 10.125, + "learning_rate": 1.0175927959548865e-06, + "loss": 1.6219645738601685, + "step": 15790 + }, + { + "epoch": 2.8744880313097294, + "grad_norm": 5.0625, + "learning_rate": 1.0174915448280105e-06, + "loss": 1.3163926601409912, + "step": 15792 + }, + { + "epoch": 2.874852097933922, + "grad_norm": 6.625, + "learning_rate": 1.0173905846262156e-06, + "loss": 1.3613336086273193, + "step": 15794 + }, + { + "epoch": 2.8752161645581142, + "grad_norm": 16.125, + "learning_rate": 1.017289915364318e-06, + "loss": 1.3420993089675903, + "step": 15796 + }, + { + "epoch": 2.8755802311823064, + "grad_norm": 9.125, + "learning_rate": 1.01718953705709e-06, + "loss": 0.9244269132614136, + "step": 15798 + }, + { + "epoch": 2.8759442978064986, + "grad_norm": 6.09375, + "learning_rate": 1.0170894497192613e-06, + "loss": 1.4073948860168457, + "step": 15800 + }, + { + "epoch": 2.876308364430691, + "grad_norm": 14.875, + "learning_rate": 1.0169896533655206e-06, + "loss": 1.3569610118865967, + "step": 15802 + }, + { + "epoch": 2.876672431054883, + "grad_norm": 21.25, + "learning_rate": 1.0168901480105113e-06, + "loss": 1.7138009071350098, + "step": 15804 + }, + { + "epoch": 2.877036497679075, + "grad_norm": 18.625, + "learning_rate": 1.016790933668836e-06, + "loss": 1.5243194103240967, + "step": 15806 + }, + { + "epoch": 2.8774005643032674, + "grad_norm": 20.75, + "learning_rate": 1.016692010355054e-06, + "loss": 1.9013592004776, + "step": 15808 + }, + { + "epoch": 2.8777646309274596, + "grad_norm": 15.6875, + "learning_rate": 1.0165933780836818e-06, + "loss": 1.7598347663879395, + "step": 15810 + }, + { + "epoch": 2.878128697551652, + "grad_norm": 7.6875, + "learning_rate": 1.0164950368691936e-06, + "loss": 1.249074101448059, + "step": 15812 + }, + { + "epoch": 2.878492764175844, + "grad_norm": 63.75, + "learning_rate": 1.0163969867260199e-06, + "loss": 1.0511736869812012, + "step": 15814 + }, + { + "epoch": 2.8788568308000366, + "grad_norm": 11.9375, + "learning_rate": 1.0162992276685497e-06, + "loss": 1.3306910991668701, + "step": 15816 + }, + { + "epoch": 2.8792208974242284, + "grad_norm": 12.1875, + "learning_rate": 1.0162017597111287e-06, + "loss": 1.8575962781906128, + "step": 15818 + }, + { + "epoch": 2.879584964048421, + "grad_norm": 8.0625, + "learning_rate": 1.0161045828680597e-06, + "loss": 0.8926464319229126, + "step": 15820 + }, + { + "epoch": 2.879949030672613, + "grad_norm": 10.625, + "learning_rate": 1.0160076971536032e-06, + "loss": 1.4648663997650146, + "step": 15822 + }, + { + "epoch": 2.8803130972968054, + "grad_norm": 4.40625, + "learning_rate": 1.0159111025819768e-06, + "loss": 0.931498646736145, + "step": 15824 + }, + { + "epoch": 2.8806771639209976, + "grad_norm": 7.3125, + "learning_rate": 1.0158147991673554e-06, + "loss": 1.1095563173294067, + "step": 15826 + }, + { + "epoch": 2.88104123054519, + "grad_norm": 7.3125, + "learning_rate": 1.0157187869238707e-06, + "loss": 1.2842364311218262, + "step": 15828 + }, + { + "epoch": 2.881405297169382, + "grad_norm": 13.75, + "learning_rate": 1.0156230658656124e-06, + "loss": 1.4239689111709595, + "step": 15830 + }, + { + "epoch": 2.881769363793574, + "grad_norm": 15.8125, + "learning_rate": 1.0155276360066275e-06, + "loss": 0.9221434593200684, + "step": 15832 + }, + { + "epoch": 2.8821334304177664, + "grad_norm": 9.375, + "learning_rate": 1.0154324973609196e-06, + "loss": 1.387025237083435, + "step": 15834 + }, + { + "epoch": 2.8824974970419586, + "grad_norm": 11.375, + "learning_rate": 1.01533764994245e-06, + "loss": 0.856332540512085, + "step": 15836 + }, + { + "epoch": 2.8828615636661508, + "grad_norm": 5.78125, + "learning_rate": 1.015243093765137e-06, + "loss": 1.2872095108032227, + "step": 15838 + }, + { + "epoch": 2.883225630290343, + "grad_norm": 16.25, + "learning_rate": 1.0151488288428564e-06, + "loss": 1.6521106958389282, + "step": 15840 + }, + { + "epoch": 2.8835896969145356, + "grad_norm": 27.75, + "learning_rate": 1.0150548551894415e-06, + "loss": 0.745322585105896, + "step": 15842 + }, + { + "epoch": 2.8839537635387273, + "grad_norm": 11.6875, + "learning_rate": 1.014961172818682e-06, + "loss": 1.750633716583252, + "step": 15844 + }, + { + "epoch": 2.88431783016292, + "grad_norm": 7.1875, + "learning_rate": 1.014867781744326e-06, + "loss": 1.5747134685516357, + "step": 15846 + }, + { + "epoch": 2.884681896787112, + "grad_norm": 14.75, + "learning_rate": 1.014774681980078e-06, + "loss": 1.9590178728103638, + "step": 15848 + }, + { + "epoch": 2.8850459634113044, + "grad_norm": 8.8125, + "learning_rate": 1.0146818735395998e-06, + "loss": 1.7739530801773071, + "step": 15850 + }, + { + "epoch": 2.8854100300354966, + "grad_norm": 4.0625, + "learning_rate": 1.0145893564365112e-06, + "loss": 1.267188310623169, + "step": 15852 + }, + { + "epoch": 2.8857740966596888, + "grad_norm": 5.84375, + "learning_rate": 1.014497130684388e-06, + "loss": 0.7861636877059937, + "step": 15854 + }, + { + "epoch": 2.886138163283881, + "grad_norm": 8.3125, + "learning_rate": 1.0144051962967645e-06, + "loss": 0.9685366153717041, + "step": 15856 + }, + { + "epoch": 2.886502229908073, + "grad_norm": 111.5, + "learning_rate": 1.0143135532871316e-06, + "loss": 1.0216107368469238, + "step": 15858 + }, + { + "epoch": 2.8868662965322653, + "grad_norm": 7.71875, + "learning_rate": 1.0142222016689372e-06, + "loss": 1.340867042541504, + "step": 15860 + }, + { + "epoch": 2.8872303631564575, + "grad_norm": 15.25, + "learning_rate": 1.0141311414555876e-06, + "loss": 1.485758662223816, + "step": 15862 + }, + { + "epoch": 2.8875944297806497, + "grad_norm": 16.875, + "learning_rate": 1.0140403726604444e-06, + "loss": 1.4802769422531128, + "step": 15864 + }, + { + "epoch": 2.887958496404842, + "grad_norm": 41.25, + "learning_rate": 1.0139498952968283e-06, + "loss": 1.6250853538513184, + "step": 15866 + }, + { + "epoch": 2.8883225630290346, + "grad_norm": 9.5, + "learning_rate": 1.0138597093780166e-06, + "loss": 1.2184441089630127, + "step": 15868 + }, + { + "epoch": 2.8886866296532263, + "grad_norm": 9.6875, + "learning_rate": 1.0137698149172428e-06, + "loss": 1.2361960411071777, + "step": 15870 + }, + { + "epoch": 2.889050696277419, + "grad_norm": 5.15625, + "learning_rate": 1.0136802119277e-06, + "loss": 1.1188724040985107, + "step": 15872 + }, + { + "epoch": 2.889414762901611, + "grad_norm": 15.125, + "learning_rate": 1.0135909004225356e-06, + "loss": 1.520588994026184, + "step": 15874 + }, + { + "epoch": 2.8897788295258033, + "grad_norm": 12.5, + "learning_rate": 1.0135018804148566e-06, + "loss": 1.1083598136901855, + "step": 15876 + }, + { + "epoch": 2.8901428961499955, + "grad_norm": 11.4375, + "learning_rate": 1.013413151917726e-06, + "loss": 1.5388174057006836, + "step": 15878 + }, + { + "epoch": 2.8905069627741877, + "grad_norm": 22.875, + "learning_rate": 1.0133247149441643e-06, + "loss": 1.6115424633026123, + "step": 15880 + }, + { + "epoch": 2.89087102939838, + "grad_norm": 6.34375, + "learning_rate": 1.0132365695071498e-06, + "loss": 1.2949028015136719, + "step": 15882 + }, + { + "epoch": 2.891235096022572, + "grad_norm": 5.78125, + "learning_rate": 1.0131487156196168e-06, + "loss": 0.806056559085846, + "step": 15884 + }, + { + "epoch": 2.8915991626467643, + "grad_norm": 18.0, + "learning_rate": 1.0130611532944578e-06, + "loss": 0.6471291780471802, + "step": 15886 + }, + { + "epoch": 2.8919632292709565, + "grad_norm": 3.6875, + "learning_rate": 1.0129738825445221e-06, + "loss": 0.920612096786499, + "step": 15888 + }, + { + "epoch": 2.8923272958951487, + "grad_norm": 8.8125, + "learning_rate": 1.0128869033826165e-06, + "loss": 1.324739694595337, + "step": 15890 + }, + { + "epoch": 2.892691362519341, + "grad_norm": 32.25, + "learning_rate": 1.012800215821505e-06, + "loss": 1.339063048362732, + "step": 15892 + }, + { + "epoch": 2.8930554291435335, + "grad_norm": 23.75, + "learning_rate": 1.012713819873908e-06, + "loss": 1.1131782531738281, + "step": 15894 + }, + { + "epoch": 2.8934194957677253, + "grad_norm": 20.25, + "learning_rate": 1.0126277155525045e-06, + "loss": 1.2030948400497437, + "step": 15896 + }, + { + "epoch": 2.893783562391918, + "grad_norm": 12.0625, + "learning_rate": 1.0125419028699293e-06, + "loss": 1.7994149923324585, + "step": 15898 + }, + { + "epoch": 2.89414762901611, + "grad_norm": 56.0, + "learning_rate": 1.0124563818387755e-06, + "loss": 1.214991807937622, + "step": 15900 + }, + { + "epoch": 2.8945116956403023, + "grad_norm": 7.5, + "learning_rate": 1.0123711524715932e-06, + "loss": 1.540924072265625, + "step": 15902 + }, + { + "epoch": 2.8948757622644945, + "grad_norm": 4.21875, + "learning_rate": 1.0122862147808888e-06, + "loss": 1.4240427017211914, + "step": 15904 + }, + { + "epoch": 2.8952398288886867, + "grad_norm": 25.625, + "learning_rate": 1.012201568779127e-06, + "loss": 1.1652765274047852, + "step": 15906 + }, + { + "epoch": 2.895603895512879, + "grad_norm": 63.0, + "learning_rate": 1.012117214478729e-06, + "loss": 1.975993037223816, + "step": 15908 + }, + { + "epoch": 2.895967962137071, + "grad_norm": 20.25, + "learning_rate": 1.0120331518920736e-06, + "loss": 1.4749317169189453, + "step": 15910 + }, + { + "epoch": 2.8963320287612633, + "grad_norm": 13.75, + "learning_rate": 1.0119493810314968e-06, + "loss": 1.105260968208313, + "step": 15912 + }, + { + "epoch": 2.8966960953854555, + "grad_norm": 9.9375, + "learning_rate": 1.0118659019092912e-06, + "loss": 1.5636228322982788, + "step": 15914 + }, + { + "epoch": 2.8970601620096477, + "grad_norm": 6.625, + "learning_rate": 1.0117827145377075e-06, + "loss": 1.3447386026382446, + "step": 15916 + }, + { + "epoch": 2.89742422863384, + "grad_norm": 10.4375, + "learning_rate": 1.0116998189289529e-06, + "loss": 1.7130296230316162, + "step": 15918 + }, + { + "epoch": 2.897788295258032, + "grad_norm": 18.625, + "learning_rate": 1.011617215095192e-06, + "loss": 1.3692744970321655, + "step": 15920 + }, + { + "epoch": 2.8981523618822242, + "grad_norm": 16.125, + "learning_rate": 1.0115349030485467e-06, + "loss": 1.5688625574111938, + "step": 15922 + }, + { + "epoch": 2.898516428506417, + "grad_norm": 5.4375, + "learning_rate": 1.0114528828010955e-06, + "loss": 1.1722689867019653, + "step": 15924 + }, + { + "epoch": 2.8988804951306086, + "grad_norm": 7.1875, + "learning_rate": 1.011371154364875e-06, + "loss": 1.241307258605957, + "step": 15926 + }, + { + "epoch": 2.8992445617548013, + "grad_norm": 18.375, + "learning_rate": 1.0112897177518786e-06, + "loss": 1.655705213546753, + "step": 15928 + }, + { + "epoch": 2.8996086283789935, + "grad_norm": 12.9375, + "learning_rate": 1.0112085729740563e-06, + "loss": 1.8025476932525635, + "step": 15930 + }, + { + "epoch": 2.8999726950031857, + "grad_norm": 6.5625, + "learning_rate": 1.0111277200433163e-06, + "loss": 1.1286834478378296, + "step": 15932 + }, + { + "epoch": 2.900336761627378, + "grad_norm": 4.96875, + "learning_rate": 1.0110471589715228e-06, + "loss": 1.2490546703338623, + "step": 15934 + }, + { + "epoch": 2.90070082825157, + "grad_norm": 13.6875, + "learning_rate": 1.0109668897704986e-06, + "loss": 1.3304779529571533, + "step": 15936 + }, + { + "epoch": 2.9010648948757622, + "grad_norm": 14.125, + "learning_rate": 1.0108869124520224e-06, + "loss": 1.341776967048645, + "step": 15938 + }, + { + "epoch": 2.9014289614999544, + "grad_norm": 24.5, + "learning_rate": 1.0108072270278305e-06, + "loss": 1.9102349281311035, + "step": 15940 + }, + { + "epoch": 2.9017930281241466, + "grad_norm": 11.375, + "learning_rate": 1.0107278335096167e-06, + "loss": 0.9950995445251465, + "step": 15942 + }, + { + "epoch": 2.902157094748339, + "grad_norm": 20.75, + "learning_rate": 1.0106487319090313e-06, + "loss": 1.0460076332092285, + "step": 15944 + }, + { + "epoch": 2.902521161372531, + "grad_norm": 10.5, + "learning_rate": 1.0105699222376826e-06, + "loss": 1.4198015928268433, + "step": 15946 + }, + { + "epoch": 2.902885227996723, + "grad_norm": 44.5, + "learning_rate": 1.0104914045071352e-06, + "loss": 1.555677890777588, + "step": 15948 + }, + { + "epoch": 2.903249294620916, + "grad_norm": 6.09375, + "learning_rate": 1.0104131787289113e-06, + "loss": 1.1894680261611938, + "step": 15950 + }, + { + "epoch": 2.9036133612451076, + "grad_norm": 17.875, + "learning_rate": 1.0103352449144905e-06, + "loss": 1.3933613300323486, + "step": 15952 + }, + { + "epoch": 2.9039774278693002, + "grad_norm": 13.5625, + "learning_rate": 1.0102576030753092e-06, + "loss": 1.3102720975875854, + "step": 15954 + }, + { + "epoch": 2.9043414944934924, + "grad_norm": 13.6875, + "learning_rate": 1.0101802532227607e-06, + "loss": 0.9838279485702515, + "step": 15956 + }, + { + "epoch": 2.9047055611176846, + "grad_norm": 16.5, + "learning_rate": 1.010103195368196e-06, + "loss": 1.1908875703811646, + "step": 15958 + }, + { + "epoch": 2.905069627741877, + "grad_norm": 32.5, + "learning_rate": 1.0100264295229233e-06, + "loss": 1.504979133605957, + "step": 15960 + }, + { + "epoch": 2.905433694366069, + "grad_norm": 16.75, + "learning_rate": 1.0099499556982073e-06, + "loss": 1.5941108465194702, + "step": 15962 + }, + { + "epoch": 2.905797760990261, + "grad_norm": 8.6875, + "learning_rate": 1.0098737739052702e-06, + "loss": 1.4507976770401, + "step": 15964 + }, + { + "epoch": 2.9061618276144534, + "grad_norm": 11.375, + "learning_rate": 1.0097978841552916e-06, + "loss": 1.430513620376587, + "step": 15966 + }, + { + "epoch": 2.9065258942386456, + "grad_norm": 26.625, + "learning_rate": 1.009722286459408e-06, + "loss": 1.370814323425293, + "step": 15968 + }, + { + "epoch": 2.906889960862838, + "grad_norm": 14.0625, + "learning_rate": 1.0096469808287129e-06, + "loss": 1.3203754425048828, + "step": 15970 + }, + { + "epoch": 2.90725402748703, + "grad_norm": 19.0, + "learning_rate": 1.009571967274257e-06, + "loss": 2.059998035430908, + "step": 15972 + }, + { + "epoch": 2.907618094111222, + "grad_norm": 15.1875, + "learning_rate": 1.0094972458070484e-06, + "loss": 1.158681035041809, + "step": 15974 + }, + { + "epoch": 2.907982160735415, + "grad_norm": 15.5, + "learning_rate": 1.0094228164380526e-06, + "loss": 1.795695185661316, + "step": 15976 + }, + { + "epoch": 2.9083462273596066, + "grad_norm": 15.75, + "learning_rate": 1.009348679178191e-06, + "loss": 1.4717373847961426, + "step": 15978 + }, + { + "epoch": 2.908710293983799, + "grad_norm": 13.0, + "learning_rate": 1.0092748340383435e-06, + "loss": 1.9813203811645508, + "step": 15980 + }, + { + "epoch": 2.9090743606079914, + "grad_norm": 12.8125, + "learning_rate": 1.0092012810293464e-06, + "loss": 1.4742377996444702, + "step": 15982 + }, + { + "epoch": 2.9094384272321836, + "grad_norm": 19.875, + "learning_rate": 1.0091280201619931e-06, + "loss": 1.4038236141204834, + "step": 15984 + }, + { + "epoch": 2.909802493856376, + "grad_norm": 40.75, + "learning_rate": 1.0090550514470349e-06, + "loss": 0.9446954727172852, + "step": 15986 + }, + { + "epoch": 2.910166560480568, + "grad_norm": 37.25, + "learning_rate": 1.0089823748951792e-06, + "loss": 1.0276025533676147, + "step": 15988 + }, + { + "epoch": 2.91053062710476, + "grad_norm": 17.0, + "learning_rate": 1.0089099905170908e-06, + "loss": 0.5614534020423889, + "step": 15990 + }, + { + "epoch": 2.9108946937289524, + "grad_norm": 7.59375, + "learning_rate": 1.0088378983233921e-06, + "loss": 1.460458755493164, + "step": 15992 + }, + { + "epoch": 2.9112587603531446, + "grad_norm": 34.5, + "learning_rate": 1.0087660983246627e-06, + "loss": 1.9154887199401855, + "step": 15994 + }, + { + "epoch": 2.9116228269773368, + "grad_norm": 11.0, + "learning_rate": 1.0086945905314385e-06, + "loss": 1.2142599821090698, + "step": 15996 + }, + { + "epoch": 2.911986893601529, + "grad_norm": 16.25, + "learning_rate": 1.0086233749542132e-06, + "loss": 1.4964649677276611, + "step": 15998 + }, + { + "epoch": 2.912350960225721, + "grad_norm": 7.53125, + "learning_rate": 1.0085524516034368e-06, + "loss": 1.079470157623291, + "step": 16000 + }, + { + "epoch": 2.912715026849914, + "grad_norm": 7.59375, + "learning_rate": 1.008481820489518e-06, + "loss": 1.4527021646499634, + "step": 16002 + }, + { + "epoch": 2.9130790934741055, + "grad_norm": 7.59375, + "learning_rate": 1.0084114816228208e-06, + "loss": 1.2002918720245361, + "step": 16004 + }, + { + "epoch": 2.913443160098298, + "grad_norm": 5.625, + "learning_rate": 1.0083414350136677e-06, + "loss": 1.3399319648742676, + "step": 16006 + }, + { + "epoch": 2.9138072267224904, + "grad_norm": 8.625, + "learning_rate": 1.0082716806723374e-06, + "loss": 1.4220551252365112, + "step": 16008 + }, + { + "epoch": 2.9141712933466826, + "grad_norm": 10.6875, + "learning_rate": 1.0082022186090664e-06, + "loss": 1.491031289100647, + "step": 16010 + }, + { + "epoch": 2.9145353599708748, + "grad_norm": 10.4375, + "learning_rate": 1.0081330488340475e-06, + "loss": 1.3106815814971924, + "step": 16012 + }, + { + "epoch": 2.914899426595067, + "grad_norm": 6.84375, + "learning_rate": 1.0080641713574313e-06, + "loss": 1.2100751399993896, + "step": 16014 + }, + { + "epoch": 2.915263493219259, + "grad_norm": 60.25, + "learning_rate": 1.0079955861893256e-06, + "loss": 1.539379596710205, + "step": 16016 + }, + { + "epoch": 2.9156275598434513, + "grad_norm": 32.25, + "learning_rate": 1.0079272933397948e-06, + "loss": 1.5998427867889404, + "step": 16018 + }, + { + "epoch": 2.9159916264676435, + "grad_norm": 65.0, + "learning_rate": 1.0078592928188603e-06, + "loss": 1.5707483291625977, + "step": 16020 + }, + { + "epoch": 2.9163556930918357, + "grad_norm": 9.0, + "learning_rate": 1.0077915846365013e-06, + "loss": 1.5759389400482178, + "step": 16022 + }, + { + "epoch": 2.916719759716028, + "grad_norm": 15.9375, + "learning_rate": 1.0077241688026534e-06, + "loss": 1.4107835292816162, + "step": 16024 + }, + { + "epoch": 2.91708382634022, + "grad_norm": 19.75, + "learning_rate": 1.0076570453272097e-06, + "loss": 1.554302453994751, + "step": 16026 + }, + { + "epoch": 2.9174478929644128, + "grad_norm": 8.875, + "learning_rate": 1.0075902142200206e-06, + "loss": 1.4295486211776733, + "step": 16028 + }, + { + "epoch": 2.9178119595886045, + "grad_norm": 5.8125, + "learning_rate": 1.0075236754908925e-06, + "loss": 1.2804522514343262, + "step": 16030 + }, + { + "epoch": 2.918176026212797, + "grad_norm": 8.6875, + "learning_rate": 1.0074574291495908e-06, + "loss": 0.9572818279266357, + "step": 16032 + }, + { + "epoch": 2.918540092836989, + "grad_norm": 15.75, + "learning_rate": 1.0073914752058362e-06, + "loss": 1.9047865867614746, + "step": 16034 + }, + { + "epoch": 2.9189041594611815, + "grad_norm": 23.875, + "learning_rate": 1.0073258136693072e-06, + "loss": 1.2155386209487915, + "step": 16036 + }, + { + "epoch": 2.9192682260853737, + "grad_norm": 4.03125, + "learning_rate": 1.0072604445496392e-06, + "loss": 0.8650633096694946, + "step": 16038 + }, + { + "epoch": 2.919632292709566, + "grad_norm": 6.21875, + "learning_rate": 1.0071953678564254e-06, + "loss": 1.579742193222046, + "step": 16040 + }, + { + "epoch": 2.919996359333758, + "grad_norm": 6.03125, + "learning_rate": 1.0071305835992152e-06, + "loss": 0.9558074474334717, + "step": 16042 + }, + { + "epoch": 2.9203604259579503, + "grad_norm": 10.25, + "learning_rate": 1.0070660917875153e-06, + "loss": 1.2999343872070312, + "step": 16044 + }, + { + "epoch": 2.9207244925821425, + "grad_norm": 5.8125, + "learning_rate": 1.0070018924307899e-06, + "loss": 1.242353081703186, + "step": 16046 + }, + { + "epoch": 2.9210885592063347, + "grad_norm": 9.625, + "learning_rate": 1.0069379855384598e-06, + "loss": 1.256218433380127, + "step": 16048 + }, + { + "epoch": 2.921452625830527, + "grad_norm": 3.875, + "learning_rate": 1.0068743711199032e-06, + "loss": 1.217787265777588, + "step": 16050 + }, + { + "epoch": 2.921816692454719, + "grad_norm": 6.9375, + "learning_rate": 1.0068110491844552e-06, + "loss": 1.188781499862671, + "step": 16052 + }, + { + "epoch": 2.9221807590789113, + "grad_norm": 13.8125, + "learning_rate": 1.0067480197414082e-06, + "loss": 1.4043807983398438, + "step": 16054 + }, + { + "epoch": 2.9225448257031035, + "grad_norm": 25.125, + "learning_rate": 1.0066852828000112e-06, + "loss": 1.4675672054290771, + "step": 16056 + }, + { + "epoch": 2.922908892327296, + "grad_norm": 13.5, + "learning_rate": 1.0066228383694708e-06, + "loss": 1.5992004871368408, + "step": 16058 + }, + { + "epoch": 2.923272958951488, + "grad_norm": 13.6875, + "learning_rate": 1.0065606864589501e-06, + "loss": 0.5050632953643799, + "step": 16060 + }, + { + "epoch": 2.9236370255756805, + "grad_norm": 13.375, + "learning_rate": 1.0064988270775705e-06, + "loss": 1.3865153789520264, + "step": 16062 + }, + { + "epoch": 2.9240010921998727, + "grad_norm": 9.375, + "learning_rate": 1.0064372602344086e-06, + "loss": 1.027376651763916, + "step": 16064 + }, + { + "epoch": 2.924365158824065, + "grad_norm": 4.5625, + "learning_rate": 1.0063759859384998e-06, + "loss": 0.9757024645805359, + "step": 16066 + }, + { + "epoch": 2.924729225448257, + "grad_norm": 7.5625, + "learning_rate": 1.0063150041988357e-06, + "loss": 1.3136813640594482, + "step": 16068 + }, + { + "epoch": 2.9250932920724493, + "grad_norm": 9.0625, + "learning_rate": 1.0062543150243647e-06, + "loss": 1.339347243309021, + "step": 16070 + }, + { + "epoch": 2.9254573586966415, + "grad_norm": 7.46875, + "learning_rate": 1.0061939184239933e-06, + "loss": 1.0517849922180176, + "step": 16072 + }, + { + "epoch": 2.9258214253208337, + "grad_norm": 9.625, + "learning_rate": 1.0061338144065843e-06, + "loss": 1.1386849880218506, + "step": 16074 + }, + { + "epoch": 2.926185491945026, + "grad_norm": 12.625, + "learning_rate": 1.0060740029809575e-06, + "loss": 1.4157938957214355, + "step": 16076 + }, + { + "epoch": 2.926549558569218, + "grad_norm": 13.75, + "learning_rate": 1.00601448415589e-06, + "loss": 1.2470686435699463, + "step": 16078 + }, + { + "epoch": 2.9269136251934103, + "grad_norm": 14.0625, + "learning_rate": 1.0059552579401157e-06, + "loss": 1.7663663625717163, + "step": 16080 + }, + { + "epoch": 2.9272776918176024, + "grad_norm": 10.0, + "learning_rate": 1.0058963243423267e-06, + "loss": 1.9873912334442139, + "step": 16082 + }, + { + "epoch": 2.927641758441795, + "grad_norm": 6.78125, + "learning_rate": 1.0058376833711702e-06, + "loss": 1.3792403936386108, + "step": 16084 + }, + { + "epoch": 2.928005825065987, + "grad_norm": 17.875, + "learning_rate": 1.0057793350352525e-06, + "loss": 1.184129238128662, + "step": 16086 + }, + { + "epoch": 2.9283698916901795, + "grad_norm": 5.78125, + "learning_rate": 1.0057212793431356e-06, + "loss": 1.0965029001235962, + "step": 16088 + }, + { + "epoch": 2.9287339583143717, + "grad_norm": 13.1875, + "learning_rate": 1.0056635163033386e-06, + "loss": 1.2313885688781738, + "step": 16090 + }, + { + "epoch": 2.929098024938564, + "grad_norm": 15.75, + "learning_rate": 1.005606045924338e-06, + "loss": 1.3645853996276855, + "step": 16092 + }, + { + "epoch": 2.929462091562756, + "grad_norm": 17.25, + "learning_rate": 1.0055488682145678e-06, + "loss": 1.8306810855865479, + "step": 16094 + }, + { + "epoch": 2.9298261581869482, + "grad_norm": 6.21875, + "learning_rate": 1.0054919831824183e-06, + "loss": 1.362161636352539, + "step": 16096 + }, + { + "epoch": 2.9301902248111404, + "grad_norm": 4.5625, + "learning_rate": 1.0054353908362375e-06, + "loss": 1.3486385345458984, + "step": 16098 + }, + { + "epoch": 2.9305542914353326, + "grad_norm": 11.3125, + "learning_rate": 1.0053790911843296e-06, + "loss": 1.1928051710128784, + "step": 16100 + }, + { + "epoch": 2.930918358059525, + "grad_norm": 8.375, + "learning_rate": 1.0053230842349566e-06, + "loss": 0.9413809776306152, + "step": 16102 + }, + { + "epoch": 2.931282424683717, + "grad_norm": 13.0, + "learning_rate": 1.0052673699963374e-06, + "loss": 1.3937246799468994, + "step": 16104 + }, + { + "epoch": 2.931646491307909, + "grad_norm": 14.125, + "learning_rate": 1.0052119484766475e-06, + "loss": 1.998185634613037, + "step": 16106 + }, + { + "epoch": 2.9320105579321014, + "grad_norm": 5.46875, + "learning_rate": 1.0051568196840203e-06, + "loss": 1.1789721250534058, + "step": 16108 + }, + { + "epoch": 2.932374624556294, + "grad_norm": 12.875, + "learning_rate": 1.0051019836265452e-06, + "loss": 1.2565393447875977, + "step": 16110 + }, + { + "epoch": 2.932738691180486, + "grad_norm": 8.125, + "learning_rate": 1.0050474403122695e-06, + "loss": 1.640028953552246, + "step": 16112 + }, + { + "epoch": 2.9331027578046784, + "grad_norm": 17.75, + "learning_rate": 1.004993189749197e-06, + "loss": 1.1269093751907349, + "step": 16114 + }, + { + "epoch": 2.9334668244288706, + "grad_norm": 5.8125, + "learning_rate": 1.0049392319452888e-06, + "loss": 1.2945148944854736, + "step": 16116 + }, + { + "epoch": 2.933830891053063, + "grad_norm": 7.65625, + "learning_rate": 1.0048855669084632e-06, + "loss": 0.9564671516418457, + "step": 16118 + }, + { + "epoch": 2.934194957677255, + "grad_norm": 11.125, + "learning_rate": 1.004832194646595e-06, + "loss": 1.4331605434417725, + "step": 16120 + }, + { + "epoch": 2.934559024301447, + "grad_norm": 8.625, + "learning_rate": 1.0047791151675167e-06, + "loss": 1.063494324684143, + "step": 16122 + }, + { + "epoch": 2.9349230909256394, + "grad_norm": 12.75, + "learning_rate": 1.0047263284790171e-06, + "loss": 1.5241312980651855, + "step": 16124 + }, + { + "epoch": 2.9352871575498316, + "grad_norm": 35.25, + "learning_rate": 1.004673834588843e-06, + "loss": 1.893446683883667, + "step": 16126 + }, + { + "epoch": 2.935651224174024, + "grad_norm": 23.5, + "learning_rate": 1.004621633504697e-06, + "loss": 1.8110666275024414, + "step": 16128 + }, + { + "epoch": 2.936015290798216, + "grad_norm": 8.9375, + "learning_rate": 1.0045697252342396e-06, + "loss": 1.48243248462677, + "step": 16130 + }, + { + "epoch": 2.936379357422408, + "grad_norm": 13.3125, + "learning_rate": 1.0045181097850886e-06, + "loss": 1.653139352798462, + "step": 16132 + }, + { + "epoch": 2.9367434240466004, + "grad_norm": 12.25, + "learning_rate": 1.004466787164818e-06, + "loss": 1.4307513236999512, + "step": 16134 + }, + { + "epoch": 2.937107490670793, + "grad_norm": 5.34375, + "learning_rate": 1.0044157573809594e-06, + "loss": 0.9555176496505737, + "step": 16136 + }, + { + "epoch": 2.9374715572949848, + "grad_norm": 4.78125, + "learning_rate": 1.0043650204410005e-06, + "loss": 0.9934687614440918, + "step": 16138 + }, + { + "epoch": 2.9378356239191774, + "grad_norm": 6.875, + "learning_rate": 1.0043145763523875e-06, + "loss": 1.3215734958648682, + "step": 16140 + }, + { + "epoch": 2.9381996905433696, + "grad_norm": 10.25, + "learning_rate": 1.0042644251225226e-06, + "loss": 1.3117142915725708, + "step": 16142 + }, + { + "epoch": 2.938563757167562, + "grad_norm": 10.375, + "learning_rate": 1.004214566758765e-06, + "loss": 1.7948687076568604, + "step": 16144 + }, + { + "epoch": 2.938927823791754, + "grad_norm": 7.59375, + "learning_rate": 1.004165001268432e-06, + "loss": 1.3431758880615234, + "step": 16146 + }, + { + "epoch": 2.939291890415946, + "grad_norm": 18.75, + "learning_rate": 1.0041157286587965e-06, + "loss": 1.3410488367080688, + "step": 16148 + }, + { + "epoch": 2.9396559570401384, + "grad_norm": 21.625, + "learning_rate": 1.004066748937089e-06, + "loss": 1.3845548629760742, + "step": 16150 + }, + { + "epoch": 2.9400200236643306, + "grad_norm": 22.125, + "learning_rate": 1.0040180621104973e-06, + "loss": 1.2215200662612915, + "step": 16152 + }, + { + "epoch": 2.9403840902885228, + "grad_norm": 10.125, + "learning_rate": 1.0039696681861661e-06, + "loss": 1.0278096199035645, + "step": 16154 + }, + { + "epoch": 2.940748156912715, + "grad_norm": 16.25, + "learning_rate": 1.0039215671711972e-06, + "loss": 1.5213755369186401, + "step": 16156 + }, + { + "epoch": 2.941112223536907, + "grad_norm": 20.75, + "learning_rate": 1.0038737590726484e-06, + "loss": 1.7307862043380737, + "step": 16158 + }, + { + "epoch": 2.9414762901610993, + "grad_norm": 68.5, + "learning_rate": 1.003826243897536e-06, + "loss": 1.465663194656372, + "step": 16160 + }, + { + "epoch": 2.9418403567852915, + "grad_norm": 9.6875, + "learning_rate": 1.0037790216528327e-06, + "loss": 1.5341862440109253, + "step": 16162 + }, + { + "epoch": 2.9422044234094837, + "grad_norm": 7.5625, + "learning_rate": 1.003732092345468e-06, + "loss": 1.1823664903640747, + "step": 16164 + }, + { + "epoch": 2.9425684900336764, + "grad_norm": 8.5625, + "learning_rate": 1.0036854559823283e-06, + "loss": 1.4220274686813354, + "step": 16166 + }, + { + "epoch": 2.942932556657868, + "grad_norm": 12.125, + "learning_rate": 1.0036391125702577e-06, + "loss": 1.408041000366211, + "step": 16168 + }, + { + "epoch": 2.9432966232820608, + "grad_norm": 11.0, + "learning_rate": 1.0035930621160571e-06, + "loss": 1.4193745851516724, + "step": 16170 + }, + { + "epoch": 2.943660689906253, + "grad_norm": 17.75, + "learning_rate": 1.0035473046264834e-06, + "loss": 1.0945427417755127, + "step": 16172 + }, + { + "epoch": 2.944024756530445, + "grad_norm": 19.5, + "learning_rate": 1.0035018401082522e-06, + "loss": 0.9122292995452881, + "step": 16174 + }, + { + "epoch": 2.9443888231546373, + "grad_norm": 16.75, + "learning_rate": 1.0034566685680346e-06, + "loss": 1.3807919025421143, + "step": 16176 + }, + { + "epoch": 2.9447528897788295, + "grad_norm": 7.625, + "learning_rate": 1.0034117900124598e-06, + "loss": 1.3316371440887451, + "step": 16178 + }, + { + "epoch": 2.9451169564030217, + "grad_norm": 7.75, + "learning_rate": 1.0033672044481133e-06, + "loss": 1.2266490459442139, + "step": 16180 + }, + { + "epoch": 2.945481023027214, + "grad_norm": 10.5625, + "learning_rate": 1.0033229118815379e-06, + "loss": 1.5142892599105835, + "step": 16182 + }, + { + "epoch": 2.945845089651406, + "grad_norm": 14.125, + "learning_rate": 1.0032789123192335e-06, + "loss": 1.4251207113265991, + "step": 16184 + }, + { + "epoch": 2.9462091562755983, + "grad_norm": 13.1875, + "learning_rate": 1.0032352057676567e-06, + "loss": 1.6307628154754639, + "step": 16186 + }, + { + "epoch": 2.9465732228997905, + "grad_norm": 21.125, + "learning_rate": 1.003191792233221e-06, + "loss": 1.972518801689148, + "step": 16188 + }, + { + "epoch": 2.9469372895239827, + "grad_norm": 23.125, + "learning_rate": 1.0031486717222976e-06, + "loss": 1.2268198728561401, + "step": 16190 + }, + { + "epoch": 2.9473013561481753, + "grad_norm": 31.875, + "learning_rate": 1.003105844241214e-06, + "loss": 1.3296505212783813, + "step": 16192 + }, + { + "epoch": 2.947665422772367, + "grad_norm": 10.6875, + "learning_rate": 1.0030633097962552e-06, + "loss": 1.8601598739624023, + "step": 16194 + }, + { + "epoch": 2.9480294893965597, + "grad_norm": 8.9375, + "learning_rate": 1.0030210683936627e-06, + "loss": 1.3801432847976685, + "step": 16196 + }, + { + "epoch": 2.948393556020752, + "grad_norm": 6.03125, + "learning_rate": 1.0029791200396355e-06, + "loss": 0.9664236903190613, + "step": 16198 + }, + { + "epoch": 2.948757622644944, + "grad_norm": 11.375, + "learning_rate": 1.002937464740329e-06, + "loss": 1.6388170719146729, + "step": 16200 + }, + { + "epoch": 2.9491216892691363, + "grad_norm": 8.875, + "learning_rate": 1.0028961025018564e-06, + "loss": 1.5815627574920654, + "step": 16202 + }, + { + "epoch": 2.9494857558933285, + "grad_norm": 12.4375, + "learning_rate": 1.0028550333302872e-06, + "loss": 1.4404516220092773, + "step": 16204 + }, + { + "epoch": 2.9498498225175207, + "grad_norm": 40.75, + "learning_rate": 1.002814257231648e-06, + "loss": 1.1763116121292114, + "step": 16206 + }, + { + "epoch": 2.950213889141713, + "grad_norm": 23.5, + "learning_rate": 1.0027737742119227e-06, + "loss": 0.905163049697876, + "step": 16208 + }, + { + "epoch": 2.950577955765905, + "grad_norm": 31.0, + "learning_rate": 1.002733584277052e-06, + "loss": 1.0610320568084717, + "step": 16210 + }, + { + "epoch": 2.9509420223900973, + "grad_norm": 4.4375, + "learning_rate": 1.0026936874329336e-06, + "loss": 0.9330928325653076, + "step": 16212 + }, + { + "epoch": 2.9513060890142895, + "grad_norm": 8.375, + "learning_rate": 1.002654083685422e-06, + "loss": 1.1678614616394043, + "step": 16214 + }, + { + "epoch": 2.9516701556384817, + "grad_norm": 19.125, + "learning_rate": 1.0026147730403294e-06, + "loss": 1.4060074090957642, + "step": 16216 + }, + { + "epoch": 2.9520342222626743, + "grad_norm": 9.625, + "learning_rate": 1.0025757555034238e-06, + "loss": 1.4334461688995361, + "step": 16218 + }, + { + "epoch": 2.952398288886866, + "grad_norm": 7.96875, + "learning_rate": 1.0025370310804316e-06, + "loss": 1.3008579015731812, + "step": 16220 + }, + { + "epoch": 2.9527623555110587, + "grad_norm": 24.0, + "learning_rate": 1.002498599777035e-06, + "loss": 1.5103909969329834, + "step": 16222 + }, + { + "epoch": 2.953126422135251, + "grad_norm": 12.4375, + "learning_rate": 1.0024604615988734e-06, + "loss": 2.1457014083862305, + "step": 16224 + }, + { + "epoch": 2.953490488759443, + "grad_norm": 11.375, + "learning_rate": 1.002422616551544e-06, + "loss": 1.4554740190505981, + "step": 16226 + }, + { + "epoch": 2.9538545553836353, + "grad_norm": 9.5625, + "learning_rate": 1.0023850646406002e-06, + "loss": 1.5656893253326416, + "step": 16228 + }, + { + "epoch": 2.9542186220078275, + "grad_norm": 12.0625, + "learning_rate": 1.0023478058715524e-06, + "loss": 1.4388329982757568, + "step": 16230 + }, + { + "epoch": 2.9545826886320197, + "grad_norm": 9.1875, + "learning_rate": 1.0023108402498684e-06, + "loss": 1.5340616703033447, + "step": 16232 + }, + { + "epoch": 2.954946755256212, + "grad_norm": 8.75, + "learning_rate": 1.0022741677809728e-06, + "loss": 1.5657023191452026, + "step": 16234 + }, + { + "epoch": 2.955310821880404, + "grad_norm": 8.4375, + "learning_rate": 1.0022377884702468e-06, + "loss": 1.2961970567703247, + "step": 16236 + }, + { + "epoch": 2.9556748885045963, + "grad_norm": 9.6875, + "learning_rate": 1.002201702323029e-06, + "loss": 0.9746873378753662, + "step": 16238 + }, + { + "epoch": 2.9560389551287884, + "grad_norm": 13.6875, + "learning_rate": 1.0021659093446152e-06, + "loss": 1.7138454914093018, + "step": 16240 + }, + { + "epoch": 2.9564030217529806, + "grad_norm": 14.125, + "learning_rate": 1.002130409540258e-06, + "loss": 1.7269600629806519, + "step": 16242 + }, + { + "epoch": 2.9567670883771733, + "grad_norm": 16.75, + "learning_rate": 1.002095202915166e-06, + "loss": 1.586848497390747, + "step": 16244 + }, + { + "epoch": 2.957131155001365, + "grad_norm": 76.0, + "learning_rate": 1.0020602894745063e-06, + "loss": 1.769135594367981, + "step": 16246 + }, + { + "epoch": 2.9574952216255577, + "grad_norm": 20.0, + "learning_rate": 1.0020256692234023e-06, + "loss": 1.4723286628723145, + "step": 16248 + }, + { + "epoch": 2.95785928824975, + "grad_norm": 18.75, + "learning_rate": 1.0019913421669344e-06, + "loss": 1.0270074605941772, + "step": 16250 + }, + { + "epoch": 2.958223354873942, + "grad_norm": 20.875, + "learning_rate": 1.0019573083101397e-06, + "loss": 1.5859978199005127, + "step": 16252 + }, + { + "epoch": 2.9585874214981343, + "grad_norm": 10.125, + "learning_rate": 1.0019235676580124e-06, + "loss": 1.4734545946121216, + "step": 16254 + }, + { + "epoch": 2.9589514881223264, + "grad_norm": 10.3125, + "learning_rate": 1.0018901202155043e-06, + "loss": 1.4765737056732178, + "step": 16256 + }, + { + "epoch": 2.9593155547465186, + "grad_norm": 3.265625, + "learning_rate": 1.0018569659875233e-06, + "loss": 1.059638500213623, + "step": 16258 + }, + { + "epoch": 2.959679621370711, + "grad_norm": 9.5, + "learning_rate": 1.001824104978935e-06, + "loss": 1.0306956768035889, + "step": 16260 + }, + { + "epoch": 2.960043687994903, + "grad_norm": 27.75, + "learning_rate": 1.0017915371945611e-06, + "loss": 1.518880844116211, + "step": 16262 + }, + { + "epoch": 2.960407754619095, + "grad_norm": 16.375, + "learning_rate": 1.0017592626391813e-06, + "loss": 1.4115920066833496, + "step": 16264 + }, + { + "epoch": 2.9607718212432874, + "grad_norm": 16.5, + "learning_rate": 1.0017272813175315e-06, + "loss": 1.361602544784546, + "step": 16266 + }, + { + "epoch": 2.9611358878674796, + "grad_norm": 34.25, + "learning_rate": 1.0016955932343049e-06, + "loss": 1.4944262504577637, + "step": 16268 + }, + { + "epoch": 2.9614999544916722, + "grad_norm": 7.71875, + "learning_rate": 1.0016641983941513e-06, + "loss": 1.5077896118164062, + "step": 16270 + }, + { + "epoch": 2.961864021115864, + "grad_norm": 8.5, + "learning_rate": 1.0016330968016784e-06, + "loss": 1.0913604497909546, + "step": 16272 + }, + { + "epoch": 2.9622280877400566, + "grad_norm": 28.0, + "learning_rate": 1.0016022884614495e-06, + "loss": 1.1831109523773193, + "step": 16274 + }, + { + "epoch": 2.9625921543642484, + "grad_norm": 5.46875, + "learning_rate": 1.0015717733779864e-06, + "loss": 1.3958098888397217, + "step": 16276 + }, + { + "epoch": 2.962956220988441, + "grad_norm": 43.25, + "learning_rate": 1.0015415515557666e-06, + "loss": 0.8568093776702881, + "step": 16278 + }, + { + "epoch": 2.963320287612633, + "grad_norm": 13.125, + "learning_rate": 1.001511622999225e-06, + "loss": 1.001417636871338, + "step": 16280 + }, + { + "epoch": 2.9636843542368254, + "grad_norm": 16.625, + "learning_rate": 1.001481987712753e-06, + "loss": 1.5871542692184448, + "step": 16282 + }, + { + "epoch": 2.9640484208610176, + "grad_norm": 21.625, + "learning_rate": 1.0014526457007007e-06, + "loss": 1.882049322128296, + "step": 16284 + }, + { + "epoch": 2.96441248748521, + "grad_norm": 7.0, + "learning_rate": 1.001423596967373e-06, + "loss": 0.986940860748291, + "step": 16286 + }, + { + "epoch": 2.964776554109402, + "grad_norm": 47.25, + "learning_rate": 1.001394841517033e-06, + "loss": 1.0480201244354248, + "step": 16288 + }, + { + "epoch": 2.965140620733594, + "grad_norm": 117.0, + "learning_rate": 1.0013663793539003e-06, + "loss": 1.4762556552886963, + "step": 16290 + }, + { + "epoch": 2.9655046873577864, + "grad_norm": 8.875, + "learning_rate": 1.0013382104821517e-06, + "loss": 1.1868630647659302, + "step": 16292 + }, + { + "epoch": 2.9658687539819786, + "grad_norm": 37.25, + "learning_rate": 1.0013103349059209e-06, + "loss": 1.320264220237732, + "step": 16294 + }, + { + "epoch": 2.9662328206061708, + "grad_norm": 30.25, + "learning_rate": 1.0012827526292984e-06, + "loss": 1.5974578857421875, + "step": 16296 + }, + { + "epoch": 2.966596887230363, + "grad_norm": 11.5, + "learning_rate": 1.0012554636563317e-06, + "loss": 1.7325783967971802, + "step": 16298 + }, + { + "epoch": 2.9669609538545556, + "grad_norm": 13.3125, + "learning_rate": 1.0012284679910257e-06, + "loss": 1.1121164560317993, + "step": 16300 + }, + { + "epoch": 2.9673250204787474, + "grad_norm": 11.9375, + "learning_rate": 1.0012017656373417e-06, + "loss": 1.4275696277618408, + "step": 16302 + }, + { + "epoch": 2.96768908710294, + "grad_norm": 14.8125, + "learning_rate": 1.0011753565991983e-06, + "loss": 1.3188142776489258, + "step": 16304 + }, + { + "epoch": 2.968053153727132, + "grad_norm": 13.625, + "learning_rate": 1.0011492408804704e-06, + "loss": 1.5391064882278442, + "step": 16306 + }, + { + "epoch": 2.9684172203513244, + "grad_norm": 8.125, + "learning_rate": 1.0011234184849912e-06, + "loss": 1.4445359706878662, + "step": 16308 + }, + { + "epoch": 2.9687812869755166, + "grad_norm": 8.0625, + "learning_rate": 1.0010978894165493e-06, + "loss": 1.1950385570526123, + "step": 16310 + }, + { + "epoch": 2.9691453535997088, + "grad_norm": 16.625, + "learning_rate": 1.0010726536788912e-06, + "loss": 1.749969244003296, + "step": 16312 + }, + { + "epoch": 2.969509420223901, + "grad_norm": 11.9375, + "learning_rate": 1.0010477112757206e-06, + "loss": 1.4159072637557983, + "step": 16314 + }, + { + "epoch": 2.969873486848093, + "grad_norm": 17.5, + "learning_rate": 1.0010230622106972e-06, + "loss": 1.3043038845062256, + "step": 16316 + }, + { + "epoch": 2.9702375534722854, + "grad_norm": 7.96875, + "learning_rate": 1.0009987064874382e-06, + "loss": 1.3582814931869507, + "step": 16318 + }, + { + "epoch": 2.9706016200964775, + "grad_norm": 24.625, + "learning_rate": 1.000974644109518e-06, + "loss": 1.5314178466796875, + "step": 16320 + }, + { + "epoch": 2.9709656867206697, + "grad_norm": 29.375, + "learning_rate": 1.000950875080467e-06, + "loss": 1.9022207260131836, + "step": 16322 + }, + { + "epoch": 2.971329753344862, + "grad_norm": 15.875, + "learning_rate": 1.0009273994037738e-06, + "loss": 1.324299931526184, + "step": 16324 + }, + { + "epoch": 2.9716938199690546, + "grad_norm": 12.625, + "learning_rate": 1.0009042170828834e-06, + "loss": 1.4956713914871216, + "step": 16326 + }, + { + "epoch": 2.9720578865932463, + "grad_norm": 11.5625, + "learning_rate": 1.0008813281211973e-06, + "loss": 1.8363910913467407, + "step": 16328 + }, + { + "epoch": 2.972421953217439, + "grad_norm": 29.625, + "learning_rate": 1.0008587325220747e-06, + "loss": 1.766657829284668, + "step": 16330 + }, + { + "epoch": 2.972786019841631, + "grad_norm": 16.25, + "learning_rate": 1.0008364302888315e-06, + "loss": 1.0280463695526123, + "step": 16332 + }, + { + "epoch": 2.9731500864658233, + "grad_norm": 178.0, + "learning_rate": 1.0008144214247401e-06, + "loss": 1.1512819528579712, + "step": 16334 + }, + { + "epoch": 2.9735141530900155, + "grad_norm": 8.1875, + "learning_rate": 1.00079270593303e-06, + "loss": 1.5159026384353638, + "step": 16336 + }, + { + "epoch": 2.9738782197142077, + "grad_norm": 15.1875, + "learning_rate": 1.0007712838168887e-06, + "loss": 1.400529146194458, + "step": 16338 + }, + { + "epoch": 2.9742422863384, + "grad_norm": 9.875, + "learning_rate": 1.000750155079459e-06, + "loss": 1.391846776008606, + "step": 16340 + }, + { + "epoch": 2.974606352962592, + "grad_norm": 4.4375, + "learning_rate": 1.000729319723842e-06, + "loss": 1.2190568447113037, + "step": 16342 + }, + { + "epoch": 2.9749704195867843, + "grad_norm": 5.25, + "learning_rate": 1.0007087777530949e-06, + "loss": 1.0561890602111816, + "step": 16344 + }, + { + "epoch": 2.9753344862109765, + "grad_norm": 5.03125, + "learning_rate": 1.0006885291702325e-06, + "loss": 1.2405662536621094, + "step": 16346 + }, + { + "epoch": 2.9756985528351687, + "grad_norm": 16.5, + "learning_rate": 1.0006685739782257e-06, + "loss": 1.23015296459198, + "step": 16348 + }, + { + "epoch": 2.976062619459361, + "grad_norm": 10.375, + "learning_rate": 1.000648912180003e-06, + "loss": 1.0744677782058716, + "step": 16350 + }, + { + "epoch": 2.9764266860835535, + "grad_norm": 20.5, + "learning_rate": 1.0006295437784499e-06, + "loss": 1.4442644119262695, + "step": 16352 + }, + { + "epoch": 2.9767907527077453, + "grad_norm": 10.125, + "learning_rate": 1.0006104687764085e-06, + "loss": 1.543410301208496, + "step": 16354 + }, + { + "epoch": 2.977154819331938, + "grad_norm": 15.1875, + "learning_rate": 1.000591687176678e-06, + "loss": 1.5970842838287354, + "step": 16356 + }, + { + "epoch": 2.97751888595613, + "grad_norm": 6.84375, + "learning_rate": 1.0005731989820144e-06, + "loss": 1.445725917816162, + "step": 16358 + }, + { + "epoch": 2.9778829525803223, + "grad_norm": 11.4375, + "learning_rate": 1.0005550041951312e-06, + "loss": 1.137832522392273, + "step": 16360 + }, + { + "epoch": 2.9782470192045145, + "grad_norm": 48.5, + "learning_rate": 1.0005371028186977e-06, + "loss": 2.0945792198181152, + "step": 16362 + }, + { + "epoch": 2.9786110858287067, + "grad_norm": 12.75, + "learning_rate": 1.0005194948553415e-06, + "loss": 1.913609504699707, + "step": 16364 + }, + { + "epoch": 2.978975152452899, + "grad_norm": 20.75, + "learning_rate": 1.0005021803076462e-06, + "loss": 1.2479089498519897, + "step": 16366 + }, + { + "epoch": 2.979339219077091, + "grad_norm": 27.375, + "learning_rate": 1.0004851591781527e-06, + "loss": 0.9958842992782593, + "step": 16368 + }, + { + "epoch": 2.9797032857012833, + "grad_norm": 16.375, + "learning_rate": 1.0004684314693587e-06, + "loss": 1.4470421075820923, + "step": 16370 + }, + { + "epoch": 2.9800673523254755, + "grad_norm": 9.4375, + "learning_rate": 1.000451997183719e-06, + "loss": 1.43610680103302, + "step": 16372 + }, + { + "epoch": 2.9804314189496677, + "grad_norm": 14.4375, + "learning_rate": 1.0004358563236452e-06, + "loss": 1.3032466173171997, + "step": 16374 + }, + { + "epoch": 2.98079548557386, + "grad_norm": 17.625, + "learning_rate": 1.0004200088915061e-06, + "loss": 0.6024792194366455, + "step": 16376 + }, + { + "epoch": 2.9811595521980525, + "grad_norm": 18.25, + "learning_rate": 1.000404454889627e-06, + "loss": 0.8797003626823425, + "step": 16378 + }, + { + "epoch": 2.9815236188222443, + "grad_norm": 7.6875, + "learning_rate": 1.0003891943202906e-06, + "loss": 1.5118615627288818, + "step": 16380 + }, + { + "epoch": 2.981887685446437, + "grad_norm": 3.359375, + "learning_rate": 1.0003742271857359e-06, + "loss": 1.0019346475601196, + "step": 16382 + }, + { + "epoch": 2.9822517520706286, + "grad_norm": 11.5625, + "learning_rate": 1.00035955348816e-06, + "loss": 1.1364027261734009, + "step": 16384 + }, + { + "epoch": 2.9826158186948213, + "grad_norm": 19.625, + "learning_rate": 1.0003451732297156e-06, + "loss": 1.4811102151870728, + "step": 16386 + }, + { + "epoch": 2.9829798853190135, + "grad_norm": 9.25, + "learning_rate": 1.0003310864125132e-06, + "loss": 1.590872049331665, + "step": 16388 + }, + { + "epoch": 2.9833439519432057, + "grad_norm": 5.8125, + "learning_rate": 1.0003172930386198e-06, + "loss": 1.0282552242279053, + "step": 16390 + }, + { + "epoch": 2.983708018567398, + "grad_norm": 5.34375, + "learning_rate": 1.0003037931100597e-06, + "loss": 1.512000322341919, + "step": 16392 + }, + { + "epoch": 2.98407208519159, + "grad_norm": 8.3125, + "learning_rate": 1.000290586628814e-06, + "loss": 1.5235819816589355, + "step": 16394 + }, + { + "epoch": 2.9844361518157823, + "grad_norm": 17.375, + "learning_rate": 1.0002776735968207e-06, + "loss": 1.734879732131958, + "step": 16396 + }, + { + "epoch": 2.9848002184399745, + "grad_norm": 8.375, + "learning_rate": 1.0002650540159742e-06, + "loss": 1.3319530487060547, + "step": 16398 + }, + { + "epoch": 2.9851642850641666, + "grad_norm": 9.1875, + "learning_rate": 1.000252727888127e-06, + "loss": 1.1793327331542969, + "step": 16400 + }, + { + "epoch": 2.985528351688359, + "grad_norm": 26.5, + "learning_rate": 1.0002406952150878e-06, + "loss": 1.4188673496246338, + "step": 16402 + }, + { + "epoch": 2.985892418312551, + "grad_norm": 8.9375, + "learning_rate": 1.0002289559986223e-06, + "loss": 1.2849725484848022, + "step": 16404 + }, + { + "epoch": 2.9862564849367432, + "grad_norm": 18.5, + "learning_rate": 1.0002175102404531e-06, + "loss": 0.6944088935852051, + "step": 16406 + }, + { + "epoch": 2.986620551560936, + "grad_norm": 15.75, + "learning_rate": 1.00020635794226e-06, + "loss": 0.4224565923213959, + "step": 16408 + }, + { + "epoch": 2.9869846181851276, + "grad_norm": 11.875, + "learning_rate": 1.000195499105679e-06, + "loss": 1.7608332633972168, + "step": 16410 + }, + { + "epoch": 2.9873486848093203, + "grad_norm": 8.75, + "learning_rate": 1.0001849337323045e-06, + "loss": 1.1287002563476562, + "step": 16412 + }, + { + "epoch": 2.9877127514335124, + "grad_norm": 11.6875, + "learning_rate": 1.0001746618236862e-06, + "loss": 1.41616690158844, + "step": 16414 + }, + { + "epoch": 2.9880768180577046, + "grad_norm": 6.875, + "learning_rate": 1.0001646833813316e-06, + "loss": 1.374161958694458, + "step": 16416 + }, + { + "epoch": 2.988440884681897, + "grad_norm": 37.25, + "learning_rate": 1.0001549984067052e-06, + "loss": 1.2015546560287476, + "step": 16418 + }, + { + "epoch": 2.988804951306089, + "grad_norm": 12.0625, + "learning_rate": 1.0001456069012282e-06, + "loss": 1.58562171459198, + "step": 16420 + }, + { + "epoch": 2.9891690179302812, + "grad_norm": 9.6875, + "learning_rate": 1.0001365088662784e-06, + "loss": 1.0949738025665283, + "step": 16422 + }, + { + "epoch": 2.9895330845544734, + "grad_norm": 10.9375, + "learning_rate": 1.0001277043031915e-06, + "loss": 1.4776755571365356, + "step": 16424 + }, + { + "epoch": 2.9898971511786656, + "grad_norm": 9.5, + "learning_rate": 1.000119193213259e-06, + "loss": 1.2836664915084839, + "step": 16426 + }, + { + "epoch": 2.990261217802858, + "grad_norm": 15.9375, + "learning_rate": 1.0001109755977303e-06, + "loss": 1.2599329948425293, + "step": 16428 + }, + { + "epoch": 2.99062528442705, + "grad_norm": 11.625, + "learning_rate": 1.000103051457811e-06, + "loss": 0.8928711414337158, + "step": 16430 + }, + { + "epoch": 2.990989351051242, + "grad_norm": 10.125, + "learning_rate": 1.000095420794664e-06, + "loss": 1.4531464576721191, + "step": 16432 + }, + { + "epoch": 2.991353417675435, + "grad_norm": 6.9375, + "learning_rate": 1.0000880836094091e-06, + "loss": 1.4205862283706665, + "step": 16434 + }, + { + "epoch": 2.9917174842996266, + "grad_norm": 8.0625, + "learning_rate": 1.000081039903123e-06, + "loss": 1.368853211402893, + "step": 16436 + }, + { + "epoch": 2.992081550923819, + "grad_norm": 7.875, + "learning_rate": 1.0000742896768392e-06, + "loss": 1.3822475671768188, + "step": 16438 + }, + { + "epoch": 2.9924456175480114, + "grad_norm": 10.5, + "learning_rate": 1.0000678329315486e-06, + "loss": 1.5048623085021973, + "step": 16440 + }, + { + "epoch": 2.9928096841722036, + "grad_norm": 13.6875, + "learning_rate": 1.0000616696681984e-06, + "loss": 1.3586997985839844, + "step": 16442 + }, + { + "epoch": 2.993173750796396, + "grad_norm": 12.6875, + "learning_rate": 1.0000557998876933e-06, + "loss": 1.4310898780822754, + "step": 16444 + }, + { + "epoch": 2.993537817420588, + "grad_norm": 21.75, + "learning_rate": 1.0000502235908943e-06, + "loss": 1.4072825908660889, + "step": 16446 + }, + { + "epoch": 2.99390188404478, + "grad_norm": 23.125, + "learning_rate": 1.00004494077862e-06, + "loss": 1.9890415668487549, + "step": 16448 + }, + { + "epoch": 2.9942659506689724, + "grad_norm": 18.5, + "learning_rate": 1.0000399514516453e-06, + "loss": 1.4942454099655151, + "step": 16450 + }, + { + "epoch": 2.9946300172931646, + "grad_norm": 13.875, + "learning_rate": 1.0000352556107028e-06, + "loss": 1.324810266494751, + "step": 16452 + }, + { + "epoch": 2.9949940839173568, + "grad_norm": 6.9375, + "learning_rate": 1.0000308532564813e-06, + "loss": 1.202855110168457, + "step": 16454 + }, + { + "epoch": 2.995358150541549, + "grad_norm": 8.0, + "learning_rate": 1.000026744389627e-06, + "loss": 1.2264525890350342, + "step": 16456 + }, + { + "epoch": 2.995722217165741, + "grad_norm": 12.625, + "learning_rate": 1.000022929010743e-06, + "loss": 1.4550981521606445, + "step": 16458 + }, + { + "epoch": 2.996086283789934, + "grad_norm": 10.75, + "learning_rate": 1.0000194071203887e-06, + "loss": 1.3527114391326904, + "step": 16460 + }, + { + "epoch": 2.9964503504141256, + "grad_norm": 11.5625, + "learning_rate": 1.0000161787190812e-06, + "loss": 0.8071882724761963, + "step": 16462 + }, + { + "epoch": 2.996814417038318, + "grad_norm": 6.59375, + "learning_rate": 1.0000132438072942e-06, + "loss": 1.4483782052993774, + "step": 16464 + }, + { + "epoch": 2.9971784836625104, + "grad_norm": 4.40625, + "learning_rate": 1.0000106023854585e-06, + "loss": 1.272663950920105, + "step": 16466 + }, + { + "epoch": 2.9975425502867026, + "grad_norm": 9.4375, + "learning_rate": 1.0000082544539618e-06, + "loss": 1.2252750396728516, + "step": 16468 + }, + { + "epoch": 2.9979066169108948, + "grad_norm": 9.375, + "learning_rate": 1.0000062000131483e-06, + "loss": 1.4054439067840576, + "step": 16470 + }, + { + "epoch": 2.998270683535087, + "grad_norm": 11.25, + "learning_rate": 1.0000044390633198e-06, + "loss": 1.309810757637024, + "step": 16472 + }, + { + "epoch": 2.998634750159279, + "grad_norm": 8.4375, + "learning_rate": 1.0000029716047346e-06, + "loss": 1.3979363441467285, + "step": 16474 + }, + { + "epoch": 2.9989988167834714, + "grad_norm": 7.53125, + "learning_rate": 1.000001797637608e-06, + "loss": 1.1941115856170654, + "step": 16476 + }, + { + "epoch": 2.9993628834076635, + "grad_norm": 11.5625, + "learning_rate": 1.0000009171621122e-06, + "loss": 1.9067292213439941, + "step": 16478 + }, + { + "epoch": 2.9997269500318557, + "grad_norm": 15.375, + "learning_rate": 1.0000003301783765e-06, + "loss": 1.603000283241272, + "step": 16480 + }, + { + "epoch": 3.0, + "grad_norm": 25.75, + "learning_rate": 1.0000000366864873e-06, + "loss": 1.4978524446487427, + "step": 16482 + }, + { + "epoch": 3.0, + "step": 16482, + "total_flos": 3.229073396012679e+18, + "train_loss": 1.3915999769032572, + "train_runtime": 20987.8784, + "train_samples_per_second": 1.57, + "train_steps_per_second": 0.785 + } + ], + "logging_steps": 2, + "max_steps": 16482, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.229073396012679e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}