{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 3250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012312427856868025, "grad_norm": 1.3125, "learning_rate": 1.0204081632653061e-07, "loss": 1.426961898803711, "step": 2 }, { "epoch": 0.002462485571373605, "grad_norm": 1.6953125, "learning_rate": 3.0612244897959183e-07, "loss": 1.346108078956604, "step": 4 }, { "epoch": 0.003693728357060408, "grad_norm": 2.390625, "learning_rate": 5.102040816326531e-07, "loss": 1.8839138746261597, "step": 6 }, { "epoch": 0.00492497114274721, "grad_norm": 3.765625, "learning_rate": 7.142857142857143e-07, "loss": 1.8666269779205322, "step": 8 }, { "epoch": 0.0061562139284340135, "grad_norm": 1.7109375, "learning_rate": 9.183673469387756e-07, "loss": 1.228968620300293, "step": 10 }, { "epoch": 0.007387456714120816, "grad_norm": 9.3125, "learning_rate": 1.122448979591837e-06, "loss": 1.0329455137252808, "step": 12 }, { "epoch": 0.008618699499807618, "grad_norm": 3.1875, "learning_rate": 1.3265306122448982e-06, "loss": 1.8221150636672974, "step": 14 }, { "epoch": 0.00984994228549442, "grad_norm": 3.234375, "learning_rate": 1.5306122448979593e-06, "loss": 1.8240559101104736, "step": 16 }, { "epoch": 0.011081185071181223, "grad_norm": 6.59375, "learning_rate": 1.7346938775510206e-06, "loss": 2.2386162281036377, "step": 18 }, { "epoch": 0.012312427856868027, "grad_norm": 7.09375, "learning_rate": 1.938775510204082e-06, "loss": 2.164640188217163, "step": 20 }, { "epoch": 0.01354367064255483, "grad_norm": 10.0, "learning_rate": 2.1428571428571427e-06, "loss": 2.490449905395508, "step": 22 }, { "epoch": 0.014774913428241632, "grad_norm": 6.1875, "learning_rate": 2.3469387755102044e-06, "loss": 2.384032726287842, "step": 24 }, { "epoch": 0.016006156213928435, "grad_norm": 2.453125, "learning_rate": 2.5510204081632657e-06, "loss": 1.432393193244934, "step": 26 }, { "epoch": 0.017237398999615235, "grad_norm": 1.4296875, "learning_rate": 2.7551020408163266e-06, "loss": 1.337876558303833, "step": 28 }, { "epoch": 0.01846864178530204, "grad_norm": 1.2890625, "learning_rate": 2.959183673469388e-06, "loss": 1.2051901817321777, "step": 30 }, { "epoch": 0.01969988457098884, "grad_norm": 1.0546875, "learning_rate": 3.1632653061224496e-06, "loss": 1.1560025215148926, "step": 32 }, { "epoch": 0.020931127356675645, "grad_norm": 2.46875, "learning_rate": 3.3673469387755105e-06, "loss": 1.1425062417984009, "step": 34 }, { "epoch": 0.022162370142362445, "grad_norm": 3.8125, "learning_rate": 3.5714285714285718e-06, "loss": 1.2081866264343262, "step": 36 }, { "epoch": 0.02339361292804925, "grad_norm": 8.4375, "learning_rate": 3.7755102040816327e-06, "loss": 2.156994104385376, "step": 38 }, { "epoch": 0.024624855713736054, "grad_norm": 14.375, "learning_rate": 3.979591836734694e-06, "loss": 2.4016313552856445, "step": 40 }, { "epoch": 0.025856098499422855, "grad_norm": 3.140625, "learning_rate": 4.183673469387755e-06, "loss": 1.7849466800689697, "step": 42 }, { "epoch": 0.02708734128510966, "grad_norm": 5.46875, "learning_rate": 4.3877551020408165e-06, "loss": 1.7391504049301147, "step": 44 }, { "epoch": 0.02831858407079646, "grad_norm": 5.96875, "learning_rate": 4.591836734693878e-06, "loss": 2.117762327194214, "step": 46 }, { "epoch": 0.029549826856483264, "grad_norm": 12.5, "learning_rate": 4.795918367346939e-06, "loss": 2.1546037197113037, "step": 48 }, { "epoch": 0.030781069642170065, "grad_norm": 6.375, "learning_rate": 5e-06, "loss": 1.6910151243209839, "step": 50 }, { "epoch": 0.03201231242785687, "grad_norm": 2.875, "learning_rate": 5.204081632653062e-06, "loss": 1.7401823997497559, "step": 52 }, { "epoch": 0.03324355521354367, "grad_norm": 5.3125, "learning_rate": 5.408163265306123e-06, "loss": 2.2959043979644775, "step": 54 }, { "epoch": 0.03447479799923047, "grad_norm": 4.59375, "learning_rate": 5.6122448979591834e-06, "loss": 2.2308778762817383, "step": 56 }, { "epoch": 0.03570604078491728, "grad_norm": 1.5078125, "learning_rate": 5.816326530612246e-06, "loss": 1.1782121658325195, "step": 58 }, { "epoch": 0.03693728357060408, "grad_norm": 1.15625, "learning_rate": 6.020408163265307e-06, "loss": 1.3119515180587769, "step": 60 }, { "epoch": 0.03816852635629088, "grad_norm": 4.21875, "learning_rate": 6.224489795918368e-06, "loss": 1.8275368213653564, "step": 62 }, { "epoch": 0.03939976914197768, "grad_norm": 3.140625, "learning_rate": 6.4285714285714295e-06, "loss": 1.9090592861175537, "step": 64 }, { "epoch": 0.04063101192766449, "grad_norm": 2.203125, "learning_rate": 6.63265306122449e-06, "loss": 1.7946950197219849, "step": 66 }, { "epoch": 0.04186225471335129, "grad_norm": 5.78125, "learning_rate": 6.836734693877551e-06, "loss": 1.8883665800094604, "step": 68 }, { "epoch": 0.04309349749903809, "grad_norm": 1.953125, "learning_rate": 7.0408163265306125e-06, "loss": 1.3253710269927979, "step": 70 }, { "epoch": 0.04432474028472489, "grad_norm": 1.578125, "learning_rate": 7.244897959183675e-06, "loss": 1.5811930894851685, "step": 72 }, { "epoch": 0.0455559830704117, "grad_norm": 6.875, "learning_rate": 7.448979591836736e-06, "loss": 2.326409339904785, "step": 74 }, { "epoch": 0.0467872258560985, "grad_norm": 6.0625, "learning_rate": 7.653061224489796e-06, "loss": 2.18452787399292, "step": 76 }, { "epoch": 0.0480184686417853, "grad_norm": 11.375, "learning_rate": 7.857142857142858e-06, "loss": 2.273136854171753, "step": 78 }, { "epoch": 0.04924971142747211, "grad_norm": 4.28125, "learning_rate": 8.06122448979592e-06, "loss": 2.2342689037323, "step": 80 }, { "epoch": 0.05048095421315891, "grad_norm": 7.84375, "learning_rate": 8.26530612244898e-06, "loss": 2.06551194190979, "step": 82 }, { "epoch": 0.05171219699884571, "grad_norm": 7.28125, "learning_rate": 8.469387755102042e-06, "loss": 2.0828216075897217, "step": 84 }, { "epoch": 0.05294343978453251, "grad_norm": 5.5, "learning_rate": 8.673469387755103e-06, "loss": 1.8301738500595093, "step": 86 }, { "epoch": 0.05417468257021932, "grad_norm": 3.046875, "learning_rate": 8.877551020408163e-06, "loss": 1.733726978302002, "step": 88 }, { "epoch": 0.05540592535590612, "grad_norm": 5.15625, "learning_rate": 9.081632653061225e-06, "loss": 2.214818239212036, "step": 90 }, { "epoch": 0.05663716814159292, "grad_norm": 3.359375, "learning_rate": 9.285714285714288e-06, "loss": 1.9520819187164307, "step": 92 }, { "epoch": 0.05786841092727972, "grad_norm": 1.21875, "learning_rate": 9.489795918367348e-06, "loss": 1.171900987625122, "step": 94 }, { "epoch": 0.05909965371296653, "grad_norm": 1.3203125, "learning_rate": 9.693877551020408e-06, "loss": 1.2222256660461426, "step": 96 }, { "epoch": 0.06033089649865333, "grad_norm": 5.0, "learning_rate": 9.89795918367347e-06, "loss": 1.6900241374969482, "step": 98 }, { "epoch": 0.06156213928434013, "grad_norm": 2.234375, "learning_rate": 9.999998013185654e-06, "loss": 1.8082116842269897, "step": 100 }, { "epoch": 0.06279338207002694, "grad_norm": 4.40625, "learning_rate": 9.999982118682718e-06, "loss": 2.431914806365967, "step": 102 }, { "epoch": 0.06402462485571374, "grad_norm": 10.0, "learning_rate": 9.999950329740007e-06, "loss": 2.414013624191284, "step": 104 }, { "epoch": 0.06525586764140054, "grad_norm": 5.34375, "learning_rate": 9.999902646483837e-06, "loss": 1.9995551109313965, "step": 106 }, { "epoch": 0.06648711042708734, "grad_norm": 3.828125, "learning_rate": 9.999839069103682e-06, "loss": 2.1116445064544678, "step": 108 }, { "epoch": 0.06771835321277414, "grad_norm": 9.3125, "learning_rate": 9.999759597852176e-06, "loss": 1.7808245420455933, "step": 110 }, { "epoch": 0.06894959599846094, "grad_norm": 48.0, "learning_rate": 9.99966423304511e-06, "loss": 1.754453420639038, "step": 112 }, { "epoch": 0.07018083878414776, "grad_norm": 1.8046875, "learning_rate": 9.999552975061427e-06, "loss": 1.0939162969589233, "step": 114 }, { "epoch": 0.07141208156983456, "grad_norm": 1.8828125, "learning_rate": 9.999425824343223e-06, "loss": 1.3613158464431763, "step": 116 }, { "epoch": 0.07264332435552136, "grad_norm": 2.953125, "learning_rate": 9.99928278139575e-06, "loss": 1.8957613706588745, "step": 118 }, { "epoch": 0.07387456714120816, "grad_norm": 2.375, "learning_rate": 9.999123846787406e-06, "loss": 1.8484387397766113, "step": 120 }, { "epoch": 0.07510580992689496, "grad_norm": 1.8515625, "learning_rate": 9.99894902114974e-06, "loss": 1.4110791683197021, "step": 122 }, { "epoch": 0.07633705271258176, "grad_norm": 2.046875, "learning_rate": 9.998758305177443e-06, "loss": 1.6105386018753052, "step": 124 }, { "epoch": 0.07756829549826856, "grad_norm": 1.9375, "learning_rate": 9.998551699628347e-06, "loss": 1.7053779363632202, "step": 126 }, { "epoch": 0.07879953828395536, "grad_norm": 2.796875, "learning_rate": 9.998329205323427e-06, "loss": 1.6974133253097534, "step": 128 }, { "epoch": 0.08003078106964218, "grad_norm": 1.8671875, "learning_rate": 9.998090823146794e-06, "loss": 1.7098684310913086, "step": 130 }, { "epoch": 0.08126202385532898, "grad_norm": 1.90625, "learning_rate": 9.997836554045689e-06, "loss": 1.699587345123291, "step": 132 }, { "epoch": 0.08249326664101578, "grad_norm": 3.734375, "learning_rate": 9.997566399030483e-06, "loss": 2.0180535316467285, "step": 134 }, { "epoch": 0.08372450942670258, "grad_norm": 4.3125, "learning_rate": 9.99728035917467e-06, "loss": 1.9283902645111084, "step": 136 }, { "epoch": 0.08495575221238938, "grad_norm": 3.953125, "learning_rate": 9.996978435614866e-06, "loss": 1.7642266750335693, "step": 138 }, { "epoch": 0.08618699499807618, "grad_norm": 3.59375, "learning_rate": 9.996660629550805e-06, "loss": 1.8200689554214478, "step": 140 }, { "epoch": 0.08741823778376298, "grad_norm": 3.296875, "learning_rate": 9.99632694224533e-06, "loss": 1.5669602155685425, "step": 142 }, { "epoch": 0.08864948056944978, "grad_norm": 2.75, "learning_rate": 9.995977375024389e-06, "loss": 1.6687263250350952, "step": 144 }, { "epoch": 0.0898807233551366, "grad_norm": 6.375, "learning_rate": 9.995611929277029e-06, "loss": 1.7212157249450684, "step": 146 }, { "epoch": 0.0911119661408234, "grad_norm": 2.8125, "learning_rate": 9.9952306064554e-06, "loss": 1.6282907724380493, "step": 148 }, { "epoch": 0.0923432089265102, "grad_norm": 5.6875, "learning_rate": 9.994833408074736e-06, "loss": 1.6188124418258667, "step": 150 }, { "epoch": 0.093574451712197, "grad_norm": 3.359375, "learning_rate": 9.994420335713354e-06, "loss": 1.6069148778915405, "step": 152 }, { "epoch": 0.0948056944978838, "grad_norm": 2.09375, "learning_rate": 9.99399139101265e-06, "loss": 1.7498996257781982, "step": 154 }, { "epoch": 0.0960369372835706, "grad_norm": 4.9375, "learning_rate": 9.99354657567709e-06, "loss": 1.7885342836380005, "step": 156 }, { "epoch": 0.0972681800692574, "grad_norm": 3.109375, "learning_rate": 9.993085891474208e-06, "loss": 1.6744879484176636, "step": 158 }, { "epoch": 0.09849942285494422, "grad_norm": 4.65625, "learning_rate": 9.99260934023459e-06, "loss": 1.71375572681427, "step": 160 }, { "epoch": 0.09973066564063102, "grad_norm": 1.3359375, "learning_rate": 9.992116923851869e-06, "loss": 1.1846431493759155, "step": 162 }, { "epoch": 0.10096190842631782, "grad_norm": 1.5390625, "learning_rate": 9.99160864428273e-06, "loss": 1.2758105993270874, "step": 164 }, { "epoch": 0.10219315121200462, "grad_norm": 0.9765625, "learning_rate": 9.991084503546882e-06, "loss": 1.1898852586746216, "step": 166 }, { "epoch": 0.10342439399769142, "grad_norm": 1.2265625, "learning_rate": 9.99054450372707e-06, "loss": 1.1127582788467407, "step": 168 }, { "epoch": 0.10465563678337822, "grad_norm": 5.375, "learning_rate": 9.989988646969049e-06, "loss": 1.9128127098083496, "step": 170 }, { "epoch": 0.10588687956906502, "grad_norm": 3.8125, "learning_rate": 9.989416935481586e-06, "loss": 1.824889063835144, "step": 172 }, { "epoch": 0.10711812235475182, "grad_norm": 2.28125, "learning_rate": 9.988829371536453e-06, "loss": 1.4687048196792603, "step": 174 }, { "epoch": 0.10834936514043864, "grad_norm": 2.046875, "learning_rate": 9.988225957468409e-06, "loss": 1.603397011756897, "step": 176 }, { "epoch": 0.10958060792612544, "grad_norm": 2.671875, "learning_rate": 9.987606695675196e-06, "loss": 1.3623863458633423, "step": 178 }, { "epoch": 0.11081185071181224, "grad_norm": 1.84375, "learning_rate": 9.986971588617529e-06, "loss": 1.2137137651443481, "step": 180 }, { "epoch": 0.11204309349749904, "grad_norm": 5.65625, "learning_rate": 9.986320638819092e-06, "loss": 1.8477953672409058, "step": 182 }, { "epoch": 0.11327433628318584, "grad_norm": 5.03125, "learning_rate": 9.98565384886651e-06, "loss": 1.6397606134414673, "step": 184 }, { "epoch": 0.11450557906887264, "grad_norm": 1.9453125, "learning_rate": 9.984971221409363e-06, "loss": 1.579763412475586, "step": 186 }, { "epoch": 0.11573682185455944, "grad_norm": 2.03125, "learning_rate": 9.984272759160156e-06, "loss": 1.582782506942749, "step": 188 }, { "epoch": 0.11696806464024626, "grad_norm": 2.0625, "learning_rate": 9.983558464894318e-06, "loss": 1.4831057786941528, "step": 190 }, { "epoch": 0.11819930742593306, "grad_norm": 2.9375, "learning_rate": 9.982828341450193e-06, "loss": 1.6540857553482056, "step": 192 }, { "epoch": 0.11943055021161986, "grad_norm": 1.203125, "learning_rate": 9.982082391729014e-06, "loss": 1.1849217414855957, "step": 194 }, { "epoch": 0.12066179299730666, "grad_norm": 0.92578125, "learning_rate": 9.981320618694908e-06, "loss": 1.2112078666687012, "step": 196 }, { "epoch": 0.12189303578299346, "grad_norm": 1.875, "learning_rate": 9.98054302537488e-06, "loss": 1.2096866369247437, "step": 198 }, { "epoch": 0.12312427856868026, "grad_norm": 2.015625, "learning_rate": 9.979749614858793e-06, "loss": 1.3210318088531494, "step": 200 }, { "epoch": 0.12435552135436706, "grad_norm": 1.2109375, "learning_rate": 9.978940390299372e-06, "loss": 1.232903003692627, "step": 202 }, { "epoch": 0.12558676414005387, "grad_norm": 9.0625, "learning_rate": 9.978115354912168e-06, "loss": 1.129847526550293, "step": 204 }, { "epoch": 0.12681800692574066, "grad_norm": 2.546875, "learning_rate": 9.977274511975564e-06, "loss": 1.5993540287017822, "step": 206 }, { "epoch": 0.12804924971142748, "grad_norm": 2.578125, "learning_rate": 9.976417864830761e-06, "loss": 1.5882292985916138, "step": 208 }, { "epoch": 0.12928049249711426, "grad_norm": 8.3125, "learning_rate": 9.97554541688175e-06, "loss": 0.9923779964447021, "step": 210 }, { "epoch": 0.13051173528280108, "grad_norm": 1.15625, "learning_rate": 9.974657171595321e-06, "loss": 1.1275218725204468, "step": 212 }, { "epoch": 0.1317429780684879, "grad_norm": 2.03125, "learning_rate": 9.973753132501028e-06, "loss": 1.521848201751709, "step": 214 }, { "epoch": 0.13297422085417468, "grad_norm": 3.859375, "learning_rate": 9.972833303191184e-06, "loss": 1.5856269598007202, "step": 216 }, { "epoch": 0.1342054636398615, "grad_norm": 2.84375, "learning_rate": 9.971897687320847e-06, "loss": 1.6335004568099976, "step": 218 }, { "epoch": 0.13543670642554828, "grad_norm": 2.625, "learning_rate": 9.970946288607809e-06, "loss": 1.616341471672058, "step": 220 }, { "epoch": 0.1366679492112351, "grad_norm": 1.421875, "learning_rate": 9.969979110832574e-06, "loss": 1.0930131673812866, "step": 222 }, { "epoch": 0.13789919199692188, "grad_norm": 1.0078125, "learning_rate": 9.968996157838345e-06, "loss": 0.9788625240325928, "step": 224 }, { "epoch": 0.1391304347826087, "grad_norm": 6.4375, "learning_rate": 9.967997433531014e-06, "loss": 1.7790307998657227, "step": 226 }, { "epoch": 0.1403616775682955, "grad_norm": 5.53125, "learning_rate": 9.966982941879135e-06, "loss": 1.9695312976837158, "step": 228 }, { "epoch": 0.1415929203539823, "grad_norm": 2.375, "learning_rate": 9.965952686913926e-06, "loss": 1.5633316040039062, "step": 230 }, { "epoch": 0.1428241631396691, "grad_norm": 2.78125, "learning_rate": 9.964906672729232e-06, "loss": 1.5933005809783936, "step": 232 }, { "epoch": 0.1440554059253559, "grad_norm": 1.0703125, "learning_rate": 9.963844903481525e-06, "loss": 1.1736454963684082, "step": 234 }, { "epoch": 0.14528664871104272, "grad_norm": 1.4140625, "learning_rate": 9.96276738338988e-06, "loss": 1.1599225997924805, "step": 236 }, { "epoch": 0.1465178914967295, "grad_norm": 1.1171875, "learning_rate": 9.961674116735964e-06, "loss": 0.9839186668395996, "step": 238 }, { "epoch": 0.14774913428241632, "grad_norm": 0.921875, "learning_rate": 9.960565107864008e-06, "loss": 1.1017593145370483, "step": 240 }, { "epoch": 0.1489803770681031, "grad_norm": 2.234375, "learning_rate": 9.959440361180803e-06, "loss": 1.6118263006210327, "step": 242 }, { "epoch": 0.15021161985378992, "grad_norm": 1.984375, "learning_rate": 9.958299881155673e-06, "loss": 1.5592412948608398, "step": 244 }, { "epoch": 0.15144286263947673, "grad_norm": 4.84375, "learning_rate": 9.95714367232046e-06, "loss": 1.6599498987197876, "step": 246 }, { "epoch": 0.15267410542516352, "grad_norm": 2.90625, "learning_rate": 9.955971739269507e-06, "loss": 1.6609536409378052, "step": 248 }, { "epoch": 0.15390534821085033, "grad_norm": 5.15625, "learning_rate": 9.95478408665964e-06, "loss": 1.8779211044311523, "step": 250 }, { "epoch": 0.15513659099653712, "grad_norm": 2.203125, "learning_rate": 9.953580719210152e-06, "loss": 2.085038661956787, "step": 252 }, { "epoch": 0.15636783378222394, "grad_norm": 2.328125, "learning_rate": 9.952361641702772e-06, "loss": 1.5933791399002075, "step": 254 }, { "epoch": 0.15759907656791072, "grad_norm": 2.390625, "learning_rate": 9.951126858981663e-06, "loss": 1.5544782876968384, "step": 256 }, { "epoch": 0.15883031935359754, "grad_norm": 1.96875, "learning_rate": 9.94987637595339e-06, "loss": 1.189842939376831, "step": 258 }, { "epoch": 0.16006156213928435, "grad_norm": 0.90625, "learning_rate": 9.94861019758691e-06, "loss": 1.2431635856628418, "step": 260 }, { "epoch": 0.16129280492497114, "grad_norm": 0.490234375, "learning_rate": 9.947328328913541e-06, "loss": 1.260135531425476, "step": 262 }, { "epoch": 0.16252404771065795, "grad_norm": 4.6875, "learning_rate": 9.946030775026954e-06, "loss": 1.277756929397583, "step": 264 }, { "epoch": 0.16375529049634474, "grad_norm": 1.1015625, "learning_rate": 9.944717541083144e-06, "loss": 1.1507606506347656, "step": 266 }, { "epoch": 0.16498653328203156, "grad_norm": 1.875, "learning_rate": 9.943388632300416e-06, "loss": 1.0960922241210938, "step": 268 }, { "epoch": 0.16621777606771834, "grad_norm": 2.8125, "learning_rate": 9.942044053959356e-06, "loss": 1.625441551208496, "step": 270 }, { "epoch": 0.16744901885340516, "grad_norm": 2.3125, "learning_rate": 9.940683811402821e-06, "loss": 1.5875778198242188, "step": 272 }, { "epoch": 0.16868026163909197, "grad_norm": 3.953125, "learning_rate": 9.939307910035902e-06, "loss": 1.6671921014785767, "step": 274 }, { "epoch": 0.16991150442477876, "grad_norm": 2.6875, "learning_rate": 9.937916355325924e-06, "loss": 1.627016544342041, "step": 276 }, { "epoch": 0.17114274721046557, "grad_norm": 2.1875, "learning_rate": 9.936509152802411e-06, "loss": 1.5069929361343384, "step": 278 }, { "epoch": 0.17237398999615236, "grad_norm": 2.328125, "learning_rate": 9.935086308057054e-06, "loss": 1.5513195991516113, "step": 280 }, { "epoch": 0.17360523278183917, "grad_norm": 2.296875, "learning_rate": 9.933647826743712e-06, "loss": 1.5851318836212158, "step": 282 }, { "epoch": 0.17483647556752596, "grad_norm": 2.734375, "learning_rate": 9.932193714578376e-06, "loss": 1.6008888483047485, "step": 284 }, { "epoch": 0.17606771835321278, "grad_norm": 2.453125, "learning_rate": 9.93072397733914e-06, "loss": 1.5561046600341797, "step": 286 }, { "epoch": 0.17729896113889956, "grad_norm": 5.03125, "learning_rate": 9.929238620866201e-06, "loss": 1.55705988407135, "step": 288 }, { "epoch": 0.17853020392458638, "grad_norm": 2.609375, "learning_rate": 9.927737651061806e-06, "loss": 1.5694118738174438, "step": 290 }, { "epoch": 0.1797614467102732, "grad_norm": 2.390625, "learning_rate": 9.926221073890254e-06, "loss": 1.6122347116470337, "step": 292 }, { "epoch": 0.18099268949595998, "grad_norm": 4.21875, "learning_rate": 9.924688895377858e-06, "loss": 1.6376450061798096, "step": 294 }, { "epoch": 0.1822239322816468, "grad_norm": 12.25, "learning_rate": 9.923141121612922e-06, "loss": 1.4908955097198486, "step": 296 }, { "epoch": 0.18345517506733358, "grad_norm": 3.625, "learning_rate": 9.921577758745726e-06, "loss": 1.6120257377624512, "step": 298 }, { "epoch": 0.1846864178530204, "grad_norm": 2.96875, "learning_rate": 9.91999881298849e-06, "loss": 1.654007911682129, "step": 300 }, { "epoch": 0.18591766063870718, "grad_norm": 23.125, "learning_rate": 9.918404290615358e-06, "loss": 1.1257104873657227, "step": 302 }, { "epoch": 0.187148903424394, "grad_norm": 26.875, "learning_rate": 9.916794197962367e-06, "loss": 0.9994939565658569, "step": 304 }, { "epoch": 0.1883801462100808, "grad_norm": 2.328125, "learning_rate": 9.915168541427424e-06, "loss": 1.6332511901855469, "step": 306 }, { "epoch": 0.1896113889957676, "grad_norm": 3.75, "learning_rate": 9.91352732747029e-06, "loss": 1.7764713764190674, "step": 308 }, { "epoch": 0.1908426317814544, "grad_norm": 5.03125, "learning_rate": 9.911870562612528e-06, "loss": 0.8898857831954956, "step": 310 }, { "epoch": 0.1920738745671412, "grad_norm": 4.5625, "learning_rate": 9.910198253437513e-06, "loss": 1.0964455604553223, "step": 312 }, { "epoch": 0.19330511735282802, "grad_norm": 2.578125, "learning_rate": 9.908510406590374e-06, "loss": 1.5099912881851196, "step": 314 }, { "epoch": 0.1945363601385148, "grad_norm": 3.96875, "learning_rate": 9.906807028777992e-06, "loss": 1.5241334438323975, "step": 316 }, { "epoch": 0.19576760292420162, "grad_norm": 6.28125, "learning_rate": 9.90508812676895e-06, "loss": 1.4628498554229736, "step": 318 }, { "epoch": 0.19699884570988843, "grad_norm": 9.125, "learning_rate": 9.903353707393529e-06, "loss": 1.0449775457382202, "step": 320 }, { "epoch": 0.19823008849557522, "grad_norm": 3.359375, "learning_rate": 9.90160377754366e-06, "loss": 1.5512725114822388, "step": 322 }, { "epoch": 0.19946133128126203, "grad_norm": 9.5625, "learning_rate": 9.89983834417292e-06, "loss": 1.4961134195327759, "step": 324 }, { "epoch": 0.20069257406694882, "grad_norm": 2.328125, "learning_rate": 9.898057414296481e-06, "loss": 1.173062801361084, "step": 326 }, { "epoch": 0.20192381685263563, "grad_norm": 1.8984375, "learning_rate": 9.896260994991093e-06, "loss": 1.42746102809906, "step": 328 }, { "epoch": 0.20315505963832242, "grad_norm": 2.359375, "learning_rate": 9.89444909339506e-06, "loss": 1.5326873064041138, "step": 330 }, { "epoch": 0.20438630242400924, "grad_norm": 2.734375, "learning_rate": 9.892621716708204e-06, "loss": 1.5838617086410522, "step": 332 }, { "epoch": 0.20561754520969602, "grad_norm": 2.859375, "learning_rate": 9.89077887219184e-06, "loss": 1.2709892988204956, "step": 334 }, { "epoch": 0.20684878799538284, "grad_norm": 2.46875, "learning_rate": 9.888920567168744e-06, "loss": 1.4641536474227905, "step": 336 }, { "epoch": 0.20808003078106965, "grad_norm": 2.515625, "learning_rate": 9.887046809023133e-06, "loss": 1.2603651285171509, "step": 338 }, { "epoch": 0.20931127356675644, "grad_norm": 1.296875, "learning_rate": 9.88515760520062e-06, "loss": 1.167304277420044, "step": 340 }, { "epoch": 0.21054251635244325, "grad_norm": 3.78125, "learning_rate": 9.883252963208198e-06, "loss": 2.0254769325256348, "step": 342 }, { "epoch": 0.21177375913813004, "grad_norm": 3.421875, "learning_rate": 9.881332890614206e-06, "loss": 2.1020188331604004, "step": 344 }, { "epoch": 0.21300500192381686, "grad_norm": 2.484375, "learning_rate": 9.879397395048298e-06, "loss": 1.5581285953521729, "step": 346 }, { "epoch": 0.21423624470950364, "grad_norm": 11.0625, "learning_rate": 9.877446484201411e-06, "loss": 1.601905345916748, "step": 348 }, { "epoch": 0.21546748749519046, "grad_norm": 3.5, "learning_rate": 9.875480165825742e-06, "loss": 1.564781665802002, "step": 350 }, { "epoch": 0.21669873028087727, "grad_norm": 3.53125, "learning_rate": 9.873498447734707e-06, "loss": 1.535958170890808, "step": 352 }, { "epoch": 0.21792997306656406, "grad_norm": 2.703125, "learning_rate": 9.871501337802914e-06, "loss": 1.5023021697998047, "step": 354 }, { "epoch": 0.21916121585225087, "grad_norm": 3.4375, "learning_rate": 9.869488843966132e-06, "loss": 1.558158278465271, "step": 356 }, { "epoch": 0.22039245863793766, "grad_norm": 3.5, "learning_rate": 9.86746097422127e-06, "loss": 1.9833602905273438, "step": 358 }, { "epoch": 0.22162370142362448, "grad_norm": 3.78125, "learning_rate": 9.865417736626321e-06, "loss": 1.7728450298309326, "step": 360 }, { "epoch": 0.22285494420931126, "grad_norm": 9.625, "learning_rate": 9.863359139300352e-06, "loss": 1.9449533224105835, "step": 362 }, { "epoch": 0.22408618699499808, "grad_norm": 2.984375, "learning_rate": 9.861285190423466e-06, "loss": 1.9914966821670532, "step": 364 }, { "epoch": 0.2253174297806849, "grad_norm": 9.0625, "learning_rate": 9.859195898236758e-06, "loss": 1.5391563177108765, "step": 366 }, { "epoch": 0.22654867256637168, "grad_norm": 3.515625, "learning_rate": 9.857091271042301e-06, "loss": 1.5083098411560059, "step": 368 }, { "epoch": 0.2277799153520585, "grad_norm": 4.6875, "learning_rate": 9.854971317203097e-06, "loss": 1.6200501918792725, "step": 370 }, { "epoch": 0.22901115813774528, "grad_norm": 2.328125, "learning_rate": 9.852836045143056e-06, "loss": 1.5281012058258057, "step": 372 }, { "epoch": 0.2302424009234321, "grad_norm": 2.546875, "learning_rate": 9.850685463346956e-06, "loss": 1.4439934492111206, "step": 374 }, { "epoch": 0.23147364370911888, "grad_norm": 2.921875, "learning_rate": 9.848519580360403e-06, "loss": 1.5237040519714355, "step": 376 }, { "epoch": 0.2327048864948057, "grad_norm": 3.109375, "learning_rate": 9.846338404789812e-06, "loss": 1.8433338403701782, "step": 378 }, { "epoch": 0.2339361292804925, "grad_norm": 3.015625, "learning_rate": 9.844141945302366e-06, "loss": 1.9542083740234375, "step": 380 }, { "epoch": 0.2351673720661793, "grad_norm": 4.65625, "learning_rate": 9.841930210625972e-06, "loss": 1.5500494241714478, "step": 382 }, { "epoch": 0.2363986148518661, "grad_norm": 2.359375, "learning_rate": 9.839703209549246e-06, "loss": 1.4857137203216553, "step": 384 }, { "epoch": 0.2376298576375529, "grad_norm": 2.140625, "learning_rate": 9.837460950921454e-06, "loss": 1.506941556930542, "step": 386 }, { "epoch": 0.23886110042323971, "grad_norm": 3.078125, "learning_rate": 9.835203443652502e-06, "loss": 1.4529392719268799, "step": 388 }, { "epoch": 0.2400923432089265, "grad_norm": 6.09375, "learning_rate": 9.83293069671288e-06, "loss": 0.4469324052333832, "step": 390 }, { "epoch": 0.24132358599461332, "grad_norm": 3.453125, "learning_rate": 9.830642719133646e-06, "loss": 0.5282363891601562, "step": 392 }, { "epoch": 0.2425548287803001, "grad_norm": 17.875, "learning_rate": 9.828339520006363e-06, "loss": 0.8444979190826416, "step": 394 }, { "epoch": 0.24378607156598692, "grad_norm": 6.03125, "learning_rate": 9.826021108483089e-06, "loss": 0.8458063006401062, "step": 396 }, { "epoch": 0.24501731435167373, "grad_norm": 2.0625, "learning_rate": 9.823687493776328e-06, "loss": 1.5211682319641113, "step": 398 }, { "epoch": 0.24624855713736052, "grad_norm": 2.328125, "learning_rate": 9.821338685158996e-06, "loss": 1.4932045936584473, "step": 400 }, { "epoch": 0.24747979992304733, "grad_norm": 3.859375, "learning_rate": 9.818974691964387e-06, "loss": 1.4841368198394775, "step": 402 }, { "epoch": 0.24871104270873412, "grad_norm": 9.875, "learning_rate": 9.816595523586128e-06, "loss": 1.1101207733154297, "step": 404 }, { "epoch": 0.24994228549442093, "grad_norm": 3.59375, "learning_rate": 9.814201189478146e-06, "loss": 1.877555251121521, "step": 406 }, { "epoch": 0.25117352828010775, "grad_norm": 7.625, "learning_rate": 9.811791699154639e-06, "loss": 1.7015639543533325, "step": 408 }, { "epoch": 0.2524047710657945, "grad_norm": 1.84375, "learning_rate": 9.809367062190016e-06, "loss": 1.4604737758636475, "step": 410 }, { "epoch": 0.2536360138514813, "grad_norm": 2.46875, "learning_rate": 9.806927288218888e-06, "loss": 1.491847038269043, "step": 412 }, { "epoch": 0.25486725663716814, "grad_norm": 2.8125, "learning_rate": 9.804472386936008e-06, "loss": 1.5824358463287354, "step": 414 }, { "epoch": 0.25609849942285495, "grad_norm": 2.5, "learning_rate": 9.80200236809624e-06, "loss": 1.495304822921753, "step": 416 }, { "epoch": 0.25732974220854177, "grad_norm": 3.03125, "learning_rate": 9.799517241514516e-06, "loss": 1.4013820886611938, "step": 418 }, { "epoch": 0.2585609849942285, "grad_norm": 2.671875, "learning_rate": 9.797017017065806e-06, "loss": 1.4486945867538452, "step": 420 }, { "epoch": 0.25979222777991534, "grad_norm": 2.203125, "learning_rate": 9.794501704685071e-06, "loss": 1.541428804397583, "step": 422 }, { "epoch": 0.26102347056560216, "grad_norm": 2.078125, "learning_rate": 9.791971314367226e-06, "loss": 1.5093767642974854, "step": 424 }, { "epoch": 0.26225471335128897, "grad_norm": 1.8203125, "learning_rate": 9.789425856167101e-06, "loss": 1.5601611137390137, "step": 426 }, { "epoch": 0.2634859561369758, "grad_norm": 4.46875, "learning_rate": 9.786865340199396e-06, "loss": 1.4437766075134277, "step": 428 }, { "epoch": 0.26471719892266254, "grad_norm": 2.4375, "learning_rate": 9.784289776638653e-06, "loss": 1.5902003049850464, "step": 430 }, { "epoch": 0.26594844170834936, "grad_norm": 9.0625, "learning_rate": 9.7816991757192e-06, "loss": 1.5537315607070923, "step": 432 }, { "epoch": 0.2671796844940362, "grad_norm": 2.53125, "learning_rate": 9.77909354773512e-06, "loss": 1.4479423761367798, "step": 434 }, { "epoch": 0.268410927279723, "grad_norm": 1.8828125, "learning_rate": 9.776472903040208e-06, "loss": 1.5103721618652344, "step": 436 }, { "epoch": 0.26964217006540975, "grad_norm": 5.40625, "learning_rate": 9.773837252047936e-06, "loss": 1.7486127614974976, "step": 438 }, { "epoch": 0.27087341285109656, "grad_norm": 6.5625, "learning_rate": 9.771186605231391e-06, "loss": 1.99478280544281, "step": 440 }, { "epoch": 0.2721046556367834, "grad_norm": 6.0625, "learning_rate": 9.76852097312326e-06, "loss": 1.9923797845840454, "step": 442 }, { "epoch": 0.2733358984224702, "grad_norm": 3.234375, "learning_rate": 9.76584036631578e-06, "loss": 1.859613299369812, "step": 444 }, { "epoch": 0.274567141208157, "grad_norm": 3.203125, "learning_rate": 9.763144795460676e-06, "loss": 1.4815813302993774, "step": 446 }, { "epoch": 0.27579838399384377, "grad_norm": 3.1875, "learning_rate": 9.76043427126914e-06, "loss": 1.5721994638442993, "step": 448 }, { "epoch": 0.2770296267795306, "grad_norm": 10.875, "learning_rate": 9.757708804511798e-06, "loss": 1.4801818132400513, "step": 450 }, { "epoch": 0.2782608695652174, "grad_norm": 4.59375, "learning_rate": 9.754968406018633e-06, "loss": 1.4633471965789795, "step": 452 }, { "epoch": 0.2794921123509042, "grad_norm": 2.3125, "learning_rate": 9.752213086678965e-06, "loss": 1.6192139387130737, "step": 454 }, { "epoch": 0.280723355136591, "grad_norm": 5.0625, "learning_rate": 9.749442857441414e-06, "loss": 1.481449007987976, "step": 456 }, { "epoch": 0.2819545979222778, "grad_norm": 3.640625, "learning_rate": 9.746657729313835e-06, "loss": 0.6401450037956238, "step": 458 }, { "epoch": 0.2831858407079646, "grad_norm": 4.8125, "learning_rate": 9.743857713363294e-06, "loss": 0.5937597155570984, "step": 460 }, { "epoch": 0.2844170834936514, "grad_norm": 1.15625, "learning_rate": 9.741042820716008e-06, "loss": 1.0705316066741943, "step": 462 }, { "epoch": 0.2856483262793382, "grad_norm": 1.3515625, "learning_rate": 9.738213062557315e-06, "loss": 1.071405291557312, "step": 464 }, { "epoch": 0.286879569065025, "grad_norm": 1.796875, "learning_rate": 9.735368450131622e-06, "loss": 1.434294581413269, "step": 466 }, { "epoch": 0.2881108118507118, "grad_norm": 2.0625, "learning_rate": 9.732508994742356e-06, "loss": 1.401355504989624, "step": 468 }, { "epoch": 0.2893420546363986, "grad_norm": 3.328125, "learning_rate": 9.729634707751929e-06, "loss": 1.4860631227493286, "step": 470 }, { "epoch": 0.29057329742208543, "grad_norm": 2.078125, "learning_rate": 9.72674560058169e-06, "loss": 1.5327996015548706, "step": 472 }, { "epoch": 0.29180454020777224, "grad_norm": 3.109375, "learning_rate": 9.723841684711874e-06, "loss": 0.8789864778518677, "step": 474 }, { "epoch": 0.293035782993459, "grad_norm": 4.5625, "learning_rate": 9.72092297168156e-06, "loss": 0.7144789695739746, "step": 476 }, { "epoch": 0.2942670257791458, "grad_norm": 2.40625, "learning_rate": 9.717989473088629e-06, "loss": 1.4282610416412354, "step": 478 }, { "epoch": 0.29549826856483263, "grad_norm": 1.984375, "learning_rate": 9.715041200589709e-06, "loss": 1.4713945388793945, "step": 480 }, { "epoch": 0.29672951135051945, "grad_norm": 2.65625, "learning_rate": 9.712078165900144e-06, "loss": 1.4964369535446167, "step": 482 }, { "epoch": 0.2979607541362062, "grad_norm": 2.15625, "learning_rate": 9.709100380793924e-06, "loss": 1.5220392942428589, "step": 484 }, { "epoch": 0.299191996921893, "grad_norm": 2.140625, "learning_rate": 9.706107857103662e-06, "loss": 1.4790360927581787, "step": 486 }, { "epoch": 0.30042323970757984, "grad_norm": 2.375, "learning_rate": 9.70310060672053e-06, "loss": 1.5163480043411255, "step": 488 }, { "epoch": 0.30165448249326665, "grad_norm": 1.6015625, "learning_rate": 9.700078641594224e-06, "loss": 1.3747470378875732, "step": 490 }, { "epoch": 0.30288572527895347, "grad_norm": 3.203125, "learning_rate": 9.697041973732907e-06, "loss": 1.3642088174819946, "step": 492 }, { "epoch": 0.3041169680646402, "grad_norm": 2.84375, "learning_rate": 9.693990615203169e-06, "loss": 1.56373929977417, "step": 494 }, { "epoch": 0.30534821085032704, "grad_norm": 1.8515625, "learning_rate": 9.69092457812997e-06, "loss": 1.4737236499786377, "step": 496 }, { "epoch": 0.30657945363601385, "grad_norm": 2.359375, "learning_rate": 9.687843874696601e-06, "loss": 1.704555869102478, "step": 498 }, { "epoch": 0.30781069642170067, "grad_norm": 3.515625, "learning_rate": 9.684748517144631e-06, "loss": 1.9481480121612549, "step": 500 }, { "epoch": 0.3090419392073875, "grad_norm": 3.484375, "learning_rate": 9.681638517773857e-06, "loss": 1.8212928771972656, "step": 502 }, { "epoch": 0.31027318199307424, "grad_norm": 12.6875, "learning_rate": 9.67851388894226e-06, "loss": 1.5932549238204956, "step": 504 }, { "epoch": 0.31150442477876106, "grad_norm": 3.015625, "learning_rate": 9.675374643065951e-06, "loss": 1.5155253410339355, "step": 506 }, { "epoch": 0.3127356675644479, "grad_norm": 3.84375, "learning_rate": 9.672220792619126e-06, "loss": 1.6778662204742432, "step": 508 }, { "epoch": 0.3139669103501347, "grad_norm": 3.8125, "learning_rate": 9.669052350134009e-06, "loss": 1.4104807376861572, "step": 510 }, { "epoch": 0.31519815313582145, "grad_norm": 2.796875, "learning_rate": 9.665869328200817e-06, "loss": 1.526164174079895, "step": 512 }, { "epoch": 0.31642939592150826, "grad_norm": 1.625, "learning_rate": 9.662671739467687e-06, "loss": 1.2319751977920532, "step": 514 }, { "epoch": 0.3176606387071951, "grad_norm": 2.9375, "learning_rate": 9.65945959664065e-06, "loss": 1.6358551979064941, "step": 516 }, { "epoch": 0.3188918814928819, "grad_norm": 5.625, "learning_rate": 9.656232912483566e-06, "loss": 1.5214580297470093, "step": 518 }, { "epoch": 0.3201231242785687, "grad_norm": 1.640625, "learning_rate": 9.652991699818075e-06, "loss": 1.448410987854004, "step": 520 }, { "epoch": 0.32135436706425546, "grad_norm": 1.9140625, "learning_rate": 9.64973597152355e-06, "loss": 1.494712471961975, "step": 522 }, { "epoch": 0.3225856098499423, "grad_norm": 1.6796875, "learning_rate": 9.646465740537044e-06, "loss": 1.4891849756240845, "step": 524 }, { "epoch": 0.3238168526356291, "grad_norm": 2.09375, "learning_rate": 9.643181019853237e-06, "loss": 1.5300544500350952, "step": 526 }, { "epoch": 0.3250480954213159, "grad_norm": 2.4375, "learning_rate": 9.639881822524385e-06, "loss": 1.5648609399795532, "step": 528 }, { "epoch": 0.32627933820700267, "grad_norm": 2.859375, "learning_rate": 9.636568161660271e-06, "loss": 1.7806546688079834, "step": 530 }, { "epoch": 0.3275105809926895, "grad_norm": 3.71875, "learning_rate": 9.63324005042815e-06, "loss": 1.717749834060669, "step": 532 }, { "epoch": 0.3287418237783763, "grad_norm": 0.83203125, "learning_rate": 9.629897502052697e-06, "loss": 1.0430840253829956, "step": 534 }, { "epoch": 0.3299730665640631, "grad_norm": 0.8828125, "learning_rate": 9.626540529815954e-06, "loss": 1.063704252243042, "step": 536 }, { "epoch": 0.3312043093497499, "grad_norm": 5.9375, "learning_rate": 9.62316914705728e-06, "loss": 2.059296131134033, "step": 538 }, { "epoch": 0.3324355521354367, "grad_norm": 4.71875, "learning_rate": 9.619783367173293e-06, "loss": 2.1116085052490234, "step": 540 }, { "epoch": 0.3336667949211235, "grad_norm": 0.92578125, "learning_rate": 9.61638320361782e-06, "loss": 1.157242774963379, "step": 542 }, { "epoch": 0.3348980377068103, "grad_norm": 1.4375, "learning_rate": 9.612968669901853e-06, "loss": 1.1667792797088623, "step": 544 }, { "epoch": 0.33612928049249713, "grad_norm": 2.9375, "learning_rate": 9.609539779593472e-06, "loss": 1.5730559825897217, "step": 546 }, { "epoch": 0.33736052327818394, "grad_norm": 3.359375, "learning_rate": 9.60609654631781e-06, "loss": 1.5219664573669434, "step": 548 }, { "epoch": 0.3385917660638707, "grad_norm": 1.6328125, "learning_rate": 9.602638983756993e-06, "loss": 1.00815749168396, "step": 550 }, { "epoch": 0.3398230088495575, "grad_norm": 0.78125, "learning_rate": 9.599167105650093e-06, "loss": 1.1170340776443481, "step": 552 }, { "epoch": 0.34105425163524433, "grad_norm": 2.15625, "learning_rate": 9.595680925793058e-06, "loss": 1.3734591007232666, "step": 554 }, { "epoch": 0.34228549442093115, "grad_norm": 3.15625, "learning_rate": 9.592180458038668e-06, "loss": 1.4375791549682617, "step": 556 }, { "epoch": 0.3435167372066179, "grad_norm": 3.109375, "learning_rate": 9.588665716296481e-06, "loss": 1.5281890630722046, "step": 558 }, { "epoch": 0.3447479799923047, "grad_norm": 2.21875, "learning_rate": 9.58513671453277e-06, "loss": 1.5165098905563354, "step": 560 }, { "epoch": 0.34597922277799154, "grad_norm": 4.8125, "learning_rate": 9.581593466770473e-06, "loss": 1.736721158027649, "step": 562 }, { "epoch": 0.34721046556367835, "grad_norm": 3.625, "learning_rate": 9.578035987089143e-06, "loss": 1.5587836503982544, "step": 564 }, { "epoch": 0.34844170834936516, "grad_norm": 2.171875, "learning_rate": 9.574464289624872e-06, "loss": 1.361844778060913, "step": 566 }, { "epoch": 0.3496729511350519, "grad_norm": 2.0625, "learning_rate": 9.570878388570262e-06, "loss": 1.1876953840255737, "step": 568 }, { "epoch": 0.35090419392073874, "grad_norm": 1.640625, "learning_rate": 9.567278298174348e-06, "loss": 1.1157175302505493, "step": 570 }, { "epoch": 0.35213543670642555, "grad_norm": 1.8671875, "learning_rate": 9.563664032742546e-06, "loss": 1.4032427072525024, "step": 572 }, { "epoch": 0.35336667949211237, "grad_norm": 3.609375, "learning_rate": 9.560035606636603e-06, "loss": 1.4978705644607544, "step": 574 }, { "epoch": 0.3545979222777991, "grad_norm": 2.8125, "learning_rate": 9.556393034274536e-06, "loss": 1.4692853689193726, "step": 576 }, { "epoch": 0.35582916506348594, "grad_norm": 2.6875, "learning_rate": 9.552736330130567e-06, "loss": 1.8530701398849487, "step": 578 }, { "epoch": 0.35706040784917276, "grad_norm": 5.15625, "learning_rate": 9.54906550873508e-06, "loss": 1.5222429037094116, "step": 580 }, { "epoch": 0.35829165063485957, "grad_norm": 4.78125, "learning_rate": 9.54538058467455e-06, "loss": 1.6488416194915771, "step": 582 }, { "epoch": 0.3595228934205464, "grad_norm": 2.953125, "learning_rate": 9.541681572591498e-06, "loss": 1.8949358463287354, "step": 584 }, { "epoch": 0.36075413620623314, "grad_norm": 6.1875, "learning_rate": 9.537968487184417e-06, "loss": 1.937645673751831, "step": 586 }, { "epoch": 0.36198537899191996, "grad_norm": 5.21875, "learning_rate": 9.534241343207726e-06, "loss": 1.7506264448165894, "step": 588 }, { "epoch": 0.3632166217776068, "grad_norm": 3.3125, "learning_rate": 9.530500155471706e-06, "loss": 1.7294695377349854, "step": 590 }, { "epoch": 0.3644478645632936, "grad_norm": 3.09375, "learning_rate": 9.526744938842452e-06, "loss": 1.734103798866272, "step": 592 }, { "epoch": 0.3656791073489804, "grad_norm": 3.484375, "learning_rate": 9.522975708241788e-06, "loss": 1.663370966911316, "step": 594 }, { "epoch": 0.36691035013466716, "grad_norm": 3.125, "learning_rate": 9.51919247864724e-06, "loss": 1.817090630531311, "step": 596 }, { "epoch": 0.368141592920354, "grad_norm": 6.4375, "learning_rate": 9.515395265091948e-06, "loss": 1.9766976833343506, "step": 598 }, { "epoch": 0.3693728357060408, "grad_norm": 5.0625, "learning_rate": 9.511584082664627e-06, "loss": 1.778980016708374, "step": 600 }, { "epoch": 0.3706040784917276, "grad_norm": 2.40625, "learning_rate": 9.5077589465095e-06, "loss": 1.6368603706359863, "step": 602 }, { "epoch": 0.37183532127741437, "grad_norm": 9.8125, "learning_rate": 9.503919871826231e-06, "loss": 1.3770142793655396, "step": 604 }, { "epoch": 0.3730665640631012, "grad_norm": 3.0625, "learning_rate": 9.500066873869873e-06, "loss": 1.151017189025879, "step": 606 }, { "epoch": 0.374297806848788, "grad_norm": 0.671875, "learning_rate": 9.496199967950808e-06, "loss": 1.1048874855041504, "step": 608 }, { "epoch": 0.3755290496344748, "grad_norm": 3.453125, "learning_rate": 9.492319169434678e-06, "loss": 1.4655290842056274, "step": 610 }, { "epoch": 0.3767602924201616, "grad_norm": 1.6953125, "learning_rate": 9.488424493742337e-06, "loss": 1.4659827947616577, "step": 612 }, { "epoch": 0.3779915352058484, "grad_norm": 2.578125, "learning_rate": 9.484515956349767e-06, "loss": 1.3192390203475952, "step": 614 }, { "epoch": 0.3792227779915352, "grad_norm": 2.375, "learning_rate": 9.480593572788048e-06, "loss": 1.4272172451019287, "step": 616 }, { "epoch": 0.380454020777222, "grad_norm": 3.25, "learning_rate": 9.476657358643268e-06, "loss": 1.3437293767929077, "step": 618 }, { "epoch": 0.3816852635629088, "grad_norm": 4.71875, "learning_rate": 9.472707329556478e-06, "loss": 1.0737183094024658, "step": 620 }, { "epoch": 0.3829165063485956, "grad_norm": 2.28125, "learning_rate": 9.468743501223626e-06, "loss": 1.581071376800537, "step": 622 }, { "epoch": 0.3841477491342824, "grad_norm": 2.6875, "learning_rate": 9.464765889395485e-06, "loss": 1.5359126329421997, "step": 624 }, { "epoch": 0.3853789919199692, "grad_norm": 2.078125, "learning_rate": 9.460774509877606e-06, "loss": 1.2157002687454224, "step": 626 }, { "epoch": 0.38661023470565603, "grad_norm": 1.328125, "learning_rate": 9.456769378530246e-06, "loss": 1.188981533050537, "step": 628 }, { "epoch": 0.38784147749134285, "grad_norm": 0.9921875, "learning_rate": 9.452750511268303e-06, "loss": 1.0613259077072144, "step": 630 }, { "epoch": 0.3890727202770296, "grad_norm": 2.921875, "learning_rate": 9.448717924061264e-06, "loss": 1.053981065750122, "step": 632 }, { "epoch": 0.3903039630627164, "grad_norm": 2.046875, "learning_rate": 9.444671632933124e-06, "loss": 1.4811919927597046, "step": 634 }, { "epoch": 0.39153520584840323, "grad_norm": 3.609375, "learning_rate": 9.44061165396234e-06, "loss": 1.3906296491622925, "step": 636 }, { "epoch": 0.39276644863409005, "grad_norm": 1.25, "learning_rate": 9.436538003281759e-06, "loss": 1.1125138998031616, "step": 638 }, { "epoch": 0.39399769141977686, "grad_norm": 1.546875, "learning_rate": 9.432450697078547e-06, "loss": 1.1720834970474243, "step": 640 }, { "epoch": 0.3952289342054636, "grad_norm": 3.015625, "learning_rate": 9.428349751594143e-06, "loss": 1.5647273063659668, "step": 642 }, { "epoch": 0.39646017699115044, "grad_norm": 2.4375, "learning_rate": 9.424235183124176e-06, "loss": 1.5186619758605957, "step": 644 }, { "epoch": 0.39769141977683725, "grad_norm": 2.40625, "learning_rate": 9.420107008018404e-06, "loss": 1.479695439338684, "step": 646 }, { "epoch": 0.39892266256252407, "grad_norm": 1.8828125, "learning_rate": 9.415965242680664e-06, "loss": 1.4690086841583252, "step": 648 }, { "epoch": 0.4001539053482108, "grad_norm": 0.98828125, "learning_rate": 9.41180990356879e-06, "loss": 1.0731900930404663, "step": 650 }, { "epoch": 0.40138514813389764, "grad_norm": 0.87109375, "learning_rate": 9.407641007194547e-06, "loss": 1.0590906143188477, "step": 652 }, { "epoch": 0.40261639091958445, "grad_norm": 3.46875, "learning_rate": 9.403458570123585e-06, "loss": 1.8852177858352661, "step": 654 }, { "epoch": 0.40384763370527127, "grad_norm": 4.5, "learning_rate": 9.399262608975343e-06, "loss": 1.837098479270935, "step": 656 }, { "epoch": 0.4050788764909581, "grad_norm": 0.6640625, "learning_rate": 9.395053140423013e-06, "loss": 1.0474339723587036, "step": 658 }, { "epoch": 0.40631011927664484, "grad_norm": 0.95703125, "learning_rate": 9.390830181193458e-06, "loss": 1.0683759450912476, "step": 660 }, { "epoch": 0.40754136206233166, "grad_norm": 2.453125, "learning_rate": 9.386593748067142e-06, "loss": 1.5274485349655151, "step": 662 }, { "epoch": 0.4087726048480185, "grad_norm": 2.546875, "learning_rate": 9.382343857878075e-06, "loss": 1.450246810913086, "step": 664 }, { "epoch": 0.4100038476337053, "grad_norm": 4.4375, "learning_rate": 9.378080527513738e-06, "loss": 1.6065733432769775, "step": 666 }, { "epoch": 0.41123509041939205, "grad_norm": 7.5, "learning_rate": 9.373803773915018e-06, "loss": 1.5195866823196411, "step": 668 }, { "epoch": 0.41246633320507886, "grad_norm": 0.8046875, "learning_rate": 9.369513614076142e-06, "loss": 1.011305570602417, "step": 670 }, { "epoch": 0.4136975759907657, "grad_norm": 1.359375, "learning_rate": 9.365210065044609e-06, "loss": 1.108022689819336, "step": 672 }, { "epoch": 0.4149288187764525, "grad_norm": 3.4375, "learning_rate": 9.360893143921121e-06, "loss": 1.8421379327774048, "step": 674 }, { "epoch": 0.4161600615621393, "grad_norm": 6.96875, "learning_rate": 9.356562867859511e-06, "loss": 1.377231478691101, "step": 676 }, { "epoch": 0.41739130434782606, "grad_norm": 1.390625, "learning_rate": 9.352219254066691e-06, "loss": 1.1049342155456543, "step": 678 }, { "epoch": 0.4186225471335129, "grad_norm": 0.84375, "learning_rate": 9.347862319802558e-06, "loss": 1.141373634338379, "step": 680 }, { "epoch": 0.4198537899191997, "grad_norm": 4.75, "learning_rate": 9.343492082379952e-06, "loss": 1.5442224740982056, "step": 682 }, { "epoch": 0.4210850327048865, "grad_norm": 4.03125, "learning_rate": 9.339108559164567e-06, "loss": 1.4725855588912964, "step": 684 }, { "epoch": 0.4223162754905733, "grad_norm": 1.828125, "learning_rate": 9.334711767574893e-06, "loss": 1.1119123697280884, "step": 686 }, { "epoch": 0.4235475182762601, "grad_norm": 1.4375, "learning_rate": 9.330301725082143e-06, "loss": 0.972973644733429, "step": 688 }, { "epoch": 0.4247787610619469, "grad_norm": 2.796875, "learning_rate": 9.325878449210181e-06, "loss": 1.7179160118103027, "step": 690 }, { "epoch": 0.4260100038476337, "grad_norm": 3.375, "learning_rate": 9.321441957535464e-06, "loss": 1.850766897201538, "step": 692 }, { "epoch": 0.4272412466333205, "grad_norm": 3.1875, "learning_rate": 9.316992267686955e-06, "loss": 1.4963725805282593, "step": 694 }, { "epoch": 0.4284724894190073, "grad_norm": 2.625, "learning_rate": 9.312529397346066e-06, "loss": 1.4499911069869995, "step": 696 }, { "epoch": 0.4297037322046941, "grad_norm": 15.8125, "learning_rate": 9.308053364246581e-06, "loss": 1.814996361732483, "step": 698 }, { "epoch": 0.4309349749903809, "grad_norm": 3.828125, "learning_rate": 9.303564186174593e-06, "loss": 1.6408932209014893, "step": 700 }, { "epoch": 0.43216621777606773, "grad_norm": 0.703125, "learning_rate": 9.299061880968416e-06, "loss": 0.9904305338859558, "step": 702 }, { "epoch": 0.43339746056175454, "grad_norm": 0.93359375, "learning_rate": 9.294546466518544e-06, "loss": 0.979654848575592, "step": 704 }, { "epoch": 0.4346287033474413, "grad_norm": 2.0625, "learning_rate": 9.290017960767545e-06, "loss": 1.3709770441055298, "step": 706 }, { "epoch": 0.4358599461331281, "grad_norm": 1.8359375, "learning_rate": 9.285476381710021e-06, "loss": 1.4823150634765625, "step": 708 }, { "epoch": 0.43709118891881493, "grad_norm": 4.34375, "learning_rate": 9.280921747392515e-06, "loss": 1.5808249711990356, "step": 710 }, { "epoch": 0.43832243170450175, "grad_norm": 3.0625, "learning_rate": 9.276354075913445e-06, "loss": 1.526861310005188, "step": 712 }, { "epoch": 0.43955367449018856, "grad_norm": 2.421875, "learning_rate": 9.271773385423042e-06, "loss": 1.6235942840576172, "step": 714 }, { "epoch": 0.4407849172758753, "grad_norm": 2.3125, "learning_rate": 9.267179694123259e-06, "loss": 1.514561414718628, "step": 716 }, { "epoch": 0.44201616006156214, "grad_norm": 3.25, "learning_rate": 9.26257302026772e-06, "loss": 0.8033193349838257, "step": 718 }, { "epoch": 0.44324740284724895, "grad_norm": 2.546875, "learning_rate": 9.257953382161628e-06, "loss": 0.6848942041397095, "step": 720 }, { "epoch": 0.44447864563293576, "grad_norm": 2.625, "learning_rate": 9.253320798161709e-06, "loss": 1.8998783826828003, "step": 722 }, { "epoch": 0.4457098884186225, "grad_norm": 3.078125, "learning_rate": 9.248675286676126e-06, "loss": 2.021900177001953, "step": 724 }, { "epoch": 0.44694113120430934, "grad_norm": 3.0, "learning_rate": 9.244016866164406e-06, "loss": 1.5490355491638184, "step": 726 }, { "epoch": 0.44817237398999615, "grad_norm": 2.671875, "learning_rate": 9.239345555137387e-06, "loss": 1.566870093345642, "step": 728 }, { "epoch": 0.44940361677568297, "grad_norm": 4.15625, "learning_rate": 9.234661372157114e-06, "loss": 1.9461112022399902, "step": 730 }, { "epoch": 0.4506348595613698, "grad_norm": 3.53125, "learning_rate": 9.22996433583679e-06, "loss": 1.6727873086929321, "step": 732 }, { "epoch": 0.45186610234705654, "grad_norm": 3.75, "learning_rate": 9.225254464840686e-06, "loss": 1.6863263845443726, "step": 734 }, { "epoch": 0.45309734513274336, "grad_norm": 5.71875, "learning_rate": 9.220531777884077e-06, "loss": 1.6813945770263672, "step": 736 }, { "epoch": 0.45432858791843017, "grad_norm": 1.765625, "learning_rate": 9.215796293733162e-06, "loss": 1.3949999809265137, "step": 738 }, { "epoch": 0.455559830704117, "grad_norm": 3.234375, "learning_rate": 9.21104803120499e-06, "loss": 1.4089975357055664, "step": 740 }, { "epoch": 0.45679107348980375, "grad_norm": 2.4375, "learning_rate": 9.206287009167393e-06, "loss": 1.4907400608062744, "step": 742 }, { "epoch": 0.45802231627549056, "grad_norm": 2.015625, "learning_rate": 9.201513246538901e-06, "loss": 1.4012898206710815, "step": 744 }, { "epoch": 0.4592535590611774, "grad_norm": 1.921875, "learning_rate": 9.196726762288662e-06, "loss": 1.4157438278198242, "step": 746 }, { "epoch": 0.4604848018468642, "grad_norm": 1.984375, "learning_rate": 9.191927575436388e-06, "loss": 1.4142546653747559, "step": 748 }, { "epoch": 0.461716044632551, "grad_norm": 4.4375, "learning_rate": 9.187115705052261e-06, "loss": 0.5696741342544556, "step": 750 }, { "epoch": 0.46294728741823776, "grad_norm": 6.78125, "learning_rate": 9.18229117025686e-06, "loss": 0.30877745151519775, "step": 752 }, { "epoch": 0.4641785302039246, "grad_norm": 3.890625, "learning_rate": 9.177453990221092e-06, "loss": 1.320806860923767, "step": 754 }, { "epoch": 0.4654097729896114, "grad_norm": 2.015625, "learning_rate": 9.17260418416611e-06, "loss": 1.4618957042694092, "step": 756 }, { "epoch": 0.4666410157752982, "grad_norm": 11.4375, "learning_rate": 9.167741771363234e-06, "loss": 0.4812394082546234, "step": 758 }, { "epoch": 0.467872258560985, "grad_norm": 5.46875, "learning_rate": 9.162866771133888e-06, "loss": 0.23717719316482544, "step": 760 }, { "epoch": 0.4691035013466718, "grad_norm": 2.765625, "learning_rate": 9.157979202849505e-06, "loss": 1.6795170307159424, "step": 762 }, { "epoch": 0.4703347441323586, "grad_norm": 16.5, "learning_rate": 9.15307908593146e-06, "loss": 1.8667967319488525, "step": 764 }, { "epoch": 0.4715659869180454, "grad_norm": 3.6875, "learning_rate": 9.148166439850996e-06, "loss": 1.3985189199447632, "step": 766 }, { "epoch": 0.4727972297037322, "grad_norm": 2.453125, "learning_rate": 9.143241284129136e-06, "loss": 1.398207664489746, "step": 768 }, { "epoch": 0.474028472489419, "grad_norm": 4.34375, "learning_rate": 9.138303638336623e-06, "loss": 1.7506206035614014, "step": 770 }, { "epoch": 0.4752597152751058, "grad_norm": 3.09375, "learning_rate": 9.133353522093815e-06, "loss": 1.8263590335845947, "step": 772 }, { "epoch": 0.4764909580607926, "grad_norm": 2.78125, "learning_rate": 9.128390955070634e-06, "loss": 1.791210412979126, "step": 774 }, { "epoch": 0.47772220084647943, "grad_norm": 4.90625, "learning_rate": 9.123415956986475e-06, "loss": 1.8291805982589722, "step": 776 }, { "epoch": 0.47895344363216624, "grad_norm": 1.484375, "learning_rate": 9.118428547610125e-06, "loss": 1.1543712615966797, "step": 778 }, { "epoch": 0.480184686417853, "grad_norm": 1.3984375, "learning_rate": 9.113428746759696e-06, "loss": 1.1177196502685547, "step": 780 }, { "epoch": 0.4814159292035398, "grad_norm": 2.578125, "learning_rate": 9.108416574302534e-06, "loss": 1.8869653940200806, "step": 782 }, { "epoch": 0.48264717198922663, "grad_norm": 3.53125, "learning_rate": 9.103392050155145e-06, "loss": 1.6388249397277832, "step": 784 }, { "epoch": 0.48387841477491345, "grad_norm": 1.75, "learning_rate": 9.09835519428312e-06, "loss": 1.3413951396942139, "step": 786 }, { "epoch": 0.4851096575606002, "grad_norm": 1.9296875, "learning_rate": 9.093306026701043e-06, "loss": 1.431657075881958, "step": 788 }, { "epoch": 0.486340900346287, "grad_norm": 2.765625, "learning_rate": 9.088244567472433e-06, "loss": 1.496319055557251, "step": 790 }, { "epoch": 0.48757214313197383, "grad_norm": 1.984375, "learning_rate": 9.083170836709643e-06, "loss": 1.4392622709274292, "step": 792 }, { "epoch": 0.48880338591766065, "grad_norm": 5.15625, "learning_rate": 9.078084854573788e-06, "loss": 0.24732553958892822, "step": 794 }, { "epoch": 0.49003462870334746, "grad_norm": 3.015625, "learning_rate": 9.072986641274668e-06, "loss": 0.38419798016548157, "step": 796 }, { "epoch": 0.4912658714890342, "grad_norm": 3.140625, "learning_rate": 9.067876217070686e-06, "loss": 1.5040916204452515, "step": 798 }, { "epoch": 0.49249711427472104, "grad_norm": 2.734375, "learning_rate": 9.062753602268766e-06, "loss": 1.4809836149215698, "step": 800 }, { "epoch": 0.49372835706040785, "grad_norm": 3.203125, "learning_rate": 9.057618817224268e-06, "loss": 1.7967525720596313, "step": 802 }, { "epoch": 0.49495959984609467, "grad_norm": 3.203125, "learning_rate": 9.05247188234092e-06, "loss": 1.9494469165802002, "step": 804 }, { "epoch": 0.4961908426317815, "grad_norm": 1.7578125, "learning_rate": 9.047312818070726e-06, "loss": 1.2794251441955566, "step": 806 }, { "epoch": 0.49742208541746824, "grad_norm": 1.953125, "learning_rate": 9.04214164491388e-06, "loss": 1.5466313362121582, "step": 808 }, { "epoch": 0.49865332820315506, "grad_norm": 1.90625, "learning_rate": 9.036958383418708e-06, "loss": 1.1338438987731934, "step": 810 }, { "epoch": 0.49988457098884187, "grad_norm": 0.9765625, "learning_rate": 9.031763054181554e-06, "loss": 1.1572006940841675, "step": 812 }, { "epoch": 0.5011158137745286, "grad_norm": 4.4375, "learning_rate": 9.026555677846726e-06, "loss": 1.489051103591919, "step": 814 }, { "epoch": 0.5023470565602155, "grad_norm": 3.5, "learning_rate": 9.021336275106397e-06, "loss": 1.4597687721252441, "step": 816 }, { "epoch": 0.5035782993459023, "grad_norm": 1.3046875, "learning_rate": 9.016104866700535e-06, "loss": 0.971706211566925, "step": 818 }, { "epoch": 0.504809542131589, "grad_norm": 1.9453125, "learning_rate": 9.010861473416803e-06, "loss": 1.1148239374160767, "step": 820 }, { "epoch": 0.5060407849172759, "grad_norm": 3.125, "learning_rate": 9.005606116090499e-06, "loss": 1.4972327947616577, "step": 822 }, { "epoch": 0.5072720277029626, "grad_norm": 4.125, "learning_rate": 9.000338815604452e-06, "loss": 1.5357601642608643, "step": 824 }, { "epoch": 0.5085032704886495, "grad_norm": 1.3046875, "learning_rate": 8.995059592888957e-06, "loss": 1.3044366836547852, "step": 826 }, { "epoch": 0.5097345132743363, "grad_norm": 1.5, "learning_rate": 8.989768468921675e-06, "loss": 1.261732816696167, "step": 828 }, { "epoch": 0.510965756060023, "grad_norm": 4.75, "learning_rate": 8.984465464727567e-06, "loss": 1.6821751594543457, "step": 830 }, { "epoch": 0.5121969988457099, "grad_norm": 3.578125, "learning_rate": 8.979150601378798e-06, "loss": 2.098515033721924, "step": 832 }, { "epoch": 0.5134282416313967, "grad_norm": 2.40625, "learning_rate": 8.973823899994653e-06, "loss": 1.4232302904129028, "step": 834 }, { "epoch": 0.5146594844170835, "grad_norm": 1.171875, "learning_rate": 8.968485381741464e-06, "loss": 1.1819924116134644, "step": 836 }, { "epoch": 0.5158907272027703, "grad_norm": 1.9453125, "learning_rate": 8.963135067832509e-06, "loss": 1.5029240846633911, "step": 838 }, { "epoch": 0.517121969988457, "grad_norm": 3.125, "learning_rate": 8.95777297952795e-06, "loss": 1.506210207939148, "step": 840 }, { "epoch": 0.5183532127741439, "grad_norm": 1.7734375, "learning_rate": 8.952399138134724e-06, "loss": 1.4535478353500366, "step": 842 }, { "epoch": 0.5195844555598307, "grad_norm": 2.765625, "learning_rate": 8.947013565006482e-06, "loss": 1.3872270584106445, "step": 844 }, { "epoch": 0.5208156983455176, "grad_norm": 0.71484375, "learning_rate": 8.941616281543484e-06, "loss": 1.1330012083053589, "step": 846 }, { "epoch": 0.5220469411312043, "grad_norm": 1.59375, "learning_rate": 8.936207309192522e-06, "loss": 1.159617304801941, "step": 848 }, { "epoch": 0.5232781839168911, "grad_norm": 6.84375, "learning_rate": 8.930786669446843e-06, "loss": 1.4233540296554565, "step": 850 }, { "epoch": 0.5245094267025779, "grad_norm": 6.9375, "learning_rate": 8.925354383846048e-06, "loss": 1.1373634338378906, "step": 852 }, { "epoch": 0.5257406694882647, "grad_norm": 3.1875, "learning_rate": 8.919910473976022e-06, "loss": 1.413563847541809, "step": 854 }, { "epoch": 0.5269719122739516, "grad_norm": 2.234375, "learning_rate": 8.914454961468828e-06, "loss": 1.413554310798645, "step": 856 }, { "epoch": 0.5282031550596383, "grad_norm": 0.765625, "learning_rate": 8.90898786800265e-06, "loss": 1.1362459659576416, "step": 858 }, { "epoch": 0.5294343978453251, "grad_norm": 1.1015625, "learning_rate": 8.903509215301677e-06, "loss": 1.0461921691894531, "step": 860 }, { "epoch": 0.530665640631012, "grad_norm": 3.359375, "learning_rate": 8.89801902513604e-06, "loss": 1.4780032634735107, "step": 862 }, { "epoch": 0.5318968834166987, "grad_norm": 2.171875, "learning_rate": 8.892517319321705e-06, "loss": 1.5155971050262451, "step": 864 }, { "epoch": 0.5331281262023856, "grad_norm": 2.109375, "learning_rate": 8.887004119720408e-06, "loss": 1.4275978803634644, "step": 866 }, { "epoch": 0.5343593689880723, "grad_norm": 3.828125, "learning_rate": 8.881479448239546e-06, "loss": 1.4534516334533691, "step": 868 }, { "epoch": 0.5355906117737591, "grad_norm": 1.1171875, "learning_rate": 8.875943326832113e-06, "loss": 1.1164848804473877, "step": 870 }, { "epoch": 0.536821854559446, "grad_norm": 20.875, "learning_rate": 8.87039577749659e-06, "loss": 1.2804282903671265, "step": 872 }, { "epoch": 0.5380530973451327, "grad_norm": 3.828125, "learning_rate": 8.864836822276872e-06, "loss": 1.0548572540283203, "step": 874 }, { "epoch": 0.5392843401308195, "grad_norm": 1.953125, "learning_rate": 8.859266483262183e-06, "loss": 1.043743371963501, "step": 876 }, { "epoch": 0.5405155829165064, "grad_norm": 1.2578125, "learning_rate": 8.853684782586971e-06, "loss": 1.0042893886566162, "step": 878 }, { "epoch": 0.5417468257021931, "grad_norm": 0.65234375, "learning_rate": 8.848091742430837e-06, "loss": 1.0357824563980103, "step": 880 }, { "epoch": 0.54297806848788, "grad_norm": 2.28125, "learning_rate": 8.842487385018443e-06, "loss": 1.8888530731201172, "step": 882 }, { "epoch": 0.5442093112735668, "grad_norm": 3.171875, "learning_rate": 8.836871732619419e-06, "loss": 1.7589383125305176, "step": 884 }, { "epoch": 0.5454405540592535, "grad_norm": 2.375, "learning_rate": 8.831244807548274e-06, "loss": 1.6239700317382812, "step": 886 }, { "epoch": 0.5466717968449404, "grad_norm": 2.625, "learning_rate": 8.825606632164314e-06, "loss": 1.501517653465271, "step": 888 }, { "epoch": 0.5479030396306271, "grad_norm": 2.046875, "learning_rate": 8.819957228871553e-06, "loss": 1.0660182237625122, "step": 890 }, { "epoch": 0.549134282416314, "grad_norm": 1.921875, "learning_rate": 8.81429662011861e-06, "loss": 1.187867283821106, "step": 892 }, { "epoch": 0.5503655252020008, "grad_norm": 2.265625, "learning_rate": 8.80862482839864e-06, "loss": 1.75932776927948, "step": 894 }, { "epoch": 0.5515967679876875, "grad_norm": 3.625, "learning_rate": 8.802941876249233e-06, "loss": 1.7781065702438354, "step": 896 }, { "epoch": 0.5528280107733744, "grad_norm": 1.703125, "learning_rate": 8.797247786252322e-06, "loss": 1.0148627758026123, "step": 898 }, { "epoch": 0.5540592535590612, "grad_norm": 1.21875, "learning_rate": 8.791542581034107e-06, "loss": 0.9595763087272644, "step": 900 }, { "epoch": 0.555290496344748, "grad_norm": 2.25, "learning_rate": 8.785826283264942e-06, "loss": 1.3790762424468994, "step": 902 }, { "epoch": 0.5565217391304348, "grad_norm": 3.375, "learning_rate": 8.780098915659272e-06, "loss": 1.5391640663146973, "step": 904 }, { "epoch": 0.5577529819161215, "grad_norm": 4.46875, "learning_rate": 8.774360500975518e-06, "loss": 1.2881464958190918, "step": 906 }, { "epoch": 0.5589842247018084, "grad_norm": 1.8828125, "learning_rate": 8.768611062016008e-06, "loss": 1.4236103296279907, "step": 908 }, { "epoch": 0.5602154674874952, "grad_norm": 1.3671875, "learning_rate": 8.76285062162687e-06, "loss": 1.1012616157531738, "step": 910 }, { "epoch": 0.561446710273182, "grad_norm": 3.609375, "learning_rate": 8.757079202697951e-06, "loss": 0.9929218292236328, "step": 912 }, { "epoch": 0.5626779530588688, "grad_norm": 4.25, "learning_rate": 8.751296828162721e-06, "loss": 1.483315348625183, "step": 914 }, { "epoch": 0.5639091958445556, "grad_norm": 4.84375, "learning_rate": 8.745503520998181e-06, "loss": 1.8858379125595093, "step": 916 }, { "epoch": 0.5651404386302424, "grad_norm": 2.15625, "learning_rate": 8.739699304224781e-06, "loss": 1.0241905450820923, "step": 918 }, { "epoch": 0.5663716814159292, "grad_norm": 0.859375, "learning_rate": 8.733884200906312e-06, "loss": 1.160780429840088, "step": 920 }, { "epoch": 0.567602924201616, "grad_norm": 1.0390625, "learning_rate": 8.728058234149836e-06, "loss": 1.3259217739105225, "step": 922 }, { "epoch": 0.5688341669873028, "grad_norm": 0.828125, "learning_rate": 8.722221427105573e-06, "loss": 1.1867862939834595, "step": 924 }, { "epoch": 0.5700654097729896, "grad_norm": 3.984375, "learning_rate": 8.71637380296682e-06, "loss": 1.7894140481948853, "step": 926 }, { "epoch": 0.5712966525586765, "grad_norm": 4.0625, "learning_rate": 8.71051538496986e-06, "loss": 1.620642066001892, "step": 928 }, { "epoch": 0.5725278953443632, "grad_norm": 3.109375, "learning_rate": 8.704646196393864e-06, "loss": 1.5504050254821777, "step": 930 }, { "epoch": 0.57375913813005, "grad_norm": 4.125, "learning_rate": 8.698766260560803e-06, "loss": 1.5462700128555298, "step": 932 }, { "epoch": 0.5749903809157368, "grad_norm": 2.609375, "learning_rate": 8.692875600835355e-06, "loss": 1.4665104150772095, "step": 934 }, { "epoch": 0.5762216237014236, "grad_norm": 2.765625, "learning_rate": 8.686974240624803e-06, "loss": 1.4654189348220825, "step": 936 }, { "epoch": 0.5774528664871105, "grad_norm": 2.9375, "learning_rate": 8.681062203378963e-06, "loss": 1.7840183973312378, "step": 938 }, { "epoch": 0.5786841092727972, "grad_norm": 2.875, "learning_rate": 8.675139512590063e-06, "loss": 1.550964593887329, "step": 940 }, { "epoch": 0.579915352058484, "grad_norm": 1.0859375, "learning_rate": 8.669206191792676e-06, "loss": 1.1342413425445557, "step": 942 }, { "epoch": 0.5811465948441709, "grad_norm": 1.2578125, "learning_rate": 8.663262264563607e-06, "loss": 1.0562883615493774, "step": 944 }, { "epoch": 0.5823778376298576, "grad_norm": 2.71875, "learning_rate": 8.657307754521811e-06, "loss": 1.5223665237426758, "step": 946 }, { "epoch": 0.5836090804155445, "grad_norm": 2.5, "learning_rate": 8.651342685328294e-06, "loss": 1.2530782222747803, "step": 948 }, { "epoch": 0.5848403232012312, "grad_norm": 2.15625, "learning_rate": 8.645367080686022e-06, "loss": 1.4140348434448242, "step": 950 }, { "epoch": 0.586071565986918, "grad_norm": 1.65625, "learning_rate": 8.63938096433982e-06, "loss": 1.4089921712875366, "step": 952 }, { "epoch": 0.5873028087726049, "grad_norm": 2.703125, "learning_rate": 8.633384360076288e-06, "loss": 1.5375595092773438, "step": 954 }, { "epoch": 0.5885340515582916, "grad_norm": 3.125, "learning_rate": 8.6273772917237e-06, "loss": 1.456397533416748, "step": 956 }, { "epoch": 0.5897652943439785, "grad_norm": 2.765625, "learning_rate": 8.621359783151906e-06, "loss": 0.9872013330459595, "step": 958 }, { "epoch": 0.5909965371296653, "grad_norm": 1.0, "learning_rate": 8.615331858272245e-06, "loss": 1.005414605140686, "step": 960 }, { "epoch": 0.592227779915352, "grad_norm": 1.5, "learning_rate": 8.609293541037448e-06, "loss": 1.4516929388046265, "step": 962 }, { "epoch": 0.5934590227010389, "grad_norm": 4.21875, "learning_rate": 8.603244855441541e-06, "loss": 1.4895005226135254, "step": 964 }, { "epoch": 0.5946902654867257, "grad_norm": 1.9765625, "learning_rate": 8.597185825519746e-06, "loss": 1.4036403894424438, "step": 966 }, { "epoch": 0.5959215082724124, "grad_norm": 2.96875, "learning_rate": 8.591116475348393e-06, "loss": 1.4434735774993896, "step": 968 }, { "epoch": 0.5971527510580993, "grad_norm": 2.109375, "learning_rate": 8.585036829044819e-06, "loss": 1.4209600687026978, "step": 970 }, { "epoch": 0.598383993843786, "grad_norm": 2.703125, "learning_rate": 8.578946910767277e-06, "loss": 1.5273462533950806, "step": 972 }, { "epoch": 0.5996152366294729, "grad_norm": 1.53125, "learning_rate": 8.572846744714833e-06, "loss": 1.4886844158172607, "step": 974 }, { "epoch": 0.6008464794151597, "grad_norm": 3.46875, "learning_rate": 8.566736355127278e-06, "loss": 1.7636457681655884, "step": 976 }, { "epoch": 0.6020777222008464, "grad_norm": 2.59375, "learning_rate": 8.560615766285025e-06, "loss": 1.7612440586090088, "step": 978 }, { "epoch": 0.6033089649865333, "grad_norm": 11.3125, "learning_rate": 8.554485002509015e-06, "loss": 1.7582465410232544, "step": 980 }, { "epoch": 0.6045402077722201, "grad_norm": 1.875, "learning_rate": 8.54834408816062e-06, "loss": 1.5206481218338013, "step": 982 }, { "epoch": 0.6057714505579069, "grad_norm": 3.21875, "learning_rate": 8.542193047641548e-06, "loss": 1.4135831594467163, "step": 984 }, { "epoch": 0.6070026933435937, "grad_norm": 1.7421875, "learning_rate": 8.536031905393742e-06, "loss": 1.2139555215835571, "step": 986 }, { "epoch": 0.6082339361292805, "grad_norm": 2.828125, "learning_rate": 8.529860685899291e-06, "loss": 1.0709203481674194, "step": 988 }, { "epoch": 0.6094651789149673, "grad_norm": 3.15625, "learning_rate": 8.523679413680324e-06, "loss": 1.780793309211731, "step": 990 }, { "epoch": 0.6106964217006541, "grad_norm": 4.09375, "learning_rate": 8.51748811329891e-06, "loss": 1.6577130556106567, "step": 992 }, { "epoch": 0.611927664486341, "grad_norm": 2.703125, "learning_rate": 8.51128680935698e-06, "loss": 1.820251703262329, "step": 994 }, { "epoch": 0.6131589072720277, "grad_norm": 4.15625, "learning_rate": 8.5050755264962e-06, "loss": 1.7748041152954102, "step": 996 }, { "epoch": 0.6143901500577145, "grad_norm": 2.90625, "learning_rate": 8.4988542893979e-06, "loss": 1.4135329723358154, "step": 998 }, { "epoch": 0.6156213928434013, "grad_norm": 3.234375, "learning_rate": 8.492623122782957e-06, "loss": 1.8253610134124756, "step": 1000 }, { "epoch": 0.6168526356290881, "grad_norm": 2.109375, "learning_rate": 8.48638205141171e-06, "loss": 1.2383848428726196, "step": 1002 }, { "epoch": 0.618083878414775, "grad_norm": 1.703125, "learning_rate": 8.480131100083853e-06, "loss": 1.306466817855835, "step": 1004 }, { "epoch": 0.6193151212004617, "grad_norm": 2.34375, "learning_rate": 8.473870293638335e-06, "loss": 1.4259259700775146, "step": 1006 }, { "epoch": 0.6205463639861485, "grad_norm": 1.9375, "learning_rate": 8.467599656953276e-06, "loss": 1.386904001235962, "step": 1008 }, { "epoch": 0.6217776067718354, "grad_norm": 1.4296875, "learning_rate": 8.461319214945847e-06, "loss": 1.241986632347107, "step": 1010 }, { "epoch": 0.6230088495575221, "grad_norm": 1.5703125, "learning_rate": 8.455028992572189e-06, "loss": 1.3007842302322388, "step": 1012 }, { "epoch": 0.6242400923432089, "grad_norm": 1.296875, "learning_rate": 8.448729014827305e-06, "loss": 1.1606676578521729, "step": 1014 }, { "epoch": 0.6254713351288957, "grad_norm": 1.015625, "learning_rate": 8.442419306744958e-06, "loss": 1.0195151567459106, "step": 1016 }, { "epoch": 0.6267025779145825, "grad_norm": 1.1328125, "learning_rate": 8.436099893397582e-06, "loss": 1.0201420783996582, "step": 1018 }, { "epoch": 0.6279338207002694, "grad_norm": 1.8984375, "learning_rate": 8.429770799896168e-06, "loss": 1.2197397947311401, "step": 1020 }, { "epoch": 0.6291650634859561, "grad_norm": 1.9453125, "learning_rate": 8.423432051390184e-06, "loss": 1.1003873348236084, "step": 1022 }, { "epoch": 0.6303963062716429, "grad_norm": 2.53125, "learning_rate": 8.417083673067452e-06, "loss": 1.231971025466919, "step": 1024 }, { "epoch": 0.6316275490573298, "grad_norm": 4.1875, "learning_rate": 8.410725690154067e-06, "loss": 1.5416268110275269, "step": 1026 }, { "epoch": 0.6328587918430165, "grad_norm": 2.53125, "learning_rate": 8.404358127914281e-06, "loss": 1.7177008390426636, "step": 1028 }, { "epoch": 0.6340900346287034, "grad_norm": 1.296875, "learning_rate": 8.39798101165042e-06, "loss": 1.1199826002120972, "step": 1030 }, { "epoch": 0.6353212774143902, "grad_norm": 1.4296875, "learning_rate": 8.391594366702772e-06, "loss": 1.1079318523406982, "step": 1032 }, { "epoch": 0.6365525202000769, "grad_norm": 1.234375, "learning_rate": 8.385198218449479e-06, "loss": 1.0160592794418335, "step": 1034 }, { "epoch": 0.6377837629857638, "grad_norm": 1.4453125, "learning_rate": 8.37879259230646e-06, "loss": 0.9635307788848877, "step": 1036 }, { "epoch": 0.6390150057714505, "grad_norm": 1.0546875, "learning_rate": 8.372377513727283e-06, "loss": 1.0464245080947876, "step": 1038 }, { "epoch": 0.6402462485571374, "grad_norm": 0.8203125, "learning_rate": 8.365953008203088e-06, "loss": 0.9715243577957153, "step": 1040 }, { "epoch": 0.6414774913428242, "grad_norm": 4.40625, "learning_rate": 8.359519101262464e-06, "loss": 1.554203748703003, "step": 1042 }, { "epoch": 0.6427087341285109, "grad_norm": 3.34375, "learning_rate": 8.353075818471362e-06, "loss": 1.7830839157104492, "step": 1044 }, { "epoch": 0.6439399769141978, "grad_norm": 2.890625, "learning_rate": 8.34662318543299e-06, "loss": 0.3349166512489319, "step": 1046 }, { "epoch": 0.6451712196998846, "grad_norm": 2.8125, "learning_rate": 8.340161227787709e-06, "loss": 0.26277410984039307, "step": 1048 }, { "epoch": 0.6464024624855714, "grad_norm": 2.609375, "learning_rate": 8.333689971212932e-06, "loss": 1.475071907043457, "step": 1050 }, { "epoch": 0.6476337052712582, "grad_norm": 12.5625, "learning_rate": 8.327209441423025e-06, "loss": 1.4347758293151855, "step": 1052 }, { "epoch": 0.648864948056945, "grad_norm": 4.21875, "learning_rate": 8.320719664169203e-06, "loss": 1.9238454103469849, "step": 1054 }, { "epoch": 0.6500961908426318, "grad_norm": 5.03125, "learning_rate": 8.314220665239418e-06, "loss": 1.5550142526626587, "step": 1056 }, { "epoch": 0.6513274336283186, "grad_norm": 5.9375, "learning_rate": 8.30771247045828e-06, "loss": 1.9398154020309448, "step": 1058 }, { "epoch": 0.6525586764140053, "grad_norm": 2.765625, "learning_rate": 8.301195105686927e-06, "loss": 1.9076368808746338, "step": 1060 }, { "epoch": 0.6537899191996922, "grad_norm": 2.625, "learning_rate": 8.294668596822941e-06, "loss": 1.4107842445373535, "step": 1062 }, { "epoch": 0.655021161985379, "grad_norm": 5.3125, "learning_rate": 8.28813296980024e-06, "loss": 1.4180490970611572, "step": 1064 }, { "epoch": 0.6562524047710658, "grad_norm": 2.40625, "learning_rate": 8.28158825058897e-06, "loss": 1.4826152324676514, "step": 1066 }, { "epoch": 0.6574836475567526, "grad_norm": 2.484375, "learning_rate": 8.275034465195413e-06, "loss": 1.4656550884246826, "step": 1068 }, { "epoch": 0.6587148903424394, "grad_norm": 2.6875, "learning_rate": 8.268471639661868e-06, "loss": 1.479034662246704, "step": 1070 }, { "epoch": 0.6599461331281262, "grad_norm": 1.8828125, "learning_rate": 8.261899800066561e-06, "loss": 1.4503613710403442, "step": 1072 }, { "epoch": 0.661177375913813, "grad_norm": 2.21875, "learning_rate": 8.255318972523538e-06, "loss": 1.4242353439331055, "step": 1074 }, { "epoch": 0.6624086186994999, "grad_norm": 17.5, "learning_rate": 8.248729183182556e-06, "loss": 1.3858964443206787, "step": 1076 }, { "epoch": 0.6636398614851866, "grad_norm": 7.46875, "learning_rate": 8.242130458228986e-06, "loss": 1.691213607788086, "step": 1078 }, { "epoch": 0.6648711042708734, "grad_norm": 5.125, "learning_rate": 8.235522823883702e-06, "loss": 1.9530844688415527, "step": 1080 }, { "epoch": 0.6661023470565602, "grad_norm": 1.6328125, "learning_rate": 8.228906306402984e-06, "loss": 1.4718236923217773, "step": 1082 }, { "epoch": 0.667333589842247, "grad_norm": 2.15625, "learning_rate": 8.22228093207841e-06, "loss": 1.478079080581665, "step": 1084 }, { "epoch": 0.6685648326279339, "grad_norm": 2.0, "learning_rate": 8.21564672723675e-06, "loss": 1.423277497291565, "step": 1086 }, { "epoch": 0.6697960754136206, "grad_norm": 2.328125, "learning_rate": 8.209003718239865e-06, "loss": 1.4079033136367798, "step": 1088 }, { "epoch": 0.6710273181993074, "grad_norm": 2.5625, "learning_rate": 8.2023519314846e-06, "loss": 1.510083794593811, "step": 1090 }, { "epoch": 0.6722585609849943, "grad_norm": 1.96875, "learning_rate": 8.195691393402676e-06, "loss": 1.4157885313034058, "step": 1092 }, { "epoch": 0.673489803770681, "grad_norm": 2.25, "learning_rate": 8.189022130460595e-06, "loss": 1.2089054584503174, "step": 1094 }, { "epoch": 0.6747210465563679, "grad_norm": 2.0625, "learning_rate": 8.182344169159527e-06, "loss": 1.254252552986145, "step": 1096 }, { "epoch": 0.6759522893420546, "grad_norm": 2.890625, "learning_rate": 8.175657536035195e-06, "loss": 1.513046383857727, "step": 1098 }, { "epoch": 0.6771835321277414, "grad_norm": 3.359375, "learning_rate": 8.1689622576578e-06, "loss": 1.5234147310256958, "step": 1100 }, { "epoch": 0.6784147749134283, "grad_norm": 0.78515625, "learning_rate": 8.16225836063188e-06, "loss": 1.077025294303894, "step": 1102 }, { "epoch": 0.679646017699115, "grad_norm": 2.25, "learning_rate": 8.155545871596228e-06, "loss": 1.1133958101272583, "step": 1104 }, { "epoch": 0.6808772604848018, "grad_norm": 1.25, "learning_rate": 8.148824817223775e-06, "loss": 1.2056477069854736, "step": 1106 }, { "epoch": 0.6821085032704887, "grad_norm": 1.0234375, "learning_rate": 8.14209522422149e-06, "loss": 1.15602707862854, "step": 1108 }, { "epoch": 0.6833397460561754, "grad_norm": 3.734375, "learning_rate": 8.13535711933027e-06, "loss": 1.6869274377822876, "step": 1110 }, { "epoch": 0.6845709888418623, "grad_norm": 3.703125, "learning_rate": 8.128610529324837e-06, "loss": 1.7017213106155396, "step": 1112 }, { "epoch": 0.685802231627549, "grad_norm": 10.0, "learning_rate": 8.121855481013624e-06, "loss": 1.94403076171875, "step": 1114 }, { "epoch": 0.6870334744132358, "grad_norm": 2.9375, "learning_rate": 8.11509200123868e-06, "loss": 1.6434037685394287, "step": 1116 }, { "epoch": 0.6882647171989227, "grad_norm": 2.03125, "learning_rate": 8.108320116875557e-06, "loss": 1.0867962837219238, "step": 1118 }, { "epoch": 0.6894959599846094, "grad_norm": 1.0625, "learning_rate": 8.101539854833201e-06, "loss": 1.0537822246551514, "step": 1120 }, { "epoch": 0.6907272027702963, "grad_norm": 2.359375, "learning_rate": 8.094751242053846e-06, "loss": 1.4303661584854126, "step": 1122 }, { "epoch": 0.6919584455559831, "grad_norm": 2.796875, "learning_rate": 8.087954305512923e-06, "loss": 1.4762662649154663, "step": 1124 }, { "epoch": 0.6931896883416698, "grad_norm": 6.28125, "learning_rate": 8.08114907221891e-06, "loss": 1.4999399185180664, "step": 1126 }, { "epoch": 0.6944209311273567, "grad_norm": 2.578125, "learning_rate": 8.074335569213287e-06, "loss": 1.445244312286377, "step": 1128 }, { "epoch": 0.6956521739130435, "grad_norm": 2.9375, "learning_rate": 8.067513823570368e-06, "loss": 1.4429575204849243, "step": 1130 }, { "epoch": 0.6968834166987303, "grad_norm": 3.203125, "learning_rate": 8.060683862397236e-06, "loss": 1.4105318784713745, "step": 1132 }, { "epoch": 0.6981146594844171, "grad_norm": 2.265625, "learning_rate": 8.05384571283361e-06, "loss": 1.4015456438064575, "step": 1134 }, { "epoch": 0.6993459022701038, "grad_norm": 2.75, "learning_rate": 8.046999402051754e-06, "loss": 1.5193300247192383, "step": 1136 }, { "epoch": 0.7005771450557907, "grad_norm": 6.28125, "learning_rate": 8.040144957256357e-06, "loss": 1.298558235168457, "step": 1138 }, { "epoch": 0.7018083878414775, "grad_norm": 2.359375, "learning_rate": 8.033282405684428e-06, "loss": 1.5250219106674194, "step": 1140 }, { "epoch": 0.7030396306271643, "grad_norm": 2.234375, "learning_rate": 8.026411774605198e-06, "loss": 1.498106837272644, "step": 1142 }, { "epoch": 0.7042708734128511, "grad_norm": 2.109375, "learning_rate": 8.019533091319991e-06, "loss": 1.4430673122406006, "step": 1144 }, { "epoch": 0.7055021161985379, "grad_norm": 2.0, "learning_rate": 8.012646383162138e-06, "loss": 1.4169411659240723, "step": 1146 }, { "epoch": 0.7067333589842247, "grad_norm": 1.9453125, "learning_rate": 8.00575167749685e-06, "loss": 1.4503501653671265, "step": 1148 }, { "epoch": 0.7079646017699115, "grad_norm": 1.4765625, "learning_rate": 7.998849001721123e-06, "loss": 1.3543637990951538, "step": 1150 }, { "epoch": 0.7091958445555983, "grad_norm": 3.8125, "learning_rate": 7.991938383263617e-06, "loss": 1.388875126838684, "step": 1152 }, { "epoch": 0.7104270873412851, "grad_norm": 1.921875, "learning_rate": 7.98501984958456e-06, "loss": 1.1109329462051392, "step": 1154 }, { "epoch": 0.7116583301269719, "grad_norm": 2.25, "learning_rate": 7.978093428175632e-06, "loss": 1.2034087181091309, "step": 1156 }, { "epoch": 0.7128895729126588, "grad_norm": 1.1328125, "learning_rate": 7.971159146559848e-06, "loss": 1.061422348022461, "step": 1158 }, { "epoch": 0.7141208156983455, "grad_norm": 0.8828125, "learning_rate": 7.964217032291463e-06, "loss": 1.0195786952972412, "step": 1160 }, { "epoch": 0.7153520584840323, "grad_norm": 0.85546875, "learning_rate": 7.957267112955856e-06, "loss": 0.9217634201049805, "step": 1162 }, { "epoch": 0.7165833012697191, "grad_norm": 1.2421875, "learning_rate": 7.950309416169415e-06, "loss": 1.1007217168807983, "step": 1164 }, { "epoch": 0.7178145440554059, "grad_norm": 3.546875, "learning_rate": 7.943343969579443e-06, "loss": 1.9357012510299683, "step": 1166 }, { "epoch": 0.7190457868410928, "grad_norm": 3.25, "learning_rate": 7.936370800864026e-06, "loss": 1.784494400024414, "step": 1168 }, { "epoch": 0.7202770296267795, "grad_norm": 1.796875, "learning_rate": 7.929389937731942e-06, "loss": 1.5166871547698975, "step": 1170 }, { "epoch": 0.7215082724124663, "grad_norm": 5.28125, "learning_rate": 7.922401407922546e-06, "loss": 1.3804179430007935, "step": 1172 }, { "epoch": 0.7227395151981532, "grad_norm": 3.328125, "learning_rate": 7.915405239205647e-06, "loss": 1.4463698863983154, "step": 1174 }, { "epoch": 0.7239707579838399, "grad_norm": 2.09375, "learning_rate": 7.90840145938142e-06, "loss": 1.4229485988616943, "step": 1176 }, { "epoch": 0.7252020007695268, "grad_norm": 2.046875, "learning_rate": 7.901390096280276e-06, "loss": 1.3788976669311523, "step": 1178 }, { "epoch": 0.7264332435552135, "grad_norm": 2.828125, "learning_rate": 7.894371177762765e-06, "loss": 1.4840986728668213, "step": 1180 }, { "epoch": 0.7276644863409003, "grad_norm": 1.7734375, "learning_rate": 7.88734473171945e-06, "loss": 1.4703840017318726, "step": 1182 }, { "epoch": 0.7288957291265872, "grad_norm": 2.796875, "learning_rate": 7.880310786070818e-06, "loss": 1.4176890850067139, "step": 1184 }, { "epoch": 0.7301269719122739, "grad_norm": 4.625, "learning_rate": 7.873269368767147e-06, "loss": 0.6599590182304382, "step": 1186 }, { "epoch": 0.7313582146979608, "grad_norm": 8.0625, "learning_rate": 7.866220507788409e-06, "loss": 0.6873646378517151, "step": 1188 }, { "epoch": 0.7325894574836476, "grad_norm": 2.703125, "learning_rate": 7.859164231144152e-06, "loss": 1.4601305723190308, "step": 1190 }, { "epoch": 0.7338207002693343, "grad_norm": 2.65625, "learning_rate": 7.852100566873394e-06, "loss": 1.380963683128357, "step": 1192 }, { "epoch": 0.7350519430550212, "grad_norm": 7.5625, "learning_rate": 7.845029543044506e-06, "loss": 1.0763018131256104, "step": 1194 }, { "epoch": 0.736283185840708, "grad_norm": 2.640625, "learning_rate": 7.837951187755106e-06, "loss": 1.0719996690750122, "step": 1196 }, { "epoch": 0.7375144286263947, "grad_norm": 3.1875, "learning_rate": 7.830865529131939e-06, "loss": 1.4080944061279297, "step": 1198 }, { "epoch": 0.7387456714120816, "grad_norm": 3.859375, "learning_rate": 7.82377259533078e-06, "loss": 1.4090288877487183, "step": 1200 }, { "epoch": 0.7399769141977683, "grad_norm": 0.8984375, "learning_rate": 7.816672414536299e-06, "loss": 1.0860710144042969, "step": 1202 }, { "epoch": 0.7412081569834552, "grad_norm": 0.609375, "learning_rate": 7.80956501496198e-06, "loss": 1.0601508617401123, "step": 1204 }, { "epoch": 0.742439399769142, "grad_norm": 3.265625, "learning_rate": 7.802450424849975e-06, "loss": 1.6900361776351929, "step": 1206 }, { "epoch": 0.7436706425548287, "grad_norm": 3.484375, "learning_rate": 7.795328672471024e-06, "loss": 1.7971972227096558, "step": 1208 }, { "epoch": 0.7449018853405156, "grad_norm": 6.5, "learning_rate": 7.788199786124316e-06, "loss": 1.9263969659805298, "step": 1210 }, { "epoch": 0.7461331281262024, "grad_norm": 3.28125, "learning_rate": 7.78106379413739e-06, "loss": 1.4539012908935547, "step": 1212 }, { "epoch": 0.7473643709118892, "grad_norm": 3.359375, "learning_rate": 7.773920724866022e-06, "loss": 1.5632342100143433, "step": 1214 }, { "epoch": 0.748595613697576, "grad_norm": 2.9375, "learning_rate": 7.766770606694109e-06, "loss": 1.2990467548370361, "step": 1216 }, { "epoch": 0.7498268564832627, "grad_norm": 0.83984375, "learning_rate": 7.759613468033564e-06, "loss": 1.1596078872680664, "step": 1218 }, { "epoch": 0.7510580992689496, "grad_norm": 0.734375, "learning_rate": 7.752449337324188e-06, "loss": 1.0468875169754028, "step": 1220 }, { "epoch": 0.7522893420546364, "grad_norm": 1.671875, "learning_rate": 7.74527824303357e-06, "loss": 1.1754250526428223, "step": 1222 }, { "epoch": 0.7535205848403232, "grad_norm": 1.1875, "learning_rate": 7.738100213656962e-06, "loss": 1.1995749473571777, "step": 1224 }, { "epoch": 0.75475182762601, "grad_norm": 3.921875, "learning_rate": 7.730915277717192e-06, "loss": 1.4692225456237793, "step": 1226 }, { "epoch": 0.7559830704116968, "grad_norm": 4.75, "learning_rate": 7.723723463764515e-06, "loss": 1.7058963775634766, "step": 1228 }, { "epoch": 0.7572143131973836, "grad_norm": 2.515625, "learning_rate": 7.716524800376521e-06, "loss": 1.4868927001953125, "step": 1230 }, { "epoch": 0.7584455559830704, "grad_norm": 2.65625, "learning_rate": 7.709319316158017e-06, "loss": 1.4232137203216553, "step": 1232 }, { "epoch": 0.7596767987687573, "grad_norm": 2.828125, "learning_rate": 7.70210703974092e-06, "loss": 1.4825751781463623, "step": 1234 }, { "epoch": 0.760908041554444, "grad_norm": 2.046875, "learning_rate": 7.69488799978413e-06, "loss": 1.4381661415100098, "step": 1236 }, { "epoch": 0.7621392843401308, "grad_norm": 1.984375, "learning_rate": 7.68766222497342e-06, "loss": 1.4844237565994263, "step": 1238 }, { "epoch": 0.7633705271258177, "grad_norm": 2.359375, "learning_rate": 7.680429744021333e-06, "loss": 1.7116321325302124, "step": 1240 }, { "epoch": 0.7646017699115044, "grad_norm": 2.078125, "learning_rate": 7.673190585667056e-06, "loss": 1.1763395071029663, "step": 1242 }, { "epoch": 0.7658330126971912, "grad_norm": 2.1875, "learning_rate": 7.665944778676307e-06, "loss": 1.1598796844482422, "step": 1244 }, { "epoch": 0.767064255482878, "grad_norm": 3.46875, "learning_rate": 7.658692351841226e-06, "loss": 1.3642587661743164, "step": 1246 }, { "epoch": 0.7682954982685648, "grad_norm": 2.0, "learning_rate": 7.651433333980256e-06, "loss": 1.430611491203308, "step": 1248 }, { "epoch": 0.7695267410542517, "grad_norm": 4.21875, "learning_rate": 7.644167753938035e-06, "loss": 1.4081594944000244, "step": 1250 }, { "epoch": 0.7707579838399384, "grad_norm": 3.953125, "learning_rate": 7.636895640585271e-06, "loss": 1.8488107919692993, "step": 1252 }, { "epoch": 0.7719892266256252, "grad_norm": 5.59375, "learning_rate": 7.629617022818634e-06, "loss": 1.9246183633804321, "step": 1254 }, { "epoch": 0.7732204694113121, "grad_norm": 3.625, "learning_rate": 7.622331929560643e-06, "loss": 1.4888027906417847, "step": 1256 }, { "epoch": 0.7744517121969988, "grad_norm": 0.96484375, "learning_rate": 7.615040389759547e-06, "loss": 1.1719835996627808, "step": 1258 }, { "epoch": 0.7756829549826857, "grad_norm": 1.671875, "learning_rate": 7.607742432389207e-06, "loss": 1.0067853927612305, "step": 1260 }, { "epoch": 0.7769141977683724, "grad_norm": 4.4375, "learning_rate": 7.600438086448993e-06, "loss": 1.4551739692687988, "step": 1262 }, { "epoch": 0.7781454405540592, "grad_norm": 4.1875, "learning_rate": 7.593127380963654e-06, "loss": 1.7276794910430908, "step": 1264 }, { "epoch": 0.7793766833397461, "grad_norm": 2.9375, "learning_rate": 7.5858103449832135e-06, "loss": 1.536874771118164, "step": 1266 }, { "epoch": 0.7806079261254328, "grad_norm": 2.0625, "learning_rate": 7.5784870075828446e-06, "loss": 1.4394667148590088, "step": 1268 }, { "epoch": 0.7818391689111197, "grad_norm": 1.7578125, "learning_rate": 7.571157397862767e-06, "loss": 1.2508525848388672, "step": 1270 }, { "epoch": 0.7830704116968065, "grad_norm": 1.171875, "learning_rate": 7.563821544948123e-06, "loss": 1.0572394132614136, "step": 1272 }, { "epoch": 0.7843016544824932, "grad_norm": 3.0, "learning_rate": 7.556479477988856e-06, "loss": 1.6088207960128784, "step": 1274 }, { "epoch": 0.7855328972681801, "grad_norm": 3.328125, "learning_rate": 7.54913122615961e-06, "loss": 1.9735209941864014, "step": 1276 }, { "epoch": 0.7867641400538669, "grad_norm": 3.03125, "learning_rate": 7.5417768186596006e-06, "loss": 1.5228854417800903, "step": 1278 }, { "epoch": 0.7879953828395537, "grad_norm": 1.84375, "learning_rate": 7.534416284712504e-06, "loss": 1.4340687990188599, "step": 1280 }, { "epoch": 0.7892266256252405, "grad_norm": 1.890625, "learning_rate": 7.527049653566347e-06, "loss": 1.4551016092300415, "step": 1282 }, { "epoch": 0.7904578684109272, "grad_norm": 2.828125, "learning_rate": 7.519676954493373e-06, "loss": 1.3765220642089844, "step": 1284 }, { "epoch": 0.7916891111966141, "grad_norm": 1.2421875, "learning_rate": 7.512298216789948e-06, "loss": 1.229244589805603, "step": 1286 }, { "epoch": 0.7929203539823009, "grad_norm": 1.375, "learning_rate": 7.504913469776427e-06, "loss": 1.0655384063720703, "step": 1288 }, { "epoch": 0.7941515967679876, "grad_norm": 5.6875, "learning_rate": 7.497522742797046e-06, "loss": 1.4802930355072021, "step": 1290 }, { "epoch": 0.7953828395536745, "grad_norm": 4.40625, "learning_rate": 7.490126065219798e-06, "loss": 1.3891750574111938, "step": 1292 }, { "epoch": 0.7966140823393613, "grad_norm": 2.734375, "learning_rate": 7.482723466436333e-06, "loss": 1.4222067594528198, "step": 1294 }, { "epoch": 0.7978453251250481, "grad_norm": 2.71875, "learning_rate": 7.475314975861816e-06, "loss": 1.4236258268356323, "step": 1296 }, { "epoch": 0.7990765679107349, "grad_norm": 12.3125, "learning_rate": 7.467900622934834e-06, "loss": 1.8899813890457153, "step": 1298 }, { "epoch": 0.8003078106964217, "grad_norm": 5.1875, "learning_rate": 7.460480437117266e-06, "loss": 1.1637026071548462, "step": 1300 }, { "epoch": 0.8015390534821085, "grad_norm": 2.171875, "learning_rate": 7.453054447894168e-06, "loss": 1.5327718257904053, "step": 1302 }, { "epoch": 0.8027702962677953, "grad_norm": 2.484375, "learning_rate": 7.445622684773652e-06, "loss": 1.3591474294662476, "step": 1304 }, { "epoch": 0.8040015390534822, "grad_norm": 0.96875, "learning_rate": 7.438185177286785e-06, "loss": 1.1956864595413208, "step": 1306 }, { "epoch": 0.8052327818391689, "grad_norm": 1.390625, "learning_rate": 7.430741954987446e-06, "loss": 0.9828561544418335, "step": 1308 }, { "epoch": 0.8064640246248557, "grad_norm": 6.875, "learning_rate": 7.423293047452234e-06, "loss": 1.4108320474624634, "step": 1310 }, { "epoch": 0.8076952674105425, "grad_norm": 2.859375, "learning_rate": 7.415838484280331e-06, "loss": 1.5532580614089966, "step": 1312 }, { "epoch": 0.8089265101962293, "grad_norm": 1.0, "learning_rate": 7.408378295093399e-06, "loss": 0.9883386492729187, "step": 1314 }, { "epoch": 0.8101577529819162, "grad_norm": 0.78515625, "learning_rate": 7.4009125095354494e-06, "loss": 1.1321879625320435, "step": 1316 }, { "epoch": 0.8113889957676029, "grad_norm": 3.671875, "learning_rate": 7.393441157272738e-06, "loss": 1.622273325920105, "step": 1318 }, { "epoch": 0.8126202385532897, "grad_norm": 5.15625, "learning_rate": 7.385964267993635e-06, "loss": 1.7767966985702515, "step": 1320 }, { "epoch": 0.8138514813389766, "grad_norm": 4.65625, "learning_rate": 7.3784818714085136e-06, "loss": 1.9980825185775757, "step": 1322 }, { "epoch": 0.8150827241246633, "grad_norm": 7.5, "learning_rate": 7.370993997249634e-06, "loss": 1.5187777280807495, "step": 1324 }, { "epoch": 0.8163139669103502, "grad_norm": 2.125, "learning_rate": 7.36350067527102e-06, "loss": 1.4495576620101929, "step": 1326 }, { "epoch": 0.817545209696037, "grad_norm": 2.359375, "learning_rate": 7.3560019352483444e-06, "loss": 1.4711284637451172, "step": 1328 }, { "epoch": 0.8187764524817237, "grad_norm": 2.03125, "learning_rate": 7.3484978069788075e-06, "loss": 1.474364161491394, "step": 1330 }, { "epoch": 0.8200076952674106, "grad_norm": 2.25, "learning_rate": 7.34098832028102e-06, "loss": 1.4535422325134277, "step": 1332 }, { "epoch": 0.8212389380530973, "grad_norm": 2.484375, "learning_rate": 7.333473504994888e-06, "loss": 1.6823538541793823, "step": 1334 }, { "epoch": 0.8224701808387841, "grad_norm": 2.640625, "learning_rate": 7.3259533909814905e-06, "loss": 1.761359691619873, "step": 1336 }, { "epoch": 0.823701423624471, "grad_norm": 6.78125, "learning_rate": 7.318428008122958e-06, "loss": 2.0572004318237305, "step": 1338 }, { "epoch": 0.8249326664101577, "grad_norm": 3.875, "learning_rate": 7.310897386322362e-06, "loss": 1.606192946434021, "step": 1340 }, { "epoch": 0.8261639091958446, "grad_norm": 2.296875, "learning_rate": 7.303361555503592e-06, "loss": 1.5423405170440674, "step": 1342 }, { "epoch": 0.8273951519815314, "grad_norm": 2.859375, "learning_rate": 7.295820545611232e-06, "loss": 1.3942408561706543, "step": 1344 }, { "epoch": 0.8286263947672181, "grad_norm": 0.765625, "learning_rate": 7.288274386610446e-06, "loss": 1.0209629535675049, "step": 1346 }, { "epoch": 0.829857637552905, "grad_norm": 1.3046875, "learning_rate": 7.280723108486863e-06, "loss": 1.0453615188598633, "step": 1348 }, { "epoch": 0.8310888803385917, "grad_norm": 2.859375, "learning_rate": 7.273166741246449e-06, "loss": 1.1256355047225952, "step": 1350 }, { "epoch": 0.8323201231242786, "grad_norm": 1.203125, "learning_rate": 7.265605314915399e-06, "loss": 1.0713417530059814, "step": 1352 }, { "epoch": 0.8335513659099654, "grad_norm": 2.6875, "learning_rate": 7.258038859540002e-06, "loss": 1.2602061033248901, "step": 1354 }, { "epoch": 0.8347826086956521, "grad_norm": 2.765625, "learning_rate": 7.250467405186534e-06, "loss": 1.3749901056289673, "step": 1356 }, { "epoch": 0.836013851481339, "grad_norm": 2.859375, "learning_rate": 7.242890981941137e-06, "loss": 1.319159984588623, "step": 1358 }, { "epoch": 0.8372450942670258, "grad_norm": 2.328125, "learning_rate": 7.235309619909698e-06, "loss": 1.401831865310669, "step": 1360 }, { "epoch": 0.8384763370527126, "grad_norm": 2.765625, "learning_rate": 7.227723349217728e-06, "loss": 1.5969266891479492, "step": 1362 }, { "epoch": 0.8397075798383994, "grad_norm": 2.609375, "learning_rate": 7.220132200010237e-06, "loss": 1.5466394424438477, "step": 1364 }, { "epoch": 0.8409388226240861, "grad_norm": 3.046875, "learning_rate": 7.21253620245163e-06, "loss": 1.8190059661865234, "step": 1366 }, { "epoch": 0.842170065409773, "grad_norm": 2.65625, "learning_rate": 7.204935386725573e-06, "loss": 1.8231192827224731, "step": 1368 }, { "epoch": 0.8434013081954598, "grad_norm": 2.046875, "learning_rate": 7.197329783034879e-06, "loss": 1.52850341796875, "step": 1370 }, { "epoch": 0.8446325509811466, "grad_norm": 1.9375, "learning_rate": 7.18971942160138e-06, "loss": 1.413024663925171, "step": 1372 }, { "epoch": 0.8458637937668334, "grad_norm": 5.28125, "learning_rate": 7.182104332665827e-06, "loss": 1.7647671699523926, "step": 1374 }, { "epoch": 0.8470950365525202, "grad_norm": 3.34375, "learning_rate": 7.174484546487743e-06, "loss": 1.7847025394439697, "step": 1376 }, { "epoch": 0.848326279338207, "grad_norm": 2.515625, "learning_rate": 7.1668600933453225e-06, "loss": 1.5937633514404297, "step": 1378 }, { "epoch": 0.8495575221238938, "grad_norm": 2.109375, "learning_rate": 7.159231003535305e-06, "loss": 1.3834004402160645, "step": 1380 }, { "epoch": 0.8507887649095806, "grad_norm": 2.8125, "learning_rate": 7.151597307372853e-06, "loss": 1.671330213546753, "step": 1382 }, { "epoch": 0.8520200076952674, "grad_norm": 2.765625, "learning_rate": 7.143959035191432e-06, "loss": 1.8127377033233643, "step": 1384 }, { "epoch": 0.8532512504809542, "grad_norm": 2.765625, "learning_rate": 7.136316217342691e-06, "loss": 1.3590033054351807, "step": 1386 }, { "epoch": 0.854482493266641, "grad_norm": 7.9375, "learning_rate": 7.128668884196346e-06, "loss": 1.1065455675125122, "step": 1388 }, { "epoch": 0.8557137360523278, "grad_norm": 8.5, "learning_rate": 7.12101706614005e-06, "loss": 1.72328782081604, "step": 1390 }, { "epoch": 0.8569449788380146, "grad_norm": 9.375, "learning_rate": 7.113360793579281e-06, "loss": 1.730324387550354, "step": 1392 }, { "epoch": 0.8581762216237014, "grad_norm": 6.09375, "learning_rate": 7.105700096937211e-06, "loss": 2.003936529159546, "step": 1394 }, { "epoch": 0.8594074644093882, "grad_norm": 2.984375, "learning_rate": 7.0980350066546e-06, "loss": 1.5877940654754639, "step": 1396 }, { "epoch": 0.8606387071950751, "grad_norm": 3.0625, "learning_rate": 7.090365553189664e-06, "loss": 1.6757268905639648, "step": 1398 }, { "epoch": 0.8618699499807618, "grad_norm": 3.0, "learning_rate": 7.082691767017955e-06, "loss": 1.928758144378662, "step": 1400 }, { "epoch": 0.8631011927664486, "grad_norm": 3.453125, "learning_rate": 7.075013678632239e-06, "loss": 1.6859486103057861, "step": 1402 }, { "epoch": 0.8643324355521355, "grad_norm": 7.15625, "learning_rate": 7.067331318542388e-06, "loss": 2.008883237838745, "step": 1404 }, { "epoch": 0.8655636783378222, "grad_norm": 4.0, "learning_rate": 7.059644717275234e-06, "loss": 1.767155647277832, "step": 1406 }, { "epoch": 0.8667949211235091, "grad_norm": 3.09375, "learning_rate": 7.051953905374471e-06, "loss": 1.611624002456665, "step": 1408 }, { "epoch": 0.8680261639091958, "grad_norm": 2.828125, "learning_rate": 7.044258913400521e-06, "loss": 1.3147724866867065, "step": 1410 }, { "epoch": 0.8692574066948826, "grad_norm": 0.984375, "learning_rate": 7.036559771930422e-06, "loss": 1.1153672933578491, "step": 1412 }, { "epoch": 0.8704886494805695, "grad_norm": 2.75, "learning_rate": 7.028856511557692e-06, "loss": 1.1882882118225098, "step": 1414 }, { "epoch": 0.8717198922662562, "grad_norm": 5.0, "learning_rate": 7.02114916289222e-06, "loss": 1.5670009851455688, "step": 1416 }, { "epoch": 0.8729511350519431, "grad_norm": 3.8125, "learning_rate": 7.013437756560139e-06, "loss": 1.4105370044708252, "step": 1418 }, { "epoch": 0.8741823778376299, "grad_norm": 2.703125, "learning_rate": 7.005722323203712e-06, "loss": 1.5349271297454834, "step": 1420 }, { "epoch": 0.8754136206233166, "grad_norm": 13.0, "learning_rate": 6.998002893481193e-06, "loss": 1.1651748418807983, "step": 1422 }, { "epoch": 0.8766448634090035, "grad_norm": 0.9609375, "learning_rate": 6.990279498066726e-06, "loss": 1.072435975074768, "step": 1424 }, { "epoch": 0.8778761061946903, "grad_norm": 1.921875, "learning_rate": 6.9825521676502076e-06, "loss": 1.2697044610977173, "step": 1426 }, { "epoch": 0.8791073489803771, "grad_norm": 5.0, "learning_rate": 6.97482093293717e-06, "loss": 1.482898473739624, "step": 1428 }, { "epoch": 0.8803385917660639, "grad_norm": 2.265625, "learning_rate": 6.967085824648663e-06, "loss": 1.3979582786560059, "step": 1430 }, { "epoch": 0.8815698345517506, "grad_norm": 2.34375, "learning_rate": 6.959346873521129e-06, "loss": 1.4735959768295288, "step": 1432 }, { "epoch": 0.8828010773374375, "grad_norm": 0.82421875, "learning_rate": 6.951604110306278e-06, "loss": 1.3152323961257935, "step": 1434 }, { "epoch": 0.8840323201231243, "grad_norm": 1.6328125, "learning_rate": 6.943857565770966e-06, "loss": 1.1805187463760376, "step": 1436 }, { "epoch": 0.885263562908811, "grad_norm": 11.125, "learning_rate": 6.936107270697079e-06, "loss": 1.6937435865402222, "step": 1438 }, { "epoch": 0.8864948056944979, "grad_norm": 3.0, "learning_rate": 6.928353255881406e-06, "loss": 1.6152817010879517, "step": 1440 }, { "epoch": 0.8877260484801847, "grad_norm": 1.7265625, "learning_rate": 6.920595552135509e-06, "loss": 1.4991744756698608, "step": 1442 }, { "epoch": 0.8889572912658715, "grad_norm": 2.015625, "learning_rate": 6.912834190285621e-06, "loss": 1.431369423866272, "step": 1444 }, { "epoch": 0.8901885340515583, "grad_norm": 1.0703125, "learning_rate": 6.905069201172501e-06, "loss": 1.1146457195281982, "step": 1446 }, { "epoch": 0.891419776837245, "grad_norm": 0.94140625, "learning_rate": 6.897300615651328e-06, "loss": 1.0204036235809326, "step": 1448 }, { "epoch": 0.8926510196229319, "grad_norm": 2.25, "learning_rate": 6.889528464591566e-06, "loss": 1.4429939985275269, "step": 1450 }, { "epoch": 0.8938822624086187, "grad_norm": 1.7578125, "learning_rate": 6.881752778876849e-06, "loss": 1.6094764471054077, "step": 1452 }, { "epoch": 0.8951135051943055, "grad_norm": 3.8125, "learning_rate": 6.873973589404861e-06, "loss": 1.1291427612304688, "step": 1454 }, { "epoch": 0.8963447479799923, "grad_norm": 1.7109375, "learning_rate": 6.8661909270872014e-06, "loss": 1.1262400150299072, "step": 1456 }, { "epoch": 0.8975759907656791, "grad_norm": 0.7109375, "learning_rate": 6.858404822849272e-06, "loss": 1.190704107284546, "step": 1458 }, { "epoch": 0.8988072335513659, "grad_norm": 1.1015625, "learning_rate": 6.850615307630158e-06, "loss": 1.0832605361938477, "step": 1460 }, { "epoch": 0.9000384763370527, "grad_norm": 3.96875, "learning_rate": 6.842822412382486e-06, "loss": 1.4920941591262817, "step": 1462 }, { "epoch": 0.9012697191227396, "grad_norm": 6.15625, "learning_rate": 6.8350261680723254e-06, "loss": 1.636549711227417, "step": 1464 }, { "epoch": 0.9025009619084263, "grad_norm": 1.8359375, "learning_rate": 6.827226605679045e-06, "loss": 1.2701830863952637, "step": 1466 }, { "epoch": 0.9037322046941131, "grad_norm": 1.4296875, "learning_rate": 6.819423756195205e-06, "loss": 1.0739316940307617, "step": 1468 }, { "epoch": 0.9049634474798, "grad_norm": 2.515625, "learning_rate": 6.811617650626423e-06, "loss": 1.635694980621338, "step": 1470 }, { "epoch": 0.9061946902654867, "grad_norm": 4.71875, "learning_rate": 6.8038083199912574e-06, "loss": 1.6989467144012451, "step": 1472 }, { "epoch": 0.9074259330511736, "grad_norm": 6.3125, "learning_rate": 6.795995795321079e-06, "loss": 1.4359147548675537, "step": 1474 }, { "epoch": 0.9086571758368603, "grad_norm": 2.3125, "learning_rate": 6.788180107659954e-06, "loss": 1.4350411891937256, "step": 1476 }, { "epoch": 0.9098884186225471, "grad_norm": 2.1875, "learning_rate": 6.780361288064514e-06, "loss": 1.3674818277359009, "step": 1478 }, { "epoch": 0.911119661408234, "grad_norm": 2.734375, "learning_rate": 6.772539367603839e-06, "loss": 1.445178508758545, "step": 1480 }, { "epoch": 0.9123509041939207, "grad_norm": 3.390625, "learning_rate": 6.764714377359327e-06, "loss": 1.8486201763153076, "step": 1482 }, { "epoch": 0.9135821469796075, "grad_norm": 2.46875, "learning_rate": 6.756886348424575e-06, "loss": 1.7258269786834717, "step": 1484 }, { "epoch": 0.9148133897652944, "grad_norm": 2.078125, "learning_rate": 6.749055311905259e-06, "loss": 1.5248911380767822, "step": 1486 }, { "epoch": 0.9160446325509811, "grad_norm": 2.671875, "learning_rate": 6.741221298919002e-06, "loss": 1.4051947593688965, "step": 1488 }, { "epoch": 0.917275875336668, "grad_norm": 2.703125, "learning_rate": 6.7333843405952525e-06, "loss": 1.2086796760559082, "step": 1490 }, { "epoch": 0.9185071181223547, "grad_norm": 2.140625, "learning_rate": 6.7255444680751684e-06, "loss": 1.1943715810775757, "step": 1492 }, { "epoch": 0.9197383609080415, "grad_norm": 2.328125, "learning_rate": 6.717701712511482e-06, "loss": 1.3514329195022583, "step": 1494 }, { "epoch": 0.9209696036937284, "grad_norm": 2.609375, "learning_rate": 6.7098561050683854e-06, "loss": 1.5922294855117798, "step": 1496 }, { "epoch": 0.9222008464794151, "grad_norm": 5.0625, "learning_rate": 6.7020076769214014e-06, "loss": 1.4178842306137085, "step": 1498 }, { "epoch": 0.923432089265102, "grad_norm": 2.3125, "learning_rate": 6.694156459257259e-06, "loss": 1.4526511430740356, "step": 1500 }, { "epoch": 0.9246633320507888, "grad_norm": 1.90625, "learning_rate": 6.686302483273781e-06, "loss": 1.2742807865142822, "step": 1502 }, { "epoch": 0.9258945748364755, "grad_norm": 1.2890625, "learning_rate": 6.678445780179738e-06, "loss": 1.2045783996582031, "step": 1504 }, { "epoch": 0.9271258176221624, "grad_norm": 6.21875, "learning_rate": 6.670586381194744e-06, "loss": 0.8458617329597473, "step": 1506 }, { "epoch": 0.9283570604078492, "grad_norm": 4.0, "learning_rate": 6.662724317549125e-06, "loss": 0.547357439994812, "step": 1508 }, { "epoch": 0.929588303193536, "grad_norm": 1.0234375, "learning_rate": 6.654859620483798e-06, "loss": 1.0537328720092773, "step": 1510 }, { "epoch": 0.9308195459792228, "grad_norm": 1.1953125, "learning_rate": 6.646992321250136e-06, "loss": 1.317865014076233, "step": 1512 }, { "epoch": 0.9320507887649095, "grad_norm": 1.984375, "learning_rate": 6.639122451109861e-06, "loss": 1.3471020460128784, "step": 1514 }, { "epoch": 0.9332820315505964, "grad_norm": 2.796875, "learning_rate": 6.631250041334905e-06, "loss": 1.4489096403121948, "step": 1516 }, { "epoch": 0.9345132743362832, "grad_norm": 1.1328125, "learning_rate": 6.623375123207295e-06, "loss": 1.2243424654006958, "step": 1518 }, { "epoch": 0.93574451712197, "grad_norm": 1.578125, "learning_rate": 6.6154977280190225e-06, "loss": 1.1166824102401733, "step": 1520 }, { "epoch": 0.9369757599076568, "grad_norm": 5.21875, "learning_rate": 6.607617887071924e-06, "loss": 1.246292233467102, "step": 1522 }, { "epoch": 0.9382070026933436, "grad_norm": 1.421875, "learning_rate": 6.599735631677555e-06, "loss": 1.030418038368225, "step": 1524 }, { "epoch": 0.9394382454790304, "grad_norm": 2.15625, "learning_rate": 6.591850993157063e-06, "loss": 1.056132197380066, "step": 1526 }, { "epoch": 0.9406694882647172, "grad_norm": 0.81640625, "learning_rate": 6.5839640028410635e-06, "loss": 1.0492579936981201, "step": 1528 }, { "epoch": 0.941900731050404, "grad_norm": 1.4453125, "learning_rate": 6.5760746920695225e-06, "loss": 1.1138197183609009, "step": 1530 }, { "epoch": 0.9431319738360908, "grad_norm": 1.1015625, "learning_rate": 6.568183092191624e-06, "loss": 1.1705691814422607, "step": 1532 }, { "epoch": 0.9443632166217776, "grad_norm": 2.109375, "learning_rate": 6.560289234565649e-06, "loss": 1.4367778301239014, "step": 1534 }, { "epoch": 0.9455944594074644, "grad_norm": 2.078125, "learning_rate": 6.552393150558847e-06, "loss": 1.3702847957611084, "step": 1536 }, { "epoch": 0.9468257021931512, "grad_norm": 2.203125, "learning_rate": 6.544494871547316e-06, "loss": 1.416054606437683, "step": 1538 }, { "epoch": 0.948056944978838, "grad_norm": 2.609375, "learning_rate": 6.536594428915875e-06, "loss": 1.4559407234191895, "step": 1540 }, { "epoch": 0.9492881877645248, "grad_norm": 3.375, "learning_rate": 6.528691854057945e-06, "loss": 1.782531976699829, "step": 1542 }, { "epoch": 0.9505194305502116, "grad_norm": 7.71875, "learning_rate": 6.520787178375415e-06, "loss": 1.6748610734939575, "step": 1544 }, { "epoch": 0.9517506733358985, "grad_norm": 3.671875, "learning_rate": 6.5128804332785235e-06, "loss": 1.773606777191162, "step": 1546 }, { "epoch": 0.9529819161215852, "grad_norm": 21.5, "learning_rate": 6.504971650185732e-06, "loss": 1.8693180084228516, "step": 1548 }, { "epoch": 0.954213158907272, "grad_norm": 2.40625, "learning_rate": 6.497060860523598e-06, "loss": 1.5108202695846558, "step": 1550 }, { "epoch": 0.9554444016929589, "grad_norm": 3.1875, "learning_rate": 6.4891480957266585e-06, "loss": 1.4236609935760498, "step": 1552 }, { "epoch": 0.9566756444786456, "grad_norm": 4.4375, "learning_rate": 6.481233387237292e-06, "loss": 1.7978931665420532, "step": 1554 }, { "epoch": 0.9579068872643325, "grad_norm": 2.84375, "learning_rate": 6.473316766505604e-06, "loss": 1.8097015619277954, "step": 1556 }, { "epoch": 0.9591381300500192, "grad_norm": 2.0625, "learning_rate": 6.465398264989298e-06, "loss": 1.521843433380127, "step": 1558 }, { "epoch": 0.960369372835706, "grad_norm": 3.515625, "learning_rate": 6.4574779141535515e-06, "loss": 1.451346516609192, "step": 1560 }, { "epoch": 0.9616006156213929, "grad_norm": 2.84375, "learning_rate": 6.44955574547089e-06, "loss": 1.4029566049575806, "step": 1562 }, { "epoch": 0.9628318584070796, "grad_norm": 5.1875, "learning_rate": 6.44163179042106e-06, "loss": 1.5579237937927246, "step": 1564 }, { "epoch": 0.9640631011927665, "grad_norm": 0.94140625, "learning_rate": 6.433706080490913e-06, "loss": 1.1235785484313965, "step": 1566 }, { "epoch": 0.9652943439784533, "grad_norm": 0.796875, "learning_rate": 6.425778647174267e-06, "loss": 1.038987398147583, "step": 1568 }, { "epoch": 0.96652558676414, "grad_norm": 2.984375, "learning_rate": 6.417849521971793e-06, "loss": 1.796567678451538, "step": 1570 }, { "epoch": 0.9677568295498269, "grad_norm": 8.125, "learning_rate": 6.409918736390879e-06, "loss": 1.7613965272903442, "step": 1572 }, { "epoch": 0.9689880723355137, "grad_norm": 3.546875, "learning_rate": 6.401986321945518e-06, "loss": 1.4940305948257446, "step": 1574 }, { "epoch": 0.9702193151212004, "grad_norm": 3.453125, "learning_rate": 6.3940523101561695e-06, "loss": 1.6017310619354248, "step": 1576 }, { "epoch": 0.9714505579068873, "grad_norm": 2.515625, "learning_rate": 6.386116732549641e-06, "loss": 1.7846119403839111, "step": 1578 }, { "epoch": 0.972681800692574, "grad_norm": 3.25, "learning_rate": 6.378179620658967e-06, "loss": 1.8666661977767944, "step": 1580 }, { "epoch": 0.9739130434782609, "grad_norm": 2.796875, "learning_rate": 6.370241006023274e-06, "loss": 1.5027897357940674, "step": 1582 }, { "epoch": 0.9751442862639477, "grad_norm": 2.375, "learning_rate": 6.362300920187663e-06, "loss": 1.422347068786621, "step": 1584 }, { "epoch": 0.9763755290496344, "grad_norm": 1.1328125, "learning_rate": 6.354359394703076e-06, "loss": 1.1453440189361572, "step": 1586 }, { "epoch": 0.9776067718353213, "grad_norm": 0.85546875, "learning_rate": 6.346416461126177e-06, "loss": 0.971987247467041, "step": 1588 }, { "epoch": 0.9788380146210081, "grad_norm": 4.125, "learning_rate": 6.3384721510192326e-06, "loss": 1.5504741668701172, "step": 1590 }, { "epoch": 0.9800692574066949, "grad_norm": 3.296875, "learning_rate": 6.330526495949969e-06, "loss": 1.597891092300415, "step": 1592 }, { "epoch": 0.9813005001923817, "grad_norm": 3.640625, "learning_rate": 6.322579527491465e-06, "loss": 1.8136931657791138, "step": 1594 }, { "epoch": 0.9825317429780684, "grad_norm": 2.953125, "learning_rate": 6.314631277222012e-06, "loss": 1.7769173383712769, "step": 1596 }, { "epoch": 0.9837629857637553, "grad_norm": 2.046875, "learning_rate": 6.306681776724997e-06, "loss": 1.513192892074585, "step": 1598 }, { "epoch": 0.9849942285494421, "grad_norm": 2.546875, "learning_rate": 6.2987310575887775e-06, "loss": 1.431948184967041, "step": 1600 }, { "epoch": 0.986225471335129, "grad_norm": 4.0625, "learning_rate": 6.29077915140655e-06, "loss": 1.4942731857299805, "step": 1602 }, { "epoch": 0.9874567141208157, "grad_norm": 2.671875, "learning_rate": 6.282826089776231e-06, "loss": 1.4677563905715942, "step": 1604 }, { "epoch": 0.9886879569065025, "grad_norm": 1.3046875, "learning_rate": 6.2748719043003236e-06, "loss": 1.2760637998580933, "step": 1606 }, { "epoch": 0.9899191996921893, "grad_norm": 1.1640625, "learning_rate": 6.266916626585802e-06, "loss": 1.0633869171142578, "step": 1608 }, { "epoch": 0.9911504424778761, "grad_norm": 3.515625, "learning_rate": 6.258960288243977e-06, "loss": 1.0499076843261719, "step": 1610 }, { "epoch": 0.992381685263563, "grad_norm": 4.9375, "learning_rate": 6.251002920890377e-06, "loss": 1.0250937938690186, "step": 1612 }, { "epoch": 0.9936129280492497, "grad_norm": 5.09375, "learning_rate": 6.243044556144614e-06, "loss": 1.390341877937317, "step": 1614 }, { "epoch": 0.9948441708349365, "grad_norm": 3.296875, "learning_rate": 6.2350852256302695e-06, "loss": 1.8683525323867798, "step": 1616 }, { "epoch": 0.9960754136206234, "grad_norm": 1.5546875, "learning_rate": 6.227124960974758e-06, "loss": 1.2446808815002441, "step": 1618 }, { "epoch": 0.9973066564063101, "grad_norm": 3.078125, "learning_rate": 6.219163793809209e-06, "loss": 1.4699010848999023, "step": 1620 }, { "epoch": 0.9985378991919969, "grad_norm": 1.671875, "learning_rate": 6.2112017557683364e-06, "loss": 1.0956699848175049, "step": 1622 }, { "epoch": 0.9997691419776837, "grad_norm": 0.890625, "learning_rate": 6.203238878490316e-06, "loss": 1.0894286632537842, "step": 1624 }, { "epoch": 1.0006156213928434, "grad_norm": 0.46875, "learning_rate": 6.195275193616654e-06, "loss": 1.1105027198791504, "step": 1626 }, { "epoch": 1.0018468641785303, "grad_norm": 0.609375, "learning_rate": 6.187310732792075e-06, "loss": 1.3475432395935059, "step": 1628 }, { "epoch": 1.003078106964217, "grad_norm": 2.0625, "learning_rate": 6.179345527664378e-06, "loss": 1.3273013830184937, "step": 1630 }, { "epoch": 1.0043093497499038, "grad_norm": 1.6953125, "learning_rate": 6.171379609884323e-06, "loss": 1.4933701753616333, "step": 1632 }, { "epoch": 1.0055405925355907, "grad_norm": 0.9296875, "learning_rate": 6.163413011105499e-06, "loss": 1.3138911724090576, "step": 1634 }, { "epoch": 1.0067718353212773, "grad_norm": 0.9921875, "learning_rate": 6.155445762984209e-06, "loss": 0.9952437281608582, "step": 1636 }, { "epoch": 1.0080030781069642, "grad_norm": 2.53125, "learning_rate": 6.147477897179328e-06, "loss": 1.2402042150497437, "step": 1638 }, { "epoch": 1.009234320892651, "grad_norm": 3.65625, "learning_rate": 6.1395094453521875e-06, "loss": 1.377193570137024, "step": 1640 }, { "epoch": 1.0104655636783377, "grad_norm": 5.03125, "learning_rate": 6.13154043916645e-06, "loss": 1.6374868154525757, "step": 1642 }, { "epoch": 1.0116968064640246, "grad_norm": 2.015625, "learning_rate": 6.123570910287979e-06, "loss": 1.6595889329910278, "step": 1644 }, { "epoch": 1.0129280492497115, "grad_norm": 3.140625, "learning_rate": 6.1156008903847164e-06, "loss": 1.840135097503662, "step": 1646 }, { "epoch": 1.0141592920353983, "grad_norm": 3.515625, "learning_rate": 6.10763041112655e-06, "loss": 1.8915746212005615, "step": 1648 }, { "epoch": 1.015390534821085, "grad_norm": 1.015625, "learning_rate": 6.0996595041852e-06, "loss": 1.5289441347122192, "step": 1650 }, { "epoch": 1.0166217776067719, "grad_norm": 1.203125, "learning_rate": 6.091688201234085e-06, "loss": 1.2102296352386475, "step": 1652 }, { "epoch": 1.0178530203924587, "grad_norm": 0.8125, "learning_rate": 6.083716533948192e-06, "loss": 1.161301612854004, "step": 1654 }, { "epoch": 1.0190842631781454, "grad_norm": 0.8515625, "learning_rate": 6.07574453400396e-06, "loss": 1.096801519393921, "step": 1656 }, { "epoch": 1.0203155059638322, "grad_norm": 0.74609375, "learning_rate": 6.06777223307915e-06, "loss": 1.015903115272522, "step": 1658 }, { "epoch": 1.0215467487495191, "grad_norm": 0.80078125, "learning_rate": 6.0597996628527155e-06, "loss": 1.0650248527526855, "step": 1660 }, { "epoch": 1.0227779915352058, "grad_norm": 5.40625, "learning_rate": 6.051826855004683e-06, "loss": 1.2022972106933594, "step": 1662 }, { "epoch": 1.0240092343208926, "grad_norm": 4.46875, "learning_rate": 6.043853841216025e-06, "loss": 1.3405088186264038, "step": 1664 }, { "epoch": 1.0252404771065795, "grad_norm": 1.9921875, "learning_rate": 6.035880653168529e-06, "loss": 1.6969496011734009, "step": 1666 }, { "epoch": 1.0264717198922662, "grad_norm": 2.265625, "learning_rate": 6.027907322544675e-06, "loss": 1.3863646984100342, "step": 1668 }, { "epoch": 1.027702962677953, "grad_norm": 4.28125, "learning_rate": 6.019933881027508e-06, "loss": 1.4729995727539062, "step": 1670 }, { "epoch": 1.02893420546364, "grad_norm": 3.03125, "learning_rate": 6.0119603603005235e-06, "loss": 1.8226011991500854, "step": 1672 }, { "epoch": 1.0301654482493268, "grad_norm": 1.7578125, "learning_rate": 6.0039867920475185e-06, "loss": 1.4725655317306519, "step": 1674 }, { "epoch": 1.0313966910350134, "grad_norm": 3.921875, "learning_rate": 5.996013207952484e-06, "loss": 1.3765413761138916, "step": 1676 }, { "epoch": 1.0326279338207003, "grad_norm": 4.65625, "learning_rate": 5.9880396396994785e-06, "loss": 1.5706751346588135, "step": 1678 }, { "epoch": 1.0338591766063872, "grad_norm": 6.65625, "learning_rate": 5.980066118972492e-06, "loss": 1.954294204711914, "step": 1680 }, { "epoch": 1.0350904193920738, "grad_norm": 0.734375, "learning_rate": 5.972092677455326e-06, "loss": 1.3637402057647705, "step": 1682 }, { "epoch": 1.0363216621777607, "grad_norm": 4.75, "learning_rate": 5.964119346831474e-06, "loss": 1.143527626991272, "step": 1684 }, { "epoch": 1.0375529049634475, "grad_norm": 1.96875, "learning_rate": 5.956146158783977e-06, "loss": 1.2583506107330322, "step": 1686 }, { "epoch": 1.0387841477491342, "grad_norm": 2.0625, "learning_rate": 5.948173144995318e-06, "loss": 1.4587152004241943, "step": 1688 }, { "epoch": 1.040015390534821, "grad_norm": 2.625, "learning_rate": 5.940200337147286e-06, "loss": 1.4821643829345703, "step": 1690 }, { "epoch": 1.041246633320508, "grad_norm": 2.375, "learning_rate": 5.9322277669208526e-06, "loss": 1.3718185424804688, "step": 1692 }, { "epoch": 1.0424778761061946, "grad_norm": 1.4453125, "learning_rate": 5.92425546599604e-06, "loss": 1.3705888986587524, "step": 1694 }, { "epoch": 1.0437091188918814, "grad_norm": 1.609375, "learning_rate": 5.916283466051808e-06, "loss": 1.1750788688659668, "step": 1696 }, { "epoch": 1.0449403616775683, "grad_norm": 4.0625, "learning_rate": 5.908311798765918e-06, "loss": 1.7212889194488525, "step": 1698 }, { "epoch": 1.0461716044632552, "grad_norm": 4.59375, "learning_rate": 5.900340495814802e-06, "loss": 1.6617670059204102, "step": 1700 }, { "epoch": 1.0474028472489418, "grad_norm": 13.125, "learning_rate": 5.892369588873452e-06, "loss": 1.8269574642181396, "step": 1702 }, { "epoch": 1.0486340900346287, "grad_norm": 4.03125, "learning_rate": 5.884399109615286e-06, "loss": 1.8086248636245728, "step": 1704 }, { "epoch": 1.0498653328203156, "grad_norm": 5.125, "learning_rate": 5.876429089712021e-06, "loss": 1.7379831075668335, "step": 1706 }, { "epoch": 1.0510965756060022, "grad_norm": 3.828125, "learning_rate": 5.86845956083355e-06, "loss": 1.6424190998077393, "step": 1708 }, { "epoch": 1.052327818391689, "grad_norm": 2.21875, "learning_rate": 5.860490554647813e-06, "loss": 1.4312442541122437, "step": 1710 }, { "epoch": 1.053559061177376, "grad_norm": 2.328125, "learning_rate": 5.8525221028206735e-06, "loss": 1.5711675882339478, "step": 1712 }, { "epoch": 1.0547903039630626, "grad_norm": 8.1875, "learning_rate": 5.844554237015793e-06, "loss": 1.5525487661361694, "step": 1714 }, { "epoch": 1.0560215467487495, "grad_norm": 3.875, "learning_rate": 5.8365869888945015e-06, "loss": 1.7739177942276, "step": 1716 }, { "epoch": 1.0572527895344364, "grad_norm": 1.5625, "learning_rate": 5.82862039011568e-06, "loss": 1.198196530342102, "step": 1718 }, { "epoch": 1.0584840323201232, "grad_norm": 1.703125, "learning_rate": 5.820654472335624e-06, "loss": 1.0741544961929321, "step": 1720 }, { "epoch": 1.0597152751058099, "grad_norm": 2.203125, "learning_rate": 5.812689267207925e-06, "loss": 1.2043887376785278, "step": 1722 }, { "epoch": 1.0609465178914967, "grad_norm": 2.046875, "learning_rate": 5.804724806383346e-06, "loss": 1.3711163997650146, "step": 1724 }, { "epoch": 1.0621777606771836, "grad_norm": 4.75, "learning_rate": 5.796761121509686e-06, "loss": 1.7536944150924683, "step": 1726 }, { "epoch": 1.0634090034628703, "grad_norm": 4.5625, "learning_rate": 5.7887982442316656e-06, "loss": 1.9879385232925415, "step": 1728 }, { "epoch": 1.0646402462485571, "grad_norm": 2.84375, "learning_rate": 5.780836206190793e-06, "loss": 1.881171464920044, "step": 1730 }, { "epoch": 1.065871489034244, "grad_norm": 6.09375, "learning_rate": 5.772875039025244e-06, "loss": 1.7399423122406006, "step": 1732 }, { "epoch": 1.0671027318199306, "grad_norm": 5.0625, "learning_rate": 5.764914774369732e-06, "loss": 1.1776556968688965, "step": 1734 }, { "epoch": 1.0683339746056175, "grad_norm": 2.359375, "learning_rate": 5.756955443855388e-06, "loss": 0.3018825352191925, "step": 1736 }, { "epoch": 1.0695652173913044, "grad_norm": 0.92578125, "learning_rate": 5.748997079109625e-06, "loss": 0.6248791217803955, "step": 1738 }, { "epoch": 1.0707964601769913, "grad_norm": 4.90625, "learning_rate": 5.741039711756025e-06, "loss": 1.0930382013320923, "step": 1740 }, { "epoch": 1.072027702962678, "grad_norm": 2.484375, "learning_rate": 5.733083373414201e-06, "loss": 1.4368818998336792, "step": 1742 }, { "epoch": 1.0732589457483648, "grad_norm": 6.125, "learning_rate": 5.725128095699678e-06, "loss": 1.4669115543365479, "step": 1744 }, { "epoch": 1.0744901885340516, "grad_norm": 3.609375, "learning_rate": 5.717173910223772e-06, "loss": 1.3312410116195679, "step": 1746 }, { "epoch": 1.0757214313197383, "grad_norm": 1.78125, "learning_rate": 5.709220848593452e-06, "loss": 1.3409535884857178, "step": 1748 }, { "epoch": 1.0769526741054252, "grad_norm": 3.125, "learning_rate": 5.7012689424112245e-06, "loss": 1.3486981391906738, "step": 1750 }, { "epoch": 1.078183916891112, "grad_norm": 2.90625, "learning_rate": 5.6933182232750036e-06, "loss": 1.460854172706604, "step": 1752 }, { "epoch": 1.0794151596767987, "grad_norm": 1.8125, "learning_rate": 5.685368722777991e-06, "loss": 1.3764128684997559, "step": 1754 }, { "epoch": 1.0806464024624856, "grad_norm": 1.9453125, "learning_rate": 5.677420472508537e-06, "loss": 1.4338797330856323, "step": 1756 }, { "epoch": 1.0818776452481724, "grad_norm": 3.671875, "learning_rate": 5.669473504050033e-06, "loss": 1.5261905193328857, "step": 1758 }, { "epoch": 1.083108888033859, "grad_norm": 2.71875, "learning_rate": 5.6615278489807694e-06, "loss": 1.8875455856323242, "step": 1760 }, { "epoch": 1.084340130819546, "grad_norm": 3.9375, "learning_rate": 5.6535835388738235e-06, "loss": 1.7816144227981567, "step": 1762 }, { "epoch": 1.0855713736052328, "grad_norm": 3.5, "learning_rate": 5.645640605296927e-06, "loss": 1.541454792022705, "step": 1764 }, { "epoch": 1.0868026163909197, "grad_norm": 2.0625, "learning_rate": 5.6376990798123385e-06, "loss": 1.4051401615142822, "step": 1766 }, { "epoch": 1.0880338591766063, "grad_norm": 2.359375, "learning_rate": 5.629758993976727e-06, "loss": 1.4245781898498535, "step": 1768 }, { "epoch": 1.0892651019622932, "grad_norm": 1.7265625, "learning_rate": 5.6218203793410346e-06, "loss": 1.4115514755249023, "step": 1770 }, { "epoch": 1.09049634474798, "grad_norm": 2.96875, "learning_rate": 5.61388326745036e-06, "loss": 1.4192876815795898, "step": 1772 }, { "epoch": 1.0917275875336667, "grad_norm": 2.125, "learning_rate": 5.605947689843833e-06, "loss": 1.346639633178711, "step": 1774 }, { "epoch": 1.0929588303193536, "grad_norm": 1.90625, "learning_rate": 5.598013678054484e-06, "loss": 1.359352707862854, "step": 1776 }, { "epoch": 1.0941900731050405, "grad_norm": 3.453125, "learning_rate": 5.590081263609122e-06, "loss": 1.602827548980713, "step": 1778 }, { "epoch": 1.095421315890727, "grad_norm": 3.421875, "learning_rate": 5.5821504780282086e-06, "loss": 1.5239148139953613, "step": 1780 }, { "epoch": 1.096652558676414, "grad_norm": 3.28125, "learning_rate": 5.574221352825735e-06, "loss": 1.657865047454834, "step": 1782 }, { "epoch": 1.0978838014621009, "grad_norm": 3.734375, "learning_rate": 5.566293919509089e-06, "loss": 1.5900053977966309, "step": 1784 }, { "epoch": 1.0991150442477875, "grad_norm": 1.328125, "learning_rate": 5.558368209578941e-06, "loss": 1.2565845251083374, "step": 1786 }, { "epoch": 1.1003462870334744, "grad_norm": 1.015625, "learning_rate": 5.550444254529113e-06, "loss": 1.064510464668274, "step": 1788 }, { "epoch": 1.1015775298191612, "grad_norm": 6.09375, "learning_rate": 5.542522085846451e-06, "loss": 1.0777498483657837, "step": 1790 }, { "epoch": 1.102808772604848, "grad_norm": 1.515625, "learning_rate": 5.534601735010703e-06, "loss": 1.164319396018982, "step": 1792 }, { "epoch": 1.1040400153905348, "grad_norm": 2.796875, "learning_rate": 5.5266832334943975e-06, "loss": 1.3410156965255737, "step": 1794 }, { "epoch": 1.1052712581762216, "grad_norm": 3.890625, "learning_rate": 5.518766612762712e-06, "loss": 1.8330609798431396, "step": 1796 }, { "epoch": 1.1065025009619085, "grad_norm": 2.234375, "learning_rate": 5.510851904273344e-06, "loss": 1.292941689491272, "step": 1798 }, { "epoch": 1.1077337437475951, "grad_norm": 2.171875, "learning_rate": 5.502939139476403e-06, "loss": 1.4467616081237793, "step": 1800 }, { "epoch": 1.108964986533282, "grad_norm": 1.5, "learning_rate": 5.495028349814271e-06, "loss": 1.3929224014282227, "step": 1802 }, { "epoch": 1.1101962293189689, "grad_norm": 1.3828125, "learning_rate": 5.487119566721477e-06, "loss": 1.1502635478973389, "step": 1804 }, { "epoch": 1.1114274721046555, "grad_norm": 6.03125, "learning_rate": 5.479212821624587e-06, "loss": 1.2217051982879639, "step": 1806 }, { "epoch": 1.1126587148903424, "grad_norm": 4.09375, "learning_rate": 5.4713081459420555e-06, "loss": 1.8544467687606812, "step": 1808 }, { "epoch": 1.1138899576760293, "grad_norm": 3.375, "learning_rate": 5.463405571084127e-06, "loss": 1.2114042043685913, "step": 1810 }, { "epoch": 1.1151212004617161, "grad_norm": 2.4375, "learning_rate": 5.455505128452687e-06, "loss": 1.358185052871704, "step": 1812 }, { "epoch": 1.1163524432474028, "grad_norm": 2.25, "learning_rate": 5.447606849441156e-06, "loss": 1.330447793006897, "step": 1814 }, { "epoch": 1.1175836860330897, "grad_norm": 4.65625, "learning_rate": 5.4397107654343515e-06, "loss": 1.4095932245254517, "step": 1816 }, { "epoch": 1.1188149288187765, "grad_norm": 1.0703125, "learning_rate": 5.431816907808376e-06, "loss": 1.2272474765777588, "step": 1818 }, { "epoch": 1.1200461716044632, "grad_norm": 0.8359375, "learning_rate": 5.423925307930478e-06, "loss": 1.1478188037872314, "step": 1820 }, { "epoch": 1.12127741439015, "grad_norm": 1.2578125, "learning_rate": 5.416035997158937e-06, "loss": 1.2050681114196777, "step": 1822 }, { "epoch": 1.122508657175837, "grad_norm": 0.98046875, "learning_rate": 5.408149006842941e-06, "loss": 1.1752591133117676, "step": 1824 }, { "epoch": 1.1237398999615236, "grad_norm": 2.671875, "learning_rate": 5.400264368322448e-06, "loss": 1.2104952335357666, "step": 1826 }, { "epoch": 1.1249711427472104, "grad_norm": 0.828125, "learning_rate": 5.392382112928078e-06, "loss": 1.0541216135025024, "step": 1828 }, { "epoch": 1.1262023855328973, "grad_norm": 4.0, "learning_rate": 5.38450227198098e-06, "loss": 1.2567408084869385, "step": 1830 }, { "epoch": 1.1274336283185842, "grad_norm": 3.546875, "learning_rate": 5.376624876792706e-06, "loss": 1.3701972961425781, "step": 1832 }, { "epoch": 1.1286648711042708, "grad_norm": 4.125, "learning_rate": 5.368749958665096e-06, "loss": 1.1659176349639893, "step": 1834 }, { "epoch": 1.1298961138899577, "grad_norm": 1.703125, "learning_rate": 5.36087754889014e-06, "loss": 1.0371299982070923, "step": 1836 }, { "epoch": 1.1311273566756446, "grad_norm": 3.265625, "learning_rate": 5.353007678749867e-06, "loss": 1.2042335271835327, "step": 1838 }, { "epoch": 1.1323585994613312, "grad_norm": 2.1875, "learning_rate": 5.345140379516205e-06, "loss": 1.3575973510742188, "step": 1840 }, { "epoch": 1.133589842247018, "grad_norm": 4.46875, "learning_rate": 5.337275682450875e-06, "loss": 1.4255024194717407, "step": 1842 }, { "epoch": 1.134821085032705, "grad_norm": 2.328125, "learning_rate": 5.329413618805257e-06, "loss": 1.4426860809326172, "step": 1844 }, { "epoch": 1.1360523278183916, "grad_norm": 2.46875, "learning_rate": 5.321554219820264e-06, "loss": 1.2452255487442017, "step": 1846 }, { "epoch": 1.1372835706040785, "grad_norm": 1.375, "learning_rate": 5.313697516726219e-06, "loss": 0.993253767490387, "step": 1848 }, { "epoch": 1.1385148133897653, "grad_norm": 4.6875, "learning_rate": 5.305843540742741e-06, "loss": 1.4551208019256592, "step": 1850 }, { "epoch": 1.139746056175452, "grad_norm": 3.9375, "learning_rate": 5.297992323078602e-06, "loss": 1.603740930557251, "step": 1852 }, { "epoch": 1.1409772989611389, "grad_norm": 2.140625, "learning_rate": 5.2901438949316166e-06, "loss": 1.4689736366271973, "step": 1854 }, { "epoch": 1.1422085417468257, "grad_norm": 2.4375, "learning_rate": 5.282298287488521e-06, "loss": 1.4280515909194946, "step": 1856 }, { "epoch": 1.1434397845325126, "grad_norm": 0.9453125, "learning_rate": 5.2744555319248336e-06, "loss": 1.2956243753433228, "step": 1858 }, { "epoch": 1.1446710273181993, "grad_norm": 0.7734375, "learning_rate": 5.2666156594047495e-06, "loss": 1.0923516750335693, "step": 1860 }, { "epoch": 1.1459022701038861, "grad_norm": 1.515625, "learning_rate": 5.258778701081e-06, "loss": 1.063520073890686, "step": 1862 }, { "epoch": 1.147133512889573, "grad_norm": 1.3984375, "learning_rate": 5.250944688094741e-06, "loss": 0.9760624170303345, "step": 1864 }, { "epoch": 1.1483647556752596, "grad_norm": 2.65625, "learning_rate": 5.243113651575426e-06, "loss": 1.2185200452804565, "step": 1866 }, { "epoch": 1.1495959984609465, "grad_norm": 3.09375, "learning_rate": 5.2352856226406765e-06, "loss": 1.452656626701355, "step": 1868 }, { "epoch": 1.1508272412466334, "grad_norm": 2.15625, "learning_rate": 5.227460632396164e-06, "loss": 1.4224770069122314, "step": 1870 }, { "epoch": 1.15205848403232, "grad_norm": 2.5, "learning_rate": 5.219638711935489e-06, "loss": 1.4585014581680298, "step": 1872 }, { "epoch": 1.153289726818007, "grad_norm": 3.5, "learning_rate": 5.2118198923400485e-06, "loss": 1.6782886981964111, "step": 1874 }, { "epoch": 1.1545209696036938, "grad_norm": 6.09375, "learning_rate": 5.2040042046789215e-06, "loss": 1.87554132938385, "step": 1876 }, { "epoch": 1.1557522123893804, "grad_norm": 3.28125, "learning_rate": 5.1961916800087446e-06, "loss": 1.6450105905532837, "step": 1878 }, { "epoch": 1.1569834551750673, "grad_norm": 2.234375, "learning_rate": 5.18838234937358e-06, "loss": 1.4129842519760132, "step": 1880 }, { "epoch": 1.1582146979607542, "grad_norm": 0.80078125, "learning_rate": 5.180576243804796e-06, "loss": 1.189549207687378, "step": 1882 }, { "epoch": 1.159445940746441, "grad_norm": 1.65625, "learning_rate": 5.172773394320956e-06, "loss": 1.2712268829345703, "step": 1884 }, { "epoch": 1.1606771835321277, "grad_norm": 0.59375, "learning_rate": 5.1649738319276766e-06, "loss": 1.157703161239624, "step": 1886 }, { "epoch": 1.1619084263178145, "grad_norm": 0.5703125, "learning_rate": 5.1571775876175154e-06, "loss": 1.3145527839660645, "step": 1888 }, { "epoch": 1.1631396691035014, "grad_norm": 2.046875, "learning_rate": 5.149384692369845e-06, "loss": 1.08966863155365, "step": 1890 }, { "epoch": 1.164370911889188, "grad_norm": 2.859375, "learning_rate": 5.1415951771507285e-06, "loss": 1.065926194190979, "step": 1892 }, { "epoch": 1.165602154674875, "grad_norm": 3.09375, "learning_rate": 5.1338090729128006e-06, "loss": 1.2760896682739258, "step": 1894 }, { "epoch": 1.1668333974605618, "grad_norm": 2.546875, "learning_rate": 5.126026410595142e-06, "loss": 1.4395681619644165, "step": 1896 }, { "epoch": 1.1680646402462487, "grad_norm": 2.0, "learning_rate": 5.1182472211231514e-06, "loss": 1.4737911224365234, "step": 1898 }, { "epoch": 1.1692958830319353, "grad_norm": 2.140625, "learning_rate": 5.110471535408437e-06, "loss": 1.4659451246261597, "step": 1900 }, { "epoch": 1.1705271258176222, "grad_norm": 3.515625, "learning_rate": 5.102699384348672e-06, "loss": 1.361763596534729, "step": 1902 }, { "epoch": 1.1717583686033088, "grad_norm": 2.296875, "learning_rate": 5.0949307988275e-06, "loss": 1.3668726682662964, "step": 1904 }, { "epoch": 1.1729896113889957, "grad_norm": 2.796875, "learning_rate": 5.087165809714381e-06, "loss": 1.4556688070297241, "step": 1906 }, { "epoch": 1.1742208541746826, "grad_norm": 2.828125, "learning_rate": 5.0794044478644934e-06, "loss": 1.443922758102417, "step": 1908 }, { "epoch": 1.1754520969603695, "grad_norm": 2.65625, "learning_rate": 5.071646744118598e-06, "loss": 1.4225409030914307, "step": 1910 }, { "epoch": 1.176683339746056, "grad_norm": 7.21875, "learning_rate": 5.063892729302924e-06, "loss": 1.427457571029663, "step": 1912 }, { "epoch": 1.177914582531743, "grad_norm": 2.8125, "learning_rate": 5.056142434229037e-06, "loss": 1.4073694944381714, "step": 1914 }, { "epoch": 1.1791458253174298, "grad_norm": 7.9375, "learning_rate": 5.048395889693725e-06, "loss": 1.456547498703003, "step": 1916 }, { "epoch": 1.1803770681031165, "grad_norm": 2.140625, "learning_rate": 5.040653126478871e-06, "loss": 1.4025368690490723, "step": 1918 }, { "epoch": 1.1816083108888034, "grad_norm": 1.765625, "learning_rate": 5.032914175351337e-06, "loss": 1.4511933326721191, "step": 1920 }, { "epoch": 1.1828395536744902, "grad_norm": 2.234375, "learning_rate": 5.025179067062833e-06, "loss": 1.3442623615264893, "step": 1922 }, { "epoch": 1.184070796460177, "grad_norm": 6.40625, "learning_rate": 5.017447832349795e-06, "loss": 1.610106110572815, "step": 1924 }, { "epoch": 1.1853020392458637, "grad_norm": 7.9375, "learning_rate": 5.009720501933276e-06, "loss": 0.8935953974723816, "step": 1926 }, { "epoch": 1.1865332820315506, "grad_norm": 8.375, "learning_rate": 5.001997106518808e-06, "loss": 0.2732890248298645, "step": 1928 }, { "epoch": 1.1877645248172375, "grad_norm": 27.5, "learning_rate": 4.9942776767962894e-06, "loss": 0.7279981374740601, "step": 1930 }, { "epoch": 1.1889957676029241, "grad_norm": 3.3125, "learning_rate": 4.986562243439861e-06, "loss": 1.6977769136428833, "step": 1932 }, { "epoch": 1.190227010388611, "grad_norm": 3.96875, "learning_rate": 4.978850837107782e-06, "loss": 1.213844895362854, "step": 1934 }, { "epoch": 1.1914582531742979, "grad_norm": 4.875, "learning_rate": 4.971143488442311e-06, "loss": 0.7499221563339233, "step": 1936 }, { "epoch": 1.1926894959599845, "grad_norm": 5.15625, "learning_rate": 4.96344022806958e-06, "loss": 1.049713373184204, "step": 1938 }, { "epoch": 1.1939207387456714, "grad_norm": 3.1875, "learning_rate": 4.955741086599481e-06, "loss": 1.568014144897461, "step": 1940 }, { "epoch": 1.1951519815313583, "grad_norm": 4.125, "learning_rate": 4.948046094625532e-06, "loss": 1.2914780378341675, "step": 1942 }, { "epoch": 1.196383224317045, "grad_norm": 2.65625, "learning_rate": 4.940355282724769e-06, "loss": 1.070427656173706, "step": 1944 }, { "epoch": 1.1976144671027318, "grad_norm": 7.1875, "learning_rate": 4.932668681457615e-06, "loss": 1.1766752004623413, "step": 1946 }, { "epoch": 1.1988457098884187, "grad_norm": 2.84375, "learning_rate": 4.9249863213677615e-06, "loss": 1.4076998233795166, "step": 1948 }, { "epoch": 1.2000769526741055, "grad_norm": 3.40625, "learning_rate": 4.917308232982048e-06, "loss": 1.2345445156097412, "step": 1950 }, { "epoch": 1.2013081954597922, "grad_norm": 2.484375, "learning_rate": 4.909634446810339e-06, "loss": 1.2328537702560425, "step": 1952 }, { "epoch": 1.202539438245479, "grad_norm": 2.25, "learning_rate": 4.901964993345402e-06, "loss": 1.3884059190750122, "step": 1954 }, { "epoch": 1.203770681031166, "grad_norm": 2.28125, "learning_rate": 4.8942999030627915e-06, "loss": 1.4215402603149414, "step": 1956 }, { "epoch": 1.2050019238168526, "grad_norm": 2.125, "learning_rate": 4.886639206420722e-06, "loss": 1.2735439538955688, "step": 1958 }, { "epoch": 1.2062331666025394, "grad_norm": 3.421875, "learning_rate": 4.878982933859951e-06, "loss": 1.3453803062438965, "step": 1960 }, { "epoch": 1.2074644093882263, "grad_norm": 4.28125, "learning_rate": 4.8713311158036544e-06, "loss": 1.2609363794326782, "step": 1962 }, { "epoch": 1.208695652173913, "grad_norm": 1.21875, "learning_rate": 4.863683782657311e-06, "loss": 1.1702247858047485, "step": 1964 }, { "epoch": 1.2099268949595998, "grad_norm": 4.28125, "learning_rate": 4.8560409648085706e-06, "loss": 1.4844826459884644, "step": 1966 }, { "epoch": 1.2111581377452867, "grad_norm": 4.28125, "learning_rate": 4.84840269262715e-06, "loss": 1.9572813510894775, "step": 1968 }, { "epoch": 1.2123893805309733, "grad_norm": 2.328125, "learning_rate": 4.840768996464696e-06, "loss": 1.7173551321029663, "step": 1970 }, { "epoch": 1.2136206233166602, "grad_norm": 2.265625, "learning_rate": 4.8331399066546795e-06, "loss": 1.4655386209487915, "step": 1972 }, { "epoch": 1.214851866102347, "grad_norm": 7.46875, "learning_rate": 4.825515453512259e-06, "loss": 1.461625337600708, "step": 1974 }, { "epoch": 1.216083108888034, "grad_norm": 2.28125, "learning_rate": 4.8178956673341745e-06, "loss": 1.3926430940628052, "step": 1976 }, { "epoch": 1.2173143516737206, "grad_norm": 2.953125, "learning_rate": 4.810280578398621e-06, "loss": 1.408326506614685, "step": 1978 }, { "epoch": 1.2185455944594075, "grad_norm": 2.703125, "learning_rate": 4.802670216965125e-06, "loss": 1.4212825298309326, "step": 1980 }, { "epoch": 1.2197768372450943, "grad_norm": 4.1875, "learning_rate": 4.79506461327443e-06, "loss": 1.7488731145858765, "step": 1982 }, { "epoch": 1.221008080030781, "grad_norm": 3.453125, "learning_rate": 4.787463797548373e-06, "loss": 1.6178581714630127, "step": 1984 }, { "epoch": 1.2222393228164679, "grad_norm": 3.609375, "learning_rate": 4.779867799989765e-06, "loss": 1.9020529985427856, "step": 1986 }, { "epoch": 1.2234705656021547, "grad_norm": 4.78125, "learning_rate": 4.772276650782275e-06, "loss": 1.8441529273986816, "step": 1988 }, { "epoch": 1.2247018083878416, "grad_norm": 5.0, "learning_rate": 4.764690380090303e-06, "loss": 1.6788111925125122, "step": 1990 }, { "epoch": 1.2259330511735282, "grad_norm": 2.375, "learning_rate": 4.757109018058865e-06, "loss": 1.3692415952682495, "step": 1992 }, { "epoch": 1.2271642939592151, "grad_norm": 3.6875, "learning_rate": 4.749532594813469e-06, "loss": 1.4384074211120605, "step": 1994 }, { "epoch": 1.2283955367449018, "grad_norm": 3.359375, "learning_rate": 4.7419611404600005e-06, "loss": 1.49677312374115, "step": 1996 }, { "epoch": 1.2296267795305886, "grad_norm": 1.8359375, "learning_rate": 4.734394685084603e-06, "loss": 1.3461604118347168, "step": 1998 }, { "epoch": 1.2308580223162755, "grad_norm": 1.9453125, "learning_rate": 4.726833258753552e-06, "loss": 1.3872590065002441, "step": 2000 }, { "epoch": 1.2320892651019624, "grad_norm": 3.046875, "learning_rate": 4.719276891513139e-06, "loss": 1.5854167938232422, "step": 2002 }, { "epoch": 1.233320507887649, "grad_norm": 3.859375, "learning_rate": 4.711725613389557e-06, "loss": 1.751989722251892, "step": 2004 }, { "epoch": 1.234551750673336, "grad_norm": 2.15625, "learning_rate": 4.704179454388773e-06, "loss": 1.7362277507781982, "step": 2006 }, { "epoch": 1.2357829934590228, "grad_norm": 2.96875, "learning_rate": 4.696638444496411e-06, "loss": 1.4224112033843994, "step": 2008 }, { "epoch": 1.2370142362447094, "grad_norm": 2.234375, "learning_rate": 4.68910261367764e-06, "loss": 1.4005614519119263, "step": 2010 }, { "epoch": 1.2382454790303963, "grad_norm": 2.375, "learning_rate": 4.681571991877043e-06, "loss": 1.3552170991897583, "step": 2012 }, { "epoch": 1.2394767218160831, "grad_norm": 4.15625, "learning_rate": 4.674046609018512e-06, "loss": 0.833554208278656, "step": 2014 }, { "epoch": 1.24070796460177, "grad_norm": 5.875, "learning_rate": 4.666526495005115e-06, "loss": 0.39474761486053467, "step": 2016 }, { "epoch": 1.2419392073874567, "grad_norm": 5.03125, "learning_rate": 4.659011679718981e-06, "loss": 0.4157543480396271, "step": 2018 }, { "epoch": 1.2431704501731435, "grad_norm": 2.96875, "learning_rate": 4.651502193021195e-06, "loss": 0.6850585341453552, "step": 2020 }, { "epoch": 1.2444016929588304, "grad_norm": 1.78125, "learning_rate": 4.643998064751658e-06, "loss": 1.0783367156982422, "step": 2022 }, { "epoch": 1.245632935744517, "grad_norm": 2.328125, "learning_rate": 4.636499324728982e-06, "loss": 1.420037031173706, "step": 2024 }, { "epoch": 1.246864178530204, "grad_norm": 4.875, "learning_rate": 4.629006002750368e-06, "loss": 1.3482452630996704, "step": 2026 }, { "epoch": 1.2480954213158908, "grad_norm": 9.375, "learning_rate": 4.6215181285914884e-06, "loss": 1.141502022743225, "step": 2028 }, { "epoch": 1.2493266641015774, "grad_norm": 2.40625, "learning_rate": 4.614035732006368e-06, "loss": 1.4624545574188232, "step": 2030 }, { "epoch": 1.2505579068872643, "grad_norm": 6.1875, "learning_rate": 4.606558842727265e-06, "loss": 1.8598597049713135, "step": 2032 }, { "epoch": 1.2517891496729512, "grad_norm": 2.21875, "learning_rate": 4.599087490464553e-06, "loss": 1.3582333326339722, "step": 2034 }, { "epoch": 1.2530203924586378, "grad_norm": 1.8984375, "learning_rate": 4.591621704906603e-06, "loss": 1.374417781829834, "step": 2036 }, { "epoch": 1.2542516352443247, "grad_norm": 2.125, "learning_rate": 4.584161515719672e-06, "loss": 1.430912971496582, "step": 2038 }, { "epoch": 1.2554828780300116, "grad_norm": 3.265625, "learning_rate": 4.576706952547769e-06, "loss": 1.5239249467849731, "step": 2040 }, { "epoch": 1.2567141208156984, "grad_norm": 1.6796875, "learning_rate": 4.569258045012557e-06, "loss": 1.2920567989349365, "step": 2042 }, { "epoch": 1.257945363601385, "grad_norm": 2.3125, "learning_rate": 4.561814822713218e-06, "loss": 1.380325436592102, "step": 2044 }, { "epoch": 1.259176606387072, "grad_norm": 2.859375, "learning_rate": 4.554377315226348e-06, "loss": 1.4241161346435547, "step": 2046 }, { "epoch": 1.2604078491727588, "grad_norm": 2.84375, "learning_rate": 4.546945552105836e-06, "loss": 1.4067350625991821, "step": 2048 }, { "epoch": 1.2616390919584455, "grad_norm": 2.96875, "learning_rate": 4.539519562882736e-06, "loss": 1.4366530179977417, "step": 2050 }, { "epoch": 1.2628703347441324, "grad_norm": 2.328125, "learning_rate": 4.532099377065168e-06, "loss": 1.4625372886657715, "step": 2052 }, { "epoch": 1.2641015775298192, "grad_norm": 2.640625, "learning_rate": 4.524685024138187e-06, "loss": 1.398501992225647, "step": 2054 }, { "epoch": 1.265332820315506, "grad_norm": 2.453125, "learning_rate": 4.51727653356367e-06, "loss": 1.5378193855285645, "step": 2056 }, { "epoch": 1.2665640631011927, "grad_norm": 3.765625, "learning_rate": 4.509873934780204e-06, "loss": 1.3529696464538574, "step": 2058 }, { "epoch": 1.2677953058868796, "grad_norm": 5.25, "learning_rate": 4.502477257202957e-06, "loss": 1.392851710319519, "step": 2060 }, { "epoch": 1.2690265486725663, "grad_norm": 10.3125, "learning_rate": 4.495086530223576e-06, "loss": 1.705520510673523, "step": 2062 }, { "epoch": 1.2702577914582531, "grad_norm": 3.859375, "learning_rate": 4.487701783210054e-06, "loss": 1.6694080829620361, "step": 2064 }, { "epoch": 1.27148903424394, "grad_norm": 3.84375, "learning_rate": 4.480323045506628e-06, "loss": 1.893826961517334, "step": 2066 }, { "epoch": 1.2727202770296269, "grad_norm": 2.828125, "learning_rate": 4.472950346433655e-06, "loss": 1.817057490348816, "step": 2068 }, { "epoch": 1.2739515198153135, "grad_norm": 2.125, "learning_rate": 4.465583715287496e-06, "loss": 1.6186152696609497, "step": 2070 }, { "epoch": 1.2751827626010004, "grad_norm": 3.078125, "learning_rate": 4.4582231813404014e-06, "loss": 1.4072115421295166, "step": 2072 }, { "epoch": 1.2764140053866873, "grad_norm": 9.625, "learning_rate": 4.450868773840392e-06, "loss": 1.1940281391143799, "step": 2074 }, { "epoch": 1.277645248172374, "grad_norm": 3.40625, "learning_rate": 4.443520522011146e-06, "loss": 1.7899526357650757, "step": 2076 }, { "epoch": 1.2788764909580608, "grad_norm": 2.25, "learning_rate": 4.43617845505188e-06, "loss": 1.3512349128723145, "step": 2078 }, { "epoch": 1.2801077337437476, "grad_norm": 2.40625, "learning_rate": 4.428842602137235e-06, "loss": 1.4610764980316162, "step": 2080 }, { "epoch": 1.2813389765294345, "grad_norm": 4.34375, "learning_rate": 4.421512992417158e-06, "loss": 0.9584015011787415, "step": 2082 }, { "epoch": 1.2825702193151212, "grad_norm": 5.375, "learning_rate": 4.414189655016789e-06, "loss": 0.5581737756729126, "step": 2084 }, { "epoch": 1.283801462100808, "grad_norm": 2.234375, "learning_rate": 4.406872619036348e-06, "loss": 0.6756957769393921, "step": 2086 }, { "epoch": 1.2850327048864947, "grad_norm": 2.203125, "learning_rate": 4.399561913551009e-06, "loss": 1.0549678802490234, "step": 2088 }, { "epoch": 1.2862639476721816, "grad_norm": 2.078125, "learning_rate": 4.392257567610794e-06, "loss": 1.180040717124939, "step": 2090 }, { "epoch": 1.2874951904578684, "grad_norm": 2.234375, "learning_rate": 4.384959610240456e-06, "loss": 1.3322702646255493, "step": 2092 }, { "epoch": 1.2887264332435553, "grad_norm": 2.828125, "learning_rate": 4.377668070439359e-06, "loss": 1.3996449708938599, "step": 2094 }, { "epoch": 1.289957676029242, "grad_norm": 2.4375, "learning_rate": 4.3703829771813685e-06, "loss": 1.412023901939392, "step": 2096 }, { "epoch": 1.2911889188149288, "grad_norm": 8.5625, "learning_rate": 4.363104359414732e-06, "loss": 1.0849536657333374, "step": 2098 }, { "epoch": 1.2924201616006157, "grad_norm": 2.625, "learning_rate": 4.3558322460619666e-06, "loss": 0.7259770631790161, "step": 2100 }, { "epoch": 1.2936514043863023, "grad_norm": 2.0625, "learning_rate": 4.3485666660197445e-06, "loss": 0.9858816266059875, "step": 2102 }, { "epoch": 1.2948826471719892, "grad_norm": 2.484375, "learning_rate": 4.3413076481587755e-06, "loss": 1.346143364906311, "step": 2104 }, { "epoch": 1.296113889957676, "grad_norm": 1.8515625, "learning_rate": 4.334055221323696e-06, "loss": 1.4789454936981201, "step": 2106 }, { "epoch": 1.297345132743363, "grad_norm": 2.40625, "learning_rate": 4.326809414332947e-06, "loss": 1.4302010536193848, "step": 2108 }, { "epoch": 1.2985763755290496, "grad_norm": 2.09375, "learning_rate": 4.319570255978668e-06, "loss": 1.4737108945846558, "step": 2110 }, { "epoch": 1.2998076183147365, "grad_norm": 1.890625, "learning_rate": 4.3123377750265804e-06, "loss": 1.4115092754364014, "step": 2112 }, { "epoch": 1.301038861100423, "grad_norm": 8.1875, "learning_rate": 4.305112000215872e-06, "loss": 1.4483823776245117, "step": 2114 }, { "epoch": 1.30227010388611, "grad_norm": 3.328125, "learning_rate": 4.297892960259081e-06, "loss": 1.314262866973877, "step": 2116 }, { "epoch": 1.3035013466717968, "grad_norm": 4.03125, "learning_rate": 4.290680683841983e-06, "loss": 1.3376444578170776, "step": 2118 }, { "epoch": 1.3047325894574837, "grad_norm": 4.25, "learning_rate": 4.283475199623483e-06, "loss": 1.4766546487808228, "step": 2120 }, { "epoch": 1.3059638322431706, "grad_norm": 2.546875, "learning_rate": 4.276276536235488e-06, "loss": 1.6876254081726074, "step": 2122 }, { "epoch": 1.3071950750288572, "grad_norm": 2.421875, "learning_rate": 4.2690847222828105e-06, "loss": 1.5043630599975586, "step": 2124 }, { "epoch": 1.308426317814544, "grad_norm": 4.625, "learning_rate": 4.261899786343038e-06, "loss": 1.9094319343566895, "step": 2126 }, { "epoch": 1.3096575606002308, "grad_norm": 4.6875, "learning_rate": 4.254721756966434e-06, "loss": 1.5988795757293701, "step": 2128 }, { "epoch": 1.3108888033859176, "grad_norm": 4.15625, "learning_rate": 4.247550662675814e-06, "loss": 1.7517191171646118, "step": 2130 }, { "epoch": 1.3121200461716045, "grad_norm": 6.09375, "learning_rate": 4.240386531966436e-06, "loss": 1.20930814743042, "step": 2132 }, { "epoch": 1.3133512889572914, "grad_norm": 3.0, "learning_rate": 4.233229393305891e-06, "loss": 1.524349570274353, "step": 2134 }, { "epoch": 1.314582531742978, "grad_norm": 4.6875, "learning_rate": 4.226079275133981e-06, "loss": 1.5060465335845947, "step": 2136 }, { "epoch": 1.3158137745286649, "grad_norm": 2.921875, "learning_rate": 4.218936205862614e-06, "loss": 1.264224648475647, "step": 2138 }, { "epoch": 1.3170450173143518, "grad_norm": 2.859375, "learning_rate": 4.211800213875687e-06, "loss": 1.440643310546875, "step": 2140 }, { "epoch": 1.3182762601000384, "grad_norm": 2.34375, "learning_rate": 4.204671327528978e-06, "loss": 1.4793394804000854, "step": 2142 }, { "epoch": 1.3195075028857253, "grad_norm": 2.609375, "learning_rate": 4.197549575150026e-06, "loss": 1.4133528470993042, "step": 2144 }, { "epoch": 1.3207387456714121, "grad_norm": 2.609375, "learning_rate": 4.190434985038023e-06, "loss": 1.4079787731170654, "step": 2146 }, { "epoch": 1.321969988457099, "grad_norm": 2.53125, "learning_rate": 4.183327585463704e-06, "loss": 1.4368443489074707, "step": 2148 }, { "epoch": 1.3232012312427857, "grad_norm": 2.0, "learning_rate": 4.1762274046692244e-06, "loss": 1.4060487747192383, "step": 2150 }, { "epoch": 1.3244324740284725, "grad_norm": 2.921875, "learning_rate": 4.1691344708680634e-06, "loss": 1.4592519998550415, "step": 2152 }, { "epoch": 1.3256637168141592, "grad_norm": 4.6875, "learning_rate": 4.162048812244897e-06, "loss": 1.6927399635314941, "step": 2154 }, { "epoch": 1.326894959599846, "grad_norm": 2.1875, "learning_rate": 4.154970456955495e-06, "loss": 1.5677540302276611, "step": 2156 }, { "epoch": 1.328126202385533, "grad_norm": 0.8984375, "learning_rate": 4.147899433126607e-06, "loss": 1.36515474319458, "step": 2158 }, { "epoch": 1.3293574451712198, "grad_norm": 4.75, "learning_rate": 4.140835768855848e-06, "loss": 0.9946126341819763, "step": 2160 }, { "epoch": 1.3305886879569064, "grad_norm": 3.90625, "learning_rate": 4.133779492211595e-06, "loss": 1.485393762588501, "step": 2162 }, { "epoch": 1.3318199307425933, "grad_norm": 4.3125, "learning_rate": 4.126730631232855e-06, "loss": 1.871983289718628, "step": 2164 }, { "epoch": 1.3330511735282802, "grad_norm": 0.90625, "learning_rate": 4.1196892139291836e-06, "loss": 1.5479357242584229, "step": 2166 }, { "epoch": 1.3342824163139668, "grad_norm": 1.8046875, "learning_rate": 4.112655268280551e-06, "loss": 1.1007100343704224, "step": 2168 }, { "epoch": 1.3355136590996537, "grad_norm": 3.09375, "learning_rate": 4.1056288222372385e-06, "loss": 1.3004124164581299, "step": 2170 }, { "epoch": 1.3367449018853406, "grad_norm": 2.6875, "learning_rate": 4.098609903719724e-06, "loss": 1.4792126417160034, "step": 2172 }, { "epoch": 1.3379761446710274, "grad_norm": 0.83203125, "learning_rate": 4.0915985406185815e-06, "loss": 1.221348762512207, "step": 2174 }, { "epoch": 1.339207387456714, "grad_norm": 0.82421875, "learning_rate": 4.084594760794356e-06, "loss": 1.0107485055923462, "step": 2176 }, { "epoch": 1.340438630242401, "grad_norm": 3.140625, "learning_rate": 4.077598592077458e-06, "loss": 1.1750391721725464, "step": 2178 }, { "epoch": 1.3416698730280876, "grad_norm": 2.421875, "learning_rate": 4.070610062268059e-06, "loss": 1.3896641731262207, "step": 2180 }, { "epoch": 1.3429011158137745, "grad_norm": 2.390625, "learning_rate": 4.063629199135977e-06, "loss": 1.4235678911209106, "step": 2182 }, { "epoch": 1.3441323585994613, "grad_norm": 2.96875, "learning_rate": 4.056656030420561e-06, "loss": 1.469269037246704, "step": 2184 }, { "epoch": 1.3453636013851482, "grad_norm": 8.25, "learning_rate": 4.049690583830588e-06, "loss": 1.6964664459228516, "step": 2186 }, { "epoch": 1.3465948441708349, "grad_norm": 4.71875, "learning_rate": 4.042732887044146e-06, "loss": 1.4317083358764648, "step": 2188 }, { "epoch": 1.3478260869565217, "grad_norm": 2.71875, "learning_rate": 4.03578296770854e-06, "loss": 1.290687918663025, "step": 2190 }, { "epoch": 1.3490573297422086, "grad_norm": 1.0234375, "learning_rate": 4.028840853440155e-06, "loss": 1.2132869958877563, "step": 2192 }, { "epoch": 1.3502885725278952, "grad_norm": 0.72265625, "learning_rate": 4.021906571824371e-06, "loss": 1.1403286457061768, "step": 2194 }, { "epoch": 1.3515198153135821, "grad_norm": 1.2265625, "learning_rate": 4.01498015041544e-06, "loss": 1.2438344955444336, "step": 2196 }, { "epoch": 1.352751058099269, "grad_norm": 2.78125, "learning_rate": 4.008061616736384e-06, "loss": 1.3489577770233154, "step": 2198 }, { "epoch": 1.3539823008849559, "grad_norm": 4.28125, "learning_rate": 4.00115099827888e-06, "loss": 1.4866937398910522, "step": 2200 }, { "epoch": 1.3552135436706425, "grad_norm": 3.265625, "learning_rate": 3.994248322503152e-06, "loss": 1.4686030149459839, "step": 2202 }, { "epoch": 1.3564447864563294, "grad_norm": 3.296875, "learning_rate": 3.987353616837864e-06, "loss": 1.7023189067840576, "step": 2204 }, { "epoch": 1.357676029242016, "grad_norm": 6.3125, "learning_rate": 3.980466908680011e-06, "loss": 1.3478009700775146, "step": 2206 }, { "epoch": 1.358907272027703, "grad_norm": 6.4375, "learning_rate": 3.973588225394804e-06, "loss": 1.93203604221344, "step": 2208 }, { "epoch": 1.3601385148133898, "grad_norm": 4.28125, "learning_rate": 3.966717594315573e-06, "loss": 1.782301425933838, "step": 2210 }, { "epoch": 1.3613697575990766, "grad_norm": 5.03125, "learning_rate": 3.959855042743644e-06, "loss": 1.6604218482971191, "step": 2212 }, { "epoch": 1.3626010003847635, "grad_norm": 6.0, "learning_rate": 3.953000597948246e-06, "loss": 1.6294022798538208, "step": 2214 }, { "epoch": 1.3638322431704502, "grad_norm": 3.1875, "learning_rate": 3.946154287166391e-06, "loss": 1.921142578125, "step": 2216 }, { "epoch": 1.365063485956137, "grad_norm": 3.359375, "learning_rate": 3.939316137602767e-06, "loss": 1.590577244758606, "step": 2218 }, { "epoch": 1.3662947287418237, "grad_norm": 3.0625, "learning_rate": 3.932486176429633e-06, "loss": 1.6776155233383179, "step": 2220 }, { "epoch": 1.3675259715275105, "grad_norm": 3.234375, "learning_rate": 3.925664430786715e-06, "loss": 1.7855265140533447, "step": 2222 }, { "epoch": 1.3687572143131974, "grad_norm": 4.1875, "learning_rate": 3.918850927781091e-06, "loss": 1.6806097030639648, "step": 2224 }, { "epoch": 1.3699884570988843, "grad_norm": 3.6875, "learning_rate": 3.912045694487083e-06, "loss": 1.7721450328826904, "step": 2226 }, { "epoch": 1.371219699884571, "grad_norm": 3.515625, "learning_rate": 3.905248757946154e-06, "loss": 1.5471529960632324, "step": 2228 }, { "epoch": 1.3724509426702578, "grad_norm": 1.1640625, "learning_rate": 3.898460145166802e-06, "loss": 1.17193603515625, "step": 2230 }, { "epoch": 1.3736821854559447, "grad_norm": 1.890625, "learning_rate": 3.891679883124446e-06, "loss": 1.124650239944458, "step": 2232 }, { "epoch": 1.3749134282416313, "grad_norm": 2.15625, "learning_rate": 3.884907998761323e-06, "loss": 1.1825627088546753, "step": 2234 }, { "epoch": 1.3761446710273182, "grad_norm": 4.125, "learning_rate": 3.8781445189863784e-06, "loss": 1.3950858116149902, "step": 2236 }, { "epoch": 1.377375913813005, "grad_norm": 2.078125, "learning_rate": 3.871389470675166e-06, "loss": 1.1823573112487793, "step": 2238 }, { "epoch": 1.378607156598692, "grad_norm": 2.640625, "learning_rate": 3.864642880669731e-06, "loss": 1.4916805028915405, "step": 2240 }, { "epoch": 1.3798383993843786, "grad_norm": 4.0625, "learning_rate": 3.857904775778511e-06, "loss": 1.5051604509353638, "step": 2242 }, { "epoch": 1.3810696421700654, "grad_norm": 2.53125, "learning_rate": 3.851175182776226e-06, "loss": 0.89178866147995, "step": 2244 }, { "epoch": 1.382300884955752, "grad_norm": 2.328125, "learning_rate": 3.844454128403774e-06, "loss": 1.2950024604797363, "step": 2246 }, { "epoch": 1.383532127741439, "grad_norm": 1.828125, "learning_rate": 3.837741639368122e-06, "loss": 1.510188102722168, "step": 2248 }, { "epoch": 1.3847633705271258, "grad_norm": 2.609375, "learning_rate": 3.831037742342203e-06, "loss": 1.3180665969848633, "step": 2250 }, { "epoch": 1.3859946133128127, "grad_norm": 2.875, "learning_rate": 3.824342463964806e-06, "loss": 1.1624681949615479, "step": 2252 }, { "epoch": 1.3872258560984994, "grad_norm": 1.0625, "learning_rate": 3.817655830840477e-06, "loss": 1.0815974473953247, "step": 2254 }, { "epoch": 1.3884570988841862, "grad_norm": 1.8359375, "learning_rate": 3.8109778695394064e-06, "loss": 0.9283170700073242, "step": 2256 }, { "epoch": 1.389688341669873, "grad_norm": 2.375, "learning_rate": 3.8043086065973258e-06, "loss": 1.3523941040039062, "step": 2258 }, { "epoch": 1.3909195844555597, "grad_norm": 2.28125, "learning_rate": 3.7976480685154028e-06, "loss": 1.3457260131835938, "step": 2260 }, { "epoch": 1.3921508272412466, "grad_norm": 1.578125, "learning_rate": 3.7909962817601377e-06, "loss": 1.2352582216262817, "step": 2262 }, { "epoch": 1.3933820700269335, "grad_norm": 1.6875, "learning_rate": 3.7843532727632525e-06, "loss": 1.0768884420394897, "step": 2264 }, { "epoch": 1.3946133128126204, "grad_norm": 2.796875, "learning_rate": 3.7777190679215927e-06, "loss": 1.3136581182479858, "step": 2266 }, { "epoch": 1.395844555598307, "grad_norm": 2.25, "learning_rate": 3.7710936935970173e-06, "loss": 1.4727461338043213, "step": 2268 }, { "epoch": 1.3970757983839939, "grad_norm": 7.75, "learning_rate": 3.7644771761163e-06, "loss": 1.4279669523239136, "step": 2270 }, { "epoch": 1.3983070411696805, "grad_norm": 2.296875, "learning_rate": 3.7578695417710164e-06, "loss": 1.4044800996780396, "step": 2272 }, { "epoch": 1.3995382839553674, "grad_norm": 1.40625, "learning_rate": 3.7512708168174457e-06, "loss": 1.259403109550476, "step": 2274 }, { "epoch": 1.4007695267410543, "grad_norm": 1.609375, "learning_rate": 3.744681027476464e-06, "loss": 1.0513827800750732, "step": 2276 }, { "epoch": 1.4020007695267411, "grad_norm": 3.28125, "learning_rate": 3.738100199933441e-06, "loss": 1.343108057975769, "step": 2278 }, { "epoch": 1.4032320123124278, "grad_norm": 3.03125, "learning_rate": 3.731528360338135e-06, "loss": 1.8145740032196045, "step": 2280 }, { "epoch": 1.4044632550981146, "grad_norm": 0.62890625, "learning_rate": 3.724965534804588e-06, "loss": 1.4862263202667236, "step": 2282 }, { "epoch": 1.4056944978838015, "grad_norm": 1.4296875, "learning_rate": 3.71841174941103e-06, "loss": 1.0445220470428467, "step": 2284 }, { "epoch": 1.4069257406694882, "grad_norm": 2.84375, "learning_rate": 3.7118670301997613e-06, "loss": 1.2141282558441162, "step": 2286 }, { "epoch": 1.408156983455175, "grad_norm": 2.109375, "learning_rate": 3.7053314031770617e-06, "loss": 1.4393140077590942, "step": 2288 }, { "epoch": 1.409388226240862, "grad_norm": 5.0625, "learning_rate": 3.698804894313075e-06, "loss": 1.3356846570968628, "step": 2290 }, { "epoch": 1.4106194690265488, "grad_norm": 4.15625, "learning_rate": 3.6922875295417222e-06, "loss": 1.8149614334106445, "step": 2292 }, { "epoch": 1.4118507118122354, "grad_norm": 0.7890625, "learning_rate": 3.6857793347605824e-06, "loss": 1.0744565725326538, "step": 2294 }, { "epoch": 1.4130819545979223, "grad_norm": 0.99609375, "learning_rate": 3.6792803358308e-06, "loss": 1.0420913696289062, "step": 2296 }, { "epoch": 1.414313197383609, "grad_norm": 4.8125, "learning_rate": 3.672790558576975e-06, "loss": 1.4709570407867432, "step": 2298 }, { "epoch": 1.4155444401692958, "grad_norm": 5.375, "learning_rate": 3.666310028787069e-06, "loss": 1.267836093902588, "step": 2300 }, { "epoch": 1.4167756829549827, "grad_norm": 1.265625, "learning_rate": 3.6598387722122945e-06, "loss": 1.403648853302002, "step": 2302 }, { "epoch": 1.4180069257406696, "grad_norm": 1.234375, "learning_rate": 3.653376814567014e-06, "loss": 1.0699892044067383, "step": 2304 }, { "epoch": 1.4192381685263564, "grad_norm": 8.625, "learning_rate": 3.646924181528641e-06, "loss": 1.3329198360443115, "step": 2306 }, { "epoch": 1.420469411312043, "grad_norm": 3.640625, "learning_rate": 3.6404808987375388e-06, "loss": 1.5011190176010132, "step": 2308 }, { "epoch": 1.42170065409773, "grad_norm": 0.99609375, "learning_rate": 3.6340469917969146e-06, "loss": 1.1454980373382568, "step": 2310 }, { "epoch": 1.4229318968834166, "grad_norm": 1.3359375, "learning_rate": 3.627622486272718e-06, "loss": 1.0486154556274414, "step": 2312 }, { "epoch": 1.4241631396691035, "grad_norm": 3.125, "learning_rate": 3.6212074076935417e-06, "loss": 1.3061400651931763, "step": 2314 }, { "epoch": 1.4253943824547903, "grad_norm": 3.8125, "learning_rate": 3.6148017815505232e-06, "loss": 1.856020450592041, "step": 2316 }, { "epoch": 1.4266256252404772, "grad_norm": 1.859375, "learning_rate": 3.6084056332972324e-06, "loss": 1.5413720607757568, "step": 2318 }, { "epoch": 1.4278568680261639, "grad_norm": 2.96875, "learning_rate": 3.602018988349582e-06, "loss": 1.4303263425827026, "step": 2320 }, { "epoch": 1.4290881108118507, "grad_norm": 4.03125, "learning_rate": 3.59564187208572e-06, "loss": 1.5900418758392334, "step": 2322 }, { "epoch": 1.4303193535975376, "grad_norm": 4.15625, "learning_rate": 3.589274309845936e-06, "loss": 1.7773805856704712, "step": 2324 }, { "epoch": 1.4315505963832242, "grad_norm": 1.1796875, "learning_rate": 3.5829163269325496e-06, "loss": 1.1708474159240723, "step": 2326 }, { "epoch": 1.432781839168911, "grad_norm": 3.546875, "learning_rate": 3.576567948609817e-06, "loss": 0.9795735478401184, "step": 2328 }, { "epoch": 1.434013081954598, "grad_norm": 2.5, "learning_rate": 3.570229200103832e-06, "loss": 1.0455710887908936, "step": 2330 }, { "epoch": 1.4352443247402848, "grad_norm": 2.203125, "learning_rate": 3.5639001066024205e-06, "loss": 1.4353384971618652, "step": 2332 }, { "epoch": 1.4364755675259715, "grad_norm": 2.609375, "learning_rate": 3.557580693255043e-06, "loss": 1.44502854347229, "step": 2334 }, { "epoch": 1.4377068103116584, "grad_norm": 2.3125, "learning_rate": 3.5512709851726968e-06, "loss": 1.4663447141647339, "step": 2336 }, { "epoch": 1.438938053097345, "grad_norm": 2.25, "learning_rate": 3.544971007427811e-06, "loss": 1.5666968822479248, "step": 2338 }, { "epoch": 1.4401692958830319, "grad_norm": 3.421875, "learning_rate": 3.538680785054154e-06, "loss": 1.4446769952774048, "step": 2340 }, { "epoch": 1.4414005386687188, "grad_norm": 3.21875, "learning_rate": 3.5324003430467265e-06, "loss": 1.1215463876724243, "step": 2342 }, { "epoch": 1.4426317814544056, "grad_norm": 2.390625, "learning_rate": 3.526129706361668e-06, "loss": 0.6768141984939575, "step": 2344 }, { "epoch": 1.4438630242400923, "grad_norm": 4.65625, "learning_rate": 3.5198688999161507e-06, "loss": 1.2538286447525024, "step": 2346 }, { "epoch": 1.4450942670257791, "grad_norm": 4.59375, "learning_rate": 3.5136179485882928e-06, "loss": 1.9145801067352295, "step": 2348 }, { "epoch": 1.446325509811466, "grad_norm": 2.875, "learning_rate": 3.5073768772170454e-06, "loss": 1.7400861978530884, "step": 2350 }, { "epoch": 1.4475567525971527, "grad_norm": 2.4375, "learning_rate": 3.501145710602103e-06, "loss": 1.4013432264328003, "step": 2352 }, { "epoch": 1.4487879953828395, "grad_norm": 4.71875, "learning_rate": 3.494924473503801e-06, "loss": 1.7890344858169556, "step": 2354 }, { "epoch": 1.4500192381685264, "grad_norm": 2.625, "learning_rate": 3.4887131906430216e-06, "loss": 1.8322502374649048, "step": 2356 }, { "epoch": 1.4512504809542133, "grad_norm": 11.375, "learning_rate": 3.482511886701091e-06, "loss": 1.65338933467865, "step": 2358 }, { "epoch": 1.4524817237399, "grad_norm": 5.875, "learning_rate": 3.4763205863196795e-06, "loss": 1.3839627504348755, "step": 2360 }, { "epoch": 1.4537129665255868, "grad_norm": 1.7734375, "learning_rate": 3.4701393141007102e-06, "loss": 1.5974239110946655, "step": 2362 }, { "epoch": 1.4549442093112734, "grad_norm": 18.75, "learning_rate": 3.4639680946062595e-06, "loss": 1.3460378646850586, "step": 2364 }, { "epoch": 1.4561754520969603, "grad_norm": 2.109375, "learning_rate": 3.457806952358456e-06, "loss": 1.401890516281128, "step": 2366 }, { "epoch": 1.4574066948826472, "grad_norm": 1.6640625, "learning_rate": 3.4516559118393827e-06, "loss": 1.390883445739746, "step": 2368 }, { "epoch": 1.458637937668334, "grad_norm": 1.8828125, "learning_rate": 3.4455149974909864e-06, "loss": 1.3334836959838867, "step": 2370 }, { "epoch": 1.4598691804540207, "grad_norm": 1.8515625, "learning_rate": 3.4393842337149775e-06, "loss": 1.331444501876831, "step": 2372 }, { "epoch": 1.4611004232397076, "grad_norm": 2.859375, "learning_rate": 3.433263644872724e-06, "loss": 0.9283863306045532, "step": 2374 }, { "epoch": 1.4623316660253944, "grad_norm": 5.0625, "learning_rate": 3.4271532552851692e-06, "loss": 0.4115677773952484, "step": 2376 }, { "epoch": 1.463562908811081, "grad_norm": 2.0, "learning_rate": 3.421053089232725e-06, "loss": 0.6582207679748535, "step": 2378 }, { "epoch": 1.464794151596768, "grad_norm": 2.046875, "learning_rate": 3.4149631709551833e-06, "loss": 1.38728928565979, "step": 2380 }, { "epoch": 1.4660253943824548, "grad_norm": 2.0, "learning_rate": 3.4088835246516098e-06, "loss": 0.8790442943572998, "step": 2382 }, { "epoch": 1.4672566371681417, "grad_norm": 18.875, "learning_rate": 3.402814174480257e-06, "loss": 0.3224247694015503, "step": 2384 }, { "epoch": 1.4684878799538283, "grad_norm": 3.40625, "learning_rate": 3.3967551445584617e-06, "loss": 0.8255885243415833, "step": 2386 }, { "epoch": 1.4697191227395152, "grad_norm": 6.78125, "learning_rate": 3.3907064589625538e-06, "loss": 1.838360071182251, "step": 2388 }, { "epoch": 1.4709503655252019, "grad_norm": 2.5625, "learning_rate": 3.384668141727757e-06, "loss": 1.5447924137115479, "step": 2390 }, { "epoch": 1.4721816083108887, "grad_norm": 1.8203125, "learning_rate": 3.3786402168480976e-06, "loss": 1.358646035194397, "step": 2392 }, { "epoch": 1.4734128510965756, "grad_norm": 2.90625, "learning_rate": 3.372622708276302e-06, "loss": 1.4267942905426025, "step": 2394 }, { "epoch": 1.4746440938822625, "grad_norm": 4.4375, "learning_rate": 3.3666156399237125e-06, "loss": 1.7402775287628174, "step": 2396 }, { "epoch": 1.4758753366679493, "grad_norm": 2.5, "learning_rate": 3.360619035660181e-06, "loss": 1.8984918594360352, "step": 2398 }, { "epoch": 1.477106579453636, "grad_norm": 3.546875, "learning_rate": 3.354632919313979e-06, "loss": 1.718995451927185, "step": 2400 }, { "epoch": 1.4783378222393229, "grad_norm": 1.6484375, "learning_rate": 3.3486573146717066e-06, "loss": 1.5234237909317017, "step": 2402 }, { "epoch": 1.4795690650250095, "grad_norm": 1.1328125, "learning_rate": 3.3426922454781907e-06, "loss": 1.084274172782898, "step": 2404 }, { "epoch": 1.4808003078106964, "grad_norm": 5.625, "learning_rate": 3.336737735436395e-06, "loss": 1.345269799232483, "step": 2406 }, { "epoch": 1.4820315505963833, "grad_norm": 2.359375, "learning_rate": 3.3307938082073256e-06, "loss": 1.7669227123260498, "step": 2408 }, { "epoch": 1.4832627933820701, "grad_norm": 1.671875, "learning_rate": 3.3248604874099377e-06, "loss": 1.3955857753753662, "step": 2410 }, { "epoch": 1.4844940361677568, "grad_norm": 2.015625, "learning_rate": 3.318937796621039e-06, "loss": 1.4464383125305176, "step": 2412 }, { "epoch": 1.4857252789534436, "grad_norm": 3.078125, "learning_rate": 3.313025759375198e-06, "loss": 1.3337277173995972, "step": 2414 }, { "epoch": 1.4869565217391305, "grad_norm": 2.03125, "learning_rate": 3.3071243991646473e-06, "loss": 1.4842052459716797, "step": 2416 }, { "epoch": 1.4881877645248172, "grad_norm": 1.71875, "learning_rate": 3.301233739439198e-06, "loss": 0.7694661617279053, "step": 2418 }, { "epoch": 1.489419007310504, "grad_norm": 9.1875, "learning_rate": 3.2953538036061373e-06, "loss": 0.2676663398742676, "step": 2420 }, { "epoch": 1.490650250096191, "grad_norm": 3.515625, "learning_rate": 3.289484615030142e-06, "loss": 0.8886584043502808, "step": 2422 }, { "epoch": 1.4918814928818778, "grad_norm": 2.75, "learning_rate": 3.2836261970331807e-06, "loss": 1.4449005126953125, "step": 2424 }, { "epoch": 1.4931127356675644, "grad_norm": 3.796875, "learning_rate": 3.2777785728944282e-06, "loss": 1.6887394189834595, "step": 2426 }, { "epoch": 1.4943439784532513, "grad_norm": 4.71875, "learning_rate": 3.2719417658501663e-06, "loss": 1.7763824462890625, "step": 2428 }, { "epoch": 1.495575221238938, "grad_norm": 1.328125, "learning_rate": 3.2661157990936897e-06, "loss": 1.5290007591247559, "step": 2430 }, { "epoch": 1.4968064640246248, "grad_norm": 1.9609375, "learning_rate": 3.2603006957752215e-06, "loss": 1.2895843982696533, "step": 2432 }, { "epoch": 1.4980377068103117, "grad_norm": 1.0390625, "learning_rate": 3.2544964790018207e-06, "loss": 1.420303225517273, "step": 2434 }, { "epoch": 1.4992689495959985, "grad_norm": 1.875, "learning_rate": 3.248703171837282e-06, "loss": 1.1444916725158691, "step": 2436 }, { "epoch": 1.5005001923816854, "grad_norm": 2.1875, "learning_rate": 3.242920797302051e-06, "loss": 1.2554428577423096, "step": 2438 }, { "epoch": 1.501731435167372, "grad_norm": 1.7578125, "learning_rate": 3.23714937837313e-06, "loss": 1.4247585535049438, "step": 2440 }, { "epoch": 1.5029626779530587, "grad_norm": 0.84375, "learning_rate": 3.2313889379839945e-06, "loss": 1.1883597373962402, "step": 2442 }, { "epoch": 1.5041939207387456, "grad_norm": 0.65625, "learning_rate": 3.2256394990244842e-06, "loss": 1.0033619403839111, "step": 2444 }, { "epoch": 1.5054251635244325, "grad_norm": 3.28125, "learning_rate": 3.2199010843407317e-06, "loss": 1.2857462167739868, "step": 2446 }, { "epoch": 1.5066564063101193, "grad_norm": 2.96875, "learning_rate": 3.214173716735059e-06, "loss": 1.4051223993301392, "step": 2448 }, { "epoch": 1.5078876490958062, "grad_norm": 1.984375, "learning_rate": 3.208457418965895e-06, "loss": 1.3864938020706177, "step": 2450 }, { "epoch": 1.5091188918814928, "grad_norm": 1.0859375, "learning_rate": 3.202752213747678e-06, "loss": 1.2609646320343018, "step": 2452 }, { "epoch": 1.5103501346671797, "grad_norm": 3.90625, "learning_rate": 3.197058123750769e-06, "loss": 1.5066922903060913, "step": 2454 }, { "epoch": 1.5115813774528664, "grad_norm": 39.75, "learning_rate": 3.191375171601362e-06, "loss": 1.7706074714660645, "step": 2456 }, { "epoch": 1.5128126202385532, "grad_norm": 1.0859375, "learning_rate": 3.185703379881393e-06, "loss": 1.743719458580017, "step": 2458 }, { "epoch": 1.51404386302424, "grad_norm": 8.125, "learning_rate": 3.1800427711284516e-06, "loss": 1.3321908712387085, "step": 2460 }, { "epoch": 1.515275105809927, "grad_norm": 1.6171875, "learning_rate": 3.1743933678356884e-06, "loss": 1.2848858833312988, "step": 2462 }, { "epoch": 1.5165063485956138, "grad_norm": 2.09375, "learning_rate": 3.1687551924517283e-06, "loss": 1.413031816482544, "step": 2464 }, { "epoch": 1.5177375913813005, "grad_norm": 8.9375, "learning_rate": 3.1631282673805838e-06, "loss": 1.4807164669036865, "step": 2466 }, { "epoch": 1.5189688341669871, "grad_norm": 1.8046875, "learning_rate": 3.1575126149815584e-06, "loss": 1.3795205354690552, "step": 2468 }, { "epoch": 1.520200076952674, "grad_norm": 0.8515625, "learning_rate": 3.1519082575691647e-06, "loss": 1.2714817523956299, "step": 2470 }, { "epoch": 1.5214313197383609, "grad_norm": 1.0546875, "learning_rate": 3.1463152174130318e-06, "loss": 1.0713388919830322, "step": 2472 }, { "epoch": 1.5226625625240477, "grad_norm": 3.203125, "learning_rate": 3.1407335167378194e-06, "loss": 1.0308473110198975, "step": 2474 }, { "epoch": 1.5238938053097346, "grad_norm": 14.5, "learning_rate": 3.1351631777231288e-06, "loss": 1.3537421226501465, "step": 2476 }, { "epoch": 1.5251250480954213, "grad_norm": 1.75, "learning_rate": 3.1296042225034128e-06, "loss": 1.3415385484695435, "step": 2478 }, { "epoch": 1.5263562908811081, "grad_norm": 2.109375, "learning_rate": 3.1240566731678884e-06, "loss": 1.407192587852478, "step": 2480 }, { "epoch": 1.5275875336667948, "grad_norm": 1.4765625, "learning_rate": 3.118520551760454e-06, "loss": 1.2609741687774658, "step": 2482 }, { "epoch": 1.5288187764524817, "grad_norm": 1.7890625, "learning_rate": 3.112995880279594e-06, "loss": 1.061118245124817, "step": 2484 }, { "epoch": 1.5300500192381685, "grad_norm": 5.0, "learning_rate": 3.107482680678297e-06, "loss": 1.2044938802719116, "step": 2486 }, { "epoch": 1.5312812620238554, "grad_norm": 2.875, "learning_rate": 3.1019809748639617e-06, "loss": 1.4915175437927246, "step": 2488 }, { "epoch": 1.5325125048095423, "grad_norm": 2.109375, "learning_rate": 3.096490784698323e-06, "loss": 1.3962339162826538, "step": 2490 }, { "epoch": 1.533743747595229, "grad_norm": 3.9375, "learning_rate": 3.091012131997352e-06, "loss": 1.4024924039840698, "step": 2492 }, { "epoch": 1.5349749903809158, "grad_norm": 1.6328125, "learning_rate": 3.0855450385311736e-06, "loss": 1.266303539276123, "step": 2494 }, { "epoch": 1.5362062331666024, "grad_norm": 1.578125, "learning_rate": 3.0800895260239815e-06, "loss": 1.1494388580322266, "step": 2496 }, { "epoch": 1.5374374759522893, "grad_norm": 1.0859375, "learning_rate": 3.0746456161539534e-06, "loss": 1.2219388484954834, "step": 2498 }, { "epoch": 1.5386687187379762, "grad_norm": 1.734375, "learning_rate": 3.06921333055316e-06, "loss": 0.9378917813301086, "step": 2500 }, { "epoch": 1.539899961523663, "grad_norm": 0.96875, "learning_rate": 3.063792690807481e-06, "loss": 1.071752667427063, "step": 2502 }, { "epoch": 1.54113120430935, "grad_norm": 1.6328125, "learning_rate": 3.0583837184565192e-06, "loss": 1.0082283020019531, "step": 2504 }, { "epoch": 1.5423624470950366, "grad_norm": 7.21875, "learning_rate": 3.0529864349935196e-06, "loss": 1.4392098188400269, "step": 2506 }, { "epoch": 1.5435936898807232, "grad_norm": 5.8125, "learning_rate": 3.047600861865277e-06, "loss": 1.6693429946899414, "step": 2508 }, { "epoch": 1.54482493266641, "grad_norm": 1.8046875, "learning_rate": 3.0422270204720528e-06, "loss": 1.7065497636795044, "step": 2510 }, { "epoch": 1.546056175452097, "grad_norm": 3.65625, "learning_rate": 3.0368649321674914e-06, "loss": 1.5884374380111694, "step": 2512 }, { "epoch": 1.5472874182377838, "grad_norm": 6.75, "learning_rate": 3.03151461825854e-06, "loss": 1.2509660720825195, "step": 2514 }, { "epoch": 1.5485186610234707, "grad_norm": 1.0078125, "learning_rate": 3.026176100005349e-06, "loss": 1.0895438194274902, "step": 2516 }, { "epoch": 1.5497499038091573, "grad_norm": 2.140625, "learning_rate": 3.020849398621204e-06, "loss": 1.5385931730270386, "step": 2518 }, { "epoch": 1.5509811465948442, "grad_norm": 3.359375, "learning_rate": 3.015534535272433e-06, "loss": 1.6532469987869263, "step": 2520 }, { "epoch": 1.5522123893805309, "grad_norm": 1.375, "learning_rate": 3.0102315310783257e-06, "loss": 1.404496669769287, "step": 2522 }, { "epoch": 1.5534436321662177, "grad_norm": 1.6484375, "learning_rate": 3.004940407111046e-06, "loss": 0.9684979915618896, "step": 2524 }, { "epoch": 1.5546748749519046, "grad_norm": 2.609375, "learning_rate": 2.9996611843955505e-06, "loss": 1.1497083902359009, "step": 2526 }, { "epoch": 1.5559061177375915, "grad_norm": 5.71875, "learning_rate": 2.9943938839095038e-06, "loss": 1.3859484195709229, "step": 2528 }, { "epoch": 1.5571373605232783, "grad_norm": 1.2734375, "learning_rate": 2.9891385265831984e-06, "loss": 1.2651927471160889, "step": 2530 }, { "epoch": 1.558368603308965, "grad_norm": 2.28125, "learning_rate": 2.9838951332994676e-06, "loss": 1.4481619596481323, "step": 2532 }, { "epoch": 1.5595998460946516, "grad_norm": 1.0625, "learning_rate": 2.9786637248936025e-06, "loss": 1.2641582489013672, "step": 2534 }, { "epoch": 1.5608310888803385, "grad_norm": 1.3515625, "learning_rate": 2.973444322153275e-06, "loss": 0.9729331731796265, "step": 2536 }, { "epoch": 1.5620623316660254, "grad_norm": 2.734375, "learning_rate": 2.968236945818447e-06, "loss": 1.1042187213897705, "step": 2538 }, { "epoch": 1.5632935744517122, "grad_norm": 4.25, "learning_rate": 2.9630416165812946e-06, "loss": 1.7580437660217285, "step": 2540 }, { "epoch": 1.5645248172373991, "grad_norm": 1.265625, "learning_rate": 2.9578583550861207e-06, "loss": 1.3896489143371582, "step": 2542 }, { "epoch": 1.5657560600230858, "grad_norm": 0.9765625, "learning_rate": 2.9526871819292774e-06, "loss": 1.0869468450546265, "step": 2544 }, { "epoch": 1.5669873028087726, "grad_norm": 24.125, "learning_rate": 2.947528117659082e-06, "loss": 1.2554739713668823, "step": 2546 }, { "epoch": 1.5682185455944593, "grad_norm": 2.65625, "learning_rate": 2.9423811827757336e-06, "loss": 1.2301050424575806, "step": 2548 }, { "epoch": 1.5694497883801461, "grad_norm": 6.5625, "learning_rate": 2.9372463977312364e-06, "loss": 1.5220141410827637, "step": 2550 }, { "epoch": 1.570681031165833, "grad_norm": 3.890625, "learning_rate": 2.932123782929315e-06, "loss": 1.6439367532730103, "step": 2552 }, { "epoch": 1.57191227395152, "grad_norm": 10.0625, "learning_rate": 2.927013358725333e-06, "loss": 1.434930682182312, "step": 2554 }, { "epoch": 1.5731435167372068, "grad_norm": 3.953125, "learning_rate": 2.9219151454262152e-06, "loss": 1.3206510543823242, "step": 2556 }, { "epoch": 1.5743747595228934, "grad_norm": 3.359375, "learning_rate": 2.9168291632903593e-06, "loss": 1.6884095668792725, "step": 2558 }, { "epoch": 1.5756060023085803, "grad_norm": 3.53125, "learning_rate": 2.911755432527568e-06, "loss": 1.4005990028381348, "step": 2560 }, { "epoch": 1.576837245094267, "grad_norm": 2.6875, "learning_rate": 2.906693973298958e-06, "loss": 1.689469575881958, "step": 2562 }, { "epoch": 1.5780684878799538, "grad_norm": 2.203125, "learning_rate": 2.901644805716884e-06, "loss": 1.4509550333023071, "step": 2564 }, { "epoch": 1.5792997306656407, "grad_norm": 0.8359375, "learning_rate": 2.8966079498448564e-06, "loss": 1.4247888326644897, "step": 2566 }, { "epoch": 1.5805309734513275, "grad_norm": 0.8046875, "learning_rate": 2.891583425697467e-06, "loss": 1.0635548830032349, "step": 2568 }, { "epoch": 1.5817622162370142, "grad_norm": 4.3125, "learning_rate": 2.8865712532403056e-06, "loss": 1.169121265411377, "step": 2570 }, { "epoch": 1.582993459022701, "grad_norm": 2.171875, "learning_rate": 2.881571452389877e-06, "loss": 1.51327383518219, "step": 2572 }, { "epoch": 1.5842247018083877, "grad_norm": 2.375, "learning_rate": 2.876584043013527e-06, "loss": 1.2776228189468384, "step": 2574 }, { "epoch": 1.5854559445940746, "grad_norm": 2.46875, "learning_rate": 2.8716090449293675e-06, "loss": 1.3650319576263428, "step": 2576 }, { "epoch": 1.5866871873797614, "grad_norm": 2.59375, "learning_rate": 2.8666464779061878e-06, "loss": 1.4736026525497437, "step": 2578 }, { "epoch": 1.5879184301654483, "grad_norm": 2.109375, "learning_rate": 2.8616963616633796e-06, "loss": 1.456394076347351, "step": 2580 }, { "epoch": 1.5891496729511352, "grad_norm": 1.7421875, "learning_rate": 2.856758715870863e-06, "loss": 1.1904287338256836, "step": 2582 }, { "epoch": 1.5903809157368218, "grad_norm": 2.125, "learning_rate": 2.851833560149007e-06, "loss": 0.9508844017982483, "step": 2584 }, { "epoch": 1.5916121585225087, "grad_norm": 3.328125, "learning_rate": 2.846920914068543e-06, "loss": 1.1540579795837402, "step": 2586 }, { "epoch": 1.5928434013081954, "grad_norm": 1.8125, "learning_rate": 2.8420207971504983e-06, "loss": 1.4928700923919678, "step": 2588 }, { "epoch": 1.5940746440938822, "grad_norm": 2.765625, "learning_rate": 2.8371332288661134e-06, "loss": 1.4172389507293701, "step": 2590 }, { "epoch": 1.595305886879569, "grad_norm": 2.734375, "learning_rate": 2.832258228636766e-06, "loss": 1.3628898859024048, "step": 2592 }, { "epoch": 1.596537129665256, "grad_norm": 2.5, "learning_rate": 2.8273958158338925e-06, "loss": 1.4539599418640137, "step": 2594 }, { "epoch": 1.5977683724509428, "grad_norm": 1.8671875, "learning_rate": 2.82254600977891e-06, "loss": 1.3956893682479858, "step": 2596 }, { "epoch": 1.5989996152366295, "grad_norm": 2.734375, "learning_rate": 2.817708829743142e-06, "loss": 1.6055045127868652, "step": 2598 }, { "epoch": 1.6002308580223161, "grad_norm": 2.5, "learning_rate": 2.812884294947742e-06, "loss": 1.4774987697601318, "step": 2600 }, { "epoch": 1.601462100808003, "grad_norm": 2.828125, "learning_rate": 2.8080724245636142e-06, "loss": 1.6516894102096558, "step": 2602 }, { "epoch": 1.6026933435936899, "grad_norm": 5.21875, "learning_rate": 2.8032732377113414e-06, "loss": 1.8592480421066284, "step": 2604 }, { "epoch": 1.6039245863793767, "grad_norm": 2.4375, "learning_rate": 2.798486753461103e-06, "loss": 1.561418056488037, "step": 2606 }, { "epoch": 1.6051558291650636, "grad_norm": 2.875, "learning_rate": 2.7937129908326083e-06, "loss": 1.4258840084075928, "step": 2608 }, { "epoch": 1.6063870719507503, "grad_norm": 1.1796875, "learning_rate": 2.7889519687950113e-06, "loss": 1.3105173110961914, "step": 2610 }, { "epoch": 1.6076183147364371, "grad_norm": 1.5703125, "learning_rate": 2.7842037062668425e-06, "loss": 1.2106075286865234, "step": 2612 }, { "epoch": 1.6088495575221238, "grad_norm": 4.9375, "learning_rate": 2.7794682221159266e-06, "loss": 1.4307101964950562, "step": 2614 }, { "epoch": 1.6100808003078106, "grad_norm": 3.609375, "learning_rate": 2.7747455351593167e-06, "loss": 1.6504584550857544, "step": 2616 }, { "epoch": 1.6113120430934975, "grad_norm": 3.46875, "learning_rate": 2.7700356641632126e-06, "loss": 1.661940574645996, "step": 2618 }, { "epoch": 1.6125432858791844, "grad_norm": 4.25, "learning_rate": 2.765338627842887e-06, "loss": 1.7384159564971924, "step": 2620 }, { "epoch": 1.6137745286648713, "grad_norm": 75.0, "learning_rate": 2.7606544448626137e-06, "loss": 1.7225772142410278, "step": 2622 }, { "epoch": 1.615005771450558, "grad_norm": 4.6875, "learning_rate": 2.7559831338355943e-06, "loss": 1.4637809991836548, "step": 2624 }, { "epoch": 1.6162370142362446, "grad_norm": 2.625, "learning_rate": 2.7513247133238787e-06, "loss": 1.4859167337417603, "step": 2626 }, { "epoch": 1.6174682570219314, "grad_norm": 2.625, "learning_rate": 2.746679201838294e-06, "loss": 1.2690590620040894, "step": 2628 }, { "epoch": 1.6186994998076183, "grad_norm": 3.296875, "learning_rate": 2.7420466178383726e-06, "loss": 1.333600640296936, "step": 2630 }, { "epoch": 1.6199307425933052, "grad_norm": 2.671875, "learning_rate": 2.7374269797322824e-06, "loss": 1.341450572013855, "step": 2632 }, { "epoch": 1.621161985378992, "grad_norm": 1.703125, "learning_rate": 2.7328203058767424e-06, "loss": 1.355420708656311, "step": 2634 }, { "epoch": 1.6223932281646787, "grad_norm": 1.84375, "learning_rate": 2.728226614576961e-06, "loss": 1.169846534729004, "step": 2636 }, { "epoch": 1.6236244709503656, "grad_norm": 2.03125, "learning_rate": 2.7236459240865555e-06, "loss": 1.2084237337112427, "step": 2638 }, { "epoch": 1.6248557137360522, "grad_norm": 1.3984375, "learning_rate": 2.7190782526074885e-06, "loss": 1.0683963298797607, "step": 2640 }, { "epoch": 1.626086956521739, "grad_norm": 1.171875, "learning_rate": 2.7145236182899817e-06, "loss": 0.9492171406745911, "step": 2642 }, { "epoch": 1.627318199307426, "grad_norm": 0.984375, "learning_rate": 2.7099820392324572e-06, "loss": 1.1876254081726074, "step": 2644 }, { "epoch": 1.6285494420931128, "grad_norm": 1.15625, "learning_rate": 2.705453533481459e-06, "loss": 1.1485795974731445, "step": 2646 }, { "epoch": 1.6297806848787997, "grad_norm": 1.5859375, "learning_rate": 2.700938119031586e-06, "loss": 1.128466248512268, "step": 2648 }, { "epoch": 1.6310119276644863, "grad_norm": 3.671875, "learning_rate": 2.696435813825411e-06, "loss": 1.4734625816345215, "step": 2650 }, { "epoch": 1.6322431704501732, "grad_norm": 3.296875, "learning_rate": 2.69194663575342e-06, "loss": 1.65286123752594, "step": 2652 }, { "epoch": 1.6334744132358598, "grad_norm": 1.765625, "learning_rate": 2.687470602653936e-06, "loss": 1.244763731956482, "step": 2654 }, { "epoch": 1.6347056560215467, "grad_norm": 1.65625, "learning_rate": 2.683007732313046e-06, "loss": 1.109053134918213, "step": 2656 }, { "epoch": 1.6359368988072336, "grad_norm": 1.6484375, "learning_rate": 2.6785580424645376e-06, "loss": 0.9938783645629883, "step": 2658 }, { "epoch": 1.6371681415929205, "grad_norm": 1.21875, "learning_rate": 2.674121550789819e-06, "loss": 0.9947565793991089, "step": 2660 }, { "epoch": 1.638399384378607, "grad_norm": 0.74609375, "learning_rate": 2.6696982749178596e-06, "loss": 0.9547367691993713, "step": 2662 }, { "epoch": 1.639630627164294, "grad_norm": 2.28125, "learning_rate": 2.6652882324251095e-06, "loss": 1.0342196226119995, "step": 2664 }, { "epoch": 1.6408618699499806, "grad_norm": 4.625, "learning_rate": 2.6608914408354355e-06, "loss": 1.2955328226089478, "step": 2666 }, { "epoch": 1.6420931127356675, "grad_norm": 3.96875, "learning_rate": 2.6565079176200503e-06, "loss": 1.697257399559021, "step": 2668 }, { "epoch": 1.6433243555213544, "grad_norm": 6.46875, "learning_rate": 2.6521376801974437e-06, "loss": 1.003428339958191, "step": 2670 }, { "epoch": 1.6445555983070412, "grad_norm": 59.25, "learning_rate": 2.647780745933312e-06, "loss": 0.1837526559829712, "step": 2672 }, { "epoch": 1.645786841092728, "grad_norm": 2.296875, "learning_rate": 2.64343713214049e-06, "loss": 0.8456532955169678, "step": 2674 }, { "epoch": 1.6470180838784148, "grad_norm": 4.5625, "learning_rate": 2.6391068560788814e-06, "loss": 1.4209729433059692, "step": 2676 }, { "epoch": 1.6482493266641016, "grad_norm": 3.46875, "learning_rate": 2.6347899349553916e-06, "loss": 1.708794355392456, "step": 2678 }, { "epoch": 1.6494805694497883, "grad_norm": 3.328125, "learning_rate": 2.630486385923858e-06, "loss": 1.7653048038482666, "step": 2680 }, { "epoch": 1.6507118122354751, "grad_norm": 7.75, "learning_rate": 2.6261962260849845e-06, "loss": 1.6216604709625244, "step": 2682 }, { "epoch": 1.651943055021162, "grad_norm": 3.6875, "learning_rate": 2.6219194724862636e-06, "loss": 1.884316325187683, "step": 2684 }, { "epoch": 1.6531742978068489, "grad_norm": 2.53125, "learning_rate": 2.617656142121927e-06, "loss": 1.6673423051834106, "step": 2686 }, { "epoch": 1.6544055405925358, "grad_norm": 2.140625, "learning_rate": 2.6134062519328596e-06, "loss": 1.3633517026901245, "step": 2688 }, { "epoch": 1.6556367833782224, "grad_norm": 2.640625, "learning_rate": 2.609169818806544e-06, "loss": 1.4055132865905762, "step": 2690 }, { "epoch": 1.656868026163909, "grad_norm": 2.109375, "learning_rate": 2.6049468595769866e-06, "loss": 1.4592944383621216, "step": 2692 }, { "epoch": 1.658099268949596, "grad_norm": 3.0625, "learning_rate": 2.6007373910246586e-06, "loss": 1.468887448310852, "step": 2694 }, { "epoch": 1.6593305117352828, "grad_norm": 3.0, "learning_rate": 2.596541429876419e-06, "loss": 1.43437659740448, "step": 2696 }, { "epoch": 1.6605617545209697, "grad_norm": 4.15625, "learning_rate": 2.5923589928054545e-06, "loss": 1.3961412906646729, "step": 2698 }, { "epoch": 1.6617929973066565, "grad_norm": 2.21875, "learning_rate": 2.588190096431212e-06, "loss": 1.3536356687545776, "step": 2700 }, { "epoch": 1.6630242400923432, "grad_norm": 7.09375, "learning_rate": 2.5840347573193364e-06, "loss": 1.5625301599502563, "step": 2702 }, { "epoch": 1.66425548287803, "grad_norm": 3.75, "learning_rate": 2.579892991981597e-06, "loss": 1.809969186782837, "step": 2704 }, { "epoch": 1.6654867256637167, "grad_norm": 2.578125, "learning_rate": 2.5757648168758277e-06, "loss": 1.6495802402496338, "step": 2706 }, { "epoch": 1.6667179684494036, "grad_norm": 2.890625, "learning_rate": 2.571650248405858e-06, "loss": 1.403519630432129, "step": 2708 }, { "epoch": 1.6679492112350904, "grad_norm": 1.7734375, "learning_rate": 2.5675493029214544e-06, "loss": 1.4832587242126465, "step": 2710 }, { "epoch": 1.6691804540207773, "grad_norm": 2.203125, "learning_rate": 2.563461996718244e-06, "loss": 1.3312305212020874, "step": 2712 }, { "epoch": 1.6704116968064642, "grad_norm": 3.71875, "learning_rate": 2.559388346037662e-06, "loss": 1.5110353231430054, "step": 2714 }, { "epoch": 1.6716429395921508, "grad_norm": 1.8125, "learning_rate": 2.5553283670668783e-06, "loss": 1.3926719427108765, "step": 2716 }, { "epoch": 1.6728741823778375, "grad_norm": 2.921875, "learning_rate": 2.551282075938739e-06, "loss": 1.2947973012924194, "step": 2718 }, { "epoch": 1.6741054251635243, "grad_norm": 1.765625, "learning_rate": 2.547249488731698e-06, "loss": 1.1865016222000122, "step": 2720 }, { "epoch": 1.6753366679492112, "grad_norm": 3.609375, "learning_rate": 2.5432306214697565e-06, "loss": 1.3613277673721313, "step": 2722 }, { "epoch": 1.676567910734898, "grad_norm": 6.15625, "learning_rate": 2.5392254901223955e-06, "loss": 1.4786993265151978, "step": 2724 }, { "epoch": 1.677799153520585, "grad_norm": 1.4453125, "learning_rate": 2.535234110604517e-06, "loss": 1.3303709030151367, "step": 2726 }, { "epoch": 1.6790303963062716, "grad_norm": 1.0859375, "learning_rate": 2.531256498776376e-06, "loss": 1.0553029775619507, "step": 2728 }, { "epoch": 1.6802616390919585, "grad_norm": 1.4765625, "learning_rate": 2.5272926704435224e-06, "loss": 1.0534037351608276, "step": 2730 }, { "epoch": 1.6814928818776451, "grad_norm": 0.98828125, "learning_rate": 2.523342641356733e-06, "loss": 1.185265064239502, "step": 2732 }, { "epoch": 1.682724124663332, "grad_norm": 2.734375, "learning_rate": 2.519406427211954e-06, "loss": 1.3950031995773315, "step": 2734 }, { "epoch": 1.6839553674490189, "grad_norm": 3.265625, "learning_rate": 2.5154840436502343e-06, "loss": 1.6567586660385132, "step": 2736 }, { "epoch": 1.6851866102347057, "grad_norm": 4.53125, "learning_rate": 2.5115755062576675e-06, "loss": 1.7554434537887573, "step": 2738 }, { "epoch": 1.6864178530203926, "grad_norm": 6.9375, "learning_rate": 2.5076808305653223e-06, "loss": 1.8998305797576904, "step": 2740 }, { "epoch": 1.6876490958060792, "grad_norm": 0.69921875, "learning_rate": 2.503800032049194e-06, "loss": 1.280155897140503, "step": 2742 }, { "epoch": 1.6888803385917661, "grad_norm": 1.7109375, "learning_rate": 2.499933126130129e-06, "loss": 1.109460473060608, "step": 2744 }, { "epoch": 1.6901115813774528, "grad_norm": 2.640625, "learning_rate": 2.4960801281737722e-06, "loss": 1.156549096107483, "step": 2746 }, { "epoch": 1.6913428241631396, "grad_norm": 1.9921875, "learning_rate": 2.492241053490502e-06, "loss": 1.4689304828643799, "step": 2748 }, { "epoch": 1.6925740669488265, "grad_norm": 9.125, "learning_rate": 2.488415917335374e-06, "loss": 1.446475625038147, "step": 2750 }, { "epoch": 1.6938053097345134, "grad_norm": 2.578125, "learning_rate": 2.4846047349080552e-06, "loss": 1.4750339984893799, "step": 2752 }, { "epoch": 1.6950365525202, "grad_norm": 2.921875, "learning_rate": 2.480807521352764e-06, "loss": 1.373404622077942, "step": 2754 }, { "epoch": 1.696267795305887, "grad_norm": 2.34375, "learning_rate": 2.4770242917582134e-06, "loss": 1.4248535633087158, "step": 2756 }, { "epoch": 1.6974990380915735, "grad_norm": 2.03125, "learning_rate": 2.4732550611575503e-06, "loss": 1.3744779825210571, "step": 2758 }, { "epoch": 1.6987302808772604, "grad_norm": 2.5625, "learning_rate": 2.4694998445282937e-06, "loss": 1.3755619525909424, "step": 2760 }, { "epoch": 1.6999615236629473, "grad_norm": 2.234375, "learning_rate": 2.4657586567922766e-06, "loss": 1.4600454568862915, "step": 2762 }, { "epoch": 1.7011927664486342, "grad_norm": 2.671875, "learning_rate": 2.462031512815585e-06, "loss": 1.3545056581497192, "step": 2764 }, { "epoch": 1.702424009234321, "grad_norm": 8.6875, "learning_rate": 2.4583184274085044e-06, "loss": 1.5093567371368408, "step": 2766 }, { "epoch": 1.7036552520200077, "grad_norm": 1.78125, "learning_rate": 2.4546194153254504e-06, "loss": 1.4223270416259766, "step": 2768 }, { "epoch": 1.7048864948056945, "grad_norm": 2.421875, "learning_rate": 2.4509344912649222e-06, "loss": 1.4608110189437866, "step": 2770 }, { "epoch": 1.7061177375913812, "grad_norm": 2.234375, "learning_rate": 2.447263669869434e-06, "loss": 1.4189188480377197, "step": 2772 }, { "epoch": 1.707348980377068, "grad_norm": 2.609375, "learning_rate": 2.443606965725466e-06, "loss": 1.347614049911499, "step": 2774 }, { "epoch": 1.708580223162755, "grad_norm": 1.8984375, "learning_rate": 2.4399643933633977e-06, "loss": 1.2820149660110474, "step": 2776 }, { "epoch": 1.7098114659484418, "grad_norm": 1.3984375, "learning_rate": 2.4363359672574557e-06, "loss": 1.3065999746322632, "step": 2778 }, { "epoch": 1.7110427087341287, "grad_norm": 1.8046875, "learning_rate": 2.4327217018256545e-06, "loss": 1.141822338104248, "step": 2780 }, { "epoch": 1.7122739515198153, "grad_norm": 0.91796875, "learning_rate": 2.4291216114297395e-06, "loss": 1.1083295345306396, "step": 2782 }, { "epoch": 1.713505194305502, "grad_norm": 1.046875, "learning_rate": 2.4255357103751298e-06, "loss": 1.0195274353027344, "step": 2784 }, { "epoch": 1.7147364370911888, "grad_norm": 1.375, "learning_rate": 2.42196401291086e-06, "loss": 0.925314724445343, "step": 2786 }, { "epoch": 1.7159676798768757, "grad_norm": 1.0703125, "learning_rate": 2.4184065332295276e-06, "loss": 1.0302790403366089, "step": 2788 }, { "epoch": 1.7171989226625626, "grad_norm": 15.875, "learning_rate": 2.414863285467232e-06, "loss": 1.4890433549880981, "step": 2790 }, { "epoch": 1.7184301654482494, "grad_norm": 6.625, "learning_rate": 2.4113342837035206e-06, "loss": 1.8335363864898682, "step": 2792 }, { "epoch": 1.719661408233936, "grad_norm": 5.6875, "learning_rate": 2.4078195419613325e-06, "loss": 1.7545239925384521, "step": 2794 }, { "epoch": 1.720892651019623, "grad_norm": 2.453125, "learning_rate": 2.4043190742069432e-06, "loss": 1.3311874866485596, "step": 2796 }, { "epoch": 1.7221238938053096, "grad_norm": 4.46875, "learning_rate": 2.4008328943499077e-06, "loss": 1.400950312614441, "step": 2798 }, { "epoch": 1.7233551365909965, "grad_norm": 7.3125, "learning_rate": 2.397361016243007e-06, "loss": 1.402868390083313, "step": 2800 }, { "epoch": 1.7245863793766834, "grad_norm": 3.03125, "learning_rate": 2.3939034536821925e-06, "loss": 1.3764790296554565, "step": 2802 }, { "epoch": 1.7258176221623702, "grad_norm": 2.65625, "learning_rate": 2.390460220406531e-06, "loss": 1.4015588760375977, "step": 2804 }, { "epoch": 1.727048864948057, "grad_norm": 8.5625, "learning_rate": 2.3870313300981492e-06, "loss": 1.4938013553619385, "step": 2806 }, { "epoch": 1.7282801077337437, "grad_norm": 2.125, "learning_rate": 2.383616796382181e-06, "loss": 1.3828613758087158, "step": 2808 }, { "epoch": 1.7295113505194304, "grad_norm": 4.75, "learning_rate": 2.3802166328267104e-06, "loss": 1.0383518934249878, "step": 2810 }, { "epoch": 1.7307425933051173, "grad_norm": 3.59375, "learning_rate": 2.3768308529427235e-06, "loss": 0.5984392762184143, "step": 2812 }, { "epoch": 1.7319738360908041, "grad_norm": 2.84375, "learning_rate": 2.3734594701840484e-06, "loss": 1.0914403200149536, "step": 2814 }, { "epoch": 1.733205078876491, "grad_norm": 2.46875, "learning_rate": 2.370102497947305e-06, "loss": 1.3296228647232056, "step": 2816 }, { "epoch": 1.7344363216621779, "grad_norm": 1.203125, "learning_rate": 2.366759949571851e-06, "loss": 1.2498970031738281, "step": 2818 }, { "epoch": 1.7356675644478645, "grad_norm": 1.1796875, "learning_rate": 2.3634318383397303e-06, "loss": 1.0500065088272095, "step": 2820 }, { "epoch": 1.7368988072335514, "grad_norm": 2.46875, "learning_rate": 2.3601181774756173e-06, "loss": 1.252281665802002, "step": 2822 }, { "epoch": 1.738130050019238, "grad_norm": 4.4375, "learning_rate": 2.3568189801467657e-06, "loss": 1.373681664466858, "step": 2824 }, { "epoch": 1.739361292804925, "grad_norm": 0.734375, "learning_rate": 2.353534259462958e-06, "loss": 1.2320070266723633, "step": 2826 }, { "epoch": 1.7405925355906118, "grad_norm": 1.40625, "learning_rate": 2.350264028476452e-06, "loss": 1.0676991939544678, "step": 2828 }, { "epoch": 1.7418237783762986, "grad_norm": 3.796875, "learning_rate": 2.3470083001819276e-06, "loss": 1.2847479581832886, "step": 2830 }, { "epoch": 1.7430550211619855, "grad_norm": 4.78125, "learning_rate": 2.343767087516437e-06, "loss": 1.801426649093628, "step": 2832 }, { "epoch": 1.7442862639476722, "grad_norm": 4.3125, "learning_rate": 2.3405404033593516e-06, "loss": 1.8096853494644165, "step": 2834 }, { "epoch": 1.745517506733359, "grad_norm": 3.0625, "learning_rate": 2.3373282605323154e-06, "loss": 1.4246339797973633, "step": 2836 }, { "epoch": 1.7467487495190457, "grad_norm": 9.4375, "learning_rate": 2.3341306717991864e-06, "loss": 1.9062341451644897, "step": 2838 }, { "epoch": 1.7479799923047326, "grad_norm": 5.5, "learning_rate": 2.330947649865992e-06, "loss": 1.4905970096588135, "step": 2840 }, { "epoch": 1.7492112350904194, "grad_norm": 0.9375, "learning_rate": 2.327779207380876e-06, "loss": 0.9517359137535095, "step": 2842 }, { "epoch": 1.7504424778761063, "grad_norm": 2.671875, "learning_rate": 2.3246253569340506e-06, "loss": 1.1234976053237915, "step": 2844 }, { "epoch": 1.751673720661793, "grad_norm": 2.671875, "learning_rate": 2.3214861110577416e-06, "loss": 1.0962259769439697, "step": 2846 }, { "epoch": 1.7529049634474798, "grad_norm": 1.46875, "learning_rate": 2.318361482226145e-06, "loss": 1.1401879787445068, "step": 2848 }, { "epoch": 1.7541362062331665, "grad_norm": 4.09375, "learning_rate": 2.3152514828553716e-06, "loss": 1.123967170715332, "step": 2850 }, { "epoch": 1.7553674490188533, "grad_norm": 4.0625, "learning_rate": 2.312156125303401e-06, "loss": 1.9278596639633179, "step": 2852 }, { "epoch": 1.7565986918045402, "grad_norm": 2.359375, "learning_rate": 2.309075421870032e-06, "loss": 1.457679033279419, "step": 2854 }, { "epoch": 1.757829934590227, "grad_norm": 3.046875, "learning_rate": 2.3060093847968333e-06, "loss": 1.4130196571350098, "step": 2856 }, { "epoch": 1.759061177375914, "grad_norm": 2.984375, "learning_rate": 2.302958026267094e-06, "loss": 1.3906093835830688, "step": 2858 }, { "epoch": 1.7602924201616006, "grad_norm": 2.265625, "learning_rate": 2.299921358405778e-06, "loss": 1.4181544780731201, "step": 2860 }, { "epoch": 1.7615236629472875, "grad_norm": 2.3125, "learning_rate": 2.2968993932794724e-06, "loss": 1.576407551765442, "step": 2862 }, { "epoch": 1.762754905732974, "grad_norm": 2.734375, "learning_rate": 2.293892142896341e-06, "loss": 1.506213903427124, "step": 2864 }, { "epoch": 1.763986148518661, "grad_norm": 2.296875, "learning_rate": 2.290899619206078e-06, "loss": 1.5014433860778809, "step": 2866 }, { "epoch": 1.7652173913043478, "grad_norm": 1.6875, "learning_rate": 2.2879218340998575e-06, "loss": 1.075061559677124, "step": 2868 }, { "epoch": 1.7664486340900347, "grad_norm": 1.7109375, "learning_rate": 2.2849587994102908e-06, "loss": 1.2030928134918213, "step": 2870 }, { "epoch": 1.7676798768757216, "grad_norm": 2.796875, "learning_rate": 2.2820105269113725e-06, "loss": 1.3955581188201904, "step": 2872 }, { "epoch": 1.7689111196614082, "grad_norm": 6.625, "learning_rate": 2.2790770283184404e-06, "loss": 1.318220853805542, "step": 2874 }, { "epoch": 1.7701423624470949, "grad_norm": 3.40625, "learning_rate": 2.276158315288127e-06, "loss": 1.6458066701889038, "step": 2876 }, { "epoch": 1.7713736052327818, "grad_norm": 4.6875, "learning_rate": 2.2732543994183104e-06, "loss": 1.8615339994430542, "step": 2878 }, { "epoch": 1.7726048480184686, "grad_norm": 4.28125, "learning_rate": 2.2703652922480716e-06, "loss": 1.877263069152832, "step": 2880 }, { "epoch": 1.7738360908041555, "grad_norm": 2.8125, "learning_rate": 2.2674910052576456e-06, "loss": 1.2448543310165405, "step": 2882 }, { "epoch": 1.7750673335898424, "grad_norm": 1.6015625, "learning_rate": 2.26463154986838e-06, "loss": 0.9634664058685303, "step": 2884 }, { "epoch": 1.776298576375529, "grad_norm": 3.796875, "learning_rate": 2.261786937442686e-06, "loss": 0.9872276186943054, "step": 2886 }, { "epoch": 1.7775298191612159, "grad_norm": 4.40625, "learning_rate": 2.2589571792839933e-06, "loss": 1.7003523111343384, "step": 2888 }, { "epoch": 1.7787610619469025, "grad_norm": 3.234375, "learning_rate": 2.256142286636708e-06, "loss": 1.7901766300201416, "step": 2890 }, { "epoch": 1.7799923047325894, "grad_norm": 2.140625, "learning_rate": 2.2533422706861665e-06, "loss": 1.3721871376037598, "step": 2892 }, { "epoch": 1.7812235475182763, "grad_norm": 1.921875, "learning_rate": 2.2505571425585893e-06, "loss": 1.4578273296356201, "step": 2894 }, { "epoch": 1.7824547903039631, "grad_norm": 1.875, "learning_rate": 2.247786913321037e-06, "loss": 1.0729336738586426, "step": 2896 }, { "epoch": 1.78368603308965, "grad_norm": 2.4375, "learning_rate": 2.245031593981371e-06, "loss": 1.1757901906967163, "step": 2898 }, { "epoch": 1.7849172758753367, "grad_norm": 4.34375, "learning_rate": 2.242291195488204e-06, "loss": 1.8881491422653198, "step": 2900 }, { "epoch": 1.7861485186610233, "grad_norm": 3.046875, "learning_rate": 2.2395657287308597e-06, "loss": 1.7370243072509766, "step": 2902 }, { "epoch": 1.7873797614467102, "grad_norm": 2.984375, "learning_rate": 2.2368552045393277e-06, "loss": 1.454010009765625, "step": 2904 }, { "epoch": 1.788611004232397, "grad_norm": 2.890625, "learning_rate": 2.2341596336842223e-06, "loss": 1.459133267402649, "step": 2906 }, { "epoch": 1.789842247018084, "grad_norm": 2.375, "learning_rate": 2.2314790268767393e-06, "loss": 1.3337326049804688, "step": 2908 }, { "epoch": 1.7910734898037708, "grad_norm": 5.875, "learning_rate": 2.2288133947686115e-06, "loss": 1.2916251420974731, "step": 2910 }, { "epoch": 1.7923047325894574, "grad_norm": 1.8046875, "learning_rate": 2.226162747952068e-06, "loss": 1.0963714122772217, "step": 2912 }, { "epoch": 1.7935359753751443, "grad_norm": 1.9609375, "learning_rate": 2.223527096959793e-06, "loss": 1.1820143461227417, "step": 2914 }, { "epoch": 1.794767218160831, "grad_norm": 2.984375, "learning_rate": 2.220906452264882e-06, "loss": 1.2962974309921265, "step": 2916 }, { "epoch": 1.7959984609465178, "grad_norm": 16.125, "learning_rate": 2.2183008242808025e-06, "loss": 1.6060822010040283, "step": 2918 }, { "epoch": 1.7972297037322047, "grad_norm": 3.125, "learning_rate": 2.215710223361349e-06, "loss": 1.433869481086731, "step": 2920 }, { "epoch": 1.7984609465178916, "grad_norm": 4.65625, "learning_rate": 2.2131346598006046e-06, "loss": 1.6119310855865479, "step": 2922 }, { "epoch": 1.7996921893035784, "grad_norm": 3.640625, "learning_rate": 2.210574143832902e-06, "loss": 1.397140622138977, "step": 2924 }, { "epoch": 1.800923432089265, "grad_norm": 2.546875, "learning_rate": 2.208028685632776e-06, "loss": 1.4758628606796265, "step": 2926 }, { "epoch": 1.802154674874952, "grad_norm": 2.796875, "learning_rate": 2.205498295314931e-06, "loss": 1.3613635301589966, "step": 2928 }, { "epoch": 1.8033859176606386, "grad_norm": 4.09375, "learning_rate": 2.2029829829341963e-06, "loss": 1.2514557838439941, "step": 2930 }, { "epoch": 1.8046171604463255, "grad_norm": 0.9296875, "learning_rate": 2.200482758485486e-06, "loss": 1.0420831441879272, "step": 2932 }, { "epoch": 1.8058484032320123, "grad_norm": 1.671875, "learning_rate": 2.197997631903763e-06, "loss": 1.0888323783874512, "step": 2934 }, { "epoch": 1.8070796460176992, "grad_norm": 3.484375, "learning_rate": 2.1955276130639934e-06, "loss": 1.4852358102798462, "step": 2936 }, { "epoch": 1.8083108888033859, "grad_norm": 1.171875, "learning_rate": 2.1930727117811135e-06, "loss": 1.3741511106491089, "step": 2938 }, { "epoch": 1.8095421315890727, "grad_norm": 1.1328125, "learning_rate": 2.1906329378099856e-06, "loss": 1.0535756349563599, "step": 2940 }, { "epoch": 1.8107733743747594, "grad_norm": 8.625, "learning_rate": 2.1882083008453653e-06, "loss": 1.3297520875930786, "step": 2942 }, { "epoch": 1.8120046171604463, "grad_norm": 4.25, "learning_rate": 2.185798810521855e-06, "loss": 1.585155963897705, "step": 2944 }, { "epoch": 1.8132358599461331, "grad_norm": 3.03125, "learning_rate": 2.183404476413874e-06, "loss": 1.9898563623428345, "step": 2946 }, { "epoch": 1.81446710273182, "grad_norm": 3.640625, "learning_rate": 2.181025308035614e-06, "loss": 1.9423363208770752, "step": 2948 }, { "epoch": 1.8156983455175069, "grad_norm": 42.0, "learning_rate": 2.178661314841005e-06, "loss": 1.3260115385055542, "step": 2950 }, { "epoch": 1.8169295883031935, "grad_norm": 1.875, "learning_rate": 2.1763125062236744e-06, "loss": 1.3768151998519897, "step": 2952 }, { "epoch": 1.8181608310888804, "grad_norm": 2.03125, "learning_rate": 2.1739788915169138e-06, "loss": 1.415548324584961, "step": 2954 }, { "epoch": 1.819392073874567, "grad_norm": 3.65625, "learning_rate": 2.17166047999364e-06, "loss": 1.480286955833435, "step": 2956 }, { "epoch": 1.820623316660254, "grad_norm": 4.5, "learning_rate": 2.1693572808663567e-06, "loss": 1.5904080867767334, "step": 2958 }, { "epoch": 1.8218545594459408, "grad_norm": 5.625, "learning_rate": 2.167069303287119e-06, "loss": 1.708762288093567, "step": 2960 }, { "epoch": 1.8230858022316276, "grad_norm": 3.46875, "learning_rate": 2.1647965563474997e-06, "loss": 1.8747527599334717, "step": 2962 }, { "epoch": 1.8243170450173145, "grad_norm": 2.671875, "learning_rate": 2.1625390490785485e-06, "loss": 1.7154184579849243, "step": 2964 }, { "epoch": 1.8255482878030012, "grad_norm": 4.15625, "learning_rate": 2.1602967904507578e-06, "loss": 1.6695775985717773, "step": 2966 }, { "epoch": 1.8267795305886878, "grad_norm": 2.484375, "learning_rate": 2.1580697893740287e-06, "loss": 1.418200969696045, "step": 2968 }, { "epoch": 1.8280107733743747, "grad_norm": 3.484375, "learning_rate": 2.1558580546976363e-06, "loss": 1.2916027307510376, "step": 2970 }, { "epoch": 1.8292420161600615, "grad_norm": 0.7578125, "learning_rate": 2.1536615952101886e-06, "loss": 0.9269257187843323, "step": 2972 }, { "epoch": 1.8304732589457484, "grad_norm": 0.984375, "learning_rate": 2.151480419639599e-06, "loss": 1.0978361368179321, "step": 2974 }, { "epoch": 1.8317045017314353, "grad_norm": 5.28125, "learning_rate": 2.1493145366530464e-06, "loss": 1.0932646989822388, "step": 2976 }, { "epoch": 1.832935744517122, "grad_norm": 0.92578125, "learning_rate": 2.1471639548569448e-06, "loss": 1.1995317935943604, "step": 2978 }, { "epoch": 1.8341669873028088, "grad_norm": 1.015625, "learning_rate": 2.1450286827969046e-06, "loss": 1.2731231451034546, "step": 2980 }, { "epoch": 1.8353982300884955, "grad_norm": 2.0, "learning_rate": 2.1429087289577018e-06, "loss": 1.3012076616287231, "step": 2982 }, { "epoch": 1.8366294728741823, "grad_norm": 2.8125, "learning_rate": 2.1408041017632443e-06, "loss": 1.3655699491500854, "step": 2984 }, { "epoch": 1.8378607156598692, "grad_norm": 3.046875, "learning_rate": 2.1387148095765366e-06, "loss": 1.5270923376083374, "step": 2986 }, { "epoch": 1.839091958445556, "grad_norm": 3.21875, "learning_rate": 2.1366408606996488e-06, "loss": 1.518768548965454, "step": 2988 }, { "epoch": 1.840323201231243, "grad_norm": 3.65625, "learning_rate": 2.1345822633736804e-06, "loss": 1.5318608283996582, "step": 2990 }, { "epoch": 1.8415544440169296, "grad_norm": 4.84375, "learning_rate": 2.1325390257787324e-06, "loss": 2.0122017860412598, "step": 2992 }, { "epoch": 1.8427856868026162, "grad_norm": 2.484375, "learning_rate": 2.1305111560338686e-06, "loss": 1.6048575639724731, "step": 2994 }, { "epoch": 1.844016929588303, "grad_norm": 2.96875, "learning_rate": 2.1284986621970894e-06, "loss": 1.4295393228530884, "step": 2996 }, { "epoch": 1.84524817237399, "grad_norm": 8.125, "learning_rate": 2.126501552265296e-06, "loss": 1.5011441707611084, "step": 2998 }, { "epoch": 1.8464794151596768, "grad_norm": 3.71875, "learning_rate": 2.1245198341742587e-06, "loss": 1.8417302370071411, "step": 3000 }, { "epoch": 1.8477106579453637, "grad_norm": 2.859375, "learning_rate": 2.1225535157985893e-06, "loss": 1.7056752443313599, "step": 3002 }, { "epoch": 1.8489419007310504, "grad_norm": 3.65625, "learning_rate": 2.120602604951704e-06, "loss": 1.4045121669769287, "step": 3004 }, { "epoch": 1.8501731435167372, "grad_norm": 3.65625, "learning_rate": 2.118667109385796e-06, "loss": 1.4713330268859863, "step": 3006 }, { "epoch": 1.8514043863024239, "grad_norm": 3.640625, "learning_rate": 2.1167470367918042e-06, "loss": 1.7798014879226685, "step": 3008 }, { "epoch": 1.8526356290881107, "grad_norm": 2.625, "learning_rate": 2.114842394799382e-06, "loss": 1.5931851863861084, "step": 3010 }, { "epoch": 1.8538668718737976, "grad_norm": 1.7734375, "learning_rate": 2.112953190976869e-06, "loss": 1.183977484703064, "step": 3012 }, { "epoch": 1.8550981146594845, "grad_norm": 2.34375, "learning_rate": 2.111079432831256e-06, "loss": 1.3038005828857422, "step": 3014 }, { "epoch": 1.8563293574451714, "grad_norm": 3.0625, "learning_rate": 2.109221127808161e-06, "loss": 1.7839789390563965, "step": 3016 }, { "epoch": 1.857560600230858, "grad_norm": 4.09375, "learning_rate": 2.1073782832917974e-06, "loss": 1.9610928297042847, "step": 3018 }, { "epoch": 1.8587918430165449, "grad_norm": 6.25, "learning_rate": 2.105550906604942e-06, "loss": 1.7422419786453247, "step": 3020 }, { "epoch": 1.8600230858022315, "grad_norm": 3.296875, "learning_rate": 2.103739005008909e-06, "loss": 1.6052628755569458, "step": 3022 }, { "epoch": 1.8612543285879184, "grad_norm": 17.75, "learning_rate": 2.101942585703522e-06, "loss": 1.737350583076477, "step": 3024 }, { "epoch": 1.8624855713736053, "grad_norm": 8.4375, "learning_rate": 2.1001616558270812e-06, "loss": 1.894784688949585, "step": 3026 }, { "epoch": 1.8637168141592921, "grad_norm": 4.34375, "learning_rate": 2.098396222456341e-06, "loss": 1.7377785444259644, "step": 3028 }, { "epoch": 1.8649480569449788, "grad_norm": 3.578125, "learning_rate": 2.0966462926064745e-06, "loss": 1.8874013423919678, "step": 3030 }, { "epoch": 1.8661792997306657, "grad_norm": 3.328125, "learning_rate": 2.0949118732310513e-06, "loss": 1.5274291038513184, "step": 3032 }, { "epoch": 1.8674105425163523, "grad_norm": 3.0625, "learning_rate": 2.09319297122201e-06, "loss": 1.6374666690826416, "step": 3034 }, { "epoch": 1.8686417853020392, "grad_norm": 4.1875, "learning_rate": 2.0914895934096256e-06, "loss": 1.0928645133972168, "step": 3036 }, { "epoch": 1.869873028087726, "grad_norm": 3.09375, "learning_rate": 2.089801746562489e-06, "loss": 1.419162392616272, "step": 3038 }, { "epoch": 1.871104270873413, "grad_norm": 4.6875, "learning_rate": 2.088129437387473e-06, "loss": 0.9393894076347351, "step": 3040 }, { "epoch": 1.8723355136590998, "grad_norm": 2.859375, "learning_rate": 2.086472672529713e-06, "loss": 1.576371431350708, "step": 3042 }, { "epoch": 1.8735667564447864, "grad_norm": 2.484375, "learning_rate": 2.0848314585725764e-06, "loss": 1.6301262378692627, "step": 3044 }, { "epoch": 1.8747979992304733, "grad_norm": 1.4453125, "learning_rate": 2.0832058020376353e-06, "loss": 1.2185789346694946, "step": 3046 }, { "epoch": 1.87602924201616, "grad_norm": 0.9296875, "learning_rate": 2.081595709384644e-06, "loss": 1.14590585231781, "step": 3048 }, { "epoch": 1.8772604848018468, "grad_norm": 1.8203125, "learning_rate": 2.080001187011511e-06, "loss": 1.0869334936141968, "step": 3050 }, { "epoch": 1.8784917275875337, "grad_norm": 4.4375, "learning_rate": 2.0784222412542755e-06, "loss": 1.4626398086547852, "step": 3052 }, { "epoch": 1.8797229703732206, "grad_norm": 3.734375, "learning_rate": 2.0768588783870785e-06, "loss": 1.3795206546783447, "step": 3054 }, { "epoch": 1.8809542131589074, "grad_norm": 3.53125, "learning_rate": 2.0753111046221435e-06, "loss": 1.4011050462722778, "step": 3056 }, { "epoch": 1.882185455944594, "grad_norm": 1.5234375, "learning_rate": 2.073778926109746e-06, "loss": 1.3742821216583252, "step": 3058 }, { "epoch": 1.8834166987302807, "grad_norm": 1.6328125, "learning_rate": 2.0722623489381943e-06, "loss": 1.282598614692688, "step": 3060 }, { "epoch": 1.8846479415159676, "grad_norm": 2.765625, "learning_rate": 2.0707613791338006e-06, "loss": 1.3038278818130493, "step": 3062 }, { "epoch": 1.8858791843016545, "grad_norm": 3.265625, "learning_rate": 2.06927602266086e-06, "loss": 1.6639541387557983, "step": 3064 }, { "epoch": 1.8871104270873413, "grad_norm": 2.734375, "learning_rate": 2.0678062854216265e-06, "loss": 1.669793725013733, "step": 3066 }, { "epoch": 1.8883416698730282, "grad_norm": 2.109375, "learning_rate": 2.0663521732562896e-06, "loss": 1.4260603189468384, "step": 3068 }, { "epoch": 1.8895729126587149, "grad_norm": 1.84375, "learning_rate": 2.064913691942947e-06, "loss": 1.317846417427063, "step": 3070 }, { "epoch": 1.8908041554444017, "grad_norm": 1.21875, "learning_rate": 2.063490847197591e-06, "loss": 0.9934130907058716, "step": 3072 }, { "epoch": 1.8920353982300884, "grad_norm": 2.484375, "learning_rate": 2.062083644674075e-06, "loss": 1.2410752773284912, "step": 3074 }, { "epoch": 1.8932666410157752, "grad_norm": 5.46875, "learning_rate": 2.0606920899640993e-06, "loss": 1.5223228931427002, "step": 3076 }, { "epoch": 1.8944978838014621, "grad_norm": 1.7109375, "learning_rate": 2.059316188597182e-06, "loss": 1.3340084552764893, "step": 3078 }, { "epoch": 1.895729126587149, "grad_norm": 0.8515625, "learning_rate": 2.057955946040645e-06, "loss": 1.0385233163833618, "step": 3080 }, { "epoch": 1.8969603693728359, "grad_norm": 1.09375, "learning_rate": 2.0566113676995854e-06, "loss": 1.1956368684768677, "step": 3082 }, { "epoch": 1.8981916121585225, "grad_norm": 0.7109375, "learning_rate": 2.055282458916856e-06, "loss": 1.0864170789718628, "step": 3084 }, { "epoch": 1.8994228549442091, "grad_norm": 3.25, "learning_rate": 2.0539692249730468e-06, "loss": 1.3644320964813232, "step": 3086 }, { "epoch": 1.900654097729896, "grad_norm": 12.625, "learning_rate": 2.05267167108646e-06, "loss": 1.8048946857452393, "step": 3088 }, { "epoch": 1.901885340515583, "grad_norm": 4.5625, "learning_rate": 2.051389802413092e-06, "loss": 1.1744332313537598, "step": 3090 }, { "epoch": 1.9031165833012698, "grad_norm": 1.1953125, "learning_rate": 2.050123624046611e-06, "loss": 1.1367355585098267, "step": 3092 }, { "epoch": 1.9043478260869566, "grad_norm": 2.109375, "learning_rate": 2.0488731410183385e-06, "loss": 1.2757030725479126, "step": 3094 }, { "epoch": 1.9055790688726433, "grad_norm": 3.5, "learning_rate": 2.0476383582972295e-06, "loss": 1.6981068849563599, "step": 3096 }, { "epoch": 1.9068103116583301, "grad_norm": 2.34375, "learning_rate": 2.04641928078985e-06, "loss": 1.582550287246704, "step": 3098 }, { "epoch": 1.9080415544440168, "grad_norm": 3.46875, "learning_rate": 2.04521591334036e-06, "loss": 1.4223381280899048, "step": 3100 }, { "epoch": 1.9092727972297037, "grad_norm": 2.0, "learning_rate": 2.044028260730494e-06, "loss": 1.3338019847869873, "step": 3102 }, { "epoch": 1.9105040400153905, "grad_norm": 3.109375, "learning_rate": 2.042856327679542e-06, "loss": 1.4374977350234985, "step": 3104 }, { "epoch": 1.9117352828010774, "grad_norm": 2.875, "learning_rate": 2.041700118844329e-06, "loss": 1.6318196058273315, "step": 3106 }, { "epoch": 1.9129665255867643, "grad_norm": 2.890625, "learning_rate": 2.0405596388191977e-06, "loss": 1.8509365320205688, "step": 3108 }, { "epoch": 1.914197768372451, "grad_norm": 2.71875, "learning_rate": 2.0394348921359923e-06, "loss": 1.6199047565460205, "step": 3110 }, { "epoch": 1.9154290111581378, "grad_norm": 2.234375, "learning_rate": 2.0383258832640375e-06, "loss": 1.4047884941101074, "step": 3112 }, { "epoch": 1.9166602539438244, "grad_norm": 1.84375, "learning_rate": 2.0372326166101207e-06, "loss": 1.2699506282806396, "step": 3114 }, { "epoch": 1.9178914967295113, "grad_norm": 1.0859375, "learning_rate": 2.036155096518477e-06, "loss": 1.227724313735962, "step": 3116 }, { "epoch": 1.9191227395151982, "grad_norm": 1.734375, "learning_rate": 2.035093327270771e-06, "loss": 1.1748602390289307, "step": 3118 }, { "epoch": 1.920353982300885, "grad_norm": 2.296875, "learning_rate": 2.0340473130860763e-06, "loss": 1.5392735004425049, "step": 3120 }, { "epoch": 1.921585225086572, "grad_norm": 2.703125, "learning_rate": 2.0330170581208658e-06, "loss": 1.4597396850585938, "step": 3122 }, { "epoch": 1.9228164678722586, "grad_norm": 2.4375, "learning_rate": 2.032002566468988e-06, "loss": 1.4114183187484741, "step": 3124 }, { "epoch": 1.9240477106579452, "grad_norm": 1.9140625, "learning_rate": 2.031003842161656e-06, "loss": 1.3674333095550537, "step": 3126 }, { "epoch": 1.925278953443632, "grad_norm": 2.046875, "learning_rate": 2.0300208891674274e-06, "loss": 1.216900110244751, "step": 3128 }, { "epoch": 1.926510196229319, "grad_norm": 2.015625, "learning_rate": 2.0290537113921924e-06, "loss": 1.058232307434082, "step": 3130 }, { "epoch": 1.9277414390150058, "grad_norm": 3.578125, "learning_rate": 2.028102312679155e-06, "loss": 0.627967119216919, "step": 3132 }, { "epoch": 1.9289726818006927, "grad_norm": 3.765625, "learning_rate": 2.027166696808819e-06, "loss": 0.7637472152709961, "step": 3134 }, { "epoch": 1.9302039245863793, "grad_norm": 1.4765625, "learning_rate": 2.0262468674989744e-06, "loss": 1.1989485025405884, "step": 3136 }, { "epoch": 1.9314351673720662, "grad_norm": 1.28125, "learning_rate": 2.0253428284046796e-06, "loss": 1.3709150552749634, "step": 3138 }, { "epoch": 1.9326664101577529, "grad_norm": 2.203125, "learning_rate": 2.0244545831182504e-06, "loss": 1.3663153648376465, "step": 3140 }, { "epoch": 1.9338976529434397, "grad_norm": 2.421875, "learning_rate": 2.0235821351692415e-06, "loss": 1.3322885036468506, "step": 3142 }, { "epoch": 1.9351288957291266, "grad_norm": 1.1796875, "learning_rate": 2.022725488024437e-06, "loss": 1.0995405912399292, "step": 3144 }, { "epoch": 1.9363601385148135, "grad_norm": 1.1640625, "learning_rate": 2.021884645087835e-06, "loss": 1.2344316244125366, "step": 3146 }, { "epoch": 1.9375913813005003, "grad_norm": 2.109375, "learning_rate": 2.02105960970063e-06, "loss": 1.1434847116470337, "step": 3148 }, { "epoch": 1.938822624086187, "grad_norm": 1.390625, "learning_rate": 2.0202503851412066e-06, "loss": 1.007233738899231, "step": 3150 }, { "epoch": 1.9400538668718736, "grad_norm": 3.5, "learning_rate": 2.019456974625122e-06, "loss": 1.0748920440673828, "step": 3152 }, { "epoch": 1.9412851096575605, "grad_norm": 4.0, "learning_rate": 2.0186793813050944e-06, "loss": 1.0976916551589966, "step": 3154 }, { "epoch": 1.9425163524432474, "grad_norm": 0.9296875, "learning_rate": 2.0179176082709897e-06, "loss": 1.1037846803665161, "step": 3156 }, { "epoch": 1.9437475952289343, "grad_norm": 2.390625, "learning_rate": 2.0171716585498098e-06, "loss": 1.2665685415267944, "step": 3158 }, { "epoch": 1.9449788380146211, "grad_norm": 2.03125, "learning_rate": 2.0164415351056825e-06, "loss": 1.4245879650115967, "step": 3160 }, { "epoch": 1.9462100808003078, "grad_norm": 3.9375, "learning_rate": 2.0157272408398456e-06, "loss": 1.3685814142227173, "step": 3162 }, { "epoch": 1.9474413235859946, "grad_norm": 2.546875, "learning_rate": 2.0150287785906396e-06, "loss": 1.4000155925750732, "step": 3164 }, { "epoch": 1.9486725663716813, "grad_norm": 2.640625, "learning_rate": 2.0143461511334917e-06, "loss": 1.5712698698043823, "step": 3166 }, { "epoch": 1.9499038091573682, "grad_norm": 3.453125, "learning_rate": 2.0136793611809104e-06, "loss": 1.8835021257400513, "step": 3168 }, { "epoch": 1.951135051943055, "grad_norm": 2.453125, "learning_rate": 2.0130284113824712e-06, "loss": 1.5735080242156982, "step": 3170 }, { "epoch": 1.952366294728742, "grad_norm": 3.484375, "learning_rate": 2.0123933043248056e-06, "loss": 1.8424159288406372, "step": 3172 }, { "epoch": 1.9535975375144288, "grad_norm": 2.875, "learning_rate": 2.0117740425315924e-06, "loss": 1.6976765394210815, "step": 3174 }, { "epoch": 1.9548287803001154, "grad_norm": 2.21875, "learning_rate": 2.0111706284635478e-06, "loss": 1.4014792442321777, "step": 3176 }, { "epoch": 1.956060023085802, "grad_norm": 3.8125, "learning_rate": 2.0105830645184145e-06, "loss": 1.4296321868896484, "step": 3178 }, { "epoch": 1.957291265871489, "grad_norm": 2.625, "learning_rate": 2.010011353030953e-06, "loss": 1.9288471937179565, "step": 3180 }, { "epoch": 1.9585225086571758, "grad_norm": 2.4375, "learning_rate": 2.0094554962729317e-06, "loss": 1.6823272705078125, "step": 3182 }, { "epoch": 1.9597537514428627, "grad_norm": 3.265625, "learning_rate": 2.008915496453119e-06, "loss": 1.494765043258667, "step": 3184 }, { "epoch": 1.9609849942285495, "grad_norm": 2.359375, "learning_rate": 2.0083913557172724e-06, "loss": 1.4550701379776, "step": 3186 }, { "epoch": 1.9622162370142362, "grad_norm": 2.484375, "learning_rate": 2.007883076148133e-06, "loss": 1.4107712507247925, "step": 3188 }, { "epoch": 1.963447479799923, "grad_norm": 2.09375, "learning_rate": 2.0073906597654126e-06, "loss": 1.3898992538452148, "step": 3190 }, { "epoch": 1.9646787225856097, "grad_norm": 0.58984375, "learning_rate": 2.0069141085257925e-06, "loss": 0.9835103750228882, "step": 3192 }, { "epoch": 1.9659099653712966, "grad_norm": 4.03125, "learning_rate": 2.00645342432291e-06, "loss": 1.4404178857803345, "step": 3194 }, { "epoch": 1.9671412081569835, "grad_norm": 4.25, "learning_rate": 2.006008608987351e-06, "loss": 1.7752964496612549, "step": 3196 }, { "epoch": 1.9683724509426703, "grad_norm": 3.375, "learning_rate": 2.0055796642866476e-06, "loss": 1.7659649848937988, "step": 3198 }, { "epoch": 1.9696036937283572, "grad_norm": 2.453125, "learning_rate": 2.0051665919252657e-06, "loss": 1.4808554649353027, "step": 3200 }, { "epoch": 1.9708349365140438, "grad_norm": 3.140625, "learning_rate": 2.004769393544601e-06, "loss": 1.6572133302688599, "step": 3202 }, { "epoch": 1.9720661792997307, "grad_norm": 2.578125, "learning_rate": 2.004388070722972e-06, "loss": 1.6950914859771729, "step": 3204 }, { "epoch": 1.9732974220854174, "grad_norm": 2.578125, "learning_rate": 2.0040226249756135e-06, "loss": 1.7044365406036377, "step": 3206 }, { "epoch": 1.9745286648711042, "grad_norm": 2.5, "learning_rate": 2.0036730577546716e-06, "loss": 1.5040702819824219, "step": 3208 }, { "epoch": 1.975759907656791, "grad_norm": 2.859375, "learning_rate": 2.0033393704491954e-06, "loss": 1.2767277956008911, "step": 3210 }, { "epoch": 1.976991150442478, "grad_norm": 2.328125, "learning_rate": 2.003021564385135e-06, "loss": 1.0338983535766602, "step": 3212 }, { "epoch": 1.9782223932281648, "grad_norm": 4.3125, "learning_rate": 2.002719640825332e-06, "loss": 1.162734866142273, "step": 3214 }, { "epoch": 1.9794536360138515, "grad_norm": 6.96875, "learning_rate": 2.00243360096952e-06, "loss": 1.7887216806411743, "step": 3216 }, { "epoch": 1.9806848787995381, "grad_norm": 4.78125, "learning_rate": 2.0021634459543126e-06, "loss": 1.5415432453155518, "step": 3218 }, { "epoch": 1.981916121585225, "grad_norm": 7.5625, "learning_rate": 2.0019091768532075e-06, "loss": 1.8647511005401611, "step": 3220 }, { "epoch": 1.9831473643709119, "grad_norm": 4.90625, "learning_rate": 2.001670794676574e-06, "loss": 1.5809669494628906, "step": 3222 }, { "epoch": 1.9843786071565988, "grad_norm": 4.4375, "learning_rate": 2.0014483003716547e-06, "loss": 1.4375505447387695, "step": 3224 }, { "epoch": 1.9856098499422856, "grad_norm": 1.984375, "learning_rate": 2.001241694822559e-06, "loss": 1.4724972248077393, "step": 3226 }, { "epoch": 1.9868410927279723, "grad_norm": 2.640625, "learning_rate": 2.0010509788502606e-06, "loss": 1.4315625429153442, "step": 3228 }, { "epoch": 1.9880723355136591, "grad_norm": 2.21875, "learning_rate": 2.0008761532125942e-06, "loss": 1.3821440935134888, "step": 3230 }, { "epoch": 1.9893035782993458, "grad_norm": 1.015625, "learning_rate": 2.000717218604251e-06, "loss": 1.0991805791854858, "step": 3232 }, { "epoch": 1.9905348210850327, "grad_norm": 2.28125, "learning_rate": 2.0005741756567775e-06, "loss": 1.0305452346801758, "step": 3234 }, { "epoch": 1.9917660638707195, "grad_norm": 7.09375, "learning_rate": 2.000447024938574e-06, "loss": 1.0397870540618896, "step": 3236 }, { "epoch": 1.9929973066564064, "grad_norm": 5.375, "learning_rate": 2.000335766954891e-06, "loss": 0.9876999258995056, "step": 3238 }, { "epoch": 1.9942285494420933, "grad_norm": 2.84375, "learning_rate": 2.0002404021478243e-06, "loss": 1.9116934537887573, "step": 3240 }, { "epoch": 1.99545979222778, "grad_norm": 4.53125, "learning_rate": 2.0001609308963195e-06, "loss": 1.5504405498504639, "step": 3242 }, { "epoch": 1.9966910350134666, "grad_norm": 2.421875, "learning_rate": 2.000097353516165e-06, "loss": 1.3082109689712524, "step": 3244 }, { "epoch": 1.9979222777991534, "grad_norm": 1.5703125, "learning_rate": 2.000049670259995e-06, "loss": 1.2466901540756226, "step": 3246 }, { "epoch": 1.9991535205848403, "grad_norm": 1.2578125, "learning_rate": 2.0000178813172827e-06, "loss": 1.0891525745391846, "step": 3248 }, { "epoch": 2.0, "grad_norm": 5.8125, "learning_rate": 2.0000019868143473e-06, "loss": 0.9806809425354004, "step": 3250 }, { "epoch": 2.0, "step": 3250, "total_flos": 2.574483891351978e+18, "train_loss": 1.4189628758430481, "train_runtime": 74292.9455, "train_samples_per_second": 0.35, "train_steps_per_second": 0.044 } ], "logging_steps": 2, "max_steps": 3250, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.574483891351978e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }