| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 2408, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0004152823920265781, | |
| "grad_norm": 25.422981813437236, | |
| "learning_rate": 4.1493775933609963e-08, | |
| "loss": 1.3975, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0020764119601328905, | |
| "grad_norm": 23.65282908395334, | |
| "learning_rate": 2.074688796680498e-07, | |
| "loss": 1.4281, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.004152823920265781, | |
| "grad_norm": 16.38973942245371, | |
| "learning_rate": 4.149377593360996e-07, | |
| "loss": 1.3933, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.006229235880398671, | |
| "grad_norm": 8.620332321861904, | |
| "learning_rate": 6.224066390041494e-07, | |
| "loss": 1.2986, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.008305647840531562, | |
| "grad_norm": 10.289897317705874, | |
| "learning_rate": 8.298755186721992e-07, | |
| "loss": 1.1565, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.010382059800664452, | |
| "grad_norm": 4.429779856244459, | |
| "learning_rate": 1.037344398340249e-06, | |
| "loss": 1.051, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.012458471760797342, | |
| "grad_norm": 3.3098208738585213, | |
| "learning_rate": 1.2448132780082988e-06, | |
| "loss": 0.9902, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.014534883720930232, | |
| "grad_norm": 3.4349888460346687, | |
| "learning_rate": 1.4522821576763488e-06, | |
| "loss": 0.9652, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.016611295681063124, | |
| "grad_norm": 3.1515624301454133, | |
| "learning_rate": 1.6597510373443984e-06, | |
| "loss": 0.9415, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.018687707641196014, | |
| "grad_norm": 3.1235312209606505, | |
| "learning_rate": 1.8672199170124482e-06, | |
| "loss": 0.93, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.020764119601328904, | |
| "grad_norm": 3.1741829648141926, | |
| "learning_rate": 2.074688796680498e-06, | |
| "loss": 0.9238, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.022840531561461794, | |
| "grad_norm": 3.232116295196654, | |
| "learning_rate": 2.282157676348548e-06, | |
| "loss": 0.9123, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.024916943521594685, | |
| "grad_norm": 3.1515595029223396, | |
| "learning_rate": 2.4896265560165977e-06, | |
| "loss": 0.9031, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.026993355481727575, | |
| "grad_norm": 3.1003061370301617, | |
| "learning_rate": 2.6970954356846475e-06, | |
| "loss": 0.8947, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.029069767441860465, | |
| "grad_norm": 2.9767060692194844, | |
| "learning_rate": 2.9045643153526977e-06, | |
| "loss": 0.8919, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.031146179401993355, | |
| "grad_norm": 3.0759553041103205, | |
| "learning_rate": 3.112033195020747e-06, | |
| "loss": 0.8702, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.03322259136212625, | |
| "grad_norm": 3.285827319776166, | |
| "learning_rate": 3.319502074688797e-06, | |
| "loss": 0.8727, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03529900332225914, | |
| "grad_norm": 3.3462993523967186, | |
| "learning_rate": 3.526970954356847e-06, | |
| "loss": 0.8736, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.03737541528239203, | |
| "grad_norm": 3.256004424550593, | |
| "learning_rate": 3.7344398340248965e-06, | |
| "loss": 0.8858, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03945182724252492, | |
| "grad_norm": 3.159488005717498, | |
| "learning_rate": 3.941908713692946e-06, | |
| "loss": 0.8559, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.04152823920265781, | |
| "grad_norm": 2.9634363451500114, | |
| "learning_rate": 4.149377593360996e-06, | |
| "loss": 0.8586, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0436046511627907, | |
| "grad_norm": 3.159728031235876, | |
| "learning_rate": 4.356846473029046e-06, | |
| "loss": 0.8674, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.04568106312292359, | |
| "grad_norm": 2.970452415217835, | |
| "learning_rate": 4.564315352697096e-06, | |
| "loss": 0.8542, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.04775747508305648, | |
| "grad_norm": 3.1788047641427513, | |
| "learning_rate": 4.771784232365146e-06, | |
| "loss": 0.8701, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.04983388704318937, | |
| "grad_norm": 3.451301944053267, | |
| "learning_rate": 4.979253112033195e-06, | |
| "loss": 0.8714, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.05191029900332226, | |
| "grad_norm": 3.388326009403783, | |
| "learning_rate": 5.1867219917012455e-06, | |
| "loss": 0.8483, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.05398671096345515, | |
| "grad_norm": 2.9959540593135645, | |
| "learning_rate": 5.394190871369295e-06, | |
| "loss": 0.8481, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05606312292358804, | |
| "grad_norm": 3.1213953476841856, | |
| "learning_rate": 5.601659751037345e-06, | |
| "loss": 0.8387, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.05813953488372093, | |
| "grad_norm": 3.403963416369247, | |
| "learning_rate": 5.809128630705395e-06, | |
| "loss": 0.8399, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06021594684385382, | |
| "grad_norm": 3.0893053330914695, | |
| "learning_rate": 6.016597510373444e-06, | |
| "loss": 0.8386, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.06229235880398671, | |
| "grad_norm": 3.166492177328262, | |
| "learning_rate": 6.224066390041494e-06, | |
| "loss": 0.8457, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0643687707641196, | |
| "grad_norm": 3.2851556967703117, | |
| "learning_rate": 6.431535269709544e-06, | |
| "loss": 0.8421, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.0664451827242525, | |
| "grad_norm": 2.9899317822541454, | |
| "learning_rate": 6.639004149377594e-06, | |
| "loss": 0.8373, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.06852159468438539, | |
| "grad_norm": 3.0509892785590456, | |
| "learning_rate": 6.846473029045644e-06, | |
| "loss": 0.8334, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.07059800664451828, | |
| "grad_norm": 3.00742757115455, | |
| "learning_rate": 7.053941908713694e-06, | |
| "loss": 0.8233, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.07267441860465117, | |
| "grad_norm": 3.0518393701751485, | |
| "learning_rate": 7.261410788381743e-06, | |
| "loss": 0.8296, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.07475083056478406, | |
| "grad_norm": 3.1984146233667263, | |
| "learning_rate": 7.468879668049793e-06, | |
| "loss": 0.8155, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.07682724252491695, | |
| "grad_norm": 3.05629449726749, | |
| "learning_rate": 7.676348547717844e-06, | |
| "loss": 0.8377, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.07890365448504984, | |
| "grad_norm": 3.249088059891964, | |
| "learning_rate": 7.883817427385892e-06, | |
| "loss": 0.8432, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08098006644518273, | |
| "grad_norm": 3.0028481508425515, | |
| "learning_rate": 8.091286307053943e-06, | |
| "loss": 0.8173, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.08305647840531562, | |
| "grad_norm": 3.059733445916786, | |
| "learning_rate": 8.298755186721992e-06, | |
| "loss": 0.8227, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08513289036544851, | |
| "grad_norm": 3.0867633236533365, | |
| "learning_rate": 8.506224066390042e-06, | |
| "loss": 0.8181, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.0872093023255814, | |
| "grad_norm": 2.997953986592159, | |
| "learning_rate": 8.713692946058093e-06, | |
| "loss": 0.821, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.08928571428571429, | |
| "grad_norm": 3.2351659520743072, | |
| "learning_rate": 8.921161825726142e-06, | |
| "loss": 0.8294, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.09136212624584718, | |
| "grad_norm": 3.1494481731597586, | |
| "learning_rate": 9.128630705394191e-06, | |
| "loss": 0.8261, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.09343853820598007, | |
| "grad_norm": 3.105511823234228, | |
| "learning_rate": 9.33609958506224e-06, | |
| "loss": 0.8165, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.09551495016611296, | |
| "grad_norm": 3.023901781664328, | |
| "learning_rate": 9.543568464730292e-06, | |
| "loss": 0.8123, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.09759136212624585, | |
| "grad_norm": 3.4303556589177187, | |
| "learning_rate": 9.751037344398341e-06, | |
| "loss": 0.8093, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.09966777408637874, | |
| "grad_norm": 3.6054989714255408, | |
| "learning_rate": 9.95850622406639e-06, | |
| "loss": 0.8201, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.10174418604651163, | |
| "grad_norm": 2.990225009601177, | |
| "learning_rate": 9.999915930067828e-06, | |
| "loss": 0.8208, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.10382059800664452, | |
| "grad_norm": 2.9957103647324264, | |
| "learning_rate": 9.999574400813641e-06, | |
| "loss": 0.816, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.10589700996677741, | |
| "grad_norm": 2.8988415018010287, | |
| "learning_rate": 9.998970175798065e-06, | |
| "loss": 0.8044, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.1079734219269103, | |
| "grad_norm": 2.893907971746992, | |
| "learning_rate": 9.998103286769267e-06, | |
| "loss": 0.799, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.11004983388704319, | |
| "grad_norm": 2.898946354458808, | |
| "learning_rate": 9.996973779276743e-06, | |
| "loss": 0.8113, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.11212624584717608, | |
| "grad_norm": 3.0485697591450998, | |
| "learning_rate": 9.99558171266891e-06, | |
| "loss": 0.8194, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.11420265780730897, | |
| "grad_norm": 2.933613250090363, | |
| "learning_rate": 9.993927160089991e-06, | |
| "loss": 0.7981, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.11627906976744186, | |
| "grad_norm": 2.900283777987733, | |
| "learning_rate": 9.992010208476178e-06, | |
| "loss": 0.8114, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.11835548172757475, | |
| "grad_norm": 2.869639926652705, | |
| "learning_rate": 9.989830958551058e-06, | |
| "loss": 0.8026, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.12043189368770764, | |
| "grad_norm": 3.0764284732072236, | |
| "learning_rate": 9.98738952482032e-06, | |
| "loss": 0.7816, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.12250830564784053, | |
| "grad_norm": 2.872848930860205, | |
| "learning_rate": 9.984686035565742e-06, | |
| "loss": 0.7851, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.12458471760797342, | |
| "grad_norm": 2.7170384439590367, | |
| "learning_rate": 9.98172063283845e-06, | |
| "loss": 0.8054, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12666112956810632, | |
| "grad_norm": 2.785739578421159, | |
| "learning_rate": 9.978493472451451e-06, | |
| "loss": 0.7824, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.1287375415282392, | |
| "grad_norm": 2.955753943035507, | |
| "learning_rate": 9.975004723971452e-06, | |
| "loss": 0.7788, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1308139534883721, | |
| "grad_norm": 2.7566534229071378, | |
| "learning_rate": 9.971254570709939e-06, | |
| "loss": 0.7804, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.132890365448505, | |
| "grad_norm": 3.0399050026271945, | |
| "learning_rate": 9.967243209713563e-06, | |
| "loss": 0.7712, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.13496677740863788, | |
| "grad_norm": 3.227011718605211, | |
| "learning_rate": 9.962970851753767e-06, | |
| "loss": 0.7852, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.13704318936877077, | |
| "grad_norm": 2.894940556652265, | |
| "learning_rate": 9.95843772131573e-06, | |
| "loss": 0.767, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.13911960132890366, | |
| "grad_norm": 3.137972193410393, | |
| "learning_rate": 9.95364405658655e-06, | |
| "loss": 0.77, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.14119601328903655, | |
| "grad_norm": 2.7913612546678426, | |
| "learning_rate": 9.948590109442755e-06, | |
| "loss": 0.7768, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.14327242524916944, | |
| "grad_norm": 2.893979747266515, | |
| "learning_rate": 9.94327614543704e-06, | |
| "loss": 0.7827, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.14534883720930233, | |
| "grad_norm": 2.665071280290936, | |
| "learning_rate": 9.937702443784343e-06, | |
| "loss": 0.7474, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.14742524916943522, | |
| "grad_norm": 2.741350083908129, | |
| "learning_rate": 9.931869297347146e-06, | |
| "loss": 0.7638, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.14950166112956811, | |
| "grad_norm": 2.9878149207237357, | |
| "learning_rate": 9.925777012620111e-06, | |
| "loss": 0.7419, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.151578073089701, | |
| "grad_norm": 2.801227928713699, | |
| "learning_rate": 9.919425909713958e-06, | |
| "loss": 0.769, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.1536544850498339, | |
| "grad_norm": 3.023770968839729, | |
| "learning_rate": 9.912816322338659e-06, | |
| "loss": 0.7447, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.15573089700996678, | |
| "grad_norm": 2.9927287523796715, | |
| "learning_rate": 9.905948597785888e-06, | |
| "loss": 0.754, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.15780730897009967, | |
| "grad_norm": 11.785492453222856, | |
| "learning_rate": 9.89882309691079e-06, | |
| "loss": 0.7497, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.15988372093023256, | |
| "grad_norm": 2.8752234411604682, | |
| "learning_rate": 9.891440194113008e-06, | |
| "loss": 0.7427, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.16196013289036545, | |
| "grad_norm": 3.097207390376622, | |
| "learning_rate": 9.88380027731702e-06, | |
| "loss": 0.7542, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.16403654485049834, | |
| "grad_norm": 2.921991118334764, | |
| "learning_rate": 9.875903747951742e-06, | |
| "loss": 0.7621, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.16611295681063123, | |
| "grad_norm": 2.8395297947865963, | |
| "learning_rate": 9.867751020929454e-06, | |
| "loss": 0.735, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16818936877076412, | |
| "grad_norm": 2.726116425089643, | |
| "learning_rate": 9.859342524623985e-06, | |
| "loss": 0.7124, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.17026578073089702, | |
| "grad_norm": 3.2173444091652943, | |
| "learning_rate": 9.850678700848208e-06, | |
| "loss": 0.7374, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1723421926910299, | |
| "grad_norm": 2.716930762983964, | |
| "learning_rate": 9.84176000483083e-06, | |
| "loss": 0.7138, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.1744186046511628, | |
| "grad_norm": 2.985441779621083, | |
| "learning_rate": 9.832586905192469e-06, | |
| "loss": 0.731, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.17649501661129569, | |
| "grad_norm": 3.032790315651323, | |
| "learning_rate": 9.823159883921028e-06, | |
| "loss": 0.7215, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 2.6988344818168155, | |
| "learning_rate": 9.813479436346378e-06, | |
| "loss": 0.7183, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.18064784053156147, | |
| "grad_norm": 2.973146607192177, | |
| "learning_rate": 9.803546071114323e-06, | |
| "loss": 0.7311, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.18272425249169436, | |
| "grad_norm": 2.9093506646801344, | |
| "learning_rate": 9.793360310159878e-06, | |
| "loss": 0.7049, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.18480066445182725, | |
| "grad_norm": 3.01100096145872, | |
| "learning_rate": 9.782922688679847e-06, | |
| "loss": 0.7118, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.18687707641196014, | |
| "grad_norm": 2.716470652939527, | |
| "learning_rate": 9.772233755104695e-06, | |
| "loss": 0.7277, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.18895348837209303, | |
| "grad_norm": 2.7134248053870165, | |
| "learning_rate": 9.761294071069736e-06, | |
| "loss": 0.7205, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.19102990033222592, | |
| "grad_norm": 2.6251507638777163, | |
| "learning_rate": 9.750104211385625e-06, | |
| "loss": 0.7152, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1931063122923588, | |
| "grad_norm": 2.8023948010803483, | |
| "learning_rate": 9.738664764008149e-06, | |
| "loss": 0.7233, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.1951827242524917, | |
| "grad_norm": 3.714290449563204, | |
| "learning_rate": 9.726976330007341e-06, | |
| "loss": 0.6998, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1972591362126246, | |
| "grad_norm": 2.8670419197216512, | |
| "learning_rate": 9.71503952353589e-06, | |
| "loss": 0.6985, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.19933554817275748, | |
| "grad_norm": 3.1683988394439107, | |
| "learning_rate": 9.702854971796876e-06, | |
| "loss": 0.7089, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.20141196013289037, | |
| "grad_norm": 3.2223078839261166, | |
| "learning_rate": 9.690423315010814e-06, | |
| "loss": 0.7053, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.20348837209302326, | |
| "grad_norm": 2.77875488832717, | |
| "learning_rate": 9.677745206382014e-06, | |
| "loss": 0.7271, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.20556478405315615, | |
| "grad_norm": 2.888271933836237, | |
| "learning_rate": 9.664821312064258e-06, | |
| "loss": 0.7018, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.20764119601328904, | |
| "grad_norm": 3.2746008040723815, | |
| "learning_rate": 9.651652311125803e-06, | |
| "loss": 0.6991, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.20971760797342193, | |
| "grad_norm": 2.76622547311742, | |
| "learning_rate": 9.638238895513687e-06, | |
| "loss": 0.7075, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.21179401993355482, | |
| "grad_norm": 2.9972446036957114, | |
| "learning_rate": 9.624581770017392e-06, | |
| "loss": 0.6857, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2138704318936877, | |
| "grad_norm": 2.869516499460042, | |
| "learning_rate": 9.610681652231794e-06, | |
| "loss": 0.6916, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.2159468438538206, | |
| "grad_norm": 2.742923434452921, | |
| "learning_rate": 9.596539272519468e-06, | |
| "loss": 0.6811, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2180232558139535, | |
| "grad_norm": 2.8482023108565677, | |
| "learning_rate": 9.582155373972303e-06, | |
| "loss": 0.6744, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.22009966777408638, | |
| "grad_norm": 2.9348099403663124, | |
| "learning_rate": 9.56753071237247e-06, | |
| "loss": 0.6776, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.22217607973421927, | |
| "grad_norm": 2.786772996017183, | |
| "learning_rate": 9.552666056152704e-06, | |
| "loss": 0.6798, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.22425249169435216, | |
| "grad_norm": 2.92722689041533, | |
| "learning_rate": 9.537562186355918e-06, | |
| "loss": 0.6843, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.22632890365448505, | |
| "grad_norm": 2.7694998172195207, | |
| "learning_rate": 9.52221989659418e-06, | |
| "loss": 0.6938, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.22840531561461794, | |
| "grad_norm": 2.9300442858036244, | |
| "learning_rate": 9.506639993007012e-06, | |
| "loss": 0.6944, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.23048172757475083, | |
| "grad_norm": 3.1035204783454993, | |
| "learning_rate": 9.490823294219015e-06, | |
| "loss": 0.672, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.23255813953488372, | |
| "grad_norm": 2.6193387690961245, | |
| "learning_rate": 9.474770631296882e-06, | |
| "loss": 0.6561, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.2346345514950166, | |
| "grad_norm": 2.61646550507026, | |
| "learning_rate": 9.458482847705705e-06, | |
| "loss": 0.6576, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.2367109634551495, | |
| "grad_norm": 2.756473668019519, | |
| "learning_rate": 9.441960799264678e-06, | |
| "loss": 0.6851, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.2387873754152824, | |
| "grad_norm": 2.6995089678231614, | |
| "learning_rate": 9.425205354102111e-06, | |
| "loss": 0.6648, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.24086378737541528, | |
| "grad_norm": 2.7140254791209677, | |
| "learning_rate": 9.408217392609831e-06, | |
| "loss": 0.6451, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.24294019933554817, | |
| "grad_norm": 2.607599787114018, | |
| "learning_rate": 9.390997807396912e-06, | |
| "loss": 0.67, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.24501661129568106, | |
| "grad_norm": 2.8420050898692764, | |
| "learning_rate": 9.373547503242775e-06, | |
| "loss": 0.6657, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.24709302325581395, | |
| "grad_norm": 2.9228965685399095, | |
| "learning_rate": 9.355867397049658e-06, | |
| "loss": 0.6566, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.24916943521594684, | |
| "grad_norm": 2.8048600929777403, | |
| "learning_rate": 9.337958417794425e-06, | |
| "loss": 0.6457, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.25124584717607973, | |
| "grad_norm": 2.6983485281997415, | |
| "learning_rate": 9.319821506479762e-06, | |
| "loss": 0.6376, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.25332225913621265, | |
| "grad_norm": 2.801805288954333, | |
| "learning_rate": 9.301457616084733e-06, | |
| "loss": 0.6523, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.2553986710963455, | |
| "grad_norm": 2.820864396273499, | |
| "learning_rate": 9.282867711514703e-06, | |
| "loss": 0.6365, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.2574750830564784, | |
| "grad_norm": 2.9932167823643043, | |
| "learning_rate": 9.264052769550643e-06, | |
| "loss": 0.6425, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2595514950166113, | |
| "grad_norm": 2.6556108045628544, | |
| "learning_rate": 9.245013778797802e-06, | |
| "loss": 0.6562, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.2616279069767442, | |
| "grad_norm": 2.676416816690246, | |
| "learning_rate": 9.225751739633772e-06, | |
| "loss": 0.6387, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.26370431893687707, | |
| "grad_norm": 2.702226526508375, | |
| "learning_rate": 9.206267664155906e-06, | |
| "loss": 0.6348, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.26578073089701, | |
| "grad_norm": 2.637563222880754, | |
| "learning_rate": 9.186562576128159e-06, | |
| "loss": 0.6263, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.26785714285714285, | |
| "grad_norm": 2.7815352111724603, | |
| "learning_rate": 9.16663751092728e-06, | |
| "loss": 0.6362, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.26993355481727577, | |
| "grad_norm": 2.8822755136904528, | |
| "learning_rate": 9.146493515488418e-06, | |
| "loss": 0.6164, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.27200996677740863, | |
| "grad_norm": 2.5755107274498146, | |
| "learning_rate": 9.126131648250112e-06, | |
| "loss": 0.6342, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.27408637873754155, | |
| "grad_norm": 2.584492766117294, | |
| "learning_rate": 9.105552979098675e-06, | |
| "loss": 0.6329, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2761627906976744, | |
| "grad_norm": 2.6805388863449036, | |
| "learning_rate": 9.084758589311977e-06, | |
| "loss": 0.6307, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.2782392026578073, | |
| "grad_norm": 2.7584115266730693, | |
| "learning_rate": 9.063749571502633e-06, | |
| "loss": 0.6374, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2803156146179402, | |
| "grad_norm": 2.8092430217085145, | |
| "learning_rate": 9.04252702956059e-06, | |
| "loss": 0.6282, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.2823920265780731, | |
| "grad_norm": 2.6353604501522168, | |
| "learning_rate": 9.021092078595132e-06, | |
| "loss": 0.6332, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.28446843853820597, | |
| "grad_norm": 2.7859177417571486, | |
| "learning_rate": 8.999445844876276e-06, | |
| "loss": 0.6381, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.2865448504983389, | |
| "grad_norm": 2.6603634875986457, | |
| "learning_rate": 8.977589465775607e-06, | |
| "loss": 0.6312, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.28862126245847175, | |
| "grad_norm": 2.6293766795824354, | |
| "learning_rate": 8.955524089706506e-06, | |
| "loss": 0.5999, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.29069767441860467, | |
| "grad_norm": 2.8986723382239967, | |
| "learning_rate": 8.933250876063815e-06, | |
| "loss": 0.6297, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.29277408637873753, | |
| "grad_norm": 2.6589365161649834, | |
| "learning_rate": 8.910770995162913e-06, | |
| "loss": 0.6303, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.29485049833887045, | |
| "grad_norm": 2.64992234535583, | |
| "learning_rate": 8.88808562817823e-06, | |
| "loss": 0.6114, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2969269102990033, | |
| "grad_norm": 2.7322760412568776, | |
| "learning_rate": 8.865195967081174e-06, | |
| "loss": 0.6215, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.29900332225913623, | |
| "grad_norm": 2.576473302210113, | |
| "learning_rate": 8.842103214577511e-06, | |
| "loss": 0.6147, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.3010797342192691, | |
| "grad_norm": 2.507546434543662, | |
| "learning_rate": 8.818808584044163e-06, | |
| "loss": 0.6089, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.303156146179402, | |
| "grad_norm": 2.953501799132662, | |
| "learning_rate": 8.795313299465455e-06, | |
| "loss": 0.6147, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.30523255813953487, | |
| "grad_norm": 2.58266860044093, | |
| "learning_rate": 8.771618595368806e-06, | |
| "loss": 0.6024, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.3073089700996678, | |
| "grad_norm": 2.7291039422306613, | |
| "learning_rate": 8.747725716759859e-06, | |
| "loss": 0.6152, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.30938538205980065, | |
| "grad_norm": 2.696653736904745, | |
| "learning_rate": 8.723635919057058e-06, | |
| "loss": 0.6082, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.31146179401993357, | |
| "grad_norm": 2.639188973608746, | |
| "learning_rate": 8.699350468025699e-06, | |
| "loss": 0.5924, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.31353820598006643, | |
| "grad_norm": 2.5960120065556294, | |
| "learning_rate": 8.674870639711403e-06, | |
| "loss": 0.5871, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.31561461794019935, | |
| "grad_norm": 2.691098687645451, | |
| "learning_rate": 8.650197720373091e-06, | |
| "loss": 0.5937, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.3176910299003322, | |
| "grad_norm": 2.7922815680081947, | |
| "learning_rate": 8.625333006415372e-06, | |
| "loss": 0.5806, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.31976744186046513, | |
| "grad_norm": 2.5989983221444635, | |
| "learning_rate": 8.600277804320452e-06, | |
| "loss": 0.5889, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.321843853820598, | |
| "grad_norm": 2.7500580415708553, | |
| "learning_rate": 8.575033430579465e-06, | |
| "loss": 0.5929, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.3239202657807309, | |
| "grad_norm": 2.9863748696055485, | |
| "learning_rate": 8.549601211623316e-06, | |
| "loss": 0.5905, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.32599667774086377, | |
| "grad_norm": 2.7128601524461966, | |
| "learning_rate": 8.523982483752973e-06, | |
| "loss": 0.5838, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.3280730897009967, | |
| "grad_norm": 2.6273588590853727, | |
| "learning_rate": 8.498178593069262e-06, | |
| "loss": 0.579, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.33014950166112955, | |
| "grad_norm": 2.6424251208940714, | |
| "learning_rate": 8.472190895402131e-06, | |
| "loss": 0.568, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.33222591362126247, | |
| "grad_norm": 2.774060760650428, | |
| "learning_rate": 8.446020756239418e-06, | |
| "loss": 0.5881, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.33430232558139533, | |
| "grad_norm": 2.7429673227633193, | |
| "learning_rate": 8.419669550655093e-06, | |
| "loss": 0.5807, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.33637873754152825, | |
| "grad_norm": 2.4588138685140164, | |
| "learning_rate": 8.393138663237015e-06, | |
| "loss": 0.5699, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.3384551495016611, | |
| "grad_norm": 2.8894345220890845, | |
| "learning_rate": 8.366429488014178e-06, | |
| "loss": 0.5644, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.34053156146179403, | |
| "grad_norm": 2.6417969175920253, | |
| "learning_rate": 8.339543428383467e-06, | |
| "loss": 0.577, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.3426079734219269, | |
| "grad_norm": 2.639049529021501, | |
| "learning_rate": 8.312481897035906e-06, | |
| "loss": 0.5835, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.3446843853820598, | |
| "grad_norm": 2.791601353912272, | |
| "learning_rate": 8.285246315882448e-06, | |
| "loss": 0.5873, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.3467607973421927, | |
| "grad_norm": 2.760486538247162, | |
| "learning_rate": 8.257838115979244e-06, | |
| "loss": 0.5743, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.3488372093023256, | |
| "grad_norm": 2.6084506349864114, | |
| "learning_rate": 8.230258737452473e-06, | |
| "loss": 0.5835, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.35091362126245845, | |
| "grad_norm": 2.568077365967415, | |
| "learning_rate": 8.202509629422647e-06, | |
| "loss": 0.5663, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.35299003322259137, | |
| "grad_norm": 3.338586543406698, | |
| "learning_rate": 8.17459224992849e-06, | |
| "loss": 0.561, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.35506644518272423, | |
| "grad_norm": 2.550936924190995, | |
| "learning_rate": 8.14650806585031e-06, | |
| "loss": 0.5748, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 2.730568567607308, | |
| "learning_rate": 8.118258552832945e-06, | |
| "loss": 0.5526, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.35921926910299, | |
| "grad_norm": 2.7922640713365765, | |
| "learning_rate": 8.0898451952082e-06, | |
| "loss": 0.5636, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.36129568106312293, | |
| "grad_norm": 2.4817520439108782, | |
| "learning_rate": 8.061269485916881e-06, | |
| "loss": 0.565, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.3633720930232558, | |
| "grad_norm": 2.5897869437416814, | |
| "learning_rate": 8.032532926430335e-06, | |
| "loss": 0.5718, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.3654485049833887, | |
| "grad_norm": 2.6233407361081196, | |
| "learning_rate": 8.003637026671558e-06, | |
| "loss": 0.5495, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3675249169435216, | |
| "grad_norm": 2.590608968830393, | |
| "learning_rate": 7.974583304935867e-06, | |
| "loss": 0.5701, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.3696013289036545, | |
| "grad_norm": 2.676185626796156, | |
| "learning_rate": 7.945373287811116e-06, | |
| "loss": 0.5476, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.37167774086378735, | |
| "grad_norm": 2.588749653152642, | |
| "learning_rate": 7.916008510097483e-06, | |
| "loss": 0.5363, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.37375415282392027, | |
| "grad_norm": 2.648109565452331, | |
| "learning_rate": 7.88649051472683e-06, | |
| "loss": 0.5566, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.37583056478405313, | |
| "grad_norm": 2.6123078212762567, | |
| "learning_rate": 7.856820852681634e-06, | |
| "loss": 0.5481, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.37790697674418605, | |
| "grad_norm": 2.5715025577779107, | |
| "learning_rate": 7.82700108291348e-06, | |
| "loss": 0.5554, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.3799833887043189, | |
| "grad_norm": 2.6810117688521333, | |
| "learning_rate": 7.797032772261164e-06, | |
| "loss": 0.5396, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.38205980066445183, | |
| "grad_norm": 2.828001329589521, | |
| "learning_rate": 7.766917495368356e-06, | |
| "loss": 0.549, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.3841362126245847, | |
| "grad_norm": 2.6073301891312455, | |
| "learning_rate": 7.736656834600866e-06, | |
| "loss": 0.5403, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.3862126245847176, | |
| "grad_norm": 2.7467154847057107, | |
| "learning_rate": 7.706252379963498e-06, | |
| "loss": 0.5395, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3882890365448505, | |
| "grad_norm": 2.6418072073420067, | |
| "learning_rate": 7.675705729016508e-06, | |
| "loss": 0.5363, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.3903654485049834, | |
| "grad_norm": 2.632007372607857, | |
| "learning_rate": 7.645018486791664e-06, | |
| "loss": 0.5377, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.39244186046511625, | |
| "grad_norm": 2.4652302347093364, | |
| "learning_rate": 7.6141922657079045e-06, | |
| "loss": 0.5321, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.3945182724252492, | |
| "grad_norm": 2.5492866422631764, | |
| "learning_rate": 7.583228685486623e-06, | |
| "loss": 0.5433, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.39659468438538203, | |
| "grad_norm": 2.4794671881341936, | |
| "learning_rate": 7.552129373066565e-06, | |
| "loss": 0.5423, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.39867109634551495, | |
| "grad_norm": 2.565377450639672, | |
| "learning_rate": 7.520895962518329e-06, | |
| "loss": 0.5357, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.4007475083056478, | |
| "grad_norm": 2.7376349329000504, | |
| "learning_rate": 7.489530094958521e-06, | |
| "loss": 0.5529, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.40282392026578073, | |
| "grad_norm": 2.5470062145134778, | |
| "learning_rate": 7.458033418463517e-06, | |
| "loss": 0.5167, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.4049003322259136, | |
| "grad_norm": 2.5915393940286724, | |
| "learning_rate": 7.426407587982869e-06, | |
| "loss": 0.5359, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.4069767441860465, | |
| "grad_norm": 2.5521473612501118, | |
| "learning_rate": 7.394654265252348e-06, | |
| "loss": 0.5448, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.4090531561461794, | |
| "grad_norm": 2.540390049884069, | |
| "learning_rate": 7.362775118706627e-06, | |
| "loss": 0.5224, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.4111295681063123, | |
| "grad_norm": 2.67106563437947, | |
| "learning_rate": 7.330771823391622e-06, | |
| "loss": 0.547, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.41320598006644516, | |
| "grad_norm": 2.5844286453504752, | |
| "learning_rate": 7.298646060876473e-06, | |
| "loss": 0.5245, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.4152823920265781, | |
| "grad_norm": 2.489462893225223, | |
| "learning_rate": 7.266399519165193e-06, | |
| "loss": 0.5177, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.417358803986711, | |
| "grad_norm": 2.548885028848683, | |
| "learning_rate": 7.234033892607969e-06, | |
| "loss": 0.5285, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.41943521594684385, | |
| "grad_norm": 2.515732979636329, | |
| "learning_rate": 7.201550881812138e-06, | |
| "loss": 0.5295, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.42151162790697677, | |
| "grad_norm": 2.580813201220608, | |
| "learning_rate": 7.168952193552831e-06, | |
| "loss": 0.5144, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.42358803986710963, | |
| "grad_norm": 2.8605769340325544, | |
| "learning_rate": 7.136239540683297e-06, | |
| "loss": 0.5189, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.42566445182724255, | |
| "grad_norm": 2.7042921962644773, | |
| "learning_rate": 7.103414642044888e-06, | |
| "loss": 0.516, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.4277408637873754, | |
| "grad_norm": 2.5935305392513475, | |
| "learning_rate": 7.070479222376765e-06, | |
| "loss": 0.5273, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.42981727574750833, | |
| "grad_norm": 2.521806447567166, | |
| "learning_rate": 7.037435012225259e-06, | |
| "loss": 0.514, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.4318936877076412, | |
| "grad_norm": 2.4922095571026808, | |
| "learning_rate": 7.00428374785295e-06, | |
| "loss": 0.5191, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.4339700996677741, | |
| "grad_norm": 2.53445755137843, | |
| "learning_rate": 6.971027171147436e-06, | |
| "loss": 0.5175, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.436046511627907, | |
| "grad_norm": 2.5854663493896815, | |
| "learning_rate": 6.937667029529803e-06, | |
| "loss": 0.5052, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.4381229235880399, | |
| "grad_norm": 2.6149256231235767, | |
| "learning_rate": 6.904205075862816e-06, | |
| "loss": 0.5155, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.44019933554817275, | |
| "grad_norm": 2.5728069972099643, | |
| "learning_rate": 6.870643068358813e-06, | |
| "loss": 0.5164, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.44227574750830567, | |
| "grad_norm": 2.610034601385569, | |
| "learning_rate": 6.8369827704873225e-06, | |
| "loss": 0.515, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.44435215946843853, | |
| "grad_norm": 2.559653943614866, | |
| "learning_rate": 6.803225950882407e-06, | |
| "loss": 0.5103, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.44642857142857145, | |
| "grad_norm": 2.744659999074845, | |
| "learning_rate": 6.769374383249728e-06, | |
| "loss": 0.5144, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.4485049833887043, | |
| "grad_norm": 2.500834722382555, | |
| "learning_rate": 6.735429846273356e-06, | |
| "loss": 0.509, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.45058139534883723, | |
| "grad_norm": 2.571303478772175, | |
| "learning_rate": 6.701394123522303e-06, | |
| "loss": 0.5061, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.4526578073089701, | |
| "grad_norm": 2.6726371126474042, | |
| "learning_rate": 6.667269003356815e-06, | |
| "loss": 0.4872, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.454734219269103, | |
| "grad_norm": 2.314624945694432, | |
| "learning_rate": 6.633056278834403e-06, | |
| "loss": 0.4978, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.4568106312292359, | |
| "grad_norm": 2.5660125412801986, | |
| "learning_rate": 6.598757747615625e-06, | |
| "loss": 0.4873, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.4588870431893688, | |
| "grad_norm": 2.5055302944005655, | |
| "learning_rate": 6.564375211869638e-06, | |
| "loss": 0.4955, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.46096345514950166, | |
| "grad_norm": 2.3161654964295963, | |
| "learning_rate": 6.529910478179499e-06, | |
| "loss": 0.4996, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.4630398671096346, | |
| "grad_norm": 2.713583584390501, | |
| "learning_rate": 6.495365357447242e-06, | |
| "loss": 0.4837, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.46511627906976744, | |
| "grad_norm": 2.6986080979156597, | |
| "learning_rate": 6.4607416647987285e-06, | |
| "loss": 0.503, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.46719269102990035, | |
| "grad_norm": 2.3758745672703614, | |
| "learning_rate": 6.426041219488275e-06, | |
| "loss": 0.4917, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.4692691029900332, | |
| "grad_norm": 2.468317610874025, | |
| "learning_rate": 6.39126584480306e-06, | |
| "loss": 0.4947, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.47134551495016613, | |
| "grad_norm": 2.672466601805675, | |
| "learning_rate": 6.3564173679673225e-06, | |
| "loss": 0.4956, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.473421926910299, | |
| "grad_norm": 2.686387722109422, | |
| "learning_rate": 6.321497620046353e-06, | |
| "loss": 0.4958, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.4754983388704319, | |
| "grad_norm": 2.4115883144762105, | |
| "learning_rate": 6.286508435850282e-06, | |
| "loss": 0.4884, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.4775747508305648, | |
| "grad_norm": 2.473062095275494, | |
| "learning_rate": 6.251451653837679e-06, | |
| "loss": 0.4873, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.4796511627906977, | |
| "grad_norm": 2.4611172122096034, | |
| "learning_rate": 6.216329116018943e-06, | |
| "loss": 0.4828, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.48172757475083056, | |
| "grad_norm": 2.438501558434762, | |
| "learning_rate": 6.181142667859521e-06, | |
| "loss": 0.4743, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.4838039867109635, | |
| "grad_norm": 2.4623748153401586, | |
| "learning_rate": 6.145894158182945e-06, | |
| "loss": 0.4813, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.48588039867109634, | |
| "grad_norm": 2.5841330806095093, | |
| "learning_rate": 6.11058543907368e-06, | |
| "loss": 0.4757, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.48795681063122925, | |
| "grad_norm": 2.420645551171905, | |
| "learning_rate": 6.075218365779814e-06, | |
| "loss": 0.4717, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.4900332225913621, | |
| "grad_norm": 2.41753538282735, | |
| "learning_rate": 6.039794796615575e-06, | |
| "loss": 0.4683, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.49210963455149503, | |
| "grad_norm": 2.6345922483315993, | |
| "learning_rate": 6.004316592863693e-06, | |
| "loss": 0.4758, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.4941860465116279, | |
| "grad_norm": 2.580357854248359, | |
| "learning_rate": 5.96878561867759e-06, | |
| "loss": 0.4923, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.4962624584717608, | |
| "grad_norm": 2.3693846881679463, | |
| "learning_rate": 5.9332037409834466e-06, | |
| "loss": 0.4732, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.4983388704318937, | |
| "grad_norm": 2.769567429139866, | |
| "learning_rate": 5.89757282938209e-06, | |
| "loss": 0.4713, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5004152823920266, | |
| "grad_norm": 2.41622785319668, | |
| "learning_rate": 5.86189475605077e-06, | |
| "loss": 0.476, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.5024916943521595, | |
| "grad_norm": 2.499791289384567, | |
| "learning_rate": 5.826171395644786e-06, | |
| "loss": 0.4749, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.5045681063122923, | |
| "grad_norm": 2.417525944289692, | |
| "learning_rate": 5.790404625198982e-06, | |
| "loss": 0.4726, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.5066445182724253, | |
| "grad_norm": 2.5878334687029114, | |
| "learning_rate": 5.754596324029125e-06, | |
| "loss": 0.4761, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.5087209302325582, | |
| "grad_norm": 2.4962102663667043, | |
| "learning_rate": 5.7187483736331554e-06, | |
| "loss": 0.4578, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.510797342192691, | |
| "grad_norm": 2.6263564446414636, | |
| "learning_rate": 5.682862657592327e-06, | |
| "loss": 0.4825, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.5128737541528239, | |
| "grad_norm": 2.880797119411763, | |
| "learning_rate": 5.646941061472242e-06, | |
| "loss": 0.469, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.5149501661129569, | |
| "grad_norm": 2.555965100494747, | |
| "learning_rate": 5.610985472723764e-06, | |
| "loss": 0.4712, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5170265780730897, | |
| "grad_norm": 2.502236357284136, | |
| "learning_rate": 5.5749977805838615e-06, | |
| "loss": 0.4681, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.5191029900332226, | |
| "grad_norm": 2.4360635002482347, | |
| "learning_rate": 5.538979875976324e-06, | |
| "loss": 0.4636, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5211794019933554, | |
| "grad_norm": 2.488011716508302, | |
| "learning_rate": 5.502933651412417e-06, | |
| "loss": 0.4699, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.5232558139534884, | |
| "grad_norm": 2.3770436189443696, | |
| "learning_rate": 5.466861000891439e-06, | |
| "loss": 0.4592, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.5253322259136213, | |
| "grad_norm": 2.7541846157024876, | |
| "learning_rate": 5.430763819801205e-06, | |
| "loss": 0.4692, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.5274086378737541, | |
| "grad_norm": 2.7287082031019745, | |
| "learning_rate": 5.394644004818452e-06, | |
| "loss": 0.4745, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.529485049833887, | |
| "grad_norm": 2.5164954994115094, | |
| "learning_rate": 5.3585034538091885e-06, | |
| "loss": 0.4525, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.53156146179402, | |
| "grad_norm": 2.347205777105881, | |
| "learning_rate": 5.322344065728964e-06, | |
| "loss": 0.4689, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.5336378737541528, | |
| "grad_norm": 2.582827989286747, | |
| "learning_rate": 5.286167740523099e-06, | |
| "loss": 0.4691, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 2.5061090934097843, | |
| "learning_rate": 5.249976379026851e-06, | |
| "loss": 0.4436, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.5377906976744186, | |
| "grad_norm": 2.4524559965169748, | |
| "learning_rate": 5.213771882865538e-06, | |
| "loss": 0.4643, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.5398671096345515, | |
| "grad_norm": 2.560097527019471, | |
| "learning_rate": 5.177556154354622e-06, | |
| "loss": 0.4464, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5419435215946844, | |
| "grad_norm": 2.397260026201424, | |
| "learning_rate": 5.141331096399755e-06, | |
| "loss": 0.4501, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.5440199335548173, | |
| "grad_norm": 2.351541148312247, | |
| "learning_rate": 5.1050986123967884e-06, | |
| "loss": 0.4398, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5460963455149501, | |
| "grad_norm": 2.452194040455103, | |
| "learning_rate": 5.068860606131766e-06, | |
| "loss": 0.4516, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.5481727574750831, | |
| "grad_norm": 2.593569889967618, | |
| "learning_rate": 5.032618981680893e-06, | |
| "loss": 0.4534, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.550249169435216, | |
| "grad_norm": 2.491194365967403, | |
| "learning_rate": 4.9963756433104875e-06, | |
| "loss": 0.4561, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.5523255813953488, | |
| "grad_norm": 2.5315048028501432, | |
| "learning_rate": 4.960132495376919e-06, | |
| "loss": 0.4387, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.5544019933554817, | |
| "grad_norm": 2.4221610492026566, | |
| "learning_rate": 4.923891442226554e-06, | |
| "loss": 0.4526, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.5564784053156147, | |
| "grad_norm": 2.4574741459986043, | |
| "learning_rate": 4.887654388095691e-06, | |
| "loss": 0.4388, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.5585548172757475, | |
| "grad_norm": 2.5581004359073565, | |
| "learning_rate": 4.851423237010504e-06, | |
| "loss": 0.4512, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.5606312292358804, | |
| "grad_norm": 2.5084567945271634, | |
| "learning_rate": 4.815199892687006e-06, | |
| "loss": 0.464, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.5627076411960132, | |
| "grad_norm": 2.4656070255557294, | |
| "learning_rate": 4.778986258431005e-06, | |
| "loss": 0.4471, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.5647840531561462, | |
| "grad_norm": 2.494517722129321, | |
| "learning_rate": 4.742784237038113e-06, | |
| "loss": 0.4352, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.5668604651162791, | |
| "grad_norm": 2.5383042319953995, | |
| "learning_rate": 4.70659573069376e-06, | |
| "loss": 0.421, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.5689368770764119, | |
| "grad_norm": 2.3933135171603936, | |
| "learning_rate": 4.670422640873242e-06, | |
| "loss": 0.4379, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.5710132890365448, | |
| "grad_norm": 2.4020680375977133, | |
| "learning_rate": 4.63426686824182e-06, | |
| "loss": 0.4323, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.5730897009966778, | |
| "grad_norm": 2.43581294994139, | |
| "learning_rate": 4.598130312554843e-06, | |
| "loss": 0.4397, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.5751661129568106, | |
| "grad_norm": 2.5772706634163027, | |
| "learning_rate": 4.562014872557936e-06, | |
| "loss": 0.4362, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.5772425249169435, | |
| "grad_norm": 2.448863408768738, | |
| "learning_rate": 4.525922445887224e-06, | |
| "loss": 0.4349, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.5793189368770764, | |
| "grad_norm": 2.535308434878213, | |
| "learning_rate": 4.489854928969635e-06, | |
| "loss": 0.4516, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.5813953488372093, | |
| "grad_norm": 2.3973615256768768, | |
| "learning_rate": 4.453814216923242e-06, | |
| "loss": 0.4336, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5834717607973422, | |
| "grad_norm": 2.3119199540164965, | |
| "learning_rate": 4.4178022034576976e-06, | |
| "loss": 0.4226, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.5855481727574751, | |
| "grad_norm": 2.3014825037296633, | |
| "learning_rate": 4.381820780774724e-06, | |
| "loss": 0.4322, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.5876245847176079, | |
| "grad_norm": 2.5351337278959556, | |
| "learning_rate": 4.345871839468694e-06, | |
| "loss": 0.4055, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.5897009966777409, | |
| "grad_norm": 2.611286820208639, | |
| "learning_rate": 4.309957268427292e-06, | |
| "loss": 0.4216, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.5917774086378738, | |
| "grad_norm": 2.3889570520642684, | |
| "learning_rate": 4.274078954732262e-06, | |
| "loss": 0.4427, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.5938538205980066, | |
| "grad_norm": 2.384724624598042, | |
| "learning_rate": 4.2382387835602565e-06, | |
| "loss": 0.4246, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.5959302325581395, | |
| "grad_norm": 2.3536762842777126, | |
| "learning_rate": 4.20243863808378e-06, | |
| "loss": 0.4352, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.5980066445182725, | |
| "grad_norm": 2.367560729519929, | |
| "learning_rate": 4.166680399372248e-06, | |
| "loss": 0.4226, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.6000830564784053, | |
| "grad_norm": 2.401186140827422, | |
| "learning_rate": 4.130965946293135e-06, | |
| "loss": 0.4529, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.6021594684385382, | |
| "grad_norm": 2.3503805374006457, | |
| "learning_rate": 4.095297155413264e-06, | |
| "loss": 0.4213, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.604235880398671, | |
| "grad_norm": 2.404199762232402, | |
| "learning_rate": 4.059675900900199e-06, | |
| "loss": 0.4309, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.606312292358804, | |
| "grad_norm": 2.5304024582625053, | |
| "learning_rate": 4.024104054423772e-06, | |
| "loss": 0.4215, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.6083887043189369, | |
| "grad_norm": 2.4035116235125473, | |
| "learning_rate": 3.9885834850577375e-06, | |
| "loss": 0.4282, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.6104651162790697, | |
| "grad_norm": 2.3499844076305156, | |
| "learning_rate": 3.953116059181563e-06, | |
| "loss": 0.422, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.6125415282392026, | |
| "grad_norm": 2.5288170114153585, | |
| "learning_rate": 3.9177036403823645e-06, | |
| "loss": 0.4329, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.6146179401993356, | |
| "grad_norm": 2.3290974062316057, | |
| "learning_rate": 3.882348089356992e-06, | |
| "loss": 0.4137, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.6166943521594684, | |
| "grad_norm": 2.4328677326588894, | |
| "learning_rate": 3.84705126381425e-06, | |
| "loss": 0.4297, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.6187707641196013, | |
| "grad_norm": 2.3908310630477954, | |
| "learning_rate": 3.8118150183772974e-06, | |
| "loss": 0.4293, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.6208471760797342, | |
| "grad_norm": 2.4893827738846808, | |
| "learning_rate": 3.776641204486191e-06, | |
| "loss": 0.4214, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.6229235880398671, | |
| "grad_norm": 2.3486377563484133, | |
| "learning_rate": 3.7415316703006116e-06, | |
| "loss": 0.405, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 2.466506888817687, | |
| "learning_rate": 3.7064882606027497e-06, | |
| "loss": 0.426, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.6270764119601329, | |
| "grad_norm": 2.496662130115367, | |
| "learning_rate": 3.671512816700375e-06, | |
| "loss": 0.4201, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.6291528239202658, | |
| "grad_norm": 2.265163717312505, | |
| "learning_rate": 3.636607176330088e-06, | |
| "loss": 0.4205, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.6312292358803987, | |
| "grad_norm": 2.2703878574783163, | |
| "learning_rate": 3.60177317356076e-06, | |
| "loss": 0.4101, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.6333056478405316, | |
| "grad_norm": 2.423443407995488, | |
| "learning_rate": 3.5670126386971625e-06, | |
| "loss": 0.4171, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.6353820598006644, | |
| "grad_norm": 2.44608682526587, | |
| "learning_rate": 3.5323273981837965e-06, | |
| "loss": 0.416, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.6374584717607974, | |
| "grad_norm": 2.2051417207338173, | |
| "learning_rate": 3.497719274508925e-06, | |
| "loss": 0.4019, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.6395348837209303, | |
| "grad_norm": 2.4800578989548034, | |
| "learning_rate": 3.4631900861088132e-06, | |
| "loss": 0.4029, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.6416112956810631, | |
| "grad_norm": 2.3268282845100035, | |
| "learning_rate": 3.4287416472721795e-06, | |
| "loss": 0.4111, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.643687707641196, | |
| "grad_norm": 2.3872453059218532, | |
| "learning_rate": 3.3943757680448697e-06, | |
| "loss": 0.4061, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.645764119601329, | |
| "grad_norm": 2.42558490404232, | |
| "learning_rate": 3.360094254134746e-06, | |
| "loss": 0.403, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.6478405315614618, | |
| "grad_norm": 2.441847356983534, | |
| "learning_rate": 3.3258989068168123e-06, | |
| "loss": 0.417, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.6499169435215947, | |
| "grad_norm": 2.356616246546388, | |
| "learning_rate": 3.2917915228385676e-06, | |
| "loss": 0.4008, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.6519933554817275, | |
| "grad_norm": 2.457529466848808, | |
| "learning_rate": 3.257773894325599e-06, | |
| "loss": 0.4166, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.6540697674418605, | |
| "grad_norm": 2.5688010790796154, | |
| "learning_rate": 3.223847808687415e-06, | |
| "loss": 0.3982, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.6561461794019934, | |
| "grad_norm": 2.2695295812005836, | |
| "learning_rate": 3.190015048523528e-06, | |
| "loss": 0.3912, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.6582225913621262, | |
| "grad_norm": 2.5664307243505227, | |
| "learning_rate": 3.156277391529796e-06, | |
| "loss": 0.4044, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.6602990033222591, | |
| "grad_norm": 2.421377162101449, | |
| "learning_rate": 3.1226366104050067e-06, | |
| "loss": 0.4061, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.6623754152823921, | |
| "grad_norm": 2.50702313044333, | |
| "learning_rate": 3.089094472757742e-06, | |
| "loss": 0.3986, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.6644518272425249, | |
| "grad_norm": 2.2015982709846122, | |
| "learning_rate": 3.055652741013497e-06, | |
| "loss": 0.3773, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6665282392026578, | |
| "grad_norm": 2.484025604844624, | |
| "learning_rate": 3.0223131723220756e-06, | |
| "loss": 0.4043, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.6686046511627907, | |
| "grad_norm": 2.2673450694224426, | |
| "learning_rate": 2.9890775184652666e-06, | |
| "loss": 0.3975, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.6706810631229236, | |
| "grad_norm": 2.411243052140437, | |
| "learning_rate": 2.955947525764796e-06, | |
| "loss": 0.4162, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.6727574750830565, | |
| "grad_norm": 2.467788088547966, | |
| "learning_rate": 2.9229249349905686e-06, | |
| "loss": 0.3905, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.6748338870431894, | |
| "grad_norm": 2.441034044229084, | |
| "learning_rate": 2.890011481269204e-06, | |
| "loss": 0.404, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.6769102990033222, | |
| "grad_norm": 2.4310426686498507, | |
| "learning_rate": 2.8572088939928623e-06, | |
| "loss": 0.3985, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.6789867109634552, | |
| "grad_norm": 2.5154739727394397, | |
| "learning_rate": 2.824518896728386e-06, | |
| "loss": 0.3972, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.6810631229235881, | |
| "grad_norm": 2.4239374759188066, | |
| "learning_rate": 2.7919432071267212e-06, | |
| "loss": 0.3986, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.6831395348837209, | |
| "grad_norm": 2.401230714452262, | |
| "learning_rate": 2.759483536832682e-06, | |
| "loss": 0.3961, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.6852159468438538, | |
| "grad_norm": 2.3945770626194425, | |
| "learning_rate": 2.7271415913950027e-06, | |
| "loss": 0.3987, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.6872923588039868, | |
| "grad_norm": 2.5083750676716248, | |
| "learning_rate": 2.6949190701767323e-06, | |
| "loss": 0.3987, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.6893687707641196, | |
| "grad_norm": 2.359597868105036, | |
| "learning_rate": 2.662817666265932e-06, | |
| "loss": 0.3992, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.6914451827242525, | |
| "grad_norm": 2.3950900870588305, | |
| "learning_rate": 2.6308390663867247e-06, | |
| "loss": 0.3755, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.6935215946843853, | |
| "grad_norm": 2.2726643843793783, | |
| "learning_rate": 2.5989849508106663e-06, | |
| "loss": 0.3788, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.6955980066445183, | |
| "grad_norm": 2.3688642141053644, | |
| "learning_rate": 2.5672569932684486e-06, | |
| "loss": 0.3923, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.6976744186046512, | |
| "grad_norm": 2.4674555381530543, | |
| "learning_rate": 2.5356568608619737e-06, | |
| "loss": 0.3784, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.699750830564784, | |
| "grad_norm": 2.348080957902949, | |
| "learning_rate": 2.504186213976736e-06, | |
| "loss": 0.3888, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.7018272425249169, | |
| "grad_norm": 2.2245908133987506, | |
| "learning_rate": 2.4728467061946017e-06, | |
| "loss": 0.383, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.7039036544850499, | |
| "grad_norm": 2.308262964854599, | |
| "learning_rate": 2.441639984206903e-06, | |
| "loss": 0.3873, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.7059800664451827, | |
| "grad_norm": 2.3316191201720726, | |
| "learning_rate": 2.4105676877279376e-06, | |
| "loss": 0.3764, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7080564784053156, | |
| "grad_norm": 2.2575654898253363, | |
| "learning_rate": 2.379631449408788e-06, | |
| "loss": 0.3857, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.7101328903654485, | |
| "grad_norm": 2.295434521334263, | |
| "learning_rate": 2.3488328947515566e-06, | |
| "loss": 0.3825, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.7122093023255814, | |
| "grad_norm": 2.3045365012329704, | |
| "learning_rate": 2.318173642023939e-06, | |
| "loss": 0.3851, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 2.3117392889776665, | |
| "learning_rate": 2.287655302174208e-06, | |
| "loss": 0.3897, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.7163621262458472, | |
| "grad_norm": 2.422532892044474, | |
| "learning_rate": 2.257279478746564e-06, | |
| "loss": 0.3799, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.71843853820598, | |
| "grad_norm": 2.2839185079742514, | |
| "learning_rate": 2.2270477677968727e-06, | |
| "loss": 0.3703, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.720514950166113, | |
| "grad_norm": 2.7279247585921786, | |
| "learning_rate": 2.196961757808813e-06, | |
| "loss": 0.3794, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.7225913621262459, | |
| "grad_norm": 2.396361579385602, | |
| "learning_rate": 2.167023029610402e-06, | |
| "loss": 0.3642, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7246677740863787, | |
| "grad_norm": 2.340856081292544, | |
| "learning_rate": 2.1372331562909453e-06, | |
| "loss": 0.372, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.7267441860465116, | |
| "grad_norm": 2.413915292833693, | |
| "learning_rate": 2.1075937031183636e-06, | |
| "loss": 0.3767, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.7288205980066446, | |
| "grad_norm": 2.2094868525489386, | |
| "learning_rate": 2.0781062274569657e-06, | |
| "loss": 0.3713, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.7308970099667774, | |
| "grad_norm": 2.2242377702402663, | |
| "learning_rate": 2.0487722786856107e-06, | |
| "loss": 0.3808, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.7329734219269103, | |
| "grad_norm": 2.451226818715509, | |
| "learning_rate": 2.019593398116292e-06, | |
| "loss": 0.3752, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.7350498338870431, | |
| "grad_norm": 2.5070300923436006, | |
| "learning_rate": 1.990571118913166e-06, | |
| "loss": 0.3754, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.7371262458471761, | |
| "grad_norm": 2.4891905395473963, | |
| "learning_rate": 1.961706966011978e-06, | |
| "loss": 0.3877, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.739202657807309, | |
| "grad_norm": 2.4842650358701905, | |
| "learning_rate": 1.9330024560399507e-06, | |
| "loss": 0.3836, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.7412790697674418, | |
| "grad_norm": 2.250133568783516, | |
| "learning_rate": 1.9044590972360822e-06, | |
| "loss": 0.3725, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.7433554817275747, | |
| "grad_norm": 2.341904795212687, | |
| "learning_rate": 1.876078389371911e-06, | |
| "loss": 0.3679, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.7454318936877077, | |
| "grad_norm": 2.3068998565270746, | |
| "learning_rate": 1.8478618236726992e-06, | |
| "loss": 0.3757, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.7475083056478405, | |
| "grad_norm": 2.2619310866276203, | |
| "learning_rate": 1.8198108827390892e-06, | |
| "loss": 0.3742, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7495847176079734, | |
| "grad_norm": 2.406091048606607, | |
| "learning_rate": 1.791927040469198e-06, | |
| "loss": 0.3805, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.7516611295681063, | |
| "grad_norm": 2.3430777426784077, | |
| "learning_rate": 1.7642117619811672e-06, | |
| "loss": 0.3744, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.7537375415282392, | |
| "grad_norm": 2.309496934162411, | |
| "learning_rate": 1.7366665035361947e-06, | |
| "loss": 0.3856, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.7558139534883721, | |
| "grad_norm": 2.3680236136606085, | |
| "learning_rate": 1.7092927124620007e-06, | |
| "loss": 0.3747, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.757890365448505, | |
| "grad_norm": 2.3303370070854066, | |
| "learning_rate": 1.682091827076796e-06, | |
| "loss": 0.3724, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.7599667774086378, | |
| "grad_norm": 2.308665058379731, | |
| "learning_rate": 1.6550652766136932e-06, | |
| "loss": 0.3701, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.7620431893687708, | |
| "grad_norm": 2.423141151726278, | |
| "learning_rate": 1.6282144811456196e-06, | |
| "loss": 0.3749, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.7641196013289037, | |
| "grad_norm": 2.310790310097539, | |
| "learning_rate": 1.6015408515107e-06, | |
| "loss": 0.3649, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.7661960132890365, | |
| "grad_norm": 2.350953218186428, | |
| "learning_rate": 1.5750457892381183e-06, | |
| "loss": 0.3766, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.7682724252491694, | |
| "grad_norm": 2.3685044215677826, | |
| "learning_rate": 1.5487306864744878e-06, | |
| "loss": 0.3626, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.7703488372093024, | |
| "grad_norm": 2.4283396349263384, | |
| "learning_rate": 1.5225969259106909e-06, | |
| "loss": 0.358, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.7724252491694352, | |
| "grad_norm": 2.515904865078178, | |
| "learning_rate": 1.4966458807092404e-06, | |
| "loss": 0.3703, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.7745016611295681, | |
| "grad_norm": 2.369156818267499, | |
| "learning_rate": 1.470878914432115e-06, | |
| "loss": 0.3628, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.776578073089701, | |
| "grad_norm": 2.3898762463795302, | |
| "learning_rate": 1.4452973809691245e-06, | |
| "loss": 0.3491, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.7786544850498339, | |
| "grad_norm": 2.307405290268551, | |
| "learning_rate": 1.4199026244667636e-06, | |
| "loss": 0.3715, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.7807308970099668, | |
| "grad_norm": 2.3679557325362808, | |
| "learning_rate": 1.3946959792575915e-06, | |
| "loss": 0.3716, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.7828073089700996, | |
| "grad_norm": 2.368304219604154, | |
| "learning_rate": 1.3696787697901131e-06, | |
| "loss": 0.3661, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.7848837209302325, | |
| "grad_norm": 2.337789695422565, | |
| "learning_rate": 1.3448523105591976e-06, | |
| "loss": 0.3605, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.7869601328903655, | |
| "grad_norm": 2.611258973451232, | |
| "learning_rate": 1.3202179060370041e-06, | |
| "loss": 0.3699, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.7890365448504983, | |
| "grad_norm": 2.437657572808606, | |
| "learning_rate": 1.2957768506044383e-06, | |
| "loss": 0.3651, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.7911129568106312, | |
| "grad_norm": 2.388228690853508, | |
| "learning_rate": 1.2715304284831492e-06, | |
| "loss": 0.3664, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.7931893687707641, | |
| "grad_norm": 2.2860587085497235, | |
| "learning_rate": 1.2474799136680394e-06, | |
| "loss": 0.3577, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.795265780730897, | |
| "grad_norm": 2.2178621526275077, | |
| "learning_rate": 1.223626569860339e-06, | |
| "loss": 0.3441, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.7973421926910299, | |
| "grad_norm": 2.518415787103085, | |
| "learning_rate": 1.1999716504011917e-06, | |
| "loss": 0.3673, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.7994186046511628, | |
| "grad_norm": 2.359475880122496, | |
| "learning_rate": 1.1765163982058109e-06, | |
| "loss": 0.3567, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.8014950166112956, | |
| "grad_norm": 2.407404285602653, | |
| "learning_rate": 1.1532620456981685e-06, | |
| "loss": 0.3476, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.8035714285714286, | |
| "grad_norm": 2.562334088122669, | |
| "learning_rate": 1.1302098147462348e-06, | |
| "loss": 0.3658, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.8056478405315615, | |
| "grad_norm": 2.4467720130350163, | |
| "learning_rate": 1.1073609165977866e-06, | |
| "loss": 0.348, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.8077242524916943, | |
| "grad_norm": 2.3514873698583574, | |
| "learning_rate": 1.0847165518167513e-06, | |
| "loss": 0.3601, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.8098006644518272, | |
| "grad_norm": 2.258063143891622, | |
| "learning_rate": 1.062277910220138e-06, | |
| "loss": 0.3548, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.8118770764119602, | |
| "grad_norm": 2.3377988411022246, | |
| "learning_rate": 1.0400461708155095e-06, | |
| "loss": 0.3591, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.813953488372093, | |
| "grad_norm": 2.4485426821221004, | |
| "learning_rate": 1.0180225017390416e-06, | |
| "loss": 0.3583, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.8160299003322259, | |
| "grad_norm": 2.3726559534317797, | |
| "learning_rate": 9.962080601941365e-07, | |
| "loss": 0.3426, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.8181063122923588, | |
| "grad_norm": 2.2417751776494543, | |
| "learning_rate": 9.746039923906258e-07, | |
| "loss": 0.343, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.8201827242524917, | |
| "grad_norm": 2.5294843157217906, | |
| "learning_rate": 9.532114334845444e-07, | |
| "loss": 0.3664, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.8222591362126246, | |
| "grad_norm": 2.5572851406694235, | |
| "learning_rate": 9.320315075184771e-07, | |
| "loss": 0.3483, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.8243355481727574, | |
| "grad_norm": 2.4014306355585973, | |
| "learning_rate": 9.110653273625103e-07, | |
| "loss": 0.3454, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.8264119601328903, | |
| "grad_norm": 2.3699223457500715, | |
| "learning_rate": 8.903139946557437e-07, | |
| "loss": 0.3527, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.8284883720930233, | |
| "grad_norm": 2.4489197804834197, | |
| "learning_rate": 8.697785997484198e-07, | |
| "loss": 0.3535, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.8305647840531561, | |
| "grad_norm": 2.4381698669696044, | |
| "learning_rate": 8.494602216446213e-07, | |
| "loss": 0.3522, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.832641196013289, | |
| "grad_norm": 2.373612659548005, | |
| "learning_rate": 8.293599279455838e-07, | |
| "loss": 0.352, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.834717607973422, | |
| "grad_norm": 2.5001126967401763, | |
| "learning_rate": 8.094787747935995e-07, | |
| "loss": 0.3533, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.8367940199335548, | |
| "grad_norm": 2.4033229472375637, | |
| "learning_rate": 7.898178068165175e-07, | |
| "loss": 0.3569, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.8388704318936877, | |
| "grad_norm": 2.34177766700727, | |
| "learning_rate": 7.703780570728637e-07, | |
| "loss": 0.3485, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.8409468438538206, | |
| "grad_norm": 2.345211689975521, | |
| "learning_rate": 7.511605469975524e-07, | |
| "loss": 0.3541, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.8430232558139535, | |
| "grad_norm": 2.491346976334481, | |
| "learning_rate": 7.321662863482248e-07, | |
| "loss": 0.357, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.8450996677740864, | |
| "grad_norm": 2.4991193300068515, | |
| "learning_rate": 7.133962731521837e-07, | |
| "loss": 0.3504, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.8471760797342193, | |
| "grad_norm": 2.4131651786978376, | |
| "learning_rate": 6.948514936539596e-07, | |
| "loss": 0.3413, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.8492524916943521, | |
| "grad_norm": 2.4158508388648046, | |
| "learning_rate": 6.765329222634892e-07, | |
| "loss": 0.3368, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.8513289036544851, | |
| "grad_norm": 2.444048773418729, | |
| "learning_rate": 6.584415215049145e-07, | |
| "loss": 0.3478, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.853405315614618, | |
| "grad_norm": 2.3067727734077854, | |
| "learning_rate": 6.405782419660073e-07, | |
| "loss": 0.3539, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.8554817275747508, | |
| "grad_norm": 2.389540542776719, | |
| "learning_rate": 6.229440222482258e-07, | |
| "loss": 0.3568, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.8575581395348837, | |
| "grad_norm": 2.490728442827626, | |
| "learning_rate": 6.055397889173947e-07, | |
| "loss": 0.3425, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.8596345514950167, | |
| "grad_norm": 2.4309142506564116, | |
| "learning_rate": 5.88366456455019e-07, | |
| "loss": 0.3556, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.8617109634551495, | |
| "grad_norm": 2.577695548294538, | |
| "learning_rate": 5.714249272102368e-07, | |
| "loss": 0.3479, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.8637873754152824, | |
| "grad_norm": 2.3780994980865513, | |
| "learning_rate": 5.547160913524024e-07, | |
| "loss": 0.3407, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.8658637873754153, | |
| "grad_norm": 2.3471940728385645, | |
| "learning_rate": 5.382408268243194e-07, | |
| "loss": 0.327, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.8679401993355482, | |
| "grad_norm": 2.5308209588235964, | |
| "learning_rate": 5.219999992961044e-07, | |
| "loss": 0.3486, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.8700166112956811, | |
| "grad_norm": 2.347529844497377, | |
| "learning_rate": 5.05994462119705e-07, | |
| "loss": 0.3507, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.872093023255814, | |
| "grad_norm": 2.4490768218202428, | |
| "learning_rate": 4.902250562840622e-07, | |
| "loss": 0.3484, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.8741694352159468, | |
| "grad_norm": 2.4607053819399227, | |
| "learning_rate": 4.7469261037091765e-07, | |
| "loss": 0.355, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.8762458471760798, | |
| "grad_norm": 2.37905091425431, | |
| "learning_rate": 4.5939794051128363e-07, | |
| "loss": 0.3544, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.8783222591362126, | |
| "grad_norm": 2.3898177002048397, | |
| "learning_rate": 4.443418503425517e-07, | |
| "loss": 0.3459, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.8803986710963455, | |
| "grad_norm": 2.3945638825763336, | |
| "learning_rate": 4.295251309662768e-07, | |
| "loss": 0.3475, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.8824750830564784, | |
| "grad_norm": 2.376437633901908, | |
| "learning_rate": 4.149485609066001e-07, | |
| "loss": 0.3448, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.8845514950166113, | |
| "grad_norm": 2.4682795986451884, | |
| "learning_rate": 4.0061290606935145e-07, | |
| "loss": 0.3501, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.8866279069767442, | |
| "grad_norm": 2.307696986215917, | |
| "learning_rate": 3.8651891970179876e-07, | |
| "loss": 0.3509, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.8887043189368771, | |
| "grad_norm": 2.2638655900879323, | |
| "learning_rate": 3.7266734235307357e-07, | |
| "loss": 0.3494, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.8907807308970099, | |
| "grad_norm": 2.4074516319355865, | |
| "learning_rate": 3.5905890183525916e-07, | |
| "loss": 0.3381, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 2.4580735039851263, | |
| "learning_rate": 3.4569431318514647e-07, | |
| "loss": 0.3506, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.8949335548172758, | |
| "grad_norm": 2.223651003352099, | |
| "learning_rate": 3.3257427862666894e-07, | |
| "loss": 0.3426, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.8970099667774086, | |
| "grad_norm": 2.5240054200803925, | |
| "learning_rate": 3.196994875339976e-07, | |
| "loss": 0.3394, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.8990863787375415, | |
| "grad_norm": 2.650418412385108, | |
| "learning_rate": 3.0707061639532687e-07, | |
| "loss": 0.3469, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.9011627906976745, | |
| "grad_norm": 2.5283079967315256, | |
| "learning_rate": 2.946883287773211e-07, | |
| "loss": 0.3572, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.9032392026578073, | |
| "grad_norm": 2.482824449172331, | |
| "learning_rate": 2.82553275290256e-07, | |
| "loss": 0.3469, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.9053156146179402, | |
| "grad_norm": 2.42162117653704, | |
| "learning_rate": 2.706660935538297e-07, | |
| "loss": 0.3522, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.907392026578073, | |
| "grad_norm": 2.610628055343181, | |
| "learning_rate": 2.590274081636568e-07, | |
| "loss": 0.3326, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.909468438538206, | |
| "grad_norm": 2.337754822501405, | |
| "learning_rate": 2.476378306584576e-07, | |
| "loss": 0.3472, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.9115448504983389, | |
| "grad_norm": 2.422013772805342, | |
| "learning_rate": 2.3649795948791744e-07, | |
| "loss": 0.3291, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.9136212624584718, | |
| "grad_norm": 2.5260012444754865, | |
| "learning_rate": 2.2560837998124862e-07, | |
| "loss": 0.3443, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9156976744186046, | |
| "grad_norm": 2.5167784300702203, | |
| "learning_rate": 2.1496966431642895e-07, | |
| "loss": 0.344, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.9177740863787376, | |
| "grad_norm": 2.5184080924547976, | |
| "learning_rate": 2.0458237149014347e-07, | |
| "loss": 0.3431, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.9198504983388704, | |
| "grad_norm": 2.6121850478268915, | |
| "learning_rate": 1.944470472884097e-07, | |
| "loss": 0.3469, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.9219269102990033, | |
| "grad_norm": 2.4250182138955987, | |
| "learning_rate": 1.8456422425789822e-07, | |
| "loss": 0.346, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.9240033222591362, | |
| "grad_norm": 2.4126854578567056, | |
| "learning_rate": 1.7493442167795526e-07, | |
| "loss": 0.3394, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.9260797342192691, | |
| "grad_norm": 2.2732400743037546, | |
| "learning_rate": 1.6555814553331328e-07, | |
| "loss": 0.3474, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.928156146179402, | |
| "grad_norm": 2.4576436036196867, | |
| "learning_rate": 1.5643588848750944e-07, | |
| "loss": 0.3455, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.9302325581395349, | |
| "grad_norm": 2.417373647969096, | |
| "learning_rate": 1.4756812985699364e-07, | |
| "loss": 0.3389, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.9323089700996677, | |
| "grad_norm": 2.314864797926019, | |
| "learning_rate": 1.3895533558594853e-07, | |
| "loss": 0.3307, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.9343853820598007, | |
| "grad_norm": 2.4942438872944375, | |
| "learning_rate": 1.305979582218042e-07, | |
| "loss": 0.3413, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.9364617940199336, | |
| "grad_norm": 2.4271492623044733, | |
| "learning_rate": 1.224964368914622e-07, | |
| "loss": 0.3533, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.9385382059800664, | |
| "grad_norm": 2.404072393255019, | |
| "learning_rate": 1.1465119727821828e-07, | |
| "loss": 0.3388, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.9406146179401993, | |
| "grad_norm": 2.4291366569357233, | |
| "learning_rate": 1.0706265159939944e-07, | |
| "loss": 0.329, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.9426910299003323, | |
| "grad_norm": 2.370319609790916, | |
| "learning_rate": 9.973119858470326e-08, | |
| "loss": 0.3435, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.9447674418604651, | |
| "grad_norm": 2.612518036597659, | |
| "learning_rate": 9.265722345524475e-08, | |
| "loss": 0.3544, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.946843853820598, | |
| "grad_norm": 2.325383175606347, | |
| "learning_rate": 8.584109790331918e-08, | |
| "loss": 0.334, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.9489202657807309, | |
| "grad_norm": 2.483650038896797, | |
| "learning_rate": 7.92831800728705e-08, | |
| "loss": 0.3495, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.9509966777408638, | |
| "grad_norm": 2.3917415303858323, | |
| "learning_rate": 7.29838145406725e-08, | |
| "loss": 0.3525, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.9530730897009967, | |
| "grad_norm": 2.3491361755345297, | |
| "learning_rate": 6.69433322982238e-08, | |
| "loss": 0.3261, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.9551495016611296, | |
| "grad_norm": 2.395546616132027, | |
| "learning_rate": 6.116205073435632e-08, | |
| "loss": 0.3572, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.9572259136212624, | |
| "grad_norm": 2.38127790933904, | |
| "learning_rate": 5.5640273618560724e-08, | |
| "loss": 0.3477, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.9593023255813954, | |
| "grad_norm": 2.437345577309693, | |
| "learning_rate": 5.0378291085020905e-08, | |
| "loss": 0.3498, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.9613787375415282, | |
| "grad_norm": 2.3743576771009125, | |
| "learning_rate": 4.537637961737285e-08, | |
| "loss": 0.3537, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.9634551495016611, | |
| "grad_norm": 2.4186159686143816, | |
| "learning_rate": 4.063480203417625e-08, | |
| "loss": 0.3491, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.965531561461794, | |
| "grad_norm": 2.43658746364112, | |
| "learning_rate": 3.6153807475103886e-08, | |
| "loss": 0.3372, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.967607973421927, | |
| "grad_norm": 2.433100952556644, | |
| "learning_rate": 3.1933631387853215e-08, | |
| "loss": 0.34, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.9696843853820598, | |
| "grad_norm": 2.3533082714101288, | |
| "learning_rate": 2.7974495515772915e-08, | |
| "loss": 0.3478, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.9717607973421927, | |
| "grad_norm": 2.3042817476032296, | |
| "learning_rate": 2.427660788621222e-08, | |
| "loss": 0.3522, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.9738372093023255, | |
| "grad_norm": 2.5758246509298184, | |
| "learning_rate": 2.0840162799591335e-08, | |
| "loss": 0.3518, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.9759136212624585, | |
| "grad_norm": 2.2839484862848254, | |
| "learning_rate": 1.7665340819192356e-08, | |
| "loss": 0.3412, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.9779900332225914, | |
| "grad_norm": 2.3825796160738184, | |
| "learning_rate": 1.475230876166911e-08, | |
| "loss": 0.3484, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.9800664451827242, | |
| "grad_norm": 2.436946151591597, | |
| "learning_rate": 1.2101219688285815e-08, | |
| "loss": 0.3406, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.9821428571428571, | |
| "grad_norm": 2.3004422772721385, | |
| "learning_rate": 9.712212896871854e-09, | |
| "loss": 0.3483, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.9842192691029901, | |
| "grad_norm": 2.243191260776767, | |
| "learning_rate": 7.585413914503182e-09, | |
| "loss": 0.3279, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.9862956810631229, | |
| "grad_norm": 2.4579139874339213, | |
| "learning_rate": 5.720934490907604e-09, | |
| "loss": 0.3539, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.9883720930232558, | |
| "grad_norm": 2.2560463637497885, | |
| "learning_rate": 4.118872592592804e-09, | |
| "loss": 0.3376, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.9904485049833887, | |
| "grad_norm": 2.4374426699588327, | |
| "learning_rate": 2.7793123976976866e-09, | |
| "loss": 0.337, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.9925249169435216, | |
| "grad_norm": 2.4021295260594466, | |
| "learning_rate": 1.7023242915703563e-09, | |
| "loss": 0.3422, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.9946013289036545, | |
| "grad_norm": 2.3647029145641847, | |
| "learning_rate": 8.879648630705229e-10, | |
| "loss": 0.3402, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.9966777408637874, | |
| "grad_norm": 2.377691718973852, | |
| "learning_rate": 3.362769015941014e-10, | |
| "loss": 0.3437, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.9987541528239202, | |
| "grad_norm": 2.277353937646912, | |
| "learning_rate": 4.7289394825567046e-11, | |
| "loss": 0.3486, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_runtime": 3.4135, | |
| "eval_samples_per_second": 2.93, | |
| "eval_steps_per_second": 0.879, | |
| "step": 2408 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 2408, | |
| "total_flos": 252093105438720.0, | |
| "train_loss": 0.0, | |
| "train_runtime": 0.0085, | |
| "train_samples_per_second": 4527521.09, | |
| "train_steps_per_second": 283028.837 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2408, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 252093105438720.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |