{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 76.92307692307692, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.038461538461538464, "grad_norm": 0.8129774928092957, "learning_rate": 2.7e-07, "loss": 1.3508, "step": 10 }, { "epoch": 0.07692307692307693, "grad_norm": 0.6433653235435486, "learning_rate": 5.7e-07, "loss": 1.3552, "step": 20 }, { "epoch": 0.11538461538461539, "grad_norm": 0.6810219883918762, "learning_rate": 8.7e-07, "loss": 1.3478, "step": 30 }, { "epoch": 0.15384615384615385, "grad_norm": 0.6231173276901245, "learning_rate": 1.17e-06, "loss": 1.3472, "step": 40 }, { "epoch": 0.19230769230769232, "grad_norm": 0.829264223575592, "learning_rate": 1.4700000000000001e-06, "loss": 1.3416, "step": 50 }, { "epoch": 0.23076923076923078, "grad_norm": 0.9087441563606262, "learning_rate": 1.77e-06, "loss": 1.3256, "step": 60 }, { "epoch": 0.2692307692307692, "grad_norm": 1.0764439105987549, "learning_rate": 2.07e-06, "loss": 1.3159, "step": 70 }, { "epoch": 0.3076923076923077, "grad_norm": 1.0241045951843262, "learning_rate": 2.37e-06, "loss": 1.3131, "step": 80 }, { "epoch": 0.34615384615384615, "grad_norm": 1.0223745107650757, "learning_rate": 2.67e-06, "loss": 1.2953, "step": 90 }, { "epoch": 0.38461538461538464, "grad_norm": 1.3292489051818848, "learning_rate": 2.9700000000000004e-06, "loss": 1.2714, "step": 100 }, { "epoch": 0.4230769230769231, "grad_norm": 1.2786927223205566, "learning_rate": 3.27e-06, "loss": 1.2593, "step": 110 }, { "epoch": 0.46153846153846156, "grad_norm": 1.0608361959457397, "learning_rate": 3.57e-06, "loss": 1.2464, "step": 120 }, { "epoch": 0.5, "grad_norm": 0.9192166328430176, "learning_rate": 3.87e-06, "loss": 1.2222, "step": 130 }, { "epoch": 0.5384615384615384, "grad_norm": 0.8084186315536499, "learning_rate": 4.170000000000001e-06, "loss": 1.2077, "step": 140 }, { "epoch": 0.5769230769230769, "grad_norm": 0.8090039491653442, "learning_rate": 4.4699999999999996e-06, "loss": 1.1963, "step": 150 }, { "epoch": 0.6153846153846154, "grad_norm": 0.7544081807136536, "learning_rate": 4.77e-06, "loss": 1.1736, "step": 160 }, { "epoch": 0.6538461538461539, "grad_norm": 0.7331106662750244, "learning_rate": 5.070000000000001e-06, "loss": 1.1634, "step": 170 }, { "epoch": 0.6923076923076923, "grad_norm": 0.8381819128990173, "learning_rate": 5.37e-06, "loss": 1.154, "step": 180 }, { "epoch": 0.7307692307692307, "grad_norm": 0.8138782382011414, "learning_rate": 5.67e-06, "loss": 1.1325, "step": 190 }, { "epoch": 0.7692307692307693, "grad_norm": 0.7330286502838135, "learning_rate": 5.9700000000000004e-06, "loss": 1.1344, "step": 200 }, { "epoch": 0.8076923076923077, "grad_norm": 0.8166984915733337, "learning_rate": 6.27e-06, "loss": 1.1302, "step": 210 }, { "epoch": 0.8461538461538461, "grad_norm": 0.7504987120628357, "learning_rate": 6.57e-06, "loss": 1.1089, "step": 220 }, { "epoch": 0.8846153846153846, "grad_norm": 0.7398339509963989, "learning_rate": 6.87e-06, "loss": 1.0961, "step": 230 }, { "epoch": 0.9230769230769231, "grad_norm": 0.6914257407188416, "learning_rate": 7.17e-06, "loss": 1.0939, "step": 240 }, { "epoch": 0.9615384615384616, "grad_norm": 0.753593921661377, "learning_rate": 7.4700000000000005e-06, "loss": 1.0844, "step": 250 }, { "epoch": 1.0, "grad_norm": 0.775452733039856, "learning_rate": 7.77e-06, "loss": 1.0757, "step": 260 }, { "epoch": 1.0384615384615385, "grad_norm": 0.8942003846168518, "learning_rate": 8.07e-06, "loss": 1.0727, "step": 270 }, { "epoch": 1.0769230769230769, "grad_norm": 0.8823027610778809, "learning_rate": 8.370000000000001e-06, "loss": 1.073, "step": 280 }, { "epoch": 1.1153846153846154, "grad_norm": 0.7825962901115417, "learning_rate": 8.67e-06, "loss": 1.0681, "step": 290 }, { "epoch": 1.1538461538461537, "grad_norm": 1.042375087738037, "learning_rate": 8.97e-06, "loss": 1.0604, "step": 300 }, { "epoch": 1.1923076923076923, "grad_norm": 0.9922687411308289, "learning_rate": 9.27e-06, "loss": 1.0616, "step": 310 }, { "epoch": 1.2307692307692308, "grad_norm": 0.8586735129356384, "learning_rate": 9.57e-06, "loss": 1.0532, "step": 320 }, { "epoch": 1.2692307692307692, "grad_norm": 0.9278848171234131, "learning_rate": 9.87e-06, "loss": 1.0478, "step": 330 }, { "epoch": 1.3076923076923077, "grad_norm": 0.8375242948532104, "learning_rate": 1.0170000000000001e-05, "loss": 1.0513, "step": 340 }, { "epoch": 1.3461538461538463, "grad_norm": 0.7845088839530945, "learning_rate": 1.047e-05, "loss": 1.0438, "step": 350 }, { "epoch": 1.3846153846153846, "grad_norm": 0.9647091031074524, "learning_rate": 1.077e-05, "loss": 1.0285, "step": 360 }, { "epoch": 1.4230769230769231, "grad_norm": 1.1311838626861572, "learning_rate": 1.107e-05, "loss": 1.023, "step": 370 }, { "epoch": 1.4615384615384617, "grad_norm": 1.1281001567840576, "learning_rate": 1.137e-05, "loss": 1.0075, "step": 380 }, { "epoch": 1.5, "grad_norm": 1.461525321006775, "learning_rate": 1.167e-05, "loss": 0.9779, "step": 390 }, { "epoch": 1.5384615384615383, "grad_norm": 1.534185767173767, "learning_rate": 1.197e-05, "loss": 0.9208, "step": 400 }, { "epoch": 1.5769230769230769, "grad_norm": 1.9967076778411865, "learning_rate": 1.227e-05, "loss": 0.8734, "step": 410 }, { "epoch": 1.6153846153846154, "grad_norm": 1.6220159530639648, "learning_rate": 1.257e-05, "loss": 0.8197, "step": 420 }, { "epoch": 1.6538461538461537, "grad_norm": 1.832636833190918, "learning_rate": 1.287e-05, "loss": 0.771, "step": 430 }, { "epoch": 1.6923076923076923, "grad_norm": 2.2742176055908203, "learning_rate": 1.3170000000000001e-05, "loss": 0.7211, "step": 440 }, { "epoch": 1.7307692307692308, "grad_norm": 1.9380594491958618, "learning_rate": 1.3470000000000001e-05, "loss": 0.6881, "step": 450 }, { "epoch": 1.7692307692307692, "grad_norm": 2.589594602584839, "learning_rate": 1.377e-05, "loss": 0.6551, "step": 460 }, { "epoch": 1.8076923076923077, "grad_norm": 2.73272442817688, "learning_rate": 1.4069999999999999e-05, "loss": 0.64, "step": 470 }, { "epoch": 1.8461538461538463, "grad_norm": 2.789524555206299, "learning_rate": 1.437e-05, "loss": 0.6142, "step": 480 }, { "epoch": 1.8846153846153846, "grad_norm": 2.5425634384155273, "learning_rate": 1.467e-05, "loss": 0.5954, "step": 490 }, { "epoch": 1.9230769230769231, "grad_norm": 2.916409969329834, "learning_rate": 1.497e-05, "loss": 0.574, "step": 500 }, { "epoch": 1.9615384615384617, "grad_norm": 2.5941741466522217, "learning_rate": 1.527e-05, "loss": 0.5612, "step": 510 }, { "epoch": 2.0, "grad_norm": 3.0018510818481445, "learning_rate": 1.5570000000000002e-05, "loss": 0.545, "step": 520 }, { "epoch": 2.0384615384615383, "grad_norm": 2.7710626125335693, "learning_rate": 1.5870000000000002e-05, "loss": 0.5413, "step": 530 }, { "epoch": 2.076923076923077, "grad_norm": 3.6336352825164795, "learning_rate": 1.6170000000000003e-05, "loss": 0.5231, "step": 540 }, { "epoch": 2.1153846153846154, "grad_norm": 3.880500555038452, "learning_rate": 1.6470000000000003e-05, "loss": 0.5153, "step": 550 }, { "epoch": 2.1538461538461537, "grad_norm": 3.505535840988159, "learning_rate": 1.677e-05, "loss": 0.5065, "step": 560 }, { "epoch": 2.1923076923076925, "grad_norm": 3.137913465499878, "learning_rate": 1.7069999999999998e-05, "loss": 0.4962, "step": 570 }, { "epoch": 2.230769230769231, "grad_norm": 3.880664110183716, "learning_rate": 1.7369999999999998e-05, "loss": 0.4966, "step": 580 }, { "epoch": 2.269230769230769, "grad_norm": 3.5915846824645996, "learning_rate": 1.767e-05, "loss": 0.498, "step": 590 }, { "epoch": 2.3076923076923075, "grad_norm": 3.1810109615325928, "learning_rate": 1.797e-05, "loss": 0.4988, "step": 600 }, { "epoch": 2.3461538461538463, "grad_norm": 3.6019811630249023, "learning_rate": 1.827e-05, "loss": 0.4974, "step": 610 }, { "epoch": 2.3846153846153846, "grad_norm": 3.5532901287078857, "learning_rate": 1.857e-05, "loss": 0.4819, "step": 620 }, { "epoch": 2.423076923076923, "grad_norm": 3.921673536300659, "learning_rate": 1.887e-05, "loss": 0.4798, "step": 630 }, { "epoch": 2.4615384615384617, "grad_norm": 3.2884128093719482, "learning_rate": 1.917e-05, "loss": 0.4826, "step": 640 }, { "epoch": 2.5, "grad_norm": 3.231353759765625, "learning_rate": 1.947e-05, "loss": 0.4825, "step": 650 }, { "epoch": 2.5384615384615383, "grad_norm": 4.012421131134033, "learning_rate": 1.9770000000000002e-05, "loss": 0.4782, "step": 660 }, { "epoch": 2.5769230769230766, "grad_norm": 3.7715535163879395, "learning_rate": 2.0070000000000003e-05, "loss": 0.4854, "step": 670 }, { "epoch": 2.6153846153846154, "grad_norm": 3.2601428031921387, "learning_rate": 2.0370000000000003e-05, "loss": 0.4762, "step": 680 }, { "epoch": 2.6538461538461537, "grad_norm": 3.3098268508911133, "learning_rate": 2.067e-05, "loss": 0.4699, "step": 690 }, { "epoch": 2.6923076923076925, "grad_norm": 3.7813925743103027, "learning_rate": 2.097e-05, "loss": 0.4739, "step": 700 }, { "epoch": 2.730769230769231, "grad_norm": 3.242644786834717, "learning_rate": 2.1269999999999998e-05, "loss": 0.476, "step": 710 }, { "epoch": 2.769230769230769, "grad_norm": 3.092524766921997, "learning_rate": 2.157e-05, "loss": 0.4687, "step": 720 }, { "epoch": 2.8076923076923075, "grad_norm": 4.034473896026611, "learning_rate": 2.187e-05, "loss": 0.4655, "step": 730 }, { "epoch": 2.8461538461538463, "grad_norm": 3.64107084274292, "learning_rate": 2.217e-05, "loss": 0.4637, "step": 740 }, { "epoch": 2.8846153846153846, "grad_norm": 4.293366432189941, "learning_rate": 2.247e-05, "loss": 0.4662, "step": 750 }, { "epoch": 2.9230769230769234, "grad_norm": 3.5577127933502197, "learning_rate": 2.277e-05, "loss": 0.4571, "step": 760 }, { "epoch": 2.9615384615384617, "grad_norm": 3.0843167304992676, "learning_rate": 2.307e-05, "loss": 0.4625, "step": 770 }, { "epoch": 3.0, "grad_norm": 3.5912671089172363, "learning_rate": 2.337e-05, "loss": 0.457, "step": 780 }, { "epoch": 3.0384615384615383, "grad_norm": 4.415438175201416, "learning_rate": 2.3670000000000002e-05, "loss": 0.4594, "step": 790 }, { "epoch": 3.076923076923077, "grad_norm": 3.8929693698883057, "learning_rate": 2.3970000000000003e-05, "loss": 0.4613, "step": 800 }, { "epoch": 3.1153846153846154, "grad_norm": 4.137643814086914, "learning_rate": 2.4270000000000003e-05, "loss": 0.4628, "step": 810 }, { "epoch": 3.1538461538461537, "grad_norm": 3.0388357639312744, "learning_rate": 2.457e-05, "loss": 0.457, "step": 820 }, { "epoch": 3.1923076923076925, "grad_norm": 3.6971359252929688, "learning_rate": 2.487e-05, "loss": 0.4546, "step": 830 }, { "epoch": 3.230769230769231, "grad_norm": 3.4712138175964355, "learning_rate": 2.517e-05, "loss": 0.4511, "step": 840 }, { "epoch": 3.269230769230769, "grad_norm": 3.3456554412841797, "learning_rate": 2.547e-05, "loss": 0.4505, "step": 850 }, { "epoch": 3.3076923076923075, "grad_norm": 3.5414817333221436, "learning_rate": 2.577e-05, "loss": 0.4545, "step": 860 }, { "epoch": 3.3461538461538463, "grad_norm": 3.727144241333008, "learning_rate": 2.607e-05, "loss": 0.4537, "step": 870 }, { "epoch": 3.3846153846153846, "grad_norm": 3.6448793411254883, "learning_rate": 2.637e-05, "loss": 0.4434, "step": 880 }, { "epoch": 3.423076923076923, "grad_norm": 3.6482105255126953, "learning_rate": 2.667e-05, "loss": 0.4438, "step": 890 }, { "epoch": 3.4615384615384617, "grad_norm": 3.7402617931365967, "learning_rate": 2.697e-05, "loss": 0.4499, "step": 900 }, { "epoch": 3.5, "grad_norm": 3.1410586833953857, "learning_rate": 2.727e-05, "loss": 0.4423, "step": 910 }, { "epoch": 3.5384615384615383, "grad_norm": 4.063307762145996, "learning_rate": 2.7570000000000002e-05, "loss": 0.4476, "step": 920 }, { "epoch": 3.5769230769230766, "grad_norm": 3.781724214553833, "learning_rate": 2.7870000000000003e-05, "loss": 0.4445, "step": 930 }, { "epoch": 3.6153846153846154, "grad_norm": 4.0924787521362305, "learning_rate": 2.817e-05, "loss": 0.4449, "step": 940 }, { "epoch": 3.6538461538461537, "grad_norm": 3.8442327976226807, "learning_rate": 2.847e-05, "loss": 0.452, "step": 950 }, { "epoch": 3.6923076923076925, "grad_norm": 3.5104334354400635, "learning_rate": 2.877e-05, "loss": 0.4363, "step": 960 }, { "epoch": 3.730769230769231, "grad_norm": 4.416885852813721, "learning_rate": 2.907e-05, "loss": 0.4423, "step": 970 }, { "epoch": 3.769230769230769, "grad_norm": 3.5241246223449707, "learning_rate": 2.9370000000000002e-05, "loss": 0.4426, "step": 980 }, { "epoch": 3.8076923076923075, "grad_norm": 3.1697614192962646, "learning_rate": 2.967e-05, "loss": 0.4437, "step": 990 }, { "epoch": 3.8461538461538463, "grad_norm": 3.7319610118865967, "learning_rate": 2.997e-05, "loss": 0.4419, "step": 1000 }, { "epoch": 3.8846153846153846, "grad_norm": 4.392916679382324, "learning_rate": 2.9999983391181253e-05, "loss": 0.4371, "step": 1010 }, { "epoch": 3.9230769230769234, "grad_norm": 3.606473207473755, "learning_rate": 2.9999925978027876e-05, "loss": 0.4403, "step": 1020 }, { "epoch": 3.9615384615384617, "grad_norm": 3.2601609230041504, "learning_rate": 2.9999827555649637e-05, "loss": 0.4357, "step": 1030 }, { "epoch": 4.0, "grad_norm": 3.616961717605591, "learning_rate": 2.999968812431563e-05, "loss": 0.4394, "step": 1040 }, { "epoch": 4.038461538461538, "grad_norm": 4.0653204917907715, "learning_rate": 2.999950768440706e-05, "loss": 0.435, "step": 1050 }, { "epoch": 4.076923076923077, "grad_norm": 4.050017833709717, "learning_rate": 2.999928623641723e-05, "loss": 0.4448, "step": 1060 }, { "epoch": 4.115384615384615, "grad_norm": 3.190156936645508, "learning_rate": 2.9999023780951575e-05, "loss": 0.4356, "step": 1070 }, { "epoch": 4.153846153846154, "grad_norm": 3.323148250579834, "learning_rate": 2.999872031872764e-05, "loss": 0.4384, "step": 1080 }, { "epoch": 4.1923076923076925, "grad_norm": 3.506814479827881, "learning_rate": 2.999837585057508e-05, "loss": 0.4287, "step": 1090 }, { "epoch": 4.230769230769231, "grad_norm": 4.119077205657959, "learning_rate": 2.999799037743565e-05, "loss": 0.427, "step": 1100 }, { "epoch": 4.269230769230769, "grad_norm": 3.2304561138153076, "learning_rate": 2.999756390036323e-05, "loss": 0.4326, "step": 1110 }, { "epoch": 4.3076923076923075, "grad_norm": 3.1747395992279053, "learning_rate": 2.9997096420523788e-05, "loss": 0.4325, "step": 1120 }, { "epoch": 4.346153846153846, "grad_norm": 3.7520697116851807, "learning_rate": 2.9996587939195395e-05, "loss": 0.4372, "step": 1130 }, { "epoch": 4.384615384615385, "grad_norm": 3.395843744277954, "learning_rate": 2.999603845776822e-05, "loss": 0.4344, "step": 1140 }, { "epoch": 4.423076923076923, "grad_norm": 4.427037715911865, "learning_rate": 2.999544797774452e-05, "loss": 0.4342, "step": 1150 }, { "epoch": 4.461538461538462, "grad_norm": 3.310183525085449, "learning_rate": 2.9994816500738648e-05, "loss": 0.433, "step": 1160 }, { "epoch": 4.5, "grad_norm": 3.460261583328247, "learning_rate": 2.999414402847704e-05, "loss": 0.4279, "step": 1170 }, { "epoch": 4.538461538461538, "grad_norm": 3.523247718811035, "learning_rate": 2.999343056279821e-05, "loss": 0.4281, "step": 1180 }, { "epoch": 4.576923076923077, "grad_norm": 3.0562286376953125, "learning_rate": 2.9992676105652746e-05, "loss": 0.4281, "step": 1190 }, { "epoch": 4.615384615384615, "grad_norm": 3.179258108139038, "learning_rate": 2.9991880659103298e-05, "loss": 0.4321, "step": 1200 }, { "epoch": 4.653846153846154, "grad_norm": 3.0872421264648438, "learning_rate": 2.9991044225324593e-05, "loss": 0.4255, "step": 1210 }, { "epoch": 4.6923076923076925, "grad_norm": 3.896038293838501, "learning_rate": 2.9990166806603407e-05, "loss": 0.4221, "step": 1220 }, { "epoch": 4.730769230769231, "grad_norm": 3.345339775085449, "learning_rate": 2.9989248405338573e-05, "loss": 0.4236, "step": 1230 }, { "epoch": 4.769230769230769, "grad_norm": 3.4517908096313477, "learning_rate": 2.9988289024040962e-05, "loss": 0.419, "step": 1240 }, { "epoch": 4.8076923076923075, "grad_norm": 3.4892146587371826, "learning_rate": 2.998728866533348e-05, "loss": 0.4235, "step": 1250 }, { "epoch": 4.846153846153846, "grad_norm": 3.3247246742248535, "learning_rate": 2.9986247331951083e-05, "loss": 0.4143, "step": 1260 }, { "epoch": 4.884615384615385, "grad_norm": 3.2198266983032227, "learning_rate": 2.998516502674072e-05, "loss": 0.4167, "step": 1270 }, { "epoch": 4.923076923076923, "grad_norm": 3.2690086364746094, "learning_rate": 2.9984041752661386e-05, "loss": 0.4142, "step": 1280 }, { "epoch": 4.961538461538462, "grad_norm": 3.7173590660095215, "learning_rate": 2.9982877512784067e-05, "loss": 0.4141, "step": 1290 }, { "epoch": 5.0, "grad_norm": 4.059051990509033, "learning_rate": 2.998167231029174e-05, "loss": 0.4073, "step": 1300 }, { "epoch": 5.038461538461538, "grad_norm": 3.8667192459106445, "learning_rate": 2.99804261484794e-05, "loss": 0.4082, "step": 1310 }, { "epoch": 5.076923076923077, "grad_norm": 3.3809549808502197, "learning_rate": 2.997913903075399e-05, "loss": 0.408, "step": 1320 }, { "epoch": 5.115384615384615, "grad_norm": 3.0045559406280518, "learning_rate": 2.997781096063445e-05, "loss": 0.4067, "step": 1330 }, { "epoch": 5.153846153846154, "grad_norm": 3.27519154548645, "learning_rate": 2.9976441941751663e-05, "loss": 0.4023, "step": 1340 }, { "epoch": 5.1923076923076925, "grad_norm": 3.353156089782715, "learning_rate": 2.997503197784849e-05, "loss": 0.4034, "step": 1350 }, { "epoch": 5.230769230769231, "grad_norm": 3.568828821182251, "learning_rate": 2.9973581072779702e-05, "loss": 0.4025, "step": 1360 }, { "epoch": 5.269230769230769, "grad_norm": 3.06388258934021, "learning_rate": 2.9972089230512035e-05, "loss": 0.3948, "step": 1370 }, { "epoch": 5.3076923076923075, "grad_norm": 3.4603991508483887, "learning_rate": 2.997055645512411e-05, "loss": 0.4045, "step": 1380 }, { "epoch": 5.346153846153846, "grad_norm": 2.7319891452789307, "learning_rate": 2.9968982750806492e-05, "loss": 0.3956, "step": 1390 }, { "epoch": 5.384615384615385, "grad_norm": 3.0891315937042236, "learning_rate": 2.9967368121861623e-05, "loss": 0.4019, "step": 1400 }, { "epoch": 5.423076923076923, "grad_norm": 3.29032039642334, "learning_rate": 2.9965712572703834e-05, "loss": 0.3962, "step": 1410 }, { "epoch": 5.461538461538462, "grad_norm": 3.2606821060180664, "learning_rate": 2.996401610785934e-05, "loss": 0.4016, "step": 1420 }, { "epoch": 5.5, "grad_norm": 3.013518810272217, "learning_rate": 2.99622787319662e-05, "loss": 0.398, "step": 1430 }, { "epoch": 5.538461538461538, "grad_norm": 2.9983019828796387, "learning_rate": 2.9960500449774338e-05, "loss": 0.4002, "step": 1440 }, { "epoch": 5.576923076923077, "grad_norm": 3.4363009929656982, "learning_rate": 2.9958681266145517e-05, "loss": 0.4017, "step": 1450 }, { "epoch": 5.615384615384615, "grad_norm": 3.917628049850464, "learning_rate": 2.995682118605331e-05, "loss": 0.3938, "step": 1460 }, { "epoch": 5.653846153846154, "grad_norm": 2.4763236045837402, "learning_rate": 2.9954920214583107e-05, "loss": 0.3976, "step": 1470 }, { "epoch": 5.6923076923076925, "grad_norm": 3.2113523483276367, "learning_rate": 2.9952978356932084e-05, "loss": 0.3984, "step": 1480 }, { "epoch": 5.730769230769231, "grad_norm": 3.3912851810455322, "learning_rate": 2.9950995618409215e-05, "loss": 0.3925, "step": 1490 }, { "epoch": 5.769230769230769, "grad_norm": 2.927689790725708, "learning_rate": 2.9948972004435228e-05, "loss": 0.4036, "step": 1500 }, { "epoch": 5.8076923076923075, "grad_norm": 3.0390853881835938, "learning_rate": 2.9946907520542602e-05, "loss": 0.3954, "step": 1510 }, { "epoch": 5.846153846153846, "grad_norm": 3.3175177574157715, "learning_rate": 2.9944802172375566e-05, "loss": 0.3984, "step": 1520 }, { "epoch": 5.884615384615385, "grad_norm": 3.478698968887329, "learning_rate": 2.9942655965690053e-05, "loss": 0.3971, "step": 1530 }, { "epoch": 5.923076923076923, "grad_norm": 2.9315621852874756, "learning_rate": 2.9940468906353712e-05, "loss": 0.3989, "step": 1540 }, { "epoch": 5.961538461538462, "grad_norm": 3.185990810394287, "learning_rate": 2.9938241000345887e-05, "loss": 0.3948, "step": 1550 }, { "epoch": 6.0, "grad_norm": 4.063650608062744, "learning_rate": 2.993597225375758e-05, "loss": 0.3937, "step": 1560 }, { "epoch": 6.038461538461538, "grad_norm": 4.225824356079102, "learning_rate": 2.993366267279146e-05, "loss": 0.3931, "step": 1570 }, { "epoch": 6.076923076923077, "grad_norm": 3.3525872230529785, "learning_rate": 2.993131226376183e-05, "loss": 0.3918, "step": 1580 }, { "epoch": 6.115384615384615, "grad_norm": 3.352668523788452, "learning_rate": 2.9928921033094626e-05, "loss": 0.3901, "step": 1590 }, { "epoch": 6.153846153846154, "grad_norm": 3.500871181488037, "learning_rate": 2.9926488987327376e-05, "loss": 0.3905, "step": 1600 }, { "epoch": 6.1923076923076925, "grad_norm": 3.547381639480591, "learning_rate": 2.99240161331092e-05, "loss": 0.3847, "step": 1610 }, { "epoch": 6.230769230769231, "grad_norm": 3.7568917274475098, "learning_rate": 2.992150247720079e-05, "loss": 0.383, "step": 1620 }, { "epoch": 6.269230769230769, "grad_norm": 2.754397392272949, "learning_rate": 2.991894802647438e-05, "loss": 0.3833, "step": 1630 }, { "epoch": 6.3076923076923075, "grad_norm": 2.8265631198883057, "learning_rate": 2.9916352787913746e-05, "loss": 0.3875, "step": 1640 }, { "epoch": 6.346153846153846, "grad_norm": 3.522829532623291, "learning_rate": 2.991371676861417e-05, "loss": 0.3867, "step": 1650 }, { "epoch": 6.384615384615385, "grad_norm": 4.345142841339111, "learning_rate": 2.991103997578243e-05, "loss": 0.3754, "step": 1660 }, { "epoch": 6.423076923076923, "grad_norm": 3.212238073348999, "learning_rate": 2.9908322416736767e-05, "loss": 0.382, "step": 1670 }, { "epoch": 6.461538461538462, "grad_norm": 3.8595285415649414, "learning_rate": 2.990556409890689e-05, "loss": 0.3831, "step": 1680 }, { "epoch": 6.5, "grad_norm": 3.5283541679382324, "learning_rate": 2.990276502983394e-05, "loss": 0.3785, "step": 1690 }, { "epoch": 6.538461538461538, "grad_norm": 2.779311180114746, "learning_rate": 2.9899925217170455e-05, "loss": 0.3722, "step": 1700 }, { "epoch": 6.576923076923077, "grad_norm": 3.0596842765808105, "learning_rate": 2.989704466868038e-05, "loss": 0.373, "step": 1710 }, { "epoch": 6.615384615384615, "grad_norm": 3.0654571056365967, "learning_rate": 2.9894123392239018e-05, "loss": 0.3717, "step": 1720 }, { "epoch": 6.653846153846154, "grad_norm": 3.738708734512329, "learning_rate": 2.9891161395833037e-05, "loss": 0.3652, "step": 1730 }, { "epoch": 6.6923076923076925, "grad_norm": 3.2960824966430664, "learning_rate": 2.988815868756042e-05, "loss": 0.3606, "step": 1740 }, { "epoch": 6.730769230769231, "grad_norm": 3.8682546615600586, "learning_rate": 2.9885115275630447e-05, "loss": 0.3616, "step": 1750 }, { "epoch": 6.769230769230769, "grad_norm": 5.847049236297607, "learning_rate": 2.9882031168363703e-05, "loss": 0.3594, "step": 1760 }, { "epoch": 6.8076923076923075, "grad_norm": 4.252906322479248, "learning_rate": 2.9878906374192013e-05, "loss": 0.3599, "step": 1770 }, { "epoch": 6.846153846153846, "grad_norm": 3.371454954147339, "learning_rate": 2.9875740901658446e-05, "loss": 0.3522, "step": 1780 }, { "epoch": 6.884615384615385, "grad_norm": 3.9505934715270996, "learning_rate": 2.987253475941728e-05, "loss": 0.3518, "step": 1790 }, { "epoch": 6.923076923076923, "grad_norm": 4.2967047691345215, "learning_rate": 2.9869287956233986e-05, "loss": 0.3453, "step": 1800 }, { "epoch": 6.961538461538462, "grad_norm": 3.76397705078125, "learning_rate": 2.9866000500985207e-05, "loss": 0.3433, "step": 1810 }, { "epoch": 7.0, "grad_norm": 3.6324551105499268, "learning_rate": 2.9862672402658712e-05, "loss": 0.3322, "step": 1820 }, { "epoch": 7.038461538461538, "grad_norm": 4.136635780334473, "learning_rate": 2.98593036703534e-05, "loss": 0.3367, "step": 1830 }, { "epoch": 7.076923076923077, "grad_norm": 4.801270008087158, "learning_rate": 2.9855894313279256e-05, "loss": 0.3327, "step": 1840 }, { "epoch": 7.115384615384615, "grad_norm": 3.6734988689422607, "learning_rate": 2.9852444340757326e-05, "loss": 0.33, "step": 1850 }, { "epoch": 7.153846153846154, "grad_norm": 3.806365966796875, "learning_rate": 2.9848953762219707e-05, "loss": 0.3315, "step": 1860 }, { "epoch": 7.1923076923076925, "grad_norm": 4.397286891937256, "learning_rate": 2.984542258720951e-05, "loss": 0.3221, "step": 1870 }, { "epoch": 7.230769230769231, "grad_norm": 4.844172954559326, "learning_rate": 2.984185082538083e-05, "loss": 0.3104, "step": 1880 }, { "epoch": 7.269230769230769, "grad_norm": 3.5059876441955566, "learning_rate": 2.983823848649873e-05, "loss": 0.3044, "step": 1890 }, { "epoch": 7.3076923076923075, "grad_norm": 3.5109610557556152, "learning_rate": 2.9834585580439203e-05, "loss": 0.3145, "step": 1900 }, { "epoch": 7.346153846153846, "grad_norm": 3.935485601425171, "learning_rate": 2.9830892117189157e-05, "loss": 0.3047, "step": 1910 }, { "epoch": 7.384615384615385, "grad_norm": 4.239902019500732, "learning_rate": 2.982715810684638e-05, "loss": 0.3004, "step": 1920 }, { "epoch": 7.423076923076923, "grad_norm": 3.725754737854004, "learning_rate": 2.982338355961951e-05, "loss": 0.2974, "step": 1930 }, { "epoch": 7.461538461538462, "grad_norm": 4.275974750518799, "learning_rate": 2.981956848582802e-05, "loss": 0.2943, "step": 1940 }, { "epoch": 7.5, "grad_norm": 4.536103248596191, "learning_rate": 2.981571289590217e-05, "loss": 0.2879, "step": 1950 }, { "epoch": 7.538461538461538, "grad_norm": 5.503205299377441, "learning_rate": 2.9811816800383003e-05, "loss": 0.2769, "step": 1960 }, { "epoch": 7.576923076923077, "grad_norm": 4.398034572601318, "learning_rate": 2.9807880209922288e-05, "loss": 0.2841, "step": 1970 }, { "epoch": 7.615384615384615, "grad_norm": 4.218934059143066, "learning_rate": 2.9803903135282518e-05, "loss": 0.2787, "step": 1980 }, { "epoch": 7.653846153846154, "grad_norm": 3.5821478366851807, "learning_rate": 2.9799885587336862e-05, "loss": 0.2669, "step": 1990 }, { "epoch": 7.6923076923076925, "grad_norm": 4.140866279602051, "learning_rate": 2.9795827577069145e-05, "loss": 0.2558, "step": 2000 }, { "epoch": 7.730769230769231, "grad_norm": 3.9685962200164795, "learning_rate": 2.9791729115573808e-05, "loss": 0.2493, "step": 2010 }, { "epoch": 7.769230769230769, "grad_norm": 6.242632865905762, "learning_rate": 2.9787590214055887e-05, "loss": 0.2538, "step": 2020 }, { "epoch": 7.8076923076923075, "grad_norm": 5.438572406768799, "learning_rate": 2.9783410883830983e-05, "loss": 0.2555, "step": 2030 }, { "epoch": 7.846153846153846, "grad_norm": 4.109585762023926, "learning_rate": 2.9779191136325233e-05, "loss": 0.2533, "step": 2040 }, { "epoch": 7.884615384615385, "grad_norm": 4.627237319946289, "learning_rate": 2.977493098307525e-05, "loss": 0.2371, "step": 2050 }, { "epoch": 7.923076923076923, "grad_norm": 4.566162109375, "learning_rate": 2.9770630435728142e-05, "loss": 0.2214, "step": 2060 }, { "epoch": 7.961538461538462, "grad_norm": 5.047729969024658, "learning_rate": 2.976628950604144e-05, "loss": 0.222, "step": 2070 }, { "epoch": 8.0, "grad_norm": 5.952073574066162, "learning_rate": 2.9761908205883073e-05, "loss": 0.2164, "step": 2080 }, { "epoch": 8.038461538461538, "grad_norm": 6.023576736450195, "learning_rate": 2.9757486547231357e-05, "loss": 0.2171, "step": 2090 }, { "epoch": 8.076923076923077, "grad_norm": 5.403975486755371, "learning_rate": 2.9753024542174934e-05, "loss": 0.2039, "step": 2100 }, { "epoch": 8.115384615384615, "grad_norm": 6.422732830047607, "learning_rate": 2.9748522202912755e-05, "loss": 0.2025, "step": 2110 }, { "epoch": 8.153846153846153, "grad_norm": 4.933073043823242, "learning_rate": 2.974397954175404e-05, "loss": 0.1943, "step": 2120 }, { "epoch": 8.192307692307692, "grad_norm": 3.9752209186553955, "learning_rate": 2.973939657111826e-05, "loss": 0.1947, "step": 2130 }, { "epoch": 8.23076923076923, "grad_norm": 5.760082721710205, "learning_rate": 2.9734773303535078e-05, "loss": 0.1982, "step": 2140 }, { "epoch": 8.26923076923077, "grad_norm": 4.883352279663086, "learning_rate": 2.9730109751644325e-05, "loss": 0.1932, "step": 2150 }, { "epoch": 8.307692307692308, "grad_norm": 5.062183856964111, "learning_rate": 2.9725405928195985e-05, "loss": 0.1808, "step": 2160 }, { "epoch": 8.346153846153847, "grad_norm": 5.349424362182617, "learning_rate": 2.9720661846050123e-05, "loss": 0.1648, "step": 2170 }, { "epoch": 8.384615384615385, "grad_norm": 4.347816467285156, "learning_rate": 2.971587751817688e-05, "loss": 0.1733, "step": 2180 }, { "epoch": 8.423076923076923, "grad_norm": 6.218843460083008, "learning_rate": 2.9711052957656425e-05, "loss": 0.1568, "step": 2190 }, { "epoch": 8.461538461538462, "grad_norm": 4.51317024230957, "learning_rate": 2.9706188177678924e-05, "loss": 0.1544, "step": 2200 }, { "epoch": 8.5, "grad_norm": 6.40587854385376, "learning_rate": 2.97012831915445e-05, "loss": 0.1501, "step": 2210 }, { "epoch": 8.538461538461538, "grad_norm": 3.9102859497070312, "learning_rate": 2.96963380126632e-05, "loss": 0.1443, "step": 2220 }, { "epoch": 8.576923076923077, "grad_norm": 5.443079948425293, "learning_rate": 2.9691352654554953e-05, "loss": 0.1472, "step": 2230 }, { "epoch": 8.615384615384615, "grad_norm": 6.355679988861084, "learning_rate": 2.9686327130849536e-05, "loss": 0.1383, "step": 2240 }, { "epoch": 8.653846153846153, "grad_norm": 6.52142333984375, "learning_rate": 2.9681261455286538e-05, "loss": 0.1482, "step": 2250 }, { "epoch": 8.692307692307692, "grad_norm": 5.2175703048706055, "learning_rate": 2.9676155641715318e-05, "loss": 0.1405, "step": 2260 }, { "epoch": 8.73076923076923, "grad_norm": 5.178348541259766, "learning_rate": 2.9671009704094988e-05, "loss": 0.1307, "step": 2270 }, { "epoch": 8.76923076923077, "grad_norm": 5.276549816131592, "learning_rate": 2.9665823656494335e-05, "loss": 0.1402, "step": 2280 }, { "epoch": 8.807692307692308, "grad_norm": 4.72429084777832, "learning_rate": 2.9660597513091824e-05, "loss": 0.129, "step": 2290 }, { "epoch": 8.846153846153847, "grad_norm": 4.0139312744140625, "learning_rate": 2.965533128817552e-05, "loss": 0.1181, "step": 2300 }, { "epoch": 8.884615384615385, "grad_norm": 4.549901485443115, "learning_rate": 2.9650024996143084e-05, "loss": 0.1293, "step": 2310 }, { "epoch": 8.923076923076923, "grad_norm": 4.0546183586120605, "learning_rate": 2.964467865150172e-05, "loss": 0.1098, "step": 2320 }, { "epoch": 8.961538461538462, "grad_norm": 5.741022109985352, "learning_rate": 2.9639292268868133e-05, "loss": 0.1255, "step": 2330 }, { "epoch": 9.0, "grad_norm": 5.321122169494629, "learning_rate": 2.9633865862968478e-05, "loss": 0.112, "step": 2340 }, { "epoch": 9.038461538461538, "grad_norm": 4.528491020202637, "learning_rate": 2.9628399448638352e-05, "loss": 0.1198, "step": 2350 }, { "epoch": 9.076923076923077, "grad_norm": 3.827465295791626, "learning_rate": 2.9622893040822714e-05, "loss": 0.1039, "step": 2360 }, { "epoch": 9.115384615384615, "grad_norm": 4.378634452819824, "learning_rate": 2.9617346654575875e-05, "loss": 0.1049, "step": 2370 }, { "epoch": 9.153846153846153, "grad_norm": 4.433044910430908, "learning_rate": 2.9611760305061447e-05, "loss": 0.099, "step": 2380 }, { "epoch": 9.192307692307692, "grad_norm": 4.418703556060791, "learning_rate": 2.9606134007552292e-05, "loss": 0.1097, "step": 2390 }, { "epoch": 9.23076923076923, "grad_norm": 4.773401737213135, "learning_rate": 2.9600467777430497e-05, "loss": 0.1039, "step": 2400 }, { "epoch": 9.26923076923077, "grad_norm": 5.771136283874512, "learning_rate": 2.9594761630187312e-05, "loss": 0.0954, "step": 2410 }, { "epoch": 9.307692307692308, "grad_norm": 4.874659061431885, "learning_rate": 2.9589015581423132e-05, "loss": 0.0973, "step": 2420 }, { "epoch": 9.346153846153847, "grad_norm": 5.661646366119385, "learning_rate": 2.958322964684743e-05, "loss": 0.0958, "step": 2430 }, { "epoch": 9.384615384615385, "grad_norm": 5.317569732666016, "learning_rate": 2.9577403842278735e-05, "loss": 0.1032, "step": 2440 }, { "epoch": 9.423076923076923, "grad_norm": 4.762622356414795, "learning_rate": 2.957153818364457e-05, "loss": 0.0972, "step": 2450 }, { "epoch": 9.461538461538462, "grad_norm": 5.307681083679199, "learning_rate": 2.9565632686981428e-05, "loss": 0.1038, "step": 2460 }, { "epoch": 9.5, "grad_norm": 3.2064321041107178, "learning_rate": 2.9559687368434702e-05, "loss": 0.0902, "step": 2470 }, { "epoch": 9.538461538461538, "grad_norm": 4.020598888397217, "learning_rate": 2.9553702244258674e-05, "loss": 0.0886, "step": 2480 }, { "epoch": 9.576923076923077, "grad_norm": 5.292928218841553, "learning_rate": 2.954767733081644e-05, "loss": 0.0839, "step": 2490 }, { "epoch": 9.615384615384615, "grad_norm": 4.200097560882568, "learning_rate": 2.9541612644579887e-05, "loss": 0.0943, "step": 2500 }, { "epoch": 9.653846153846153, "grad_norm": 4.739330291748047, "learning_rate": 2.9535508202129634e-05, "loss": 0.0903, "step": 2510 }, { "epoch": 9.692307692307692, "grad_norm": 4.430630683898926, "learning_rate": 2.9529364020154994e-05, "loss": 0.0819, "step": 2520 }, { "epoch": 9.73076923076923, "grad_norm": 4.4350481033325195, "learning_rate": 2.9523180115453922e-05, "loss": 0.0798, "step": 2530 }, { "epoch": 9.76923076923077, "grad_norm": 4.632157325744629, "learning_rate": 2.9516956504932984e-05, "loss": 0.0917, "step": 2540 }, { "epoch": 9.807692307692308, "grad_norm": 3.786306619644165, "learning_rate": 2.9510693205607286e-05, "loss": 0.0978, "step": 2550 }, { "epoch": 9.846153846153847, "grad_norm": 4.0172929763793945, "learning_rate": 2.9504390234600456e-05, "loss": 0.0884, "step": 2560 }, { "epoch": 9.884615384615385, "grad_norm": 4.171654224395752, "learning_rate": 2.9498047609144577e-05, "loss": 0.0821, "step": 2570 }, { "epoch": 9.923076923076923, "grad_norm": 5.381162643432617, "learning_rate": 2.9491665346580134e-05, "loss": 0.0872, "step": 2580 }, { "epoch": 9.961538461538462, "grad_norm": 3.150331497192383, "learning_rate": 2.9485243464356e-05, "loss": 0.0859, "step": 2590 }, { "epoch": 10.0, "grad_norm": 4.065011978149414, "learning_rate": 2.9478781980029352e-05, "loss": 0.0823, "step": 2600 }, { "epoch": 10.038461538461538, "grad_norm": 3.450819492340088, "learning_rate": 2.9472280911265642e-05, "loss": 0.079, "step": 2610 }, { "epoch": 10.076923076923077, "grad_norm": 3.7067441940307617, "learning_rate": 2.9465740275838543e-05, "loss": 0.0802, "step": 2620 }, { "epoch": 10.115384615384615, "grad_norm": 3.485081434249878, "learning_rate": 2.94591600916299e-05, "loss": 0.0772, "step": 2630 }, { "epoch": 10.153846153846153, "grad_norm": 2.740015983581543, "learning_rate": 2.9452540376629692e-05, "loss": 0.0712, "step": 2640 }, { "epoch": 10.192307692307692, "grad_norm": 3.398298501968384, "learning_rate": 2.944588114893596e-05, "loss": 0.0826, "step": 2650 }, { "epoch": 10.23076923076923, "grad_norm": 3.7506258487701416, "learning_rate": 2.9439182426754784e-05, "loss": 0.0896, "step": 2660 }, { "epoch": 10.26923076923077, "grad_norm": 3.0388355255126953, "learning_rate": 2.9432444228400208e-05, "loss": 0.0794, "step": 2670 }, { "epoch": 10.307692307692308, "grad_norm": 2.9595770835876465, "learning_rate": 2.9425666572294218e-05, "loss": 0.0823, "step": 2680 }, { "epoch": 10.346153846153847, "grad_norm": 3.7701125144958496, "learning_rate": 2.941884947696666e-05, "loss": 0.0777, "step": 2690 }, { "epoch": 10.384615384615385, "grad_norm": 3.5903477668762207, "learning_rate": 2.9411992961055214e-05, "loss": 0.0864, "step": 2700 }, { "epoch": 10.423076923076923, "grad_norm": 3.5986275672912598, "learning_rate": 2.9405097043305334e-05, "loss": 0.0809, "step": 2710 }, { "epoch": 10.461538461538462, "grad_norm": 3.5043976306915283, "learning_rate": 2.9398161742570196e-05, "loss": 0.0787, "step": 2720 }, { "epoch": 10.5, "grad_norm": 3.9481887817382812, "learning_rate": 2.9391187077810644e-05, "loss": 0.0828, "step": 2730 }, { "epoch": 10.538461538461538, "grad_norm": 3.953305721282959, "learning_rate": 2.9384173068095145e-05, "loss": 0.079, "step": 2740 }, { "epoch": 10.576923076923077, "grad_norm": 4.111423492431641, "learning_rate": 2.937711973259974e-05, "loss": 0.0702, "step": 2750 }, { "epoch": 10.615384615384615, "grad_norm": 3.0147547721862793, "learning_rate": 2.9370027090607974e-05, "loss": 0.0767, "step": 2760 }, { "epoch": 10.653846153846153, "grad_norm": 3.451683759689331, "learning_rate": 2.936289516151086e-05, "loss": 0.0724, "step": 2770 }, { "epoch": 10.692307692307692, "grad_norm": 3.6806628704071045, "learning_rate": 2.935572396480682e-05, "loss": 0.0753, "step": 2780 }, { "epoch": 10.73076923076923, "grad_norm": 3.9452641010284424, "learning_rate": 2.9348513520101636e-05, "loss": 0.0692, "step": 2790 }, { "epoch": 10.76923076923077, "grad_norm": 3.2497544288635254, "learning_rate": 2.9341263847108383e-05, "loss": 0.0745, "step": 2800 }, { "epoch": 10.807692307692308, "grad_norm": 3.549978494644165, "learning_rate": 2.933397496564739e-05, "loss": 0.0749, "step": 2810 }, { "epoch": 10.846153846153847, "grad_norm": 3.3335323333740234, "learning_rate": 2.9326646895646178e-05, "loss": 0.0692, "step": 2820 }, { "epoch": 10.884615384615385, "grad_norm": 3.405339479446411, "learning_rate": 2.931927965713942e-05, "loss": 0.0746, "step": 2830 }, { "epoch": 10.923076923076923, "grad_norm": 3.82893443107605, "learning_rate": 2.931187327026886e-05, "loss": 0.0721, "step": 2840 }, { "epoch": 10.961538461538462, "grad_norm": 3.7267136573791504, "learning_rate": 2.9304427755283278e-05, "loss": 0.0734, "step": 2850 }, { "epoch": 11.0, "grad_norm": 3.892670154571533, "learning_rate": 2.9296943132538425e-05, "loss": 0.0695, "step": 2860 }, { "epoch": 11.038461538461538, "grad_norm": 3.6918320655822754, "learning_rate": 2.928941942249697e-05, "loss": 0.0715, "step": 2870 }, { "epoch": 11.076923076923077, "grad_norm": 3.9371860027313232, "learning_rate": 2.928185664572846e-05, "loss": 0.0698, "step": 2880 }, { "epoch": 11.115384615384615, "grad_norm": 3.618164539337158, "learning_rate": 2.927425482290923e-05, "loss": 0.0722, "step": 2890 }, { "epoch": 11.153846153846153, "grad_norm": 3.7436001300811768, "learning_rate": 2.926661397482238e-05, "loss": 0.0763, "step": 2900 }, { "epoch": 11.192307692307692, "grad_norm": 3.660889148712158, "learning_rate": 2.9258934122357685e-05, "loss": 0.0917, "step": 2910 }, { "epoch": 11.23076923076923, "grad_norm": 3.8613357543945312, "learning_rate": 2.9251215286511574e-05, "loss": 0.072, "step": 2920 }, { "epoch": 11.26923076923077, "grad_norm": 3.738790273666382, "learning_rate": 2.924345748838706e-05, "loss": 0.0709, "step": 2930 }, { "epoch": 11.307692307692308, "grad_norm": 3.2637104988098145, "learning_rate": 2.923566074919365e-05, "loss": 0.0707, "step": 2940 }, { "epoch": 11.346153846153847, "grad_norm": 2.518782615661621, "learning_rate": 2.9227825090247346e-05, "loss": 0.0679, "step": 2950 }, { "epoch": 11.384615384615385, "grad_norm": 3.1399145126342773, "learning_rate": 2.9219950532970526e-05, "loss": 0.0806, "step": 2960 }, { "epoch": 11.423076923076923, "grad_norm": 3.190922498703003, "learning_rate": 2.921203709889194e-05, "loss": 0.0766, "step": 2970 }, { "epoch": 11.461538461538462, "grad_norm": 3.0159084796905518, "learning_rate": 2.9204084809646607e-05, "loss": 0.0671, "step": 2980 }, { "epoch": 11.5, "grad_norm": 3.547037124633789, "learning_rate": 2.9196093686975793e-05, "loss": 0.0661, "step": 2990 }, { "epoch": 11.538461538461538, "grad_norm": 3.7780942916870117, "learning_rate": 2.918806375272691e-05, "loss": 0.0772, "step": 3000 }, { "epoch": 11.576923076923077, "grad_norm": 3.440263509750366, "learning_rate": 2.9179995028853498e-05, "loss": 0.0697, "step": 3010 }, { "epoch": 11.615384615384615, "grad_norm": 3.568605661392212, "learning_rate": 2.917188753741514e-05, "loss": 0.0779, "step": 3020 }, { "epoch": 11.653846153846153, "grad_norm": 3.969608783721924, "learning_rate": 2.916374130057741e-05, "loss": 0.0684, "step": 3030 }, { "epoch": 11.692307692307692, "grad_norm": 3.102283000946045, "learning_rate": 2.91555563406118e-05, "loss": 0.0624, "step": 3040 }, { "epoch": 11.73076923076923, "grad_norm": 2.713280200958252, "learning_rate": 2.9147332679895683e-05, "loss": 0.0723, "step": 3050 }, { "epoch": 11.76923076923077, "grad_norm": 3.4983530044555664, "learning_rate": 2.9139070340912236e-05, "loss": 0.0714, "step": 3060 }, { "epoch": 11.807692307692308, "grad_norm": 3.3425889015197754, "learning_rate": 2.9130769346250376e-05, "loss": 0.0776, "step": 3070 }, { "epoch": 11.846153846153847, "grad_norm": 3.1763546466827393, "learning_rate": 2.9122429718604704e-05, "loss": 0.0705, "step": 3080 }, { "epoch": 11.884615384615385, "grad_norm": 2.6937384605407715, "learning_rate": 2.911405148077545e-05, "loss": 0.0691, "step": 3090 }, { "epoch": 11.923076923076923, "grad_norm": 2.8746800422668457, "learning_rate": 2.9105634655668385e-05, "loss": 0.0715, "step": 3100 }, { "epoch": 11.961538461538462, "grad_norm": 3.535343647003174, "learning_rate": 2.9097179266294794e-05, "loss": 0.0754, "step": 3110 }, { "epoch": 12.0, "grad_norm": 3.433488607406616, "learning_rate": 2.9088685335771396e-05, "loss": 0.0619, "step": 3120 }, { "epoch": 12.038461538461538, "grad_norm": 3.0864665508270264, "learning_rate": 2.9080152887320255e-05, "loss": 0.0732, "step": 3130 }, { "epoch": 12.076923076923077, "grad_norm": 3.4718246459960938, "learning_rate": 2.9071581944268778e-05, "loss": 0.0702, "step": 3140 }, { "epoch": 12.115384615384615, "grad_norm": 3.164069175720215, "learning_rate": 2.906297253004958e-05, "loss": 0.0692, "step": 3150 }, { "epoch": 12.153846153846153, "grad_norm": 2.8084299564361572, "learning_rate": 2.9054324668200483e-05, "loss": 0.0681, "step": 3160 }, { "epoch": 12.192307692307692, "grad_norm": 2.749694347381592, "learning_rate": 2.9045638382364404e-05, "loss": 0.0712, "step": 3170 }, { "epoch": 12.23076923076923, "grad_norm": 2.85776948928833, "learning_rate": 2.9036913696289318e-05, "loss": 0.0653, "step": 3180 }, { "epoch": 12.26923076923077, "grad_norm": 2.447458505630493, "learning_rate": 2.9028150633828186e-05, "loss": 0.068, "step": 3190 }, { "epoch": 12.307692307692308, "grad_norm": 2.5854172706604004, "learning_rate": 2.9019349218938887e-05, "loss": 0.0759, "step": 3200 }, { "epoch": 12.346153846153847, "grad_norm": 2.299642324447632, "learning_rate": 2.9010509475684146e-05, "loss": 0.0663, "step": 3210 }, { "epoch": 12.384615384615385, "grad_norm": 2.9513206481933594, "learning_rate": 2.900163142823149e-05, "loss": 0.0608, "step": 3220 }, { "epoch": 12.423076923076923, "grad_norm": 3.0884900093078613, "learning_rate": 2.8992715100853166e-05, "loss": 0.0658, "step": 3230 }, { "epoch": 12.461538461538462, "grad_norm": 3.2073280811309814, "learning_rate": 2.898376051792606e-05, "loss": 0.0596, "step": 3240 }, { "epoch": 12.5, "grad_norm": 2.7891628742218018, "learning_rate": 2.897476770393167e-05, "loss": 0.0592, "step": 3250 }, { "epoch": 12.538461538461538, "grad_norm": 3.262420892715454, "learning_rate": 2.8965736683456e-05, "loss": 0.0762, "step": 3260 }, { "epoch": 12.576923076923077, "grad_norm": 3.5874862670898438, "learning_rate": 2.895666748118952e-05, "loss": 0.0682, "step": 3270 }, { "epoch": 12.615384615384615, "grad_norm": 3.6109778881073, "learning_rate": 2.8947560121927077e-05, "loss": 0.0615, "step": 3280 }, { "epoch": 12.653846153846153, "grad_norm": 3.210784435272217, "learning_rate": 2.8938414630567852e-05, "loss": 0.0658, "step": 3290 }, { "epoch": 12.692307692307692, "grad_norm": 3.263617992401123, "learning_rate": 2.892923103211526e-05, "loss": 0.0607, "step": 3300 }, { "epoch": 12.73076923076923, "grad_norm": 3.226895332336426, "learning_rate": 2.892000935167691e-05, "loss": 0.058, "step": 3310 }, { "epoch": 12.76923076923077, "grad_norm": 3.4264261722564697, "learning_rate": 2.8910749614464536e-05, "loss": 0.0587, "step": 3320 }, { "epoch": 12.807692307692308, "grad_norm": 3.2235159873962402, "learning_rate": 2.890145184579389e-05, "loss": 0.068, "step": 3330 }, { "epoch": 12.846153846153847, "grad_norm": 3.570420503616333, "learning_rate": 2.8892116071084727e-05, "loss": 0.0631, "step": 3340 }, { "epoch": 12.884615384615385, "grad_norm": 3.176832914352417, "learning_rate": 2.8882742315860692e-05, "loss": 0.0683, "step": 3350 }, { "epoch": 12.923076923076923, "grad_norm": 3.430499792098999, "learning_rate": 2.8873330605749275e-05, "loss": 0.0616, "step": 3360 }, { "epoch": 12.961538461538462, "grad_norm": 3.4460554122924805, "learning_rate": 2.886388096648174e-05, "loss": 0.0755, "step": 3370 }, { "epoch": 13.0, "grad_norm": 3.3051247596740723, "learning_rate": 2.8854393423893024e-05, "loss": 0.0663, "step": 3380 }, { "epoch": 13.038461538461538, "grad_norm": 3.3149800300598145, "learning_rate": 2.8844868003921723e-05, "loss": 0.0687, "step": 3390 }, { "epoch": 13.076923076923077, "grad_norm": 3.245965003967285, "learning_rate": 2.8835304732609962e-05, "loss": 0.0726, "step": 3400 }, { "epoch": 13.115384615384615, "grad_norm": 2.479734420776367, "learning_rate": 2.882570363610336e-05, "loss": 0.064, "step": 3410 }, { "epoch": 13.153846153846153, "grad_norm": 3.601377487182617, "learning_rate": 2.8816064740650954e-05, "loss": 0.0675, "step": 3420 }, { "epoch": 13.192307692307692, "grad_norm": 3.4560110569000244, "learning_rate": 2.880638807260511e-05, "loss": 0.0607, "step": 3430 }, { "epoch": 13.23076923076923, "grad_norm": 2.8063180446624756, "learning_rate": 2.8796673658421472e-05, "loss": 0.0647, "step": 3440 }, { "epoch": 13.26923076923077, "grad_norm": 2.8318352699279785, "learning_rate": 2.8786921524658877e-05, "loss": 0.057, "step": 3450 }, { "epoch": 13.307692307692308, "grad_norm": 2.682603597640991, "learning_rate": 2.8777131697979283e-05, "loss": 0.0701, "step": 3460 }, { "epoch": 13.346153846153847, "grad_norm": 2.213459014892578, "learning_rate": 2.876730420514771e-05, "loss": 0.0572, "step": 3470 }, { "epoch": 13.384615384615385, "grad_norm": 3.3933420181274414, "learning_rate": 2.8757439073032136e-05, "loss": 0.0645, "step": 3480 }, { "epoch": 13.423076923076923, "grad_norm": 2.8533976078033447, "learning_rate": 2.874753632860347e-05, "loss": 0.064, "step": 3490 }, { "epoch": 13.461538461538462, "grad_norm": 2.714489459991455, "learning_rate": 2.873759599893543e-05, "loss": 0.0729, "step": 3500 }, { "epoch": 13.5, "grad_norm": 2.61195969581604, "learning_rate": 2.8727618111204494e-05, "loss": 0.0665, "step": 3510 }, { "epoch": 13.538461538461538, "grad_norm": 2.846728801727295, "learning_rate": 2.871760269268983e-05, "loss": 0.0595, "step": 3520 }, { "epoch": 13.576923076923077, "grad_norm": 2.8665387630462646, "learning_rate": 2.870754977077321e-05, "loss": 0.0626, "step": 3530 }, { "epoch": 13.615384615384615, "grad_norm": 2.974710464477539, "learning_rate": 2.869745937293894e-05, "loss": 0.0685, "step": 3540 }, { "epoch": 13.653846153846153, "grad_norm": 2.7472543716430664, "learning_rate": 2.8687331526773775e-05, "loss": 0.0639, "step": 3550 }, { "epoch": 13.692307692307692, "grad_norm": 2.984053611755371, "learning_rate": 2.867716625996687e-05, "loss": 0.0642, "step": 3560 }, { "epoch": 13.73076923076923, "grad_norm": 2.6401329040527344, "learning_rate": 2.8666963600309672e-05, "loss": 0.0698, "step": 3570 }, { "epoch": 13.76923076923077, "grad_norm": 3.483025550842285, "learning_rate": 2.8656723575695862e-05, "loss": 0.0633, "step": 3580 }, { "epoch": 13.807692307692308, "grad_norm": 3.1316492557525635, "learning_rate": 2.8646446214121276e-05, "loss": 0.0599, "step": 3590 }, { "epoch": 13.846153846153847, "grad_norm": 2.7110755443573, "learning_rate": 2.8636131543683828e-05, "loss": 0.0657, "step": 3600 }, { "epoch": 13.884615384615385, "grad_norm": 2.8898301124572754, "learning_rate": 2.8625779592583436e-05, "loss": 0.0569, "step": 3610 }, { "epoch": 13.923076923076923, "grad_norm": 3.3383960723876953, "learning_rate": 2.861539038912193e-05, "loss": 0.0618, "step": 3620 }, { "epoch": 13.961538461538462, "grad_norm": 2.7277491092681885, "learning_rate": 2.860496396170301e-05, "loss": 0.0626, "step": 3630 }, { "epoch": 14.0, "grad_norm": 3.5116193294525146, "learning_rate": 2.859450033883212e-05, "loss": 0.0632, "step": 3640 }, { "epoch": 14.038461538461538, "grad_norm": 3.129054069519043, "learning_rate": 2.8583999549116413e-05, "loss": 0.065, "step": 3650 }, { "epoch": 14.076923076923077, "grad_norm": 2.848735809326172, "learning_rate": 2.857346162126464e-05, "loss": 0.0536, "step": 3660 }, { "epoch": 14.115384615384615, "grad_norm": 2.7207109928131104, "learning_rate": 2.8562886584087092e-05, "loss": 0.0624, "step": 3670 }, { "epoch": 14.153846153846153, "grad_norm": 2.6145403385162354, "learning_rate": 2.8552274466495525e-05, "loss": 0.0638, "step": 3680 }, { "epoch": 14.192307692307692, "grad_norm": 2.5214338302612305, "learning_rate": 2.8541625297503056e-05, "loss": 0.0645, "step": 3690 }, { "epoch": 14.23076923076923, "grad_norm": 2.453113555908203, "learning_rate": 2.8530939106224106e-05, "loss": 0.059, "step": 3700 }, { "epoch": 14.26923076923077, "grad_norm": 2.9459733963012695, "learning_rate": 2.8520215921874325e-05, "loss": 0.0619, "step": 3710 }, { "epoch": 14.307692307692308, "grad_norm": 3.1279542446136475, "learning_rate": 2.850945577377048e-05, "loss": 0.0652, "step": 3720 }, { "epoch": 14.346153846153847, "grad_norm": 3.1692466735839844, "learning_rate": 2.8498658691330406e-05, "loss": 0.056, "step": 3730 }, { "epoch": 14.384615384615385, "grad_norm": 2.7388417720794678, "learning_rate": 2.8487824704072913e-05, "loss": 0.0641, "step": 3740 }, { "epoch": 14.423076923076923, "grad_norm": 3.1448442935943604, "learning_rate": 2.8476953841617713e-05, "loss": 0.063, "step": 3750 }, { "epoch": 14.461538461538462, "grad_norm": 2.5632693767547607, "learning_rate": 2.846604613368532e-05, "loss": 0.0562, "step": 3760 }, { "epoch": 14.5, "grad_norm": 2.959221601486206, "learning_rate": 2.8455101610097002e-05, "loss": 0.0675, "step": 3770 }, { "epoch": 14.538461538461538, "grad_norm": 2.6775829792022705, "learning_rate": 2.8444120300774666e-05, "loss": 0.0606, "step": 3780 }, { "epoch": 14.576923076923077, "grad_norm": 2.562490940093994, "learning_rate": 2.8433102235740788e-05, "loss": 0.0573, "step": 3790 }, { "epoch": 14.615384615384615, "grad_norm": 2.8279271125793457, "learning_rate": 2.842204744511834e-05, "loss": 0.0595, "step": 3800 }, { "epoch": 14.653846153846153, "grad_norm": 2.3390533924102783, "learning_rate": 2.8410955959130693e-05, "loss": 0.0533, "step": 3810 }, { "epoch": 14.692307692307692, "grad_norm": 2.221428871154785, "learning_rate": 2.8399827808101554e-05, "loss": 0.0623, "step": 3820 }, { "epoch": 14.73076923076923, "grad_norm": 2.7612838745117188, "learning_rate": 2.8388663022454857e-05, "loss": 0.0616, "step": 3830 }, { "epoch": 14.76923076923077, "grad_norm": 2.730405330657959, "learning_rate": 2.83774616327147e-05, "loss": 0.0575, "step": 3840 }, { "epoch": 14.807692307692308, "grad_norm": 2.5589001178741455, "learning_rate": 2.836622366950526e-05, "loss": 0.0576, "step": 3850 }, { "epoch": 14.846153846153847, "grad_norm": 2.984184741973877, "learning_rate": 2.835494916355069e-05, "loss": 0.0635, "step": 3860 }, { "epoch": 14.884615384615385, "grad_norm": 2.6934828758239746, "learning_rate": 2.8343638145675072e-05, "loss": 0.0616, "step": 3870 }, { "epoch": 14.923076923076923, "grad_norm": 2.6913018226623535, "learning_rate": 2.8332290646802282e-05, "loss": 0.0601, "step": 3880 }, { "epoch": 14.961538461538462, "grad_norm": 2.394535779953003, "learning_rate": 2.8320906697955963e-05, "loss": 0.0562, "step": 3890 }, { "epoch": 15.0, "grad_norm": 2.9435136318206787, "learning_rate": 2.8309486330259385e-05, "loss": 0.0626, "step": 3900 }, { "epoch": 15.038461538461538, "grad_norm": 2.9360132217407227, "learning_rate": 2.82980295749354e-05, "loss": 0.0624, "step": 3910 }, { "epoch": 15.076923076923077, "grad_norm": 3.197340965270996, "learning_rate": 2.828653646330634e-05, "loss": 0.0612, "step": 3920 }, { "epoch": 15.115384615384615, "grad_norm": 2.8745882511138916, "learning_rate": 2.8275007026793938e-05, "loss": 0.0597, "step": 3930 }, { "epoch": 15.153846153846153, "grad_norm": 2.903532028198242, "learning_rate": 2.826344129691923e-05, "loss": 0.0632, "step": 3940 }, { "epoch": 15.192307692307692, "grad_norm": 2.7144417762756348, "learning_rate": 2.8251839305302478e-05, "loss": 0.0636, "step": 3950 }, { "epoch": 15.23076923076923, "grad_norm": 2.7780587673187256, "learning_rate": 2.8240201083663088e-05, "loss": 0.0569, "step": 3960 }, { "epoch": 15.26923076923077, "grad_norm": 2.372857093811035, "learning_rate": 2.8228526663819504e-05, "loss": 0.0615, "step": 3970 }, { "epoch": 15.307692307692308, "grad_norm": 2.953294277191162, "learning_rate": 2.8216816077689158e-05, "loss": 0.0569, "step": 3980 }, { "epoch": 15.346153846153847, "grad_norm": 3.1452503204345703, "learning_rate": 2.8205069357288337e-05, "loss": 0.0555, "step": 3990 }, { "epoch": 15.384615384615385, "grad_norm": 2.4111666679382324, "learning_rate": 2.8193286534732128e-05, "loss": 0.0612, "step": 4000 }, { "epoch": 15.423076923076923, "grad_norm": 2.9192652702331543, "learning_rate": 2.8181467642234317e-05, "loss": 0.0578, "step": 4010 }, { "epoch": 15.461538461538462, "grad_norm": 2.613917589187622, "learning_rate": 2.8169612712107306e-05, "loss": 0.0553, "step": 4020 }, { "epoch": 15.5, "grad_norm": 2.7946548461914062, "learning_rate": 2.8157721776762017e-05, "loss": 0.0632, "step": 4030 }, { "epoch": 15.538461538461538, "grad_norm": 2.815171003341675, "learning_rate": 2.814579486870782e-05, "loss": 0.0602, "step": 4040 }, { "epoch": 15.576923076923077, "grad_norm": 2.570457696914673, "learning_rate": 2.813383202055242e-05, "loss": 0.0551, "step": 4050 }, { "epoch": 15.615384615384615, "grad_norm": 2.0170300006866455, "learning_rate": 2.8121833265001792e-05, "loss": 0.0642, "step": 4060 }, { "epoch": 15.653846153846153, "grad_norm": 2.3233728408813477, "learning_rate": 2.8109798634860072e-05, "loss": 0.0588, "step": 4070 }, { "epoch": 15.692307692307692, "grad_norm": 2.1397790908813477, "learning_rate": 2.8097728163029482e-05, "loss": 0.0529, "step": 4080 }, { "epoch": 15.73076923076923, "grad_norm": 2.950807571411133, "learning_rate": 2.8085621882510233e-05, "loss": 0.0569, "step": 4090 }, { "epoch": 15.76923076923077, "grad_norm": 2.839064121246338, "learning_rate": 2.8073479826400425e-05, "loss": 0.0615, "step": 4100 }, { "epoch": 15.807692307692308, "grad_norm": 2.629194736480713, "learning_rate": 2.806130202789598e-05, "loss": 0.0596, "step": 4110 }, { "epoch": 15.846153846153847, "grad_norm": 2.859807252883911, "learning_rate": 2.804908852029054e-05, "loss": 0.0568, "step": 4120 }, { "epoch": 15.884615384615385, "grad_norm": 2.6774954795837402, "learning_rate": 2.8036839336975367e-05, "loss": 0.0573, "step": 4130 }, { "epoch": 15.923076923076923, "grad_norm": 3.763676166534424, "learning_rate": 2.8024554511439253e-05, "loss": 0.0612, "step": 4140 }, { "epoch": 15.961538461538462, "grad_norm": 2.945574998855591, "learning_rate": 2.801223407726844e-05, "loss": 0.0553, "step": 4150 }, { "epoch": 16.0, "grad_norm": 2.359447717666626, "learning_rate": 2.7999878068146537e-05, "loss": 0.049, "step": 4160 }, { "epoch": 16.03846153846154, "grad_norm": 2.7345235347747803, "learning_rate": 2.7987486517854396e-05, "loss": 0.0595, "step": 4170 }, { "epoch": 16.076923076923077, "grad_norm": 2.2802300453186035, "learning_rate": 2.7975059460270037e-05, "loss": 0.0622, "step": 4180 }, { "epoch": 16.115384615384617, "grad_norm": 2.078882932662964, "learning_rate": 2.7962596929368566e-05, "loss": 0.0585, "step": 4190 }, { "epoch": 16.153846153846153, "grad_norm": 2.182929039001465, "learning_rate": 2.795009895922207e-05, "loss": 0.0536, "step": 4200 }, { "epoch": 16.192307692307693, "grad_norm": 3.1578166484832764, "learning_rate": 2.7937565583999513e-05, "loss": 0.0517, "step": 4210 }, { "epoch": 16.23076923076923, "grad_norm": 2.627866268157959, "learning_rate": 2.792499683796667e-05, "loss": 0.0542, "step": 4220 }, { "epoch": 16.26923076923077, "grad_norm": 2.430981159210205, "learning_rate": 2.791239275548601e-05, "loss": 0.0484, "step": 4230 }, { "epoch": 16.307692307692307, "grad_norm": 2.130453586578369, "learning_rate": 2.789975337101662e-05, "loss": 0.0603, "step": 4240 }, { "epoch": 16.346153846153847, "grad_norm": 2.2407262325286865, "learning_rate": 2.788707871911409e-05, "loss": 0.0606, "step": 4250 }, { "epoch": 16.384615384615383, "grad_norm": 2.404421091079712, "learning_rate": 2.7874368834430426e-05, "loss": 0.0561, "step": 4260 }, { "epoch": 16.423076923076923, "grad_norm": 2.6223599910736084, "learning_rate": 2.7861623751713982e-05, "loss": 0.053, "step": 4270 }, { "epoch": 16.46153846153846, "grad_norm": 2.6188464164733887, "learning_rate": 2.7848843505809317e-05, "loss": 0.0559, "step": 4280 }, { "epoch": 16.5, "grad_norm": 2.1133105754852295, "learning_rate": 2.7836028131657142e-05, "loss": 0.0524, "step": 4290 }, { "epoch": 16.53846153846154, "grad_norm": 2.4061028957366943, "learning_rate": 2.7823177664294197e-05, "loss": 0.0545, "step": 4300 }, { "epoch": 16.576923076923077, "grad_norm": 2.635202169418335, "learning_rate": 2.7810292138853168e-05, "loss": 0.0526, "step": 4310 }, { "epoch": 16.615384615384617, "grad_norm": 2.4512650966644287, "learning_rate": 2.779737159056259e-05, "loss": 0.0527, "step": 4320 }, { "epoch": 16.653846153846153, "grad_norm": 2.650878429412842, "learning_rate": 2.7784416054746753e-05, "loss": 0.0518, "step": 4330 }, { "epoch": 16.692307692307693, "grad_norm": 2.4242632389068604, "learning_rate": 2.7771425566825593e-05, "loss": 0.0527, "step": 4340 }, { "epoch": 16.73076923076923, "grad_norm": 2.464691162109375, "learning_rate": 2.7758400162314605e-05, "loss": 0.0547, "step": 4350 }, { "epoch": 16.76923076923077, "grad_norm": 2.6380252838134766, "learning_rate": 2.7745339876824756e-05, "loss": 0.0503, "step": 4360 }, { "epoch": 16.807692307692307, "grad_norm": 2.551723003387451, "learning_rate": 2.7732244746062363e-05, "loss": 0.0537, "step": 4370 }, { "epoch": 16.846153846153847, "grad_norm": 2.8268020153045654, "learning_rate": 2.7719114805829015e-05, "loss": 0.0516, "step": 4380 }, { "epoch": 16.884615384615383, "grad_norm": 2.501936435699463, "learning_rate": 2.7705950092021465e-05, "loss": 0.0583, "step": 4390 }, { "epoch": 16.923076923076923, "grad_norm": 2.710240602493286, "learning_rate": 2.7692750640631533e-05, "loss": 0.0658, "step": 4400 }, { "epoch": 16.96153846153846, "grad_norm": 2.378331422805786, "learning_rate": 2.767951648774603e-05, "loss": 0.0548, "step": 4410 }, { "epoch": 17.0, "grad_norm": 2.894601345062256, "learning_rate": 2.766624766954661e-05, "loss": 0.0567, "step": 4420 }, { "epoch": 17.03846153846154, "grad_norm": 2.357337236404419, "learning_rate": 2.7652944222309727e-05, "loss": 0.0538, "step": 4430 }, { "epoch": 17.076923076923077, "grad_norm": 2.6987788677215576, "learning_rate": 2.7639606182406484e-05, "loss": 0.0617, "step": 4440 }, { "epoch": 17.115384615384617, "grad_norm": 1.861840009689331, "learning_rate": 2.7626233586302583e-05, "loss": 0.0487, "step": 4450 }, { "epoch": 17.153846153846153, "grad_norm": 1.9388928413391113, "learning_rate": 2.7612826470558192e-05, "loss": 0.0568, "step": 4460 }, { "epoch": 17.192307692307693, "grad_norm": 2.5828299522399902, "learning_rate": 2.7599384871827846e-05, "loss": 0.0535, "step": 4470 }, { "epoch": 17.23076923076923, "grad_norm": 2.6794981956481934, "learning_rate": 2.7585908826860368e-05, "loss": 0.0528, "step": 4480 }, { "epoch": 17.26923076923077, "grad_norm": 2.3347725868225098, "learning_rate": 2.757239837249875e-05, "loss": 0.0551, "step": 4490 }, { "epoch": 17.307692307692307, "grad_norm": 3.0247833728790283, "learning_rate": 2.7558853545680057e-05, "loss": 0.0524, "step": 4500 }, { "epoch": 17.346153846153847, "grad_norm": 2.5298354625701904, "learning_rate": 2.754527438343533e-05, "loss": 0.0624, "step": 4510 }, { "epoch": 17.384615384615383, "grad_norm": 2.508488893508911, "learning_rate": 2.7531660922889477e-05, "loss": 0.0548, "step": 4520 }, { "epoch": 17.423076923076923, "grad_norm": 2.869568347930908, "learning_rate": 2.751801320126118e-05, "loss": 0.0567, "step": 4530 }, { "epoch": 17.46153846153846, "grad_norm": 3.067972421646118, "learning_rate": 2.750433125586279e-05, "loss": 0.0527, "step": 4540 }, { "epoch": 17.5, "grad_norm": 2.691254138946533, "learning_rate": 2.7490615124100225e-05, "loss": 0.049, "step": 4550 }, { "epoch": 17.53846153846154, "grad_norm": 2.420832395553589, "learning_rate": 2.747686484347286e-05, "loss": 0.0488, "step": 4560 }, { "epoch": 17.576923076923077, "grad_norm": 2.2831807136535645, "learning_rate": 2.7463080451573447e-05, "loss": 0.0547, "step": 4570 }, { "epoch": 17.615384615384617, "grad_norm": 2.23234486579895, "learning_rate": 2.744926198608798e-05, "loss": 0.0521, "step": 4580 }, { "epoch": 17.653846153846153, "grad_norm": 2.116222620010376, "learning_rate": 2.743540948479561e-05, "loss": 0.0526, "step": 4590 }, { "epoch": 17.692307692307693, "grad_norm": 2.2791247367858887, "learning_rate": 2.7421522985568562e-05, "loss": 0.054, "step": 4600 }, { "epoch": 17.73076923076923, "grad_norm": 2.1204259395599365, "learning_rate": 2.7407602526371983e-05, "loss": 0.0544, "step": 4610 }, { "epoch": 17.76923076923077, "grad_norm": 2.725435733795166, "learning_rate": 2.7393648145263873e-05, "loss": 0.0545, "step": 4620 }, { "epoch": 17.807692307692307, "grad_norm": 2.7043561935424805, "learning_rate": 2.7379659880394996e-05, "loss": 0.0554, "step": 4630 }, { "epoch": 17.846153846153847, "grad_norm": 3.0216851234436035, "learning_rate": 2.7365637770008717e-05, "loss": 0.0551, "step": 4640 }, { "epoch": 17.884615384615383, "grad_norm": 2.6223690509796143, "learning_rate": 2.7351581852440953e-05, "loss": 0.0545, "step": 4650 }, { "epoch": 17.923076923076923, "grad_norm": 2.883230209350586, "learning_rate": 2.7337492166120053e-05, "loss": 0.0632, "step": 4660 }, { "epoch": 17.96153846153846, "grad_norm": 2.9789953231811523, "learning_rate": 2.732336874956667e-05, "loss": 0.0558, "step": 4670 }, { "epoch": 18.0, "grad_norm": 2.189687967300415, "learning_rate": 2.7309211641393696e-05, "loss": 0.0439, "step": 4680 }, { "epoch": 18.03846153846154, "grad_norm": 2.24808931350708, "learning_rate": 2.7295020880306123e-05, "loss": 0.0567, "step": 4690 }, { "epoch": 18.076923076923077, "grad_norm": 2.401685953140259, "learning_rate": 2.7280796505100946e-05, "loss": 0.0578, "step": 4700 }, { "epoch": 18.115384615384617, "grad_norm": 1.9197814464569092, "learning_rate": 2.7266538554667065e-05, "loss": 0.0525, "step": 4710 }, { "epoch": 18.153846153846153, "grad_norm": 2.022991180419922, "learning_rate": 2.725224706798517e-05, "loss": 0.0567, "step": 4720 }, { "epoch": 18.192307692307693, "grad_norm": 2.132225513458252, "learning_rate": 2.7237922084127643e-05, "loss": 0.0524, "step": 4730 }, { "epoch": 18.23076923076923, "grad_norm": 2.158095598220825, "learning_rate": 2.7223563642258446e-05, "loss": 0.0496, "step": 4740 }, { "epoch": 18.26923076923077, "grad_norm": 2.268658399581909, "learning_rate": 2.7209171781633e-05, "loss": 0.0536, "step": 4750 }, { "epoch": 18.307692307692307, "grad_norm": 2.788698673248291, "learning_rate": 2.7194746541598113e-05, "loss": 0.0527, "step": 4760 }, { "epoch": 18.346153846153847, "grad_norm": 2.46181321144104, "learning_rate": 2.7180287961591835e-05, "loss": 0.0547, "step": 4770 }, { "epoch": 18.384615384615383, "grad_norm": 2.1883716583251953, "learning_rate": 2.7165796081143377e-05, "loss": 0.0554, "step": 4780 }, { "epoch": 18.423076923076923, "grad_norm": 2.5119552612304688, "learning_rate": 2.715127093987298e-05, "loss": 0.0502, "step": 4790 }, { "epoch": 18.46153846153846, "grad_norm": 2.3125741481781006, "learning_rate": 2.713671257749183e-05, "loss": 0.0521, "step": 4800 }, { "epoch": 18.5, "grad_norm": 2.01935076713562, "learning_rate": 2.712212103380193e-05, "loss": 0.049, "step": 4810 }, { "epoch": 18.53846153846154, "grad_norm": 2.192258834838867, "learning_rate": 2.7107496348696004e-05, "loss": 0.0468, "step": 4820 }, { "epoch": 18.576923076923077, "grad_norm": 2.0872395038604736, "learning_rate": 2.7092838562157386e-05, "loss": 0.0608, "step": 4830 }, { "epoch": 18.615384615384617, "grad_norm": 2.0910723209381104, "learning_rate": 2.7078147714259905e-05, "loss": 0.0597, "step": 4840 }, { "epoch": 18.653846153846153, "grad_norm": 2.1819539070129395, "learning_rate": 2.7063423845167773e-05, "loss": 0.0514, "step": 4850 }, { "epoch": 18.692307692307693, "grad_norm": 2.541072368621826, "learning_rate": 2.7048666995135494e-05, "loss": 0.0518, "step": 4860 }, { "epoch": 18.73076923076923, "grad_norm": 2.253460168838501, "learning_rate": 2.7033877204507722e-05, "loss": 0.0519, "step": 4870 }, { "epoch": 18.76923076923077, "grad_norm": 2.454606294631958, "learning_rate": 2.701905451371919e-05, "loss": 0.0559, "step": 4880 }, { "epoch": 18.807692307692307, "grad_norm": 2.596813917160034, "learning_rate": 2.7004198963294558e-05, "loss": 0.0557, "step": 4890 }, { "epoch": 18.846153846153847, "grad_norm": 2.3953702449798584, "learning_rate": 2.6989310593848345e-05, "loss": 0.0553, "step": 4900 }, { "epoch": 18.884615384615383, "grad_norm": 2.1888365745544434, "learning_rate": 2.6974389446084776e-05, "loss": 0.0488, "step": 4910 }, { "epoch": 18.923076923076923, "grad_norm": 2.3826677799224854, "learning_rate": 2.6959435560797706e-05, "loss": 0.0495, "step": 4920 }, { "epoch": 18.96153846153846, "grad_norm": 2.1377201080322266, "learning_rate": 2.6944448978870478e-05, "loss": 0.0566, "step": 4930 }, { "epoch": 19.0, "grad_norm": 2.3483967781066895, "learning_rate": 2.6929429741275845e-05, "loss": 0.0488, "step": 4940 }, { "epoch": 19.03846153846154, "grad_norm": 1.9696838855743408, "learning_rate": 2.691437788907582e-05, "loss": 0.051, "step": 4950 }, { "epoch": 19.076923076923077, "grad_norm": 2.692110776901245, "learning_rate": 2.689929346342159e-05, "loss": 0.0461, "step": 4960 }, { "epoch": 19.115384615384617, "grad_norm": 2.2084898948669434, "learning_rate": 2.688417650555341e-05, "loss": 0.0599, "step": 4970 }, { "epoch": 19.153846153846153, "grad_norm": 2.121561288833618, "learning_rate": 2.686902705680046e-05, "loss": 0.0578, "step": 4980 }, { "epoch": 19.192307692307693, "grad_norm": 2.3033506870269775, "learning_rate": 2.6853845158580756e-05, "loss": 0.0519, "step": 4990 }, { "epoch": 19.23076923076923, "grad_norm": 2.0581743717193604, "learning_rate": 2.6838630852401028e-05, "loss": 0.0538, "step": 5000 }, { "epoch": 19.26923076923077, "grad_norm": 2.1499931812286377, "learning_rate": 2.6823384179856602e-05, "loss": 0.0515, "step": 5010 }, { "epoch": 19.307692307692307, "grad_norm": 2.4993667602539062, "learning_rate": 2.6808105182631303e-05, "loss": 0.0537, "step": 5020 }, { "epoch": 19.346153846153847, "grad_norm": 2.7367970943450928, "learning_rate": 2.6792793902497328e-05, "loss": 0.0507, "step": 5030 }, { "epoch": 19.384615384615383, "grad_norm": 2.3506581783294678, "learning_rate": 2.6777450381315133e-05, "loss": 0.0501, "step": 5040 }, { "epoch": 19.423076923076923, "grad_norm": 1.9951107501983643, "learning_rate": 2.676207466103331e-05, "loss": 0.0519, "step": 5050 }, { "epoch": 19.46153846153846, "grad_norm": 2.0051987171173096, "learning_rate": 2.6746666783688503e-05, "loss": 0.0505, "step": 5060 }, { "epoch": 19.5, "grad_norm": 1.8392553329467773, "learning_rate": 2.673122679140525e-05, "loss": 0.0421, "step": 5070 }, { "epoch": 19.53846153846154, "grad_norm": 1.9367773532867432, "learning_rate": 2.671575472639591e-05, "loss": 0.0514, "step": 5080 }, { "epoch": 19.576923076923077, "grad_norm": 2.1854703426361084, "learning_rate": 2.6700250630960506e-05, "loss": 0.0535, "step": 5090 }, { "epoch": 19.615384615384617, "grad_norm": 2.0213778018951416, "learning_rate": 2.6684714547486654e-05, "loss": 0.0546, "step": 5100 }, { "epoch": 19.653846153846153, "grad_norm": 2.09897780418396, "learning_rate": 2.6669146518449407e-05, "loss": 0.0477, "step": 5110 }, { "epoch": 19.692307692307693, "grad_norm": 2.4380099773406982, "learning_rate": 2.665354658641117e-05, "loss": 0.0611, "step": 5120 }, { "epoch": 19.73076923076923, "grad_norm": 2.0932047367095947, "learning_rate": 2.6637914794021552e-05, "loss": 0.0462, "step": 5130 }, { "epoch": 19.76923076923077, "grad_norm": 2.186757802963257, "learning_rate": 2.6622251184017274e-05, "loss": 0.0487, "step": 5140 }, { "epoch": 19.807692307692307, "grad_norm": 1.9822323322296143, "learning_rate": 2.660655579922206e-05, "loss": 0.0498, "step": 5150 }, { "epoch": 19.846153846153847, "grad_norm": 2.4814295768737793, "learning_rate": 2.6590828682546487e-05, "loss": 0.0541, "step": 5160 }, { "epoch": 19.884615384615383, "grad_norm": 2.3453330993652344, "learning_rate": 2.657506987698789e-05, "loss": 0.0403, "step": 5170 }, { "epoch": 19.923076923076923, "grad_norm": 2.3614869117736816, "learning_rate": 2.655927942563024e-05, "loss": 0.0491, "step": 5180 }, { "epoch": 19.96153846153846, "grad_norm": 2.508418083190918, "learning_rate": 2.6543457371644027e-05, "loss": 0.0506, "step": 5190 }, { "epoch": 20.0, "grad_norm": 1.809973955154419, "learning_rate": 2.652760375828615e-05, "loss": 0.0553, "step": 5200 }, { "epoch": 20.03846153846154, "grad_norm": 2.3840510845184326, "learning_rate": 2.651171862889978e-05, "loss": 0.0511, "step": 5210 }, { "epoch": 20.076923076923077, "grad_norm": 2.154362440109253, "learning_rate": 2.649580202691425e-05, "loss": 0.0522, "step": 5220 }, { "epoch": 20.115384615384617, "grad_norm": 2.4772872924804688, "learning_rate": 2.6479853995844942e-05, "loss": 0.0524, "step": 5230 }, { "epoch": 20.153846153846153, "grad_norm": 2.3529508113861084, "learning_rate": 2.646387457929317e-05, "loss": 0.0494, "step": 5240 }, { "epoch": 20.192307692307693, "grad_norm": 2.08630633354187, "learning_rate": 2.6447863820946047e-05, "loss": 0.0462, "step": 5250 }, { "epoch": 20.23076923076923, "grad_norm": 1.6905722618103027, "learning_rate": 2.6431821764576367e-05, "loss": 0.0512, "step": 5260 }, { "epoch": 20.26923076923077, "grad_norm": 1.8629543781280518, "learning_rate": 2.641574845404251e-05, "loss": 0.0502, "step": 5270 }, { "epoch": 20.307692307692307, "grad_norm": 2.0543575286865234, "learning_rate": 2.639964393328829e-05, "loss": 0.0545, "step": 5280 }, { "epoch": 20.346153846153847, "grad_norm": 2.485801935195923, "learning_rate": 2.6383508246342844e-05, "loss": 0.0514, "step": 5290 }, { "epoch": 20.384615384615383, "grad_norm": 1.7313271760940552, "learning_rate": 2.636734143732054e-05, "loss": 0.0488, "step": 5300 }, { "epoch": 20.423076923076923, "grad_norm": 1.8892971277236938, "learning_rate": 2.63511435504208e-05, "loss": 0.05, "step": 5310 }, { "epoch": 20.46153846153846, "grad_norm": 1.8958619832992554, "learning_rate": 2.633491462992804e-05, "loss": 0.0509, "step": 5320 }, { "epoch": 20.5, "grad_norm": 2.0657594203948975, "learning_rate": 2.63186547202115e-05, "loss": 0.0457, "step": 5330 }, { "epoch": 20.53846153846154, "grad_norm": 2.0821566581726074, "learning_rate": 2.6302363865725158e-05, "loss": 0.0515, "step": 5340 }, { "epoch": 20.576923076923077, "grad_norm": 1.938544750213623, "learning_rate": 2.628604211100759e-05, "loss": 0.0505, "step": 5350 }, { "epoch": 20.615384615384617, "grad_norm": 2.1884799003601074, "learning_rate": 2.6269689500681846e-05, "loss": 0.0514, "step": 5360 }, { "epoch": 20.653846153846153, "grad_norm": 2.3804664611816406, "learning_rate": 2.6253306079455337e-05, "loss": 0.0541, "step": 5370 }, { "epoch": 20.692307692307693, "grad_norm": 2.18882155418396, "learning_rate": 2.6236891892119713e-05, "loss": 0.0492, "step": 5380 }, { "epoch": 20.73076923076923, "grad_norm": 2.1746103763580322, "learning_rate": 2.6220446983550738e-05, "loss": 0.0487, "step": 5390 }, { "epoch": 20.76923076923077, "grad_norm": 1.9990772008895874, "learning_rate": 2.6203971398708162e-05, "loss": 0.0492, "step": 5400 }, { "epoch": 20.807692307692307, "grad_norm": 2.243596315383911, "learning_rate": 2.6187465182635598e-05, "loss": 0.0521, "step": 5410 }, { "epoch": 20.846153846153847, "grad_norm": 2.5800552368164062, "learning_rate": 2.6170928380460424e-05, "loss": 0.0468, "step": 5420 }, { "epoch": 20.884615384615383, "grad_norm": 2.2908005714416504, "learning_rate": 2.615436103739362e-05, "loss": 0.051, "step": 5430 }, { "epoch": 20.923076923076923, "grad_norm": 2.4502663612365723, "learning_rate": 2.6137763198729665e-05, "loss": 0.0476, "step": 5440 }, { "epoch": 20.96153846153846, "grad_norm": 2.216867208480835, "learning_rate": 2.6121134909846416e-05, "loss": 0.0553, "step": 5450 }, { "epoch": 21.0, "grad_norm": 2.4995601177215576, "learning_rate": 2.6104476216204985e-05, "loss": 0.0453, "step": 5460 }, { "epoch": 21.03846153846154, "grad_norm": 1.9124618768692017, "learning_rate": 2.6087787163349605e-05, "loss": 0.0479, "step": 5470 }, { "epoch": 21.076923076923077, "grad_norm": 1.9513589143753052, "learning_rate": 2.60710677969075e-05, "loss": 0.0476, "step": 5480 }, { "epoch": 21.115384615384617, "grad_norm": 1.7153360843658447, "learning_rate": 2.6054318162588792e-05, "loss": 0.044, "step": 5490 }, { "epoch": 21.153846153846153, "grad_norm": 2.320129871368408, "learning_rate": 2.6037538306186337e-05, "loss": 0.0456, "step": 5500 }, { "epoch": 21.192307692307693, "grad_norm": 1.6431825160980225, "learning_rate": 2.602072827357562e-05, "loss": 0.0483, "step": 5510 }, { "epoch": 21.23076923076923, "grad_norm": 2.124098539352417, "learning_rate": 2.6003888110714624e-05, "loss": 0.0402, "step": 5520 }, { "epoch": 21.26923076923077, "grad_norm": 2.669579267501831, "learning_rate": 2.5987017863643714e-05, "loss": 0.0547, "step": 5530 }, { "epoch": 21.307692307692307, "grad_norm": 2.1754443645477295, "learning_rate": 2.5970117578485506e-05, "loss": 0.0481, "step": 5540 }, { "epoch": 21.346153846153847, "grad_norm": 2.0452888011932373, "learning_rate": 2.5953187301444733e-05, "loss": 0.0436, "step": 5550 }, { "epoch": 21.384615384615383, "grad_norm": 2.2750635147094727, "learning_rate": 2.5936227078808123e-05, "loss": 0.0498, "step": 5560 }, { "epoch": 21.423076923076923, "grad_norm": 1.7035436630249023, "learning_rate": 2.5919236956944277e-05, "loss": 0.0455, "step": 5570 }, { "epoch": 21.46153846153846, "grad_norm": 2.014969825744629, "learning_rate": 2.5902216982303544e-05, "loss": 0.0502, "step": 5580 }, { "epoch": 21.5, "grad_norm": 2.2059478759765625, "learning_rate": 2.588516720141788e-05, "loss": 0.0466, "step": 5590 }, { "epoch": 21.53846153846154, "grad_norm": 1.8128308057785034, "learning_rate": 2.5868087660900735e-05, "loss": 0.0494, "step": 5600 }, { "epoch": 21.576923076923077, "grad_norm": 2.221346139907837, "learning_rate": 2.5850978407446924e-05, "loss": 0.0532, "step": 5610 }, { "epoch": 21.615384615384617, "grad_norm": 1.9685431718826294, "learning_rate": 2.5833839487832488e-05, "loss": 0.0506, "step": 5620 }, { "epoch": 21.653846153846153, "grad_norm": 1.538164734840393, "learning_rate": 2.5816670948914583e-05, "loss": 0.0446, "step": 5630 }, { "epoch": 21.692307692307693, "grad_norm": 1.8389371633529663, "learning_rate": 2.5799472837631338e-05, "loss": 0.053, "step": 5640 }, { "epoch": 21.73076923076923, "grad_norm": 2.2778820991516113, "learning_rate": 2.578224520100173e-05, "loss": 0.0423, "step": 5650 }, { "epoch": 21.76923076923077, "grad_norm": 2.5849103927612305, "learning_rate": 2.576498808612546e-05, "loss": 0.0489, "step": 5660 }, { "epoch": 21.807692307692307, "grad_norm": 2.3590989112854004, "learning_rate": 2.5747701540182825e-05, "loss": 0.0514, "step": 5670 }, { "epoch": 21.846153846153847, "grad_norm": 1.8261209726333618, "learning_rate": 2.573038561043458e-05, "loss": 0.0459, "step": 5680 }, { "epoch": 21.884615384615383, "grad_norm": 2.492357015609741, "learning_rate": 2.5713040344221815e-05, "loss": 0.053, "step": 5690 }, { "epoch": 21.923076923076923, "grad_norm": 2.3282649517059326, "learning_rate": 2.5695665788965823e-05, "loss": 0.0422, "step": 5700 }, { "epoch": 21.96153846153846, "grad_norm": 2.3369557857513428, "learning_rate": 2.5678261992167978e-05, "loss": 0.0542, "step": 5710 }, { "epoch": 22.0, "grad_norm": 2.2342748641967773, "learning_rate": 2.5660829001409594e-05, "loss": 0.0553, "step": 5720 }, { "epoch": 22.03846153846154, "grad_norm": 1.9400802850723267, "learning_rate": 2.5643366864351806e-05, "loss": 0.0554, "step": 5730 }, { "epoch": 22.076923076923077, "grad_norm": 2.1915154457092285, "learning_rate": 2.5625875628735423e-05, "loss": 0.0463, "step": 5740 }, { "epoch": 22.115384615384617, "grad_norm": 1.7244971990585327, "learning_rate": 2.560835534238082e-05, "loss": 0.0466, "step": 5750 }, { "epoch": 22.153846153846153, "grad_norm": 2.1806528568267822, "learning_rate": 2.5590806053187793e-05, "loss": 0.0483, "step": 5760 }, { "epoch": 22.192307692307693, "grad_norm": 1.9255647659301758, "learning_rate": 2.557322780913542e-05, "loss": 0.0489, "step": 5770 }, { "epoch": 22.23076923076923, "grad_norm": 2.2142374515533447, "learning_rate": 2.555562065828196e-05, "loss": 0.0527, "step": 5780 }, { "epoch": 22.26923076923077, "grad_norm": 2.4573400020599365, "learning_rate": 2.5537984648764684e-05, "loss": 0.0451, "step": 5790 }, { "epoch": 22.307692307692307, "grad_norm": 2.1143500804901123, "learning_rate": 2.5520319828799766e-05, "loss": 0.0479, "step": 5800 }, { "epoch": 22.346153846153847, "grad_norm": 1.8646248579025269, "learning_rate": 2.550262624668216e-05, "loss": 0.0435, "step": 5810 }, { "epoch": 22.384615384615383, "grad_norm": 1.5074269771575928, "learning_rate": 2.5484903950785432e-05, "loss": 0.0446, "step": 5820 }, { "epoch": 22.423076923076923, "grad_norm": 2.103839874267578, "learning_rate": 2.546715298956167e-05, "loss": 0.0527, "step": 5830 }, { "epoch": 22.46153846153846, "grad_norm": 1.9880903959274292, "learning_rate": 2.5449373411541322e-05, "loss": 0.0492, "step": 5840 }, { "epoch": 22.5, "grad_norm": 1.8882006406784058, "learning_rate": 2.5431565265333074e-05, "loss": 0.0552, "step": 5850 }, { "epoch": 22.53846153846154, "grad_norm": 2.3421194553375244, "learning_rate": 2.541372859962372e-05, "loss": 0.0487, "step": 5860 }, { "epoch": 22.576923076923077, "grad_norm": 2.0303971767425537, "learning_rate": 2.5395863463178023e-05, "loss": 0.0467, "step": 5870 }, { "epoch": 22.615384615384617, "grad_norm": 1.9958126544952393, "learning_rate": 2.537796990483858e-05, "loss": 0.0593, "step": 5880 }, { "epoch": 22.653846153846153, "grad_norm": 2.40570068359375, "learning_rate": 2.53600479735257e-05, "loss": 0.046, "step": 5890 }, { "epoch": 22.692307692307693, "grad_norm": 1.7980605363845825, "learning_rate": 2.5342097718237262e-05, "loss": 0.0496, "step": 5900 }, { "epoch": 22.73076923076923, "grad_norm": 2.364152431488037, "learning_rate": 2.5324119188048567e-05, "loss": 0.0485, "step": 5910 }, { "epoch": 22.76923076923077, "grad_norm": 2.065056085586548, "learning_rate": 2.530611243211224e-05, "loss": 0.0483, "step": 5920 }, { "epoch": 22.807692307692307, "grad_norm": 1.8822139501571655, "learning_rate": 2.5288077499658064e-05, "loss": 0.0445, "step": 5930 }, { "epoch": 22.846153846153847, "grad_norm": 1.9192858934402466, "learning_rate": 2.527001443999285e-05, "loss": 0.0462, "step": 5940 }, { "epoch": 22.884615384615383, "grad_norm": 2.175851821899414, "learning_rate": 2.5251923302500318e-05, "loss": 0.0448, "step": 5950 }, { "epoch": 22.923076923076923, "grad_norm": 2.1367673873901367, "learning_rate": 2.523380413664095e-05, "loss": 0.0461, "step": 5960 }, { "epoch": 22.96153846153846, "grad_norm": 2.0076870918273926, "learning_rate": 2.5215656991951844e-05, "loss": 0.0429, "step": 5970 }, { "epoch": 23.0, "grad_norm": 2.2983851432800293, "learning_rate": 2.5197481918046606e-05, "loss": 0.0473, "step": 5980 }, { "epoch": 23.03846153846154, "grad_norm": 1.547244906425476, "learning_rate": 2.5179278964615192e-05, "loss": 0.0441, "step": 5990 }, { "epoch": 23.076923076923077, "grad_norm": 1.8851597309112549, "learning_rate": 2.516104818142379e-05, "loss": 0.0494, "step": 6000 }, { "epoch": 23.115384615384617, "grad_norm": 1.918543815612793, "learning_rate": 2.5142789618314654e-05, "loss": 0.0417, "step": 6010 }, { "epoch": 23.153846153846153, "grad_norm": 1.7837193012237549, "learning_rate": 2.5124503325206006e-05, "loss": 0.0469, "step": 6020 }, { "epoch": 23.192307692307693, "grad_norm": 1.9671348333358765, "learning_rate": 2.5106189352091867e-05, "loss": 0.0429, "step": 6030 }, { "epoch": 23.23076923076923, "grad_norm": 1.8569093942642212, "learning_rate": 2.5087847749041944e-05, "loss": 0.0498, "step": 6040 }, { "epoch": 23.26923076923077, "grad_norm": 2.0214736461639404, "learning_rate": 2.506947856620148e-05, "loss": 0.0395, "step": 6050 }, { "epoch": 23.307692307692307, "grad_norm": 1.9467337131500244, "learning_rate": 2.505108185379111e-05, "loss": 0.046, "step": 6060 }, { "epoch": 23.346153846153847, "grad_norm": 1.8502717018127441, "learning_rate": 2.503265766210676e-05, "loss": 0.044, "step": 6070 }, { "epoch": 23.384615384615383, "grad_norm": 2.310283660888672, "learning_rate": 2.5014206041519456e-05, "loss": 0.0521, "step": 6080 }, { "epoch": 23.423076923076923, "grad_norm": 1.6830577850341797, "learning_rate": 2.499572704247523e-05, "loss": 0.0458, "step": 6090 }, { "epoch": 23.46153846153846, "grad_norm": 1.6736561059951782, "learning_rate": 2.497722071549495e-05, "loss": 0.0422, "step": 6100 }, { "epoch": 23.5, "grad_norm": 1.5868667364120483, "learning_rate": 2.4958687111174216e-05, "loss": 0.0432, "step": 6110 }, { "epoch": 23.53846153846154, "grad_norm": 2.2007408142089844, "learning_rate": 2.494012628018319e-05, "loss": 0.0495, "step": 6120 }, { "epoch": 23.576923076923077, "grad_norm": 1.7047786712646484, "learning_rate": 2.4921538273266475e-05, "loss": 0.0491, "step": 6130 }, { "epoch": 23.615384615384617, "grad_norm": 2.0366289615631104, "learning_rate": 2.490292314124298e-05, "loss": 0.0413, "step": 6140 }, { "epoch": 23.653846153846153, "grad_norm": 2.104093551635742, "learning_rate": 2.4884280935005755e-05, "loss": 0.0395, "step": 6150 }, { "epoch": 23.692307692307693, "grad_norm": 2.0423264503479004, "learning_rate": 2.486561170552188e-05, "loss": 0.051, "step": 6160 }, { "epoch": 23.73076923076923, "grad_norm": 2.2798447608947754, "learning_rate": 2.4846915503832326e-05, "loss": 0.0504, "step": 6170 }, { "epoch": 23.76923076923077, "grad_norm": 2.168039083480835, "learning_rate": 2.4828192381051787e-05, "loss": 0.0494, "step": 6180 }, { "epoch": 23.807692307692307, "grad_norm": 2.1671855449676514, "learning_rate": 2.480944238836857e-05, "loss": 0.0532, "step": 6190 }, { "epoch": 23.846153846153847, "grad_norm": 1.7856274843215942, "learning_rate": 2.4790665577044428e-05, "loss": 0.046, "step": 6200 }, { "epoch": 23.884615384615383, "grad_norm": 2.0235636234283447, "learning_rate": 2.4771861998414458e-05, "loss": 0.0465, "step": 6210 }, { "epoch": 23.923076923076923, "grad_norm": 2.179847478866577, "learning_rate": 2.475303170388692e-05, "loss": 0.0447, "step": 6220 }, { "epoch": 23.96153846153846, "grad_norm": 1.6464632749557495, "learning_rate": 2.4734174744943122e-05, "loss": 0.0449, "step": 6230 }, { "epoch": 24.0, "grad_norm": 1.7687687873840332, "learning_rate": 2.471529117313727e-05, "loss": 0.0447, "step": 6240 }, { "epoch": 24.03846153846154, "grad_norm": 1.8370416164398193, "learning_rate": 2.4696381040096335e-05, "loss": 0.0476, "step": 6250 }, { "epoch": 24.076923076923077, "grad_norm": 1.8568719625473022, "learning_rate": 2.4677444397519883e-05, "loss": 0.0478, "step": 6260 }, { "epoch": 24.115384615384617, "grad_norm": 1.9673206806182861, "learning_rate": 2.4658481297179987e-05, "loss": 0.0466, "step": 6270 }, { "epoch": 24.153846153846153, "grad_norm": 2.064671516418457, "learning_rate": 2.4639491790921028e-05, "loss": 0.0482, "step": 6280 }, { "epoch": 24.192307692307693, "grad_norm": 2.0179781913757324, "learning_rate": 2.4620475930659596e-05, "loss": 0.0491, "step": 6290 }, { "epoch": 24.23076923076923, "grad_norm": 1.8786386251449585, "learning_rate": 2.4601433768384327e-05, "loss": 0.042, "step": 6300 }, { "epoch": 24.26923076923077, "grad_norm": 2.0720064640045166, "learning_rate": 2.4582365356155766e-05, "loss": 0.0452, "step": 6310 }, { "epoch": 24.307692307692307, "grad_norm": 2.04176926612854, "learning_rate": 2.4563270746106224e-05, "loss": 0.0432, "step": 6320 }, { "epoch": 24.346153846153847, "grad_norm": 1.7975056171417236, "learning_rate": 2.4544149990439632e-05, "loss": 0.0431, "step": 6330 }, { "epoch": 24.384615384615383, "grad_norm": 1.9778639078140259, "learning_rate": 2.4525003141431413e-05, "loss": 0.0431, "step": 6340 }, { "epoch": 24.423076923076923, "grad_norm": 1.931883454322815, "learning_rate": 2.450583025142831e-05, "loss": 0.0411, "step": 6350 }, { "epoch": 24.46153846153846, "grad_norm": 2.43259596824646, "learning_rate": 2.4486631372848286e-05, "loss": 0.0464, "step": 6360 }, { "epoch": 24.5, "grad_norm": 2.0310606956481934, "learning_rate": 2.4467406558180328e-05, "loss": 0.0453, "step": 6370 }, { "epoch": 24.53846153846154, "grad_norm": 1.961578130722046, "learning_rate": 2.4448155859984357e-05, "loss": 0.0425, "step": 6380 }, { "epoch": 24.576923076923077, "grad_norm": 1.9241677522659302, "learning_rate": 2.442887933089104e-05, "loss": 0.0489, "step": 6390 }, { "epoch": 24.615384615384617, "grad_norm": 1.6629705429077148, "learning_rate": 2.440957702360167e-05, "loss": 0.0449, "step": 6400 }, { "epoch": 24.653846153846153, "grad_norm": 1.9020706415176392, "learning_rate": 2.4390248990888026e-05, "loss": 0.0468, "step": 6410 }, { "epoch": 24.692307692307693, "grad_norm": 1.9919699430465698, "learning_rate": 2.4370895285592202e-05, "loss": 0.047, "step": 6420 }, { "epoch": 24.73076923076923, "grad_norm": 1.7475947141647339, "learning_rate": 2.43515159606265e-05, "loss": 0.041, "step": 6430 }, { "epoch": 24.76923076923077, "grad_norm": 1.8714221715927124, "learning_rate": 2.4332111068973243e-05, "loss": 0.045, "step": 6440 }, { "epoch": 24.807692307692307, "grad_norm": 1.8813892602920532, "learning_rate": 2.4312680663684674e-05, "loss": 0.0416, "step": 6450 }, { "epoch": 24.846153846153847, "grad_norm": 1.9131197929382324, "learning_rate": 2.429322479788277e-05, "loss": 0.0423, "step": 6460 }, { "epoch": 24.884615384615383, "grad_norm": 1.9985467195510864, "learning_rate": 2.4273743524759132e-05, "loss": 0.0496, "step": 6470 }, { "epoch": 24.923076923076923, "grad_norm": 2.1156630516052246, "learning_rate": 2.4254236897574818e-05, "loss": 0.0425, "step": 6480 }, { "epoch": 24.96153846153846, "grad_norm": 2.0164928436279297, "learning_rate": 2.4234704969660192e-05, "loss": 0.0421, "step": 6490 }, { "epoch": 25.0, "grad_norm": 1.9131778478622437, "learning_rate": 2.4215147794414806e-05, "loss": 0.0431, "step": 6500 }, { "epoch": 25.03846153846154, "grad_norm": 2.0394511222839355, "learning_rate": 2.419556542530723e-05, "loss": 0.0462, "step": 6510 }, { "epoch": 25.076923076923077, "grad_norm": 1.879845142364502, "learning_rate": 2.4175957915874916e-05, "loss": 0.0489, "step": 6520 }, { "epoch": 25.115384615384617, "grad_norm": 1.9313360452651978, "learning_rate": 2.4156325319724037e-05, "loss": 0.0473, "step": 6530 }, { "epoch": 25.153846153846153, "grad_norm": 1.9558639526367188, "learning_rate": 2.4136667690529372e-05, "loss": 0.0432, "step": 6540 }, { "epoch": 25.192307692307693, "grad_norm": 1.932648777961731, "learning_rate": 2.4116985082034126e-05, "loss": 0.0441, "step": 6550 }, { "epoch": 25.23076923076923, "grad_norm": 1.8341952562332153, "learning_rate": 2.409727754804979e-05, "loss": 0.0484, "step": 6560 }, { "epoch": 25.26923076923077, "grad_norm": 2.142963171005249, "learning_rate": 2.4077545142456025e-05, "loss": 0.0491, "step": 6570 }, { "epoch": 25.307692307692307, "grad_norm": 2.0177972316741943, "learning_rate": 2.405778791920046e-05, "loss": 0.0581, "step": 6580 }, { "epoch": 25.346153846153847, "grad_norm": 2.0351908206939697, "learning_rate": 2.4038005932298594e-05, "loss": 0.0448, "step": 6590 }, { "epoch": 25.384615384615383, "grad_norm": 1.8626478910446167, "learning_rate": 2.4018199235833624e-05, "loss": 0.0428, "step": 6600 }, { "epoch": 25.423076923076923, "grad_norm": 1.7484854459762573, "learning_rate": 2.3998367883956306e-05, "loss": 0.0462, "step": 6610 }, { "epoch": 25.46153846153846, "grad_norm": 1.729616403579712, "learning_rate": 2.3978511930884795e-05, "loss": 0.0421, "step": 6620 }, { "epoch": 25.5, "grad_norm": 1.5235844850540161, "learning_rate": 2.3958631430904504e-05, "loss": 0.0489, "step": 6630 }, { "epoch": 25.53846153846154, "grad_norm": 1.9413248300552368, "learning_rate": 2.393872643836797e-05, "loss": 0.0464, "step": 6640 }, { "epoch": 25.576923076923077, "grad_norm": 1.6310465335845947, "learning_rate": 2.3918797007694675e-05, "loss": 0.0412, "step": 6650 }, { "epoch": 25.615384615384617, "grad_norm": 2.049593925476074, "learning_rate": 2.3898843193370923e-05, "loss": 0.0451, "step": 6660 }, { "epoch": 25.653846153846153, "grad_norm": 1.554084062576294, "learning_rate": 2.387886504994969e-05, "loss": 0.0434, "step": 6670 }, { "epoch": 25.692307692307693, "grad_norm": 2.2045257091522217, "learning_rate": 2.385886263205044e-05, "loss": 0.0458, "step": 6680 }, { "epoch": 25.73076923076923, "grad_norm": 2.1516737937927246, "learning_rate": 2.3838835994359036e-05, "loss": 0.0445, "step": 6690 }, { "epoch": 25.76923076923077, "grad_norm": 1.9684396982192993, "learning_rate": 2.3818785191627525e-05, "loss": 0.0442, "step": 6700 }, { "epoch": 25.807692307692307, "grad_norm": 1.7377302646636963, "learning_rate": 2.379871027867405e-05, "loss": 0.0443, "step": 6710 }, { "epoch": 25.846153846153847, "grad_norm": 1.860579490661621, "learning_rate": 2.3778611310382653e-05, "loss": 0.043, "step": 6720 }, { "epoch": 25.884615384615383, "grad_norm": 2.0663163661956787, "learning_rate": 2.3758488341703137e-05, "loss": 0.0444, "step": 6730 }, { "epoch": 25.923076923076923, "grad_norm": 1.8501702547073364, "learning_rate": 2.3738341427650945e-05, "loss": 0.0497, "step": 6740 }, { "epoch": 25.96153846153846, "grad_norm": 1.7028001546859741, "learning_rate": 2.3718170623306955e-05, "loss": 0.0418, "step": 6750 }, { "epoch": 26.0, "grad_norm": 1.7601794004440308, "learning_rate": 2.369797598381739e-05, "loss": 0.0413, "step": 6760 }, { "epoch": 26.03846153846154, "grad_norm": 1.8938238620758057, "learning_rate": 2.3677757564393612e-05, "loss": 0.0397, "step": 6770 }, { "epoch": 26.076923076923077, "grad_norm": 2.0491912364959717, "learning_rate": 2.3657515420312015e-05, "loss": 0.0414, "step": 6780 }, { "epoch": 26.115384615384617, "grad_norm": 2.1569526195526123, "learning_rate": 2.3637249606913847e-05, "loss": 0.045, "step": 6790 }, { "epoch": 26.153846153846153, "grad_norm": 1.8800678253173828, "learning_rate": 2.3616960179605064e-05, "loss": 0.0532, "step": 6800 }, { "epoch": 26.192307692307693, "grad_norm": 1.7340847253799438, "learning_rate": 2.3596647193856188e-05, "loss": 0.0409, "step": 6810 }, { "epoch": 26.23076923076923, "grad_norm": 2.0529165267944336, "learning_rate": 2.3576310705202143e-05, "loss": 0.0409, "step": 6820 }, { "epoch": 26.26923076923077, "grad_norm": 1.351607322692871, "learning_rate": 2.3555950769242122e-05, "loss": 0.0378, "step": 6830 }, { "epoch": 26.307692307692307, "grad_norm": 1.7300976514816284, "learning_rate": 2.3535567441639396e-05, "loss": 0.0416, "step": 6840 }, { "epoch": 26.346153846153847, "grad_norm": 1.606675148010254, "learning_rate": 2.351516077812122e-05, "loss": 0.0385, "step": 6850 }, { "epoch": 26.384615384615383, "grad_norm": 2.1588430404663086, "learning_rate": 2.349473083447863e-05, "loss": 0.0479, "step": 6860 }, { "epoch": 26.423076923076923, "grad_norm": 1.7339332103729248, "learning_rate": 2.3474277666566307e-05, "loss": 0.0499, "step": 6870 }, { "epoch": 26.46153846153846, "grad_norm": 1.6982123851776123, "learning_rate": 2.345380133030243e-05, "loss": 0.0507, "step": 6880 }, { "epoch": 26.5, "grad_norm": 1.991782307624817, "learning_rate": 2.343330188166853e-05, "loss": 0.0455, "step": 6890 }, { "epoch": 26.53846153846154, "grad_norm": 1.6237735748291016, "learning_rate": 2.3412779376709304e-05, "loss": 0.043, "step": 6900 }, { "epoch": 26.576923076923077, "grad_norm": 1.8890820741653442, "learning_rate": 2.3392233871532504e-05, "loss": 0.0465, "step": 6910 }, { "epoch": 26.615384615384617, "grad_norm": 1.9966788291931152, "learning_rate": 2.337166542230876e-05, "loss": 0.0509, "step": 6920 }, { "epoch": 26.653846153846153, "grad_norm": 2.028984546661377, "learning_rate": 2.335107408527142e-05, "loss": 0.0391, "step": 6930 }, { "epoch": 26.692307692307693, "grad_norm": 1.8344041109085083, "learning_rate": 2.3330459916716417e-05, "loss": 0.0387, "step": 6940 }, { "epoch": 26.73076923076923, "grad_norm": 1.8030184507369995, "learning_rate": 2.3309822973002097e-05, "loss": 0.0476, "step": 6950 }, { "epoch": 26.76923076923077, "grad_norm": 2.0031795501708984, "learning_rate": 2.328916331054908e-05, "loss": 0.0437, "step": 6960 }, { "epoch": 26.807692307692307, "grad_norm": 1.6738231182098389, "learning_rate": 2.3268480985840093e-05, "loss": 0.0423, "step": 6970 }, { "epoch": 26.846153846153847, "grad_norm": 1.5337883234024048, "learning_rate": 2.3247776055419826e-05, "loss": 0.0417, "step": 6980 }, { "epoch": 26.884615384615383, "grad_norm": 1.6017706394195557, "learning_rate": 2.3227048575894758e-05, "loss": 0.0451, "step": 6990 }, { "epoch": 26.923076923076923, "grad_norm": 1.8597776889801025, "learning_rate": 2.3206298603933037e-05, "loss": 0.04, "step": 7000 }, { "epoch": 26.96153846153846, "grad_norm": 1.6399952173233032, "learning_rate": 2.3185526196264288e-05, "loss": 0.0434, "step": 7010 }, { "epoch": 27.0, "grad_norm": 1.3050568103790283, "learning_rate": 2.3164731409679476e-05, "loss": 0.0364, "step": 7020 }, { "epoch": 27.03846153846154, "grad_norm": 1.4773322343826294, "learning_rate": 2.3143914301030765e-05, "loss": 0.0443, "step": 7030 }, { "epoch": 27.076923076923077, "grad_norm": 1.4294204711914062, "learning_rate": 2.3123074927231332e-05, "loss": 0.0451, "step": 7040 }, { "epoch": 27.115384615384617, "grad_norm": 1.8464568853378296, "learning_rate": 2.310221334525522e-05, "loss": 0.0509, "step": 7050 }, { "epoch": 27.153846153846153, "grad_norm": 1.5100023746490479, "learning_rate": 2.3081329612137207e-05, "loss": 0.041, "step": 7060 }, { "epoch": 27.192307692307693, "grad_norm": 1.688292145729065, "learning_rate": 2.3060423784972625e-05, "loss": 0.045, "step": 7070 }, { "epoch": 27.23076923076923, "grad_norm": 1.5028406381607056, "learning_rate": 2.3039495920917193e-05, "loss": 0.0398, "step": 7080 }, { "epoch": 27.26923076923077, "grad_norm": 1.6817067861557007, "learning_rate": 2.301854607718691e-05, "loss": 0.0399, "step": 7090 }, { "epoch": 27.307692307692307, "grad_norm": 1.7924225330352783, "learning_rate": 2.299757431105783e-05, "loss": 0.0435, "step": 7100 }, { "epoch": 27.346153846153847, "grad_norm": 1.8683700561523438, "learning_rate": 2.2976580679865972e-05, "loss": 0.0438, "step": 7110 }, { "epoch": 27.384615384615383, "grad_norm": 1.6464298963546753, "learning_rate": 2.2955565241007123e-05, "loss": 0.0462, "step": 7120 }, { "epoch": 27.423076923076923, "grad_norm": 1.7487648725509644, "learning_rate": 2.293452805193669e-05, "loss": 0.0407, "step": 7130 }, { "epoch": 27.46153846153846, "grad_norm": 1.8740178346633911, "learning_rate": 2.291346917016954e-05, "loss": 0.046, "step": 7140 }, { "epoch": 27.5, "grad_norm": 1.5780128240585327, "learning_rate": 2.289238865327985e-05, "loss": 0.0451, "step": 7150 }, { "epoch": 27.53846153846154, "grad_norm": 1.7798570394515991, "learning_rate": 2.2871286558900956e-05, "loss": 0.0461, "step": 7160 }, { "epoch": 27.576923076923077, "grad_norm": 2.028824806213379, "learning_rate": 2.285016294472517e-05, "loss": 0.0448, "step": 7170 }, { "epoch": 27.615384615384617, "grad_norm": 1.669351577758789, "learning_rate": 2.2829017868503658e-05, "loss": 0.042, "step": 7180 }, { "epoch": 27.653846153846153, "grad_norm": 2.1708717346191406, "learning_rate": 2.280785138804624e-05, "loss": 0.0513, "step": 7190 }, { "epoch": 27.692307692307693, "grad_norm": 1.5223692655563354, "learning_rate": 2.2786663561221265e-05, "loss": 0.0407, "step": 7200 }, { "epoch": 27.73076923076923, "grad_norm": 1.670336365699768, "learning_rate": 2.2765454445955452e-05, "loss": 0.0369, "step": 7210 }, { "epoch": 27.76923076923077, "grad_norm": 1.4955790042877197, "learning_rate": 2.2744224100233705e-05, "loss": 0.0479, "step": 7220 }, { "epoch": 27.807692307692307, "grad_norm": 1.7686519622802734, "learning_rate": 2.2722972582098984e-05, "loss": 0.0408, "step": 7230 }, { "epoch": 27.846153846153847, "grad_norm": 2.064610004425049, "learning_rate": 2.2701699949652118e-05, "loss": 0.0415, "step": 7240 }, { "epoch": 27.884615384615383, "grad_norm": 2.065558433532715, "learning_rate": 2.2680406261051685e-05, "loss": 0.0386, "step": 7250 }, { "epoch": 27.923076923076923, "grad_norm": 1.6095936298370361, "learning_rate": 2.2659091574513805e-05, "loss": 0.0433, "step": 7260 }, { "epoch": 27.96153846153846, "grad_norm": 1.7477508783340454, "learning_rate": 2.263775594831202e-05, "loss": 0.0431, "step": 7270 }, { "epoch": 28.0, "grad_norm": 1.827488899230957, "learning_rate": 2.2616399440777128e-05, "loss": 0.0389, "step": 7280 }, { "epoch": 28.03846153846154, "grad_norm": 1.7892285585403442, "learning_rate": 2.2595022110296988e-05, "loss": 0.042, "step": 7290 }, { "epoch": 28.076923076923077, "grad_norm": 2.0402352809906006, "learning_rate": 2.2573624015316418e-05, "loss": 0.0459, "step": 7300 }, { "epoch": 28.115384615384617, "grad_norm": 1.7068613767623901, "learning_rate": 2.2552205214336986e-05, "loss": 0.0445, "step": 7310 }, { "epoch": 28.153846153846153, "grad_norm": 1.5043061971664429, "learning_rate": 2.253076576591688e-05, "loss": 0.041, "step": 7320 }, { "epoch": 28.192307692307693, "grad_norm": 1.7505626678466797, "learning_rate": 2.2509305728670733e-05, "loss": 0.0458, "step": 7330 }, { "epoch": 28.23076923076923, "grad_norm": 1.7237753868103027, "learning_rate": 2.2487825161269463e-05, "loss": 0.0417, "step": 7340 }, { "epoch": 28.26923076923077, "grad_norm": 1.7082198858261108, "learning_rate": 2.2466324122440125e-05, "loss": 0.0397, "step": 7350 }, { "epoch": 28.307692307692307, "grad_norm": 1.798220157623291, "learning_rate": 2.2444802670965732e-05, "loss": 0.0403, "step": 7360 }, { "epoch": 28.346153846153847, "grad_norm": 2.100522994995117, "learning_rate": 2.2423260865685124e-05, "loss": 0.0401, "step": 7370 }, { "epoch": 28.384615384615383, "grad_norm": 2.088228702545166, "learning_rate": 2.2401698765492762e-05, "loss": 0.0438, "step": 7380 }, { "epoch": 28.423076923076923, "grad_norm": 1.6894526481628418, "learning_rate": 2.2380116429338612e-05, "loss": 0.0445, "step": 7390 }, { "epoch": 28.46153846153846, "grad_norm": 1.8538814783096313, "learning_rate": 2.2358513916227945e-05, "loss": 0.0418, "step": 7400 }, { "epoch": 28.5, "grad_norm": 1.6178901195526123, "learning_rate": 2.233689128522122e-05, "loss": 0.0372, "step": 7410 }, { "epoch": 28.53846153846154, "grad_norm": 1.6388591527938843, "learning_rate": 2.2315248595433883e-05, "loss": 0.0455, "step": 7420 }, { "epoch": 28.576923076923077, "grad_norm": 1.7331197261810303, "learning_rate": 2.2293585906036214e-05, "loss": 0.0421, "step": 7430 }, { "epoch": 28.615384615384617, "grad_norm": 1.5612764358520508, "learning_rate": 2.2271903276253183e-05, "loss": 0.0426, "step": 7440 }, { "epoch": 28.653846153846153, "grad_norm": 1.5970007181167603, "learning_rate": 2.2250200765364273e-05, "loss": 0.0393, "step": 7450 }, { "epoch": 28.692307692307693, "grad_norm": 1.7833430767059326, "learning_rate": 2.2228478432703317e-05, "loss": 0.043, "step": 7460 }, { "epoch": 28.73076923076923, "grad_norm": 1.7582666873931885, "learning_rate": 2.2206736337658348e-05, "loss": 0.0426, "step": 7470 }, { "epoch": 28.76923076923077, "grad_norm": 1.8394339084625244, "learning_rate": 2.2184974539671417e-05, "loss": 0.0421, "step": 7480 }, { "epoch": 28.807692307692307, "grad_norm": 2.0509567260742188, "learning_rate": 2.2163193098238453e-05, "loss": 0.0468, "step": 7490 }, { "epoch": 28.846153846153847, "grad_norm": 1.9507832527160645, "learning_rate": 2.2141392072909082e-05, "loss": 0.0427, "step": 7500 }, { "epoch": 28.884615384615383, "grad_norm": 2.1092071533203125, "learning_rate": 2.2119571523286484e-05, "loss": 0.037, "step": 7510 }, { "epoch": 28.923076923076923, "grad_norm": 1.6048542261123657, "learning_rate": 2.2097731509027196e-05, "loss": 0.045, "step": 7520 }, { "epoch": 28.96153846153846, "grad_norm": 1.5821419954299927, "learning_rate": 2.207587208984099e-05, "loss": 0.0443, "step": 7530 }, { "epoch": 29.0, "grad_norm": 1.5150803327560425, "learning_rate": 2.205399332549068e-05, "loss": 0.0385, "step": 7540 }, { "epoch": 29.03846153846154, "grad_norm": 1.6221213340759277, "learning_rate": 2.2032095275791974e-05, "loss": 0.0407, "step": 7550 }, { "epoch": 29.076923076923077, "grad_norm": 1.737960934638977, "learning_rate": 2.2010178000613307e-05, "loss": 0.0407, "step": 7560 }, { "epoch": 29.115384615384617, "grad_norm": 1.7430760860443115, "learning_rate": 2.1988241559875666e-05, "loss": 0.0417, "step": 7570 }, { "epoch": 29.153846153846153, "grad_norm": 1.7095967531204224, "learning_rate": 2.1966286013552448e-05, "loss": 0.0466, "step": 7580 }, { "epoch": 29.192307692307693, "grad_norm": 1.701668381690979, "learning_rate": 2.1944311421669274e-05, "loss": 0.0432, "step": 7590 }, { "epoch": 29.23076923076923, "grad_norm": 1.6135324239730835, "learning_rate": 2.1922317844303846e-05, "loss": 0.045, "step": 7600 }, { "epoch": 29.26923076923077, "grad_norm": 1.5123496055603027, "learning_rate": 2.1900305341585756e-05, "loss": 0.037, "step": 7610 }, { "epoch": 29.307692307692307, "grad_norm": 1.4518818855285645, "learning_rate": 2.187827397369635e-05, "loss": 0.0387, "step": 7620 }, { "epoch": 29.346153846153847, "grad_norm": 1.8243811130523682, "learning_rate": 2.1856223800868542e-05, "loss": 0.0408, "step": 7630 }, { "epoch": 29.384615384615383, "grad_norm": 1.6765333414077759, "learning_rate": 2.183415488338667e-05, "loss": 0.0421, "step": 7640 }, { "epoch": 29.423076923076923, "grad_norm": 1.7374074459075928, "learning_rate": 2.1812067281586312e-05, "loss": 0.0449, "step": 7650 }, { "epoch": 29.46153846153846, "grad_norm": 1.3828516006469727, "learning_rate": 2.178996105585412e-05, "loss": 0.0416, "step": 7660 }, { "epoch": 29.5, "grad_norm": 1.41438627243042, "learning_rate": 2.1767836266627676e-05, "loss": 0.0416, "step": 7670 }, { "epoch": 29.53846153846154, "grad_norm": 1.4291772842407227, "learning_rate": 2.174569297439531e-05, "loss": 0.0381, "step": 7680 }, { "epoch": 29.576923076923077, "grad_norm": 1.4206595420837402, "learning_rate": 2.1723531239695932e-05, "loss": 0.0411, "step": 7690 }, { "epoch": 29.615384615384617, "grad_norm": 1.4999078512191772, "learning_rate": 2.1701351123118886e-05, "loss": 0.0406, "step": 7700 }, { "epoch": 29.653846153846153, "grad_norm": 1.9684511423110962, "learning_rate": 2.167915268530376e-05, "loss": 0.0438, "step": 7710 }, { "epoch": 29.692307692307693, "grad_norm": 1.7212599515914917, "learning_rate": 2.165693598694023e-05, "loss": 0.0397, "step": 7720 }, { "epoch": 29.73076923076923, "grad_norm": 1.5021146535873413, "learning_rate": 2.163470108876791e-05, "loss": 0.0478, "step": 7730 }, { "epoch": 29.76923076923077, "grad_norm": 1.5587773323059082, "learning_rate": 2.161244805157616e-05, "loss": 0.0357, "step": 7740 }, { "epoch": 29.807692307692307, "grad_norm": 1.5270205736160278, "learning_rate": 2.159017693620393e-05, "loss": 0.0415, "step": 7750 }, { "epoch": 29.846153846153847, "grad_norm": 1.5055110454559326, "learning_rate": 2.15678878035396e-05, "loss": 0.04, "step": 7760 }, { "epoch": 29.884615384615383, "grad_norm": 1.9528456926345825, "learning_rate": 2.1545580714520817e-05, "loss": 0.0351, "step": 7770 }, { "epoch": 29.923076923076923, "grad_norm": 1.9646743535995483, "learning_rate": 2.1523255730134294e-05, "loss": 0.0434, "step": 7780 }, { "epoch": 29.96153846153846, "grad_norm": 1.6051669120788574, "learning_rate": 2.15009129114157e-05, "loss": 0.0373, "step": 7790 }, { "epoch": 30.0, "grad_norm": 1.7047405242919922, "learning_rate": 2.1478552319449443e-05, "loss": 0.0408, "step": 7800 }, { "epoch": 30.03846153846154, "grad_norm": 1.537502646446228, "learning_rate": 2.1456174015368527e-05, "loss": 0.0426, "step": 7810 }, { "epoch": 30.076923076923077, "grad_norm": 1.4524402618408203, "learning_rate": 2.1433778060354375e-05, "loss": 0.0464, "step": 7820 }, { "epoch": 30.115384615384617, "grad_norm": 1.7234159708023071, "learning_rate": 2.1411364515636685e-05, "loss": 0.0512, "step": 7830 }, { "epoch": 30.153846153846153, "grad_norm": 1.512885570526123, "learning_rate": 2.1388933442493232e-05, "loss": 0.0438, "step": 7840 }, { "epoch": 30.192307692307693, "grad_norm": 1.5967854261398315, "learning_rate": 2.13664849022497e-05, "loss": 0.039, "step": 7850 }, { "epoch": 30.23076923076923, "grad_norm": 2.0385942459106445, "learning_rate": 2.1344018956279547e-05, "loss": 0.0369, "step": 7860 }, { "epoch": 30.26923076923077, "grad_norm": 1.9069523811340332, "learning_rate": 2.1321535666003817e-05, "loss": 0.0392, "step": 7870 }, { "epoch": 30.307692307692307, "grad_norm": 1.7070564031600952, "learning_rate": 2.1299035092890966e-05, "loss": 0.0447, "step": 7880 }, { "epoch": 30.346153846153847, "grad_norm": 1.4670286178588867, "learning_rate": 2.12765172984567e-05, "loss": 0.0416, "step": 7890 }, { "epoch": 30.384615384615383, "grad_norm": 1.7784276008605957, "learning_rate": 2.1253982344263803e-05, "loss": 0.0378, "step": 7900 }, { "epoch": 30.423076923076923, "grad_norm": 1.4911285638809204, "learning_rate": 2.1231430291921987e-05, "loss": 0.0459, "step": 7910 }, { "epoch": 30.46153846153846, "grad_norm": 1.4901173114776611, "learning_rate": 2.1208861203087695e-05, "loss": 0.0404, "step": 7920 }, { "epoch": 30.5, "grad_norm": 1.8902019262313843, "learning_rate": 2.1186275139463967e-05, "loss": 0.0421, "step": 7930 }, { "epoch": 30.53846153846154, "grad_norm": 1.5284932851791382, "learning_rate": 2.1163672162800222e-05, "loss": 0.0405, "step": 7940 }, { "epoch": 30.576923076923077, "grad_norm": 1.6077547073364258, "learning_rate": 2.114105233489215e-05, "loss": 0.0505, "step": 7950 }, { "epoch": 30.615384615384617, "grad_norm": 1.7635397911071777, "learning_rate": 2.1118415717581487e-05, "loss": 0.0509, "step": 7960 }, { "epoch": 30.653846153846153, "grad_norm": 1.7803516387939453, "learning_rate": 2.1095762372755885e-05, "loss": 0.0428, "step": 7970 }, { "epoch": 30.692307692307693, "grad_norm": 1.7161511182785034, "learning_rate": 2.1073092362348716e-05, "loss": 0.0496, "step": 7980 }, { "epoch": 30.73076923076923, "grad_norm": 1.8986924886703491, "learning_rate": 2.1050405748338933e-05, "loss": 0.0425, "step": 7990 }, { "epoch": 30.76923076923077, "grad_norm": 1.8261290788650513, "learning_rate": 2.102770259275087e-05, "loss": 0.0395, "step": 8000 }, { "epoch": 30.807692307692307, "grad_norm": 1.6952645778656006, "learning_rate": 2.100498295765408e-05, "loss": 0.0454, "step": 8010 }, { "epoch": 30.846153846153847, "grad_norm": 1.8140101432800293, "learning_rate": 2.098224690516319e-05, "loss": 0.0411, "step": 8020 }, { "epoch": 30.884615384615383, "grad_norm": 1.8083970546722412, "learning_rate": 2.0959494497437688e-05, "loss": 0.0427, "step": 8030 }, { "epoch": 30.923076923076923, "grad_norm": 1.270798921585083, "learning_rate": 2.0936725796681796e-05, "loss": 0.0379, "step": 8040 }, { "epoch": 30.96153846153846, "grad_norm": 1.6783995628356934, "learning_rate": 2.0913940865144266e-05, "loss": 0.0372, "step": 8050 }, { "epoch": 31.0, "grad_norm": 1.6331335306167603, "learning_rate": 2.0891139765118235e-05, "loss": 0.0415, "step": 8060 }, { "epoch": 31.03846153846154, "grad_norm": 1.3328851461410522, "learning_rate": 2.086832255894104e-05, "loss": 0.0386, "step": 8070 }, { "epoch": 31.076923076923077, "grad_norm": 1.5463865995407104, "learning_rate": 2.084548930899405e-05, "loss": 0.0392, "step": 8080 }, { "epoch": 31.115384615384617, "grad_norm": 1.4242238998413086, "learning_rate": 2.08226400777025e-05, "loss": 0.0371, "step": 8090 }, { "epoch": 31.153846153846153, "grad_norm": 1.5513811111450195, "learning_rate": 2.0799774927535313e-05, "loss": 0.0368, "step": 8100 }, { "epoch": 31.192307692307693, "grad_norm": 1.7626824378967285, "learning_rate": 2.0776893921004936e-05, "loss": 0.0394, "step": 8110 }, { "epoch": 31.23076923076923, "grad_norm": 1.5672152042388916, "learning_rate": 2.0753997120667172e-05, "loss": 0.0431, "step": 8120 }, { "epoch": 31.26923076923077, "grad_norm": 1.6233484745025635, "learning_rate": 2.0731084589120995e-05, "loss": 0.0358, "step": 8130 }, { "epoch": 31.307692307692307, "grad_norm": 1.7482249736785889, "learning_rate": 2.070815638900839e-05, "loss": 0.0446, "step": 8140 }, { "epoch": 31.346153846153847, "grad_norm": 1.4374932050704956, "learning_rate": 2.0685212583014186e-05, "loss": 0.0407, "step": 8150 }, { "epoch": 31.384615384615383, "grad_norm": 1.583405613899231, "learning_rate": 2.0662253233865866e-05, "loss": 0.0395, "step": 8160 }, { "epoch": 31.423076923076923, "grad_norm": 1.5504549741744995, "learning_rate": 2.063927840433342e-05, "loss": 0.0396, "step": 8170 }, { "epoch": 31.46153846153846, "grad_norm": 1.6861332654953003, "learning_rate": 2.0616288157229154e-05, "loss": 0.0427, "step": 8180 }, { "epoch": 31.5, "grad_norm": 1.3032209873199463, "learning_rate": 2.0593282555407522e-05, "loss": 0.0409, "step": 8190 }, { "epoch": 31.53846153846154, "grad_norm": 1.734984040260315, "learning_rate": 2.057026166176496e-05, "loss": 0.0405, "step": 8200 }, { "epoch": 31.576923076923077, "grad_norm": 1.5121088027954102, "learning_rate": 2.0547225539239715e-05, "loss": 0.04, "step": 8210 }, { "epoch": 31.615384615384617, "grad_norm": 1.6952282190322876, "learning_rate": 2.0524174250811665e-05, "loss": 0.0442, "step": 8220 }, { "epoch": 31.653846153846153, "grad_norm": 1.8165638446807861, "learning_rate": 2.050110785950216e-05, "loss": 0.0415, "step": 8230 }, { "epoch": 31.692307692307693, "grad_norm": 1.6110750436782837, "learning_rate": 2.047802642837382e-05, "loss": 0.0377, "step": 8240 }, { "epoch": 31.73076923076923, "grad_norm": 1.6655876636505127, "learning_rate": 2.0454930020530403e-05, "loss": 0.0385, "step": 8250 }, { "epoch": 31.76923076923077, "grad_norm": 1.491156816482544, "learning_rate": 2.0431818699116606e-05, "loss": 0.0375, "step": 8260 }, { "epoch": 31.807692307692307, "grad_norm": 1.4836500883102417, "learning_rate": 2.04086925273179e-05, "loss": 0.0521, "step": 8270 }, { "epoch": 31.846153846153847, "grad_norm": 1.317879319190979, "learning_rate": 2.0385551568360357e-05, "loss": 0.0393, "step": 8280 }, { "epoch": 31.884615384615383, "grad_norm": 1.4740430116653442, "learning_rate": 2.036239588551047e-05, "loss": 0.0401, "step": 8290 }, { "epoch": 31.923076923076923, "grad_norm": 1.7209266424179077, "learning_rate": 2.0339225542074996e-05, "loss": 0.0352, "step": 8300 }, { "epoch": 31.96153846153846, "grad_norm": 1.6838961839675903, "learning_rate": 2.0316040601400765e-05, "loss": 0.0433, "step": 8310 }, { "epoch": 32.0, "grad_norm": 1.6667869091033936, "learning_rate": 2.029284112687453e-05, "loss": 0.0417, "step": 8320 }, { "epoch": 32.03846153846154, "grad_norm": 1.8445954322814941, "learning_rate": 2.0269627181922752e-05, "loss": 0.0414, "step": 8330 }, { "epoch": 32.07692307692308, "grad_norm": 1.4822769165039062, "learning_rate": 2.0246398830011482e-05, "loss": 0.0382, "step": 8340 }, { "epoch": 32.11538461538461, "grad_norm": 1.6866445541381836, "learning_rate": 2.0223156134646142e-05, "loss": 0.042, "step": 8350 }, { "epoch": 32.15384615384615, "grad_norm": 1.7119044065475464, "learning_rate": 2.019989915937138e-05, "loss": 0.0403, "step": 8360 }, { "epoch": 32.19230769230769, "grad_norm": 2.0892767906188965, "learning_rate": 2.0176627967770873e-05, "loss": 0.035, "step": 8370 }, { "epoch": 32.23076923076923, "grad_norm": 1.5972812175750732, "learning_rate": 2.015334262346717e-05, "loss": 0.0409, "step": 8380 }, { "epoch": 32.26923076923077, "grad_norm": 1.5595967769622803, "learning_rate": 2.0130043190121515e-05, "loss": 0.04, "step": 8390 }, { "epoch": 32.30769230769231, "grad_norm": 1.349557876586914, "learning_rate": 2.0106729731433663e-05, "loss": 0.0337, "step": 8400 }, { "epoch": 32.34615384615385, "grad_norm": 1.450696587562561, "learning_rate": 2.008340231114173e-05, "loss": 0.0388, "step": 8410 }, { "epoch": 32.38461538461539, "grad_norm": 1.7912949323654175, "learning_rate": 2.006006099302199e-05, "loss": 0.0378, "step": 8420 }, { "epoch": 32.42307692307692, "grad_norm": 1.6425917148590088, "learning_rate": 2.003670584088871e-05, "loss": 0.0384, "step": 8430 }, { "epoch": 32.46153846153846, "grad_norm": 1.5332478284835815, "learning_rate": 2.001333691859399e-05, "loss": 0.0348, "step": 8440 }, { "epoch": 32.5, "grad_norm": 1.5672633647918701, "learning_rate": 1.9989954290027565e-05, "loss": 0.0379, "step": 8450 }, { "epoch": 32.53846153846154, "grad_norm": 1.715303897857666, "learning_rate": 1.9966558019116654e-05, "loss": 0.0344, "step": 8460 }, { "epoch": 32.57692307692308, "grad_norm": 1.3653415441513062, "learning_rate": 1.9943148169825766e-05, "loss": 0.0391, "step": 8470 }, { "epoch": 32.61538461538461, "grad_norm": 1.5256359577178955, "learning_rate": 1.991972480615653e-05, "loss": 0.046, "step": 8480 }, { "epoch": 32.65384615384615, "grad_norm": 1.432858943939209, "learning_rate": 1.989628799214754e-05, "loss": 0.0408, "step": 8490 }, { "epoch": 32.69230769230769, "grad_norm": 1.5813000202178955, "learning_rate": 1.987283779187414e-05, "loss": 0.0359, "step": 8500 }, { "epoch": 32.73076923076923, "grad_norm": 1.5625938177108765, "learning_rate": 1.9849374269448288e-05, "loss": 0.0407, "step": 8510 }, { "epoch": 32.76923076923077, "grad_norm": 1.6737438440322876, "learning_rate": 1.982589748901836e-05, "loss": 0.0387, "step": 8520 }, { "epoch": 32.80769230769231, "grad_norm": 1.765540361404419, "learning_rate": 1.9802407514768964e-05, "loss": 0.0369, "step": 8530 }, { "epoch": 32.84615384615385, "grad_norm": 1.6643508672714233, "learning_rate": 1.9778904410920808e-05, "loss": 0.036, "step": 8540 }, { "epoch": 32.88461538461539, "grad_norm": 1.470862865447998, "learning_rate": 1.9755388241730475e-05, "loss": 0.0414, "step": 8550 }, { "epoch": 32.92307692307692, "grad_norm": 1.748119831085205, "learning_rate": 1.973185907149027e-05, "loss": 0.0362, "step": 8560 }, { "epoch": 32.96153846153846, "grad_norm": 1.780103325843811, "learning_rate": 1.970831696452805e-05, "loss": 0.0421, "step": 8570 }, { "epoch": 33.0, "grad_norm": 1.5416651964187622, "learning_rate": 1.9684761985207038e-05, "loss": 0.0369, "step": 8580 }, { "epoch": 33.03846153846154, "grad_norm": 1.6990591287612915, "learning_rate": 1.9661194197925644e-05, "loss": 0.0356, "step": 8590 }, { "epoch": 33.07692307692308, "grad_norm": 1.5676331520080566, "learning_rate": 1.9637613667117303e-05, "loss": 0.0437, "step": 8600 }, { "epoch": 33.11538461538461, "grad_norm": 1.9157164096832275, "learning_rate": 1.961402045725028e-05, "loss": 0.0408, "step": 8610 }, { "epoch": 33.15384615384615, "grad_norm": 1.409117341041565, "learning_rate": 1.9590414632827513e-05, "loss": 0.0414, "step": 8620 }, { "epoch": 33.19230769230769, "grad_norm": 1.700792670249939, "learning_rate": 1.9566796258386424e-05, "loss": 0.037, "step": 8630 }, { "epoch": 33.23076923076923, "grad_norm": 1.9453880786895752, "learning_rate": 1.9543165398498743e-05, "loss": 0.0385, "step": 8640 }, { "epoch": 33.26923076923077, "grad_norm": 1.4440393447875977, "learning_rate": 1.9519522117770355e-05, "loss": 0.0409, "step": 8650 }, { "epoch": 33.30769230769231, "grad_norm": 1.8256205320358276, "learning_rate": 1.9495866480841063e-05, "loss": 0.038, "step": 8660 }, { "epoch": 33.34615384615385, "grad_norm": 2.067312479019165, "learning_rate": 1.9472198552384494e-05, "loss": 0.0378, "step": 8670 }, { "epoch": 33.38461538461539, "grad_norm": 1.8080998659133911, "learning_rate": 1.9448518397107848e-05, "loss": 0.0446, "step": 8680 }, { "epoch": 33.42307692307692, "grad_norm": 1.553263545036316, "learning_rate": 1.942482607975177e-05, "loss": 0.0385, "step": 8690 }, { "epoch": 33.46153846153846, "grad_norm": 1.4473556280136108, "learning_rate": 1.940112166509016e-05, "loss": 0.0382, "step": 8700 }, { "epoch": 33.5, "grad_norm": 1.6522272825241089, "learning_rate": 1.937740521792996e-05, "loss": 0.0339, "step": 8710 }, { "epoch": 33.53846153846154, "grad_norm": 1.5314304828643799, "learning_rate": 1.935367680311106e-05, "loss": 0.0381, "step": 8720 }, { "epoch": 33.57692307692308, "grad_norm": 1.5628886222839355, "learning_rate": 1.9329936485506012e-05, "loss": 0.0403, "step": 8730 }, { "epoch": 33.61538461538461, "grad_norm": 1.291195034980774, "learning_rate": 1.930618433001996e-05, "loss": 0.0386, "step": 8740 }, { "epoch": 33.65384615384615, "grad_norm": 1.1823079586029053, "learning_rate": 1.9282420401590377e-05, "loss": 0.0402, "step": 8750 }, { "epoch": 33.69230769230769, "grad_norm": 1.515400767326355, "learning_rate": 1.925864476518694e-05, "loss": 0.0431, "step": 8760 }, { "epoch": 33.73076923076923, "grad_norm": 1.39093017578125, "learning_rate": 1.9234857485811336e-05, "loss": 0.0366, "step": 8770 }, { "epoch": 33.76923076923077, "grad_norm": 1.5924241542816162, "learning_rate": 1.9211058628497066e-05, "loss": 0.0412, "step": 8780 }, { "epoch": 33.80769230769231, "grad_norm": 1.3143709897994995, "learning_rate": 1.918724825830931e-05, "loss": 0.0447, "step": 8790 }, { "epoch": 33.84615384615385, "grad_norm": 1.6090748310089111, "learning_rate": 1.9163426440344702e-05, "loss": 0.0398, "step": 8800 }, { "epoch": 33.88461538461539, "grad_norm": 1.4839627742767334, "learning_rate": 1.913959323973119e-05, "loss": 0.0405, "step": 8810 }, { "epoch": 33.92307692307692, "grad_norm": 1.691476583480835, "learning_rate": 1.9115748721627827e-05, "loss": 0.041, "step": 8820 }, { "epoch": 33.96153846153846, "grad_norm": 1.4554405212402344, "learning_rate": 1.9091892951224614e-05, "loss": 0.0394, "step": 8830 }, { "epoch": 34.0, "grad_norm": 1.4189049005508423, "learning_rate": 1.906802599374233e-05, "loss": 0.0357, "step": 8840 }, { "epoch": 34.03846153846154, "grad_norm": 1.9339512586593628, "learning_rate": 1.904414791443231e-05, "loss": 0.0432, "step": 8850 }, { "epoch": 34.07692307692308, "grad_norm": 1.4830939769744873, "learning_rate": 1.9020258778576324e-05, "loss": 0.042, "step": 8860 }, { "epoch": 34.11538461538461, "grad_norm": 1.5076172351837158, "learning_rate": 1.8996358651486347e-05, "loss": 0.0365, "step": 8870 }, { "epoch": 34.15384615384615, "grad_norm": 1.5338919162750244, "learning_rate": 1.8972447598504417e-05, "loss": 0.0406, "step": 8880 }, { "epoch": 34.19230769230769, "grad_norm": 1.9823853969573975, "learning_rate": 1.8948525685002438e-05, "loss": 0.0349, "step": 8890 }, { "epoch": 34.23076923076923, "grad_norm": 1.429854393005371, "learning_rate": 1.892459297638201e-05, "loss": 0.0427, "step": 8900 }, { "epoch": 34.26923076923077, "grad_norm": 1.8108021020889282, "learning_rate": 1.890064953807425e-05, "loss": 0.0341, "step": 8910 }, { "epoch": 34.30769230769231, "grad_norm": 1.07712721824646, "learning_rate": 1.8876695435539596e-05, "loss": 0.0371, "step": 8920 }, { "epoch": 34.34615384615385, "grad_norm": 1.3517080545425415, "learning_rate": 1.8852730734267653e-05, "loss": 0.0348, "step": 8930 }, { "epoch": 34.38461538461539, "grad_norm": 1.369992733001709, "learning_rate": 1.8828755499776997e-05, "loss": 0.0379, "step": 8940 }, { "epoch": 34.42307692307692, "grad_norm": 1.4494092464447021, "learning_rate": 1.8804769797615007e-05, "loss": 0.0386, "step": 8950 }, { "epoch": 34.46153846153846, "grad_norm": 1.806056022644043, "learning_rate": 1.8780773693357675e-05, "loss": 0.0416, "step": 8960 }, { "epoch": 34.5, "grad_norm": 1.4004135131835938, "learning_rate": 1.8756767252609433e-05, "loss": 0.034, "step": 8970 }, { "epoch": 34.53846153846154, "grad_norm": 1.3696749210357666, "learning_rate": 1.8732750541002974e-05, "loss": 0.0393, "step": 8980 }, { "epoch": 34.57692307692308, "grad_norm": 1.2756446599960327, "learning_rate": 1.870872362419907e-05, "loss": 0.0398, "step": 8990 }, { "epoch": 34.61538461538461, "grad_norm": 1.3034799098968506, "learning_rate": 1.8684686567886398e-05, "loss": 0.0435, "step": 9000 }, { "epoch": 34.65384615384615, "grad_norm": 1.5392398834228516, "learning_rate": 1.8660639437781344e-05, "loss": 0.0349, "step": 9010 }, { "epoch": 34.69230769230769, "grad_norm": 1.5113469362258911, "learning_rate": 1.8636582299627854e-05, "loss": 0.0359, "step": 9020 }, { "epoch": 34.73076923076923, "grad_norm": 1.1920346021652222, "learning_rate": 1.8612515219197215e-05, "loss": 0.0361, "step": 9030 }, { "epoch": 34.76923076923077, "grad_norm": 1.304843783378601, "learning_rate": 1.858843826228791e-05, "loss": 0.0357, "step": 9040 }, { "epoch": 34.80769230769231, "grad_norm": 1.7710132598876953, "learning_rate": 1.8564351494725423e-05, "loss": 0.0373, "step": 9050 }, { "epoch": 34.84615384615385, "grad_norm": 1.4856317043304443, "learning_rate": 1.8540254982362053e-05, "loss": 0.0371, "step": 9060 }, { "epoch": 34.88461538461539, "grad_norm": 1.8330937623977661, "learning_rate": 1.8516148791076743e-05, "loss": 0.0389, "step": 9070 }, { "epoch": 34.92307692307692, "grad_norm": 1.504800796508789, "learning_rate": 1.8492032986774904e-05, "loss": 0.0359, "step": 9080 }, { "epoch": 34.96153846153846, "grad_norm": 1.2242687940597534, "learning_rate": 1.8467907635388225e-05, "loss": 0.0371, "step": 9090 }, { "epoch": 35.0, "grad_norm": 1.3216480016708374, "learning_rate": 1.844377280287449e-05, "loss": 0.0386, "step": 9100 }, { "epoch": 35.03846153846154, "grad_norm": 1.0509378910064697, "learning_rate": 1.8419628555217407e-05, "loss": 0.0405, "step": 9110 }, { "epoch": 35.07692307692308, "grad_norm": 1.1122572422027588, "learning_rate": 1.839547495842644e-05, "loss": 0.039, "step": 9120 }, { "epoch": 35.11538461538461, "grad_norm": 1.334999680519104, "learning_rate": 1.8371312078536587e-05, "loss": 0.0316, "step": 9130 }, { "epoch": 35.15384615384615, "grad_norm": 1.2729421854019165, "learning_rate": 1.834713998160825e-05, "loss": 0.039, "step": 9140 }, { "epoch": 35.19230769230769, "grad_norm": 1.4823824167251587, "learning_rate": 1.832295873372701e-05, "loss": 0.0428, "step": 9150 }, { "epoch": 35.23076923076923, "grad_norm": 1.5749338865280151, "learning_rate": 1.8298768401003477e-05, "loss": 0.0386, "step": 9160 }, { "epoch": 35.26923076923077, "grad_norm": 1.4943759441375732, "learning_rate": 1.8274569049573103e-05, "loss": 0.0409, "step": 9170 }, { "epoch": 35.30769230769231, "grad_norm": 1.4229377508163452, "learning_rate": 1.8250360745595983e-05, "loss": 0.0398, "step": 9180 }, { "epoch": 35.34615384615385, "grad_norm": 1.192239761352539, "learning_rate": 1.8226143555256703e-05, "loss": 0.0348, "step": 9190 }, { "epoch": 35.38461538461539, "grad_norm": 1.7158582210540771, "learning_rate": 1.820191754476413e-05, "loss": 0.04, "step": 9200 }, { "epoch": 35.42307692307692, "grad_norm": 1.2287282943725586, "learning_rate": 1.8177682780351256e-05, "loss": 0.0381, "step": 9210 }, { "epoch": 35.46153846153846, "grad_norm": 1.578681468963623, "learning_rate": 1.8153439328275e-05, "loss": 0.0434, "step": 9220 }, { "epoch": 35.5, "grad_norm": 1.7834547758102417, "learning_rate": 1.8129187254816035e-05, "loss": 0.0393, "step": 9230 }, { "epoch": 35.53846153846154, "grad_norm": 1.6021229028701782, "learning_rate": 1.81049266262786e-05, "loss": 0.0372, "step": 9240 }, { "epoch": 35.57692307692308, "grad_norm": 1.3831838369369507, "learning_rate": 1.808065750899033e-05, "loss": 0.0409, "step": 9250 }, { "epoch": 35.61538461538461, "grad_norm": 1.727046012878418, "learning_rate": 1.8056379969302066e-05, "loss": 0.0377, "step": 9260 }, { "epoch": 35.65384615384615, "grad_norm": 1.710699439048767, "learning_rate": 1.8032094073587675e-05, "loss": 0.0376, "step": 9270 }, { "epoch": 35.69230769230769, "grad_norm": 1.6480785608291626, "learning_rate": 1.800779988824387e-05, "loss": 0.0343, "step": 9280 }, { "epoch": 35.73076923076923, "grad_norm": 1.4451653957366943, "learning_rate": 1.7983497479690018e-05, "loss": 0.0347, "step": 9290 }, { "epoch": 35.76923076923077, "grad_norm": 1.520155668258667, "learning_rate": 1.795918691436798e-05, "loss": 0.039, "step": 9300 }, { "epoch": 35.80769230769231, "grad_norm": 1.372151494026184, "learning_rate": 1.7934868258741917e-05, "loss": 0.0366, "step": 9310 }, { "epoch": 35.84615384615385, "grad_norm": 1.39862859249115, "learning_rate": 1.79105415792981e-05, "loss": 0.0377, "step": 9320 }, { "epoch": 35.88461538461539, "grad_norm": 1.4081544876098633, "learning_rate": 1.788620694254475e-05, "loss": 0.0412, "step": 9330 }, { "epoch": 35.92307692307692, "grad_norm": 1.39034903049469, "learning_rate": 1.7861864415011827e-05, "loss": 0.0411, "step": 9340 }, { "epoch": 35.96153846153846, "grad_norm": 1.650275468826294, "learning_rate": 1.783751406325087e-05, "loss": 0.0336, "step": 9350 }, { "epoch": 36.0, "grad_norm": 1.45979905128479, "learning_rate": 1.7813155953834814e-05, "loss": 0.0402, "step": 9360 }, { "epoch": 36.03846153846154, "grad_norm": 1.596880555152893, "learning_rate": 1.7788790153357803e-05, "loss": 0.0439, "step": 9370 }, { "epoch": 36.07692307692308, "grad_norm": 1.4605010747909546, "learning_rate": 1.7764416728435e-05, "loss": 0.0429, "step": 9380 }, { "epoch": 36.11538461538461, "grad_norm": 1.4628403186798096, "learning_rate": 1.774003574570242e-05, "loss": 0.0416, "step": 9390 }, { "epoch": 36.15384615384615, "grad_norm": 1.8021140098571777, "learning_rate": 1.7715647271816744e-05, "loss": 0.0346, "step": 9400 }, { "epoch": 36.19230769230769, "grad_norm": 1.4157359600067139, "learning_rate": 1.769125137345512e-05, "loss": 0.0357, "step": 9410 }, { "epoch": 36.23076923076923, "grad_norm": 1.421094536781311, "learning_rate": 1.7666848117315008e-05, "loss": 0.0392, "step": 9420 }, { "epoch": 36.26923076923077, "grad_norm": 1.7203149795532227, "learning_rate": 1.7642437570113974e-05, "loss": 0.0358, "step": 9430 }, { "epoch": 36.30769230769231, "grad_norm": 1.436547040939331, "learning_rate": 1.7618019798589525e-05, "loss": 0.043, "step": 9440 }, { "epoch": 36.34615384615385, "grad_norm": 1.5459340810775757, "learning_rate": 1.7593594869498915e-05, "loss": 0.035, "step": 9450 }, { "epoch": 36.38461538461539, "grad_norm": 1.5073521137237549, "learning_rate": 1.7569162849618966e-05, "loss": 0.0393, "step": 9460 }, { "epoch": 36.42307692307692, "grad_norm": 1.3095002174377441, "learning_rate": 1.75447238057459e-05, "loss": 0.0383, "step": 9470 }, { "epoch": 36.46153846153846, "grad_norm": 1.308945655822754, "learning_rate": 1.752027780469511e-05, "loss": 0.0364, "step": 9480 }, { "epoch": 36.5, "grad_norm": 1.4354068040847778, "learning_rate": 1.7495824913301043e-05, "loss": 0.0417, "step": 9490 }, { "epoch": 36.53846153846154, "grad_norm": 1.756820559501648, "learning_rate": 1.7471365198416957e-05, "loss": 0.0393, "step": 9500 }, { "epoch": 36.57692307692308, "grad_norm": 1.3326730728149414, "learning_rate": 1.7446898726914797e-05, "loss": 0.0369, "step": 9510 }, { "epoch": 36.61538461538461, "grad_norm": 1.3120198249816895, "learning_rate": 1.742242556568495e-05, "loss": 0.0426, "step": 9520 }, { "epoch": 36.65384615384615, "grad_norm": 1.121603012084961, "learning_rate": 1.73979457816361e-05, "loss": 0.0387, "step": 9530 }, { "epoch": 36.69230769230769, "grad_norm": 1.3100221157073975, "learning_rate": 1.7373459441695058e-05, "loss": 0.0423, "step": 9540 }, { "epoch": 36.73076923076923, "grad_norm": 1.3542190790176392, "learning_rate": 1.7348966612806524e-05, "loss": 0.0421, "step": 9550 }, { "epoch": 36.76923076923077, "grad_norm": 1.59927499294281, "learning_rate": 1.7324467361932973e-05, "loss": 0.0397, "step": 9560 }, { "epoch": 36.80769230769231, "grad_norm": 1.6307555437088013, "learning_rate": 1.729996175605441e-05, "loss": 0.0411, "step": 9570 }, { "epoch": 36.84615384615385, "grad_norm": 1.3856720924377441, "learning_rate": 1.7275449862168235e-05, "loss": 0.0401, "step": 9580 }, { "epoch": 36.88461538461539, "grad_norm": 1.2768700122833252, "learning_rate": 1.725093174728902e-05, "loss": 0.0324, "step": 9590 }, { "epoch": 36.92307692307692, "grad_norm": 1.9689397811889648, "learning_rate": 1.7226407478448357e-05, "loss": 0.038, "step": 9600 }, { "epoch": 36.96153846153846, "grad_norm": 1.4385662078857422, "learning_rate": 1.7201877122694666e-05, "loss": 0.0423, "step": 9610 }, { "epoch": 37.0, "grad_norm": 1.3675845861434937, "learning_rate": 1.7177340747093e-05, "loss": 0.0397, "step": 9620 }, { "epoch": 37.03846153846154, "grad_norm": 1.733004093170166, "learning_rate": 1.7152798418724873e-05, "loss": 0.0417, "step": 9630 }, { "epoch": 37.07692307692308, "grad_norm": 1.5616117715835571, "learning_rate": 1.712825020468807e-05, "loss": 0.0359, "step": 9640 }, { "epoch": 37.11538461538461, "grad_norm": 1.513261079788208, "learning_rate": 1.710369617209648e-05, "loss": 0.0414, "step": 9650 }, { "epoch": 37.15384615384615, "grad_norm": 1.6468145847320557, "learning_rate": 1.7079136388079884e-05, "loss": 0.038, "step": 9660 }, { "epoch": 37.19230769230769, "grad_norm": 1.6188808679580688, "learning_rate": 1.7054570919783796e-05, "loss": 0.0382, "step": 9670 }, { "epoch": 37.23076923076923, "grad_norm": 1.556585669517517, "learning_rate": 1.7029999834369264e-05, "loss": 0.0387, "step": 9680 }, { "epoch": 37.26923076923077, "grad_norm": 1.5919781923294067, "learning_rate": 1.7005423199012696e-05, "loss": 0.034, "step": 9690 }, { "epoch": 37.30769230769231, "grad_norm": 1.565858244895935, "learning_rate": 1.6980841080905687e-05, "loss": 0.0344, "step": 9700 }, { "epoch": 37.34615384615385, "grad_norm": 1.190491795539856, "learning_rate": 1.6956253547254798e-05, "loss": 0.0405, "step": 9710 }, { "epoch": 37.38461538461539, "grad_norm": 1.2053945064544678, "learning_rate": 1.693166066528141e-05, "loss": 0.0366, "step": 9720 }, { "epoch": 37.42307692307692, "grad_norm": 1.7797147035598755, "learning_rate": 1.690706250222152e-05, "loss": 0.0404, "step": 9730 }, { "epoch": 37.46153846153846, "grad_norm": 1.3981484174728394, "learning_rate": 1.6882459125325573e-05, "loss": 0.0372, "step": 9740 }, { "epoch": 37.5, "grad_norm": 1.3930727243423462, "learning_rate": 1.685785060185826e-05, "loss": 0.036, "step": 9750 }, { "epoch": 37.53846153846154, "grad_norm": 1.2622334957122803, "learning_rate": 1.683323699909834e-05, "loss": 0.036, "step": 9760 }, { "epoch": 37.57692307692308, "grad_norm": 1.3782292604446411, "learning_rate": 1.6808618384338472e-05, "loss": 0.0375, "step": 9770 }, { "epoch": 37.61538461538461, "grad_norm": 1.3075623512268066, "learning_rate": 1.6783994824885e-05, "loss": 0.0357, "step": 9780 }, { "epoch": 37.65384615384615, "grad_norm": 1.449067234992981, "learning_rate": 1.6759366388057795e-05, "loss": 0.04, "step": 9790 }, { "epoch": 37.69230769230769, "grad_norm": 1.53331458568573, "learning_rate": 1.6734733141190073e-05, "loss": 0.0322, "step": 9800 }, { "epoch": 37.73076923076923, "grad_norm": 1.3725186586380005, "learning_rate": 1.6710095151628182e-05, "loss": 0.0394, "step": 9810 }, { "epoch": 37.76923076923077, "grad_norm": 1.6133525371551514, "learning_rate": 1.668545248673144e-05, "loss": 0.0329, "step": 9820 }, { "epoch": 37.80769230769231, "grad_norm": 1.2607682943344116, "learning_rate": 1.6660805213871962e-05, "loss": 0.0432, "step": 9830 }, { "epoch": 37.84615384615385, "grad_norm": 1.4924263954162598, "learning_rate": 1.663615340043445e-05, "loss": 0.0392, "step": 9840 }, { "epoch": 37.88461538461539, "grad_norm": 1.3537297248840332, "learning_rate": 1.6611497113816014e-05, "loss": 0.0426, "step": 9850 }, { "epoch": 37.92307692307692, "grad_norm": 1.1831531524658203, "learning_rate": 1.6586836421426007e-05, "loss": 0.0383, "step": 9860 }, { "epoch": 37.96153846153846, "grad_norm": 1.317609429359436, "learning_rate": 1.6562171390685815e-05, "loss": 0.033, "step": 9870 }, { "epoch": 38.0, "grad_norm": 1.4860436916351318, "learning_rate": 1.653750208902869e-05, "loss": 0.0377, "step": 9880 }, { "epoch": 38.03846153846154, "grad_norm": 1.7705858945846558, "learning_rate": 1.6512828583899562e-05, "loss": 0.0428, "step": 9890 }, { "epoch": 38.07692307692308, "grad_norm": 1.4256800413131714, "learning_rate": 1.648815094275486e-05, "loss": 0.0374, "step": 9900 }, { "epoch": 38.11538461538461, "grad_norm": 1.430487871170044, "learning_rate": 1.6463469233062302e-05, "loss": 0.0382, "step": 9910 }, { "epoch": 38.15384615384615, "grad_norm": 1.273598074913025, "learning_rate": 1.6438783522300742e-05, "loss": 0.0403, "step": 9920 }, { "epoch": 38.19230769230769, "grad_norm": 1.6827222108840942, "learning_rate": 1.641409387795997e-05, "loss": 0.0318, "step": 9930 }, { "epoch": 38.23076923076923, "grad_norm": 1.5564827919006348, "learning_rate": 1.6389400367540534e-05, "loss": 0.0443, "step": 9940 }, { "epoch": 38.26923076923077, "grad_norm": 1.5248053073883057, "learning_rate": 1.6364703058553552e-05, "loss": 0.0408, "step": 9950 }, { "epoch": 38.30769230769231, "grad_norm": 1.2614673376083374, "learning_rate": 1.6340002018520512e-05, "loss": 0.0372, "step": 9960 }, { "epoch": 38.34615384615385, "grad_norm": 1.43790602684021, "learning_rate": 1.6315297314973126e-05, "loss": 0.0354, "step": 9970 }, { "epoch": 38.38461538461539, "grad_norm": 1.3772172927856445, "learning_rate": 1.6290589015453102e-05, "loss": 0.0362, "step": 9980 }, { "epoch": 38.42307692307692, "grad_norm": 1.5918662548065186, "learning_rate": 1.6265877187511993e-05, "loss": 0.0378, "step": 9990 }, { "epoch": 38.46153846153846, "grad_norm": 1.3942394256591797, "learning_rate": 1.6241161898710993e-05, "loss": 0.0408, "step": 10000 }, { "epoch": 38.5, "grad_norm": 0.9701500535011292, "learning_rate": 1.6216443216620752e-05, "loss": 0.0364, "step": 10010 }, { "epoch": 38.53846153846154, "grad_norm": 1.424035668373108, "learning_rate": 1.6191721208821208e-05, "loss": 0.0372, "step": 10020 }, { "epoch": 38.57692307692308, "grad_norm": 1.392910361289978, "learning_rate": 1.6166995942901382e-05, "loss": 0.0374, "step": 10030 }, { "epoch": 38.61538461538461, "grad_norm": 1.6318131685256958, "learning_rate": 1.614226748645921e-05, "loss": 0.0436, "step": 10040 }, { "epoch": 38.65384615384615, "grad_norm": 1.4503819942474365, "learning_rate": 1.6117535907101354e-05, "loss": 0.0332, "step": 10050 }, { "epoch": 38.69230769230769, "grad_norm": 1.5287636518478394, "learning_rate": 1.6092801272442996e-05, "loss": 0.0395, "step": 10060 }, { "epoch": 38.73076923076923, "grad_norm": 1.3624824285507202, "learning_rate": 1.606806365010769e-05, "loss": 0.037, "step": 10070 }, { "epoch": 38.76923076923077, "grad_norm": 1.548240065574646, "learning_rate": 1.6043323107727143e-05, "loss": 0.0361, "step": 10080 }, { "epoch": 38.80769230769231, "grad_norm": 1.3337862491607666, "learning_rate": 1.6018579712941064e-05, "loss": 0.0344, "step": 10090 }, { "epoch": 38.84615384615385, "grad_norm": 1.3645668029785156, "learning_rate": 1.599383353339694e-05, "loss": 0.0361, "step": 10100 }, { "epoch": 38.88461538461539, "grad_norm": 1.2591993808746338, "learning_rate": 1.596908463674989e-05, "loss": 0.0381, "step": 10110 }, { "epoch": 38.92307692307692, "grad_norm": 1.321970820426941, "learning_rate": 1.5944333090662442e-05, "loss": 0.0363, "step": 10120 }, { "epoch": 38.96153846153846, "grad_norm": 1.4175691604614258, "learning_rate": 1.5919578962804386e-05, "loss": 0.0349, "step": 10130 }, { "epoch": 39.0, "grad_norm": 1.2944371700286865, "learning_rate": 1.5894822320852563e-05, "loss": 0.0344, "step": 10140 }, { "epoch": 39.03846153846154, "grad_norm": 1.2625024318695068, "learning_rate": 1.5870063232490677e-05, "loss": 0.0344, "step": 10150 }, { "epoch": 39.07692307692308, "grad_norm": 1.521975040435791, "learning_rate": 1.5845301765409144e-05, "loss": 0.0347, "step": 10160 }, { "epoch": 39.11538461538461, "grad_norm": 1.0716321468353271, "learning_rate": 1.5820537987304856e-05, "loss": 0.0378, "step": 10170 }, { "epoch": 39.15384615384615, "grad_norm": 1.3439924716949463, "learning_rate": 1.5795771965881044e-05, "loss": 0.0372, "step": 10180 }, { "epoch": 39.19230769230769, "grad_norm": 1.193076252937317, "learning_rate": 1.577100376884707e-05, "loss": 0.0375, "step": 10190 }, { "epoch": 39.23076923076923, "grad_norm": 1.5394845008850098, "learning_rate": 1.5746233463918226e-05, "loss": 0.0352, "step": 10200 }, { "epoch": 39.26923076923077, "grad_norm": 1.3572865724563599, "learning_rate": 1.572146111881559e-05, "loss": 0.0388, "step": 10210 }, { "epoch": 39.30769230769231, "grad_norm": 1.4613553285598755, "learning_rate": 1.56966868012658e-05, "loss": 0.0356, "step": 10220 }, { "epoch": 39.34615384615385, "grad_norm": 1.1900838613510132, "learning_rate": 1.56719105790009e-05, "loss": 0.038, "step": 10230 }, { "epoch": 39.38461538461539, "grad_norm": 1.154110312461853, "learning_rate": 1.5647132519758135e-05, "loss": 0.034, "step": 10240 }, { "epoch": 39.42307692307692, "grad_norm": 1.509911060333252, "learning_rate": 1.562235269127977e-05, "loss": 0.0357, "step": 10250 }, { "epoch": 39.46153846153846, "grad_norm": 1.6270045042037964, "learning_rate": 1.5597571161312914e-05, "loss": 0.0373, "step": 10260 }, { "epoch": 39.5, "grad_norm": 1.1288354396820068, "learning_rate": 1.557278799760932e-05, "loss": 0.0348, "step": 10270 }, { "epoch": 39.53846153846154, "grad_norm": 1.4229916334152222, "learning_rate": 1.5548003267925214e-05, "loss": 0.0449, "step": 10280 }, { "epoch": 39.57692307692308, "grad_norm": 1.5001567602157593, "learning_rate": 1.5523217040021094e-05, "loss": 0.0411, "step": 10290 }, { "epoch": 39.61538461538461, "grad_norm": 1.4716676473617554, "learning_rate": 1.549842938166157e-05, "loss": 0.0346, "step": 10300 }, { "epoch": 39.65384615384615, "grad_norm": 1.4848761558532715, "learning_rate": 1.5473640360615146e-05, "loss": 0.0362, "step": 10310 }, { "epoch": 39.69230769230769, "grad_norm": 1.4209644794464111, "learning_rate": 1.5448850044654063e-05, "loss": 0.0393, "step": 10320 }, { "epoch": 39.73076923076923, "grad_norm": 1.292456030845642, "learning_rate": 1.5424058501554102e-05, "loss": 0.0346, "step": 10330 }, { "epoch": 39.76923076923077, "grad_norm": 1.3563344478607178, "learning_rate": 1.5399265799094383e-05, "loss": 0.0329, "step": 10340 }, { "epoch": 39.80769230769231, "grad_norm": 1.2830549478530884, "learning_rate": 1.537447200505722e-05, "loss": 0.0358, "step": 10350 }, { "epoch": 39.84615384615385, "grad_norm": 1.562695860862732, "learning_rate": 1.5349677187227892e-05, "loss": 0.03, "step": 10360 }, { "epoch": 39.88461538461539, "grad_norm": 1.3443552255630493, "learning_rate": 1.532488141339449e-05, "loss": 0.0394, "step": 10370 }, { "epoch": 39.92307692307692, "grad_norm": 1.4095454216003418, "learning_rate": 1.5300084751347703e-05, "loss": 0.0367, "step": 10380 }, { "epoch": 39.96153846153846, "grad_norm": 1.2375614643096924, "learning_rate": 1.527528726888067e-05, "loss": 0.0324, "step": 10390 }, { "epoch": 40.0, "grad_norm": 1.1956071853637695, "learning_rate": 1.5250489033788757e-05, "loss": 0.0371, "step": 10400 }, { "epoch": 40.03846153846154, "grad_norm": 1.4053899049758911, "learning_rate": 1.5225690113869383e-05, "loss": 0.0355, "step": 10410 }, { "epoch": 40.07692307692308, "grad_norm": 1.349990725517273, "learning_rate": 1.5200890576921863e-05, "loss": 0.0358, "step": 10420 }, { "epoch": 40.11538461538461, "grad_norm": 1.2344571352005005, "learning_rate": 1.5176090490747174e-05, "loss": 0.0344, "step": 10430 }, { "epoch": 40.15384615384615, "grad_norm": 1.5571675300598145, "learning_rate": 1.5151289923147806e-05, "loss": 0.0384, "step": 10440 }, { "epoch": 40.19230769230769, "grad_norm": 1.5244865417480469, "learning_rate": 1.5126488941927568e-05, "loss": 0.0371, "step": 10450 }, { "epoch": 40.23076923076923, "grad_norm": 1.5313900709152222, "learning_rate": 1.5101687614891385e-05, "loss": 0.0351, "step": 10460 }, { "epoch": 40.26923076923077, "grad_norm": 1.3361246585845947, "learning_rate": 1.5076886009845157e-05, "loss": 0.0354, "step": 10470 }, { "epoch": 40.30769230769231, "grad_norm": 1.0437862873077393, "learning_rate": 1.5052084194595507e-05, "loss": 0.0359, "step": 10480 }, { "epoch": 40.34615384615385, "grad_norm": 1.0374122858047485, "learning_rate": 1.5027282236949662e-05, "loss": 0.0337, "step": 10490 }, { "epoch": 40.38461538461539, "grad_norm": 1.1165339946746826, "learning_rate": 1.5002480204715218e-05, "loss": 0.0326, "step": 10500 }, { "epoch": 40.42307692307692, "grad_norm": 1.3532160520553589, "learning_rate": 1.4977678165699992e-05, "loss": 0.0389, "step": 10510 }, { "epoch": 40.46153846153846, "grad_norm": 1.457037329673767, "learning_rate": 1.4952876187711806e-05, "loss": 0.0414, "step": 10520 }, { "epoch": 40.5, "grad_norm": 0.9408591985702515, "learning_rate": 1.4928074338558326e-05, "loss": 0.0395, "step": 10530 }, { "epoch": 40.53846153846154, "grad_norm": 1.212058424949646, "learning_rate": 1.4903272686046857e-05, "loss": 0.0341, "step": 10540 }, { "epoch": 40.57692307692308, "grad_norm": 1.4682952165603638, "learning_rate": 1.4878471297984174e-05, "loss": 0.0335, "step": 10550 }, { "epoch": 40.61538461538461, "grad_norm": 1.4514968395233154, "learning_rate": 1.4853670242176318e-05, "loss": 0.0367, "step": 10560 }, { "epoch": 40.65384615384615, "grad_norm": 1.0562829971313477, "learning_rate": 1.4828869586428433e-05, "loss": 0.0357, "step": 10570 }, { "epoch": 40.69230769230769, "grad_norm": 1.3134219646453857, "learning_rate": 1.4804069398544563e-05, "loss": 0.0333, "step": 10580 }, { "epoch": 40.73076923076923, "grad_norm": 1.3238235712051392, "learning_rate": 1.477926974632748e-05, "loss": 0.0375, "step": 10590 }, { "epoch": 40.76923076923077, "grad_norm": 1.2255327701568604, "learning_rate": 1.4754470697578478e-05, "loss": 0.0354, "step": 10600 }, { "epoch": 40.80769230769231, "grad_norm": 1.4414708614349365, "learning_rate": 1.4729672320097214e-05, "loss": 0.0358, "step": 10610 }, { "epoch": 40.84615384615385, "grad_norm": 1.1872987747192383, "learning_rate": 1.4704874681681504e-05, "loss": 0.0342, "step": 10620 }, { "epoch": 40.88461538461539, "grad_norm": 1.2035821676254272, "learning_rate": 1.4680077850127146e-05, "loss": 0.0378, "step": 10630 }, { "epoch": 40.92307692307692, "grad_norm": 1.3482985496520996, "learning_rate": 1.465528189322773e-05, "loss": 0.0341, "step": 10640 }, { "epoch": 40.96153846153846, "grad_norm": 1.2167506217956543, "learning_rate": 1.4630486878774455e-05, "loss": 0.0318, "step": 10650 }, { "epoch": 41.0, "grad_norm": 1.1671282052993774, "learning_rate": 1.4605692874555942e-05, "loss": 0.0357, "step": 10660 }, { "epoch": 41.03846153846154, "grad_norm": 1.4723036289215088, "learning_rate": 1.4580899948358054e-05, "loss": 0.0341, "step": 10670 }, { "epoch": 41.07692307692308, "grad_norm": 1.1682496070861816, "learning_rate": 1.4556108167963702e-05, "loss": 0.0373, "step": 10680 }, { "epoch": 41.11538461538461, "grad_norm": 1.1913172006607056, "learning_rate": 1.4531317601152675e-05, "loss": 0.0328, "step": 10690 }, { "epoch": 41.15384615384615, "grad_norm": 1.110813856124878, "learning_rate": 1.4506528315701425e-05, "loss": 0.0348, "step": 10700 }, { "epoch": 41.19230769230769, "grad_norm": 1.4629855155944824, "learning_rate": 1.4481740379382916e-05, "loss": 0.0398, "step": 10710 }, { "epoch": 41.23076923076923, "grad_norm": 1.0551105737686157, "learning_rate": 1.445695385996642e-05, "loss": 0.0308, "step": 10720 }, { "epoch": 41.26923076923077, "grad_norm": 1.0418416261672974, "learning_rate": 1.4432168825217344e-05, "loss": 0.0325, "step": 10730 }, { "epoch": 41.30769230769231, "grad_norm": 1.2222310304641724, "learning_rate": 1.4407385342897005e-05, "loss": 0.0379, "step": 10740 }, { "epoch": 41.34615384615385, "grad_norm": 1.45067298412323, "learning_rate": 1.4382603480762514e-05, "loss": 0.0334, "step": 10750 }, { "epoch": 41.38461538461539, "grad_norm": 1.6197105646133423, "learning_rate": 1.4357823306566529e-05, "loss": 0.0385, "step": 10760 }, { "epoch": 41.42307692307692, "grad_norm": 1.3169426918029785, "learning_rate": 1.4333044888057104e-05, "loss": 0.0362, "step": 10770 }, { "epoch": 41.46153846153846, "grad_norm": 1.2981311082839966, "learning_rate": 1.4308268292977496e-05, "loss": 0.0353, "step": 10780 }, { "epoch": 41.5, "grad_norm": 1.1880425214767456, "learning_rate": 1.4283493589065948e-05, "loss": 0.0365, "step": 10790 }, { "epoch": 41.53846153846154, "grad_norm": 1.3107539415359497, "learning_rate": 1.4258720844055573e-05, "loss": 0.0352, "step": 10800 }, { "epoch": 41.57692307692308, "grad_norm": 0.9910938739776611, "learning_rate": 1.4233950125674105e-05, "loss": 0.0354, "step": 10810 }, { "epoch": 41.61538461538461, "grad_norm": 0.9883972406387329, "learning_rate": 1.420918150164374e-05, "loss": 0.0393, "step": 10820 }, { "epoch": 41.65384615384615, "grad_norm": 1.3997018337249756, "learning_rate": 1.4184415039680958e-05, "loss": 0.0391, "step": 10830 }, { "epoch": 41.69230769230769, "grad_norm": 1.234336018562317, "learning_rate": 1.415965080749631e-05, "loss": 0.0374, "step": 10840 }, { "epoch": 41.73076923076923, "grad_norm": 1.1024219989776611, "learning_rate": 1.4134888872794265e-05, "loss": 0.0343, "step": 10850 }, { "epoch": 41.76923076923077, "grad_norm": 1.2052985429763794, "learning_rate": 1.411012930327301e-05, "loss": 0.034, "step": 10860 }, { "epoch": 41.80769230769231, "grad_norm": 1.3438876867294312, "learning_rate": 1.4085372166624263e-05, "loss": 0.0333, "step": 10870 }, { "epoch": 41.84615384615385, "grad_norm": 1.418308138847351, "learning_rate": 1.406061753053308e-05, "loss": 0.0343, "step": 10880 }, { "epoch": 41.88461538461539, "grad_norm": 1.53734290599823, "learning_rate": 1.40358654626777e-05, "loss": 0.0338, "step": 10890 }, { "epoch": 41.92307692307692, "grad_norm": 1.1732251644134521, "learning_rate": 1.4011116030729333e-05, "loss": 0.035, "step": 10900 }, { "epoch": 41.96153846153846, "grad_norm": 1.2510560750961304, "learning_rate": 1.3986369302351974e-05, "loss": 0.033, "step": 10910 }, { "epoch": 42.0, "grad_norm": 1.1198493242263794, "learning_rate": 1.3961625345202245e-05, "loss": 0.0305, "step": 10920 }, { "epoch": 42.03846153846154, "grad_norm": 1.0447126626968384, "learning_rate": 1.3936884226929163e-05, "loss": 0.0384, "step": 10930 }, { "epoch": 42.07692307692308, "grad_norm": 1.4311703443527222, "learning_rate": 1.391214601517401e-05, "loss": 0.0334, "step": 10940 }, { "epoch": 42.11538461538461, "grad_norm": 1.3343842029571533, "learning_rate": 1.3887410777570116e-05, "loss": 0.0358, "step": 10950 }, { "epoch": 42.15384615384615, "grad_norm": 1.2059050798416138, "learning_rate": 1.3862678581742667e-05, "loss": 0.0349, "step": 10960 }, { "epoch": 42.19230769230769, "grad_norm": 1.4710291624069214, "learning_rate": 1.3837949495308558e-05, "loss": 0.0334, "step": 10970 }, { "epoch": 42.23076923076923, "grad_norm": 1.166894793510437, "learning_rate": 1.3813223585876145e-05, "loss": 0.0364, "step": 10980 }, { "epoch": 42.26923076923077, "grad_norm": 1.233723521232605, "learning_rate": 1.3788500921045135e-05, "loss": 0.0358, "step": 10990 }, { "epoch": 42.30769230769231, "grad_norm": 1.1590265035629272, "learning_rate": 1.3763781568406343e-05, "loss": 0.0429, "step": 11000 }, { "epoch": 42.34615384615385, "grad_norm": 1.349654197692871, "learning_rate": 1.3739065595541548e-05, "loss": 0.0321, "step": 11010 }, { "epoch": 42.38461538461539, "grad_norm": 1.0555387735366821, "learning_rate": 1.3714353070023257e-05, "loss": 0.0355, "step": 11020 }, { "epoch": 42.42307692307692, "grad_norm": 1.2509338855743408, "learning_rate": 1.368964405941458e-05, "loss": 0.0405, "step": 11030 }, { "epoch": 42.46153846153846, "grad_norm": 1.2438158988952637, "learning_rate": 1.366493863126901e-05, "loss": 0.0317, "step": 11040 }, { "epoch": 42.5, "grad_norm": 0.9446445107460022, "learning_rate": 1.3640236853130243e-05, "loss": 0.0364, "step": 11050 }, { "epoch": 42.53846153846154, "grad_norm": 1.0268769264221191, "learning_rate": 1.3615538792532002e-05, "loss": 0.0299, "step": 11060 }, { "epoch": 42.57692307692308, "grad_norm": 1.1675711870193481, "learning_rate": 1.3590844516997832e-05, "loss": 0.0399, "step": 11070 }, { "epoch": 42.61538461538461, "grad_norm": 1.2040306329727173, "learning_rate": 1.356615409404094e-05, "loss": 0.0365, "step": 11080 }, { "epoch": 42.65384615384615, "grad_norm": 1.488957405090332, "learning_rate": 1.354146759116401e-05, "loss": 0.035, "step": 11090 }, { "epoch": 42.69230769230769, "grad_norm": 1.195846676826477, "learning_rate": 1.3516785075858988e-05, "loss": 0.0365, "step": 11100 }, { "epoch": 42.73076923076923, "grad_norm": 1.2420823574066162, "learning_rate": 1.3492106615606941e-05, "loss": 0.0425, "step": 11110 }, { "epoch": 42.76923076923077, "grad_norm": 1.2918013334274292, "learning_rate": 1.346743227787782e-05, "loss": 0.0356, "step": 11120 }, { "epoch": 42.80769230769231, "grad_norm": 1.0789659023284912, "learning_rate": 1.344276213013033e-05, "loss": 0.0333, "step": 11130 }, { "epoch": 42.84615384615385, "grad_norm": 1.1360996961593628, "learning_rate": 1.3418096239811712e-05, "loss": 0.0388, "step": 11140 }, { "epoch": 42.88461538461539, "grad_norm": 1.3978289365768433, "learning_rate": 1.3393434674357579e-05, "loss": 0.0426, "step": 11150 }, { "epoch": 42.92307692307692, "grad_norm": 1.1686747074127197, "learning_rate": 1.3368777501191692e-05, "loss": 0.0388, "step": 11160 }, { "epoch": 42.96153846153846, "grad_norm": 1.410258173942566, "learning_rate": 1.334412478772583e-05, "loss": 0.0369, "step": 11170 }, { "epoch": 43.0, "grad_norm": 1.2414584159851074, "learning_rate": 1.3319476601359565e-05, "loss": 0.0342, "step": 11180 }, { "epoch": 43.03846153846154, "grad_norm": 1.326174259185791, "learning_rate": 1.3294833009480105e-05, "loss": 0.0409, "step": 11190 }, { "epoch": 43.07692307692308, "grad_norm": 1.3536320924758911, "learning_rate": 1.3270194079462091e-05, "loss": 0.0399, "step": 11200 }, { "epoch": 43.11538461538461, "grad_norm": 1.1759998798370361, "learning_rate": 1.3245559878667405e-05, "loss": 0.0432, "step": 11210 }, { "epoch": 43.15384615384615, "grad_norm": 1.3741729259490967, "learning_rate": 1.3220930474445019e-05, "loss": 0.0346, "step": 11220 }, { "epoch": 43.19230769230769, "grad_norm": 1.5057193040847778, "learning_rate": 1.3196305934130778e-05, "loss": 0.0406, "step": 11230 }, { "epoch": 43.23076923076923, "grad_norm": 1.0771777629852295, "learning_rate": 1.3171686325047241e-05, "loss": 0.033, "step": 11240 }, { "epoch": 43.26923076923077, "grad_norm": 1.219438910484314, "learning_rate": 1.3147071714503484e-05, "loss": 0.0401, "step": 11250 }, { "epoch": 43.30769230769231, "grad_norm": 1.2763108015060425, "learning_rate": 1.3122462169794903e-05, "loss": 0.0341, "step": 11260 }, { "epoch": 43.34615384615385, "grad_norm": 1.3366858959197998, "learning_rate": 1.3097857758203053e-05, "loss": 0.0332, "step": 11270 }, { "epoch": 43.38461538461539, "grad_norm": 1.2299692630767822, "learning_rate": 1.3073258546995455e-05, "loss": 0.0342, "step": 11280 }, { "epoch": 43.42307692307692, "grad_norm": 1.306052803993225, "learning_rate": 1.3048664603425429e-05, "loss": 0.0374, "step": 11290 }, { "epoch": 43.46153846153846, "grad_norm": 1.087632656097412, "learning_rate": 1.3024075994731859e-05, "loss": 0.0365, "step": 11300 }, { "epoch": 43.5, "grad_norm": 0.9882426857948303, "learning_rate": 1.2999492788139068e-05, "loss": 0.0341, "step": 11310 }, { "epoch": 43.53846153846154, "grad_norm": 1.2657909393310547, "learning_rate": 1.2974915050856605e-05, "loss": 0.0322, "step": 11320 }, { "epoch": 43.57692307692308, "grad_norm": 1.2123744487762451, "learning_rate": 1.2950342850079061e-05, "loss": 0.0376, "step": 11330 }, { "epoch": 43.61538461538461, "grad_norm": 1.2592309713363647, "learning_rate": 1.292577625298591e-05, "loss": 0.0387, "step": 11340 }, { "epoch": 43.65384615384615, "grad_norm": 1.4374959468841553, "learning_rate": 1.2901215326741273e-05, "loss": 0.0322, "step": 11350 }, { "epoch": 43.69230769230769, "grad_norm": 1.2394741773605347, "learning_rate": 1.287666013849379e-05, "loss": 0.0324, "step": 11360 }, { "epoch": 43.73076923076923, "grad_norm": 1.3554021120071411, "learning_rate": 1.285211075537641e-05, "loss": 0.0399, "step": 11370 }, { "epoch": 43.76923076923077, "grad_norm": 1.1661251783370972, "learning_rate": 1.2827567244506203e-05, "loss": 0.0318, "step": 11380 }, { "epoch": 43.80769230769231, "grad_norm": 1.3110450506210327, "learning_rate": 1.2803029672984208e-05, "loss": 0.0323, "step": 11390 }, { "epoch": 43.84615384615385, "grad_norm": 1.064651370048523, "learning_rate": 1.2778498107895186e-05, "loss": 0.03, "step": 11400 }, { "epoch": 43.88461538461539, "grad_norm": 1.1394022703170776, "learning_rate": 1.275397261630751e-05, "loss": 0.0366, "step": 11410 }, { "epoch": 43.92307692307692, "grad_norm": 1.3582385778427124, "learning_rate": 1.2729453265272935e-05, "loss": 0.0375, "step": 11420 }, { "epoch": 43.96153846153846, "grad_norm": 1.1929188966751099, "learning_rate": 1.270494012182644e-05, "loss": 0.0334, "step": 11430 }, { "epoch": 44.0, "grad_norm": 1.1532018184661865, "learning_rate": 1.268043325298601e-05, "loss": 0.0329, "step": 11440 }, { "epoch": 44.03846153846154, "grad_norm": 1.142674446105957, "learning_rate": 1.2655932725752494e-05, "loss": 0.0365, "step": 11450 }, { "epoch": 44.07692307692308, "grad_norm": 1.3638461828231812, "learning_rate": 1.26314386071094e-05, "loss": 0.0436, "step": 11460 }, { "epoch": 44.11538461538461, "grad_norm": 0.972291886806488, "learning_rate": 1.2606950964022701e-05, "loss": 0.0376, "step": 11470 }, { "epoch": 44.15384615384615, "grad_norm": 1.145645022392273, "learning_rate": 1.2582469863440704e-05, "loss": 0.0343, "step": 11480 }, { "epoch": 44.19230769230769, "grad_norm": 1.0924091339111328, "learning_rate": 1.2557995372293778e-05, "loss": 0.0307, "step": 11490 }, { "epoch": 44.23076923076923, "grad_norm": 1.480159878730774, "learning_rate": 1.2533527557494257e-05, "loss": 0.0368, "step": 11500 }, { "epoch": 44.26923076923077, "grad_norm": 1.4256212711334229, "learning_rate": 1.250906648593621e-05, "loss": 0.0329, "step": 11510 }, { "epoch": 44.30769230769231, "grad_norm": 1.190695881843567, "learning_rate": 1.2484612224495275e-05, "loss": 0.0342, "step": 11520 }, { "epoch": 44.34615384615385, "grad_norm": 1.098670482635498, "learning_rate": 1.2460164840028477e-05, "loss": 0.0368, "step": 11530 }, { "epoch": 44.38461538461539, "grad_norm": 1.0962218046188354, "learning_rate": 1.2435724399374016e-05, "loss": 0.031, "step": 11540 }, { "epoch": 44.42307692307692, "grad_norm": 1.2497434616088867, "learning_rate": 1.2411290969351129e-05, "loss": 0.0367, "step": 11550 }, { "epoch": 44.46153846153846, "grad_norm": 1.417742133140564, "learning_rate": 1.2386864616759883e-05, "loss": 0.0356, "step": 11560 }, { "epoch": 44.5, "grad_norm": 0.9971799850463867, "learning_rate": 1.2362445408380996e-05, "loss": 0.0332, "step": 11570 }, { "epoch": 44.53846153846154, "grad_norm": 1.040932059288025, "learning_rate": 1.2338033410975644e-05, "loss": 0.0304, "step": 11580 }, { "epoch": 44.57692307692308, "grad_norm": 1.272524356842041, "learning_rate": 1.2313628691285301e-05, "loss": 0.0364, "step": 11590 }, { "epoch": 44.61538461538461, "grad_norm": 1.156447410583496, "learning_rate": 1.2289231316031536e-05, "loss": 0.0333, "step": 11600 }, { "epoch": 44.65384615384615, "grad_norm": 1.2335338592529297, "learning_rate": 1.2264841351915842e-05, "loss": 0.0339, "step": 11610 }, { "epoch": 44.69230769230769, "grad_norm": 1.2152777910232544, "learning_rate": 1.2240458865619455e-05, "loss": 0.0372, "step": 11620 }, { "epoch": 44.73076923076923, "grad_norm": 1.3504607677459717, "learning_rate": 1.2216083923803152e-05, "loss": 0.0318, "step": 11630 }, { "epoch": 44.76923076923077, "grad_norm": 1.2215933799743652, "learning_rate": 1.2191716593107097e-05, "loss": 0.0363, "step": 11640 }, { "epoch": 44.80769230769231, "grad_norm": 1.2269892692565918, "learning_rate": 1.2167356940150645e-05, "loss": 0.0331, "step": 11650 }, { "epoch": 44.84615384615385, "grad_norm": 1.2709892988204956, "learning_rate": 1.2143005031532152e-05, "loss": 0.0313, "step": 11660 }, { "epoch": 44.88461538461539, "grad_norm": 1.2237647771835327, "learning_rate": 1.2118660933828813e-05, "loss": 0.0358, "step": 11670 }, { "epoch": 44.92307692307692, "grad_norm": 1.1469601392745972, "learning_rate": 1.2094324713596453e-05, "loss": 0.0317, "step": 11680 }, { "epoch": 44.96153846153846, "grad_norm": 1.236554503440857, "learning_rate": 1.2069996437369374e-05, "loss": 0.0353, "step": 11690 }, { "epoch": 45.0, "grad_norm": 1.014963150024414, "learning_rate": 1.2045676171660154e-05, "loss": 0.035, "step": 11700 }, { "epoch": 45.03846153846154, "grad_norm": 1.3070363998413086, "learning_rate": 1.2021363982959472e-05, "loss": 0.0399, "step": 11710 }, { "epoch": 45.07692307692308, "grad_norm": 1.0885117053985596, "learning_rate": 1.1997059937735919e-05, "loss": 0.0313, "step": 11720 }, { "epoch": 45.11538461538461, "grad_norm": 1.3546298742294312, "learning_rate": 1.197276410243583e-05, "loss": 0.0324, "step": 11730 }, { "epoch": 45.15384615384615, "grad_norm": 1.2902907133102417, "learning_rate": 1.194847654348309e-05, "loss": 0.0323, "step": 11740 }, { "epoch": 45.19230769230769, "grad_norm": 1.403416633605957, "learning_rate": 1.1924197327278957e-05, "loss": 0.0336, "step": 11750 }, { "epoch": 45.23076923076923, "grad_norm": 1.0065228939056396, "learning_rate": 1.1899926520201885e-05, "loss": 0.035, "step": 11760 }, { "epoch": 45.26923076923077, "grad_norm": 1.3786506652832031, "learning_rate": 1.1875664188607327e-05, "loss": 0.0288, "step": 11770 }, { "epoch": 45.30769230769231, "grad_norm": 1.0867382287979126, "learning_rate": 1.1851410398827578e-05, "loss": 0.0316, "step": 11780 }, { "epoch": 45.34615384615385, "grad_norm": 1.1923028230667114, "learning_rate": 1.1827165217171567e-05, "loss": 0.0345, "step": 11790 }, { "epoch": 45.38461538461539, "grad_norm": 1.1290911436080933, "learning_rate": 1.18029287099247e-05, "loss": 0.0334, "step": 11800 }, { "epoch": 45.42307692307692, "grad_norm": 1.5147455930709839, "learning_rate": 1.1778700943348662e-05, "loss": 0.0329, "step": 11810 }, { "epoch": 45.46153846153846, "grad_norm": 1.2994474172592163, "learning_rate": 1.1754481983681238e-05, "loss": 0.0371, "step": 11820 }, { "epoch": 45.5, "grad_norm": 1.4459302425384521, "learning_rate": 1.173027189713614e-05, "loss": 0.0352, "step": 11830 }, { "epoch": 45.53846153846154, "grad_norm": 1.4037601947784424, "learning_rate": 1.170607074990282e-05, "loss": 0.0305, "step": 11840 }, { "epoch": 45.57692307692308, "grad_norm": 0.9631344079971313, "learning_rate": 1.1681878608146297e-05, "loss": 0.0346, "step": 11850 }, { "epoch": 45.61538461538461, "grad_norm": 1.2066413164138794, "learning_rate": 1.1657695538006952e-05, "loss": 0.0389, "step": 11860 }, { "epoch": 45.65384615384615, "grad_norm": 0.925900399684906, "learning_rate": 1.163352160560038e-05, "loss": 0.0386, "step": 11870 }, { "epoch": 45.69230769230769, "grad_norm": 1.1876864433288574, "learning_rate": 1.1609356877017191e-05, "loss": 0.0327, "step": 11880 }, { "epoch": 45.73076923076923, "grad_norm": 1.419493317604065, "learning_rate": 1.1585201418322828e-05, "loss": 0.0365, "step": 11890 }, { "epoch": 45.76923076923077, "grad_norm": 1.2383546829223633, "learning_rate": 1.1561055295557397e-05, "loss": 0.0367, "step": 11900 }, { "epoch": 45.80769230769231, "grad_norm": 1.1757643222808838, "learning_rate": 1.1536918574735469e-05, "loss": 0.035, "step": 11910 }, { "epoch": 45.84615384615385, "grad_norm": 1.1894476413726807, "learning_rate": 1.1512791321845921e-05, "loss": 0.0332, "step": 11920 }, { "epoch": 45.88461538461539, "grad_norm": 1.043144702911377, "learning_rate": 1.148867360285174e-05, "loss": 0.0298, "step": 11930 }, { "epoch": 45.92307692307692, "grad_norm": 1.3999884128570557, "learning_rate": 1.1464565483689853e-05, "loss": 0.0358, "step": 11940 }, { "epoch": 45.96153846153846, "grad_norm": 0.8913671970367432, "learning_rate": 1.144046703027093e-05, "loss": 0.0321, "step": 11950 }, { "epoch": 46.0, "grad_norm": 0.9930167198181152, "learning_rate": 1.1416378308479223e-05, "loss": 0.0324, "step": 11960 }, { "epoch": 46.03846153846154, "grad_norm": 1.0561189651489258, "learning_rate": 1.1392299384172383e-05, "loss": 0.0356, "step": 11970 }, { "epoch": 46.07692307692308, "grad_norm": 1.3580750226974487, "learning_rate": 1.1368230323181267e-05, "loss": 0.0335, "step": 11980 }, { "epoch": 46.11538461538461, "grad_norm": 1.4222067594528198, "learning_rate": 1.1344171191309772e-05, "loss": 0.0325, "step": 11990 }, { "epoch": 46.15384615384615, "grad_norm": 1.2112866640090942, "learning_rate": 1.1320122054334636e-05, "loss": 0.0318, "step": 12000 }, { "epoch": 46.19230769230769, "grad_norm": 1.3446886539459229, "learning_rate": 1.1296082978005292e-05, "loss": 0.0361, "step": 12010 }, { "epoch": 46.23076923076923, "grad_norm": 1.4501997232437134, "learning_rate": 1.127205402804365e-05, "loss": 0.0388, "step": 12020 }, { "epoch": 46.26923076923077, "grad_norm": 1.107108473777771, "learning_rate": 1.1248035270143946e-05, "loss": 0.0343, "step": 12030 }, { "epoch": 46.30769230769231, "grad_norm": 1.1503781080245972, "learning_rate": 1.1224026769972545e-05, "loss": 0.0308, "step": 12040 }, { "epoch": 46.34615384615385, "grad_norm": 1.2170590162277222, "learning_rate": 1.1200028593167769e-05, "loss": 0.0347, "step": 12050 }, { "epoch": 46.38461538461539, "grad_norm": 1.3022524118423462, "learning_rate": 1.1176040805339718e-05, "loss": 0.0373, "step": 12060 }, { "epoch": 46.42307692307692, "grad_norm": 1.3600707054138184, "learning_rate": 1.1152063472070086e-05, "loss": 0.0335, "step": 12070 }, { "epoch": 46.46153846153846, "grad_norm": 1.551127552986145, "learning_rate": 1.1128096658911992e-05, "loss": 0.032, "step": 12080 }, { "epoch": 46.5, "grad_norm": 1.3627465963363647, "learning_rate": 1.1104140431389782e-05, "loss": 0.0292, "step": 12090 }, { "epoch": 46.53846153846154, "grad_norm": 1.3775311708450317, "learning_rate": 1.1080194854998868e-05, "loss": 0.0361, "step": 12100 }, { "epoch": 46.57692307692308, "grad_norm": 1.0679173469543457, "learning_rate": 1.1056259995205545e-05, "loss": 0.0323, "step": 12110 }, { "epoch": 46.61538461538461, "grad_norm": 1.3462945222854614, "learning_rate": 1.1032335917446803e-05, "loss": 0.0351, "step": 12120 }, { "epoch": 46.65384615384615, "grad_norm": 1.1379890441894531, "learning_rate": 1.100842268713016e-05, "loss": 0.0321, "step": 12130 }, { "epoch": 46.69230769230769, "grad_norm": 0.7943074703216553, "learning_rate": 1.098452036963347e-05, "loss": 0.0334, "step": 12140 }, { "epoch": 46.73076923076923, "grad_norm": 1.2128022909164429, "learning_rate": 1.096062903030476e-05, "loss": 0.0353, "step": 12150 }, { "epoch": 46.76923076923077, "grad_norm": 1.0873559713363647, "learning_rate": 1.0936748734462036e-05, "loss": 0.0405, "step": 12160 }, { "epoch": 46.80769230769231, "grad_norm": 1.2984329462051392, "learning_rate": 1.0912879547393119e-05, "loss": 0.0335, "step": 12170 }, { "epoch": 46.84615384615385, "grad_norm": 0.9440507292747498, "learning_rate": 1.0889021534355456e-05, "loss": 0.0366, "step": 12180 }, { "epoch": 46.88461538461539, "grad_norm": 1.3121964931488037, "learning_rate": 1.0865174760575936e-05, "loss": 0.037, "step": 12190 }, { "epoch": 46.92307692307692, "grad_norm": 1.1670479774475098, "learning_rate": 1.0841339291250733e-05, "loss": 0.0296, "step": 12200 }, { "epoch": 46.96153846153846, "grad_norm": 1.306760311126709, "learning_rate": 1.081751519154511e-05, "loss": 0.0386, "step": 12210 }, { "epoch": 47.0, "grad_norm": 1.1474354267120361, "learning_rate": 1.079370252659325e-05, "loss": 0.0337, "step": 12220 }, { "epoch": 47.03846153846154, "grad_norm": 1.2645727396011353, "learning_rate": 1.076990136149806e-05, "loss": 0.0394, "step": 12230 }, { "epoch": 47.07692307692308, "grad_norm": 1.2392700910568237, "learning_rate": 1.0746111761331021e-05, "loss": 0.0317, "step": 12240 }, { "epoch": 47.11538461538461, "grad_norm": 0.9455440044403076, "learning_rate": 1.0722333791131996e-05, "loss": 0.0332, "step": 12250 }, { "epoch": 47.15384615384615, "grad_norm": 1.4143307209014893, "learning_rate": 1.0698567515909041e-05, "loss": 0.0357, "step": 12260 }, { "epoch": 47.19230769230769, "grad_norm": 1.264585256576538, "learning_rate": 1.0674813000638252e-05, "loss": 0.0339, "step": 12270 }, { "epoch": 47.23076923076923, "grad_norm": 1.1998621225357056, "learning_rate": 1.0651070310263559e-05, "loss": 0.0337, "step": 12280 }, { "epoch": 47.26923076923077, "grad_norm": 1.1370961666107178, "learning_rate": 1.0627339509696574e-05, "loss": 0.0304, "step": 12290 }, { "epoch": 47.30769230769231, "grad_norm": 1.2680944204330444, "learning_rate": 1.06036206638164e-05, "loss": 0.032, "step": 12300 }, { "epoch": 47.34615384615385, "grad_norm": 1.0663670301437378, "learning_rate": 1.0579913837469455e-05, "loss": 0.032, "step": 12310 }, { "epoch": 47.38461538461539, "grad_norm": 1.1136479377746582, "learning_rate": 1.0556219095469303e-05, "loss": 0.0325, "step": 12320 }, { "epoch": 47.42307692307692, "grad_norm": 1.046266794204712, "learning_rate": 1.0532536502596455e-05, "loss": 0.0312, "step": 12330 }, { "epoch": 47.46153846153846, "grad_norm": 1.133286476135254, "learning_rate": 1.0508866123598218e-05, "loss": 0.0316, "step": 12340 }, { "epoch": 47.5, "grad_norm": 1.1869333982467651, "learning_rate": 1.0485208023188505e-05, "loss": 0.0332, "step": 12350 }, { "epoch": 47.53846153846154, "grad_norm": 1.3524519205093384, "learning_rate": 1.0461562266047668e-05, "loss": 0.0295, "step": 12360 }, { "epoch": 47.57692307692308, "grad_norm": 1.1688059568405151, "learning_rate": 1.0437928916822286e-05, "loss": 0.0285, "step": 12370 }, { "epoch": 47.61538461538461, "grad_norm": 1.3284744024276733, "learning_rate": 1.0414308040125043e-05, "loss": 0.0393, "step": 12380 }, { "epoch": 47.65384615384615, "grad_norm": 1.016032099723816, "learning_rate": 1.0390699700534517e-05, "loss": 0.0309, "step": 12390 }, { "epoch": 47.69230769230769, "grad_norm": 0.8657578229904175, "learning_rate": 1.0367103962595003e-05, "loss": 0.0377, "step": 12400 }, { "epoch": 47.73076923076923, "grad_norm": 1.0467973947525024, "learning_rate": 1.0343520890816356e-05, "loss": 0.0356, "step": 12410 }, { "epoch": 47.76923076923077, "grad_norm": 0.9654638767242432, "learning_rate": 1.0319950549673779e-05, "loss": 0.035, "step": 12420 }, { "epoch": 47.80769230769231, "grad_norm": 1.0706387758255005, "learning_rate": 1.0296393003607692e-05, "loss": 0.0344, "step": 12430 }, { "epoch": 47.84615384615385, "grad_norm": 1.2317215204238892, "learning_rate": 1.0272848317023526e-05, "loss": 0.0321, "step": 12440 }, { "epoch": 47.88461538461539, "grad_norm": 1.0294604301452637, "learning_rate": 1.0249316554291556e-05, "loss": 0.0297, "step": 12450 }, { "epoch": 47.92307692307692, "grad_norm": 1.147530198097229, "learning_rate": 1.022579777974673e-05, "loss": 0.0352, "step": 12460 }, { "epoch": 47.96153846153846, "grad_norm": 1.0180110931396484, "learning_rate": 1.0202292057688462e-05, "loss": 0.0317, "step": 12470 }, { "epoch": 48.0, "grad_norm": 0.9370880126953125, "learning_rate": 1.0178799452380511e-05, "loss": 0.036, "step": 12480 }, { "epoch": 48.03846153846154, "grad_norm": 0.9655492305755615, "learning_rate": 1.0155320028050757e-05, "loss": 0.032, "step": 12490 }, { "epoch": 48.07692307692308, "grad_norm": 0.9434256553649902, "learning_rate": 1.0131853848891063e-05, "loss": 0.0325, "step": 12500 }, { "epoch": 48.11538461538461, "grad_norm": 0.9240254163742065, "learning_rate": 1.0108400979057048e-05, "loss": 0.0302, "step": 12510 }, { "epoch": 48.15384615384615, "grad_norm": 1.0489293336868286, "learning_rate": 1.008496148266797e-05, "loss": 0.0306, "step": 12520 }, { "epoch": 48.19230769230769, "grad_norm": 1.136711835861206, "learning_rate": 1.0061535423806519e-05, "loss": 0.0304, "step": 12530 }, { "epoch": 48.23076923076923, "grad_norm": 1.0447853803634644, "learning_rate": 1.0038122866518647e-05, "loss": 0.0309, "step": 12540 }, { "epoch": 48.26923076923077, "grad_norm": 1.01454496383667, "learning_rate": 1.0014723874813394e-05, "loss": 0.0329, "step": 12550 }, { "epoch": 48.30769230769231, "grad_norm": 1.0103205442428589, "learning_rate": 9.991338512662696e-06, "loss": 0.0282, "step": 12560 }, { "epoch": 48.34615384615385, "grad_norm": 1.1427966356277466, "learning_rate": 9.96796684400125e-06, "loss": 0.0353, "step": 12570 }, { "epoch": 48.38461538461539, "grad_norm": 0.9953744411468506, "learning_rate": 9.944608932726306e-06, "loss": 0.0333, "step": 12580 }, { "epoch": 48.42307692307692, "grad_norm": 0.9882763028144836, "learning_rate": 9.921264842697501e-06, "loss": 0.0308, "step": 12590 }, { "epoch": 48.46153846153846, "grad_norm": 1.046663761138916, "learning_rate": 9.897934637736692e-06, "loss": 0.0345, "step": 12600 }, { "epoch": 48.5, "grad_norm": 1.264633297920227, "learning_rate": 9.874618381627751e-06, "loss": 0.0305, "step": 12610 }, { "epoch": 48.53846153846154, "grad_norm": 0.9879583716392517, "learning_rate": 9.851316138116446e-06, "loss": 0.0324, "step": 12620 }, { "epoch": 48.57692307692308, "grad_norm": 1.2528743743896484, "learning_rate": 9.828027970910217e-06, "loss": 0.0342, "step": 12630 }, { "epoch": 48.61538461538461, "grad_norm": 1.0247045755386353, "learning_rate": 9.80475394367803e-06, "loss": 0.0314, "step": 12640 }, { "epoch": 48.65384615384615, "grad_norm": 1.2082724571228027, "learning_rate": 9.781494120050176e-06, "loss": 0.0325, "step": 12650 }, { "epoch": 48.69230769230769, "grad_norm": 1.0873223543167114, "learning_rate": 9.758248563618126e-06, "loss": 0.0345, "step": 12660 }, { "epoch": 48.73076923076923, "grad_norm": 1.2322958707809448, "learning_rate": 9.735017337934349e-06, "loss": 0.0353, "step": 12670 }, { "epoch": 48.76923076923077, "grad_norm": 0.9945014119148254, "learning_rate": 9.71180050651213e-06, "loss": 0.0288, "step": 12680 }, { "epoch": 48.80769230769231, "grad_norm": 1.0278196334838867, "learning_rate": 9.688598132825402e-06, "loss": 0.0398, "step": 12690 }, { "epoch": 48.84615384615385, "grad_norm": 0.9396235942840576, "learning_rate": 9.665410280308555e-06, "loss": 0.0371, "step": 12700 }, { "epoch": 48.88461538461539, "grad_norm": 0.8512292504310608, "learning_rate": 9.642237012356302e-06, "loss": 0.028, "step": 12710 }, { "epoch": 48.92307692307692, "grad_norm": 1.0575693845748901, "learning_rate": 9.619078392323471e-06, "loss": 0.0342, "step": 12720 }, { "epoch": 48.96153846153846, "grad_norm": 1.2560358047485352, "learning_rate": 9.595934483524847e-06, "loss": 0.0318, "step": 12730 }, { "epoch": 49.0, "grad_norm": 1.3950382471084595, "learning_rate": 9.572805349234997e-06, "loss": 0.0324, "step": 12740 }, { "epoch": 49.03846153846154, "grad_norm": 1.0843040943145752, "learning_rate": 9.54969105268808e-06, "loss": 0.0308, "step": 12750 }, { "epoch": 49.07692307692308, "grad_norm": 1.165537714958191, "learning_rate": 9.526591657077701e-06, "loss": 0.0337, "step": 12760 }, { "epoch": 49.11538461538461, "grad_norm": 0.8965687155723572, "learning_rate": 9.503507225556734e-06, "loss": 0.0303, "step": 12770 }, { "epoch": 49.15384615384615, "grad_norm": 0.9655594229698181, "learning_rate": 9.480437821237134e-06, "loss": 0.038, "step": 12780 }, { "epoch": 49.19230769230769, "grad_norm": 1.0516133308410645, "learning_rate": 9.457383507189763e-06, "loss": 0.0372, "step": 12790 }, { "epoch": 49.23076923076923, "grad_norm": 0.9941837191581726, "learning_rate": 9.434344346444237e-06, "loss": 0.0356, "step": 12800 }, { "epoch": 49.26923076923077, "grad_norm": 1.3109407424926758, "learning_rate": 9.411320401988744e-06, "loss": 0.0371, "step": 12810 }, { "epoch": 49.30769230769231, "grad_norm": 1.0958704948425293, "learning_rate": 9.388311736769867e-06, "loss": 0.0356, "step": 12820 }, { "epoch": 49.34615384615385, "grad_norm": 1.2955561876296997, "learning_rate": 9.365318413692429e-06, "loss": 0.0331, "step": 12830 }, { "epoch": 49.38461538461539, "grad_norm": 0.9428987503051758, "learning_rate": 9.34234049561928e-06, "loss": 0.0343, "step": 12840 }, { "epoch": 49.42307692307692, "grad_norm": 1.1973485946655273, "learning_rate": 9.31937804537118e-06, "loss": 0.0303, "step": 12850 }, { "epoch": 49.46153846153846, "grad_norm": 1.0412033796310425, "learning_rate": 9.296431125726587e-06, "loss": 0.0331, "step": 12860 }, { "epoch": 49.5, "grad_norm": 1.315387487411499, "learning_rate": 9.27349979942151e-06, "loss": 0.0334, "step": 12870 }, { "epoch": 49.53846153846154, "grad_norm": 0.8955021500587463, "learning_rate": 9.250584129149321e-06, "loss": 0.0328, "step": 12880 }, { "epoch": 49.57692307692308, "grad_norm": 1.017458200454712, "learning_rate": 9.227684177560575e-06, "loss": 0.033, "step": 12890 }, { "epoch": 49.61538461538461, "grad_norm": 0.8650226593017578, "learning_rate": 9.204800007262874e-06, "loss": 0.0299, "step": 12900 }, { "epoch": 49.65384615384615, "grad_norm": 0.750944197177887, "learning_rate": 9.18193168082066e-06, "loss": 0.0311, "step": 12910 }, { "epoch": 49.69230769230769, "grad_norm": 0.9948349595069885, "learning_rate": 9.159079260755079e-06, "loss": 0.0357, "step": 12920 }, { "epoch": 49.73076923076923, "grad_norm": 1.1247837543487549, "learning_rate": 9.136242809543754e-06, "loss": 0.0331, "step": 12930 }, { "epoch": 49.76923076923077, "grad_norm": 1.0161885023117065, "learning_rate": 9.113422389620685e-06, "loss": 0.0271, "step": 12940 }, { "epoch": 49.80769230769231, "grad_norm": 0.9860794544219971, "learning_rate": 9.090618063376021e-06, "loss": 0.0331, "step": 12950 }, { "epoch": 49.84615384615385, "grad_norm": 0.9928134083747864, "learning_rate": 9.067829893155922e-06, "loss": 0.0324, "step": 12960 }, { "epoch": 49.88461538461539, "grad_norm": 1.0498042106628418, "learning_rate": 9.045057941262384e-06, "loss": 0.0342, "step": 12970 }, { "epoch": 49.92307692307692, "grad_norm": 1.02463698387146, "learning_rate": 9.02230226995304e-06, "loss": 0.0341, "step": 12980 }, { "epoch": 49.96153846153846, "grad_norm": 1.065952181816101, "learning_rate": 8.999562941441031e-06, "loss": 0.0333, "step": 12990 }, { "epoch": 50.0, "grad_norm": 0.989569902420044, "learning_rate": 8.976840017894814e-06, "loss": 0.0348, "step": 13000 }, { "epoch": 50.03846153846154, "grad_norm": 1.242495059967041, "learning_rate": 8.954133561437993e-06, "loss": 0.0368, "step": 13010 }, { "epoch": 50.07692307692308, "grad_norm": 1.2487726211547852, "learning_rate": 8.931443634149163e-06, "loss": 0.0323, "step": 13020 }, { "epoch": 50.11538461538461, "grad_norm": 1.0911444425582886, "learning_rate": 8.908770298061702e-06, "loss": 0.034, "step": 13030 }, { "epoch": 50.15384615384615, "grad_norm": 1.0673996210098267, "learning_rate": 8.886113615163655e-06, "loss": 0.0305, "step": 13040 }, { "epoch": 50.19230769230769, "grad_norm": 1.1423349380493164, "learning_rate": 8.863473647397522e-06, "loss": 0.0322, "step": 13050 }, { "epoch": 50.23076923076923, "grad_norm": 1.0116785764694214, "learning_rate": 8.840850456660121e-06, "loss": 0.0349, "step": 13060 }, { "epoch": 50.26923076923077, "grad_norm": 0.9031907916069031, "learning_rate": 8.818244104802384e-06, "loss": 0.0246, "step": 13070 }, { "epoch": 50.30769230769231, "grad_norm": 1.0316237211227417, "learning_rate": 8.79565465362921e-06, "loss": 0.0314, "step": 13080 }, { "epoch": 50.34615384615385, "grad_norm": 1.0059611797332764, "learning_rate": 8.773082164899305e-06, "loss": 0.0341, "step": 13090 }, { "epoch": 50.38461538461539, "grad_norm": 0.974493145942688, "learning_rate": 8.750526700324984e-06, "loss": 0.0336, "step": 13100 }, { "epoch": 50.42307692307692, "grad_norm": 1.0661152601242065, "learning_rate": 8.72798832157203e-06, "loss": 0.0323, "step": 13110 }, { "epoch": 50.46153846153846, "grad_norm": 0.9538970589637756, "learning_rate": 8.705467090259507e-06, "loss": 0.0353, "step": 13120 }, { "epoch": 50.5, "grad_norm": 0.8963907361030579, "learning_rate": 8.682963067959607e-06, "loss": 0.0346, "step": 13130 }, { "epoch": 50.53846153846154, "grad_norm": 0.9397483468055725, "learning_rate": 8.660476316197457e-06, "loss": 0.0308, "step": 13140 }, { "epoch": 50.57692307692308, "grad_norm": 0.9931941032409668, "learning_rate": 8.638006896450991e-06, "loss": 0.0303, "step": 13150 }, { "epoch": 50.61538461538461, "grad_norm": 0.9436623454093933, "learning_rate": 8.61555487015074e-06, "loss": 0.0268, "step": 13160 }, { "epoch": 50.65384615384615, "grad_norm": 0.9443449974060059, "learning_rate": 8.593120298679676e-06, "loss": 0.034, "step": 13170 }, { "epoch": 50.69230769230769, "grad_norm": 0.9173842072486877, "learning_rate": 8.570703243373076e-06, "loss": 0.0315, "step": 13180 }, { "epoch": 50.73076923076923, "grad_norm": 1.0726054906845093, "learning_rate": 8.5483037655183e-06, "loss": 0.0278, "step": 13190 }, { "epoch": 50.76923076923077, "grad_norm": 0.9434815049171448, "learning_rate": 8.525921926354686e-06, "loss": 0.0404, "step": 13200 }, { "epoch": 50.80769230769231, "grad_norm": 1.0521373748779297, "learning_rate": 8.5035577870733e-06, "loss": 0.035, "step": 13210 }, { "epoch": 50.84615384615385, "grad_norm": 1.1391912698745728, "learning_rate": 8.481211408816858e-06, "loss": 0.0344, "step": 13220 }, { "epoch": 50.88461538461539, "grad_norm": 1.4031716585159302, "learning_rate": 8.45888285267951e-06, "loss": 0.0344, "step": 13230 }, { "epoch": 50.92307692307692, "grad_norm": 0.8375698924064636, "learning_rate": 8.436572179706666e-06, "loss": 0.0359, "step": 13240 }, { "epoch": 50.96153846153846, "grad_norm": 1.113251805305481, "learning_rate": 8.41427945089487e-06, "loss": 0.0341, "step": 13250 }, { "epoch": 51.0, "grad_norm": 1.0186192989349365, "learning_rate": 8.39200472719157e-06, "loss": 0.0372, "step": 13260 }, { "epoch": 51.03846153846154, "grad_norm": 1.0115083456039429, "learning_rate": 8.369748069495017e-06, "loss": 0.0389, "step": 13270 }, { "epoch": 51.07692307692308, "grad_norm": 1.3340504169464111, "learning_rate": 8.347509538654074e-06, "loss": 0.0331, "step": 13280 }, { "epoch": 51.11538461538461, "grad_norm": 1.0206196308135986, "learning_rate": 8.325289195468023e-06, "loss": 0.0362, "step": 13290 }, { "epoch": 51.15384615384615, "grad_norm": 0.978456974029541, "learning_rate": 8.303087100686449e-06, "loss": 0.038, "step": 13300 }, { "epoch": 51.19230769230769, "grad_norm": 0.9769437909126282, "learning_rate": 8.280903315009009e-06, "loss": 0.0281, "step": 13310 }, { "epoch": 51.23076923076923, "grad_norm": 0.979179322719574, "learning_rate": 8.258737899085338e-06, "loss": 0.0328, "step": 13320 }, { "epoch": 51.26923076923077, "grad_norm": 1.1295559406280518, "learning_rate": 8.236590913514841e-06, "loss": 0.035, "step": 13330 }, { "epoch": 51.30769230769231, "grad_norm": 0.948234498500824, "learning_rate": 8.214462418846529e-06, "loss": 0.0316, "step": 13340 }, { "epoch": 51.34615384615385, "grad_norm": 0.9799079298973083, "learning_rate": 8.192352475578857e-06, "loss": 0.0275, "step": 13350 }, { "epoch": 51.38461538461539, "grad_norm": 1.1674693822860718, "learning_rate": 8.170261144159563e-06, "loss": 0.0319, "step": 13360 }, { "epoch": 51.42307692307692, "grad_norm": 1.1795536279678345, "learning_rate": 8.148188484985505e-06, "loss": 0.0298, "step": 13370 }, { "epoch": 51.46153846153846, "grad_norm": 0.9101167917251587, "learning_rate": 8.126134558402501e-06, "loss": 0.0325, "step": 13380 }, { "epoch": 51.5, "grad_norm": 0.995457112789154, "learning_rate": 8.104099424705135e-06, "loss": 0.0325, "step": 13390 }, { "epoch": 51.53846153846154, "grad_norm": 1.003032922744751, "learning_rate": 8.082083144136625e-06, "loss": 0.0283, "step": 13400 }, { "epoch": 51.57692307692308, "grad_norm": 0.9459449648857117, "learning_rate": 8.060085776888634e-06, "loss": 0.0325, "step": 13410 }, { "epoch": 51.61538461538461, "grad_norm": 1.151404619216919, "learning_rate": 8.038107383101126e-06, "loss": 0.0338, "step": 13420 }, { "epoch": 51.65384615384615, "grad_norm": 0.9389851689338684, "learning_rate": 8.016148022862201e-06, "loss": 0.0351, "step": 13430 }, { "epoch": 51.69230769230769, "grad_norm": 0.9662908315658569, "learning_rate": 7.994207756207903e-06, "loss": 0.0363, "step": 13440 }, { "epoch": 51.73076923076923, "grad_norm": 0.9525715708732605, "learning_rate": 7.972286643122083e-06, "loss": 0.0343, "step": 13450 }, { "epoch": 51.76923076923077, "grad_norm": 1.1760188341140747, "learning_rate": 7.950384743536225e-06, "loss": 0.0345, "step": 13460 }, { "epoch": 51.80769230769231, "grad_norm": 0.9653248190879822, "learning_rate": 7.928502117329285e-06, "loss": 0.0348, "step": 13470 }, { "epoch": 51.84615384615385, "grad_norm": 1.0553956031799316, "learning_rate": 7.906638824327545e-06, "loss": 0.0287, "step": 13480 }, { "epoch": 51.88461538461539, "grad_norm": 1.1215946674346924, "learning_rate": 7.884794924304392e-06, "loss": 0.0424, "step": 13490 }, { "epoch": 51.92307692307692, "grad_norm": 0.754399836063385, "learning_rate": 7.862970476980218e-06, "loss": 0.0353, "step": 13500 }, { "epoch": 51.96153846153846, "grad_norm": 1.1638495922088623, "learning_rate": 7.841165542022242e-06, "loss": 0.0382, "step": 13510 }, { "epoch": 52.0, "grad_norm": 1.0180604457855225, "learning_rate": 7.819380179044307e-06, "loss": 0.0276, "step": 13520 }, { "epoch": 52.03846153846154, "grad_norm": 1.0161025524139404, "learning_rate": 7.797614447606783e-06, "loss": 0.0301, "step": 13530 }, { "epoch": 52.07692307692308, "grad_norm": 0.8754862546920776, "learning_rate": 7.775868407216326e-06, "loss": 0.0381, "step": 13540 }, { "epoch": 52.11538461538461, "grad_norm": 0.9107174277305603, "learning_rate": 7.754142117325792e-06, "loss": 0.0328, "step": 13550 }, { "epoch": 52.15384615384615, "grad_norm": 0.9813603162765503, "learning_rate": 7.73243563733403e-06, "loss": 0.0356, "step": 13560 }, { "epoch": 52.19230769230769, "grad_norm": 1.0126020908355713, "learning_rate": 7.710749026585726e-06, "loss": 0.0276, "step": 13570 }, { "epoch": 52.23076923076923, "grad_norm": 0.9731197357177734, "learning_rate": 7.689082344371244e-06, "loss": 0.0356, "step": 13580 }, { "epoch": 52.26923076923077, "grad_norm": 1.082835078239441, "learning_rate": 7.667435649926459e-06, "loss": 0.0315, "step": 13590 }, { "epoch": 52.30769230769231, "grad_norm": 0.8532876372337341, "learning_rate": 7.645809002432616e-06, "loss": 0.0333, "step": 13600 }, { "epoch": 52.34615384615385, "grad_norm": 0.8078252673149109, "learning_rate": 7.6242024610161454e-06, "loss": 0.0321, "step": 13610 }, { "epoch": 52.38461538461539, "grad_norm": 0.9187402725219727, "learning_rate": 7.602616084748501e-06, "loss": 0.0327, "step": 13620 }, { "epoch": 52.42307692307692, "grad_norm": 0.8912934064865112, "learning_rate": 7.5810499326460145e-06, "loss": 0.0359, "step": 13630 }, { "epoch": 52.46153846153846, "grad_norm": 0.8820334672927856, "learning_rate": 7.5595040636697145e-06, "loss": 0.0338, "step": 13640 }, { "epoch": 52.5, "grad_norm": 1.0421593189239502, "learning_rate": 7.537978536725189e-06, "loss": 0.0293, "step": 13650 }, { "epoch": 52.53846153846154, "grad_norm": 0.9799807667732239, "learning_rate": 7.5164734106624135e-06, "loss": 0.0345, "step": 13660 }, { "epoch": 52.57692307692308, "grad_norm": 0.8654934167861938, "learning_rate": 7.494988744275575e-06, "loss": 0.0288, "step": 13670 }, { "epoch": 52.61538461538461, "grad_norm": 0.939386248588562, "learning_rate": 7.473524596302931e-06, "loss": 0.0372, "step": 13680 }, { "epoch": 52.65384615384615, "grad_norm": 1.0695780515670776, "learning_rate": 7.452081025426639e-06, "loss": 0.0316, "step": 13690 }, { "epoch": 52.69230769230769, "grad_norm": 1.0574803352355957, "learning_rate": 7.430658090272606e-06, "loss": 0.0337, "step": 13700 }, { "epoch": 52.73076923076923, "grad_norm": 1.0280009508132935, "learning_rate": 7.40925584941033e-06, "loss": 0.0362, "step": 13710 }, { "epoch": 52.76923076923077, "grad_norm": 1.196169376373291, "learning_rate": 7.3878743613527e-06, "loss": 0.0285, "step": 13720 }, { "epoch": 52.80769230769231, "grad_norm": 0.9964907169342041, "learning_rate": 7.366513684555903e-06, "loss": 0.0346, "step": 13730 }, { "epoch": 52.84615384615385, "grad_norm": 1.0160105228424072, "learning_rate": 7.345173877419204e-06, "loss": 0.031, "step": 13740 }, { "epoch": 52.88461538461539, "grad_norm": 1.034250020980835, "learning_rate": 7.323854998284823e-06, "loss": 0.0351, "step": 13750 }, { "epoch": 52.92307692307692, "grad_norm": 1.0437781810760498, "learning_rate": 7.302557105437775e-06, "loss": 0.0306, "step": 13760 }, { "epoch": 52.96153846153846, "grad_norm": 0.9846848249435425, "learning_rate": 7.281280257105666e-06, "loss": 0.0312, "step": 13770 }, { "epoch": 53.0, "grad_norm": 1.1539181470870972, "learning_rate": 7.260024511458599e-06, "loss": 0.03, "step": 13780 }, { "epoch": 53.03846153846154, "grad_norm": 1.0355321168899536, "learning_rate": 7.238789926608963e-06, "loss": 0.0335, "step": 13790 }, { "epoch": 53.07692307692308, "grad_norm": 0.8215237855911255, "learning_rate": 7.217576560611305e-06, "loss": 0.0346, "step": 13800 }, { "epoch": 53.11538461538461, "grad_norm": 1.086424469947815, "learning_rate": 7.196384471462172e-06, "loss": 0.0366, "step": 13810 }, { "epoch": 53.15384615384615, "grad_norm": 1.2629507780075073, "learning_rate": 7.175213717099903e-06, "loss": 0.0326, "step": 13820 }, { "epoch": 53.19230769230769, "grad_norm": 1.1072007417678833, "learning_rate": 7.154064355404547e-06, "loss": 0.0325, "step": 13830 }, { "epoch": 53.23076923076923, "grad_norm": 1.0039595365524292, "learning_rate": 7.132936444197641e-06, "loss": 0.0325, "step": 13840 }, { "epoch": 53.26923076923077, "grad_norm": 0.990537166595459, "learning_rate": 7.111830041242101e-06, "loss": 0.0291, "step": 13850 }, { "epoch": 53.30769230769231, "grad_norm": 1.0518862009048462, "learning_rate": 7.090745204242018e-06, "loss": 0.0358, "step": 13860 }, { "epoch": 53.34615384615385, "grad_norm": 0.9419360756874084, "learning_rate": 7.069681990842527e-06, "loss": 0.0281, "step": 13870 }, { "epoch": 53.38461538461539, "grad_norm": 1.0219368934631348, "learning_rate": 7.04864045862966e-06, "loss": 0.0316, "step": 13880 }, { "epoch": 53.42307692307692, "grad_norm": 1.0230674743652344, "learning_rate": 7.027620665130152e-06, "loss": 0.0338, "step": 13890 }, { "epoch": 53.46153846153846, "grad_norm": 0.796351432800293, "learning_rate": 7.006622667811326e-06, "loss": 0.0352, "step": 13900 }, { "epoch": 53.5, "grad_norm": 1.2250455617904663, "learning_rate": 6.9856465240809e-06, "loss": 0.0295, "step": 13910 }, { "epoch": 53.53846153846154, "grad_norm": 0.9623807072639465, "learning_rate": 6.964692291286844e-06, "loss": 0.0352, "step": 13920 }, { "epoch": 53.57692307692308, "grad_norm": 0.8724016547203064, "learning_rate": 6.943760026717243e-06, "loss": 0.0314, "step": 13930 }, { "epoch": 53.61538461538461, "grad_norm": 1.101204514503479, "learning_rate": 6.922849787600097e-06, "loss": 0.0318, "step": 13940 }, { "epoch": 53.65384615384615, "grad_norm": 0.8270919322967529, "learning_rate": 6.90196163110321e-06, "loss": 0.0363, "step": 13950 }, { "epoch": 53.69230769230769, "grad_norm": 0.8336949944496155, "learning_rate": 6.881095614334002e-06, "loss": 0.0313, "step": 13960 }, { "epoch": 53.73076923076923, "grad_norm": 0.8287408351898193, "learning_rate": 6.860251794339359e-06, "loss": 0.0299, "step": 13970 }, { "epoch": 53.76923076923077, "grad_norm": 0.8003503680229187, "learning_rate": 6.839430228105501e-06, "loss": 0.0298, "step": 13980 }, { "epoch": 53.80769230769231, "grad_norm": 1.0847970247268677, "learning_rate": 6.818630972557788e-06, "loss": 0.0313, "step": 13990 }, { "epoch": 53.84615384615385, "grad_norm": 0.8157468438148499, "learning_rate": 6.797854084560585e-06, "loss": 0.0288, "step": 14000 }, { "epoch": 53.88461538461539, "grad_norm": 0.7339880466461182, "learning_rate": 6.777099620917124e-06, "loss": 0.0316, "step": 14010 }, { "epoch": 53.92307692307692, "grad_norm": 0.9050057530403137, "learning_rate": 6.756367638369301e-06, "loss": 0.0339, "step": 14020 }, { "epoch": 53.96153846153846, "grad_norm": 0.8340151309967041, "learning_rate": 6.735658193597579e-06, "loss": 0.031, "step": 14030 }, { "epoch": 54.0, "grad_norm": 1.0625113248825073, "learning_rate": 6.7149713432207825e-06, "loss": 0.0295, "step": 14040 }, { "epoch": 54.03846153846154, "grad_norm": 0.9300896525382996, "learning_rate": 6.694307143795966e-06, "loss": 0.027, "step": 14050 }, { "epoch": 54.07692307692308, "grad_norm": 1.1176495552062988, "learning_rate": 6.6736656518182704e-06, "loss": 0.0321, "step": 14060 }, { "epoch": 54.11538461538461, "grad_norm": 0.96674644947052, "learning_rate": 6.6530469237207375e-06, "loss": 0.0333, "step": 14070 }, { "epoch": 54.15384615384615, "grad_norm": 0.9425223469734192, "learning_rate": 6.632451015874193e-06, "loss": 0.029, "step": 14080 }, { "epoch": 54.19230769230769, "grad_norm": 0.8956887125968933, "learning_rate": 6.611877984587058e-06, "loss": 0.0296, "step": 14090 }, { "epoch": 54.23076923076923, "grad_norm": 1.0595916509628296, "learning_rate": 6.591327886105207e-06, "loss": 0.0316, "step": 14100 }, { "epoch": 54.26923076923077, "grad_norm": 0.6846816539764404, "learning_rate": 6.570800776611836e-06, "loss": 0.0267, "step": 14110 }, { "epoch": 54.30769230769231, "grad_norm": 0.8474279046058655, "learning_rate": 6.55029671222727e-06, "loss": 0.0293, "step": 14120 }, { "epoch": 54.34615384615385, "grad_norm": 1.0645068883895874, "learning_rate": 6.529815749008846e-06, "loss": 0.0325, "step": 14130 }, { "epoch": 54.38461538461539, "grad_norm": 1.061000108718872, "learning_rate": 6.50935794295073e-06, "loss": 0.0344, "step": 14140 }, { "epoch": 54.42307692307692, "grad_norm": 0.8858121633529663, "learning_rate": 6.488923349983779e-06, "loss": 0.0294, "step": 14150 }, { "epoch": 54.46153846153846, "grad_norm": 0.9322943091392517, "learning_rate": 6.468512025975401e-06, "loss": 0.0302, "step": 14160 }, { "epoch": 54.5, "grad_norm": 1.0522525310516357, "learning_rate": 6.448124026729363e-06, "loss": 0.0321, "step": 14170 }, { "epoch": 54.53846153846154, "grad_norm": 0.885651171207428, "learning_rate": 6.427759407985691e-06, "loss": 0.0308, "step": 14180 }, { "epoch": 54.57692307692308, "grad_norm": 1.0464199781417847, "learning_rate": 6.407418225420465e-06, "loss": 0.0313, "step": 14190 }, { "epoch": 54.61538461538461, "grad_norm": 0.9101268649101257, "learning_rate": 6.387100534645698e-06, "loss": 0.0313, "step": 14200 }, { "epoch": 54.65384615384615, "grad_norm": 0.970328688621521, "learning_rate": 6.366806391209194e-06, "loss": 0.0319, "step": 14210 }, { "epoch": 54.69230769230769, "grad_norm": 0.8179218173027039, "learning_rate": 6.346535850594352e-06, "loss": 0.0332, "step": 14220 }, { "epoch": 54.73076923076923, "grad_norm": 0.9124367833137512, "learning_rate": 6.326288968220069e-06, "loss": 0.0344, "step": 14230 }, { "epoch": 54.76923076923077, "grad_norm": 1.0613592863082886, "learning_rate": 6.306065799440542e-06, "loss": 0.0286, "step": 14240 }, { "epoch": 54.80769230769231, "grad_norm": 1.0514345169067383, "learning_rate": 6.285866399545137e-06, "loss": 0.0334, "step": 14250 }, { "epoch": 54.84615384615385, "grad_norm": 1.174297571182251, "learning_rate": 6.2656908237582515e-06, "loss": 0.0342, "step": 14260 }, { "epoch": 54.88461538461539, "grad_norm": 0.7678318619728088, "learning_rate": 6.245539127239135e-06, "loss": 0.0289, "step": 14270 }, { "epoch": 54.92307692307692, "grad_norm": 0.9058740139007568, "learning_rate": 6.225411365081752e-06, "loss": 0.0307, "step": 14280 }, { "epoch": 54.96153846153846, "grad_norm": 0.941781759262085, "learning_rate": 6.205307592314645e-06, "loss": 0.0253, "step": 14290 }, { "epoch": 55.0, "grad_norm": 0.7861696481704712, "learning_rate": 6.185227863900751e-06, "loss": 0.0296, "step": 14300 }, { "epoch": 55.03846153846154, "grad_norm": 0.7829078435897827, "learning_rate": 6.165172234737291e-06, "loss": 0.0279, "step": 14310 }, { "epoch": 55.07692307692308, "grad_norm": 0.9816953539848328, "learning_rate": 6.145140759655586e-06, "loss": 0.0371, "step": 14320 }, { "epoch": 55.11538461538461, "grad_norm": 0.965496838092804, "learning_rate": 6.125133493420914e-06, "loss": 0.0346, "step": 14330 }, { "epoch": 55.15384615384615, "grad_norm": 0.8535565137863159, "learning_rate": 6.1051504907323915e-06, "loss": 0.0335, "step": 14340 }, { "epoch": 55.19230769230769, "grad_norm": 0.6699849367141724, "learning_rate": 6.085191806222774e-06, "loss": 0.03, "step": 14350 }, { "epoch": 55.23076923076923, "grad_norm": 0.927868664264679, "learning_rate": 6.065257494458352e-06, "loss": 0.0349, "step": 14360 }, { "epoch": 55.26923076923077, "grad_norm": 0.9079675674438477, "learning_rate": 6.045347609938767e-06, "loss": 0.0319, "step": 14370 }, { "epoch": 55.30769230769231, "grad_norm": 0.927964985370636, "learning_rate": 6.025462207096879e-06, "loss": 0.0335, "step": 14380 }, { "epoch": 55.34615384615385, "grad_norm": 0.8558468818664551, "learning_rate": 6.005601340298631e-06, "loss": 0.03, "step": 14390 }, { "epoch": 55.38461538461539, "grad_norm": 0.8809120059013367, "learning_rate": 5.985765063842862e-06, "loss": 0.0312, "step": 14400 }, { "epoch": 55.42307692307692, "grad_norm": 0.8774912357330322, "learning_rate": 5.965953431961206e-06, "loss": 0.0325, "step": 14410 }, { "epoch": 55.46153846153846, "grad_norm": 0.7501303553581238, "learning_rate": 5.946166498817903e-06, "loss": 0.0303, "step": 14420 }, { "epoch": 55.5, "grad_norm": 0.8086189031600952, "learning_rate": 5.926404318509668e-06, "loss": 0.0266, "step": 14430 }, { "epoch": 55.53846153846154, "grad_norm": 1.08782958984375, "learning_rate": 5.906666945065556e-06, "loss": 0.0279, "step": 14440 }, { "epoch": 55.57692307692308, "grad_norm": 1.0755653381347656, "learning_rate": 5.886954432446784e-06, "loss": 0.0386, "step": 14450 }, { "epoch": 55.61538461538461, "grad_norm": 0.9763014912605286, "learning_rate": 5.867266834546617e-06, "loss": 0.029, "step": 14460 }, { "epoch": 55.65384615384615, "grad_norm": 1.0998519659042358, "learning_rate": 5.847604205190192e-06, "loss": 0.0331, "step": 14470 }, { "epoch": 55.69230769230769, "grad_norm": 0.977335512638092, "learning_rate": 5.827966598134383e-06, "loss": 0.0342, "step": 14480 }, { "epoch": 55.73076923076923, "grad_norm": 0.8687805533409119, "learning_rate": 5.808354067067665e-06, "loss": 0.0323, "step": 14490 }, { "epoch": 55.76923076923077, "grad_norm": 0.7466011047363281, "learning_rate": 5.788766665609941e-06, "loss": 0.0257, "step": 14500 }, { "epoch": 55.80769230769231, "grad_norm": 0.7289785742759705, "learning_rate": 5.7692044473124276e-06, "loss": 0.0309, "step": 14510 }, { "epoch": 55.84615384615385, "grad_norm": 0.9711834788322449, "learning_rate": 5.749667465657479e-06, "loss": 0.0273, "step": 14520 }, { "epoch": 55.88461538461539, "grad_norm": 0.629587709903717, "learning_rate": 5.730155774058451e-06, "loss": 0.0316, "step": 14530 }, { "epoch": 55.92307692307692, "grad_norm": 1.080928921699524, "learning_rate": 5.710669425859575e-06, "loss": 0.0357, "step": 14540 }, { "epoch": 55.96153846153846, "grad_norm": 0.8124915361404419, "learning_rate": 5.691208474335774e-06, "loss": 0.0329, "step": 14550 }, { "epoch": 56.0, "grad_norm": 0.8106907606124878, "learning_rate": 5.6717729726925446e-06, "loss": 0.031, "step": 14560 }, { "epoch": 56.03846153846154, "grad_norm": 0.9612008929252625, "learning_rate": 5.652362974065816e-06, "loss": 0.0327, "step": 14570 }, { "epoch": 56.07692307692308, "grad_norm": 0.8096701502799988, "learning_rate": 5.6329785315217726e-06, "loss": 0.0316, "step": 14580 }, { "epoch": 56.11538461538461, "grad_norm": 0.6411515474319458, "learning_rate": 5.6136196980567495e-06, "loss": 0.0287, "step": 14590 }, { "epoch": 56.15384615384615, "grad_norm": 0.7341135144233704, "learning_rate": 5.594286526597054e-06, "loss": 0.0303, "step": 14600 }, { "epoch": 56.19230769230769, "grad_norm": 0.7750659584999084, "learning_rate": 5.574979069998833e-06, "loss": 0.0317, "step": 14610 }, { "epoch": 56.23076923076923, "grad_norm": 0.962619423866272, "learning_rate": 5.5556973810479486e-06, "loss": 0.0295, "step": 14620 }, { "epoch": 56.26923076923077, "grad_norm": 0.8387047648429871, "learning_rate": 5.536441512459787e-06, "loss": 0.0357, "step": 14630 }, { "epoch": 56.30769230769231, "grad_norm": 0.8797601461410522, "learning_rate": 5.517211516879172e-06, "loss": 0.0321, "step": 14640 }, { "epoch": 56.34615384615385, "grad_norm": 0.8170955181121826, "learning_rate": 5.49800744688017e-06, "loss": 0.0304, "step": 14650 }, { "epoch": 56.38461538461539, "grad_norm": 0.7207375168800354, "learning_rate": 5.4788293549659694e-06, "loss": 0.0291, "step": 14660 }, { "epoch": 56.42307692307692, "grad_norm": 0.6312052011489868, "learning_rate": 5.459677293568753e-06, "loss": 0.0279, "step": 14670 }, { "epoch": 56.46153846153846, "grad_norm": 0.8333424925804138, "learning_rate": 5.440551315049515e-06, "loss": 0.0309, "step": 14680 }, { "epoch": 56.5, "grad_norm": 0.8187565803527832, "learning_rate": 5.421451471697966e-06, "loss": 0.0281, "step": 14690 }, { "epoch": 56.53846153846154, "grad_norm": 0.877469539642334, "learning_rate": 5.402377815732326e-06, "loss": 0.0308, "step": 14700 }, { "epoch": 56.57692307692308, "grad_norm": 0.7964321374893188, "learning_rate": 5.383330399299253e-06, "loss": 0.0409, "step": 14710 }, { "epoch": 56.61538461538461, "grad_norm": 0.863561749458313, "learning_rate": 5.364309274473663e-06, "loss": 0.0341, "step": 14720 }, { "epoch": 56.65384615384615, "grad_norm": 0.9740127921104431, "learning_rate": 5.345314493258573e-06, "loss": 0.0312, "step": 14730 }, { "epoch": 56.69230769230769, "grad_norm": 1.0954939126968384, "learning_rate": 5.326346107585e-06, "loss": 0.0384, "step": 14740 }, { "epoch": 56.73076923076923, "grad_norm": 0.9019908905029297, "learning_rate": 5.307404169311782e-06, "loss": 0.0328, "step": 14750 }, { "epoch": 56.76923076923077, "grad_norm": 0.773690402507782, "learning_rate": 5.288488730225449e-06, "loss": 0.0302, "step": 14760 }, { "epoch": 56.80769230769231, "grad_norm": 1.0760575532913208, "learning_rate": 5.2695998420401e-06, "loss": 0.0372, "step": 14770 }, { "epoch": 56.84615384615385, "grad_norm": 0.898847758769989, "learning_rate": 5.2507375563972236e-06, "loss": 0.0307, "step": 14780 }, { "epoch": 56.88461538461539, "grad_norm": 0.8067259192466736, "learning_rate": 5.231901924865596e-06, "loss": 0.0325, "step": 14790 }, { "epoch": 56.92307692307692, "grad_norm": 0.8250156044960022, "learning_rate": 5.213092998941113e-06, "loss": 0.0248, "step": 14800 }, { "epoch": 56.96153846153846, "grad_norm": 0.8855283260345459, "learning_rate": 5.1943108300466555e-06, "loss": 0.0288, "step": 14810 }, { "epoch": 57.0, "grad_norm": 0.9341421127319336, "learning_rate": 5.175555469531964e-06, "loss": 0.0303, "step": 14820 }, { "epoch": 57.03846153846154, "grad_norm": 0.8757314085960388, "learning_rate": 5.1568269686734716e-06, "loss": 0.0316, "step": 14830 }, { "epoch": 57.07692307692308, "grad_norm": 0.816425621509552, "learning_rate": 5.138125378674182e-06, "loss": 0.0298, "step": 14840 }, { "epoch": 57.11538461538461, "grad_norm": 0.9273233413696289, "learning_rate": 5.119450750663539e-06, "loss": 0.0289, "step": 14850 }, { "epoch": 57.15384615384615, "grad_norm": 0.8298301100730896, "learning_rate": 5.100803135697248e-06, "loss": 0.0305, "step": 14860 }, { "epoch": 57.19230769230769, "grad_norm": 0.8186129927635193, "learning_rate": 5.0821825847571904e-06, "loss": 0.0309, "step": 14870 }, { "epoch": 57.23076923076923, "grad_norm": 0.7907426953315735, "learning_rate": 5.063589148751236e-06, "loss": 0.0313, "step": 14880 }, { "epoch": 57.26923076923077, "grad_norm": 0.7023813724517822, "learning_rate": 5.045022878513122e-06, "loss": 0.0286, "step": 14890 }, { "epoch": 57.30769230769231, "grad_norm": 0.9657378196716309, "learning_rate": 5.026483824802333e-06, "loss": 0.0279, "step": 14900 }, { "epoch": 57.34615384615385, "grad_norm": 0.830165445804596, "learning_rate": 5.0079720383039245e-06, "loss": 0.0281, "step": 14910 }, { "epoch": 57.38461538461539, "grad_norm": 0.8461522459983826, "learning_rate": 4.989487569628425e-06, "loss": 0.0383, "step": 14920 }, { "epoch": 57.42307692307692, "grad_norm": 0.7930967807769775, "learning_rate": 4.971030469311658e-06, "loss": 0.0285, "step": 14930 }, { "epoch": 57.46153846153846, "grad_norm": 0.9181236028671265, "learning_rate": 4.952600787814628e-06, "loss": 0.0328, "step": 14940 }, { "epoch": 57.5, "grad_norm": 0.9219228029251099, "learning_rate": 4.934198575523391e-06, "loss": 0.0304, "step": 14950 }, { "epoch": 57.53846153846154, "grad_norm": 0.8909518122673035, "learning_rate": 4.915823882748882e-06, "loss": 0.0265, "step": 14960 }, { "epoch": 57.57692307692308, "grad_norm": 0.6028658151626587, "learning_rate": 4.897476759726823e-06, "loss": 0.0268, "step": 14970 }, { "epoch": 57.61538461538461, "grad_norm": 0.6926416754722595, "learning_rate": 4.87915725661753e-06, "loss": 0.0279, "step": 14980 }, { "epoch": 57.65384615384615, "grad_norm": 0.9338682889938354, "learning_rate": 4.860865423505833e-06, "loss": 0.0349, "step": 14990 }, { "epoch": 57.69230769230769, "grad_norm": 1.1470394134521484, "learning_rate": 4.842601310400912e-06, "loss": 0.029, "step": 15000 }, { "epoch": 57.73076923076923, "grad_norm": 0.8884938955307007, "learning_rate": 4.824364967236145e-06, "loss": 0.0303, "step": 15010 }, { "epoch": 57.76923076923077, "grad_norm": 0.7509388327598572, "learning_rate": 4.8061564438690095e-06, "loss": 0.0268, "step": 15020 }, { "epoch": 57.80769230769231, "grad_norm": 0.8904589414596558, "learning_rate": 4.787975790080896e-06, "loss": 0.0332, "step": 15030 }, { "epoch": 57.84615384615385, "grad_norm": 0.8939962387084961, "learning_rate": 4.769823055577029e-06, "loss": 0.0319, "step": 15040 }, { "epoch": 57.88461538461539, "grad_norm": 0.8094428181648254, "learning_rate": 4.7516982899862934e-06, "loss": 0.0309, "step": 15050 }, { "epoch": 57.92307692307692, "grad_norm": 0.804418683052063, "learning_rate": 4.733601542861098e-06, "loss": 0.0291, "step": 15060 }, { "epoch": 57.96153846153846, "grad_norm": 0.782423198223114, "learning_rate": 4.7155328636772735e-06, "loss": 0.0313, "step": 15070 }, { "epoch": 58.0, "grad_norm": 0.8305570483207703, "learning_rate": 4.697492301833878e-06, "loss": 0.0272, "step": 15080 }, { "epoch": 58.03846153846154, "grad_norm": 0.6236631870269775, "learning_rate": 4.679479906653128e-06, "loss": 0.0312, "step": 15090 }, { "epoch": 58.07692307692308, "grad_norm": 0.8579105138778687, "learning_rate": 4.661495727380232e-06, "loss": 0.0287, "step": 15100 }, { "epoch": 58.11538461538461, "grad_norm": 0.8176283836364746, "learning_rate": 4.64353981318324e-06, "loss": 0.0367, "step": 15110 }, { "epoch": 58.15384615384615, "grad_norm": 0.7548969984054565, "learning_rate": 4.62561221315294e-06, "loss": 0.0357, "step": 15120 }, { "epoch": 58.19230769230769, "grad_norm": 0.9920035004615784, "learning_rate": 4.6077129763026995e-06, "loss": 0.0318, "step": 15130 }, { "epoch": 58.23076923076923, "grad_norm": 0.7022545337677002, "learning_rate": 4.589842151568354e-06, "loss": 0.027, "step": 15140 }, { "epoch": 58.26923076923077, "grad_norm": 0.8829279541969299, "learning_rate": 4.571999787808057e-06, "loss": 0.0285, "step": 15150 }, { "epoch": 58.30769230769231, "grad_norm": 0.7242911458015442, "learning_rate": 4.554185933802151e-06, "loss": 0.0284, "step": 15160 }, { "epoch": 58.34615384615385, "grad_norm": 0.8165299296379089, "learning_rate": 4.5364006382530285e-06, "loss": 0.0289, "step": 15170 }, { "epoch": 58.38461538461539, "grad_norm": 0.7144998908042908, "learning_rate": 4.518643949785004e-06, "loss": 0.028, "step": 15180 }, { "epoch": 58.42307692307692, "grad_norm": 0.7504596710205078, "learning_rate": 4.500915916944193e-06, "loss": 0.0299, "step": 15190 }, { "epoch": 58.46153846153846, "grad_norm": 0.8009552359580994, "learning_rate": 4.483216588198366e-06, "loss": 0.033, "step": 15200 }, { "epoch": 58.5, "grad_norm": 0.7826159596443176, "learning_rate": 4.465546011936797e-06, "loss": 0.0278, "step": 15210 }, { "epoch": 58.53846153846154, "grad_norm": 0.6764227151870728, "learning_rate": 4.447904236470177e-06, "loss": 0.0287, "step": 15220 }, { "epoch": 58.57692307692308, "grad_norm": 0.7171785235404968, "learning_rate": 4.43029131003044e-06, "loss": 0.0291, "step": 15230 }, { "epoch": 58.61538461538461, "grad_norm": 0.7673807740211487, "learning_rate": 4.412707280770658e-06, "loss": 0.0314, "step": 15240 }, { "epoch": 58.65384615384615, "grad_norm": 0.82426917552948, "learning_rate": 4.395152196764905e-06, "loss": 0.0342, "step": 15250 }, { "epoch": 58.69230769230769, "grad_norm": 0.7604185938835144, "learning_rate": 4.3776261060080916e-06, "loss": 0.0299, "step": 15260 }, { "epoch": 58.73076923076923, "grad_norm": 0.7163047194480896, "learning_rate": 4.360129056415895e-06, "loss": 0.0329, "step": 15270 }, { "epoch": 58.76923076923077, "grad_norm": 0.7846920490264893, "learning_rate": 4.34266109582457e-06, "loss": 0.0289, "step": 15280 }, { "epoch": 58.80769230769231, "grad_norm": 0.7941290140151978, "learning_rate": 4.325222271990861e-06, "loss": 0.0327, "step": 15290 }, { "epoch": 58.84615384615385, "grad_norm": 0.7879597544670105, "learning_rate": 4.307812632591853e-06, "loss": 0.0296, "step": 15300 }, { "epoch": 58.88461538461539, "grad_norm": 0.7094414830207825, "learning_rate": 4.2904322252248186e-06, "loss": 0.0332, "step": 15310 }, { "epoch": 58.92307692307692, "grad_norm": 0.7299686670303345, "learning_rate": 4.273081097407142e-06, "loss": 0.0374, "step": 15320 }, { "epoch": 58.96153846153846, "grad_norm": 0.8516938090324402, "learning_rate": 4.255759296576133e-06, "loss": 0.0335, "step": 15330 }, { "epoch": 59.0, "grad_norm": 0.8119964599609375, "learning_rate": 4.238466870088945e-06, "loss": 0.0305, "step": 15340 }, { "epoch": 59.03846153846154, "grad_norm": 0.7987461686134338, "learning_rate": 4.221203865222405e-06, "loss": 0.0291, "step": 15350 }, { "epoch": 59.07692307692308, "grad_norm": 0.8725585341453552, "learning_rate": 4.203970329172907e-06, "loss": 0.0287, "step": 15360 }, { "epoch": 59.11538461538461, "grad_norm": 0.7841514348983765, "learning_rate": 4.186766309056286e-06, "loss": 0.0274, "step": 15370 }, { "epoch": 59.15384615384615, "grad_norm": 0.7789570689201355, "learning_rate": 4.16959185190767e-06, "loss": 0.0292, "step": 15380 }, { "epoch": 59.19230769230769, "grad_norm": 0.8352637887001038, "learning_rate": 4.152447004681379e-06, "loss": 0.0364, "step": 15390 }, { "epoch": 59.23076923076923, "grad_norm": 0.6365222334861755, "learning_rate": 4.135331814250764e-06, "loss": 0.0346, "step": 15400 }, { "epoch": 59.26923076923077, "grad_norm": 0.6212939023971558, "learning_rate": 4.118246327408095e-06, "loss": 0.0278, "step": 15410 }, { "epoch": 59.30769230769231, "grad_norm": 0.7841902375221252, "learning_rate": 4.101190590864457e-06, "loss": 0.0367, "step": 15420 }, { "epoch": 59.34615384615385, "grad_norm": 0.6365123987197876, "learning_rate": 4.084164651249566e-06, "loss": 0.0275, "step": 15430 }, { "epoch": 59.38461538461539, "grad_norm": 0.7116932272911072, "learning_rate": 4.0671685551117035e-06, "loss": 0.0275, "step": 15440 }, { "epoch": 59.42307692307692, "grad_norm": 0.7276926040649414, "learning_rate": 4.050202348917544e-06, "loss": 0.0304, "step": 15450 }, { "epoch": 59.46153846153846, "grad_norm": 0.8002246022224426, "learning_rate": 4.033266079052039e-06, "loss": 0.0296, "step": 15460 }, { "epoch": 59.5, "grad_norm": 0.7436348795890808, "learning_rate": 4.016359791818314e-06, "loss": 0.0306, "step": 15470 }, { "epoch": 59.53846153846154, "grad_norm": 0.9387431740760803, "learning_rate": 3.999483533437511e-06, "loss": 0.0275, "step": 15480 }, { "epoch": 59.57692307692308, "grad_norm": 0.8010867834091187, "learning_rate": 3.982637350048669e-06, "loss": 0.0337, "step": 15490 }, { "epoch": 59.61538461538461, "grad_norm": 0.8753929734230042, "learning_rate": 3.965821287708619e-06, "loss": 0.0289, "step": 15500 }, { "epoch": 59.65384615384615, "grad_norm": 0.7646387219429016, "learning_rate": 3.949035392391825e-06, "loss": 0.0281, "step": 15510 }, { "epoch": 59.69230769230769, "grad_norm": 0.8426647782325745, "learning_rate": 3.932279709990293e-06, "loss": 0.0312, "step": 15520 }, { "epoch": 59.73076923076923, "grad_norm": 0.9022755026817322, "learning_rate": 3.915554286313413e-06, "loss": 0.037, "step": 15530 }, { "epoch": 59.76923076923077, "grad_norm": 0.7830395102500916, "learning_rate": 3.898859167087853e-06, "loss": 0.0346, "step": 15540 }, { "epoch": 59.80769230769231, "grad_norm": 0.7000887393951416, "learning_rate": 3.882194397957437e-06, "loss": 0.0309, "step": 15550 }, { "epoch": 59.84615384615385, "grad_norm": 0.9535983204841614, "learning_rate": 3.865560024483002e-06, "loss": 0.0285, "step": 15560 }, { "epoch": 59.88461538461539, "grad_norm": 0.6389148235321045, "learning_rate": 3.848956092142294e-06, "loss": 0.027, "step": 15570 }, { "epoch": 59.92307692307692, "grad_norm": 0.6393333077430725, "learning_rate": 3.832382646329831e-06, "loss": 0.0298, "step": 15580 }, { "epoch": 59.96153846153846, "grad_norm": 0.7785714864730835, "learning_rate": 3.8158397323567725e-06, "loss": 0.0314, "step": 15590 }, { "epoch": 60.0, "grad_norm": 0.6326451897621155, "learning_rate": 3.7993273954508262e-06, "loss": 0.0284, "step": 15600 }, { "epoch": 60.03846153846154, "grad_norm": 0.6587042212486267, "learning_rate": 3.782845680756078e-06, "loss": 0.0302, "step": 15610 }, { "epoch": 60.07692307692308, "grad_norm": 0.5870890021324158, "learning_rate": 3.7663946333329186e-06, "loss": 0.0287, "step": 15620 }, { "epoch": 60.11538461538461, "grad_norm": 0.7662439942359924, "learning_rate": 3.7499742981578753e-06, "loss": 0.0248, "step": 15630 }, { "epoch": 60.15384615384615, "grad_norm": 0.7695785760879517, "learning_rate": 3.7335847201235166e-06, "loss": 0.0289, "step": 15640 }, { "epoch": 60.19230769230769, "grad_norm": 0.6776568293571472, "learning_rate": 3.717225944038331e-06, "loss": 0.0326, "step": 15650 }, { "epoch": 60.23076923076923, "grad_norm": 0.5910949110984802, "learning_rate": 3.7008980146265776e-06, "loss": 0.0292, "step": 15660 }, { "epoch": 60.26923076923077, "grad_norm": 0.7144111394882202, "learning_rate": 3.6846009765282013e-06, "loss": 0.0296, "step": 15670 }, { "epoch": 60.30769230769231, "grad_norm": 0.8671786785125732, "learning_rate": 3.6683348742986784e-06, "loss": 0.0288, "step": 15680 }, { "epoch": 60.34615384615385, "grad_norm": 0.7827692627906799, "learning_rate": 3.6520997524089057e-06, "loss": 0.0309, "step": 15690 }, { "epoch": 60.38461538461539, "grad_norm": 0.7179871797561646, "learning_rate": 3.635895655245096e-06, "loss": 0.0281, "step": 15700 }, { "epoch": 60.42307692307692, "grad_norm": 0.6734188199043274, "learning_rate": 3.619722627108624e-06, "loss": 0.0304, "step": 15710 }, { "epoch": 60.46153846153846, "grad_norm": 0.8591067790985107, "learning_rate": 3.603580712215937e-06, "loss": 0.0306, "step": 15720 }, { "epoch": 60.5, "grad_norm": 0.7745352983474731, "learning_rate": 3.587469954698413e-06, "loss": 0.0274, "step": 15730 }, { "epoch": 60.53846153846154, "grad_norm": 0.680931568145752, "learning_rate": 3.5713903986022425e-06, "loss": 0.0313, "step": 15740 }, { "epoch": 60.57692307692308, "grad_norm": 0.7778944969177246, "learning_rate": 3.555342087888326e-06, "loss": 0.0291, "step": 15750 }, { "epoch": 60.61538461538461, "grad_norm": 0.6938156485557556, "learning_rate": 3.539325066432127e-06, "loss": 0.031, "step": 15760 }, { "epoch": 60.65384615384615, "grad_norm": 0.8834248185157776, "learning_rate": 3.523339378023569e-06, "loss": 0.0329, "step": 15770 }, { "epoch": 60.69230769230769, "grad_norm": 0.716575026512146, "learning_rate": 3.5073850663669193e-06, "loss": 0.0327, "step": 15780 }, { "epoch": 60.73076923076923, "grad_norm": 0.6148329973220825, "learning_rate": 3.4914621750806503e-06, "loss": 0.0308, "step": 15790 }, { "epoch": 60.76923076923077, "grad_norm": 0.745103657245636, "learning_rate": 3.475570747697346e-06, "loss": 0.0307, "step": 15800 }, { "epoch": 60.80769230769231, "grad_norm": 0.824196457862854, "learning_rate": 3.4597108276635577e-06, "loss": 0.0285, "step": 15810 }, { "epoch": 60.84615384615385, "grad_norm": 0.7248411774635315, "learning_rate": 3.443882458339699e-06, "loss": 0.0338, "step": 15820 }, { "epoch": 60.88461538461539, "grad_norm": 0.7202903032302856, "learning_rate": 3.4280856829999323e-06, "loss": 0.0302, "step": 15830 }, { "epoch": 60.92307692307692, "grad_norm": 0.8248746991157532, "learning_rate": 3.412320544832033e-06, "loss": 0.0273, "step": 15840 }, { "epoch": 60.96153846153846, "grad_norm": 0.6239380240440369, "learning_rate": 3.396587086937294e-06, "loss": 0.0315, "step": 15850 }, { "epoch": 61.0, "grad_norm": 0.8295191526412964, "learning_rate": 3.380885352330383e-06, "loss": 0.0274, "step": 15860 }, { "epoch": 61.03846153846154, "grad_norm": 0.7094337344169617, "learning_rate": 3.3652153839392414e-06, "loss": 0.0283, "step": 15870 }, { "epoch": 61.07692307692308, "grad_norm": 0.5938734412193298, "learning_rate": 3.34957722460497e-06, "loss": 0.0358, "step": 15880 }, { "epoch": 61.11538461538461, "grad_norm": 0.6732869148254395, "learning_rate": 3.333970917081691e-06, "loss": 0.0276, "step": 15890 }, { "epoch": 61.15384615384615, "grad_norm": 0.6122872233390808, "learning_rate": 3.318396504036465e-06, "loss": 0.0328, "step": 15900 }, { "epoch": 61.19230769230769, "grad_norm": 0.7387085556983948, "learning_rate": 3.302854028049134e-06, "loss": 0.0345, "step": 15910 }, { "epoch": 61.23076923076923, "grad_norm": 0.6732428073883057, "learning_rate": 3.287343531612233e-06, "loss": 0.0304, "step": 15920 }, { "epoch": 61.26923076923077, "grad_norm": 0.7445382475852966, "learning_rate": 3.271865057130874e-06, "loss": 0.0394, "step": 15930 }, { "epoch": 61.30769230769231, "grad_norm": 0.8848334550857544, "learning_rate": 3.2564186469226064e-06, "loss": 0.0282, "step": 15940 }, { "epoch": 61.34615384615385, "grad_norm": 0.7744347453117371, "learning_rate": 3.2410043432173353e-06, "loss": 0.0303, "step": 15950 }, { "epoch": 61.38461538461539, "grad_norm": 0.7968832850456238, "learning_rate": 3.2256221881571734e-06, "loss": 0.0347, "step": 15960 }, { "epoch": 61.42307692307692, "grad_norm": 0.8091844916343689, "learning_rate": 3.210272223796341e-06, "loss": 0.0307, "step": 15970 }, { "epoch": 61.46153846153846, "grad_norm": 0.7867940664291382, "learning_rate": 3.1949544921010637e-06, "loss": 0.0302, "step": 15980 }, { "epoch": 61.5, "grad_norm": 0.654680609703064, "learning_rate": 3.1796690349494273e-06, "loss": 0.0283, "step": 15990 }, { "epoch": 61.53846153846154, "grad_norm": 0.6281628608703613, "learning_rate": 3.1644158941312935e-06, "loss": 0.028, "step": 16000 }, { "epoch": 61.57692307692308, "grad_norm": 0.5741403102874756, "learning_rate": 3.149195111348166e-06, "loss": 0.028, "step": 16010 }, { "epoch": 61.61538461538461, "grad_norm": 0.717573881149292, "learning_rate": 3.1340067282130765e-06, "loss": 0.0252, "step": 16020 }, { "epoch": 61.65384615384615, "grad_norm": 0.6507997512817383, "learning_rate": 3.118850786250495e-06, "loss": 0.0271, "step": 16030 }, { "epoch": 61.69230769230769, "grad_norm": 0.7002608776092529, "learning_rate": 3.1037273268961836e-06, "loss": 0.0319, "step": 16040 }, { "epoch": 61.73076923076923, "grad_norm": 0.5722702145576477, "learning_rate": 3.0886363914970994e-06, "loss": 0.0316, "step": 16050 }, { "epoch": 61.76923076923077, "grad_norm": 0.653465986251831, "learning_rate": 3.0735780213112896e-06, "loss": 0.0269, "step": 16060 }, { "epoch": 61.80769230769231, "grad_norm": 0.846031904220581, "learning_rate": 3.0585522575077558e-06, "loss": 0.0305, "step": 16070 }, { "epoch": 61.84615384615385, "grad_norm": 0.6906000375747681, "learning_rate": 3.043559141166372e-06, "loss": 0.0331, "step": 16080 }, { "epoch": 61.88461538461539, "grad_norm": 0.7020779252052307, "learning_rate": 3.02859871327774e-06, "loss": 0.03, "step": 16090 }, { "epoch": 61.92307692307692, "grad_norm": 0.6151683330535889, "learning_rate": 3.0136710147430945e-06, "loss": 0.029, "step": 16100 }, { "epoch": 61.96153846153846, "grad_norm": 0.6461793780326843, "learning_rate": 2.998776086374202e-06, "loss": 0.0286, "step": 16110 }, { "epoch": 62.0, "grad_norm": 0.6847963929176331, "learning_rate": 2.983913968893221e-06, "loss": 0.0304, "step": 16120 }, { "epoch": 62.03846153846154, "grad_norm": 0.6221253871917725, "learning_rate": 2.969084702932619e-06, "loss": 0.0285, "step": 16130 }, { "epoch": 62.07692307692308, "grad_norm": 0.6729203462600708, "learning_rate": 2.95428832903504e-06, "loss": 0.032, "step": 16140 }, { "epoch": 62.11538461538461, "grad_norm": 0.6860134601593018, "learning_rate": 2.939524887653201e-06, "loss": 0.029, "step": 16150 }, { "epoch": 62.15384615384615, "grad_norm": 0.7285041809082031, "learning_rate": 2.924794419149796e-06, "loss": 0.0329, "step": 16160 }, { "epoch": 62.19230769230769, "grad_norm": 0.7282090783119202, "learning_rate": 2.910096963797354e-06, "loss": 0.0277, "step": 16170 }, { "epoch": 62.23076923076923, "grad_norm": 0.6759335994720459, "learning_rate": 2.895432561778164e-06, "loss": 0.0349, "step": 16180 }, { "epoch": 62.26923076923077, "grad_norm": 0.6853289604187012, "learning_rate": 2.8808012531841393e-06, "loss": 0.0297, "step": 16190 }, { "epoch": 62.30769230769231, "grad_norm": 0.7826913595199585, "learning_rate": 2.8662030780167138e-06, "loss": 0.0335, "step": 16200 }, { "epoch": 62.34615384615385, "grad_norm": 0.6961445212364197, "learning_rate": 2.85163807618675e-06, "loss": 0.0254, "step": 16210 }, { "epoch": 62.38461538461539, "grad_norm": 0.7920289635658264, "learning_rate": 2.837106287514397e-06, "loss": 0.0292, "step": 16220 }, { "epoch": 62.42307692307692, "grad_norm": 0.7149428129196167, "learning_rate": 2.822607751729018e-06, "loss": 0.0306, "step": 16230 }, { "epoch": 62.46153846153846, "grad_norm": 0.5912485718727112, "learning_rate": 2.808142508469054e-06, "loss": 0.0303, "step": 16240 }, { "epoch": 62.5, "grad_norm": 0.6906529068946838, "learning_rate": 2.7937105972819237e-06, "loss": 0.0289, "step": 16250 }, { "epoch": 62.53846153846154, "grad_norm": 0.6463748216629028, "learning_rate": 2.7793120576239285e-06, "loss": 0.0319, "step": 16260 }, { "epoch": 62.57692307692308, "grad_norm": 0.6151840686798096, "learning_rate": 2.7649469288601175e-06, "loss": 0.0292, "step": 16270 }, { "epoch": 62.61538461538461, "grad_norm": 0.613445520401001, "learning_rate": 2.7506152502642125e-06, "loss": 0.0275, "step": 16280 }, { "epoch": 62.65384615384615, "grad_norm": 0.6083470582962036, "learning_rate": 2.7363170610184716e-06, "loss": 0.0324, "step": 16290 }, { "epoch": 62.69230769230769, "grad_norm": 0.7266085147857666, "learning_rate": 2.722052400213595e-06, "loss": 0.03, "step": 16300 }, { "epoch": 62.73076923076923, "grad_norm": 0.8478745222091675, "learning_rate": 2.707821306848627e-06, "loss": 0.0314, "step": 16310 }, { "epoch": 62.76923076923077, "grad_norm": 0.7400423288345337, "learning_rate": 2.6936238198308318e-06, "loss": 0.0285, "step": 16320 }, { "epoch": 62.80769230769231, "grad_norm": 0.6447364687919617, "learning_rate": 2.67945997797559e-06, "loss": 0.0324, "step": 16330 }, { "epoch": 62.84615384615385, "grad_norm": 0.6548041701316833, "learning_rate": 2.665329820006314e-06, "loss": 0.0286, "step": 16340 }, { "epoch": 62.88461538461539, "grad_norm": 0.5898657441139221, "learning_rate": 2.6512333845543086e-06, "loss": 0.0274, "step": 16350 }, { "epoch": 62.92307692307692, "grad_norm": 0.6900472044944763, "learning_rate": 2.637170710158697e-06, "loss": 0.0317, "step": 16360 }, { "epoch": 62.96153846153846, "grad_norm": 0.7050812840461731, "learning_rate": 2.6231418352662895e-06, "loss": 0.03, "step": 16370 }, { "epoch": 63.0, "grad_norm": 0.6556162238121033, "learning_rate": 2.609146798231493e-06, "loss": 0.0332, "step": 16380 }, { "epoch": 63.03846153846154, "grad_norm": 0.5093095898628235, "learning_rate": 2.5951856373162097e-06, "loss": 0.0297, "step": 16390 }, { "epoch": 63.07692307692308, "grad_norm": 0.5848261117935181, "learning_rate": 2.581258390689712e-06, "loss": 0.0293, "step": 16400 }, { "epoch": 63.11538461538461, "grad_norm": 0.8164099454879761, "learning_rate": 2.5673650964285718e-06, "loss": 0.0329, "step": 16410 }, { "epoch": 63.15384615384615, "grad_norm": 0.8447076082229614, "learning_rate": 2.553505792516518e-06, "loss": 0.0268, "step": 16420 }, { "epoch": 63.19230769230769, "grad_norm": 0.5706382989883423, "learning_rate": 2.539680516844356e-06, "loss": 0.0251, "step": 16430 }, { "epoch": 63.23076923076923, "grad_norm": 0.7153502702713013, "learning_rate": 2.5258893072098678e-06, "loss": 0.0287, "step": 16440 }, { "epoch": 63.26923076923077, "grad_norm": 0.5721958875656128, "learning_rate": 2.512132201317688e-06, "loss": 0.0269, "step": 16450 }, { "epoch": 63.30769230769231, "grad_norm": 0.9266186356544495, "learning_rate": 2.4984092367792272e-06, "loss": 0.0306, "step": 16460 }, { "epoch": 63.34615384615385, "grad_norm": 0.683887243270874, "learning_rate": 2.484720451112536e-06, "loss": 0.0286, "step": 16470 }, { "epoch": 63.38461538461539, "grad_norm": 0.7552659511566162, "learning_rate": 2.471065881742236e-06, "loss": 0.0309, "step": 16480 }, { "epoch": 63.42307692307692, "grad_norm": 0.6858226656913757, "learning_rate": 2.4574455659994023e-06, "loss": 0.033, "step": 16490 }, { "epoch": 63.46153846153846, "grad_norm": 0.707927942276001, "learning_rate": 2.4438595411214528e-06, "loss": 0.0292, "step": 16500 }, { "epoch": 63.5, "grad_norm": 0.6420449614524841, "learning_rate": 2.430307844252069e-06, "loss": 0.0266, "step": 16510 }, { "epoch": 63.53846153846154, "grad_norm": 0.7699747681617737, "learning_rate": 2.4167905124410587e-06, "loss": 0.0282, "step": 16520 }, { "epoch": 63.57692307692308, "grad_norm": 0.5155813694000244, "learning_rate": 2.4033075826442995e-06, "loss": 0.0283, "step": 16530 }, { "epoch": 63.61538461538461, "grad_norm": 0.6115219593048096, "learning_rate": 2.389859091723608e-06, "loss": 0.0284, "step": 16540 }, { "epoch": 63.65384615384615, "grad_norm": 0.6672840714454651, "learning_rate": 2.376445076446641e-06, "loss": 0.031, "step": 16550 }, { "epoch": 63.69230769230769, "grad_norm": 0.6997214555740356, "learning_rate": 2.3630655734868117e-06, "loss": 0.0276, "step": 16560 }, { "epoch": 63.73076923076923, "grad_norm": 0.6843510270118713, "learning_rate": 2.349720619423158e-06, "loss": 0.0293, "step": 16570 }, { "epoch": 63.76923076923077, "grad_norm": 1.0205272436141968, "learning_rate": 2.3364102507402817e-06, "loss": 0.0334, "step": 16580 }, { "epoch": 63.80769230769231, "grad_norm": 0.6957825422286987, "learning_rate": 2.3231345038282243e-06, "loss": 0.0289, "step": 16590 }, { "epoch": 63.84615384615385, "grad_norm": 0.756861686706543, "learning_rate": 2.3098934149823686e-06, "loss": 0.0297, "step": 16600 }, { "epoch": 63.88461538461539, "grad_norm": 0.7331545948982239, "learning_rate": 2.296687020403346e-06, "loss": 0.031, "step": 16610 }, { "epoch": 63.92307692307692, "grad_norm": 0.655592679977417, "learning_rate": 2.2835153561969322e-06, "loss": 0.0311, "step": 16620 }, { "epoch": 63.96153846153846, "grad_norm": 0.6528260111808777, "learning_rate": 2.270378458373956e-06, "loss": 0.0277, "step": 16630 }, { "epoch": 64.0, "grad_norm": 0.5519979596138, "learning_rate": 2.257276362850199e-06, "loss": 0.0288, "step": 16640 }, { "epoch": 64.03846153846153, "grad_norm": 0.5662660598754883, "learning_rate": 2.244209105446286e-06, "loss": 0.0311, "step": 16650 }, { "epoch": 64.07692307692308, "grad_norm": 0.6251816749572754, "learning_rate": 2.2311767218875995e-06, "loss": 0.0291, "step": 16660 }, { "epoch": 64.11538461538461, "grad_norm": 0.7042757868766785, "learning_rate": 2.218179247804177e-06, "loss": 0.0305, "step": 16670 }, { "epoch": 64.15384615384616, "grad_norm": 0.6656632423400879, "learning_rate": 2.2052167187306167e-06, "loss": 0.0274, "step": 16680 }, { "epoch": 64.1923076923077, "grad_norm": 0.7627696990966797, "learning_rate": 2.192289170105989e-06, "loss": 0.029, "step": 16690 }, { "epoch": 64.23076923076923, "grad_norm": 0.7777524590492249, "learning_rate": 2.1793966372737003e-06, "loss": 0.034, "step": 16700 }, { "epoch": 64.26923076923077, "grad_norm": 0.6019226312637329, "learning_rate": 2.166539155481455e-06, "loss": 0.0252, "step": 16710 }, { "epoch": 64.3076923076923, "grad_norm": 0.7408103942871094, "learning_rate": 2.1537167598811118e-06, "loss": 0.0322, "step": 16720 }, { "epoch": 64.34615384615384, "grad_norm": 0.7054068446159363, "learning_rate": 2.140929485528612e-06, "loss": 0.0269, "step": 16730 }, { "epoch": 64.38461538461539, "grad_norm": 0.5298335552215576, "learning_rate": 2.1281773673838838e-06, "loss": 0.0246, "step": 16740 }, { "epoch": 64.42307692307692, "grad_norm": 0.6683807969093323, "learning_rate": 2.1154604403107175e-06, "loss": 0.0264, "step": 16750 }, { "epoch": 64.46153846153847, "grad_norm": 0.6651745438575745, "learning_rate": 2.102778739076715e-06, "loss": 0.0302, "step": 16760 }, { "epoch": 64.5, "grad_norm": 0.6783315539360046, "learning_rate": 2.0901322983531574e-06, "loss": 0.0327, "step": 16770 }, { "epoch": 64.53846153846153, "grad_norm": 0.63167804479599, "learning_rate": 2.0775211527149357e-06, "loss": 0.0274, "step": 16780 }, { "epoch": 64.57692307692308, "grad_norm": 0.7201244831085205, "learning_rate": 2.0649453366404438e-06, "loss": 0.0309, "step": 16790 }, { "epoch": 64.61538461538461, "grad_norm": 0.6224427819252014, "learning_rate": 2.052404884511472e-06, "loss": 0.0286, "step": 16800 }, { "epoch": 64.65384615384616, "grad_norm": 0.6468524932861328, "learning_rate": 2.039899830613145e-06, "loss": 0.0333, "step": 16810 }, { "epoch": 64.6923076923077, "grad_norm": 0.5740170478820801, "learning_rate": 2.0274302091337987e-06, "loss": 0.0301, "step": 16820 }, { "epoch": 64.73076923076923, "grad_norm": 0.6893084049224854, "learning_rate": 2.0149960541649076e-06, "loss": 0.0283, "step": 16830 }, { "epoch": 64.76923076923077, "grad_norm": 0.782490074634552, "learning_rate": 2.002597399700974e-06, "loss": 0.0299, "step": 16840 }, { "epoch": 64.8076923076923, "grad_norm": 0.6436896324157715, "learning_rate": 1.990234279639441e-06, "loss": 0.0297, "step": 16850 }, { "epoch": 64.84615384615384, "grad_norm": 0.5762495398521423, "learning_rate": 1.977906727780614e-06, "loss": 0.0281, "step": 16860 }, { "epoch": 64.88461538461539, "grad_norm": 0.5494686961174011, "learning_rate": 1.9656147778275423e-06, "loss": 0.0277, "step": 16870 }, { "epoch": 64.92307692307692, "grad_norm": 0.46614858508110046, "learning_rate": 1.953358463385954e-06, "loss": 0.0333, "step": 16880 }, { "epoch": 64.96153846153847, "grad_norm": 0.5312842130661011, "learning_rate": 1.9411378179641435e-06, "loss": 0.035, "step": 16890 }, { "epoch": 65.0, "grad_norm": 0.610146701335907, "learning_rate": 1.9289528749728834e-06, "loss": 0.0382, "step": 16900 }, { "epoch": 65.03846153846153, "grad_norm": 0.6586548686027527, "learning_rate": 1.916803667725351e-06, "loss": 0.0293, "step": 16910 }, { "epoch": 65.07692307692308, "grad_norm": 0.6412225961685181, "learning_rate": 1.9046902294370044e-06, "loss": 0.0288, "step": 16920 }, { "epoch": 65.11538461538461, "grad_norm": 0.6662336587905884, "learning_rate": 1.8926125932255328e-06, "loss": 0.0282, "step": 16930 }, { "epoch": 65.15384615384616, "grad_norm": 0.7287715673446655, "learning_rate": 1.8805707921107262e-06, "loss": 0.0297, "step": 16940 }, { "epoch": 65.1923076923077, "grad_norm": 0.6192464828491211, "learning_rate": 1.8685648590144066e-06, "loss": 0.0274, "step": 16950 }, { "epoch": 65.23076923076923, "grad_norm": 0.5435850620269775, "learning_rate": 1.8565948267603444e-06, "loss": 0.0331, "step": 16960 }, { "epoch": 65.26923076923077, "grad_norm": 0.4770941734313965, "learning_rate": 1.8446607280741435e-06, "loss": 0.0342, "step": 16970 }, { "epoch": 65.3076923076923, "grad_norm": 0.6695992350578308, "learning_rate": 1.8327625955831763e-06, "loss": 0.0272, "step": 16980 }, { "epoch": 65.34615384615384, "grad_norm": 0.6568714380264282, "learning_rate": 1.8209004618164837e-06, "loss": 0.0313, "step": 16990 }, { "epoch": 65.38461538461539, "grad_norm": 0.6583348512649536, "learning_rate": 1.8090743592046843e-06, "loss": 0.035, "step": 17000 }, { "epoch": 65.42307692307692, "grad_norm": 0.5825269222259521, "learning_rate": 1.7972843200798932e-06, "loss": 0.0283, "step": 17010 }, { "epoch": 65.46153846153847, "grad_norm": 0.6032260060310364, "learning_rate": 1.7855303766756316e-06, "loss": 0.0307, "step": 17020 }, { "epoch": 65.5, "grad_norm": 0.6155828833580017, "learning_rate": 1.7738125611267204e-06, "loss": 0.0295, "step": 17030 }, { "epoch": 65.53846153846153, "grad_norm": 0.5321171283721924, "learning_rate": 1.7621309054692302e-06, "loss": 0.0271, "step": 17040 }, { "epoch": 65.57692307692308, "grad_norm": 0.4425484836101532, "learning_rate": 1.7504854416403542e-06, "loss": 0.0289, "step": 17050 }, { "epoch": 65.61538461538461, "grad_norm": 0.5637415051460266, "learning_rate": 1.7388762014783493e-06, "loss": 0.0289, "step": 17060 }, { "epoch": 65.65384615384616, "grad_norm": 0.705307126045227, "learning_rate": 1.7273032167224418e-06, "loss": 0.0327, "step": 17070 }, { "epoch": 65.6923076923077, "grad_norm": 0.6594562530517578, "learning_rate": 1.7157665190127154e-06, "loss": 0.0331, "step": 17080 }, { "epoch": 65.73076923076923, "grad_norm": 0.5829503536224365, "learning_rate": 1.7042661398900733e-06, "loss": 0.0257, "step": 17090 }, { "epoch": 65.76923076923077, "grad_norm": 0.5575122833251953, "learning_rate": 1.692802110796105e-06, "loss": 0.0295, "step": 17100 }, { "epoch": 65.8076923076923, "grad_norm": 0.8589024543762207, "learning_rate": 1.6813744630730343e-06, "loss": 0.0331, "step": 17110 }, { "epoch": 65.84615384615384, "grad_norm": 0.7978380918502808, "learning_rate": 1.6699832279636113e-06, "loss": 0.0317, "step": 17120 }, { "epoch": 65.88461538461539, "grad_norm": 0.6307493448257446, "learning_rate": 1.6586284366110355e-06, "loss": 0.0366, "step": 17130 }, { "epoch": 65.92307692307692, "grad_norm": 0.49484503269195557, "learning_rate": 1.647310120058878e-06, "loss": 0.034, "step": 17140 }, { "epoch": 65.96153846153847, "grad_norm": 0.7164108157157898, "learning_rate": 1.6360283092509765e-06, "loss": 0.0295, "step": 17150 }, { "epoch": 66.0, "grad_norm": 0.730033278465271, "learning_rate": 1.6247830350313797e-06, "loss": 0.0313, "step": 17160 }, { "epoch": 66.03846153846153, "grad_norm": 0.6059457659721375, "learning_rate": 1.6135743281442333e-06, "loss": 0.0285, "step": 17170 }, { "epoch": 66.07692307692308, "grad_norm": 0.7369277477264404, "learning_rate": 1.6024022192337112e-06, "loss": 0.0297, "step": 17180 }, { "epoch": 66.11538461538461, "grad_norm": 0.7592663764953613, "learning_rate": 1.591266738843939e-06, "loss": 0.0279, "step": 17190 }, { "epoch": 66.15384615384616, "grad_norm": 0.7213373780250549, "learning_rate": 1.5801679174188888e-06, "loss": 0.0297, "step": 17200 }, { "epoch": 66.1923076923077, "grad_norm": 0.5568493604660034, "learning_rate": 1.5691057853023199e-06, "loss": 0.0316, "step": 17210 }, { "epoch": 66.23076923076923, "grad_norm": 0.6144545674324036, "learning_rate": 1.5580803727376786e-06, "loss": 0.0279, "step": 17220 }, { "epoch": 66.26923076923077, "grad_norm": 0.5706826448440552, "learning_rate": 1.5470917098680142e-06, "loss": 0.028, "step": 17230 }, { "epoch": 66.3076923076923, "grad_norm": 0.5628615617752075, "learning_rate": 1.5361398267359205e-06, "loss": 0.0294, "step": 17240 }, { "epoch": 66.34615384615384, "grad_norm": 0.6618927121162415, "learning_rate": 1.5252247532834246e-06, "loss": 0.032, "step": 17250 }, { "epoch": 66.38461538461539, "grad_norm": 0.6899990439414978, "learning_rate": 1.5143465193519173e-06, "loss": 0.0297, "step": 17260 }, { "epoch": 66.42307692307692, "grad_norm": 0.6864995956420898, "learning_rate": 1.5035051546820821e-06, "loss": 0.0301, "step": 17270 }, { "epoch": 66.46153846153847, "grad_norm": 0.7423065900802612, "learning_rate": 1.4927006889137862e-06, "loss": 0.0319, "step": 17280 }, { "epoch": 66.5, "grad_norm": 0.634519100189209, "learning_rate": 1.4819331515860357e-06, "loss": 0.0282, "step": 17290 }, { "epoch": 66.53846153846153, "grad_norm": 0.6282957792282104, "learning_rate": 1.4712025721368644e-06, "loss": 0.0327, "step": 17300 }, { "epoch": 66.57692307692308, "grad_norm": 0.5965334177017212, "learning_rate": 1.46050897990326e-06, "loss": 0.0318, "step": 17310 }, { "epoch": 66.61538461538461, "grad_norm": 0.5001717209815979, "learning_rate": 1.449852404121103e-06, "loss": 0.0264, "step": 17320 }, { "epoch": 66.65384615384616, "grad_norm": 0.6770375967025757, "learning_rate": 1.4392328739250615e-06, "loss": 0.0315, "step": 17330 }, { "epoch": 66.6923076923077, "grad_norm": 0.5028340816497803, "learning_rate": 1.4286504183485277e-06, "loss": 0.0308, "step": 17340 }, { "epoch": 66.73076923076923, "grad_norm": 0.7518753409385681, "learning_rate": 1.4181050663235284e-06, "loss": 0.0321, "step": 17350 }, { "epoch": 66.76923076923077, "grad_norm": 0.5301283001899719, "learning_rate": 1.4075968466806533e-06, "loss": 0.0302, "step": 17360 }, { "epoch": 66.8076923076923, "grad_norm": 0.5376285314559937, "learning_rate": 1.3971257881489762e-06, "loss": 0.028, "step": 17370 }, { "epoch": 66.84615384615384, "grad_norm": 0.5453165769577026, "learning_rate": 1.386691919355968e-06, "loss": 0.0257, "step": 17380 }, { "epoch": 66.88461538461539, "grad_norm": 0.526111364364624, "learning_rate": 1.3762952688274316e-06, "loss": 0.0257, "step": 17390 }, { "epoch": 66.92307692307692, "grad_norm": 0.587806224822998, "learning_rate": 1.3659358649874104e-06, "loss": 0.026, "step": 17400 }, { "epoch": 66.96153846153847, "grad_norm": 0.6474411487579346, "learning_rate": 1.3556137361581155e-06, "loss": 0.0298, "step": 17410 }, { "epoch": 67.0, "grad_norm": 0.7550439834594727, "learning_rate": 1.3453289105598616e-06, "loss": 0.0321, "step": 17420 }, { "epoch": 67.03846153846153, "grad_norm": 0.6548799276351929, "learning_rate": 1.3350814163109592e-06, "loss": 0.0336, "step": 17430 }, { "epoch": 67.07692307692308, "grad_norm": 0.5987898111343384, "learning_rate": 1.3248712814276732e-06, "loss": 0.0264, "step": 17440 }, { "epoch": 67.11538461538461, "grad_norm": 0.5832239389419556, "learning_rate": 1.3146985338241207e-06, "loss": 0.0284, "step": 17450 }, { "epoch": 67.15384615384616, "grad_norm": 0.6605148315429688, "learning_rate": 1.3045632013122032e-06, "loss": 0.0296, "step": 17460 }, { "epoch": 67.1923076923077, "grad_norm": 0.6073583364486694, "learning_rate": 1.294465311601537e-06, "loss": 0.0287, "step": 17470 }, { "epoch": 67.23076923076923, "grad_norm": 0.5388942956924438, "learning_rate": 1.2844048922993602e-06, "loss": 0.0314, "step": 17480 }, { "epoch": 67.26923076923077, "grad_norm": 0.48449668288230896, "learning_rate": 1.2743819709104826e-06, "loss": 0.0283, "step": 17490 }, { "epoch": 67.3076923076923, "grad_norm": 0.6949400305747986, "learning_rate": 1.264396574837185e-06, "loss": 0.024, "step": 17500 }, { "epoch": 67.34615384615384, "grad_norm": 0.5057098865509033, "learning_rate": 1.2544487313791564e-06, "loss": 0.0258, "step": 17510 }, { "epoch": 67.38461538461539, "grad_norm": 0.6099366545677185, "learning_rate": 1.2445384677334282e-06, "loss": 0.0295, "step": 17520 }, { "epoch": 67.42307692307692, "grad_norm": 0.7301434874534607, "learning_rate": 1.2346658109942755e-06, "loss": 0.0316, "step": 17530 }, { "epoch": 67.46153846153847, "grad_norm": 0.5584269165992737, "learning_rate": 1.2248307881531656e-06, "loss": 0.0341, "step": 17540 }, { "epoch": 67.5, "grad_norm": 0.5226390361785889, "learning_rate": 1.2150334260986818e-06, "loss": 0.0333, "step": 17550 }, { "epoch": 67.53846153846153, "grad_norm": 0.7313436269760132, "learning_rate": 1.2052737516164292e-06, "loss": 0.0307, "step": 17560 }, { "epoch": 67.57692307692308, "grad_norm": 0.4775332808494568, "learning_rate": 1.1955517913889924e-06, "loss": 0.0291, "step": 17570 }, { "epoch": 67.61538461538461, "grad_norm": 0.6237944960594177, "learning_rate": 1.185867571995835e-06, "loss": 0.031, "step": 17580 }, { "epoch": 67.65384615384616, "grad_norm": 0.6194935441017151, "learning_rate": 1.1762211199132433e-06, "loss": 0.0262, "step": 17590 }, { "epoch": 67.6923076923077, "grad_norm": 0.5005141496658325, "learning_rate": 1.1666124615142525e-06, "loss": 0.0254, "step": 17600 }, { "epoch": 67.73076923076923, "grad_norm": 0.34853067994117737, "learning_rate": 1.1570416230685627e-06, "loss": 0.0255, "step": 17610 }, { "epoch": 67.76923076923077, "grad_norm": 0.5736085772514343, "learning_rate": 1.147508630742486e-06, "loss": 0.0277, "step": 17620 }, { "epoch": 67.8076923076923, "grad_norm": 0.5523407459259033, "learning_rate": 1.1380135105988576e-06, "loss": 0.0292, "step": 17630 }, { "epoch": 67.84615384615384, "grad_norm": 0.6663440465927124, "learning_rate": 1.128556288596969e-06, "loss": 0.0319, "step": 17640 }, { "epoch": 67.88461538461539, "grad_norm": 0.5403256416320801, "learning_rate": 1.1191369905925096e-06, "loss": 0.0278, "step": 17650 }, { "epoch": 67.92307692307692, "grad_norm": 0.5839226245880127, "learning_rate": 1.1097556423374765e-06, "loss": 0.0304, "step": 17660 }, { "epoch": 67.96153846153847, "grad_norm": 0.5336809754371643, "learning_rate": 1.1004122694801233e-06, "loss": 0.0369, "step": 17670 }, { "epoch": 68.0, "grad_norm": 0.38012614846229553, "learning_rate": 1.0911068975648697e-06, "loss": 0.0261, "step": 17680 }, { "epoch": 68.03846153846153, "grad_norm": 0.5304232239723206, "learning_rate": 1.0818395520322456e-06, "loss": 0.0264, "step": 17690 }, { "epoch": 68.07692307692308, "grad_norm": 0.5830591320991516, "learning_rate": 1.072610258218825e-06, "loss": 0.0369, "step": 17700 }, { "epoch": 68.11538461538461, "grad_norm": 0.5707952976226807, "learning_rate": 1.0634190413571415e-06, "loss": 0.0283, "step": 17710 }, { "epoch": 68.15384615384616, "grad_norm": 0.6779969334602356, "learning_rate": 1.0542659265756337e-06, "loss": 0.0306, "step": 17720 }, { "epoch": 68.1923076923077, "grad_norm": 0.5678604245185852, "learning_rate": 1.0451509388985663e-06, "loss": 0.0329, "step": 17730 }, { "epoch": 68.23076923076923, "grad_norm": 0.5494369864463806, "learning_rate": 1.0360741032459636e-06, "loss": 0.031, "step": 17740 }, { "epoch": 68.26923076923077, "grad_norm": 0.520933985710144, "learning_rate": 1.027035444433555e-06, "loss": 0.0262, "step": 17750 }, { "epoch": 68.3076923076923, "grad_norm": 0.49139437079429626, "learning_rate": 1.0180349871726819e-06, "loss": 0.0306, "step": 17760 }, { "epoch": 68.34615384615384, "grad_norm": 0.6331795454025269, "learning_rate": 1.0090727560702572e-06, "loss": 0.0325, "step": 17770 }, { "epoch": 68.38461538461539, "grad_norm": 0.4937455952167511, "learning_rate": 1.0001487756286748e-06, "loss": 0.0294, "step": 17780 }, { "epoch": 68.42307692307692, "grad_norm": 0.6751633882522583, "learning_rate": 9.912630702457548e-07, "loss": 0.0278, "step": 17790 }, { "epoch": 68.46153846153847, "grad_norm": 0.4086829423904419, "learning_rate": 9.824156642146798e-07, "loss": 0.0323, "step": 17800 }, { "epoch": 68.5, "grad_norm": 0.5629752278327942, "learning_rate": 9.736065817239192e-07, "loss": 0.0296, "step": 17810 }, { "epoch": 68.53846153846153, "grad_norm": 0.7791491746902466, "learning_rate": 9.648358468571667e-07, "loss": 0.0288, "step": 17820 }, { "epoch": 68.57692307692308, "grad_norm": 0.6236022114753723, "learning_rate": 9.561034835932774e-07, "loss": 0.0286, "step": 17830 }, { "epoch": 68.61538461538461, "grad_norm": 0.5426509380340576, "learning_rate": 9.474095158061996e-07, "loss": 0.0322, "step": 17840 }, { "epoch": 68.65384615384616, "grad_norm": 0.5536707639694214, "learning_rate": 9.387539672649082e-07, "loss": 0.0404, "step": 17850 }, { "epoch": 68.6923076923077, "grad_norm": 0.5977795720100403, "learning_rate": 9.301368616333456e-07, "loss": 0.0272, "step": 17860 }, { "epoch": 68.73076923076923, "grad_norm": 0.5283776521682739, "learning_rate": 9.215582224703417e-07, "loss": 0.0271, "step": 17870 }, { "epoch": 68.76923076923077, "grad_norm": 0.6370693445205688, "learning_rate": 9.13018073229579e-07, "loss": 0.0335, "step": 17880 }, { "epoch": 68.8076923076923, "grad_norm": 0.4692783057689667, "learning_rate": 9.045164372594889e-07, "loss": 0.0273, "step": 17890 }, { "epoch": 68.84615384615384, "grad_norm": 0.6345952153205872, "learning_rate": 8.960533378032288e-07, "loss": 0.0313, "step": 17900 }, { "epoch": 68.88461538461539, "grad_norm": 0.5102418661117554, "learning_rate": 8.876287979985853e-07, "loss": 0.0271, "step": 17910 }, { "epoch": 68.92307692307692, "grad_norm": 0.43029651045799255, "learning_rate": 8.792428408779246e-07, "loss": 0.0331, "step": 17920 }, { "epoch": 68.96153846153847, "grad_norm": 0.5155785083770752, "learning_rate": 8.708954893681421e-07, "loss": 0.0311, "step": 17930 }, { "epoch": 69.0, "grad_norm": 0.5972561836242676, "learning_rate": 8.62586766290569e-07, "loss": 0.0319, "step": 17940 }, { "epoch": 69.03846153846153, "grad_norm": 0.39445289969444275, "learning_rate": 8.543166943609448e-07, "loss": 0.0273, "step": 17950 }, { "epoch": 69.07692307692308, "grad_norm": 0.7934895157814026, "learning_rate": 8.460852961893234e-07, "loss": 0.0313, "step": 17960 }, { "epoch": 69.11538461538461, "grad_norm": 0.6706607341766357, "learning_rate": 8.378925942800364e-07, "loss": 0.0355, "step": 17970 }, { "epoch": 69.15384615384616, "grad_norm": 0.45455625653266907, "learning_rate": 8.297386110316202e-07, "loss": 0.0278, "step": 17980 }, { "epoch": 69.1923076923077, "grad_norm": 0.49573683738708496, "learning_rate": 8.216233687367491e-07, "loss": 0.0322, "step": 17990 }, { "epoch": 69.23076923076923, "grad_norm": 0.5711278915405273, "learning_rate": 8.135468895821924e-07, "loss": 0.0324, "step": 18000 }, { "epoch": 69.26923076923077, "grad_norm": 0.5714309215545654, "learning_rate": 8.05509195648727e-07, "loss": 0.0302, "step": 18010 }, { "epoch": 69.3076923076923, "grad_norm": 0.46931174397468567, "learning_rate": 7.975103089111052e-07, "loss": 0.0331, "step": 18020 }, { "epoch": 69.34615384615384, "grad_norm": 0.5957428812980652, "learning_rate": 7.895502512379805e-07, "loss": 0.0328, "step": 18030 }, { "epoch": 69.38461538461539, "grad_norm": 0.43586134910583496, "learning_rate": 7.816290443918411e-07, "loss": 0.0298, "step": 18040 }, { "epoch": 69.42307692307692, "grad_norm": 0.5497664213180542, "learning_rate": 7.737467100289725e-07, "loss": 0.033, "step": 18050 }, { "epoch": 69.46153846153847, "grad_norm": 0.5081692337989807, "learning_rate": 7.659032696993661e-07, "loss": 0.0309, "step": 18060 }, { "epoch": 69.5, "grad_norm": 0.5081343054771423, "learning_rate": 7.580987448466925e-07, "loss": 0.0279, "step": 18070 }, { "epoch": 69.53846153846153, "grad_norm": 0.45474639534950256, "learning_rate": 7.503331568082267e-07, "loss": 0.0295, "step": 18080 }, { "epoch": 69.57692307692308, "grad_norm": 0.5449734330177307, "learning_rate": 7.426065268147875e-07, "loss": 0.0269, "step": 18090 }, { "epoch": 69.61538461538461, "grad_norm": 0.5013516545295715, "learning_rate": 7.349188759906889e-07, "loss": 0.0293, "step": 18100 }, { "epoch": 69.65384615384616, "grad_norm": 0.5508989691734314, "learning_rate": 7.272702253536683e-07, "loss": 0.0292, "step": 18110 }, { "epoch": 69.6923076923077, "grad_norm": 0.5623137354850769, "learning_rate": 7.196605958148505e-07, "loss": 0.0291, "step": 18120 }, { "epoch": 69.73076923076923, "grad_norm": 0.37221282720565796, "learning_rate": 7.120900081786719e-07, "loss": 0.0242, "step": 18130 }, { "epoch": 69.76923076923077, "grad_norm": 0.4855639934539795, "learning_rate": 7.045584831428276e-07, "loss": 0.034, "step": 18140 }, { "epoch": 69.8076923076923, "grad_norm": 0.635829508304596, "learning_rate": 6.970660412982199e-07, "loss": 0.0256, "step": 18150 }, { "epoch": 69.84615384615384, "grad_norm": 0.5936337113380432, "learning_rate": 6.896127031288985e-07, "loss": 0.033, "step": 18160 }, { "epoch": 69.88461538461539, "grad_norm": 0.39154934883117676, "learning_rate": 6.821984890120064e-07, "loss": 0.0257, "step": 18170 }, { "epoch": 69.92307692307692, "grad_norm": 0.47772216796875, "learning_rate": 6.748234192177227e-07, "loss": 0.0324, "step": 18180 }, { "epoch": 69.96153846153847, "grad_norm": 0.559112012386322, "learning_rate": 6.674875139092051e-07, "loss": 0.0311, "step": 18190 }, { "epoch": 70.0, "grad_norm": 0.5065118670463562, "learning_rate": 6.601907931425388e-07, "loss": 0.0249, "step": 18200 }, { "epoch": 70.03846153846153, "grad_norm": 0.47782960534095764, "learning_rate": 6.529332768666779e-07, "loss": 0.0265, "step": 18210 }, { "epoch": 70.07692307692308, "grad_norm": 0.5119221806526184, "learning_rate": 6.457149849233973e-07, "loss": 0.0285, "step": 18220 }, { "epoch": 70.11538461538461, "grad_norm": 0.4450240433216095, "learning_rate": 6.385359370472343e-07, "loss": 0.031, "step": 18230 }, { "epoch": 70.15384615384616, "grad_norm": 0.6436985731124878, "learning_rate": 6.313961528654239e-07, "loss": 0.0298, "step": 18240 }, { "epoch": 70.1923076923077, "grad_norm": 0.6557655930519104, "learning_rate": 6.242956518978682e-07, "loss": 0.0327, "step": 18250 }, { "epoch": 70.23076923076923, "grad_norm": 0.4954908490180969, "learning_rate": 6.172344535570673e-07, "loss": 0.0254, "step": 18260 }, { "epoch": 70.26923076923077, "grad_norm": 0.4294760823249817, "learning_rate": 6.102125771480655e-07, "loss": 0.0284, "step": 18270 }, { "epoch": 70.3076923076923, "grad_norm": 0.4691609740257263, "learning_rate": 6.032300418684062e-07, "loss": 0.0255, "step": 18280 }, { "epoch": 70.34615384615384, "grad_norm": 0.567426323890686, "learning_rate": 5.962868668080706e-07, "loss": 0.0315, "step": 18290 }, { "epoch": 70.38461538461539, "grad_norm": 0.7433890700340271, "learning_rate": 5.89383070949438e-07, "loss": 0.0324, "step": 18300 }, { "epoch": 70.42307692307692, "grad_norm": 0.36738789081573486, "learning_rate": 5.825186731672217e-07, "loss": 0.0291, "step": 18310 }, { "epoch": 70.46153846153847, "grad_norm": 0.4836757481098175, "learning_rate": 5.756936922284228e-07, "loss": 0.0237, "step": 18320 }, { "epoch": 70.5, "grad_norm": 0.5915025472640991, "learning_rate": 5.689081467922791e-07, "loss": 0.0304, "step": 18330 }, { "epoch": 70.53846153846153, "grad_norm": 0.6776860952377319, "learning_rate": 5.621620554102108e-07, "loss": 0.0269, "step": 18340 }, { "epoch": 70.57692307692308, "grad_norm": 0.367214173078537, "learning_rate": 5.554554365257747e-07, "loss": 0.0257, "step": 18350 }, { "epoch": 70.61538461538461, "grad_norm": 0.43015265464782715, "learning_rate": 5.487883084746137e-07, "loss": 0.0311, "step": 18360 }, { "epoch": 70.65384615384616, "grad_norm": 0.5791113972663879, "learning_rate": 5.421606894843989e-07, "loss": 0.0342, "step": 18370 }, { "epoch": 70.6923076923077, "grad_norm": 0.6974865794181824, "learning_rate": 5.355725976747878e-07, "loss": 0.03, "step": 18380 }, { "epoch": 70.73076923076923, "grad_norm": 0.44641560316085815, "learning_rate": 5.290240510573707e-07, "loss": 0.0275, "step": 18390 }, { "epoch": 70.76923076923077, "grad_norm": 0.3478294610977173, "learning_rate": 5.22515067535625e-07, "loss": 0.0333, "step": 18400 }, { "epoch": 70.8076923076923, "grad_norm": 0.5026068091392517, "learning_rate": 5.160456649048656e-07, "loss": 0.034, "step": 18410 }, { "epoch": 70.84615384615384, "grad_norm": 0.42307016253471375, "learning_rate": 5.096158608521878e-07, "loss": 0.0289, "step": 18420 }, { "epoch": 70.88461538461539, "grad_norm": 0.3856401741504669, "learning_rate": 5.032256729564349e-07, "loss": 0.0306, "step": 18430 }, { "epoch": 70.92307692307692, "grad_norm": 0.44772571325302124, "learning_rate": 4.968751186881321e-07, "loss": 0.0298, "step": 18440 }, { "epoch": 70.96153846153847, "grad_norm": 0.4006417989730835, "learning_rate": 4.905642154094526e-07, "loss": 0.0285, "step": 18450 }, { "epoch": 71.0, "grad_norm": 0.5411732792854309, "learning_rate": 4.842929803741713e-07, "loss": 0.0333, "step": 18460 }, { "epoch": 71.03846153846153, "grad_norm": 0.566062331199646, "learning_rate": 4.780614307275987e-07, "loss": 0.0286, "step": 18470 }, { "epoch": 71.07692307692308, "grad_norm": 0.7110042572021484, "learning_rate": 4.7186958350655506e-07, "loss": 0.026, "step": 18480 }, { "epoch": 71.11538461538461, "grad_norm": 0.5046260356903076, "learning_rate": 4.65717455639314e-07, "loss": 0.0297, "step": 18490 }, { "epoch": 71.15384615384616, "grad_norm": 0.46450793743133545, "learning_rate": 4.5960506394555956e-07, "loss": 0.0302, "step": 18500 }, { "epoch": 71.1923076923077, "grad_norm": 0.6306203603744507, "learning_rate": 4.53532425136341e-07, "loss": 0.0325, "step": 18510 }, { "epoch": 71.23076923076923, "grad_norm": 0.42232760787010193, "learning_rate": 4.4749955581401103e-07, "loss": 0.028, "step": 18520 }, { "epoch": 71.26923076923077, "grad_norm": 0.4966919720172882, "learning_rate": 4.415064724722129e-07, "loss": 0.0299, "step": 18530 }, { "epoch": 71.3076923076923, "grad_norm": 0.5042365193367004, "learning_rate": 4.355531914958016e-07, "loss": 0.0373, "step": 18540 }, { "epoch": 71.34615384615384, "grad_norm": 0.5430654883384705, "learning_rate": 4.2963972916082286e-07, "loss": 0.0272, "step": 18550 }, { "epoch": 71.38461538461539, "grad_norm": 0.4358469843864441, "learning_rate": 4.2376610163446074e-07, "loss": 0.0328, "step": 18560 }, { "epoch": 71.42307692307692, "grad_norm": 0.4239481985569, "learning_rate": 4.1793232497498167e-07, "loss": 0.0301, "step": 18570 }, { "epoch": 71.46153846153847, "grad_norm": 0.49865809082984924, "learning_rate": 4.1213841513171257e-07, "loss": 0.0278, "step": 18580 }, { "epoch": 71.5, "grad_norm": 0.6041271686553955, "learning_rate": 4.0638438794497743e-07, "loss": 0.0317, "step": 18590 }, { "epoch": 71.53846153846153, "grad_norm": 0.5827446579933167, "learning_rate": 4.0067025914607257e-07, "loss": 0.0313, "step": 18600 }, { "epoch": 71.57692307692308, "grad_norm": 0.6713128685951233, "learning_rate": 3.9499604435720483e-07, "loss": 0.0308, "step": 18610 }, { "epoch": 71.61538461538461, "grad_norm": 0.42750629782676697, "learning_rate": 3.8936175909146e-07, "loss": 0.0258, "step": 18620 }, { "epoch": 71.65384615384616, "grad_norm": 0.4124327600002289, "learning_rate": 3.837674187527629e-07, "loss": 0.0292, "step": 18630 }, { "epoch": 71.6923076923077, "grad_norm": 0.49467605352401733, "learning_rate": 3.7821303863581904e-07, "loss": 0.0253, "step": 18640 }, { "epoch": 71.73076923076923, "grad_norm": 0.3977210819721222, "learning_rate": 3.726986339260996e-07, "loss": 0.0284, "step": 18650 }, { "epoch": 71.76923076923077, "grad_norm": 0.5405611991882324, "learning_rate": 3.672242196997733e-07, "loss": 0.0285, "step": 18660 }, { "epoch": 71.8076923076923, "grad_norm": 0.5646962523460388, "learning_rate": 3.6178981092367615e-07, "loss": 0.039, "step": 18670 }, { "epoch": 71.84615384615384, "grad_norm": 0.3832261264324188, "learning_rate": 3.5639542245527847e-07, "loss": 0.0294, "step": 18680 }, { "epoch": 71.88461538461539, "grad_norm": 0.6208793520927429, "learning_rate": 3.5104106904263134e-07, "loss": 0.0308, "step": 18690 }, { "epoch": 71.92307692307692, "grad_norm": 0.3466988801956177, "learning_rate": 3.4572676532433345e-07, "loss": 0.0314, "step": 18700 }, { "epoch": 71.96153846153847, "grad_norm": 0.4137439727783203, "learning_rate": 3.4045252582948603e-07, "loss": 0.0309, "step": 18710 }, { "epoch": 72.0, "grad_norm": 0.4454915225505829, "learning_rate": 3.3521836497765803e-07, "loss": 0.0273, "step": 18720 }, { "epoch": 72.03846153846153, "grad_norm": 0.3448931872844696, "learning_rate": 3.300242970788492e-07, "loss": 0.0271, "step": 18730 }, { "epoch": 72.07692307692308, "grad_norm": 0.46295422315597534, "learning_rate": 3.248703363334404e-07, "loss": 0.0283, "step": 18740 }, { "epoch": 72.11538461538461, "grad_norm": 0.46668925881385803, "learning_rate": 3.197564968321637e-07, "loss": 0.0257, "step": 18750 }, { "epoch": 72.15384615384616, "grad_norm": 0.36828577518463135, "learning_rate": 3.1468279255606027e-07, "loss": 0.0271, "step": 18760 }, { "epoch": 72.1923076923077, "grad_norm": 0.5191018581390381, "learning_rate": 3.096492373764442e-07, "loss": 0.029, "step": 18770 }, { "epoch": 72.23076923076923, "grad_norm": 0.5372118353843689, "learning_rate": 3.046558450548642e-07, "loss": 0.0289, "step": 18780 }, { "epoch": 72.26923076923077, "grad_norm": 0.49187055230140686, "learning_rate": 2.997026292430632e-07, "loss": 0.0297, "step": 18790 }, { "epoch": 72.3076923076923, "grad_norm": 0.4657900929450989, "learning_rate": 2.9478960348294393e-07, "loss": 0.0298, "step": 18800 }, { "epoch": 72.34615384615384, "grad_norm": 0.4845152795314789, "learning_rate": 2.8991678120653343e-07, "loss": 0.0311, "step": 18810 }, { "epoch": 72.38461538461539, "grad_norm": 0.3693923056125641, "learning_rate": 2.850841757359385e-07, "loss": 0.0292, "step": 18820 }, { "epoch": 72.42307692307692, "grad_norm": 0.5879788994789124, "learning_rate": 2.802918002833188e-07, "loss": 0.0278, "step": 18830 }, { "epoch": 72.46153846153847, "grad_norm": 0.474170058965683, "learning_rate": 2.7553966795084875e-07, "loss": 0.0263, "step": 18840 }, { "epoch": 72.5, "grad_norm": 0.5131663680076599, "learning_rate": 2.708277917306723e-07, "loss": 0.0295, "step": 18850 }, { "epoch": 72.53846153846153, "grad_norm": 0.5183109641075134, "learning_rate": 2.661561845048832e-07, "loss": 0.026, "step": 18860 }, { "epoch": 72.57692307692308, "grad_norm": 0.37531840801239014, "learning_rate": 2.615248590454733e-07, "loss": 0.0285, "step": 18870 }, { "epoch": 72.61538461538461, "grad_norm": 0.38698941469192505, "learning_rate": 2.569338280143124e-07, "loss": 0.0296, "step": 18880 }, { "epoch": 72.65384615384616, "grad_norm": 0.40946704149246216, "learning_rate": 2.523831039631036e-07, "loss": 0.0259, "step": 18890 }, { "epoch": 72.6923076923077, "grad_norm": 0.456138014793396, "learning_rate": 2.478726993333513e-07, "loss": 0.0297, "step": 18900 }, { "epoch": 72.73076923076923, "grad_norm": 0.46513912081718445, "learning_rate": 2.434026264563299e-07, "loss": 0.0264, "step": 18910 }, { "epoch": 72.76923076923077, "grad_norm": 0.5075211524963379, "learning_rate": 2.389728975530486e-07, "loss": 0.0339, "step": 18920 }, { "epoch": 72.8076923076923, "grad_norm": 0.3680661916732788, "learning_rate": 2.3458352473421986e-07, "loss": 0.0295, "step": 18930 }, { "epoch": 72.84615384615384, "grad_norm": 0.362907737493515, "learning_rate": 2.3023452000021594e-07, "loss": 0.0292, "step": 18940 }, { "epoch": 72.88461538461539, "grad_norm": 0.4176541864871979, "learning_rate": 2.2592589524105255e-07, "loss": 0.0259, "step": 18950 }, { "epoch": 72.92307692307692, "grad_norm": 0.5750277042388916, "learning_rate": 2.216576622363453e-07, "loss": 0.0316, "step": 18960 }, { "epoch": 72.96153846153847, "grad_norm": 0.3795614242553711, "learning_rate": 2.1742983265527984e-07, "loss": 0.0272, "step": 18970 }, { "epoch": 73.0, "grad_norm": 0.3043051064014435, "learning_rate": 2.1324241805658006e-07, "loss": 0.0276, "step": 18980 }, { "epoch": 73.03846153846153, "grad_norm": 0.5338976383209229, "learning_rate": 2.0909542988848007e-07, "loss": 0.0239, "step": 18990 }, { "epoch": 73.07692307692308, "grad_norm": 0.35654547810554504, "learning_rate": 2.0498887948868395e-07, "loss": 0.026, "step": 19000 }, { "epoch": 73.11538461538461, "grad_norm": 0.5859305262565613, "learning_rate": 2.009227780843459e-07, "loss": 0.0274, "step": 19010 }, { "epoch": 73.15384615384616, "grad_norm": 0.5016968250274658, "learning_rate": 1.968971367920319e-07, "loss": 0.0318, "step": 19020 }, { "epoch": 73.1923076923077, "grad_norm": 0.5447234511375427, "learning_rate": 1.9291196661768984e-07, "loss": 0.0335, "step": 19030 }, { "epoch": 73.23076923076923, "grad_norm": 0.5071997046470642, "learning_rate": 1.8896727845662432e-07, "loss": 0.0278, "step": 19040 }, { "epoch": 73.26923076923077, "grad_norm": 0.29972633719444275, "learning_rate": 1.8506308309346022e-07, "loss": 0.025, "step": 19050 }, { "epoch": 73.3076923076923, "grad_norm": 0.4020128548145294, "learning_rate": 1.811993912021226e-07, "loss": 0.0259, "step": 19060 }, { "epoch": 73.34615384615384, "grad_norm": 0.4244936406612396, "learning_rate": 1.7737621334579346e-07, "loss": 0.0281, "step": 19070 }, { "epoch": 73.38461538461539, "grad_norm": 0.4991587698459625, "learning_rate": 1.735935599768951e-07, "loss": 0.0286, "step": 19080 }, { "epoch": 73.42307692307692, "grad_norm": 0.4140017628669739, "learning_rate": 1.6985144143706166e-07, "loss": 0.0278, "step": 19090 }, { "epoch": 73.46153846153847, "grad_norm": 0.6661912202835083, "learning_rate": 1.6614986795709774e-07, "loss": 0.0292, "step": 19100 }, { "epoch": 73.5, "grad_norm": 0.39654165506362915, "learning_rate": 1.6248884965696654e-07, "loss": 0.0269, "step": 19110 }, { "epoch": 73.53846153846153, "grad_norm": 0.4546181261539459, "learning_rate": 1.5886839654575158e-07, "loss": 0.0268, "step": 19120 }, { "epoch": 73.57692307692308, "grad_norm": 0.5141550898551941, "learning_rate": 1.5528851852163183e-07, "loss": 0.0316, "step": 19130 }, { "epoch": 73.61538461538461, "grad_norm": 0.3499520421028137, "learning_rate": 1.5174922537185997e-07, "loss": 0.0279, "step": 19140 }, { "epoch": 73.65384615384616, "grad_norm": 0.4141696095466614, "learning_rate": 1.4825052677272576e-07, "loss": 0.0307, "step": 19150 }, { "epoch": 73.6923076923077, "grad_norm": 0.39404889941215515, "learning_rate": 1.4479243228953942e-07, "loss": 0.031, "step": 19160 }, { "epoch": 73.73076923076923, "grad_norm": 0.5340063571929932, "learning_rate": 1.4137495137659827e-07, "loss": 0.0318, "step": 19170 }, { "epoch": 73.76923076923077, "grad_norm": 0.5623757839202881, "learning_rate": 1.3799809337716517e-07, "loss": 0.0274, "step": 19180 }, { "epoch": 73.8076923076923, "grad_norm": 0.4724770486354828, "learning_rate": 1.3466186752344178e-07, "loss": 0.0304, "step": 19190 }, { "epoch": 73.84615384615384, "grad_norm": 0.46430504322052, "learning_rate": 1.3136628293653863e-07, "loss": 0.0279, "step": 19200 }, { "epoch": 73.88461538461539, "grad_norm": 0.36791667342185974, "learning_rate": 1.2811134862646178e-07, "loss": 0.0258, "step": 19210 }, { "epoch": 73.92307692307692, "grad_norm": 0.3952659070491791, "learning_rate": 1.2489707349207623e-07, "loss": 0.0279, "step": 19220 }, { "epoch": 73.96153846153847, "grad_norm": 0.5908829569816589, "learning_rate": 1.2172346632108754e-07, "loss": 0.0313, "step": 19230 }, { "epoch": 74.0, "grad_norm": 0.49030816555023193, "learning_rate": 1.1859053579001688e-07, "loss": 0.0279, "step": 19240 }, { "epoch": 74.03846153846153, "grad_norm": 0.537586510181427, "learning_rate": 1.1549829046417437e-07, "loss": 0.0306, "step": 19250 }, { "epoch": 74.07692307692308, "grad_norm": 0.2679162919521332, "learning_rate": 1.1244673879764411e-07, "loss": 0.028, "step": 19260 }, { "epoch": 74.11538461538461, "grad_norm": 0.45097458362579346, "learning_rate": 1.094358891332492e-07, "loss": 0.0277, "step": 19270 }, { "epoch": 74.15384615384616, "grad_norm": 0.5424238443374634, "learning_rate": 1.0646574970253842e-07, "loss": 0.0336, "step": 19280 }, { "epoch": 74.1923076923077, "grad_norm": 0.4734244644641876, "learning_rate": 1.0353632862576124e-07, "loss": 0.0315, "step": 19290 }, { "epoch": 74.23076923076923, "grad_norm": 0.39023080468177795, "learning_rate": 1.0064763391184118e-07, "loss": 0.0269, "step": 19300 }, { "epoch": 74.26923076923077, "grad_norm": 0.48881417512893677, "learning_rate": 9.779967345835917e-08, "loss": 0.0272, "step": 19310 }, { "epoch": 74.3076923076923, "grad_norm": 0.33827894926071167, "learning_rate": 9.49924550515302e-08, "loss": 0.0267, "step": 19320 }, { "epoch": 74.34615384615384, "grad_norm": 0.48151615262031555, "learning_rate": 9.22259863661834e-08, "loss": 0.0283, "step": 19330 }, { "epoch": 74.38461538461539, "grad_norm": 0.3296300172805786, "learning_rate": 8.950027496573865e-08, "loss": 0.0288, "step": 19340 }, { "epoch": 74.42307692307692, "grad_norm": 0.6360703110694885, "learning_rate": 8.681532830218497e-08, "loss": 0.0305, "step": 19350 }, { "epoch": 74.46153846153847, "grad_norm": 0.3356926143169403, "learning_rate": 8.417115371606554e-08, "loss": 0.0262, "step": 19360 }, { "epoch": 74.5, "grad_norm": 0.4273458421230316, "learning_rate": 8.15677584364527e-08, "loss": 0.0278, "step": 19370 }, { "epoch": 74.53846153846153, "grad_norm": 0.3618144094944, "learning_rate": 7.900514958092964e-08, "loss": 0.0298, "step": 19380 }, { "epoch": 74.57692307692308, "grad_norm": 0.47049006819725037, "learning_rate": 7.64833341555704e-08, "loss": 0.0317, "step": 19390 }, { "epoch": 74.61538461538461, "grad_norm": 0.3946126401424408, "learning_rate": 7.400231905492328e-08, "loss": 0.028, "step": 19400 }, { "epoch": 74.65384615384616, "grad_norm": 0.682574450969696, "learning_rate": 7.15621110619874e-08, "loss": 0.0321, "step": 19410 }, { "epoch": 74.6923076923077, "grad_norm": 0.4793030917644501, "learning_rate": 6.916271684819787e-08, "loss": 0.0269, "step": 19420 }, { "epoch": 74.73076923076923, "grad_norm": 0.38368162512779236, "learning_rate": 6.680414297340897e-08, "loss": 0.0269, "step": 19430 }, { "epoch": 74.76923076923077, "grad_norm": 0.4740610420703888, "learning_rate": 6.448639588587103e-08, "loss": 0.0296, "step": 19440 }, { "epoch": 74.8076923076923, "grad_norm": 0.36339718103408813, "learning_rate": 6.22094819222152e-08, "loss": 0.0282, "step": 19450 }, { "epoch": 74.84615384615384, "grad_norm": 0.44646188616752625, "learning_rate": 5.997340730743705e-08, "loss": 0.025, "step": 19460 }, { "epoch": 74.88461538461539, "grad_norm": 0.32935407757759094, "learning_rate": 5.7778178154879736e-08, "loss": 0.0312, "step": 19470 }, { "epoch": 74.92307692307692, "grad_norm": 0.5153601169586182, "learning_rate": 5.56238004662174e-08, "loss": 0.035, "step": 19480 }, { "epoch": 74.96153846153847, "grad_norm": 0.5570949912071228, "learning_rate": 5.351028013143355e-08, "loss": 0.0346, "step": 19490 }, { "epoch": 75.0, "grad_norm": 0.40673136711120605, "learning_rate": 5.1437622928814374e-08, "loss": 0.0271, "step": 19500 }, { "epoch": 75.03846153846153, "grad_norm": 0.48403364419937134, "learning_rate": 4.940583452492542e-08, "loss": 0.0291, "step": 19510 }, { "epoch": 75.07692307692308, "grad_norm": 0.32624363899230957, "learning_rate": 4.741492047459661e-08, "loss": 0.0275, "step": 19520 }, { "epoch": 75.11538461538461, "grad_norm": 0.5291919708251953, "learning_rate": 4.5464886220912275e-08, "loss": 0.03, "step": 19530 }, { "epoch": 75.15384615384616, "grad_norm": 0.3304661810398102, "learning_rate": 4.355573709519112e-08, "loss": 0.028, "step": 19540 }, { "epoch": 75.1923076923077, "grad_norm": 0.5096902847290039, "learning_rate": 4.168747831697628e-08, "loss": 0.0329, "step": 19550 }, { "epoch": 75.23076923076923, "grad_norm": 0.6049113869667053, "learning_rate": 3.986011499401199e-08, "loss": 0.0301, "step": 19560 }, { "epoch": 75.26923076923077, "grad_norm": 0.43766793608665466, "learning_rate": 3.80736521222419e-08, "loss": 0.0327, "step": 19570 }, { "epoch": 75.3076923076923, "grad_norm": 0.3489794135093689, "learning_rate": 3.6328094585789116e-08, "loss": 0.0307, "step": 19580 }, { "epoch": 75.34615384615384, "grad_norm": 0.3926312327384949, "learning_rate": 3.462344715693788e-08, "loss": 0.0257, "step": 19590 }, { "epoch": 75.38461538461539, "grad_norm": 0.33839303255081177, "learning_rate": 3.295971449613022e-08, "loss": 0.0262, "step": 19600 }, { "epoch": 75.42307692307692, "grad_norm": 0.34823545813560486, "learning_rate": 3.1336901151949316e-08, "loss": 0.0242, "step": 19610 }, { "epoch": 75.46153846153847, "grad_norm": 0.5097415447235107, "learning_rate": 2.975501156110283e-08, "loss": 0.0289, "step": 19620 }, { "epoch": 75.5, "grad_norm": 0.4118276834487915, "learning_rate": 2.821405004841793e-08, "loss": 0.0276, "step": 19630 }, { "epoch": 75.53846153846153, "grad_norm": 0.4702577590942383, "learning_rate": 2.671402082682295e-08, "loss": 0.0263, "step": 19640 }, { "epoch": 75.57692307692308, "grad_norm": 0.47985705733299255, "learning_rate": 2.5254927997342415e-08, "loss": 0.0257, "step": 19650 }, { "epoch": 75.61538461538461, "grad_norm": 0.4477846920490265, "learning_rate": 2.3836775549078703e-08, "loss": 0.032, "step": 19660 }, { "epoch": 75.65384615384616, "grad_norm": 0.45571255683898926, "learning_rate": 2.245956735920873e-08, "loss": 0.0314, "step": 19670 }, { "epoch": 75.6923076923077, "grad_norm": 0.43451234698295593, "learning_rate": 2.1123307192965625e-08, "loss": 0.0259, "step": 19680 }, { "epoch": 75.73076923076923, "grad_norm": 0.33790716528892517, "learning_rate": 1.9827998703632056e-08, "loss": 0.0267, "step": 19690 }, { "epoch": 75.76923076923077, "grad_norm": 0.5504989624023438, "learning_rate": 1.8573645432535258e-08, "loss": 0.0348, "step": 19700 }, { "epoch": 75.8076923076923, "grad_norm": 0.32850709557533264, "learning_rate": 1.7360250809027034e-08, "loss": 0.0277, "step": 19710 }, { "epoch": 75.84615384615384, "grad_norm": 0.3690672218799591, "learning_rate": 1.618781815048209e-08, "loss": 0.0292, "step": 19720 }, { "epoch": 75.88461538461539, "grad_norm": 0.3160001039505005, "learning_rate": 1.5056350662286388e-08, "loss": 0.0303, "step": 19730 }, { "epoch": 75.92307692307692, "grad_norm": 0.5542304515838623, "learning_rate": 1.3965851437830468e-08, "loss": 0.0299, "step": 19740 }, { "epoch": 75.96153846153847, "grad_norm": 0.25911015272140503, "learning_rate": 1.2916323458494473e-08, "loss": 0.0306, "step": 19750 }, { "epoch": 76.0, "grad_norm": 0.524623692035675, "learning_rate": 1.1907769593651474e-08, "loss": 0.0294, "step": 19760 }, { "epoch": 76.03846153846153, "grad_norm": 0.3299170136451721, "learning_rate": 1.0940192600647491e-08, "loss": 0.0304, "step": 19770 }, { "epoch": 76.07692307692308, "grad_norm": 0.3935050964355469, "learning_rate": 1.0013595124801488e-08, "loss": 0.0298, "step": 19780 }, { "epoch": 76.11538461538461, "grad_norm": 0.5473943948745728, "learning_rate": 9.127979699393719e-09, "loss": 0.0308, "step": 19790 }, { "epoch": 76.15384615384616, "grad_norm": 0.38750502467155457, "learning_rate": 8.283348745665719e-09, "loss": 0.0276, "step": 19800 }, { "epoch": 76.1923076923077, "grad_norm": 0.31281858682632446, "learning_rate": 7.479704572805336e-09, "loss": 0.0306, "step": 19810 }, { "epoch": 76.23076923076923, "grad_norm": 0.6501541137695312, "learning_rate": 6.717049377943374e-09, "loss": 0.0287, "step": 19820 }, { "epoch": 76.26923076923077, "grad_norm": 0.32253095507621765, "learning_rate": 5.995385246151952e-09, "loss": 0.0307, "step": 19830 }, { "epoch": 76.3076923076923, "grad_norm": 0.2738461494445801, "learning_rate": 5.314714150432831e-09, "loss": 0.0342, "step": 19840 }, { "epoch": 76.34615384615384, "grad_norm": 0.6626792550086975, "learning_rate": 4.67503795171409e-09, "loss": 0.0336, "step": 19850 }, { "epoch": 76.38461538461539, "grad_norm": 0.6046369671821594, "learning_rate": 4.076358398846791e-09, "loss": 0.0337, "step": 19860 }, { "epoch": 76.42307692307692, "grad_norm": 0.38855934143066406, "learning_rate": 3.518677128598324e-09, "loss": 0.0318, "step": 19870 }, { "epoch": 76.46153846153847, "grad_norm": 0.4337220788002014, "learning_rate": 3.0019956656457404e-09, "loss": 0.0272, "step": 19880 }, { "epoch": 76.5, "grad_norm": 0.3263535499572754, "learning_rate": 2.526315422579084e-09, "loss": 0.0303, "step": 19890 }, { "epoch": 76.53846153846153, "grad_norm": 0.5594188570976257, "learning_rate": 2.091637699889737e-09, "loss": 0.0288, "step": 19900 }, { "epoch": 76.57692307692308, "grad_norm": 0.4034026563167572, "learning_rate": 1.6979636859687509e-09, "loss": 0.0273, "step": 19910 }, { "epoch": 76.61538461538461, "grad_norm": 0.40827733278274536, "learning_rate": 1.3452944571051839e-09, "loss": 0.0267, "step": 19920 }, { "epoch": 76.65384615384616, "grad_norm": 0.3561939597129822, "learning_rate": 1.0336309774860998e-09, "loss": 0.025, "step": 19930 }, { "epoch": 76.6923076923077, "grad_norm": 0.5761282444000244, "learning_rate": 7.629740991849099e-10, "loss": 0.0276, "step": 19940 }, { "epoch": 76.73076923076923, "grad_norm": 0.4117676615715027, "learning_rate": 5.333245621680361e-10, "loss": 0.0315, "step": 19950 }, { "epoch": 76.76923076923077, "grad_norm": 0.35498183965682983, "learning_rate": 3.446829942882479e-10, "loss": 0.0278, "step": 19960 }, { "epoch": 76.8076923076923, "grad_norm": 0.5719658732414246, "learning_rate": 1.9704991128632887e-10, "loss": 0.0318, "step": 19970 }, { "epoch": 76.84615384615384, "grad_norm": 0.5071206092834473, "learning_rate": 9.042571678274936e-11, "loss": 0.0263, "step": 19980 }, { "epoch": 76.88461538461539, "grad_norm": 0.4370606243610382, "learning_rate": 2.4810702284328024e-11, "loss": 0.0311, "step": 19990 }, { "epoch": 76.92307692307692, "grad_norm": 0.5554015040397644, "learning_rate": 2.050471825665312e-13, "loss": 0.0256, "step": 20000 }, { "epoch": 76.92307692307692, "step": 20000, "total_flos": 0.0, "train_loss": 0.09584806797504425, "train_runtime": 7325.8064, "train_samples_per_second": 174.725, "train_steps_per_second": 2.73 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 77, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }