| { |
| "best_metric": 0.5654382109642029, |
| "best_model_checkpoint": "./results_t5_base/checkpoint-1100", |
| "epoch": 20.0, |
| "eval_steps": 500, |
| "global_step": 1100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.18181818181818182, |
| "grad_norm": 34.49951934814453, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 15.0621, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 34.444114685058594, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 14.3473, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.5454545454545454, |
| "grad_norm": 27.946786880493164, |
| "learning_rate": 3e-06, |
| "loss": 13.5965, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 26.316389083862305, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 12.6354, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 31.084760665893555, |
| "learning_rate": 5e-06, |
| "loss": 11.446, |
| "step": 50 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 9.390619277954102, |
| "eval_runtime": 1.9144, |
| "eval_samples_per_second": 50.67, |
| "eval_steps_per_second": 3.657, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.0909090909090908, |
| "grad_norm": 27.938682556152344, |
| "learning_rate": 6e-06, |
| "loss": 9.6083, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.2727272727272727, |
| "grad_norm": 24.73709487915039, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 7.8641, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.4545454545454546, |
| "grad_norm": 20.69359588623047, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 5.7202, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.6363636363636362, |
| "grad_norm": 9.500884056091309, |
| "learning_rate": 9e-06, |
| "loss": 4.0409, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 3.8369476795196533, |
| "learning_rate": 1e-05, |
| "loss": 3.3363, |
| "step": 100 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 2.6282877922058105, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 2.4721, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 1.9182844161987305, |
| "eval_runtime": 1.9094, |
| "eval_samples_per_second": 50.801, |
| "eval_steps_per_second": 3.666, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.1818181818181817, |
| "grad_norm": 1.8732331991195679, |
| "learning_rate": 1.2e-05, |
| "loss": 2.2906, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.3636363636363638, |
| "grad_norm": 1.2134524583816528, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 2.012, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.5454545454545454, |
| "grad_norm": 1.0808675289154053, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 1.9923, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.7272727272727275, |
| "grad_norm": 0.7511759996414185, |
| "learning_rate": 1.5e-05, |
| "loss": 1.7195, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.909090909090909, |
| "grad_norm": 0.7951436638832092, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.7114, |
| "step": 160 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 1.3082835674285889, |
| "eval_runtime": 1.9145, |
| "eval_samples_per_second": 50.666, |
| "eval_steps_per_second": 3.656, |
| "step": 165 |
| }, |
| { |
| "epoch": 3.090909090909091, |
| "grad_norm": 1.0560592412948608, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 1.4839, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.2727272727272725, |
| "grad_norm": 0.8097766041755676, |
| "learning_rate": 1.8e-05, |
| "loss": 1.4752, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.4545454545454546, |
| "grad_norm": 0.7408425211906433, |
| "learning_rate": 1.9e-05, |
| "loss": 1.4855, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.6363636363636362, |
| "grad_norm": 0.902044415473938, |
| "learning_rate": 2e-05, |
| "loss": 1.439, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.8181818181818183, |
| "grad_norm": 0.8487627506256104, |
| "learning_rate": 2.1e-05, |
| "loss": 1.3827, |
| "step": 210 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 1.330592393875122, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 1.3552, |
| "step": 220 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 1.0556784868240356, |
| "eval_runtime": 1.9095, |
| "eval_samples_per_second": 50.8, |
| "eval_steps_per_second": 3.666, |
| "step": 220 |
| }, |
| { |
| "epoch": 4.181818181818182, |
| "grad_norm": 0.8066139221191406, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 1.2733, |
| "step": 230 |
| }, |
| { |
| "epoch": 4.363636363636363, |
| "grad_norm": 0.6790980696678162, |
| "learning_rate": 2.4e-05, |
| "loss": 1.2248, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.545454545454545, |
| "grad_norm": 0.8204854726791382, |
| "learning_rate": 2.5e-05, |
| "loss": 1.2713, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.7272727272727275, |
| "grad_norm": 0.7819919586181641, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 1.2195, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.909090909090909, |
| "grad_norm": 0.5966542363166809, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 1.1837, |
| "step": 270 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.9253256916999817, |
| "eval_runtime": 1.9159, |
| "eval_samples_per_second": 50.628, |
| "eval_steps_per_second": 3.654, |
| "step": 275 |
| }, |
| { |
| "epoch": 5.090909090909091, |
| "grad_norm": 0.772972583770752, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 1.151, |
| "step": 280 |
| }, |
| { |
| "epoch": 5.2727272727272725, |
| "grad_norm": 0.6945077180862427, |
| "learning_rate": 2.9e-05, |
| "loss": 1.0236, |
| "step": 290 |
| }, |
| { |
| "epoch": 5.454545454545454, |
| "grad_norm": 0.6860663294792175, |
| "learning_rate": 3e-05, |
| "loss": 1.123, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.636363636363637, |
| "grad_norm": 0.640788197517395, |
| "learning_rate": 3.1e-05, |
| "loss": 1.1041, |
| "step": 310 |
| }, |
| { |
| "epoch": 5.818181818181818, |
| "grad_norm": 0.6724874377250671, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 1.0774, |
| "step": 320 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 1.2789943218231201, |
| "learning_rate": 3.3e-05, |
| "loss": 1.0695, |
| "step": 330 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.8376194834709167, |
| "eval_runtime": 1.9165, |
| "eval_samples_per_second": 50.613, |
| "eval_steps_per_second": 3.652, |
| "step": 330 |
| }, |
| { |
| "epoch": 6.181818181818182, |
| "grad_norm": 0.6990534067153931, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 1.0048, |
| "step": 340 |
| }, |
| { |
| "epoch": 6.363636363636363, |
| "grad_norm": 0.8184038400650024, |
| "learning_rate": 3.5e-05, |
| "loss": 0.9969, |
| "step": 350 |
| }, |
| { |
| "epoch": 6.545454545454545, |
| "grad_norm": 0.6995009183883667, |
| "learning_rate": 3.6e-05, |
| "loss": 0.949, |
| "step": 360 |
| }, |
| { |
| "epoch": 6.7272727272727275, |
| "grad_norm": 0.784795343875885, |
| "learning_rate": 3.7e-05, |
| "loss": 1.0163, |
| "step": 370 |
| }, |
| { |
| "epoch": 6.909090909090909, |
| "grad_norm": 0.6257941126823425, |
| "learning_rate": 3.8e-05, |
| "loss": 1.0053, |
| "step": 380 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.7738495469093323, |
| "eval_runtime": 1.9131, |
| "eval_samples_per_second": 50.703, |
| "eval_steps_per_second": 3.659, |
| "step": 385 |
| }, |
| { |
| "epoch": 7.090909090909091, |
| "grad_norm": 0.6166519522666931, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 0.929, |
| "step": 390 |
| }, |
| { |
| "epoch": 7.2727272727272725, |
| "grad_norm": 0.6807654500007629, |
| "learning_rate": 4e-05, |
| "loss": 0.8915, |
| "step": 400 |
| }, |
| { |
| "epoch": 7.454545454545454, |
| "grad_norm": 0.6417834162712097, |
| "learning_rate": 4.1e-05, |
| "loss": 0.9398, |
| "step": 410 |
| }, |
| { |
| "epoch": 7.636363636363637, |
| "grad_norm": 0.7357299327850342, |
| "learning_rate": 4.2e-05, |
| "loss": 0.9189, |
| "step": 420 |
| }, |
| { |
| "epoch": 7.818181818181818, |
| "grad_norm": 0.6846190094947815, |
| "learning_rate": 4.3e-05, |
| "loss": 0.9205, |
| "step": 430 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 0.9817469120025635, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 0.8928, |
| "step": 440 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.7215824723243713, |
| "eval_runtime": 1.9094, |
| "eval_samples_per_second": 50.801, |
| "eval_steps_per_second": 3.666, |
| "step": 440 |
| }, |
| { |
| "epoch": 8.181818181818182, |
| "grad_norm": 0.6967756748199463, |
| "learning_rate": 4.5e-05, |
| "loss": 0.9265, |
| "step": 450 |
| }, |
| { |
| "epoch": 8.363636363636363, |
| "grad_norm": 0.6210283041000366, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 0.8445, |
| "step": 460 |
| }, |
| { |
| "epoch": 8.545454545454545, |
| "grad_norm": 0.6811444759368896, |
| "learning_rate": 4.7e-05, |
| "loss": 0.8298, |
| "step": 470 |
| }, |
| { |
| "epoch": 8.727272727272727, |
| "grad_norm": 0.5957173109054565, |
| "learning_rate": 4.8e-05, |
| "loss": 0.7891, |
| "step": 480 |
| }, |
| { |
| "epoch": 8.909090909090908, |
| "grad_norm": 0.6470558047294617, |
| "learning_rate": 4.9e-05, |
| "loss": 0.8546, |
| "step": 490 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.6784611940383911, |
| "eval_runtime": 1.9123, |
| "eval_samples_per_second": 50.725, |
| "eval_steps_per_second": 3.661, |
| "step": 495 |
| }, |
| { |
| "epoch": 9.090909090909092, |
| "grad_norm": 0.5924862623214722, |
| "learning_rate": 5e-05, |
| "loss": 0.8455, |
| "step": 500 |
| }, |
| { |
| "epoch": 9.272727272727273, |
| "grad_norm": 0.6424144506454468, |
| "learning_rate": 4.9166666666666665e-05, |
| "loss": 0.8007, |
| "step": 510 |
| }, |
| { |
| "epoch": 9.454545454545455, |
| "grad_norm": 0.6752752065658569, |
| "learning_rate": 4.8333333333333334e-05, |
| "loss": 0.8317, |
| "step": 520 |
| }, |
| { |
| "epoch": 9.636363636363637, |
| "grad_norm": 0.5884845852851868, |
| "learning_rate": 4.75e-05, |
| "loss": 0.7812, |
| "step": 530 |
| }, |
| { |
| "epoch": 9.818181818181818, |
| "grad_norm": 0.6853846907615662, |
| "learning_rate": 4.666666666666667e-05, |
| "loss": 0.8089, |
| "step": 540 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 1.07851243019104, |
| "learning_rate": 4.5833333333333334e-05, |
| "loss": 0.7501, |
| "step": 550 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.6473520994186401, |
| "eval_runtime": 1.9092, |
| "eval_samples_per_second": 50.807, |
| "eval_steps_per_second": 3.666, |
| "step": 550 |
| }, |
| { |
| "epoch": 10.181818181818182, |
| "grad_norm": 1.17769455909729, |
| "learning_rate": 4.5e-05, |
| "loss": 0.7607, |
| "step": 560 |
| }, |
| { |
| "epoch": 10.363636363636363, |
| "grad_norm": 0.5850676894187927, |
| "learning_rate": 4.4166666666666665e-05, |
| "loss": 0.7473, |
| "step": 570 |
| }, |
| { |
| "epoch": 10.545454545454545, |
| "grad_norm": 0.5133849382400513, |
| "learning_rate": 4.3333333333333334e-05, |
| "loss": 0.7539, |
| "step": 580 |
| }, |
| { |
| "epoch": 10.727272727272727, |
| "grad_norm": 0.5890449285507202, |
| "learning_rate": 4.25e-05, |
| "loss": 0.7221, |
| "step": 590 |
| }, |
| { |
| "epoch": 10.909090909090908, |
| "grad_norm": 2.0515189170837402, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 0.7381, |
| "step": 600 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.6253225207328796, |
| "eval_runtime": 1.9123, |
| "eval_samples_per_second": 50.724, |
| "eval_steps_per_second": 3.66, |
| "step": 605 |
| }, |
| { |
| "epoch": 11.090909090909092, |
| "grad_norm": 0.5024580955505371, |
| "learning_rate": 4.0833333333333334e-05, |
| "loss": 0.7502, |
| "step": 610 |
| }, |
| { |
| "epoch": 11.272727272727273, |
| "grad_norm": 0.5601004362106323, |
| "learning_rate": 4e-05, |
| "loss": 0.7421, |
| "step": 620 |
| }, |
| { |
| "epoch": 11.454545454545455, |
| "grad_norm": 0.5731512308120728, |
| "learning_rate": 3.9166666666666665e-05, |
| "loss": 0.7159, |
| "step": 630 |
| }, |
| { |
| "epoch": 11.636363636363637, |
| "grad_norm": 0.5588123202323914, |
| "learning_rate": 3.8333333333333334e-05, |
| "loss": 0.6927, |
| "step": 640 |
| }, |
| { |
| "epoch": 11.818181818181818, |
| "grad_norm": 0.6156173348426819, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.6928, |
| "step": 650 |
| }, |
| { |
| "epoch": 12.0, |
| "grad_norm": 0.8520174622535706, |
| "learning_rate": 3.6666666666666666e-05, |
| "loss": 0.7357, |
| "step": 660 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.605716347694397, |
| "eval_runtime": 1.9125, |
| "eval_samples_per_second": 50.72, |
| "eval_steps_per_second": 3.66, |
| "step": 660 |
| }, |
| { |
| "epoch": 12.181818181818182, |
| "grad_norm": 0.702212929725647, |
| "learning_rate": 3.5833333333333335e-05, |
| "loss": 0.7132, |
| "step": 670 |
| }, |
| { |
| "epoch": 12.363636363636363, |
| "grad_norm": 0.5611622333526611, |
| "learning_rate": 3.5e-05, |
| "loss": 0.7224, |
| "step": 680 |
| }, |
| { |
| "epoch": 12.545454545454545, |
| "grad_norm": 0.5733953714370728, |
| "learning_rate": 3.4166666666666666e-05, |
| "loss": 0.7207, |
| "step": 690 |
| }, |
| { |
| "epoch": 12.727272727272727, |
| "grad_norm": 0.5382534265518188, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 0.6554, |
| "step": 700 |
| }, |
| { |
| "epoch": 12.909090909090908, |
| "grad_norm": 0.6537340879440308, |
| "learning_rate": 3.2500000000000004e-05, |
| "loss": 0.6237, |
| "step": 710 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.5956346392631531, |
| "eval_runtime": 1.9108, |
| "eval_samples_per_second": 50.765, |
| "eval_steps_per_second": 3.663, |
| "step": 715 |
| }, |
| { |
| "epoch": 13.090909090909092, |
| "grad_norm": 0.5932813882827759, |
| "learning_rate": 3.1666666666666666e-05, |
| "loss": 0.7095, |
| "step": 720 |
| }, |
| { |
| "epoch": 13.272727272727273, |
| "grad_norm": 0.5455180406570435, |
| "learning_rate": 3.0833333333333335e-05, |
| "loss": 0.6415, |
| "step": 730 |
| }, |
| { |
| "epoch": 13.454545454545455, |
| "grad_norm": 0.548633337020874, |
| "learning_rate": 3e-05, |
| "loss": 0.6528, |
| "step": 740 |
| }, |
| { |
| "epoch": 13.636363636363637, |
| "grad_norm": 0.5608177185058594, |
| "learning_rate": 2.916666666666667e-05, |
| "loss": 0.6676, |
| "step": 750 |
| }, |
| { |
| "epoch": 13.818181818181818, |
| "grad_norm": 0.5515778660774231, |
| "learning_rate": 2.8333333333333335e-05, |
| "loss": 0.6927, |
| "step": 760 |
| }, |
| { |
| "epoch": 14.0, |
| "grad_norm": 1.0543361902236938, |
| "learning_rate": 2.7500000000000004e-05, |
| "loss": 0.6932, |
| "step": 770 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.5847615599632263, |
| "eval_runtime": 1.9117, |
| "eval_samples_per_second": 50.741, |
| "eval_steps_per_second": 3.662, |
| "step": 770 |
| }, |
| { |
| "epoch": 14.181818181818182, |
| "grad_norm": 0.5957939624786377, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 0.6294, |
| "step": 780 |
| }, |
| { |
| "epoch": 14.363636363636363, |
| "grad_norm": 0.495108962059021, |
| "learning_rate": 2.5833333333333336e-05, |
| "loss": 0.6384, |
| "step": 790 |
| }, |
| { |
| "epoch": 14.545454545454545, |
| "grad_norm": 0.45685040950775146, |
| "learning_rate": 2.5e-05, |
| "loss": 0.6492, |
| "step": 800 |
| }, |
| { |
| "epoch": 14.727272727272727, |
| "grad_norm": 0.5110467672348022, |
| "learning_rate": 2.4166666666666667e-05, |
| "loss": 0.6384, |
| "step": 810 |
| }, |
| { |
| "epoch": 14.909090909090908, |
| "grad_norm": 0.48996323347091675, |
| "learning_rate": 2.3333333333333336e-05, |
| "loss": 0.6802, |
| "step": 820 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.5759454965591431, |
| "eval_runtime": 1.9161, |
| "eval_samples_per_second": 50.624, |
| "eval_steps_per_second": 3.653, |
| "step": 825 |
| }, |
| { |
| "epoch": 15.090909090909092, |
| "grad_norm": 0.5354136228561401, |
| "learning_rate": 2.25e-05, |
| "loss": 0.6623, |
| "step": 830 |
| }, |
| { |
| "epoch": 15.272727272727273, |
| "grad_norm": 0.5101513266563416, |
| "learning_rate": 2.1666666666666667e-05, |
| "loss": 0.664, |
| "step": 840 |
| }, |
| { |
| "epoch": 15.454545454545455, |
| "grad_norm": 0.593004047870636, |
| "learning_rate": 2.0833333333333336e-05, |
| "loss": 0.6559, |
| "step": 850 |
| }, |
| { |
| "epoch": 15.636363636363637, |
| "grad_norm": 0.5267238020896912, |
| "learning_rate": 2e-05, |
| "loss": 0.6319, |
| "step": 860 |
| }, |
| { |
| "epoch": 15.818181818181818, |
| "grad_norm": 0.5737471580505371, |
| "learning_rate": 1.9166666666666667e-05, |
| "loss": 0.5883, |
| "step": 870 |
| }, |
| { |
| "epoch": 16.0, |
| "grad_norm": 0.805059015750885, |
| "learning_rate": 1.8333333333333333e-05, |
| "loss": 0.654, |
| "step": 880 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.5736147165298462, |
| "eval_runtime": 1.9086, |
| "eval_samples_per_second": 50.824, |
| "eval_steps_per_second": 3.668, |
| "step": 880 |
| }, |
| { |
| "epoch": 16.181818181818183, |
| "grad_norm": 0.5428951978683472, |
| "learning_rate": 1.75e-05, |
| "loss": 0.613, |
| "step": 890 |
| }, |
| { |
| "epoch": 16.363636363636363, |
| "grad_norm": 0.538959264755249, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 0.6214, |
| "step": 900 |
| }, |
| { |
| "epoch": 16.545454545454547, |
| "grad_norm": 0.6322575211524963, |
| "learning_rate": 1.5833333333333333e-05, |
| "loss": 0.6162, |
| "step": 910 |
| }, |
| { |
| "epoch": 16.727272727272727, |
| "grad_norm": 0.6376714110374451, |
| "learning_rate": 1.5e-05, |
| "loss": 0.6161, |
| "step": 920 |
| }, |
| { |
| "epoch": 16.90909090909091, |
| "grad_norm": 0.5397374033927917, |
| "learning_rate": 1.4166666666666668e-05, |
| "loss": 0.6439, |
| "step": 930 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.5707854628562927, |
| "eval_runtime": 1.9113, |
| "eval_samples_per_second": 50.752, |
| "eval_steps_per_second": 3.663, |
| "step": 935 |
| }, |
| { |
| "epoch": 17.09090909090909, |
| "grad_norm": 0.43974068760871887, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 0.6028, |
| "step": 940 |
| }, |
| { |
| "epoch": 17.272727272727273, |
| "grad_norm": 0.5189619660377502, |
| "learning_rate": 1.25e-05, |
| "loss": 0.5696, |
| "step": 950 |
| }, |
| { |
| "epoch": 17.454545454545453, |
| "grad_norm": 0.5635692477226257, |
| "learning_rate": 1.1666666666666668e-05, |
| "loss": 0.63, |
| "step": 960 |
| }, |
| { |
| "epoch": 17.636363636363637, |
| "grad_norm": 0.5299723744392395, |
| "learning_rate": 1.0833333333333334e-05, |
| "loss": 0.6387, |
| "step": 970 |
| }, |
| { |
| "epoch": 17.818181818181817, |
| "grad_norm": 0.6093310117721558, |
| "learning_rate": 1e-05, |
| "loss": 0.6522, |
| "step": 980 |
| }, |
| { |
| "epoch": 18.0, |
| "grad_norm": 0.7209185361862183, |
| "learning_rate": 9.166666666666666e-06, |
| "loss": 0.6005, |
| "step": 990 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.5690126419067383, |
| "eval_runtime": 1.9113, |
| "eval_samples_per_second": 50.75, |
| "eval_steps_per_second": 3.662, |
| "step": 990 |
| }, |
| { |
| "epoch": 18.181818181818183, |
| "grad_norm": 0.523536205291748, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 0.6112, |
| "step": 1000 |
| }, |
| { |
| "epoch": 18.363636363636363, |
| "grad_norm": 0.5092321634292603, |
| "learning_rate": 7.5e-06, |
| "loss": 0.6261, |
| "step": 1010 |
| }, |
| { |
| "epoch": 18.545454545454547, |
| "grad_norm": 0.5760687589645386, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.591, |
| "step": 1020 |
| }, |
| { |
| "epoch": 18.727272727272727, |
| "grad_norm": 0.5239204168319702, |
| "learning_rate": 5.833333333333334e-06, |
| "loss": 0.6091, |
| "step": 1030 |
| }, |
| { |
| "epoch": 18.90909090909091, |
| "grad_norm": 0.5258241891860962, |
| "learning_rate": 5e-06, |
| "loss": 0.6392, |
| "step": 1040 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.5662071108818054, |
| "eval_runtime": 1.9107, |
| "eval_samples_per_second": 50.767, |
| "eval_steps_per_second": 3.664, |
| "step": 1045 |
| }, |
| { |
| "epoch": 19.09090909090909, |
| "grad_norm": 0.5691114664077759, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 0.5851, |
| "step": 1050 |
| }, |
| { |
| "epoch": 19.272727272727273, |
| "grad_norm": 0.5294829607009888, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.5744, |
| "step": 1060 |
| }, |
| { |
| "epoch": 19.454545454545453, |
| "grad_norm": 0.5142855644226074, |
| "learning_rate": 2.5e-06, |
| "loss": 0.6367, |
| "step": 1070 |
| }, |
| { |
| "epoch": 19.636363636363637, |
| "grad_norm": 0.6378938555717468, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 0.5809, |
| "step": 1080 |
| }, |
| { |
| "epoch": 19.818181818181817, |
| "grad_norm": 0.5766515731811523, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 0.6091, |
| "step": 1090 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.8583983778953552, |
| "learning_rate": 0.0, |
| "loss": 0.6453, |
| "step": 1100 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.5654382109642029, |
| "eval_runtime": 1.9129, |
| "eval_samples_per_second": 50.709, |
| "eval_steps_per_second": 3.659, |
| "step": 1100 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1100, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0595867295744e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|