| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 31.0, | |
| "eval_steps": 500, | |
| "global_step": 6200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8e-05, | |
| "loss": 2.9668, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 8.75, | |
| "learning_rate": 8e-05, | |
| "loss": 1.5447, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 18.375, | |
| "learning_rate": 8e-05, | |
| "loss": 1.3061, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 19.5, | |
| "learning_rate": 8e-05, | |
| "loss": 1.2262, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 19.625, | |
| "learning_rate": 8e-05, | |
| "loss": 1.106, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 20.5, | |
| "learning_rate": 8e-05, | |
| "loss": 1.1051, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 1.0603, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 17.625, | |
| "learning_rate": 8e-05, | |
| "loss": 1.0426, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 18.5, | |
| "learning_rate": 8e-05, | |
| "loss": 1.0278, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 8.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.9811, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 8.375, | |
| "learning_rate": 8e-05, | |
| "loss": 1.0554, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.981, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.9207, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.9944, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7818, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6701, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.8406, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 8.375, | |
| "learning_rate": 8e-05, | |
| "loss": 1.1126, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.9493, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.8737, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.8025, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6186, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7477, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6547, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 8.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7623, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6258, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 8.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7048, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7504, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 8.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7502, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 8.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7787, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 7.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7807, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7065, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7102, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 13.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6821, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5562, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6497, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 8.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7489, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 9.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.726, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 8.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5764, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6491, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5339, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 8.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.7454, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 15.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5912, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6097, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6417, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5642, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 14.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6748, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6594, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6317, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 7.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5976, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 5.992827415466309, | |
| "eval_runtime": 7.763, | |
| "eval_samples_per_second": 64.408, | |
| "eval_steps_per_second": 64.408, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 13.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6125, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.621, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5758, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 11.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5952, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 7.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6743, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 11.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5858, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6662, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6263, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5472, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.574, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4992, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5446, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 9.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5207, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 12.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5025, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.524, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 6.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5135, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4906, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 15.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4418, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5683, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5697, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4701, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 11.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5056, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.498, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 12.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5773, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5411, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6051, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5179, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5243, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5162, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5591, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 10.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4792, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4441, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4185, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4334, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 10.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4924, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 9.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4366, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4441, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5339, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 13.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4497, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 6.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4611, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4251, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 12.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4453, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 14.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4904, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 14.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.526, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4325, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3979, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4622, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 12.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4522, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 13.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4131, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.464, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 6.443065166473389, | |
| "eval_runtime": 7.5814, | |
| "eval_samples_per_second": 65.951, | |
| "eval_steps_per_second": 65.951, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.05, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4061, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 5.1, | |
| "grad_norm": 6.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.384, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 5.15, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3238, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4739, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3773, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 5.3, | |
| "grad_norm": 6.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3674, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 5.35, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3371, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4022, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 5.45, | |
| "grad_norm": 5.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3692, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3847, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.55, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4535, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4559, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 5.65, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3967, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 5.7, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3578, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 5.75, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4006, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3664, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 5.85, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3775, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 5.9, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4148, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 5.95, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3621, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 13.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5599, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.05, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4049, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 6.1, | |
| "grad_norm": 13.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4218, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 6.15, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3504, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4073, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3338, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3149, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 6.35, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3903, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3733, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 6.45, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3173, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 4.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4605, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 6.55, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4063, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 6.6, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3575, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 6.65, | |
| "grad_norm": 5.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3622, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 6.7, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3387, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3373, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 6.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3143, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 6.85, | |
| "grad_norm": 13.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3434, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 6.9, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3255, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 6.95, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.357, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3876, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.05, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3567, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 7.1, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3101, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 7.15, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3498, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2873, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 7.25, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2651, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 7.3, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2505, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 7.35, | |
| "grad_norm": 6.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2421, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3338, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 7.45, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4454, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 4.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4056, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "eval_loss": 7.630816459655762, | |
| "eval_runtime": 7.5449, | |
| "eval_samples_per_second": 66.27, | |
| "eval_steps_per_second": 66.27, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.55, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2285, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3527, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 7.65, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4113, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 7.7, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4207, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3073, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 7.8, | |
| "grad_norm": 9.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3525, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 7.85, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4336, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 7.9, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3153, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 7.95, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.311, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2972, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.05, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3153, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 8.1, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3684, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 8.15, | |
| "grad_norm": 11.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1658, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 8.2, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.317, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 8.25, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2382, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 8.3, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3048, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 8.35, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3565, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 11.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2439, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 8.45, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3198, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 11.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3728, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 8.55, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3061, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 8.6, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5014, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 8.65, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2326, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 8.7, | |
| "grad_norm": 4.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2521, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 8.75, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3909, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 13.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3824, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 8.85, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3043, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 8.9, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3079, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 8.95, | |
| "grad_norm": 2.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.336, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 13.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3156, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 9.05, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.258, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 9.1, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3333, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 9.15, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4112, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 9.2, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3298, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 9.25, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3552, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 9.3, | |
| "grad_norm": 11.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.196, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 9.35, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2146, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 9.4, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2204, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 9.45, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3717, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2952, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 9.55, | |
| "grad_norm": 11.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3634, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.327, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 9.65, | |
| "grad_norm": 11.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2675, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 9.7, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.289, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 9.75, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4264, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 9.8, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2596, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 9.85, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2564, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 9.9, | |
| "grad_norm": 4.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3128, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 9.95, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4754, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3452, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 7.939090728759766, | |
| "eval_runtime": 7.5613, | |
| "eval_samples_per_second": 66.126, | |
| "eval_steps_per_second": 66.126, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 10.05, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3269, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 10.1, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3554, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 10.15, | |
| "grad_norm": 11.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3173, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 10.2, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2155, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 10.25, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2992, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 10.3, | |
| "grad_norm": 13.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4483, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 10.35, | |
| "grad_norm": 10.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4116, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 10.4, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2901, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 10.45, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3026, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 10.5, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3183, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 10.55, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.263, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 10.6, | |
| "grad_norm": 11.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2742, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 10.65, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3583, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 10.7, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.244, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 10.75, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2986, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 10.8, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2685, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 10.85, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.239, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 10.9, | |
| "grad_norm": 2.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.25, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 10.95, | |
| "grad_norm": 2.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2264, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 13.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.403, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 11.05, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3125, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 11.1, | |
| "grad_norm": 53.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3731, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 11.15, | |
| "grad_norm": 11.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2595, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 11.2, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3029, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 11.25, | |
| "grad_norm": 13.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2089, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 11.3, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3185, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 11.35, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.254, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 11.4, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2984, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 11.45, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3858, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 11.5, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3145, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 11.55, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3565, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 11.6, | |
| "grad_norm": 11.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3711, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 11.65, | |
| "grad_norm": 2.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3212, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 11.7, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2068, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 11.75, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3058, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 11.8, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.201, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 11.85, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3403, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 11.9, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2613, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 11.95, | |
| "grad_norm": 11.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3407, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3051, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 12.05, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2997, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 12.1, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3016, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 12.15, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3798, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 12.2, | |
| "grad_norm": 2.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2223, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 12.25, | |
| "grad_norm": 13.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2249, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 12.3, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2264, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 12.35, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3619, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 12.4, | |
| "grad_norm": 13.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3665, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 12.45, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3139, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4703, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "eval_loss": 7.897547721862793, | |
| "eval_runtime": 7.6709, | |
| "eval_samples_per_second": 65.181, | |
| "eval_steps_per_second": 65.181, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 12.55, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2518, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 12.6, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2905, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 12.65, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2674, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 12.7, | |
| "grad_norm": 12.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2041, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 12.75, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.258, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 12.8, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2959, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 12.85, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2758, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 12.9, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2456, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 12.95, | |
| "grad_norm": 11.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3101, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2981, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 13.05, | |
| "grad_norm": 1.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2467, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 13.1, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.249, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 13.15, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.382, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 13.2, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3436, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 13.25, | |
| "grad_norm": 2.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2819, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 13.3, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2695, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 13.35, | |
| "grad_norm": 1.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1805, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 13.4, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3324, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 13.45, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4207, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 13.5, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.36, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 13.55, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3061, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 13.6, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2009, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 13.65, | |
| "grad_norm": 10.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3731, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 13.7, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2775, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 13.75, | |
| "grad_norm": 12.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2231, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 13.8, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3238, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 13.85, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2291, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 13.9, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2505, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 13.95, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.352, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1966, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 14.05, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2186, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 14.1, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2466, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 14.15, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1781, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 14.2, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3363, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 14.25, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3329, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 14.3, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3858, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 14.35, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2778, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 14.4, | |
| "grad_norm": 1.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1987, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 14.45, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1794, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 14.5, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2878, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 14.55, | |
| "grad_norm": 13.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3154, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 14.6, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3468, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 14.65, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2626, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 14.7, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2456, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 14.75, | |
| "grad_norm": 2.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2751, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 14.8, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2335, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 14.85, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3561, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 14.9, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3794, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 14.95, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2901, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 12.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2891, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 7.215864181518555, | |
| "eval_runtime": 7.9685, | |
| "eval_samples_per_second": 62.747, | |
| "eval_steps_per_second": 62.747, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 15.05, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1937, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 15.1, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3008, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 15.15, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1713, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 15.2, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2894, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 15.25, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.404, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 15.3, | |
| "grad_norm": 10.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3772, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 15.35, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2001, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 15.4, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2303, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 15.45, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2211, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 15.5, | |
| "grad_norm": 11.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3988, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 15.55, | |
| "grad_norm": 11.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1864, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 15.6, | |
| "grad_norm": 13.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3078, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 15.65, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2891, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 15.7, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4039, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 15.75, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2887, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 15.8, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3268, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 15.85, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2726, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 15.9, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1886, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 15.95, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3537, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2877, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 16.05, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2848, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 16.1, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1388, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 16.15, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2291, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 16.2, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3616, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 16.25, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2676, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 16.3, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4425, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 16.35, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3095, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 16.4, | |
| "grad_norm": 2.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2071, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 16.45, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3159, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 16.5, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3892, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 16.55, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2224, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 16.6, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2233, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 16.65, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3661, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 16.7, | |
| "grad_norm": 12.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3293, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 16.75, | |
| "grad_norm": 12.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3403, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 16.8, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2894, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 16.85, | |
| "grad_norm": 13.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2419, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 16.9, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3061, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 16.95, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2204, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2497, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 17.05, | |
| "grad_norm": 10.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2295, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 17.1, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3195, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 17.15, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2639, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 17.2, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.283, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 17.25, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3231, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 17.3, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.204, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 17.35, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2669, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 17.4, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2475, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 17.45, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3369, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 17.5, | |
| "grad_norm": 2.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3026, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 17.5, | |
| "eval_loss": 8.5922269821167, | |
| "eval_runtime": 7.9526, | |
| "eval_samples_per_second": 62.873, | |
| "eval_steps_per_second": 62.873, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 17.55, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3049, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 17.6, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2318, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 17.65, | |
| "grad_norm": 13.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3433, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 17.7, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.265, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 17.75, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3195, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 17.8, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3147, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 17.85, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2251, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 17.9, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1943, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 17.95, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3524, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2027, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 18.05, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1753, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 18.1, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3361, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 18.15, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2835, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 18.2, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.247, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 18.25, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2648, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 18.3, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2733, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 18.35, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2856, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 18.4, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2245, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 18.45, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2948, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 18.5, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2622, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 18.55, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3569, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 18.6, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2437, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 18.65, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3894, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 18.7, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2614, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 18.75, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2579, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 18.8, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2146, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 18.85, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3762, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 18.9, | |
| "grad_norm": 11.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3321, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 18.95, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3717, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2763, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 19.05, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1778, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 19.1, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1328, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 19.15, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1773, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 19.2, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2006, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 19.25, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2921, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 19.3, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1976, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 19.35, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3143, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 19.4, | |
| "grad_norm": 13.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2905, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 19.45, | |
| "grad_norm": 1.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3518, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 19.5, | |
| "grad_norm": 12.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3462, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 19.55, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2707, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 19.6, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3661, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 19.65, | |
| "grad_norm": 12.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2653, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 19.7, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2482, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 19.75, | |
| "grad_norm": 14.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3152, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 19.8, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4428, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 19.85, | |
| "grad_norm": 12.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2869, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 19.9, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3191, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 19.95, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3486, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3294, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 8.210659980773926, | |
| "eval_runtime": 7.5705, | |
| "eval_samples_per_second": 66.046, | |
| "eval_steps_per_second": 66.046, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 20.05, | |
| "grad_norm": 11.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2655, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 20.1, | |
| "grad_norm": 13.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2366, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 20.15, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.306, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 20.2, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1857, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 20.25, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2701, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 20.3, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2085, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 20.35, | |
| "grad_norm": 11.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2882, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 20.4, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1754, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 20.45, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2088, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 20.5, | |
| "grad_norm": 12.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2538, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 20.55, | |
| "grad_norm": 10.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.27, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 20.6, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3837, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 20.65, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2489, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 20.7, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2951, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 20.75, | |
| "grad_norm": 10.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3018, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 20.8, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3647, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 20.85, | |
| "grad_norm": 12.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2971, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 20.9, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2797, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 20.95, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2959, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3338, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 21.05, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3054, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 21.1, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2705, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 21.15, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3126, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 21.2, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1907, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 21.25, | |
| "grad_norm": 12.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3798, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 21.3, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2596, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 21.35, | |
| "grad_norm": 13.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2034, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 21.4, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3202, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 21.45, | |
| "grad_norm": 12.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3012, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 21.5, | |
| "grad_norm": 1.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2908, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 21.55, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2747, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 21.6, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2408, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 21.65, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2384, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 21.7, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3229, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 21.75, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.273, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 21.8, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2094, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 21.85, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3144, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 21.9, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.296, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 21.95, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3715, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2191, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 22.05, | |
| "grad_norm": 3.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2787, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 22.1, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1906, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 22.15, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2839, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 22.2, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2678, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 22.25, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1711, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 22.3, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2485, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 22.35, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2903, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 22.4, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3669, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 22.45, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3196, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 22.5, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3304, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 22.5, | |
| "eval_loss": 8.893681526184082, | |
| "eval_runtime": 7.8064, | |
| "eval_samples_per_second": 64.05, | |
| "eval_steps_per_second": 64.05, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 22.55, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2211, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 22.6, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2872, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 22.65, | |
| "grad_norm": 1.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2652, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 22.7, | |
| "grad_norm": 13.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3234, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 22.75, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3285, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 22.8, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2371, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 22.85, | |
| "grad_norm": 1.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1858, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 22.9, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2691, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 22.95, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4012, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3667, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 23.05, | |
| "grad_norm": 11.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.298, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 23.1, | |
| "grad_norm": 12.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3351, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 23.15, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.298, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 23.2, | |
| "grad_norm": 0.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.0958, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 23.25, | |
| "grad_norm": 0.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2265, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 23.3, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3006, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 23.35, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3545, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 23.4, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.276, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 23.45, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2633, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 23.5, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2083, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 23.55, | |
| "grad_norm": 2.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3108, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 23.6, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3238, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 23.65, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3359, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 23.7, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3277, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 23.75, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2861, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 23.8, | |
| "grad_norm": 12.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2311, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 23.85, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3383, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 23.9, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2087, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 23.95, | |
| "grad_norm": 12.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3536, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2426, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 24.05, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2743, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 24.1, | |
| "grad_norm": 13.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2697, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 24.15, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1881, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 24.2, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2347, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 24.25, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2888, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 24.3, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1057, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 24.35, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2993, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 24.4, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3406, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 24.45, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2368, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 24.5, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2919, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 24.55, | |
| "grad_norm": 12.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3382, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 24.6, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2537, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 24.65, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2991, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 24.7, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2682, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 24.75, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2845, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 24.8, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3295, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 24.85, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3334, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 24.9, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3461, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 24.95, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.211, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2913, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 8.996872901916504, | |
| "eval_runtime": 7.5487, | |
| "eval_samples_per_second": 66.236, | |
| "eval_steps_per_second": 66.236, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 25.05, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2376, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 25.1, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.175, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 25.15, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3706, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 25.2, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3528, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 25.25, | |
| "grad_norm": 12.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.44, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 25.3, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1827, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 25.35, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3465, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 25.4, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2745, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 25.45, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1351, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 25.5, | |
| "grad_norm": 13.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3016, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 25.55, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1977, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 25.6, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1249, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 25.65, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3311, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 25.7, | |
| "grad_norm": 1.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2879, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 25.75, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1996, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 25.8, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3853, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 25.85, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2886, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 25.9, | |
| "grad_norm": 13.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3065, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 25.95, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2465, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2765, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 26.05, | |
| "grad_norm": 2.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2821, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 26.1, | |
| "grad_norm": 11.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2685, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 26.15, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2782, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 26.2, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2616, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 26.25, | |
| "grad_norm": 13.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3842, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 26.3, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2202, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 26.35, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1973, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 26.4, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3689, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 26.45, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2377, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 26.5, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2211, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 26.55, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3614, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 26.6, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2357, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 26.65, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2854, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 26.7, | |
| "grad_norm": 10.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2299, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 26.75, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3128, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 26.8, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2182, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 26.85, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1037, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 26.9, | |
| "grad_norm": 12.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2412, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 26.95, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3913, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2973, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 27.05, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2552, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 27.1, | |
| "grad_norm": 12.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1857, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 27.15, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2013, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 27.2, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3292, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 27.25, | |
| "grad_norm": 1.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3218, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 27.3, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2718, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 27.35, | |
| "grad_norm": 12.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2765, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 27.4, | |
| "grad_norm": 12.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3151, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 27.45, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2165, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 27.5, | |
| "grad_norm": 2.0, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2343, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 27.5, | |
| "eval_loss": 8.912619590759277, | |
| "eval_runtime": 7.8779, | |
| "eval_samples_per_second": 63.469, | |
| "eval_steps_per_second": 63.469, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 27.55, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3697, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 27.6, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1904, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 27.65, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3406, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 27.7, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2541, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 27.75, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2346, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 27.8, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2101, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 27.85, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2812, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 27.9, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2815, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 27.95, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2582, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4554, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 28.05, | |
| "grad_norm": 10.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3659, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 28.1, | |
| "grad_norm": 10.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1781, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 28.15, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2764, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 28.2, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3022, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 28.25, | |
| "grad_norm": 13.5, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2352, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 28.3, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2052, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 28.35, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2885, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 28.4, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3102, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 28.45, | |
| "grad_norm": 14.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2526, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 28.5, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1431, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 28.55, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2638, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 28.6, | |
| "grad_norm": 13.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3692, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 28.65, | |
| "grad_norm": 3.625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3725, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 28.7, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2955, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 28.75, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1972, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 28.8, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3902, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 28.85, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2459, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 28.9, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2117, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 28.95, | |
| "grad_norm": 11.125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3475, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.342, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 29.05, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1948, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 29.1, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3632, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 29.15, | |
| "grad_norm": 12.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3307, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 29.2, | |
| "grad_norm": 11.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2275, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 29.25, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2147, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 29.3, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2111, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 29.35, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3288, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 29.4, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1918, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 29.45, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2553, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 29.5, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2064, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 29.55, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3478, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 29.6, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2593, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 29.65, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3139, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 29.7, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1809, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 29.75, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.276, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 29.8, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3907, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 29.85, | |
| "grad_norm": 13.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3121, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 29.9, | |
| "grad_norm": 11.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3276, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 29.95, | |
| "grad_norm": 11.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2805, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2789, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 8.706538200378418, | |
| "eval_runtime": 8.1518, | |
| "eval_samples_per_second": 61.336, | |
| "eval_steps_per_second": 61.336, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 30.05, | |
| "grad_norm": 8.25, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2017, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 30.1, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3184, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 30.15, | |
| "grad_norm": 11.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2877, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 30.2, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2002, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 30.25, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2738, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 30.3, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2625, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 30.35, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2488, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 30.4, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3019, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 30.45, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2733, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 30.5, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2455, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 30.55, | |
| "grad_norm": 11.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3836, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 30.6, | |
| "grad_norm": 1.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2377, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 30.65, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.315, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 30.7, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3438, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 30.75, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.4268, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 30.8, | |
| "grad_norm": 13.375, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2822, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 30.85, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2607, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 30.9, | |
| "grad_norm": 11.75, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1853, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 30.95, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2499, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 8e-05, | |
| "loss": 0.2878, | |
| "step": 6200 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 8000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 40, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3565855315034112.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |