| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.16666666666666666, | |
| "eval_steps": 1000, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 21.75, | |
| "learning_rate": 3.7125e-06, | |
| "loss": 1.6333, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 11.75, | |
| "learning_rate": 3.675e-06, | |
| "loss": 1.8863, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 30.25, | |
| "learning_rate": 3.6375e-06, | |
| "loss": 2.1646, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 20.0, | |
| "learning_rate": 3.6e-06, | |
| "loss": 2.0098, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 21.125, | |
| "learning_rate": 3.5624999999999998e-06, | |
| "loss": 1.7331, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 18.125, | |
| "learning_rate": 3.5249999999999997e-06, | |
| "loss": 1.8913, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 17.5, | |
| "learning_rate": 3.4875e-06, | |
| "loss": 1.8686, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 37.0, | |
| "learning_rate": 3.4500000000000004e-06, | |
| "loss": 1.6756, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 21.125, | |
| "learning_rate": 3.4125000000000004e-06, | |
| "loss": 2.0029, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 14.0, | |
| "learning_rate": 3.3750000000000003e-06, | |
| "loss": 1.4412, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 12.75, | |
| "learning_rate": 3.3375000000000002e-06, | |
| "loss": 2.0556, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 31.625, | |
| "learning_rate": 3.3e-06, | |
| "loss": 1.9298, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 3.2625e-06, | |
| "loss": 2.0686, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 15.75, | |
| "learning_rate": 3.225e-06, | |
| "loss": 1.6107, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 28.5, | |
| "learning_rate": 3.1875e-06, | |
| "loss": 2.4047, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 17.375, | |
| "learning_rate": 3.15e-06, | |
| "loss": 1.5916, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 23.0, | |
| "learning_rate": 3.1125e-06, | |
| "loss": 1.9109, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 27.5, | |
| "learning_rate": 3.0749999999999998e-06, | |
| "loss": 2.1359, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 18.0, | |
| "learning_rate": 3.0375e-06, | |
| "loss": 2.1144, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 3e-06, | |
| "loss": 1.6672, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 22.125, | |
| "learning_rate": 2.9625e-06, | |
| "loss": 1.6704, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 28.5, | |
| "learning_rate": 2.9250000000000004e-06, | |
| "loss": 1.9675, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 13.0, | |
| "learning_rate": 2.8875000000000003e-06, | |
| "loss": 1.9873, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 2.8500000000000002e-06, | |
| "loss": 1.7557, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 19.125, | |
| "learning_rate": 2.8125e-06, | |
| "loss": 1.7556, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 16.5, | |
| "learning_rate": 2.775e-06, | |
| "loss": 2.0693, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 18.25, | |
| "learning_rate": 2.7375e-06, | |
| "loss": 1.6011, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 21.875, | |
| "learning_rate": 2.7e-06, | |
| "loss": 1.6058, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 18.0, | |
| "learning_rate": 2.6625e-06, | |
| "loss": 1.8085, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 14.875, | |
| "learning_rate": 2.625e-06, | |
| "loss": 1.777, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 20.75, | |
| "learning_rate": 2.5875e-06, | |
| "loss": 1.8708, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 14.625, | |
| "learning_rate": 2.55e-06, | |
| "loss": 1.5629, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 16.125, | |
| "learning_rate": 2.5125e-06, | |
| "loss": 1.7416, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 19.5, | |
| "learning_rate": 2.475e-06, | |
| "loss": 1.9123, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 17.375, | |
| "learning_rate": 2.4375e-06, | |
| "loss": 1.6956, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 30.125, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 1.863, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 24.25, | |
| "learning_rate": 2.3625000000000003e-06, | |
| "loss": 1.7432, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 15.5625, | |
| "learning_rate": 2.325e-06, | |
| "loss": 1.6194, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 20.75, | |
| "learning_rate": 2.2875e-06, | |
| "loss": 2.1294, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 24.125, | |
| "learning_rate": 2.25e-06, | |
| "loss": 1.8663, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 22.625, | |
| "learning_rate": 2.2125e-06, | |
| "loss": 1.8445, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 23.375, | |
| "learning_rate": 2.175e-06, | |
| "loss": 2.1122, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 20.875, | |
| "learning_rate": 2.1375e-06, | |
| "loss": 1.8364, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 20.0, | |
| "learning_rate": 2.1000000000000002e-06, | |
| "loss": 1.6375, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 18.25, | |
| "learning_rate": 2.0625e-06, | |
| "loss": 1.5882, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 34.25, | |
| "learning_rate": 2.025e-06, | |
| "loss": 2.0043, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 28.75, | |
| "learning_rate": 1.9875e-06, | |
| "loss": 1.9231, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 17.5, | |
| "learning_rate": 1.95e-06, | |
| "loss": 2.0631, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.9125e-06, | |
| "loss": 2.0359, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 21.0, | |
| "learning_rate": 1.875e-06, | |
| "loss": 1.8175, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 17.5, | |
| "learning_rate": 1.8375e-06, | |
| "loss": 2.0844, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 20.25, | |
| "learning_rate": 1.8e-06, | |
| "loss": 1.6329, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 12.375, | |
| "learning_rate": 1.7624999999999999e-06, | |
| "loss": 1.9352, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 21.375, | |
| "learning_rate": 1.7250000000000002e-06, | |
| "loss": 1.8842, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 16.375, | |
| "learning_rate": 1.6875000000000001e-06, | |
| "loss": 2.1543, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 15.625, | |
| "learning_rate": 1.65e-06, | |
| "loss": 2.0122, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 16.0, | |
| "learning_rate": 1.6125e-06, | |
| "loss": 2.3606, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.575e-06, | |
| "loss": 1.8354, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.5374999999999999e-06, | |
| "loss": 1.962, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 22.625, | |
| "learning_rate": 1.5e-06, | |
| "loss": 1.6487, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.4625000000000002e-06, | |
| "loss": 1.5496, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 20.25, | |
| "learning_rate": 1.4250000000000001e-06, | |
| "loss": 2.0923, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 20.125, | |
| "learning_rate": 1.3875e-06, | |
| "loss": 2.0688, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 14.875, | |
| "learning_rate": 1.35e-06, | |
| "loss": 2.1762, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 51.0, | |
| "learning_rate": 1.3125e-06, | |
| "loss": 1.7076, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 27.0, | |
| "learning_rate": 1.275e-06, | |
| "loss": 2.1918, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1.2375e-06, | |
| "loss": 2.0706, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 21.0, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 1.8599, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.1625e-06, | |
| "loss": 2.1312, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 22.5, | |
| "learning_rate": 1.125e-06, | |
| "loss": 1.9219, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 24.75, | |
| "learning_rate": 1.0875e-06, | |
| "loss": 2.0628, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 16.875, | |
| "learning_rate": 1.0500000000000001e-06, | |
| "loss": 1.999, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 20.875, | |
| "learning_rate": 1.0125e-06, | |
| "loss": 1.6709, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 20.125, | |
| "learning_rate": 9.75e-07, | |
| "loss": 1.9329, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 15.125, | |
| "learning_rate": 9.375e-07, | |
| "loss": 1.8998, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 20.0, | |
| "learning_rate": 9e-07, | |
| "loss": 1.4411, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 18.125, | |
| "learning_rate": 8.625000000000001e-07, | |
| "loss": 1.758, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 17.0, | |
| "learning_rate": 8.25e-07, | |
| "loss": 2.0136, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 19.25, | |
| "learning_rate": 7.875e-07, | |
| "loss": 1.8072, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 14.625, | |
| "learning_rate": 7.5e-07, | |
| "loss": 1.9614, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 17.5, | |
| "learning_rate": 7.125000000000001e-07, | |
| "loss": 1.9077, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 25.625, | |
| "learning_rate": 6.75e-07, | |
| "loss": 1.6237, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 23.875, | |
| "learning_rate": 6.375e-07, | |
| "loss": 1.875, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 29.5, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 2.0392, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 15.6875, | |
| "learning_rate": 5.625e-07, | |
| "loss": 1.9254, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 5.250000000000001e-07, | |
| "loss": 1.4531, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 17.875, | |
| "learning_rate": 4.875e-07, | |
| "loss": 1.8028, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 15.875, | |
| "learning_rate": 4.5e-07, | |
| "loss": 1.7695, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 20.375, | |
| "learning_rate": 4.125e-07, | |
| "loss": 1.8477, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 16.25, | |
| "learning_rate": 3.75e-07, | |
| "loss": 2.0242, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 30.5, | |
| "learning_rate": 3.375e-07, | |
| "loss": 1.79, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 21.875, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "loss": 2.0096, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 18.75, | |
| "learning_rate": 2.6250000000000003e-07, | |
| "loss": 1.8059, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 15.9375, | |
| "learning_rate": 2.25e-07, | |
| "loss": 1.4159, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.875e-07, | |
| "loss": 2.0914, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.5000000000000002e-07, | |
| "loss": 1.9735, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.125e-07, | |
| "loss": 1.7791, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 20.25, | |
| "learning_rate": 7.500000000000001e-08, | |
| "loss": 2.0176, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 17.5, | |
| "learning_rate": 3.7500000000000005e-08, | |
| "loss": 2.1275, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 15.875, | |
| "learning_rate": 0.0, | |
| "loss": 1.9038, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "eval_loss": 1.9000076055526733, | |
| "eval_runtime": 30.02, | |
| "eval_samples_per_second": 33.311, | |
| "eval_steps_per_second": 33.311, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "total_flos": 8069610209280000.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |