{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.16666666666666666, "eval_steps": 1000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 21.75, "learning_rate": 3.7125e-06, "loss": 1.6333, "step": 10 }, { "epoch": 0.0, "grad_norm": 11.75, "learning_rate": 3.675e-06, "loss": 1.8863, "step": 20 }, { "epoch": 0.01, "grad_norm": 30.25, "learning_rate": 3.6375e-06, "loss": 2.1646, "step": 30 }, { "epoch": 0.01, "grad_norm": 20.0, "learning_rate": 3.6e-06, "loss": 2.0098, "step": 40 }, { "epoch": 0.01, "grad_norm": 21.125, "learning_rate": 3.5624999999999998e-06, "loss": 1.7331, "step": 50 }, { "epoch": 0.01, "grad_norm": 18.125, "learning_rate": 3.5249999999999997e-06, "loss": 1.8913, "step": 60 }, { "epoch": 0.01, "grad_norm": 17.5, "learning_rate": 3.4875e-06, "loss": 1.8686, "step": 70 }, { "epoch": 0.01, "grad_norm": 37.0, "learning_rate": 3.4500000000000004e-06, "loss": 1.6756, "step": 80 }, { "epoch": 0.01, "grad_norm": 21.125, "learning_rate": 3.4125000000000004e-06, "loss": 2.0029, "step": 90 }, { "epoch": 0.02, "grad_norm": 14.0, "learning_rate": 3.3750000000000003e-06, "loss": 1.4412, "step": 100 }, { "epoch": 0.02, "grad_norm": 12.75, "learning_rate": 3.3375000000000002e-06, "loss": 2.0556, "step": 110 }, { "epoch": 0.02, "grad_norm": 31.625, "learning_rate": 3.3e-06, "loss": 1.9298, "step": 120 }, { "epoch": 0.02, "grad_norm": 15.0625, "learning_rate": 3.2625e-06, "loss": 2.0686, "step": 130 }, { "epoch": 0.02, "grad_norm": 15.75, "learning_rate": 3.225e-06, "loss": 1.6107, "step": 140 }, { "epoch": 0.03, "grad_norm": 28.5, "learning_rate": 3.1875e-06, "loss": 2.4047, "step": 150 }, { "epoch": 0.03, "grad_norm": 17.375, "learning_rate": 3.15e-06, "loss": 1.5916, "step": 160 }, { "epoch": 0.03, "grad_norm": 23.0, "learning_rate": 3.1125e-06, "loss": 1.9109, "step": 170 }, { "epoch": 0.03, "grad_norm": 27.5, "learning_rate": 3.0749999999999998e-06, "loss": 2.1359, "step": 180 }, { "epoch": 0.03, "grad_norm": 18.0, "learning_rate": 3.0375e-06, "loss": 2.1144, "step": 190 }, { "epoch": 0.03, "grad_norm": 14.4375, "learning_rate": 3e-06, "loss": 1.6672, "step": 200 }, { "epoch": 0.04, "grad_norm": 22.125, "learning_rate": 2.9625e-06, "loss": 1.6704, "step": 210 }, { "epoch": 0.04, "grad_norm": 28.5, "learning_rate": 2.9250000000000004e-06, "loss": 1.9675, "step": 220 }, { "epoch": 0.04, "grad_norm": 13.0, "learning_rate": 2.8875000000000003e-06, "loss": 1.9873, "step": 230 }, { "epoch": 0.04, "grad_norm": 13.8125, "learning_rate": 2.8500000000000002e-06, "loss": 1.7557, "step": 240 }, { "epoch": 0.04, "grad_norm": 19.125, "learning_rate": 2.8125e-06, "loss": 1.7556, "step": 250 }, { "epoch": 0.04, "grad_norm": 16.5, "learning_rate": 2.775e-06, "loss": 2.0693, "step": 260 }, { "epoch": 0.04, "grad_norm": 18.25, "learning_rate": 2.7375e-06, "loss": 1.6011, "step": 270 }, { "epoch": 0.05, "grad_norm": 21.875, "learning_rate": 2.7e-06, "loss": 1.6058, "step": 280 }, { "epoch": 0.05, "grad_norm": 18.0, "learning_rate": 2.6625e-06, "loss": 1.8085, "step": 290 }, { "epoch": 0.05, "grad_norm": 14.875, "learning_rate": 2.625e-06, "loss": 1.777, "step": 300 }, { "epoch": 0.05, "grad_norm": 20.75, "learning_rate": 2.5875e-06, "loss": 1.8708, "step": 310 }, { "epoch": 0.05, "grad_norm": 14.625, "learning_rate": 2.55e-06, "loss": 1.5629, "step": 320 }, { "epoch": 0.06, "grad_norm": 16.125, "learning_rate": 2.5125e-06, "loss": 1.7416, "step": 330 }, { "epoch": 0.06, "grad_norm": 19.5, "learning_rate": 2.475e-06, "loss": 1.9123, "step": 340 }, { "epoch": 0.06, "grad_norm": 17.375, "learning_rate": 2.4375e-06, "loss": 1.6956, "step": 350 }, { "epoch": 0.06, "grad_norm": 30.125, "learning_rate": 2.4000000000000003e-06, "loss": 1.863, "step": 360 }, { "epoch": 0.06, "grad_norm": 24.25, "learning_rate": 2.3625000000000003e-06, "loss": 1.7432, "step": 370 }, { "epoch": 0.06, "grad_norm": 15.5625, "learning_rate": 2.325e-06, "loss": 1.6194, "step": 380 }, { "epoch": 0.07, "grad_norm": 20.75, "learning_rate": 2.2875e-06, "loss": 2.1294, "step": 390 }, { "epoch": 0.07, "grad_norm": 24.125, "learning_rate": 2.25e-06, "loss": 1.8663, "step": 400 }, { "epoch": 0.07, "grad_norm": 22.625, "learning_rate": 2.2125e-06, "loss": 1.8445, "step": 410 }, { "epoch": 0.07, "grad_norm": 23.375, "learning_rate": 2.175e-06, "loss": 2.1122, "step": 420 }, { "epoch": 0.07, "grad_norm": 20.875, "learning_rate": 2.1375e-06, "loss": 1.8364, "step": 430 }, { "epoch": 0.07, "grad_norm": 20.0, "learning_rate": 2.1000000000000002e-06, "loss": 1.6375, "step": 440 }, { "epoch": 0.07, "grad_norm": 18.25, "learning_rate": 2.0625e-06, "loss": 1.5882, "step": 450 }, { "epoch": 0.08, "grad_norm": 34.25, "learning_rate": 2.025e-06, "loss": 2.0043, "step": 460 }, { "epoch": 0.08, "grad_norm": 28.75, "learning_rate": 1.9875e-06, "loss": 1.9231, "step": 470 }, { "epoch": 0.08, "grad_norm": 17.5, "learning_rate": 1.95e-06, "loss": 2.0631, "step": 480 }, { "epoch": 0.08, "grad_norm": 17.125, "learning_rate": 1.9125e-06, "loss": 2.0359, "step": 490 }, { "epoch": 0.08, "grad_norm": 21.0, "learning_rate": 1.875e-06, "loss": 1.8175, "step": 500 }, { "epoch": 0.09, "grad_norm": 17.5, "learning_rate": 1.8375e-06, "loss": 2.0844, "step": 510 }, { "epoch": 0.09, "grad_norm": 20.25, "learning_rate": 1.8e-06, "loss": 1.6329, "step": 520 }, { "epoch": 0.09, "grad_norm": 12.375, "learning_rate": 1.7624999999999999e-06, "loss": 1.9352, "step": 530 }, { "epoch": 0.09, "grad_norm": 21.375, "learning_rate": 1.7250000000000002e-06, "loss": 1.8842, "step": 540 }, { "epoch": 0.09, "grad_norm": 16.375, "learning_rate": 1.6875000000000001e-06, "loss": 2.1543, "step": 550 }, { "epoch": 0.09, "grad_norm": 15.625, "learning_rate": 1.65e-06, "loss": 2.0122, "step": 560 }, { "epoch": 0.1, "grad_norm": 16.0, "learning_rate": 1.6125e-06, "loss": 2.3606, "step": 570 }, { "epoch": 0.1, "grad_norm": 22.375, "learning_rate": 1.575e-06, "loss": 1.8354, "step": 580 }, { "epoch": 0.1, "grad_norm": 18.75, "learning_rate": 1.5374999999999999e-06, "loss": 1.962, "step": 590 }, { "epoch": 0.1, "grad_norm": 22.625, "learning_rate": 1.5e-06, "loss": 1.6487, "step": 600 }, { "epoch": 0.1, "grad_norm": 17.875, "learning_rate": 1.4625000000000002e-06, "loss": 1.5496, "step": 610 }, { "epoch": 0.1, "grad_norm": 20.25, "learning_rate": 1.4250000000000001e-06, "loss": 2.0923, "step": 620 }, { "epoch": 0.1, "grad_norm": 20.125, "learning_rate": 1.3875e-06, "loss": 2.0688, "step": 630 }, { "epoch": 0.11, "grad_norm": 14.875, "learning_rate": 1.35e-06, "loss": 2.1762, "step": 640 }, { "epoch": 0.11, "grad_norm": 51.0, "learning_rate": 1.3125e-06, "loss": 1.7076, "step": 650 }, { "epoch": 0.11, "grad_norm": 27.0, "learning_rate": 1.275e-06, "loss": 2.1918, "step": 660 }, { "epoch": 0.11, "grad_norm": 16.125, "learning_rate": 1.2375e-06, "loss": 2.0706, "step": 670 }, { "epoch": 0.11, "grad_norm": 21.0, "learning_rate": 1.2000000000000002e-06, "loss": 1.8599, "step": 680 }, { "epoch": 0.12, "grad_norm": 19.375, "learning_rate": 1.1625e-06, "loss": 2.1312, "step": 690 }, { "epoch": 0.12, "grad_norm": 22.5, "learning_rate": 1.125e-06, "loss": 1.9219, "step": 700 }, { "epoch": 0.12, "grad_norm": 24.75, "learning_rate": 1.0875e-06, "loss": 2.0628, "step": 710 }, { "epoch": 0.12, "grad_norm": 16.875, "learning_rate": 1.0500000000000001e-06, "loss": 1.999, "step": 720 }, { "epoch": 0.12, "grad_norm": 20.875, "learning_rate": 1.0125e-06, "loss": 1.6709, "step": 730 }, { "epoch": 0.12, "grad_norm": 20.125, "learning_rate": 9.75e-07, "loss": 1.9329, "step": 740 }, { "epoch": 0.12, "grad_norm": 15.125, "learning_rate": 9.375e-07, "loss": 1.8998, "step": 750 }, { "epoch": 0.13, "grad_norm": 20.0, "learning_rate": 9e-07, "loss": 1.4411, "step": 760 }, { "epoch": 0.13, "grad_norm": 18.125, "learning_rate": 8.625000000000001e-07, "loss": 1.758, "step": 770 }, { "epoch": 0.13, "grad_norm": 17.0, "learning_rate": 8.25e-07, "loss": 2.0136, "step": 780 }, { "epoch": 0.13, "grad_norm": 19.25, "learning_rate": 7.875e-07, "loss": 1.8072, "step": 790 }, { "epoch": 0.13, "grad_norm": 14.625, "learning_rate": 7.5e-07, "loss": 1.9614, "step": 800 }, { "epoch": 0.14, "grad_norm": 17.5, "learning_rate": 7.125000000000001e-07, "loss": 1.9077, "step": 810 }, { "epoch": 0.14, "grad_norm": 25.625, "learning_rate": 6.75e-07, "loss": 1.6237, "step": 820 }, { "epoch": 0.14, "grad_norm": 23.875, "learning_rate": 6.375e-07, "loss": 1.875, "step": 830 }, { "epoch": 0.14, "grad_norm": 29.5, "learning_rate": 6.000000000000001e-07, "loss": 2.0392, "step": 840 }, { "epoch": 0.14, "grad_norm": 15.6875, "learning_rate": 5.625e-07, "loss": 1.9254, "step": 850 }, { "epoch": 0.14, "grad_norm": 11.8125, "learning_rate": 5.250000000000001e-07, "loss": 1.4531, "step": 860 }, { "epoch": 0.14, "grad_norm": 17.875, "learning_rate": 4.875e-07, "loss": 1.8028, "step": 870 }, { "epoch": 0.15, "grad_norm": 15.875, "learning_rate": 4.5e-07, "loss": 1.7695, "step": 880 }, { "epoch": 0.15, "grad_norm": 20.375, "learning_rate": 4.125e-07, "loss": 1.8477, "step": 890 }, { "epoch": 0.15, "grad_norm": 16.25, "learning_rate": 3.75e-07, "loss": 2.0242, "step": 900 }, { "epoch": 0.15, "grad_norm": 30.5, "learning_rate": 3.375e-07, "loss": 1.79, "step": 910 }, { "epoch": 0.15, "grad_norm": 21.875, "learning_rate": 3.0000000000000004e-07, "loss": 2.0096, "step": 920 }, { "epoch": 0.15, "grad_norm": 18.75, "learning_rate": 2.6250000000000003e-07, "loss": 1.8059, "step": 930 }, { "epoch": 0.16, "grad_norm": 15.9375, "learning_rate": 2.25e-07, "loss": 1.4159, "step": 940 }, { "epoch": 0.16, "grad_norm": 21.25, "learning_rate": 1.875e-07, "loss": 2.0914, "step": 950 }, { "epoch": 0.16, "grad_norm": 19.25, "learning_rate": 1.5000000000000002e-07, "loss": 1.9735, "step": 960 }, { "epoch": 0.16, "grad_norm": 18.0, "learning_rate": 1.125e-07, "loss": 1.7791, "step": 970 }, { "epoch": 0.16, "grad_norm": 20.25, "learning_rate": 7.500000000000001e-08, "loss": 2.0176, "step": 980 }, { "epoch": 0.17, "grad_norm": 17.5, "learning_rate": 3.7500000000000005e-08, "loss": 2.1275, "step": 990 }, { "epoch": 0.17, "grad_norm": 15.875, "learning_rate": 0.0, "loss": 1.9038, "step": 1000 }, { "epoch": 0.17, "eval_loss": 1.9000076055526733, "eval_runtime": 30.02, "eval_samples_per_second": 33.311, "eval_steps_per_second": 33.311, "step": 1000 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 8069610209280000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }