{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 10.272224426269531, "learning_rate": 4.5e-06, "loss": 12.8379, "step": 10 }, { "epoch": 0.16, "grad_norm": 11.09077262878418, "learning_rate": 9.5e-06, "loss": 12.3314, "step": 20 }, { "epoch": 0.24, "grad_norm": 7.8299150466918945, "learning_rate": 9.908163265306123e-06, "loss": 10.8693, "step": 30 }, { "epoch": 0.32, "grad_norm": 5.720259189605713, "learning_rate": 9.806122448979593e-06, "loss": 9.5772, "step": 40 }, { "epoch": 0.4, "grad_norm": 4.4982500076293945, "learning_rate": 9.704081632653061e-06, "loss": 8.8462, "step": 50 }, { "epoch": 0.48, "grad_norm": 5.525058269500732, "learning_rate": 9.60204081632653e-06, "loss": 8.3404, "step": 60 }, { "epoch": 0.56, "grad_norm": 4.1277570724487305, "learning_rate": 9.5e-06, "loss": 8.1087, "step": 70 }, { "epoch": 0.64, "grad_norm": 4.99296760559082, "learning_rate": 9.39795918367347e-06, "loss": 7.9132, "step": 80 }, { "epoch": 0.72, "grad_norm": 4.5625319480896, "learning_rate": 9.29591836734694e-06, "loss": 7.5496, "step": 90 }, { "epoch": 0.8, "grad_norm": 4.021756649017334, "learning_rate": 9.19387755102041e-06, "loss": 7.6371, "step": 100 }, { "epoch": 0.88, "grad_norm": 4.0445098876953125, "learning_rate": 9.091836734693878e-06, "loss": 7.0593, "step": 110 }, { "epoch": 0.96, "grad_norm": 4.158232688903809, "learning_rate": 8.989795918367348e-06, "loss": 6.8773, "step": 120 }, { "epoch": 1.04, "grad_norm": 4.3605546951293945, "learning_rate": 8.887755102040817e-06, "loss": 6.365, "step": 130 }, { "epoch": 1.12, "grad_norm": 3.7614684104919434, "learning_rate": 8.785714285714286e-06, "loss": 6.3579, "step": 140 }, { "epoch": 1.2, "grad_norm": 3.5187251567840576, "learning_rate": 8.683673469387755e-06, "loss": 6.4043, "step": 150 }, { "epoch": 1.28, "grad_norm": 3.6821653842926025, "learning_rate": 8.581632653061225e-06, "loss": 6.219, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 4.2614970207214355, "learning_rate": 8.479591836734695e-06, "loss": 6.2059, "step": 170 }, { "epoch": 1.44, "grad_norm": 3.459669351577759, "learning_rate": 8.377551020408165e-06, "loss": 5.9866, "step": 180 }, { "epoch": 1.52, "grad_norm": 3.8360798358917236, "learning_rate": 8.275510204081634e-06, "loss": 5.8988, "step": 190 }, { "epoch": 1.6, "grad_norm": 3.6687963008880615, "learning_rate": 8.173469387755103e-06, "loss": 5.603, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 4.398599147796631, "learning_rate": 8.071428571428572e-06, "loss": 5.5413, "step": 210 }, { "epoch": 1.76, "grad_norm": 4.392071723937988, "learning_rate": 7.969387755102042e-06, "loss": 5.6413, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 3.310082197189331, "learning_rate": 7.86734693877551e-06, "loss": 5.4796, "step": 230 }, { "epoch": 1.92, "grad_norm": 3.3684332370758057, "learning_rate": 7.76530612244898e-06, "loss": 5.6276, "step": 240 }, { "epoch": 2.0, "grad_norm": 4.198359966278076, "learning_rate": 7.66326530612245e-06, "loss": 5.3979, "step": 250 }, { "epoch": 2.08, "grad_norm": 3.726461410522461, "learning_rate": 7.561224489795919e-06, "loss": 5.2551, "step": 260 }, { "epoch": 2.16, "grad_norm": 4.147603511810303, "learning_rate": 7.459183673469388e-06, "loss": 5.1481, "step": 270 }, { "epoch": 2.24, "grad_norm": 4.2914204597473145, "learning_rate": 7.357142857142858e-06, "loss": 5.079, "step": 280 }, { "epoch": 2.32, "grad_norm": 3.718191146850586, "learning_rate": 7.255102040816327e-06, "loss": 4.9855, "step": 290 }, { "epoch": 2.4, "grad_norm": 3.8326470851898193, "learning_rate": 7.153061224489797e-06, "loss": 5.1467, "step": 300 }, { "epoch": 2.48, "grad_norm": 3.780733585357666, "learning_rate": 7.051020408163266e-06, "loss": 4.9813, "step": 310 }, { "epoch": 2.56, "grad_norm": 4.059634685516357, "learning_rate": 6.948979591836736e-06, "loss": 5.0103, "step": 320 }, { "epoch": 2.64, "grad_norm": 4.013712406158447, "learning_rate": 6.8469387755102046e-06, "loss": 4.9963, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 4.24052619934082, "learning_rate": 6.7448979591836735e-06, "loss": 4.8682, "step": 340 }, { "epoch": 2.8, "grad_norm": 4.262918949127197, "learning_rate": 6.642857142857143e-06, "loss": 4.8391, "step": 350 }, { "epoch": 2.88, "grad_norm": 4.253732681274414, "learning_rate": 6.540816326530612e-06, "loss": 4.8529, "step": 360 }, { "epoch": 2.96, "grad_norm": 4.445166110992432, "learning_rate": 6.438775510204082e-06, "loss": 4.6523, "step": 370 }, { "epoch": 3.04, "grad_norm": 4.086543560028076, "learning_rate": 6.336734693877552e-06, "loss": 4.6866, "step": 380 }, { "epoch": 3.12, "grad_norm": 4.3391265869140625, "learning_rate": 6.2346938775510215e-06, "loss": 4.8265, "step": 390 }, { "epoch": 3.2, "grad_norm": 4.4690327644348145, "learning_rate": 6.1326530612244905e-06, "loss": 4.5729, "step": 400 }, { "epoch": 3.2800000000000002, "grad_norm": 4.506042957305908, "learning_rate": 6.03061224489796e-06, "loss": 4.645, "step": 410 }, { "epoch": 3.36, "grad_norm": 3.8643500804901123, "learning_rate": 5.928571428571429e-06, "loss": 4.4849, "step": 420 }, { "epoch": 3.44, "grad_norm": 4.425049781799316, "learning_rate": 5.826530612244898e-06, "loss": 4.4227, "step": 430 }, { "epoch": 3.52, "grad_norm": 4.880602836608887, "learning_rate": 5.724489795918368e-06, "loss": 4.5872, "step": 440 }, { "epoch": 3.6, "grad_norm": 4.425590991973877, "learning_rate": 5.622448979591837e-06, "loss": 4.4351, "step": 450 }, { "epoch": 3.68, "grad_norm": 4.5981125831604, "learning_rate": 5.520408163265306e-06, "loss": 4.2647, "step": 460 }, { "epoch": 3.76, "grad_norm": 5.0482611656188965, "learning_rate": 5.4183673469387755e-06, "loss": 4.4483, "step": 470 }, { "epoch": 3.84, "grad_norm": 4.352452754974365, "learning_rate": 5.316326530612246e-06, "loss": 4.2195, "step": 480 }, { "epoch": 3.92, "grad_norm": 4.14626932144165, "learning_rate": 5.214285714285715e-06, "loss": 4.2741, "step": 490 }, { "epoch": 4.0, "grad_norm": 4.289257526397705, "learning_rate": 5.112244897959184e-06, "loss": 4.3662, "step": 500 }, { "epoch": 4.08, "grad_norm": 4.666240215301514, "learning_rate": 5.010204081632654e-06, "loss": 4.1023, "step": 510 }, { "epoch": 4.16, "grad_norm": 4.825807094573975, "learning_rate": 4.908163265306123e-06, "loss": 4.0558, "step": 520 }, { "epoch": 4.24, "grad_norm": 5.112582683563232, "learning_rate": 4.8061224489795925e-06, "loss": 4.1276, "step": 530 }, { "epoch": 4.32, "grad_norm": 5.386510848999023, "learning_rate": 4.704081632653061e-06, "loss": 4.133, "step": 540 }, { "epoch": 4.4, "grad_norm": 5.501555442810059, "learning_rate": 4.602040816326531e-06, "loss": 4.0733, "step": 550 }, { "epoch": 4.48, "grad_norm": 4.4781060218811035, "learning_rate": 4.5e-06, "loss": 4.1908, "step": 560 }, { "epoch": 4.5600000000000005, "grad_norm": 4.725682258605957, "learning_rate": 4.39795918367347e-06, "loss": 4.2183, "step": 570 }, { "epoch": 4.64, "grad_norm": 4.902810573577881, "learning_rate": 4.295918367346939e-06, "loss": 3.9457, "step": 580 }, { "epoch": 4.72, "grad_norm": 4.52883243560791, "learning_rate": 4.193877551020409e-06, "loss": 4.1341, "step": 590 }, { "epoch": 4.8, "grad_norm": 4.9895124435424805, "learning_rate": 4.091836734693878e-06, "loss": 3.9592, "step": 600 }, { "epoch": 4.88, "grad_norm": 4.645427227020264, "learning_rate": 3.989795918367347e-06, "loss": 4.042, "step": 610 }, { "epoch": 4.96, "grad_norm": 4.708733558654785, "learning_rate": 3.887755102040816e-06, "loss": 4.2402, "step": 620 }, { "epoch": 5.04, "grad_norm": 4.388338088989258, "learning_rate": 3.785714285714286e-06, "loss": 3.8437, "step": 630 }, { "epoch": 5.12, "grad_norm": 4.25796365737915, "learning_rate": 3.6836734693877554e-06, "loss": 3.8926, "step": 640 }, { "epoch": 5.2, "grad_norm": 4.4803996086120605, "learning_rate": 3.581632653061225e-06, "loss": 3.7283, "step": 650 }, { "epoch": 5.28, "grad_norm": 4.613894939422607, "learning_rate": 3.479591836734694e-06, "loss": 4.0191, "step": 660 }, { "epoch": 5.36, "grad_norm": 4.880568027496338, "learning_rate": 3.3775510204081634e-06, "loss": 3.7278, "step": 670 }, { "epoch": 5.44, "grad_norm": 4.900217533111572, "learning_rate": 3.2755102040816328e-06, "loss": 3.7275, "step": 680 }, { "epoch": 5.52, "grad_norm": 5.112405300140381, "learning_rate": 3.173469387755102e-06, "loss": 3.6629, "step": 690 }, { "epoch": 5.6, "grad_norm": 5.060956954956055, "learning_rate": 3.071428571428572e-06, "loss": 4.1205, "step": 700 }, { "epoch": 5.68, "grad_norm": 4.825809001922607, "learning_rate": 2.9693877551020413e-06, "loss": 3.6707, "step": 710 }, { "epoch": 5.76, "grad_norm": 4.704409122467041, "learning_rate": 2.86734693877551e-06, "loss": 3.9554, "step": 720 }, { "epoch": 5.84, "grad_norm": 4.779526710510254, "learning_rate": 2.7653061224489795e-06, "loss": 3.707, "step": 730 }, { "epoch": 5.92, "grad_norm": 4.754640102386475, "learning_rate": 2.663265306122449e-06, "loss": 3.9676, "step": 740 }, { "epoch": 6.0, "grad_norm": 4.465866565704346, "learning_rate": 2.5612244897959187e-06, "loss": 3.5938, "step": 750 }, { "epoch": 6.08, "grad_norm": 5.0297722816467285, "learning_rate": 2.459183673469388e-06, "loss": 3.4993, "step": 760 }, { "epoch": 6.16, "grad_norm": 5.041810989379883, "learning_rate": 2.3571428571428574e-06, "loss": 3.2929, "step": 770 }, { "epoch": 6.24, "grad_norm": 5.438796043395996, "learning_rate": 2.2551020408163267e-06, "loss": 3.724, "step": 780 }, { "epoch": 6.32, "grad_norm": 5.112025260925293, "learning_rate": 2.153061224489796e-06, "loss": 3.5256, "step": 790 }, { "epoch": 6.4, "grad_norm": 4.986135959625244, "learning_rate": 2.0510204081632654e-06, "loss": 3.607, "step": 800 }, { "epoch": 6.48, "grad_norm": 4.837615489959717, "learning_rate": 1.948979591836735e-06, "loss": 3.7234, "step": 810 }, { "epoch": 6.5600000000000005, "grad_norm": 4.9461750984191895, "learning_rate": 1.8469387755102042e-06, "loss": 3.551, "step": 820 }, { "epoch": 6.64, "grad_norm": 5.679599285125732, "learning_rate": 1.7448979591836737e-06, "loss": 3.694, "step": 830 }, { "epoch": 6.72, "grad_norm": 4.861215114593506, "learning_rate": 1.642857142857143e-06, "loss": 3.5914, "step": 840 }, { "epoch": 6.8, "grad_norm": 4.849487781524658, "learning_rate": 1.5408163265306122e-06, "loss": 3.5168, "step": 850 }, { "epoch": 6.88, "grad_norm": 5.109979629516602, "learning_rate": 1.4387755102040818e-06, "loss": 3.8957, "step": 860 }, { "epoch": 6.96, "grad_norm": 4.744449138641357, "learning_rate": 1.3367346938775511e-06, "loss": 3.6189, "step": 870 }, { "epoch": 7.04, "grad_norm": 5.0839080810546875, "learning_rate": 1.2346938775510205e-06, "loss": 3.5112, "step": 880 }, { "epoch": 7.12, "grad_norm": 5.095026016235352, "learning_rate": 1.1326530612244898e-06, "loss": 3.4154, "step": 890 }, { "epoch": 7.2, "grad_norm": 5.221547603607178, "learning_rate": 1.0306122448979592e-06, "loss": 3.5512, "step": 900 }, { "epoch": 7.28, "grad_norm": 4.755325794219971, "learning_rate": 9.285714285714287e-07, "loss": 3.5292, "step": 910 }, { "epoch": 7.36, "grad_norm": 5.137501239776611, "learning_rate": 8.265306122448981e-07, "loss": 3.4896, "step": 920 }, { "epoch": 7.44, "grad_norm": 4.977344989776611, "learning_rate": 7.244897959183674e-07, "loss": 3.4731, "step": 930 }, { "epoch": 7.52, "grad_norm": 4.887359142303467, "learning_rate": 6.224489795918367e-07, "loss": 3.7047, "step": 940 }, { "epoch": 7.6, "grad_norm": 5.050560474395752, "learning_rate": 5.204081632653062e-07, "loss": 3.316, "step": 950 }, { "epoch": 7.68, "grad_norm": 4.847559452056885, "learning_rate": 4.183673469387756e-07, "loss": 3.373, "step": 960 }, { "epoch": 7.76, "grad_norm": 4.991832256317139, "learning_rate": 3.163265306122449e-07, "loss": 3.4879, "step": 970 }, { "epoch": 7.84, "grad_norm": 5.611753463745117, "learning_rate": 2.142857142857143e-07, "loss": 3.4826, "step": 980 }, { "epoch": 7.92, "grad_norm": 4.520301818847656, "learning_rate": 1.1224489795918368e-07, "loss": 3.5984, "step": 990 }, { "epoch": 8.0, "grad_norm": 5.4338178634643555, "learning_rate": 1.0204081632653063e-08, "loss": 3.3724, "step": 1000 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4845344768e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }