{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11764705882352941, "grad_norm": 3.437100410461426, "learning_rate": 4.941764705882353e-05, "loss": 0.6786, "step": 100 }, { "epoch": 0.23529411764705882, "grad_norm": 2.240630626678467, "learning_rate": 4.8829411764705885e-05, "loss": 0.6502, "step": 200 }, { "epoch": 0.35294117647058826, "grad_norm": 4.658847808837891, "learning_rate": 4.824117647058824e-05, "loss": 0.554, "step": 300 }, { "epoch": 0.47058823529411764, "grad_norm": 2.393118143081665, "learning_rate": 4.765294117647059e-05, "loss": 0.5091, "step": 400 }, { "epoch": 0.5882352941176471, "grad_norm": 6.558012008666992, "learning_rate": 4.7064705882352944e-05, "loss": 0.4821, "step": 500 }, { "epoch": 0.7058823529411765, "grad_norm": 10.292436599731445, "learning_rate": 4.64764705882353e-05, "loss": 0.4832, "step": 600 }, { "epoch": 0.8235294117647058, "grad_norm": 2.2930338382720947, "learning_rate": 4.588823529411765e-05, "loss": 0.4906, "step": 700 }, { "epoch": 0.9411764705882353, "grad_norm": 4.4116692543029785, "learning_rate": 4.53e-05, "loss": 0.4212, "step": 800 }, { "epoch": 1.0588235294117647, "grad_norm": 11.04600715637207, "learning_rate": 4.4711764705882356e-05, "loss": 0.4047, "step": 900 }, { "epoch": 1.1764705882352942, "grad_norm": 3.98223614692688, "learning_rate": 4.412352941176471e-05, "loss": 0.3968, "step": 1000 }, { "epoch": 1.2941176470588236, "grad_norm": 5.051950931549072, "learning_rate": 4.353529411764706e-05, "loss": 0.3654, "step": 1100 }, { "epoch": 1.4117647058823528, "grad_norm": 3.390256404876709, "learning_rate": 4.2947058823529415e-05, "loss": 0.4115, "step": 1200 }, { "epoch": 1.5294117647058822, "grad_norm": 6.051326751708984, "learning_rate": 4.235882352941177e-05, "loss": 0.3766, "step": 1300 }, { "epoch": 1.6470588235294117, "grad_norm": 3.2749335765838623, "learning_rate": 4.1770588235294115e-05, "loss": 0.3872, "step": 1400 }, { "epoch": 1.7647058823529411, "grad_norm": 3.5865375995635986, "learning_rate": 4.1182352941176474e-05, "loss": 0.3628, "step": 1500 }, { "epoch": 1.8823529411764706, "grad_norm": 9.934630393981934, "learning_rate": 4.059411764705883e-05, "loss": 0.3044, "step": 1600 }, { "epoch": 2.0, "grad_norm": 3.085042953491211, "learning_rate": 4.0005882352941174e-05, "loss": 0.3483, "step": 1700 }, { "epoch": 2.1176470588235294, "grad_norm": 10.278831481933594, "learning_rate": 3.941764705882353e-05, "loss": 0.3355, "step": 1800 }, { "epoch": 2.235294117647059, "grad_norm": 4.414193630218506, "learning_rate": 3.8829411764705886e-05, "loss": 0.3181, "step": 1900 }, { "epoch": 2.3529411764705883, "grad_norm": 9.129111289978027, "learning_rate": 3.824117647058823e-05, "loss": 0.3096, "step": 2000 }, { "epoch": 2.4705882352941178, "grad_norm": 37.28619384765625, "learning_rate": 3.765294117647059e-05, "loss": 0.2821, "step": 2100 }, { "epoch": 2.588235294117647, "grad_norm": 0.5300999283790588, "learning_rate": 3.706470588235294e-05, "loss": 0.3126, "step": 2200 }, { "epoch": 2.7058823529411766, "grad_norm": 1.9004733562469482, "learning_rate": 3.64764705882353e-05, "loss": 0.3397, "step": 2300 }, { "epoch": 2.8235294117647056, "grad_norm": 0.971808910369873, "learning_rate": 3.588823529411765e-05, "loss": 0.3461, "step": 2400 }, { "epoch": 2.9411764705882355, "grad_norm": 10.828816413879395, "learning_rate": 3.53e-05, "loss": 0.2998, "step": 2500 }, { "epoch": 3.0588235294117645, "grad_norm": 5.21161413192749, "learning_rate": 3.471176470588236e-05, "loss": 0.287, "step": 2600 }, { "epoch": 3.176470588235294, "grad_norm": 17.639314651489258, "learning_rate": 3.412352941176471e-05, "loss": 0.2408, "step": 2700 }, { "epoch": 3.2941176470588234, "grad_norm": 5.808989524841309, "learning_rate": 3.3535294117647056e-05, "loss": 0.2514, "step": 2800 }, { "epoch": 3.411764705882353, "grad_norm": 5.669056415557861, "learning_rate": 3.2947058823529416e-05, "loss": 0.2401, "step": 2900 }, { "epoch": 3.5294117647058822, "grad_norm": 17.544368743896484, "learning_rate": 3.235882352941176e-05, "loss": 0.304, "step": 3000 }, { "epoch": 3.6470588235294117, "grad_norm": 15.652566909790039, "learning_rate": 3.1770588235294115e-05, "loss": 0.2169, "step": 3100 }, { "epoch": 3.764705882352941, "grad_norm": 38.110477447509766, "learning_rate": 3.1182352941176475e-05, "loss": 0.3245, "step": 3200 }, { "epoch": 3.8823529411764706, "grad_norm": 15.126811027526855, "learning_rate": 3.059411764705882e-05, "loss": 0.2366, "step": 3300 }, { "epoch": 4.0, "grad_norm": 1.916968822479248, "learning_rate": 3.0005882352941178e-05, "loss": 0.2394, "step": 3400 }, { "epoch": 4.117647058823529, "grad_norm": 6.685044765472412, "learning_rate": 2.9417647058823534e-05, "loss": 0.239, "step": 3500 }, { "epoch": 4.235294117647059, "grad_norm": 0.2686949074268341, "learning_rate": 2.8829411764705884e-05, "loss": 0.2042, "step": 3600 }, { "epoch": 4.352941176470588, "grad_norm": 4.416964530944824, "learning_rate": 2.8241176470588237e-05, "loss": 0.235, "step": 3700 }, { "epoch": 4.470588235294118, "grad_norm": 14.782841682434082, "learning_rate": 2.7652941176470586e-05, "loss": 0.2145, "step": 3800 }, { "epoch": 4.588235294117647, "grad_norm": 3.3013288974761963, "learning_rate": 2.7064705882352943e-05, "loss": 0.2301, "step": 3900 }, { "epoch": 4.705882352941177, "grad_norm": 12.687139511108398, "learning_rate": 2.6476470588235296e-05, "loss": 0.2459, "step": 4000 }, { "epoch": 4.823529411764706, "grad_norm": 12.49465560913086, "learning_rate": 2.5888235294117645e-05, "loss": 0.2523, "step": 4100 }, { "epoch": 4.9411764705882355, "grad_norm": 7.881172180175781, "learning_rate": 2.5300000000000002e-05, "loss": 0.2301, "step": 4200 }, { "epoch": 5.0588235294117645, "grad_norm": 7.88573694229126, "learning_rate": 2.4711764705882355e-05, "loss": 0.1962, "step": 4300 }, { "epoch": 5.176470588235294, "grad_norm": 8.924189567565918, "learning_rate": 2.4123529411764704e-05, "loss": 0.1782, "step": 4400 }, { "epoch": 5.294117647058823, "grad_norm": 0.7592102289199829, "learning_rate": 2.353529411764706e-05, "loss": 0.2081, "step": 4500 }, { "epoch": 5.411764705882353, "grad_norm": 4.540928363800049, "learning_rate": 2.2947058823529414e-05, "loss": 0.1991, "step": 4600 }, { "epoch": 5.529411764705882, "grad_norm": 2.462392807006836, "learning_rate": 2.2358823529411767e-05, "loss": 0.226, "step": 4700 }, { "epoch": 5.647058823529412, "grad_norm": 0.7198309898376465, "learning_rate": 2.1770588235294116e-05, "loss": 0.2093, "step": 4800 }, { "epoch": 5.764705882352941, "grad_norm": 0.04426976293325424, "learning_rate": 2.1182352941176473e-05, "loss": 0.1616, "step": 4900 }, { "epoch": 5.882352941176471, "grad_norm": 19.01175880432129, "learning_rate": 2.0594117647058826e-05, "loss": 0.2451, "step": 5000 }, { "epoch": 6.0, "grad_norm": 9.680488586425781, "learning_rate": 2.000588235294118e-05, "loss": 0.1954, "step": 5100 }, { "epoch": 6.117647058823529, "grad_norm": 13.620891571044922, "learning_rate": 1.9417647058823528e-05, "loss": 0.1739, "step": 5200 }, { "epoch": 6.235294117647059, "grad_norm": 23.215810775756836, "learning_rate": 1.8829411764705885e-05, "loss": 0.1989, "step": 5300 }, { "epoch": 6.352941176470588, "grad_norm": 34.69459915161133, "learning_rate": 1.8241176470588238e-05, "loss": 0.1845, "step": 5400 }, { "epoch": 6.470588235294118, "grad_norm": 0.17689703404903412, "learning_rate": 1.7652941176470587e-05, "loss": 0.1888, "step": 5500 }, { "epoch": 6.588235294117647, "grad_norm": 0.1681346297264099, "learning_rate": 1.706470588235294e-05, "loss": 0.1808, "step": 5600 }, { "epoch": 6.705882352941177, "grad_norm": 0.21678882837295532, "learning_rate": 1.6476470588235297e-05, "loss": 0.1527, "step": 5700 }, { "epoch": 6.823529411764706, "grad_norm": 6.419234752655029, "learning_rate": 1.588823529411765e-05, "loss": 0.2019, "step": 5800 }, { "epoch": 6.9411764705882355, "grad_norm": 0.09294360131025314, "learning_rate": 1.53e-05, "loss": 0.1881, "step": 5900 }, { "epoch": 7.0588235294117645, "grad_norm": 14.81273365020752, "learning_rate": 1.4711764705882352e-05, "loss": 0.1606, "step": 6000 }, { "epoch": 7.176470588235294, "grad_norm": 64.77528381347656, "learning_rate": 1.4123529411764707e-05, "loss": 0.103, "step": 6100 }, { "epoch": 7.294117647058823, "grad_norm": 0.01647937297821045, "learning_rate": 1.353529411764706e-05, "loss": 0.144, "step": 6200 }, { "epoch": 7.411764705882353, "grad_norm": 2.1152279376983643, "learning_rate": 1.2947058823529413e-05, "loss": 0.1687, "step": 6300 }, { "epoch": 7.529411764705882, "grad_norm": 5.980208396911621, "learning_rate": 1.2358823529411766e-05, "loss": 0.134, "step": 6400 }, { "epoch": 7.647058823529412, "grad_norm": 20.869848251342773, "learning_rate": 1.1770588235294117e-05, "loss": 0.1346, "step": 6500 }, { "epoch": 7.764705882352941, "grad_norm": 46.88475799560547, "learning_rate": 1.1182352941176472e-05, "loss": 0.1871, "step": 6600 }, { "epoch": 7.882352941176471, "grad_norm": 0.02130250073969364, "learning_rate": 1.0594117647058823e-05, "loss": 0.1287, "step": 6700 }, { "epoch": 8.0, "grad_norm": 0.03343911096453667, "learning_rate": 1.0005882352941178e-05, "loss": 0.1196, "step": 6800 }, { "epoch": 8.117647058823529, "grad_norm": 0.6227496862411499, "learning_rate": 9.417647058823529e-06, "loss": 0.1297, "step": 6900 }, { "epoch": 8.235294117647058, "grad_norm": 0.6179134249687195, "learning_rate": 8.829411764705884e-06, "loss": 0.1624, "step": 7000 }, { "epoch": 8.352941176470589, "grad_norm": 0.015830175951123238, "learning_rate": 8.241176470588235e-06, "loss": 0.1167, "step": 7100 }, { "epoch": 8.470588235294118, "grad_norm": 0.9566899538040161, "learning_rate": 7.65294117647059e-06, "loss": 0.1552, "step": 7200 }, { "epoch": 8.588235294117647, "grad_norm": 0.0542168989777565, "learning_rate": 7.064705882352941e-06, "loss": 0.1505, "step": 7300 }, { "epoch": 8.705882352941176, "grad_norm": 20.501676559448242, "learning_rate": 6.476470588235295e-06, "loss": 0.1435, "step": 7400 }, { "epoch": 8.823529411764707, "grad_norm": 24.469806671142578, "learning_rate": 5.888235294117647e-06, "loss": 0.1411, "step": 7500 }, { "epoch": 8.941176470588236, "grad_norm": 0.4506582021713257, "learning_rate": 5.3e-06, "loss": 0.1091, "step": 7600 }, { "epoch": 9.058823529411764, "grad_norm": 0.023853685706853867, "learning_rate": 4.711764705882353e-06, "loss": 0.1062, "step": 7700 }, { "epoch": 9.176470588235293, "grad_norm": 0.06553351134061813, "learning_rate": 4.123529411764706e-06, "loss": 0.0913, "step": 7800 }, { "epoch": 9.294117647058824, "grad_norm": 0.1247912049293518, "learning_rate": 3.535294117647059e-06, "loss": 0.0952, "step": 7900 }, { "epoch": 9.411764705882353, "grad_norm": 0.03167188912630081, "learning_rate": 2.9470588235294116e-06, "loss": 0.1452, "step": 8000 }, { "epoch": 9.529411764705882, "grad_norm": 0.014582249335944653, "learning_rate": 2.3588235294117646e-06, "loss": 0.0852, "step": 8100 }, { "epoch": 9.647058823529411, "grad_norm": 0.022731494158506393, "learning_rate": 1.7705882352941176e-06, "loss": 0.1289, "step": 8200 }, { "epoch": 9.764705882352942, "grad_norm": 0.9212973713874817, "learning_rate": 1.1823529411764708e-06, "loss": 0.1184, "step": 8300 }, { "epoch": 9.882352941176471, "grad_norm": 0.4289371967315674, "learning_rate": 5.941176470588236e-07, "loss": 0.0689, "step": 8400 }, { "epoch": 10.0, "grad_norm": 0.018015075474977493, "learning_rate": 5.882352941176471e-09, "loss": 0.119, "step": 8500 }, { "epoch": 10.0, "step": 8500, "total_flos": 5.269455293792256e+18, "train_loss": 0.24702390199549057, "train_runtime": 4430.2336, "train_samples_per_second": 15.349, "train_steps_per_second": 1.919 } ], "logging_steps": 100, "max_steps": 8500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.269455293792256e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }