| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 8500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11764705882352941, | |
| "grad_norm": 3.437100410461426, | |
| "learning_rate": 4.941764705882353e-05, | |
| "loss": 0.6786, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 2.240630626678467, | |
| "learning_rate": 4.8829411764705885e-05, | |
| "loss": 0.6502, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.35294117647058826, | |
| "grad_norm": 4.658847808837891, | |
| "learning_rate": 4.824117647058824e-05, | |
| "loss": 0.554, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 2.393118143081665, | |
| "learning_rate": 4.765294117647059e-05, | |
| "loss": 0.5091, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 6.558012008666992, | |
| "learning_rate": 4.7064705882352944e-05, | |
| "loss": 0.4821, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 10.292436599731445, | |
| "learning_rate": 4.64764705882353e-05, | |
| "loss": 0.4832, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8235294117647058, | |
| "grad_norm": 2.2930338382720947, | |
| "learning_rate": 4.588823529411765e-05, | |
| "loss": 0.4906, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 4.4116692543029785, | |
| "learning_rate": 4.53e-05, | |
| "loss": 0.4212, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0588235294117647, | |
| "grad_norm": 11.04600715637207, | |
| "learning_rate": 4.4711764705882356e-05, | |
| "loss": 0.4047, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1764705882352942, | |
| "grad_norm": 3.98223614692688, | |
| "learning_rate": 4.412352941176471e-05, | |
| "loss": 0.3968, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2941176470588236, | |
| "grad_norm": 5.051950931549072, | |
| "learning_rate": 4.353529411764706e-05, | |
| "loss": 0.3654, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.4117647058823528, | |
| "grad_norm": 3.390256404876709, | |
| "learning_rate": 4.2947058823529415e-05, | |
| "loss": 0.4115, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.5294117647058822, | |
| "grad_norm": 6.051326751708984, | |
| "learning_rate": 4.235882352941177e-05, | |
| "loss": 0.3766, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.6470588235294117, | |
| "grad_norm": 3.2749335765838623, | |
| "learning_rate": 4.1770588235294115e-05, | |
| "loss": 0.3872, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.7647058823529411, | |
| "grad_norm": 3.5865375995635986, | |
| "learning_rate": 4.1182352941176474e-05, | |
| "loss": 0.3628, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.8823529411764706, | |
| "grad_norm": 9.934630393981934, | |
| "learning_rate": 4.059411764705883e-05, | |
| "loss": 0.3044, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3.085042953491211, | |
| "learning_rate": 4.0005882352941174e-05, | |
| "loss": 0.3483, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.1176470588235294, | |
| "grad_norm": 10.278831481933594, | |
| "learning_rate": 3.941764705882353e-05, | |
| "loss": 0.3355, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.235294117647059, | |
| "grad_norm": 4.414193630218506, | |
| "learning_rate": 3.8829411764705886e-05, | |
| "loss": 0.3181, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.3529411764705883, | |
| "grad_norm": 9.129111289978027, | |
| "learning_rate": 3.824117647058823e-05, | |
| "loss": 0.3096, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.4705882352941178, | |
| "grad_norm": 37.28619384765625, | |
| "learning_rate": 3.765294117647059e-05, | |
| "loss": 0.2821, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.588235294117647, | |
| "grad_norm": 0.5300999283790588, | |
| "learning_rate": 3.706470588235294e-05, | |
| "loss": 0.3126, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.7058823529411766, | |
| "grad_norm": 1.9004733562469482, | |
| "learning_rate": 3.64764705882353e-05, | |
| "loss": 0.3397, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.8235294117647056, | |
| "grad_norm": 0.971808910369873, | |
| "learning_rate": 3.588823529411765e-05, | |
| "loss": 0.3461, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.9411764705882355, | |
| "grad_norm": 10.828816413879395, | |
| "learning_rate": 3.53e-05, | |
| "loss": 0.2998, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.0588235294117645, | |
| "grad_norm": 5.21161413192749, | |
| "learning_rate": 3.471176470588236e-05, | |
| "loss": 0.287, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.176470588235294, | |
| "grad_norm": 17.639314651489258, | |
| "learning_rate": 3.412352941176471e-05, | |
| "loss": 0.2408, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.2941176470588234, | |
| "grad_norm": 5.808989524841309, | |
| "learning_rate": 3.3535294117647056e-05, | |
| "loss": 0.2514, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.411764705882353, | |
| "grad_norm": 5.669056415557861, | |
| "learning_rate": 3.2947058823529416e-05, | |
| "loss": 0.2401, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.5294117647058822, | |
| "grad_norm": 17.544368743896484, | |
| "learning_rate": 3.235882352941176e-05, | |
| "loss": 0.304, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.6470588235294117, | |
| "grad_norm": 15.652566909790039, | |
| "learning_rate": 3.1770588235294115e-05, | |
| "loss": 0.2169, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.764705882352941, | |
| "grad_norm": 38.110477447509766, | |
| "learning_rate": 3.1182352941176475e-05, | |
| "loss": 0.3245, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.8823529411764706, | |
| "grad_norm": 15.126811027526855, | |
| "learning_rate": 3.059411764705882e-05, | |
| "loss": 0.2366, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.916968822479248, | |
| "learning_rate": 3.0005882352941178e-05, | |
| "loss": 0.2394, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 4.117647058823529, | |
| "grad_norm": 6.685044765472412, | |
| "learning_rate": 2.9417647058823534e-05, | |
| "loss": 0.239, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 4.235294117647059, | |
| "grad_norm": 0.2686949074268341, | |
| "learning_rate": 2.8829411764705884e-05, | |
| "loss": 0.2042, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 4.352941176470588, | |
| "grad_norm": 4.416964530944824, | |
| "learning_rate": 2.8241176470588237e-05, | |
| "loss": 0.235, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 4.470588235294118, | |
| "grad_norm": 14.782841682434082, | |
| "learning_rate": 2.7652941176470586e-05, | |
| "loss": 0.2145, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 4.588235294117647, | |
| "grad_norm": 3.3013288974761963, | |
| "learning_rate": 2.7064705882352943e-05, | |
| "loss": 0.2301, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 4.705882352941177, | |
| "grad_norm": 12.687139511108398, | |
| "learning_rate": 2.6476470588235296e-05, | |
| "loss": 0.2459, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 4.823529411764706, | |
| "grad_norm": 12.49465560913086, | |
| "learning_rate": 2.5888235294117645e-05, | |
| "loss": 0.2523, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 4.9411764705882355, | |
| "grad_norm": 7.881172180175781, | |
| "learning_rate": 2.5300000000000002e-05, | |
| "loss": 0.2301, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 5.0588235294117645, | |
| "grad_norm": 7.88573694229126, | |
| "learning_rate": 2.4711764705882355e-05, | |
| "loss": 0.1962, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 5.176470588235294, | |
| "grad_norm": 8.924189567565918, | |
| "learning_rate": 2.4123529411764704e-05, | |
| "loss": 0.1782, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 5.294117647058823, | |
| "grad_norm": 0.7592102289199829, | |
| "learning_rate": 2.353529411764706e-05, | |
| "loss": 0.2081, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 5.411764705882353, | |
| "grad_norm": 4.540928363800049, | |
| "learning_rate": 2.2947058823529414e-05, | |
| "loss": 0.1991, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 5.529411764705882, | |
| "grad_norm": 2.462392807006836, | |
| "learning_rate": 2.2358823529411767e-05, | |
| "loss": 0.226, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 5.647058823529412, | |
| "grad_norm": 0.7198309898376465, | |
| "learning_rate": 2.1770588235294116e-05, | |
| "loss": 0.2093, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 5.764705882352941, | |
| "grad_norm": 0.04426976293325424, | |
| "learning_rate": 2.1182352941176473e-05, | |
| "loss": 0.1616, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 5.882352941176471, | |
| "grad_norm": 19.01175880432129, | |
| "learning_rate": 2.0594117647058826e-05, | |
| "loss": 0.2451, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 9.680488586425781, | |
| "learning_rate": 2.000588235294118e-05, | |
| "loss": 0.1954, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 6.117647058823529, | |
| "grad_norm": 13.620891571044922, | |
| "learning_rate": 1.9417647058823528e-05, | |
| "loss": 0.1739, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 6.235294117647059, | |
| "grad_norm": 23.215810775756836, | |
| "learning_rate": 1.8829411764705885e-05, | |
| "loss": 0.1989, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 6.352941176470588, | |
| "grad_norm": 34.69459915161133, | |
| "learning_rate": 1.8241176470588238e-05, | |
| "loss": 0.1845, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 6.470588235294118, | |
| "grad_norm": 0.17689703404903412, | |
| "learning_rate": 1.7652941176470587e-05, | |
| "loss": 0.1888, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 6.588235294117647, | |
| "grad_norm": 0.1681346297264099, | |
| "learning_rate": 1.706470588235294e-05, | |
| "loss": 0.1808, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 6.705882352941177, | |
| "grad_norm": 0.21678882837295532, | |
| "learning_rate": 1.6476470588235297e-05, | |
| "loss": 0.1527, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 6.823529411764706, | |
| "grad_norm": 6.419234752655029, | |
| "learning_rate": 1.588823529411765e-05, | |
| "loss": 0.2019, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 6.9411764705882355, | |
| "grad_norm": 0.09294360131025314, | |
| "learning_rate": 1.53e-05, | |
| "loss": 0.1881, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 7.0588235294117645, | |
| "grad_norm": 14.81273365020752, | |
| "learning_rate": 1.4711764705882352e-05, | |
| "loss": 0.1606, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 7.176470588235294, | |
| "grad_norm": 64.77528381347656, | |
| "learning_rate": 1.4123529411764707e-05, | |
| "loss": 0.103, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 7.294117647058823, | |
| "grad_norm": 0.01647937297821045, | |
| "learning_rate": 1.353529411764706e-05, | |
| "loss": 0.144, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 7.411764705882353, | |
| "grad_norm": 2.1152279376983643, | |
| "learning_rate": 1.2947058823529413e-05, | |
| "loss": 0.1687, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 7.529411764705882, | |
| "grad_norm": 5.980208396911621, | |
| "learning_rate": 1.2358823529411766e-05, | |
| "loss": 0.134, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 7.647058823529412, | |
| "grad_norm": 20.869848251342773, | |
| "learning_rate": 1.1770588235294117e-05, | |
| "loss": 0.1346, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 7.764705882352941, | |
| "grad_norm": 46.88475799560547, | |
| "learning_rate": 1.1182352941176472e-05, | |
| "loss": 0.1871, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 7.882352941176471, | |
| "grad_norm": 0.02130250073969364, | |
| "learning_rate": 1.0594117647058823e-05, | |
| "loss": 0.1287, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.03343911096453667, | |
| "learning_rate": 1.0005882352941178e-05, | |
| "loss": 0.1196, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 8.117647058823529, | |
| "grad_norm": 0.6227496862411499, | |
| "learning_rate": 9.417647058823529e-06, | |
| "loss": 0.1297, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 8.235294117647058, | |
| "grad_norm": 0.6179134249687195, | |
| "learning_rate": 8.829411764705884e-06, | |
| "loss": 0.1624, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 8.352941176470589, | |
| "grad_norm": 0.015830175951123238, | |
| "learning_rate": 8.241176470588235e-06, | |
| "loss": 0.1167, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 8.470588235294118, | |
| "grad_norm": 0.9566899538040161, | |
| "learning_rate": 7.65294117647059e-06, | |
| "loss": 0.1552, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 8.588235294117647, | |
| "grad_norm": 0.0542168989777565, | |
| "learning_rate": 7.064705882352941e-06, | |
| "loss": 0.1505, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 8.705882352941176, | |
| "grad_norm": 20.501676559448242, | |
| "learning_rate": 6.476470588235295e-06, | |
| "loss": 0.1435, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 8.823529411764707, | |
| "grad_norm": 24.469806671142578, | |
| "learning_rate": 5.888235294117647e-06, | |
| "loss": 0.1411, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 8.941176470588236, | |
| "grad_norm": 0.4506582021713257, | |
| "learning_rate": 5.3e-06, | |
| "loss": 0.1091, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 9.058823529411764, | |
| "grad_norm": 0.023853685706853867, | |
| "learning_rate": 4.711764705882353e-06, | |
| "loss": 0.1062, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 9.176470588235293, | |
| "grad_norm": 0.06553351134061813, | |
| "learning_rate": 4.123529411764706e-06, | |
| "loss": 0.0913, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 9.294117647058824, | |
| "grad_norm": 0.1247912049293518, | |
| "learning_rate": 3.535294117647059e-06, | |
| "loss": 0.0952, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 9.411764705882353, | |
| "grad_norm": 0.03167188912630081, | |
| "learning_rate": 2.9470588235294116e-06, | |
| "loss": 0.1452, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 9.529411764705882, | |
| "grad_norm": 0.014582249335944653, | |
| "learning_rate": 2.3588235294117646e-06, | |
| "loss": 0.0852, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 9.647058823529411, | |
| "grad_norm": 0.022731494158506393, | |
| "learning_rate": 1.7705882352941176e-06, | |
| "loss": 0.1289, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 9.764705882352942, | |
| "grad_norm": 0.9212973713874817, | |
| "learning_rate": 1.1823529411764708e-06, | |
| "loss": 0.1184, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 9.882352941176471, | |
| "grad_norm": 0.4289371967315674, | |
| "learning_rate": 5.941176470588236e-07, | |
| "loss": 0.0689, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.018015075474977493, | |
| "learning_rate": 5.882352941176471e-09, | |
| "loss": 0.119, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 8500, | |
| "total_flos": 5.269455293792256e+18, | |
| "train_loss": 0.24702390199549057, | |
| "train_runtime": 4430.2336, | |
| "train_samples_per_second": 15.349, | |
| "train_steps_per_second": 1.919 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 8500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.269455293792256e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |