{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.929233772571987, "eval_steps": 50, "global_step": 1280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3904343582235237, "grad_norm": 1104.4410400390625, "learning_rate": 9.609375e-05, "loss": 24.0156, "step": 50 }, { "epoch": 0.3904343582235237, "eval_runtime": 19.6625, "eval_samples_per_second": 21.971, "eval_steps_per_second": 5.493, "step": 50 }, { "epoch": 0.7808687164470474, "grad_norm": 808.6486206054688, "learning_rate": 9.21875e-05, "loss": 8.6753, "step": 100 }, { "epoch": 0.7808687164470474, "eval_runtime": 19.6174, "eval_samples_per_second": 22.021, "eval_steps_per_second": 5.505, "step": 100 }, { "epoch": 1.16398243045388, "grad_norm": 668.7958984375, "learning_rate": 8.828125000000001e-05, "loss": 6.9236, "step": 150 }, { "epoch": 1.16398243045388, "eval_runtime": 19.6375, "eval_samples_per_second": 21.999, "eval_steps_per_second": 5.5, "step": 150 }, { "epoch": 1.5544167886774036, "grad_norm": 1052.2135009765625, "learning_rate": 8.4375e-05, "loss": 5.8306, "step": 200 }, { "epoch": 1.5544167886774036, "eval_runtime": 19.6418, "eval_samples_per_second": 21.994, "eval_steps_per_second": 5.498, "step": 200 }, { "epoch": 1.9448511469009273, "grad_norm": 445.023681640625, "learning_rate": 8.046875e-05, "loss": 4.2927, "step": 250 }, { "epoch": 1.9448511469009273, "eval_runtime": 19.66, "eval_samples_per_second": 21.974, "eval_steps_per_second": 5.493, "step": 250 }, { "epoch": 2.32796486090776, "grad_norm": 1359.267333984375, "learning_rate": 7.65625e-05, "loss": 3.9746, "step": 300 }, { "epoch": 2.32796486090776, "eval_runtime": 19.6437, "eval_samples_per_second": 21.992, "eval_steps_per_second": 5.498, "step": 300 }, { "epoch": 2.7183992191312836, "grad_norm": 524.243408203125, "learning_rate": 7.265625000000001e-05, "loss": 4.0115, "step": 350 }, { "epoch": 2.7183992191312836, "eval_runtime": 19.6341, "eval_samples_per_second": 22.003, "eval_steps_per_second": 5.501, "step": 350 }, { "epoch": 3.101512933138116, "grad_norm": 512.4241943359375, "learning_rate": 6.875e-05, "loss": 3.5169, "step": 400 }, { "epoch": 3.101512933138116, "eval_runtime": 19.6203, "eval_samples_per_second": 22.018, "eval_steps_per_second": 5.505, "step": 400 }, { "epoch": 3.49194729136164, "grad_norm": 845.7166137695312, "learning_rate": 6.484375e-05, "loss": 2.8932, "step": 450 }, { "epoch": 3.49194729136164, "eval_runtime": 19.6341, "eval_samples_per_second": 22.003, "eval_steps_per_second": 5.501, "step": 450 }, { "epoch": 3.8823816495851635, "grad_norm": 446.70782470703125, "learning_rate": 6.0937500000000004e-05, "loss": 2.4259, "step": 500 }, { "epoch": 3.8823816495851635, "eval_runtime": 19.6466, "eval_samples_per_second": 21.988, "eval_steps_per_second": 5.497, "step": 500 }, { "epoch": 4.265495363591996, "grad_norm": 787.6598510742188, "learning_rate": 5.703125e-05, "loss": 2.3731, "step": 550 }, { "epoch": 4.265495363591996, "eval_runtime": 19.665, "eval_samples_per_second": 21.968, "eval_steps_per_second": 5.492, "step": 550 }, { "epoch": 4.65592972181552, "grad_norm": 322.1536560058594, "learning_rate": 5.3125000000000004e-05, "loss": 2.2039, "step": 600 }, { "epoch": 4.65592972181552, "eval_runtime": 19.6404, "eval_samples_per_second": 21.995, "eval_steps_per_second": 5.499, "step": 600 }, { "epoch": 5.039043435822352, "grad_norm": 385.4160461425781, "learning_rate": 4.921875e-05, "loss": 1.9896, "step": 650 }, { "epoch": 5.039043435822352, "eval_runtime": 19.5642, "eval_samples_per_second": 22.081, "eval_steps_per_second": 5.52, "step": 650 }, { "epoch": 5.4294777940458765, "grad_norm": 784.8404541015625, "learning_rate": 4.5312500000000004e-05, "loss": 2.2482, "step": 700 }, { "epoch": 5.4294777940458765, "eval_runtime": 19.6391, "eval_samples_per_second": 21.997, "eval_steps_per_second": 5.499, "step": 700 }, { "epoch": 5.819912152269399, "grad_norm": 796.7393188476562, "learning_rate": 4.140625e-05, "loss": 1.9074, "step": 750 }, { "epoch": 5.819912152269399, "eval_runtime": 19.6326, "eval_samples_per_second": 22.004, "eval_steps_per_second": 5.501, "step": 750 }, { "epoch": 6.203025866276232, "grad_norm": 300.7341613769531, "learning_rate": 3.7500000000000003e-05, "loss": 1.7721, "step": 800 }, { "epoch": 6.203025866276232, "eval_runtime": 19.6207, "eval_samples_per_second": 22.018, "eval_steps_per_second": 5.504, "step": 800 }, { "epoch": 6.593460224499756, "grad_norm": 141.74777221679688, "learning_rate": 3.359375e-05, "loss": 1.4889, "step": 850 }, { "epoch": 6.593460224499756, "eval_runtime": 19.6434, "eval_samples_per_second": 21.992, "eval_steps_per_second": 5.498, "step": 850 }, { "epoch": 6.98389458272328, "grad_norm": 283.6333923339844, "learning_rate": 2.96875e-05, "loss": 1.5119, "step": 900 }, { "epoch": 6.98389458272328, "eval_runtime": 19.6441, "eval_samples_per_second": 21.991, "eval_steps_per_second": 5.498, "step": 900 }, { "epoch": 7.367008296730113, "grad_norm": 564.9349975585938, "learning_rate": 2.578125e-05, "loss": 1.441, "step": 950 }, { "epoch": 7.367008296730113, "eval_runtime": 19.6472, "eval_samples_per_second": 21.988, "eval_steps_per_second": 5.497, "step": 950 }, { "epoch": 7.7574426549536355, "grad_norm": 549.8765869140625, "learning_rate": 2.1875e-05, "loss": 1.3326, "step": 1000 }, { "epoch": 7.7574426549536355, "eval_runtime": 19.6413, "eval_samples_per_second": 21.995, "eval_steps_per_second": 5.499, "step": 1000 }, { "epoch": 8.140556368960468, "grad_norm": 726.1790161132812, "learning_rate": 1.796875e-05, "loss": 1.335, "step": 1050 }, { "epoch": 8.140556368960468, "eval_runtime": 19.6045, "eval_samples_per_second": 22.036, "eval_steps_per_second": 5.509, "step": 1050 }, { "epoch": 8.530990727183992, "grad_norm": 391.7787780761719, "learning_rate": 1.4062500000000001e-05, "loss": 1.2137, "step": 1100 }, { "epoch": 8.530990727183992, "eval_runtime": 19.6376, "eval_samples_per_second": 21.999, "eval_steps_per_second": 5.5, "step": 1100 }, { "epoch": 8.921425085407517, "grad_norm": 194.22886657714844, "learning_rate": 1.0156250000000001e-05, "loss": 1.1869, "step": 1150 }, { "epoch": 8.921425085407517, "eval_runtime": 19.647, "eval_samples_per_second": 21.988, "eval_steps_per_second": 5.497, "step": 1150 }, { "epoch": 9.304538799414349, "grad_norm": 266.43023681640625, "learning_rate": 6.25e-06, "loss": 1.049, "step": 1200 }, { "epoch": 9.304538799414349, "eval_runtime": 19.6329, "eval_samples_per_second": 22.004, "eval_steps_per_second": 5.501, "step": 1200 }, { "epoch": 9.694973157637872, "grad_norm": 144.0484161376953, "learning_rate": 2.3437500000000002e-06, "loss": 1.0052, "step": 1250 }, { "epoch": 9.694973157637872, "eval_runtime": 19.6249, "eval_samples_per_second": 22.013, "eval_steps_per_second": 5.503, "step": 1250 } ], "logging_steps": 50, "max_steps": 1280, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }