| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.929233772571987, | |
| "eval_steps": 50, | |
| "global_step": 1280, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.3904343582235237, | |
| "grad_norm": 1104.4410400390625, | |
| "learning_rate": 9.609375e-05, | |
| "loss": 24.0156, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3904343582235237, | |
| "eval_runtime": 19.6625, | |
| "eval_samples_per_second": 21.971, | |
| "eval_steps_per_second": 5.493, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.7808687164470474, | |
| "grad_norm": 808.6486206054688, | |
| "learning_rate": 9.21875e-05, | |
| "loss": 8.6753, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7808687164470474, | |
| "eval_runtime": 19.6174, | |
| "eval_samples_per_second": 22.021, | |
| "eval_steps_per_second": 5.505, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.16398243045388, | |
| "grad_norm": 668.7958984375, | |
| "learning_rate": 8.828125000000001e-05, | |
| "loss": 6.9236, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.16398243045388, | |
| "eval_runtime": 19.6375, | |
| "eval_samples_per_second": 21.999, | |
| "eval_steps_per_second": 5.5, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.5544167886774036, | |
| "grad_norm": 1052.2135009765625, | |
| "learning_rate": 8.4375e-05, | |
| "loss": 5.8306, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.5544167886774036, | |
| "eval_runtime": 19.6418, | |
| "eval_samples_per_second": 21.994, | |
| "eval_steps_per_second": 5.498, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9448511469009273, | |
| "grad_norm": 445.023681640625, | |
| "learning_rate": 8.046875e-05, | |
| "loss": 4.2927, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.9448511469009273, | |
| "eval_runtime": 19.66, | |
| "eval_samples_per_second": 21.974, | |
| "eval_steps_per_second": 5.493, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.32796486090776, | |
| "grad_norm": 1359.267333984375, | |
| "learning_rate": 7.65625e-05, | |
| "loss": 3.9746, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.32796486090776, | |
| "eval_runtime": 19.6437, | |
| "eval_samples_per_second": 21.992, | |
| "eval_steps_per_second": 5.498, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.7183992191312836, | |
| "grad_norm": 524.243408203125, | |
| "learning_rate": 7.265625000000001e-05, | |
| "loss": 4.0115, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.7183992191312836, | |
| "eval_runtime": 19.6341, | |
| "eval_samples_per_second": 22.003, | |
| "eval_steps_per_second": 5.501, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.101512933138116, | |
| "grad_norm": 512.4241943359375, | |
| "learning_rate": 6.875e-05, | |
| "loss": 3.5169, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.101512933138116, | |
| "eval_runtime": 19.6203, | |
| "eval_samples_per_second": 22.018, | |
| "eval_steps_per_second": 5.505, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.49194729136164, | |
| "grad_norm": 845.7166137695312, | |
| "learning_rate": 6.484375e-05, | |
| "loss": 2.8932, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.49194729136164, | |
| "eval_runtime": 19.6341, | |
| "eval_samples_per_second": 22.003, | |
| "eval_steps_per_second": 5.501, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.8823816495851635, | |
| "grad_norm": 446.70782470703125, | |
| "learning_rate": 6.0937500000000004e-05, | |
| "loss": 2.4259, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.8823816495851635, | |
| "eval_runtime": 19.6466, | |
| "eval_samples_per_second": 21.988, | |
| "eval_steps_per_second": 5.497, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.265495363591996, | |
| "grad_norm": 787.6598510742188, | |
| "learning_rate": 5.703125e-05, | |
| "loss": 2.3731, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.265495363591996, | |
| "eval_runtime": 19.665, | |
| "eval_samples_per_second": 21.968, | |
| "eval_steps_per_second": 5.492, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.65592972181552, | |
| "grad_norm": 322.1536560058594, | |
| "learning_rate": 5.3125000000000004e-05, | |
| "loss": 2.2039, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.65592972181552, | |
| "eval_runtime": 19.6404, | |
| "eval_samples_per_second": 21.995, | |
| "eval_steps_per_second": 5.499, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 5.039043435822352, | |
| "grad_norm": 385.4160461425781, | |
| "learning_rate": 4.921875e-05, | |
| "loss": 1.9896, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.039043435822352, | |
| "eval_runtime": 19.5642, | |
| "eval_samples_per_second": 22.081, | |
| "eval_steps_per_second": 5.52, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.4294777940458765, | |
| "grad_norm": 784.8404541015625, | |
| "learning_rate": 4.5312500000000004e-05, | |
| "loss": 2.2482, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.4294777940458765, | |
| "eval_runtime": 19.6391, | |
| "eval_samples_per_second": 21.997, | |
| "eval_steps_per_second": 5.499, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.819912152269399, | |
| "grad_norm": 796.7393188476562, | |
| "learning_rate": 4.140625e-05, | |
| "loss": 1.9074, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 5.819912152269399, | |
| "eval_runtime": 19.6326, | |
| "eval_samples_per_second": 22.004, | |
| "eval_steps_per_second": 5.501, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 6.203025866276232, | |
| "grad_norm": 300.7341613769531, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 1.7721, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.203025866276232, | |
| "eval_runtime": 19.6207, | |
| "eval_samples_per_second": 22.018, | |
| "eval_steps_per_second": 5.504, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.593460224499756, | |
| "grad_norm": 141.74777221679688, | |
| "learning_rate": 3.359375e-05, | |
| "loss": 1.4889, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 6.593460224499756, | |
| "eval_runtime": 19.6434, | |
| "eval_samples_per_second": 21.992, | |
| "eval_steps_per_second": 5.498, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 6.98389458272328, | |
| "grad_norm": 283.6333923339844, | |
| "learning_rate": 2.96875e-05, | |
| "loss": 1.5119, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 6.98389458272328, | |
| "eval_runtime": 19.6441, | |
| "eval_samples_per_second": 21.991, | |
| "eval_steps_per_second": 5.498, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 7.367008296730113, | |
| "grad_norm": 564.9349975585938, | |
| "learning_rate": 2.578125e-05, | |
| "loss": 1.441, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 7.367008296730113, | |
| "eval_runtime": 19.6472, | |
| "eval_samples_per_second": 21.988, | |
| "eval_steps_per_second": 5.497, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 7.7574426549536355, | |
| "grad_norm": 549.8765869140625, | |
| "learning_rate": 2.1875e-05, | |
| "loss": 1.3326, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 7.7574426549536355, | |
| "eval_runtime": 19.6413, | |
| "eval_samples_per_second": 21.995, | |
| "eval_steps_per_second": 5.499, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 8.140556368960468, | |
| "grad_norm": 726.1790161132812, | |
| "learning_rate": 1.796875e-05, | |
| "loss": 1.335, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 8.140556368960468, | |
| "eval_runtime": 19.6045, | |
| "eval_samples_per_second": 22.036, | |
| "eval_steps_per_second": 5.509, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 8.530990727183992, | |
| "grad_norm": 391.7787780761719, | |
| "learning_rate": 1.4062500000000001e-05, | |
| "loss": 1.2137, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.530990727183992, | |
| "eval_runtime": 19.6376, | |
| "eval_samples_per_second": 21.999, | |
| "eval_steps_per_second": 5.5, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.921425085407517, | |
| "grad_norm": 194.22886657714844, | |
| "learning_rate": 1.0156250000000001e-05, | |
| "loss": 1.1869, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 8.921425085407517, | |
| "eval_runtime": 19.647, | |
| "eval_samples_per_second": 21.988, | |
| "eval_steps_per_second": 5.497, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 9.304538799414349, | |
| "grad_norm": 266.43023681640625, | |
| "learning_rate": 6.25e-06, | |
| "loss": 1.049, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 9.304538799414349, | |
| "eval_runtime": 19.6329, | |
| "eval_samples_per_second": 22.004, | |
| "eval_steps_per_second": 5.501, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 9.694973157637872, | |
| "grad_norm": 144.0484161376953, | |
| "learning_rate": 2.3437500000000002e-06, | |
| "loss": 1.0052, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 9.694973157637872, | |
| "eval_runtime": 19.6249, | |
| "eval_samples_per_second": 22.013, | |
| "eval_steps_per_second": 5.503, | |
| "step": 1250 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 1280, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |