| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1000.0, | |
| "eval_steps": 500, | |
| "global_step": 8000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 62.5, | |
| "grad_norm": 0.2072206288576126, | |
| "learning_rate": 0.0009375, | |
| "loss": 0.3349, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 62.5, | |
| "eval_loss": 0.21073544025421143, | |
| "eval_runtime": 1.7576, | |
| "eval_samples_per_second": 4470.361, | |
| "eval_steps_per_second": 35.276, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 125.0, | |
| "grad_norm": 0.20736312866210938, | |
| "learning_rate": 0.000875, | |
| "loss": 0.1147, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 125.0, | |
| "eval_loss": 0.2409573197364807, | |
| "eval_runtime": 1.7582, | |
| "eval_samples_per_second": 4468.854, | |
| "eval_steps_per_second": 35.264, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 187.5, | |
| "grad_norm": 0.17418691515922546, | |
| "learning_rate": 0.0008125000000000001, | |
| "loss": 0.071, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 187.5, | |
| "eval_loss": 0.2686581015586853, | |
| "eval_runtime": 1.7698, | |
| "eval_samples_per_second": 4439.56, | |
| "eval_steps_per_second": 35.033, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 250.0, | |
| "grad_norm": 0.1891753375530243, | |
| "learning_rate": 0.00075, | |
| "loss": 0.0502, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 250.0, | |
| "eval_loss": 0.29014259576797485, | |
| "eval_runtime": 1.7819, | |
| "eval_samples_per_second": 4409.274, | |
| "eval_steps_per_second": 34.794, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 312.5, | |
| "grad_norm": 0.16434858739376068, | |
| "learning_rate": 0.0006875, | |
| "loss": 0.0373, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 312.5, | |
| "eval_loss": 0.3033287525177002, | |
| "eval_runtime": 1.7585, | |
| "eval_samples_per_second": 4468.05, | |
| "eval_steps_per_second": 35.258, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 375.0, | |
| "grad_norm": 0.11065812408924103, | |
| "learning_rate": 0.000625, | |
| "loss": 0.0301, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 375.0, | |
| "eval_loss": 0.31410983204841614, | |
| "eval_runtime": 1.7765, | |
| "eval_samples_per_second": 4422.629, | |
| "eval_steps_per_second": 34.899, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 437.5, | |
| "grad_norm": 0.12541820108890533, | |
| "learning_rate": 0.0005625000000000001, | |
| "loss": 0.025, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 437.5, | |
| "eval_loss": 0.3234836161136627, | |
| "eval_runtime": 1.8714, | |
| "eval_samples_per_second": 4198.44, | |
| "eval_steps_per_second": 33.13, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 500.0, | |
| "grad_norm": 0.12353639304637909, | |
| "learning_rate": 0.0005, | |
| "loss": 0.0212, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 500.0, | |
| "eval_loss": 0.33116382360458374, | |
| "eval_runtime": 2.0737, | |
| "eval_samples_per_second": 3788.95, | |
| "eval_steps_per_second": 29.899, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 562.5, | |
| "grad_norm": 0.14147008955478668, | |
| "learning_rate": 0.0004375, | |
| "loss": 0.0187, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 562.5, | |
| "eval_loss": 0.3403891623020172, | |
| "eval_runtime": 1.7439, | |
| "eval_samples_per_second": 4505.522, | |
| "eval_steps_per_second": 35.553, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 625.0, | |
| "grad_norm": 0.17466695606708527, | |
| "learning_rate": 0.000375, | |
| "loss": 0.017, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 625.0, | |
| "eval_loss": 0.3371485471725464, | |
| "eval_runtime": 1.7727, | |
| "eval_samples_per_second": 4432.252, | |
| "eval_steps_per_second": 34.975, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 687.5, | |
| "grad_norm": 0.1278233528137207, | |
| "learning_rate": 0.0003125, | |
| "loss": 0.0148, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 687.5, | |
| "eval_loss": 0.3466494679450989, | |
| "eval_runtime": 1.7776, | |
| "eval_samples_per_second": 4420.019, | |
| "eval_steps_per_second": 34.879, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 750.0, | |
| "grad_norm": 0.08440423011779785, | |
| "learning_rate": 0.00025, | |
| "loss": 0.0139, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 750.0, | |
| "eval_loss": 0.34804055094718933, | |
| "eval_runtime": 1.7744, | |
| "eval_samples_per_second": 4427.887, | |
| "eval_steps_per_second": 34.941, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 812.5, | |
| "grad_norm": 0.1064932644367218, | |
| "learning_rate": 0.0001875, | |
| "loss": 0.0124, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 812.5, | |
| "eval_loss": 0.3551761209964752, | |
| "eval_runtime": 1.7547, | |
| "eval_samples_per_second": 4477.816, | |
| "eval_steps_per_second": 35.335, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 875.0, | |
| "grad_norm": 0.07584909349679947, | |
| "learning_rate": 0.000125, | |
| "loss": 0.0118, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 875.0, | |
| "eval_loss": 0.3594026267528534, | |
| "eval_runtime": 1.7559, | |
| "eval_samples_per_second": 4474.601, | |
| "eval_steps_per_second": 35.309, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 937.5, | |
| "grad_norm": 0.04748733341693878, | |
| "learning_rate": 6.25e-05, | |
| "loss": 0.0112, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 937.5, | |
| "eval_loss": 0.35810717940330505, | |
| "eval_runtime": 1.7577, | |
| "eval_samples_per_second": 4469.966, | |
| "eval_steps_per_second": 35.273, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1000.0, | |
| "grad_norm": 0.0842234194278717, | |
| "learning_rate": 0.0, | |
| "loss": 0.0106, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1000.0, | |
| "eval_loss": 0.35844165086746216, | |
| "eval_runtime": 1.7498, | |
| "eval_samples_per_second": 4490.355, | |
| "eval_steps_per_second": 35.434, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1000.0, | |
| "step": 8000, | |
| "total_flos": 3.308878023570227e+16, | |
| "train_loss": 0.04967061561346054, | |
| "train_runtime": 721.4078, | |
| "train_samples_per_second": 1386.178, | |
| "train_steps_per_second": 11.089 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 8000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1000, | |
| "save_steps": 500, | |
| "total_flos": 3.308878023570227e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |