| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.0, | |
| "eval_steps": 500, | |
| "global_step": 11655, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.3003003003003003, | |
| "grad_norm": 4.6720170974731445, | |
| "learning_rate": 1.9399399399399402e-05, | |
| "loss": 0.4843, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6006006006006006, | |
| "grad_norm": 17.49016571044922, | |
| "learning_rate": 1.87987987987988e-05, | |
| "loss": 0.3955, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9009009009009009, | |
| "grad_norm": 50.96406936645508, | |
| "learning_rate": 1.81981981981982e-05, | |
| "loss": 0.3807, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.2911546230316162, | |
| "eval_runtime": 1.0985, | |
| "eval_samples_per_second": 1346.393, | |
| "eval_steps_per_second": 168.413, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 1.2012012012012012, | |
| "grad_norm": 41.78978729248047, | |
| "learning_rate": 1.7597597597597598e-05, | |
| "loss": 0.3176, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.5015015015015014, | |
| "grad_norm": 15.492409706115723, | |
| "learning_rate": 1.6996996996997e-05, | |
| "loss": 0.2729, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.8018018018018018, | |
| "grad_norm": 15.4843111038208, | |
| "learning_rate": 1.6396396396396396e-05, | |
| "loss": 0.2802, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.4967462718486786, | |
| "eval_runtime": 1.1211, | |
| "eval_samples_per_second": 1319.266, | |
| "eval_steps_per_second": 165.02, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.1021021021021022, | |
| "grad_norm": 0.058736126869916916, | |
| "learning_rate": 1.5795795795795797e-05, | |
| "loss": 0.2095, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.4024024024024024, | |
| "grad_norm": 41.070953369140625, | |
| "learning_rate": 1.5195195195195196e-05, | |
| "loss": 0.1309, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.7027027027027026, | |
| "grad_norm": 26.841840744018555, | |
| "learning_rate": 1.4594594594594596e-05, | |
| "loss": 0.169, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.5553244948387146, | |
| "eval_runtime": 1.1109, | |
| "eval_samples_per_second": 1331.404, | |
| "eval_steps_per_second": 166.538, | |
| "step": 4995 | |
| }, | |
| { | |
| "epoch": 3.003003003003003, | |
| "grad_norm": 0.2728441655635834, | |
| "learning_rate": 1.3993993993993995e-05, | |
| "loss": 0.1481, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.3033033033033035, | |
| "grad_norm": 0.013637651689350605, | |
| "learning_rate": 1.3393393393393394e-05, | |
| "loss": 0.0638, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.6036036036036037, | |
| "grad_norm": 0.5741263031959534, | |
| "learning_rate": 1.2792792792792795e-05, | |
| "loss": 0.0753, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.903903903903904, | |
| "grad_norm": 0.03777342289686203, | |
| "learning_rate": 1.2192192192192194e-05, | |
| "loss": 0.0823, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.7062426805496216, | |
| "eval_runtime": 1.1008, | |
| "eval_samples_per_second": 1343.529, | |
| "eval_steps_per_second": 168.055, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 4.2042042042042045, | |
| "grad_norm": 0.0037255873903632164, | |
| "learning_rate": 1.1591591591591593e-05, | |
| "loss": 0.0377, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.504504504504505, | |
| "grad_norm": 0.004647154361009598, | |
| "learning_rate": 1.0990990990990992e-05, | |
| "loss": 0.037, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 4.804804804804805, | |
| "grad_norm": 0.00761270709335804, | |
| "learning_rate": 1.039039039039039e-05, | |
| "loss": 0.0534, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.7775081396102905, | |
| "eval_runtime": 1.0937, | |
| "eval_samples_per_second": 1352.232, | |
| "eval_steps_per_second": 169.143, | |
| "step": 8325 | |
| }, | |
| { | |
| "epoch": 5.105105105105105, | |
| "grad_norm": 0.004814255982637405, | |
| "learning_rate": 9.78978978978979e-06, | |
| "loss": 0.0287, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 5.405405405405405, | |
| "grad_norm": 0.0026015336625277996, | |
| "learning_rate": 9.189189189189191e-06, | |
| "loss": 0.025, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 5.7057057057057055, | |
| "grad_norm": 0.012158718891441822, | |
| "learning_rate": 8.588588588588589e-06, | |
| "loss": 0.0246, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.8445501923561096, | |
| "eval_runtime": 1.0956, | |
| "eval_samples_per_second": 1349.885, | |
| "eval_steps_per_second": 168.85, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 6.006006006006006, | |
| "grad_norm": 0.0015228951815515757, | |
| "learning_rate": 7.987987987987988e-06, | |
| "loss": 0.0288, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 6.306306306306306, | |
| "grad_norm": 0.0007786727510392666, | |
| "learning_rate": 7.387387387387388e-06, | |
| "loss": 0.0152, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 6.606606606606607, | |
| "grad_norm": 0.0005245794309303164, | |
| "learning_rate": 6.786786786786788e-06, | |
| "loss": 0.0191, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 6.906906906906907, | |
| "grad_norm": 0.0009558356832712889, | |
| "learning_rate": 6.186186186186187e-06, | |
| "loss": 0.0111, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.9164330363273621, | |
| "eval_runtime": 1.0989, | |
| "eval_samples_per_second": 1345.937, | |
| "eval_steps_per_second": 168.356, | |
| "step": 11655 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 16650, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3066329128047360.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |