| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.22456140350877193, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011228070175438596, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00024749999999999994, | |
| "loss": 8.5199, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02245614035087719, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0002998664031981949, | |
| "loss": 6.415, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03368421052631579, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.0002993145411731054, | |
| "loss": 5.8846, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04491228070175438, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.00029833654740795074, | |
| "loss": 5.5615, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.056140350877192984, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.00029693521301859697, | |
| "loss": 5.3436, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.056140350877192984, | |
| "eval_loss": 5.2718000411987305, | |
| "eval_runtime": 296.6131, | |
| "eval_samples_per_second": 50.571, | |
| "eval_steps_per_second": 6.321, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06736842105263158, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 0.00029511453730114126, | |
| "loss": 5.1964, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07859649122807018, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.0002928797163182408, | |
| "loss": 5.0532, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.08982456140350877, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 0.00029023712806996646, | |
| "loss": 4.9288, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10105263157894737, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.0002871943142915013, | |
| "loss": 4.8118, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.11228070175438597, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.0002837599589296326, | |
| "loss": 4.6787, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.11228070175438597, | |
| "eval_loss": 4.601714134216309, | |
| "eval_runtime": 296.6412, | |
| "eval_samples_per_second": 50.566, | |
| "eval_steps_per_second": 6.321, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.12350877192982457, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.00027994386335946324, | |
| "loss": 4.5393, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.13473684210526315, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 0.0002757569184120724, | |
| "loss": 4.3818, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.14596491228070174, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.00027121107329295584, | |
| "loss": 4.2778, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.15719298245614036, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 0.0002663193014799507, | |
| "loss": 4.2072, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.16842105263157894, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.000261095563697969, | |
| "loss": 4.1242, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.16842105263157894, | |
| "eval_loss": 4.072988510131836, | |
| "eval_runtime": 296.6152, | |
| "eval_samples_per_second": 50.571, | |
| "eval_steps_per_second": 6.321, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.17964912280701753, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.0002555547680762069, | |
| "loss": 4.0549, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.19087719298245615, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.00024971272760153834, | |
| "loss": 4.0018, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.20210526315789473, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 0.00024358611498951694, | |
| "loss": 3.9404, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.0002371924151017814, | |
| "loss": 3.9074, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.22456140350877193, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.00023054987504566113, | |
| "loss": 3.8638, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.22456140350877193, | |
| "eval_loss": 3.8390953540802, | |
| "eval_runtime": 296.6249, | |
| "eval_samples_per_second": 50.569, | |
| "eval_steps_per_second": 6.321, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 6000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.7273358843904e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |