{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22456140350877193, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011228070175438596, "grad_norm": 0.60546875, "learning_rate": 0.00024749999999999994, "loss": 8.5199, "step": 100 }, { "epoch": 0.02245614035087719, "grad_norm": 0.55859375, "learning_rate": 0.0002998664031981949, "loss": 6.415, "step": 200 }, { "epoch": 0.03368421052631579, "grad_norm": 0.48046875, "learning_rate": 0.0002993145411731054, "loss": 5.8846, "step": 300 }, { "epoch": 0.04491228070175438, "grad_norm": 0.5, "learning_rate": 0.00029833654740795074, "loss": 5.5615, "step": 400 }, { "epoch": 0.056140350877192984, "grad_norm": 0.63671875, "learning_rate": 0.00029693521301859697, "loss": 5.3436, "step": 500 }, { "epoch": 0.056140350877192984, "eval_loss": 5.2718000411987305, "eval_runtime": 296.6131, "eval_samples_per_second": 50.571, "eval_steps_per_second": 6.321, "step": 500 }, { "epoch": 0.06736842105263158, "grad_norm": 0.484375, "learning_rate": 0.00029511453730114126, "loss": 5.1964, "step": 600 }, { "epoch": 0.07859649122807018, "grad_norm": 0.45703125, "learning_rate": 0.0002928797163182408, "loss": 5.0532, "step": 700 }, { "epoch": 0.08982456140350877, "grad_norm": 0.3984375, "learning_rate": 0.00029023712806996646, "loss": 4.9288, "step": 800 }, { "epoch": 0.10105263157894737, "grad_norm": 0.408203125, "learning_rate": 0.0002871943142915013, "loss": 4.8118, "step": 900 }, { "epoch": 0.11228070175438597, "grad_norm": 0.609375, "learning_rate": 0.0002837599589296326, "loss": 4.6787, "step": 1000 }, { "epoch": 0.11228070175438597, "eval_loss": 4.601714134216309, "eval_runtime": 296.6412, "eval_samples_per_second": 50.566, "eval_steps_per_second": 6.321, "step": 1000 }, { "epoch": 0.12350877192982457, "grad_norm": 0.49609375, "learning_rate": 0.00027994386335946324, "loss": 4.5393, "step": 1100 }, { "epoch": 0.13473684210526315, "grad_norm": 0.4375, "learning_rate": 0.0002757569184120724, "loss": 4.3818, "step": 1200 }, { "epoch": 0.14596491228070174, "grad_norm": 0.431640625, "learning_rate": 0.00027121107329295584, "loss": 4.2778, "step": 1300 }, { "epoch": 0.15719298245614036, "grad_norm": 0.380859375, "learning_rate": 0.0002663193014799507, "loss": 4.2072, "step": 1400 }, { "epoch": 0.16842105263157894, "grad_norm": 0.462890625, "learning_rate": 0.000261095563697969, "loss": 4.1242, "step": 1500 }, { "epoch": 0.16842105263157894, "eval_loss": 4.072988510131836, "eval_runtime": 296.6152, "eval_samples_per_second": 50.571, "eval_steps_per_second": 6.321, "step": 1500 }, { "epoch": 0.17964912280701753, "grad_norm": 0.40625, "learning_rate": 0.0002555547680762069, "loss": 4.0549, "step": 1600 }, { "epoch": 0.19087719298245615, "grad_norm": 0.47265625, "learning_rate": 0.00024971272760153834, "loss": 4.0018, "step": 1700 }, { "epoch": 0.20210526315789473, "grad_norm": 0.4453125, "learning_rate": 0.00024358611498951694, "loss": 3.9404, "step": 1800 }, { "epoch": 0.21333333333333335, "grad_norm": 0.40234375, "learning_rate": 0.0002371924151017814, "loss": 3.9074, "step": 1900 }, { "epoch": 0.22456140350877193, "grad_norm": 0.39453125, "learning_rate": 0.00023054987504566113, "loss": 3.8638, "step": 2000 }, { "epoch": 0.22456140350877193, "eval_loss": 3.8390953540802, "eval_runtime": 296.6249, "eval_samples_per_second": 50.569, "eval_steps_per_second": 6.321, "step": 2000 } ], "logging_steps": 100, "max_steps": 6000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.7273358843904e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }