{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2553191489361702, "eval_steps": 500, "global_step": 21, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0121580547112462, "grad_norm": 0.0986328125, "learning_rate": 2e-05, "loss": 0.8436, "step": 1 }, { "epoch": 0.0243161094224924, "grad_norm": 0.09765625, "learning_rate": 4e-05, "loss": 0.9406, "step": 2 }, { "epoch": 0.0364741641337386, "grad_norm": 0.11083984375, "learning_rate": 6e-05, "loss": 1.1161, "step": 3 }, { "epoch": 0.0486322188449848, "grad_norm": 0.09521484375, "learning_rate": 8e-05, "loss": 0.9467, "step": 4 }, { "epoch": 0.060790273556231005, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.8456, "step": 5 }, { "epoch": 0.0729483282674772, "grad_norm": 0.10009765625, "learning_rate": 0.00012, "loss": 0.9215, "step": 6 }, { "epoch": 0.0851063829787234, "grad_norm": 0.107421875, "learning_rate": 0.00014, "loss": 0.8381, "step": 7 }, { "epoch": 0.0972644376899696, "grad_norm": 0.1123046875, "learning_rate": 0.00016, "loss": 0.9411, "step": 8 }, { "epoch": 0.1094224924012158, "grad_norm": 0.11572265625, "learning_rate": 0.00018, "loss": 0.9348, "step": 9 }, { "epoch": 0.12158054711246201, "grad_norm": 0.1279296875, "learning_rate": 0.0002, "loss": 0.9752, "step": 10 }, { "epoch": 0.1337386018237082, "grad_norm": 0.1416015625, "learning_rate": 0.0001999048221581858, "loss": 1.1067, "step": 11 }, { "epoch": 0.1458966565349544, "grad_norm": 0.1328125, "learning_rate": 0.00019961946980917456, "loss": 0.904, "step": 12 }, { "epoch": 0.1580547112462006, "grad_norm": 0.146484375, "learning_rate": 0.00019914448613738106, "loss": 0.9718, "step": 13 }, { "epoch": 0.1702127659574468, "grad_norm": 0.1435546875, "learning_rate": 0.00019848077530122083, "loss": 0.9354, "step": 14 }, { "epoch": 0.182370820668693, "grad_norm": 0.12890625, "learning_rate": 0.00019762960071199333, "loss": 1.0536, "step": 15 }, { "epoch": 0.1945288753799392, "grad_norm": 0.1259765625, "learning_rate": 0.00019659258262890683, "loss": 1.0129, "step": 16 }, { "epoch": 0.2066869300911854, "grad_norm": 0.1328125, "learning_rate": 0.0001953716950748227, "loss": 1.0696, "step": 17 }, { "epoch": 0.2188449848024316, "grad_norm": 0.1025390625, "learning_rate": 0.00019396926207859084, "loss": 0.802, "step": 18 }, { "epoch": 0.23100303951367782, "grad_norm": 0.12255859375, "learning_rate": 0.0001923879532511287, "loss": 0.9757, "step": 19 }, { "epoch": 0.24316109422492402, "grad_norm": 0.123046875, "learning_rate": 0.000190630778703665, "loss": 1.1032, "step": 20 }, { "epoch": 0.2553191489361702, "grad_norm": 0.12158203125, "learning_rate": 0.00018870108331782217, "loss": 0.9319, "step": 21 } ], "logging_steps": 1, "max_steps": 82, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 21, "total_flos": 1.5579632954769408e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }