{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025, "grad_norm": 28.533191680908203, "learning_rate": 0.00019950000000000002, "loss": 10.2028, "step": 10 }, { "epoch": 0.005, "grad_norm": 1.5962224006652832, "learning_rate": 0.000199, "loss": 1.8731, "step": 20 }, { "epoch": 0.0075, "grad_norm": 0.8784465193748474, "learning_rate": 0.00019850000000000003, "loss": 0.5834, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.6551746726036072, "learning_rate": 0.00019800000000000002, "loss": 0.4274, "step": 40 }, { "epoch": 0.0125, "grad_norm": 0.7458402514457703, "learning_rate": 0.00019750000000000003, "loss": 0.2921, "step": 50 }, { "epoch": 0.015, "grad_norm": 0.6910417675971985, "learning_rate": 0.00019700000000000002, "loss": 0.1889, "step": 60 }, { "epoch": 0.0175, "grad_norm": 0.575136125087738, "learning_rate": 0.0001965, "loss": 0.1379, "step": 70 }, { "epoch": 0.02, "grad_norm": 1.2535680532455444, "learning_rate": 0.000196, "loss": 0.1053, "step": 80 }, { "epoch": 0.0225, "grad_norm": 0.9697864651679993, "learning_rate": 0.0001955, "loss": 0.0756, "step": 90 }, { "epoch": 0.025, "grad_norm": 0.508269727230072, "learning_rate": 0.000195, "loss": 0.055, "step": 100 }, { "epoch": 0.0275, "grad_norm": 0.6621774435043335, "learning_rate": 0.0001945, "loss": 0.0487, "step": 110 }, { "epoch": 0.03, "grad_norm": 0.5406679511070251, "learning_rate": 0.000194, "loss": 0.0406, "step": 120 }, { "epoch": 0.0325, "grad_norm": 0.35967350006103516, "learning_rate": 0.00019350000000000001, "loss": 0.0347, "step": 130 }, { "epoch": 0.035, "grad_norm": 0.6122244000434875, "learning_rate": 0.000193, "loss": 0.0334, "step": 140 }, { "epoch": 0.0375, "grad_norm": 0.4679579734802246, "learning_rate": 0.00019250000000000002, "loss": 0.0342, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.6229879856109619, "learning_rate": 0.000192, "loss": 0.0328, "step": 160 }, { "epoch": 0.0425, "grad_norm": 0.4741787314414978, "learning_rate": 0.00019150000000000002, "loss": 0.0328, "step": 170 }, { "epoch": 0.045, "grad_norm": 0.3581089377403259, "learning_rate": 0.000191, "loss": 0.0329, "step": 180 }, { "epoch": 0.0475, "grad_norm": 0.2805705964565277, "learning_rate": 0.00019050000000000002, "loss": 0.0316, "step": 190 }, { "epoch": 0.05, "grad_norm": 0.36797094345092773, "learning_rate": 0.00019, "loss": 0.0314, "step": 200 }, { "epoch": 0.0525, "grad_norm": 0.22872206568717957, "learning_rate": 0.0001895, "loss": 0.0304, "step": 210 }, { "epoch": 0.055, "grad_norm": 0.3525296151638031, "learning_rate": 0.00018899999999999999, "loss": 0.0315, "step": 220 }, { "epoch": 0.0575, "grad_norm": 0.21026159822940826, "learning_rate": 0.0001885, "loss": 0.0302, "step": 230 }, { "epoch": 0.06, "grad_norm": 0.1741417497396469, "learning_rate": 0.000188, "loss": 0.0307, "step": 240 }, { "epoch": 0.0625, "grad_norm": 0.35116010904312134, "learning_rate": 0.0001875, "loss": 0.0305, "step": 250 }, { "epoch": 0.065, "grad_norm": 0.2572971284389496, "learning_rate": 0.00018700000000000002, "loss": 0.0313, "step": 260 }, { "epoch": 0.0675, "grad_norm": 0.2466694414615631, "learning_rate": 0.0001865, "loss": 0.0299, "step": 270 }, { "epoch": 0.07, "grad_norm": 0.19943873584270477, "learning_rate": 0.00018600000000000002, "loss": 0.0304, "step": 280 }, { "epoch": 0.0725, "grad_norm": 0.3378709852695465, "learning_rate": 0.0001855, "loss": 0.0299, "step": 290 }, { "epoch": 0.075, "grad_norm": 0.23438668251037598, "learning_rate": 0.00018500000000000002, "loss": 0.0301, "step": 300 }, { "epoch": 0.0775, "grad_norm": 0.3201534152030945, "learning_rate": 0.0001845, "loss": 0.0307, "step": 310 }, { "epoch": 0.08, "grad_norm": 0.19868455827236176, "learning_rate": 0.00018400000000000003, "loss": 0.0294, "step": 320 }, { "epoch": 0.0825, "grad_norm": 0.20487827062606812, "learning_rate": 0.00018350000000000002, "loss": 0.0309, "step": 330 }, { "epoch": 0.085, "grad_norm": 0.3057793378829956, "learning_rate": 0.000183, "loss": 0.0307, "step": 340 }, { "epoch": 0.0875, "grad_norm": 0.12229125201702118, "learning_rate": 0.0001825, "loss": 0.0303, "step": 350 }, { "epoch": 0.09, "grad_norm": 0.18177232146263123, "learning_rate": 0.000182, "loss": 0.0302, "step": 360 }, { "epoch": 0.0925, "grad_norm": 0.28575578331947327, "learning_rate": 0.0001815, "loss": 0.0304, "step": 370 }, { "epoch": 0.095, "grad_norm": 0.19034205377101898, "learning_rate": 0.000181, "loss": 0.03, "step": 380 }, { "epoch": 0.0975, "grad_norm": 0.23103861510753632, "learning_rate": 0.0001805, "loss": 0.0305, "step": 390 }, { "epoch": 0.1, "grad_norm": 0.15927983820438385, "learning_rate": 0.00018, "loss": 0.0292, "step": 400 } ], "logging_steps": 10, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 440449577779200.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }