{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9049489395129615, "eval_steps": 500, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012568735271013355, "grad_norm": 0.06298828125, "learning_rate": 0.0001, "loss": 0.6012, "step": 5 }, { "epoch": 0.02513747054202671, "grad_norm": 0.11767578125, "learning_rate": 0.0001, "loss": 0.3895, "step": 10 }, { "epoch": 0.037706205813040065, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.2298, "step": 15 }, { "epoch": 0.05027494108405342, "grad_norm": 0.068359375, "learning_rate": 0.0001, "loss": 0.1486, "step": 20 }, { "epoch": 0.06284367635506677, "grad_norm": 0.06396484375, "learning_rate": 0.0001, "loss": 0.1333, "step": 25 }, { "epoch": 0.07541241162608013, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.1203, "step": 30 }, { "epoch": 0.08798114689709348, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.0904, "step": 35 }, { "epoch": 0.10054988216810684, "grad_norm": 0.05859375, "learning_rate": 0.0001, "loss": 0.0617, "step": 40 }, { "epoch": 0.11311861743912019, "grad_norm": 0.0478515625, "learning_rate": 0.0001, "loss": 0.0515, "step": 45 }, { "epoch": 0.12568735271013354, "grad_norm": 0.0634765625, "learning_rate": 0.0001, "loss": 0.0295, "step": 50 }, { "epoch": 0.13825608798114689, "grad_norm": 0.07421875, "learning_rate": 0.0001, "loss": 0.2835, "step": 55 }, { "epoch": 0.15082482325216026, "grad_norm": 0.057861328125, "learning_rate": 0.0001, "loss": 0.0973, "step": 60 }, { "epoch": 0.1633935585231736, "grad_norm": 0.026611328125, "learning_rate": 0.0001, "loss": 0.0755, "step": 65 }, { "epoch": 0.17596229379418696, "grad_norm": 0.0244140625, "learning_rate": 0.0001, "loss": 0.0547, "step": 70 }, { "epoch": 0.1885310290652003, "grad_norm": 0.0274658203125, "learning_rate": 0.0001, "loss": 0.0638, "step": 75 }, { "epoch": 0.20109976433621368, "grad_norm": 0.029052734375, "learning_rate": 0.0001, "loss": 0.0541, "step": 80 }, { "epoch": 0.21366849960722703, "grad_norm": 0.039306640625, "learning_rate": 0.0001, "loss": 0.0511, "step": 85 }, { "epoch": 0.22623723487824038, "grad_norm": 0.0196533203125, "learning_rate": 0.0001, "loss": 0.0392, "step": 90 }, { "epoch": 0.23880597014925373, "grad_norm": 0.0269775390625, "learning_rate": 0.0001, "loss": 0.0373, "step": 95 }, { "epoch": 0.2513747054202671, "grad_norm": 0.02734375, "learning_rate": 0.0001, "loss": 0.0168, "step": 100 }, { "epoch": 0.26394344069128045, "grad_norm": 0.0556640625, "learning_rate": 0.0001, "loss": 0.2346, "step": 105 }, { "epoch": 0.27651217596229377, "grad_norm": 0.0301513671875, "learning_rate": 0.0001, "loss": 0.0746, "step": 110 }, { "epoch": 0.28908091123330715, "grad_norm": 0.0294189453125, "learning_rate": 0.0001, "loss": 0.0534, "step": 115 }, { "epoch": 0.3016496465043205, "grad_norm": 0.0247802734375, "learning_rate": 0.0001, "loss": 0.0371, "step": 120 }, { "epoch": 0.31421838177533384, "grad_norm": 0.0225830078125, "learning_rate": 0.0001, "loss": 0.0488, "step": 125 }, { "epoch": 0.3267871170463472, "grad_norm": 0.02490234375, "learning_rate": 0.0001, "loss": 0.0444, "step": 130 }, { "epoch": 0.33935585231736054, "grad_norm": 0.0250244140625, "learning_rate": 0.0001, "loss": 0.038, "step": 135 }, { "epoch": 0.3519245875883739, "grad_norm": 0.0264892578125, "learning_rate": 0.0001, "loss": 0.0308, "step": 140 }, { "epoch": 0.3644933228593873, "grad_norm": 0.039306640625, "learning_rate": 0.0001, "loss": 0.0291, "step": 145 }, { "epoch": 0.3770620581304006, "grad_norm": 0.031982421875, "learning_rate": 0.0001, "loss": 0.0142, "step": 150 }, { "epoch": 0.389630793401414, "grad_norm": 0.045654296875, "learning_rate": 0.0001, "loss": 0.2053, "step": 155 }, { "epoch": 0.40219952867242736, "grad_norm": 0.0400390625, "learning_rate": 0.0001, "loss": 0.0658, "step": 160 }, { "epoch": 0.4147682639434407, "grad_norm": 0.0272216796875, "learning_rate": 0.0001, "loss": 0.045, "step": 165 }, { "epoch": 0.42733699921445406, "grad_norm": 0.021240234375, "learning_rate": 0.0001, "loss": 0.0343, "step": 170 }, { "epoch": 0.4399057344854674, "grad_norm": 0.0263671875, "learning_rate": 0.0001, "loss": 0.041, "step": 175 }, { "epoch": 0.45247446975648076, "grad_norm": 0.0311279296875, "learning_rate": 0.0001, "loss": 0.0382, "step": 180 }, { "epoch": 0.46504320502749413, "grad_norm": 0.022705078125, "learning_rate": 0.0001, "loss": 0.0295, "step": 185 }, { "epoch": 0.47761194029850745, "grad_norm": 0.0301513671875, "learning_rate": 0.0001, "loss": 0.0257, "step": 190 }, { "epoch": 0.49018067556952083, "grad_norm": 0.02490234375, "learning_rate": 0.0001, "loss": 0.0217, "step": 195 }, { "epoch": 0.5027494108405341, "grad_norm": 0.006866455078125, "learning_rate": 0.0001, "loss": 0.0073, "step": 200 }, { "epoch": 0.5153181461115475, "grad_norm": 0.04443359375, "learning_rate": 0.0001, "loss": 0.1655, "step": 205 }, { "epoch": 0.5278868813825609, "grad_norm": 0.056640625, "learning_rate": 0.0001, "loss": 0.051, "step": 210 }, { "epoch": 0.5404556166535742, "grad_norm": 0.026123046875, "learning_rate": 0.0001, "loss": 0.0393, "step": 215 }, { "epoch": 0.5530243519245875, "grad_norm": 0.01806640625, "learning_rate": 0.0001, "loss": 0.0255, "step": 220 }, { "epoch": 0.565593087195601, "grad_norm": 0.0230712890625, "learning_rate": 0.0001, "loss": 0.0333, "step": 225 }, { "epoch": 0.5781618224666143, "grad_norm": 0.024658203125, "learning_rate": 0.0001, "loss": 0.0289, "step": 230 }, { "epoch": 0.5907305577376276, "grad_norm": 0.0301513671875, "learning_rate": 0.0001, "loss": 0.0235, "step": 235 }, { "epoch": 0.603299293008641, "grad_norm": 0.0284423828125, "learning_rate": 0.0001, "loss": 0.0208, "step": 240 }, { "epoch": 0.6158680282796544, "grad_norm": 0.025634765625, "learning_rate": 0.0001, "loss": 0.0119, "step": 245 }, { "epoch": 0.6284367635506677, "grad_norm": 0.0125732421875, "learning_rate": 0.0001, "loss": 0.0093, "step": 250 }, { "epoch": 0.6410054988216811, "grad_norm": 0.051025390625, "learning_rate": 0.0001, "loss": 0.1598, "step": 255 }, { "epoch": 0.6535742340926944, "grad_norm": 0.0546875, "learning_rate": 0.0001, "loss": 0.0457, "step": 260 }, { "epoch": 0.6661429693637078, "grad_norm": 0.03564453125, "learning_rate": 0.0001, "loss": 0.0352, "step": 265 }, { "epoch": 0.6787117046347211, "grad_norm": 0.019775390625, "learning_rate": 0.0001, "loss": 0.024, "step": 270 }, { "epoch": 0.6912804399057345, "grad_norm": 0.0234375, "learning_rate": 0.0001, "loss": 0.0296, "step": 275 }, { "epoch": 0.7038491751767478, "grad_norm": 0.0264892578125, "learning_rate": 0.0001, "loss": 0.0249, "step": 280 }, { "epoch": 0.7164179104477612, "grad_norm": 0.029541015625, "learning_rate": 0.0001, "loss": 0.0199, "step": 285 }, { "epoch": 0.7289866457187746, "grad_norm": 0.02294921875, "learning_rate": 0.0001, "loss": 0.0154, "step": 290 }, { "epoch": 0.7415553809897879, "grad_norm": 0.0220947265625, "learning_rate": 0.0001, "loss": 0.0116, "step": 295 }, { "epoch": 0.7541241162608012, "grad_norm": 0.00531005859375, "learning_rate": 0.0001, "loss": 0.0058, "step": 300 }, { "epoch": 0.7666928515318147, "grad_norm": 0.049560546875, "learning_rate": 0.0001, "loss": 0.1521, "step": 305 }, { "epoch": 0.779261586802828, "grad_norm": 0.140625, "learning_rate": 0.0001, "loss": 0.0482, "step": 310 }, { "epoch": 0.7918303220738413, "grad_norm": 0.035888671875, "learning_rate": 0.0001, "loss": 0.0372, "step": 315 }, { "epoch": 0.8043990573448547, "grad_norm": 0.036865234375, "learning_rate": 0.0001, "loss": 0.0244, "step": 320 }, { "epoch": 0.816967792615868, "grad_norm": 0.030517578125, "learning_rate": 0.0001, "loss": 0.0263, "step": 325 }, { "epoch": 0.8295365278868814, "grad_norm": 0.024169921875, "learning_rate": 0.0001, "loss": 0.0218, "step": 330 }, { "epoch": 0.8421052631578947, "grad_norm": 0.0308837890625, "learning_rate": 0.0001, "loss": 0.0182, "step": 335 }, { "epoch": 0.8546739984289081, "grad_norm": 0.02880859375, "learning_rate": 0.0001, "loss": 0.014, "step": 340 }, { "epoch": 0.8672427336999214, "grad_norm": 0.03173828125, "learning_rate": 0.0001, "loss": 0.0109, "step": 345 }, { "epoch": 0.8798114689709348, "grad_norm": 0.01483154296875, "learning_rate": 0.0001, "loss": 0.0044, "step": 350 }, { "epoch": 0.8923802042419482, "grad_norm": 0.03955078125, "learning_rate": 0.0001, "loss": 0.1312, "step": 355 }, { "epoch": 0.9049489395129615, "grad_norm": 0.031982421875, "learning_rate": 0.0001, "loss": 0.0403, "step": 360 } ], "logging_steps": 5, "max_steps": 360, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 90, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6220305320330854e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }