| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9049489395129615, | |
| "eval_steps": 500, | |
| "global_step": 360, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012568735271013355, | |
| "grad_norm": 0.06298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6012, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02513747054202671, | |
| "grad_norm": 0.11767578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3895, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.037706205813040065, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2298, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05027494108405342, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1486, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06284367635506677, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1333, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.07541241162608013, | |
| "grad_norm": 0.0849609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1203, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08798114689709348, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0904, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.10054988216810684, | |
| "grad_norm": 0.05859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0617, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11311861743912019, | |
| "grad_norm": 0.0478515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0515, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.12568735271013354, | |
| "grad_norm": 0.0634765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0295, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13825608798114689, | |
| "grad_norm": 0.07421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2835, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.15082482325216026, | |
| "grad_norm": 0.057861328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0973, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1633935585231736, | |
| "grad_norm": 0.026611328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0755, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.17596229379418696, | |
| "grad_norm": 0.0244140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0547, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1885310290652003, | |
| "grad_norm": 0.0274658203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0638, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.20109976433621368, | |
| "grad_norm": 0.029052734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0541, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.21366849960722703, | |
| "grad_norm": 0.039306640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0511, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.22623723487824038, | |
| "grad_norm": 0.0196533203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0392, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23880597014925373, | |
| "grad_norm": 0.0269775390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0373, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2513747054202671, | |
| "grad_norm": 0.02734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0168, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.26394344069128045, | |
| "grad_norm": 0.0556640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2346, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.27651217596229377, | |
| "grad_norm": 0.0301513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0746, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.28908091123330715, | |
| "grad_norm": 0.0294189453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0534, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3016496465043205, | |
| "grad_norm": 0.0247802734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0371, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.31421838177533384, | |
| "grad_norm": 0.0225830078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0488, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3267871170463472, | |
| "grad_norm": 0.02490234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0444, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.33935585231736054, | |
| "grad_norm": 0.0250244140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.038, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3519245875883739, | |
| "grad_norm": 0.0264892578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0308, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3644933228593873, | |
| "grad_norm": 0.039306640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0291, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3770620581304006, | |
| "grad_norm": 0.031982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0142, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.389630793401414, | |
| "grad_norm": 0.045654296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2053, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.40219952867242736, | |
| "grad_norm": 0.0400390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0658, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4147682639434407, | |
| "grad_norm": 0.0272216796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.045, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.42733699921445406, | |
| "grad_norm": 0.021240234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0343, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4399057344854674, | |
| "grad_norm": 0.0263671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.041, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.45247446975648076, | |
| "grad_norm": 0.0311279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0382, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.46504320502749413, | |
| "grad_norm": 0.022705078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0295, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.47761194029850745, | |
| "grad_norm": 0.0301513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0257, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.49018067556952083, | |
| "grad_norm": 0.02490234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0217, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5027494108405341, | |
| "grad_norm": 0.006866455078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0073, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5153181461115475, | |
| "grad_norm": 0.04443359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1655, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5278868813825609, | |
| "grad_norm": 0.056640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.051, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5404556166535742, | |
| "grad_norm": 0.026123046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0393, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5530243519245875, | |
| "grad_norm": 0.01806640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0255, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.565593087195601, | |
| "grad_norm": 0.0230712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0333, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5781618224666143, | |
| "grad_norm": 0.024658203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0289, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5907305577376276, | |
| "grad_norm": 0.0301513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0235, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.603299293008641, | |
| "grad_norm": 0.0284423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0208, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6158680282796544, | |
| "grad_norm": 0.025634765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0119, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6284367635506677, | |
| "grad_norm": 0.0125732421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0093, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6410054988216811, | |
| "grad_norm": 0.051025390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1598, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6535742340926944, | |
| "grad_norm": 0.0546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0457, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6661429693637078, | |
| "grad_norm": 0.03564453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0352, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6787117046347211, | |
| "grad_norm": 0.019775390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.024, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6912804399057345, | |
| "grad_norm": 0.0234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0296, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7038491751767478, | |
| "grad_norm": 0.0264892578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0249, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7164179104477612, | |
| "grad_norm": 0.029541015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0199, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7289866457187746, | |
| "grad_norm": 0.02294921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0154, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7415553809897879, | |
| "grad_norm": 0.0220947265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0116, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7541241162608012, | |
| "grad_norm": 0.00531005859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0058, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7666928515318147, | |
| "grad_norm": 0.049560546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1521, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.779261586802828, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0482, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7918303220738413, | |
| "grad_norm": 0.035888671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0372, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8043990573448547, | |
| "grad_norm": 0.036865234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0244, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.816967792615868, | |
| "grad_norm": 0.030517578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0263, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8295365278868814, | |
| "grad_norm": 0.024169921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0218, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.0308837890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0182, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8546739984289081, | |
| "grad_norm": 0.02880859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.014, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8672427336999214, | |
| "grad_norm": 0.03173828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0109, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8798114689709348, | |
| "grad_norm": 0.01483154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0044, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8923802042419482, | |
| "grad_norm": 0.03955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1312, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9049489395129615, | |
| "grad_norm": 0.031982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0403, | |
| "step": 360 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 360, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 90, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6220305320330854e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |