| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.7171029042667623, | |
| "eval_steps": 2000, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.6033709049224854, | |
| "learning_rate": 9.9e-07, | |
| "loss": 1.0789, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.7718708515167236, | |
| "learning_rate": 9.9e-07, | |
| "loss": 0.8366, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.209937572479248, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.6863, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.8079888820648193, | |
| "learning_rate": 9.698989898989898e-07, | |
| "loss": 0.6659, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.12679386138916, | |
| "learning_rate": 9.598989898989899e-07, | |
| "loss": 0.6546, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 3.909221887588501, | |
| "learning_rate": 9.497979797979798e-07, | |
| "loss": 0.6581, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 3.234121322631836, | |
| "learning_rate": 9.396969696969696e-07, | |
| "loss": 0.6224, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 3.9791226387023926, | |
| "learning_rate": 9.295959595959596e-07, | |
| "loss": 0.6139, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 3.557487726211548, | |
| "learning_rate": 9.194949494949495e-07, | |
| "loss": 0.6272, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.946579694747925, | |
| "learning_rate": 9.093939393939394e-07, | |
| "loss": 0.6312, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 3.4085192680358887, | |
| "learning_rate": 8.992929292929292e-07, | |
| "loss": 0.6139, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 3.036348342895508, | |
| "learning_rate": 8.891919191919191e-07, | |
| "loss": 0.637, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.595344066619873, | |
| "learning_rate": 8.790909090909091e-07, | |
| "loss": 0.6447, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.76248025894165, | |
| "learning_rate": 8.68989898989899e-07, | |
| "loss": 0.6419, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.9645845890045166, | |
| "learning_rate": 8.588888888888888e-07, | |
| "loss": 0.6569, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 2.357501268386841, | |
| "learning_rate": 8.487878787878787e-07, | |
| "loss": 0.6345, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 4.0495171546936035, | |
| "learning_rate": 8.386868686868687e-07, | |
| "loss": 0.6004, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 3.0601418018341064, | |
| "learning_rate": 8.285858585858585e-07, | |
| "loss": 0.6444, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 4.655466079711914, | |
| "learning_rate": 8.184848484848484e-07, | |
| "loss": 0.621, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 4.7475666999816895, | |
| "learning_rate": 8.083838383838384e-07, | |
| "loss": 0.6554, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 0.7412441372871399, | |
| "eval_runtime": 565.2186, | |
| "eval_samples_per_second": 1.769, | |
| "eval_steps_per_second": 0.442, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 2000, | |
| "total_flos": 1.88564197343232e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |