| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.008923830643542047, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.8500534296035767, | |
| "learning_rate": 4.997769042339115e-05, | |
| "loss": 5.7067, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.29711076617240906, | |
| "learning_rate": 4.995538084678229e-05, | |
| "loss": 4.9888, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.21760140359401703, | |
| "learning_rate": 4.9933071270173436e-05, | |
| "loss": 4.7622, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.2236577719449997, | |
| "learning_rate": 4.991076169356459e-05, | |
| "loss": 4.7175, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.18726830184459686, | |
| "learning_rate": 4.9888452116955725e-05, | |
| "loss": 4.6896, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.3301438093185425, | |
| "learning_rate": 4.986614254034687e-05, | |
| "loss": 4.6776, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.23270975053310394, | |
| "learning_rate": 4.984383296373802e-05, | |
| "loss": 4.6656, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.26907047629356384, | |
| "learning_rate": 4.982152338712916e-05, | |
| "loss": 4.6493, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.22385388612747192, | |
| "learning_rate": 4.979921381052031e-05, | |
| "loss": 4.6447, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.28366366028785706, | |
| "learning_rate": 4.9776904233911454e-05, | |
| "loss": 4.632, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.26614850759506226, | |
| "learning_rate": 4.975459465730259e-05, | |
| "loss": 4.6319, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.3065544068813324, | |
| "learning_rate": 4.973228508069374e-05, | |
| "loss": 4.6223, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.23633620142936707, | |
| "learning_rate": 4.970997550408489e-05, | |
| "loss": 4.619, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.2925296127796173, | |
| "learning_rate": 4.968766592747603e-05, | |
| "loss": 4.616, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.3636851906776428, | |
| "learning_rate": 4.9665356350867176e-05, | |
| "loss": 4.6118, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.3567110598087311, | |
| "learning_rate": 4.964304677425832e-05, | |
| "loss": 4.6092, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.35951370000839233, | |
| "learning_rate": 4.9620737197649465e-05, | |
| "loss": 4.6004, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.2747381329536438, | |
| "learning_rate": 4.959842762104061e-05, | |
| "loss": 4.5976, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.33775028586387634, | |
| "learning_rate": 4.9576118044431754e-05, | |
| "loss": 4.594, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.2855284810066223, | |
| "learning_rate": 4.95538084678229e-05, | |
| "loss": 4.5928, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 224119, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "total_flos": 1.5251220029472768e+16, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |